next up previous index
Next: The Discussion Up: Writing on MPI Files Previous: Writing on MPI Files

   
Program mkrandpfile

So, here is the listing of the program in full glory.

/*
 * %Id: mkrandpfile.c,v 1.13 2003/10/19 19:29:59 gustav Exp %
 *
 * %Log: mkrandpfile.c,v %
 * Revision 1.13  2003/10/19 19:29:59  gustav
 * Indented the file with Emacs.
 *
 * Revision 1.12  2003/10/19 19:26:09  gustav
 * Truncated the log.
 *
 *
 */

#include <stdio.h>   /* all IO stuff lives here */
#include <stdlib.h>  /* exit lives here */
#include <unistd.h>  /* getopt lives here */
#include <string.h>  /* strcpy lives here */
#include <mpi.h>     /* MPI and MPI-IO live here */

#define MASTER_RANK 0
#define TRUE 1
#define FALSE 0
#define BOOLEAN int
#define BLOCK_SIZE 1048576
#define MBYTE 1048576
#define SYNOPSIS printf ("synopsis: %s -f <file> -l <blocks>\n", argv[0])

int main(argc, argv)
     int argc;
     char *argv[];
{
  /* my variables */

  int my_rank, pool_size, number_of_blocks = 0, i, count;
  BOOLEAN i_am_the_master = FALSE, input_error = FALSE;
  char *filename = NULL;
  int filename_length;
  int *junk;
  int number_of_integers, number_of_bytes;
  long long total_number_of_integers, total_number_of_bytes;
  MPI_Offset my_offset, my_current_offset;
  MPI_File fh;
  MPI_Status status;
  double start, finish, io_time, longest_io_time;

  /* getopt variables */

  extern char *optarg;
  int c;

  /* ACTION */

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &pool_size);
  if (my_rank == MASTER_RANK) i_am_the_master = TRUE;

  if (i_am_the_master) {

    /* read the command line */

    while ((c = getopt(argc, argv, "f:l:h")) != EOF)
      switch(c) {
      case 'f': 
        filename = optarg;
#ifdef DEBUG
	printf("output file: %s\n", filename);
#endif
	break;
      case 'l': 
	if ((sscanf (optarg, "%d", &number_of_blocks) != 1) ||
	    (number_of_blocks < 1)) {
	  SYNOPSIS;
	  input_error = TRUE;
	}
#ifdef DEBUG
	else
	  printf("each process will write %d blocks of integers\n", 
		 number_of_blocks);
#endif
	break;
      case 'h':
	SYNOPSIS;
	input_error = TRUE;
	break;
      case '?':
	SYNOPSIS;
	input_error = TRUE;
	break;
      }

    /* Check if the command line has initialized filename and
     * number_of_blocks.
     */

    if ((filename == NULL) || (number_of_blocks == 0)) {
      SYNOPSIS;
      input_error = TRUE;
    }

    if (input_error) MPI_Abort(MPI_COMM_WORLD, 1);
    /* This is another way of exiting, but it can be done only
       if no files have been opened yet. */

    filename_length = strlen(filename) + 1;

  } /* end of "if (i_am_the_master)"; reading the command line */

    /* If we got this far, the data read from the command line
       should be OK. */

  MPI_Bcast(&number_of_blocks, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
  MPI_Bcast(&filename_length, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
  if (! i_am_the_master) filename = (char*) malloc(filename_length);
  MPI_Bcast(filename, filename_length, MPI_CHAR, MASTER_RANK, MPI_COMM_WORLD);
#ifdef DEBUG
  printf("%3d: received broadcast\n", my_rank);
  printf("%3d: filename = %s\n", my_rank, filename);
#endif

  number_of_integers = number_of_blocks * BLOCK_SIZE;
  number_of_bytes = sizeof(int) * number_of_integers;

  /* number_of_bytes must be just plain integer, because we are
     going to use it in malloc */

  total_number_of_integers =
    (long long) pool_size * (long long) number_of_integers;
  total_number_of_bytes =
    (long long) pool_size * (long long) number_of_bytes;
  my_offset = (long long) my_rank * (long long) number_of_bytes;

#ifdef DEBUG
  if (i_am_the_master) {
    printf("number_of_bytes       = %d/process\n", number_of_bytes);
    printf("total_number_of_bytes = %lld\n", total_number_of_bytes);
    printf("size of offset        = %d bytes\n", sizeof(MPI_Offset));
  }
#endif

  MPI_File_open(MPI_COMM_WORLD, filename, 
		MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh);
  MPI_File_seek(fh, my_offset, MPI_SEEK_SET);
  MPI_File_get_position(fh, &my_current_offset);
#ifdef DEBUG
  printf ("%3d: my current offset is %lld\n", my_rank, my_current_offset);
#endif

  /* generate random integers */

  junk = (int*) malloc(number_of_bytes);
  srand(28 + my_rank);
  for (i = 0; i < number_of_integers; i++) *(junk + i) = rand();

  /* write the stuff out */

  start = MPI_Wtime();
  MPI_File_write(fh, junk, number_of_integers, MPI_INT, &status);
  finish = MPI_Wtime();
  io_time = finish - start;
  MPI_Get_count(&status, MPI_INT, &count);
#ifdef DEBUG
  printf("%3d: wrote %d integers\n", my_rank, count);
#endif
  MPI_File_get_position(fh, &my_current_offset);
#ifdef DEBUG
  printf ("%3d: my current offset is %lld\n", my_rank, my_current_offset);
#endif
  MPI_File_close(&fh);

  MPI_Allreduce(&io_time, &longest_io_time, 1, MPI_DOUBLE, MPI_MAX,
		MPI_COMM_WORLD);

  if (i_am_the_master) {
    printf("longest_io_time       = %f seconds\n", longest_io_time);
    printf("total_number_of_bytes = %lld\n", total_number_of_bytes);
    printf("transfer rate         = %f MB/s\n", 
	   total_number_of_bytes / longest_io_time / MBYTE);
  }
       
  MPI_Finalize();
  exit(0);
}

Here is how the program is installed and made:

gustav@bh1 $ pwd
/N/B/gustav/src/I590/mkrandpfile
gustav@bh1 $ make install
co  RCS/Makefile,v Makefile
RCS/Makefile,v  -->  Makefile
revision 1.2
done
co  RCS/mkrandpfile.c,v mkrandpfile.c
RCS/mkrandpfile.c,v  -->  mkrandpfile.c
revision 1.10
done
mpicc -DDEBUG -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -o mkrandpfile mkrandpfile.c
install mkrandpfile /N/B/gustav/bin
gustav@bh1 $
Observe the defines, which are added to the CFLAGS:
-D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE
These defines inform the Linux C compiler that we are going to work with files of size that exceeds the ancient UNIX limitation of 2 GB maximum. This limitation is related to the 32-bit architecture of IA32. I have it on explicit advice of ROMIO authors that I should use these switches for all MPI-IO compilations on IA32 systems under Linux, yet they are not MPI-IO specific. They are mentioned, instead, in various system files in /usr/include.

And here is how the program is run. The program generates quite a lot of output when run in the DEBUG mode, of course, this apart from the file it writes, which can be very large indeed.

First I want to show you how the program captures error on the command line:

gustav@bh1 $ pwd
/N/gpfs/gustav/mkrandpfile
gustav@bh1 $ ls
gustav@bh1 $ mpiexec -n 8 mkrandpfile -l 10
each process will write 10 blocks of integers
synopsis: mkrandpfile -f <file> -l <blocks>
ABORT - process 0: application called MPI_ABORT
rank 0 in job 40  bh1_46074   caused collective abort of all ranks
  exit status of rank 0: return code 1 
gustav@bh1 $
The ABORT message is triggered by the call to MPI_Abort. In this case process of rank 0 is the only one that has called MPI_Abort, but this is enough to take all other processes down too. The return code of the master process, i.e., process of rank 0, is returned in the ABORT message.

Now let us create a relatively small 640 MB file with mkrandpfile:

gustav@bh1 $ mpiexec -n 8 mkrandpfile -f test -l 20
output file: test
each process will write 20 blocks of integers
  0: received broadcast
  0: filename = test
number_of_bytes       = 83886080/process
total_number_of_bytes = 671088640
size of offset        = 8 bytes
  4: received broadcast
  4: filename = test
  6: received broadcast
  6: filename = test
  7: received broadcast
  7: filename = test
  1: received broadcast
  1: filename = test
  2: received broadcast
  2: filename = test
  5: received broadcast
  5: filename = test
  3: received broadcast
  3: filename = test
  0: my current offset is 0
  3: my current offset is 251658240
  2: my current offset is 167772160
  4: my current offset is 335544320
  7: my current offset is 587202560
  5: my current offset is 419430400
  6: my current offset is 503316480
  1: my current offset is 83886080
  2: wrote 20971520 integers
  2: my current offset is 251658240
  5: wrote 20971520 integers
  5: my current offset is 503316480
  4: wrote 20971520 integers
  4: my current offset is 419430400
  1: wrote 20971520 integers
  1: my current offset is 167772160
  6: wrote 20971520 integers
  6: my current offset is 587202560
  3: wrote 20971520 integers
  3: my current offset is 335544320
  7: wrote 20971520 integers
  7: my current offset is 671088640
  0: wrote 20971520 integers
  0: my current offset is 83886080
longest_io_time       = 5.781979 seconds
total_number_of_bytes = 671088640
transfer rate         = 110.688746 MB/s
gustav@bh1 $
First observe a rather nice IO bandwidth of 110MB/s. The bandwidth is so nice, because the file is small and we end up writing it to memory caches. This program does not flush the file. We will discuss the other messages, about offsets and who wrote how many integers, in the next section.

And now we are going to create a somewhat larger 32GB file. This is going to take a little longer, the program will run on 32 nodes, and I am going to shorten the rather verbose output a little.

gustav@bh1 $ pwd  
/N/gpfs/gustav/mkrandpfile
gustav@bh1 $ rm test
gustav@bh1 $ mpiexec -n 32 mkrandpfile -f test -l 256

output file: test
each process will write 256 blocks of integers
number_of_bytes       = 1073741824/process
total_number_of_bytes = 34359738368
size of offset        = 8 bytes

[...]

  0: my current offset is 0
  1: my current offset is 1073741824
  2: my current offset is 2147483648
  3: my current offset is 3221225472
  4: my current offset is 4294967296

[...]

  0: wrote 268435456 integers
  0: my current offset is 1073741824
  1: wrote 268435456 integers
  1: my current offset is 2147483648
  2: wrote 268435456 integers
  2: my current offset is 3221225472
  3: wrote 268435456 integers
  3: my current offset is 4294967296
  4: wrote 268435456 integers
  4: my current offset is 5368709120

[...]

longest_io_time       = 509.006609 seconds
total_number_of_bytes = 34359738368
transfer rate         = 64.376374 MB/s
gustav@bh1 $
This time IO has dropped to 64 MB/s, which is what we saw when we had written a lot of data with mkrandfiles. This transfer rate is disk array limited. The amount of data, 1GB/node, is too large to just hide in memory buffers.


next up previous index
Next: The Discussion Up: Writing on MPI Files Previous: Writing on MPI Files
Zdzislaw Meglicki
2004-04-29