next up previous index
Next: The Discussion Up: Reading from MPI Files Previous: Reading from MPI Files

Program xrandpfile

This is the example program. It opens a file whose name has been provided on the command line. Then it finds about its length. Every process evaluates the amount of data it's going to read by dividing the size of the file by the number of processes in the pool, and allocates sufficient memory for the data. The reading itself is timed.

If any process has problems opening the file, the program aborts.

/*
 * %Id: xrandpfile.c,v 1.4 2003/10/18 21:43:05 gustav Exp %
 *
 * %Log: xrandpfile.c,v %
 * Revision 1.4  2003/10/18 21:43:05  gustav
 * integers -> bytes
 *
 * Revision 1.3  2003/10/18 21:33:35  gustav
 * Indented the program (used emacs).
 *
 * Revision 1.2  2003/10/18 20:43:24  gustav
 * Added reading of status with MPI_Get_count
 *
 * Revision 1.1  2003/10/18 19:52:44  gustav
 * Initial revision
 *
 *
 */

#include <stdio.h>   /* all IO stuff lives here */
#include <stdlib.h>  /* exit lives here */
#include <unistd.h>  /* getopt lives here */
#include <string.h>  /* strcpy lives here */
#include <limits.h>  /* INT_MAX lives here */
#include <mpi.h>     /* MPI and MPI-IO live here */

#define MASTER_RANK 0
#define TRUE 1
#define FALSE 0
#define BOOLEAN int
#define MBYTE 1048576
#define SYNOPSIS printf ("synopsis: %s -f <file>\n", argv[0])

int main(argc, argv)
     int argc;
     char *argv[];
{
  /* my variables */

  int my_rank, pool_size, last_guy, i, count;
  BOOLEAN i_am_the_master = FALSE, input_error = FALSE;
  char *filename = NULL, *read_buffer;
  int filename_length;
  int *junk;
  int file_open_error, number_of_bytes;

  /* MPI_Offset is long long */

  MPI_Offset my_offset, my_current_offset, total_number_of_bytes,
    number_of_bytes_ll, max_number_of_bytes_ll;
  MPI_File fh;
  MPI_Status status;
  double start, finish, io_time, longest_io_time;

  /* getopt variables */

  extern char *optarg;
  int c;

  /* ACTION */

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &pool_size);
  last_guy = pool_size - 1;
  if (my_rank == MASTER_RANK) i_am_the_master = TRUE;

  if (i_am_the_master) {

    /* read the command line */

    while ((c = getopt(argc, argv, "f:h")) != EOF)
      switch(c) {
      case 'f': 
	filename = optarg;
#ifdef DEBUG
	printf("input file: %s\n", filename);
#endif
	break;
      case 'h':
	SYNOPSIS;
	input_error = TRUE;
	break;
      case '?':
	SYNOPSIS;
	input_error = TRUE;
	break;
      } /* end of switch(c) */

    /* Check if the command line has initialized filename and
     * number_of_blocks.
     */

    if (filename == NULL) {
      SYNOPSIS;
      input_error = TRUE;
    }

    if (input_error) MPI_Abort(MPI_COMM_WORLD, 1);

    filename_length = strlen(filename) + 1;

    /* This is another way of exiting, but it can be done only
       if no files have been opened yet. */

  } /* end of "if (i_am_the_master)"; reading the command line */

    /* If we got this far, the data read from the command line
       should be OK. */
  
  MPI_Bcast(&filename_length, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
  if (! i_am_the_master) filename = (char*) malloc(filename_length);
#ifdef DEBUG
  printf("%3d: allocated space for filename\n", my_rank);
#endif
  MPI_Bcast(filename, filename_length, MPI_CHAR, MASTER_RANK, MPI_COMM_WORLD);
#ifdef DEBUG
  printf("%3d: received broadcast\n", my_rank);
  printf("%3d: filename = %s\n", my_rank, filename);
#endif

  MPI_Barrier(MPI_COMM_WORLD);

  /* Default I/O error handling is MPI_ERRORS_RETURN */

  file_open_error = MPI_File_open(MPI_COMM_WORLD, filename, 
		                  MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);

  if (file_open_error != MPI_SUCCESS) {

    char error_string[BUFSIZ];
    int length_of_error_string, error_class;

    MPI_Error_class(file_open_error, &error_class);
    MPI_Error_string(error_class, error_string, &length_of_error_string);
    printf("%3d: %s\n", my_rank, error_string);

    MPI_Error_string(file_open_error, error_string, &length_of_error_string);
    printf("%3d: %s\n", my_rank, error_string);

    MPI_Abort(MPI_COMM_WORLD, file_open_error);
  }

  MPI_File_get_size(fh, &total_number_of_bytes);
#ifdef DEBUG
  printf("%3d: total_number_of_bytes = %lld\n", my_rank, total_number_of_bytes);
#endif

  number_of_bytes_ll = total_number_of_bytes / pool_size;

  /* If pool_size does not divide total_number_of_bytes evenly,
     the last process will have to read more data, i.e., to the
     end of the file. */

  max_number_of_bytes_ll = 
    number_of_bytes_ll + total_number_of_bytes % pool_size;

  if (max_number_of_bytes_ll < INT_MAX) {

    if (my_rank == last_guy)
      number_of_bytes = (int) max_number_of_bytes_ll;
    else
      number_of_bytes = (int) number_of_bytes_ll;

    read_buffer = (char*) malloc(number_of_bytes);
#ifdef DEBUG
    printf("%3d: allocated %d bytes\n", my_rank, number_of_bytes);
#endif

    my_offset = (MPI_Offset) my_rank * number_of_bytes_ll;
#ifdef DEBUG
    printf("%3d: my offset = %lld\n", my_rank, my_offset);
#endif
    MPI_File_seek(fh, my_offset, MPI_SEEK_SET);

    MPI_Barrier(MPI_COMM_WORLD);

    start = MPI_Wtime();
    MPI_File_read(fh, read_buffer, number_of_bytes, MPI_BYTE, &status);
    finish = MPI_Wtime();
    MPI_Get_count(&status, MPI_BYTE, &count);
#ifdef DEBUG
    printf("%3d: read %d bytes\n", my_rank, count);
#endif
    MPI_File_get_position(fh, &my_offset);
#ifdef DEBUG
    printf("%3d: my offset = %lld\n", my_rank, my_offset);
#endif

    io_time = finish - start;
    MPI_Allreduce(&io_time, &longest_io_time, 1, MPI_DOUBLE, MPI_MAX,
		  MPI_COMM_WORLD);
    if (i_am_the_master) {
      printf("longest_io_time       = %f seconds\n", longest_io_time);
      printf("total_number_of_bytes = %lld\n", total_number_of_bytes);
      printf("transfer rate         = %f MB/s\n", 
	     total_number_of_bytes / longest_io_time / MBYTE);
    }
  }
  else {
    if (i_am_the_master) {
      printf("Not enough memory to read the file.\n");
      printf("Consider running on more nodes.\n");
    }
  } /* of if(max_number_of_bytes_ll < INT_MAX) */

  MPI_File_close(&fh);

  MPI_Finalize();
  exit(0);
}

The program is made much the same as its sibling mkrandpfile:

/N/B/gustav/src/I590/xrandpfile
gustav@bh1 $ make install
co  RCS/Makefile,v Makefile
RCS/Makefile,v  -->  Makefile
revision 1.1
done
co  RCS/xrandpfile.c,v xrandpfile.c
RCS/xrandpfile.c,v  -->  xrandpfile.c
revision 1.3
done
mpicc -DDEBUG -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -o xrandpfile xrandpfile.c
install xrandpfile /N/B/gustav/bin
gustav@bh1 $

Now let us run this program. But first I want to show you how capturing MPI file errors works. I will ask the program to open a non-existent file for reading.

gustav@bh1 $ pwd
/N/gpfs/gustav/mkrandpfile
gustav@bh1 $ ls
test
gustav@bh1 $ mpiexec -n 8 xrandpfile -f junk
input file: junk
  0: allocated space for filename
  0: received broadcast
  0: filename = junk
  1: allocated space for filename
  1: received broadcast
  1: filename = junk
  2: allocated space for filename
  6: allocated space for filename
  2: received broadcast
  2: filename = junk
  5: allocated space for filename
  6: received broadcast
  6: filename = junk
  4: allocated space for filename
  3: allocated space for filename
  4: received broadcast
  4: filename = junk
  7: allocated space for filename
  3: received broadcast
  3: filename = junk
  7: received broadcast
  7: filename = junk
  5: received broadcast
  5: filename = junk
ABORT - process 0: application called MPI_ABORT
ABORT - process 1: application called MPI_ABORT
ABORT - process 3: application called MPI_ABORT
ABORT - process 7: application called MPI_ABORT
ABORT - process 4: application called MPI_ABORT
  0: Other I/O error 
  0: Other I/O error No such file or directory
  1: Other I/O error 
  1: Other I/O error No such file or directory
  3: Other I/O error 
  3: Other I/O error No such file or directory
ABORT - process 2: application called MPI_ABORT
ABORT - process 5: application called MPI_ABORT
ABORT - process 6: application called MPI_ABORT
  2: Other I/O error 
  2: Other I/O error No such file or directory
  5: Other I/O error 
  5: Other I/O error No such file or directory
  6: Other I/O error 
  6: Other I/O error No such file or directory
  4: Other I/O error 
  4: Other I/O error No such file or directory
  7: Other I/O error 
  7: Other I/O error No such file or directory
rank 7 in job 43  bh1_46074   caused collective abort of all ranks
  exit status of rank 7: return code 32 
rank 6 in job 43  bh1_46074   caused collective abort of all ranks
  exit status of rank 6: return code 32 
rank 5 in job 43  bh1_46074   caused collective abort of all ranks
  exit status of rank 5: return code 32 
rank 4 in job 43  bh1_46074   caused collective abort of all ranks
  exit status of rank 4: return code 32 
rank 3 in job 43  bh1_46074   caused collective abort of all ranks
  exit status of rank 3: return code 32 
rank 2 in job 43  bh1_46074   caused collective abort of all ranks
  exit status of rank 2: return code 32 
rank 1 in job 43  bh1_46074   caused collective abort of all ranks
  exit status of rank 1: return code 32 
rank 0 in job 43  bh1_46074   caused collective abort of all ranks
  exit status of rank 0: return code 32 
gustav@bh1 $
Observe that since every process captures the open error, every process issues the MPI_Abort call. The error class corresponds to the message:
Other I/O error
and the specific error is:
Other I/O error No such file or directory

And now let us read file test created with mkrandpfile with xrandpfile. This file is 32GB long!

gustav@bh1 $ pwd
/N/gpfs/gustav/mkrandpfile
gustav@bh1 $ ls -l
total 33554432
-rw-rw-rw-    1 gustav   ucs      34359738368 Oct 18 16:12 test
gustav@bh1 $ mpiexec -n 32 xrandpfile -f test

[... there some simple diagnostic output here pertaining to the 
     file name ...]

  0: total_number_of_bytes = 34359738368
  0: allocated 1073741824 bytes
  0: my offset = 0
  1: total_number_of_bytes = 34359738368
  1: allocated 1073741824 bytes
  1: my offset = 1073741824
  2: total_number_of_bytes = 34359738368
  2: allocated 1073741824 bytes
  2: my offset = 2147483648
  3: total_number_of_bytes = 34359738368
  3: allocated 1073741824 bytes
  3: my offset = 3221225472
  4: total_number_of_bytes = 34359738368
  4: allocated 1073741824 bytes
  4: my offset = 4294967296

[...]

  0: read 1073741824 bytes
  0: my offset = 1073741824
  1: read 1073741824 bytes
  1: my offset = 2147483648
  2: read 1073741824 bytes
  2: my offset = 3221225472
  3: read 1073741824 bytes
  3: my offset = 4294967296
  4: read 1073741824 bytes
  4: my offset = 5368709120

[...]

longest_io_time       = 193.654002 seconds
total_number_of_bytes = 34359738368
transfer rate         = 169.209000 MB/s
 20: read 1073741824 bytes
 20: my offset = 22548578304
gustav@bh1 $
Here the transfer rate of nearly 170 MB/s is much better than the 64 MB/s we saw when mkrandpfile wrote the 32 GB of test. Neither of these two numbers is likely to get any better with the current generation and configuration of the AVIDD disk arrays. This is it. This large discrepancy between reads and writes illustrates very aptly how much slower physical writes on the magnetic media are from reads. Optical media, such as CDs and DVDs, are even worse in this respect. But the discrepancy would not be this large on, e.g., magnetic tapes, because a magnetic tape is a streaming medium. What makes writes to disk arrays so slow, apart from the physical process itself, is having to find space for the writes. Once it's been found, the head has to do quite a lot of moving around as data is written on the disk. File test is not stored on the AVIDD disk arrays contiguously.

Now let me show you on a smaller example the case when the total length of the file does not divide by the number of processes evenly. In this case one of the processes will have to read a little bit more.

gustav@bh1 $ mpiexec -n 4 mkrandpfile -f small_test -l 4
output file: small_test
each process will write 4 blocks of integers
  0: received broadcast
  0: filename = small_test
number_of_bytes       = 16777216/process
total_number_of_bytes = 67108864
size of offset        = 8 bytes
  2: received broadcast
  2: filename = small_test
  1: received broadcast
  1: filename = small_test
  3: received broadcast
  3: filename = small_test
  0: my current offset is 0
  1: my current offset is 16777216
  2: my current offset is 33554432
  3: my current offset is 50331648
  2: wrote 4194304 integers
  2: my current offset is 50331648
  3: wrote 4194304 integers
  3: my current offset is 67108864
  0: wrote 4194304 integers
  0: my current offset is 16777216
  1: wrote 4194304 integers
  1: my current offset is 33554432
longest_io_time       = 1.453121 seconds
total_number_of_bytes = 67108864
transfer rate         = 44.043134 MB/s
gustav@bh1 $ mpiexec -n 3 xrandpfile -f small_test
input file: small_test
  2: allocated space for filename
  0: allocated space for filename
  0: received broadcast
  0: filename = small_test
  1: allocated space for filename
  1: received broadcast
  1: filename = small_test
  2: received broadcast
  2: filename = small_test
  0: total_number_of_bytes = 67108864
  0: allocated 22369621 bytes
  0: my offset = 0
  2: total_number_of_bytes = 67108864
  2: allocated 22369622 bytes
  2: my offset = 44739242
  1: total_number_of_bytes = 67108864
  1: allocated 22369621 bytes
  1: my offset = 22369621
  1: read 22369621 bytes
  1: my offset = 44739242
  2: read 22369622 bytes
  2: my offset = 67108864
  0: read 22369621 bytes
  0: my offset = 22369621
longest_io_time       = 0.167047 seconds
total_number_of_bytes = 67108864
transfer rate         = 383.125653 MB/s
gustav@bh1 $
Observe that process number 2 reads 22369622 bytes, whereas processes 0 and 1 read 22369621 bytes. In this case $3 \times 22369621 = 67108863$, which is one byte less than the length of the file, 67108864 bytes. So process number 2 has to stretch just this little byte farther.


next up previous index
Next: The Discussion Up: Reading from MPI Files Previous: Reading from MPI Files
Zdzislaw Meglicki
2004-04-29