This is the example program. It opens a file whose name has been provided on the command line. Then it finds about its length. Every process evaluates the amount of data it's going to read by dividing the size of the file by the number of processes in the pool, and allocates sufficient memory for the data. The reading itself is timed.
If any process has problems opening the file, the program aborts.
/*
* %Id: xrandpfile.c,v 1.4 2003/10/18 21:43:05 gustav Exp %
*
* %Log: xrandpfile.c,v %
* Revision 1.4 2003/10/18 21:43:05 gustav
* integers -> bytes
*
* Revision 1.3 2003/10/18 21:33:35 gustav
* Indented the program (used emacs).
*
* Revision 1.2 2003/10/18 20:43:24 gustav
* Added reading of status with MPI_Get_count
*
* Revision 1.1 2003/10/18 19:52:44 gustav
* Initial revision
*
*
*/
#include <stdio.h> /* all IO stuff lives here */
#include <stdlib.h> /* exit lives here */
#include <unistd.h> /* getopt lives here */
#include <string.h> /* strcpy lives here */
#include <limits.h> /* INT_MAX lives here */
#include <mpi.h> /* MPI and MPI-IO live here */
#define MASTER_RANK 0
#define TRUE 1
#define FALSE 0
#define BOOLEAN int
#define MBYTE 1048576
#define SYNOPSIS printf ("synopsis: %s -f <file>\n", argv[0])
int main(argc, argv)
int argc;
char *argv[];
{
/* my variables */
int my_rank, pool_size, last_guy, i, count;
BOOLEAN i_am_the_master = FALSE, input_error = FALSE;
char *filename = NULL, *read_buffer;
int filename_length;
int *junk;
int file_open_error, number_of_bytes;
/* MPI_Offset is long long */
MPI_Offset my_offset, my_current_offset, total_number_of_bytes,
number_of_bytes_ll, max_number_of_bytes_ll;
MPI_File fh;
MPI_Status status;
double start, finish, io_time, longest_io_time;
/* getopt variables */
extern char *optarg;
int c;
/* ACTION */
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &pool_size);
last_guy = pool_size - 1;
if (my_rank == MASTER_RANK) i_am_the_master = TRUE;
if (i_am_the_master) {
/* read the command line */
while ((c = getopt(argc, argv, "f:h")) != EOF)
switch(c) {
case 'f':
filename = optarg;
#ifdef DEBUG
printf("input file: %s\n", filename);
#endif
break;
case 'h':
SYNOPSIS;
input_error = TRUE;
break;
case '?':
SYNOPSIS;
input_error = TRUE;
break;
} /* end of switch(c) */
/* Check if the command line has initialized filename and
* number_of_blocks.
*/
if (filename == NULL) {
SYNOPSIS;
input_error = TRUE;
}
if (input_error) MPI_Abort(MPI_COMM_WORLD, 1);
filename_length = strlen(filename) + 1;
/* This is another way of exiting, but it can be done only
if no files have been opened yet. */
} /* end of "if (i_am_the_master)"; reading the command line */
/* If we got this far, the data read from the command line
should be OK. */
MPI_Bcast(&filename_length, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
if (! i_am_the_master) filename = (char*) malloc(filename_length);
#ifdef DEBUG
printf("%3d: allocated space for filename\n", my_rank);
#endif
MPI_Bcast(filename, filename_length, MPI_CHAR, MASTER_RANK, MPI_COMM_WORLD);
#ifdef DEBUG
printf("%3d: received broadcast\n", my_rank);
printf("%3d: filename = %s\n", my_rank, filename);
#endif
MPI_Barrier(MPI_COMM_WORLD);
/* Default I/O error handling is MPI_ERRORS_RETURN */
file_open_error = MPI_File_open(MPI_COMM_WORLD, filename,
MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
if (file_open_error != MPI_SUCCESS) {
char error_string[BUFSIZ];
int length_of_error_string, error_class;
MPI_Error_class(file_open_error, &error_class);
MPI_Error_string(error_class, error_string, &length_of_error_string);
printf("%3d: %s\n", my_rank, error_string);
MPI_Error_string(file_open_error, error_string, &length_of_error_string);
printf("%3d: %s\n", my_rank, error_string);
MPI_Abort(MPI_COMM_WORLD, file_open_error);
}
MPI_File_get_size(fh, &total_number_of_bytes);
#ifdef DEBUG
printf("%3d: total_number_of_bytes = %lld\n", my_rank, total_number_of_bytes);
#endif
number_of_bytes_ll = total_number_of_bytes / pool_size;
/* If pool_size does not divide total_number_of_bytes evenly,
the last process will have to read more data, i.e., to the
end of the file. */
max_number_of_bytes_ll =
number_of_bytes_ll + total_number_of_bytes % pool_size;
if (max_number_of_bytes_ll < INT_MAX) {
if (my_rank == last_guy)
number_of_bytes = (int) max_number_of_bytes_ll;
else
number_of_bytes = (int) number_of_bytes_ll;
read_buffer = (char*) malloc(number_of_bytes);
#ifdef DEBUG
printf("%3d: allocated %d bytes\n", my_rank, number_of_bytes);
#endif
my_offset = (MPI_Offset) my_rank * number_of_bytes_ll;
#ifdef DEBUG
printf("%3d: my offset = %lld\n", my_rank, my_offset);
#endif
MPI_File_seek(fh, my_offset, MPI_SEEK_SET);
MPI_Barrier(MPI_COMM_WORLD);
start = MPI_Wtime();
MPI_File_read(fh, read_buffer, number_of_bytes, MPI_BYTE, &status);
finish = MPI_Wtime();
MPI_Get_count(&status, MPI_BYTE, &count);
#ifdef DEBUG
printf("%3d: read %d bytes\n", my_rank, count);
#endif
MPI_File_get_position(fh, &my_offset);
#ifdef DEBUG
printf("%3d: my offset = %lld\n", my_rank, my_offset);
#endif
io_time = finish - start;
MPI_Allreduce(&io_time, &longest_io_time, 1, MPI_DOUBLE, MPI_MAX,
MPI_COMM_WORLD);
if (i_am_the_master) {
printf("longest_io_time = %f seconds\n", longest_io_time);
printf("total_number_of_bytes = %lld\n", total_number_of_bytes);
printf("transfer rate = %f MB/s\n",
total_number_of_bytes / longest_io_time / MBYTE);
}
}
else {
if (i_am_the_master) {
printf("Not enough memory to read the file.\n");
printf("Consider running on more nodes.\n");
}
} /* of if(max_number_of_bytes_ll < INT_MAX) */
MPI_File_close(&fh);
MPI_Finalize();
exit(0);
}
The program is made much the same as its sibling mkrandpfile:
/N/B/gustav/src/I590/xrandpfile gustav@bh1 $ make install co RCS/Makefile,v Makefile RCS/Makefile,v --> Makefile revision 1.1 done co RCS/xrandpfile.c,v xrandpfile.c RCS/xrandpfile.c,v --> xrandpfile.c revision 1.3 done mpicc -DDEBUG -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -o xrandpfile xrandpfile.c install xrandpfile /N/B/gustav/bin gustav@bh1 $
Now let us run this program. But first I want to show you how capturing MPI file errors works. I will ask the program to open a non-existent file for reading.
gustav@bh1 $ pwd /N/gpfs/gustav/mkrandpfile gustav@bh1 $ ls test gustav@bh1 $ mpiexec -n 8 xrandpfile -f junk input file: junk 0: allocated space for filename 0: received broadcast 0: filename = junk 1: allocated space for filename 1: received broadcast 1: filename = junk 2: allocated space for filename 6: allocated space for filename 2: received broadcast 2: filename = junk 5: allocated space for filename 6: received broadcast 6: filename = junk 4: allocated space for filename 3: allocated space for filename 4: received broadcast 4: filename = junk 7: allocated space for filename 3: received broadcast 3: filename = junk 7: received broadcast 7: filename = junk 5: received broadcast 5: filename = junk ABORT - process 0: application called MPI_ABORT ABORT - process 1: application called MPI_ABORT ABORT - process 3: application called MPI_ABORT ABORT - process 7: application called MPI_ABORT ABORT - process 4: application called MPI_ABORT 0: Other I/O error 0: Other I/O error No such file or directory 1: Other I/O error 1: Other I/O error No such file or directory 3: Other I/O error 3: Other I/O error No such file or directory ABORT - process 2: application called MPI_ABORT ABORT - process 5: application called MPI_ABORT ABORT - process 6: application called MPI_ABORT 2: Other I/O error 2: Other I/O error No such file or directory 5: Other I/O error 5: Other I/O error No such file or directory 6: Other I/O error 6: Other I/O error No such file or directory 4: Other I/O error 4: Other I/O error No such file or directory 7: Other I/O error 7: Other I/O error No such file or directory rank 7 in job 43 bh1_46074 caused collective abort of all ranks exit status of rank 7: return code 32 rank 6 in job 43 bh1_46074 caused collective abort of all ranks exit status of rank 6: return code 32 rank 5 in job 43 bh1_46074 caused collective abort of all ranks exit status of rank 5: return code 32 rank 4 in job 43 bh1_46074 caused collective abort of all ranks exit status of rank 4: return code 32 rank 3 in job 43 bh1_46074 caused collective abort of all ranks exit status of rank 3: return code 32 rank 2 in job 43 bh1_46074 caused collective abort of all ranks exit status of rank 2: return code 32 rank 1 in job 43 bh1_46074 caused collective abort of all ranks exit status of rank 1: return code 32 rank 0 in job 43 bh1_46074 caused collective abort of all ranks exit status of rank 0: return code 32 gustav@bh1 $Observe that since every process captures the open error, every process issues the
MPI_Abort call. The error class
corresponds to the message:Other I/O errorand the specific error is:
Other I/O error No such file or directory
And now let us read file test created with mkrandpfile
with xrandpfile. This file is 32GB long!
gustav@bh1 $ pwd
/N/gpfs/gustav/mkrandpfile
gustav@bh1 $ ls -l
total 33554432
-rw-rw-rw- 1 gustav ucs 34359738368 Oct 18 16:12 test
gustav@bh1 $ mpiexec -n 32 xrandpfile -f test
[... there some simple diagnostic output here pertaining to the
file name ...]
0: total_number_of_bytes = 34359738368
0: allocated 1073741824 bytes
0: my offset = 0
1: total_number_of_bytes = 34359738368
1: allocated 1073741824 bytes
1: my offset = 1073741824
2: total_number_of_bytes = 34359738368
2: allocated 1073741824 bytes
2: my offset = 2147483648
3: total_number_of_bytes = 34359738368
3: allocated 1073741824 bytes
3: my offset = 3221225472
4: total_number_of_bytes = 34359738368
4: allocated 1073741824 bytes
4: my offset = 4294967296
[...]
0: read 1073741824 bytes
0: my offset = 1073741824
1: read 1073741824 bytes
1: my offset = 2147483648
2: read 1073741824 bytes
2: my offset = 3221225472
3: read 1073741824 bytes
3: my offset = 4294967296
4: read 1073741824 bytes
4: my offset = 5368709120
[...]
longest_io_time = 193.654002 seconds
total_number_of_bytes = 34359738368
transfer rate = 169.209000 MB/s
20: read 1073741824 bytes
20: my offset = 22548578304
gustav@bh1 $
Here the transfer rate of nearly 170 MB/s is much better than
the 64 MB/s we saw when mkrandpfile wrote
the 32 GB of test. Neither of these two numbers is
likely to get any better with the current generation and configuration of
the AVIDD disk arrays. This is it. This large discrepancy between
reads and writes illustrates very aptly how much slower physical writes on the
magnetic media are from reads. Optical media, such as CDs and DVDs,
are even worse in this respect. But the discrepancy would not be
this large on, e.g., magnetic tapes, because a magnetic tape is a
streaming medium. What makes writes to disk arrays so slow, apart
from the physical process itself, is having to find space for
the writes. Once it's been found, the head has to do quite a lot
of moving around as data is written on the disk. File test
is not stored on the AVIDD disk arrays contiguously.
Now let me show you on a smaller example the case when the total length of the file does not divide by the number of processes evenly. In this case one of the processes will have to read a little bit more.
gustav@bh1 $ mpiexec -n 4 mkrandpfile -f small_test -l 4 output file: small_test each process will write 4 blocks of integers 0: received broadcast 0: filename = small_test number_of_bytes = 16777216/process total_number_of_bytes = 67108864 size of offset = 8 bytes 2: received broadcast 2: filename = small_test 1: received broadcast 1: filename = small_test 3: received broadcast 3: filename = small_test 0: my current offset is 0 1: my current offset is 16777216 2: my current offset is 33554432 3: my current offset is 50331648 2: wrote 4194304 integers 2: my current offset is 50331648 3: wrote 4194304 integers 3: my current offset is 67108864 0: wrote 4194304 integers 0: my current offset is 16777216 1: wrote 4194304 integers 1: my current offset is 33554432 longest_io_time = 1.453121 seconds total_number_of_bytes = 67108864 transfer rate = 44.043134 MB/s gustav@bh1 $ mpiexec -n 3 xrandpfile -f small_test input file: small_test 2: allocated space for filename 0: allocated space for filename 0: received broadcast 0: filename = small_test 1: allocated space for filename 1: received broadcast 1: filename = small_test 2: received broadcast 2: filename = small_test 0: total_number_of_bytes = 67108864 0: allocated 22369621 bytes 0: my offset = 0 2: total_number_of_bytes = 67108864 2: allocated 22369622 bytes 2: my offset = 44739242 1: total_number_of_bytes = 67108864 1: allocated 22369621 bytes 1: my offset = 22369621 1: read 22369621 bytes 1: my offset = 44739242 2: read 22369622 bytes 2: my offset = 67108864 0: read 22369621 bytes 0: my offset = 22369621 longest_io_time = 0.167047 seconds total_number_of_bytes = 67108864 transfer rate = 383.125653 MB/s gustav@bh1 $Observe that process number 2 reads 22369622 bytes, whereas processes 0 and 1 read 22369621 bytes. In this case