next up previous index
Next: Handling MPI Errors Up: Error Handling Previous: Error Handling

   
Handling UNIX Errors

In this section we are going to discuss a parallel version of program mkrandfile introduced in section 3.4.3. The parallel version of this program is called mkrandfiles (plural) and it tries to capture and process possible errors just as diligently as its sequential version. This program will also prepare us for the issues of MPI-IO. You can think of it as a prelude to MPI-IO.

Here is the program itself:

/*
 * %Id: mkrandfiles.c,v 1.6 2003/10/19 19:02:09 gustav Exp %
 *
 * %Log: mkrandfiles.c,v %
 * Revision 1.6  2003/10/19 19:02:09  gustav
 * Forgot to initialize basename to NULL.
 *
 * Revision 1.5  2003/10/19 18:58:59  gustav
 * Corrected reading the command line.
 *
 * Revision 1.4  2003/10/13 22:49:45  gustav
 * Moved the debug messages into the if clauses.
 *
 * Revision 1.3  2003/10/13 22:46:36  gustav
 * Added more debug messages.
 *
 * Revision 1.2  2003/10/13 22:41:36  gustav
 * Finished.
 *
 * Revision 1.1  2003/10/13 21:18:00  gustav
 * Initial revision
 *
 *
 */

#include <stdio.h>   /* all IO stuff lives here */
#include <stdlib.h>  /* exit lives here */
#include <unistd.h>  /* getopt lives here */
#include <errno.h>   /* UNIX error handling lives here */
#include <string.h>  /* strcpy lives here */
#include <mpi.h>     /* MPI and MPI-IO live here */

#define MASTER_RANK 0
#define TRUE 1
#define FALSE 0
#define BOOLEAN int
#define BLOCK_SIZE 1048576
#define SYNOPSIS printf ("synopsis: %s -f <file> -l <blocks>\n", argv[0])

int main(argc, argv)
     int argc;
     char *argv[];
{
  /* my variables */

  int my_rank, pool_size, number_of_blocks = 0, block, i;
  BOOLEAN i_am_the_master = FALSE, input_error = FALSE, 
    my_file_open_error = FALSE, file_open_error = FALSE,
    my_write_error = FALSE, write_error = FALSE;
  char *basename = NULL, file_name[BUFSIZ], message[BUFSIZ];
  int basename_length, junk[BLOCK_SIZE];
  FILE *fp;
  double start, finish, io_time = 0.0;

  /* getopt variables */

  extern char *optarg;
  int c;

  /* error handling variables */

  extern int errno;

  /* ACTION */

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &pool_size);
  if (my_rank == MASTER_RANK) i_am_the_master = TRUE;

  if (i_am_the_master) {

    /* read the command line */

    while ((c = getopt(argc, argv, "f:l:h")) != EOF)
      switch(c) {
      case 'f': 
        basename = optarg;
	break;
      case 'l': 
	if ((sscanf (optarg, "%d", &number_of_blocks) != 1) ||
	    (number_of_blocks < 1)) 
	  input_error = TRUE;
	break;
      case 'h':
	input_error = TRUE;
	break;
      case '?':
	input_error = TRUE;
	break;
      }

    /* Check if the command line has initialized basename and
     * number_of_blocks.
     */

    if ((basename == NULL) || (number_of_blocks == 0)) input_error = TRUE;

    if (input_error)
       SYNOPSIS;
    else {
      basename_length = strlen(basename) + 1;
#ifdef DEBUG
      printf("basename         = %s\n", basename);
      printf("basename_length  = %d\n", basename_length);
      printf("number_of_blocks = %d\n", number_of_blocks);
#endif
    }
  }

  /* Transmit the effect of reading the command line to other
     processes. */

  MPI_Bcast(&input_error, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);

  if (! input_error) {
    MPI_Bcast(&number_of_blocks, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
    MPI_Bcast(&basename_length, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
    if (! i_am_the_master) basename = (char*) malloc(basename_length);
    MPI_Bcast(basename, basename_length, MPI_CHAR, MASTER_RANK, MPI_COMM_WORLD);

#ifdef DEBUG
    printf("%3d: basename = %s, number_of_blocks = %d\n", 
	   my_rank, basename, number_of_blocks);
#endif

    /* Now every process creates its own file name and attempts
       to open the file. */

    sprintf(file_name, "%s.%d", basename, my_rank);

#ifdef DEBUG
    printf("%3d: opening file %s\n", my_rank, file_name);
#endif

    if (! (fp = fopen(file_name, "w"))) {
      sprintf(message, "%3d: %s", my_rank, file_name);
      perror(message);
      my_file_open_error = TRUE;
    }

    /* Now we must ALL check that NOBODY had problems
       with opening the file. */

    MPI_Allreduce (&my_file_open_error, &file_open_error, 1, MPI_INT, 
		   MPI_LOR, MPI_COMM_WORLD);

#ifdef DEBUG
    if (i_am_the_master)
      if (file_open_error)
	fprintf(stderr, "problem opening output files\n");
#endif

    /* If all files are open for writing, write to them */

    if (! file_open_error) {
      srand(28 + my_rank);
      for (block = 0; (block < number_of_blocks) || my_write_error; 
	   block++) {
	for (i = 0; i < BLOCK_SIZE; junk[i++] = rand());
        start = MPI_Wtime();
	if (fwrite(junk, sizeof(int), BLOCK_SIZE, fp) != BLOCK_SIZE) {
	  sprintf(message, "%3d: %s", my_rank, file_name);
	  perror(message);
	  my_write_error = TRUE;
	}
        finish = MPI_Wtime();
        io_time += finish - start;
      }
        
      /* Check if anybody had problems writing on the file */

      MPI_Allreduce (&my_write_error, &write_error, 1, MPI_INT,
		     MPI_LOR, MPI_COMM_WORLD);

#ifdef DEBUG
      if (i_am_the_master)
	if (write_error)
	  fprintf(stderr, "problem writing on files\n");
#endif
      if (i_am_the_master)
        if (!write_error)
          printf("io_time = %f\n", io_time);

    }

    /* Only processes that were successful opening the files
       need do close them here */

    if (!my_file_open_error) {
      fclose(fp);
#ifdef DEBUG
      printf ("%3d: closed %s\n", my_rank, file_name);
#endif
    }

    /* If we have either write errors or file open errors,
       then processes that managed to open their files
       are requested to throw them away */

    if ((write_error || file_open_error) && !my_file_open_error) {
      unlink(file_name);
#ifdef DEBUG
      printf("%3d: unlinked %s\n", my_rank, file_name);
#endif
    }

    /* We don't try to capture unlink or fclose errors here,
       because there is little we could do about them. */
       
  } 

  MPI_Finalize();
  exit(0);
}

Now let us discuss what the program does and how it goes about it.

The program begins the same way all other MPI programs do. All processes find about the size of the pool and their own rank within it. Then it befalls to the master processs, i.e., process of rank zero, to read input from the command line using function  getopt. The reading goes much the same as in mkrandfile, but this time we don't call exit whenever we encounter an error. Instead we have a boolean variable input_error, which is set by default to FALSE, and whenever the master process encounters a problem on reading the command line, instead of exiting it sets input_error to TRUE.

The command line expects the same options as before: -f followed by the name of the file, -l followed by the number of blocks that will be written on the file and -h if the user asks for quick help. But the file name is treated a little differently because it is a parallel program and without MPI-IO we cannot make all processes write on the same file. The processes will append their rank number to the name obtained from the command line and each process will open a different file. For example if the program is invoked with

-f test
then the files that will be opened will be called test.0, test.1, test.2 and so on.

The master process checks the correctness of input on the command line, checks if everything has been properly specified and sets input_error accordingly. The value of input_error is then broadcast to all processes:

  MPI_Bcast(&input_error, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
and the rest of the program is one large
  if (! input_error) {
     blah... blah... blah...
  }
clause. What follows the clause is MPI_Finalize. In other words, if any input errors are encountered by the master process, nobody does anything. All processes go directly to MPI_Finalize, and the whole program exits cleanly.

Now, if there have been no input errors on the command line, we may still encounter various other errors and the logic of the program has to flow around them.

The if(! input_error) { clause begins with some broadcasts and a malloc that transmit number_of_blocks and basename (i.e., the word from which the file names will be constructed by appending the rank numbers to it) to all processes. On having received this data, the processes attempt to construct their file names, and then they try to open the files for writing. This is how they go about it:

    if (! (fp = fopen(file_name, "w"))) {
      sprintf(message, "%3d: %s", my_rank, file_name);
      perror(message);
      my_file_open_error = TRUE;
    }
A problem may arise at this stage, but if it does the processes affected by it should not just exit. Instead the process sets its own boolean variable my_file_open_error to TRUE. The default value of this variable is FALSE.

Now the logic of the program handles such error as follows. We check if any of the processes failed to open the file by calling  MPI_Allreduce with the reduction operation set to logical OR:

    /* Now we must ALL check that NOBODY had problems
       with opening the file. */

    MPI_Allreduce (&my_file_open_error, &file_open_error, 1, MPI_INT, 
                   MPI_LOR, MPI_COMM_WORLD);
If any instance of my_file_open_error is TRUE, everybody's instance of file_open_error will be TRUE too. But by default file_open_error is FALSE.

If any of the processes failed to open its file, we don't compute anything and we don't write anything either. Instead processes that opened files successfully, close them and then delete them:

    if (! file_open_error) {
       blah... blah... blah...
    }
    /* Only processes that were successful opening the files
       need do close them here */

    if (!my_file_open_error) {
      fclose(fp);
#ifdef DEBUG
      printf ("%3d: closed %s\n", my_rank, file_name);
#endif
    }

    /* If we have either write errors or file open errors,
       then processes that managed to open their files
       are requested to throw them away */

    if ((write_error || file_open_error) && !my_file_open_error) {
      unlink(file_name);
#ifdef DEBUG
      printf("%3d: unlinked %s\n", my_rank, file_name);
#endif
    }

But if all processes managed to open their files without problems, we commence the computation, which works the same way it did in the sequential version of mkrandfile$\ldots$

    if (! file_open_error) {
      srand(28 + my_rank);
      for (block = 0; (block < number_of_blocks) || my_write_error; 
           block++) {
        for (i = 0; i < BLOCK_SIZE; junk[i++] = rand());
        start = MPI_Wtime();
        if (fwrite(junk, sizeof(int), BLOCK_SIZE, fp) != BLOCK_SIZE) {
          sprintf(message, "%3d: %s", my_rank, file_name);
          perror(message);
          my_write_error = TRUE;
        }
        finish = MPI_Wtime();
        io_time += finish - start;
      }
$\ldots$ with one diffrence. If a process fails to write on its file, it sets the variable my_write_error to TRUE. the for statement checks the value of my_write_error at the top of the loop:
for (block = 0; (block < number_of_blocks) || my_write_error; block++)
and will not execute the next iteration if error has been detected.

Now, if there has been an error, then this information needs to be exchanged with other processes. So we call MPI_Allreduce again:

      /* Check if anybody had problems writing on the file */

      MPI_Allreduce (&my_write_error, &write_error, 1, MPI_INT,
                     MPI_LOR, MPI_COMM_WORLD);
If any instance of my_write_error is TRUE, then every instance of write_error is going to be TRUE too.

The way the program handles this condition is to discard all data. After the files have been closed, they are unlinked by this statement:

    if ((write_error || file_open_error) && !my_file_open_error) {
      unlink(file_name);
#ifdef DEBUG
      printf("%3d: unlinked %s\n", my_rank, file_name);
#endif
    }

After all these travails, whether successful or unsuccessful, the processes meet at MPI_Finalize and exit cleanly. The error condition is printed on standard error by the perror statements. We could rewrite the program so that an appropriate exit code would be set up too, depending on the type of error encountered.

The program is compiled with the -DDEBUG CFLAG so that we can see how it goes about detecting and handling errors. Without this flag the program execution is silent and only the specific error messages are printed on standard error.

So here is the compilation and installation:

gustav@bh1 $ pwd
/N/B/gustav/src/I590/mkrandfiles
gustav@bh1 $ make install
co  RCS/Makefile,v Makefile
RCS/Makefile,v  -->  Makefile
revision 1.2
done
co  RCS/mkrandfiles.c,v mkrandfiles.c
RCS/mkrandfiles.c,v  -->  mkrandfiles.c
revision 1.4
done
mpicc -DDEBUG -o mkrandfiles mkrandfiles.c
install mkrandfiles /N/B/gustav/bin
gustav@bh1 $

Now let us run a few tests to see how the program is going to behave. First we are going to test for incorrect input on the command line:

gustav@bh1 $ mpdboot
gustav@bh1 $ mpdtrace | wc
     32      32     159
gustav@bh1 $ mpiexec -n 16 mkrandfiles
synopsis: mkrandfiles -f <file> -l <blocks>
gustav@bh1 $ mpiexec -n 16 mkrandfiles -f test -l -7
output files basename: test
synopsis: mkrandfiles -f <file> -l <blocks>
gustav@bh1 $
Now we are going to check the behaviour of the program if one of the processes fails to open its output file. To do so I am going to create a file test.7 in my working directory and will set permissions on this file to -r--r--r--. Then I'll invoke the program with the -f test option:
gustav@bh1 $ touch test.7
gustav@bh1 $ chmod 444 test.7
gustav@bh1 $ mpiexec -n 8 mkrandfiles -f test -l 7
output files basename: test
each process will write 7 blocks of integers
  0: basename = test, number_of_blocks = 7
  0: opening file test.0
  4: basename = test, number_of_blocks = 7
  4: opening file test.4
  2: basename = test, number_of_blocks = 7
  2: opening file test.2
  5: basename = test, number_of_blocks = 7
  5: opening file test.5
  3: basename = test, number_of_blocks = 7
  3: opening file test.3
  7: basename = test, number_of_blocks = 7
  7: opening file test.7
  1: basename = test, number_of_blocks = 7
  1: opening file test.1
  7: test.7: Permission denied
  6: basename = test, number_of_blocks = 7
  6: opening file test.6
problem opening output files
  0: closed test.0
  3: closed test.3
  4: closed test.4
  0: unlinked test.0
  4: unlinked test.4
  3: unlinked test.3
  1: closed test.1
  1: unlinked test.1
  6: closed test.6
  2: closed test.2
  5: closed test.5
  6: unlinked test.6
  5: unlinked test.5
  2: unlinked test.2
gustav@bh1 $
Every process write quite rich diagnostics on standard output. You can see that they all received the basename and number_of_blocks. They all managed to construct their file names without problems, but process number 7 failed to open the file, wrote the diagnostics on standard error and passed this information to all other processes, so that the master process could write problem opening output files on standard output:
  7: opening file test.7
...
  7: test.7: Permission denied
...
problem opening output files
Then all processes that managed to open files, i.e., all but process rank 7, close them and then unlink them. After the run there are no files called test.? left in the directory with the exception of test.7:
gustav@bh1 $ ls
PBS  bin  man  mpd.hosts  src  test.7  tmp
gustav@bh1 $
Now let me recompile the program without debugging and we can use it to test IO on the AVIDD GPFS:
gustav@bh1 $ make install
co  RCS/Makefile,v Makefile
RCS/Makefile,v  -->  Makefile
revision 1.3
done
co  RCS/mkrandfiles.c,v mkrandfiles.c
RCS/mkrandfiles.c,v  -->  mkrandfiles.c
revision 1.4
done
mpicc  -o mkrandfiles mkrandfiles.c
install mkrandfiles /N/B/gustav/bin
gustav@bh1 $
This time I am going to run the program on GPFS:
gustav@bh1 $ cd /N/gpfs/gustav/mkrandfiles
gustav@bh1 $ mpiexec -n 32 mkrandfiles -f test -l 100
io_time = 202.484912
gustav@bh1 $ ls -l
total 13107200
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.0
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.1
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.10
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.11
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.12
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:04 test.13
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.14
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.15
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.16
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.17
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.18
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.19
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.2
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.20
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.21
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.22
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.23
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.24
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.25
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.26
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.27
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.28
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.29
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:04 test.3
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.30
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.31
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:04 test.4
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.5
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.6
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:04 test.7
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.8
-rw-rw-rw-    1 gustav   ucs      419430400 Oct 13 19:05 test.9
gustav@bh1 $
The program wrote, in parallel, 32 files, each 400MB long. This is 12.5GB altogether. The IO itself took 202 seconds, which yields data transfer rate on writes to GPFS of 63MB/s.

Would we be better off using fewer processes perhaps? This is easy to check:

gustav@bh1 $ ls
test.0   test.12  test.16  test.2   test.23  test.27  test.30  test.6
test.1   test.13  test.17  test.20  test.24  test.28  test.31  test.7
test.10  test.14  test.18  test.21  test.25  test.29  test.4   test.8
test.11  test.15  test.19  test.22  test.26  test.3   test.5   test.9
gustav@bh1 $ rm *
gustav@bh1 $ mpiexec -n 16 mkrandfiles -f test -l 200
io_time = 226.157398
gustav@bh1 $
Well, this is the same amount of data, but it took a little longer, and we got data transfer rate on writes of about 57MB/s this time. But observe the following:
gustav@bh1 $ rm -f *
gustav@bh1 $ mpiexec -n 32 mkrandfiles -f test -l 4
io_time = 2.663388
gustav@bh1 $
Here we created 32 files, each 16MB long, which is 512MB total, and it took us only 2.663388 seconds to write them. This yields data transfer rate on this parallel write to GPFS of 190MB/s, sic! How would you explain this result?

When we get to study MPI-IO, you will learn how to write data from 32 processes to a single file, shared amongst them all.


next up previous index
Next: Handling MPI Errors Up: Error Handling Previous: Error Handling
Zdzislaw Meglicki
2004-04-29