next up previous index
Next: The Discussion Up: File Views Previous: Defining File Views in

   
Program darray.c

This program exercises a number of new MPI function calls. They are MPI_Dims_create,  MPI_Type_create_darray,  MPI_Type_extent,  MPI_Type_size,  MPI_File_set_view,  MPI_File_write_all,  and MPI_File_read_all. 

/*
 * %Id: darray.c,v 1.2 2003/10/26 22:48:13 gustav Exp %
 *
 * %Log: darray.c,v %
 * Revision 1.2  2003/10/26 22:48:13  gustav
 * Expunged any references to "info".
 *
 * Revision 1.1  2003/10/26 22:05:50  gustav
 * Initial revision
 *
 *
 */

#include <stdio.h>     /* printf and relatives live here */
#include <stdlib.h>    /* exit lives here */
#include <unistd.h>    /* getopt lives here */
#include <string.h>    /* strlen lives here */
#include <sys/types.h> /* chmod needs these two */
#include <sys/stat.h>  
#include <mpi.h>       /* all MPI stuff lives here (including MPI-IO) */

#define MASTER_RANK 0
#define TRUE 1
#define FALSE 0
#define BOOLEAN int
#define MBYTE 1048576
#define NDIMS  3
#define SIZE 512
#define SYNOPSIS printf ("synopsis: %s -f <file>\n", argv[0])

int main(argc, argv)
     int argc;
     char *argv[];
{
  /* my variables */

  int my_rank, pool_size, i, ndims, order, file_name_length,
    array_of_gsizes[NDIMS], array_of_distribs[NDIMS], 
    array_of_dargs[NDIMS], array_of_psizes[NDIMS],
    *write_buffer, write_buffer_size, count,
    *read_buffer, read_buffer_size;
  BOOLEAN i_am_the_master = FALSE, input_error = FALSE,
    file_open_error = FALSE, file_write_error = FALSE, verbose = FALSE,
    my_read_error = FALSE, read_error = FALSE;
  char *file_name = NULL, message[BUFSIZ];

  /* MPI variables */

  MPI_Offset file_size;
  MPI_File fh;
  MPI_Status status;
  MPI_Datatype file_type;
  MPI_Aint file_type_size, file_type_extent;
  int error_string_length;
  char error_string[BUFSIZ];

  /* getopt variables */

  extern char *optarg;
  int c;

  /* error handling variables */

  extern int errno;

  /* ACTION */

  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &pool_size);
  if (my_rank == MASTER_RANK) i_am_the_master = TRUE;

  if (i_am_the_master) {
    while ((c = getopt(argc, argv, "f:vh")) != EOF)
      switch(c) {
      case 'f': 
        file_name = optarg;
        break;
      case 'v':
        verbose = TRUE;
        break;
      case 'h':
        input_error = TRUE;
        break;
      case '?':
        input_error = TRUE;
        break;
      }
    if (file_name == NULL) input_error = TRUE;
    if (input_error)
      SYNOPSIS;
    else {
      file_name_length = strlen(file_name) + 1;
      if (verbose) {
	printf("file_name         = %s\n", file_name);
	printf("file_name_length  = %d\n", file_name_length);
      }
    }
  } 

  MPI_Bcast(&input_error, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);

  if (! input_error) {
    MPI_Bcast(&verbose, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
    MPI_Bcast(&file_name_length, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
    if (! i_am_the_master) file_name = (char*) malloc(file_name_length);
    MPI_Bcast(file_name, file_name_length, MPI_CHAR, MASTER_RANK, 
	      MPI_COMM_WORLD);
     
    /* Prepare for calling MPI_Type_create_darray */

    ndims = NDIMS; 
    for (i = 0; i < ndims; i++) {
      array_of_gsizes[i] = SIZE;
      array_of_distribs[i] = MPI_DISTRIBUTE_BLOCK;
      array_of_dargs[i] = MPI_DISTRIBUTE_DFLT_DARG;
      array_of_psizes[i] = 0;
    }
    MPI_Dims_create(pool_size, ndims, array_of_psizes);
    order = MPI_ORDER_C;

    /* Now call MPI_Type_create_darray */

    if (verbose) {
      printf ("%3d: calling MPI_Type_create_darray with\n", my_rank);
      printf ("%3d:    pool_size         = %d\n", my_rank, pool_size);
      printf ("%3d:    my_rank           = %d\n", my_rank, my_rank);
      printf ("%3d:    ndims             = %d\n", my_rank, ndims);
      printf ("%3d:    array_of_gsizes   = (%d, %d, %d)\n", my_rank,
	      array_of_gsizes[0],
	      array_of_gsizes[1],
	      array_of_gsizes[2]);
      printf ("%3d:    array_of_distribs = (%d, %d, %d)\n", my_rank,
	      array_of_distribs[0],
	      array_of_distribs[1],
	      array_of_distribs[2]);
      printf ("%3d:    array_of_dargs    = (%d, %d, %d)\n", my_rank,
	      array_of_dargs[0],
	      array_of_dargs[1],
	      array_of_dargs[2]);
      printf ("%3d:    array_of_psizes   = (%d, %d, %d)\n", my_rank,
	      array_of_psizes[0],
	      array_of_psizes[1],
	      array_of_psizes[2]);
      printf ("%3d:    order             = %d\n", my_rank, order);
      printf ("%3d:    type              = %d\n", my_rank, MPI_INT);
    }
    MPI_Type_create_darray(pool_size, my_rank, ndims,
			   array_of_gsizes, array_of_distribs,
			   array_of_dargs, array_of_psizes, order,
			   MPI_INT, &file_type);
    MPI_Type_commit(&file_type);

    /* Explore the returned type */

    MPI_Type_extent(file_type, &file_type_extent);
    MPI_Type_size(file_type, &file_type_size);
    if (verbose) {
      printf("%3d: file_type_size   = %d\n", my_rank, file_type_size);
      printf("%3d: file_type_extent = %d\n", my_rank, file_type_extent);
    }

    /* Allocate space for your own write buffer based on the
       return of the MPI_Type_size call. */

    write_buffer_size = file_type_size / sizeof(int);
    write_buffer = (int*) malloc(write_buffer_size * sizeof(int));

    /* We do this in case sizeof(int) does not divide file_type_size 
       exactly. But this should not happen if we have called 
       MPI_Type_create_darray with MPI_INT as the original data
       type. */

    if (! write_buffer) {
      sprintf(message, "%3d: malloc write_buffer", my_rank);
      perror(message);
      MPI_Abort(MPI_COMM_WORLD, errno);

      /* We can still abort, because we have not opened any
	 files yet. Notice that since MPI_Type_create_darray 
	 will fail if SIZE^3 * sizeof(int) exceeds MAX_INT,
	 because MPI_Aint on AVIDD is a 32-bit integer,
	 we are rather unlikely to fail on this malloc
	 anyway. */
    }

    MPI_Barrier(MPI_COMM_WORLD); 
    /* We wait here in case some procs have problems with malloc. */

    /* Initialize the buffer */

    for (i = 0; i < write_buffer_size; i++) 
      *(write_buffer + i) = my_rank * SIZE + i;

    file_open_error = MPI_File_open(MPI_COMM_WORLD, file_name, 
				    MPI_MODE_CREATE | MPI_MODE_WRONLY,
				    MPI_INFO_NULL, &fh);
    if (file_open_error != MPI_SUCCESS) {
      MPI_Error_string(file_open_error, error_string,
		       &error_string_length);
      fprintf(stderr, "%3d: %s\n", my_rank, error_string);
      MPI_Abort(MPI_COMM_WORLD, file_open_error);

      /* It is still OK to abort, because we have failed to
	 open the file. */

    }
    else {

      if (i_am_the_master)
         chmod(file_name, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
      MPI_Barrier(MPI_COMM_WORLD);

      /* We should be able to change permissions on the file by setting
         the "file_perm" hint in the info structure before passing
         it to MPI_File_open, but MPICH2 doesn't support this
         yet. All processes have to meet on the barrier before 
         further action. */

      MPI_File_set_view(fh, 0, MPI_INT, file_type, "native", MPI_INFO_NULL);
      file_write_error =
	MPI_File_write_all(fh, write_buffer, write_buffer_size, MPI_INT, 
			   &status);
      if (file_write_error != MPI_SUCCESS) {
	MPI_Error_string(file_write_error, error_string,
			 &error_string_length);
	fprintf(stderr, "%3d: %s\n", my_rank, error_string);
	MPI_File_close(&fh);
	free(write_buffer);
	if (i_am_the_master) MPI_File_delete(file_name, MPI_INFO_NULL);
      }
      else {
	MPI_Get_count(&status, MPI_INT, &count);
	MPI_File_get_size(fh, &file_size);
	if(verbose) {
	  printf("%3d: wrote %d integers\n", my_rank, count);
	  printf("%3d: file size is %lld bytes\n", my_rank, file_size);
	}
	MPI_File_close(&fh);

        /* We have managed to open, write on it and close the file.
           Now we're going to read it the same way we wrote it. */

        read_buffer_size = write_buffer_size;
        read_buffer = (int*) malloc(read_buffer_size * sizeof(int));
        if (! read_buffer) {
          sprintf(message, "%3d: malloc read_buffer", my_rank);
          perror(message);
          MPI_Abort(MPI_COMM_WORLD, errno);

          /* We can abort, because the file has been closed and
             we haven't opened it for reading yet. */

        }

        MPI_Barrier(MPI_COMM_WORLD);
        /* We wait here in case some procs have problems with malloc. */

        MPI_File_open(MPI_COMM_WORLD, file_name, MPI_MODE_RDONLY,
                      MPI_INFO_NULL, &fh);

        /* We don't check for errors here, because we've just closed
           this file a moment ago, so it should still be there. */

        MPI_File_set_view(fh, 0, MPI_INT, file_type, "native", MPI_INFO_NULL);
        MPI_File_read_all(fh, read_buffer, read_buffer_size, MPI_INT, &status);
        MPI_Get_count(&status, MPI_INT, &count);
        if (verbose)
           printf("%3d: read %d integers\n", my_rank, count);
        MPI_File_close(&fh);

        /* Now check that the integers read are the same as the ones
           we wrote. */

        for (i = 0; i < read_buffer_size; i++) {
           if (*(write_buffer + i) != *(read_buffer + i)) {
              printf("%3d: data read different from data written, i = %d\n", 
                     my_rank, i);
              my_read_error = TRUE;
           }
        }
        
        MPI_Reduce (&my_read_error, &read_error, 1, MPI_INT, MPI_LOR, 
                    MASTER_RANK, MPI_COMM_WORLD);           

        if (i_am_the_master)
           if (! read_error)
              printf("--> All data read back is correct.\n");

      } /* no problem with file write */
    } /* no problem with file open */
  } /* no input error */

  MPI_Finalize();
  exit(0);
}

Here is how this program is compiled and installed:

gustav@bh1 $ pwd
/N/B/gustav/src/I590/darray
gustav@bh1 $ make install
co  RCS/Makefile,v Makefile
RCS/Makefile,v  -->  Makefile
revision 1.2
done
co  RCS/darray.c,v darray.c
RCS/darray.c,v  -->  darray.c
revision 1.1
done
mpicc -o darray darray.c
install darray /N/B/gustav/bin
gustav@bh1 $

And now here is how the program runs. It can be made to run silently, or you can ask for verbose output by invoking it with the -v option. I'll do this to show you what's going on inside. I run this program under PBS. Not because it is very heavy computationally or in terms of IO, but because I want to run it on 27 processors. The reason for this is that $27 = 3\times3\times3$, so I can have a non-trivial 3-dimensional process grid, over which I am going to distribute a 3-dimensional array ( $512\times512\times512$) of integers. Observe that 5123 = 134,217,728 does not divide by 27: $512^3 / 27 = 4971026\frac{26}{27}$. Function MPI_Type_create_darray is going to allocate 5000211 integers to 8 processes each, 4970970 integers to 12 processes each, 4941900 integers to 6 processes each and 4913000 to one process. You can easily check that $8\times 5000211 + 12\times4970970 +
6\times4941900 + 1\times4913000 = 134217728 = 512^3$.

This is my PBS job file:

gustav@bh1 $ pwd
/N/B/gustav/PBS
gustav@bh1 $ cat darray.sh
#PBS -S /bin/bash
#PBS -N darray
#PBS -o darray_out
#PBS -e darray_err
#PBS -q bg
#PBS -m a
#PBS -V
#PBS -l nodes=27
NODES=27
HOST=`hostname`
echo Local MPD console on $HOST
cd /N/gpfs/gustav/darray
rm -f test
pwd
# Specify Myrinet interfaces on the hostfile.
grep -v $HOST $PBS_NODEFILE | sed 's/$/-myri0/' > $HOME/mpd.hosts
# Boot the MPI2 engine.
which mpdboot
mpdboot --totalnum=$NODES --file=$HOME/mpd.hosts 
sleep 10
# Inspect if all MPI nodes have been activated.
which mpdtrace
mpdtrace 
# Execute your MPI program.
which mpiexec
mpiexec -n $NODES darray -f test -v
# Shut down the MPI2 engine and exit the PBS script.
which mpdallexit
mpdallexit
exit 0
gustav@bh1 $
After I have submitted it with
gustav@bh1 $ qsub darray.sh
46101.bh1.avidd.iu.edu
gustav@bh1 $ qstat | grep gustav
46101.bh1        darray           gustav                  0 R bg              
gustav@bh1 $
I got the following output written on darray_out:
gustav@bh1 $ cat darray_out
[ ... various uninteresting junk ...]
  0: calling MPI_Type_create_darray with
  0:    pool_size         = 27
  0:    my_rank           = 0
  0:    ndims             = 3
  0:    array_of_gsizes   = (512, 512, 512)
  0:    array_of_distribs = (121, 121, 121)
  0:    array_of_dargs    = (-49767, -49767, -49767)
  0:    array_of_psizes   = (3, 3, 3)
  0:    order             = 56
  0:    type              = 1275069445
  0: file_type_size   = 20000844
  0: file_type_extent = 536870912
[...]
 16: calling MPI_Type_create_darray with
 16:    pool_size         = 27
 16:    my_rank           = 16
 16:    ndims             = 3
 16:    array_of_gsizes   = (512, 512, 512)
 16:    array_of_distribs = (121, 121, 121)
 16:    array_of_dargs    = (-49767, -49767, -49767)
 16:    array_of_psizes   = (3, 3, 3)
 16:    order             = 56
 16:    type              = 1275069445
 16: file_type_size   = 19883880
 16: file_type_extent = 536870912
[...]
  0: wrote 5000211 integers
  0: file size is 536870912 bytes
 16: wrote 4970970 integers
 16: file size is 536870912 bytes
[...]
  0: read 5000211 integers
  1: read 5000211 integers
  2: read 4970970 integers
[...]
 16: read 4970970 integers
[...]
 24: read 4941900 integers
 25: read 4941900 integers
 26: read 4913000 integers
--> All data read back is correct.
gustav@bh1 $ ls -l /N/gpfs/gustav/darray
total 524288
-rw-r--r--    1 gustav   ucs      536870912 Oct 26 17:49 test
gustav@bh1 $


next up previous index
Next: The Discussion Up: File Views Previous: Defining File Views in
Zdzislaw Meglicki
2004-04-29