next up previous index
Next: Point Selection Up: Partitioning MPI-IO/HDF5 Datasets Previous: Scattering Data

Chunking

In this last MPI-IO/HDF5 example we are going to write data from four processes, the data being much the same as in the previous example, i.e., the process rank number plus 1, on the $8\times4$ dataset, but this time we'll write the data in blocks:

\begin{displaymath}\begin{array}{cccc}
1 & 1 & 2 & 2 \\
1 & 1 & 2 & 2 \\
1 ...
...3 & 3 & 4 & 4 \\
3 & 3 & 4 & 4 \\
3 & 3 & 4 & 4
\end{array}\end{displaymath}

Observe that although at first glance this partitioning does not seem very different from what we had in our first MPI-IO/HDF5 example:

\begin{displaymath}\begin{array}{cccc}
1 & 1 & 1 & 1 \\
1 & 1 & 1 & 1 \\
2 ...
...3 & 3 & 3 & 3 \\
4 & 4 & 4 & 4 \\
4 & 4 & 4 & 4
\end{array}\end{displaymath}

it is, in fact, very different. Because matrices are stored row-contiguous in C, in order to achieve the above pattern we can simply divide the dataset into four contiguous portions and assign each to a separate process. This is, in fact, the most efficient way of partitioning a dataset and you should stick to this, unless you really have to subject yourself to the kind of equilibristics we have been talking about in the last 3 sections (including this one).

So, the partition we are going to implement in the following example will not be contiguous.

Before I get to the program itself, let me show you how to compile, link and run the program first:

gustav@bh1 $ pwd
/N/B/gustav/src/I590/HDF5/mpi-io
gustav@bh1 $ h5cc -o hyperslab_by_chunk hyperslab_by_chunk.c 
gustav@bh1 $ mv hyperslab_by_chunk ~/bin
gustav@bh1 $ cd
gustav@bh1 $ mpdboot

gustav@bh1 $ 
gustav@bh1 $ mpdtrace | wc
     20      20      99
gustav@bh1 $ cd /N/gpfs/gustav
gustav@bh1 $ ls
SDS_col.h5  SDS_pat.h5  SDS_row.h5  t_mpio_1wMr  test  tests
gustav@bh1 $ mpiexec -n 4 hyperslab_by_chunk
gustav@bh1 $ ls
SDS_chnk.h5  SDS_col.h5  SDS_pat.h5  SDS_row.h5  t_mpio_1wMr  test  tests
gustav@bh1 $ h5dump SDS_chnk.h5
HDF5 "SDS_chnk.h5" {
GROUP "/" {
   DATASET "IntArray" {
      DATATYPE  H5T_STD_I32LE
      DATASPACE  SIMPLE { ( 8, 4 ) / ( 8, 4 ) }
      DATA {
         1, 1, 2, 2,
         1, 1, 2, 2,
         1, 1, 2, 2,
         1, 1, 2, 2,
         3, 3, 4, 4,
         3, 3, 4, 4,
         3, 3, 4, 4,
         3, 3, 4, 4
      }
   }
}
}
gustav@bh1 $ mpdallexit
gustav@bh1 $

Here is the program, which I have taken from the NCSA HDF5 Tutorial.

/*  
 *  This example writes dataset sing chunking. Each process writes
 *  exactly one chunk.
 *             - | 
 *             * V 
 *  Number of processes is assumed to be 4.
 */
 
#include "hdf5.h"

#define H5FILE_NAME     "SDS_chnk.h5"
#define DATASETNAME 	"IntArray" 
#define NX     8                      /* dataset dimensions */
#define NY     4 
#define CH_NX  4                      /* chunk dimensions */
#define CH_NY  2 
#define RANK   2

int
main (int argc, char **argv)
{
    /*
     * HDF5 APIs definitions
     */ 	
    hid_t       file_id, dset_id;         /* file and dataset identifiers */
    hid_t       filespace, memspace;      /* file and memory dataspace identifiers */
    hsize_t     dimsf[2];                 /* dataset dimensions */
    hsize_t     chunk_dims[2];            /* chunk dimensions */
    int         *data;                    /* pointer to data buffer to write */
    hsize_t	count[2];	          /* hyperslab selection parameters */
    hsize_t	stride[2];
    hsize_t	block[2];
    hssize_t	offset[2];
    hid_t	plist_id;                 /* property list identifier */
    int         i;
    herr_t	status;

    /*
     * MPI variables
     */
    int mpi_size, mpi_rank;
    MPI_Comm comm  = MPI_COMM_WORLD;
    MPI_Info info  = MPI_INFO_NULL;

    /*
     * Initialize MPI
     */
    MPI_Init(&argc, &argv);
    MPI_Comm_size(comm, &mpi_size);
    MPI_Comm_rank(comm, &mpi_rank);  
    /*
     * Exit if number of processes is not 4. 
     */
    if (mpi_size != 4) {
       printf("This example to set up to use only 4 processes \n");
       printf("Quitting...\n");
       return 0;
    }
 
    /* 
     * Set up file access property list with parallel I/O access
     */
     plist_id = H5Pcreate(H5P_FILE_ACCESS);
     H5Pset_fapl_mpio(plist_id, comm, info);

    /*
     * Create a new file collectively and release property list identifier.
     */
    file_id = H5Fcreate(H5FILE_NAME, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
    H5Pclose(plist_id);
   

    /*
     * Create the dataspace for the dataset.
     */
    dimsf[0] = NX;
    dimsf[1] = NY;
    chunk_dims[0] = CH_NX;   
    chunk_dims[1] = CH_NY;   
    filespace = H5Screate_simple(RANK, dimsf, NULL); 
    memspace  = H5Screate_simple(RANK, chunk_dims, NULL); 

    /*
     * Create chunked dataset.
     */
    plist_id = H5Pcreate(H5P_DATASET_CREATE);
    H5Pset_chunk(plist_id, RANK, chunk_dims);
    dset_id = H5Dcreate(file_id, DATASETNAME, H5T_NATIVE_INT, filespace,
			plist_id);
    H5Pclose(plist_id);
    H5Sclose(filespace);

    /* 
     * Each process defines dataset in memory and writes it to the hyperslab
     * in the file.
     */
    count[0] = 1;
    count[1] = 1 ;
    stride[0] = 1;
    stride[1] = 1;
    block[0] = chunk_dims[0];
    block[1] = chunk_dims[1];
    if(mpi_rank == 0) {
       offset[0] = 0;
       offset[1] = 0;
    } 
    if(mpi_rank == 1) {
       offset[0] = 0;
       offset[1] = chunk_dims[1];
    }
    if(mpi_rank == 2) {
       offset[0] = chunk_dims[0];
       offset[1] = 0;
    } 
    if(mpi_rank == 3) {
       offset[0] = chunk_dims[0];
       offset[1] = chunk_dims[1];
    } 

    /*
     * Select hyperslab in the file.
     */
    filespace = H5Dget_space(dset_id);
    status = H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, stride, count, block);

    /*
     * Initialize data buffer 
     */
    data = (int *) malloc(sizeof(int)*chunk_dims[0]*chunk_dims[1]);
    for (i=0; i < (int)chunk_dims[0]*chunk_dims[1]; i++) {
        data[i] = mpi_rank + 1;
    }

    /*
     * Create property list for collective dataset write.
     */
    plist_id = H5Pcreate(H5P_DATASET_XFER);
    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
    
    status = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace,
		      plist_id, data);
    free(data);

    /*
     * Close/release resources.
     */
    H5Dclose(dset_id);
    H5Sclose(filespace);
    H5Sclose(memspace);
    H5Pclose(plist_id);
    H5Fclose(file_id);
 
    MPI_Finalize();

    return 0;
}

We'll skip the preliminaries and land right where the filespace and the memory space are defined:

#define NX     8
#define NY     4 
#define CH_NX  4
#define CH_NY  2 
...
    dimsf[0] = NX;
    dimsf[1] = NY;
    chunk_dims[0] = CH_NX;   
    chunk_dims[1] = CH_NY;   
    filespace = H5Screate_simple(RANK, dimsf, NULL); 
    memspace  = H5Screate_simple(RANK, chunk_dims, NULL);
So, the filespace itself is going to be $8\times4$ and the memory space for each process is going to be $4\times2$. Now we create the dataset and observe that we request that this dataset be chunked with chunks of the same size as the memory datasets. But this chunking of the dataset, i.e., its physical partitioning, is quite unrelated to our logical chunking, which we will have to implement by selecting an appropriate hyperslab.
#define DATASETNAME 	"IntArray" 
...
    plist_id = H5Pcreate(H5P_DATASET_CREATE);
    H5Pset_chunk(plist_id, RANK, chunk_dims);
    dset_id = H5Dcreate(file_id, DATASETNAME, H5T_NATIVE_INT, filespace,
			plist_id);
    H5Pclose(plist_id);
    H5Sclose(filespace);

Now we have to define counts, strides, block sizes and offsets for each hyperslab:

    count[0] = 1;
    count[1] = 1 ;
    stride[0] = 1;
    stride[1] = 1;
    block[0] = chunk_dims[0];
    block[1] = chunk_dims[1];
    if(mpi_rank == 0) {
       offset[0] = 0;
       offset[1] = 0;
    } 
    if(mpi_rank == 1) {
       offset[0] = 0;
       offset[1] = chunk_dims[1];
    }
    if(mpi_rank == 2) {
       offset[0] = chunk_dims[0];
       offset[1] = 0;
    } 
    if(mpi_rank == 3) {
       offset[0] = chunk_dims[0];
       offset[1] = chunk_dims[1];
    }
Each process is going to write just one block. The stride between the blocks is going to be 1 in each direction, i.e., no striding - but here we could probably default, since you cannot talk about a stride if you are going to write just one block. The block size is going to be $4\times2$for each process. Process of rank 0 writes its block at (0,0), process of rank 1 writes its block at (0,2), process of rank 2 writes its block at (4,0) and process of rank 3 writes its block at (4,2).

Now each process is ready to select its hyperslab:

    filespace = H5Dget_space(dset_id);
    status = H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, stride, count, block);
At this stage the data gets initialized in memory, we add a collective MPI-IO write request to the property list and we write the data, whereupon the previously allocated buffer data gets freed.
    data = (int *) malloc(sizeof(int)*chunk_dims[0]*chunk_dims[1]);
    for (i=0; i < (int)chunk_dims[0]*chunk_dims[1]; i++) {
        data[i] = mpi_rank + 1;
    }
    plist_id = H5Pcreate(H5P_DATASET_XFER);
    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
    status = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace,
		      plist_id, data);
    free(data);
Finally we close all resources and finalize MPI and exit:
    H5Dclose(dset_id);
    H5Sclose(filespace);
    H5Sclose(memspace);
    H5Pclose(plist_id);
    H5Fclose(file_id);
    MPI_Finalize();
    return 0;


next up previous index
Next: Point Selection Up: Partitioning MPI-IO/HDF5 Datasets Previous: Scattering Data
Zdzislaw Meglicki
2004-04-29