next up previous index
Next: Create a Compressed Dataset Up: Property Lists Previous: Create an MPI-IO File

   
Create an Extendible Dataset

In our previous examples, the datasets were of fixed pre-defined size. HDF5 lets you define datasets that can grow. But datasets like these have to be chunked, in other words, if they are to grow, you must allow for the data itself to be physically scattered around the file in chunks.

The following program, which is a little on the large side (by the standards of this tutorial, which tries to keep programming examples very short), illustrates how to create and then manipulate such an extendible dataset. So you will see a few more new elements in it - but, read on, I'll explain it all. The program is taken from the NCSA HDF5 Tutorial.

Here is the program:

#include "hdf5.h"

#define FILE        "ext.h5"
#define DATASETNAME "ExtendibleArray" 
#define RANK         2

int
main (void)
{
    hid_t       file;                          /* handles */
    hid_t       dataspace, dataset;  
    hid_t       filespace;                   
    hid_t       cparms;                     
    hid_t       memspace;

    hsize_t      dims[2]  = { 3, 3};           /* dataset dimensions			
                                                  at creation time */
    hsize_t      dims1[2] = { 3, 3};           /* data1 dimensions */ 
    hsize_t      dims2[2] = { 7, 1};           /* data2 dimensions */  

    hsize_t      maxdims[2] = {H5S_UNLIMITED, H5S_UNLIMITED};
    hsize_t      size[2];
    hssize_t     offset[2];
    hsize_t      i,j;
    herr_t       status, status_n;                             
    int          data1[3][3] = { {1, 1, 1},      /* data to write */
                                 {1, 1, 1},
                                 {1, 1, 1} };      

    int          data2[7]    = { 2, 2, 2, 2, 2, 2, 2};

    /* Variables used in reading data back */
    hsize_t      chunk_dims[2] ={2, 5};
    hsize_t      chunk_dimsr[2];
    hsize_t      dimsr[2];
    int          data_out[10][3];
    int          rank, rank_chunk;

    /* Create the data space with unlimited dimensions. */
    dataspace = H5Screate_simple (RANK, dims, maxdims); 

    /* Create a new file. If file exists its contents will be overwritten. */
    file = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

    /* Modify dataset creation properties, i.e. enable chunking  */
    cparms = H5Pcreate (H5P_DATASET_CREATE);
    status = H5Pset_chunk ( cparms, RANK, chunk_dims);

    /* Create a new dataset within the file using cparms
       creation properties.  */
    dataset = H5Dcreate (file, DATASETNAME, H5T_NATIVE_INT, dataspace,
                         cparms);

    /* Extend the dataset. This call assures that dataset is 3 x 3.*/
    size[0]   = 3; 
    size[1]   = 3; 
    status = H5Dextend (dataset, size);

    /* Select a hyperslab  */
    filespace = H5Dget_space (dataset);
    offset[0] = 0;
    offset[1] = 0;
    status = H5Sselect_hyperslab (filespace, H5S_SELECT_SET, offset, NULL,
                                  dims1, NULL);  

    /* Write the data to the hyperslab  */
    status = H5Dwrite (dataset, H5T_NATIVE_INT, dataspace, filespace,
                       H5P_DEFAULT, data1);

    /* Extend the dataset. Dataset becomes 10 x 3  */
    dims[0]   = dims1[0] + dims2[0];
    size[0]   = dims[0];  
    size[1]   = dims[1]; 
    status = H5Dextend (dataset, size);

    /* Select a hyperslab  */
    filespace = H5Dget_space (dataset);
    offset[0] = 3;
    offset[1] = 0;
    status = H5Sselect_hyperslab (filespace, H5S_SELECT_SET, offset, NULL,
                                  dims2, NULL);  

    /* Define memory space */
    dataspace = H5Screate_simple (RANK, dims2, NULL); 

    /* Write the data to the hyperslab  */
    status = H5Dwrite (dataset, H5T_NATIVE_INT, dataspace, filespace,
                       H5P_DEFAULT, data2);

    /* Close resources */
    status = H5Dclose (dataset);
    status = H5Sclose (dataspace);
    status = H5Sclose (filespace);
    status = H5Fclose (file);

/****************************************************************
    Read the data back 
 ***************************************************************/

    file = H5Fopen (FILE, H5F_ACC_RDONLY, H5P_DEFAULT);
    dataset = H5Dopen (file, DATASETNAME);
    filespace = H5Dget_space (dataset);
    rank = H5Sget_simple_extent_ndims (filespace);
    status_n = H5Sget_simple_extent_dims (filespace, dimsr, NULL);

    cparms = H5Dget_create_plist (dataset);
    if (H5D_CHUNKED == H5Pget_layout (cparms))
    {
       rank_chunk = H5Pget_chunk (cparms, 2, chunk_dimsr);
    }

    memspace = H5Screate_simple (rank,dimsr,NULL);
    status = H5Dread (dataset, H5T_NATIVE_INT, memspace, filespace,
                      H5P_DEFAULT, data_out);
    printf("\n");
    printf("Dataset: \n");
    for (j = 0; j < dimsr[0]; j++)
    {
       for (i = 0; i < dimsr[1]; i++)
           printf("%d ", data_out[j][i]);
       printf("\n");
    }

    status = H5Pclose (cparms);
    status = H5Dclose (dataset);
    status = H5Sclose (filespace);
    status = H5Sclose (memspace);
    status = H5Fclose (file);
}
This is a sequential program, so it is compiled, linked and run in the usual HDF5 way:
gustav@bh1 $ h5cc -o h5_extend h5_extend.c
gustav@bh1 $ ./h5_extend

Dataset: 
1 1 1 
1 1 1 
1 1 1 
2 0 0 
2 0 0 
2 0 0 
2 0 0 
2 0 0 
2 0 0 
2 0 0 
gustav@bh1 $
But when you run h5dump on the HDF5 data file generated by this program you will notice something new:
gustav@bh1 $ h5dump ext.h5
HDF5 "ext.h5" {
GROUP "/" {
   DATASET "ExtendibleArray" {
      DATATYPE  H5T_STD_I32LE
      DATASPACE  SIMPLE { ( 10, 3 ) / ( H5S_UNLIMITED, H5S_UNLIMITED ) }
      DATA {
         1, 1, 1,
         1, 1, 1,
         1, 1, 1,
         2, 0, 0,
         2, 0, 0,
         2, 0, 0,
         2, 0, 0,
         2, 0, 0,
         2, 0, 0,
         2, 0, 0
      }
   }
}
}
gustav@bh1 $
The new element to notice is the annotation:
      DATASPACE  SIMPLE { ( 10, 3 ) / ( H5S_UNLIMITED, H5S_UNLIMITED ) }
which says that the dataset currently of size $10\times3$ can grow to unlimited size. Previously we would have an annotation such as:
      DATASPACE  SIMPLE { ( 40, 60 ) / ( 40, 60 ) }
which says that this particular dataset is of size $40\times60$ and cannot be resized.

The above program resizes the dataset twice, before the dataset itself and then the file are closed.

So, let us discuss the program now.

The program begins with the creation of a simple $3\times3$ dataspace, which is allowed to grow without limits:

#define RANK         2
...
    hsize_t      dims[2]  = { 3, 3}; 
...
    hsize_t      maxdims[2] = {H5S_UNLIMITED, H5S_UNLIMITED};
...
    dataspace = H5Screate_simple (RANK, dims, maxdims);
Then we create the new HDF5 file in the usual way, assuming default property lists:
    file = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
and then we get down to generating the property list for the dataset, which specifies the chunking:
#define RANK         2
...
hsize_t      chunk_dims[2] ={2, 5};
...
    cparms = H5Pcreate (H5P_DATASET_CREATE);
    status = H5Pset_chunk ( cparms, RANK, chunk_dims);
Function  H5Pset_chunk takes three arguments: the first one is the property list identifier, then the rank of the array that specifies the chunking. Here we have a two-dimensional data space, and so our chunk array has to be two-dimensional too. The chunking in the first dimension is going to be 2 (of elementary data items) and the chunking in the second dimension is going to be 5 (of elementary data items).

Now we create the dataset itself:

#define DATASETNAME "ExtendibleArray"
...
    dataset = H5Dcreate (file, DATASETNAME, H5T_NATIVE_INT, dataspace,
                         cparms);
and here we also say that the data in the dataset should be stored in the native integer format. When we looked at the file itself though, we saw DATATYPE H5T_STD_I32LE there, even though the program does not specify this anywhere explicitly. From this we infer that IA32's native format is little-endian.

This call:

    size[0]   = 3; 
    size[1]   = 3; 
    status = H5Dextend (dataset, size);
is not really necessary, because we have already defined our dataspace to be $3\times3$, but it doesn't hurt either. We are merely confirming the dataset to be $3\times3$ here.

Now the program embarks on manipulations that are a little too soon in our course, because they refer to hyperslab selection. Here is what happens. The first call to  H5Dget_space

    filespace = H5Dget_space (dataset);
returns an identifier for a copy of the dataspace for the dataset. In this space we are now going to select a hyperslab by calling function  H5Sselect_hyperslab
   hsize_t      dims1[2] = { 3, 3}; 
...
    offset[0] = 0;
    offset[1] = 0;
    status = H5Sselect_hyperslab (filespace, H5S_SELECT_SET, offset, NULL,
                                  dims1, NULL);
This function call goes to the dataspace pointed to by the filespace identifier and selects a portion of data, which is $3\times3$ and begins at the beginning of the dataset, because the offsets are both zero. The fourth parameter, which is NULL, is used to define stride and the last parameter, which is NULL too, is used to define the size of the block. In both cases NULL invokes defaults, i.e., the stride is 1 and the block size is 1 too. With this function, as you see, you can pick up portions of data from the dataset, but in this case we are just selecting all there is at the moment, because our dataspace at present is $3\times3$.

Now we write the data to the dataset:

    int          data1[3][3] = { {1, 1, 1},      /* data to write */
                                 {1, 1, 1},
                                 {1, 1, 1} };      
...
    status = H5Dwrite (dataset, H5T_NATIVE_INT, dataspace, filespace,
                       H5P_DEFAULT, data1);
Well, if this was not an extendible dataset, we could not write to it any more, because it would be already full. So now we are going to extend it, for real this time:
    hsize_t      dims[2]  = { 3, 3};           
    hsize_t      dims1[2] = { 3, 3};  
    hsize_t      dims2[2] = { 7, 1};  
...
    dims[0]   = dims1[0] + dims2[0];
    size[0]   = dims[0];  
    size[1]   = dims[1]; 
    status = H5Dextend (dataset, size);
This time the dataset is going to be $10\times3$. We have already filled its top $3\times3$ portion with data. Now we want to select a hyperslab that covers the as yet unused space and write something there:
    hsize_t      dims2[2] = { 7, 1};  
...
    filespace = H5Dget_space (dataset);
    offset[0] = 3;
    offset[1] = 0;
    status = H5Sselect_hyperslab (filespace, H5S_SELECT_SET, offset, NULL,
                                  dims2, NULL);
We create new dataspace for this new hyperslab and then write new data into it:
#define RANK         2
    hsize_t      dims2[2] = { 7, 1};  
    int          data2[7]    = { 2, 2, 2, 2, 2, 2, 2};
...
    dataspace = H5Screate_simple (RANK, dims2, NULL); 
    status = H5Dwrite (dataset, H5T_NATIVE_INT, dataspace, filespace,
                       H5P_DEFAULT, data2);
Observe that this dataspace is $7\times1$, i.e., we are going to write on the first column only, and only on rows 4 through 10. Before I go any further, let me again bring back the h5dump of the file:
HDF5 "ext.h5" {
GROUP "/" {
   DATASET "ExtendibleArray" {
      DATATYPE  H5T_STD_I32LE
      DATASPACE  SIMPLE { ( 10, 3 ) / ( H5S_UNLIMITED, H5S_UNLIMITED ) }
      DATA {
         1, 1, 1,
         1, 1, 1,
         1, 1, 1,
         2, 0, 0,
         2, 0, 0,
         2, 0, 0,
         2, 0, 0,
         2, 0, 0,
         2, 0, 0,
         2, 0, 0
      }
   }
}
}
As you see we have indeed written one-s on the top $3\times3$ part of the space and then added a column of two-s on the left hand side of the data space.

Having done all this, we close the dataset, then the dataspace, the the filespace and finally the file itself.

The second part of the program opens the file for reading, then opens the dataset and then extracts information about it as follows:

    file = H5Fopen (FILE, H5F_ACC_RDONLY, H5P_DEFAULT);
    dataset = H5Dopen (file, DATASETNAME);
    filespace = H5Dget_space (dataset);
    rank = H5Sget_simple_extent_ndims (filespace);
    status_n = H5Sget_simple_extent_dims (filespace, dimsr, NULL);
The two new functions here are H5Sget_simple_extent_ndims  and H5Sget_simple_extent_dims . The first one returns the dimensionality of the dataspace and the second one returns the size in each dimension, here on dimsr, and the maximum size, here on NULL, which means that we don't care about this.

Now we are going to enquire about the data layout on the file by extracting the property list from the dataset and then checking the list itself with  H5Pget_layout

    cparms = H5Dget_create_plist (dataset);
    if (H5D_CHUNKED == H5Pget_layout (cparms))
    {
       rank_chunk = H5Pget_chunk (cparms, 2, chunk_dimsr);
    }
Function  H5Dget_create_plist returns an identifier for a copy of the dataset creation property list for the dataset. Once we have the list we check if the data was chunked and if it was we retrieve the size of chunks by calling function  H5Pget_chunk. The function returns chunking on the last parameter, here it is chunk_dimsr.

Having performed these interrogations, only to show how it's done, because we don't really do anything with the returned data about chunking in this program, we get down to reading the data itself. First we create the memory space, then read the data itself and print it on standard output:

    memspace = H5Screate_simple (rank,dimsr,NULL);
    status = H5Dread (dataset, H5T_NATIVE_INT, memspace, filespace,
                      H5P_DEFAULT, data_out);
    printf("\n");
    printf("Dataset: \n");
    for (j = 0; j < dimsr[0]; j++)
    {
       for (i = 0; i < dimsr[1]; i++)
           printf("%d ", data_out[j][i]);
       printf("\n");
    }

Finally we have to close all that's opened before exiting the program:

    status = H5Pclose (cparms);
    status = H5Dclose (dataset);
    status = H5Sclose (filespace);
    status = H5Sclose (memspace);
    status = H5Fclose (file);
}


next up previous index
Next: Create a Compressed Dataset Up: Property Lists Previous: Create an MPI-IO File
Zdzislaw Meglicki
2004-04-29