From 3c6065e2f903b92a66e44a5b39ad6fb06cc090ac Mon Sep 17 00:00:00 2001 From: Marc Howison Date: Fri, 16 Oct 2009 22:37:20 +0000 Subject: [PATCH] flags are now stored in H5PartFile struct; added the 'throttle' as a runtime parameter instead of a compiler flag; merged H5Bench into the tests suite; compiles but need to perform regression tests --- src/H5MultiBlock.c | 90 +++----------------------- src/H5MultiBlockErrors.h | 6 -- src/H5Part.c | 132 ++++++++++++++++++++++++++++++++++----- src/H5Part.h | 8 +++ src/H5PartErrors.h | 6 ++ src/H5PartPrivate.h | 10 +++ src/H5PartTypes.h | 5 +- test/Makefile.am | 7 +++ 8 files changed, 159 insertions(+), 105 deletions(-) diff --git a/src/H5MultiBlock.c b/src/H5MultiBlock.c index 8255372..d840278 100644 --- a/src/H5MultiBlock.c +++ b/src/H5MultiBlock.c @@ -906,29 +906,8 @@ _H5MultiBlock_read_data ( herr = _H5Block_select_hyperslab_for_reading ( f, dataset_id ); if ( herr < 0 ) return herr; -#if H5PART_THROTTLE > 0 - int ret; - int token = 1; - if (f->myproc == 0) { - _H5Part_print_info ("Using throttled reads with factor = %d", - H5PART_THROTTLE); - } - if (f->myproc % H5PART_THROTTLE > 0) { - _H5Part_print_debug_detail ( - "[%d] Waiting on read token from %d", - f->myproc, f->myproc - 1); - // wait to receive token before continuing with read - ret = MPI_Recv( - &token, 1, MPI_INT, - f->myproc - 1, // receive from previous proc - f->myproc, // use this proc id as message tag - f->comm, - MPI_STATUS_IGNORE - ); - if ( ret != MPI_SUCCESS ) return HANDLE_MPI_SENDRECV_ERR; - } - _H5Part_print_debug_detail ("[%d] Executing read", f->myproc); -#endif + herr = _H5Part_start_throttle( f ); + if (herr < 0) return herr; herr = H5Dread ( dataset_id, @@ -939,23 +918,8 @@ _H5MultiBlock_read_data ( *data ); if ( herr < 0 ) return HANDLE_H5D_READ_ERR ( field_name, f->timestep ); -#if H5PART_THROTTLE > 0 - if (f->myproc % H5PART_THROTTLE < H5PART_THROTTLE - 1) { - // pass token to next proc - if (f->myproc + 1 < f->nprocs) { - _H5Part_print_debug_detail ( - "[%d] Passing read token to %d", - f->myproc, f->myproc + 1); - ret = MPI_Send( - &token, 1, MPI_INT, - f->myproc + 1, // send to next proc - f->myproc + 1, // use the id of the target as tag - f->comm - ); - } - if ( ret != MPI_SUCCESS ) return HANDLE_MPI_SENDRECV_ERR; - } -#endif + herr = _H5Part_end_throttle( f ); + if (herr < 0) return herr; herr = H5Dclose ( dataset_id ); if ( herr < 0 ) return HANDLE_H5D_CLOSE_ERR; @@ -1056,30 +1020,9 @@ _H5MultiBlock_write_data ( #endif if ( dataset < 0 ) return HANDLE_H5D_CREATE_ERR ( name, f->timestep ); -#if H5PART_THROTTLE > 0 - int ret; - int token = 1; - if (f->myproc == 0) { - _H5Part_print_info ("Using throttled writes with factor = %d", - H5PART_THROTTLE); - } - if (f->myproc % H5PART_THROTTLE > 0) { - _H5Part_print_debug_detail ( - "[%d] Waiting on write token from %d", - f->myproc, f->myproc - 1); - // wait to receive token before continuing with write - ret = MPI_Recv( - &token, 1, MPI_INT, - f->myproc - 1, // receive from previous proc - f->myproc, // use this proc id as message tag - f->comm, - MPI_STATUS_IGNORE - ); - if ( ret != MPI_SUCCESS ) return HANDLE_MPI_SENDRECV_ERR; - } - _H5Part_print_debug_detail ("[%d] Executing write", f->myproc); -#endif - + herr = _H5Part_start_throttle( f ); + if (herr < 0) return herr; + herr = H5Dwrite ( dataset, type, @@ -1089,23 +1032,8 @@ _H5MultiBlock_write_data ( data ); if ( herr < 0 ) return HANDLE_H5D_WRITE_ERR ( name, f->timestep ); -#if H5PART_THROTTLE > 0 - if (f->myproc % H5PART_THROTTLE < H5PART_THROTTLE - 1) { - // pass token to next proc - if (f->myproc + 1 < f->nprocs) { - _H5Part_print_debug_detail ( - "[%d] Passing write token to %d", - f->myproc, f->myproc + 1); - ret = MPI_Send( - &token, 1, MPI_INT, - f->myproc + 1, // send to next proc - f->myproc + 1, // use the id of the target as tag - f->comm - ); - } - if ( ret != MPI_SUCCESS ) return HANDLE_MPI_SENDRECV_ERR; - } -#endif + herr = _H5Part_end_throttle( f ); + if (herr < 0) return herr; herr = H5Dclose ( dataset ); if ( herr < 0 ) return HANDLE_H5D_CLOSE_ERR; diff --git a/src/H5MultiBlockErrors.h b/src/H5MultiBlockErrors.h index 8f0986c..bf8a491 100644 --- a/src/H5MultiBlockErrors.h +++ b/src/H5MultiBlockErrors.h @@ -22,12 +22,6 @@ H5PART_ERR_MPI, \ "Cannot create/commit/free strided vector MPI datatype." ); -#define HANDLE_MPI_SENDRECV_ERR \ - (*_err_handler) ( \ - _H5Part_get_funcname(), \ - H5PART_ERR_MPI, \ - "Unable to perform point-to-point MPI send/receive." ); - #define HANDLE_MPI_INT64_ERR \ (*_err_handler) ( \ _H5Part_get_funcname(), \ diff --git a/src/H5Part.c b/src/H5Part.c index 9967a0f..672f158 100644 --- a/src/H5Part.c +++ b/src/H5Part.c @@ -147,6 +147,9 @@ _H5Part_open_file ( } memset (f, 0, sizeof (H5PartFile)); + f->flags = flags; + f->throttle = 0; + f->groupname_step = strdup ( H5PART_GROUPNAME_STEP ); if( f->groupname_step == NULL ) { HANDLE_H5PART_NOMEM_ERR; @@ -201,9 +204,13 @@ _H5Part_open_file ( goto error_cleanup; } if (flags & H5PART_VFD_MPIIO_IND) { - _H5Part_print_info ( "Using independent mode" ); + if (f->myproc == 0) { + _H5Part_print_info ( "Using independent mode" ); + } } else { - _H5Part_print_info ( "Using collective mode" ); + if (f->myproc == 0) { + _H5Part_print_info ( "Using collective mode" ); + } f->xfer_prop = H5Pcreate (H5P_DATASET_XFER); if (f->xfer_prop < 0) { HANDLE_H5P_CREATE_ERR; @@ -762,6 +769,9 @@ _write_data ( if ( dataset_id < 0 ) return HANDLE_H5D_CREATE_ERR ( name, f->timestep ); + herr = _H5Part_start_throttle( f ); + if (herr < 0) return herr; + herr = H5Dwrite ( dataset_id, type, @@ -770,6 +780,9 @@ _write_data ( f->xfer_prop, array ); + herr = _H5Part_end_throttle( f ); + if (herr < 0) return herr; + if ( herr < 0 ) return HANDLE_H5D_WRITE_ERR ( name, f->timestep ); herr = H5Dclose ( dataset_id ); @@ -2323,7 +2336,7 @@ h5part_int64_t H5PartHasView ( H5PartFile *f /*!< [in] Handle to open file */ ) { - SET_FNAME ( "H5PartResetView" ); + SET_FNAME ( "H5PartHasView" ); CHECK_FILEHANDLE( f ); CHECK_READONLY_MODE ( f ); @@ -2387,7 +2400,7 @@ _set_view ( return HANDLE_H5S_CREATE_SIMPLE_ERR ( total ); /* declare overall data size but then will select a subset */ - f->diskshape= H5Screate_simple ( 1, &total, &total ); + f->diskshape = H5Screate_simple ( 1, &total, &total ); if ( f->diskshape < 0 ) return HANDLE_H5S_CREATE_SIMPLE_ERR ( total ); @@ -2598,17 +2611,9 @@ _read_data ( memspace_id = _get_memshape_for_reading ( f, dataset_id ); if ( memspace_id < 0 ) return (h5part_int64_t)memspace_id; -#ifdef INDEPENDENT_IO - herr = H5Dread ( - dataset_id, - type, - memspace_id, /* shape/size of data in memory (the - complement to disk hyperslab) */ - space_id, /* shape/size of data on disk - (get hyperslab if needed) */ - H5P_DEFAULT, /* ignore... its for parallel reads */ - array ); -#else + herr = _H5Part_start_throttle( f ); + if (herr < 0) return herr; + herr = H5Dread ( dataset_id, type, @@ -2618,7 +2623,9 @@ _read_data ( (get hyperslab if needed) */ f->xfer_prop, /* ignore... its for parallel reads */ array ); -#endif + + herr = _H5Part_end_throttle( f ); + if (herr < 0) return herr; if ( herr < 0 ) return HANDLE_H5D_READ_ERR ( name, f->timestep ); @@ -2820,7 +2827,98 @@ H5PartReadParticleStep ( return H5PART_SUCCESS; } -/****************** error handling ******************/ +/************ error handling and configuration ************/ + +/*! + \ingroup h5part_errhandle + + Set the `throttle` factor, which causes HDF5 write and read + calls to be issued in that number of batches. + + This can prevent large cuncurrency parallel applications that + use independent writes from overwhelming the underlying + parallel file system. + + Throttling only works with the H5PART_VFD_MPIPOSIX or + H5PART_VFD_MPIIO_IND drivers. + + \return \c H5PART_SUCCESS +*/ +h5part_int64_t +H5PartSetThrottle ( + H5PartFile *f, + int factor + ) { + + SET_FNAME( "H5PartSetThrottle" ); + CHECK_FILEHANDLE ( f ); + + if (f->flags & H5PART_VFD_MPIIO_IND || f->flags & H5PART_VFD_MPIPOSIX) { + f->throttle = factor; + } else { + _H5Part_print_warn ("Throttling is not permitted in MPI-IO collective mode."); + } + + return H5PART_SUCCESS; +} + +h5part_int64_t +_H5Part_start_throttle ( + H5PartFile *f + ) { + + if (f->throttle > 0) { + int ret; + int token = 1; + if (f->myproc == 0) { + _H5Part_print_info ("Throttling with factor = %d", + f->throttle); + } + if (f->myproc % f->throttle > 0) { + _H5Part_print_debug_detail ( + "[%d] throttle: waiting on token from %d", + f->myproc, f->myproc - 1); + // wait to receive token before continuing with read + ret = MPI_Recv( + &token, 1, MPI_INT, + f->myproc - 1, // receive from previous proc + f->myproc, // use this proc id as message tag + f->comm, + MPI_STATUS_IGNORE + ); + if ( ret != MPI_SUCCESS ) return HANDLE_MPI_SENDRECV_ERR; + } + _H5Part_print_debug_detail ("[%d] throttle: received token", f->myproc); + } + return H5PART_SUCCESS; +} + +h5part_int64_t +_H5Part_end_throttle ( + H5PartFile *f + ) { + + if (f->throttle > 0) { + int ret; + int token; + if (f->myproc % f->throttle < f->throttle - 1) { + // pass token to next proc + if (f->myproc + 1 < f->nprocs) { + _H5Part_print_debug_detail ( + "[%d] throttle: passing token to %d", + f->myproc, f->myproc + 1); + ret = MPI_Send( + &token, 1, MPI_INT, + f->myproc + 1, // send to next proc + f->myproc + 1, // use the id of the target as tag + f->comm + ); + } + if ( ret != MPI_SUCCESS ) return HANDLE_MPI_SENDRECV_ERR; + } + } + return H5PART_SUCCESS; +} /*! \ingroup h5part_errhandle diff --git a/src/H5Part.h b/src/H5Part.h index 7ccc50c..4b9e9e7 100644 --- a/src/H5Part.h +++ b/src/H5Part.h @@ -349,11 +349,19 @@ H5PartReadFileAttrib ( void *value ); +/*============ Error Reporting and Configuration =============*/ + h5part_int64_t H5PartSetVerbosityLevel ( const h5part_int64_t level ); +h5part_int64_t +H5PartSetThrottle ( + H5PartFile *f, + int factor + ); + h5part_int64_t H5PartSetErrorHandler ( const h5part_error_handler handler diff --git a/src/H5PartErrors.h b/src/H5PartErrors.h index 0877e1e..422b21d 100644 --- a/src/H5PartErrors.h +++ b/src/H5PartErrors.h @@ -336,6 +336,12 @@ H5PART_ERR_MPI, \ "Cannot gather data." ); +#define HANDLE_MPI_SENDRECV_ERR \ + (*_err_handler) ( \ + _H5Part_get_funcname(), \ + H5PART_ERR_MPI, \ + "Unable to perform point-to-point MPI send/receive." ); + #define HANDLE_MPI_COMM_SIZE_ERR \ (*_err_handler) ( \ _H5Part_get_funcname(), \ diff --git a/src/H5PartPrivate.h b/src/H5PartPrivate.h index 198e6a2..ea460e4 100644 --- a/src/H5PartPrivate.h +++ b/src/H5PartPrivate.h @@ -122,6 +122,16 @@ _H5Part_have_group ( const char *name ); +h5part_int64_t +_H5Part_start_throttle ( + H5PartFile *f + ); + +h5part_int64_t +_H5Part_end_throttle ( + H5PartFile *f + ); + void _H5Part_vprint_error ( const char *fmt, diff --git a/src/H5PartTypes.h b/src/H5PartTypes.h index 29f61d0..6606f7a 100644 --- a/src/H5PartTypes.h +++ b/src/H5PartTypes.h @@ -19,7 +19,7 @@ __attribute__ ((format (printf, 3, 4))) #endif ; -#ifndef PARALLEL_IO +#ifndef MPI_INCLUDED typedef unsigned long MPI_Comm; #endif @@ -79,6 +79,9 @@ struct H5PartFile { */ MPI_Comm comm; + char flags; + int throttle; + struct H5BlockStruct *block; h5part_int64_t (*close_block)(struct H5PartFile *f); diff --git a/test/Makefile.am b/test/Makefile.am index 30343f8..80e02a9 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -53,6 +53,7 @@ EXTRA_DIST = Bench.c \ H5testFpar.F90 \ H5BlockParTestScalarField.c H5BlockParTestScalarFieldF.F90 \ H5BlockTestAttributes.c H5BlockTestAttributesF.F90 \ + H5PartBench.c H5BlockBench.c \ $(bin_SCRIPTS) # Specific building instruction (What compilers to use...) @@ -61,6 +62,12 @@ EXTRA_DIST = Bench.c \ Bench: Bench.c $(CC) $(CFLAGS) $(INC) -o $@ $< $(H5PLIB) -lH5Part $(LIBS) +H5PartBench: H5PartBench.c + $(CC) $(CFLAGS) $(INC) -o $@ $< $(H5PLIB) -lpH5Part $(LIBS) + +H5BlockBench: H5BlockBench.c + $(CC) $(CFLAGS) $(INC) -o $@ $< $(H5PLIB) -lpH5Part $(LIBS) + ############################################################################### H5PartTest: H5PartTest.o