From bb250e99401842af333cdcfa92d6c2c022e172ca Mon Sep 17 00:00:00 2001 From: Florez Ospina Juan Felipe Date: Fri, 16 Aug 2024 08:08:28 +0200 Subject: [PATCH] Implemented method to reformat a given column in a datatable holding datetime info into a desired datetime format. During data integration this will serve to normalize datatime formats across data tables --- src/hdf5_data_extraction.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/hdf5_data_extraction.py b/src/hdf5_data_extraction.py index 32b8b1a..76607b1 100644 --- a/src/hdf5_data_extraction.py +++ b/src/hdf5_data_extraction.py @@ -80,14 +80,27 @@ class HDF5DataOpsManager(): self.file_obj[group_name].create_dataset(dataset_dict['name'], data=dataset_dict['data']) self.file_obj[group_name][dataset_dict['name']].attrs.update(dataset_dict['attributes']) - def reformat_datetime_column(self, dataset_name, column_name, src_format, desired_format = '%Y-%m-%d %H:%M:%S.%f'): - + def reformat_datetime_column(self, dataset_name, column_name, src_format, desired_format='%Y-%m-%d %H:%M:%S.%f'): + # Access the dataset dataset = self.file_obj[dataset_name] - - dt_column_data = dataset[column_name][:] - + + # Read the column data into a pandas Series and decode bytes to strings + dt_column_data = pd.Series(dataset[column_name][:]).apply(lambda x: x.decode() ) + + # Convert to datetime using the source format + dt_column_data = pd.to_datetime(dt_column_data, format=src_format, errors = 'coerce') + + # Reformat datetime objects to the desired format as strings + dt_column_data = dt_column_data.dt.strftime(desired_format) + + # Encode the strings back to bytes + #encoded_data = dt_column_data.apply(lambda x: x.encode() if not pd.isnull(x) else 'N/A').to_numpy() + + # Update the dataset in place + #dataset[column_name][:] = encoded_data + # Convert byte strings to datetime objects - timestamps = [datetime.datetime.strptime(a.decode(), src_format).strftime(desired_format) for a in dt_column_data] + #timestamps = [datetime.datetime.strptime(a.decode(), src_format).strftime(desired_format) for a in dt_column_data] #datetime.strptime('31/01/22 23:59:59.999999', # '%d/%m/%y %H:%M:%S.%f') @@ -109,8 +122,8 @@ class HDF5DataOpsManager(): # TODO: make this a more secure operation #dataset[column_name][:] = standardized_time_bytes - return np.array(timestamps) - + #return np.array(timestamps) + return dt_column_data.to_numpy()