Implemented method to reformat a given column in a datatable holding datetime info into a desired datetime format. During data integration this will serve to normalize datatime formats across data tables

This commit is contained in:
2024-08-16 08:08:28 +02:00
parent 062a688f47
commit bb250e9940

View File

@ -80,14 +80,27 @@ class HDF5DataOpsManager():
self.file_obj[group_name].create_dataset(dataset_dict['name'], data=dataset_dict['data'])
self.file_obj[group_name][dataset_dict['name']].attrs.update(dataset_dict['attributes'])
def reformat_datetime_column(self, dataset_name, column_name, src_format, desired_format = '%Y-%m-%d %H:%M:%S.%f'):
def reformat_datetime_column(self, dataset_name, column_name, src_format, desired_format='%Y-%m-%d %H:%M:%S.%f'):
# Access the dataset
dataset = self.file_obj[dataset_name]
dt_column_data = dataset[column_name][:]
# Read the column data into a pandas Series and decode bytes to strings
dt_column_data = pd.Series(dataset[column_name][:]).apply(lambda x: x.decode() )
# Convert to datetime using the source format
dt_column_data = pd.to_datetime(dt_column_data, format=src_format, errors = 'coerce')
# Reformat datetime objects to the desired format as strings
dt_column_data = dt_column_data.dt.strftime(desired_format)
# Encode the strings back to bytes
#encoded_data = dt_column_data.apply(lambda x: x.encode() if not pd.isnull(x) else 'N/A').to_numpy()
# Update the dataset in place
#dataset[column_name][:] = encoded_data
# Convert byte strings to datetime objects
timestamps = [datetime.datetime.strptime(a.decode(), src_format).strftime(desired_format) for a in dt_column_data]
#timestamps = [datetime.datetime.strptime(a.decode(), src_format).strftime(desired_format) for a in dt_column_data]
#datetime.strptime('31/01/22 23:59:59.999999',
# '%d/%m/%y %H:%M:%S.%f')
@ -109,8 +122,8 @@ class HDF5DataOpsManager():
# TODO: make this a more secure operation
#dataset[column_name][:] = standardized_time_bytes
return np.array(timestamps)
#return np.array(timestamps)
return dt_column_data.to_numpy()