Implemented method to reformat a given column in a datatable holding datetime info into a desired datetime format. During data integration this will serve to normalize datatime formats across data tables
This commit is contained in:
@ -80,14 +80,27 @@ class HDF5DataOpsManager():
|
|||||||
self.file_obj[group_name].create_dataset(dataset_dict['name'], data=dataset_dict['data'])
|
self.file_obj[group_name].create_dataset(dataset_dict['name'], data=dataset_dict['data'])
|
||||||
self.file_obj[group_name][dataset_dict['name']].attrs.update(dataset_dict['attributes'])
|
self.file_obj[group_name][dataset_dict['name']].attrs.update(dataset_dict['attributes'])
|
||||||
|
|
||||||
def reformat_datetime_column(self, dataset_name, column_name, src_format, desired_format = '%Y-%m-%d %H:%M:%S.%f'):
|
def reformat_datetime_column(self, dataset_name, column_name, src_format, desired_format='%Y-%m-%d %H:%M:%S.%f'):
|
||||||
|
# Access the dataset
|
||||||
dataset = self.file_obj[dataset_name]
|
dataset = self.file_obj[dataset_name]
|
||||||
|
|
||||||
dt_column_data = dataset[column_name][:]
|
# Read the column data into a pandas Series and decode bytes to strings
|
||||||
|
dt_column_data = pd.Series(dataset[column_name][:]).apply(lambda x: x.decode() )
|
||||||
|
|
||||||
|
# Convert to datetime using the source format
|
||||||
|
dt_column_data = pd.to_datetime(dt_column_data, format=src_format, errors = 'coerce')
|
||||||
|
|
||||||
|
# Reformat datetime objects to the desired format as strings
|
||||||
|
dt_column_data = dt_column_data.dt.strftime(desired_format)
|
||||||
|
|
||||||
|
# Encode the strings back to bytes
|
||||||
|
#encoded_data = dt_column_data.apply(lambda x: x.encode() if not pd.isnull(x) else 'N/A').to_numpy()
|
||||||
|
|
||||||
|
# Update the dataset in place
|
||||||
|
#dataset[column_name][:] = encoded_data
|
||||||
|
|
||||||
# Convert byte strings to datetime objects
|
# Convert byte strings to datetime objects
|
||||||
timestamps = [datetime.datetime.strptime(a.decode(), src_format).strftime(desired_format) for a in dt_column_data]
|
#timestamps = [datetime.datetime.strptime(a.decode(), src_format).strftime(desired_format) for a in dt_column_data]
|
||||||
|
|
||||||
#datetime.strptime('31/01/22 23:59:59.999999',
|
#datetime.strptime('31/01/22 23:59:59.999999',
|
||||||
# '%d/%m/%y %H:%M:%S.%f')
|
# '%d/%m/%y %H:%M:%S.%f')
|
||||||
@ -109,8 +122,8 @@ class HDF5DataOpsManager():
|
|||||||
# TODO: make this a more secure operation
|
# TODO: make this a more secure operation
|
||||||
#dataset[column_name][:] = standardized_time_bytes
|
#dataset[column_name][:] = standardized_time_bytes
|
||||||
|
|
||||||
return np.array(timestamps)
|
#return np.array(timestamps)
|
||||||
|
return dt_column_data.to_numpy()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user