diff --git a/src/hdf5_data_extraction.py b/src/hdf5_data_extraction.py new file mode 100644 index 0000000..16d76fc --- /dev/null +++ b/src/hdf5_data_extraction.py @@ -0,0 +1,46 @@ +import h5py +import pandas as pd +import numpy as np +import os + + +def read_dataset_as_dataframe(hdf, dataset_name): + dataset = hdf[dataset_name] + data = np.empty(dataset.shape, dtype=dataset.dtype) + dataset.read_direct(data) + df = pd.DataFrame(data) + + if 'categorical' in dataset_name: + # Assuming all entries are byte strings encoded with utf-8 + for column_name in df.columns: + df[column_name] = df[column_name].str.decode('utf-8') + # Assuming there's a 'timestamps' column that needs to be converted to datetime + if 'timestamps' in df.columns: + df['timestamps'] = pd.to_datetime(df['timestamps'],yearfirst=True) + + elif 'numerical' in dataset_name: + df = df.apply(pd.to_numeric) + + return df + +def extract_filename(dataset_name_path): + + tmp = dataset_name_path.split('/') + + return tmp[len(tmp)-2] + +def list_datasets(hdf5_filename_path): + + def get_datasets(name, obj, list_of_datasets): + if isinstance(obj,h5py.Dataset): + list_of_datasets.append(name) + head, tail = os.path.split(name) + print(f'Adding dataset: tail: {head} head: {tail}') + + + with h5py.File(hdf5_filename_path,'r') as file: + list_of_datasets = [] + file.visititems(lambda name, obj: get_datasets(name, obj, list_of_datasets)) + + + return list_of_datasets