Implemented functions for data extraction from hdf5 files.

2024-05-31 12:39:10 +02:00
parent e6de1ff55d
commit 69f3857936
1 changed files with 46 additions and 0 deletions
--- a/src/hdf5_data_extraction.py
+++ b/src/hdf5_data_extraction.py
@ -0,0 +1,46 @@
+import h5py
+import pandas as pd
+import numpy as np
+import os
+
+
+def read_dataset_as_dataframe(hdf, dataset_name):
+    dataset = hdf[dataset_name]
+    data = np.empty(dataset.shape, dtype=dataset.dtype)
+    dataset.read_direct(data)
+    df = pd.DataFrame(data)
+
+    if 'categorical' in dataset_name:
+        # Assuming all entries are byte strings encoded with utf-8
+        for column_name in df.columns:
+            df[column_name] = df[column_name].str.decode('utf-8')
+        # Assuming there's a 'timestamps' column that needs to be converted to datetime
+        if 'timestamps' in df.columns:
+            df['timestamps'] = pd.to_datetime(df['timestamps'],yearfirst=True)
+
+    elif 'numerical' in dataset_name:
+        df = df.apply(pd.to_numeric)
+
+    return df
+
+def extract_filename(dataset_name_path):
+
+    tmp = dataset_name_path.split('/')
+
+    return tmp[len(tmp)-2]
+
+def list_datasets(hdf5_filename_path):
+
+    def get_datasets(name, obj, list_of_datasets):
+        if isinstance(obj,h5py.Dataset):
+            list_of_datasets.append(name)  
+            head, tail = os.path.split(name)  
+            print(f'Adding dataset: tail: {head} head: {tail}')
+
+
+    with h5py.File(hdf5_filename_path,'r') as file:
+        list_of_datasets = []
+        file.visititems(lambda name, obj: get_datasets(name, obj, list_of_datasets))
+
+
+    return list_of_datasets