From e7ed6145f0286ff0ecd72ece1228b5a67ee4fc21 Mon Sep 17 00:00:00 2001
From: Florez Ospina Juan Felipe <juan.florez-ospina@psi.ch>
Date: Tue, 11 Jun 2024 10:38:04 +0200
Subject: [PATCH] Implemented a data extraction module to access data from an
 hdf5 file in the form of dataframes.

---
 src/hdf5_data_extraction.py | 55 +++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/src/hdf5_data_extraction.py b/src/hdf5_data_extraction.py
index 16d76fc..eef6d9e 100644
--- a/src/hdf5_data_extraction.py
+++ b/src/hdf5_data_extraction.py
@@ -2,45 +2,46 @@ import h5py
 import pandas as pd
 import numpy as np
 import os
+import src.hdf5_vis as hdf5_vis
 
 
-def read_dataset_as_dataframe(hdf, dataset_name):
-    dataset = hdf[dataset_name]
-    data = np.empty(dataset.shape, dtype=dataset.dtype)
-    dataset.read_direct(data)
-    df = pd.DataFrame(data)
-
-    if 'categorical' in dataset_name:
-        # Assuming all entries are byte strings encoded with utf-8
-        for column_name in df.columns:
-            df[column_name] = df[column_name].str.decode('utf-8')
-        # Assuming there's a 'timestamps' column that needs to be converted to datetime
-        if 'timestamps' in df.columns:
-            df['timestamps'] = pd.to_datetime(df['timestamps'],yearfirst=True)
-
-    elif 'numerical' in dataset_name:
-        df = df.apply(pd.to_numeric)
-
+def read_dataset_from_hdf5file(hdf5_file_path, dataset_path):
+    # Open the HDF5 file
+    with h5py.File(hdf5_file_path, 'r') as hdf:
+        # Load the dataset
+        dataset = hdf[dataset_path]
+        data = np.empty(dataset.shape, dtype=dataset.dtype)
+        dataset.read_direct(data)
+        df = pd.DataFrame(data)
+        
+        for col_name in df.select_dtypes(exclude='number'):
+            df[col_name] = df[col_name].str.decode('utf-8') #apply(lambda x: x.decode('utf-8') if isinstance(x,bytes) else x)
+        ## Extract metadata (attributes) and convert to a dictionary
+        #metadata = hdf5_vis.construct_attributes_dict(hdf[dataset_name].attrs)
+        ## Create a one-row DataFrame with the metadata
+        #metadata_df = pd.DataFrame.from_dict(data, orient='columns')        
     return df
 
-def extract_filename(dataset_name_path):
+def read_metadata_from_hdf5obj(hdf5_file_path, obj_path):
+    # TODO: Complete this function
+    metadata_df = pd.DataFrame.empty()
+    return metadata_df
 
-    tmp = dataset_name_path.split('/')
-
-    return tmp[len(tmp)-2]
-
-def list_datasets(hdf5_filename_path):
+def list_datasets_in_hdf5file(hdf5_file_path):
 
     def get_datasets(name, obj, list_of_datasets):
         if isinstance(obj,h5py.Dataset):
             list_of_datasets.append(name)  
-            head, tail = os.path.split(name)  
-            print(f'Adding dataset: tail: {head} head: {tail}')
+            #print(f'Adding dataset: {name}') #tail: {head} head: {tail}')
 
 
-    with h5py.File(hdf5_filename_path,'r') as file:
+    with h5py.File(hdf5_file_path,'r') as file:
         list_of_datasets = []
         file.visititems(lambda name, obj: get_datasets(name, obj, list_of_datasets))
 
+    dataset_df = pd.DataFrame({'dataset_name':list_of_datasets})
 
-    return list_of_datasets
+    dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3])
+    dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2])
+
+    return dataset_df