Source code for src.hdf5_data_extraction

import h5py
import pandas as pd
import numpy as np
import os
import src.hdf5_vis as hdf5_vis


[docs] def read_dataset_from_hdf5file(hdf5_file_path, dataset_path): # Open the HDF5 file with h5py.File(hdf5_file_path, 'r') as hdf: # Load the dataset dataset = hdf[dataset_path] data = np.empty(dataset.shape, dtype=dataset.dtype) dataset.read_direct(data) df = pd.DataFrame(data) for col_name in df.select_dtypes(exclude='number'): df[col_name] = df[col_name].str.decode('utf-8') #apply(lambda x: x.decode('utf-8') if isinstance(x,bytes) else x) ## Extract metadata (attributes) and convert to a dictionary #metadata = hdf5_vis.construct_attributes_dict(hdf[dataset_name].attrs) ## Create a one-row DataFrame with the metadata #metadata_df = pd.DataFrame.from_dict(data, orient='columns') return df
[docs] def read_metadata_from_hdf5obj(hdf5_file_path, obj_path): # TODO: Complete this function metadata_df = pd.DataFrame.empty() return metadata_df
[docs] def list_datasets_in_hdf5file(hdf5_file_path): def get_datasets(name, obj, list_of_datasets): if isinstance(obj,h5py.Dataset): list_of_datasets.append(name) #print(f'Adding dataset: {name}') #tail: {head} head: {tail}') with h5py.File(hdf5_file_path,'r') as file: list_of_datasets = [] file.visititems(lambda name, obj: get_datasets(name, obj, list_of_datasets)) dataset_df = pd.DataFrame({'dataset_name':list_of_datasets}) dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3]) dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2]) return dataset_df