Implemented functions for data extraction from hdf5 files.
This commit is contained in:
46
src/hdf5_data_extraction.py
Normal file
46
src/hdf5_data_extraction.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import h5py
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def read_dataset_as_dataframe(hdf, dataset_name):
|
||||||
|
dataset = hdf[dataset_name]
|
||||||
|
data = np.empty(dataset.shape, dtype=dataset.dtype)
|
||||||
|
dataset.read_direct(data)
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
|
if 'categorical' in dataset_name:
|
||||||
|
# Assuming all entries are byte strings encoded with utf-8
|
||||||
|
for column_name in df.columns:
|
||||||
|
df[column_name] = df[column_name].str.decode('utf-8')
|
||||||
|
# Assuming there's a 'timestamps' column that needs to be converted to datetime
|
||||||
|
if 'timestamps' in df.columns:
|
||||||
|
df['timestamps'] = pd.to_datetime(df['timestamps'],yearfirst=True)
|
||||||
|
|
||||||
|
elif 'numerical' in dataset_name:
|
||||||
|
df = df.apply(pd.to_numeric)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def extract_filename(dataset_name_path):
|
||||||
|
|
||||||
|
tmp = dataset_name_path.split('/')
|
||||||
|
|
||||||
|
return tmp[len(tmp)-2]
|
||||||
|
|
||||||
|
def list_datasets(hdf5_filename_path):
|
||||||
|
|
||||||
|
def get_datasets(name, obj, list_of_datasets):
|
||||||
|
if isinstance(obj,h5py.Dataset):
|
||||||
|
list_of_datasets.append(name)
|
||||||
|
head, tail = os.path.split(name)
|
||||||
|
print(f'Adding dataset: tail: {head} head: {tail}')
|
||||||
|
|
||||||
|
|
||||||
|
with h5py.File(hdf5_filename_path,'r') as file:
|
||||||
|
list_of_datasets = []
|
||||||
|
file.visititems(lambda name, obj: get_datasets(name, obj, list_of_datasets))
|
||||||
|
|
||||||
|
|
||||||
|
return list_of_datasets
|
Reference in New Issue
Block a user