Implemented a data extraction module to access data from an hdf5 file in the form of dataframes.
This commit is contained in:
@ -2,45 +2,46 @@ import h5py
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import src.hdf5_vis as hdf5_vis
|
||||
|
||||
|
||||
def read_dataset_as_dataframe(hdf, dataset_name):
|
||||
dataset = hdf[dataset_name]
|
||||
data = np.empty(dataset.shape, dtype=dataset.dtype)
|
||||
dataset.read_direct(data)
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
if 'categorical' in dataset_name:
|
||||
# Assuming all entries are byte strings encoded with utf-8
|
||||
for column_name in df.columns:
|
||||
df[column_name] = df[column_name].str.decode('utf-8')
|
||||
# Assuming there's a 'timestamps' column that needs to be converted to datetime
|
||||
if 'timestamps' in df.columns:
|
||||
df['timestamps'] = pd.to_datetime(df['timestamps'],yearfirst=True)
|
||||
|
||||
elif 'numerical' in dataset_name:
|
||||
df = df.apply(pd.to_numeric)
|
||||
|
||||
def read_dataset_from_hdf5file(hdf5_file_path, dataset_path):
|
||||
# Open the HDF5 file
|
||||
with h5py.File(hdf5_file_path, 'r') as hdf:
|
||||
# Load the dataset
|
||||
dataset = hdf[dataset_path]
|
||||
data = np.empty(dataset.shape, dtype=dataset.dtype)
|
||||
dataset.read_direct(data)
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
for col_name in df.select_dtypes(exclude='number'):
|
||||
df[col_name] = df[col_name].str.decode('utf-8') #apply(lambda x: x.decode('utf-8') if isinstance(x,bytes) else x)
|
||||
## Extract metadata (attributes) and convert to a dictionary
|
||||
#metadata = hdf5_vis.construct_attributes_dict(hdf[dataset_name].attrs)
|
||||
## Create a one-row DataFrame with the metadata
|
||||
#metadata_df = pd.DataFrame.from_dict(data, orient='columns')
|
||||
return df
|
||||
|
||||
def extract_filename(dataset_name_path):
|
||||
def read_metadata_from_hdf5obj(hdf5_file_path, obj_path):
|
||||
# TODO: Complete this function
|
||||
metadata_df = pd.DataFrame.empty()
|
||||
return metadata_df
|
||||
|
||||
tmp = dataset_name_path.split('/')
|
||||
|
||||
return tmp[len(tmp)-2]
|
||||
|
||||
def list_datasets(hdf5_filename_path):
|
||||
def list_datasets_in_hdf5file(hdf5_file_path):
|
||||
|
||||
def get_datasets(name, obj, list_of_datasets):
|
||||
if isinstance(obj,h5py.Dataset):
|
||||
list_of_datasets.append(name)
|
||||
head, tail = os.path.split(name)
|
||||
print(f'Adding dataset: tail: {head} head: {tail}')
|
||||
#print(f'Adding dataset: {name}') #tail: {head} head: {tail}')
|
||||
|
||||
|
||||
with h5py.File(hdf5_filename_path,'r') as file:
|
||||
with h5py.File(hdf5_file_path,'r') as file:
|
||||
list_of_datasets = []
|
||||
file.visititems(lambda name, obj: get_datasets(name, obj, list_of_datasets))
|
||||
|
||||
dataset_df = pd.DataFrame({'dataset_name':list_of_datasets})
|
||||
|
||||
return list_of_datasets
|
||||
dataset_df['parent_instrument'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-3])
|
||||
dataset_df['parent_file'] = dataset_df['dataset_name'].apply(lambda x: x.split('/')[-2])
|
||||
|
||||
return dataset_df
|
||||
|
Reference in New Issue
Block a user