Renamed to_yaml() as serialize_metadata() and introduce input parameter output_format, which allows yaml or json.

This commit is contained in:
2024-09-26 16:23:09 +02:00
parent 85b0e5ab74
commit 96dad0bfb1
2 changed files with 59 additions and 23 deletions

View File

@ -178,7 +178,7 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
#if numerical_variables:
dataset = {}
dataset['name'] = 'data_table'#_numerical_variables'
dataset['data'] = utils.dataframe_to_np_structured_array(df) #df_numerical_attrs.to_numpy()
dataset['data'] = utils.convert_dataframe_to_np_structured_array(df) #df_numerical_attrs.to_numpy()
dataset['shape'] = dataset['data'].shape
dataset['dtype'] = type(dataset['data'])
#dataset['data_units'] = file_obj['wave']['data_units']
@ -191,7 +191,7 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
for column_name in df.columns:
column_attr_dict = description_dict['table_header'].get(column_name,
{'note':'there was no description available. Review instrument files.'})
dataset['attributes'].update({column_name: utils.parse_attribute(column_attr_dict)})
dataset['attributes'].update({column_name: utils.convert_attrdict_to_np_structured_array(column_attr_dict)})
#try:
# dataset['attributes'] = description_dict['table_header'].copy()

View File

@ -10,7 +10,12 @@ import numpy as np
import utils.g5505_utils as utils
import logging
import datetime
import os
import h5py
import yaml
import json
class HDF5DataOpsManager():
def __init__(self, file_path, mode = 'r+') -> None:
@ -254,7 +259,7 @@ def construct_attributes_dict(attrs_obj):
return attr_dict
def print_metadata(name, obj, folder_depth, yaml_dict):
def __print_metadata__(name, obj, folder_depth, yaml_dict):
# TODO: should we enable deeper folders ?
if len(obj.name.split('/')) <= folder_depth:
@ -298,29 +303,60 @@ def print_metadata(name, obj, folder_depth, yaml_dict):
#elif len(obj.name.split('/')) == 3:
# print(yaml.dump())
def to_yaml(input_filename_path,folder_depth: int = 4):
def serialize_metadata(input_filename_path, folder_depth: int = 4, output_format: str = 'yaml') -> str:
"""
Serialize metadata from an HDF5 file into YAML or JSON format.
Parameters
----------
input_filename_path : str
The path to the input HDF5 file.
folder_depth : int, optional
The folder depth to control how much of the HDF5 file hierarchy is traversed (default is 4).
output_format : str, optional
The format to serialize the output, either 'yaml' or 'json' (default is 'yaml').
Returns
-------
str
The output file path where the serialized metadata is stored (either .yaml or .json).
"""
# Choose the appropriate output format (YAML or JSON)
if output_format not in ['yaml', 'json']:
raise ValueError("Unsupported format. Please choose either 'yaml' or 'json'.")
# Initialize dictionary to store YAML/JSON data
yaml_dict = {}
# Split input file path to get the output file's base name
output_filename_tail, ext = os.path.splitext(input_filename_path)
with h5py.File(input_filename_path,'r') as f:
# Open the HDF5 file and extract metadata
with h5py.File(input_filename_path, 'r') as f:
# Construct attributes dictionary and top-level structure
attrs_dict = construct_attributes_dict(f.attrs)
yaml_dict[f.name] = {"name": f.name, "attributes": attrs_dict, "datasets":{}}
f.visititems(lambda name, obj: print_metadata(name,obj,folder_depth,yaml_dict))
#with open(output_filename_tail+".json","w") as yaml_file:
# json_obj = json.dumps(yaml_dict,indent=4,sort_keys=False,)
# yaml_file.write(json_obj)
with open(output_filename_tail+".yaml","w") as yaml_file:
yaml_output = yaml.dump(yaml_dict,sort_keys=False)
#for key in yaml_dict:
# yaml_output = yaml.dump(yaml_dict[key],sort_keys=False)
yaml_file.write(yaml_output )
return output_filename_tail+".yaml"
yaml_dict[f.name] = {
"name": f.name,
"attributes": attrs_dict,
"datasets": {}
}
# Traverse HDF5 file hierarchy and add datasets
f.visititems(lambda name, obj: __print_metadata__(name, obj, folder_depth, yaml_dict))
# Serialize and write the data
output_file_path = output_filename_tail + '.' + output_format
with open(output_file_path, 'w') as output_file:
if output_format == 'json':
json_output = json.dumps(yaml_dict, indent=4, sort_keys=False)
output_file.write(json_output)
elif output_format == 'yaml':
yaml_output = yaml.dump(yaml_dict, sort_keys=False)
output_file.write(yaml_output)
return output_file_path
def get_groups_at_a_level(file: h5py.File, level: str):