diff --git a/instruments/readers/g5505_text_reader.py b/instruments/readers/g5505_text_reader.py index d127202..be9144d 100644 --- a/instruments/readers/g5505_text_reader.py +++ b/instruments/readers/g5505_text_reader.py @@ -178,7 +178,7 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with #if numerical_variables: dataset = {} dataset['name'] = 'data_table'#_numerical_variables' - dataset['data'] = utils.dataframe_to_np_structured_array(df) #df_numerical_attrs.to_numpy() + dataset['data'] = utils.convert_dataframe_to_np_structured_array(df) #df_numerical_attrs.to_numpy() dataset['shape'] = dataset['data'].shape dataset['dtype'] = type(dataset['data']) #dataset['data_units'] = file_obj['wave']['data_units'] @@ -191,7 +191,7 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with for column_name in df.columns: column_attr_dict = description_dict['table_header'].get(column_name, {'note':'there was no description available. Review instrument files.'}) - dataset['attributes'].update({column_name: utils.parse_attribute(column_attr_dict)}) + dataset['attributes'].update({column_name: utils.convert_attrdict_to_np_structured_array(column_attr_dict)}) #try: # dataset['attributes'] = description_dict['table_header'].copy() diff --git a/src/hdf5_ops.py b/src/hdf5_ops.py index a12fcb7..eff636d 100644 --- a/src/hdf5_ops.py +++ b/src/hdf5_ops.py @@ -10,7 +10,12 @@ import numpy as np import utils.g5505_utils as utils import logging import datetime + +import os +import h5py + import yaml +import json class HDF5DataOpsManager(): def __init__(self, file_path, mode = 'r+') -> None: @@ -254,7 +259,7 @@ def construct_attributes_dict(attrs_obj): return attr_dict -def print_metadata(name, obj, folder_depth, yaml_dict): +def __print_metadata__(name, obj, folder_depth, yaml_dict): # TODO: should we enable deeper folders ? if len(obj.name.split('/')) <= folder_depth: @@ -298,29 +303,60 @@ def print_metadata(name, obj, folder_depth, yaml_dict): #elif len(obj.name.split('/')) == 3: # print(yaml.dump()) -def to_yaml(input_filename_path,folder_depth: int = 4): - +def serialize_metadata(input_filename_path, folder_depth: int = 4, output_format: str = 'yaml') -> str: + """ + Serialize metadata from an HDF5 file into YAML or JSON format. + + Parameters + ---------- + input_filename_path : str + The path to the input HDF5 file. + folder_depth : int, optional + The folder depth to control how much of the HDF5 file hierarchy is traversed (default is 4). + output_format : str, optional + The format to serialize the output, either 'yaml' or 'json' (default is 'yaml'). + + Returns + ------- + str + The output file path where the serialized metadata is stored (either .yaml or .json). + + """ + + # Choose the appropriate output format (YAML or JSON) + if output_format not in ['yaml', 'json']: + raise ValueError("Unsupported format. Please choose either 'yaml' or 'json'.") + + # Initialize dictionary to store YAML/JSON data yaml_dict = {} - + + # Split input file path to get the output file's base name output_filename_tail, ext = os.path.splitext(input_filename_path) - - with h5py.File(input_filename_path,'r') as f: - + + # Open the HDF5 file and extract metadata + with h5py.File(input_filename_path, 'r') as f: + # Construct attributes dictionary and top-level structure attrs_dict = construct_attributes_dict(f.attrs) - yaml_dict[f.name] = {"name": f.name, "attributes": attrs_dict, "datasets":{}} - f.visititems(lambda name, obj: print_metadata(name,obj,folder_depth,yaml_dict)) - - #with open(output_filename_tail+".json","w") as yaml_file: - # json_obj = json.dumps(yaml_dict,indent=4,sort_keys=False,) - # yaml_file.write(json_obj) - - with open(output_filename_tail+".yaml","w") as yaml_file: - yaml_output = yaml.dump(yaml_dict,sort_keys=False) - #for key in yaml_dict: - # yaml_output = yaml.dump(yaml_dict[key],sort_keys=False) - yaml_file.write(yaml_output ) - - return output_filename_tail+".yaml" + yaml_dict[f.name] = { + "name": f.name, + "attributes": attrs_dict, + "datasets": {} + } + # Traverse HDF5 file hierarchy and add datasets + f.visititems(lambda name, obj: __print_metadata__(name, obj, folder_depth, yaml_dict)) + + + # Serialize and write the data + output_file_path = output_filename_tail + '.' + output_format + with open(output_file_path, 'w') as output_file: + if output_format == 'json': + json_output = json.dumps(yaml_dict, indent=4, sort_keys=False) + output_file.write(json_output) + elif output_format == 'yaml': + yaml_output = yaml.dump(yaml_dict, sort_keys=False) + output_file.write(yaml_output) + + return output_file_path def get_groups_at_a_level(file: h5py.File, level: str):