In [1]:
import os
from nbutils import add_project_path_to_sys_path


# Add project root to sys.path
add_project_path_to_sys_path()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

try:
    import src.hdf5_writer as hdf5_writer
    import src.hdf5_ops as hdf5_ops
    import src.hdf5_vis as h5vis
    import src.napp_plotlib as napp

    import utils.g5505_utils as utils
    #import pipelines.metadata_revision as metadata_revision
    print("Imports successful!")
except ImportError as e:
    print(f"Import error: {e}")



Imports successful!


Read the above specified input_file_path as a dataframe. 

Since we know this file was created from a Thorsten Table's format, we can use h5lib.read_mtable_as_dataframe() to read it.

Then, we rename the 'name' column as 'filename', as this is the column's name use to idenfify files in subsequent functions.
Also, we augment the dataframe with a few categorical columns to be used as grouping variables when creating the hdf5 file's group hierarchy. 

In [2]:
# Define input file directory

input_file_path = '../input_files/BeamTimeMetaData.h5'
output_dir_path = '../output_files'
if not os.path.exists(output_dir_path):
    os.makedirs(output_dir_path)

# Read BeamTimeMetaData.h5, containing Thorsten's Matlab Table
input_data_df = hdf5_ops.read_mtable_as_dataframe(input_file_path)

# Preprocess Thorsten's input_data dataframe so that i can be used to create a newer .h5 file
# under certain grouping specificiations.
input_data_df = input_data_df.rename(columns = {'name':'filename'})
input_data_df = utils.augment_with_filenumber(input_data_df)
input_data_df = utils.augment_with_filetype(input_data_df)
input_data_df = utils.split_sample_col_into_sample_and_data_quality_cols(input_data_df)
input_data_df['lastModifiedDatestr'] = input_data_df['lastModifiedDatestr'].astype('datetime64[s]')

input_data_df.columns


Index(['scientaEkinStep_eV', 'scientaLensMode', 'scientaRegionIterations',
       'scientaSequenceIterations', 'sampleTemp_dC', 'cellPressure_mbar',
       'iceTemp_dC', 'smplX_mm', 'smplY_mm', 'smplZ_mm', 'filename',
       'sealingTemp', 'lastModifiedDatestr', 'sample', 'xRayEkinRange_eV_1',
       'xRayEkinRange_eV_2', 'scientaPassEnergy_eV', 'scientaDwellTime_ms',
       'regionName', 'scientaAcquisitionMode', 'scientaEkinRange_eV_1',
       'scientaEkinRange_eV_2', 'filenumber', 'filetype', 'data_quality'],
      dtype='object')

We now create a hdf5 file with a 3-level group hierarchy based on the input_data and three grouping functions. Then
we visualize the group hierarchy of the created file as a treemap.

In [6]:
# Define grouping functions to be passed into create_hdf5_file function. These can also be set
# as strings refering to categorical columns in input_data_df.

test_grouping_funcs = True
if test_grouping_funcs:
    group_by_sample = lambda x : utils.group_by_df_column(x,'sample')
    group_by_type = lambda x : utils.group_by_df_column(x,'filetype')
    group_by_filenumber = lambda x : utils.group_by_df_column(x,'filenumber')
else:
    group_by_sample = 'sample'
    group_by_type = 'filetype'
    group_by_filenumber = 'filenumber'

import pandas as pd
import h5py

output_filename = os.path.normpath(os.path.join(output_dir_path, 'test.h5'))

# Define the groupnames function
groupnames = lambda df, vars: ['/' + '/'.join(df.loc[idx, vars].to_list()) for idx in df.index]

grouping_by_vars = ['sample', 'filenumber']

# Apply the groupnames function to generate the 'groupnames' column
input_data_df['groupnames'] = pd.Series(groupnames(input_data_df, grouping_by_vars), index=input_data_df.index)

# Open the HDF5 file in write mode
with h5py.File(output_filename, 'w') as file:
    # Iterate over unique groupnames
    for groupname in input_data_df['groupnames'].unique():
        # Create the group in the HDF5 file
        group = file.create_group(groupname)
        
        # Filter the dataframe for the current group
        datatable = input_data_df[input_data_df['groupnames'] == groupname].copy()
        
        # Exclude the grouping variables from the datatable
        datatable = datatable.drop(columns=grouping_by_vars + ['groupnames'])   

        
        if datatable.shape[0] > 1:
            # Create a dataset in the current group
            group.create_dataset(name = 'data_table', data = utils.convert_dataframe_to_np_structured_array(datatable))



annotation_dict = {'Campaign name': 'SLS-Campaign-2023',
                    'Producers':'Thorsten, Luca, Zoe',
                    'Startdate': str(input_data_df['lastModifiedDatestr'].min()),
                    'Enddate': str(input_data_df['lastModifiedDatestr'].max())
                    }

dataOpsObj = hdf5_ops.HDF5DataOpsManager(output_filename)
dataOpsObj.load_file_obj()
# Annotate root folder with annotation_dict
dataOpsObj.append_metadata('/',annotation_dict)
dataOpsObj.unload_file_obj()



h5vis.display_group_hierarchy_on_a_treemap(output_filename)


/MgO crystal
/MgO crystal/0002002
/MgO crystal/0003003
/MgO crystal/0004004
/MgO crystal/0005005
/MgO crystal/0006006
/MgO crystal/0007007
/MgO crystal/0008008
/MgO crystal/0009009
/MgO crystal/0010010
/MgO crystal/0011011
/MgO crystal/0012012
/MgO crystal/0013013
/MgO crystal/0014014
/MgO crystal/0015015
/MgO crystal/0016016
/MgO crystal/0017017
/MgO crystal/0018018
/MgO crystal/0019019
/MgO crystal/0020020
/MgO crystal/0021021
/MgO crystal/0022022
/MgO crystal/0023023
/MgO crystal,H2O
/MgO crystal,H2O/0034034
/MgO crystal,H2O/0035035
/MgO powder,H2O
/MgO powder,H2O/0036036
/MgO powder,H2O/0037037
/MgO powder,H2O/0038038
/MgO powder,H2O/0039039
/MgO powder,H2O/0040040
/MgO powder,H2O/0056056
/MgO powder,H2O/0057057
/MgO powder,H2O/0058058
/MgO powder,H2O/0059059
/MgO powder,H2O/0060060
/MgO powder,H2O/0061061
/MgO powder,H2O/0062062
/MgO powder,H2O/0063063
/MgO powder,H2O/0064064
/MgO powder,H2O/0065065
/MgO powder,H2O/0066066
/MgO powder,H2O/0067067
/MgO powder,H2O/0068068
/MgO powde