Changed variable names, rearranged pieces of code, and set up data checks.

2024-02-21 10:41:57 +01:00
parent 1a4294e0c2
commit 219435511b
2 changed files with 238 additions and 34 deletions
--- a/src/g5505_file_reader.py
+++ b/src/g5505_file_reader.py
@ -1,5 +1,5 @@
 import numpy as np
-import pandas as np
+import pandas as pd
 import matplotlib.pyplot as plt
 import plotly.express as px
 import plotly.graph_objects as go
@ -26,12 +26,9 @@ def read_xps_ibw_file_as_dict(filename):
    file_dict = {}
    path_tail, path_head = os.path.split(filename)
    file_dict['name'] = path_head
    file_dict['data'] = file_obj['wave']['wData']
    file_dict['data_units'] = file_obj['wave']['data_units']
    file_dict['shape'] = file_dict['data'].shape
    file_dict['dtype'] = type(file_dict['data'])   
    # Group name and attributes
    file_dict['name'] = path_head
    file_dict['attributes_dict'] = {}
    # Convert notes of bytes class to string class and split string into a list of elements separated by '\r'. 
@ -48,6 +45,21 @@ def read_xps_ibw_file_as_dict(filename):
    dimension_labels = file_obj['wave']['dimension_units'].decode("utf-8").split(']')
    file_dict['attributes_dict']['dimension_units'] = [item+']' for item in dimension_labels[0:len(dimension_labels)-1]]
    # Datasets and their attributes
    file_dict['datasets'] = []
    dataset = {}
    dataset['name'] = 'spectrum'
    dataset['data'] = file_obj['wave']['wData']
    dataset['data_units'] = file_obj['wave']['data_units']
    dataset['shape'] = dataset['data'].shape
    dataset['dtype'] = type(dataset['data'])   
    # TODO: include energy axis dataset
    file_dict['datasets'].append(dataset)
    return file_dict
@ -63,13 +75,152 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
    if not os.path.exists(tmp_dirpath):
        os.mkdir(tmp_dirpath)
-    shutil.copy(source_file_path, os.path.join(tmp_dirpath,backup_filename))
+    tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
    shutil.copy(source_file_path, tmp_file_path)
    # Open backup h5 file and copy complet filesystem directory onto a group in h5file
    with h5py.File(os.path.join(tmp_dirpath,backup_filename),'r') as src_file:
-        dest_file_obj.copy(source= src_file['/'], dest= dest_group_name +'/'+filename)
+        dest_file_obj.copy(source= src_file['/'], dest= dest_group_name)
    if 'tmp_files' in tmp_file_path:
        os.remove(tmp_file_path)
 def read_txt_files_as_dict(filename : str ):
    #if instrument_folder == 'smps':
    # Infer from filename whether txt file comes from smps or gas folder
    #TODO: this may be prone to error if assumed folder structure is non compliant 
    if 'RGA' in filename: 
        #end_of_header = 'Channel,  Mass(amu),     Name,                 Cal Factor,  Noise Floor, CEM Status',
        table_header = 'Time(s)      Channel#1   Channel#2   Channel#3   Channel#4   Channel#5   Channel#6   Channel#7   Channel#8'
        separator = None
    elif 'Pressure' in filename: 
        table_header = 'Date	Time	Vapore-Pressure 1 in 	Vapore-Pressure 2 in 	Baratron 1 in 	Baratron 2 in 	Baratron 3 in 	Baratron 4 in 	Temp. Ice-Sample in 	Temp. Heated-Sample in 	Temp. Cooler 1 in 	Temp. Cooler 2 in 	Flow  Gas 1 in 	Pressure Chamber in 	X in 	Y in 	Z in 	None in 	Temp. Sealing in 	Flow Ice-Sample in'      
        separator = '\t'
    #elif 'gas' in filename:
    #    end_of_header = 'Date_Time	HoribaNO	HoribaNOy	Thermo42C_NO	Thermo42C_NOx	APHA370 CH4'
    else:
        return {}
        #raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
    # Read header as a dictionary and detect where data table starts
    header_dict = {}
    data_start = False    
    with open(filename,'r') as f:
        file_encoding = f.encoding
        for line_number, line in enumerate(f):        
            list_of_substrings = line.split(separator)
            if table_header in line:
                data_start = True  
                column_names = []
                #for i, name in enumerate(line.split('\t')):
                for i, name in enumerate(list_of_substrings):
                    column_names.append(str(i)+'_'+name) 
                print(line_number, len(column_names ))
                break
            else:
                # TODO: update to extract information from lines formed by more than two elements separaed by '\t'
                if list_of_substrings:
                    key, value = list_of_substrings[0], list_of_substrings[1::]
                    header_dict[key] = value
            #if len(end_of_header) > 1 and any([item in line for item in end_of_header]):
            #    line_numbers.append(line_number)
                #break
    if not data_start:
        raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
    #if len(end_of_header) > 0: 
    # TODO: it does not work with separater as none :(. fix for RGA
    try:
        df = pd.read_csv(filename, 
                        delimiter = separator, 
                        header=line_number, 
                        #encoding='latin-1',
                        encoding = file_encoding,
                        names=column_names,
                        skip_blank_lines=True)
        df_numerical_attrs = df.select_dtypes(include ='number')
        df_categorical_attrs = df.select_dtypes(exclude='number')
        numerical_variables = [item for item in df_numerical_attrs.columns]       
        # TODO: 
        if 'Pressure' in filename:
            df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
            df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time'])
        categorical_variables = [item for item in df_categorical_attrs.columns]
        ####
        #elif 'RGA' in filename:
        #    df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Time(s)' : 'timestamps'})
        ###
        file_dict = {}
        path_tail, path_head = os.path.split(filename)
        file_dict['name'] = path_head
        # TODO: review this header dictionary, it may not be the best way to represent header data
        file_dict['attributes_dict'] = header_dict
        file_dict['datasets'] = []
        ####
        if numerical_variables:
            dataset = {}
            dataset['name'] = 'numerical_variables'
            dataset['data'] = df_numerical_attrs.to_numpy()
            dataset['shape'] = dataset['data'].shape
            dataset['dtype'] = type(dataset['data'])       
            #dataset['data_units'] = file_obj['wave']['data_units']
            file_dict['datasets'].append(dataset)
            rows,cols = dataset['shape']
            dataset = {}
            numerical_variables= [item.encode("utf-8") for item in numerical_variables]
            dataset['name'] = 'numerical_variable_names'
            dataset['data'] = np.array(numerical_variables).reshape((1,cols))
            dataset['shape'] = dataset['data'].shape
            dataset['dtype'] = type(dataset['data'])
            file_dict['datasets'].append(dataset)            
        if 'timestamps' in categorical_variables:
            dataset = {}
            dataset['name'] = 'timestamps'
            dataset['data'] = df_categorical_attrs['timestamps'].to_numpy().reshape((rows,1))
            dataset['shape'] = dataset['data'].shape
            dataset['dtype'] = type(dataset['data'])
            file_dict['datasets'].append(dataset)            
            categorical_variables.remove('timestamps')
        if categorical_variables:
            dataset = {}
            dataset['name'] = 'categorical_variables'
            dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy()
            dataset['shape'] = dataset['data'].shape
            dataset['dtype'] = type(dataset['data'])
            file_dict['datasets'].append(dataset)    
            dataset = {}
            categorical_variables = [item.encode("utf-8") for item in categorical_variables]
            dataset['name'] = 'categorial_variable_names'
            dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
            dataset['shape'] = dataset['data'].shape
            dataset['dtype'] = type(dataset['data'])
            file_dict['datasets'].append(dataset)    
    except:
        return {}
    return file_dict
 def main():
--- a/src/smog_chamber_file_reader.py
+++ b/src/smog_chamber_file_reader.py
@ -1,16 +1,21 @@
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import os
-def read_txt_files_as_dict(filename : str ,instrument_folder : str):
+#def read_txt_files_as_dict(filename : str ,instrument_folder : str):
 def read_txt_files_as_dict(filename : str ):
-    
+    #if instrument_folder == 'smps':
-    if instrument_folder == 'smps':
+    # Infer from filename whether txt file comes from smps or gas folder
    #TODO: this may be prone to error if assumed folder structure is non compliant 
    if 'smps' in filename: 
        end_of_header = 'Sample #	Date	Start Time	Sample Temp (C)	Sample Pressure (kPa)'
-        
+        separator = '\t'
-    elif instrument_folder == 'gas':
+    elif 'gas' in filename:
        end_of_header = 'Date_Time	HoribaNO	HoribaNOy	Thermo42C_NO	Thermo42C_NOx	APHA370 CH4'
        separator = '\t'
    else:
        raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
@ -18,53 +23,101 @@ def read_txt_files_as_dict(filename : str ,instrument_folder : str):
    header_dict = {}
    data_start = False    
    with open(filename,'r') as f:
        file_encoding = f.encoding
        for line_number, line in enumerate(f):        
-            list = line.split('\t')
+            list_of_substrings = line.split(separator)
            if end_of_header in line:
                data_start = True  
                column_names = []
-                for i, name in enumerate(line.split('\t')):
+                for i, name in enumerate(list_of_substrings):
                    column_names.append(str(i)+'_'+name) 
                print(line_number, len(column_names ))
                break
            # TODO: update to extract information from lines formed by more than two elements separaed by '\t'
-            key, value = list[0], list[1::]
+            if list_of_substrings:
-            header_dict[key] = value
+                key, value = list_of_substrings[0], list_of_substrings[1::]
                header_dict[key] = value
    if not data_start:
-        raise ValueError('file appears to be invalid. Data start condition in txt file was not met.')
+        raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
    df = pd.read_csv(filename, 
-                     delimiter = "\t", 
+                     delimiter = separator, 
                     header=line_number, 
                     #encoding='latin-1',
-                     encoding='latin-1',
+                     encoding= file_encoding,
                     names=column_names,
                     skip_blank_lines=True)
    df_numerical_attrs = df.select_dtypes(include ='number')
    df_categorical_attrs = df.select_dtypes(exclude='number')
-    if instrument_folder == 'smps':
+    if 'smps' in filename:
-        df_categorical_attrs['1_Timestamp'] = [ df_categorical_attrs.loc[i,'1_Date']+' '+df_categorical_attrs.loc[i,'2_Start Time'] for i in df.index]
+        df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'1_Date']+' '+df_categorical_attrs.loc[i,'2_Start Time'] for i in df.index]
        df_categorical_attrs = df_categorical_attrs.drop(columns=['1_Date','2_Start Time'])
-    elif instrument_folder == 'gas':
+    elif 'gas' in filename:
-        df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Date_Time' : '0_Timestamp'})
+        df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Date_Time' : 'timestamps'})
-    data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns]
+    #data_column_names = [item.encode("utf-8") for item in df_numerical_attrs.columns]
-    output_dict = { 'header_dict':header_dict,
+    numerical_variables = [item for item in df_numerical_attrs.columns]  
-                    'data': df_numerical_attrs.to_numpy(),
+    categorical_variables = [item for item in df_categorical_attrs.columns]
                    'data_column_names':data_column_names,
                    'categ_data_dict':df_categorical_attrs.to_dict(orient='list')
                    }
-   #output_dict = {'header_dict':header_dict,
+    ###
-   #                'num_data_df':df_numerical_attrs.to_numpy(),
+    file_dict = {}
-   #                'categ_data_df':df_categorical_attrs.to_dict(orient='list')}
+    path_tail, path_head = os.path.split(filename)
-    return output_dict 
+    file_dict['name'] = path_head
    # TODO: review this header dictionary, it may not be the best way to represent header data
    file_dict['attributes_dict'] = header_dict
    file_dict['datasets'] = []
    ####
    if numerical_variables:
        dataset = {}
        dataset['name'] = 'numerical_variables'
        dataset['data'] = df_numerical_attrs.to_numpy()
        dataset['shape'] = dataset['data'].shape
        dataset['dtype'] = type(dataset['data'])       
        #dataset['data_units'] = file_obj['wave']['data_units']
        file_dict['datasets'].append(dataset)
        rows,cols = dataset['shape']
        dataset = {}
        numerical_variables= [item.encode("utf-8") for item in numerical_variables]
        dataset['name'] = 'numerical_variable_names'
        dataset['data'] = np.array(numerical_variables).reshape((1,cols))
        dataset['shape'] = dataset['data'].shape
        dataset['dtype'] = type(dataset['data'])
        file_dict['datasets'].append(dataset)            
    if 'timestamps' in categorical_variables:
        dataset = {}
        dataset['name'] = 'timestamps'
        dataset['data'] = df_categorical_attrs['timestamps'].to_numpy().reshape((rows,1))
        dataset['shape'] = dataset['data'].shape
        dataset['dtype'] = type(dataset['data'])
        file_dict['datasets'].append(dataset)            
        categorical_variables.remove('timestamps')
    if categorical_variables:
        dataset = {}
        dataset['name'] = 'categorical_variables'
        dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy()
        dataset['shape'] = dataset['data'].shape
        dataset['dtype'] = type(dataset['data'])
        file_dict['datasets'].append(dataset)    
        dataset = {}
        categorical_variables = [item.encode("utf-8") for item in categorical_variables]
        dataset['name'] = 'categorial_variable_names'
        dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
        dataset['shape'] = dataset['data'].shape
        dataset['dtype'] = type(dataset['data'])
        file_dict['datasets'].append(dataset)    
    return file_dict
 def main():