Replaced attributes, previously extracted from the table_preamble in .txt and .dat files with a single dataset attribute called table_preamble that contains the whole table preamble.

2024-03-19 11:40:35 +01:00
parent b886066133
commit 7fe254755f
2 changed files with 57 additions and 35 deletions
--- a/src/g5505_file_reader.py
+++ b/src/g5505_file_reader.py
@ -63,6 +63,22 @@ def read_xps_ibw_file_as_dict(filename):
    return file_dict
 def make_file_copy(source_file_path):
    pathtail, filename = os.path.split(source_file_path)
    backup_filename = 'backup_'+ filename
    # Path                     
    ROOT_DIR = os.path.abspath(os.curdir)
    tmp_dirpath = os.path.join(ROOT_DIR,'tmp_files')
    if not os.path.exists(tmp_dirpath):
        os.mkdir(tmp_dirpath)
    tmp_file_path = os.path.join(tmp_dirpath,backup_filename)
    shutil.copy(source_file_path, tmp_file_path)
    return tmp_file_path
 def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name):
    # Create copy of original file to avoid possible file corruption and work with it.
@ -87,10 +103,9 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
 def read_txt_files_as_dict(filename : str ):
-    #if instrument_folder == 'smps':
+
    # Infer from filename whether txt file comes from smps or gas folder
    #TODO: this may be prone to error if assumed folder structure is non compliant 
-    if 'RGA' in filename: 
+    if 'RGA' in filename: #TODO: it does not work with separator as none :(. fix for RGA
        #end_of_header = 'Channel,  Mass(amu),     Name,                 Cal Factor,  Noise Floor, CEM Status',
        table_header = 'Time(s)      Channel#1   Channel#2   Channel#3   Channel#4   Channel#5   Channel#6   Channel#7   Channel#8'
        separator = None
@ -99,6 +114,15 @@ def read_txt_files_as_dict(filename : str ):
        separator = '\t'
    #elif 'gas' in filename:
    #    end_of_header = 'Date_Time	HoribaNO	HoribaNOy	Thermo42C_NO	Thermo42C_NOx	APHA370 CH4'
    elif 'Humidity_Sensors' in filename:
        table_header = 'Date	Time	RH1[%]	RH2[%]	RH3[%]	RH4[%]	RH5[%]	RH6[%]	RH7[%]	RH8[%]	T1[°C]	T2[°C]	T3[°C]	T4[°C]	T5[°C]	T6[°C]	T7[°C]	T8[°C]	DP1[°C]	DP2[°C]	DP3[°C]	DP4[°C]	DP5[°C]	DP6[°C]	DP7[°C]	DP8[°C]'
        separator = '\t'
    elif 'ICAD' in filename and 'HONO' in filename:
        table_header = 'Start Date/Time (UTC)	Duration (s)	NO2 (ppb)	NO2 Uncertainty (ppb)	H2O (ppb)	H2O Uncertainty (ppb)	CHOCHO (ppb)	CHOCHO Uncertainty (ppb)	File Number	Light Intensity	#ICEDOAS iter.	Cell Pressure	Ambient Pressure	Cell Temp	Spec Temp	Lat	Lon	Height	Speed	GPSQuality	0-Air Ref. Time	0-Air Ref. Duration	0-Air Ref. File Number	0-Air Ref. Intensity	0-Air Ref. Rel Intensity	0-Air Ref. Intensity valid	MeasMode	SampleSource'
        separator = '\t'
    elif 'ICAD' in filename and 'NO2' in filename:
        table_header = 'Start Date/Time (UTC)	Duration (s)	NO2 (ppb)	NO2 Uncertainty (ppb)	H2O (ppb)	H2O Uncertainty (ppb)	CHOCHO (ppb)	CHOCHO Uncertainty (ppb)	File Number	Light Intensity	#ICEDOAS iter.	Cell Pressure	Ambient Pressure	Cell Temp	Spec Temp	Lat	Lon	Height	Speed	GPSQuality	0-Air Ref. Time	0-Air Ref. Duration	0-Air Ref. File Number	0-Air Ref. Intensity	0-Air Ref. Rel Intensity	0-Air Ref. Intensity valid	MeasMode	SampleSource'
        separator = '\t'
    else:
        return {}
        #raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"')
@ -106,43 +130,31 @@ def read_txt_files_as_dict(filename : str ):
    # Read header as a dictionary and detect where data table starts
    header_dict = {}
    data_start = False    
    # Work with copy of the file for safety
    tmp_filename = make_file_copy(filename)
    with open(filename,'r') as f:
        file_encoding = f.encoding
        table_preamble = ""
        for line_number, line in enumerate(f):        
            list_of_substrings = line.split(separator)
-            if table_header in line:
+            if not (line == '\n'):
                table_preamble += line.strip() #+ "\n"
            if table_header in line:                
                data_start = True  
                column_names = []
                #for i, name in enumerate(line.split('\t')):
                for i, name in enumerate(list_of_substrings):
                    column_names.append(str(i)+'_'+name) 
                print(line_number, len(column_names ))
                break
            else:
                # TODO: update to extract information from lines formed by more than two elements separaed by '\t'
                if list_of_substrings:
                    key, value = list_of_substrings[0], list_of_substrings[1::]
                    header_dict[key] = value
-            #if len(end_of_header) > 1 and any([item in line for item in end_of_header]):
+        header_dict["table_preamble"] = table_preamble
            #    line_numbers.append(line_number)
                #break
-
+   
    if not data_start:
        raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
    #if len(end_of_header) > 0: 
    # TODO: it does not work with separater as none :(. fix for RGA
    try:
-        df = pd.read_csv(filename, 
+        df = pd.read_csv(tmp_filename, 
                        delimiter = separator, 
                        header=line_number, 
                        #encoding='latin-1',
@ -155,7 +167,7 @@ def read_txt_files_as_dict(filename : str ):
        numerical_variables = [item for item in df_numerical_attrs.columns]       
        # TODO: 
-        if 'Pressure' in filename:
+        if 'Pressure' in tmp_filename:
            df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
            df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time'])
@ -166,7 +178,7 @@ def read_txt_files_as_dict(filename : str ):
        ###
        file_dict = {}
-        path_tail, path_head = os.path.split(filename)
+        path_tail, path_head = os.path.split(tmp_filename)
        file_dict['name'] = path_head
        # TODO: review this header dictionary, it may not be the best way to represent header data
--- a/src/smog_chamber_file_reader.py
+++ b/src/smog_chamber_file_reader.py
@ -11,10 +11,10 @@ def read_txt_files_as_dict(filename : str ):
    # Infer from filename whether txt file comes from smps or gas folder
    #TODO: this may be prone to error if assumed folder structure is non compliant 
    if 'smps' in filename: 
-        end_of_header = 'Sample #	Date	Start Time	Sample Temp (C)	Sample Pressure (kPa)'
+        table_of_header = 'Sample #	Date	Start Time	Sample Temp (C)	Sample Pressure (kPa)'
        separator = '\t'
    elif 'gas' in filename:
-        end_of_header = 'Date_Time	HoribaNO	HoribaNOy	Thermo42C_NO	Thermo42C_NOx	APHA370 CH4'
+        table_of_header = 'Date_Time	HoribaNO	HoribaNOy	Thermo42C_NO	Thermo42C_NOx	APHA370 CH4'
        separator = '\t'
    else:
        raise ValueError('intrument_folder must be set as a either "smps" or "gas"')
@ -24,9 +24,12 @@ def read_txt_files_as_dict(filename : str ):
    data_start = False    
    with open(filename,'r') as f:
        file_encoding = f.encoding
        table_preamble = ""
        for line_number, line in enumerate(f):        
            list_of_substrings = line.split(separator)
-            if end_of_header in line:
+            if not (line == '\n'):
                table_preamble += line.strip() #+ "\n"
            if table_of_header in line:                
                data_start = True  
                column_names = []
                for i, name in enumerate(list_of_substrings):
@ -34,11 +37,8 @@ def read_txt_files_as_dict(filename : str ):
                print(line_number, len(column_names ))
                break
-
+            
-            # TODO: update to extract information from lines formed by more than two elements separaed by '\t'
+        header_dict["table_preamble"] = table_preamble
            if list_of_substrings:
                key, value = list_of_substrings[0], list_of_substrings[1::]
                header_dict[key] = value
    if not data_start:
        raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.')
@ -84,13 +84,23 @@ def read_txt_files_as_dict(filename : str ):
        file_dict['datasets'].append(dataset)
        rows,cols = dataset['shape']
        # This lines were added to test the structured array functionality
        tmp = [tuple(dataset['data'][i,:]) for i in range(dataset['shape'][0])]
        dtype_tmp = [(numerical_variables[i],'f4') for i in range(dataset['shape'][1])]
        data = np.array(tmp, dtype=dtype_tmp)
        dataset['data'] = data
        dataset['shape'] = dataset['data'].shape
        dataset = {}
        numerical_variables= [item.encode("utf-8") for item in numerical_variables]
        dataset['name'] = 'numerical_variable_names'
        dataset['data'] = np.array(numerical_variables).reshape((1,cols))
        dataset['shape'] = dataset['data'].shape
        dataset['dtype'] = type(dataset['data'])
-        file_dict['datasets'].append(dataset)            
+        file_dict['datasets'].append(dataset) 
    if 'timestamps' in categorical_variables:
        dataset = {}