diff --git a/src/g5505_file_reader.py b/src/g5505_file_reader.py index 3208535..8811d50 100644 --- a/src/g5505_file_reader.py +++ b/src/g5505_file_reader.py @@ -63,6 +63,22 @@ def read_xps_ibw_file_as_dict(filename): return file_dict +def make_file_copy(source_file_path): + + pathtail, filename = os.path.split(source_file_path) + backup_filename = 'backup_'+ filename + # Path + ROOT_DIR = os.path.abspath(os.curdir) + + tmp_dirpath = os.path.join(ROOT_DIR,'tmp_files') + if not os.path.exists(tmp_dirpath): + os.mkdir(tmp_dirpath) + + tmp_file_path = os.path.join(tmp_dirpath,backup_filename) + shutil.copy(source_file_path, tmp_file_path) + + return tmp_file_path + def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_name): # Create copy of original file to avoid possible file corruption and work with it. @@ -87,10 +103,9 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n def read_txt_files_as_dict(filename : str ): - #if instrument_folder == 'smps': - # Infer from filename whether txt file comes from smps or gas folder + #TODO: this may be prone to error if assumed folder structure is non compliant - if 'RGA' in filename: + if 'RGA' in filename: #TODO: it does not work with separator as none :(. fix for RGA #end_of_header = 'Channel, Mass(amu), Name, Cal Factor, Noise Floor, CEM Status', table_header = 'Time(s) Channel#1 Channel#2 Channel#3 Channel#4 Channel#5 Channel#6 Channel#7 Channel#8' separator = None @@ -99,6 +114,15 @@ def read_txt_files_as_dict(filename : str ): separator = '\t' #elif 'gas' in filename: # end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4' + elif 'Humidity_Sensors' in filename: + table_header = 'Date Time RH1[%] RH2[%] RH3[%] RH4[%] RH5[%] RH6[%] RH7[%] RH8[%] T1[°C] T2[°C] T3[°C] T4[°C] T5[°C] T6[°C] T7[°C] T8[°C] DP1[°C] DP2[°C] DP3[°C] DP4[°C] DP5[°C] DP6[°C] DP7[°C] DP8[°C]' + separator = '\t' + elif 'ICAD' in filename and 'HONO' in filename: + table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource' + separator = '\t' + elif 'ICAD' in filename and 'NO2' in filename: + table_header = 'Start Date/Time (UTC) Duration (s) NO2 (ppb) NO2 Uncertainty (ppb) H2O (ppb) H2O Uncertainty (ppb) CHOCHO (ppb) CHOCHO Uncertainty (ppb) File Number Light Intensity #ICEDOAS iter. Cell Pressure Ambient Pressure Cell Temp Spec Temp Lat Lon Height Speed GPSQuality 0-Air Ref. Time 0-Air Ref. Duration 0-Air Ref. File Number 0-Air Ref. Intensity 0-Air Ref. Rel Intensity 0-Air Ref. Intensity valid MeasMode SampleSource' + separator = '\t' else: return {} #raise ValueError('intrument_folder must be set as a either "RGA" or "Pressure"') @@ -106,43 +130,31 @@ def read_txt_files_as_dict(filename : str ): # Read header as a dictionary and detect where data table starts header_dict = {} data_start = False + # Work with copy of the file for safety + tmp_filename = make_file_copy(filename) with open(filename,'r') as f: file_encoding = f.encoding + table_preamble = "" for line_number, line in enumerate(f): list_of_substrings = line.split(separator) - if table_header in line: + if not (line == '\n'): + table_preamble += line.strip() #+ "\n" + if table_header in line: data_start = True column_names = [] - #for i, name in enumerate(line.split('\t')): for i, name in enumerate(list_of_substrings): column_names.append(str(i)+'_'+name) print(line_number, len(column_names )) - - - break - else: - # TODO: update to extract information from lines formed by more than two elements separaed by '\t' - if list_of_substrings: - key, value = list_of_substrings[0], list_of_substrings[1::] - header_dict[key] = value - #if len(end_of_header) > 1 and any([item in line for item in end_of_header]): - # line_numbers.append(line_number) - #break + header_dict["table_preamble"] = table_preamble - - - if not data_start: - raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.') - - #if len(end_of_header) > 0: - + # TODO: it does not work with separater as none :(. fix for RGA try: - df = pd.read_csv(filename, + df = pd.read_csv(tmp_filename, delimiter = separator, header=line_number, #encoding='latin-1', @@ -155,7 +167,7 @@ def read_txt_files_as_dict(filename : str ): numerical_variables = [item for item in df_numerical_attrs.columns] # TODO: - if 'Pressure' in filename: + if 'Pressure' in tmp_filename: df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index] df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time']) @@ -166,7 +178,7 @@ def read_txt_files_as_dict(filename : str ): ### file_dict = {} - path_tail, path_head = os.path.split(filename) + path_tail, path_head = os.path.split(tmp_filename) file_dict['name'] = path_head # TODO: review this header dictionary, it may not be the best way to represent header data diff --git a/src/smog_chamber_file_reader.py b/src/smog_chamber_file_reader.py index 4a834ea..9b63996 100644 --- a/src/smog_chamber_file_reader.py +++ b/src/smog_chamber_file_reader.py @@ -11,10 +11,10 @@ def read_txt_files_as_dict(filename : str ): # Infer from filename whether txt file comes from smps or gas folder #TODO: this may be prone to error if assumed folder structure is non compliant if 'smps' in filename: - end_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)' + table_of_header = 'Sample # Date Start Time Sample Temp (C) Sample Pressure (kPa)' separator = '\t' elif 'gas' in filename: - end_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4' + table_of_header = 'Date_Time HoribaNO HoribaNOy Thermo42C_NO Thermo42C_NOx APHA370 CH4' separator = '\t' else: raise ValueError('intrument_folder must be set as a either "smps" or "gas"') @@ -24,9 +24,12 @@ def read_txt_files_as_dict(filename : str ): data_start = False with open(filename,'r') as f: file_encoding = f.encoding + table_preamble = "" for line_number, line in enumerate(f): list_of_substrings = line.split(separator) - if end_of_header in line: + if not (line == '\n'): + table_preamble += line.strip() #+ "\n" + if table_of_header in line: data_start = True column_names = [] for i, name in enumerate(list_of_substrings): @@ -34,11 +37,8 @@ def read_txt_files_as_dict(filename : str ): print(line_number, len(column_names )) break - - # TODO: update to extract information from lines formed by more than two elements separaed by '\t' - if list_of_substrings: - key, value = list_of_substrings[0], list_of_substrings[1::] - header_dict[key] = value + + header_dict["table_preamble"] = table_preamble if not data_start: raise ValueError('Invalid table header. The table header was not found and therefore table data cannot be extracted from txt or dat file.') @@ -84,13 +84,23 @@ def read_txt_files_as_dict(filename : str ): file_dict['datasets'].append(dataset) rows,cols = dataset['shape'] + # This lines were added to test the structured array functionality + tmp = [tuple(dataset['data'][i,:]) for i in range(dataset['shape'][0])] + dtype_tmp = [(numerical_variables[i],'f4') for i in range(dataset['shape'][1])] + + data = np.array(tmp, dtype=dtype_tmp) + dataset['data'] = data + dataset['shape'] = dataset['data'].shape + dataset = {} numerical_variables= [item.encode("utf-8") for item in numerical_variables] dataset['name'] = 'numerical_variable_names' dataset['data'] = np.array(numerical_variables).reshape((1,cols)) dataset['shape'] = dataset['data'].shape dataset['dtype'] = type(dataset['data']) - file_dict['datasets'].append(dataset) + file_dict['datasets'].append(dataset) + + if 'timestamps' in categorical_variables: dataset = {}