Incorparated feature to merge data and time data which may originally be in separate columns in text source files. This is specified in the text source specification yaml file

2024-04-30 14:50:33 +02:00
parent f3c2777bb0
commit 553c3fe946
1 changed files with 18 additions and 4 deletions
--- a/src/g5505_file_reader.py
+++ b/src/g5505_file_reader.py
@ -72,6 +72,18 @@ def copy_file_in_group(source_file_path, dest_file_obj : h5py.File, dest_group_n
    if 'tmp_files' in tmp_file_path:
        os.remove(tmp_file_path)

+import re
+
+def infer_units(column_name):
+
+    match = re.search('\[.+\]')
+
+    if match:
+        return match
+    else:
+        match = re.search('\(.+\)')
+        
+    return match

 def read_txt_files_as_dict(filename : str ):

@ -91,6 +103,7 @@ def read_txt_files_as_dict(filename : str ):
            file_encoding = config_dict[key].get('file_encoding',file_encoding)
            separator = config_dict[key].get('separator',separator).replace('\\t','\t')
            table_header = config_dict[key].get('table_header',table_header)
+            timestamp_variables = config_dict[key].get('timestamp',[])
            break
    #if 'None' in table_header:
    #    return {}
@ -140,10 +153,11 @@ def read_txt_files_as_dict(filename : str ):
        df_categorical_attrs = df.select_dtypes(exclude='number')
        numerical_variables = [item for item in df_numerical_attrs.columns]       

-        # TODO: 
-        if 'Pressure' in tmp_filename:
-            df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
-            df_categorical_attrs = df_categorical_attrs.drop(columns=['0_Date','1_Time'])
+        # Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
+        if timestamp_variables:
+            df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
+            #df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
+            df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)

        categorical_variables = [item for item in df_categorical_attrs.columns]
        ####