Fix reader txt/csv default behavior.

2025-02-07 16:25:45 +01:00
parent 0d26777732
commit f986edd4a5
3 changed files with 64 additions and 49 deletions
--- a/instruments/readers/config_text_reader.yaml
+++ b/instruments/readers/config_text_reader.yaml
@ -1,7 +1,7 @@
 default:
  file_encoding : 'utf-8'
-  separator : 'None'
-  table_header : 'None'
+  separator : ','
+  table_header : 'infer'
  desired_format: '%Y-%m-%d %H:%M:%S.%f'

 RGA:
--- a/instruments/readers/filereader_registry.py
+++ b/instruments/readers/filereader_registry.py
@ -17,8 +17,7 @@ file_readers = {
    'txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
    'TXT': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
    'dat': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
-    #'ACSM_TOFWARE_txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
-    #'ACSM_TOFWARE_csv': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False)
+    'csv': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False)
 }

 # Add new "instrument reader (Data flagging app data)"
--- a/instruments/readers/g5505_text_reader.py
+++ b/instruments/readers/g5505_text_reader.py
@ -37,17 +37,22 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
    file_encoding = config_dict['default']['file_encoding'] #'utf-8'
    separator = config_dict['default']['separator']
    table_header = config_dict['default']['table_header']
+    timestamp_variables = []
+    datetime_format = []
+    tb_idx = 0
+    column_names = ''
+    description_dict = {}

-    for key in config_dict.keys():
-        if key.replace('/',os.sep) in filename:
-            file_encoding = config_dict[key].get('file_encoding',file_encoding)
-            separator = config_dict[key].get('separator',separator)
-            table_header = config_dict[key].get('table_header',table_header)
-            timestamp_variables = config_dict[key].get('timestamp',[])
-            datetime_format = config_dict[key].get('datetime_format',[])
+    for instFolder in config_dict.keys():
+        if instFolder in filename.split(os.sep):
+            file_encoding = config_dict[instFolder].get('file_encoding',file_encoding)
+            separator = config_dict[instFolder].get('separator',separator)
+            table_header = config_dict[instFolder].get('table_header',table_header)
+            timestamp_variables = config_dict[instFolder].get('timestamp',[])
+            datetime_format = config_dict[instFolder].get('datetime_format',[])

-            description_dict = {}
-            link_to_description = config_dict[key].get('link_to_description', '').replace('/', os.sep) 
+            
+            link_to_description = config_dict[instFolder].get('link_to_description', '').replace('/', os.sep) 

            if link_to_description:
                path = os.path.join(instruments_dir, link_to_description)                
@ -75,49 +80,60 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
        file_encoding = [file_encoding]
        separator = [separator]

-    with open(tmp_filename,'rb') as f:
-        table_preamble = []
-        for line_number, line in enumerate(f):   
+    table_preamble = []
+    line_number = 0
+    if 'infer' not in table_header:

+        with open(tmp_filename,'rb') as f:
            
-            for tb_idx, tb in enumerate(table_header):
-                if tb in line.decode(file_encoding[tb_idx]):
+            for line_number, line in enumerate(f):   
+
+                
+                for tb_idx, tb in enumerate(table_header):
+                    if tb in line.decode(file_encoding[tb_idx]):
+                        break
+
+                if tb in line.decode(file_encoding[tb_idx]):   
+                    list_of_substrings = line.decode(file_encoding[tb_idx]).split(separator[tb_idx].replace('\\t','\t'))  
+
+                    # Count occurrences of each substring
+                    substring_counts = collections.Counter(list_of_substrings)
+                    data_start = True  
+                    # Generate column names with appended index only for repeated substrings
+                    column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)]           
+
+                    #column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)]
+                    #column_names = []
+                    #for i, name in enumerate(list_of_substrings):
+                    #    column_names.append(str(i)+'_'+name) 
+
+                    #print(line_number, len(column_names ),'\n')
                    break
-
-            if tb in line.decode(file_encoding[tb_idx]):   
-                list_of_substrings = line.decode(file_encoding[tb_idx]).split(separator[tb_idx].replace('\\t','\t'))  
-
-                # Count occurrences of each substring
-                substring_counts = collections.Counter(list_of_substrings)
-                data_start = True  
-                # Generate column names with appended index only for repeated substrings
-                column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)]           
-
-                #column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)]
-                #column_names = []
-                #for i, name in enumerate(list_of_substrings):
-                #    column_names.append(str(i)+'_'+name) 
-
-                #print(line_number, len(column_names ),'\n')
-                break
-            # Subdivide line into words, and join them by single space. 
-            # I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
-            list_of_substrings = line.decode(file_encoding[tb_idx]).split()
-            # TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
-            #line = ' '.join(list_of_substrings+['\n'])
-            #line = ' '.join(list_of_substrings)     
-            table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line  
+                # Subdivide line into words, and join them by single space. 
+                # I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
+                list_of_substrings = line.decode(file_encoding[tb_idx]).split()
+                # TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
+                #line = ' '.join(list_of_substrings+['\n'])
+                #line = ' '.join(list_of_substrings)     
+                table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line  

  
    # TODO: it does not work with separator as none :(. fix for RGA
    try:
-        df = pd.read_csv(tmp_filename, 
-                        delimiter = separator[tb_idx].replace('\\t','\t'), 
-                        header=line_number, 
-                        #encoding='latin-1',
-                        encoding = file_encoding[tb_idx],
-                        names=column_names,
-                        skip_blank_lines=True)
+        if not 'infer' in table_header:
+            df = pd.read_csv(tmp_filename, 
+                            delimiter = separator[tb_idx].replace('\\t','\t'), 
+                            header=line_number, 
+                            #encoding='latin-1',
+                            encoding = file_encoding[tb_idx],
+                            names=column_names,
+                            skip_blank_lines=True)
+        else:
+            df = pd.read_csv(tmp_filename, 
+                delimiter = separator[tb_idx].replace('\\t','\t'), 
+                header=line_number, 
+                encoding = file_encoding[tb_idx],
+                skip_blank_lines=True)
   
        df_numerical_attrs = df.select_dtypes(include ='number')
        df_categorical_attrs = df.select_dtypes(exclude='number')