diff --git a/instruments/readers/config_text_reader.yaml b/instruments/readers/config_text_reader.yaml index db00bed..555c429 100644 --- a/instruments/readers/config_text_reader.yaml +++ b/instruments/readers/config_text_reader.yaml @@ -1,7 +1,7 @@ default: file_encoding : 'utf-8' - separator : 'None' - table_header : 'None' + separator : ',' + table_header : 'infer' desired_format: '%Y-%m-%d %H:%M:%S.%f' RGA: diff --git a/instruments/readers/filereader_registry.py b/instruments/readers/filereader_registry.py index a9c4774..64b4fb5 100644 --- a/instruments/readers/filereader_registry.py +++ b/instruments/readers/filereader_registry.py @@ -17,8 +17,7 @@ file_readers = { 'txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False), 'TXT': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False), 'dat': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False), - #'ACSM_TOFWARE_txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False), - #'ACSM_TOFWARE_csv': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False) + 'csv': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False) } # Add new "instrument reader (Data flagging app data)" diff --git a/instruments/readers/g5505_text_reader.py b/instruments/readers/g5505_text_reader.py index ed019e0..396e0e1 100644 --- a/instruments/readers/g5505_text_reader.py +++ b/instruments/readers/g5505_text_reader.py @@ -37,17 +37,22 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with file_encoding = config_dict['default']['file_encoding'] #'utf-8' separator = config_dict['default']['separator'] table_header = config_dict['default']['table_header'] + timestamp_variables = [] + datetime_format = [] + tb_idx = 0 + column_names = '' + description_dict = {} - for key in config_dict.keys(): - if key.replace('/',os.sep) in filename: - file_encoding = config_dict[key].get('file_encoding',file_encoding) - separator = config_dict[key].get('separator',separator) - table_header = config_dict[key].get('table_header',table_header) - timestamp_variables = config_dict[key].get('timestamp',[]) - datetime_format = config_dict[key].get('datetime_format',[]) + for instFolder in config_dict.keys(): + if instFolder in filename.split(os.sep): + file_encoding = config_dict[instFolder].get('file_encoding',file_encoding) + separator = config_dict[instFolder].get('separator',separator) + table_header = config_dict[instFolder].get('table_header',table_header) + timestamp_variables = config_dict[instFolder].get('timestamp',[]) + datetime_format = config_dict[instFolder].get('datetime_format',[]) - description_dict = {} - link_to_description = config_dict[key].get('link_to_description', '').replace('/', os.sep) + + link_to_description = config_dict[instFolder].get('link_to_description', '').replace('/', os.sep) if link_to_description: path = os.path.join(instruments_dir, link_to_description) @@ -75,49 +80,60 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with file_encoding = [file_encoding] separator = [separator] - with open(tmp_filename,'rb') as f: - table_preamble = [] - for line_number, line in enumerate(f): + table_preamble = [] + line_number = 0 + if 'infer' not in table_header: + with open(tmp_filename,'rb') as f: - for tb_idx, tb in enumerate(table_header): - if tb in line.decode(file_encoding[tb_idx]): + for line_number, line in enumerate(f): + + + for tb_idx, tb in enumerate(table_header): + if tb in line.decode(file_encoding[tb_idx]): + break + + if tb in line.decode(file_encoding[tb_idx]): + list_of_substrings = line.decode(file_encoding[tb_idx]).split(separator[tb_idx].replace('\\t','\t')) + + # Count occurrences of each substring + substring_counts = collections.Counter(list_of_substrings) + data_start = True + # Generate column names with appended index only for repeated substrings + column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)] + + #column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)] + #column_names = [] + #for i, name in enumerate(list_of_substrings): + # column_names.append(str(i)+'_'+name) + + #print(line_number, len(column_names ),'\n') break - - if tb in line.decode(file_encoding[tb_idx]): - list_of_substrings = line.decode(file_encoding[tb_idx]).split(separator[tb_idx].replace('\\t','\t')) - - # Count occurrences of each substring - substring_counts = collections.Counter(list_of_substrings) - data_start = True - # Generate column names with appended index only for repeated substrings - column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)] - - #column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)] - #column_names = [] - #for i, name in enumerate(list_of_substrings): - # column_names.append(str(i)+'_'+name) - - #print(line_number, len(column_names ),'\n') - break - # Subdivide line into words, and join them by single space. - # I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on. - list_of_substrings = line.decode(file_encoding[tb_idx]).split() - # TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character - #line = ' '.join(list_of_substrings+['\n']) - #line = ' '.join(list_of_substrings) - table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line + # Subdivide line into words, and join them by single space. + # I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on. + list_of_substrings = line.decode(file_encoding[tb_idx]).split() + # TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character + #line = ' '.join(list_of_substrings+['\n']) + #line = ' '.join(list_of_substrings) + table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line # TODO: it does not work with separator as none :(. fix for RGA try: - df = pd.read_csv(tmp_filename, - delimiter = separator[tb_idx].replace('\\t','\t'), - header=line_number, - #encoding='latin-1', - encoding = file_encoding[tb_idx], - names=column_names, - skip_blank_lines=True) + if not 'infer' in table_header: + df = pd.read_csv(tmp_filename, + delimiter = separator[tb_idx].replace('\\t','\t'), + header=line_number, + #encoding='latin-1', + encoding = file_encoding[tb_idx], + names=column_names, + skip_blank_lines=True) + else: + df = pd.read_csv(tmp_filename, + delimiter = separator[tb_idx].replace('\\t','\t'), + header=line_number, + encoding = file_encoding[tb_idx], + skip_blank_lines=True) df_numerical_attrs = df.select_dtypes(include ='number') df_categorical_attrs = df.select_dtypes(exclude='number')