Fix reader txt/csv default behavior.
This commit is contained in:
@ -1,7 +1,7 @@
|
||||
default:
|
||||
file_encoding : 'utf-8'
|
||||
separator : 'None'
|
||||
table_header : 'None'
|
||||
separator : ','
|
||||
table_header : 'infer'
|
||||
desired_format: '%Y-%m-%d %H:%M:%S.%f'
|
||||
|
||||
RGA:
|
||||
|
@ -17,8 +17,7 @@ file_readers = {
|
||||
'txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'TXT': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'dat': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
#'ACSM_TOFWARE_txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
#'ACSM_TOFWARE_csv': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False)
|
||||
'csv': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False)
|
||||
}
|
||||
|
||||
# Add new "instrument reader (Data flagging app data)"
|
||||
|
@ -37,17 +37,22 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
|
||||
file_encoding = config_dict['default']['file_encoding'] #'utf-8'
|
||||
separator = config_dict['default']['separator']
|
||||
table_header = config_dict['default']['table_header']
|
||||
timestamp_variables = []
|
||||
datetime_format = []
|
||||
tb_idx = 0
|
||||
column_names = ''
|
||||
description_dict = {}
|
||||
|
||||
for key in config_dict.keys():
|
||||
if key.replace('/',os.sep) in filename:
|
||||
file_encoding = config_dict[key].get('file_encoding',file_encoding)
|
||||
separator = config_dict[key].get('separator',separator)
|
||||
table_header = config_dict[key].get('table_header',table_header)
|
||||
timestamp_variables = config_dict[key].get('timestamp',[])
|
||||
datetime_format = config_dict[key].get('datetime_format',[])
|
||||
for instFolder in config_dict.keys():
|
||||
if instFolder in filename.split(os.sep):
|
||||
file_encoding = config_dict[instFolder].get('file_encoding',file_encoding)
|
||||
separator = config_dict[instFolder].get('separator',separator)
|
||||
table_header = config_dict[instFolder].get('table_header',table_header)
|
||||
timestamp_variables = config_dict[instFolder].get('timestamp',[])
|
||||
datetime_format = config_dict[instFolder].get('datetime_format',[])
|
||||
|
||||
description_dict = {}
|
||||
link_to_description = config_dict[key].get('link_to_description', '').replace('/', os.sep)
|
||||
|
||||
link_to_description = config_dict[instFolder].get('link_to_description', '').replace('/', os.sep)
|
||||
|
||||
if link_to_description:
|
||||
path = os.path.join(instruments_dir, link_to_description)
|
||||
@ -75,49 +80,60 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
|
||||
file_encoding = [file_encoding]
|
||||
separator = [separator]
|
||||
|
||||
with open(tmp_filename,'rb') as f:
|
||||
table_preamble = []
|
||||
for line_number, line in enumerate(f):
|
||||
table_preamble = []
|
||||
line_number = 0
|
||||
if 'infer' not in table_header:
|
||||
|
||||
with open(tmp_filename,'rb') as f:
|
||||
|
||||
for tb_idx, tb in enumerate(table_header):
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
for line_number, line in enumerate(f):
|
||||
|
||||
|
||||
for tb_idx, tb in enumerate(table_header):
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
break
|
||||
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split(separator[tb_idx].replace('\\t','\t'))
|
||||
|
||||
# Count occurrences of each substring
|
||||
substring_counts = collections.Counter(list_of_substrings)
|
||||
data_start = True
|
||||
# Generate column names with appended index only for repeated substrings
|
||||
column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
|
||||
#column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
#column_names = []
|
||||
#for i, name in enumerate(list_of_substrings):
|
||||
# column_names.append(str(i)+'_'+name)
|
||||
|
||||
#print(line_number, len(column_names ),'\n')
|
||||
break
|
||||
|
||||
if tb in line.decode(file_encoding[tb_idx]):
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split(separator[tb_idx].replace('\\t','\t'))
|
||||
|
||||
# Count occurrences of each substring
|
||||
substring_counts = collections.Counter(list_of_substrings)
|
||||
data_start = True
|
||||
# Generate column names with appended index only for repeated substrings
|
||||
column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
|
||||
#column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
#column_names = []
|
||||
#for i, name in enumerate(list_of_substrings):
|
||||
# column_names.append(str(i)+'_'+name)
|
||||
|
||||
#print(line_number, len(column_names ),'\n')
|
||||
break
|
||||
# Subdivide line into words, and join them by single space.
|
||||
# I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split()
|
||||
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
|
||||
#line = ' '.join(list_of_substrings+['\n'])
|
||||
#line = ' '.join(list_of_substrings)
|
||||
table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line
|
||||
# Subdivide line into words, and join them by single space.
|
||||
# I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
|
||||
list_of_substrings = line.decode(file_encoding[tb_idx]).split()
|
||||
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
|
||||
#line = ' '.join(list_of_substrings+['\n'])
|
||||
#line = ' '.join(list_of_substrings)
|
||||
table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line
|
||||
|
||||
|
||||
# TODO: it does not work with separator as none :(. fix for RGA
|
||||
try:
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter = separator[tb_idx].replace('\\t','\t'),
|
||||
header=line_number,
|
||||
#encoding='latin-1',
|
||||
encoding = file_encoding[tb_idx],
|
||||
names=column_names,
|
||||
skip_blank_lines=True)
|
||||
if not 'infer' in table_header:
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter = separator[tb_idx].replace('\\t','\t'),
|
||||
header=line_number,
|
||||
#encoding='latin-1',
|
||||
encoding = file_encoding[tb_idx],
|
||||
names=column_names,
|
||||
skip_blank_lines=True)
|
||||
else:
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter = separator[tb_idx].replace('\\t','\t'),
|
||||
header=line_number,
|
||||
encoding = file_encoding[tb_idx],
|
||||
skip_blank_lines=True)
|
||||
|
||||
df_numerical_attrs = df.select_dtypes(include ='number')
|
||||
df_categorical_attrs = df.select_dtypes(exclude='number')
|
||||
|
Reference in New Issue
Block a user