Optimzed and included df to np structured array conversion. \n-Replaced loop plus append with list comprehension. \n-Replaced pd df column concatenation based on row-wise concatenation with df.aggr() method that uses column wise concatenation.

2024-05-23 22:18:37 +02:00
parent a45fb4476b
commit bd458c6cd0
1 changed files with 69 additions and 25 deletions
--- a/src/g5505_file_reader.py
+++ b/src/g5505_file_reader.py
@@ -86,6 +86,37 @@ def infer_units(column_name):
    return match
 def dataframe_to_np_structured_array(df: pd.DataFrame):
     # Define the dtype for the structured array, ensuring compatibility with h5py
    dtype = []
    for col in df.columns:
        col_dtype = df[col].dtype
        if pd.api.types.is_string_dtype(col_dtype):
            # Convert string dtype to fixed-length strings
            max_len = df[col].str.len().max()
            dtype.append((col, f'S{max_len}'))
        elif pd.api.types.is_integer_dtype(col_dtype):
            dtype.append((col, 'i4'))  # Assuming 32-bit integer
        elif pd.api.types.is_float_dtype(col_dtype):
            dtype.append((col, 'f4'))  # Assuming 32-bit float
        else:
            raise ValueError(f"Unsupported dtype: {col_dtype}")
    # Convert the DataFrame to a structured array
    structured_array = np.array(list(df.itertuples(index=False, name=None)), dtype=dtype)
    #return structured_array
    #table_header = df.columns
    #table = df.to_numpy()
    #rows,cols = table.shape
    #tmp = [tuple(table[i,:]) for i in range(rows)]
    #dtype_tmp = [(table_header[i],'f4') for i in range(cols)]
    #data = np.array(tmp, dtype=dtype_tmp)
    return structured_array
 def read_txt_files_as_dict(filename : str ):
    with open('src/text_data_sources.yaml','r') as stream:
@@ -131,9 +162,10 @@ def read_txt_files_as_dict(filename : str ):
            if table_header in line.decode(file_encoding):   
                list_of_substrings = line.decode(file_encoding).split(separator)             
                data_start = True  
-                column_names = []
+                column_names = [str(i)+'_'+name for i, name in enumerate(list_of_substrings)]
-                for i, name in enumerate(list_of_substrings):
+                #column_names = []
-                    column_names.append(str(i)+'_'+name) 
+                #for i, name in enumerate(list_of_substrings):
                #    column_names.append(str(i)+'_'+name) 
                #print(line_number, len(column_names ),'\n')
                break
@@ -143,9 +175,17 @@ def read_txt_files_as_dict(filename : str ):
            # TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
            #line = ' '.join(list_of_substrings+['\n'])
            #line = ' '.join(list_of_substrings)     
-            table_preamble.append(' '.join(list_of_substrings))# += new_line  
+            table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line  
-        header_dict["table_preamble"] = table_preamble
+    # Represent string values as fixed length strings in the HDF5 file, which need
    # to be decoded as string when we read them. It provides better control than variable strings,
    # at the expense of flexibility.
    # https://docs.h5py.org/en/stable/strings.html
    if table_preamble:
        max_length = max(len(item) for item in table_preamble)
        utf8_type = h5py.string_dtype('utf-8', max_length)
        header_dict["table_preamble"] = np.array(table_preamble,dtype=utf8_type) 
    # TODO: it does not work with separator as none :(. fix for RGA
@@ -164,9 +204,14 @@ def read_txt_files_as_dict(filename : str ):
        # Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
        if timestamp_variables:
-            df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
+            #df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
            #df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
            df_categorical_attrs['timestamps'] = df_categorical_attrs[timestamp_variables].astype(str).agg(' '.join, axis=1)
            df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
            #print(df_categorical_attrs)
        categorical_variables = [item for item in df_categorical_attrs.columns]
        ####
@@ -185,13 +230,12 @@ def read_txt_files_as_dict(filename : str ):
        if numerical_variables:
            dataset = {}
-            dataset['name'] = 'numerical_variables'
+            dataset['name'] = 'table_numerical_variables'
-            dataset['data'] = df_numerical_attrs.to_numpy()
+            dataset['data'] = dataframe_to_np_structured_array(df_numerical_attrs) #df_numerical_attrs.to_numpy()
            dataset['shape'] = dataset['data'].shape
            dataset['dtype'] = type(dataset['data'])       
            #dataset['data_units'] = file_obj['wave']['data_units']
            file_dict['datasets'].append(dataset)
            rows,cols = dataset['shape']
            try:
                dataset['attributes'] = description_dict['table_header'].copy()
@@ -203,29 +247,29 @@ def read_txt_files_as_dict(filename : str ):
            except ValueError as err:
                print(err)
-            dataset = {}
+            #dataset = {}
-            numerical_variables= [item.encode("utf-8") for item in numerical_variables]
+            #numerical_variables= [item.encode("utf-8") for item in numerical_variables]
-            dataset['name'] = 'numerical_variable_names'
+            #dataset['name'] = 'numerical_variable_names'
-            dataset['data'] = np.array(numerical_variables).reshape((1,cols))
+            #dataset['data'] = np.array(numerical_variables).reshape((1,len(numerical_variables)))
-            dataset['shape'] = dataset['data'].shape
+            #dataset['shape'] = dataset['data'].shape
-            dataset['dtype'] = type(dataset['data'])
+            #dataset['dtype'] = type(dataset['data'])
-            file_dict['datasets'].append(dataset)            
+            #file_dict['datasets'].append(dataset)            
        if categorical_variables:
            dataset = {}
-            dataset['name'] = 'categorical_variables'
+            dataset['name'] = 'table_categorical_variables'
-            dataset['data'] = df_categorical_attrs.loc[:,categorical_variables].to_numpy()
+            dataset['data'] = dataframe_to_np_structured_array(df_categorical_attrs) #df_categorical_attrs.loc[:,categorical_variables].to_numpy()
            dataset['shape'] = dataset['data'].shape
            dataset['dtype'] = type(dataset['data'])
            file_dict['datasets'].append(dataset)    
-            dataset = {}
+        #    dataset = {}
-            categorical_variables = [item.encode("utf-8") for item in categorical_variables]
+        #    categorical_variables = [item.encode("utf-8") for item in categorical_variables]
-            dataset['name'] = 'categorial_variable_names'
+        #    dataset['name'] = 'categorial_variable_names'
-            dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
+        #    dataset['data'] = np.array(categorical_variables).reshape((1,len(categorical_variables)))
-            dataset['shape'] = dataset['data'].shape
+        #    dataset['shape'] = dataset['data'].shape
-            dataset['dtype'] = type(dataset['data'])
+        #    dataset['dtype'] = type(dataset['data'])
-            file_dict['datasets'].append(dataset)    
+        #    file_dict['datasets'].append(dataset)    
    except:
        return {}