Created g5505_utils.py module with very specific helper functions of the 5505 group. This is to remove clutter from the main module hdf5_lib.py

2024-02-13 16:39:05 +01:00
parent 8f757ca68c
commit d62327ba25
1 changed files with 51 additions and 0 deletions
--- a/g5505_utils.py
+++ b/g5505_utils.py
@@ -0,0 +1,51 @@
+import pandas as pd
+import os
+
+
+def is_callable_list(x : list):
+    return all([callable(item) for item in x])
+
+def is_str_list(x : list):
+    return all([isinstance(item,str) for item in x])
+
+def augment_with_filetype(df):
+    df['filetype'] = [os.path.splitext(item)[1][1::] for item in df['filename']]
+    #return [os.path.splitext(item)[1][1::] for item in df['filename']]
+    return df
+
+def augment_with_filenumber(df):  
+    df['filenumber'] = [item[0:item.find('_')] for item in df['filename']]
+    #return [item[0:item.find('_')] for item in df['filename']]
+    return df
+
+def group_by_df_column(df, column_name: str):
+    """
+    df (pandas.DataFrame): 
+    column_name (str): column_name of df by which grouping operation will take place.  
+    """
+
+    if not column_name in df.columns:
+        raise ValueError("column_name must be in the columns of df.")
+    
+    return df[column_name]
+
+def split_sample_col_into_sample_and_data_quality_cols(input_data: pd.DataFrame):
+    
+    sample_name = []
+    sample_quality = []
+    for item in input_data['sample']:
+        if item.find('(')!=-1:
+            #print(item)
+            sample_name.append(item[0:item.find('(')])
+            sample_quality.append(item[item.find('(')+1:len(item)-1])
+        else:            
+            if item=='':
+                sample_name.append('Not yet annotated')
+                sample_quality.append('unevaluated')
+            else:
+                sample_name.append(item)
+                sample_quality.append('good data')
+    input_data['sample'] = sample_name
+    input_data['data_quality'] = sample_quality
+
+    return input_data