diff --git a/g5505_utils.py b/g5505_utils.py new file mode 100644 index 0000000..103f529 --- /dev/null +++ b/g5505_utils.py @@ -0,0 +1,51 @@ +import pandas as pd +import os + + +def is_callable_list(x : list): + return all([callable(item) for item in x]) + +def is_str_list(x : list): + return all([isinstance(item,str) for item in x]) + +def augment_with_filetype(df): + df['filetype'] = [os.path.splitext(item)[1][1::] for item in df['filename']] + #return [os.path.splitext(item)[1][1::] for item in df['filename']] + return df + +def augment_with_filenumber(df): + df['filenumber'] = [item[0:item.find('_')] for item in df['filename']] + #return [item[0:item.find('_')] for item in df['filename']] + return df + +def group_by_df_column(df, column_name: str): + """ + df (pandas.DataFrame): + column_name (str): column_name of df by which grouping operation will take place. + """ + + if not column_name in df.columns: + raise ValueError("column_name must be in the columns of df.") + + return df[column_name] + +def split_sample_col_into_sample_and_data_quality_cols(input_data: pd.DataFrame): + + sample_name = [] + sample_quality = [] + for item in input_data['sample']: + if item.find('(')!=-1: + #print(item) + sample_name.append(item[0:item.find('(')]) + sample_quality.append(item[item.find('(')+1:len(item)-1]) + else: + if item=='': + sample_name.append('Not yet annotated') + sample_quality.append('unevaluated') + else: + sample_name.append(item) + sample_quality.append('good data') + input_data['sample'] = sample_name + input_data['data_quality'] = sample_quality + + return input_data