Fix bug. Saved flags from apps were not associated with right parent variable.

This commit is contained in:
2025-02-15 18:10:29 +01:00
parent 46767996b8
commit 25f3ee12a4
2 changed files with 1122 additions and 1122 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,301 +1,301 @@
import dima.src.hdf5_ops as h5de import dima.src.hdf5_ops as h5de
from plotly.subplots import make_subplots from plotly.subplots import make_subplots
import plotly.graph_objs as go import plotly.graph_objs as go
import base64 import base64
import os import os
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import dima.utils.g5505_utils as utils import dima.utils.g5505_utils as utils
UPLOAD_DIRECTORY = 'data_products/' UPLOAD_DIRECTORY = 'data/'
flags_dict = { flags_dict = {
"000" : {"flag_label": 'V', "flag_description": "Valid measurement"}, "000" : {"flag_label": 'V', "flag_description": "Valid measurement"},
"100" : {"flag_label": 'V', "flag_description": "Checked by data originator. Valid measurement, overrides any invalid flags"}, "100" : {"flag_label": 'V', "flag_description": "Checked by data originator. Valid measurement, overrides any invalid flags"},
"110" : {"flag_label": 'V', "flag_description": "Episode data checked and accepted by data originator. Valid measurement"}, "110" : {"flag_label": 'V', "flag_description": "Episode data checked and accepted by data originator. Valid measurement"},
"111" : {"flag_label": 'V', "flag_description": "Irregular data checked and accepted by data originator. Valid measurement"}, "111" : {"flag_label": 'V', "flag_description": "Irregular data checked and accepted by data originator. Valid measurement"},
"456" : {"flag_label": 'I', "flag_description": "Invalidated by data originator"}, "456" : {"flag_label": 'I', "flag_description": "Invalidated by data originator"},
"460" : {"flag_label": 'I', "flag_description": "Contamination suspected"}, "460" : {"flag_label": 'I', "flag_description": "Contamination suspected"},
"559" : {"flag_label": 'V', "flag_description": "Unspecified contamination or local influence, but considered valid"}, "559" : {"flag_label": 'V', "flag_description": "Unspecified contamination or local influence, but considered valid"},
"599" : {"flag_label": 'I', "flag_description": "Unspecified contamination or local influence"}, "599" : {"flag_label": 'I', "flag_description": "Unspecified contamination or local influence"},
"652" : {"flag_label": 'V', "flag_description": "construction/activity nearby"}, "652" : {"flag_label": 'V', "flag_description": "construction/activity nearby"},
"659" : {"flag_label": 'I', "flag_description": "Unspecified instrument/sampling anomaly"}, "659" : {"flag_label": 'I', "flag_description": "Unspecified instrument/sampling anomaly"},
"660" : {"flag_label": 'V', "flag_description": "Unspecified instrument/sampling anomaly"}, "660" : {"flag_label": 'V', "flag_description": "Unspecified instrument/sampling anomaly"},
"999" : {"flag_label": 'I', "flag_description": "Missing measurement, unspecified reason"} "999" : {"flag_label": 'I', "flag_description": "Missing measurement, unspecified reason"}
} }
def save_file(name, content): def save_file(name, content):
# Decode the content and save the file # Decode the content and save the file
content_type, content_string = content.split(',') content_type, content_string = content.split(',')
decoded = base64.b64decode(content_string) decoded = base64.b64decode(content_string)
file_path = os.path.join(UPLOAD_DIRECTORY, name) file_path = os.path.join(UPLOAD_DIRECTORY, name)
if not os.path.exists(file_path): if not os.path.exists(file_path):
with open(file_path, "wb") as f: with open(file_path, "wb") as f:
f.write(decoded) f.write(decoded)
print(f"File saved successfully at {file_path}") print(f"File saved successfully at {file_path}")
return file_path return file_path
else: else:
print(f'File already exists at {file_path}.\nTo maintain the integrity of the existing file, it will not be overwritten.') print(f'File already exists at {file_path}.\nTo maintain the integrity of the existing file, it will not be overwritten.')
return file_path return file_path
def filter_flags_by_label(flags_dict, label): def filter_flags_by_label(flags_dict, label):
""" """
Filters the flags dictionary by the specified label. Filters the flags dictionary by the specified label.
Parameters: Parameters:
----------- -----------
flags_dict (dict): The dictionary containing flags. flags_dict (dict): The dictionary containing flags.
label (str): The label to filter by ('I' or 'V'). label (str): The label to filter by ('I' or 'V').
Returns: Returns:
-------- --------
list: A list of dictionaries with 'label' and 'value' for the specified label. list: A list of dictionaries with 'label' and 'value' for the specified label.
""" """
return [{'label': value['flag_description'], 'value': code} return [{'label': value['flag_description'], 'value': code}
for code, value in flags_dict.items() if value['flag_label'] == label] for code, value in flags_dict.items() if value['flag_label'] == label]
def create_loaded_file_figure(file_path, instFolder, dataset_name, datetime_var, datetime_var_format, variables): def create_loaded_file_figure(file_path, instFolder, dataset_name, datetime_var, datetime_var_format, variables):
DataOpsAPI = h5de.HDF5DataOpsManager(file_path) DataOpsAPI = h5de.HDF5DataOpsManager(file_path)
if not DataOpsAPI.file_obj: if not DataOpsAPI.file_obj:
DataOpsAPI.load_file_obj() DataOpsAPI.load_file_obj()
#target_channels = DataOpsAPI.file_obj[instfolder].attrs['target_channels']['names'][0].decode().split(',') #target_channels = DataOpsAPI.file_obj[instfolder].attrs['target_channels']['names'][0].decode().split(',')
#target_loc = DataOpsAPI.file_obj[instfolder].attrs['target_channels']['location'][0].decode() #target_loc = DataOpsAPI.file_obj[instfolder].attrs['target_channels']['location'][0].decode()
#diagnostic_channels = DataOpsAPI.file_obj[instfolder].attrs['diagnostic_channels']['names'][0].decode().split(',') #diagnostic_channels = DataOpsAPI.file_obj[instfolder].attrs['diagnostic_channels']['names'][0].decode().split(',')
#diagnostic_loc = DataOpsAPI.file_obj[instfolder].attrs['diagnostic_channels']['location'][0].decode() #diagnostic_loc = DataOpsAPI.file_obj[instfolder].attrs['diagnostic_channels']['location'][0].decode()
#fig = make_subplots(rows=(len(target_channels+diagnostic_channels)-2), cols=1, shared_xaxes=True, #fig = make_subplots(rows=(len(target_channels+diagnostic_channels)-2), cols=1, shared_xaxes=True,
# row_heights = [1 for i in range(len(target_channels+diagnostic_channels)-2)]) # row_heights = [1 for i in range(len(target_channels+diagnostic_channels)-2)])
fig = make_subplots(rows=(len(variables)), cols=1, fig = make_subplots(rows=(len(variables)), cols=1,
row_heights = [1 for i in range(len(variables))]) row_heights = [1 for i in range(len(variables))])
traces = [] traces = []
trace_idx = 1 trace_idx = 1
dataset = DataOpsAPI.file_obj[dataset_name] dataset = DataOpsAPI.file_obj[dataset_name]
time_column = DataOpsAPI.reformat_datetime_column(dataset_name, time_column = DataOpsAPI.reformat_datetime_column(dataset_name,
datetime_var, datetime_var,
datetime_var_format) datetime_var_format)
#time_column = dataset[datetime_var][:] #time_column = dataset[datetime_var][:]
for i in range(1,len(variables)): for i in range(1,len(variables)):
fig.add_trace(go.Scatter(x = time_column, fig.add_trace(go.Scatter(x = time_column,
y = dataset[variables[i]][:], y = dataset[variables[i]][:],
mode = 'lines', mode = 'lines',
name = variables[i]), row=trace_idx, col=1) name = variables[i]), row=trace_idx, col=1)
fig.update_yaxes(title_text= variables[i], row=trace_idx, col=1) fig.update_yaxes(title_text= variables[i], row=trace_idx, col=1)
trace_idx = trace_idx + 1 trace_idx = trace_idx + 1
#dataset = DataOpsAPI.file_obj[diagnostic_loc] #dataset = DataOpsAPI.file_obj[diagnostic_loc]
#time_column = DataOpsAPI.reformat_datetime_column(diagnostic_loc,diagnostic_channels[0],'%d.%m.%Y %H:%M:%S') #time_column = DataOpsAPI.reformat_datetime_column(diagnostic_loc,diagnostic_channels[0],'%d.%m.%Y %H:%M:%S')
#for i in range(1,len(diagnostic_channels)): #for i in range(1,len(diagnostic_channels)):
# fig.add_trace(go.Scatter(x = time_column, # fig.add_trace(go.Scatter(x = time_column,
# y = dataset[diagnostic_channels[i]][:], # y = dataset[diagnostic_channels[i]][:],
# mode = 'lines', # mode = 'lines',
# name = diagnostic_channels[i]), row=trace_idx, col=1) # name = diagnostic_channels[i]), row=trace_idx, col=1)
# fig.update_yaxes(title_text= diagnostic_channels[i], row=trace_idx, col=1, type="log") # fig.update_yaxes(title_text= diagnostic_channels[i], row=trace_idx, col=1, type="log")
# trace_idx = trace_idx + 1 # trace_idx = trace_idx + 1
fig.update_layout(height=1200, title_text=f"{instFolder} : Target and Diagnostic Channels", showlegend=False) fig.update_layout(height=1200, title_text=f"{instFolder} : Target and Diagnostic Channels", showlegend=False)
DataOpsAPI.unload_file_obj() DataOpsAPI.unload_file_obj()
#target_channels.remove(target_channels[0]) #target_channels.remove(target_channels[0])
#diagnostic_channels.remove(diagnostic_channels[0]) #diagnostic_channels.remove(diagnostic_channels[0])
return fig , [','.join([item,dataset_name]) for item in variables] #+ [','.join([item,diagnostic_loc]) for item in diagnostic_channels] return fig , [','.join([item,dataset_name]) for item in variables] #+ [','.join([item,diagnostic_loc]) for item in diagnostic_channels]
#import os #import os
import json import json
import h5py import h5py
def load_flags(flagFolderPath, dry_run : bool = False): #filePath, instFolder, dry_run : bool = False): def load_flags(flagFolderPath, dry_run : bool = False): #filePath, instFolder, dry_run : bool = False):
""" """
Returns a list of flags (dictionaries) based on the provided filePath and instFolder. Returns a list of flags (dictionaries) based on the provided filePath and instFolder.
Parameters: Parameters:
----------- -----------
filePath (str): The path to the uploaded file, expected to have an .h5 extension. filePath (str): The path to the uploaded file, expected to have an .h5 extension.
instFolder (str): The name of the instrument folder, which must exist as a group in the HDF5 file. instFolder (str): The name of the instrument folder, which must exist as a group in the HDF5 file.
dry_run (bool): If True, performs all operations except loading file contents. dry_run (bool): If True, performs all operations except loading file contents.
Returns: Returns:
-------- --------
list: A list of dictionaries containing flag data (or file paths in dry_run mode), list: A list of dictionaries containing flag data (or file paths in dry_run mode),
or None if conditions are not met. or None if conditions are not met.
""" """
# Return None if the flags folder does not exist # Return None if the flags folder does not exist
if not os.path.exists(flagFolderPath): if not os.path.exists(flagFolderPath):
return None return None
# List files in the flags folder # List files in the flags folder
files = [os.path.join(flagFolderPath, f) for f in os.listdir(flagFolderPath)] files = [os.path.join(flagFolderPath, f) for f in os.listdir(flagFolderPath)]
# If no files found, return None # If no files found, return None
if not files: if not files:
return None return None
# Sort files by creation time # Sort files by creation time
sortedFiles = sorted(files, key=os.path.getctime) sortedFiles = sorted(files, key=os.path.getctime)
if dry_run: if dry_run:
print(f"Dry run: Found {len(sortedFiles)} files in the flags folder:") print(f"Dry run: Found {len(sortedFiles)} files in the flags folder:")
for filePath in sortedFiles: for filePath in sortedFiles:
print(f" - {filePath}") print(f" - {filePath}")
return sortedFiles # Return file paths in dry run mode return sortedFiles # Return file paths in dry run mode
# Process and load JSON files # Process and load JSON files
flagDataList = [] flagDataList = []
for filePath in sortedFiles: for filePath in sortedFiles:
if filePath.endswith('.json'): if filePath.endswith('.json'):
try: try:
with open(filePath, 'r') as file: with open(filePath, 'r') as file:
flagDataList.append(json.load(file)) flagDataList.append(json.load(file))
except (json.JSONDecodeError, FileNotFoundError) as e: except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"Error loading file {filePath}: {e}") print(f"Error loading file {filePath}: {e}")
continue # Skip invalid or missing files continue # Skip invalid or missing files
return flagDataList return flagDataList
class FlaggingAppDataManager(): class FlaggingAppDataManager():
def __init__(self, file_path, mode = 'r+') -> None: def __init__(self, file_path, mode = 'r+') -> None:
self.file_path = file_path self.file_path = file_path
self.mode = mode self.mode = mode
self._data_ops_obj = None self._data_ops_obj = None
self.file_obj = None self.file_obj = None
self.datasets_metadata_df = None self.datasets_metadata_df = None
return None return None
def load_file_obj(self): def load_file_obj(self):
self._data_ops_obj = h5de.HDF5DataOpsManager(self.file_path, self.mode) self._data_ops_obj = h5de.HDF5DataOpsManager(self.file_path, self.mode)
self._data_ops_obj.load_file_obj() self._data_ops_obj.load_file_obj()
self.file_obj = self._data_ops_obj.file_obj self.file_obj = self._data_ops_obj.file_obj
def unload_file_obj(self): def unload_file_obj(self):
self._data_ops_obj = h5de.HDF5DataOpsManager(self.file_path, self.mode) self._data_ops_obj = h5de.HDF5DataOpsManager(self.file_path, self.mode)
self._data_ops_obj.unload_file_obj() # sets __data_ops_obj.file_obj to None self._data_ops_obj.unload_file_obj() # sets __data_ops_obj.file_obj to None
def transfer_flags(self): def transfer_flags(self):
if self.file_obj is None: if self.file_obj is None:
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file_obj' method before attempting to modify it.") raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file_obj' method before attempting to modify it.")
path_to_append_dir, ext = os.path.splitext(self.file_path) path_to_append_dir, ext = os.path.splitext(self.file_path)
self._data_ops_obj.update_file(path_to_append_dir) self._data_ops_obj.update_file(path_to_append_dir)
def apply_flags(self,instFolder): def apply_flags(self,instFolder):
# TODO: apply flags to diagnostic and indivial channels. so far is all channels are cleaned # TODO: apply flags to diagnostic and indivial channels. so far is all channels are cleaned
if self.file_obj is None: if self.file_obj is None:
raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file_obj' method before attempting to modify it.") raise RuntimeError("File object is not loaded. Please load the HDF5 file using the 'load_file_obj' method before attempting to modify it.")
DataOpsManager = self._data_ops_obj DataOpsManager = self._data_ops_obj
file_obj = self.file_obj file_obj = self.file_obj
#with h5py.File(self.file_path, mode = self.mode, track_order=True) as file_obj: #with h5py.File(self.file_path, mode = self.mode, track_order=True) as file_obj:
try: try:
if not instFolder in file_obj: if not instFolder in file_obj:
raise ValueError(f'Invalid instrument name. Instrument folder {instFolder} was not found in file {self.file_path}.') raise ValueError(f'Invalid instrument name. Instrument folder {instFolder} was not found in file {self.file_path}.')
if '_'.join([instFolder,'flags']) not in flag_obj: if '_'.join([instFolder,'flags']) not in flag_obj:
raise RuntimeWarning(f'There is no flags to apply. ') raise RuntimeWarning(f'There is no flags to apply. ')
if not ('diagnostic_channels' in file_obj[instFolder].attrs and 'target_channels' in file_obj[instFolder].attrs): if not ('diagnostic_channels' in file_obj[instFolder].attrs and 'target_channels' in file_obj[instFolder].attrs):
raise ValueError( raise ValueError(
f'Required attributes missing. Instrument folder {instFolder} in file {self.file_path} has to be annotated with ' f'Required attributes missing. Instrument folder {instFolder} in file {self.file_path} has to be annotated with '
'attributes "diagnostic_channels" and "target_channels" that specify channels location and their names.' 'attributes "diagnostic_channels" and "target_channels" that specify channels location and their names.'
) )
dataset_name = file_obj[instFolder].attrs['target_channels']['location'][0].decode() dataset_name = file_obj[instFolder].attrs['target_channels']['location'][0].decode()
channel_names = file_obj[instFolder].attrs['target_channels']['names'][0].decode().split(',') channel_names = file_obj[instFolder].attrs['target_channels']['names'][0].decode().split(',')
dataset_obj = file_obj[dataset_name] dataset_obj = file_obj[dataset_name]
# TODO: maybe we can do this directly on dataset = dataset_obj[...], which is a structured numpy array, instead of wrapping that as dataframe # TODO: maybe we can do this directly on dataset = dataset_obj[...], which is a structured numpy array, instead of wrapping that as dataframe
dataset_df = DataOpsManager.extract_dataset_as_dataframe(dataset_name) dataset_df = DataOpsManager.extract_dataset_as_dataframe(dataset_name)
# Define datetime variable based on channel names. We assume by design the first entry of the list is the datetime variable name. # Define datetime variable based on channel names. We assume by design the first entry of the list is the datetime variable name.
datetime_var = channel_names[0] datetime_var = channel_names[0]
remaining_vars = channel_names.copy() remaining_vars = channel_names.copy()
remaining_vars.remove(datetime_var) remaining_vars.remove(datetime_var)
ref_datetime_format = dataset_obj.attrs.get(datetime_var,None)['datetime_format'][0].decode() ref_datetime_format = dataset_obj.attrs.get(datetime_var,None)['datetime_format'][0].decode()
#datetime_var_data = pd.Series([item.decode() for item in dataset_obj[datetime_var]]) #datetime_var_data = pd.Series([item.decode() for item in dataset_obj[datetime_var]])
#datetime_var_data = pd.to_datetime(datetime_var_data , format = ref_datetime_format, errors = 'coerce') #datetime_var_data = pd.to_datetime(datetime_var_data , format = ref_datetime_format, errors = 'coerce')
dataset_df[datetime_var] = dataset_df[datetime_var].apply(lambda x: x.decode() ) dataset_df[datetime_var] = dataset_df[datetime_var].apply(lambda x: x.decode() )
dataset_df[datetime_var] = pd.to_datetime(dataset_df[datetime_var], format = ref_datetime_format, errors = 'coerce') dataset_df[datetime_var] = pd.to_datetime(dataset_df[datetime_var], format = ref_datetime_format, errors = 'coerce')
flag_indicator = np.zeros(shape = dataset_df[datetime_var].shape, flag_indicator = np.zeros(shape = dataset_df[datetime_var].shape,
dtype = bool) dtype = bool)
# TODO: include this information as part of the flag's attributes in the flag recording process # TODO: include this information as part of the flag's attributes in the flag recording process
flag_datetime_format='%Y-%m-%d %H:%M:%S.%f' flag_datetime_format='%Y-%m-%d %H:%M:%S.%f'
for flag in file_obj[f'{instFolder}_flags']: for flag in file_obj[f'{instFolder}_flags']:
flag_obj = file_obj[f'{instFolder}_flags'][flag]['data_table'] flag_obj = file_obj[f'{instFolder}_flags'][flag]['data_table']
# Replace values indicated by flag NaN if flag label refers to invalidated data. # Replace values indicated by flag NaN if flag label refers to invalidated data.
if not flag_obj['flag_code'][0].decode() is 'None': if not flag_obj['flag_code'][0].decode() is 'None':
flag_label = '' flag_label = ''
else: else:
flag_label = flag_obj['flag_label'][0].decode() flag_label = flag_obj['flag_label'][0].decode()
if flag_label == 'I': if flag_label == 'I':
t1 = pd.to_datetime(flag_obj['startdate'][0].decode(), format=flag_datetime_format) t1 = pd.to_datetime(flag_obj['startdate'][0].decode(), format=flag_datetime_format)
t2 = pd.to_datetime(flag_obj['enddate'][0].decode(), format=flag_datetime_format) t2 = pd.to_datetime(flag_obj['enddate'][0].decode(), format=flag_datetime_format)
t1_idx = abs(dataset_df[datetime_var]-t1).argmin() t1_idx = abs(dataset_df[datetime_var]-t1).argmin()
t2_idx = abs(dataset_df[datetime_var]-t2).argmin() t2_idx = abs(dataset_df[datetime_var]-t2).argmin()
dataset_df.loc[t1_idx:t2_idx,remaining_vars] = np.nan dataset_df.loc[t1_idx:t2_idx,remaining_vars] = np.nan
# Apply the .strftime() method, handling NaT values by filling with an empty string or placeholder # Apply the .strftime() method, handling NaT values by filling with an empty string or placeholder
dataset_df[datetime_var] = dataset_df[datetime_var].apply( dataset_df[datetime_var] = dataset_df[datetime_var].apply(
lambda x: x.strftime(ref_datetime_format).encode('utf-8') if not pd.isnull(x) else b'' # Handle NaT/NaN by returning empty string lambda x: x.strftime(ref_datetime_format).encode('utf-8') if not pd.isnull(x) else b'' # Handle NaT/NaN by returning empty string
) )
# Split full datasetname instFolder/fileName/datatable --> [instFolder, filename, datatable] # Split full datasetname instFolder/fileName/datatable --> [instFolder, filename, datatable]
dataset_name_parts = dataset_name.split('/') dataset_name_parts = dataset_name.split('/')
# Create new instFolder name to store dataset after applying flags # Create new instFolder name to store dataset after applying flags
newInstFolder = '_'.join([dataset_name_parts[0],'cleaned']) newInstFolder = '_'.join([dataset_name_parts[0],'cleaned'])
dataset_name_parts.remove(dataset_name_parts[0]) dataset_name_parts.remove(dataset_name_parts[0])
# Put together relative datasetname. Note that instFolder is now missing. # Put together relative datasetname. Note that instFolder is now missing.
flagged_dataset_name = '/'.join(dataset_name_parts) flagged_dataset_name = '/'.join(dataset_name_parts)
dataset_dict = {'attributes':{}, dataset_dict = {'attributes':{},
'name':flagged_dataset_name, 'name':flagged_dataset_name,
'data': utils.convert_dataframe_to_np_structured_array(dataset_df)} 'data': utils.convert_dataframe_to_np_structured_array(dataset_df)}
dataset_dict['attributes'].update({'creation_date':utils.created_at().encode('utf-8')}) dataset_dict['attributes'].update({'creation_date':utils.created_at().encode('utf-8')})
dataset_dict['attributes'].update(dataset_obj.attrs) dataset_dict['attributes'].update(dataset_obj.attrs)
DataOpsManager.append_dataset(dataset_dict, newInstFolder) DataOpsManager.append_dataset(dataset_dict, newInstFolder)
except Exception as e: except Exception as e:
self._data_ops_obj.unload_file_obj() self._data_ops_obj.unload_file_obj()
print(f"An unexpected error occurred: {e}" print(f"An unexpected error occurred: {e}"
"The file object has been properly closed.") "The file object has been properly closed.")
#flag_indicator[t1_idx:t2_idx] = True #flag_indicator[t1_idx:t2_idx] = True
#(datetime_var_data-t1).min() #(datetime_var_data-t1).min()
#if not instrument_name in file_obj and not flag_name in file_obj: #if not instrument_name in file_obj and not flag_name in file_obj:
# raise ValueError(f'Invalid instrument_name {instrument_name} and flag_name {flag_name}. No object with such names in file {self.file_path}') # raise ValueError(f'Invalid instrument_name {instrument_name} and flag_name {flag_name}. No object with such names in file {self.file_path}')
#if not f'{instrument_name}_flags': #if not f'{instrument_name}_flags':
# raise ValueError(f'There is no flags to work with. Make sure {instrument_name}_flags is created first before running this method.') # raise ValueError(f'There is no flags to work with. Make sure {instrument_name}_flags is created first before running this method.')