Source code for src.data_integration_lib

import os

import src.hdf5_lib as hdf5_lib
import src.g5505_utils as utils
import yaml
 
import logging
from datetime import datetime



[docs] def integrate_data_sources(yaml_config_file_path, log_dir='logs/'): """ Integrates data sources specified by the input configuration file into HDF5 files. Parameters: yaml_config_file_path (str): Path to the YAML configuration file. log_dir (str): Directory to save the log file. Returns: str: Path (or list of Paths) to the created HDF5 file(s). """ date = utils.created_at() utils.setup_logging(log_dir, f"integrate_data_sources_{date}.log") with open(yaml_config_file_path,'r') as stream: try: config_dict = yaml.load(stream, Loader=yaml.FullLoader) except yaml.YAMLError as exc: logging.error("Error loading YAML file: %s", exc) raise def output_filename(name, date, initials): return f"{name}_{date}_{initials}.h5" exp_campaign_name = config_dict['experiment'] initials = config_dict['contact'] input_file_dir = config_dict['input_file_directory'] output_dir = config_dict['output_file_directory'] select_dir_keywords = config_dict['instrument_datafolder'] root_metadata_dict = { 'project' : config_dict['project'], 'experiment' : config_dict['experiment'], 'contact' : config_dict['contact'], 'actris_level': config_dict['actris_level'] } def create_hdf5_file(date_str, select_file_keywords,root_metadata): filename = output_filename(exp_campaign_name, date_str, initials) output_path = os.path.join(output_dir, filename) logging.info("Creating HDF5 file at: %s", output_path) return hdf5_lib.create_hdf5_file_from_filesystem_path( output_path, input_file_dir, select_dir_keywords, select_file_keywords, root_metadata_dict=root_metadata ) if config_dict.get('datetime_steps'): datetime_augment_dict = {} for datetime_step in config_dict['datetime_steps']: tmp = datetime.strptime(datetime_step,'%Y-%m-%d %H-%M-%S') #convert(datetime_step) datetime_augment_dict[tmp] = [tmp.strftime('%Y-%m-%d'),tmp.strftime('%Y_%m_%d'),tmp.strftime('%Y.%m.%d'),tmp.strftime('%Y%m%d')] print(tmp) if 'single_experiment' in config_dict['integration_mode']: output_filename_path = [] for datetime_step in datetime_augment_dict.keys(): date_str = datetime_step.strftime('%Y-%m-%d') select_file_keywords = datetime_augment_dict[datetime_step] root_metadata_dict.update({'dataset_startdate': date_str, 'dataset_enddate': date_str}) dt_step_output_filename_path= create_hdf5_file(date_str, select_file_keywords, root_metadata_dict) output_filename_path.append(dt_step_output_filename_path) elif 'collection' in config_dict['integration_mode']: select_file_keywords = [] for datetime_step in datetime_augment_dict.keys(): select_file_keywords = select_file_keywords + datetime_augment_dict[datetime_step] config_dict['dataset_startdate'] = min(datetime_augment_dict.keys()) config_dict['dataset_enddate'] = max(datetime_augment_dict.keys()) startdate = config_dict['dataset_startdate'].strftime('%Y-%m-%d') enddate = config_dict['dataset_enddate'].strftime('%Y-%m-%d') root_metadata_dict.update({'dataset_startdate': startdate, 'dataset_enddate': enddate}) date_str = f'{startdate}_{enddate}' output_filename_path = create_hdf5_file(date_str, select_file_keywords, root_metadata_dict) else: startdate = config_dict['dataset_startdate'] enddate = config_dict['dataset_enddate'] root_metadata_dict.update({'dataset_startdate': startdate, 'dataset_enddate': enddate}) date_str = f'{startdate}_{enddate}' output_filename_path = create_hdf5_file(date_str, select_file_keywords = [], root_metadata = root_metadata_dict) return output_filename_path