import os
import src.hdf5_lib as hdf5_lib
import src.g5505_utils as utils
import yaml
import logging
from datetime import datetime
[docs]
def integrate_data_sources(yaml_config_file_path, log_dir='logs/'):
""" Integrates data sources specified by the input configuration file into HDF5 files.
Parameters:
yaml_config_file_path (str): Path to the YAML configuration file.
log_dir (str): Directory to save the log file.
Returns:
str: Path (or list of Paths) to the created HDF5 file(s).
"""
date = utils.created_at()
utils.setup_logging(log_dir, f"integrate_data_sources_{date}.log")
with open(yaml_config_file_path,'r') as stream:
try:
config_dict = yaml.load(stream, Loader=yaml.FullLoader)
except yaml.YAMLError as exc:
logging.error("Error loading YAML file: %s", exc)
raise
def output_filename(name, date, initials):
return f"{name}_{date}_{initials}.h5"
exp_campaign_name = config_dict['experiment']
initials = config_dict['contact']
input_file_dir = config_dict['input_file_directory']
output_dir = config_dict['output_file_directory']
select_dir_keywords = config_dict['instrument_datafolder']
root_metadata_dict = {
'project' : config_dict['project'],
'experiment' : config_dict['experiment'],
'contact' : config_dict['contact'],
'actris_level': config_dict['actris_level']
}
def create_hdf5_file(date_str, select_file_keywords,root_metadata):
filename = output_filename(exp_campaign_name, date_str, initials)
output_path = os.path.join(output_dir, filename)
logging.info("Creating HDF5 file at: %s", output_path)
return hdf5_lib.create_hdf5_file_from_filesystem_path(
output_path, input_file_dir, select_dir_keywords, select_file_keywords, root_metadata_dict=root_metadata
)
if config_dict.get('datetime_steps'):
datetime_augment_dict = {}
for datetime_step in config_dict['datetime_steps']:
tmp = datetime.strptime(datetime_step,'%Y-%m-%d %H-%M-%S') #convert(datetime_step)
datetime_augment_dict[tmp] = [tmp.strftime('%Y-%m-%d'),tmp.strftime('%Y_%m_%d'),tmp.strftime('%Y.%m.%d'),tmp.strftime('%Y%m%d')]
print(tmp)
if 'single_experiment' in config_dict['integration_mode']:
output_filename_path = []
for datetime_step in datetime_augment_dict.keys():
date_str = datetime_step.strftime('%Y-%m-%d')
select_file_keywords = datetime_augment_dict[datetime_step]
root_metadata_dict.update({'dataset_startdate': date_str,
'dataset_enddate': date_str})
dt_step_output_filename_path= create_hdf5_file(date_str, select_file_keywords, root_metadata_dict)
output_filename_path.append(dt_step_output_filename_path)
elif 'collection' in config_dict['integration_mode']:
select_file_keywords = []
for datetime_step in datetime_augment_dict.keys():
select_file_keywords = select_file_keywords + datetime_augment_dict[datetime_step]
config_dict['dataset_startdate'] = min(datetime_augment_dict.keys())
config_dict['dataset_enddate'] = max(datetime_augment_dict.keys())
startdate = config_dict['dataset_startdate'].strftime('%Y-%m-%d')
enddate = config_dict['dataset_enddate'].strftime('%Y-%m-%d')
root_metadata_dict.update({'dataset_startdate': startdate,
'dataset_enddate': enddate})
date_str = f'{startdate}_{enddate}'
output_filename_path = create_hdf5_file(date_str, select_file_keywords, root_metadata_dict)
else:
startdate = config_dict['dataset_startdate']
enddate = config_dict['dataset_enddate']
root_metadata_dict.update({'dataset_startdate': startdate,
'dataset_enddate': enddate})
date_str = f'{startdate}_{enddate}'
output_filename_path = create_hdf5_file(date_str, select_file_keywords = [], root_metadata = root_metadata_dict)
return output_filename_path