diff --git a/pipelines/data_integration.py b/pipelines/data_integration.py index aaf49a4..46a7e36 100644 --- a/pipelines/data_integration.py +++ b/pipelines/data_integration.py @@ -1,5 +1,6 @@ import sys import os +import re try: thisFilePath = os.path.abspath(__file__) @@ -66,6 +67,28 @@ def load_config_and_setup_logging(yaml_config_file_path, log_dir): missing_keys = [key for key in required_keys if key not in config_dict] if missing_keys: raise KeyError(f"Missing required keys in YAML configuration: {missing_keys}") + + # Check the instrument_datafolder required type and ensure the list is of at least length one. + if isinstance(config_dict['instrument_datafolder'], list) and not len(config_dict['instrument_datafolder'])>=1: + raise ValueError('Invalid value for key "instrument_datafolder". Expected a list of strings with at least one item.' + 'Each item represents a subfolder name in the input file directory, where the name' + 'must match the format "[/]".' + 'The first subfolder name is required, and the second is optional. ' + 'Examples of valid values: "level1", "level1/level2".') + + # Define the pattern for valid subfolder names: `subfolder` or `subfolder/subfolder` + #valid_pattern = re.compile(r'^[^/]+(/[^/]+)?$') + + # Validate each subfolder name + #for folder in config_dict['instrument_folder']: + # if not isinstance(folder, str) or not valid_pattern.match(folder): + # raise ValueError( + # 'Invalid value for key "instrument_folder" in YAML file.' + # 'Each item must be a string matching the format ' + # '"[/]". The first subfolder name is required, and the second is optional. ' + # 'Examples of valid values: "level1", "level1/level2". ' + # f'Invalid item: {folder}' + # ) # Validate integration_mode integration_mode = config_dict.get('integration_mode', 'N/A') # Default to 'collection'