#!/usr/bin/env python3 """ @package pmsco.pmsco PSI Multiple-Scattering Calculation and Structural Optimization This is the top-level interface of the PMSCO package. All calculations (any mode, any project) start by calling the run_project function of this module. The module also provides a command line, a run-file, and a run-dict interface. They all, in one way or another, set up an instance of a Project class and call the run_project function. For parallel execution, prefix the command line with mpi_exec -np NN, where NN is the number of processes to use. Note that in parallel mode, one process takes the role of the coordinator (master). The master does not run calculations and is idle most of the time. To benefit from parallel execution on a work station, NN should be the number of processors. On a cluster, the number of processes should be chosen according to the available resources. All calculations can also be run in a single process. PMSCO serializes the calculations automatically. The code of the main module is independent of a particular calculation project. All project-specific code must be in a separate python module. The project module must implement a class derived from pmsco.project.Project. The project module and class must be referenced in the run-file, or passed to the suitable run-function. While they are not strictly necessary, run-files help to separate code and data. Code is usually version-controlled, run-files contain metadata of calculations and should be kept with the results. A git hash can be used to refer to the code used to execute the calculation. @author Matthias Muntwiler, matthias.muntwiler@psi.ch @copyright (c) 2015-23 by Paul Scherrer Institut @n Licensed under the Apache License, Version 2.0 (the "License"); @n you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 """ import argparse from collections.abc import Mapping import importlib import importlib.util import json import jsonschema import logging import os from pathlib import Path import sys import typing try: from mpi4py import MPI mpi_comm = MPI.COMM_WORLD mpi_size = mpi_comm.Get_size() mpi_rank = mpi_comm.Get_rank() except ImportError: MPI = None mpi_comm = None mpi_size = 1 mpi_rank = 0 pmsco_root = Path(__file__).resolve().parent.parent if str(pmsco_root) not in sys.path: sys.path.insert(0, str(pmsco_root)) from pmsco.database.git import get_git_hash import pmsco.dispatch as dispatch from pmsco.project import Project # the module-level logger logger = logging.getLogger(__name__) def setup_logging(enable=False, filename="pmsco.log", level="WARNING"): """ configure the root logger. direct the logs either to a file or the null handler. this function must be called before the first logging command whether a log output is requested or not. to disable logging, call this function with enable=False (default). modules should create their own loggers, by calling @code logger = logging.getLogger(__name__) @endcode at the top of the module code. that logger is then used by calls like @code logger.debug(message) @endcode. @param enable: (bool) True=enable logging to the specified file, False=do not generate a log (null handler). @param filename: (Path-like) path and name of the log file. if this process is part of an MPI communicator, the function inserts a dot and the MPI rank of this process before the extension. if the filename is empty, logging is disabled. @param level: (string) name of the log level. must be the name of one of "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL". if empty, logging is disabled. if not a valid level, defaults to "WARNING". @return None """ enable = enable and str(filename) and level numeric_level = getattr(logging, level.upper(), logging.WARNING) root_logger = logging.getLogger() root_logger.setLevel(numeric_level) logging.getLogger('matplotlib').setLevel(logging.WARNING) if enable: if mpi_size > 1: p = Path(filename) filename = p.with_suffix(f".{mpi_rank}" + p.suffix) log_format = '%(asctime)s (%(name)s) %(levelname)s: %(message)s' formatter = logging.Formatter(log_format) handler = logging.FileHandler(filename, mode="w", delay=True) handler.setLevel(numeric_level) handler.setFormatter(formatter) else: handler = logging.NullHandler() root_logger.addHandler(handler) def run_project(project): """ run a calculation project. the function sets up logging, validates the project, chooses the handler classes, and passes control to the pmsco.dispatch module to run the calculations. @param project: fully initialized project object. the validate method is called as part of this function after setting up the logger. @return: None """ log_file = Path(project.log_file) if not log_file.name: log_file = Path(project.job_name).with_suffix(".log") if log_file.name: log_file.parent.mkdir(exist_ok=True) log_level = project.log_level else: log_level = "" setup_logging(enable=bool(log_level), filename=log_file, level=log_level) if mpi_rank == 0: project.log_project_args() if not project.git_hash: project.git_hash = get_git_hash() project.validate() if project: logger.info("starting calculations") try: dispatch.run_calculations(project) except (SystemExit, KeyboardInterrupt): raise except Exception: logger.exception("unhandled exception during calculations.") raise else: logger.info("calculations complete") else: logger.error("undefined project, optimizer, or calculator.") def schedule_project(project, run_dict): """ schedule a calculation project. the function validates the project and submits a job to the scheduler. placeholders in run-file's directories dict are resolved. @param project: fully initialized project object. the validate method is called as part of this function. @param run_dict: dictionary holding the contents of the run file. @return: None """ assert mpi_rank == 0 setup_logging(enable=False) project.validate() try: dirs = run_dict['project']['directories'] for k in dirs: dirs[k] = str(project.directories[k]) except KeyError: pass if project.git_hash: run_dict['project']['git_hash'] = project.git_hash elif hsh := get_git_hash(): run_dict['project']['git_hash'] = hsh if project.db_file: run_dict['project']['db_file'] = str(project.db_file) if sf := project.optimizer_params['seed_file']: run_dict['project']['optimizer_params']['seed_file'] = str(sf) schedule_dict = run_dict['schedule'] module = _load_module(schedule_dict['__module__']) schedule_class = getattr(module, schedule_dict['__class__']) schedule = schedule_class(project) schedule.set_properties(vars(module), schedule_dict, project) schedule.run_dict = run_dict schedule.validate() schedule.submit() def _load_runfile(runfile: typing.Union[typing.Dict, str, bytes, os.PathLike, typing.TextIO]) -> typing.Mapping: """ Load a runfile The function loads a runfile from a dictionary, an open json file object, or a json file specified by a file path. If the source is a file, the directory is added to the project directories under the `run` key. @param runfile: Dictionary with contents of a runfile, an open file object, or a path-like. @return: Dictionary with the contents of the runfile. """ def set_run_dir(fileobj): try: p = Path(fileobj.name).parent.resolve(True) rf['project']['directories']['run'] = p except (AttributeError, FileNotFoundError): pass if isinstance(runfile, Mapping): rf = runfile elif hasattr(runfile, 'read'): rf = json.load(runfile) set_run_dir(runfile) else: with open(runfile, 'r') as f: rf = json.load(f) set_run_dir(f) schema_dir = Path(__file__).parent / "schema" schema_file = schema_dir / "runfile.schema.json" schema_url = f"file://{schema_dir}/" with open(schema_file) as f: schema = json.load(f) resolver = jsonschema.RefResolver(schema_url, None) jsonschema.validate(rf, schema, resolver=resolver) return rf def _load_module(name_or_path: typing.Union[str, bytes, os.PathLike]): """ Load a Python module @param name_or_path: Module name or file path of the module. If a module name is given, the module must be in the Python module search path. @return: module @raise ValueError if the module is not found """ try: return importlib.import_module(name_or_path) except ImportError: p = Path(name_or_path) module_name = p.stem spec = importlib.util.spec_from_file_location(module_name, name_or_path) try: module = importlib.util.module_from_spec(spec) except AttributeError: msg = f"Can't find module {name_or_path}" print(msg, sys.stderr) print("sys.path:", sys.path, sys.stderr) raise ValueError(msg) sys.modules[module_name] = module spec.loader.exec_module(module) return module def main_project(symbols: typing.Optional[typing.Dict[str, typing.Any]] = None, project: typing.Optional[Project] = None, project_module: typing.Optional[typing.Union[str, os.PathLike]] = None, project_class: typing.Optional[typing.Union[str, typing.Type[Project]]] = None, runfile: typing.Union[typing.Dict, str, bytes, os.PathLike, typing.TextIO] = None): """ Main function with optional arguments. This function starts the whole process based on function arguments. The arguments can be a an existing project instance, a project class, and/or a runfile. The function carries out the following steps: 1. Load a runfile - if specified. 2. Create a project object. 3. Apply the runfile to the project. 4. Run or schedule the project. The project instance is produced from the first match of the following conditions: 1. `project` argument is a Project instance. 2. `project_class` is a Project class. 3. `__class__` entry from runfile. The class must be listed in symbols, or the runfile must also contain a `__module__` entry with the name or file path of the project module that declares the class. The project is scheduled rather than executed if the corresponding section in the runfile is present. @param symbols: Namespace of the project module, which contains project, cluster and calculator classes. This is the basis for class resolution from runfiles. If called by the project module, it should pass vars(). @param project: project instance. @param project_class: project class or name of a project class defined in `symbols`. @param project_module: name or file path of the project module. This is required if symobls is not defined and the project class is given as a string (project_class argument or runfile value). @param runfile: A file-like, path-like or dict with runfile contents. Runfiles must be in json-format. @return: None """ if runfile is not None: rf = _load_runfile(runfile) rfp = rf['project'] else: rf = None rfp = None if project is None: if project_class is None or not issubclass(project_class, Project): project_classname = project_class if not project_classname: project_classname = rfp['__class__'] if not symbols: if project_module: module = _load_module(project_module) symbols = vars(module) else: module = _load_module(rfp['__module__']) symbols = vars(module) project_class = symbols[project_classname] project = project_class() project.directories['pmsco'] = Path(__file__).parent try: project.directories['project'] = Path(module.__file__).parent except AttributeError: pass if rfp: project.set_properties(symbols, rfp, project) try: schedule_enabled = rf['schedule']['enabled'] except KeyError: schedule_enabled = False if schedule_enabled: schedule_project(project, rf) else: run_project(project) def get_cli_parser(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=""" PSI multiple-scattering calculations and optimization (PMSCO) This is the main command line entry point for PMSCO calculation jobs. Alternative entry points can be provided by project modules. The command line requires at least a run-file to define the project parameters. The command can run a calculation job directly or submit it to a job queue via the `schedule` section in the run-file. The program detects whether it runs in a single-process or OpenMPI multi-process environment and coordinates parallel processes automatically. All arguments should preferably be declared in the run-file. A small number of options can be passed on the command line to override the corresponding parameter of the run-file. Please see the documentation that is compiled in docs/html/index.html for instructions how to set up a project module and run-files. See also the projects folder for examples. """) parser.add_argument('-r', '--run-file', help="Path to a run-file in JSON format which contains all calculation parameters. " "This argument is mandatory. " ) parser.add_argument('-m', '--module', help="File name of the custom project module. " "The module must declare the project class and other project-specific classes. " "This optional argument overrides the __module__ entry of the run-file. " ) parser.add_argument('-c', '--project-class', help="Project class. Requires --module to be specified. " "The project class is resolved in the namespace of the module. " "This optional argument corresponds to the __class__ entry of the run-file. " ) parser.add_argument('-o', '--output-dir', help="Output directory. " "This optional argument overrides the directories['output'] entry of the run-file." ) parser.add_argument('-j', '--job-name', help="Job name. Should be short and valid as a part of directory and file names. " "If a persistent database is used, it must not exist in the database yet. " "This optional argument overrides the job_name of the run-file." ) return parser def parse_cli(): """ parse the command line interface @return: Namespace object created by the argument parser. """ parser = get_cli_parser() args, unknown_args = parser.parse_known_args() return args, unknown_args def main(symbols: typing.Optional[typing.Dict[str, typing.Any]] = None): """ Main function with command line parsing This function starts the whole process with parameters from the command line. If the command line contains a run-file parameter, it determines the project class and the project parameters. The project class can be specified either in the run-file, on the command line or the function arguments. If the run-file specifies a class name, that class is instantiated. @return: None """ args, unknown_args = parse_cli() try: rf = _load_runfile(args.run_file) except AttributeError: rf = {'project': {}} try: if args.module: rf['project']['__module__'] = args.module except AttributeError: pass try: if args.project_class: rf['project']['__class__'] = args.project_class except AttributeError: pass try: if args.output_dir: rf['project']['directories']['output'] = args.output_dir except (AttributeError, KeyError): pass try: if args.job_name: rf['project']['job_name'] = args.job_name except (AttributeError, KeyError): pass main_project(symbols=symbols, runfile=rf) if __name__ == '__main__': main() sys.exit(0)