public release 4.2.0 - see README.md and CHANGES.md for details

2026-01-08 19:10:45 +01:00
parent ef781e2db4
commit b64beb694c
181 changed files with 39388 additions and 6527 deletions
--- a/pmsco/pmsco.py
+++ b/pmsco/pmsco.py
@@ -1,31 +1,35 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3

 """
@package pmsco.pmsco
-PEARL Multiple-Scattering Calculation and Structural Optimization
+PSI Multiple-Scattering Calculation and Structural Optimization

-this is the top-level interface of the PMSCO package.
-all calculations (any mode, any project) start by calling the run_project() function of this module.
-the module also provides a command line and a run-file/run-dict interface.
+This is the top-level interface of the PMSCO package.
+All calculations (any mode, any project) start by calling the run_project function of this module.
+The module also provides a command line, a run-file, and a run-dict interface.
+They all, in one way or another, set up an instance of a Project class and call the run_project function.

-for parallel execution, prefix the command line with mpi_exec -np NN, where NN is the number of processes to use.
-note that in parallel mode, one process takes the role of the coordinator (master).
-the master does not run calculations and is idle most of the time.
-to benefit from parallel execution on a work station, NN should be the number of processors.
-on a cluster, the number of processes is chosen according to the available resources.
+For parallel execution, prefix the command line with mpi_exec -np NN, where NN is the number of processes to use.
+Note that in parallel mode, one process takes the role of the coordinator (master).
+The master does not run calculations and is idle most of the time.
+To benefit from parallel execution on a work station, NN should be the number of processors.
+On a cluster, the number of processes should be chosen according to the available resources.

-all calculations can also be run in a single process.
+All calculations can also be run in a single process.
 PMSCO serializes the calculations automatically.

-the code of the main module is independent of a particular calculation project.
-all project-specific code must be in a separate python module.
-the project module must implement a class derived from pmsco.project.Project,
-and call run_project() with an instance of the project class.
-refer to the projects folder for examples.
+The code of the main module is independent of a particular calculation project.
+All project-specific code must be in a separate python module.
+The project module must implement a class derived from pmsco.project.Project.
+The project module and class must be referenced in the run-file, or passed to the suitable run-function.
+
+While they are not strictly necessary, run-files help to separate code and data.
+Code is usually version-controlled, run-files contain metadata of calculations and should be kept with the results.
+A git hash can be used to refer to the code used to execute the calculation.

@author Matthias Muntwiler, matthias.muntwiler@psi.ch

-@copyright (c) 2015-21 by Paul Scherrer Institut @n
+@copyright (c) 2015-23 by Paul Scherrer Institut @n
 Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
@@ -33,12 +37,16 @@ Licensed under the Apache License, Version 2.0 (the "License"); @n
 """

 import argparse
-from builtins import range
-import logging
+from collections.abc import Mapping
 import importlib
-import commentjson as json
+import importlib.util
+import json
+import jsonschema
+import logging
+import os
 from pathlib import Path
 import sys
+import typing

 try:
    from mpi4py import MPI
@@ -55,10 +63,9 @@ pmsco_root = Path(__file__).resolve().parent.parent
 if str(pmsco_root) not in sys.path:
    sys.path.insert(0, str(pmsco_root))

+from pmsco.database.git import get_git_hash
 import pmsco.dispatch as dispatch
-import pmsco.files as files
-import pmsco.handlers as handlers
-from pmsco.optimizers import genetic, swarm, grid, table
+from pmsco.project import Project

 # the module-level logger
 logger = logging.getLogger(__name__)
@@ -94,6 +101,7 @@ def setup_logging(enable=False, filename="pmsco.log", level="WARNING"):
    numeric_level = getattr(logging, level.upper(), logging.WARNING)
    root_logger = logging.getLogger()
    root_logger.setLevel(numeric_level)
+    logging.getLogger('matplotlib').setLevel(logging.WARNING)

    if enable:
        if mpi_size > 1:
@@ -112,49 +120,6 @@ def setup_logging(enable=False, filename="pmsco.log", level="WARNING"):
    root_logger.addHandler(handler)


-def set_common_args(project, args):
-    """
-    set common project arguments from parsed command line.
-
-    this function translates and distributes the common arguments from the command line parser
-    to the respective destinations.
-    as of this writing, there are two destinations: the global logger and the project instance.
-
-    note that run_project() is called with the project instance as the only argument.
-    all project-related arguments from the command line must therefore be copied to the project object.
-
-    @param args: a namespace object containing the necessary parameters.
-        this can be an instance of Args, or the return value of parse_cli(),
-        or any object which has the same attributes as the Args class.
-
-    @return: None
-    """
-
-    if args.data_dir:
-        project.data_dir = args.data_dir
-    if args.output_file:
-        project.output_file = args.output_file
-    if args.db_file:
-        project.db_file = args.db_file
-    if args.log_file:
-        project.log_file = args.log_file
-    if args.log_level:
-        project.log_level = args.log_level
-    if not args.log_enable:
-        project.log_file = ""
-        project.log_level = ""
-    if args.mode:
-        project.mode = args.mode.lower()
-    if args.time_limit:
-        project.time_limit = args.time_limit
-    if args.keep_files:
-        project.keep_files = args.keep_files
-    if args.keep_levels:
-        project.keep_levels = max(args.keep_levels, project.keep_levels)
-    if args.keep_best:
-        project.keep_best = max(args.keep_best, project.keep_best)
-
-
 def run_project(project):
    """
    run a calculation project.
@@ -179,36 +144,18 @@ def run_project(project):
    if mpi_rank == 0:
        project.log_project_args()

+    if not project.git_hash:
+        project.git_hash = get_git_hash()
+
    project.validate()

-    optimizer_class = None
-    if project.mode == 'single':
-        optimizer_class = handlers.SingleModelHandler
-    elif project.mode == 'grid':
-        optimizer_class = grid.GridSearchHandler
-    elif project.mode == 'swarm':
-        optimizer_class = swarm.ParticleSwarmHandler
-    elif project.mode == 'genetic':
-        optimizer_class = genetic.GeneticOptimizationHandler
-    elif project.mode == 'gradient':
-        logger.error("gradient search not implemented")
-        # TODO: implement gradient search
-        # optimizer_class = gradient.GradientSearchHandler
-    elif project.mode == 'table':
-        optimizer_class = table.TableModelHandler
-    else:
-        logger.error("invalid optimization mode '%s'.", project.mode)
-    project.handler_classes['model'] = optimizer_class
-
-    project.handler_classes['region'] = handlers.choose_region_handler_class(project)
-
-    if project and optimizer_class:
+    if project:
        logger.info("starting calculations")
        try:
            dispatch.run_calculations(project)
        except (SystemExit, KeyboardInterrupt):
            raise
-        except Exception as __:
+        except Exception:
            logger.exception("unhandled exception during calculations.")
            raise
        else:
@@ -223,6 +170,8 @@ def schedule_project(project, run_dict):

    the function validates the project and submits a job to the scheduler.

+    placeholders in run-file's directories dict are resolved.
+
    @param project: fully initialized project object.
        the validate method is called as part of this function.

@@ -234,117 +183,234 @@ def schedule_project(project, run_dict):
    setup_logging(enable=False)

    project.validate()
+    try:
+        dirs = run_dict['project']['directories']
+        for k in dirs:
+            dirs[k] = str(project.directories[k])
+    except KeyError:
+        pass
+    if project.git_hash:
+        run_dict['project']['git_hash'] = project.git_hash
+    elif hsh := get_git_hash():
+        run_dict['project']['git_hash'] = hsh
+    if project.db_file:
+        run_dict['project']['db_file'] = str(project.db_file)
+    if sf := project.optimizer_params['seed_file']:
+        run_dict['project']['optimizer_params']['seed_file'] = str(sf)

    schedule_dict = run_dict['schedule']
-    module = importlib.import_module(schedule_dict['__module__'])
+    module = _load_module(schedule_dict['__module__'])
    schedule_class = getattr(module, schedule_dict['__class__'])
    schedule = schedule_class(project)
-    schedule.set_properties(module, schedule_dict, project)
+    schedule.set_properties(vars(module), schedule_dict, project)
    schedule.run_dict = run_dict
    schedule.validate()
    schedule.submit()


-class Args(object):
+def _load_runfile(runfile: typing.Union[typing.Dict, str, bytes, os.PathLike, typing.TextIO]) -> typing.Mapping:
    """
-    arguments of the main function.
-    
-    this class can be used to set up an arguments object for the main 
-    function as an alternative to the __main__ function which parses
-    command line arguments.
-    
-    the constructor initializes the attributes with the same default 
-    values as the command line parser.
+    Load a runfile
+
+    The function loads a runfile from a dictionary, an open json file object, or a json file specified by a file path.
+    If the source is a file, the directory is added to the project directories under the `run` key.
+
+    @param runfile: Dictionary with contents of a runfile, an open file object, or a path-like.
+    @return: Dictionary with the contents of the runfile.
    """

-    def __init__(self):
-        """
-        constructor.
-        
-        the parameters are the same as for the command line interface.
-        project and mode are mandatory.
-        other parameters may be required depending on the project
-        and/or the calculation mode.
-        """
-        self.data_dir = ""
-        self.output_file = ""
-        self.db_file = ""
-        self.time_limit = 24.0
-        self.keep_files = files.FILE_CATEGORIES_TO_KEEP
-        self.keep_best = 10
-        self.keep_levels = 1
-        self.log_level = "WARNING"
-        self.log_file = ""
-        self.log_enable = True
+    def set_run_dir(fileobj):
+        try:
+            p = Path(fileobj.name).parent.resolve(True)
+            rf['project']['directories']['run'] = p
+        except (AttributeError, FileNotFoundError):
+            pass
+
+    if isinstance(runfile, Mapping):
+        rf = runfile
+    elif hasattr(runfile, 'read'):
+        rf = json.load(runfile)
+        set_run_dir(runfile)
+    else:
+        with open(runfile, 'r') as f:
+            rf = json.load(f)
+            set_run_dir(f)
+
+    schema_dir = Path(__file__).parent / "schema"
+    schema_file = schema_dir / "runfile.schema.json"
+    schema_url = f"file://{schema_dir}/"
+    with open(schema_file) as f:
+        schema = json.load(f)
+
+    resolver = jsonschema.RefResolver(schema_url, None)
+    jsonschema.validate(rf, schema, resolver=resolver)
+
+    return rf
+
+
+def _load_module(name_or_path: typing.Union[str, bytes, os.PathLike]):
+    """
+    Load a Python module
+
+    @param name_or_path: Module name or file path of the module.
+        If a module name is given, the module must be in the Python module search path.
+    @return: module
+    @raise ValueError if the module is not found
+    """
+
+    try:
+        return importlib.import_module(name_or_path)
+    except ImportError:
+        p = Path(name_or_path)
+        module_name = p.stem
+        spec = importlib.util.spec_from_file_location(module_name, name_or_path)
+        try:
+            module = importlib.util.module_from_spec(spec)
+        except AttributeError:
+            msg = f"Can't find module {name_or_path}"
+            print(msg, sys.stderr)
+            print("sys.path:", sys.path, sys.stderr)
+            raise ValueError(msg)
+
+        sys.modules[module_name] = module
+        spec.loader.exec_module(module)
+        return module
+
+
+def main_project(symbols: typing.Optional[typing.Dict[str, typing.Any]] = None,
+                 project: typing.Optional[Project] = None,
+                 project_module: typing.Optional[typing.Union[str, os.PathLike]] = None,
+                 project_class: typing.Optional[typing.Union[str, typing.Type[Project]]] = None,
+                 runfile: typing.Union[typing.Dict, str, bytes, os.PathLike, typing.TextIO] = None):
+
+    """
+    Main function with optional arguments.
+
+    This function starts the whole process based on function arguments.
+    The arguments can be a an existing project instance, a project class, and/or a runfile.
+
+    The function carries out the following steps:
+
+    1. Load a runfile - if specified.
+    2. Create a project object.
+    3. Apply the runfile to the project.
+    4. Run or schedule the project.
+
+    The project instance is produced from the first match of the following conditions:
+
+    1. `project` argument is a Project instance.
+    2. `project_class` is a Project class.
+    3. `__class__` entry from runfile.
+        The class must be listed in symbols,
+        or the runfile must also contain a `__module__` entry
+        with the name or file path of the project module that declares the class.
+
+    The project is scheduled rather than executed if the corresponding section in the runfile is present.
+
+    @param symbols: Namespace of the project module, which contains project, cluster and calculator classes.
+        This is the basis for class resolution from runfiles.
+        If called by the project module, it should pass vars().
+    @param project: project instance.
+    @param project_class: project class or name of a project class defined in `symbols`.
+    @param project_module: name or file path of the project module.
+        This is required if symobls is not defined
+        and the project class is given as a string (project_class argument or runfile value).
+    @param runfile: A file-like, path-like or dict with runfile contents.
+        Runfiles must be in json-format.
+    @return: None
+    """
+
+    if runfile is not None:
+        rf = _load_runfile(runfile)
+        rfp = rf['project']
+    else:
+        rf = None
+        rfp = None
+
+    if project is None:
+        if project_class is None or not issubclass(project_class, Project):
+            project_classname = project_class
+            if not project_classname:
+                project_classname = rfp['__class__']
+
+            if not symbols:
+                if project_module:
+                    module = _load_module(project_module)
+                    symbols = vars(module)
+                else:
+                    module = _load_module(rfp['__module__'])
+                    symbols = vars(module)
+
+            project_class = symbols[project_classname]
+
+        project = project_class()
+
+    project.directories['pmsco'] = Path(__file__).parent
+    try:
+        project.directories['project'] = Path(module.__file__).parent
+    except AttributeError:
+        pass
+
+    if rfp:
+        project.set_properties(symbols, rfp, project)
+
+    try:
+        schedule_enabled = rf['schedule']['enabled']
+    except KeyError:
+        schedule_enabled = False
+    if schedule_enabled:
+        schedule_project(project, rf)
+    else:
+        run_project(project)


 def get_cli_parser():
-    KEEP_FILES_CHOICES = files.FILE_CATEGORIES | {'all'}
-
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""
-        multiple-scattering calculations and optimization
+        PSI multiple-scattering calculations and optimization (PMSCO)
+        
+        This is the main command line entry point for PMSCO calculation jobs.
+        Alternative entry points can be provided by project modules.
+        The command line requires at least a run-file to define the project parameters.

-        you must call pmsco.py from a project file which defines the calculation project.
-        the project file must be a regular Python module and define:
+        The command can run a calculation job directly or submit it to a job queue
+        via the `schedule` section in the run-file.
+        The program detects whether it runs in a single-process or OpenMPI multi-process environment
+        and coordinates parallel processes automatically.

-        1) a project class derived from pmsco.project.Project.
-           the class implements/overrides all necessary methods of the calculation project,
-           in particular create_model_space, create_cluster, and create_params.
-
-        2) a global function named create_project.
-           the function accepts a namespace object from the argument parser.
-           it may evaluate extra, project-specific arguments.
-           it does not need to evaluate the common parameters described below.
-           the function must return an instance of the project class described above.
-
-        3) main code that parses the command line and calls pmsco.pmsco.main_pmsco().
-           (see the projects folder for examples).
+        All arguments should preferably be declared in the run-file.
+        A small number of options can be passed on the command line
+        to override the corresponding parameter of the run-file.
+        
+        Please see the documentation that is compiled in docs/html/index.html
+        for instructions how to set up a project module and run-files.
+        See also the projects folder for examples.
        """)
-    # the required argument list may depend on the calculation mode.
-    # for simplicity, the parser does not check these requirements.
-    # all parameters are optional and accepted regardless of mode.
-    # errors may occur if implicit requirements are not met.
-    parser.add_argument('project_module', nargs='?',
-                        help="path to custom module that defines the calculation project")
+
    parser.add_argument('-r', '--run-file',
-                        help="path to run-time parameters file which contains all program arguments. " +
-                        "must be in JSON format.")
-    parser.add_argument('-m', '--mode',
-                        choices=['single', 'grid', 'swarm', 'genetic', 'table'],
-                        help='calculation mode')
-    parser.add_argument('-d', '--data-dir',
-                        help='directory path for experimental data files (if required by project). ' +
-                             'default: working directory')
-    parser.add_argument('-o', '--output-file',
-                        help='base path for intermediate and output files.')
-    parser.add_argument('-b', '--db-file',
-                        help='name of an sqlite3 database file where the results should be stored.')
-    parser.add_argument('-k', '--keep-files', nargs='*',
-                        choices=KEEP_FILES_CHOICES,
-                        help='output file categories to keep after the calculation. '
-                             'by default, cluster and model (simulated data) '
-                             'of a limited number of best models are kept.')
-    parser.add_argument('--keep-best', type=int,
-                        help='number of best models for which to keep result files '
-                             '(at each node from root down to keep-levels).')
-    parser.add_argument('--keep-levels', type=int, choices=range(5),
-                        help='task level down to which result files of best models are kept. '
-                             '0 = model, 1 = scan, 2 = domain, 3 = emitter, 4 = region.')
-    parser.add_argument('-t', '--time-limit', type=float,
-                        help='wall time limit in hours. the optimizers try to finish before the limit.')
-    parser.add_argument('--log-file',
-                        help='name of the main log file. ' +
-                             'under MPI, the rank of the process is inserted before the extension.')
-    parser.add_argument('--log-level',
-                        help='minimum level of log messages. DEBUG, INFO, WARNING, ERROR, CRITICAL.')
-    feature_parser = parser.add_mutually_exclusive_group(required=False)
-    feature_parser.add_argument('--log-enable', dest='log_enable', action="store_true",
-                        help="enable logging. by default, logging is on.")
-    feature_parser.add_argument('--log-disable', dest='log_enable', action='store_false',
-                        help="disable logging. by default, logging is on.")
-    parser.set_defaults(log_enable=True)
+                        help="Path to a run-file in JSON format which contains all calculation parameters. "
+                             "This argument is mandatory. "
+                        )
+    parser.add_argument('-m', '--module',
+                        help="File name of the custom project module. "
+                             "The module must declare the project class and other project-specific classes. "
+                             "This optional argument overrides the __module__ entry of the run-file. "
+                        )
+    parser.add_argument('-c', '--project-class',
+                        help="Project class. Requires --module to be specified. "
+                             "The project class is resolved in the namespace of the module. "
+                             "This optional argument corresponds to the __class__ entry of the run-file. "
+                        )
+    parser.add_argument('-o', '--output-dir',
+                        help="Output directory. "
+                             "This optional argument overrides the directories['output'] entry of the run-file."
+                        )
+    parser.add_argument('-j', '--job-name',
+                        help="Job name. Should be short and valid as a part of directory and file names. "
+                             "If a persistent database is used, it must not exist in the database yet. "
+                             "This optional argument overrides the job_name of the run-file."
+                        )

    return parser

@@ -362,129 +428,52 @@ def parse_cli():
    return args, unknown_args


-def import_module(module_name):
+def main(symbols: typing.Optional[typing.Dict[str, typing.Any]] = None):
    """
-    import a custom module by name.
+    Main function with command line parsing

-    import a module given its file path or module name (like in an import statement).
+    This function starts the whole process with parameters from the command line.

-    preferably, the module name should be given as in an import statement.
-    as the top-level pmsco directory is on the python path,
-    the module name will begin with `projects` for a custom project module or `pmsco` for a core pmsco module.
-    in this case, the function just calls importlib.import_module.
+    If the command line contains a run-file parameter, it determines the project class and the project parameters.

-    if a file path is given, i.e., `module_name` links to an existing file and has a `.py` extension,
-    the function extracts the directory path,
-    inserts it into the python path,
-    and calls importlib.import_module on the stem of the file name.
-
-    @note the file path remains in the python path.
-    this option should be used carefully to avoid breaking file name resolution.
-
-    @param module_name: file path or module name.
-        file path is interpreted relative to the working directory.
-
-    @return: the loaded module as a python object
-    """
-    p = Path(module_name)
-    if p.is_file() and p.suffix == ".py":
-        path = p.parent.resolve()
-        module_name = p.stem
-        if path not in sys.path:
-            sys.path.insert(0, path)
-
-    module = importlib.import_module(module_name)
-    return module
-
-
-def main_dict(run_params):
-    """
-    main function with dictionary run-time parameters
-
-    this starts the whole process with all direct parameters.
-    the command line is not parsed.
-    no run-file is loaded (just the project module).
-
-    @param run_params: dictionary with the same structure as the JSON run-file.
+    The project class can be specified either in the run-file, on the command line or the function arguments.
+    If the run-file specifies a class name, that class is instantiated.

    @return: None
    """
-    project_params = run_params['project']

-    module = importlib.import_module(project_params['__module__'])
-    try:
-        project_class = getattr(module, project_params['__class__'])
-    except KeyError:
-        project = module.create_project()
-    else:
-        project = project_class()
-
-    project._module = module
-    project.directories['pmsco'] = Path(__file__).parent
-    project.directories['project'] = Path(module.__file__).parent
-    project.set_properties(module, project_params, project)
-    run_project(project)
-
-
-def main():
-    """
-    main function with command line parsing
-
-    this function starts the whole process with parameters from the command line.
-
-    if the command line contains a run-file parameter, it determines the module to load and the project parameters.
-    otherwise, the command line parameters apply.
-
-    the project class can be specified either in the run-file or the project module.
-    if the run-file specifies a class name, that class is looked up in the project module and instantiated.
-    otherwise, the module's create_project is called.
-
-    @return: None
-    """
    args, unknown_args = parse_cli()

    try:
-        with open(args.run_file, 'r') as f:
-            rf = json.load(f)
+        rf = _load_runfile(args.run_file)
    except AttributeError:
-        rfp = {'__module__': args.project_module}
-    else:
-        rfp = rf['project']
-
-    module = import_module(rfp['__module__'])
-    try:
-        project_args = module.parse_project_args(unknown_args)
-    except AttributeError:
-        project_args = None
+        rf = {'project': {}}

    try:
-        project_class = getattr(module, rfp['__class__'])
-    except (AttributeError, KeyError):
-        project = module.create_project()
-    else:
-        project = project_class()
-        project_args = None
-
-    project._module = module
-    project.directories['pmsco'] = Path(__file__).parent
-    project.directories['project'] = Path(module.__file__).parent
-    project.set_properties(module, rfp, project)
-
-    set_common_args(project, args)
-    try:
-        if project_args:
-            module.set_project_args(project, project_args)
+        if args.module:
+            rf['project']['__module__'] = args.module
    except AttributeError:
        pass

    try:
-        schedule_enabled = rf['schedule']['enabled']
-    except KeyError:
-        schedule_enabled = False
-    if schedule_enabled:
-        schedule_project(project, rf)
-    else:
-        run_project(project)
+        if args.project_class:
+            rf['project']['__class__'] = args.project_class
+    except AttributeError:
+        pass
+
+    try:
+        if args.output_dir:
+            rf['project']['directories']['output'] = args.output_dir
+    except (AttributeError, KeyError):
+        pass
+
+    try:
+        if args.job_name:
+            rf['project']['job_name'] = args.job_name
+    except (AttributeError, KeyError):
+        pass
+
+    main_project(symbols=symbols, runfile=rf)


 if __name__ == '__main__':