pmsco-public/pmsco/pmsco.py

#!/usr/bin/env python3

"""
@package pmsco.pmsco
PSI Multiple-Scattering Calculation and Structural Optimization

This is the top-level interface of the PMSCO package.
All calculations (any mode, any project) start by calling the run_project function of this module.
The module also provides a command line, a run-file, and a run-dict interface.
They all, in one way or another, set up an instance of a Project class and call the run_project function.

For parallel execution, prefix the command line with mpi_exec -np NN, where NN is the number of processes to use.
Note that in parallel mode, one process takes the role of the coordinator (master).
The master does not run calculations and is idle most of the time.
To benefit from parallel execution on a work station, NN should be the number of processors.
On a cluster, the number of processes should be chosen according to the available resources.

All calculations can also be run in a single process.
PMSCO serializes the calculations automatically.

The code of the main module is independent of a particular calculation project.
All project-specific code must be in a separate python module.
The project module must implement a class derived from pmsco.project.Project.
The project module and class must be referenced in the run-file, or passed to the suitable run-function.

While they are not strictly necessary, run-files help to separate code and data.
Code is usually version-controlled, run-files contain metadata of calculations and should be kept with the results.
A git hash can be used to refer to the code used to execute the calculation.

@author Matthias Muntwiler, matthias.muntwiler@psi.ch

@copyright (c) 2015-23 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
"""

import argparse
from collections.abc import Mapping
import importlib
import importlib.util
import json
import jsonschema
import logging
import os
from pathlib import Path
import sys
import typing

try:
    from mpi4py import MPI
    mpi_comm = MPI.COMM_WORLD
    mpi_size = mpi_comm.Get_size()
    mpi_rank = mpi_comm.Get_rank()
except ImportError:
    MPI = None
    mpi_comm = None
    mpi_size = 1
    mpi_rank = 0

pmsco_root = Path(__file__).resolve().parent.parent
if str(pmsco_root) not in sys.path:
    sys.path.insert(0, str(pmsco_root))

from pmsco.database.git import get_git_hash
import pmsco.dispatch as dispatch
from pmsco.project import Project

# the module-level logger
logger = logging.getLogger(__name__)


def setup_logging(enable=False, filename="pmsco.log", level="WARNING"):
    """
    configure the root logger. direct the logs either to a file or the null handler.

    this function must be called before the first logging command
    whether a log output is requested or not.
    to disable logging, call this function with enable=False (default).

    modules should create their own loggers, by calling
    @code logger = logging.getLogger(__name__) @endcode
    at the top of the module code.
    that logger is then used by calls like
    @code logger.debug(message) @endcode.

    @param enable: (bool) True=enable logging to the specified file,
        False=do not generate a log (null handler).
    @param filename: (Path-like) path and name of the log file.
        if this process is part of an MPI communicator,
        the function inserts a dot and the MPI rank of this process before the extension.
        if the filename is empty, logging is disabled.
    @param level: (string) name of the log level.
        must be the name of one of "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL".
        if empty, logging is disabled.
        if not a valid level, defaults to "WARNING".
    @return None
    """
    enable = enable and str(filename) and level
    numeric_level = getattr(logging, level.upper(), logging.WARNING)
    root_logger = logging.getLogger()
    root_logger.setLevel(numeric_level)
    logging.getLogger('matplotlib').setLevel(logging.WARNING)

    if enable:
        if mpi_size > 1:
            p = Path(filename)
            filename = p.with_suffix(f".{mpi_rank}" + p.suffix)

        log_format = '%(asctime)s (%(name)s) %(levelname)s: %(message)s'
        formatter = logging.Formatter(log_format)

        handler = logging.FileHandler(filename, mode="w", delay=True)
        handler.setLevel(numeric_level)
        handler.setFormatter(formatter)
    else:
        handler = logging.NullHandler()

    root_logger.addHandler(handler)


def run_project(project):
    """
    run a calculation project.

    the function sets up logging, validates the project, chooses the handler classes,
    and passes control to the pmsco.dispatch module to run the calculations.

    @param project: fully initialized project object.
        the validate method is called as part of this function after setting up the logger.
    @return: None
    """

    log_file = Path(project.log_file)
    if not log_file.name:
        log_file = Path(project.job_name).with_suffix(".log")
    if log_file.name:
        log_file.parent.mkdir(exist_ok=True)
        log_level = project.log_level
    else:
        log_level = ""
    setup_logging(enable=bool(log_level), filename=log_file, level=log_level)
    if mpi_rank == 0:
        project.log_project_args()

    if not project.git_hash:
        project.git_hash = get_git_hash()

    project.validate()

    if project:
        logger.info("starting calculations")
        try:
            dispatch.run_calculations(project)
        except (SystemExit, KeyboardInterrupt):
            raise
        except Exception:
            logger.exception("unhandled exception during calculations.")
            raise
        else:
            logger.info("calculations complete")
    else:
        logger.error("undefined project, optimizer, or calculator.")


def schedule_project(project, run_dict):
    """
    schedule a calculation project.

    the function validates the project and submits a job to the scheduler.

    placeholders in run-file's directories dict are resolved.

    @param project: fully initialized project object.
        the validate method is called as part of this function.

    @param run_dict: dictionary holding the contents of the run file.

    @return: None
    """
    assert mpi_rank == 0
    setup_logging(enable=False)

    project.validate()
    try:
        dirs = run_dict['project']['directories']
        for k in dirs:
            dirs[k] = str(project.directories[k])
    except KeyError:
        pass
    if project.git_hash:
        run_dict['project']['git_hash'] = project.git_hash
    elif hsh := get_git_hash():
        run_dict['project']['git_hash'] = hsh
    if project.db_file:
        run_dict['project']['db_file'] = str(project.db_file)
    if sf := project.optimizer_params['seed_file']:
        run_dict['project']['optimizer_params']['seed_file'] = str(sf)

    schedule_dict = run_dict['schedule']
    module = _load_module(schedule_dict['__module__'])
    schedule_class = getattr(module, schedule_dict['__class__'])
    schedule = schedule_class(project)
    schedule.set_properties(vars(module), schedule_dict, project)
    schedule.run_dict = run_dict
    schedule.validate()
    schedule.submit()


def _load_runfile(runfile: typing.Union[typing.Dict, str, bytes, os.PathLike, typing.TextIO]) -> typing.Mapping:
    """
    Load a runfile

    The function loads a runfile from a dictionary, an open json file object, or a json file specified by a file path.
    If the source is a file, the directory is added to the project directories under the `run` key.

    @param runfile: Dictionary with contents of a runfile, an open file object, or a path-like.
    @return: Dictionary with the contents of the runfile.
    """

    def set_run_dir(fileobj):
        try:
            p = Path(fileobj.name).parent.resolve(True)
            rf['project']['directories']['run'] = p
        except (AttributeError, FileNotFoundError):
            pass

    if isinstance(runfile, Mapping):
        rf = runfile
    elif hasattr(runfile, 'read'):
        rf = json.load(runfile)
        set_run_dir(runfile)
    else:
        with open(runfile, 'r') as f:
            rf = json.load(f)
            set_run_dir(f)

    schema_dir = Path(__file__).parent / "schema"
    schema_file = schema_dir / "runfile.schema.json"
    schema_url = f"file://{schema_dir}/"
    with open(schema_file) as f:
        schema = json.load(f)

    resolver = jsonschema.RefResolver(schema_url, None)
    jsonschema.validate(rf, schema, resolver=resolver)

    return rf


def _load_module(name_or_path: typing.Union[str, bytes, os.PathLike]):
    """
    Load a Python module

    @param name_or_path: Module name or file path of the module.
        If a module name is given, the module must be in the Python module search path.
    @return: module
    @raise ValueError if the module is not found
    """

    try:
        return importlib.import_module(name_or_path)
    except ImportError:
        p = Path(name_or_path)
        module_name = p.stem
        spec = importlib.util.spec_from_file_location(module_name, name_or_path)
        try:
            module = importlib.util.module_from_spec(spec)
        except AttributeError:
            msg = f"Can't find module {name_or_path}"
            print(msg, sys.stderr)
            print("sys.path:", sys.path, sys.stderr)
            raise ValueError(msg)

        sys.modules[module_name] = module
        spec.loader.exec_module(module)
        return module


def main_project(symbols: typing.Optional[typing.Dict[str, typing.Any]] = None,
                 project: typing.Optional[Project] = None,
                 project_module: typing.Optional[typing.Union[str, os.PathLike]] = None,
                 project_class: typing.Optional[typing.Union[str, typing.Type[Project]]] = None,
                 runfile: typing.Union[typing.Dict, str, bytes, os.PathLike, typing.TextIO] = None):

    """
    Main function with optional arguments.

    This function starts the whole process based on function arguments.
    The arguments can be a an existing project instance, a project class, and/or a runfile.

    The function carries out the following steps:

    1. Load a runfile - if specified.
    2. Create a project object.
    3. Apply the runfile to the project.
    4. Run or schedule the project.

    The project instance is produced from the first match of the following conditions:

    1. `project` argument is a Project instance.
    2. `project_class` is a Project class.
    3. `__class__` entry from runfile.
        The class must be listed in symbols,
        or the runfile must also contain a `__module__` entry
        with the name or file path of the project module that declares the class.

    The project is scheduled rather than executed if the corresponding section in the runfile is present.

    @param symbols: Namespace of the project module, which contains project, cluster and calculator classes.
        This is the basis for class resolution from runfiles.
        If called by the project module, it should pass vars().
    @param project: project instance.
    @param project_class: project class or name of a project class defined in `symbols`.
    @param project_module: name or file path of the project module.
        This is required if symobls is not defined
        and the project class is given as a string (project_class argument or runfile value).
    @param runfile: A file-like, path-like or dict with runfile contents.
        Runfiles must be in json-format.
    @return: None
    """

    if runfile is not None:
        rf = _load_runfile(runfile)
        rfp = rf['project']
    else:
        rf = None
        rfp = None

    if project is None:
        if project_class is None or not issubclass(project_class, Project):
            project_classname = project_class
            if not project_classname:
                project_classname = rfp['__class__']

            if not symbols:
                if project_module:
                    module = _load_module(project_module)
                    symbols = vars(module)
                else:
                    module = _load_module(rfp['__module__'])
                    symbols = vars(module)

            project_class = symbols[project_classname]

        project = project_class()

    project.directories['pmsco'] = Path(__file__).parent
    try:
        project.directories['project'] = Path(module.__file__).parent
    except AttributeError:
        pass

    if rfp:
        project.set_properties(symbols, rfp, project)

    try:
        schedule_enabled = rf['schedule']['enabled']
    except KeyError:
        schedule_enabled = False
    if schedule_enabled:
        schedule_project(project, rf)
    else:
        run_project(project)


def get_cli_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""
        PSI multiple-scattering calculations and optimization (PMSCO)

        This is the main command line entry point for PMSCO calculation jobs.
        Alternative entry points can be provided by project modules.
        The command line requires at least a run-file to define the project parameters.

        The command can run a calculation job directly or submit it to a job queue
        via the `schedule` section in the run-file.
        The program detects whether it runs in a single-process or OpenMPI multi-process environment
        and coordinates parallel processes automatically.

        All arguments should preferably be declared in the run-file.
        A small number of options can be passed on the command line
        to override the corresponding parameter of the run-file.

        Please see the documentation that is compiled in docs/html/index.html
        for instructions how to set up a project module and run-files.
        See also the projects folder for examples.
        """)

    parser.add_argument('-r', '--run-file',
                        help="Path to a run-file in JSON format which contains all calculation parameters. "
                             "This argument is mandatory. "
                        )
    parser.add_argument('-m', '--module',
                        help="File name of the custom project module. "
                             "The module must declare the project class and other project-specific classes. "
                             "This optional argument overrides the __module__ entry of the run-file. "
                        )
    parser.add_argument('-c', '--project-class',
                        help="Project class. Requires --module to be specified. "
                             "The project class is resolved in the namespace of the module. "
                             "This optional argument corresponds to the __class__ entry of the run-file. "
                        )
    parser.add_argument('-o', '--output-dir',
                        help="Output directory. "
                             "This optional argument overrides the directories['output'] entry of the run-file."
                        )
    parser.add_argument('-j', '--job-name',
                        help="Job name. Should be short and valid as a part of directory and file names. "
                             "If a persistent database is used, it must not exist in the database yet. "
                             "This optional argument overrides the job_name of the run-file."
                        )

    return parser


def parse_cli():
    """
    parse the command line interface

    @return: Namespace object created by the argument parser.
    """
    parser = get_cli_parser()

    args, unknown_args = parser.parse_known_args()

    return args, unknown_args


def main(symbols: typing.Optional[typing.Dict[str, typing.Any]] = None):
    """
    Main function with command line parsing

    This function starts the whole process with parameters from the command line.

    If the command line contains a run-file parameter, it determines the project class and the project parameters.

    The project class can be specified either in the run-file, on the command line or the function arguments.
    If the run-file specifies a class name, that class is instantiated.

    @return: None
    """

    args, unknown_args = parse_cli()

    try:
        rf = _load_runfile(args.run_file)
    except AttributeError:
        rf = {'project': {}}

    try:
        if args.module:
            rf['project']['__module__'] = args.module
    except AttributeError:
        pass

    try:
        if args.project_class:
            rf['project']['__class__'] = args.project_class
    except AttributeError:
        pass

    try:
        if args.output_dir:
            rf['project']['directories']['output'] = args.output_dir
    except (AttributeError, KeyError):
        pass

    try:
        if args.job_name:
            rf['project']['job_name'] = args.job_name
    except (AttributeError, KeyError):
        pass

    main_project(symbols=symbols, runfile=rf)


if __name__ == '__main__':
    main()
    sys.exit(0)