pmsco-public/pmsco/pmsco.py

#!/usr/bin/env python

"""
@package pmsco.pmsco
PEARL Multiple-Scattering Calculation and Structural Optimization

this is the top-level interface of the PMSCO package.
all calculations (any mode, any project) start by calling the run_project() function of this module.
the module also provides a command line and a run-file/run-dict interface.

for parallel execution, prefix the command line with mpi_exec -np NN, where NN is the number of processes to use.
note that in parallel mode, one process takes the role of the coordinator (master).
the master does not run calculations and is idle most of the time.
to benefit from parallel execution on a work station, NN should be the number of processors.
on a cluster, the number of processes is chosen according to the available resources.

all calculations can also be run in a single process.
PMSCO serializes the calculations automatically.

the code of the main module is independent of a particular calculation project.
all project-specific code must be in a separate python module.
the project module must implement a class derived from pmsco.project.Project,
and call run_project() with an instance of the project class.
refer to the projects folder for examples.

@author Matthias Muntwiler, matthias.muntwiler@psi.ch

@copyright (c) 2015-21 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
"""

import argparse
from builtins import range
import logging
import importlib
import commentjson as json
from pathlib import Path
import sys

try:
    from mpi4py import MPI
    mpi_comm = MPI.COMM_WORLD
    mpi_size = mpi_comm.Get_size()
    mpi_rank = mpi_comm.Get_rank()
except ImportError:
    MPI = None
    mpi_comm = None
    mpi_size = 1
    mpi_rank = 0

pmsco_root = Path(__file__).resolve().parent.parent
if str(pmsco_root) not in sys.path:
    sys.path.insert(0, str(pmsco_root))

import pmsco.dispatch as dispatch
import pmsco.files as files
import pmsco.handlers as handlers
from pmsco.optimizers import genetic, swarm, grid, table

# the module-level logger
logger = logging.getLogger(__name__)


def setup_logging(enable=False, filename="pmsco.log", level="WARNING"):
    """
    configure the root logger. direct the logs either to a file or the null handler.

    this function must be called before the first logging command
    whether a log output is requested or not.
    to disable logging, call this function with enable=False (default).

    modules should create their own loggers, by calling
    @code logger = logging.getLogger(__name__) @endcode
    at the top of the module code.
    that logger is then used by calls like
    @code logger.debug(message) @endcode.

    @param enable: (bool) True=enable logging to the specified file,
        False=do not generate a log (null handler).
    @param filename: (Path-like) path and name of the log file.
        if this process is part of an MPI communicator,
        the function inserts a dot and the MPI rank of this process before the extension.
        if the filename is empty, logging is disabled.
    @param level: (string) name of the log level.
        must be the name of one of "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL".
        if empty, logging is disabled.
        if not a valid level, defaults to "WARNING".
    @return None
    """
    enable = enable and str(filename) and level
    numeric_level = getattr(logging, level.upper(), logging.WARNING)
    root_logger = logging.getLogger()
    root_logger.setLevel(numeric_level)

    if enable:
        if mpi_size > 1:
            p = Path(filename)
            filename = p.with_suffix(f".{mpi_rank}" + p.suffix)

        log_format = '%(asctime)s (%(name)s) %(levelname)s: %(message)s'
        formatter = logging.Formatter(log_format)

        handler = logging.FileHandler(filename, mode="w", delay=True)
        handler.setLevel(numeric_level)
        handler.setFormatter(formatter)
    else:
        handler = logging.NullHandler()

    root_logger.addHandler(handler)


def set_common_args(project, args):
    """
    set common project arguments from parsed command line.

    this function translates and distributes the common arguments from the command line parser
    to the respective destinations.
    as of this writing, there are two destinations: the global logger and the project instance.

    note that run_project() is called with the project instance as the only argument.
    all project-related arguments from the command line must therefore be copied to the project object.

    @param args: a namespace object containing the necessary parameters.
        this can be an instance of Args, or the return value of parse_cli(),
        or any object which has the same attributes as the Args class.

    @return: None
    """

    if args.data_dir:
        project.data_dir = args.data_dir
    if args.output_file:
        project.output_file = args.output_file
    if args.db_file:
        project.db_file = args.db_file
    if args.log_file:
        project.log_file = args.log_file
    if args.log_level:
        project.log_level = args.log_level
    if not args.log_enable:
        project.log_file = ""
        project.log_level = ""
    if args.mode:
        project.mode = args.mode.lower()
    if args.time_limit:
        project.time_limit = args.time_limit
    if args.keep_files:
        project.keep_files = args.keep_files
    if args.keep_levels:
        project.keep_levels = max(args.keep_levels, project.keep_levels)
    if args.keep_best:
        project.keep_best = max(args.keep_best, project.keep_best)


def run_project(project):
    """
    run a calculation project.

    the function sets up logging, validates the project, chooses the handler classes,
    and passes control to the pmsco.dispatch module to run the calculations.

    @param project: fully initialized project object.
        the validate method is called as part of this function after setting up the logger.
    @return: None
    """

    log_file = Path(project.log_file)
    if not log_file.name:
        log_file = Path(project.job_name).with_suffix(".log")
    if log_file.name:
        log_file.parent.mkdir(exist_ok=True)
        log_level = project.log_level
    else:
        log_level = ""
    setup_logging(enable=bool(log_level), filename=log_file, level=log_level)
    if mpi_rank == 0:
        project.log_project_args()

    project.validate()

    optimizer_class = None
    if project.mode == 'single':
        optimizer_class = handlers.SingleModelHandler
    elif project.mode == 'grid':
        optimizer_class = grid.GridSearchHandler
    elif project.mode == 'swarm':
        optimizer_class = swarm.ParticleSwarmHandler
    elif project.mode == 'genetic':
        optimizer_class = genetic.GeneticOptimizationHandler
    elif project.mode == 'gradient':
        logger.error("gradient search not implemented")
        # TODO: implement gradient search
        # optimizer_class = gradient.GradientSearchHandler
    elif project.mode == 'table':
        optimizer_class = table.TableModelHandler
    else:
        logger.error("invalid optimization mode '%s'.", project.mode)
    project.handler_classes['model'] = optimizer_class

    project.handler_classes['region'] = handlers.choose_region_handler_class(project)

    if project and optimizer_class:
        logger.info("starting calculations")
        try:
            dispatch.run_calculations(project)
        except (SystemExit, KeyboardInterrupt):
            raise
        except Exception as __:
            logger.exception("unhandled exception during calculations.")
            raise
        else:
            logger.info("calculations complete")
    else:
        logger.error("undefined project, optimizer, or calculator.")


def schedule_project(project, run_dict):
    """
    schedule a calculation project.

    the function validates the project and submits a job to the scheduler.

    @param project: fully initialized project object.
        the validate method is called as part of this function.

    @param run_dict: dictionary holding the contents of the run file.

    @return: None
    """
    assert mpi_rank == 0
    setup_logging(enable=False)

    project.validate()

    schedule_dict = run_dict['schedule']
    module = importlib.import_module(schedule_dict['__module__'])
    schedule_class = getattr(module, schedule_dict['__class__'])
    schedule = schedule_class(project)
    schedule.set_properties(module, schedule_dict, project)
    schedule.run_dict = run_dict
    schedule.validate()
    schedule.submit()


class Args(object):
    """
    arguments of the main function.

    this class can be used to set up an arguments object for the main
    function as an alternative to the __main__ function which parses
    command line arguments.

    the constructor initializes the attributes with the same default
    values as the command line parser.
    """

    def __init__(self):
        """
        constructor.

        the parameters are the same as for the command line interface.
        project and mode are mandatory.
        other parameters may be required depending on the project
        and/or the calculation mode.
        """
        self.data_dir = ""
        self.output_file = ""
        self.db_file = ""
        self.time_limit = 24.0
        self.keep_files = files.FILE_CATEGORIES_TO_KEEP
        self.keep_best = 10
        self.keep_levels = 1
        self.log_level = "WARNING"
        self.log_file = ""
        self.log_enable = True


def get_cli_parser():
    KEEP_FILES_CHOICES = files.FILE_CATEGORIES | {'all'}

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""
        multiple-scattering calculations and optimization

        you must call pmsco.py from a project file which defines the calculation project.
        the project file must be a regular Python module and define:

        1) a project class derived from pmsco.project.Project.
           the class implements/overrides all necessary methods of the calculation project,
           in particular create_model_space, create_cluster, and create_params.

        2) a global function named create_project.
           the function accepts a namespace object from the argument parser.
           it may evaluate extra, project-specific arguments.
           it does not need to evaluate the common parameters described below.
           the function must return an instance of the project class described above.

        3) main code that parses the command line and calls pmsco.pmsco.main_pmsco().
           (see the projects folder for examples).
        """)
    # the required argument list may depend on the calculation mode.
    # for simplicity, the parser does not check these requirements.
    # all parameters are optional and accepted regardless of mode.
    # errors may occur if implicit requirements are not met.
    parser.add_argument('project_module', nargs='?',
                        help="path to custom module that defines the calculation project")
    parser.add_argument('-r', '--run-file',
                        help="path to run-time parameters file which contains all program arguments. " +
                        "must be in JSON format.")
    parser.add_argument('-m', '--mode',
                        choices=['single', 'grid', 'swarm', 'genetic', 'table'],
                        help='calculation mode')
    parser.add_argument('-d', '--data-dir',
                        help='directory path for experimental data files (if required by project). ' +
                             'default: working directory')
    parser.add_argument('-o', '--output-file',
                        help='base path for intermediate and output files.')
    parser.add_argument('-b', '--db-file',
                        help='name of an sqlite3 database file where the results should be stored.')
    parser.add_argument('-k', '--keep-files', nargs='*',
                        choices=KEEP_FILES_CHOICES,
                        help='output file categories to keep after the calculation. '
                             'by default, cluster and model (simulated data) '
                             'of a limited number of best models are kept.')
    parser.add_argument('--keep-best', type=int,
                        help='number of best models for which to keep result files '
                             '(at each node from root down to keep-levels).')
    parser.add_argument('--keep-levels', type=int, choices=range(5),
                        help='task level down to which result files of best models are kept. '
                             '0 = model, 1 = scan, 2 = domain, 3 = emitter, 4 = region.')
    parser.add_argument('-t', '--time-limit', type=float,
                        help='wall time limit in hours. the optimizers try to finish before the limit.')
    parser.add_argument('--log-file',
                        help='name of the main log file. ' +
                             'under MPI, the rank of the process is inserted before the extension.')
    parser.add_argument('--log-level',
                        help='minimum level of log messages. DEBUG, INFO, WARNING, ERROR, CRITICAL.')
    feature_parser = parser.add_mutually_exclusive_group(required=False)
    feature_parser.add_argument('--log-enable', dest='log_enable', action="store_true",
                        help="enable logging. by default, logging is on.")
    feature_parser.add_argument('--log-disable', dest='log_enable', action='store_false',
                        help="disable logging. by default, logging is on.")
    parser.set_defaults(log_enable=True)

    return parser


def parse_cli():
    """
    parse the command line interface

    @return: Namespace object created by the argument parser.
    """
    parser = get_cli_parser()

    args, unknown_args = parser.parse_known_args()

    return args, unknown_args


def import_module(module_name):
    """
    import a custom module by name.

    import a module given its file path or module name (like in an import statement).

    preferably, the module name should be given as in an import statement.
    as the top-level pmsco directory is on the python path,
    the module name will begin with `projects` for a custom project module or `pmsco` for a core pmsco module.
    in this case, the function just calls importlib.import_module.

    if a file path is given, i.e., `module_name` links to an existing file and has a `.py` extension,
    the function extracts the directory path,
    inserts it into the python path,
    and calls importlib.import_module on the stem of the file name.

    @note the file path remains in the python path.
    this option should be used carefully to avoid breaking file name resolution.

    @param module_name: file path or module name.
        file path is interpreted relative to the working directory.

    @return: the loaded module as a python object
    """
    p = Path(module_name)
    if p.is_file() and p.suffix == ".py":
        path = p.parent.resolve()
        module_name = p.stem
        if path not in sys.path:
            sys.path.insert(0, path)

    module = importlib.import_module(module_name)
    return module


def main_dict(run_params):
    """
    main function with dictionary run-time parameters

    this starts the whole process with all direct parameters.
    the command line is not parsed.
    no run-file is loaded (just the project module).

    @param run_params: dictionary with the same structure as the JSON run-file.

    @return: None
    """
    project_params = run_params['project']

    module = importlib.import_module(project_params['__module__'])
    try:
        project_class = getattr(module, project_params['__class__'])
    except KeyError:
        project = module.create_project()
    else:
        project = project_class()

    project._module = module
    project.directories['pmsco'] = Path(__file__).parent
    project.directories['project'] = Path(module.__file__).parent
    project.set_properties(module, project_params, project)
    run_project(project)


def main():
    """
    main function with command line parsing

    this function starts the whole process with parameters from the command line.

    if the command line contains a run-file parameter, it determines the module to load and the project parameters.
    otherwise, the command line parameters apply.

    the project class can be specified either in the run-file or the project module.
    if the run-file specifies a class name, that class is looked up in the project module and instantiated.
    otherwise, the module's create_project is called.

    @return: None
    """
    args, unknown_args = parse_cli()

    try:
        with open(args.run_file, 'r') as f:
            rf = json.load(f)
    except AttributeError:
        rfp = {'__module__': args.project_module}
    else:
        rfp = rf['project']

    module = import_module(rfp['__module__'])
    try:
        project_args = module.parse_project_args(unknown_args)
    except AttributeError:
        project_args = None

    try:
        project_class = getattr(module, rfp['__class__'])
    except (AttributeError, KeyError):
        project = module.create_project()
    else:
        project = project_class()
        project_args = None

    project._module = module
    project.directories['pmsco'] = Path(__file__).parent
    project.directories['project'] = Path(module.__file__).parent
    project.set_properties(module, rfp, project)

    set_common_args(project, args)
    try:
        if project_args:
            module.set_project_args(project, project_args)
    except AttributeError:
        pass

    try:
        schedule_enabled = rf['schedule']['enabled']
    except KeyError:
        schedule_enabled = False
    if schedule_enabled:
        schedule_project(project, rf)
    else:
        run_project(project)


if __name__ == '__main__':
    main()
    sys.exit(0)