public release 4.2.0 - see README.md and CHANGES.md for details

This commit is contained in:
2026-01-08 19:10:45 +01:00
parent ef781e2db4
commit b64beb694c
181 changed files with 39388 additions and 6527 deletions

555
pmsco/pmsco.py Executable file → Normal file
View File

@@ -1,31 +1,35 @@
#!/usr/bin/env python
#!/usr/bin/env python3
"""
@package pmsco.pmsco
PEARL Multiple-Scattering Calculation and Structural Optimization
PSI Multiple-Scattering Calculation and Structural Optimization
this is the top-level interface of the PMSCO package.
all calculations (any mode, any project) start by calling the run_project() function of this module.
the module also provides a command line and a run-file/run-dict interface.
This is the top-level interface of the PMSCO package.
All calculations (any mode, any project) start by calling the run_project function of this module.
The module also provides a command line, a run-file, and a run-dict interface.
They all, in one way or another, set up an instance of a Project class and call the run_project function.
for parallel execution, prefix the command line with mpi_exec -np NN, where NN is the number of processes to use.
note that in parallel mode, one process takes the role of the coordinator (master).
the master does not run calculations and is idle most of the time.
to benefit from parallel execution on a work station, NN should be the number of processors.
on a cluster, the number of processes is chosen according to the available resources.
For parallel execution, prefix the command line with mpi_exec -np NN, where NN is the number of processes to use.
Note that in parallel mode, one process takes the role of the coordinator (master).
The master does not run calculations and is idle most of the time.
To benefit from parallel execution on a work station, NN should be the number of processors.
On a cluster, the number of processes should be chosen according to the available resources.
all calculations can also be run in a single process.
All calculations can also be run in a single process.
PMSCO serializes the calculations automatically.
the code of the main module is independent of a particular calculation project.
all project-specific code must be in a separate python module.
the project module must implement a class derived from pmsco.project.Project,
and call run_project() with an instance of the project class.
refer to the projects folder for examples.
The code of the main module is independent of a particular calculation project.
All project-specific code must be in a separate python module.
The project module must implement a class derived from pmsco.project.Project.
The project module and class must be referenced in the run-file, or passed to the suitable run-function.
While they are not strictly necessary, run-files help to separate code and data.
Code is usually version-controlled, run-files contain metadata of calculations and should be kept with the results.
A git hash can be used to refer to the code used to execute the calculation.
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
@copyright (c) 2015-21 by Paul Scherrer Institut @n
@copyright (c) 2015-23 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
@@ -33,12 +37,16 @@ Licensed under the Apache License, Version 2.0 (the "License"); @n
"""
import argparse
from builtins import range
import logging
from collections.abc import Mapping
import importlib
import commentjson as json
import importlib.util
import json
import jsonschema
import logging
import os
from pathlib import Path
import sys
import typing
try:
from mpi4py import MPI
@@ -55,10 +63,9 @@ pmsco_root = Path(__file__).resolve().parent.parent
if str(pmsco_root) not in sys.path:
sys.path.insert(0, str(pmsco_root))
from pmsco.database.git import get_git_hash
import pmsco.dispatch as dispatch
import pmsco.files as files
import pmsco.handlers as handlers
from pmsco.optimizers import genetic, swarm, grid, table
from pmsco.project import Project
# the module-level logger
logger = logging.getLogger(__name__)
@@ -94,6 +101,7 @@ def setup_logging(enable=False, filename="pmsco.log", level="WARNING"):
numeric_level = getattr(logging, level.upper(), logging.WARNING)
root_logger = logging.getLogger()
root_logger.setLevel(numeric_level)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
if enable:
if mpi_size > 1:
@@ -112,49 +120,6 @@ def setup_logging(enable=False, filename="pmsco.log", level="WARNING"):
root_logger.addHandler(handler)
def set_common_args(project, args):
"""
set common project arguments from parsed command line.
this function translates and distributes the common arguments from the command line parser
to the respective destinations.
as of this writing, there are two destinations: the global logger and the project instance.
note that run_project() is called with the project instance as the only argument.
all project-related arguments from the command line must therefore be copied to the project object.
@param args: a namespace object containing the necessary parameters.
this can be an instance of Args, or the return value of parse_cli(),
or any object which has the same attributes as the Args class.
@return: None
"""
if args.data_dir:
project.data_dir = args.data_dir
if args.output_file:
project.output_file = args.output_file
if args.db_file:
project.db_file = args.db_file
if args.log_file:
project.log_file = args.log_file
if args.log_level:
project.log_level = args.log_level
if not args.log_enable:
project.log_file = ""
project.log_level = ""
if args.mode:
project.mode = args.mode.lower()
if args.time_limit:
project.time_limit = args.time_limit
if args.keep_files:
project.keep_files = args.keep_files
if args.keep_levels:
project.keep_levels = max(args.keep_levels, project.keep_levels)
if args.keep_best:
project.keep_best = max(args.keep_best, project.keep_best)
def run_project(project):
"""
run a calculation project.
@@ -179,36 +144,18 @@ def run_project(project):
if mpi_rank == 0:
project.log_project_args()
if not project.git_hash:
project.git_hash = get_git_hash()
project.validate()
optimizer_class = None
if project.mode == 'single':
optimizer_class = handlers.SingleModelHandler
elif project.mode == 'grid':
optimizer_class = grid.GridSearchHandler
elif project.mode == 'swarm':
optimizer_class = swarm.ParticleSwarmHandler
elif project.mode == 'genetic':
optimizer_class = genetic.GeneticOptimizationHandler
elif project.mode == 'gradient':
logger.error("gradient search not implemented")
# TODO: implement gradient search
# optimizer_class = gradient.GradientSearchHandler
elif project.mode == 'table':
optimizer_class = table.TableModelHandler
else:
logger.error("invalid optimization mode '%s'.", project.mode)
project.handler_classes['model'] = optimizer_class
project.handler_classes['region'] = handlers.choose_region_handler_class(project)
if project and optimizer_class:
if project:
logger.info("starting calculations")
try:
dispatch.run_calculations(project)
except (SystemExit, KeyboardInterrupt):
raise
except Exception as __:
except Exception:
logger.exception("unhandled exception during calculations.")
raise
else:
@@ -223,6 +170,8 @@ def schedule_project(project, run_dict):
the function validates the project and submits a job to the scheduler.
placeholders in run-file's directories dict are resolved.
@param project: fully initialized project object.
the validate method is called as part of this function.
@@ -234,117 +183,234 @@ def schedule_project(project, run_dict):
setup_logging(enable=False)
project.validate()
try:
dirs = run_dict['project']['directories']
for k in dirs:
dirs[k] = str(project.directories[k])
except KeyError:
pass
if project.git_hash:
run_dict['project']['git_hash'] = project.git_hash
elif hsh := get_git_hash():
run_dict['project']['git_hash'] = hsh
if project.db_file:
run_dict['project']['db_file'] = str(project.db_file)
if sf := project.optimizer_params['seed_file']:
run_dict['project']['optimizer_params']['seed_file'] = str(sf)
schedule_dict = run_dict['schedule']
module = importlib.import_module(schedule_dict['__module__'])
module = _load_module(schedule_dict['__module__'])
schedule_class = getattr(module, schedule_dict['__class__'])
schedule = schedule_class(project)
schedule.set_properties(module, schedule_dict, project)
schedule.set_properties(vars(module), schedule_dict, project)
schedule.run_dict = run_dict
schedule.validate()
schedule.submit()
class Args(object):
def _load_runfile(runfile: typing.Union[typing.Dict, str, bytes, os.PathLike, typing.TextIO]) -> typing.Mapping:
"""
arguments of the main function.
this class can be used to set up an arguments object for the main
function as an alternative to the __main__ function which parses
command line arguments.
the constructor initializes the attributes with the same default
values as the command line parser.
Load a runfile
The function loads a runfile from a dictionary, an open json file object, or a json file specified by a file path.
If the source is a file, the directory is added to the project directories under the `run` key.
@param runfile: Dictionary with contents of a runfile, an open file object, or a path-like.
@return: Dictionary with the contents of the runfile.
"""
def __init__(self):
"""
constructor.
the parameters are the same as for the command line interface.
project and mode are mandatory.
other parameters may be required depending on the project
and/or the calculation mode.
"""
self.data_dir = ""
self.output_file = ""
self.db_file = ""
self.time_limit = 24.0
self.keep_files = files.FILE_CATEGORIES_TO_KEEP
self.keep_best = 10
self.keep_levels = 1
self.log_level = "WARNING"
self.log_file = ""
self.log_enable = True
def set_run_dir(fileobj):
try:
p = Path(fileobj.name).parent.resolve(True)
rf['project']['directories']['run'] = p
except (AttributeError, FileNotFoundError):
pass
if isinstance(runfile, Mapping):
rf = runfile
elif hasattr(runfile, 'read'):
rf = json.load(runfile)
set_run_dir(runfile)
else:
with open(runfile, 'r') as f:
rf = json.load(f)
set_run_dir(f)
schema_dir = Path(__file__).parent / "schema"
schema_file = schema_dir / "runfile.schema.json"
schema_url = f"file://{schema_dir}/"
with open(schema_file) as f:
schema = json.load(f)
resolver = jsonschema.RefResolver(schema_url, None)
jsonschema.validate(rf, schema, resolver=resolver)
return rf
def _load_module(name_or_path: typing.Union[str, bytes, os.PathLike]):
"""
Load a Python module
@param name_or_path: Module name or file path of the module.
If a module name is given, the module must be in the Python module search path.
@return: module
@raise ValueError if the module is not found
"""
try:
return importlib.import_module(name_or_path)
except ImportError:
p = Path(name_or_path)
module_name = p.stem
spec = importlib.util.spec_from_file_location(module_name, name_or_path)
try:
module = importlib.util.module_from_spec(spec)
except AttributeError:
msg = f"Can't find module {name_or_path}"
print(msg, sys.stderr)
print("sys.path:", sys.path, sys.stderr)
raise ValueError(msg)
sys.modules[module_name] = module
spec.loader.exec_module(module)
return module
def main_project(symbols: typing.Optional[typing.Dict[str, typing.Any]] = None,
project: typing.Optional[Project] = None,
project_module: typing.Optional[typing.Union[str, os.PathLike]] = None,
project_class: typing.Optional[typing.Union[str, typing.Type[Project]]] = None,
runfile: typing.Union[typing.Dict, str, bytes, os.PathLike, typing.TextIO] = None):
"""
Main function with optional arguments.
This function starts the whole process based on function arguments.
The arguments can be a an existing project instance, a project class, and/or a runfile.
The function carries out the following steps:
1. Load a runfile - if specified.
2. Create a project object.
3. Apply the runfile to the project.
4. Run or schedule the project.
The project instance is produced from the first match of the following conditions:
1. `project` argument is a Project instance.
2. `project_class` is a Project class.
3. `__class__` entry from runfile.
The class must be listed in symbols,
or the runfile must also contain a `__module__` entry
with the name or file path of the project module that declares the class.
The project is scheduled rather than executed if the corresponding section in the runfile is present.
@param symbols: Namespace of the project module, which contains project, cluster and calculator classes.
This is the basis for class resolution from runfiles.
If called by the project module, it should pass vars().
@param project: project instance.
@param project_class: project class or name of a project class defined in `symbols`.
@param project_module: name or file path of the project module.
This is required if symobls is not defined
and the project class is given as a string (project_class argument or runfile value).
@param runfile: A file-like, path-like or dict with runfile contents.
Runfiles must be in json-format.
@return: None
"""
if runfile is not None:
rf = _load_runfile(runfile)
rfp = rf['project']
else:
rf = None
rfp = None
if project is None:
if project_class is None or not issubclass(project_class, Project):
project_classname = project_class
if not project_classname:
project_classname = rfp['__class__']
if not symbols:
if project_module:
module = _load_module(project_module)
symbols = vars(module)
else:
module = _load_module(rfp['__module__'])
symbols = vars(module)
project_class = symbols[project_classname]
project = project_class()
project.directories['pmsco'] = Path(__file__).parent
try:
project.directories['project'] = Path(module.__file__).parent
except AttributeError:
pass
if rfp:
project.set_properties(symbols, rfp, project)
try:
schedule_enabled = rf['schedule']['enabled']
except KeyError:
schedule_enabled = False
if schedule_enabled:
schedule_project(project, rf)
else:
run_project(project)
def get_cli_parser():
KEEP_FILES_CHOICES = files.FILE_CATEGORIES | {'all'}
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="""
multiple-scattering calculations and optimization
PSI multiple-scattering calculations and optimization (PMSCO)
This is the main command line entry point for PMSCO calculation jobs.
Alternative entry points can be provided by project modules.
The command line requires at least a run-file to define the project parameters.
you must call pmsco.py from a project file which defines the calculation project.
the project file must be a regular Python module and define:
The command can run a calculation job directly or submit it to a job queue
via the `schedule` section in the run-file.
The program detects whether it runs in a single-process or OpenMPI multi-process environment
and coordinates parallel processes automatically.
1) a project class derived from pmsco.project.Project.
the class implements/overrides all necessary methods of the calculation project,
in particular create_model_space, create_cluster, and create_params.
2) a global function named create_project.
the function accepts a namespace object from the argument parser.
it may evaluate extra, project-specific arguments.
it does not need to evaluate the common parameters described below.
the function must return an instance of the project class described above.
3) main code that parses the command line and calls pmsco.pmsco.main_pmsco().
(see the projects folder for examples).
All arguments should preferably be declared in the run-file.
A small number of options can be passed on the command line
to override the corresponding parameter of the run-file.
Please see the documentation that is compiled in docs/html/index.html
for instructions how to set up a project module and run-files.
See also the projects folder for examples.
""")
# the required argument list may depend on the calculation mode.
# for simplicity, the parser does not check these requirements.
# all parameters are optional and accepted regardless of mode.
# errors may occur if implicit requirements are not met.
parser.add_argument('project_module', nargs='?',
help="path to custom module that defines the calculation project")
parser.add_argument('-r', '--run-file',
help="path to run-time parameters file which contains all program arguments. " +
"must be in JSON format.")
parser.add_argument('-m', '--mode',
choices=['single', 'grid', 'swarm', 'genetic', 'table'],
help='calculation mode')
parser.add_argument('-d', '--data-dir',
help='directory path for experimental data files (if required by project). ' +
'default: working directory')
parser.add_argument('-o', '--output-file',
help='base path for intermediate and output files.')
parser.add_argument('-b', '--db-file',
help='name of an sqlite3 database file where the results should be stored.')
parser.add_argument('-k', '--keep-files', nargs='*',
choices=KEEP_FILES_CHOICES,
help='output file categories to keep after the calculation. '
'by default, cluster and model (simulated data) '
'of a limited number of best models are kept.')
parser.add_argument('--keep-best', type=int,
help='number of best models for which to keep result files '
'(at each node from root down to keep-levels).')
parser.add_argument('--keep-levels', type=int, choices=range(5),
help='task level down to which result files of best models are kept. '
'0 = model, 1 = scan, 2 = domain, 3 = emitter, 4 = region.')
parser.add_argument('-t', '--time-limit', type=float,
help='wall time limit in hours. the optimizers try to finish before the limit.')
parser.add_argument('--log-file',
help='name of the main log file. ' +
'under MPI, the rank of the process is inserted before the extension.')
parser.add_argument('--log-level',
help='minimum level of log messages. DEBUG, INFO, WARNING, ERROR, CRITICAL.')
feature_parser = parser.add_mutually_exclusive_group(required=False)
feature_parser.add_argument('--log-enable', dest='log_enable', action="store_true",
help="enable logging. by default, logging is on.")
feature_parser.add_argument('--log-disable', dest='log_enable', action='store_false',
help="disable logging. by default, logging is on.")
parser.set_defaults(log_enable=True)
help="Path to a run-file in JSON format which contains all calculation parameters. "
"This argument is mandatory. "
)
parser.add_argument('-m', '--module',
help="File name of the custom project module. "
"The module must declare the project class and other project-specific classes. "
"This optional argument overrides the __module__ entry of the run-file. "
)
parser.add_argument('-c', '--project-class',
help="Project class. Requires --module to be specified. "
"The project class is resolved in the namespace of the module. "
"This optional argument corresponds to the __class__ entry of the run-file. "
)
parser.add_argument('-o', '--output-dir',
help="Output directory. "
"This optional argument overrides the directories['output'] entry of the run-file."
)
parser.add_argument('-j', '--job-name',
help="Job name. Should be short and valid as a part of directory and file names. "
"If a persistent database is used, it must not exist in the database yet. "
"This optional argument overrides the job_name of the run-file."
)
return parser
@@ -362,129 +428,52 @@ def parse_cli():
return args, unknown_args
def import_module(module_name):
def main(symbols: typing.Optional[typing.Dict[str, typing.Any]] = None):
"""
import a custom module by name.
Main function with command line parsing
import a module given its file path or module name (like in an import statement).
This function starts the whole process with parameters from the command line.
preferably, the module name should be given as in an import statement.
as the top-level pmsco directory is on the python path,
the module name will begin with `projects` for a custom project module or `pmsco` for a core pmsco module.
in this case, the function just calls importlib.import_module.
If the command line contains a run-file parameter, it determines the project class and the project parameters.
if a file path is given, i.e., `module_name` links to an existing file and has a `.py` extension,
the function extracts the directory path,
inserts it into the python path,
and calls importlib.import_module on the stem of the file name.
@note the file path remains in the python path.
this option should be used carefully to avoid breaking file name resolution.
@param module_name: file path or module name.
file path is interpreted relative to the working directory.
@return: the loaded module as a python object
"""
p = Path(module_name)
if p.is_file() and p.suffix == ".py":
path = p.parent.resolve()
module_name = p.stem
if path not in sys.path:
sys.path.insert(0, path)
module = importlib.import_module(module_name)
return module
def main_dict(run_params):
"""
main function with dictionary run-time parameters
this starts the whole process with all direct parameters.
the command line is not parsed.
no run-file is loaded (just the project module).
@param run_params: dictionary with the same structure as the JSON run-file.
The project class can be specified either in the run-file, on the command line or the function arguments.
If the run-file specifies a class name, that class is instantiated.
@return: None
"""
project_params = run_params['project']
module = importlib.import_module(project_params['__module__'])
try:
project_class = getattr(module, project_params['__class__'])
except KeyError:
project = module.create_project()
else:
project = project_class()
project._module = module
project.directories['pmsco'] = Path(__file__).parent
project.directories['project'] = Path(module.__file__).parent
project.set_properties(module, project_params, project)
run_project(project)
def main():
"""
main function with command line parsing
this function starts the whole process with parameters from the command line.
if the command line contains a run-file parameter, it determines the module to load and the project parameters.
otherwise, the command line parameters apply.
the project class can be specified either in the run-file or the project module.
if the run-file specifies a class name, that class is looked up in the project module and instantiated.
otherwise, the module's create_project is called.
@return: None
"""
args, unknown_args = parse_cli()
try:
with open(args.run_file, 'r') as f:
rf = json.load(f)
rf = _load_runfile(args.run_file)
except AttributeError:
rfp = {'__module__': args.project_module}
else:
rfp = rf['project']
module = import_module(rfp['__module__'])
try:
project_args = module.parse_project_args(unknown_args)
except AttributeError:
project_args = None
rf = {'project': {}}
try:
project_class = getattr(module, rfp['__class__'])
except (AttributeError, KeyError):
project = module.create_project()
else:
project = project_class()
project_args = None
project._module = module
project.directories['pmsco'] = Path(__file__).parent
project.directories['project'] = Path(module.__file__).parent
project.set_properties(module, rfp, project)
set_common_args(project, args)
try:
if project_args:
module.set_project_args(project, project_args)
if args.module:
rf['project']['__module__'] = args.module
except AttributeError:
pass
try:
schedule_enabled = rf['schedule']['enabled']
except KeyError:
schedule_enabled = False
if schedule_enabled:
schedule_project(project, rf)
else:
run_project(project)
if args.project_class:
rf['project']['__class__'] = args.project_class
except AttributeError:
pass
try:
if args.output_dir:
rf['project']['directories']['output'] = args.output_dir
except (AttributeError, KeyError):
pass
try:
if args.job_name:
rf['project']['job_name'] = args.job_name
except (AttributeError, KeyError):
pass
main_project(symbols=symbols, runfile=rf)
if __name__ == '__main__':