public release 3.0.0 - see README and CHANGES for details

This commit is contained in:
2021-02-09 12:46:20 +01:00
parent 2b3dbd8bac
commit ef781e2db4
46 changed files with 4390 additions and 1655 deletions

View File

@ -8,16 +8,13 @@ python pmsco [pmsco-arguments]
@endverbatim
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from pathlib import Path
import sys
import os.path
file_dir = os.path.dirname(__file__) or '.'
root_dir = os.path.join(file_dir, '..')
root_dir = os.path.abspath(root_dir)
sys.path[0] = root_dir
pmsco_root = Path(__file__).resolve().parent.parent
if str(pmsco_root) not in sys.path:
sys.path.insert(0, str(pmsco_root))
if __name__ == '__main__':
import pmsco.pmsco

View File

@ -13,8 +13,9 @@ SHELL=/bin/sh
.PHONY: all clean phagen
FC?=gfortran
FCOPTS?=-std=legacy
F2PY?=f2py
F2PYOPTS?=
F2PYOPTS?=--f77flags=-std=legacy --f90flags=-std=legacy
CC?=gcc
CCOPTS?=
SWIG?=swig

View File

@ -17,22 +17,20 @@ pip install --user periodictable
@author Matthias Muntwiler
@copyright (c) 2015-20 by Paul Scherrer Institut @n
@copyright (c) 2015-21 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import periodictable as pt
import sys
import pmsco.config as config
## default file format identifier
FMT_DEFAULT = 0
## MSC file format identifier
@ -227,13 +225,13 @@ class Cluster(object):
"""
self.rmax = r
def build_element(self, index, element_number, x, y, z, emitter, charge=0., scatterer_class=0):
def build_element(self, index, element, x, y, z, emitter, charge=0., scatterer_class=0):
"""
build a tuple in the format of the internal data array.
@param index: (int) index
@param element_number: (int) chemical element number
@param element: chemical element number (int) or symbol (str)
@param x, y, z: (float) atom coordinates in the cluster
@ -243,7 +241,13 @@ class Cluster(object):
@param scatterer_class: (int) scatterer class. default = 0.
"""
symbol = pt.elements[element_number].symbol
try:
element_number = int(element)
symbol = pt.elements[element_number].symbol
except ValueError:
symbol = element
element_number = pt.elements.symbol(symbol.strip()).number
element = (index, element_number, symbol, scatterer_class, x, y, z, int(emitter), charge)
return element
@ -251,7 +255,7 @@ class Cluster(object):
"""
add a single atom to the cluster.
@param atomtype: (int) chemical element number
@param atomtype: chemical element number (int) or symbol (str)
@param v_pos: (numpy.ndarray, shape = (3)) position vector
@ -274,7 +278,7 @@ class Cluster(object):
self.rmax (maximum distance from the origin).
all atoms are non-emitters.
@param atomtype: (int) chemical element number
@param atomtype: chemical element number (int) or symbol (str)
@param v_pos: (numpy.ndarray, shape = (3))
position vector of the first atom (basis vector)
@ -307,7 +311,7 @@ class Cluster(object):
and z_surf (position of the surface).
all atoms are non-emitters.
@param atomtype: (int) chemical element number
@param atomtype: chemical element number (int) or symbol (str)
@param v_pos: (numpy.ndarray, shape = (3))
position vector of the first atom (basis vector)
@ -1133,7 +1137,7 @@ class Cluster(object):
np.savetxt(f, data, fmt=file_format, header=header, comments="")
class ClusterGenerator(object):
class ClusterGenerator(config.ConfigurableObject):
"""
cluster generator class.
@ -1151,6 +1155,7 @@ class ClusterGenerator(object):
@param project: reference to the project object.
cluster generators may need to look up project parameters.
"""
super().__init__()
self.project = project
def count_emitters(self, model, index):
@ -1258,7 +1263,7 @@ class LegacyClusterGenerator(ClusterGenerator):
"""
def __init__(self, project):
super(LegacyClusterGenerator, self).__init__(project)
super().__init__(project)
def count_emitters(self, model, index):
"""

120
pmsco/config.py Normal file
View File

@ -0,0 +1,120 @@
"""
@package pmsco.config
infrastructure for configurable objects
@author Matthias Muntwiler
@copyright (c) 2021 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
import collections.abc
import functools
import inspect
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
def resolve_path(path, dirs):
"""
resolve a file path by replacing placeholders
placeholders are enclosed in curly braces.
values for all possible placeholders are provided in a dictionary.
@param path: str, Path or other path-like.
example: '{work}/test/testfile.dat'.
@param dirs: dictionary mapping placeholders to project paths.
the paths can be str, Path or other path-like
example: {'work': '/home/user/work'}
@return: pathlib.Path object
"""
return Path(*(p.format(**dirs) for p in Path(path).parts))
class ConfigurableObject(object):
"""
Parent class for objects that can be configured by a run file
the run file is a JSON file that contains object data in a nested dictionary structure.
in the dictionary structure the keys are property or attribute names of the object to be initialized.
keys starting with a non-alphabetic character (except for some special keys like __class__) are ignored.
these can be used as comments, or they protect private attributes.
the values can be numeric values, strings, lists or dictionaries.
simple values are simply assigned using setattr.
this may call a property setter if defined.
lists are iterated. each item is appended to the attribute.
the attribute must implement an append method in this case.
if an item is a dictionary and contains the special key '__class__',
an object of that class is instantiated and recursively initialized with the dictionary elements.
this requires that the class can be found in the module scope passed to the parser methods,
and that the class inherits from this class.
cases that can't be covered easily using this mechanism
should be implemented in a property setter.
value-checking should also be done in a property setter (or the append method in sequence-like objects).
"""
def __init__(self):
pass
def set_properties(self, module, data_dict, project):
"""
set properties of this class.
@param module: module reference that should be used to resolve class names.
this is usually the project module.
@param data_dict: dictionary of properties to set.
see the class description for details.
@param project: reference to the project object.
@return: None
"""
for key in data_dict:
if key[0].isalpha():
self.set_property(module, key, data_dict[key], project)
def set_property(self, module, key, value, project):
obj = self.parse_object(module, value, project)
if hasattr(self, key):
if obj is not None:
if isinstance(obj, collections.abc.MutableSequence):
attr = getattr(self, key)
for item in obj:
attr.append(item)
elif isinstance(obj, collections.abc.Mapping):
d = getattr(self, key)
if d is not None and isinstance(d, collections.abc.MutableMapping):
d.update(obj)
else:
setattr(self, key, obj)
else:
setattr(self, key, obj)
else:
setattr(self, key, obj)
else:
logger.warning(f"class {self.__class__.__name__} does not have attribute {key}.")
def parse_object(self, module, value, project):
if isinstance(value, collections.abc.MutableMapping) and "__class__" in value:
cn = value["__class__"].split('.')
c = functools.reduce(getattr, cn, module)
s = inspect.signature(c)
if 'project' in s.parameters:
o = c(project=project)
else:
o = c()
o.set_properties(module, value, project)
elif isinstance(value, collections.abc.MutableSequence):
o = [self.parse_object(module, i, project) for i in value]
else:
o = value
return o

View File

@ -4,16 +4,13 @@ calculation dispatcher.
@author Matthias Muntwiler
@copyright (c) 2015 by Paul Scherrer Institut @n
@copyright (c) 2015-21 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import os.path
import datetime
@ -21,10 +18,20 @@ import signal
import collections
import copy
import logging
import math
from attrdict import AttrDict
from mpi4py import MPI
try:
from mpi4py import MPI
mpi_comm = MPI.COMM_WORLD
mpi_size = mpi_comm.Get_size()
mpi_rank = mpi_comm.Get_rank()
except ImportError:
MPI = None
mpi_comm = None
mpi_size = 1
mpi_rank = 0
from pmsco.helpers import BraceMessage as BMsg
logger = logging.getLogger(__name__)
@ -521,8 +528,7 @@ class MscoProcess(object):
#
# the default is 2 days after start.
def __init__(self, comm):
self._comm = comm
def __init__(self):
self._project = None
self._atomic_scattering = None
self._multiple_scattering = None
@ -829,12 +835,12 @@ class MscoMaster(MscoProcess):
# the values are handlers.TaskHandler objects.
# the objects can be accessed in attribute or dictionary notation.
def __init__(self, comm):
super(MscoMaster, self).__init__(comm)
def __init__(self):
super().__init__()
self._pending_tasks = collections.OrderedDict()
self._running_tasks = collections.OrderedDict()
self._complete_tasks = collections.OrderedDict()
self._slaves = self._comm.Get_size() - 1
self._slaves = mpi_size - 1
self._idle_ranks = []
self.max_calculations = 1000000
self._calculations = 0
@ -879,8 +885,8 @@ class MscoMaster(MscoProcess):
self._idle_ranks = list(range(1, self._running_slaves + 1))
self._root_task = CalculationTask()
self._root_task.file_root = project.output_file
self._root_task.model = project.create_model_space().start
self._root_task.file_root = str(project.output_file)
self._root_task.model = project.model_space.start
for level in self.task_levels:
self.task_handlers[level] = project.handler_classes[level]()
@ -1033,7 +1039,7 @@ class MscoMaster(MscoProcess):
else:
logger.debug("assigning task %s to rank %u", str(task.id), rank)
self._running_tasks[task.id] = task
self._comm.send(task.get_mpi_message(), dest=rank, tag=TAG_NEW_TASK)
mpi_comm.send(task.get_mpi_message(), dest=rank, tag=TAG_NEW_TASK)
self._calculations += 1
else:
if not self._finishing:
@ -1055,7 +1061,7 @@ class MscoMaster(MscoProcess):
while self._idle_ranks:
rank = self._idle_ranks.pop()
logger.debug("send finish tag to rank %u", rank)
self._comm.send(None, dest=rank, tag=TAG_FINISH)
mpi_comm.send(None, dest=rank, tag=TAG_FINISH)
self._running_slaves -= 1
def _receive_result(self):
@ -1065,7 +1071,7 @@ class MscoMaster(MscoProcess):
if self._running_slaves > 0:
logger.debug("waiting for calculation result")
s = MPI.Status()
data = self._comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=s)
data = mpi_comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=s)
if s.tag == TAG_NEW_RESULT:
task_id = self._accept_task_done(data)
@ -1185,8 +1191,8 @@ class MscoSlave(MscoProcess):
#
# typically, a task is aborted when an exception is encountered.
def __init__(self, comm):
super(MscoSlave, self).__init__(comm)
def __init__(self):
super().__init__()
self._errors = 0
self._max_errors = 5
@ -1199,7 +1205,7 @@ class MscoSlave(MscoProcess):
self._running = True
while self._running:
logger.debug("waiting for message")
data = self._comm.recv(source=0, tag=MPI.ANY_TAG, status=s)
data = mpi_comm.recv(source=0, tag=MPI.ANY_TAG, status=s)
if s.tag == TAG_NEW_TASK:
logger.debug("received new task")
self.accept_task(data)
@ -1229,17 +1235,17 @@ class MscoSlave(MscoProcess):
logger.exception(BMsg("unhandled exception in calculation task {0}", task.id))
self._errors += 1
if self._errors <= self._max_errors:
self._comm.send(data, dest=0, tag=TAG_INVALID_RESULT)
mpi_comm.send(data, dest=0, tag=TAG_INVALID_RESULT)
else:
logger.error("too many exceptions, aborting")
self._running = False
self._comm.send(data, dest=0, tag=TAG_ERROR_ABORTING)
mpi_comm.send(data, dest=0, tag=TAG_ERROR_ABORTING)
else:
logger.debug(BMsg("sending result of task {0} to master", result.id))
self._comm.send(result.get_mpi_message(), dest=0, tag=TAG_NEW_RESULT)
mpi_comm.send(result.get_mpi_message(), dest=0, tag=TAG_NEW_RESULT)
def run_master(mpi_comm, project):
def run_master(project):
"""
initialize and run the master calculation loop.
@ -1251,25 +1257,25 @@ def run_master(mpi_comm, project):
if an unhandled exception occurs, this function aborts the MPI communicator, killing all MPI processes.
the caller will not have a chance to handle the exception.
@param mpi_comm: MPI communicator (mpi4py.MPI.COMM_WORLD).
@param project: project instance (sub-class of project.Project).
"""
try:
master = MscoMaster(mpi_comm)
master = MscoMaster()
master.setup(project)
master.run()
master.cleanup()
except (SystemExit, KeyboardInterrupt):
mpi_comm.Abort()
if mpi_comm:
mpi_comm.Abort()
raise
except Exception:
logger.exception("unhandled exception in master calculation loop.")
mpi_comm.Abort()
if mpi_comm:
mpi_comm.Abort()
raise
def run_slave(mpi_comm, project):
def run_slave(project):
"""
initialize and run the slave calculation loop.
@ -1282,12 +1288,10 @@ def run_slave(mpi_comm, project):
unless it is a SystemExit or KeyboardInterrupt (where we expect that the master also receives the signal),
the MPI communicator is aborted, killing all MPI processes.
@param mpi_comm: MPI communicator (mpi4py.MPI.COMM_WORLD).
@param project: project instance (sub-class of project.Project).
"""
try:
slave = MscoSlave(mpi_comm)
slave = MscoSlave()
slave.setup(project)
slave.run()
slave.cleanup()
@ -1295,7 +1299,8 @@ def run_slave(mpi_comm, project):
raise
except Exception:
logger.exception("unhandled exception in slave calculation loop.")
mpi_comm.Abort()
if mpi_comm:
mpi_comm.Abort()
raise
@ -1307,12 +1312,9 @@ def run_calculations(project):
@param project: project instance (sub-class of project.Project).
"""
mpi_comm = MPI.COMM_WORLD
mpi_rank = mpi_comm.Get_rank()
if mpi_rank == 0:
logger.debug("MPI rank %u setting up master loop", mpi_rank)
run_master(mpi_comm, project)
run_master(project)
else:
logger.debug("MPI rank %u setting up slave loop", mpi_rank)
run_slave(mpi_comm, project)
run_slave(project)

View File

@ -1,7 +0,0 @@
/* EDAC interface for other programs */
%module edac
%{
extern int run_script(char *scriptfile);
%}
extern int run_script(char *scriptfile);

File diff suppressed because it is too large Load Diff

View File

@ -10,6 +10,8 @@ the binding energies are compiled from Gwyn Williams' web page
(https://userweb.jlab.org/~gwyn/ebindene.html).
please refer to the original web page or the x-ray data booklet
for original sources, definitions and remarks.
binding energies of gases are replaced by respective values of a common compound
from the 'handbook of x-ray photoelectron spectroscopy' (physical electronics, inc., 1995).
usage
-----
@ -52,15 +54,47 @@ from pmsco.compat import open
index_energy = np.zeros(0)
index_number = np.zeros(0)
index_term = []
default_data_path = os.path.join(os.path.dirname(__file__), "bindingenergy.json")
def load_data():
data_path = os.path.join(os.path.dirname(__file__), "bindingenergy.json")
def load_data(data_path=None):
"""
load binding energy data from json file
the data file must be in the same format as generated by save_data.
@param file path of the data file. default: "bindingenergy.json" next to this module file
@return dictionary
"""
if data_path is None:
data_path = default_data_path
with open(data_path) as fp:
data = json.load(fp)
return data
def save_data(data_path=None):
"""
save binding energy data to json file
@param file path of the data file. default: "bindingenergy.json" next to this module file
@return None
"""
if data_path is None:
data_path = default_data_path
data = {}
for element in pt.elements:
element_data = {}
for term, energy in element.binding_energy.items():
element_data[term] = energy
if element_data:
data[element.number] = element_data
with open(data_path, 'w', 'utf8') as fp:
json.dump(data, fp, sort_keys=True, indent='\t')
def init(table, reload=False):
if 'binding_energy' in table.properties and not reload:
return
@ -142,6 +176,9 @@ def export_flat_text(f):
"""
export the binding energies to a flat general text file.
the file has four space-separated columns `number`, `symbol`, `term`, `energy`.
column names are included in the first row.
@param f: file path or open file object
@return: None
"""
@ -153,3 +190,23 @@ def export_flat_text(f):
else:
with open(f, "w") as fi:
export_flat_text(fi)
def import_flat_text(f):
"""
import binding energies from a flat general text file.
data is in space-separated columns.
the first row contains column names.
at least the columns `number`, `term`, `energy` must be present.
the function updates existing entries and appends entries of non-existing terms.
existing terms that are not listed in the file remain unchanged.
@param f: file path or open file object
@return: None
"""
data = np.atleast_1d(np.genfromtxt(f, names=True, dtype=None, encoding="utf8"))
for d in data:
pt.elements[d['number']].binding_energy[d['term']] = d['energy']

View File

@ -92,6 +92,8 @@ def get_cross_section(photon_energy, element, nlj):
@return: (float) cross section in Mb.
"""
nl = nlj[0:2]
if not hasattr(element, "photoionization"):
element = get_element(element)
try:
pet, cst = element.photoionization.cross_section[nl]
except KeyError:
@ -196,3 +198,11 @@ def plot_spectrum(photon_energy, elements, binding_energy=False, work_function=4
ax.set_ylabel('intensity')
ax.set_title(elements)
return fig, ax
def plot_cross_section(el, nlj):
energy = np.arange(100, 1500, 140)
cs = get_cross_section(energy, el, nlj)
fig, ax = plt.subplots()
ax.set_yscale("log")
ax.plot(energy, cs)

View File

@ -0,0 +1,443 @@
"""
@package pmsco.graphics.population
graphics rendering module for population dynamics.
the main function is render_genetic_chart().
this module is experimental.
interface and implementation are subject to change.
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
@copyright (c) 2021 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
import logging
import numpy as np
import os
from pmsco.database import regular_params, special_params
logger = logging.getLogger(__name__)
try:
from matplotlib.figure import Figure
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
# from matplotlib.backends.backend_pdf import FigureCanvasPdf
# from matplotlib.backends.backend_svg import FigureCanvasSVG
except ImportError:
Figure = None
FigureCanvas = None
logger.warning("error importing matplotlib. graphics rendering disabled.")
def _default_range(pos):
"""
determine a default range from actual values.
@param pos: (numpy.ndarray) 1-dimensional structured array of parameter values.
@return: range_min, range_max are dictionaries of the minimum and maximum values of each parameter.
"""
names = regular_params(pos.dtype.names)
range_min = {}
range_max = {}
for name in names:
range_min[name] = pos[name].min()
range_max[name] = pos[name].max()
return range_min, range_max
def _prune_constant_params(pnames, range_min, range_max):
"""
remove constant parameters from the list and range
@param pnames: (list)
@param range_min: (dict)
@param range_max: (dict)
@return:
"""
del_names = [name for name in pnames if range_max[name] <= range_min[name]]
for name in del_names:
pnames.remove(name)
del range_min[name]
del range_max[name]
def render_genetic_chart(output_file, input_data_or_file, model_space=None, generations=None, title=None, cmap=None,
canvas=None):
"""
produce a genetic chart from a given population.
a genetic chart is a pseudo-colour representation of the coordinates of each individual in the model space.
the axes are the particle number and the model parameter.
the colour is mapped from the relative position of a parameter value within the parameter range.
the chart should illustrate the diversity in the population.
converged parameters will show similar colours.
by comparing charts of different generations, the effect of the optimization algorithm can be examined.
though the chart type is designed for the genetic algorithm, it may be useful for other algorithms as well.
the function requires input in one of the following forms:
- a result (.dat) file or numpy structured array.
the array must contain regular parameters, as well as the _particle and _gen columns.
the function generates one chart per generation unless the generation argument is specified.
- a population (.pop) file or numpy structured array.
the array must contain regular parameters, as well as the _particle columns.
- a pmsco.optimizers.population.Population object with valid data.
the graphics file format can be changed by providing a specific canvas. default is PNG.
this function requires the matplotlib module.
if it is not available, the function raises an error.
@param output_file: path and base name of the output file without extension.
a generation index and the file extension according to the file format are appended.
@param input_data_or_file: a numpy structured ndarray of a population or result list from an optimization run.
alternatively, the file path of a result file (.dat) or population file (.pop) can be given.
file can be any object that numpy.genfromtxt() can handle.
@param model_space: model space can be a pmsco.project.ModelSpace object,
any object that contains the same min and max attributes as pmsco.project.ModelSpace,
or a dictionary with to keys 'min' and 'max' that provides the corresponding ModelSpace dictionaries.
by default, the model space boundaries are derived from the input data.
if a model_space is specified, only the parameters listed in it are plotted.
@param generations: (int or sequence) generation index or list of indices.
this index is used in the output file name and for filtering input data by generation.
if the input data does not contain the generation, no filtering is applied.
by default, no filtering is applied, and one graph for each generation is produced.
@param title: (str) title of the chart.
the title is a {}-style format string, where {base} is the output file name and {gen} is the generation.
default: derived from file name.
@param cmap: (str) name of colour map supported by matplotlib.
default is 'jet'.
other good-looking options are 'PiYG', 'RdBu', 'RdYlGn', 'coolwarm'.
@param canvas: a FigureCanvas class reference from a matplotlib backend.
if None, the default FigureCanvasAgg is used which produces a bitmap file in PNG format.
some other options are:
matplotlib.backends.backend_pdf.FigureCanvasPdf or
matplotlib.backends.backend_svg.FigureCanvasSVG.
@return (str) path and name of the generated graphics file.
empty string if an error occurred.
@raise TypeError if matplotlib is not available.
"""
try:
pos = np.copy(input_data_or_file.pos)
range_min = input_data_or_file.model_min
range_max = input_data_or_file.model_max
generations = [input_data_or_file.generation]
except AttributeError:
try:
pos = np.atleast_1d(np.genfromtxt(input_data_or_file, names=True))
except TypeError:
pos = np.copy(input_data_or_file)
range_min, range_max = _default_range(pos)
pnames = regular_params(pos.dtype.names)
if model_space is not None:
try:
# a ModelSpace-like object
range_min = model_space.min
range_max = model_space.max
except AttributeError:
# a dictionary-like object
range_min = model_space['min']
range_max = model_space['max']
try:
pnames = range_min.keys()
except AttributeError:
pnames = range_min.dtype.names
pnames = list(pnames)
_prune_constant_params(pnames, range_min, range_max)
if generations is None:
try:
generations = np.unique(pos['_gen'])
except ValueError:
pass
files = []
path, base = os.path.split(output_file)
if generations is not None and len(generations):
if title is None:
title = "{base} gen {gen}"
for generation in generations:
idx = np.where(pos['_gen'] == generation)
gpos = pos[idx]
gtitle = title.format(base=base, gen=int(generation))
out_filename = "{base}-{gen}".format(base=os.fspath(output_file), gen=int(generation))
out_filename = _render_genetic_chart_2(out_filename, gpos, pnames, range_min, range_max,
gtitle, cmap, canvas)
files.append(out_filename)
else:
if title is None:
title = "{base}"
gtitle = title.format(base=base, gen="")
out_filename = "{base}".format(base=os.fspath(output_file))
out_filename = _render_genetic_chart_2(out_filename, pos, pnames, range_min, range_max, gtitle, cmap, canvas)
files.append(out_filename)
return files
def _render_genetic_chart_2(out_filename, pos, pnames, range_min, range_max, title, cmap, canvas):
"""
internal part of render_genetic_chart()
this function calculates the relative position in the model space,
sorts the positions array by particle index,
and calls plot_genetic_chart().
@param out_filename:
@param pos:
@param pnames:
@param range_max:
@param range_min:
@param cmap:
@param canvas:
@return: out_filename
"""
spos = np.sort(pos, order='_particle')
rpos2d = np.zeros((spos.shape[0], len(pnames)))
for index, pname in enumerate(pnames):
rpos2d[:, index] = (spos[pname] - range_min[pname]) / (range_max[pname] - range_min[pname])
out_filename = plot_genetic_chart(out_filename, rpos2d, pnames, title=title, cmap=cmap, canvas=canvas)
return out_filename
def plot_genetic_chart(filename, rpos2d, param_labels, title=None, cmap=None, canvas=None):
"""
produce a genetic chart from the given data.
a genetic chart is a pseudo-colour representation of the coordinates of each individual in the model space.
the chart should highlight the amount of diversity in the population
and - by comparing charts of different generations - the changes due to mutation.
the axes are the model parameter (x) and particle number (y).
the colour is mapped from the relative position of a parameter value within the parameter range.
in contrast to render_genetic_chart() this function contains only the drawing code.
it requires input in the final form and does not do any checks, conversion or processing.
the graphics file format can be changed by providing a specific canvas. default is PNG.
this function requires the matplotlib module.
if it is not available, the function raises an error.
@param filename: path and name of the output file without extension.
@param rpos2d: (two-dimensional numpy array of numeric type)
relative positions of the particles in the model space.
dimension 0 (y-axis) is the particle index,
dimension 1 (x-axis) is the parameter index (in the order given by param_labels).
all values must be between 0 and 1.
@param param_labels: (sequence) list or tuple of parameter names.
@param title: (str) string to be printed as chart title. default is 'genetic chart'.
@param cmap: (str) name of colour map supported by matplotlib.
default is 'jet'.
other good-looking options are 'PiYG', 'RdBu', 'RdYlGn', 'coolwarm'.
@param canvas: a FigureCanvas class reference from a matplotlib backend.
if None, the default FigureCanvasAgg is used which produces a bitmap file in PNG format.
some other options are:
matplotlib.backends.backend_pdf.FigureCanvasPdf or
matplotlib.backends.backend_svg.FigureCanvasSVG.
@raise TypeError if matplotlib is not available.
"""
if canvas is None:
canvas = FigureCanvas
if cmap is None:
cmap = 'jet'
if title is None:
title = 'genetic chart'
fig = Figure()
canvas(fig)
ax = fig.add_subplot(111)
im = ax.imshow(rpos2d, aspect='auto', cmap=cmap, origin='lower')
im.set_clim((0.0, 1.0))
ax.set_xticks(np.arange(len(param_labels)))
ax.set_xticklabels(param_labels, rotation=45, ha="right", rotation_mode="anchor")
ax.set_ylabel('particle')
ax.set_title(title)
cb = ax.figure.colorbar(im, ax=ax)
cb.ax.set_ylabel("relative value", rotation=-90, va="bottom")
out_filename = "{base}.{ext}".format(base=filename, ext=canvas.get_default_filetype())
fig.savefig(out_filename)
return out_filename
def render_swarm(output_file, input_data, model_space=None, title=None, cmap=None, canvas=None):
"""
render a two-dimensional particle swarm population.
this function generates a schematic rendering of a particle swarm in two dimensions.
particles are represented by their position and velocity, indicated by an arrow.
the model space is projected on the first two (or selected two) variable parameters.
in the background, a scatter plot of results (dots with pseudocolor representing the R-factor) can be plotted.
the chart type is designed for the particle swarm optimization algorithm.
the function requires input in one of the following forms:
- position (.pos), velocity (.vel) and result (.dat) files or the respective numpy structured arrays.
the arrays must contain regular parameters, as well as the `_particle` column.
the result file must also contain an `_rfac` column.
- a pmsco.optimizers.population.Population object with valid data.
the graphics file format can be changed by providing a specific canvas. default is PNG.
this function requires the matplotlib module.
if it is not available, the function raises an error.
@param output_file: path and base name of the output file without extension.
a generation index and the file extension according to the file format are appended.
@param input_data: a pmsco.optimizers.population.Population object with valid data,
or a sequence of position, velocity and result arrays.
the arrays must be structured ndarrays corresponding to the respective Population members.
alternatively, the arrays can be referenced as file paths
in any format that numpy.genfromtxt() can handle.
@param model_space: model space can be a pmsco.project.ModelSpace object,
any object that contains the same min and max attributes as pmsco.project.ModelSpace,
or a dictionary with to keys 'min' and 'max' that provides the corresponding ModelSpace dictionaries.
by default, the model space boundaries are derived from the input data.
if a model_space is specified, only the parameters listed in it are plotted.
@param title: (str) title of the chart.
the title is a {}-style format string, where {base} is the output file name and {gen} is the generation.
default: derived from file name.
@param cmap: (str) name of colour map supported by matplotlib.
default is 'plasma'.
other good-looking options are 'viridis', 'plasma', 'inferno', 'magma', 'cividis'.
@param canvas: a FigureCanvas class reference from a matplotlib backend.
if None, the default FigureCanvasAgg is used which produces a bitmap file in PNG format.
some other options are:
matplotlib.backends.backend_pdf.FigureCanvasPdf or
matplotlib.backends.backend_svg.FigureCanvasSVG.
@return (str) path and name of the generated graphics file.
empty string if an error occurred.
@raise TypeError if matplotlib is not available.
"""
try:
range_min = input_data.model_min
range_max = input_data.model_max
pos = np.copy(input_data.pos)
vel = np.copy(input_data.vel)
rfac = np.copy(input_data.results)
generation = input_data.generation
except AttributeError:
try:
pos = np.atleast_1d(np.genfromtxt(input_data[0], names=True))
vel = np.atleast_1d(np.genfromtxt(input_data[1], names=True))
rfac = np.atleast_1d(np.genfromtxt(input_data[2], names=True))
except TypeError:
pos = np.copy(input_data[0])
vel = np.copy(input_data[1])
rfac = np.copy(input_data[2])
range_min, range_max = _default_range(rfac)
pnames = regular_params(pos.dtype.names)
if model_space is not None:
try:
# a ModelSpace-like object
range_min = model_space.min
range_max = model_space.max
except AttributeError:
# a dictionary-like object
range_min = model_space['min']
range_max = model_space['max']
try:
pnames = range_min.keys()
except AttributeError:
pnames = range_min.dtype.names
pnames = list(pnames)
_prune_constant_params(pnames, range_min, range_max)
pnames = pnames[0:2]
files = []
if len(pnames) == 2:
params = {pnames[0]: [range_min[pnames[0]], range_max[pnames[0]]],
pnames[1]: [range_min[pnames[1]], range_max[pnames[1]]]}
out_filename = plot_swarm(output_file, pos, vel, rfac, params, title=title, cmap=cmap, canvas=canvas)
files.append(out_filename)
else:
logging.warning("model space must be two-dimensional and non-degenerate.")
return files
def plot_swarm(filename, pos, vel, rfac, params, title=None, cmap=None, canvas=None):
"""
plot a two-dimensional particle swarm population.
this is a sub-function of render_swarm() containing just the plotting commands.
the graphics file format can be changed by providing a specific canvas. default is PNG.
this function requires the matplotlib module.
if it is not available, the function raises an error.
@param filename: path and base name of the output file without extension.
a generation index and the file extension according to the file format are appended.
@param pos: structured ndarray containing the positions of the particles.
@param vel: structured ndarray containing the velocities of the particles.
@param rfac: structured ndarray containing positions and R-factor values.
this array is independent of pos and vel.
it can also be set to None if results should be suppressed.
@param params: dictionary of two parameters to be plotted.
the keys correspond to columns of the pos, vel and rfac arrays.
the values are lists [minimum, maximum] that define the axis range.
@param title: (str) title of the chart.
the title is a {}-style format string, where {base} is the output file name and {gen} is the generation.
default: derived from file name.
@param cmap: (str) name of colour map supported by matplotlib.
default is 'plasma'.
other good-looking options are 'viridis', 'plasma', 'inferno', 'magma', 'cividis'.
@param canvas: a FigureCanvas class reference from a matplotlib backend.
if None, the default FigureCanvasAgg is used which produces a bitmap file in PNG format.
some other options are:
matplotlib.backends.backend_pdf.FigureCanvasPdf or
matplotlib.backends.backend_svg.FigureCanvasSVG.
@return (str) path and name of the generated graphics file.
empty string if an error occurred.
@raise TypeError if matplotlib is not available.
"""
if canvas is None:
canvas = FigureCanvas
if cmap is None:
cmap = 'plasma'
if title is None:
title = 'swarm map'
pnames = list(params.keys())
fig = Figure()
canvas(fig)
ax = fig.add_subplot(111)
if rfac is not None:
try:
s = ax.scatter(rfac[params[0]], rfac[params[1]], s=5, c=rfac['_rfac'], cmap=cmap, vmin=0, vmax=1)
except ValueError:
# _rfac column missing
pass
else:
cb = ax.figure.colorbar(s, ax=ax)
cb.ax.set_ylabel("R-factor", rotation=-90, va="bottom")
p = ax.plot(pos[pnames[0]], pos[pnames[1]], 'co')
q = ax.quiver(pos[pnames[0]], pos[pnames[1]], vel[pnames[0]], vel[pnames[1]], color='c')
ax.set_xlim(params[pnames[0]])
ax.set_ylim(params[pnames[1]])
ax.set_xlabel(pnames[0])
ax.set_ylabel(pnames[1])
ax.set_title(title)
out_filename = "{base}.{ext}".format(base=filename, ext=canvas.get_default_filetype())
fig.savefig(out_filename)
return out_filename

View File

@ -7,16 +7,13 @@ interface and implementation are subject to change.
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
@copyright (c) 2018 by Paul Scherrer Institut @n
@copyright (c) 2018-21 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import math
import numpy as np
@ -135,9 +132,8 @@ def render_ea_scan(filename, data, scan_mode, canvas=None, is_modf=False):
im.set_cmap("RdBu_r")
dhi = max(abs(dlo), abs(dhi))
dlo = -dhi
im.set_clim((dlo, dhi))
im.set_clim((-1., 1.))
try:
# requires matplotlib 2.1.0
ti = cb.get_ticks()
ti = [min(ti), 0., max(ti)]
cb.set_ticks(ti)
@ -213,9 +209,8 @@ def render_tp_scan(filename, data, canvas=None, is_modf=False):
# im.set_cmap("coolwarm")
dhi = max(abs(dlo), abs(dhi))
dlo = -dhi
pc.set_clim((dlo, dhi))
pc.set_clim((-1., 1.))
try:
# requires matplotlib 2.1.0
ti = cb.get_ticks()
ti = [min(ti), 0., max(ti)]
cb.set_ticks(ti)
@ -226,9 +221,12 @@ def render_tp_scan(filename, data, canvas=None, is_modf=False):
# im.set_cmap("inferno")
# im.set_cmap("viridis")
pc.set_clim((dlo, dhi))
ti = cb.get_ticks()
ti = [min(ti), max(ti)]
cb.set_ticks(ti)
try:
ti = cb.get_ticks()
ti = [min(ti), max(ti)]
cb.set_ticks(ti)
except AttributeError:
pass
out_filename = "{0}.{1}".format(filename, canvas.get_default_filetype())
fig.savefig(out_filename)

View File

@ -40,23 +40,20 @@ the scan and domain handlers call methods of the project class to invoke project
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
@copyright (c) 2015-18 by Paul Scherrer Institut @n
@copyright (c) 2015-21 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import datetime
from functools import reduce
import logging
import math
import numpy as np
import os
from pathlib import Path
from pmsco.compat import open
import pmsco.data as md
@ -377,7 +374,7 @@ class SingleModelHandler(ModelHandler):
keys = [key for key in self.result]
keys.sort(key=lambda t: t[0].lower())
vals = (str(self.result[key]) for key in keys)
filename = self._project.output_file + ".dat"
filename = Path(self._project.output_file).with_suffix(".dat")
with open(filename, "w") as outfile:
outfile.write("# ")
outfile.write(" ".join(keys))
@ -437,11 +434,11 @@ class ScanHandler(TaskHandler):
if project.combined_scan is not None:
ext = md.format_extension(project.combined_scan)
filename = project.output_file + ext
filename = Path(project.output_file).with_suffix(ext)
md.save_data(filename, project.combined_scan)
if project.combined_modf is not None:
ext = md.format_extension(project.combined_modf)
filename = project.output_file + ".modf" + ext
filename = Path(project.output_file).with_suffix(".modf" + ext)
md.save_data(filename, project.combined_modf)
return len(self._project.scans)
@ -695,7 +692,7 @@ class EmitterHandler(TaskHandler):
the estimate is based on the start parameters, scan 0 and domain 0.
"""
super(EmitterHandler, self).setup(project, slots)
mock_model = self._project.create_model_space().start
mock_model = self._project.model_space.start
mock_index = dispatch.CalcID(-1, 0, 0, -1, -1)
n_emitters = project.cluster_generator.count_emitters(mock_model, mock_index)
return n_emitters

View File

@ -304,7 +304,7 @@ class GridSearchHandler(handlers.ModelHandler):
super(GridSearchHandler, self).setup(project, slots)
self._pop = GridPopulation()
self._pop.setup(self._project.create_model_space())
self._pop.setup(self._project.model_space)
self._invalid_limit = max(slots, self._invalid_limit)
self._outfile = open(self._project.output_file + ".dat", "w")

View File

@ -554,7 +554,7 @@ class Population(object):
however, the patch is applied only upon the next execution of advance_population().
an info or warning message is printed to the log
depending on whether the filed contained a complete dataset or not.
depending on whether the file contained a complete dataset or not.
@attention patching a live population is a potentially dangerous operation.
it may cause an optimization to abort because of an error in the file.
@ -1209,7 +1209,7 @@ class PopulationHandler(handlers.ModelHandler):
return self._pop_size
def setup_population(self):
self._pop.setup(self._pop_size, self._project.create_model_space(), **self._project.optimizer_params)
self._pop.setup(self._pop_size, self._project.model_space, **self._project.optimizer_params)
def cleanup(self):
super(PopulationHandler, self).cleanup()

View File

@ -6,12 +6,12 @@ PEARL Multiple-Scattering Calculation and Structural Optimization
this is the top-level interface of the PMSCO package.
all calculations (any mode, any project) start by calling the run_project() function of this module.
the module also provides a command line parser for common options.
the module also provides a command line and a run-file/run-dict interface.
for parallel execution, prefix the command line with mpi_exec -np NN, where NN is the number of processes to use.
note that in parallel mode, one process takes the role of the coordinator (master).
the master does not run calculations and is idle most of the time.
to benefit from parallel execution on a work station, NN should be the number of processors plus one.
to benefit from parallel execution on a work station, NN should be the number of processors.
on a cluster, the number of processes is chosen according to the available resources.
all calculations can also be run in a single process.
@ -25,26 +25,35 @@ refer to the projects folder for examples.
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
@copyright (c) 2015-18 by Paul Scherrer Institut @n
@copyright (c) 2015-21 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
from builtins import range
import datetime
import logging
import importlib
import os.path
import commentjson as json
from pathlib import Path
import sys
from mpi4py import MPI
try:
from mpi4py import MPI
mpi_comm = MPI.COMM_WORLD
mpi_size = mpi_comm.Get_size()
mpi_rank = mpi_comm.Get_rank()
except ImportError:
MPI = None
mpi_comm = None
mpi_size = 1
mpi_rank = 0
pmsco_root = Path(__file__).resolve().parent.parent
if str(pmsco_root) not in sys.path:
sys.path.insert(0, str(pmsco_root))
import pmsco.dispatch as dispatch
import pmsco.files as files
@ -71,40 +80,36 @@ def setup_logging(enable=False, filename="pmsco.log", level="WARNING"):
@param enable: (bool) True=enable logging to the specified file,
False=do not generate a log (null handler).
@param filename: (string) path and name of the log file.
@param filename: (Path-like) path and name of the log file.
if this process is part of an MPI communicator,
the function inserts a dot and the MPI rank of this process before the extension.
if the filename is empty, logging is disabled.
@param level: (string) name of the log level.
must be the name of one of "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL".
if empty or invalid, the function raises a ValueError.
if empty, logging is disabled.
if not a valid level, defaults to "WARNING".
@return None
"""
numeric_level = getattr(logging, level.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % level)
logger = logging.getLogger("")
logger.setLevel(numeric_level)
logformat = '%(asctime)s (%(name)s) %(levelname)s: %(message)s'
formatter = logging.Formatter(logformat)
enable = enable and str(filename) and level
numeric_level = getattr(logging, level.upper(), logging.WARNING)
root_logger = logging.getLogger()
root_logger.setLevel(numeric_level)
if enable:
mpi_comm = MPI.COMM_WORLD
mpi_size = mpi_comm.Get_size()
if mpi_size > 1:
mpi_rank = mpi_comm.Get_rank()
root, ext = os.path.splitext(filename)
filename = root + "." + str(mpi_rank) + ext
p = Path(filename)
filename = p.with_suffix(f".{mpi_rank}" + p.suffix)
log_format = '%(asctime)s (%(name)s) %(levelname)s: %(message)s'
formatter = logging.Formatter(log_format)
handler = logging.FileHandler(filename, mode="w", delay=True)
handler.setLevel(numeric_level)
handler.setFormatter(formatter)
else:
handler = logging.NullHandler()
logger.addHandler(handler)
root_logger.addHandler(handler)
def set_common_args(project, args):
@ -124,67 +129,58 @@ def set_common_args(project, args):
@return: None
"""
log_file = "pmsco.log"
if args.data_dir:
project.data_dir = args.data_dir
if args.output_file:
project.set_output(args.output_file)
log_file = args.output_file + ".log"
project.output_file = args.output_file
if args.db_file:
project.db_file = args.db_file
if args.log_file:
log_file = args.log_file
setup_logging(enable=args.log_enable, filename=log_file, level=args.log_level)
logger.debug("creating project")
mode = args.mode.lower()
if mode in {'single', 'grid', 'swarm', 'genetic', 'table'}:
project.mode = mode
else:
logger.error("invalid optimization mode '%s'.", mode)
if args.pop_size:
project.optimizer_params['pop_size'] = args.pop_size
if args.seed_file:
project.optimizer_params['seed_file'] = args.seed_file
if args.seed_limit:
project.optimizer_params['seed_limit'] = args.seed_limit
if args.table_file:
project.optimizer_params['table_file'] = args.table_file
project.log_file = args.log_file
if args.log_level:
project.log_level = args.log_level
if not args.log_enable:
project.log_file = ""
project.log_level = ""
if args.mode:
project.mode = args.mode.lower()
if args.time_limit:
project.set_timedelta_limit(datetime.timedelta(hours=args.time_limit))
project.time_limit = args.time_limit
if args.keep_files:
if "all" in args.keep_files:
cats = set([])
else:
cats = files.FILE_CATEGORIES - set(args.keep_files)
cats -= {'report'}
if mode == 'single':
cats -= {'model'}
project.files.categories_to_delete = cats
if args.keep_levels > project.keep_levels:
project.keep_levels = args.keep_levels
if args.keep_best > project.keep_best:
project.keep_best = args.keep_best
project.keep_files = args.keep_files
if args.keep_levels:
project.keep_levels = max(args.keep_levels, project.keep_levels)
if args.keep_best:
project.keep_best = max(args.keep_best, project.keep_best)
def run_project(project):
"""
run a calculation project.
@param project:
@return:
the function sets up logging, validates the project, chooses the handler classes,
and passes control to the pmsco.dispatch module to run the calculations.
@param project: fully initialized project object.
the validate method is called as part of this function after setting up the logger.
@return: None
"""
# log project arguments only in rank 0
mpi_comm = MPI.COMM_WORLD
mpi_rank = mpi_comm.Get_rank()
log_file = Path(project.log_file)
if not log_file.name:
log_file = Path(project.job_name).with_suffix(".log")
if log_file.name:
log_file.parent.mkdir(exist_ok=True)
log_level = project.log_level
else:
log_level = ""
setup_logging(enable=bool(log_level), filename=log_file, level=log_level)
if mpi_rank == 0:
project.log_project_args()
project.validate()
optimizer_class = None
if project.mode == 'single':
optimizer_class = handlers.SingleModelHandler
@ -221,6 +217,34 @@ def run_project(project):
logger.error("undefined project, optimizer, or calculator.")
def schedule_project(project, run_dict):
"""
schedule a calculation project.
the function validates the project and submits a job to the scheduler.
@param project: fully initialized project object.
the validate method is called as part of this function.
@param run_dict: dictionary holding the contents of the run file.
@return: None
"""
assert mpi_rank == 0
setup_logging(enable=False)
project.validate()
schedule_dict = run_dict['schedule']
module = importlib.import_module(schedule_dict['__module__'])
schedule_class = getattr(module, schedule_dict['__class__'])
schedule = schedule_class(project)
schedule.set_properties(module, schedule_dict, project)
schedule.run_dict = run_dict
schedule.validate()
schedule.submit()
class Args(object):
"""
arguments of the main function.
@ -233,7 +257,7 @@ class Args(object):
values as the command line parser.
"""
def __init__(self, mode="single", output_file="pmsco_data"):
def __init__(self):
"""
constructor.
@ -242,12 +266,8 @@ class Args(object):
other parameters may be required depending on the project
and/or the calculation mode.
"""
self.mode = mode
self.pop_size = 0
self.seed_file = ""
self.seed_limit = 0
self.data_dir = ""
self.output_file = output_file
self.output_file = ""
self.db_file = ""
self.time_limit = 24.0
self.keep_files = files.FILE_CATEGORIES_TO_KEEP
@ -256,13 +276,9 @@ class Args(object):
self.log_level = "WARNING"
self.log_file = ""
self.log_enable = True
self.table_file = ""
def get_cli_parser(default_args=None):
if not default_args:
default_args = Args()
def get_cli_parser():
KEEP_FILES_CHOICES = files.FILE_CATEGORIES | {'all'}
parser = argparse.ArgumentParser(
@ -290,56 +306,45 @@ def get_cli_parser(default_args=None):
# for simplicity, the parser does not check these requirements.
# all parameters are optional and accepted regardless of mode.
# errors may occur if implicit requirements are not met.
parser.add_argument('project_module',
parser.add_argument('project_module', nargs='?',
help="path to custom module that defines the calculation project")
parser.add_argument('-m', '--mode', default=default_args.mode,
parser.add_argument('-r', '--run-file',
help="path to run-time parameters file which contains all program arguments. " +
"must be in JSON format.")
parser.add_argument('-m', '--mode',
choices=['single', 'grid', 'swarm', 'genetic', 'table'],
help='calculation mode')
parser.add_argument('--pop-size', type=int, default=default_args.pop_size,
help='population size (number of particles) in swarm or genetic optimization mode. ' +
'default is the greater of 4 or the number of calculation processes.')
parser.add_argument('--seed-file',
help='path and name of population seed file. ' +
'population data of previous optimizations can be used to seed a new optimization. ' +
'the file must have the same structure as the .pop or .dat files.')
parser.add_argument('--seed-limit', type=int, default=default_args.seed_limit,
help='maximum number of models to use from the seed file. ' +
'the models with the best R-factors are selected.')
parser.add_argument('-d', '--data-dir', default=default_args.data_dir,
parser.add_argument('-d', '--data-dir',
help='directory path for experimental data files (if required by project). ' +
'default: working directory')
parser.add_argument('-o', '--output-file', default=default_args.output_file,
parser.add_argument('-o', '--output-file',
help='base path for intermediate and output files.')
parser.add_argument('-b', '--db-file', default=default_args.db_file,
parser.add_argument('-b', '--db-file',
help='name of an sqlite3 database file where the results should be stored.')
parser.add_argument('--table-file',
help='path and name of population table file for table optimization mode. ' +
'the file must have the same structure as the .pop or .dat files.')
parser.add_argument('-k', '--keep-files', nargs='*', default=default_args.keep_files,
parser.add_argument('-k', '--keep-files', nargs='*',
choices=KEEP_FILES_CHOICES,
help='output file categories to keep after the calculation. '
'by default, cluster and model (simulated data) '
'of a limited number of best models are kept.')
parser.add_argument('--keep-best', type=int, default=default_args.keep_best,
parser.add_argument('--keep-best', type=int,
help='number of best models for which to keep result files '
'(at each node from root down to keep-levels).')
parser.add_argument('--keep-levels', type=int, choices=range(5),
default=default_args.keep_levels,
help='task level down to which result files of best models are kept. '
'0 = model, 1 = scan, 2 = domain, 3 = emitter, 4 = region.')
parser.add_argument('-t', '--time-limit', type=float, default=default_args.time_limit,
parser.add_argument('-t', '--time-limit', type=float,
help='wall time limit in hours. the optimizers try to finish before the limit.')
parser.add_argument('--log-file', default=default_args.log_file,
parser.add_argument('--log-file',
help='name of the main log file. ' +
'under MPI, the rank of the process is inserted before the extension.')
parser.add_argument('--log-level', default=default_args.log_level,
parser.add_argument('--log-level',
help='minimum level of log messages. DEBUG, INFO, WARNING, ERROR, CRITICAL.')
feature_parser = parser.add_mutually_exclusive_group(required=False)
feature_parser.add_argument('--log-enable', dest='log_enable', action="store_true",
help="enable logging. by default, logging is on.")
feature_parser.add_argument('--log-disable', dest='log_enable', action='store_false',
help="disable logging. by default, logging is on.")
parser.set_defaults(log_enable=default_args.log_enable)
parser.set_defaults(log_enable=True)
return parser
@ -350,52 +355,135 @@ def parse_cli():
@return: Namespace object created by the argument parser.
"""
default_args = Args()
parser = get_cli_parser(default_args)
parser = get_cli_parser()
args, unknown_args = parser.parse_known_args()
return args, unknown_args
def import_project_module(path):
def import_module(module_name):
"""
import the custom project module.
import a custom module by name.
imports the project module given its file path.
the path is expanded to its absolute form and appended to the python path.
import a module given its file path or module name (like in an import statement).
@param path: path and name of the module to be loaded.
path is optional and defaults to the python path.
if the name includes an extension, it is stripped off.
preferably, the module name should be given as in an import statement.
as the top-level pmsco directory is on the python path,
the module name will begin with `projects` for a custom project module or `pmsco` for a core pmsco module.
in this case, the function just calls importlib.import_module.
if a file path is given, i.e., `module_name` links to an existing file and has a `.py` extension,
the function extracts the directory path,
inserts it into the python path,
and calls importlib.import_module on the stem of the file name.
@note the file path remains in the python path.
this option should be used carefully to avoid breaking file name resolution.
@param module_name: file path or module name.
file path is interpreted relative to the working directory.
@return: the loaded module as a python object
"""
path, name = os.path.split(path)
name, __ = os.path.splitext(name)
path = os.path.abspath(path)
sys.path.append(path)
project_module = importlib.import_module(name)
return project_module
p = Path(module_name)
if p.is_file() and p.suffix == ".py":
path = p.parent.resolve()
module_name = p.stem
if path not in sys.path:
sys.path.insert(0, path)
module = importlib.import_module(module_name)
return module
def main_dict(run_params):
"""
main function with dictionary run-time parameters
this starts the whole process with all direct parameters.
the command line is not parsed.
no run-file is loaded (just the project module).
@param run_params: dictionary with the same structure as the JSON run-file.
@return: None
"""
project_params = run_params['project']
module = importlib.import_module(project_params['__module__'])
try:
project_class = getattr(module, project_params['__class__'])
except KeyError:
project = module.create_project()
else:
project = project_class()
project._module = module
project.directories['pmsco'] = Path(__file__).parent
project.directories['project'] = Path(module.__file__).parent
project.set_properties(module, project_params, project)
run_project(project)
def main():
"""
main function with command line parsing
this function starts the whole process with parameters from the command line.
if the command line contains a run-file parameter, it determines the module to load and the project parameters.
otherwise, the command line parameters apply.
the project class can be specified either in the run-file or the project module.
if the run-file specifies a class name, that class is looked up in the project module and instantiated.
otherwise, the module's create_project is called.
@return: None
"""
args, unknown_args = parse_cli()
if args:
module = import_project_module(args.project_module)
try:
project_args = module.parse_project_args(unknown_args)
except NameError:
project_args = None
try:
with open(args.run_file, 'r') as f:
rf = json.load(f)
except AttributeError:
rfp = {'__module__': args.project_module}
else:
rfp = rf['project']
module = import_module(rfp['__module__'])
try:
project_args = module.parse_project_args(unknown_args)
except AttributeError:
project_args = None
try:
project_class = getattr(module, rfp['__class__'])
except (AttributeError, KeyError):
project = module.create_project()
set_common_args(project, args)
try:
module.set_project_args(project, project_args)
except NameError:
pass
else:
project = project_class()
project_args = None
project._module = module
project.directories['pmsco'] = Path(__file__).parent
project.directories['project'] = Path(module.__file__).parent
project.set_properties(module, rfp, project)
set_common_args(project, args)
try:
if project_args:
module.set_project_args(project, project_args)
except AttributeError:
pass
try:
schedule_enabled = rf['schedule']['enabled']
except KeyError:
schedule_enabled = False
if schedule_enabled:
schedule_project(project, rf)
else:
run_project(project)

View File

@ -19,36 +19,32 @@ the ModelSpace and CalculatorParams classes are typically used unchanged.
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
@copyright (c) 2015 by Paul Scherrer Institut @n
@copyright (c) 2015-21 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import copy
import datetime
import git
import logging
import numpy as np
import os.path
from pathlib import Path
import socket
import sys
from pmsco.calculators.calculator import InternalAtomicCalculator
from pmsco.calculators.edac import EdacCalculator
import pmsco.cluster as mc
import pmsco.cluster
import pmsco.config as config
from pmsco.compat import open
import pmsco.data as md
import pmsco.database as database
import pmsco.dispatch as dispatch
import pmsco.files as files
import pmsco.handlers as handlers
import pmsco.database
import pmsco.dispatch
import pmsco.files
import pmsco.handlers
from pmsco.helpers import BraceMessage as BMsg
logger = logging.getLogger(__name__)
@ -157,6 +153,34 @@ class ModelSpace(object):
"""
return ParamSpace(self.start[name], self.min[name], self.max[name], self.step[name])
def set_param_dict(self, d):
"""
initialize model space from dictionary.
@param d: dictionary with two levels:
the top level are parameter names,
the second level the space descriptors 'start', 'min', 'max', 'step' and 'width'.
see add_param() for possible combinations.
@return: None
"""
self.__init__()
for k, v in d.items():
self.add_param(k, **v)
def get_param_dict(self):
"""
return model space parameters in dictionary form
the top level are parameter names,
the second level the space descriptors 'start', 'min', 'max' and 'step'.
@return: dict
"""
d = {}
for name in self.start:
d[name] = {self.start[name], self.min[name], self.max[name], self.step[name]}
return d
class CalculatorParams(object):
"""
@ -568,9 +592,166 @@ class Scan(object):
self.raw_data[dim] = grid[i].reshape(-1)
self.raw_data['i'] = 1
def load(self):
return self
class ScanKey(config.ConfigurableObject):
"""
create a Scan object based on a project-supplied dictionary
this class can be used in a run file to create a scan object based on the scan_dict attribute of the project.
this may be convenient if you're project should selectively use scans out of a long list of data files
and you don't want to clutter up the run file with parameters that don't change.
to do so, set the key property to match an item of scan_dict.
the load method will look up the corresponding scan_dict item and construct the final Scan object.
"""
def __init__(self, project=None):
super().__init__()
self.key = ""
self.project = project
def load(self, dirs=None):
"""
load the selected scan as specified in the project's scan dictionary
the method uses ScanLoader or ScanCreator as an intermediate.
@return a new Scan object which contains the loaded data.
"""
scan_spec = self.project.scan_dict[self.key]
if hasattr(scan_spec, 'positions'):
loader = ScanCreator()
else:
loader = ScanLoader()
for k, v in scan_spec.items():
setattr(loader, k, v)
scan = loader.load(dirs=dirs)
return scan
class ScanLoader(config.ConfigurableObject):
"""
create a Scan object from a data file reference
this class can be used in a run file to create a scan object from an experimental data file.
to do so, fill the properties with values as documented.
the load() method is called when the project is run.
"""
## @var filename (string)
# file name from which the scan should be loaded.
# the file name can contain a format specifier like {project} to include the base path.
## @var emitter (string)
# chemical symbol and, optionally following, further specification (chemical state, environment, ...)
# of photo-emitting atoms.
# the interpretation of this string is up to the project and its cluster generator.
# it should, however, always start with a chemical element symbol.
#
# examples: 'Ca' (calcium), 'CA' (carbon A), 'C a' (carbon a), 'C 1' (carbon one), 'N=O', 'FeIII'.
## @var initial_state (string)
# nl term of initial state
#
# in the form expected by EDAC, for example: '2p1/2'
## @var is_modf (bool)
# declares whether the data file contains the modulation function rather than intensity values
#
# if false, the project will calculate a modulation function from the raw data
def __init__(self):
super().__init__()
self.filename = ""
self.emitter = ""
self.initial_state = "1s"
self.is_modf = False
def load(self, dirs=None):
"""
load the scan according to specification
create a new Scan object and load the file by calling Scan.import_scan_file().
@return a new Scan object which contains the loaded data file.
"""
scan = Scan()
filename = config.resolve_path(self.filename, dirs)
scan.import_scan_file(filename, self.emitter, self.initial_state)
if self.is_modf:
scan.modulation = scan.raw_data
return scan
class ScanCreator(config.ConfigurableObject):
"""
create a Scan object from string expressions
this class can be used in a run file to create a scan object from python expressions,
such as lists, ranges or numpy functions.
to do so, fill the properties with values as documented.
the load() method is called when the project is run.
@note the raw_data property of the scan cannot be filled this way.
thus, the class is useful in `single` calculation mode only.
"""
## @var filename (string)
# name of the file which should receive the scan data.
# the file name can contain a format specifier like {project} to include the base path.
## @var positions (dict)
# dictionary specifying the scan positions
#
# the dictionary must contain four keys: 'e', 't', 'p', 'a' representing the four scan axes.
# each key holds a string that contains a python expression.
# the string is evaluated using python's built-in eval() function.
# the expression must evaluate to an iterable object or numpy ndarray of the scan positions.
# the `np` namespace can be used to access numpy functions.
#
# example:
# the following dictionary generates a hemispherical scan
# self.position = {'e': '100', 't': 'np.linspace(0, 90, 91)', 'p': 'range(0, 360, 2)', 'a': '0'}
## @var emitter (string)
# chemical symbol and, optionally following, further specification (chemical state, environment, ...)
# of photo-emitting atoms.
# the interpretation of this string is up to the project and its cluster generator.
# it should, however, always start with a chemical element symbol.
#
# examples: 'Ca' (calcium), 'CA' (carbon A), 'C a' (carbon a), 'C 1' (carbon one), 'N=O', 'FeIII'.
## @var initial_state (string)
# nl term of initial state
#
# in the form expected by EDAC, for example: '2p1/2'
def __init__(self):
super().__init__()
self.filename = ""
self.positions = {'e': None, 't': None, 'p': None, 'a': None}
self.emitter = ""
self.initial_state = "1s"
def load(self, dirs=None):
"""
create the scan according to specification
@return a new Scan object which contains the created scan array.
"""
scan = Scan()
positions = {}
for axis in self.positions.keys():
positions[axis] = np.atleast_1d(np.asarray(eval(self.positions[axis])))
scan.define_scan(positions, self.emitter, self.initial_state)
scan.filename = config.resolve_path(self.filename, dirs)
return scan
# noinspection PyMethodMayBeStatic
class Project(object):
class Project(config.ConfigurableObject):
"""
base class of a calculation project.
@ -609,17 +790,18 @@ class Project(object):
#
## @var scans (list of Scan objects)
# list of experimental or scan files for which calculations are to be run.
# list of experimental scans for which calculations are to be run.
#
# the list must be populated by calling the add_scan() method.
# this should be done in the create_project() function, or through the command line arguments.
# during project initialization, this list must be populated with Scan, ScanLoader or ScanCreator objects.
# while Scan objects contain all scan data, the latter two classes contain only scan specifications
# which are expanded (i.e. files are loaded or arrays are calculated) just before the calculations start.
# the Project.add_scan() method is a short-cut to create the respective scan object from few arguments.
# before the calculation starts, all objects are converted into fully specified Scan objects
# and scan data is loaded or calculated.
#
# the modulation function is calculated internally.
# if your scan files contain the modulation function (as opposed to intensity),
# you must add the files in the create_project() function.
# the command line does not support loading modulation functions.
#
# @c scans must be considered read-only. use project methods to change it.
# there are two ways to fill this list:
# either the project code fills it as a part of its initialization (create_project),
# or the list is populated via the run-file.
## @var domains (list of arbitrary objects)
# list of domains for which calculations are to be run.
@ -661,28 +843,22 @@ class Project(object):
# set this argument to False only if the calculation is a continuation of a previous one
# without any changes to the code.
## @var data_dir
# directory path to experimental data.
## @var directories
# dictionary for various directory paths.
#
# the project should load experimental data (scan files) from this path.
# this attribute receives the --data-dir argument from the command line
# if the project parses the common arguments (pmsco.set_common_args).
#
# it is up to the project to define where to load scan files from.
# if the location of the files may depend on the machine or user account,
# the user may want to specify the data path on the command line.
## @var output_dir (string)
# directory path for data files produced during the calculation, including intermediate files.
# home: user's home directory.
# data: where to load experimental data (scan files) from.
# project: directory of the project module.
# output: where to write output and intermediate files.
# temp: for temporary files.
#
# output_dir and output_file are set at once by @ref set_output.
## @var output_file (string)
## @var output_file (Path)
# file name root for data files produced during the calculation, including intermediate files.
#
# the file name should include the path. the path must also be set in @ref output_dir.
#
# output_dir and output_file are set at once by @ref set_output.
# this is the concatenation of self.directories['output'] and self.job_name.
# assignment to this property will update the two basic attributes.
## @var db_file (string)
# name of an sqlite3 database file where the calculation results should be stored.
@ -694,14 +870,17 @@ class Project(object):
#
# the actual wall time may be longer by the remaining time of running calculations.
# running calculations will not be aborted.
#
# the time_limit property is an alternative representation as hours.
# reading and writing accesses timedelta_limit.
## @var combined_scan
# combined raw data from scans.
# updated by add_scan().
# updated by self.load_scans().
## @var combined_modf
# combined modulation function from scans.
# updated by add_scan().
# updated by self.load_scans().
## @var files
# list of all generated data files with metadata.
@ -741,14 +920,17 @@ class Project(object):
#
def __init__(self):
super().__init__()
self._module = None
self.mode = "single"
self.job_name = ""
self.job_name = "pmsco0"
self.job_tags = {}
self.git_hash = ""
self.description = ""
self.features = {}
self.cluster_format = mc.FMT_EDAC
self.cluster_generator = mc.LegacyClusterGenerator(self)
self.cluster_format = pmsco.cluster.FMT_EDAC
self.cluster_generator = pmsco.cluster.LegacyClusterGenerator(self)
self._model_space = None
self.scans = []
self.domains = []
self.optimizer_params = {
@ -758,39 +940,170 @@ class Project(object):
'recalc_seed': True,
'table_file': ""
}
self.data_dir = ""
self.output_dir = ""
self.output_file = "pmsco_data"
self.directories = {
"home": Path.home(),
"work": Path.cwd(),
"data": "",
"project": "",
"output": "",
"temp": ""}
self.log_file = ""
self.log_level = "WARNING"
self.db_file = ':memory:'
self.timedelta_limit = datetime.timedelta(days=1)
self.combined_scan = None
self.combined_modf = None
self.files = files.FileTracker()
self.files = pmsco.files.FileTracker()
self.keep_files = list(pmsco.files.FILE_CATEGORIES_TO_KEEP)
self.keep_levels = 1
self.keep_best = 10
self.handler_classes = {
'model': handlers.SingleModelHandler,
'scan': handlers.ScanHandler,
'domain': handlers.DomainHandler,
'emit': handlers.EmitterHandler,
'region': handlers.SingleRegionHandler
'model': pmsco.handlers.SingleModelHandler,
'scan': pmsco.handlers.ScanHandler,
'domain': pmsco.handlers.DomainHandler,
'emit': pmsco.handlers.EmitterHandler,
'region': pmsco.handlers.SingleRegionHandler
}
self.atomic_scattering_factory = InternalAtomicCalculator
self.multiple_scattering_factory = EdacCalculator
self._tasks_fields = []
self._db = database.ResultsDatabase()
self._db = pmsco.database.ResultsDatabase()
def validate(self):
"""
validate the project parameters before starting the calculations
the method checks and fixes attributes that may cause trouble or go unnoticed if they are wrong.
in addition, it fixes attributes which may be incomplete after loading a run-file.
failed critical checks raise an exception (AssertionError, AttributeError, KeyError, ValueError).
checks that cause an attribute do revert to default, are logged as warning.
the following attributes are fixed silently:
- scattering factories that are declared as string are looked up in the project module.
- place holders in the directories attribute are resolved.
- place holders in the output_file attribute are resolved.
- output_file and output_dir are made consistent (so that output_file includes output_dir).
- the create_model_space() method is called if the model_space attribute is undefined.
- scan data are loaded.
@note to check the syntax of a run-file, set the calculation mode to 'validate' and run pmsco.
this will pass the validate method but will stop execution before calculations are started.
@raise AssertionError if a parameter is not correct.
@raise AttributeError if a class name cannot be resolved.
"""
assert self.mode in {"single", "swarm", "genetic", "grid", "table", "test", "validate"}
if isinstance(self.atomic_scattering_factory, str):
self.atomic_scattering_factory = getattr(self._module, self.atomic_scattering_factory)
if isinstance(self.multiple_scattering_factory, str):
self.multiple_scattering_factory = getattr(self._module, self.multiple_scattering_factory)
self.directories = {k: config.resolve_path(Path(v), self.directories) for k, v in self.directories.items()}
assert len(str(self.output_file))
d = config.resolve_path(self.directories['output'], self.directories)
f = config.resolve_path(self.output_file, self.directories)
self.output_file = Path(d, f)
self.directories['output'] = self.output_file.parent
if self._model_space is None or not self._model_space.start:
logger.warning("undefined model_space attribute, trying project's create_model_space")
self._model_space = self.create_model_space()
self.load_scans()
@property
def data_dir(self):
return self.directories['data']
@data_dir.setter
def data_dir(self, path):
self.directories['data'] = Path(path)
@property
def output_dir(self):
return self.directories['output']
@output_dir.setter
def output_dir(self, path):
self.directories['output'] = Path(path)
@property
def output_file(self):
return Path(self.directories['output'], self.job_name)
@output_file.setter
def output_file(self, filename):
"""
set path and base name of output file.
path is copied to the output_dir attribute.
the file stem is copied to the job_name attribute.
@param filename: (PathLike)
"""
p = Path(filename)
s = str(p.parent)
if s and s != ".":
self.directories['output'] = p.parent
s = str(p.stem)
if s:
self.job_name = s
else:
raise ValueError("invalid output file name")
@property
def time_limit(self):
return self.timedelta_limit.total_seconds() / 3600 / 24
@time_limit.setter
def time_limit(self, hours):
self.timedelta_limit = datetime.timedelta(hours=hours)
def create_model_space(self):
"""
create a project.ModelSpace object which defines the allowed range for model parameters.
this method must be implemented by the actual project class.
the ModelSpace object must declare all model parameters used in the project.
there are three ways for a project to declare the model space:
1. implement the @ref create_model_space method.
this is the older way and may become deprecated in a future version.
2. assign a ModelSpace to the self.model_space property directly
(in the @ref validate method).
3. declare the model space in the run-file.
this method is called by the validate method only if self._model_space is undefined.
@return ModelSpace object
"""
return None
@property
def model_space(self):
"""
ModelSpace object that defines the allowed range for model parameters.
there are three ways for a project to declare the model space:
1. implement the @ref create_model_space method.
this is the older way and may become deprecated in a future version.
2. assign a ModelSpace to the self.model_space property directly
(in the @ref validate method).
3. declare the model space in the run-file.
initially, this property is None.
"""
return self._model_space
@model_space.setter
def model_space(self, value):
if isinstance(value, ModelSpace):
self._model_space = value
elif hasattr(value, 'items'):
self._model_space = ModelSpace()
self._model_space.set_param_dict(value)
else:
raise ValueError("incompatible object type")
def create_params(self, model, index):
"""
create a CalculatorParams object given the model parameters and calculation index.
@ -816,11 +1129,15 @@ class Project(object):
self.combined_scan = None
self.combined_modf = None
def add_scan(self, filename, emitter, initial_state, is_modf=False, modf_model=None, positions=None):
def add_scan(self, filename, emitter, initial_state, is_modf=False, positions=None):
"""
add the file name of reference experiment and load it.
the extension must be one of msc_data.DATATYPES (case insensitive)
add a scan specification to the scans list.
this is a shortcut for adding a ScanCreator or ScanLoader object to the self.scans list.
the creator or loader are converted into full Scan objects just before the calculation starts
(in the self.setup() method).
the extension must be one of pmsco.data.DATATYPES (case insensitive)
corresponding to the meaning of the columns in the file.
caution: EDAC can only calculate equidistant, rectangular scans.
@ -831,9 +1148,6 @@ class Project(object):
* intensity vs theta, phi, or alpha
* intensity vs theta and phi (hemisphere or hologram scan)
the method calculates the modulation function if @c is_modf is @c False.
it also updates @c combined_scan and @c combined_modf which may be used as R-factor comparison targets.
@param filename: (string) file name of the experimental data, possibly including a path.
the file is not loaded when the optional positions argument is present,
but the filename may serve as basename for output files (e.g. modulation function).
@ -852,57 +1166,64 @@ class Project(object):
@param is_modf: (bool) declares whether the file contains the modulation function (True),
or intensity (False, default). In the latter case, the modulation function is calculated internally.
@param modf_model: (dict) model parameters to be passed to the modulation function.
@return (Scan) the new scan object (which is also a member of self.scans).
"""
scan = Scan()
if positions is not None:
scan.define_scan(positions, emitter, initial_state)
scan.filename = filename
scan = ScanCreator()
scan.positions = positions
else:
scan.import_scan_file(filename, emitter, initial_state)
scan = ScanLoader()
scan.is_modf = is_modf
scan.filename = filename
scan.emitter = emitter
scan.initial_state = initial_state
self.scans.append(scan)
if modf_model is None:
modf_model = {}
return scan
if scan.raw_data is not None:
if is_modf:
scan.modulation = scan.raw_data
else:
def load_scans(self):
"""
load all scan data.
initially, the self.scans list may contain objects of different classes (Scan, ScanLoader, ScanCreator)
depending on the project initialization.
this method loads all data, so that the scans list contains only Scan objects.
also, the self.combined_scan and self.combined_modf fields are calculated from the scans.
"""
has_raw_data = True
has_mod_func = True
loaded_scans = []
for idx, scan in enumerate(self.scans):
scan = scan.load(dirs=self.directories)
loaded_scans.append(scan)
if scan.modulation is None:
try:
scan.modulation = self.calc_modulation(scan.raw_data, modf_model)
scan.modulation = self.calc_modulation(scan.raw_data, self.model_space.start)
except ValueError:
logger.error("error calculating the modulation function of experimental data.")
scan.modulation = None
else:
scan.modulation = None
logger.error(f"error calculating the modulation function of scan {idx}.")
has_raw_data = has_raw_data and scan.raw_data is not None
has_mod_func = has_mod_func and scan.modulation is not None
self.scans = loaded_scans
if scan.raw_data is not None:
if self.combined_scan is not None:
dt = md.common_dtype((self.combined_scan, scan.raw_data))
d1 = md.restructure_data(self.combined_scan, dt)
d2 = md.restructure_data(scan.raw_data, dt)
self.combined_scan = np.hstack((d1, d2))
else:
self.combined_scan = scan.raw_data.copy()
if has_raw_data:
stack1 = [scan.raw_data for scan in self.scans]
dtype = md.common_dtype(stack1)
stack2 = [md.restructure_data(data, dtype) for data in stack1]
self.combined_scan = np.hstack(tuple(stack2))
else:
self.combined_scan = None
if scan.modulation is not None:
if self.combined_modf is not None:
dt = md.common_dtype((self.combined_modf, scan.modulation))
d1 = md.restructure_data(self.combined_modf, dt)
d2 = md.restructure_data(scan.modulation, dt)
self.combined_modf = np.hstack((d1, d2))
else:
self.combined_modf = scan.modulation.copy()
if has_mod_func:
stack1 = [scan.modulation for scan in self.scans]
dtype = md.common_dtype(stack1)
stack2 = [md.restructure_data(data, dtype) for data in stack1]
self.combined_modf = np.hstack(tuple(stack2))
else:
self.combined_modf = None
return scan
def clear_domains(self):
"""
clear domains.
@ -933,42 +1254,6 @@ class Project(object):
"""
self.domains.append(domain)
def set_output(self, filename):
"""
set path and base name of output file.
path and name are copied to the output_file attribute.
path is copied to the output_dir attribute.
if the path is missing, the destination is the current working directory.
"""
self.output_file = filename
path, name = os.path.split(filename)
self.output_dir = path
self.job_name = name
def set_timedelta_limit(self, timedelta, margin_minutes=10):
"""
set the walltime limit with a safety margin.
this method sets the internal self.timedelta_limit attribute.
by default, a safety margin of 10 minutes is subtracted to the main argument
in order to increase the probability that the process ends in time.
if this is not wanted, the project class may override the method and provide its own margin.
the method is typically called with the command line time limit from the main module.
@note the safety margin could be applied at various levels.
it is done here because it can easily be overridden by the project subclass.
to keep run scripts simple, the command line can be given the same time limit
as the job scheduler of the computing cluster.
@param timedelta: (datetime.timedelta) max. duration of the calculation process (wall time).
@param margin_minutes: (int) safety margin in minutes to subtract from timedelta.
"""
self.timedelta_limit = timedelta - datetime.timedelta(minutes=margin_minutes)
def log_project_args(self):
"""
send some common project attributes to the log.
@ -981,6 +1266,14 @@ class Project(object):
@return: None
"""
try:
for key in self.directories:
val = self.directories[key]
lev = logging.WARNING if val else logging.DEBUG
logger.log(lev, f"directories['{key}']: {val}")
logger.warning("output file: {0}".format(self.output_file))
logger.warning("database: {0}".format(self.db_file))
logger.warning("atomic scattering: {0}".format(self.atomic_scattering_factory))
logger.warning("multiple scattering: {0}".format(self.multiple_scattering_factory))
logger.warning("optimization mode: {0}".format(self.mode))
@ -990,15 +1283,11 @@ class Project(object):
lev = logging.WARNING if val else logging.DEBUG
logger.log(lev, "optimizer_params['{k}']: {v}".format(k=key, v=val))
logger.warning("data directory: {0}".format(self.data_dir))
logger.warning("output file: {0}".format(self.output_file))
logger.warning("database: {0}".format(self.db_file))
_files_to_keep = files.FILE_CATEGORIES - self.files.categories_to_delete
_files_to_keep = pmsco.files.FILE_CATEGORIES - self.files.categories_to_delete
logger.warning("intermediate files to keep: {0}".format(", ".join(_files_to_keep)))
for idx, scan in enumerate(self.scans):
logger.warning(f"scan {idx}: {scan.filename} ({scan.emitter} {scan.initial_state}")
logger.warning(f"scan {idx}: {scan.filename} ({scan.emitter} {scan.initial_state})")
for idx, dom in enumerate(self.domains):
logger.warning(f"domain {idx}: {dom}")
@ -1247,16 +1536,26 @@ class Project(object):
"""
self.git_hash = self.get_git_hash()
fields = ["rfac"]
fields.extend(dispatch.CalcID._fields)
fields.extend(pmsco.dispatch.CalcID._fields)
fields.append("secs")
fields = ["_" + f for f in fields]
mspace = self.create_model_space()
model_fields = list(mspace.start.keys())
model_fields = list(self.model_space.start.keys())
model_fields.sort(key=lambda name: name.lower())
fields.extend(model_fields)
self._tasks_fields = fields
with open(self.output_file + ".tasks.dat", "w") as outfile:
if 'all' in self.keep_files:
cats = set([])
else:
cats = pmsco.files.FILE_CATEGORIES - set(self.keep_files)
cats -= {'report'}
if self.mode == 'single':
cats -= {'model'}
self.files.categories_to_delete = cats
Path(self.output_file).parent.mkdir(parents=True, exist_ok=True)
tasks_file = Path(self.output_file).with_suffix(".tasks.dat")
with open(tasks_file, "w") as outfile:
outfile.write("# ")
outfile.write(" ".join(fields))
outfile.write("\n")
@ -1311,7 +1610,8 @@ class Project(object):
values_dict['_rfac'] = parent_task.rfac
values_dict['_secs'] = parent_task.time.total_seconds()
values_list = [values_dict[field] for field in self._tasks_fields]
with open(self.output_file + ".tasks.dat", "a") as outfile:
tasks_file = Path(self.output_file).with_suffix(".tasks.dat")
with open(tasks_file, "a") as outfile:
outfile.write(" ".join(format(value) for value in values_list) + "\n")
db_id = self._db.insert_result(parent_task.id, values_dict)
@ -1548,11 +1848,11 @@ class Project(object):
"""
_files = {}
xyz_filename = filename + ".xyz"
cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ)
cluster.save_to_file(xyz_filename, fmt=pmsco.cluster.FMT_XYZ)
_files[xyz_filename] = 'cluster'
xyz_filename = filename + ".emit.xyz"
cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ, emitters_only=True)
cluster.save_to_file(xyz_filename, fmt=pmsco.cluster.FMT_XYZ, emitters_only=True)
_files[xyz_filename] = 'cluster'
return _files

309
pmsco/schedule.py Normal file
View File

@ -0,0 +1,309 @@
"""
@package pmsco.schedule
job schedule interface
this module defines common infrastructure to submit a pmsco calculation job to a job scheduler such as slurm.
the schedule can be defined as part of the run-file (see pmsco module).
users may derive sub-classes in a separate module to adapt to their own computing cluster.
the basic call sequence is:
1. create a schedule object.
2. initialize its properties with job parameters.
3. validate()
4. submit()
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
@copyright (c) 2015-21 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
import collections.abc
import commentjson as json
import datetime
import logging
from pathlib import Path
import shutil
import subprocess
import pmsco.config
logger = logging.getLogger(__name__)
class JobSchedule(pmsco.config.ConfigurableObject):
"""
base class for job schedule
this class defines the abstract interface and some utilities.
derived classes may override any method, but should call the inherited method.
usage:
1. create object, assigning a project instance.
2. assign run_file.
3. call validate.
4. call submit.
this class' properties should not be listed in the run file - they will be overwritten.
"""
## @var enabled (bool)
#
# this parameter signals whether pmsco should schedule a job or run the calculation.
# it is not directly used by the schedule classes but by the pmsco module.
# it must be defined in the run file and set to true to submit the job to a scheduler.
# it is set to false in the run file copied to the job directory so that the job script starts the calculation.
def __init__(self, project):
super(JobSchedule, self).__init__()
self.project = project
self.enabled = False
self.run_dict = {}
self.job_dir = Path()
self.job_file = Path()
self.run_file = Path()
# directory that contains the pmsco and projects directories
self.pmsco_root = Path(__file__).parent.parent
def validate(self):
"""
validate the job parameters.
make sure all object attributes are correct for submission.
@return: None
"""
self.pmsco_root = Path(self.project.directories['pmsco']).parent
output_dir = Path(self.project.directories['output'])
assert self.pmsco_root.is_dir()
assert (self.pmsco_root / "pmsco").is_dir()
assert (self.pmsco_root / "projects").is_dir()
assert output_dir.is_dir()
assert self.project.job_name
self.job_dir = output_dir / self.project.job_name
self.job_dir.mkdir(parents=True, exist_ok=True)
self.job_file = (self.job_dir / self.project.job_name).with_suffix(".sh")
self.run_file = (self.job_dir / self.project.job_name).with_suffix(".json")
def submit(self):
"""
submit the job to the scheduler.
as of this class, the method does to following:
1. copy source files
2. copy a patched version of the run file.
3. write the job file (_write_job_file must be implemented by a derived class).
@return: None
"""
self._copy_source()
self._fix_run_file()
self._write_run_file()
self._write_job_file()
def _copy_source(self):
"""
copy the source files to the job directory.
the source_dir and job_dir attributes must be correct.
the job_dir directory must not exist and will be created.
this is a utility method used internally by derived classes.
job_dir/pmsco/pmsco/**
job_dir/pmsco/projects/**
job_dir/job.sh
job_dir/job.json
@return: None
"""
source = self.pmsco_root
dest = self.job_dir / "pmsco"
ignore = shutil.ignore_patterns(".*", "~*", "*~")
shutil.copytree(source / "pmsco", dest / "pmsco", ignore=ignore)
shutil.copytree(source / "projects", dest / "projects", ignore=ignore)
def _fix_run_file(self):
"""
fix the run file.
patch some entries of self.run_dict so that it can be used as run file.
the following changes are made:
1. set schedule.enabled to false so that the calculation is run.
2. set the output directory to the job directory.
3. set the log file to the job directory.
@return: None
"""
self.run_dict['schedule']['enabled'] = False
self.run_dict['project']['directories']['output'] = str(self.job_dir)
self.run_dict['project']['log_file'] = str((self.job_dir / self.project.job_name).with_suffix(".log"))
def _write_run_file(self):
"""
copy the run file.
this is a JSON dump of self.run_dict to the self.run_file file.
@return: None
"""
with open(self.run_file, "wt") as f:
json.dump(self.run_dict, f, indent=2)
def _write_job_file(self):
"""
create the job script.
this method must be implemented by a derived class.
the script must be written to the self.job_file file.
don't forget to make the file executable.
@return: None
"""
pass
class SlurmSchedule(JobSchedule):
"""
job schedule for a slurm scheduler.
this class implements commonly used features of the slurm scheduler.
host-specific features and the creation of the job file should be done in a derived class.
derived classes must, in particular, implement the _write_job_file method.
they can override other methods, too, but should call the inherited method first.
1. copy the source trees (pmsco and projects) to the job directory
2. copy a patched version of the run file.
3. call the submission command
the public properties of this class should be assigned from the run file.
"""
def __init__(self, project):
super(SlurmSchedule, self).__init__(project)
self.host = ""
self.nodes = 1
self.tasks_per_node = 8
self.wall_time = datetime.timedelta(hours=1)
self.signal_time = 600
self.manual = True
@staticmethod
def parse_timedelta(td):
"""
parse time delta input formats
converts a string or dictionary from run-file into datetime.timedelta.
@param td:
str: [days-]hours[:minutes[:seconds]]
dict: days, hours, minutes, seconds - at least one needs to be defined. values must be numeric.
datetime.timedelta - native type
@return: datetime.timedelta
"""
if isinstance(td, str):
dt = {}
d = td.split("-")
if len(d) > 1:
dt['days'] = float(d.pop(0))
t = d[0].split(":")
try:
dt['hours'] = float(t.pop(0))
dt['minutes'] = float(t.pop(0))
dt['seconds'] = float(t.pop(0))
except (IndexError, ValueError):
pass
td = datetime.timedelta(**dt)
elif isinstance(td, collections.abc.Mapping):
td = datetime.timedelta(**td)
return td
def validate(self):
super(SlurmSchedule, self).validate()
self.wall_time = self.parse_timedelta(self.wall_time)
assert self.job_dir.is_absolute()
def submit(self):
"""
call the sbatch command
if manual is true, the job files are generated but the job is not submitted.
@return: None
"""
super(SlurmSchedule, self).submit()
args = ['sbatch', str(self.job_file)]
print(" ".join(args))
if self.manual:
print("manual run - job files created but not submitted")
else:
cp = subprocess.run(args)
cp.check_returncode()
class PsiRaSchedule(SlurmSchedule):
"""
job shedule for the Ra cluster at PSI.
this class selects specific features of the Ra cluster,
such as the partition and node type (24 or 32 cores).
it also implements the _write_job_file method.
"""
## @var partition (str)
#
# the partition is selected based on wall time and number of tasks by the validate() method.
# it should not be listed in the run file.
def __init__(self, project):
super(PsiRaSchedule, self).__init__(project)
self.partition = "shared"
def validate(self):
super(PsiRaSchedule, self).validate()
assert self.nodes <= 2
assert self.tasks_per_node <= 24 or self.tasks_per_node == 32
assert self.wall_time.total_seconds() >= 60
if self.wall_time.total_seconds() > 24 * 60 * 60:
self.partition = "week"
elif self.tasks_per_node < 24:
self.partition = "shared"
else:
self.partition = "day"
assert self.partition in ["day", "week", "shared"]
def _write_job_file(self):
lines = []
lines.append('#!/bin/bash')
lines.append('#SBATCH --export=NONE')
lines.append(f'#SBATCH --job-name="{self.project.job_name}"')
lines.append(f'#SBATCH --partition={self.partition}')
lines.append(f'#SBATCH --time={int(self.wall_time.total_seconds() / 60)}')
lines.append(f'#SBATCH --nodes={self.nodes}')
lines.append(f'#SBATCH --ntasks-per-node={self.tasks_per_node}')
if self.tasks_per_node > 24:
lines.append('#SBATCH --cores-per-socket=16')
# 0 - 65535 seconds
# currently, PMSCO does not react to signals properly
# lines.append(f'#SBATCH --signal=TERM@{self.signal_time}')
lines.append(f'#SBATCH --output="{self.project.job_name}.o.%j"')
lines.append(f'#SBATCH --error="{self.project.job_name}.e.%j"')
lines.append('module load psi-python36/4.4.0')
lines.append('module load gcc/4.8.5')
lines.append('module load openmpi/3.1.3')
lines.append('source activate pmsco')
lines.append(f'cd "{self.job_dir}"')
lines.append(f'mpirun python pmsco/pmsco -r {self.run_file.name}')
lines.append(f'cd "{self.job_dir}"')
lines.append('rm -rf pmsco')
lines.append('exit 0')
self.job_file.write_text("\n".join(lines))
self.job_file.chmod(0o755)