1671 lines
68 KiB
Python
1671 lines
68 KiB
Python
"""
|
|
@package pmsco.project
|
|
project-independent classes which store and handle model parameters.
|
|
|
|
the most important class defined here is Project.
|
|
each calculation project needs to derive its own project class from it.
|
|
the ModelSpace and CalculatorParams classes are typically used unchanged.
|
|
|
|
@note nomenclature: the term @e parameters has several meanings in the code and documentation.
|
|
the following distinctive terms are used in updated documentation sections.
|
|
ambiguous terms may still be present in older code sections.
|
|
@arg <em>calculation parameters</em> set of specific parameters passed as input to the calculation programs.
|
|
the amount and meaning of these parameters depend on the calculation code used.
|
|
typically, many of these parameters remain fixed, or change very rarely in the course of the study.
|
|
@arg <em>model parameters</em> concise set of independent physical parameters
|
|
that define the system in one calculation instance.
|
|
these parameters are varied systematically by the optimization process.
|
|
they are mapped to calculation parameters and a cluster by code derived from the Project class.
|
|
|
|
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
|
|
|
|
@copyright (c) 2015 by Paul Scherrer Institut @n
|
|
Licensed under the Apache License, Version 2.0 (the "License"); @n
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import collections
|
|
import copy
|
|
import datetime
|
|
import git
|
|
import logging
|
|
import numpy as np
|
|
import os.path
|
|
import socket
|
|
import sys
|
|
|
|
from pmsco.calculators.calculator import InternalAtomicCalculator
|
|
from pmsco.calculators.edac import EdacCalculator
|
|
import pmsco.cluster as mc
|
|
from pmsco.compat import open
|
|
import pmsco.data as md
|
|
import pmsco.database as database
|
|
import pmsco.dispatch as dispatch
|
|
import pmsco.files as files
|
|
import pmsco.handlers as handlers
|
|
from pmsco.helpers import BraceMessage as BMsg
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
ParamSpace = collections.namedtuple('ParamSpace', ['start', 'min', 'max', 'step'])
|
|
|
|
|
|
class ModelSpace(object):
|
|
"""
|
|
Domain of model parameters.
|
|
|
|
Each member contains a dictionary of model parameter names and their values.
|
|
Parameter names can be defined almost freely by the project,
|
|
except that they should contain only alphanumeric and underscore characters.
|
|
furthermore, names starting with an underscore are reserved for the optimizers.
|
|
"""
|
|
|
|
## @var start (dict)
|
|
# dictionary of start values for each model parameter.
|
|
#
|
|
# the start value can be the initial guess for an optimization run,
|
|
# or the actual value for a single calculation.
|
|
#
|
|
# there must be one item for each model parameter,
|
|
# where the key is the name of the parameter, and the value its physical value.
|
|
|
|
## @var min (dict)
|
|
# dictionary of minimum values for each model parameter.
|
|
#
|
|
# the minimum defines the lower bound of the allowed interval for a model parameter.
|
|
#
|
|
# there must be one item for each model parameter,
|
|
# where the key is the name of the parameter, and the value its physical value.
|
|
|
|
## @var max (dict)
|
|
# dictionary of maximum values for each model parameter.
|
|
#
|
|
# the maximum defines the upper bound of the allowed interval for a model parameter.
|
|
#
|
|
# there must be one item for each model parameter,
|
|
# where the key is the name of the parameter, and the value its physical value.
|
|
|
|
## @var step (dict)
|
|
# dictionary of step sizes for each model parameter.
|
|
#
|
|
# depending on the optimization mode, the step is a guess of how fast values should vary,
|
|
# e.g. step size, gradient, velocity, ...
|
|
#
|
|
# there must be one item for each model parameter,
|
|
# where the key is the name of the parameter, and the value its physical value.
|
|
|
|
def __init__(self):
|
|
"""
|
|
initialize the domain object with empty dictionaries.
|
|
"""
|
|
self.start = {}
|
|
self.min = {}
|
|
self.max = {}
|
|
self.step = {}
|
|
|
|
def add_param(self, name, start, min=None, max=None, step=None, width=None):
|
|
"""
|
|
set the domain of one parameter with all necessary values at once.
|
|
|
|
the exact meaning of the arguments depends on the calculation mode.
|
|
|
|
@param name (string) name of the parameter (alphanumeric and underscore characters only).
|
|
it is recommended to use short but distinctive names.
|
|
|
|
@param start (float) start value.
|
|
|
|
@param min (float) lower bound of the parameter interval.
|
|
must be lower or equal to start.
|
|
if None, the field is set to start.
|
|
|
|
@param max (float) upper bound of the parameter interval.
|
|
must be greater or equal to start.
|
|
if None, the field is set to start.
|
|
|
|
@param width (float) width of the parameter interval.
|
|
instead of min and max, the interval can be set centered around the start value.
|
|
this is equivalent to min = start - width/2, max = start + width/2.
|
|
this argument overrides min and max. don't use both arguments.
|
|
|
|
@param step (float) step size.
|
|
must be greater or equal to zero.
|
|
if None, the field is set to zero.
|
|
"""
|
|
self.start[name] = start
|
|
self.min[name] = min if min is not None else start
|
|
self.max[name] = max if max is not None else start
|
|
if width is not None:
|
|
self.min[name] = start - width / 2.
|
|
self.max[name] = start + width / 2.
|
|
self.step[name] = step if step is not None else 0.0
|
|
|
|
def get_param(self, name):
|
|
"""
|
|
get all values of a model parameter in a tuple.
|
|
|
|
@param name (string) name of the parameter.
|
|
|
|
@return named tuple ParamSpace(start, min, max, step) of the parameter.
|
|
|
|
@raise IndexError if the parameter is not defined.
|
|
"""
|
|
return ParamSpace(self.start[name], self.min[name], self.max[name], self.step[name])
|
|
|
|
|
|
class CalculatorParams(object):
|
|
"""
|
|
calculation parameters for a single scattering calculation job.
|
|
|
|
this class holds all the calculation parameters that are passed via input file to the calculation program.
|
|
|
|
the class can hold parameters for both the MSC and EDAC codes.
|
|
some parameters are used by both codes, others are used just by one of them.
|
|
newer features such as multiple emitters, multiple domains, and others are supported in EDAC mode only.
|
|
MSC mode is currently not maintained.
|
|
|
|
objects of this class are created by the implementation of the create_params() method
|
|
of the actual project class.
|
|
"""
|
|
|
|
## @var angular_resolution (float)
|
|
# FWHM angular resolution of the detector.
|
|
#
|
|
# maps to:
|
|
# @arg emission angle window (EDAC)
|
|
# @arg angular_broadening (MSC)
|
|
|
|
## @var binding_energy (float)
|
|
# initial state binding energy with respect to the Fermi level in eV
|
|
#
|
|
|
|
## @var initial_state (str)
|
|
# initial state
|
|
#
|
|
# 1s, 2p, 2p1/2, etc.
|
|
#
|
|
|
|
## @var phase_files (dict)
|
|
# dictionary of phase or scattering matrix element files.
|
|
#
|
|
# the keys are atomic numbers, the values file names.
|
|
# whether the files contain phase shifts or matrix elements depends on the calculator.
|
|
# EDAC determines the kind of information from the first line in the file.
|
|
#
|
|
# if the dictionary is empty or the files don't exist,
|
|
# the scattering matrix is computed by the calculator (if supported).
|
|
#
|
|
# maps to:
|
|
# @arg scatterer (EDAC)
|
|
# @arg atomic_number, phase_file (MSC)
|
|
|
|
## @var phase_output_classes (int or iterable of int)
|
|
# atom classes for which to output phase files
|
|
#
|
|
# if the atomic scattering factors are calculated internally,
|
|
# EDAC can export them to scattering files.
|
|
#
|
|
# this parameter can be one of
|
|
# @arg None (default) no phase output,
|
|
# @arg integer number defining a range 0:N-1 of atom classes,
|
|
# @arg iterable (e.g., set or sequence) of atom classes to export.
|
|
#
|
|
# the problem is that EDAC expects the user to list each atom class to export,
|
|
# though it is not possible to know how many classes there will be
|
|
# or which atoms belong to which class before the calculation is actually done.
|
|
# the number of classes will be between the number of different elements and the number of atoms.
|
|
#
|
|
# thus, this parameter should normally be left at its default value
|
|
# and used only in specific situations that can be processed manually.
|
|
# if the parameter is non-default, EDAC will also produce a cluster output
|
|
# that includes a mapping between atomic coordinates and atom classes.
|
|
#
|
|
# @note the files generated belong to the category "output".
|
|
# you need to specify `--keep-files output` to prevent them from getting cleaned up.
|
|
|
|
## @var polarization (str)
|
|
# photon polarization
|
|
#
|
|
# 'H', 'V', 'L', 'R', 'U'
|
|
#
|
|
|
|
## @var rme_files (dict)
|
|
# dictionary of radial matrix element files.
|
|
#
|
|
# if the dictionary is empty or the files don't exist,
|
|
# the radial matrix defaults to the rme_xxx_xxx attributes.
|
|
#
|
|
# in EDAC, RME files or constants are considered only if @ref phase_files are specified.
|
|
#
|
|
|
|
## @var work function (float)
|
|
# work function in eV
|
|
#
|
|
# the energy scale of EDAC is referenced to the vacuum level
|
|
# but data files are referenced to the Fermi level.
|
|
# the @ref pmsco.calculators.edac module adds the work function to the kinetic energy before it calls EDAC.
|
|
#
|
|
|
|
def __init__(self):
|
|
self.title = "default parameters"
|
|
self.comment = "set by project.CalculatorParams()"
|
|
self.cluster_file = ""
|
|
self.output_file = ""
|
|
self.scan_file = ""
|
|
self.initial_state = "1s"
|
|
self.binding_energy = 0.0
|
|
self.polarization = "H"
|
|
self.angular_resolution = 1.0
|
|
self.z_surface = 0.0
|
|
self.inner_potential = 10.0
|
|
self.work_function = 0.0
|
|
self.symmetry_range = 360.0
|
|
self.polar_incidence_angle = 60.0
|
|
self.azimuthal_incidence_angle = 0.0
|
|
self.experiment_temperature = 300.0
|
|
self.debye_temperature = 400.0
|
|
self.debye_wavevector = 1.0
|
|
self.phase_files = {}
|
|
self.rme_files = {}
|
|
self.rme_minus_value = 0.1
|
|
self.rme_minus_shift = 0.0
|
|
self.rme_plus_value = 1.0
|
|
self.rme_plus_shift = 0.0
|
|
# used by MSC only
|
|
self.spherical_order = 2
|
|
self.scattering_level = 5
|
|
self.fcut = 15.0
|
|
self.cut = 15.0
|
|
self.lattice_constant = 1.0
|
|
self.msq_displacement = {}
|
|
self.planewave_attenuation = 1.0
|
|
self.vibration_model = "N"
|
|
self.substrate_atomic_mass = 1.0
|
|
# used by EDAC only
|
|
self.emitters = [(0.0, 0.0, 0.0, 0)]
|
|
self.lmax = 15
|
|
self.dmax = 5.0
|
|
self.orders = [20]
|
|
self.phase_output_classes = None
|
|
|
|
@property
|
|
def l_init(self):
|
|
"""
|
|
initial state l quantum number.
|
|
|
|
this is converted from the initial_state property.
|
|
|
|
@return: (int) 0..3
|
|
"""
|
|
return "spdf".index(self.initial_state[1])
|
|
|
|
|
|
class Scan(object):
|
|
"""
|
|
class to describe the scanning scheme or store the experimental data set.
|
|
"""
|
|
|
|
## @var filename (string)
|
|
# file name from which a scan was loaded
|
|
|
|
## @var raw_data (numpy.ndarray)
|
|
# original scan data (ETPAIS array)
|
|
|
|
## @var dtype (dict)
|
|
# data type of self.raw_data.
|
|
#
|
|
# one of the data.DTYPE_Xxxx constants.
|
|
|
|
## @var modulation (numpy.ndarray)
|
|
# modulation function calculated from original scan (ETPAIS array)
|
|
|
|
## @var mode (list of characters)
|
|
# list of ETPAI column names which are scanned in self.raw_data.
|
|
#
|
|
# example: ['t','p']
|
|
|
|
## @var emitter (string)
|
|
# chemical symbol and, optionally following, further specification (chemical state, environment, ...)
|
|
# of photo-emitting atoms.
|
|
# the interpretation of this string is up to the project and its cluster generator.
|
|
# it should, however, always start with a chemical element symbol.
|
|
#
|
|
# examples: 'Ca' (calcium), 'CA' (carbon A), 'C a' (carbon a), 'C 1' (carbon one), 'N=O', 'FeIII'.
|
|
|
|
## @var initial_state (string)
|
|
# nl term of initial state
|
|
#
|
|
# in the form expected by EDAC, for example: '1s'
|
|
|
|
## @var energies (numpy.ndarray)
|
|
# kinetic energy referenced to Fermi level.
|
|
#
|
|
# one-dimensional array.
|
|
|
|
## @var thetas (numpy.ndarray)
|
|
# polar angle referenced to normal emission
|
|
#
|
|
# one-dimensional array.
|
|
#
|
|
# note: in the case of a hemispherical scan, the values in this array will not be unique.
|
|
|
|
## @var phis (numpy.ndarray)
|
|
# azimuthal angle referenced to arbitrary origin
|
|
#
|
|
# one-dimensional array.
|
|
#
|
|
# note: in the case of a hemispherical scan, the values in this array will not be unique, and not monotonic.
|
|
|
|
## @var alphas (numpy.ndarray)
|
|
# polar angle referenced to normal emission
|
|
#
|
|
# one-dimensional array.
|
|
|
|
def __init__(self):
|
|
self.filename = ""
|
|
self.raw_data = None
|
|
self.dtype = None
|
|
self.modulation = None
|
|
self.mode = []
|
|
self.emitter = ""
|
|
self.initial_state = "1s"
|
|
self.positions = {
|
|
'e': np.empty(0),
|
|
't': np.empty(0),
|
|
'p': np.empty(0),
|
|
'a': np.empty(0),
|
|
}
|
|
|
|
@property
|
|
def energies(self):
|
|
return self.positions['e']
|
|
|
|
@energies.setter
|
|
def energies(self, value):
|
|
self.positions['e'] = value
|
|
|
|
@property
|
|
def thetas(self):
|
|
return self.positions['t']
|
|
|
|
@thetas.setter
|
|
def thetas(self, value):
|
|
self.positions['t'] = value
|
|
|
|
@property
|
|
def phis(self):
|
|
return self.positions['p']
|
|
|
|
@phis.setter
|
|
def phis(self, value):
|
|
self.positions['p'] = value
|
|
|
|
@property
|
|
def alphas(self):
|
|
return self.positions['a']
|
|
|
|
@alphas.setter
|
|
def alphas(self, value):
|
|
self.positions['a'] = value
|
|
|
|
def copy(self):
|
|
"""
|
|
create a copy of the scan.
|
|
|
|
@return: new independent scan object with the same attributes as the original one.
|
|
"""
|
|
return copy.deepcopy(self)
|
|
|
|
def import_scan_file(self, filename, emitter, initial_state):
|
|
"""
|
|
import the reference experiment.
|
|
|
|
the extension must be one of msc_data.DATATYPES (case insensitive)
|
|
corresponding to the meaning of the columns in the file.
|
|
|
|
this method does not calculate the modulation function.
|
|
|
|
@attention EDAC can only calculate equidistant, rectangular scans.
|
|
holo scans are transparently mapped to rectangular scans by pmsco.
|
|
this method accepts the following scans:
|
|
|
|
* intensity vs energy at fixed theta, phi
|
|
* intensity vs analyser angle vs energy at normal emission (theta = 0, constant phi)
|
|
* intensity vs theta, phi, or alpha
|
|
* holo scan (theta,phi)
|
|
|
|
@param filename: (string) file name of the experimental data, possibly including a path.
|
|
|
|
@param emitter: (string) chemical symbol of the photo-emitting atom, e.g. "Cu".
|
|
|
|
@param initial_state: (string) nl term of the initial state of the atom, e.g. "2p".
|
|
|
|
"""
|
|
self.filename = filename
|
|
self.emitter = emitter
|
|
self.initial_state = initial_state
|
|
|
|
if self.filename:
|
|
self.raw_data = md.load_data(self.filename)
|
|
self.dtype = self.raw_data.dtype
|
|
self.mode, self.positions = md.detect_scan_mode(self.raw_data)
|
|
|
|
if 'e' not in self.mode:
|
|
try:
|
|
self.energies = np.asarray((self.raw_data['e'][0], ))
|
|
except ValueError:
|
|
logger.error("missing energy in scan file %s", self.filename)
|
|
raise
|
|
|
|
if 't' not in self.mode:
|
|
try:
|
|
self.thetas = np.asarray((self.raw_data['t'][0], ))
|
|
except ValueError:
|
|
logger.info("missing theta in scan file %s, defaulting to 0.0", self.filename)
|
|
self.thetas = np.zeros(1)
|
|
|
|
if 'p' not in self.mode:
|
|
try:
|
|
self.phis = np.asarray((self.raw_data['p'][0], ))
|
|
except ValueError:
|
|
logger.info("missing phi in scan file %s, defaulting to 0.0", self.filename)
|
|
self.phis = np.zeros(1)
|
|
|
|
if 'a' not in self.mode:
|
|
try:
|
|
self.alphas = np.asarray((self.raw_data['a'][0], ))
|
|
except ValueError:
|
|
logger.info("missing alpha in scan file %s, defaulting to 0.0", self.filename)
|
|
self.alphas = np.zeros(1)
|
|
|
|
def define_scan(self, positions, emitter, initial_state):
|
|
"""
|
|
define a cartesian (rectangular/grid) scan.
|
|
|
|
this method initializes the scan with a one- or two-dimensional cartesian scan
|
|
of the four possible scan dimensions.
|
|
the scan range is given as arguments, the intensity values are initialized as 1.
|
|
the file name and modulation functions are reset to empty and None, respectively.
|
|
|
|
the method can create the following scan schemes:
|
|
|
|
* intensity vs energy at fixed theta, phi
|
|
* intensity vs analyser angle vs energy at normal emission (theta = 0, constant phi)
|
|
* intensity vs theta, phi, or alpha
|
|
* intensity vs theta and phi (rectangular holo scan)
|
|
|
|
@param positions: (dictionary of numpy arrays)
|
|
the dictionary must contain a one-dimensional array for each scan dimension 'e', 't', 'p' and 'a'.
|
|
these array must contain unique, equidistant positions.
|
|
constant dimensions must contain exactly one value.
|
|
missing angle dimensions default to 0,
|
|
a missing energy dimension results in a KeyError.
|
|
|
|
@param emitter: (string) chemical symbol of the photo-emitting atom, e.g. "Cu".
|
|
|
|
@param initial_state: (string) nl term of the initial state of the atom, e.g. "2p".
|
|
|
|
"""
|
|
self.filename = ""
|
|
self.emitter = emitter
|
|
self.initial_state = initial_state
|
|
self.mode = []
|
|
shape = 1
|
|
|
|
try:
|
|
self.energies = np.copy(positions['e'])
|
|
except KeyError:
|
|
logger.error("missing energy in define_scan arguments")
|
|
raise
|
|
else:
|
|
if self.energies.shape[0] > 1:
|
|
self.mode.append('e')
|
|
shape *= self.energies.shape[0]
|
|
|
|
try:
|
|
self.thetas = np.copy(positions['t'])
|
|
except KeyError:
|
|
logger.info("missing theta in define_scan arguments, defaulting to 0.0")
|
|
self.thetas = np.zeros(1)
|
|
else:
|
|
if self.thetas.shape[0] > 1:
|
|
self.mode.append('t')
|
|
shape *= self.thetas.shape[0]
|
|
|
|
try:
|
|
self.phis = np.copy(positions['p'])
|
|
except KeyError:
|
|
logger.info("missing phi in define_scan arguments, defaulting to 0.0")
|
|
self.phis = np.zeros(1)
|
|
else:
|
|
if self.phis.shape[0] > 1:
|
|
self.mode.append('p')
|
|
shape *= self.phis.shape[0]
|
|
|
|
try:
|
|
self.alphas = np.copy(positions['a'])
|
|
except KeyError:
|
|
logger.info("missing alpha in define_scan arguments, defaulting to 0.0")
|
|
self.alphas = np.zeros(1)
|
|
else:
|
|
if self.alphas.shape[0] > 1:
|
|
self.mode.append('a')
|
|
shape *= self.alphas.shape[0]
|
|
|
|
assert 0 < len(self.mode) <= 2, "unacceptable number of dimensions in define_scan"
|
|
assert not ('t' in self.mode and 'a' in self.mode), "unacceptable combination of dimensions in define_scan"
|
|
|
|
self.dtype = md.DTYPE_ETPAI
|
|
self.raw_data = np.zeros(shape, self.dtype)
|
|
dimensions = [self.positions[dim] for dim in ['e', 't', 'p', 'a']]
|
|
grid = np.meshgrid(*dimensions)
|
|
for i, dim in enumerate(['e', 't', 'p', 'a']):
|
|
self.raw_data[dim] = grid[i].reshape(-1)
|
|
self.raw_data['i'] = 1
|
|
|
|
|
|
# noinspection PyMethodMayBeStatic
|
|
class Project(object):
|
|
"""
|
|
base class of a calculation project.
|
|
|
|
a 'calculation project' is a coded set of prescriptions
|
|
on how to get from a set of model parameters to simulated data
|
|
which correspond to provided experimental data.
|
|
the results include a measure of the quality of the simulated data compared to experimental data.
|
|
|
|
each calculation project must derive from this class.
|
|
it must implement the create_model_space(), create_cluster(), and create_params() methods.
|
|
|
|
the other methods and attributes of this class
|
|
are for passing command line parameters to the calculation modules.
|
|
the attributes should be populated in the constructor of the derived class,
|
|
or (recommended) in the create_project() function of the module.
|
|
it is essential that the attributes are set correctly before calculation.
|
|
"""
|
|
|
|
## @var features (dictionary)
|
|
#
|
|
# calculation features and versions supported by the project.
|
|
#
|
|
# the dictionary contains key-value pairs where the key is the name of the feature and value is a version number.
|
|
# this field conditionally enables new software features that may break backward compatibility.
|
|
# derived projects should fill this field with the supported version
|
|
# upon creation (in their __init__ method or create_project() factory).
|
|
# version 0 (default) means that the feature is disabled.
|
|
#
|
|
# the following features can be enabled (list may be incomplete):
|
|
# as of this version, no optional features are defined.
|
|
#
|
|
# @note rather than introducing new features and, particularly, new versions that rely on this mechanism,
|
|
# developers of generic code should check whether backward compatibility could be achieved in a simpler way,
|
|
# e.g. by implementing addition methods whose default behaviour is the same as of the previous version.
|
|
# in some cases it may be better to refactor all current project code.
|
|
#
|
|
|
|
## @var scans (list of Scan objects)
|
|
# list of experimental or scan files for which calculations are to be run.
|
|
#
|
|
# the list must be populated by calling the add_scan() method.
|
|
# this should be done in the create_project() function, or through the command line arguments.
|
|
#
|
|
# the modulation function is calculated internally.
|
|
# if your scan files contain the modulation function (as opposed to intensity),
|
|
# you must add the files in the create_project() function.
|
|
# the command line does not support loading modulation functions.
|
|
#
|
|
# @c scans must be considered read-only. use project methods to change it.
|
|
|
|
## @var domains (list of arbitrary objects)
|
|
# list of domains for which calculations are to be run.
|
|
#
|
|
# it is up to the derived class what kind of objects are stored in the list.
|
|
# the recommended kind of objects are dictionaries which hold parameter values,
|
|
# similar to the model dictionaries.
|
|
#
|
|
# the list must be populated by calling the add_domain() method.
|
|
|
|
## @var cluster_generator (ClusterGenerator object)
|
|
# provides the cluster generator methods.
|
|
#
|
|
# a project must provide a cluster generator object that is derived from ClusterGenerator.
|
|
# at least the ClusterGenerator.create_cluster method must be implemented.
|
|
# if emitters should be run in parallel, the ClusterGenerator.count_emitters must be implemented as well.
|
|
#
|
|
# the initial value is a LegacyClusterGenerator object
|
|
# which routes cluster calls back to the project for compatibility with older project code.
|
|
|
|
## @var optimizer_params (dict)
|
|
# optional parameters of the model optimizer.
|
|
#
|
|
# this is a dictionary that can have (among others) the following values.
|
|
# for a detailed list, see the documentation of the respective model handler.
|
|
#
|
|
# @arg @c 'pop_size' (int)
|
|
# population size (number of particles) in the swarm or genetic optimization mode.
|
|
# by default, the population size is set to the number of parallel processes or 4, whichever is greater.
|
|
# you may want to override the default value in cases where the automatic choice is not appropriate.
|
|
# @arg @c 'seed_file' (string)
|
|
# name of a file containing the results from previous optimization runs.
|
|
# this can be used to resume a swarm or genetic optimization where it was interrupted before.
|
|
# the seed file is a space-delimited, multi-column, text file,
|
|
# e.g., the output file of a previous optimization.
|
|
# by default, no seed is loaded.
|
|
# @arg @c 'recalc_seed' (bool)
|
|
# select whether the R-factors of the seed models are calculated again.
|
|
# set this argument to False only if the calculation is a continuation of a previous one
|
|
# without any changes to the code.
|
|
|
|
## @var data_dir
|
|
# directory path to experimental data.
|
|
#
|
|
# the project should load experimental data (scan files) from this path.
|
|
# this attribute receives the --data-dir argument from the command line
|
|
# if the project parses the common arguments (pmsco.set_common_args).
|
|
#
|
|
# it is up to the project to define where to load scan files from.
|
|
# if the location of the files may depend on the machine or user account,
|
|
# the user may want to specify the data path on the command line.
|
|
|
|
## @var output_dir (string)
|
|
# directory path for data files produced during the calculation, including intermediate files.
|
|
#
|
|
# output_dir and output_file are set at once by @ref set_output.
|
|
|
|
## @var output_file (string)
|
|
# file name root for data files produced during the calculation, including intermediate files.
|
|
#
|
|
# the file name should include the path. the path must also be set in @ref output_dir.
|
|
#
|
|
# output_dir and output_file are set at once by @ref set_output.
|
|
|
|
## @var db_file (string)
|
|
# name of an sqlite3 database file where the calculation results should be stored.
|
|
#
|
|
# the default value is ':memory:', which creates a volatile in-memory database.
|
|
|
|
## @var timedelta_limit (datetime.timedelta)
|
|
# wall time after which no new calculations should be started.
|
|
#
|
|
# the actual wall time may be longer by the remaining time of running calculations.
|
|
# running calculations will not be aborted.
|
|
|
|
## @var combined_scan
|
|
# combined raw data from scans.
|
|
# updated by add_scan().
|
|
|
|
## @var combined_modf
|
|
# combined modulation function from scans.
|
|
# updated by add_scan().
|
|
|
|
## @var files
|
|
# list of all generated data files with metadata.
|
|
# the list is used by model handlers to decide which files can be deleted at run time to save disk space.
|
|
#
|
|
# files.categories_to_delete determines which files can be deleted.
|
|
|
|
## @var keep_best
|
|
# number of best models for which result files should be kept.
|
|
#
|
|
# this attribute determines how many models are kept based on R-factor ranking at each node of the task tree
|
|
# (up to keep_levels).
|
|
|
|
## @var keep_levels
|
|
# numeric task level down to which R-factors are considered when model files are cleaned up.
|
|
#
|
|
# @arg 0 = model level: combined results only.
|
|
# @arg 1 = scan level: scan nodes in addition to combined results (level 0).
|
|
# @arg 2 = domain level: domain nodes in addition to level 1.
|
|
# @arg 3 = emitter level: emitter nodes in addition to level 1.
|
|
# @arg 4 = region level: region nodes in addition to level 1.
|
|
|
|
## @var atomic_scattering_factory
|
|
# factory function to create an atomic scattering calculator
|
|
#
|
|
# this can also be the name of a class.
|
|
# the calculator must inherit from pmsco.calculators.calculator.AtomicCalculator.
|
|
# the name of atomic scattering calculator classes should end in AtomicCalculator.
|
|
|
|
## @var multiple_scattering_factory
|
|
# factory function to create a multiple scattering calculator
|
|
#
|
|
# this can also be the name of a class.
|
|
# the calculator must inherit from pmsco.calculators.calculator.Calculator
|
|
#
|
|
# example: pmsco.calculators.edac.EdacCalculator
|
|
#
|
|
|
|
def __init__(self):
|
|
self.mode = "single"
|
|
self.job_name = ""
|
|
self.job_tags = {}
|
|
self.git_hash = ""
|
|
self.description = ""
|
|
self.features = {}
|
|
self.cluster_format = mc.FMT_EDAC
|
|
self.cluster_generator = mc.LegacyClusterGenerator(self)
|
|
self.scans = []
|
|
self.domains = []
|
|
self.optimizer_params = {
|
|
'pop_size': 0,
|
|
'seed_file': "",
|
|
'seed_limit': 0,
|
|
'recalc_seed': True,
|
|
'table_file': ""
|
|
}
|
|
self.data_dir = ""
|
|
self.output_dir = ""
|
|
self.output_file = "pmsco_data"
|
|
self.db_file = ':memory:'
|
|
self.timedelta_limit = datetime.timedelta(days=1)
|
|
self.combined_scan = None
|
|
self.combined_modf = None
|
|
self.files = files.FileTracker()
|
|
self.keep_levels = 1
|
|
self.keep_best = 10
|
|
self.handler_classes = {
|
|
'model': handlers.SingleModelHandler,
|
|
'scan': handlers.ScanHandler,
|
|
'domain': handlers.DomainHandler,
|
|
'emit': handlers.EmitterHandler,
|
|
'region': handlers.SingleRegionHandler
|
|
}
|
|
self.atomic_scattering_factory = InternalAtomicCalculator
|
|
self.multiple_scattering_factory = EdacCalculator
|
|
self._tasks_fields = []
|
|
self._db = database.ResultsDatabase()
|
|
|
|
def create_model_space(self):
|
|
"""
|
|
create a project.ModelSpace object which defines the allowed range for model parameters.
|
|
|
|
this method must be implemented by the actual project class.
|
|
the ModelSpace object must declare all model parameters used in the project.
|
|
|
|
@return ModelSpace object
|
|
"""
|
|
return None
|
|
|
|
def create_params(self, model, index):
|
|
"""
|
|
create a CalculatorParams object given the model parameters and calculation index.
|
|
|
|
@param model (dictionary) model parameters to be used in the calculation.
|
|
|
|
@param index (named tuple CalcID) calculation index.
|
|
the method should consider only the following attributes:
|
|
@arg `scan` scan index (index into Project.scans)
|
|
@arg `domain` domain index (index into Project.domains)
|
|
"""
|
|
return None
|
|
|
|
def clear_scans(self):
|
|
"""
|
|
clear scans.
|
|
|
|
delete all scans in self.scans and empty the list.
|
|
|
|
@return: None
|
|
"""
|
|
self.scans = []
|
|
self.combined_scan = None
|
|
self.combined_modf = None
|
|
|
|
def add_scan(self, filename, emitter, initial_state, is_modf=False, modf_model=None, positions=None):
|
|
"""
|
|
add the file name of reference experiment and load it.
|
|
|
|
the extension must be one of msc_data.DATATYPES (case insensitive)
|
|
corresponding to the meaning of the columns in the file.
|
|
|
|
caution: EDAC can only calculate equidistant, rectangular scans.
|
|
the following scans are currently supported:
|
|
|
|
* intensity vs energy at fixed theta, phi
|
|
* intensity vs analyser angle vs energy at normal emission (theta = 0, constant phi)
|
|
* intensity vs theta, phi, or alpha
|
|
* intensity vs theta and phi (hemisphere or hologram scan)
|
|
|
|
the method calculates the modulation function if @c is_modf is @c False.
|
|
it also updates @c combined_scan and @c combined_modf which may be used as R-factor comparison targets.
|
|
|
|
@param filename: (string) file name of the experimental data, possibly including a path.
|
|
the file is not loaded when the optional positions argument is present,
|
|
but the filename may serve as basename for output files (e.g. modulation function).
|
|
|
|
@param positions: (optional, dictionary of numpy arrays) scan positions.
|
|
if specified, the file given by filename is _not_ loaded,
|
|
and the scan positions are initialized from this dictionary.
|
|
the dictionary keys are the possible scan dimensions: 'e', 't', 'p', 'a'.
|
|
the arrays are one-dimensional and contain unique, equidistant positions.
|
|
constant dimensions have shape 1. see @ref Scan.define_scan.
|
|
|
|
@param emitter: (string) chemical symbol of the photo-emitting atom, e.g. "Cu".
|
|
|
|
@param initial_state: (string) nl term of the initial state of the atom, e.g. "2p".
|
|
|
|
@param is_modf: (bool) declares whether the file contains the modulation function (True),
|
|
or intensity (False, default). In the latter case, the modulation function is calculated internally.
|
|
|
|
@param modf_model: (dict) model parameters to be passed to the modulation function.
|
|
|
|
@return (Scan) the new scan object (which is also a member of self.scans).
|
|
"""
|
|
scan = Scan()
|
|
if positions is not None:
|
|
scan.define_scan(positions, emitter, initial_state)
|
|
scan.filename = filename
|
|
else:
|
|
scan.import_scan_file(filename, emitter, initial_state)
|
|
self.scans.append(scan)
|
|
|
|
if modf_model is None:
|
|
modf_model = {}
|
|
|
|
if scan.raw_data is not None:
|
|
if is_modf:
|
|
scan.modulation = scan.raw_data
|
|
else:
|
|
try:
|
|
scan.modulation = self.calc_modulation(scan.raw_data, modf_model)
|
|
except ValueError:
|
|
logger.error("error calculating the modulation function of experimental data.")
|
|
scan.modulation = None
|
|
else:
|
|
scan.modulation = None
|
|
|
|
if scan.raw_data is not None:
|
|
if self.combined_scan is not None:
|
|
dt = md.common_dtype((self.combined_scan, scan.raw_data))
|
|
d1 = md.restructure_data(self.combined_scan, dt)
|
|
d2 = md.restructure_data(scan.raw_data, dt)
|
|
self.combined_scan = np.hstack((d1, d2))
|
|
else:
|
|
self.combined_scan = scan.raw_data.copy()
|
|
else:
|
|
self.combined_scan = None
|
|
|
|
if scan.modulation is not None:
|
|
if self.combined_modf is not None:
|
|
dt = md.common_dtype((self.combined_modf, scan.modulation))
|
|
d1 = md.restructure_data(self.combined_modf, dt)
|
|
d2 = md.restructure_data(scan.modulation, dt)
|
|
self.combined_modf = np.hstack((d1, d2))
|
|
else:
|
|
self.combined_modf = scan.modulation.copy()
|
|
else:
|
|
self.combined_modf = None
|
|
|
|
return scan
|
|
|
|
def clear_domains(self):
|
|
"""
|
|
clear domains.
|
|
|
|
delete all domains in self.domains and empty the list.
|
|
|
|
@return: None
|
|
"""
|
|
self.domains = []
|
|
|
|
def add_domain(self, domain):
|
|
"""
|
|
add a domain to the list of domains.
|
|
|
|
this class declares the list of domains.
|
|
it does not define what should be in the list of domains.
|
|
however, there must be an entry for each domain to be calculated.
|
|
if the list is empty, no calculation will be executed.
|
|
|
|
@attention initially, the domains list is empty.
|
|
your project needs to add at least one domain.
|
|
otherwise, no calculation will be executed.
|
|
|
|
@param domain: it is up to the derived project class to specify and interpret the data stored here.
|
|
it is recommended to store a dictionary with domain parameters similar to the model parameters.
|
|
|
|
@return: None
|
|
"""
|
|
self.domains.append(domain)
|
|
|
|
def set_output(self, filename):
|
|
"""
|
|
set path and base name of output file.
|
|
|
|
path and name are copied to the output_file attribute.
|
|
path is copied to the output_dir attribute.
|
|
|
|
if the path is missing, the destination is the current working directory.
|
|
"""
|
|
self.output_file = filename
|
|
path, name = os.path.split(filename)
|
|
self.output_dir = path
|
|
self.job_name = name
|
|
|
|
def set_timedelta_limit(self, timedelta, margin_minutes=10):
|
|
"""
|
|
set the walltime limit with a safety margin.
|
|
|
|
this method sets the internal self.timedelta_limit attribute.
|
|
by default, a safety margin of 10 minutes is subtracted to the main argument
|
|
in order to increase the probability that the process ends in time.
|
|
if this is not wanted, the project class may override the method and provide its own margin.
|
|
|
|
the method is typically called with the command line time limit from the main module.
|
|
|
|
@note the safety margin could be applied at various levels.
|
|
it is done here because it can easily be overridden by the project subclass.
|
|
to keep run scripts simple, the command line can be given the same time limit
|
|
as the job scheduler of the computing cluster.
|
|
|
|
@param timedelta: (datetime.timedelta) max. duration of the calculation process (wall time).
|
|
|
|
@param margin_minutes: (int) safety margin in minutes to subtract from timedelta.
|
|
"""
|
|
self.timedelta_limit = timedelta - datetime.timedelta(minutes=margin_minutes)
|
|
|
|
def log_project_args(self):
|
|
"""
|
|
send some common project attributes to the log.
|
|
|
|
the attributes are normally logged at WARNING level.
|
|
|
|
this method is called by the main pmsco module after creating the project and assigning command line arguments.
|
|
it may be overridden to add logs of attributes of the sub-class.
|
|
|
|
@return: None
|
|
"""
|
|
try:
|
|
logger.warning("atomic scattering: {0}".format(self.atomic_scattering_factory))
|
|
logger.warning("multiple scattering: {0}".format(self.multiple_scattering_factory))
|
|
logger.warning("optimization mode: {0}".format(self.mode))
|
|
|
|
for key in sorted(self.optimizer_params):
|
|
val = self.optimizer_params[key]
|
|
lev = logging.WARNING if val else logging.DEBUG
|
|
logger.log(lev, "optimizer_params['{k}']: {v}".format(k=key, v=val))
|
|
|
|
logger.warning("data directory: {0}".format(self.data_dir))
|
|
logger.warning("output file: {0}".format(self.output_file))
|
|
logger.warning("database: {0}".format(self.db_file))
|
|
|
|
_files_to_keep = files.FILE_CATEGORIES - self.files.categories_to_delete
|
|
logger.warning("intermediate files to keep: {0}".format(", ".join(_files_to_keep)))
|
|
|
|
for idx, scan in enumerate(self.scans):
|
|
logger.warning(f"scan {idx}: {scan.filename} ({scan.emitter} {scan.initial_state}")
|
|
for idx, dom in enumerate(self.domains):
|
|
logger.warning(f"domain {idx}: {dom}")
|
|
|
|
except AttributeError:
|
|
logger.warning("AttributeError in log_project_args")
|
|
|
|
def combine_domains(self, parent_task, child_tasks):
|
|
"""
|
|
combine results of different domain into one result and calculate the modulation function.
|
|
|
|
the domain results are read from the file system using the indices defined by the child_tasks,
|
|
and the combined result is written to the file system with the index defined by parent_task.
|
|
|
|
by default, this method adds all domains with equal weight.
|
|
weights can be defined in the model dictionary with keys 'wdom0', 'wdom1', etc.
|
|
missing weights default to 1.
|
|
to avoid correlated parameters, one domain must always have a fixed weight.
|
|
it is recommended to leave 'wdom0' at its default.
|
|
|
|
@param parent_task: (CalculationTask) parent task of the domain tasks.
|
|
the method must write the results to the files indicated
|
|
by the @c result_filename and @c modf_filename attributes.
|
|
|
|
@param child_tasks: (sequence of CalculationTask) tasks which identify each domain.
|
|
the method must read the source data from the files
|
|
indicated by the @c result_filename attributes.
|
|
the sequence is sorted by task ID, i.e., essentially, by domain index.
|
|
|
|
@return: None
|
|
|
|
@raise IndexError if child_tasks is empty
|
|
|
|
@raise IOError if a filename is missing
|
|
|
|
@note the weights of the domains (in derived classes) can be part of the optimizable model parameters.
|
|
the model parameters are available as the @c model attribute of the calculation tasks.
|
|
"""
|
|
|
|
result_data = None
|
|
sum_weights = 0.
|
|
for task in child_tasks:
|
|
data = md.load_data(task.result_filename)
|
|
if result_data is None:
|
|
result_data = data.copy()
|
|
result_data['i'] = 0.
|
|
try:
|
|
weight = task.model['wdom{}'.format(task.id.domain)]
|
|
except KeyError:
|
|
weight = 1.
|
|
result_data['i'] += weight * data['i']
|
|
sum_weights += weight
|
|
result_data['i'] /= sum_weights
|
|
|
|
md.save_data(parent_task.result_filename, result_data)
|
|
|
|
# todo : the handling of missing modulation functions may need some cleanup
|
|
if self.scans[parent_task.id.scan].modulation is not None:
|
|
result_modf = self.calc_modulation(result_data, parent_task.model)
|
|
md.save_data(parent_task.modf_filename, result_modf)
|
|
else:
|
|
parent_task.modf_filename = ""
|
|
|
|
def combine_emitters(self, parent_task, child_tasks):
|
|
"""
|
|
combine results of different emitters into one result. calculate the modulation function.
|
|
|
|
the emitter results are read from the file system using the indices defined by the child_tasks,
|
|
and the combined result is written to the file system with the index defined by parent_task.
|
|
|
|
by default, this method adds all emitters with equal weight.
|
|
|
|
sub-classes may override this method and implement expansion of equivalent emitters,
|
|
unequal weights, etc.
|
|
|
|
@param parent_task: (CalculationTask) parent task of the emitter tasks.
|
|
the method must write the results to the files indicated
|
|
by the @c result_filename and @c modf_filename attributes.
|
|
|
|
@param child_tasks: (sequence of CalculationTask) tasks which identify each emitter.
|
|
the method must read the source data from the files
|
|
indicated by the @c result_filename attributes.
|
|
the sequence is sorted by task ID, i.e., essentially, by the emitter index.
|
|
|
|
@return: None
|
|
|
|
@raise IndexError if child_tasks is empty
|
|
|
|
@raise IOError if a filename is missing
|
|
|
|
@note the weights of the emitters (in derived classes) can be part of the optimizable model parameters.
|
|
the model parameters are available as the @c model attribute of the calculation tasks.
|
|
"""
|
|
|
|
result_data = None
|
|
for task in child_tasks:
|
|
data = md.load_data(task.result_filename)
|
|
if result_data is not None:
|
|
result_data['i'] += data['i']
|
|
else:
|
|
result_data = data
|
|
|
|
md.save_data(parent_task.result_filename, result_data)
|
|
|
|
# todo : the handling of missing modulation functions may need some cleanup
|
|
if self.scans[parent_task.id.scan].modulation is not None:
|
|
result_modf = self.calc_modulation(result_data, parent_task.model)
|
|
md.save_data(parent_task.modf_filename, result_modf)
|
|
else:
|
|
parent_task.modf_filename = ""
|
|
|
|
def combine_scans(self, parent_task, child_tasks):
|
|
"""
|
|
combine results of different scans into one result, for intensity and modulation.
|
|
|
|
the scan results are read from the file system using the indices defined by the child_tasks,
|
|
and the combined result is written to the file system with the index defined by parent_task.
|
|
|
|
the datasets of the scans are appended.
|
|
this is done for intensity and modulation data independently.
|
|
|
|
@param parent_task: (CalculationTask) parent task of the scan tasks.
|
|
the method must write the results to the files indicated
|
|
by the @c result_filename and @c modf_filename attributes.
|
|
|
|
@param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
|
|
the method must read the source data from the files
|
|
indicated by the @c result_filename attributes.
|
|
the sequence is sorted by task ID, i.e., essentially, by scan index.
|
|
|
|
@return: None
|
|
|
|
@raise IndexError if child_tasks is empty.
|
|
"""
|
|
|
|
# intensity
|
|
try:
|
|
stack1 = [md.load_data(task.result_filename) for task in child_tasks]
|
|
except IOError:
|
|
parent_task.result_filename = ""
|
|
else:
|
|
dtype = md.common_dtype(stack1)
|
|
stack2 = [md.restructure_data(data, dtype) for data in stack1]
|
|
result_data = np.hstack(tuple(stack2))
|
|
md.save_data(parent_task.result_filename, result_data)
|
|
|
|
# modulation
|
|
try:
|
|
stack1 = [md.load_data(task.modf_filename) for task in child_tasks]
|
|
except IOError:
|
|
parent_task.modf_filename = ""
|
|
else:
|
|
dtype = md.common_dtype(stack1)
|
|
stack2 = [md.restructure_data(data, dtype) for data in stack1]
|
|
result_modf = np.hstack(tuple(stack2))
|
|
md.save_data(parent_task.modf_filename, result_modf)
|
|
|
|
def combine_regions(self, parent_task, child_tasks):
|
|
"""
|
|
combine results from different regions into one result, for intensity and modulation.
|
|
|
|
the scan results are read from the file system using the indices defined by the child_tasks,
|
|
and the combined result is written to the file system with the index defined by parent_task.
|
|
|
|
the datasets of the regions are appended and sorted in the standard order of the data module.
|
|
if the resulting length differs from the corresponding experimental scan,
|
|
an error is printed to the logger, but the calculation continues.
|
|
|
|
the modulation function is calculated by calling @ref calc_modulation.
|
|
|
|
@param parent_task: (CalculationTask) parent task of the region tasks.
|
|
the method writes the results to the file names
|
|
given by the @c result_filename and @c modf_filename attributes.
|
|
|
|
@param child_tasks: (sequence of CalculationTask) tasks which identify each region.
|
|
the reads the source data from the files
|
|
indicated by the @c result_filename attributes.
|
|
the sequence is sorted by task ID, i.e., essentially, by region index.
|
|
|
|
@return: None
|
|
|
|
@raise IndexError if child_tasks is empty.
|
|
"""
|
|
# intensity
|
|
try:
|
|
stack1 = [md.load_data(task.result_filename) for task in child_tasks]
|
|
except IOError:
|
|
parent_task.result_valid = False
|
|
parent_task.result_filename = ""
|
|
else:
|
|
dtype = md.common_dtype(stack1)
|
|
stack2 = [md.restructure_data(data, dtype) for data in stack1]
|
|
result_data = np.hstack(tuple(stack2))
|
|
md.sort_data(result_data)
|
|
md.save_data(parent_task.result_filename, result_data)
|
|
|
|
scan = self.scans[parent_task.id.scan]
|
|
if result_data.shape[0] != scan.raw_data.shape[0]:
|
|
logger.error(BMsg("scan length mismatch: combined result: {result}, experimental data: {expected}",
|
|
result=result_data.shape[0], expected=scan.raw_data.shape[0]))
|
|
|
|
# modulation
|
|
try:
|
|
data = md.load_data(parent_task.result_filename)
|
|
modf = self.calc_modulation(data, parent_task.model)
|
|
except IOError:
|
|
parent_task.modf_filename = ""
|
|
else:
|
|
md.save_data(parent_task.modf_filename, modf)
|
|
|
|
def get_git_hash(self):
|
|
"""
|
|
get the git commit (hash) of the running code (HEAD)
|
|
|
|
the method looks for a git repository in the source tree of this module.
|
|
if successful, it returns the hash string of the HEAD commit.
|
|
|
|
@return: hexadecimal hash string.
|
|
empty string if the file is not in a git repository.
|
|
"""
|
|
try:
|
|
repo = git.Repo(__file__, search_parent_directories=True)
|
|
except git.exc.InvalidGitRepositoryError:
|
|
return ""
|
|
else:
|
|
return repo.head.commit.hexsha
|
|
|
|
def setup(self, handlers):
|
|
"""
|
|
prepare for calculations.
|
|
|
|
this method is called in the master process before starting the task loop.
|
|
at this point the task handlers have been created and set up.
|
|
if the project needs to change settings of task handlers it can do so in this method.
|
|
|
|
this instance writes the header of the tasks.dat file
|
|
that will receive sub-task evaluation results from the evaluate_result() method.
|
|
|
|
it also initializes the database where the task results will be stored.
|
|
this is either a volatile in-memory database or a user-specified sqlite3 database file.
|
|
|
|
@param handlers: dictionary listing the initialized task handler instances.
|
|
the dictionary keys are the attribute names of pmsco.dispatch.CalcID:
|
|
'model', 'scan', 'domain', 'emit' and 'region'.
|
|
|
|
@return: None
|
|
"""
|
|
self.git_hash = self.get_git_hash()
|
|
fields = ["rfac"]
|
|
fields.extend(dispatch.CalcID._fields)
|
|
fields.append("secs")
|
|
fields = ["_" + f for f in fields]
|
|
mspace = self.create_model_space()
|
|
model_fields = list(mspace.start.keys())
|
|
model_fields.sort(key=lambda name: name.lower())
|
|
fields.extend(model_fields)
|
|
self._tasks_fields = fields
|
|
|
|
with open(self.output_file + ".tasks.dat", "w") as outfile:
|
|
outfile.write("# ")
|
|
outfile.write(" ".join(fields))
|
|
outfile.write("\n")
|
|
|
|
self._db.connect(self.db_file)
|
|
project_name = self.__class__.__name__
|
|
project_module = self.__class__.__module__
|
|
project_id = self._db.register_project(project_name, project_module)
|
|
job_id = self._db.register_job(project_id,
|
|
self.job_name,
|
|
self.mode,
|
|
socket.gethostname(),
|
|
self.git_hash,
|
|
datetime.datetime.now(),
|
|
self.description)
|
|
logger.debug(BMsg("database {db_file}, project {proj}, job {job}",
|
|
db_file=self.db_file, proj=project_id, job=job_id))
|
|
self._db.insert_jobtags(job_id, self.job_tags)
|
|
self._db.register_params(model_fields)
|
|
self._db.create_models_view()
|
|
|
|
def evaluate_result(self, parent_task, child_tasks):
|
|
"""
|
|
evaluate the result of a calculation task.
|
|
|
|
this method is called from the add_result of the task handlers at each level.
|
|
it gives the project a hook to check the progress of a model at any level of the task tree.
|
|
|
|
the method calculates the r-factor by calling the Project.calc_rfactor method.
|
|
the result is written to the task.rfac field and to the .tasks.dat file.
|
|
invalid and region-level results are skipped.
|
|
|
|
this method is called in the master process only.
|
|
|
|
@param parent_task: (CalculationTask) a calculation task.
|
|
|
|
@param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
|
|
the sequence must be sorted by task ID.
|
|
|
|
@return: None
|
|
"""
|
|
if parent_task.result_valid and parent_task.id.region == -1:
|
|
try:
|
|
parent_task.rfac = self.calc_rfactor(parent_task, child_tasks)
|
|
except ValueError:
|
|
parent_task.result_valid = False
|
|
logger.warning(BMsg("calculation {0} resulted in an undefined R-factor.", parent_task.id))
|
|
else:
|
|
values_dict = parent_task.id._asdict()
|
|
values_dict = {"_" + k: v for k, v in values_dict.items()}
|
|
values_dict.update(parent_task.model)
|
|
values_dict['_rfac'] = parent_task.rfac
|
|
values_dict['_secs'] = parent_task.time.total_seconds()
|
|
values_list = [values_dict[field] for field in self._tasks_fields]
|
|
with open(self.output_file + ".tasks.dat", "a") as outfile:
|
|
outfile.write(" ".join(format(value) for value in values_list) + "\n")
|
|
|
|
db_id = self._db.insert_result(parent_task.id, values_dict)
|
|
logger.debug(BMsg("model {model}, database result {db_id}", model=parent_task.id.model, db_id=db_id))
|
|
|
|
return None
|
|
|
|
# noinspection PyUnusedLocal
|
|
def calc_modulation(self, data, model):
|
|
"""
|
|
calculate the project-dependent modulation function.
|
|
|
|
the modulation function of I(x) is (I(x) - S(x)) / S(x)
|
|
where S(x) is a smooth copy of I(x).
|
|
|
|
by default, the modulation function is calculated by data.calc_modfunc_loess().
|
|
override this method in your project to use a different modulation function.
|
|
|
|
@param data structured numpy.ndarray in EI, ETPI, or ETPAI format.
|
|
can contain a one- or multi-dimensional scan.
|
|
the scan coordinates must be on a rectangular or hemisperical grid.
|
|
for maximum compatibility, the array should be sorted,
|
|
though for the default calc_modfunc_loess() function this is not required.
|
|
|
|
if data contains a hemispherical scan, the phi dimension is ignored,
|
|
i.e. the modulation function is calcualted on a phi-average.
|
|
|
|
@param model: (dict) model parameters of the calculation task.
|
|
can be used to pass parameters from the project.
|
|
this argument is a dictionary of the model parameters.
|
|
|
|
@return copy of the data array with the modulation function in the 'i' column.
|
|
"""
|
|
|
|
return md.calc_modfunc_loess(data)
|
|
|
|
def calc_rfactor(self, parent_task, child_tasks):
|
|
"""
|
|
calculate the r-factor of a task.
|
|
|
|
the r-factor is calculated on the experimental and simulated modulation functions.
|
|
the algorithm differs for the model level and the lower task levels.
|
|
at the model level, the calculation is delegated to Project.combine_rfactors.
|
|
at all other levels, the r-factor is calculated by Project.rfactor,
|
|
where the simulated is loaded from the file specified by parent_task.modf_filename
|
|
and the experimental data from Project.scan.
|
|
|
|
this method is called by the task handlers.
|
|
all child tasks belonging to the parent task must be complete.
|
|
|
|
to select or implement a specific R-factor algorithm,
|
|
the project sub-class should override Project.rfactor.
|
|
to combine scan r-factors, it should override or patch Project.combine_rfactors.
|
|
|
|
@version in earlier versions,
|
|
projects had to override this method to implement their algorithm.
|
|
this has lead to duplication of common code.
|
|
the r-factor algorithm is now distributed over several methods,
|
|
and the method signature has changed.
|
|
new projects should override Project.rfactor and/or Project.combine_rfactors.
|
|
|
|
@param parent_task: (CalculationTask) a calculation task.
|
|
|
|
@param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
|
|
the sequence must be sorted by task ID.
|
|
|
|
@return (float) calculated R-factor.
|
|
|
|
@raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
|
|
"""
|
|
if parent_task.id.scan >= 0:
|
|
task_data = md.load_data(parent_task.modf_filename)
|
|
exp_data = self.scans[parent_task.id.scan].modulation
|
|
result_r = self.rfactor(exp_data, task_data)
|
|
else:
|
|
result_r = self.combine_rfactors(parent_task, child_tasks)
|
|
|
|
return result_r
|
|
|
|
def rfactor(self, exp_data, theo_data):
|
|
"""
|
|
calculate the r-factor of simulated diffraction data.
|
|
|
|
in this class, the method calls the data.rfactor function to calculate the r-factor.
|
|
override this method in your project to use a different R-factor algorithm.
|
|
|
|
the input arrays must have the same shape,
|
|
and the coordinate columns must be identical (they are ignored, however).
|
|
the array elements are compared element-by-element.
|
|
terms having NaN intensity are ignored.
|
|
|
|
if the sigma column is present in experiment and non-zero,
|
|
the R-factor terms are weighted.
|
|
|
|
@param exp_data: (numpy structured array)
|
|
ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
|
|
if an @c s field is present and non-zero,
|
|
the R-factor terms are weighted by 1/sigma**2.
|
|
|
|
@param theo_data: (numpy structured array)
|
|
ETPI or ETPAI array containing the calculated modulation functions.
|
|
|
|
@return: (float) scalar R-factor
|
|
|
|
@raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
|
|
"""
|
|
return md.rfactor(exp_data, theo_data)
|
|
|
|
def opt_rfactor(self, exp_data, theo_data):
|
|
"""
|
|
calculate the r-factor of simulated diffraction data, adjusting their amplitude.
|
|
|
|
this is an alternative r-factor calculation algorithm
|
|
using the pmsco.data.optimize_rfactor() function.
|
|
|
|
to activate this method (replacing the default one), assign it to Project.rfactor
|
|
in the overriding __init__ or setup method:
|
|
@code{.py}
|
|
self.rfactor = self.opt_rfactor
|
|
@endcode
|
|
|
|
@param exp_data: (numpy structured array)
|
|
ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
|
|
if an @c s field is present and non-zero,
|
|
the R-factor terms are weighted by 1/sigma**2.
|
|
|
|
@param theo_data: (numpy structured array)
|
|
ETPI or ETPAI array containing the calculated modulation functions.
|
|
|
|
@return: (float) scalar R-factor
|
|
|
|
@raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
|
|
"""
|
|
return md.optimize_rfactor(exp_data, theo_data)
|
|
|
|
def combine_rfactors(self, parent_task, child_tasks):
|
|
"""
|
|
combine r-factors of child tasks.
|
|
|
|
the r-factors are taken from the rfac attribute of the child_tasks.
|
|
the result is an average of the child r-rfactors.
|
|
|
|
to produce a balanced result, every child dataset must contain a similar amount of information.
|
|
if this is not the case, the child r-factors must be weighted.
|
|
weighting is currently not implemented but may be introduced in a future version.
|
|
|
|
the method is intended to be used at the model level (children are scans).
|
|
though it can technically be used at any level where child r-factors are available.
|
|
|
|
@param parent_task: (CalculationTask) parent task for which the r-factor is calculated,
|
|
i.e. a model task.
|
|
|
|
@param child_tasks: (sequence of CalculationTask) child tasks of parent_tasks
|
|
that may be consulted for calculating the r-factor.
|
|
|
|
@return: (float) r-factor, NaN if parent task is invalid
|
|
|
|
@raise ValueError or IndexError if child_tasks is empty.
|
|
"""
|
|
if parent_task.result_valid:
|
|
rsum = 0.
|
|
for task in child_tasks:
|
|
rsum += task.rfac
|
|
return rsum / len(child_tasks)
|
|
else:
|
|
return float('nan')
|
|
|
|
def alt_combine_rfactors(self, parent_task, child_tasks):
|
|
"""
|
|
combine r-factors of child tasks by explicit calculation on the combined result.
|
|
|
|
this is an alternative implementation of combine_rfactors.
|
|
instead of using the r-factors from child tasks,
|
|
it re-calculates the r-factor for the combined dataset.
|
|
this method avoids the issue of weighting
|
|
but can introduce bias if the amplitudes of the child datasets differ substantially.
|
|
|
|
the experimental dataset is loaded from the file specified by the parent task,
|
|
the corresponding experimental data is taken from self.combined_modf.
|
|
|
|
to activate this method, assign it to combine_rfactors.
|
|
in the overriding __init__ or setup method:
|
|
@code{.py}
|
|
self.combine_rfactors = self.alt_combine_rfactors
|
|
@endcode
|
|
|
|
@param parent_task: (CalculationTask) parent task for which the r-factor is calculated,
|
|
i.e. a model task.
|
|
|
|
@param child_tasks: (sequence of CalculationTask) child tasks of parent_tasks
|
|
that may be consulted for calculating the r-factor.
|
|
|
|
@return: (float) r-factor, NaN if parent task is invalid
|
|
"""
|
|
if parent_task.result_valid:
|
|
task_data = md.load_data(parent_task.modf_filename)
|
|
exp_data = self.combined_modf
|
|
return self.rfactor(exp_data, task_data)
|
|
else:
|
|
return float('nan')
|
|
|
|
def export_cluster(self, index, filename, cluster):
|
|
"""
|
|
export the cluster of a calculation task in XYZ format for diagnostics and reporting.
|
|
|
|
this method is called with the final cluster just before it is handed over to the calculator.
|
|
it saves the atom coordinates in XYZ format for future reference (e.g. graphics).
|
|
|
|
the method creates two files:
|
|
@arg a file with extension '.xyz' contains the whole cluster in XYZ format.
|
|
@arg a file with extension '.emit.xyz' contains only emitter atoms in XYZ format.
|
|
|
|
the first part of the file name is formatted with the output name and the complete task identification.
|
|
the file is registered with the file tracker in the 'cluster' category
|
|
so that it will be deleted unless the cluster category is selected for keeping.
|
|
|
|
derived project class may override or extend this method
|
|
to carry out further diagnostics or reporting on the cluster.
|
|
|
|
@param index: (CalcID) calculation index to which the cluster belongs.
|
|
region may be -1 if only one cluster is exported for all regions
|
|
(clusters do not depend on the scan region).
|
|
emit may be -1 if the cluster is a master from which emitter-related child clusters are derived.
|
|
|
|
@param filename: (str) base file name for the output files.
|
|
the filename should be formatted using pmsco.dispatch.CalculationTask.format_filename().
|
|
extensions are appended by this method.
|
|
|
|
@param cluster: a pmsco.cluster.Cluster() object with all atom positions and emitters.
|
|
|
|
@return: dictionary listing the names of the created files with their category.
|
|
the dictionary key is the file name,
|
|
the value is the file category (cluster).
|
|
"""
|
|
_files = {}
|
|
xyz_filename = filename + ".xyz"
|
|
cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ)
|
|
_files[xyz_filename] = 'cluster'
|
|
|
|
xyz_filename = filename + ".emit.xyz"
|
|
cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ, emitters_only=True)
|
|
_files[xyz_filename] = 'cluster'
|
|
|
|
return _files
|
|
|
|
def before_atomic_scattering(self, task, par, clu):
|
|
"""
|
|
project hook before atomic scattering factors are calculated.
|
|
|
|
this method derives modified CalculatorParams and Cluster objects for the atomic scattering calculation
|
|
from the original objects that will be used in the multiple scattering calculation.
|
|
|
|
in the basic version, the method does not change the objects
|
|
except that it returns None for the root task (reference cluster).
|
|
subclasses may override it to modify or replace the cluster.
|
|
|
|
@param task: @ref pmsco.dispatch.CalculationTask object representing the current calculation task.
|
|
if the model index is -1, the project can return the global reference cluster
|
|
(to calculate the fixed scattering factors that will be used for all models)
|
|
or None if no global scattering factors should be calculated.
|
|
do not modify this object!
|
|
|
|
@param par: @ref pmsco.project.CalculatorParams object representing the preliminary
|
|
multiple scattering input parameters of the current task.
|
|
the method can make modifications to this object instance directly.
|
|
|
|
@param clu: @ref pmsco.cluster.Cluster object representing the preliminary
|
|
multiple scattering cluster of the current task.
|
|
the method can make modifications to this object instance directly.
|
|
|
|
@return: a tuple (par, clu) where par and clu are the input parameters and cluster
|
|
to be used for the calculation of atomic scattering factors.
|
|
these should either be the original function arguments,
|
|
or copies of the original arguments.
|
|
if atomic scattering factors should not be calculated, the return values should be None.
|
|
"""
|
|
if task.id.model >= 0:
|
|
return par, clu
|
|
else:
|
|
return None, None
|
|
|
|
def after_atomic_scattering(self, task, par, clu):
|
|
"""
|
|
project hook after atomic scattering factors are calculated.
|
|
|
|
this method cleans up the CalculatorParams and Cluster objects from the atomic scattering calculation
|
|
so that they can be used in the multiple scattering calculation.
|
|
|
|
in the basic version, the method just passes the input parameters for model tasks
|
|
and returns None for the root task.
|
|
subclasses may override it and modify the cluster and/or input parameters
|
|
so that the desired atomic scattering factors are used.
|
|
|
|
@param task: @ref pmsco.dispatch.CalculationTask object representing the current calculation task.
|
|
if the model index is -1, the project should return the global reference cluster
|
|
(to calculate the fixed scattering factors that will be used for all models)
|
|
or None if no global scattering factors should be calculated.
|
|
|
|
@param par: @ref pmsco.project.CalculatorParams object representing the preliminary
|
|
multiple scattering input parameters of the current task.
|
|
|
|
@param clu: @ref pmsco.cluster.Cluster object representing the preliminary
|
|
multiple scattering cluster of the current task.
|
|
do not modify this object, make a copy!
|
|
|
|
@return: a tuple (par, clu) where par and clu are the input parameters and cluster
|
|
to be used for the calculation of atomic scattering factors.
|
|
these should either be the original function arguments,
|
|
or copies of the original arguments.
|
|
"""
|
|
if task.id.model >= 0:
|
|
return par, clu
|
|
else:
|
|
return None, None
|
|
|
|
def cleanup(self):
|
|
"""
|
|
delete unwanted files at the end of a project and close the database.
|
|
|
|
@return: None
|
|
"""
|
|
self.cleanup_files(incomplete_models=True)
|
|
self._db.disconnect()
|
|
|
|
def cleanup_files(self, keep=0, incomplete_models=False):
|
|
"""
|
|
delete uninteresting files (any time).
|
|
|
|
delete all files that
|
|
belong to one of the self.files.categories_to_delete categories or
|
|
do not belong to one of the "best" models.
|
|
|
|
"best" models are a number (self.keep_best) of models that gave the lowest R-factors
|
|
at each task level from root to self.keep_levels.
|
|
for example if `keep_best = 10` and `keep_levels = 1`
|
|
the 10 best models at the top level, and the 10 best at the scan level are kept.
|
|
this means that in total up to `n = 10 + 10 * n_scans` models may be kept,
|
|
where n_scans is the number of scan files in the job.
|
|
|
|
this method can be called at any time during the calculation process.
|
|
it executes on complete models only
|
|
unless incomplete_models is True.
|
|
|
|
@param keep: minimum number of best models to keep.
|
|
0 (default): use the project parameter self.keep_best.
|
|
|
|
@param incomplete_models: (bool) delete files of incomplete models as well.
|
|
by default (False), incomplete models are not deleted.
|
|
|
|
@return None
|
|
"""
|
|
self.files.delete_files(incomplete_models=incomplete_models)
|
|
if 'rfac' in self.files.categories_to_delete:
|
|
keep = max(keep, self.keep_best)
|
|
keepers = self._db.query_best_task_models(self.keep_levels, keep)
|
|
self.files.delete_models(keep=keepers)
|