pmsco-public/pmsco/project.py

"""
@package pmsco.project
project-independent classes which store and handle model parameters.

the most important class defined here is Project.
each calculation project needs to derive its own project class from it.
the ModelSpace and CalculatorParams classes are typically used unchanged.

@note nomenclature: the term @e parameters has several meanings in the code and documentation.
    the following distinctive terms are used in updated documentation sections.
    ambiguous terms may still be present in older code sections.
@arg <em>calculation parameters</em> set of specific parameters passed as input to the calculation programs.
    the amount and meaning of these parameters depend on the calculation code used.
    typically, many of these parameters remain fixed, or change very rarely in the course of the study.
@arg <em>model parameters</em> concise set of independent physical parameters
    that define the system in one calculation instance.
    these parameters are varied systematically by the optimization process.
    they are mapped to calculation parameters and a cluster by code derived from the Project class.

@author Matthias Muntwiler, matthias.muntwiler@psi.ch

@copyright (c) 2015 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import copy
import datetime
import git
import logging
import numpy as np
import os.path
import socket
import sys

from pmsco.calculators.calculator import InternalAtomicCalculator
from pmsco.calculators.edac import EdacCalculator
import pmsco.cluster as mc
from pmsco.compat import open
import pmsco.data as md
import pmsco.database as database
import pmsco.dispatch as dispatch
import pmsco.files as files
import pmsco.handlers as handlers
from pmsco.helpers import BraceMessage as BMsg

logger = logging.getLogger(__name__)

ParamSpace = collections.namedtuple('ParamSpace', ['start', 'min', 'max', 'step'])


class ModelSpace(object):
    """
    Domain of model parameters.

    Each member contains a dictionary of model parameter names and their values.
    Parameter names can be defined almost freely by the project,
    except that they should contain only alphanumeric and underscore characters.
    furthermore, names starting with an underscore are reserved for the optimizers.
    """

    ## @var start (dict)
    # dictionary of start values for each model parameter.
    #
    # the start value can be the initial guess for an optimization run,
    # or the actual value for a single calculation.
    #
    # there must be one item for each model parameter,
    # where the key is the name of the parameter, and the value its physical value.

    ## @var min (dict)
    # dictionary of minimum values for each model parameter.
    #
    # the minimum defines the lower bound of the allowed interval for a model parameter.
    #
    # there must be one item for each model parameter,
    # where the key is the name of the parameter, and the value its physical value.

    ## @var max (dict)
    # dictionary of maximum values for each model parameter.
    #
    # the maximum defines the upper bound of the allowed interval for a model parameter.
    #
    # there must be one item for each model parameter,
    # where the key is the name of the parameter, and the value its physical value.

    ## @var step (dict)
    # dictionary of step sizes for each model parameter.
    #
    # depending on the optimization mode, the step is a guess of how fast values should vary,
    # e.g. step size, gradient, velocity, ...
    #
    # there must be one item for each model parameter,
    # where the key is the name of the parameter, and the value its physical value.

    def __init__(self):
        """
        initialize the domain object with empty dictionaries.
        """
        self.start = {}
        self.min = {}
        self.max = {}
        self.step = {}

    def add_param(self, name, start, min=None, max=None, step=None, width=None):
        """
        set the domain of one parameter with all necessary values at once.

        the exact meaning of the arguments depends on the calculation mode.

        @param name (string) name of the parameter (alphanumeric and underscore characters only).
            it is recommended to use short but distinctive names.

        @param start (float) start value.

        @param min (float) lower bound of the parameter interval.
            must be lower or equal to start.
            if None, the field is set to start.

        @param max (float) upper bound of the parameter interval.
            must be greater or equal to start.
            if None, the field is set to start.

        @param width (float) width of the parameter interval.
            instead of min and max, the interval can be set centered around the start value.
            this is equivalent to min = start - width/2, max = start + width/2.
            this argument overrides min and max. don't use both arguments.

        @param step (float) step size.
            must be greater or equal to zero.
            if None, the field is set to zero.
        """
        self.start[name] = start
        self.min[name] = min if min is not None else start
        self.max[name] = max if max is not None else start
        if width is not None:
            self.min[name] = start - width / 2.
            self.max[name] = start + width / 2.
        self.step[name] = step if step is not None else 0.0

    def get_param(self, name):
        """
        get all values of a model parameter in a tuple.

        @param name (string) name of the parameter.

        @return named tuple ParamSpace(start, min, max, step) of the parameter.

        @raise IndexError if the parameter is not defined.
        """
        return ParamSpace(self.start[name], self.min[name], self.max[name], self.step[name])


class CalculatorParams(object):
    """
    calculation parameters for a single scattering calculation job.

    this class holds all the calculation parameters that are passed via input file to the calculation program.

    the class can hold parameters for both the MSC and EDAC codes.
    some parameters are used by both codes, others are used just by one of them.
    newer features such as multiple emitters, multiple domains, and others are supported in EDAC mode only.
    MSC mode is currently not maintained.

    objects of this class are created by the implementation of the create_params() method
    of the actual project class.
    """

    ## @var angular_resolution (float)
    # FWHM angular resolution of the detector.
    #
    # maps to:
    # @arg emission angle window (EDAC)
    # @arg angular_broadening (MSC)

    ## @var binding_energy (float)
    # initial state binding energy with respect to the Fermi level in eV
    #

    ## @var initial_state (str)
    # initial state
    #
    # 1s, 2p, 2p1/2, etc.
    #

    ## @var phase_files (dict)
    # dictionary of phase or scattering matrix element files.
    #
    # the keys are atomic numbers, the values file names.
    # whether the files contain phase shifts or matrix elements depends on the calculator.
    # EDAC determines the kind of information from the first line in the file.
    #
    # if the dictionary is empty or the files don't exist,
    # the scattering matrix is computed by the calculator (if supported).
    #
    # maps to:
    # @arg scatterer (EDAC)
    # @arg atomic_number, phase_file (MSC)

    ## @var phase_output_classes (int or iterable of int)
    # atom classes for which to output phase files
    #
    # if the atomic scattering factors are calculated internally,
    # EDAC can export them to scattering files.
    #
    # this parameter can be one of
    # @arg None (default) no phase output,
    # @arg integer number defining a range 0:N-1 of atom classes,
    # @arg iterable (e.g., set or sequence) of atom classes to export.
    #
    # the problem is that EDAC expects the user to list each atom class to export,
    # though it is not possible to know how many classes there will be
    # or which atoms belong to which class before the calculation is actually done.
    # the number of classes will be between the number of different elements and the number of atoms.
    #
    # thus, this parameter should normally be left at its default value
    # and used only in specific situations that can be processed manually.
    # if the parameter is non-default, EDAC will also produce a cluster output
    # that includes a mapping between atomic coordinates and atom classes.
    #
    # @note the files generated belong to the category "output".
    # you need to specify `--keep-files output` to prevent them from getting cleaned up.

    ## @var polarization (str)
    # photon polarization
    #
    # 'H', 'V', 'L', 'R', 'U'
    #

    ## @var rme_files (dict)
    # dictionary of radial matrix element files.
    #
    # if the dictionary is empty or the files don't exist,
    # the radial matrix defaults to the rme_xxx_xxx attributes.
    #
    # in EDAC, RME files or constants are considered only if @ref phase_files are specified.
    #

    ## @var work function (float)
    # work function in eV
    #
    # the energy scale of EDAC is referenced to the vacuum level
    # but data files are referenced to the Fermi level.
    # the @ref pmsco.calculators.edac module adds the work function to the kinetic energy before it calls EDAC.
    #

    def __init__(self):
        self.title = "default parameters"
        self.comment = "set by project.CalculatorParams()"
        self.cluster_file = ""
        self.output_file = ""
        self.scan_file = ""
        self.initial_state = "1s"
        self.binding_energy = 0.0
        self.polarization = "H"
        self.angular_resolution = 1.0
        self.z_surface = 0.0
        self.inner_potential = 10.0
        self.work_function = 0.0
        self.symmetry_range = 360.0
        self.polar_incidence_angle = 60.0
        self.azimuthal_incidence_angle = 0.0
        self.experiment_temperature = 300.0
        self.debye_temperature = 400.0
        self.debye_wavevector = 1.0
        self.phase_files = {}
        self.rme_files = {}
        self.rme_minus_value = 0.1
        self.rme_minus_shift = 0.0
        self.rme_plus_value = 1.0
        self.rme_plus_shift = 0.0
        # used by MSC only
        self.spherical_order = 2
        self.scattering_level = 5
        self.fcut = 15.0
        self.cut = 15.0
        self.lattice_constant = 1.0
        self.msq_displacement = {}
        self.planewave_attenuation = 1.0
        self.vibration_model = "N"
        self.substrate_atomic_mass = 1.0
        # used by EDAC only
        self.emitters = [(0.0, 0.0, 0.0, 0)]
        self.lmax = 15
        self.dmax = 5.0
        self.orders = [20]
        self.phase_output_classes = None

    @property
    def l_init(self):
        """
        initial state l quantum number.

        this is converted from the initial_state property.

        @return: (int) 0..3
        """
        return "spdf".index(self.initial_state[1])


class Scan(object):
    """
    class to describe the scanning scheme or store the experimental data set.
    """

    ## @var filename (string)
    # file name from which a scan was loaded

    ## @var raw_data (numpy.ndarray)
    # original scan data (ETPAIS array)

    ## @var dtype (dict)
    # data type of self.raw_data.
    #
    # one of the data.DTYPE_Xxxx constants.

    ## @var modulation (numpy.ndarray)
    # modulation function calculated from original scan (ETPAIS array)

    ## @var mode (list of characters)
    # list of ETPAI column names which are scanned in self.raw_data.
    #
    # example: ['t','p']

    ## @var emitter (string)
    # chemical symbol and, optionally following, further specification (chemical state, environment, ...)
    # of photo-emitting atoms.
    # the interpretation of this string is up to the project and its cluster generator.
    # it should, however, always start with a chemical element symbol.
    #
    # examples: 'Ca' (calcium), 'CA' (carbon A), 'C a' (carbon a), 'C 1' (carbon one), 'N=O', 'FeIII'.

    ## @var initial_state (string)
    # nl term of initial state
    #
    # in the form expected by EDAC, for example: '1s'

    ## @var energies (numpy.ndarray)
    # kinetic energy referenced to Fermi level.
    #
    # one-dimensional array.

    ## @var thetas (numpy.ndarray)
    # polar angle referenced to normal emission
    #
    # one-dimensional array.
    #
    # note: in the case of a hemispherical scan, the values in this array will not be unique.

    ## @var phis (numpy.ndarray)
    # azimuthal angle referenced to arbitrary origin
    #
    # one-dimensional array.
    #
    # note: in the case of a hemispherical scan, the values in this array will not be unique, and not monotonic.

    ## @var alphas (numpy.ndarray)
    # polar angle referenced to normal emission
    #
    # one-dimensional array.

    def __init__(self):
        self.filename = ""
        self.raw_data = None
        self.dtype = None
        self.modulation = None
        self.mode = []
        self.emitter = ""
        self.initial_state = "1s"
        self.positions = {
            'e': np.empty(0),
            't': np.empty(0),
            'p': np.empty(0),
            'a': np.empty(0),
        }

    @property
    def energies(self):
        return self.positions['e']

    @energies.setter
    def energies(self, value):
        self.positions['e'] = value

    @property
    def thetas(self):
        return self.positions['t']

    @thetas.setter
    def thetas(self, value):
        self.positions['t'] = value

    @property
    def phis(self):
        return self.positions['p']

    @phis.setter
    def phis(self, value):
        self.positions['p'] = value

    @property
    def alphas(self):
        return self.positions['a']

    @alphas.setter
    def alphas(self, value):
        self.positions['a'] = value

    def copy(self):
        """
        create a copy of the scan.

        @return: new independent scan object with the same attributes as the original one.
        """
        return copy.deepcopy(self)

    def import_scan_file(self, filename, emitter, initial_state):
        """
        import the reference experiment.

        the extension must be one of msc_data.DATATYPES (case insensitive)
        corresponding to the meaning of the columns in the file.

        this method does not calculate the modulation function.

        @attention EDAC can only calculate equidistant, rectangular scans.
        holo scans are transparently mapped to rectangular scans by pmsco.
        this method accepts the following scans:

        * intensity vs energy at fixed theta, phi
        * intensity vs analyser angle vs energy at normal emission (theta = 0, constant phi)
        * intensity vs theta, phi, or alpha
        * holo scan (theta,phi)

        @param filename: (string) file name of the experimental data, possibly including a path.

        @param emitter: (string) chemical symbol of the photo-emitting atom, e.g. "Cu".

        @param initial_state: (string) nl term of the initial state of the atom, e.g. "2p".

        """
        self.filename = filename
        self.emitter = emitter
        self.initial_state = initial_state

        if self.filename:
            self.raw_data = md.load_data(self.filename)
            self.dtype = self.raw_data.dtype
            self.mode, self.positions = md.detect_scan_mode(self.raw_data)

            if 'e' not in self.mode:
                try:
                    self.energies = np.asarray((self.raw_data['e'][0], ))
                except ValueError:
                    logger.error("missing energy in scan file %s", self.filename)
                    raise

            if 't' not in self.mode:
                try:
                    self.thetas = np.asarray((self.raw_data['t'][0], ))
                except ValueError:
                    logger.info("missing theta in scan file %s, defaulting to 0.0", self.filename)
                    self.thetas = np.zeros(1)

            if 'p' not in self.mode:
                try:
                    self.phis = np.asarray((self.raw_data['p'][0], ))
                except ValueError:
                    logger.info("missing phi in scan file %s, defaulting to 0.0", self.filename)
                    self.phis = np.zeros(1)

            if 'a' not in self.mode:
                try:
                    self.alphas = np.asarray((self.raw_data['a'][0], ))
                except ValueError:
                    logger.info("missing alpha in scan file %s, defaulting to 0.0", self.filename)
                    self.alphas = np.zeros(1)

    def define_scan(self, positions, emitter, initial_state):
        """
        define a cartesian (rectangular/grid) scan.

        this method initializes the scan with a one- or two-dimensional cartesian scan
        of the four possible scan dimensions.
        the scan range is given as arguments, the intensity values are initialized as 1.
        the file name and modulation functions are reset to empty and None, respectively.

        the method can create the following scan schemes:

        * intensity vs energy at fixed theta, phi
        * intensity vs analyser angle vs energy at normal emission (theta = 0, constant phi)
        * intensity vs theta, phi, or alpha
        * intensity vs theta and phi (rectangular holo scan)

        @param positions: (dictionary of numpy arrays)
            the dictionary must contain a one-dimensional array for each scan dimension 'e', 't', 'p' and 'a'.
            these array must contain unique, equidistant positions.
            constant dimensions must contain exactly one value.
            missing angle dimensions default to 0,
            a missing energy dimension results in a KeyError.

        @param emitter: (string) chemical symbol of the photo-emitting atom, e.g. "Cu".

        @param initial_state: (string) nl term of the initial state of the atom, e.g. "2p".

        """
        self.filename = ""
        self.emitter = emitter
        self.initial_state = initial_state
        self.mode = []
        shape = 1

        try:
            self.energies = np.copy(positions['e'])
        except KeyError:
            logger.error("missing energy in define_scan arguments")
            raise
        else:
            if self.energies.shape[0] > 1:
                self.mode.append('e')
                shape *= self.energies.shape[0]

        try:
            self.thetas = np.copy(positions['t'])
        except KeyError:
            logger.info("missing theta in define_scan arguments, defaulting to 0.0")
            self.thetas = np.zeros(1)
        else:
            if self.thetas.shape[0] > 1:
                self.mode.append('t')
                shape *= self.thetas.shape[0]

        try:
            self.phis = np.copy(positions['p'])
        except KeyError:
            logger.info("missing phi in define_scan arguments, defaulting to 0.0")
            self.phis = np.zeros(1)
        else:
            if self.phis.shape[0] > 1:
                self.mode.append('p')
                shape *= self.phis.shape[0]

        try:
            self.alphas = np.copy(positions['a'])
        except KeyError:
            logger.info("missing alpha in define_scan arguments, defaulting to 0.0")
            self.alphas = np.zeros(1)
        else:
            if self.alphas.shape[0] > 1:
                self.mode.append('a')
                shape *= self.alphas.shape[0]

        assert 0 < len(self.mode) <= 2, "unacceptable number of dimensions in define_scan"
        assert not ('t' in self.mode and 'a' in self.mode), "unacceptable combination of dimensions in define_scan"

        self.dtype = md.DTYPE_ETPAI
        self.raw_data = np.zeros(shape, self.dtype)
        dimensions = [self.positions[dim] for dim in ['e', 't', 'p', 'a']]
        grid = np.meshgrid(*dimensions)
        for i, dim in enumerate(['e', 't', 'p', 'a']):
            self.raw_data[dim] = grid[i].reshape(-1)
        self.raw_data['i'] = 1


# noinspection PyMethodMayBeStatic
class Project(object):
    """
    base class of a calculation project.

    a 'calculation project' is a coded set of prescriptions
    on how to get from a set of model parameters to simulated data
    which correspond to provided experimental data.
    the results include a measure of the quality of the simulated data compared to experimental data.

    each calculation project must derive from this class.
    it must implement the create_model_space(), create_cluster(), and create_params() methods.

    the other methods and attributes of this class
    are for passing command line parameters to the calculation modules.
    the attributes should be populated in the constructor of the derived class,
    or (recommended) in the create_project() function of the module.
    it is essential that the attributes are set correctly before calculation.
    """

    ## @var features (dictionary)
    #
    # calculation features and versions supported by the project.
    #
    # the dictionary contains key-value pairs where the key is the name of the feature and value is a version number.
    # this field conditionally enables new software features that may break backward compatibility.
    # derived projects should fill this field with the supported version
    # upon creation (in their __init__ method or create_project() factory).
    # version 0 (default) means that the feature is disabled.
    #
    # the following features can be enabled (list may be incomplete):
    # as of this version, no optional features are defined.
    #
    # @note rather than introducing new features and, particularly, new versions that rely on this mechanism,
    # developers of generic code should check whether backward compatibility could be achieved in a simpler way,
    # e.g. by implementing addition methods whose default behaviour is the same as of the previous version.
    # in some cases it may be better to refactor all current project code.
    #

    ## @var scans (list of Scan objects)
    #  list of experimental or scan files for which calculations are to be run.
    #
    #  the list must be populated by calling the add_scan() method.
    #  this should be done in the create_project() function, or through the command line arguments.
    #
    #  the modulation function is calculated internally.
    #  if your scan files contain the modulation function (as opposed to intensity),
    #  you must add the files in the create_project() function.
    #  the command line does not support loading modulation functions.
    #
    #  @c scans must be considered read-only. use project methods to change it.

    ## @var domains (list of arbitrary objects)
    #  list of domains for which calculations are to be run.
    #
    # it is up to the derived class what kind of objects are stored in the list.
    # the recommended kind of objects are dictionaries which hold parameter values,
    # similar to the model dictionaries.
    #
    # the list must be populated by calling the add_domain() method.

    ## @var cluster_generator (ClusterGenerator object)
    #  provides the cluster generator methods.
    #
    # a project must provide a cluster generator object that is derived from ClusterGenerator.
    # at least the ClusterGenerator.create_cluster method must be implemented.
    # if emitters should be run in parallel, the ClusterGenerator.count_emitters must be implemented as well.
    #
    # the initial value is a LegacyClusterGenerator object
    # which routes cluster calls back to the project for compatibility with older project code.

    ## @var optimizer_params (dict)
    # optional parameters of the model optimizer.
    #
    # this is a dictionary that can have (among others) the following values.
    # for a detailed list, see the documentation of the respective model handler.
    #
    # @arg @c 'pop_size' (int)
    #   population size (number of particles) in the swarm or genetic optimization mode.
    #   by default, the population size is set to the number of parallel processes or 4, whichever is greater.
    #   you may want to override the default value in cases where the automatic choice is not appropriate.
    # @arg @c 'seed_file' (string)
    #   name of a file containing the results from previous optimization runs.
    #   this can be used to resume a swarm or genetic optimization where it was interrupted before.
    #   the seed file is a space-delimited, multi-column, text file,
    #   e.g., the output file of a previous optimization.
    #   by default, no seed is loaded.
    # @arg @c 'recalc_seed' (bool)
    #   select whether the R-factors of the seed models are calculated again.
    #   set this argument to False only if the calculation is a continuation of a previous one
    #   without any changes to the code.

    ## @var data_dir
    # directory path to experimental data.
    #
    # the project should load experimental data (scan files) from this path.
    # this attribute receives the --data-dir argument from the command line
    # if the project parses the common arguments (pmsco.set_common_args).
    #
    # it is up to the project to define where to load scan files from.
    # if the location of the files may depend on the machine or user account,
    # the user may want to specify the data path on the command line.

    ## @var output_dir (string)
    # directory path for data files produced during the calculation, including intermediate files.
    #
    # output_dir and output_file are set at once by @ref set_output.

    ## @var output_file (string)
    # file name root for data files produced during the calculation, including intermediate files.
    #
    # the file name should include the path. the path must also be set in @ref output_dir.
    #
    # output_dir and output_file are set at once by @ref set_output.

    ## @var db_file (string)
    # name of an sqlite3 database file where the calculation results should be stored.
    #
    # the default value is ':memory:', which creates a volatile in-memory database.

    ## @var timedelta_limit (datetime.timedelta)
    # wall time after which no new calculations should be started.
    #
    # the actual wall time may be longer by the remaining time of running calculations.
    # running calculations will not be aborted.

    ## @var combined_scan
    # combined raw data from scans.
    # updated by add_scan().

    ## @var combined_modf
    # combined modulation function from scans.
    # updated by add_scan().

    ## @var files
    # list of all generated data files with metadata.
    # the list is used by model handlers to decide which files can be deleted at run time to save disk space.
    #
    # files.categories_to_delete determines which files can be deleted.

    ## @var keep_best
    # number of best models for which result files should be kept.
    #
    # this attribute determines how many models are kept based on R-factor ranking at each node of the task tree
    # (up to keep_levels).

    ## @var keep_levels
    # numeric task level down to which R-factors are considered when model files are cleaned up.
    #
    # @arg 0 = model level: combined results only.
    # @arg 1 = scan level: scan nodes in addition to combined results (level 0).
    # @arg 2 = domain level: domain nodes in addition to level 1.
    # @arg 3 = emitter level: emitter nodes in addition to level 1.
    # @arg 4 = region level: region nodes in addition to level 1.

    ## @var atomic_scattering_factory
    # factory function to create an atomic scattering calculator
    #
    # this can also be the name of a class.
    # the calculator must inherit from pmsco.calculators.calculator.AtomicCalculator.
    # the name of atomic scattering calculator classes should end in AtomicCalculator.

    ## @var multiple_scattering_factory
    # factory function to create a multiple scattering calculator
    #
    # this can also be the name of a class.
    # the calculator must inherit from pmsco.calculators.calculator.Calculator
    #
    # example: pmsco.calculators.edac.EdacCalculator
    #

    def __init__(self):
        self.mode = "single"
        self.job_name = ""
        self.job_tags = {}
        self.git_hash = ""
        self.description = ""
        self.features = {}
        self.cluster_format = mc.FMT_EDAC
        self.cluster_generator = mc.LegacyClusterGenerator(self)
        self.scans = []
        self.domains = []
        self.optimizer_params = {
            'pop_size': 0,
            'seed_file': "",
            'seed_limit': 0,
            'recalc_seed': True,
            'table_file': ""
        }
        self.data_dir = ""
        self.output_dir = ""
        self.output_file = "pmsco_data"
        self.db_file = ':memory:'
        self.timedelta_limit = datetime.timedelta(days=1)
        self.combined_scan = None
        self.combined_modf = None
        self.files = files.FileTracker()
        self.keep_levels = 1
        self.keep_best = 10
        self.handler_classes = {
            'model': handlers.SingleModelHandler,
            'scan': handlers.ScanHandler,
            'domain': handlers.DomainHandler,
            'emit': handlers.EmitterHandler,
            'region': handlers.SingleRegionHandler
        }
        self.atomic_scattering_factory = InternalAtomicCalculator
        self.multiple_scattering_factory = EdacCalculator
        self._tasks_fields = []
        self._db = database.ResultsDatabase()

    def create_model_space(self):
        """
        create a project.ModelSpace object which defines the allowed range for model parameters.

        this method must be implemented by the actual project class.
        the ModelSpace object must declare all model parameters used in the project.

        @return ModelSpace object
        """
        return None

    def create_params(self, model, index):
        """
        create a CalculatorParams object given the model parameters and calculation index.

        @param model (dictionary) model parameters to be used in the calculation.

        @param index (named tuple CalcID) calculation index.
            the method should consider only the following attributes:
            @arg `scan`   scan index (index into Project.scans)
            @arg `domain`    domain index (index into Project.domains)
        """
        return None

    def clear_scans(self):
        """
        clear scans.

        delete all scans in self.scans and empty the list.

        @return: None
        """
        self.scans = []
        self.combined_scan = None
        self.combined_modf = None

    def add_scan(self, filename, emitter, initial_state, is_modf=False, modf_model=None, positions=None):
        """
        add the file name of reference experiment and load it.

        the extension must be one of msc_data.DATATYPES (case insensitive)
        corresponding to the meaning of the columns in the file.

        caution: EDAC can only calculate equidistant, rectangular scans.
        the following scans are currently supported:

        * intensity vs energy at fixed theta, phi
        * intensity vs analyser angle vs energy at normal emission (theta = 0, constant phi)
        * intensity vs theta, phi, or alpha
        * intensity vs theta and phi (hemisphere or hologram scan)

        the method calculates the modulation function if @c is_modf is @c False.
        it also updates @c combined_scan and @c combined_modf which may be used as R-factor comparison targets.

        @param filename: (string) file name of the experimental data, possibly including a path.
            the file is not loaded when the optional positions argument is present,
            but the filename may serve as basename for output files (e.g. modulation function).

        @param positions: (optional, dictionary of numpy arrays) scan positions.
            if specified, the file given by filename is _not_ loaded,
            and the scan positions are initialized from this dictionary.
            the dictionary keys are the possible scan dimensions: 'e', 't', 'p', 'a'.
            the arrays are one-dimensional and contain unique, equidistant positions.
            constant dimensions have shape 1. see @ref Scan.define_scan.

        @param emitter: (string) chemical symbol of the photo-emitting atom, e.g. "Cu".

        @param initial_state: (string) nl term of the initial state of the atom, e.g. "2p".

        @param is_modf: (bool) declares whether the file contains the modulation function (True),
            or intensity (False, default). In the latter case, the modulation function is calculated internally.

        @param modf_model: (dict) model parameters to be passed to the modulation function.

        @return (Scan) the new scan object (which is also a member of self.scans).
        """
        scan = Scan()
        if positions is not None:
            scan.define_scan(positions, emitter, initial_state)
            scan.filename = filename
        else:
            scan.import_scan_file(filename, emitter, initial_state)
        self.scans.append(scan)

        if modf_model is None:
            modf_model = {}

        if scan.raw_data is not None:
            if is_modf:
                scan.modulation = scan.raw_data
            else:
                try:
                    scan.modulation = self.calc_modulation(scan.raw_data, modf_model)
                except ValueError:
                    logger.error("error calculating the modulation function of experimental data.")
                    scan.modulation = None
        else:
            scan.modulation = None

        if scan.raw_data is not None:
            if self.combined_scan is not None:
                dt = md.common_dtype((self.combined_scan, scan.raw_data))
                d1 = md.restructure_data(self.combined_scan, dt)
                d2 = md.restructure_data(scan.raw_data, dt)
                self.combined_scan = np.hstack((d1, d2))
            else:
                self.combined_scan = scan.raw_data.copy()
        else:
            self.combined_scan = None

        if scan.modulation is not None:
            if self.combined_modf is not None:
                dt = md.common_dtype((self.combined_modf, scan.modulation))
                d1 = md.restructure_data(self.combined_modf, dt)
                d2 = md.restructure_data(scan.modulation, dt)
                self.combined_modf = np.hstack((d1, d2))
            else:
                self.combined_modf = scan.modulation.copy()
        else:
            self.combined_modf = None

        return scan

    def clear_domains(self):
        """
        clear domains.

        delete all domains in self.domains and empty the list.

        @return: None
        """
        self.domains = []

    def add_domain(self, domain):
        """
        add a domain to the list of domains.

        this class declares the list of domains.
        it does not define what should be in the list of domains.
        however, there must be an entry for each domain to be calculated.
        if the list is empty, no calculation will be executed.

        @attention initially, the domains list is empty.
            your project needs to add at least one domain.
            otherwise, no calculation will be executed.

        @param domain: it is up to the derived project class to specify and interpret the data stored here.
            it is recommended to store a dictionary with domain parameters similar to the model parameters.

        @return: None
        """
        self.domains.append(domain)

    def set_output(self, filename):
        """
        set path and base name of output file.

        path and name are copied to the output_file attribute.
        path is copied to the output_dir attribute.

        if the path is missing, the destination is the current working directory.
        """
        self.output_file = filename
        path, name = os.path.split(filename)
        self.output_dir = path
        self.job_name = name

    def set_timedelta_limit(self, timedelta, margin_minutes=10):
        """
        set the walltime limit with a safety margin.

        this method sets the internal self.timedelta_limit attribute.
        by default, a safety margin of 10 minutes is subtracted to the main argument
        in order to increase the probability that the process ends in time.
        if this is not wanted, the project class may override the method and provide its own margin.

        the method is typically called with the command line time limit from the main module.

        @note the safety margin could be applied at various levels.
        it is done here because it can easily be overridden by the project subclass.
        to keep run scripts simple, the command line can be given the same time limit
        as the job scheduler of the computing cluster.

        @param timedelta: (datetime.timedelta) max. duration of the calculation process (wall time).

        @param margin_minutes: (int) safety margin in minutes to subtract from timedelta.
        """
        self.timedelta_limit = timedelta - datetime.timedelta(minutes=margin_minutes)

    def log_project_args(self):
        """
        send some common project attributes to the log.

        the attributes are normally logged at WARNING level.

        this method is called by the main pmsco module after creating the project and assigning command line arguments.
        it may be overridden to add logs of attributes of the sub-class.

        @return: None
        """
        try:
            logger.warning("atomic scattering: {0}".format(self.atomic_scattering_factory))
            logger.warning("multiple scattering: {0}".format(self.multiple_scattering_factory))
            logger.warning("optimization mode: {0}".format(self.mode))

            for key in sorted(self.optimizer_params):
                val = self.optimizer_params[key]
                lev = logging.WARNING if val else logging.DEBUG
                logger.log(lev, "optimizer_params['{k}']: {v}".format(k=key, v=val))

            logger.warning("data directory: {0}".format(self.data_dir))
            logger.warning("output file: {0}".format(self.output_file))
            logger.warning("database: {0}".format(self.db_file))

            _files_to_keep = files.FILE_CATEGORIES - self.files.categories_to_delete
            logger.warning("intermediate files to keep: {0}".format(", ".join(_files_to_keep)))

            for idx, scan in enumerate(self.scans):
                logger.warning(f"scan {idx}: {scan.filename} ({scan.emitter} {scan.initial_state}")
            for idx, dom in enumerate(self.domains):
                logger.warning(f"domain {idx}: {dom}")

        except AttributeError:
            logger.warning("AttributeError in log_project_args")

    def combine_domains(self, parent_task, child_tasks):
        """
        combine results of different domain into one result and calculate the modulation function.

        the domain results are read from the file system using the indices defined by the child_tasks,
        and the combined result is written to the file system with the index defined by parent_task.

        by default, this method adds all domains with equal weight.
        weights can be defined in the model dictionary with keys 'wdom0', 'wdom1', etc.
        missing weights default to 1.
        to avoid correlated parameters, one domain must always have a fixed weight.
        it is recommended to leave 'wdom0' at its default.

        @param parent_task: (CalculationTask) parent task of the domain tasks.
            the method must write the results to the files indicated
            by the @c result_filename and @c modf_filename attributes.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each domain.
            the method must read the source data from the files
            indicated by the @c result_filename attributes.
            the sequence is sorted by task ID, i.e., essentially, by domain index.

        @return: None

        @raise IndexError if child_tasks is empty

        @raise IOError if a filename is missing

        @note the weights of the domains (in derived classes) can be part of the optimizable model parameters.
            the model parameters are available as the @c model attribute of the calculation tasks.
        """

        result_data = None
        sum_weights = 0.
        for task in child_tasks:
            data = md.load_data(task.result_filename)
            if result_data is None:
                result_data = data.copy()
                result_data['i'] = 0.
            try:
                weight = task.model['wdom{}'.format(task.id.domain)]
            except KeyError:
                weight = 1.
            result_data['i'] += weight * data['i']
            sum_weights += weight
        result_data['i'] /= sum_weights

        md.save_data(parent_task.result_filename, result_data)

        # todo : the handling of missing modulation functions may need some cleanup
        if self.scans[parent_task.id.scan].modulation is not None:
            result_modf = self.calc_modulation(result_data, parent_task.model)
            md.save_data(parent_task.modf_filename, result_modf)
        else:
            parent_task.modf_filename = ""

    def combine_emitters(self, parent_task, child_tasks):
        """
        combine results of different emitters into one result. calculate the modulation function.

        the emitter results are read from the file system using the indices defined by the child_tasks,
        and the combined result is written to the file system with the index defined by parent_task.

        by default, this method adds all emitters with equal weight.

        sub-classes may override this method and implement expansion of equivalent emitters,
        unequal weights, etc.

        @param parent_task: (CalculationTask) parent task of the emitter tasks.
            the method must write the results to the files indicated
            by the @c result_filename and @c modf_filename attributes.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each emitter.
            the method must read the source data from the files
            indicated by the @c result_filename attributes.
            the sequence is sorted by task ID, i.e., essentially, by the emitter index.

        @return: None

        @raise IndexError if child_tasks is empty

        @raise IOError if a filename is missing

        @note the weights of the emitters (in derived classes) can be part of the optimizable model parameters.
            the model parameters are available as the @c model attribute of the calculation tasks.
        """

        result_data = None
        for task in child_tasks:
            data = md.load_data(task.result_filename)
            if result_data is not None:
                result_data['i'] += data['i']
            else:
                result_data = data

        md.save_data(parent_task.result_filename, result_data)

        # todo : the handling of missing modulation functions may need some cleanup
        if self.scans[parent_task.id.scan].modulation is not None:
            result_modf = self.calc_modulation(result_data, parent_task.model)
            md.save_data(parent_task.modf_filename, result_modf)
        else:
            parent_task.modf_filename = ""

    def combine_scans(self, parent_task, child_tasks):
        """
        combine results of different scans into one result, for intensity and modulation.

        the scan results are read from the file system using the indices defined by the child_tasks,
        and the combined result is written to the file system with the index defined by parent_task.

        the datasets of the scans are appended.
        this is done for intensity and modulation data independently.

        @param parent_task: (CalculationTask) parent task of the scan tasks.
            the method must write the results to the files indicated
            by the @c result_filename and @c modf_filename attributes.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
            the method must read the source data from the files
            indicated by the @c result_filename attributes.
            the sequence is sorted by task ID, i.e., essentially, by scan index.

        @return: None

        @raise IndexError if child_tasks is empty.
        """

        # intensity
        try:
            stack1 = [md.load_data(task.result_filename) for task in child_tasks]
        except IOError:
            parent_task.result_filename = ""
        else:
            dtype = md.common_dtype(stack1)
            stack2 = [md.restructure_data(data, dtype) for data in stack1]
            result_data = np.hstack(tuple(stack2))
            md.save_data(parent_task.result_filename, result_data)

        # modulation
        try:
            stack1 = [md.load_data(task.modf_filename) for task in child_tasks]
        except IOError:
            parent_task.modf_filename = ""
        else:
            dtype = md.common_dtype(stack1)
            stack2 = [md.restructure_data(data, dtype) for data in stack1]
            result_modf = np.hstack(tuple(stack2))
            md.save_data(parent_task.modf_filename, result_modf)

    def combine_regions(self, parent_task, child_tasks):
        """
        combine results from different regions into one result, for intensity and modulation.

        the scan results are read from the file system using the indices defined by the child_tasks,
        and the combined result is written to the file system with the index defined by parent_task.

        the datasets of the regions are appended and sorted in the standard order of the data module.
        if the resulting length differs from the corresponding experimental scan,
        an error is printed to the logger, but the calculation continues.

        the modulation function is calculated by calling @ref calc_modulation.

        @param parent_task: (CalculationTask) parent task of the region tasks.
            the method writes the results to the file names
            given by the @c result_filename and @c modf_filename attributes.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each region.
            the reads the source data from the files
            indicated by the @c result_filename attributes.
            the sequence is sorted by task ID, i.e., essentially, by region index.

        @return: None

        @raise IndexError if child_tasks is empty.
        """
        # intensity
        try:
            stack1 = [md.load_data(task.result_filename) for task in child_tasks]
        except IOError:
            parent_task.result_valid = False
            parent_task.result_filename = ""
        else:
            dtype = md.common_dtype(stack1)
            stack2 = [md.restructure_data(data, dtype) for data in stack1]
            result_data = np.hstack(tuple(stack2))
            md.sort_data(result_data)
            md.save_data(parent_task.result_filename, result_data)

            scan = self.scans[parent_task.id.scan]
            if result_data.shape[0] != scan.raw_data.shape[0]:
                logger.error(BMsg("scan length mismatch: combined result: {result}, experimental data: {expected}",
                                  result=result_data.shape[0], expected=scan.raw_data.shape[0]))

        # modulation
        try:
            data = md.load_data(parent_task.result_filename)
            modf = self.calc_modulation(data, parent_task.model)
        except IOError:
            parent_task.modf_filename = ""
        else:
            md.save_data(parent_task.modf_filename, modf)

    def get_git_hash(self):
        """
        get the git commit (hash) of the running code (HEAD)

        the method looks for a git repository in the source tree of this module.
        if successful, it returns the hash string of the HEAD commit.

        @return: hexadecimal hash string.
            empty string if the file is not in a git repository.
        """
        try:
            repo = git.Repo(__file__, search_parent_directories=True)
        except git.exc.InvalidGitRepositoryError:
            return ""
        else:
            return repo.head.commit.hexsha

    def setup(self, handlers):
        """
        prepare for calculations.

        this method is called in the master process before starting the task loop.
        at this point the task handlers have been created and set up.
        if the project needs to change settings of task handlers it can do so in this method.

        this instance writes the header of the tasks.dat file
        that will receive sub-task evaluation results from the evaluate_result() method.

        it also initializes the database where the task results will be stored.
        this is either a volatile in-memory database or a user-specified sqlite3 database file.

        @param handlers: dictionary listing the initialized task handler instances.
            the dictionary keys are the attribute names of pmsco.dispatch.CalcID:
            'model', 'scan', 'domain', 'emit' and 'region'.

        @return: None
        """
        self.git_hash = self.get_git_hash()
        fields = ["rfac"]
        fields.extend(dispatch.CalcID._fields)
        fields.append("secs")
        fields = ["_" + f for f in fields]
        mspace = self.create_model_space()
        model_fields = list(mspace.start.keys())
        model_fields.sort(key=lambda name: name.lower())
        fields.extend(model_fields)
        self._tasks_fields = fields

        with open(self.output_file + ".tasks.dat", "w") as outfile:
            outfile.write("# ")
            outfile.write(" ".join(fields))
            outfile.write("\n")

        self._db.connect(self.db_file)
        project_name = self.__class__.__name__
        project_module = self.__class__.__module__
        project_id = self._db.register_project(project_name, project_module)
        job_id = self._db.register_job(project_id,
                                       self.job_name,
                                       self.mode,
                                       socket.gethostname(),
                                       self.git_hash,
                                       datetime.datetime.now(),
                                       self.description)
        logger.debug(BMsg("database {db_file}, project {proj}, job {job}",
                          db_file=self.db_file, proj=project_id, job=job_id))
        self._db.insert_jobtags(job_id, self.job_tags)
        self._db.register_params(model_fields)
        self._db.create_models_view()

    def evaluate_result(self, parent_task, child_tasks):
        """
        evaluate the result of a calculation task.

        this method is called from the add_result of the task handlers at each level.
        it gives the project a hook to check the progress of a model at any level of the task tree.

        the method calculates the r-factor by calling the Project.calc_rfactor method.
        the result is written to the task.rfac field and to the .tasks.dat file.
        invalid and region-level results are skipped.

        this method is called in the master process only.

        @param parent_task: (CalculationTask) a calculation task.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
            the sequence must be sorted by task ID.

        @return: None
        """
        if parent_task.result_valid and parent_task.id.region == -1:
            try:
                parent_task.rfac = self.calc_rfactor(parent_task, child_tasks)
            except ValueError:
                parent_task.result_valid = False
                logger.warning(BMsg("calculation {0} resulted in an undefined R-factor.", parent_task.id))
            else:
                values_dict = parent_task.id._asdict()
                values_dict = {"_" + k: v for k, v in values_dict.items()}
                values_dict.update(parent_task.model)
                values_dict['_rfac'] = parent_task.rfac
                values_dict['_secs'] = parent_task.time.total_seconds()
                values_list = [values_dict[field] for field in self._tasks_fields]
                with open(self.output_file + ".tasks.dat", "a") as outfile:
                    outfile.write(" ".join(format(value) for value in values_list) + "\n")

                db_id = self._db.insert_result(parent_task.id, values_dict)
                logger.debug(BMsg("model {model}, database result {db_id}", model=parent_task.id.model, db_id=db_id))

        return None

    # noinspection PyUnusedLocal
    def calc_modulation(self, data, model):
        """
        calculate the project-dependent modulation function.

        the modulation function of I(x) is (I(x) - S(x)) / S(x)
        where S(x) is a smooth copy of I(x).

        by default, the modulation function is calculated by data.calc_modfunc_loess().
        override this method in your project to use a different modulation function.

        @param data structured numpy.ndarray in EI, ETPI, or ETPAI format.
            can contain a one- or multi-dimensional scan.
            the scan coordinates must be on a rectangular or hemisperical grid.
            for maximum compatibility, the array should be sorted,
            though for the default calc_modfunc_loess() function this is not required.

            if data contains a hemispherical scan, the phi dimension is ignored,
            i.e. the modulation function is calcualted on a phi-average.

        @param model: (dict) model parameters of the calculation task.
            can be used to pass parameters from the project.
            this argument is a dictionary of the model parameters.

        @return copy of the data array with the modulation function in the 'i' column.
        """

        return md.calc_modfunc_loess(data)

    def calc_rfactor(self, parent_task, child_tasks):
        """
        calculate the r-factor of a task.

        the r-factor is calculated on the experimental and simulated modulation functions.
        the algorithm differs for the model level and the lower task levels.
        at the model level, the calculation is delegated to Project.combine_rfactors.
        at all other levels, the r-factor is calculated by Project.rfactor,
        where the simulated is loaded from the file specified by parent_task.modf_filename
        and the experimental data from Project.scan.

        this method is called by the task handlers.
        all child tasks belonging to the parent task must be complete.

        to select or implement a specific R-factor algorithm,
        the project sub-class should override Project.rfactor.
        to combine scan r-factors, it should override or patch Project.combine_rfactors.

        @version in earlier versions,
        projects had to override this method to implement their algorithm.
        this has lead to duplication of common code.
        the r-factor algorithm is now distributed over several methods,
        and the method signature has changed.
        new projects should override Project.rfactor and/or Project.combine_rfactors.

        @param parent_task: (CalculationTask) a calculation task.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
            the sequence must be sorted by task ID.

        @return (float) calculated R-factor.

        @raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
        """
        if parent_task.id.scan >= 0:
            task_data = md.load_data(parent_task.modf_filename)
            exp_data = self.scans[parent_task.id.scan].modulation
            result_r = self.rfactor(exp_data, task_data)
        else:
            result_r = self.combine_rfactors(parent_task, child_tasks)

        return result_r

    def rfactor(self, exp_data, theo_data):
        """
        calculate the r-factor of simulated diffraction data.

        in this class, the method calls the data.rfactor function to calculate the r-factor.
        override this method in your project to use a different R-factor algorithm.

        the input arrays must have the same shape,
        and the coordinate columns must be identical (they are ignored, however).
        the array elements are compared element-by-element.
        terms having NaN intensity are ignored.

        if the sigma column is present in experiment and non-zero,
        the R-factor terms are weighted.

        @param exp_data: (numpy structured array)
            ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
            if an @c s field is present and non-zero,
            the R-factor terms are weighted by 1/sigma**2.

        @param theo_data: (numpy structured array)
            ETPI or ETPAI array containing the calculated modulation functions.

        @return: (float) scalar R-factor

        @raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
        """
        return md.rfactor(exp_data, theo_data)

    def opt_rfactor(self, exp_data, theo_data):
        """
        calculate the r-factor of simulated diffraction data, adjusting their amplitude.

        this is an alternative r-factor calculation algorithm
        using the pmsco.data.optimize_rfactor() function.

        to activate this method (replacing the default one), assign it to Project.rfactor
        in the overriding __init__ or setup method:
        @code{.py}
        self.rfactor = self.opt_rfactor
        @endcode

        @param exp_data: (numpy structured array)
            ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
            if an @c s field is present and non-zero,
            the R-factor terms are weighted by 1/sigma**2.

        @param theo_data: (numpy structured array)
            ETPI or ETPAI array containing the calculated modulation functions.

        @return: (float) scalar R-factor

        @raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
        """
        return md.optimize_rfactor(exp_data, theo_data)

    def combine_rfactors(self, parent_task, child_tasks):
        """
        combine r-factors of child tasks.

        the r-factors are taken from the rfac attribute of the child_tasks.
        the result is an average of the child r-rfactors.

        to produce a balanced result, every child dataset must contain a similar amount of information.
        if this is not the case, the child r-factors must be weighted.
        weighting is currently not implemented but may be introduced in a future version.

        the method is intended to be used at the model level (children are scans).
        though it can technically be used at any level where child r-factors are available.

        @param parent_task: (CalculationTask) parent task for which the r-factor is calculated,
            i.e. a model task.

        @param child_tasks: (sequence of CalculationTask) child tasks of parent_tasks
            that may be consulted for calculating the r-factor.

        @return: (float) r-factor, NaN if parent task is invalid

        @raise ValueError or IndexError if child_tasks is empty.
        """
        if parent_task.result_valid:
            rsum = 0.
            for task in child_tasks:
                rsum += task.rfac
            return rsum / len(child_tasks)
        else:
            return float('nan')

    def alt_combine_rfactors(self, parent_task, child_tasks):
        """
        combine r-factors of child tasks by explicit calculation on the combined result.

        this is an alternative implementation of combine_rfactors.
        instead of using the r-factors from child tasks,
        it re-calculates the r-factor for the combined dataset.
        this method avoids the issue of weighting
        but can introduce bias if the amplitudes of the child datasets differ substantially.

        the experimental dataset is loaded from the file specified by the parent task,
        the corresponding experimental data is taken from self.combined_modf.

        to activate this method, assign it to combine_rfactors.
        in the overriding __init__ or setup method:
        @code{.py}
        self.combine_rfactors = self.alt_combine_rfactors
        @endcode

        @param parent_task: (CalculationTask) parent task for which the r-factor is calculated,
            i.e. a model task.

        @param child_tasks: (sequence of CalculationTask) child tasks of parent_tasks
            that may be consulted for calculating the r-factor.

        @return: (float) r-factor, NaN if parent task is invalid
        """
        if parent_task.result_valid:
            task_data = md.load_data(parent_task.modf_filename)
            exp_data = self.combined_modf
            return self.rfactor(exp_data, task_data)
        else:
            return float('nan')

    def export_cluster(self, index, filename, cluster):
        """
        export the cluster of a calculation task in XYZ format for diagnostics and reporting.

        this method is called with the final cluster just before it is handed over to the calculator.
        it saves the atom coordinates in XYZ format for future reference (e.g. graphics).

        the method creates two files:
        @arg a file with extension '.xyz' contains the whole cluster in XYZ format.
        @arg a file with extension '.emit.xyz' contains only emitter atoms in XYZ format.

        the first part of the file name is formatted with the output name and the complete task identification.
        the file is registered with the file tracker in the 'cluster' category
        so that it will be deleted unless the cluster category is selected for keeping.

        derived project class may override or extend this method
        to carry out further diagnostics or reporting on the cluster.

        @param index: (CalcID) calculation index to which the cluster belongs.
            region may be -1 if only one cluster is exported for all regions
            (clusters do not depend on the scan region).
            emit may be -1 if the cluster is a master from which emitter-related child clusters are derived.

        @param filename: (str) base file name for the output files.
            the filename should be formatted using pmsco.dispatch.CalculationTask.format_filename().
            extensions are appended by this method.

        @param cluster: a pmsco.cluster.Cluster() object with all atom positions and emitters.

        @return: dictionary listing the names of the created files with their category.
                 the dictionary key is the file name,
                 the value is the file category (cluster).
        """
        _files = {}
        xyz_filename = filename + ".xyz"
        cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ)
        _files[xyz_filename] = 'cluster'

        xyz_filename = filename + ".emit.xyz"
        cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ, emitters_only=True)
        _files[xyz_filename] = 'cluster'

        return _files

    def before_atomic_scattering(self, task, par, clu):
        """
        project hook before atomic scattering factors are calculated.

        this method derives modified CalculatorParams and Cluster objects for the atomic scattering calculation
        from the original objects that will be used in the multiple scattering calculation.

        in the basic version, the method does not change the objects
        except that it returns None for the root task (reference cluster).
        subclasses may override it to modify or replace the cluster.

        @param task: @ref pmsco.dispatch.CalculationTask object representing the current calculation task.
            if the model index is -1, the project can return the global reference cluster
            (to calculate the fixed scattering factors that will be used for all models)
            or None if no global scattering factors should be calculated.
            do not modify this object!

        @param par: @ref pmsco.project.CalculatorParams object representing the preliminary
            multiple scattering input parameters of the current task.
            the method can make modifications to this object instance directly.

        @param clu: @ref pmsco.cluster.Cluster object representing the preliminary
            multiple scattering cluster of the current task.
            the method can make modifications to this object instance directly.

        @return: a tuple (par, clu) where par and clu are the input parameters and cluster
            to be used for the calculation of atomic scattering factors.
            these should either be the original function arguments,
            or copies of the original arguments.
            if atomic scattering factors should not be calculated, the return values should be None.
        """
        if task.id.model >= 0:
            return par, clu
        else:
            return None, None

    def after_atomic_scattering(self, task, par, clu):
        """
        project hook after atomic scattering factors are calculated.

        this method cleans up the CalculatorParams and Cluster objects from the atomic scattering calculation
        so that they can be used in the multiple scattering calculation.

        in the basic version, the method just passes the input parameters for model tasks
        and returns None for the root task.
        subclasses may override it and modify the cluster and/or input parameters
        so that the desired atomic scattering factors are used.

        @param task: @ref pmsco.dispatch.CalculationTask object representing the current calculation task.
            if the model index is -1, the project should return the global reference cluster
            (to calculate the fixed scattering factors that will be used for all models)
            or None if no global scattering factors should be calculated.

        @param par: @ref pmsco.project.CalculatorParams object representing the preliminary
            multiple scattering input parameters of the current task.

        @param clu: @ref pmsco.cluster.Cluster object representing the preliminary
            multiple scattering cluster of the current task.
            do not modify this object, make a copy!

        @return: a tuple (par, clu) where par and clu are the input parameters and cluster
            to be used for the calculation of atomic scattering factors.
            these should either be the original function arguments,
            or copies of the original arguments.
        """
        if task.id.model >= 0:
            return par, clu
        else:
            return None, None

    def cleanup(self):
        """
        delete unwanted files at the end of a project and close the database.

        @return: None
        """
        self.cleanup_files(incomplete_models=True)
        self._db.disconnect()

    def cleanup_files(self, keep=0, incomplete_models=False):
        """
        delete uninteresting files (any time).

        delete all files that
        belong to one of the self.files.categories_to_delete categories or
        do not belong to one of the "best" models.

        "best" models are a number (self.keep_best) of models that gave the lowest R-factors
        at each task level from root to self.keep_levels.
        for example if `keep_best = 10` and `keep_levels = 1`
        the 10 best models at the top level, and the 10 best at the scan level are kept.
        this means that in total up to `n = 10 + 10 * n_scans` models may be kept,
        where n_scans is the number of scan files in the job.

        this method can be called at any time during the calculation process.
        it executes on complete models only
        unless incomplete_models is True.

        @param keep: minimum number of best models to keep.
            0 (default): use the project parameter self.keep_best.

        @param incomplete_models: (bool) delete files of incomplete models as well.
            by default (False), incomplete models are not deleted.

        @return None
        """
        self.files.delete_files(incomplete_models=incomplete_models)
        if 'rfac' in self.files.categories_to_delete:
            keep = max(keep, self.keep_best)
            keepers = self._db.query_best_task_models(self.keep_levels, keep)
            self.files.delete_models(keep=keepers)