pmsco-public/pmsco/project.py

"""
@package pmsco.project
project-independent classes which store and handle model parameters.

the most important class defined here is Project.
each calculation project needs to derive its own project class from it.
the Domain and Params classes are typically used unchanged.

@note nomenclature: the term @e parameters has several meanings in the code and documentation.
    the following distinctive terms are used in updated documentation sections.
    ambiguous terms may still be present in older code sections.
@arg <em>calculation parameters</em> set of specific parameters passed as input to the calculation programs.
    the amount and meaning of these parameters depend on the calculation code used.
    typically, many of these parameters remain fixed, or change very rarely in the course of the study.
@arg <em>model parameters</em> concise set of independent physical parameters
    that define the system in one calculation instance.
    these parameters are varied systematically by the optimization process.
    they are mapped to calculation parameters and a cluster by code derived from the Project class.

@author Matthias Muntwiler, matthias.muntwiler@psi.ch

@copyright (c) 2015 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import copy
import datetime
import logging
import numpy as np
import os.path
import socket
import sys

import pmsco.cluster as mc
from pmsco.compat import open
import pmsco.data as md
import pmsco.database as database
import pmsco.dispatch as dispatch
import pmsco.files as files
import pmsco.handlers as handlers
from pmsco.helpers import BraceMessage as BMsg

logger = logging.getLogger(__name__)

ParamDomain = collections.namedtuple('ParamDomain', ['start', 'min', 'max', 'step'])


class Domain(object):
    """
    Domain of model parameters.

    Each member contains a dictionary of model parameter names and their values.
    Parameter names can be defined almost freely by the project,
    except that they should contain only alphanumeric and underscore characters.
    furthermore, names starting with an underscore are reserved for the optimizers.
    """

    ## @var start (dict)
    # dictionary of start values for each model parameter.
    #
    # the start value can be the initial guess for an optimization run,
    # or the actual value for a single calculation.
    #
    # there must be one item for each model parameter,
    # where the key is the name of the parameter, and the value its physical value.

    ## @var min (dict)
    # dictionary of minimum values for each model parameter.
    #
    # the minimum defines the lower bound of the allowed interval for a model parameter.
    #
    # there must be one item for each model parameter,
    # where the key is the name of the parameter, and the value its physical value.

    ## @var max (dict)
    # dictionary of maximum values for each model parameter.
    #
    # the maximum defines the upper bound of the allowed interval for a model parameter.
    #
    # there must be one item for each model parameter,
    # where the key is the name of the parameter, and the value its physical value.

    ## @var step (dict)
    # dictionary of step sizes for each model parameter.
    #
    # depending on the optimization mode, the step is a guess of how fast values should vary,
    # e.g. step size, gradient, velocity, ...
    #
    # there must be one item for each model parameter,
    # where the key is the name of the parameter, and the value its physical value.

    def __init__(self):
        """
        initialize the domain object with empty dictionaries.
        """
        self.start = {}
        self.min = {}
        self.max = {}
        self.step = {}

    def add_param(self, name, start, min=None, max=None, step=None, width=None):
        """
        set the domain of one parameter with all necessary values at once.

        the exact meaning of the arguments depends on the calculation mode.

        @param name (string) name of the parameter (alphanumeric and underscore characters only).
            it is recommended to use short but distinctive names.

        @param start (float) start value.

        @param min (float) lower bound of the parameter interval.
            must be lower or equal to start.
            if None, the field is set to start.

        @param max (float) upper bound of the parameter interval.
            must be greater or equal to start.
            if None, the field is set to start.

        @param width (float) width of the parameter interval.
            instead of min and max, the interval can be set centered around the start value.
            this is equivalent to min = start - width/2, max = start + width/2.
            this argument overrides min and max. don't use both arguments.

        @param step (float) step size.
            must be greater or equal to zero.
            if None, the field is set to zero.
        """
        self.start[name] = start
        self.min[name] = min if min is not None else start
        self.max[name] = max if max is not None else start
        if width is not None:
            self.min[name] = start - width / 2.
            self.max[name] = start + width / 2.
        self.step[name] = step if step is not None else 0.0

    def get_param(self, name):
        """
        get all values of a model parameter in a tuple.

        @param name (string) name of the parameter.

        @return named tuple ParamDomain(start, min, max, step) of the parameter.

        @raise IndexError if the parameter is not defined.
        """
        return ParamDomain(self.start[name], self.min[name], self.max[name], self.step[name])


class Params(object):
    """
    calculation parameters for a single scattering calculation job.

    this class holds all the calculation parameters that are passed via input file to the calculation program.

    the class can hold parameters for both the MSC and EDAC codes.
    some parameters are used by both codes, others are used just by one of them.
    newer features such as multiple emitters, multiple symmetries, and others are supported in EDAC mode only.
    MSC mode is currently not maintained.

    objects of this class are created by the implementation of the create_params() method
    of the actual project class.
    """

    ## @var angular_resolution (float)
    # FWHM angular resolution of the detector.
    #
    # maps to:
    # @arg emission angle window (EDAC)
    # @arg angular_broadening (MSC)

    ## @var phase_files (dict)
    # dictionary of phase files.
    #
    # the keys are atomic numbers, the values file names.
    # if the dictionary is empty or the files don't exist, the phases are computed internally (EDAC only).
    #
    # maps to:
    # @arg scatterer (EDAC)
    # @arg atomic_number, phase_file (MSC)

    def __init__(self):
        self.title = "default parameters"
        self.comment = "set by project.Params()"
        self.cluster_file = ""
        self.output_file = ""
        self.scan_file = ""
        # EDAC convention: 1s, 2p, 2p1/2, etc.
        self.initial_state = "1s"
        # MSC convention: H, V, L, R, U
        self.polarization = "H"
        self.angular_resolution = 1.0
        self.z_surface = 0.0
        self.inner_potential = 10.0
        # the energy scale of EDAC is referenced to the vacuum level
        # but data files are referenced to the Fermi level
        # the msc_edac module adds the work function to the kinetic energy before it calls EDAC
        self.work_function = 0.0
        self.symmetry_range = 360.0
        self.polar_incidence_angle = 60.0
        self.azimuthal_incidence_angle = 0.0
        self.experiment_temperature = 300.0
        self.debye_temperature = 400.0
        self.debye_wavevector = 1.0
        self.phase_files = {}
        # used by MSC only
        self.spherical_order = 2
        self.scattering_level = 5
        self.fcut = 15.0
        self.cut = 15.0
        self.lattice_constant = 1.0
        self.msq_displacement = {}
        self.planewave_attenuation = 1.0
        self.vibration_model = "N"
        self.substrate_atomic_mass = 1.0
        self.rme_minus_value = 0.5
        self.rme_minus_shift = 0.0
        self.rme_plus_value = 0.5
        self.rme_plus_shift = 0.0
        # used by EDAC only
        self.emitters = [(0.0, 0.0, 0.0, 0)]
        self.lmax = 15
        self.dmax = 5.0
        self.orders = [20]


class Scan(object):
    """
    class to describe the scanning scheme or store the experimental data set.
    """

    ## @var filename (string)
    # file name from which a scan was loaded

    ## @var raw_data (numpy.ndarray)
    # original scan data (ETPAIS array)

    ## @var dtype (dict)
    # data type of self.raw_data.
    #
    # one of the data.DTYPE_Xxxx constants.

    ## @var modulation (numpy.ndarray)
    # modulation function calculated from original scan (ETPAIS array)

    ## @var mode (list of characters)
    # list of ETPAI column names which are scanned in self.raw_data.
    #
    # example: ['t','p']

    ## @var emitter (string)
    # chemical symbol and, optionally following, further specification (chemical state, environment, ...)
    # of photo-emitting atoms.
    # the interpretation of this string is up to the project and its cluster generator.
    # it should, however, always start with a chemical element symbol.
    #
    # examples: 'Ca' (calcium), 'CA' (carbon A), 'C a' (carbon a), 'C 1' (carbon one), 'N=O', 'FeIII'.

    ## @var initial_state (string)
    # nl term of initial state
    #
    # in the form expected by EDAC, for example: '1s'

    ## @var energies (numpy.ndarray)
    # kinetic energy referenced to Fermi level.
    #
    # one-dimensional array.

    ## @var thetas (numpy.ndarray)
    # polar angle referenced to normal emission
    #
    # one-dimensional array.
    #
    # note: in the case of a hemispherical scan, the values in this array will not be unique.

    ## @var phis (numpy.ndarray)
    # azimuthal angle referenced to arbitrary origin
    #
    # one-dimensional array.
    #
    # note: in the case of a hemispherical scan, the values in this array will not be unique, and not monotonic.

    ## @var alphas (numpy.ndarray)
    # polar angle referenced to normal emission
    #
    # one-dimensional array.

    def __init__(self):
        self.filename = ""
        self.raw_data = None
        self.dtype = None
        self.modulation = None
        self.mode = []
        self.emitter = ""
        self.initial_state = "1s"
        self.energies = np.zeros((0))
        self.thetas = np.zeros((0))
        self.phis = np.zeros((0))
        self.alphas = np.zeros((0))

    def copy(self):
        """
        create a copy of the scan.

        @return: new independent scan object with the same attributes as the original one.
        """
        return copy.deepcopy(self)

    def set_scan(self, filename, emitter, initial_state):
        """
        set file name of reference experiment and load it.

        the extension must be one of msc_data.DATATYPES (case insensitive)
        corresponding to the meaning of the columns in the file.

        this method does not calculate the modulation function.

        @attention EDAC can only calculate equidistant, rectangular scans.
        this version introduces holo scans as an experimental feature.
        for all other scan types, the scan file must exactly conform with a rectangular scan.
        the following scans are currently supported:

        * intensity vs energy at fixed theta, phi
        * intensity vs analyser angle vs energy at normal emission (theta = 0, constant phi)
        * intensity vs theta, phi, or alpha
        * holo scan (theta,phi)

        @param filename: (string) file name of the experimental data, possibly including a path.

        @param emitter: (string) chemical symbol of the photo-emitting atom, e.g. "Cu".

        @param initial_state: (string) nl term of the initial state of the atom, e.g. "2p".

        """
        self.filename = filename
        self.emitter = emitter
        self.initial_state = initial_state

        if self.filename:
            self.raw_data = md.load_data(self.filename)
            self.dtype = self.raw_data.dtype
            self.mode, positions = md.detect_scan_mode(self.raw_data)

            if 'e' in self.mode:
                self.energies = positions['e']
            else:
                try:
                    self.energies = np.asarray((self.raw_data['e'][0], ))
                except ValueError:
                    logger.error("missing energy in scan file %s", self.filename)
                    raise

            if 't' in self.mode:
                self.thetas = positions['t']
            else:
                try:
                    self.thetas = np.asarray((self.raw_data['t'][0], ))
                except ValueError:
                    logger.info("missing theta in scan file %s, defaulting to 0.0", self.filename)
                    self.thetas = np.zeros((1))

            if 'p' in self.mode:
                self.phis = positions['p']
            else:
                try:
                    self.phis = np.asarray((self.raw_data['p'][0], ))
                except ValueError:
                    logger.info("missing phi in scan file %s, defaulting to 0.0", self.filename)
                    self.phis = np.zeros((1))

            if 'a' in self.mode:
                self.alphas = positions['a']
            else:
                try:
                    self.alphas = np.asarray((self.raw_data['a'][0], ))
                except ValueError:
                    logger.info("missing alpha in scan file %s, defaulting to 0.0", self.filename)
                    self.alphas = np.zeros((1))


# noinspection PyMethodMayBeStatic
class Project(object):
    """
    base class of a calculation project.

    a 'calculation project' is a coded set of prescriptions
    on how to get from a set of model parameters to simulated data
    which correspond to provided experimental data.
    the results include a measure of the quality of the simulated data compared to experimental data.

    each calculation project must derive from this class.
    it must implement the create_domain(), create_cluster(), and create_params() methods.

    the other methods and attributes of this class
    are for passing command line parameters to the calculation modules.
    the attributes should be populated in the constructor of the derived class,
    or (recommended) in the create_project() function of the module.
    it is essential that the attributes are set correctly before calculation.
    """

    ## @var features (dictionary)
    #
    # calculation features and versions supported by the project.
    #
    # the dictionary contains key-value pairs where the key is the name of the feature and value is a version number.
    # this field conditionally enables new software features that may break backward compatibility.
    # derived projects should fill this field with the supported version
    # upon creation (in their __init__ method or create_project() factory).
    # version 0 (default) means that the feature is disabled.
    #
    # the following features can be enabled (list may be incomplete):
    # as of this version, no optional features are defined.
    #
    # @note rather than introducing new features and, particularly, new versions that rely on this mechanism,
    # developers of generic code should check whether backward compatibility could be achieved in a simpler way,
    # e.g. by implementing addition methods whose default behaviour is the same as of the previous version.
    # in some cases it may be better to refactor all current project code.
    #

    ## @var scans (list of Scan objects)
    #  list of experimental or scan files for which calculations are to be run.
    #
    #  the list must be populated by calling the add_scan() method.
    #  this should be done in the create_project() function, or through the command line arguments.
    #
    #  the modulation function is calculated internally.
    #  if your scan files contain the modulation function (as opposed to intensity),
    #  you must add the files in the create_project() function.
    #  the command line does not support loading modulation functions.
    #
    #  @c scans must be considered read-only. use project methods to change it.

    ## @var symmetries (list of arbitrary objects)
    #  list of symmetries for which calculations are to be run.
    #
    # it is up to the derived class what kind of objects are stored in the list.
    # the recommended kind of objects are dictionaries which hold parameter values,
    # similar to the model dictionaries.
    #
    # the list must be populated by calling the add_symmetry() method.

    ## @var cluster_generator (ClusterGenerator object)
    #  provides the cluster generator methods.
    #
    # a project must provide a cluster generator object that is derived from ClusterGenerator.
    # at least the ClusterGenerator.create_cluster method must be implemented.
    # if emitters should be run in parallel, the ClusterGenerator.count_emitters must be implemented as well.
    #
    # the initial value is a LegacyClusterGenerator object
    # which routes cluster calls back to the project for compatibility with older project code.

    ## @var optimizer_params (dict)
    # optional parameters of the model optimizer.
    #
    # this is a dictionary that can have (among others) the following values.
    # for a detailed list, see the documentation of the respective model handler.
    #
    # @arg @c 'pop_size' (int)
    #   population size (number of particles) in the swarm or genetic optimization mode.
    #   by default, the ParticleSwarmHandler chooses the population size depending on the number of parallel processes.
    #   you may want to override the default value in cases where the automatic choice is not appropriate.
    #   the value can be set by the command line.
    # @arg @c 'seed_file' (string)
    #   name of a file containing the results from previous optimization runs.
    #   this can be used to resume a swarm or genetic optimization where it was interrupted before.
    #   the seed file is a space-delimited, multi-column, text file,
    #   e.g., the output file of a previous optimization.
    #   by default, no seed is loaded.
    # @arg @c 'recalc_seed' (bool)
    #   select whether the R-factors of the seed models are calculated again.
    #   set this argument to False only if the calculation is a continuation of a previous one
    #   without any changes to the code.

    ## @var data_dir
    # directory path to experimental data.
    #
    # the project should load experimental data (scan files) from this path.
    # this attribute receives the --data-dir argument from the command line
    # if the project parses the common arguments (pmsco.set_common_args).
    #
    # it is up to the project to define where to load scan files from.
    # if the location of the files may depend on the machine or user account,
    # the user may want to specify the data path on the command line.

    ## @var output_dir (string)
    # directory path for data files produced during the calculation, including intermediate files.
    #
    # output_dir and output_file are set at once by @ref set_output.

    ## @var output_file (string)
    # file name root for data files produced during the calculation, including intermediate files.
    #
    # the file name should include the path. the path must also be set in @ref output_dir.
    #
    # output_dir and output_file are set at once by @ref set_output.

    ## @var timedelta_limit (datetime.timedelta)
    # wall time after which no new calculations should be started.
    #
    # the actual wall time may be longer by the remaining time of running calculations.
    # running calculations will not be aborted.

    ## @var combined_scan
    # combined raw data from scans.
    # updated by add_scan().

    ## @var combined_modf
    # combined modulation function from scans.
    # updated by add_scan().

    ## @var files
    # list of all generated data files with metadata.
    # the list is used by model handlers to decide which files can be deleted at run time to save disk space.
    #
    # files.categories_to_delete determines which files can be deleted.

    ## @var keep_best
    # number of best models for which result files should be kept.
    #
    # this attribute determines how many models are kept based on R-factor ranking at each node of the task tree
    # (up to keep_levels).

    ## @var keep_levels
    # numeric task level down to which R-factors are considered when model files are cleaned up.
    #
    # @arg 0 = model level: combined results only.
    # @arg 1 = scan level: scan nodes in addition to combined results (level 0).
    # @arg 2 = symmetry level: symmetry nodes in addition to level 1.
    # @arg 3 = emitter level: emitter nodes in addition to level 1.
    # @arg 4 = region level: region nodes in addition to level 1.

    def __init__(self):
        self.mode = "single"
        self.code = "edac"
        self.features = {}
        self.cluster_format = mc.FMT_EDAC
        self.cluster_generator = mc.LegacyClusterGenerator(self)
        self.scans = []
        self.symmetries = []
        self.optimizer_params = {
            'pop_size': 0,
            'seed_file': "",
            'seed_limit': 0,
            'recalc_seed': True,
            'table_file': ""
        }
        self.data_dir = ""
        self.output_dir = ""
        self.output_file = "pmsco_data"
        self.timedelta_limit = datetime.timedelta(days=1)
        self.combined_scan = None
        self.combined_modf = None
        self.files = files.FileTracker()
        self.keep_levels = 1
        self.keep_best = 10
        self.handler_classes = {
            'model': handlers.SingleModelHandler,
            'scan': handlers.ScanHandler,
            'sym': handlers.SymmetryHandler,
            'emit': handlers.EmitterHandler,
            'region': handlers.SingleRegionHandler
        }
        self.calculator_class = None
        self._tasks_fields = []
        self._db = database.ResultsDatabase()

    def create_domain(self):
        """
        create a msc_project.Domain object which defines the allowed range for model parameters.

        this method must be implemented by the actual project class.
        the Domain object must declare all model parameters used in the project.

        @return Domain object
        """
        return None

    def create_params(self, model, index):
        """
        create a Params object given the model parameters and calculation index.

        @param model (dictionary) model parameters to be used in the calculation.

        @param index (named tuple CalcID) calculation index.
            the method should consider only the following attributes:
            @arg @c scan   scan index (index into Project.scans)
            @arg @c sym    symmetry index (index into Project.symmetries)
        """
        return None

    def clear_scans(self):
        """
        clear scans.

        delete all scans in self.scans and empty the list.

        @return: None
        """
        self.scans = []
        self.combined_scan = None
        self.combined_modf = None

    def add_scan(self, filename, emitter, initial_state, is_modf=False, modf_model=None):
        """
        add the file name of reference experiment and load it.

        the extension must be one of msc_data.DATATYPES (case insensitive)
        corresponding to the meaning of the columns in the file.

        caution: EDAC can only calculate equidistant, rectangular scans.
        the following scans are currently supported:

        * intensity vs energy at fixed theta, phi
        * intensity vs analyser angle vs energy at normal emission (theta = 0, constant phi)
        * intensity vs theta, phi, or alpha
        * intensity vs theta and phi (hemisphere or hologram scan)

        the method calculates the modulation function if @c is_modf is @c False.
        it also updates @c combined_scan and @c combined_modf which may be used as R-factor comparison targets.

        @param filename: (string) file name of the experimental data, possibly including a path.

        @param emitter: (string) chemical symbol of the photo-emitting atom, e.g. "Cu".

        @param initial_state: (string) nl term of the initial state of the atom, e.g. "2p".

        @param is_modf: (bool) declares whether the file contains the modulation function (True),
            or intensity (False, default). In the latter case, the modulation function is calculated internally.

        @param modf_model: (dict) model parameters to be passed to the modulation function.

        @return (Scan) the new scan object (which is also a member of self.scans).

        @todo the accepted scanning schemes should be generalized.
        """
        scan = Scan()
        scan.set_scan(filename, emitter, initial_state)
        self.scans.append(scan)

        if modf_model is None:
            modf_model = {}

        if scan.raw_data is not None:
            if is_modf:
                scan.modulation = scan.raw_data
            else:
                try:
                    scan.modulation = self.calc_modulation(scan.raw_data, modf_model)
                except ValueError:
                    logger.error("error calculating the modulation function of experimental data.")
                    scan.modulation = None
        else:
            scan.modulation = None

        if scan.raw_data is not None:
            if self.combined_scan is not None:
                dt = md.common_dtype((self.combined_scan, scan.raw_data))
                d1 = md.restructure_data(self.combined_scan, dt)
                d2 = md.restructure_data(scan.raw_data, dt)
                self.combined_scan = np.hstack((d1, d2))
            else:
                self.combined_scan = scan.raw_data.copy()
        else:
            self.combined_scan = None

        if scan.modulation is not None:
            if self.combined_modf is not None:
                dt = md.common_dtype((self.combined_modf, scan.modulation))
                d1 = md.restructure_data(self.combined_modf, dt)
                d2 = md.restructure_data(scan.modulation, dt)
                self.combined_modf = np.hstack((d1, d2))
            else:
                self.combined_modf = scan.modulation.copy()
        else:
            self.combined_modf = None

        return scan

    def clear_symmetries(self):
        """
        clear symmetries.

        delete all symmetries in self.symmetries and empty the list.

        @return: None
        """
        self.symmetries = []

    def add_symmetry(self, symmetry):
        """
        add a symmetry to the list of symmetries.

        this class declares the list of symmetries.
        it does not define what should be in the list of symmetries.
        however, there must be an entry for each symmetry to be calculated.
        if the list is empty, no calculation will be executed.

        @attention initially, the symmetries list is empty.
            your project needs to add at least one symmetry.
            otherwise, no calculation will be executed.

        @param symmetry: it is up to the derived project class to specify and interpret the data stored here.
            it is recommended to store a dictionary with symmetry parameters similar to the model parameters.

        @return: None
        """
        self.symmetries.append(symmetry)

    def set_output(self, filename):
        """
        set path and base name of output file.

        path and name are copied to the output_file attribute.
        path is copied to the output_dir attribute.

        if the path is missing, the destination is the current working directory.
        """
        self.output_file = filename
        path, name = os.path.split(filename)
        self.output_dir = path

    def set_timedelta_limit(self, timedelta):
        """
        set the walltime limit

        timedelta (datetime.timedelta)
        """
        self.timedelta_limit = timedelta

    def combine_symmetries(self, parent_task, child_tasks):
        """
        combine results of different symmetry into one result and calculate the modulation function.

        the symmetry results are read from the file system using the indices defined by the child_tasks,
        and the combined result is written to the file system with the index defined by parent_task.

        by default, this method adds all symmetries with equal weight.
        weights can be defined in the model dictionary with keys 'wsym0', 'wsym1', etc.
        missing weights default to 1.
        note: to avoid correlated parameters, one symmetry must always have a fixed weight.

        @param parent_task: (CalculationTask) parent task of the symmetry tasks.
            the method must write the results to the files indicated
            by the @c result_filename and @c modf_filename attributes.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each symmetry.
            the method must read the source data from the files
            indicated by the @c result_filename attributes.
            the sequence is sorted by task ID, i.e., essentially, by symmetry index.

        @return: None

        @raise IndexError if child_tasks is empty

        @raise IOError if a filename is missing

        @note the weights of the symmetries (in derived classes) can be part of the optimizable model parameters.
            the model parameters are available as the @c model attribute of the calculation tasks.
        """

        result_data = None
        sum_weights = 0.
        for task in child_tasks:
            data = md.load_data(task.result_filename)
            if result_data is None:
                result_data = data.copy()
                result_data['i'] = 0.
            try:
                weight = task.model['wsym{}'.format(task.id.sym)]
            except KeyError:
                weight = 1.
            result_data['i'] += weight * data['i']
            sum_weights += weight
        result_data['i'] /= sum_weights

        md.save_data(parent_task.result_filename, result_data)

        # todo : the handling of missing modulation functions may need some cleanup
        if self.scans[parent_task.id.scan].modulation is not None:
            result_modf = self.calc_modulation(result_data, parent_task.model)
            md.save_data(parent_task.modf_filename, result_modf)
        else:
            parent_task.modf_filename = ""

    def combine_emitters(self, parent_task, child_tasks):
        """
        combine results of different emitters into one result. calculate the modulation function.

        the emitter results are read from the file system using the indices defined by the child_tasks,
        and the combined result is written to the file system with the index defined by parent_task.

        by default, this method adds all emitters with equal weight.

        sub-classes may override this method and implement expansion of equivalent emitters,
        unequal weights, etc.

        @param parent_task: (CalculationTask) parent task of the emitter tasks.
            the method must write the results to the files indicated
            by the @c result_filename and @c modf_filename attributes.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each emitter.
            the method must read the source data from the files
            indicated by the @c result_filename attributes.
            the sequence is sorted by task ID, i.e., essentially, by the emitter index.

        @return: None

        @raise IndexError if child_tasks is empty

        @raise IOError if a filename is missing

        @note the weights of the emitters (in derived classes) can be part of the optimizable model parameters.
            the model parameters are available as the @c model attribute of the calculation tasks.
        """

        result_data = None
        for task in child_tasks:
            data = md.load_data(task.result_filename)
            if result_data is not None:
                result_data['i'] += data['i']
            else:
                result_data = data

        md.save_data(parent_task.result_filename, result_data)

        # todo : the handling of missing modulation functions may need some cleanup
        if self.scans[parent_task.id.scan].modulation is not None:
            result_modf = self.calc_modulation(result_data, parent_task.model)
            md.save_data(parent_task.modf_filename, result_modf)
        else:
            parent_task.modf_filename = ""

    def combine_scans(self, parent_task, child_tasks):
        """
        combine results of different scans into one result, for intensity and modulation.

        the scan results are read from the file system using the indices defined by the child_tasks,
        and the combined result is written to the file system with the index defined by parent_task.

        the datasets of the scans are appended.
        this is done for intensity and modulation data independently.

        @param parent_task: (CalculationTask) parent task of the scan tasks.
            the method must write the results to the files indicated
            by the @c result_filename and @c modf_filename attributes.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
            the method must read the source data from the files
            indicated by the @c result_filename attributes.
            the sequence is sorted by task ID, i.e., essentially, by scan index.

        @return: None

        @raise IndexError if child_tasks is empty.
        """

        # intensity
        try:
            stack1 = [md.load_data(task.result_filename) for task in child_tasks]
        except IOError:
            parent_task.result_filename = ""
        else:
            dtype = md.common_dtype(stack1)
            stack2 = [md.restructure_data(data, dtype) for data in stack1]
            result_data = np.hstack(tuple(stack2))
            md.save_data(parent_task.result_filename, result_data)

        # modulation
        try:
            stack1 = [md.load_data(task.modf_filename) for task in child_tasks]
        except IOError:
            parent_task.modf_filename = ""
        else:
            dtype = md.common_dtype(stack1)
            stack2 = [md.restructure_data(data, dtype) for data in stack1]
            result_modf = np.hstack(tuple(stack2))
            md.save_data(parent_task.modf_filename, result_modf)

    def combine_regions(self, parent_task, child_tasks):
        """
        combine results from different regions into one result, for intensity and modulation.

        the scan results are read from the file system using the indices defined by the child_tasks,
        and the combined result is written to the file system with the index defined by parent_task.

        the datasets of the regions are appended and sorted in the standard order of the data module.
        if the resulting length differs from the corresponding experimental scan,
        an error is printed to the logger, but the calculation continues.

        the modulation function is calculated by calling @ref calc_modulation.

        @param parent_task: (CalculationTask) parent task of the region tasks.
            the method writes the results to the file names
            given by the @c result_filename and @c modf_filename attributes.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each region.
            the reads the source data from the files
            indicated by the @c result_filename attributes.
            the sequence is sorted by task ID, i.e., essentially, by region index.

        @return: None

        @raise IndexError if child_tasks is empty.
        """
        # intensity
        try:
            stack1 = [md.load_data(task.result_filename) for task in child_tasks]
        except IOError:
            parent_task.result_valid = False
            parent_task.result_filename = ""
        else:
            dtype = md.common_dtype(stack1)
            stack2 = [md.restructure_data(data, dtype) for data in stack1]
            result_data = np.hstack(tuple(stack2))
            md.sort_data(result_data)
            md.save_data(parent_task.result_filename, result_data)

            scan = self.scans[parent_task.id.scan]
            if result_data.shape[0] != scan.raw_data.shape[0]:
                logger.error(BMsg("scan length mismatch: combined result: {result}, experimental data: {expected}",
                                  result=result_data.shape[0], expected=scan.raw_data.shape[0]))

        # modulation
        try:
            data = md.load_data(parent_task.result_filename)
            modf = self.calc_modulation(data, parent_task.model)
        except IOError:
            parent_task.modf_filename = ""
        else:
            md.save_data(parent_task.modf_filename, modf)

    def setup(self, handlers):
        """
        prepare for calculations.

        this method is called in the master process before starting the task loop.
        at this point the task handlers have been created and set up.
        if the project needs to change settings of task handlers it can do so in this method.

        this instance writes the header of the tasks.dat file
        that will receive sub-task evaluation results from the evaluate_result() method.

        @param handlers: dictionary listing the initialized task handler instances.
            the dictionary keys are the attribute names of pmsco.dispatch.CalcID:
            'model', 'scan', 'sym', 'emit' and 'region'.

        @return: None
        """
        fields = ["rfac"]
        fields.extend(dispatch.CalcID._fields)
        fields = ["_" + f for f in fields]
        dom = self.create_domain()
        model_fields = dom.start.keys()
        model_fields.sort(key=lambda name: name.lower())
        fields.extend(model_fields)
        self._tasks_fields = fields

        with open(self.output_file + ".tasks.dat", "w") as outfile:
            outfile.write("# ")
            outfile.write(" ".join(fields))
            outfile.write("\n")

        # todo : fill in the descriptive fields, change to file-database
        self._db.connect(":memory:")
        project_id = self._db.register_project(self.__class__.__name__, sys.argv[0])
        job_id = self._db.register_job(project_id,
                                       "job-name",
                                       self.mode,
                                       socket.gethostname(),
                                       "git-hash",
                                       datetime.datetime.now(),
                                       "description")
        self._db.register_params(model_fields)
        self._db.create_models_view()

    def evaluate_result(self, parent_task, child_tasks):
        """
        evaluate the result of a calculation task.

        this method is called from the add_result of the task handlers at each level.
        it gives the project a hook to check the progress of a model at any level of the task tree.

        the method calculates the r-factor by calling the Project.calc_rfactor method.
        the result is written to the task.rfac field and to the .tasks.dat file.
        invalid and region-level results are skipped.

        this method is called in the master process only.

        @param parent_task: (CalculationTask) a calculation task.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
            the sequence must be sorted by task ID.

        @return: None
        """
        if parent_task.result_valid and parent_task.id.region == -1:
            try:
                parent_task.rfac = self.calc_rfactor(parent_task, child_tasks)
            except ValueError:
                parent_task.result_valid = False
                logger.warning(BMsg("calculation {0} resulted in an undefined R-factor.", parent_task.id))
            else:
                values_dict = parent_task.id._asdict()
                values_dict = {"_" + k: v for k, v in values_dict.items()}
                values_dict.update(parent_task.model)
                values_dict['_rfac'] = parent_task.rfac
                values_list = [values_dict[field] for field in self._tasks_fields]
                with open(self.output_file + ".tasks.dat", "a") as outfile:
                    outfile.write(" ".join(format(value) for value in values_list) + "\n")

                self._db.insert_result(parent_task.id, values_dict)

        return None

    # noinspection PyUnusedLocal
    def calc_modulation(self, data, model):
        """
        calculate the project-dependent modulation function.

        the modulation function of I(x) is (I(x) - S(x)) / S(x)
        where S(x) is a smooth copy of I(x).

        by default, the modulation function is calculated by data.calc_modfunc_loess().
        override this method in your project to use a different modulation function.

        @param data structured numpy.ndarray in EI, ETPI, or ETPAI format.
            can contain a one- or multi-dimensional scan.
            the scan coordinates must be on a rectangular or hemisperical grid.
            for maximum compatibility, the array should be sorted,
            though for the default calc_modfunc_loess() function this is not required.

            if data contains a hemispherical scan, the phi dimension is ignored,
            i.e. the modulation function is calcualted on a phi-average.

        @param model: (dict) model parameters of the calculation task.
            can be used to pass parameters from the project.
            this argument is a dictionary of the model parameters.

        @return copy of the data array with the modulation function in the 'i' column.
        """

        return md.calc_modfunc_loess(data)

    def calc_rfactor(self, parent_task, child_tasks):
        """
        calculate the r-factor of a task.

        the r-factor is calculated on the experimental and simulated modulation functions.
        the algorithm differs for the model level and the lower task levels.
        at the model level, the calculation is delegated to Project.combine_rfactors.
        at all other levels, the r-factor is calculated by Project.rfactor,
        where the simulated is loaded from the file specified by parent_task.modf_filename
        and the experimental data from Project.scan.

        this method is called by the task handlers.
        all child tasks belonging to the parent task must be complete.

        to select or implement a specific R-factor algorithm,
        the project sub-class should override Project.rfactor.
        to combine scan r-factors, it should override or patch Project.combine_rfactors.

        @version in earlier versions,
        projects had to override this method to implement their algorithm.
        this has lead to duplication of common code.
        the r-factor algorithm is now distributed over several methods,
        and the method signature has changed.
        new projects should override Project.rfactor and/or Project.combine_rfactors.

        @param parent_task: (CalculationTask) a calculation task.

        @param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
            the sequence must be sorted by task ID.

        @return (float) calculated R-factor.

        @raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
        """
        if parent_task.id.scan >= 0:
            task_data = md.load_data(parent_task.modf_filename)
            exp_data = self.scans[parent_task.id.scan].modulation
            result_r = self.rfactor(exp_data, task_data)
        else:
            result_r = self.combine_rfactors(parent_task, child_tasks)

        return result_r

    def rfactor(self, exp_data, theo_data):
        """
        calculate the r-factor of simulated diffraction data.

        in this class, the method calls the data.rfactor function to calculate the r-factor.
        override this method in your project to use a different R-factor algorithm.

        the input arrays must have the same shape,
        and the coordinate columns must be identical (they are ignored, however).
        the array elements are compared element-by-element.
        terms having NaN intensity are ignored.

        if the sigma column is present in experiment and non-zero,
        the R-factor terms are weighted.

        @param exp_data: (numpy structured array)
            ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
            if an @c s field is present and non-zero,
            the R-factor terms are weighted by 1/sigma**2.

        @param theo_data: (numpy structured array)
            ETPI or ETPAI array containing the calculated modulation functions.

        @return: (float) scalar R-factor

        @raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
        """
        return md.rfactor(exp_data, theo_data)

    def opt_rfactor(self, exp_data, theo_data):
        """
        calculate the r-factor of simulated diffraction data, adjusting their amplitude.

        this is an alternative r-factor calculation algorithm
        using the pmsco.data.optimize_rfactor() function.

        to activate this method (replacing the default one), assign it to Project.rfactor
        in the overriding __init__ or setup method:
        @code{.py}
        self.rfactor = self.opt_rfactor
        @endcode

        @param exp_data: (numpy structured array)
            ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
            if an @c s field is present and non-zero,
            the R-factor terms are weighted by 1/sigma**2.

        @param theo_data: (numpy structured array)
            ETPI or ETPAI array containing the calculated modulation functions.

        @return: (float) scalar R-factor

        @raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
        """
        return md.optimize_rfactor(exp_data, theo_data)

    def combine_rfactors(self, parent_task, child_tasks):
        """
        combine r-factors of child tasks.

        the r-factors are taken from the rfac attribute of the child_tasks.
        the result is an average of the child r-rfactors.

        to produce a balanced result, every child dataset must contain a similar amount of information.
        if this is not the case, the child r-factors must be weighted.
        weighting is currently not implemented but may be introduced in a future version.

        the method is intended to be used at the model level (children are scans).
        though it can technically be used at any level where child r-factors are available.

        @param parent_task: (CalculationTask) parent task for which the r-factor is calculated,
            i.e. a model task.

        @param child_tasks: (sequence of CalculationTask) child tasks of parent_tasks
            that may be consulted for calculating the r-factor.

        @return: (float) r-factor, NaN if parent task is invalid

        @raise ValueError or IndexError if child_tasks is empty.
        """
        if parent_task.result_valid:
            rsum = 0.
            for task in child_tasks:
                rsum += task.rfac
            return rsum / len(child_tasks)
        else:
            return float('nan')

    def alt_combine_rfactors(self, parent_task, child_tasks):
        """
        combine r-factors of child tasks by explicit calculation on the combined result.

        this is an alternative implementation of combine_rfactors.
        instead of using the r-factors from child tasks,
        it re-calculates the r-factor for the combined dataset.
        this method avoids the issue of weighting
        but can introduce bias if the amplitudes of the child datasets differ substantially.

        the experimental dataset is loaded from the file specified by the parent task,
        the corresponding experimental data is taken from self.combined_modf.

        to activate this method, assign it to combine_rfactors.
        in the overriding __init__ or setup method:
        @code{.py}
        self.combine_rfactors = self.alt_combine_rfactors
        @endcode

        @param parent_task: (CalculationTask) parent task for which the r-factor is calculated,
            i.e. a model task.

        @param child_tasks: (sequence of CalculationTask) child tasks of parent_tasks
            that may be consulted for calculating the r-factor.

        @return: (float) r-factor, NaN if parent task is invalid
        """
        if parent_task.result_valid:
            task_data = md.load_data(parent_task.modf_filename)
            exp_data = self.combined_modf
            return self.rfactor(exp_data, task_data)
        else:
            return float('nan')

    def export_cluster(self, index, filename, cluster):
        """
        export the cluster of a calculation task in XYZ format for diagnostics and reporting.

        this method is called with the final cluster just before it is handed over to the calculator.
        it saves the atom coordinates in XYZ format for future reference (e.g. graphics).

        the method creates two files:
        @arg a file with extension '.xyz' contains the whole cluster in XYZ format.
        @arg a file with extension '.emit.xyz' contains only emitter atoms in XYZ format.

        the first part of the file name is formatted with the output name and the complete task identification.
        the file is registered with the file tracker in the 'cluster' category
        so that it will be deleted unless the cluster category is selected for keeping.

        derived project class may override or extend this method
        to carry out further diagnostics or reporting on the cluster.

        @param index: (CalcID) calculation index to which the cluster belongs.
            region may be -1 if only one cluster is exported for all regions
            (clusters do not depend on the scan region).
            emit may be -1 if the cluster is a master from which emitter-related child clusters are derived.

        @param filename: (str) base file name for the output files.
            the filename should be formatted using pmsco.dispatch.CalculationTask.format_filename().
            extensions are appended by this method.

        @param cluster: a pmsco.cluster.Cluster() object with all atom positions and emitters.

        @return: dictionary listing the names of the created files with their category.
                 the dictionary key is the file name,
                 the value is the file category (cluster).
        """
        _files = {}
        xyz_filename = filename + ".xyz"
        cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ)
        _files[xyz_filename] = 'cluster'

        xyz_filename = filename + ".emit.xyz"
        cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ, emitters_only=True)
        _files[xyz_filename] = 'cluster'

        return _files

    def cleanup(self):
        """
        delete unwanted files at the end of a project.

        @return: None
        """
        self.cleanup_files()
        self._db.disconnect()

    def cleanup_files(self, keep=0):
        """
        delete uninteresting files.

        these are all files that
        belong to one of the self.files.categories_to_delete categories or
        do not belong to one of the "best" models.

        "best" models are a number (self.keep_best) of models that gave the lowest R-factors
        at each task level from root to self.keep_levels.
        for example if `keep_best = 10` and `keep_levels = 1`
        the 10 best models at the top level, and the 10 best at the scan level are kept.
        this means that in total up to `n = 10 + 10 * n_scans` models may be kept,
        where n_scans is the number of scan files in the job.

        @param keep: minimum number of best models to keep.
            0 (default): use the project parameter self.keep_best.

        @return None
        """
        self.files.delete_files()
        if 'rfac' in self.files.categories_to_delete:
            keep = max(keep, self.keep_best)
            keepers = self._db.query_best_task_models(self.keep_levels, keep)
            self.files.delete_models(keep=keepers)