update public distribution

based on internal repository c9a2ac8 2019-01-03 16:04:57 +0100 tagged rev-master-2.0.0
2019-01-31 15:45:02 +01:00
parent bbd16d0f94
commit acea809e4e
92 changed files with 165828 additions and 143181 deletions
--- a/pmsco/project.py
+++ b/pmsco/project.py
@ -26,16 +26,27 @@ Licensed under the Apache License, Version 2.0 (the "License"); @n
  http://www.apache.org/licenses/LICENSE-2.0
 """

+from __future__ import absolute_import
 from __future__ import division
+from __future__ import print_function
+
+import collections
 import copy
 import datetime
 import logging
 import numpy as np
-import collections
-import data as md
-import cluster as mc
-import files
-import handlers
+import os.path
+import socket
+import sys
+
+import pmsco.cluster as mc
+from pmsco.compat import open
+import pmsco.data as md
+import pmsco.database as database
+import pmsco.dispatch as dispatch
+import pmsco.files as files
+import pmsco.handlers as handlers
+from pmsco.helpers import BraceMessage as BMsg

 logger = logging.getLogger(__name__)

@ -95,7 +106,7 @@ class Domain(object):
        self.max = {}
        self.step = {}

-    def add_param(self, name, start, min, max, step):
+    def add_param(self, name, start, min=None, max=None, step=None, width=None):
        """
        set the domain of one parameter with all necessary values at once.

@ -107,15 +118,29 @@ class Domain(object):
        @param start (float) start value.

        @param min (float) lower bound of the parameter interval.
+            must be lower or equal to start.
+            if None, the field is set to start.

        @param max (float) upper bound of the parameter interval.
+            must be greater or equal to start.
+            if None, the field is set to start.
+
+        @param width (float) width of the parameter interval.
+            instead of min and max, the interval can be set centered around the start value.
+            this is equivalent to min = start - width/2, max = start + width/2.
+            this argument overrides min and max. don't use both arguments.

        @param step (float) step size.
+            must be greater or equal to zero.
+            if None, the field is set to zero.
        """
        self.start[name] = start
-        self.min[name] = min
-        self.max[name] = max
-        self.step[name] = step
+        self.min[name] = min if min is not None else start
+        self.max[name] = max if max is not None else start
+        if width is not None:
+            self.min[name] = start - width / 2.
+            self.max[name] = start + width / 2.
+        self.step[name] = step if step is not None else 0.0

    def get_param(self, name):
        """
@ -144,9 +169,27 @@ class Params(object):
    objects of this class are created by the implementation of the create_params() method
    of the actual project class.
    """
+
+    ## @var angular_resolution (float)
+    # FWHM angular resolution of the detector.
+    #
+    # maps to:
+    # @arg emission angle window (EDAC)
+    # @arg angular_broadening (MSC)
+
+    ## @var phase_files (dict)
+    # dictionary of phase files.
+    #
+    # the keys are atomic numbers, the values file names.
+    # if the dictionary is empty or the files don't exist, the phases are computed internally (EDAC only).
+    #
+    # maps to:
+    # @arg scatterer (EDAC)
+    # @arg atomic_number, phase_file (MSC)
+
    def __init__(self):
-        self.title = "MSC default parameters"
-        self.comment = "from msc_project.Params()"
+        self.title = "default parameters"
+        self.comment = "set by project.Params()"
        self.cluster_file = ""
        self.output_file = ""
        self.scan_file = ""
@ -154,7 +197,7 @@ class Params(object):
        self.initial_state = "1s"
        # MSC convention: H, V, L, R, U
        self.polarization = "H"
-        self.angular_broadening = 0.0
+        self.angular_resolution = 1.0
        self.z_surface = 0.0
        self.inner_potential = 10.0
        # the energy scale of EDAC is referenced to the vacuum level
@ -167,16 +210,14 @@ class Params(object):
        self.experiment_temperature = 300.0
        self.debye_temperature = 400.0
        self.debye_wavevector = 1.0
+        self.phase_files = {}
        # used by MSC only
        self.spherical_order = 2
        self.scattering_level = 5
        self.fcut = 15.0
        self.cut = 15.0
        self.lattice_constant = 1.0
-        self.atom_types = 0
-        self.atomic_number = [1, 2, 3, 4]
-        self.phase_file = ["1.pha", "2.pha", "3.pha", "4.pha"]
-        self.msq_displacement = [0.1, 0.1, 0.1, 0.1]
+        self.msq_displacement = {}
        self.planewave_attenuation = 1.0
        self.vibration_model = "N"
        self.substrate_atomic_mass = 1.0
@ -216,9 +257,12 @@ class Scan(object):
    # example: ['t','p']

    ## @var emitter (string)
-    # chemical symbol of emitter atom
+    # chemical symbol and, optionally following, further specification (chemical state, environment, ...)
+    # of photo-emitting atoms.
+    # the interpretation of this string is up to the project and its cluster generator.
+    # it should, however, always start with a chemical element symbol.
    #
-    # example: 'Cu'
+    # examples: 'Ca' (calcium), 'CA' (carbon A), 'C a' (carbon a), 'C 1' (carbon one), 'N=O', 'FeIII'.

    ## @var initial_state (string)
    # nl term of initial state
@ -342,142 +386,6 @@ class Scan(object):
                    self.alphas = np.zeros((1))


-class ClusterGenerator(object):
-    """
-    cluster generator class.
-
-    this class bundles the cluster methods in one place
-    so that it's easier to exchange them for different kinds of clusters.
-
-    the project must override at least the create_cluster method.
-    if emitters should be run in parallel tasks, the count_emitters method must be implemented as well.
-    """
-
-    def __init__(self, project):
-        """
-        initialize the cluster generator.
-
-        @param project: reference to the project object.
-            cluster generators may need to look up project parameters.
-        """
-        self.project = project
-
-    def count_emitters(self, model, index):
-        """
-        return the number of emitter configurations for a particular model.
-
-        the number of emitter configurations may depend on the model parameters, scan index and symmetry index.
-        by default, the method returns 1, which means that there is only one emitter configuration.
-
-        emitter configurations are mainly a way to distribute the calculations to multiple processes
-        based on emitters since the resulting diffraction patterns add up incoherently.
-        for this to work, the create_cluster() method must pay attention to the emitter index
-        and generate either a full cluster with all emitters (single process)
-        or a cluster with only a subset of the emitters according to the emitter index (multiple processes).
-        whether all emitters are calculated in one or multiple processes is decided at run-time
-        based on the available resources.
-
-        note that this function returns the number of _configurations_ not _atoms_.
-        an emitter configuration (declared in a Cluster) may include more than one atom.
-        it is up to the project, what is included in a particular configuration.
-
-        to enable multiple emitter configurations, the derived project class must override this method
-        and return a number greater than 1.
-
-        @note in some cases it may be most efficient to call create_cluster and
-            return Cluster.get_emitter_count() of the generated cluster.
-            this is possible because the method is called with emitter index -1.
-            model and index can be passed unchanged to create_cluster.
-
-        @param model (dictionary) model parameters to be used in the calculation.
-
-        @param index (named tuple CalcID) calculation index.
-            the method should consider only the following attributes:
-            @arg @c scan   scan index (index into Project.scans)
-            @arg @c sym    symmetry index (index into Project.symmetries)
-            @arg @c emit   emitter index is -1 if called by the emitter handler.
-
-        @return number of emitter configurations.
-            this implementation returns the default value of 1.
-        """
-        return 1
-
-    def create_cluster(self, model, index):
-        """
-        create a Cluster object given the model parameters and calculation index.
-
-        the generated cluster will typically depend on the model parameters.
-        depending on the project, it may also depend on the scan index, symmetry index and emitter index.
-
-        the scan index can be used to generate a different cluster for different scan geometry,
-        e.g., if some atoms can be excluded due to a longer mean free path.
-        if this is not the case for the specific project, the scan index can be ignored.
-
-        the symmetry index may select a particular domain that has a different atomic arrangement.
-        in this case, depending on the value of index.sym, the function must generate a cluster corresponding
-        to the particular domain/symmetry.
-        the method can ignore the symmetry index if the project defines only one symmetry,
-        or if the symmetry does not correspond to a different atomic structure.
-
-        the emitter index selects a particular emitter configuration.
-        depending on the value of the emitter index, the method must react differently:
-
-        1. if the value lower or equal to zero, return the full cluster and mark all inequivalent emitter atoms.
-           emitters which are reproduced by a symmetry expansion in combine_emitters() should not be marked.
-           the full diffraction scan will be calculated in one calculation.
-
-        2. if the value is greater than zero, generate the cluster with the emitter configuration
-           selected by the emitter index.
-           the index is in the range between 1 and the return value of count_emitters().
-           the results of the individual emitter calculations are summed up in combine_emitters().
-
-        the code should ideally be written such that either case yields the same diffraction result.
-        if count_emitters() always returns 1 (default), the second case does not have to be implemented,
-        and the method can ignore the emitter index.
-
-        the method must ignore the model and energy index.
-
-        @param model (dictionary) model parameters to be used in the calculation.
-
-        @param index (named tuple CalcID) calculation index.
-            the method should consider only the following attributes:
-            @arg @c scan   scan index (index into Project.scans)
-            @arg @c sym    symmetry index (index into Project.symmetries)
-            @arg @c emit   emitter index.
-                            if lower or equal to zero, generate the full cluster and mark all emitters.
-                            if greater than zero, the value is a 1-based index of the emitter configuration.
-        """
-        return None
-
-
-class LegacyClusterGenerator(ClusterGenerator):
-    """
-    cluster generator class for projects that don't declare a generator.
-
-    in previous versions, the create_cluster and count_emitters methods were implemented by the project class.
-    this class redirects generator calls to the project methods
-    providing compatibility to older project code.
-    """
-
-    def __init__(self, project):
-        super(LegacyClusterGenerator, self).__init__(project)
-
-    def count_emitters(self, model, index):
-        """
-        redirect the call to the corresponding project method if implemented.
-        """
-        try:
-            return self.project.count_emitters(model, index)
-        except AttributeError:
-            return 1
-
-    def create_cluster(self, model, index):
-        """
-        redirect the call to the corresponding project method.
-        """
-        return self.project.create_cluster(model, index)
-
-
 # noinspection PyMethodMayBeStatic
 class Project(object):
    """
@ -549,39 +457,27 @@ class Project(object):
    # the initial value is a LegacyClusterGenerator object
    # which routes cluster calls back to the project for compatibility with older project code.

-    ## @var pop_size (int)
-    #  population size (number of particles) in the particle swarm optimization.
+    ## @var optimizer_params (dict)
+    # optional parameters of the model optimizer.
    #
-    # by default, the ParticleSwarmHandler chooses the population size depending on the number of parallel processes.
-    # you may want to override the default value in cases where the automatic choice is not appropriate, e.g.:
-    # - the calculation of a model takes a long time compared to the available computing time.
-    # - the calculation of a model spawns many sub-tasks due to complex symmetry.
-    # - you want to increase the number of generations compared to the number of particles.
+    # this is a dictionary that can have (among others) the following values.
+    # for a detailed list, see the documentation of the respective model handler.
    #
-    # the default value is 0.
-    #
-    # the value can be set by the command line.
-
-    ## @var history_file (string)
-    # name of a file containing the results from previous optimization runs.
-    # this can be used to resume a swarm optimization where it was interrupted before.
-    #
-    # the history file is a space-delimited, multi-column, text file.
-    # output files of a previous optimization run can be used as is.
-    # there must be one column for each model parameter, and one column of R factors.
-    # the first row must contain the names of the model parameters.
-    # the name of th R factor column must be '_rfac'.
-    # additional columns may be included and are ignored.
-    #
-    # by default, no history is loaded.
-
-    ## @var recalc_history (bool)
-    # select whether the R-factors of the historic models are calculated again.
-    #
-    # this is useful if the historic data was calculated for a different cluster, different set of parameters,
-    # or different experimental data, and if the R-factors of the new optimization may be systematically greater.
-    # set this argument to False only if the calculation is a continuation of a previous one
-    # without any changes to the code.
+    # @arg @c 'pop_size' (int)
+    #   population size (number of particles) in the swarm or genetic optimization mode.
+    #   by default, the ParticleSwarmHandler chooses the population size depending on the number of parallel processes.
+    #   you may want to override the default value in cases where the automatic choice is not appropriate.
+    #   the value can be set by the command line.
+    # @arg @c 'seed_file' (string)
+    #   name of a file containing the results from previous optimization runs.
+    #   this can be used to resume a swarm or genetic optimization where it was interrupted before.
+    #   the seed file is a space-delimited, multi-column, text file,
+    #   e.g., the output file of a previous optimization.
+    #   by default, no seed is loaded.
+    # @arg @c 'recalc_seed' (bool)
+    #   select whether the R-factors of the seed models are calculated again.
+    #   set this argument to False only if the calculation is a continuation of a previous one
+    #   without any changes to the code.

    ## @var data_dir
    # directory path to experimental data.
@ -594,9 +490,17 @@ class Project(object):
    # if the location of the files may depend on the machine or user account,
    # the user may want to specify the data path on the command line.

+    ## @var output_dir (string)
+    # directory path for data files produced during the calculation, including intermediate files.
+    #
+    # output_dir and output_file are set at once by @ref set_output.
+
    ## @var output_file (string)
    # file name root for data files produced during the calculation, including intermediate files.
    #
+    # the file name should include the path. the path must also be set in @ref output_dir.
+    #
+    # output_dir and output_file are set at once by @ref set_output.

    ## @var timedelta_limit (datetime.timedelta)
    # wall time after which no new calculations should be started.
@ -604,11 +508,11 @@ class Project(object):
    # the actual wall time may be longer by the remaining time of running calculations.
    # running calculations will not be aborted.

-    ## @var _combined_scan
+    ## @var combined_scan
    # combined raw data from scans.
    # updated by add_scan().

-    ## @var _combined_modf
+    ## @var combined_modf
    # combined modulation function from scans.
    # updated by add_scan().

@ -618,30 +522,55 @@ class Project(object):
    #
    # files.categories_to_delete determines which files can be deleted.

+    ## @var keep_best
+    # number of best models for which result files should be kept.
+    #
+    # this attribute determines how many models are kept based on R-factor ranking at each node of the task tree
+    # (up to keep_levels).
+
+    ## @var keep_levels
+    # numeric task level down to which R-factors are considered when model files are cleaned up.
+    #
+    # @arg 0 = model level: combined results only.
+    # @arg 1 = scan level: scan nodes in addition to combined results (level 0).
+    # @arg 2 = symmetry level: symmetry nodes in addition to level 1.
+    # @arg 3 = emitter level: emitter nodes in addition to level 1.
+    # @arg 4 = region level: region nodes in addition to level 1.
+
    def __init__(self):
        self.mode = "single"
        self.code = "edac"
        self.features = {}
        self.cluster_format = mc.FMT_EDAC
-        self.cluster_generator = LegacyClusterGenerator(self)
+        self.cluster_generator = mc.LegacyClusterGenerator(self)
        self.scans = []
        self.symmetries = []
-        self.pop_size = 0
-        self.history_file = ""
-        self.recalc_history = True
+        self.optimizer_params = {
+            'pop_size': 0,
+            'seed_file': "",
+            'seed_limit': 0,
+            'recalc_seed': True,
+            'table_file': ""
+        }
        self.data_dir = ""
+        self.output_dir = ""
        self.output_file = "pmsco_data"
        self.timedelta_limit = datetime.timedelta(days=1)
-        self._combined_scan = None
-        self._combined_modf = None
+        self.combined_scan = None
+        self.combined_modf = None
        self.files = files.FileTracker()
-        self.handler_classes = {}
-        self.handler_classes['model'] = handlers.SingleModelHandler
-        self.handler_classes['scan'] = handlers.ScanHandler
-        self.handler_classes['symmetry'] = handlers.SymmetryHandler
-        self.handler_classes['emitter'] = handlers.EmitterHandler
-        self.handler_classes['region'] = handlers.SingleRegionHandler
+        self.keep_levels = 1
+        self.keep_best = 10
+        self.handler_classes = {
+            'model': handlers.SingleModelHandler,
+            'scan': handlers.ScanHandler,
+            'sym': handlers.SymmetryHandler,
+            'emit': handlers.EmitterHandler,
+            'region': handlers.SingleRegionHandler
+        }
        self.calculator_class = None
+        self._tasks_fields = []
+        self._db = database.ResultsDatabase()

    def create_domain(self):
        """
@ -676,8 +605,8 @@ class Project(object):
        @return: None
        """
        self.scans = []
-        self._combined_scan = None
-        self._combined_modf = None
+        self.combined_scan = None
+        self.combined_modf = None

    def add_scan(self, filename, emitter, initial_state, is_modf=False, modf_model=None):
        """
@ -695,7 +624,7 @@ class Project(object):
        * intensity vs theta and phi (hemisphere or hologram scan)

        the method calculates the modulation function if @c is_modf is @c False.
-        it also updates @c _combined_scan and @c _combined_modf which may be used as R-factor comparison targets.
+        it also updates @c combined_scan and @c combined_modf which may be used as R-factor comparison targets.

        @param filename: (string) file name of the experimental data, possibly including a path.

@ -732,22 +661,26 @@ class Project(object):
            scan.modulation = None

        if scan.raw_data is not None:
-            if self._combined_scan is not None:
-                dtype = md.common_dtype((self._combined_scan, scan.raw_data))
-                self._combined_scan = np.hstack((self._combined_scan, md.restructure_data(scan.raw_data, dtype)))
+            if self.combined_scan is not None:
+                dt = md.common_dtype((self.combined_scan, scan.raw_data))
+                d1 = md.restructure_data(self.combined_scan, dt)
+                d2 = md.restructure_data(scan.raw_data, dt)
+                self.combined_scan = np.hstack((d1, d2))
            else:
-                self._combined_scan = scan.raw_data.copy()
+                self.combined_scan = scan.raw_data.copy()
        else:
-            self._combined_scan = None
+            self.combined_scan = None

        if scan.modulation is not None:
-            if self._combined_modf is not None:
-                dtype = md.common_dtype((self._combined_modf, scan.modulation))
-                self._combined_modf = np.hstack((self._combined_modf, md.restructure_data(scan.modulation, dtype)))
+            if self.combined_modf is not None:
+                dt = md.common_dtype((self.combined_modf, scan.modulation))
+                d1 = md.restructure_data(self.combined_modf, dt)
+                d2 = md.restructure_data(scan.modulation, dt)
+                self.combined_modf = np.hstack((d1, d2))
            else:
-                self._combined_modf = scan.modulation.copy()
+                self.combined_modf = scan.modulation.copy()
        else:
-            self._combined_modf = None
+            self.combined_modf = None

        return scan

@ -783,9 +716,16 @@ class Project(object):

    def set_output(self, filename):
        """
-        set base name of output file
+        set path and base name of output file.
+
+        path and name are copied to the output_file attribute.
+        path is copied to the output_dir attribute.
+
+        if the path is missing, the destination is the current working directory.
        """
        self.output_file = filename
+        path, name = os.path.split(filename)
+        self.output_dir = path

    def set_timedelta_limit(self, timedelta):
        """
@ -797,12 +737,15 @@ class Project(object):

    def combine_symmetries(self, parent_task, child_tasks):
        """
-        combine results of different symmetry into one result. calculate the modulation function.
+        combine results of different symmetry into one result and calculate the modulation function.

        the symmetry results are read from the file system using the indices defined by the child_tasks,
        and the combined result is written to the file system with the index defined by parent_task.

        by default, this method adds all symmetries with equal weight.
+        weights can be defined in the model dictionary with keys 'wsym0', 'wsym1', etc.
+        missing weights default to 1.
+        note: to avoid correlated parameters, one symmetry must always have a fixed weight.

        @param parent_task: (CalculationTask) parent task of the symmetry tasks.
            the method must write the results to the files indicated
@ -817,19 +760,26 @@ class Project(object):

        @raise IndexError if child_tasks is empty

-        @raise KeyError if a filename is missing
+        @raise IOError if a filename is missing

        @note the weights of the symmetries (in derived classes) can be part of the optimizable model parameters.
            the model parameters are available as the @c model attribute of the calculation tasks.
        """

        result_data = None
+        sum_weights = 0.
        for task in child_tasks:
            data = md.load_data(task.result_filename)
-            if result_data is not None:
-                result_data['i'] += data['i']
-            else:
-                result_data = data
+            if result_data is None:
+                result_data = data.copy()
+                result_data['i'] = 0.
+            try:
+                weight = task.model['wsym{}'.format(task.id.sym)]
+            except KeyError:
+                weight = 1.
+            result_data['i'] += weight * data['i']
+            sum_weights += weight
+        result_data['i'] /= sum_weights

        md.save_data(parent_task.result_filename, result_data)

@ -865,7 +815,7 @@ class Project(object):

        @raise IndexError if child_tasks is empty

-        @raise KeyError if a filename is missing
+        @raise IOError if a filename is missing

        @note the weights of the emitters (in derived classes) can be part of the optimizable model parameters.
            the model parameters are available as the @c model attribute of the calculation tasks.
@ -898,7 +848,7 @@ class Project(object):
        the datasets of the scans are appended.
        this is done for intensity and modulation data independently.

-        @param parent_task: (CalculationTask) parent task of the symmetry tasks.
+        @param parent_task: (CalculationTask) parent task of the scan tasks.
            the method must write the results to the files indicated
            by the @c result_filename and @c modf_filename attributes.

@ -910,14 +860,12 @@ class Project(object):
        @return: None

        @raise IndexError if child_tasks is empty.
-
-        @raise KeyError if a filename is missing.
        """

        # intensity
        try:
            stack1 = [md.load_data(task.result_filename) for task in child_tasks]
-        except (KeyError, IOError):
+        except IOError:
            parent_task.result_filename = ""
        else:
            dtype = md.common_dtype(stack1)
@ -928,7 +876,7 @@ class Project(object):
        # modulation
        try:
            stack1 = [md.load_data(task.modf_filename) for task in child_tasks]
-        except (KeyError, IOError):
+        except IOError:
            parent_task.modf_filename = ""
        else:
            dtype = md.common_dtype(stack1)
@ -936,6 +884,142 @@ class Project(object):
            result_modf = np.hstack(tuple(stack2))
            md.save_data(parent_task.modf_filename, result_modf)

+    def combine_regions(self, parent_task, child_tasks):
+        """
+        combine results from different regions into one result, for intensity and modulation.
+
+        the scan results are read from the file system using the indices defined by the child_tasks,
+        and the combined result is written to the file system with the index defined by parent_task.
+
+        the datasets of the regions are appended and sorted in the standard order of the data module.
+        if the resulting length differs from the corresponding experimental scan,
+        an error is printed to the logger, but the calculation continues.
+
+        the modulation function is calculated by calling @ref calc_modulation.
+
+        @param parent_task: (CalculationTask) parent task of the region tasks.
+            the method writes the results to the file names
+            given by the @c result_filename and @c modf_filename attributes.
+
+        @param child_tasks: (sequence of CalculationTask) tasks which identify each region.
+            the reads the source data from the files
+            indicated by the @c result_filename attributes.
+            the sequence is sorted by task ID, i.e., essentially, by region index.
+
+        @return: None
+
+        @raise IndexError if child_tasks is empty.
+        """
+        # intensity
+        try:
+            stack1 = [md.load_data(task.result_filename) for task in child_tasks]
+        except IOError:
+            parent_task.result_valid = False
+            parent_task.result_filename = ""
+        else:
+            dtype = md.common_dtype(stack1)
+            stack2 = [md.restructure_data(data, dtype) for data in stack1]
+            result_data = np.hstack(tuple(stack2))
+            md.sort_data(result_data)
+            md.save_data(parent_task.result_filename, result_data)
+
+            scan = self.scans[parent_task.id.scan]
+            if result_data.shape[0] != scan.raw_data.shape[0]:
+                logger.error(BMsg("scan length mismatch: combined result: {result}, experimental data: {expected}",
+                                  result=result_data.shape[0], expected=scan.raw_data.shape[0]))
+
+        # modulation
+        try:
+            data = md.load_data(parent_task.result_filename)
+            modf = self.calc_modulation(data, parent_task.model)
+        except IOError:
+            parent_task.modf_filename = ""
+        else:
+            md.save_data(parent_task.modf_filename, modf)
+
+    def setup(self, handlers):
+        """
+        prepare for calculations.
+
+        this method is called in the master process before starting the task loop.
+        at this point the task handlers have been created and set up.
+        if the project needs to change settings of task handlers it can do so in this method.
+
+        this instance writes the header of the tasks.dat file
+        that will receive sub-task evaluation results from the evaluate_result() method.
+
+        @param handlers: dictionary listing the initialized task handler instances.
+            the dictionary keys are the attribute names of pmsco.dispatch.CalcID:
+            'model', 'scan', 'sym', 'emit' and 'region'.
+
+        @return: None
+        """
+        fields = ["rfac"]
+        fields.extend(dispatch.CalcID._fields)
+        fields = ["_" + f for f in fields]
+        dom = self.create_domain()
+        model_fields = dom.start.keys()
+        model_fields.sort(key=lambda name: name.lower())
+        fields.extend(model_fields)
+        self._tasks_fields = fields
+
+        with open(self.output_file + ".tasks.dat", "w") as outfile:
+            outfile.write("# ")
+            outfile.write(" ".join(fields))
+            outfile.write("\n")
+
+        # todo : fill in the descriptive fields, change to file-database
+        self._db.connect(":memory:")
+        project_id = self._db.register_project(self.__class__.__name__, sys.argv[0])
+        job_id = self._db.register_job(project_id,
+                                       "job-name",
+                                       self.mode,
+                                       socket.gethostname(),
+                                       "git-hash",
+                                       datetime.datetime.now(),
+                                       "description")
+        self._db.register_params(model_fields)
+        self._db.create_models_view()
+
+    def evaluate_result(self, parent_task, child_tasks):
+        """
+        evaluate the result of a calculation task.
+
+        this method is called from the add_result of the task handlers at each level.
+        it gives the project a hook to check the progress of a model at any level of the task tree.
+
+        the method calculates the r-factor by calling the Project.calc_rfactor method.
+        the result is written to the task.rfac field and to the .tasks.dat file.
+        invalid and region-level results are skipped.
+
+        this method is called in the master process only.
+
+        @param parent_task: (CalculationTask) a calculation task.
+
+        @param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
+            the sequence must be sorted by task ID.
+
+        @return: None
+        """
+        if parent_task.result_valid and parent_task.id.region == -1:
+            try:
+                parent_task.rfac = self.calc_rfactor(parent_task, child_tasks)
+            except ValueError:
+                parent_task.result_valid = False
+                logger.warning(BMsg("calculation {0} resulted in an undefined R-factor.", parent_task.id))
+            else:
+                values_dict = parent_task.id._asdict()
+                values_dict = {"_" + k: v for k, v in values_dict.items()}
+                values_dict.update(parent_task.model)
+                values_dict['_rfac'] = parent_task.rfac
+                values_list = [values_dict[field] for field in self._tasks_fields]
+                with open(self.output_file + ".tasks.dat", "a") as outfile:
+                    outfile.write(" ".join(format(value) for value in values_list) + "\n")
+
+                self._db.insert_result(parent_task.id, values_dict)
+
+        return None
+
    # noinspection PyUnusedLocal
    def calc_modulation(self, data, model):
        """
@ -965,31 +1049,246 @@ class Project(object):

        return md.calc_modfunc_loess(data)

-    def calc_rfactor(self, task):
+    def calc_rfactor(self, parent_task, child_tasks):
        """
-        calculate the R-factor of a task.
+        calculate the r-factor of a task.

-        the method calculates the R-factor over the combined scans.
-        the corresponding experimental data is taken from self._combined_modf.
+        the r-factor is calculated on the experimental and simulated modulation functions.
+        the algorithm differs for the model level and the lower task levels.
+        at the model level, the calculation is delegated to Project.combine_rfactors.
+        at all other levels, the r-factor is calculated by Project.rfactor,
+        where the simulated is loaded from the file specified by parent_task.modf_filename
+        and the experimental data from Project.scan.

-        this method is called by the model handler.
+        this method is called by the task handlers.
+        all child tasks belonging to the parent task must be complete.

-        by default, the R-factor is calculated by data.rfactor() over the combined scans.
-        override this method in your project to use a different R-factor algorithm.
+        to select or implement a specific R-factor algorithm,
+        the project sub-class should override Project.rfactor.
+        to combine scan r-factors, it should override or patch Project.combine_rfactors.

-        @param task: (CalculationTask) a model task.
+        @version in earlier versions,
+        projects had to override this method to implement their algorithm.
+        this has lead to duplication of common code.
+        the r-factor algorithm is now distributed over several methods,
+        and the method signature has changed.
+        new projects should override Project.rfactor and/or Project.combine_rfactors.

-        @return (int) calculated R-factor.
+        @param parent_task: (CalculationTask) a calculation task.
+
+        @param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
+            the sequence must be sorted by task ID.
+
+        @return (float) calculated R-factor.
+
+        @raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
        """
-        task_data = md.load_data(task.modf_filename)
-        result_r = md.rfactor(self._combined_modf, task_data)
+        if parent_task.id.scan >= 0:
+            task_data = md.load_data(parent_task.modf_filename)
+            exp_data = self.scans[parent_task.id.scan].modulation
+            result_r = self.rfactor(exp_data, task_data)
+        else:
+            result_r = self.combine_rfactors(parent_task, child_tasks)

        return result_r

+    def rfactor(self, exp_data, theo_data):
+        """
+        calculate the r-factor of simulated diffraction data.
+
+        in this class, the method calls the data.rfactor function to calculate the r-factor.
+        override this method in your project to use a different R-factor algorithm.
+
+        the input arrays must have the same shape,
+        and the coordinate columns must be identical (they are ignored, however).
+        the array elements are compared element-by-element.
+        terms having NaN intensity are ignored.
+
+        if the sigma column is present in experiment and non-zero,
+        the R-factor terms are weighted.
+
+        @param exp_data: (numpy structured array)
+            ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
+            if an @c s field is present and non-zero,
+            the R-factor terms are weighted by 1/sigma**2.
+
+        @param theo_data: (numpy structured array)
+            ETPI or ETPAI array containing the calculated modulation functions.
+
+        @return: (float) scalar R-factor
+
+        @raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
+        """
+        return md.rfactor(exp_data, theo_data)
+
+    def opt_rfactor(self, exp_data, theo_data):
+        """
+        calculate the r-factor of simulated diffraction data, adjusting their amplitude.
+
+        this is an alternative r-factor calculation algorithm
+        using the pmsco.data.optimize_rfactor() function.
+
+        to activate this method (replacing the default one), assign it to Project.rfactor
+        in the overriding __init__ or setup method:
+        @code{.py}
+        self.rfactor = self.opt_rfactor
+        @endcode
+
+        @param exp_data: (numpy structured array)
+            ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
+            if an @c s field is present and non-zero,
+            the R-factor terms are weighted by 1/sigma**2.
+
+        @param theo_data: (numpy structured array)
+            ETPI or ETPAI array containing the calculated modulation functions.
+
+        @return: (float) scalar R-factor
+
+        @raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
+        """
+        return md.optimize_rfactor(exp_data, theo_data)
+
+    def combine_rfactors(self, parent_task, child_tasks):
+        """
+        combine r-factors of child tasks.
+
+        the r-factors are taken from the rfac attribute of the child_tasks.
+        the result is an average of the child r-rfactors.
+
+        to produce a balanced result, every child dataset must contain a similar amount of information.
+        if this is not the case, the child r-factors must be weighted.
+        weighting is currently not implemented but may be introduced in a future version.
+
+        the method is intended to be used at the model level (children are scans).
+        though it can technically be used at any level where child r-factors are available.
+
+        @param parent_task: (CalculationTask) parent task for which the r-factor is calculated,
+            i.e. a model task.
+
+        @param child_tasks: (sequence of CalculationTask) child tasks of parent_tasks
+            that may be consulted for calculating the r-factor.
+
+        @return: (float) r-factor, NaN if parent task is invalid
+
+        @raise ValueError or IndexError if child_tasks is empty.
+        """
+        if parent_task.result_valid:
+            rsum = 0.
+            for task in child_tasks:
+                rsum += task.rfac
+            return rsum / len(child_tasks)
+        else:
+            return float('nan')
+
+    def alt_combine_rfactors(self, parent_task, child_tasks):
+        """
+        combine r-factors of child tasks by explicit calculation on the combined result.
+
+        this is an alternative implementation of combine_rfactors.
+        instead of using the r-factors from child tasks,
+        it re-calculates the r-factor for the combined dataset.
+        this method avoids the issue of weighting
+        but can introduce bias if the amplitudes of the child datasets differ substantially.
+
+        the experimental dataset is loaded from the file specified by the parent task,
+        the corresponding experimental data is taken from self.combined_modf.
+
+        to activate this method, assign it to combine_rfactors.
+        in the overriding __init__ or setup method:
+        @code{.py}
+        self.combine_rfactors = self.alt_combine_rfactors
+        @endcode
+
+        @param parent_task: (CalculationTask) parent task for which the r-factor is calculated,
+            i.e. a model task.
+
+        @param child_tasks: (sequence of CalculationTask) child tasks of parent_tasks
+            that may be consulted for calculating the r-factor.
+
+        @return: (float) r-factor, NaN if parent task is invalid
+        """
+        if parent_task.result_valid:
+            task_data = md.load_data(parent_task.modf_filename)
+            exp_data = self.combined_modf
+            return self.rfactor(exp_data, task_data)
+        else:
+            return float('nan')
+
+    def export_cluster(self, index, filename, cluster):
+        """
+        export the cluster of a calculation task in XYZ format for diagnostics and reporting.
+
+        this method is called with the final cluster just before it is handed over to the calculator.
+        it saves the atom coordinates in XYZ format for future reference (e.g. graphics).
+
+        the method creates two files:
+        @arg a file with extension '.xyz' contains the whole cluster in XYZ format.
+        @arg a file with extension '.emit.xyz' contains only emitter atoms in XYZ format.
+
+        the first part of the file name is formatted with the output name and the complete task identification.
+        the file is registered with the file tracker in the 'cluster' category
+        so that it will be deleted unless the cluster category is selected for keeping.
+
+        derived project class may override or extend this method
+        to carry out further diagnostics or reporting on the cluster.
+
+        @param index: (CalcID) calculation index to which the cluster belongs.
+            region may be -1 if only one cluster is exported for all regions
+            (clusters do not depend on the scan region).
+            emit may be -1 if the cluster is a master from which emitter-related child clusters are derived.
+
+        @param filename: (str) base file name for the output files.
+            the filename should be formatted using pmsco.dispatch.CalculationTask.format_filename().
+            extensions are appended by this method.
+
+        @param cluster: a pmsco.cluster.Cluster() object with all atom positions and emitters.
+
+        @return: dictionary listing the names of the created files with their category.
+                 the dictionary key is the file name,
+                 the value is the file category (cluster).
+        """
+        _files = {}
+        xyz_filename = filename + ".xyz"
+        cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ)
+        _files[xyz_filename] = 'cluster'
+
+        xyz_filename = filename + ".emit.xyz"
+        cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ, emitters_only=True)
+        _files[xyz_filename] = 'cluster'
+
+        return _files
+
    def cleanup(self):
        """
        delete unwanted files at the end of a project.

        @return: None
        """
+        self.cleanup_files()
+        self._db.disconnect()
+
+    def cleanup_files(self, keep=0):
+        """
+        delete uninteresting files.
+
+        these are all files that
+        belong to one of the self.files.categories_to_delete categories or
+        do not belong to one of the "best" models.
+
+        "best" models are a number (self.keep_best) of models that gave the lowest R-factors
+        at each task level from root to self.keep_levels.
+        for example if `keep_best = 10` and `keep_levels = 1`
+        the 10 best models at the top level, and the 10 best at the scan level are kept.
+        this means that in total up to `n = 10 + 10 * n_scans` models may be kept,
+        where n_scans is the number of scan files in the job.
+
+        @param keep: minimum number of best models to keep.
+            0 (default): use the project parameter self.keep_best.
+
+        @return None
+        """
        self.files.delete_files()
+        if 'rfac' in self.files.categories_to_delete:
+            keep = max(keep, self.keep_best)
+            keepers = self._db.query_best_task_models(self.keep_levels, keep)
+            self.files.delete_models(keep=keepers)