public release 3.0.0 - see README and CHANGES for details

2021-02-09 12:46:20 +01:00
parent 2b3dbd8bac
commit ef781e2db4
46 changed files with 4390 additions and 1655 deletions
--- a/pmsco/project.py
+++ b/pmsco/project.py
@ -19,36 +19,32 @@ the ModelSpace and CalculatorParams classes are typically used unchanged.

@author Matthias Muntwiler, matthias.muntwiler@psi.ch

-@copyright (c) 2015 by Paul Scherrer Institut @n
+@copyright (c) 2015-21 by Paul Scherrer Institut @n
 Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
 """

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import collections
 import copy
 import datetime
 import git
 import logging
 import numpy as np
-import os.path
+from pathlib import Path
 import socket
-import sys

 from pmsco.calculators.calculator import InternalAtomicCalculator
 from pmsco.calculators.edac import EdacCalculator
-import pmsco.cluster as mc
+import pmsco.cluster
+import pmsco.config as config
 from pmsco.compat import open
 import pmsco.data as md
-import pmsco.database as database
-import pmsco.dispatch as dispatch
-import pmsco.files as files
-import pmsco.handlers as handlers
+import pmsco.database
+import pmsco.dispatch
+import pmsco.files
+import pmsco.handlers
 from pmsco.helpers import BraceMessage as BMsg

 logger = logging.getLogger(__name__)
@ -157,6 +153,34 @@ class ModelSpace(object):
        """
        return ParamSpace(self.start[name], self.min[name], self.max[name], self.step[name])

+    def set_param_dict(self, d):
+        """
+        initialize model space from dictionary.
+
+        @param d: dictionary with two levels:
+            the top level are parameter names,
+            the second level the space descriptors 'start', 'min', 'max', 'step' and 'width'.
+            see add_param() for possible combinations.
+        @return: None
+        """
+        self.__init__()
+        for k, v in d.items():
+            self.add_param(k, **v)
+
+    def get_param_dict(self):
+        """
+        return model space parameters in dictionary form
+
+        the top level are parameter names,
+        the second level the space descriptors 'start', 'min', 'max' and 'step'.
+
+        @return: dict
+        """
+        d = {}
+        for name in self.start:
+            d[name] = {self.start[name], self.min[name], self.max[name], self.step[name]}
+        return d
+

 class CalculatorParams(object):
    """
@ -568,9 +592,166 @@ class Scan(object):
            self.raw_data[dim] = grid[i].reshape(-1)
        self.raw_data['i'] = 1

+    def load(self):
+        return self
+
+
+class ScanKey(config.ConfigurableObject):
+    """
+    create a Scan object based on a project-supplied dictionary
+
+    this class can be used in a run file to create a scan object based on the scan_dict attribute of the project.
+    this may be convenient if you're project should selectively use scans out of a long list of data files
+    and you don't want to clutter up the run file with parameters that don't change.
+
+    to do so, set the key property to match an item of scan_dict.
+    the load method will look up the corresponding scan_dict item and construct the final Scan object.
+    """
+    def __init__(self, project=None):
+        super().__init__()
+        self.key = ""
+        self.project = project
+
+    def load(self, dirs=None):
+        """
+        load the selected scan as specified in the project's scan dictionary
+
+        the method uses ScanLoader or ScanCreator as an intermediate.
+
+        @return a new Scan object which contains the loaded data.
+        """
+        scan_spec = self.project.scan_dict[self.key]
+        if hasattr(scan_spec, 'positions'):
+            loader = ScanCreator()
+        else:
+            loader = ScanLoader()
+        for k, v in scan_spec.items():
+            setattr(loader, k, v)
+        scan = loader.load(dirs=dirs)
+        return scan
+
+
+class ScanLoader(config.ConfigurableObject):
+    """
+    create a Scan object from a data file reference
+
+    this class can be used in a run file to create a scan object from an experimental data file.
+    to do so, fill the properties with values as documented.
+    the load() method is called when the project is run.
+    """
+
+    ## @var filename (string)
+    # file name from which the scan should be loaded.
+    # the file name can contain a format specifier like {project} to include the base path.
+
+    ## @var emitter (string)
+    # chemical symbol and, optionally following, further specification (chemical state, environment, ...)
+    # of photo-emitting atoms.
+    # the interpretation of this string is up to the project and its cluster generator.
+    # it should, however, always start with a chemical element symbol.
+    #
+    # examples: 'Ca' (calcium), 'CA' (carbon A), 'C a' (carbon a), 'C 1' (carbon one), 'N=O', 'FeIII'.
+
+    ## @var initial_state (string)
+    # nl term of initial state
+    #
+    # in the form expected by EDAC, for example: '2p1/2'
+
+    ## @var is_modf (bool)
+    # declares whether the data file contains the modulation function rather than intensity values
+    #
+    # if false, the project will calculate a modulation function from the raw data
+
+    def __init__(self):
+        super().__init__()
+        self.filename = ""
+        self.emitter = ""
+        self.initial_state = "1s"
+        self.is_modf = False
+
+    def load(self, dirs=None):
+        """
+        load the scan according to specification
+
+        create a new Scan object and load the file by calling Scan.import_scan_file().
+
+        @return a new Scan object which contains the loaded data file.
+        """
+        scan = Scan()
+        filename = config.resolve_path(self.filename, dirs)
+        scan.import_scan_file(filename, self.emitter, self.initial_state)
+        if self.is_modf:
+            scan.modulation = scan.raw_data
+        return scan
+
+
+class ScanCreator(config.ConfigurableObject):
+    """
+    create a Scan object from string expressions
+
+    this class can be used in a run file to create a scan object from python expressions,
+    such as lists, ranges or numpy functions.
+    to do so, fill the properties with values as documented.
+    the load() method is called when the project is run.
+
+    @note the raw_data property of the scan cannot be filled this way.
+    thus, the class is useful in `single` calculation mode only.
+    """
+
+    ## @var filename (string)
+    # name of the file which should receive the scan data.
+    # the file name can contain a format specifier like {project} to include the base path.
+
+    ## @var positions (dict)
+    # dictionary specifying the scan positions
+    #
+    # the dictionary must contain four keys: 'e', 't', 'p', 'a' representing the four scan axes.
+    # each key holds a string that contains a python expression.
+    # the string is evaluated using python's built-in eval() function.
+    # the expression must evaluate to an iterable object or numpy ndarray of the scan positions.
+    # the `np` namespace can be used to access numpy functions.
+    #
+    # example:
+    # the following dictionary generates a hemispherical scan
+    # self.position = {'e': '100', 't': 'np.linspace(0, 90, 91)', 'p': 'range(0, 360, 2)', 'a': '0'}
+
+    ## @var emitter (string)
+    # chemical symbol and, optionally following, further specification (chemical state, environment, ...)
+    # of photo-emitting atoms.
+    # the interpretation of this string is up to the project and its cluster generator.
+    # it should, however, always start with a chemical element symbol.
+    #
+    # examples: 'Ca' (calcium), 'CA' (carbon A), 'C a' (carbon a), 'C 1' (carbon one), 'N=O', 'FeIII'.
+
+    ## @var initial_state (string)
+    # nl term of initial state
+    #
+    # in the form expected by EDAC, for example: '2p1/2'
+
+    def __init__(self):
+        super().__init__()
+        self.filename = ""
+        self.positions = {'e': None, 't': None, 'p': None, 'a': None}
+        self.emitter = ""
+        self.initial_state = "1s"
+
+    def load(self, dirs=None):
+        """
+        create the scan according to specification
+
+        @return a new Scan object which contains the created scan array.
+        """
+        scan = Scan()
+        positions = {}
+        for axis in self.positions.keys():
+            positions[axis] = np.atleast_1d(np.asarray(eval(self.positions[axis])))
+        scan.define_scan(positions, self.emitter, self.initial_state)
+        scan.filename = config.resolve_path(self.filename, dirs)
+        return scan
+

 # noinspection PyMethodMayBeStatic
-class Project(object):
+class Project(config.ConfigurableObject):
    """
    base class of a calculation project.

@ -609,17 +790,18 @@ class Project(object):
    #

    ## @var scans (list of Scan objects)
-    #  list of experimental or scan files for which calculations are to be run.
+    # list of experimental scans for which calculations are to be run.
    #
-    #  the list must be populated by calling the add_scan() method.
-    #  this should be done in the create_project() function, or through the command line arguments.
+    # during project initialization, this list must be populated with Scan, ScanLoader or ScanCreator objects.
+    # while Scan objects contain all scan data, the latter two classes contain only scan specifications
+    # which are expanded (i.e. files are loaded or arrays are calculated) just before the calculations start.
+    # the Project.add_scan() method is a short-cut to create the respective scan object from few arguments.
+    # before the calculation starts, all objects are converted into fully specified Scan objects
+    # and scan data is loaded or calculated.
    #
-    #  the modulation function is calculated internally.
-    #  if your scan files contain the modulation function (as opposed to intensity),
-    #  you must add the files in the create_project() function.
-    #  the command line does not support loading modulation functions.
-    #
-    #  @c scans must be considered read-only. use project methods to change it.
+    # there are two ways to fill this list:
+    # either the project code fills it as a part of its initialization (create_project),
+    # or the list is populated via the run-file.

    ## @var domains (list of arbitrary objects)
    #  list of domains for which calculations are to be run.
@ -661,28 +843,22 @@ class Project(object):
    #   set this argument to False only if the calculation is a continuation of a previous one
    #   without any changes to the code.

-    ## @var data_dir
-    # directory path to experimental data.
+    ## @var directories
+    # dictionary for various directory paths.
    #
-    # the project should load experimental data (scan files) from this path.
-    # this attribute receives the --data-dir argument from the command line
-    # if the project parses the common arguments (pmsco.set_common_args).
-    #
-    # it is up to the project to define where to load scan files from.
-    # if the location of the files may depend on the machine or user account,
-    # the user may want to specify the data path on the command line.
-
-    ## @var output_dir (string)
-    # directory path for data files produced during the calculation, including intermediate files.
+    # home: user's home directory.
+    # data: where to load experimental data (scan files) from.
+    # project: directory of the project module.
+    # output: where to write output and intermediate files.
+    # temp: for temporary files.
    #
    # output_dir and output_file are set at once by @ref set_output.

-    ## @var output_file (string)
+    ## @var output_file (Path)
    # file name root for data files produced during the calculation, including intermediate files.
    #
-    # the file name should include the path. the path must also be set in @ref output_dir.
-    #
-    # output_dir and output_file are set at once by @ref set_output.
+    # this is the concatenation of self.directories['output'] and self.job_name.
+    # assignment to this property will update the two basic attributes.

    ## @var db_file (string)
    # name of an sqlite3 database file where the calculation results should be stored.
@ -694,14 +870,17 @@ class Project(object):
    #
    # the actual wall time may be longer by the remaining time of running calculations.
    # running calculations will not be aborted.
+    #
+    # the time_limit property is an alternative representation as hours.
+    # reading and writing accesses timedelta_limit.

    ## @var combined_scan
    # combined raw data from scans.
-    # updated by add_scan().
+    # updated by self.load_scans().

    ## @var combined_modf
    # combined modulation function from scans.
-    # updated by add_scan().
+    # updated by self.load_scans().

    ## @var files
    # list of all generated data files with metadata.
@ -741,14 +920,17 @@ class Project(object):
    #

    def __init__(self):
+        super().__init__()
+        self._module = None
        self.mode = "single"
-        self.job_name = ""
+        self.job_name = "pmsco0"
        self.job_tags = {}
        self.git_hash = ""
        self.description = ""
        self.features = {}
-        self.cluster_format = mc.FMT_EDAC
-        self.cluster_generator = mc.LegacyClusterGenerator(self)
+        self.cluster_format = pmsco.cluster.FMT_EDAC
+        self.cluster_generator = pmsco.cluster.LegacyClusterGenerator(self)
+        self._model_space = None
        self.scans = []
        self.domains = []
        self.optimizer_params = {
@ -758,39 +940,170 @@ class Project(object):
            'recalc_seed': True,
            'table_file': ""
        }
-        self.data_dir = ""
-        self.output_dir = ""
-        self.output_file = "pmsco_data"
+        self.directories = {
+            "home": Path.home(),
+            "work": Path.cwd(),
+            "data": "",
+            "project": "",
+            "output": "",
+            "temp": ""}
+        self.log_file = ""
+        self.log_level = "WARNING"
        self.db_file = ':memory:'
        self.timedelta_limit = datetime.timedelta(days=1)
        self.combined_scan = None
        self.combined_modf = None
-        self.files = files.FileTracker()
+        self.files = pmsco.files.FileTracker()
+        self.keep_files = list(pmsco.files.FILE_CATEGORIES_TO_KEEP)
        self.keep_levels = 1
        self.keep_best = 10
        self.handler_classes = {
-            'model': handlers.SingleModelHandler,
-            'scan': handlers.ScanHandler,
-            'domain': handlers.DomainHandler,
-            'emit': handlers.EmitterHandler,
-            'region': handlers.SingleRegionHandler
+            'model': pmsco.handlers.SingleModelHandler,
+            'scan': pmsco.handlers.ScanHandler,
+            'domain': pmsco.handlers.DomainHandler,
+            'emit': pmsco.handlers.EmitterHandler,
+            'region': pmsco.handlers.SingleRegionHandler
        }
        self.atomic_scattering_factory = InternalAtomicCalculator
        self.multiple_scattering_factory = EdacCalculator
        self._tasks_fields = []
-        self._db = database.ResultsDatabase()
+        self._db = pmsco.database.ResultsDatabase()
+
+    def validate(self):
+        """
+        validate the project parameters before starting the calculations
+
+        the method checks and fixes attributes that may cause trouble or go unnoticed if they are wrong.
+        in addition, it fixes attributes which may be incomplete after loading a run-file.
+        failed critical checks raise an exception (AssertionError, AttributeError, KeyError, ValueError).
+        checks that cause an attribute do revert to default, are logged as warning.
+
+        the following attributes are fixed silently:
+        - scattering factories that are declared as string are looked up in the project module.
+        - place holders in the directories attribute are resolved.
+        - place holders in the output_file attribute are resolved.
+        - output_file and output_dir are made consistent (so that output_file includes output_dir).
+        - the create_model_space() method is called if the model_space attribute is undefined.
+        - scan data are loaded.
+
+        @note to check the syntax of a run-file, set the calculation mode to 'validate' and run pmsco.
+        this will pass the validate method but will stop execution before calculations are started.
+
+        @raise AssertionError if a parameter is not correct.
+        @raise AttributeError if a class name cannot be resolved.
+        """
+        assert self.mode in {"single", "swarm", "genetic", "grid", "table", "test", "validate"}
+
+        if isinstance(self.atomic_scattering_factory, str):
+            self.atomic_scattering_factory = getattr(self._module, self.atomic_scattering_factory)
+        if isinstance(self.multiple_scattering_factory, str):
+            self.multiple_scattering_factory = getattr(self._module, self.multiple_scattering_factory)
+
+        self.directories = {k: config.resolve_path(Path(v), self.directories) for k, v in self.directories.items()}
+
+        assert len(str(self.output_file))
+        d = config.resolve_path(self.directories['output'], self.directories)
+        f = config.resolve_path(self.output_file, self.directories)
+        self.output_file = Path(d, f)
+        self.directories['output'] = self.output_file.parent
+
+        if self._model_space is None or not self._model_space.start:
+            logger.warning("undefined model_space attribute, trying project's create_model_space")
+            self._model_space = self.create_model_space()
+
+        self.load_scans()
+
+    @property
+    def data_dir(self):
+        return self.directories['data']
+
+    @data_dir.setter
+    def data_dir(self, path):
+        self.directories['data'] = Path(path)
+
+    @property
+    def output_dir(self):
+        return self.directories['output']
+
+    @output_dir.setter
+    def output_dir(self, path):
+        self.directories['output'] = Path(path)
+
+    @property
+    def output_file(self):
+        return Path(self.directories['output'], self.job_name)
+
+    @output_file.setter
+    def output_file(self, filename):
+        """
+        set path and base name of output file.
+
+        path is copied to the output_dir attribute.
+        the file stem is copied to the job_name attribute.
+
+        @param filename: (PathLike)
+        """
+        p = Path(filename)
+        s = str(p.parent)
+        if s and s != ".":
+            self.directories['output'] = p.parent
+        s = str(p.stem)
+        if s:
+            self.job_name = s
+        else:
+            raise ValueError("invalid output file name")
+
+    @property
+    def time_limit(self):
+        return self.timedelta_limit.total_seconds() / 3600 / 24
+
+    @time_limit.setter
+    def time_limit(self, hours):
+        self.timedelta_limit = datetime.timedelta(hours=hours)

    def create_model_space(self):
        """
        create a project.ModelSpace object which defines the allowed range for model parameters.

-        this method must be implemented by the actual project class.
-        the ModelSpace object must declare all model parameters used in the project.
+        there are three ways for a project to declare the model space:
+        1. implement the @ref create_model_space method.
+           this is the older way and may become deprecated in a future version.
+        2. assign a ModelSpace to the self.model_space property directly
+           (in the @ref validate method).
+        3. declare the model space in the run-file.
+
+        this method is called by the validate method only if self._model_space is undefined.

        @return ModelSpace object
        """
        return None

+    @property
+    def model_space(self):
+        """
+        ModelSpace object that defines the allowed range for model parameters.
+
+        there are three ways for a project to declare the model space:
+        1. implement the @ref create_model_space method.
+           this is the older way and may become deprecated in a future version.
+        2. assign a ModelSpace to the self.model_space property directly
+           (in the @ref validate method).
+        3. declare the model space in the run-file.
+
+        initially, this property is None.
+        """
+        return self._model_space
+
+    @model_space.setter
+    def model_space(self, value):
+        if isinstance(value, ModelSpace):
+            self._model_space = value
+        elif hasattr(value, 'items'):
+            self._model_space = ModelSpace()
+            self._model_space.set_param_dict(value)
+        else:
+            raise ValueError("incompatible object type")
+
    def create_params(self, model, index):
        """
        create a CalculatorParams object given the model parameters and calculation index.
@ -816,11 +1129,15 @@ class Project(object):
        self.combined_scan = None
        self.combined_modf = None

-    def add_scan(self, filename, emitter, initial_state, is_modf=False, modf_model=None, positions=None):
+    def add_scan(self, filename, emitter, initial_state, is_modf=False, positions=None):
        """
-        add the file name of reference experiment and load it.
-        
-        the extension must be one of msc_data.DATATYPES (case insensitive)
+        add a scan specification to the scans list.
+
+        this is a shortcut for adding a ScanCreator or ScanLoader object to the self.scans list.
+        the creator or loader are converted into full Scan objects just before the calculation starts
+        (in the self.setup() method).
+
+        the extension must be one of pmsco.data.DATATYPES (case insensitive)
        corresponding to the meaning of the columns in the file.
        
        caution: EDAC can only calculate equidistant, rectangular scans.
@ -831,9 +1148,6 @@ class Project(object):
        * intensity vs theta, phi, or alpha
        * intensity vs theta and phi (hemisphere or hologram scan)

-        the method calculates the modulation function if @c is_modf is @c False.
-        it also updates @c combined_scan and @c combined_modf which may be used as R-factor comparison targets.
-
        @param filename: (string) file name of the experimental data, possibly including a path.
            the file is not loaded when the optional positions argument is present,
            but the filename may serve as basename for output files (e.g. modulation function).
@ -852,57 +1166,64 @@ class Project(object):
        @param is_modf: (bool) declares whether the file contains the modulation function (True),
            or intensity (False, default). In the latter case, the modulation function is calculated internally.

-        @param modf_model: (dict) model parameters to be passed to the modulation function.
-
        @return (Scan) the new scan object (which is also a member of self.scans).
        """
-        scan = Scan()
        if positions is not None:
-            scan.define_scan(positions, emitter, initial_state)
-            scan.filename = filename
+            scan = ScanCreator()
+            scan.positions = positions
        else:
-            scan.import_scan_file(filename, emitter, initial_state)
+            scan = ScanLoader()
+            scan.is_modf = is_modf
+
+        scan.filename = filename
+        scan.emitter = emitter
+        scan.initial_state = initial_state
        self.scans.append(scan)

-        if modf_model is None:
-            modf_model = {}
+        return scan

-        if scan.raw_data is not None:
-            if is_modf:
-                scan.modulation = scan.raw_data
-            else:
+    def load_scans(self):
+        """
+        load all scan data.
+
+        initially, the self.scans list may contain objects of different classes (Scan, ScanLoader, ScanCreator)
+        depending on the project initialization.
+        this method loads all data, so that the scans list contains only Scan objects.
+
+        also, the self.combined_scan and self.combined_modf fields are calculated from the scans.
+        """
+        has_raw_data = True
+        has_mod_func = True
+        loaded_scans = []
+
+        for idx, scan in enumerate(self.scans):
+            scan = scan.load(dirs=self.directories)
+            loaded_scans.append(scan)
+            if scan.modulation is None:
                try:
-                    scan.modulation = self.calc_modulation(scan.raw_data, modf_model)
+                    scan.modulation = self.calc_modulation(scan.raw_data, self.model_space.start)
                except ValueError:
-                    logger.error("error calculating the modulation function of experimental data.")
-                    scan.modulation = None
-        else:
-            scan.modulation = None
+                    logger.error(f"error calculating the modulation function of scan {idx}.")
+            has_raw_data = has_raw_data and scan.raw_data is not None
+            has_mod_func = has_mod_func and scan.modulation is not None
+        self.scans = loaded_scans

-        if scan.raw_data is not None:
-            if self.combined_scan is not None:
-                dt = md.common_dtype((self.combined_scan, scan.raw_data))
-                d1 = md.restructure_data(self.combined_scan, dt)
-                d2 = md.restructure_data(scan.raw_data, dt)
-                self.combined_scan = np.hstack((d1, d2))
-            else:
-                self.combined_scan = scan.raw_data.copy()
+        if has_raw_data:
+            stack1 = [scan.raw_data for scan in self.scans]
+            dtype = md.common_dtype(stack1)
+            stack2 = [md.restructure_data(data, dtype) for data in stack1]
+            self.combined_scan = np.hstack(tuple(stack2))
        else:
            self.combined_scan = None

-        if scan.modulation is not None:
-            if self.combined_modf is not None:
-                dt = md.common_dtype((self.combined_modf, scan.modulation))
-                d1 = md.restructure_data(self.combined_modf, dt)
-                d2 = md.restructure_data(scan.modulation, dt)
-                self.combined_modf = np.hstack((d1, d2))
-            else:
-                self.combined_modf = scan.modulation.copy()
+        if has_mod_func:
+            stack1 = [scan.modulation for scan in self.scans]
+            dtype = md.common_dtype(stack1)
+            stack2 = [md.restructure_data(data, dtype) for data in stack1]
+            self.combined_modf = np.hstack(tuple(stack2))
        else:
            self.combined_modf = None

-        return scan
-
    def clear_domains(self):
        """
        clear domains.
@ -933,42 +1254,6 @@ class Project(object):
        """
        self.domains.append(domain)

-    def set_output(self, filename):
-        """
-        set path and base name of output file.
-
-        path and name are copied to the output_file attribute.
-        path is copied to the output_dir attribute.
-
-        if the path is missing, the destination is the current working directory.
-        """
-        self.output_file = filename
-        path, name = os.path.split(filename)
-        self.output_dir = path
-        self.job_name = name
-
-    def set_timedelta_limit(self, timedelta, margin_minutes=10):
-        """
-        set the walltime limit with a safety margin.
-
-        this method sets the internal self.timedelta_limit attribute.
-        by default, a safety margin of 10 minutes is subtracted to the main argument
-        in order to increase the probability that the process ends in time.
-        if this is not wanted, the project class may override the method and provide its own margin.
-
-        the method is typically called with the command line time limit from the main module.
-
-        @note the safety margin could be applied at various levels.
-        it is done here because it can easily be overridden by the project subclass.
-        to keep run scripts simple, the command line can be given the same time limit
-        as the job scheduler of the computing cluster.
-
-        @param timedelta: (datetime.timedelta) max. duration of the calculation process (wall time).
-
-        @param margin_minutes: (int) safety margin in minutes to subtract from timedelta.
-        """
-        self.timedelta_limit = timedelta - datetime.timedelta(minutes=margin_minutes)
-
    def log_project_args(self):
        """
        send some common project attributes to the log.
@ -981,6 +1266,14 @@ class Project(object):
        @return: None
        """
        try:
+            for key in self.directories:
+                val = self.directories[key]
+                lev = logging.WARNING if val else logging.DEBUG
+                logger.log(lev, f"directories['{key}']: {val}")
+
+            logger.warning("output file: {0}".format(self.output_file))
+            logger.warning("database: {0}".format(self.db_file))
+
            logger.warning("atomic scattering: {0}".format(self.atomic_scattering_factory))
            logger.warning("multiple scattering: {0}".format(self.multiple_scattering_factory))
            logger.warning("optimization mode: {0}".format(self.mode))
@ -990,15 +1283,11 @@ class Project(object):
                lev = logging.WARNING if val else logging.DEBUG
                logger.log(lev, "optimizer_params['{k}']: {v}".format(k=key, v=val))

-            logger.warning("data directory: {0}".format(self.data_dir))
-            logger.warning("output file: {0}".format(self.output_file))
-            logger.warning("database: {0}".format(self.db_file))
-
-            _files_to_keep = files.FILE_CATEGORIES - self.files.categories_to_delete
+            _files_to_keep = pmsco.files.FILE_CATEGORIES - self.files.categories_to_delete
            logger.warning("intermediate files to keep: {0}".format(", ".join(_files_to_keep)))

            for idx, scan in enumerate(self.scans):
-                logger.warning(f"scan {idx}: {scan.filename} ({scan.emitter} {scan.initial_state}")
+                logger.warning(f"scan {idx}: {scan.filename} ({scan.emitter} {scan.initial_state})")
            for idx, dom in enumerate(self.domains):
                logger.warning(f"domain {idx}: {dom}")

@ -1247,16 +1536,26 @@ class Project(object):
        """
        self.git_hash = self.get_git_hash()
        fields = ["rfac"]
-        fields.extend(dispatch.CalcID._fields)
+        fields.extend(pmsco.dispatch.CalcID._fields)
        fields.append("secs")
        fields = ["_" + f for f in fields]
-        mspace = self.create_model_space()
-        model_fields = list(mspace.start.keys())
+        model_fields = list(self.model_space.start.keys())
        model_fields.sort(key=lambda name: name.lower())
        fields.extend(model_fields)
        self._tasks_fields = fields

-        with open(self.output_file + ".tasks.dat", "w") as outfile:
+        if 'all' in self.keep_files:
+            cats = set([])
+        else:
+            cats = pmsco.files.FILE_CATEGORIES - set(self.keep_files)
+        cats -= {'report'}
+        if self.mode == 'single':
+            cats -= {'model'}
+        self.files.categories_to_delete = cats
+
+        Path(self.output_file).parent.mkdir(parents=True, exist_ok=True)
+        tasks_file = Path(self.output_file).with_suffix(".tasks.dat")
+        with open(tasks_file, "w") as outfile:
            outfile.write("# ")
            outfile.write(" ".join(fields))
            outfile.write("\n")
@ -1311,7 +1610,8 @@ class Project(object):
                values_dict['_rfac'] = parent_task.rfac
                values_dict['_secs'] = parent_task.time.total_seconds()
                values_list = [values_dict[field] for field in self._tasks_fields]
-                with open(self.output_file + ".tasks.dat", "a") as outfile:
+                tasks_file = Path(self.output_file).with_suffix(".tasks.dat")
+                with open(tasks_file, "a") as outfile:
                    outfile.write(" ".join(format(value) for value in values_list) + "\n")

                db_id = self._db.insert_result(parent_task.id, values_dict)
@ -1548,11 +1848,11 @@ class Project(object):
        """
        _files = {}
        xyz_filename = filename + ".xyz"
-        cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ)
+        cluster.save_to_file(xyz_filename, fmt=pmsco.cluster.FMT_XYZ)
        _files[xyz_filename] = 'cluster'

        xyz_filename = filename + ".emit.xyz"
-        cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ, emitters_only=True)
+        cluster.save_to_file(xyz_filename, fmt=pmsco.cluster.FMT_XYZ, emitters_only=True)
        _files[xyz_filename] = 'cluster'

        return _files