public release 2.2.0 - see README.md and CHANGES.md for details

2020-09-04 16:22:42 +02:00
parent fbd2d4fa8c
commit 7c61eb1b41
67 changed files with 2934 additions and 682 deletions
--- a/pmsco/project.py
+++ b/pmsco/project.py
@ -4,7 +4,7 @@ project-independent classes which store and handle model parameters.

 the most important class defined here is Project.
 each calculation project needs to derive its own project class from it.
-the Domain and Params classes are typically used unchanged.
+the ModelSpace and CalculatorParams classes are typically used unchanged.

@note nomenclature: the term @e parameters has several meanings in the code and documentation.
    the following distinctive terms are used in updated documentation sections.
@ -53,10 +53,10 @@ from pmsco.helpers import BraceMessage as BMsg

 logger = logging.getLogger(__name__)

-ParamDomain = collections.namedtuple('ParamDomain', ['start', 'min', 'max', 'step'])
+ParamSpace = collections.namedtuple('ParamSpace', ['start', 'min', 'max', 'step'])


-class Domain(object):
+class ModelSpace(object):
    """
    Domain of model parameters.

@ -151,14 +151,14 @@ class Domain(object):

        @param name (string) name of the parameter.

-        @return named tuple ParamDomain(start, min, max, step) of the parameter.
+        @return named tuple ParamSpace(start, min, max, step) of the parameter.

        @raise IndexError if the parameter is not defined.
        """
-        return ParamDomain(self.start[name], self.min[name], self.max[name], self.step[name])
+        return ParamSpace(self.start[name], self.min[name], self.max[name], self.step[name])


-class Params(object):
+class CalculatorParams(object):
    """
    calculation parameters for a single scattering calculation job.

@ -166,7 +166,7 @@ class Params(object):

    the class can hold parameters for both the MSC and EDAC codes.
    some parameters are used by both codes, others are used just by one of them.
-    newer features such as multiple emitters, multiple symmetries, and others are supported in EDAC mode only.
+    newer features such as multiple emitters, multiple domains, and others are supported in EDAC mode only.
    MSC mode is currently not maintained.

    objects of this class are created by the implementation of the create_params() method
@ -253,7 +253,7 @@ class Params(object):

    def __init__(self):
        self.title = "default parameters"
-        self.comment = "set by project.Params()"
+        self.comment = "set by project.CalculatorParams()"
        self.cluster_file = ""
        self.output_file = ""
        self.scan_file = ""
@ -580,7 +580,7 @@ class Project(object):
    the results include a measure of the quality of the simulated data compared to experimental data.
    
    each calculation project must derive from this class.
-    it must implement the create_domain(), create_cluster(), and create_params() methods.
+    it must implement the create_model_space(), create_cluster(), and create_params() methods.
    
    the other methods and attributes of this class
    are for passing command line parameters to the calculation modules.
@ -621,14 +621,14 @@ class Project(object):
    #
    #  @c scans must be considered read-only. use project methods to change it.

-    ## @var symmetries (list of arbitrary objects)
-    #  list of symmetries for which calculations are to be run.
+    ## @var domains (list of arbitrary objects)
+    #  list of domains for which calculations are to be run.
    #
    # it is up to the derived class what kind of objects are stored in the list.
    # the recommended kind of objects are dictionaries which hold parameter values,
    # similar to the model dictionaries.
    #
-    # the list must be populated by calling the add_symmetry() method.
+    # the list must be populated by calling the add_domain() method.

    ## @var cluster_generator (ClusterGenerator object)
    #  provides the cluster generator methods.
@ -684,6 +684,11 @@ class Project(object):
    #
    # output_dir and output_file are set at once by @ref set_output.

+    ## @var db_file (string)
+    # name of an sqlite3 database file where the calculation results should be stored.
+    #
+    # the default value is ':memory:', which creates a volatile in-memory database.
+
    ## @var timedelta_limit (datetime.timedelta)
    # wall time after which no new calculations should be started.
    #
@ -715,7 +720,7 @@ class Project(object):
    #
    # @arg 0 = model level: combined results only.
    # @arg 1 = scan level: scan nodes in addition to combined results (level 0).
-    # @arg 2 = symmetry level: symmetry nodes in addition to level 1.
+    # @arg 2 = domain level: domain nodes in addition to level 1.
    # @arg 3 = emitter level: emitter nodes in addition to level 1.
    # @arg 4 = region level: region nodes in addition to level 1.

@ -738,13 +743,14 @@ class Project(object):
    def __init__(self):
        self.mode = "single"
        self.job_name = ""
+        self.job_tags = {}
        self.git_hash = ""
        self.description = ""
        self.features = {}
        self.cluster_format = mc.FMT_EDAC
        self.cluster_generator = mc.LegacyClusterGenerator(self)
        self.scans = []
-        self.symmetries = []
+        self.domains = []
        self.optimizer_params = {
            'pop_size': 0,
            'seed_file': "",
@ -755,6 +761,7 @@ class Project(object):
        self.data_dir = ""
        self.output_dir = ""
        self.output_file = "pmsco_data"
+        self.db_file = ':memory:'
        self.timedelta_limit = datetime.timedelta(days=1)
        self.combined_scan = None
        self.combined_modf = None
@ -764,7 +771,7 @@ class Project(object):
        self.handler_classes = {
            'model': handlers.SingleModelHandler,
            'scan': handlers.ScanHandler,
-            'sym': handlers.SymmetryHandler,
+            'domain': handlers.DomainHandler,
            'emit': handlers.EmitterHandler,
            'region': handlers.SingleRegionHandler
        }
@ -773,27 +780,27 @@ class Project(object):
        self._tasks_fields = []
        self._db = database.ResultsDatabase()

-    def create_domain(self):
+    def create_model_space(self):
        """
-        create a msc_project.Domain object which defines the allowed range for model parameters.
+        create a project.ModelSpace object which defines the allowed range for model parameters.

        this method must be implemented by the actual project class.
-        the Domain object must declare all model parameters used in the project.
+        the ModelSpace object must declare all model parameters used in the project.

-        @return Domain object
+        @return ModelSpace object
        """
        return None

    def create_params(self, model, index):
        """
-        create a Params object given the model parameters and calculation index.
+        create a CalculatorParams object given the model parameters and calculation index.

        @param model (dictionary) model parameters to be used in the calculation.

        @param index (named tuple CalcID) calculation index.
            the method should consider only the following attributes:
-            @arg @c scan   scan index (index into Project.scans)
-            @arg @c sym    symmetry index (index into Project.symmetries)
+            @arg `scan`   scan index (index into Project.scans)
+            @arg `domain`    domain index (index into Project.domains)
        """
        return None

@ -896,35 +903,35 @@ class Project(object):

        return scan

-    def clear_symmetries(self):
+    def clear_domains(self):
        """
-        clear symmetries.
+        clear domains.

-        delete all symmetries in self.symmetries and empty the list.
+        delete all domains in self.domains and empty the list.

        @return: None
        """
-        self.symmetries = []
+        self.domains = []

-    def add_symmetry(self, symmetry):
+    def add_domain(self, domain):
        """
-        add a symmetry to the list of symmetries.
+        add a domain to the list of domains.

-        this class declares the list of symmetries.
-        it does not define what should be in the list of symmetries.
-        however, there must be an entry for each symmetry to be calculated.
+        this class declares the list of domains.
+        it does not define what should be in the list of domains.
+        however, there must be an entry for each domain to be calculated.
        if the list is empty, no calculation will be executed.

-        @attention initially, the symmetries list is empty.
-            your project needs to add at least one symmetry.
+        @attention initially, the domains list is empty.
+            your project needs to add at least one domain.
            otherwise, no calculation will be executed.

-        @param symmetry: it is up to the derived project class to specify and interpret the data stored here.
-            it is recommended to store a dictionary with symmetry parameters similar to the model parameters.
+        @param domain: it is up to the derived project class to specify and interpret the data stored here.
+            it is recommended to store a dictionary with domain parameters similar to the model parameters.

        @return: None
        """
-        self.symmetries.append(symmetry)
+        self.domains.append(domain)

    def set_output(self, filename):
        """
@ -938,14 +945,29 @@ class Project(object):
        self.output_file = filename
        path, name = os.path.split(filename)
        self.output_dir = path
+        self.job_name = name

-    def set_timedelta_limit(self, timedelta):
+    def set_timedelta_limit(self, timedelta, margin_minutes=10):
        """
-        set the walltime limit
-        
-        timedelta (datetime.timedelta)
+        set the walltime limit with a safety margin.
+
+        this method sets the internal self.timedelta_limit attribute.
+        by default, a safety margin of 10 minutes is subtracted to the main argument
+        in order to increase the probability that the process ends in time.
+        if this is not wanted, the project class may override the method and provide its own margin.
+
+        the method is typically called with the command line time limit from the main module.
+
+        @note the safety margin could be applied at various levels.
+        it is done here because it can easily be overridden by the project subclass.
+        to keep run scripts simple, the command line can be given the same time limit
+        as the job scheduler of the computing cluster.
+
+        @param timedelta: (datetime.timedelta) max. duration of the calculation process (wall time).
+
+        @param margin_minutes: (int) safety margin in minutes to subtract from timedelta.
        """
-        self.timedelta_limit = timedelta
+        self.timedelta_limit = timedelta - datetime.timedelta(minutes=margin_minutes)

    def log_project_args(self):
        """
@ -970,38 +992,40 @@ class Project(object):

            logger.warning("data directory: {0}".format(self.data_dir))
            logger.warning("output file: {0}".format(self.output_file))
+            logger.warning("database: {0}".format(self.db_file))

            _files_to_keep = files.FILE_CATEGORIES - self.files.categories_to_delete
            logger.warning("intermediate files to keep: {0}".format(", ".join(_files_to_keep)))

            for idx, scan in enumerate(self.scans):
-                logger.warning(BMsg("scan {0}: {filename} ({emitter} {initial_state})", idx, **vars(scan)))
-            for idx, sym in enumerate(self.symmetries):
-                logger.warning(BMsg("symmetry {0}: {sym}", idx, sym=sym))
+                logger.warning(f"scan {idx}: {scan.filename} ({scan.emitter} {scan.initial_state}")
+            for idx, dom in enumerate(self.domains):
+                logger.warning(f"domain {idx}: {dom}")

        except AttributeError:
            logger.warning("AttributeError in log_project_args")

-    def combine_symmetries(self, parent_task, child_tasks):
+    def combine_domains(self, parent_task, child_tasks):
        """
-        combine results of different symmetry into one result and calculate the modulation function.
+        combine results of different domain into one result and calculate the modulation function.

-        the symmetry results are read from the file system using the indices defined by the child_tasks,
+        the domain results are read from the file system using the indices defined by the child_tasks,
        and the combined result is written to the file system with the index defined by parent_task.

-        by default, this method adds all symmetries with equal weight.
-        weights can be defined in the model dictionary with keys 'wsym0', 'wsym1', etc.
+        by default, this method adds all domains with equal weight.
+        weights can be defined in the model dictionary with keys 'wdom0', 'wdom1', etc.
        missing weights default to 1.
-        note: to avoid correlated parameters, one symmetry must always have a fixed weight.
+        to avoid correlated parameters, one domain must always have a fixed weight.
+        it is recommended to leave 'wdom0' at its default.

-        @param parent_task: (CalculationTask) parent task of the symmetry tasks.
+        @param parent_task: (CalculationTask) parent task of the domain tasks.
            the method must write the results to the files indicated
            by the @c result_filename and @c modf_filename attributes.

-        @param child_tasks: (sequence of CalculationTask) tasks which identify each symmetry.
+        @param child_tasks: (sequence of CalculationTask) tasks which identify each domain.
            the method must read the source data from the files
            indicated by the @c result_filename attributes.
-            the sequence is sorted by task ID, i.e., essentially, by symmetry index.
+            the sequence is sorted by task ID, i.e., essentially, by domain index.

        @return: None

@ -1009,7 +1033,7 @@ class Project(object):

        @raise IOError if a filename is missing

-        @note the weights of the symmetries (in derived classes) can be part of the optimizable model parameters.
+        @note the weights of the domains (in derived classes) can be part of the optimizable model parameters.
            the model parameters are available as the @c model attribute of the calculation tasks.
        """

@ -1021,7 +1045,7 @@ class Project(object):
                result_data = data.copy()
                result_data['i'] = 0.
            try:
-                weight = task.model['wsym{}'.format(task.id.sym)]
+                weight = task.model['wdom{}'.format(task.id.domain)]
            except KeyError:
                weight = 1.
            result_data['i'] += weight * data['i']
@ -1212,9 +1236,12 @@ class Project(object):
        this instance writes the header of the tasks.dat file
        that will receive sub-task evaluation results from the evaluate_result() method.

+        it also initializes the database where the task results will be stored.
+        this is either a volatile in-memory database or a user-specified sqlite3 database file.
+
        @param handlers: dictionary listing the initialized task handler instances.
            the dictionary keys are the attribute names of pmsco.dispatch.CalcID:
-            'model', 'scan', 'sym', 'emit' and 'region'.
+            'model', 'scan', 'domain', 'emit' and 'region'.

        @return: None
        """
@ -1223,8 +1250,8 @@ class Project(object):
        fields.extend(dispatch.CalcID._fields)
        fields.append("secs")
        fields = ["_" + f for f in fields]
-        dom = self.create_domain()
-        model_fields = list(dom.start.keys())
+        mspace = self.create_model_space()
+        model_fields = list(mspace.start.keys())
        model_fields.sort(key=lambda name: name.lower())
        fields.extend(model_fields)
        self._tasks_fields = fields
@ -1234,9 +1261,10 @@ class Project(object):
            outfile.write(" ".join(fields))
            outfile.write("\n")

-        # todo : change to file-database
-        self._db.connect(":memory:")
-        project_id = self._db.register_project(self.__class__.__name__, sys.argv[0])
+        self._db.connect(self.db_file)
+        project_name = self.__class__.__name__
+        project_module = self.__class__.__module__
+        project_id = self._db.register_project(project_name, project_module)
        job_id = self._db.register_job(project_id,
                                       self.job_name,
                                       self.mode,
@ -1244,6 +1272,9 @@ class Project(object):
                                       self.git_hash,
                                       datetime.datetime.now(),
                                       self.description)
+        logger.debug(BMsg("database {db_file}, project {proj}, job {job}",
+                          db_file=self.db_file, proj=project_id, job=job_id))
+        self._db.insert_jobtags(job_id, self.job_tags)
        self._db.register_params(model_fields)
        self._db.create_models_view()

@ -1283,7 +1314,8 @@ class Project(object):
                with open(self.output_file + ".tasks.dat", "a") as outfile:
                    outfile.write(" ".join(format(value) for value in values_list) + "\n")

-                self._db.insert_result(parent_task.id, values_dict)
+                db_id = self._db.insert_result(parent_task.id, values_dict)
+                logger.debug(BMsg("model {model}, database result {db_id}", model=parent_task.id.model, db_id=db_id))

        return None

@ -1529,7 +1561,7 @@ class Project(object):
        """
        project hook before atomic scattering factors are calculated.

-        this method derives modified Params and Cluster objects for the atomic scattering calculation
+        this method derives modified CalculatorParams and Cluster objects for the atomic scattering calculation
        from the original objects that will be used in the multiple scattering calculation.

        in the basic version, the method does not change the objects
@ -1542,7 +1574,7 @@ class Project(object):
            or None if no global scattering factors should be calculated.
            do not modify this object!

-        @param par: @ref pmsco.project.Params object representing the preliminary
+        @param par: @ref pmsco.project.CalculatorParams object representing the preliminary
            multiple scattering input parameters of the current task.
            the method can make modifications to this object instance directly.

@ -1565,7 +1597,7 @@ class Project(object):
        """
        project hook after atomic scattering factors are calculated.

-        this method cleans up the Params and Cluster objects from the atomic scattering calculation
+        this method cleans up the CalculatorParams and Cluster objects from the atomic scattering calculation
        so that they can be used in the multiple scattering calculation.

        in the basic version, the method just passes the input parameters for model tasks
@ -1578,7 +1610,7 @@ class Project(object):
            (to calculate the fixed scattering factors that will be used for all models)
            or None if no global scattering factors should be calculated.

-        @param par: @ref pmsco.project.Params object representing the preliminary
+        @param par: @ref pmsco.project.CalculatorParams object representing the preliminary
            multiple scattering input parameters of the current task.

        @param clu: @ref pmsco.cluster.Cluster object representing the preliminary
@ -1597,18 +1629,18 @@ class Project(object):

    def cleanup(self):
        """
-        delete unwanted files at the end of a project.
+        delete unwanted files at the end of a project and close the database.

        @return: None
        """
-        self.cleanup_files()
+        self.cleanup_files(incomplete_models=True)
        self._db.disconnect()

-    def cleanup_files(self, keep=0):
+    def cleanup_files(self, keep=0, incomplete_models=False):
        """
-        delete uninteresting files.
+        delete uninteresting files (any time).

-        these are all files that
+        delete all files that
        belong to one of the self.files.categories_to_delete categories or
        do not belong to one of the "best" models.

@ -1619,12 +1651,19 @@ class Project(object):
        this means that in total up to `n = 10 + 10 * n_scans` models may be kept,
        where n_scans is the number of scan files in the job.

+        this method can be called at any time during the calculation process.
+        it executes on complete models only
+        unless incomplete_models is True.
+
        @param keep: minimum number of best models to keep.
            0 (default): use the project parameter self.keep_best.

+        @param incomplete_models: (bool) delete files of incomplete models as well.
+            by default (False), incomplete models are not deleted.
+
        @return None
        """
-        self.files.delete_files()
+        self.files.delete_files(incomplete_models=incomplete_models)
        if 'rfac' in self.files.categories_to_delete:
            keep = max(keep, self.keep_best)
            keepers = self._db.query_best_task_models(self.keep_levels, keep)