public release 4.2.0 - see README.md and CHANGES.md for details

2026-01-08 19:10:45 +01:00
parent ef781e2db4
commit b64beb694c
181 changed files with 39388 additions and 6527 deletions
--- a/pmsco/reports/results.py
+++ b/pmsco/reports/results.py
@@ -0,0 +1,442 @@
+"""
+@package pmsco.reports.results
+query and filter result data for reports
+
+@author Matthias Muntwiler, matthias.muntwiler@psi.ch
+
+@copyright (c) 2021 by Paul Scherrer Institut @n
+Licensed under the Apache License, Version 2.0 (the "License"); @n
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+"""
+
+import logging
+import numpy as np
+import pmsco.database.orm as db_orm
+import pmsco.database.query as db_query
+import pmsco.database.util as db_util
+from pmsco.project import ModelSpace
+
+logger = logging.getLogger(__name__)
+
+
+def array_remove_columns(a, cols):
+    """
+    return a copy of a structured array with some columns removed.
+
+    @param a: numpy structured array
+    @param cols: sequence of column names to be removed
+    @return: new array
+    """
+    dtb = [dt for dt in a.dtype if dt[0] not in cols]
+    b = np.empty(a.shape, dtype=dtb)
+    for col in b.dtype.names():
+        b[col] = a[col]
+    return b
+
+
+def array_range(a):
+    """
+    determine a default range from actual values.
+
+    @param a: (numpy.ndarray) 1-dimensional structured array of parameter values.
+    @return: range_min, range_max are dictionaries of the minimum and maximum values of each parameter.
+    """
+    names = db_util.regular_params(a.dtype.names)
+    range_min = {}
+    range_max = {}
+    for name in names:
+        range_min[name] = a[name].min()
+        range_max[name] = a[name].max()
+    return range_min, range_max
+
+
+class ResultData(object):
+    """
+    data structure for results
+
+    the data is stored in the values and deltas arrays.
+    the arrays are numpy structured arrays
+    and contain a flat list of results (Result table of database)
+    along with parameter and system variables
+    in the format returned by pmsco.database.query.query_model_results_array().
+
+    values and deltas must have the same data type (same fields).
+    deltas can be None if they are not available.
+
+    the other attributes serve two purposes:
+    some of them define filter rules that are applied by some of the data loading methods or the apply_filters() method.
+    after loading, the update_collections() method updates them to describe actually loaded data.
+
+    @attention if you want to reuse the same instance for multiple loads,
+    check or reset the filter attributes before each loading.
+    """
+
+    ## @var generations
+    # sequence of generation numbers loaded
+    #
+    # on loading, data is filtered by the generation numbers in this sequence (`in` operator, see apply_filters() method).
+    # by default (None), all generations are loaded.
+    # after loading, the sequence contains the loaded generation numbers (update_collections() method).
+
+    ## @var particles
+    # sequence of particle numbers loaded
+    #
+    # on loading, data is filtered by the particle numbers in this sequence (`in` operator, see apply_filters() method).
+    # by default (None), all particles are loaded.
+    # after loading, the sequence contains the loaded particle numbers (update_collections() method).
+
+    ## @var levels
+    # dictionary of level indices loaded
+    #
+    # the dictionary is organized by task level.
+    # allowed keys are: 'scan', 'domain', 'emit' (one or more).
+    # the values are sequences of index numbers.
+    #
+    # on loading, data is filtered by level numbers (`in` operator, see apply_filters() method).
+    # by default (None), all levels are loaded.
+    # after loading, the sequence contains the loaded indices (update_collections() method).
+
+    ## @var filters
+    # extra database filters
+    #
+    # filter expressions that are not covered by the generations, particles and levels attributes can be entered here.
+    # this must be a sequence of sqlalchemy expressions that are passed to the Query.filter methods.
+    # these filters take effect in load_from_db() only.
+
+    ## @var order
+    # sort order in database query
+    #
+    # this must be a sequence of pmsco.database.orm.Model and pmsco.database.orm.Result attributes.
+    # this sort order takes effect in load_from_db() only.
+
+    ## @var model_space
+    # parameter range
+    #
+    # pmsco.project.ModelSpace object, only min and max are used.
+    #
+    # @attention this may be a reference to the project's model space object rather than an independent copy.
+    # do not modify the object directly! instead, copy the old one and modify the copy!
+
+    ## @var values
+    # value data
+    #
+
+    ## @var deltas
+    # delta data (if loaded)
+    #
+
+    def __init__(self):
+        self.generations = None
+        self.params = None
+        self.particles = None
+        self.levels = None
+        self.filters = None
+        self.order = None
+        self.values = None
+        self.deltas = None
+        self.model_space = None
+
+    def load_any(self, values, deltas=None):
+        """
+        load data of any accepted type.
+
+        the method tries to detect the type of input data and calls the specialized load method.
+        the data can be one of the types accepted by
+        load_from_population(), load_from_arrays(), load_from_text(), or load_from_db().
+
+        @param values: value data
+        @param deltas: delta data (optional)
+        @return: None
+        """
+        if isinstance(values, np.ndarray):
+            self.load_from_arrays(values, deltas)
+        elif hasattr(values, 'pos') and hasattr(values, 'vel'):
+            self.load_from_population(values)
+        elif hasattr(values, 'query'):
+            self.load_from_db(values)
+        else:
+            self.load_from_text(values, deltas)
+
+    def load_from_population(self, pop):
+        """
+        load data from a population object.
+
+        the object should be of pmsco.optimizer.population.Population type
+        or have the same pos, vel, results, generation, model_min and model_max attributes.
+
+        loaded data is filtered by generations, particles, and/or levels by the apply_filters() method.
+
+        @param pop: Population-like object
+        @return: None
+        """
+        # the _rfac field of pop.pos is undefined
+        pos = np.copy(pop.pos)
+        pos['_rfac'] = np.nan
+        self.values = np.concatenate([pop.results, pos])
+        self.deltas = np.copy(pop.vel)
+        self.apply_filters()
+        self.update_collections()
+        self.generations = np.array((pop.generation,))
+        self.model_space = ModelSpace()
+        self.model_space.min = pop.model_min
+        self.model_space.max = pop.model_max
+
+    def load_from_arrays(self, values, deltas=None):
+        """
+        load data from numpy arrays.
+
+        data type must be the same as used by pmsco.optimizer.population.Population.
+
+        loaded data is filtered by generations, particles, and/or levels by the apply_filters() method.
+
+        @param values: path-like or open file
+        @param deltas: path-like or open file
+        @return: None
+        @raise OSError if file can't be loaded.
+        """
+        self.values = values
+        self.deltas = deltas
+        self.apply_filters()
+        self.update_collections()
+
+    def load_from_text(self, values_file, deltas_file=None):
+        """
+        load data from results file (.dat or .tasks.dat)
+
+        loaded data is filtered by generations, particles, and/or levels by the apply_filters() method.
+
+        @param values_file: path-like or open file
+        @param deltas_file: path-like or open file
+        @return: None
+        @raise OSError if file can't be loaded.
+        """
+        self.values = np.atleast_1d(np.genfromtxt(values_file, names=True))
+        if deltas_file is not None:
+            self.deltas = np.atleast_1d(np.genfromtxt(deltas_file, names=True))
+        self.apply_filters()
+        self.update_collections()
+
+    def load_from_db(self, session, jobs=-1, include_params=True):
+        """
+        load data from the database.
+
+        data is filtered on the SQL level by self.generations, self.particles, self.levels and self.filters.
+        data is ordered on the SQL level by self.order,
+        which defaults to generation, particle, scan, domain, emit.
+
+        @param session: database session, from pmsco.database.access.DatabaseAccess.session().
+        @param jobs: filter by job.
+            the argument can be a singleton or sequence of orm.Job objects or numeric id.
+            if None, results from all jobs are loaded.
+            if -1 (default), results from the most recent job (by datetime field) are loaded.
+        @param include_params: include parameter values of each model in the result (True, default).
+            if you're just interested in the R-factor, set this to False and parameter values are not retrieved.
+        @return: None
+        """
+        if jobs == -1:
+            jobs = db_query.query_newest_job(session)
+
+        filters = self.filters if self.filters is not None else []
+        if self.generations is not None:
+            filters.append(db_orm.Model.gen.in_(self.generations))
+        if self.particles is not None:
+            filters.append(db_orm.Model.particle.in_(self.particles))
+        if self.levels is not None:
+            for k, v in self.levels.items():
+                if k[0] == '_':
+                    k = k[1:]
+                if hasattr(type(v), '__iter__'):
+                    filters.append(getattr(db_orm.Result, k).in_(v))
+                else:
+                    filters.append(getattr(db_orm.Result, k) == v)
+
+        if self.order is None:
+            self.order = [db_orm.Model.gen, db_orm.Model.particle,
+                          db_orm.Result.scan, db_orm.Result.domain, db_orm.Result.emit]
+
+        hook_data = {'filters': filters}
+        self.values, self.deltas = db_query.query_model_results_array(session,
+                                                                      jobs=jobs,
+                                                                      query_hook=self._filters_hook,
+                                                                      hook_data=hook_data,
+                                                                      include_params=include_params,
+                                                                      order=self.order)
+
+        self.update_collections()
+
+    @staticmethod
+    def _filters_hook(query, filters):
+        """
+        hook function used in ResultData.load_from_db
+
+        the function adds a sequence of conditions to a database query.
+
+        @param query: sqlalchemy query object
+        @param filters: sequence of filter expressions to be passed to query.filter.
+            example: db_orm.Model.gen.in_([1,2,3]])
+        @return: modified query
+        """
+        for f in filters:
+            query = query.filter(f)
+        return query
+
+    def load_from_project(self, project):
+        """
+        define model space from project
+
+        @note we copy the reference. the object must not be modified!
+
+        @param project:
+        @return:
+        """
+        self.model_space = project.model_space
+
+    def reset_filters(self):
+        """
+        reset all filter attributes to default values
+
+        this function resets all instance attributes that modify the query statement to their default values.
+        in particular: generations, params, particles, levels, filters and order.
+        it does not affect the values and deltas arrays.
+
+        @return: None
+        """
+        self.generations = None
+        self.params = None
+        self.particles = None
+        self.levels = None
+        self.filters = None
+        self.order = None
+
+    def apply_filters(self):
+        """
+        apply generation, particle and level filters to loaded arrays.
+
+        this method acts on the loaded values and deltas arrays and replaces with views of the original arrays.
+
+        @return: None
+        """
+        filters = {}
+        if self.generations is not None:
+            filters['_gen'] = list(self.generations)
+        if self.particles is not None:
+            filters['_particle'] = list(self.particles)
+        if self.levels is not None:
+            for k, v in self.levels.items():
+                if k[0] != '_':
+                    k = '_' + k
+                if k in self.values.dtype.names:
+                    filters[k] = list(v)
+
+        for k, v in filters.items():
+            idx = np.where(np.isin(self.values[k], v))
+            self.values = self.values[idx]
+            if self.deltas is not None:
+                idx = np.where(np.isin(self.deltas[k], v))
+                self.deltas = self.deltas[idx]
+
+    def update_collections(self):
+        """
+        update attributes that depend on the values and deltas arrays.
+
+        namely: params, generations, particles, model_space
+
+        this method is called by the load methods after loading data.
+
+        @return: None
+        """
+        self.params = db_util.regular_params(self.values.dtype.names)
+
+        try:
+            self.generations = np.unique(self.values['_gen'])
+        except (KeyError, ValueError):
+            pass
+
+        try:
+            self.particles = np.unique(self.values['_particle'])
+        except (KeyError, ValueError):
+            pass
+
+        self.model_space = ModelSpace()
+        self.model_space.min, self.model_space.max = array_range(self.values)
+
+    def debug_log(self):
+        logger.debug(f"params = {self.params}")
+        logger.debug(f"generations = {self.generations}")
+        logger.debug(f"particles = {self.particles}")
+        logger.debug(f"levels = {self.levels}")
+        logger.debug(f"model_space.min = {self.model_space.min}")
+        logger.debug(f"model_space.max = {self.model_space.max}")
+        logger.debug(f"values.shape = {self.values.shape}")
+        logger.debug(f"values.dtype = {self.values.dtype}")
+
+    def set_model_space(self, model_space):
+        """
+        set the model space (parameter value range)
+
+        @note the model space is updated by the load methods and update_collections().
+
+        @param model_space: model space can be a pmsco.project.ModelSpace object,
+        any object that contains the same min and max attributes as pmsco.project.ModelSpace,
+        or a dictionary with two keys 'min' and 'max' that provides the corresponding ModelSpace dictionaries.
+
+        @return: None
+        """
+        if isinstance(model_space, ModelSpace):
+            self.model_space = model_space
+        else:
+            self.model_space = ModelSpace()
+            try:
+                self.model_space.min, self.model_space.max = model_space.min, model_space.max
+            except AttributeError:
+                self.model_space.min, self.model_space.max = model_space['min'], model_space['max']
+
+    def non_degenerate_params(self):
+        """
+        get the names of non-degenerate parameters
+
+        the result set contains the names of all parameters
+        where the upper range limit of the model space is strictly greater than the lower.
+
+        the result is based on the params and model_space attributes.
+
+        @return: set of strings (not ordered).
+        """
+        pn = set(self.params)
+        rmn = set(self.model_space.min.keys())
+        rmx = set(self.model_space.max.keys())
+        names = pn.intersection(rmn).intersection(rmx)
+        names = {name for name in names if self.model_space.max[name] > self.model_space.min[name]}
+        return names
+
+    def iterate_generations(self):
+        """
+        iterate over generations.
+
+        this is a generator function.
+        it yields a @ref ResultData object for each generation,
+        where the data is filtered by generation.
+
+        the @ref ResultData object is a shallow copy of `self`.
+        the attributes are references to the original objects.
+        the `values` and `deltas` are views of the original arrays
+        showing just the elements belonging to one generation.
+        `generations` contains just one element: the generation number.
+
+        @return: one @ref ResultData for each generation
+        """
+        for gen in self.generations:
+            rd = ResultData()
+            rd.generations = (gen,)
+            rd.params = self.params
+            rd.particles = self.particles
+            rd.levels = self.levels
+            idx = np.where(self.values['_gen'] == gen)
+            rd.values = self.values[idx]
+            if self.deltas is not None:
+                idx = np.where(self.deltas['_gen'] == gen)
+                rd.deltas = self.deltas[idx]
+            rd.model_space = self.model_space
+            yield rd