pmsco-public/pmsco/reports/results.py

"""
@package pmsco.reports.results
query and filter result data for reports

@author Matthias Muntwiler, matthias.muntwiler@psi.ch

@copyright (c) 2021 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
"""

import logging
import numpy as np
import pmsco.database.orm as db_orm
import pmsco.database.query as db_query
import pmsco.database.util as db_util
from pmsco.project import ModelSpace

logger = logging.getLogger(__name__)


def array_remove_columns(a, cols):
    """
    return a copy of a structured array with some columns removed.

    @param a: numpy structured array
    @param cols: sequence of column names to be removed
    @return: new array
    """
    dtb = [dt for dt in a.dtype if dt[0] not in cols]
    b = np.empty(a.shape, dtype=dtb)
    for col in b.dtype.names():
        b[col] = a[col]
    return b


def array_range(a):
    """
    determine a default range from actual values.

    @param a: (numpy.ndarray) 1-dimensional structured array of parameter values.
    @return: range_min, range_max are dictionaries of the minimum and maximum values of each parameter.
    """
    names = db_util.regular_params(a.dtype.names)
    range_min = {}
    range_max = {}
    for name in names:
        range_min[name] = a[name].min()
        range_max[name] = a[name].max()
    return range_min, range_max


class ResultData(object):
    """
    data structure for results

    the data is stored in the values and deltas arrays.
    the arrays are numpy structured arrays
    and contain a flat list of results (Result table of database)
    along with parameter and system variables
    in the format returned by pmsco.database.query.query_model_results_array().

    values and deltas must have the same data type (same fields).
    deltas can be None if they are not available.

    the other attributes serve two purposes:
    some of them define filter rules that are applied by some of the data loading methods or the apply_filters() method.
    after loading, the update_collections() method updates them to describe actually loaded data.

    @attention if you want to reuse the same instance for multiple loads,
    check or reset the filter attributes before each loading.
    """

    ## @var generations
    # sequence of generation numbers loaded
    #
    # on loading, data is filtered by the generation numbers in this sequence (`in` operator, see apply_filters() method).
    # by default (None), all generations are loaded.
    # after loading, the sequence contains the loaded generation numbers (update_collections() method).

    ## @var particles
    # sequence of particle numbers loaded
    #
    # on loading, data is filtered by the particle numbers in this sequence (`in` operator, see apply_filters() method).
    # by default (None), all particles are loaded.
    # after loading, the sequence contains the loaded particle numbers (update_collections() method).

    ## @var levels
    # dictionary of level indices loaded
    #
    # the dictionary is organized by task level.
    # allowed keys are: 'scan', 'domain', 'emit' (one or more).
    # the values are sequences of index numbers.
    #
    # on loading, data is filtered by level numbers (`in` operator, see apply_filters() method).
    # by default (None), all levels are loaded.
    # after loading, the sequence contains the loaded indices (update_collections() method).

    ## @var filters
    # extra database filters
    #
    # filter expressions that are not covered by the generations, particles and levels attributes can be entered here.
    # this must be a sequence of sqlalchemy expressions that are passed to the Query.filter methods.
    # these filters take effect in load_from_db() only.

    ## @var order
    # sort order in database query
    #
    # this must be a sequence of pmsco.database.orm.Model and pmsco.database.orm.Result attributes.
    # this sort order takes effect in load_from_db() only.

    ## @var model_space
    # parameter range
    #
    # pmsco.project.ModelSpace object, only min and max are used.
    #
    # @attention this may be a reference to the project's model space object rather than an independent copy.
    # do not modify the object directly! instead, copy the old one and modify the copy!

    ## @var values
    # value data
    #

    ## @var deltas
    # delta data (if loaded)
    #

    def __init__(self):
        self.generations = None
        self.params = None
        self.particles = None
        self.levels = None
        self.filters = None
        self.order = None
        self.values = None
        self.deltas = None
        self.model_space = None

    def load_any(self, values, deltas=None):
        """
        load data of any accepted type.

        the method tries to detect the type of input data and calls the specialized load method.
        the data can be one of the types accepted by
        load_from_population(), load_from_arrays(), load_from_text(), or load_from_db().

        @param values: value data
        @param deltas: delta data (optional)
        @return: None
        """
        if isinstance(values, np.ndarray):
            self.load_from_arrays(values, deltas)
        elif hasattr(values, 'pos') and hasattr(values, 'vel'):
            self.load_from_population(values)
        elif hasattr(values, 'query'):
            self.load_from_db(values)
        else:
            self.load_from_text(values, deltas)

    def load_from_population(self, pop):
        """
        load data from a population object.

        the object should be of pmsco.optimizer.population.Population type
        or have the same pos, vel, results, generation, model_min and model_max attributes.

        loaded data is filtered by generations, particles, and/or levels by the apply_filters() method.

        @param pop: Population-like object
        @return: None
        """
        # the _rfac field of pop.pos is undefined
        pos = np.copy(pop.pos)
        pos['_rfac'] = np.nan
        self.values = np.concatenate([pop.results, pos])
        self.deltas = np.copy(pop.vel)
        self.apply_filters()
        self.update_collections()
        self.generations = np.array((pop.generation,))
        self.model_space = ModelSpace()
        self.model_space.min = pop.model_min
        self.model_space.max = pop.model_max

    def load_from_arrays(self, values, deltas=None):
        """
        load data from numpy arrays.

        data type must be the same as used by pmsco.optimizer.population.Population.

        loaded data is filtered by generations, particles, and/or levels by the apply_filters() method.

        @param values: path-like or open file
        @param deltas: path-like or open file
        @return: None
        @raise OSError if file can't be loaded.
        """
        self.values = values
        self.deltas = deltas
        self.apply_filters()
        self.update_collections()

    def load_from_text(self, values_file, deltas_file=None):
        """
        load data from results file (.dat or .tasks.dat)

        loaded data is filtered by generations, particles, and/or levels by the apply_filters() method.

        @param values_file: path-like or open file
        @param deltas_file: path-like or open file
        @return: None
        @raise OSError if file can't be loaded.
        """
        self.values = np.atleast_1d(np.genfromtxt(values_file, names=True))
        if deltas_file is not None:
            self.deltas = np.atleast_1d(np.genfromtxt(deltas_file, names=True))
        self.apply_filters()
        self.update_collections()

    def load_from_db(self, session, jobs=-1, include_params=True):
        """
        load data from the database.

        data is filtered on the SQL level by self.generations, self.particles, self.levels and self.filters.
        data is ordered on the SQL level by self.order,
        which defaults to generation, particle, scan, domain, emit.

        @param session: database session, from pmsco.database.access.DatabaseAccess.session().
        @param jobs: filter by job.
            the argument can be a singleton or sequence of orm.Job objects or numeric id.
            if None, results from all jobs are loaded.
            if -1 (default), results from the most recent job (by datetime field) are loaded.
        @param include_params: include parameter values of each model in the result (True, default).
            if you're just interested in the R-factor, set this to False and parameter values are not retrieved.
        @return: None
        """
        if jobs == -1:
            jobs = db_query.query_newest_job(session)

        filters = self.filters if self.filters is not None else []
        if self.generations is not None:
            filters.append(db_orm.Model.gen.in_(self.generations))
        if self.particles is not None:
            filters.append(db_orm.Model.particle.in_(self.particles))
        if self.levels is not None:
            for k, v in self.levels.items():
                if k[0] == '_':
                    k = k[1:]
                if hasattr(type(v), '__iter__'):
                    filters.append(getattr(db_orm.Result, k).in_(v))
                else:
                    filters.append(getattr(db_orm.Result, k) == v)

        if self.order is None:
            self.order = [db_orm.Model.gen, db_orm.Model.particle,
                          db_orm.Result.scan, db_orm.Result.domain, db_orm.Result.emit]

        hook_data = {'filters': filters}
        self.values, self.deltas = db_query.query_model_results_array(session,
                                                                      jobs=jobs,
                                                                      query_hook=self._filters_hook,
                                                                      hook_data=hook_data,
                                                                      include_params=include_params,
                                                                      order=self.order)

        self.update_collections()

    @staticmethod
    def _filters_hook(query, filters):
        """
        hook function used in ResultData.load_from_db

        the function adds a sequence of conditions to a database query.

        @param query: sqlalchemy query object
        @param filters: sequence of filter expressions to be passed to query.filter.
            example: db_orm.Model.gen.in_([1,2,3]])
        @return: modified query
        """
        for f in filters:
            query = query.filter(f)
        return query

    def load_from_project(self, project):
        """
        define model space from project

        @note we copy the reference. the object must not be modified!

        @param project:
        @return:
        """
        self.model_space = project.model_space

    def reset_filters(self):
        """
        reset all filter attributes to default values

        this function resets all instance attributes that modify the query statement to their default values.
        in particular: generations, params, particles, levels, filters and order.
        it does not affect the values and deltas arrays.

        @return: None
        """
        self.generations = None
        self.params = None
        self.particles = None
        self.levels = None
        self.filters = None
        self.order = None

    def apply_filters(self):
        """
        apply generation, particle and level filters to loaded arrays.

        this method acts on the loaded values and deltas arrays and replaces with views of the original arrays.

        @return: None
        """
        filters = {}
        if self.generations is not None:
            filters['_gen'] = list(self.generations)
        if self.particles is not None:
            filters['_particle'] = list(self.particles)
        if self.levels is not None:
            for k, v in self.levels.items():
                if k[0] != '_':
                    k = '_' + k
                if k in self.values.dtype.names:
                    filters[k] = list(v)

        for k, v in filters.items():
            idx = np.where(np.isin(self.values[k], v))
            self.values = self.values[idx]
            if self.deltas is not None:
                idx = np.where(np.isin(self.deltas[k], v))
                self.deltas = self.deltas[idx]

    def update_collections(self):
        """
        update attributes that depend on the values and deltas arrays.

        namely: params, generations, particles, model_space

        this method is called by the load methods after loading data.

        @return: None
        """
        self.params = db_util.regular_params(self.values.dtype.names)

        try:
            self.generations = np.unique(self.values['_gen'])
        except (KeyError, ValueError):
            pass

        try:
            self.particles = np.unique(self.values['_particle'])
        except (KeyError, ValueError):
            pass

        self.model_space = ModelSpace()
        self.model_space.min, self.model_space.max = array_range(self.values)

    def debug_log(self):
        logger.debug(f"params = {self.params}")
        logger.debug(f"generations = {self.generations}")
        logger.debug(f"particles = {self.particles}")
        logger.debug(f"levels = {self.levels}")
        logger.debug(f"model_space.min = {self.model_space.min}")
        logger.debug(f"model_space.max = {self.model_space.max}")
        logger.debug(f"values.shape = {self.values.shape}")
        logger.debug(f"values.dtype = {self.values.dtype}")

    def set_model_space(self, model_space):
        """
        set the model space (parameter value range)

        @note the model space is updated by the load methods and update_collections().

        @param model_space: model space can be a pmsco.project.ModelSpace object,
        any object that contains the same min and max attributes as pmsco.project.ModelSpace,
        or a dictionary with two keys 'min' and 'max' that provides the corresponding ModelSpace dictionaries.

        @return: None
        """
        if isinstance(model_space, ModelSpace):
            self.model_space = model_space
        else:
            self.model_space = ModelSpace()
            try:
                self.model_space.min, self.model_space.max = model_space.min, model_space.max
            except AttributeError:
                self.model_space.min, self.model_space.max = model_space['min'], model_space['max']

    def non_degenerate_params(self):
        """
        get the names of non-degenerate parameters

        the result set contains the names of all parameters
        where the upper range limit of the model space is strictly greater than the lower.

        the result is based on the params and model_space attributes.

        @return: set of strings (not ordered).
        """
        pn = set(self.params)
        rmn = set(self.model_space.min.keys())
        rmx = set(self.model_space.max.keys())
        names = pn.intersection(rmn).intersection(rmx)
        names = {name for name in names if self.model_space.max[name] > self.model_space.min[name]}
        return names

    def iterate_generations(self):
        """
        iterate over generations.

        this is a generator function.
        it yields a @ref ResultData object for each generation,
        where the data is filtered by generation.

        the @ref ResultData object is a shallow copy of `self`.
        the attributes are references to the original objects.
        the `values` and `deltas` are views of the original arrays
        showing just the elements belonging to one generation.
        `generations` contains just one element: the generation number.

        @return: one @ref ResultData for each generation
        """
        for gen in self.generations:
            rd = ResultData()
            rd.generations = (gen,)
            rd.params = self.params
            rd.particles = self.particles
            rd.levels = self.levels
            idx = np.where(self.values['_gen'] == gen)
            rd.values = self.values[idx]
            if self.deltas is not None:
                idx = np.where(self.deltas['_gen'] == gen)
                rd.deltas = self.deltas[idx]
            rd.model_space = self.model_space
            yield rd