public release 4.2.0 - see README.md and CHANGES.md for details

2026-01-08 19:10:45 +01:00
parent ef781e2db4
commit b64beb694c
181 changed files with 39388 additions and 6527 deletions
--- a/pmsco/database/query.py
+++ b/pmsco/database/query.py
@@ -0,0 +1,470 @@
+"""
+@package pmsco.database.query
+specialized query functions for the pmsco database
+
+@author Matthias Muntwiler, matthias.muntwiler@psi.ch
+
+@copyright (c) 2016-21 by Paul Scherrer Institut @n
+Licensed under the Apache License, Version 2.0 (the "License"); @n
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+"""
+
+import logging
+import numpy as np
+from sqlalchemy import func
+import pmsco.database.orm as orm
+import pmsco.database.util as util
+import pmsco.dispatch as dispatch
+
+logger = logging.getLogger(__name__)
+
+
+def query_newest_job(session):
+    """
+    retrieve the entry of the newest job
+    
+    the newest entry is determined by the datetime field.
+
+    @param session:
+
+    @return: pmsco.database.orm.Job object
+    """
+    q = session.query(orm.Job)
+    q = q.order_by(orm.Job.datetime.desc(), orm.Job.id.desc())
+    job = q.first()
+    return job
+
+
+def query_model(session, job_id=None, model_id=None, model=None):
+    """
+    retrieve model parameters and control variables from the database.
+
+    @param model_id: id of the model in the database.
+
+    @return: (dict, dict) value dictionary and delta dictionary.
+        dictionary keys are parameter values.
+        the special value '_model' is included.
+    """
+    query = session.query(orm.ParamValue)
+    if job_id is not None:
+        query = query.filter(orm.Job.id == job_id)
+    if model_id is not None:
+        query = query.filter(orm.Model.id == model_id)
+    if model is not None:
+        query = query.filter(orm.Model.model == model)
+    result = query.all()
+
+    param_value = {}
+    param_delta = {}
+    model_obj = None
+    for pv in result:
+        if model_obj is None:
+            model_obj = pv.model
+        param_value[pv.param.key] = pv.value
+        param_delta[pv.param.key] = pv.delta
+
+    param_value['_model_id'] = model_obj.id
+    param_value['_model'] = model_obj.model
+    param_value['_gen'] = model_obj.gen
+    param_value['_particle'] = model_obj.particle
+    param_delta['_model_id'] = model_obj.id
+    param_delta['_model'] = model_obj.model
+    param_delta['_gen'] = model_obj.gen
+    param_delta['_particle'] = model_obj.particle
+
+    return param_value, param_delta
+
+
+def query_results(session, job_id):
+    query = session.query(orm.Result)
+    query = query.join(orm.Model)
+    query = query.filter(orm.Job == job_id)
+    return None
+
+
+def query_tasks(session, job_id):
+    """
+    query the task index used in a calculation job.
+
+    this query neglects the model index
+    and returns the unique tuples (-1, scan, domain, emit, region).
+
+    @param job_id: (int) id of the associated Jobs entry.
+
+    @return list of pmsco.dispatch.CalcID tuples of task indices.
+        the model attribute is -1 in all elements.
+    """
+    query = session.query(orm.Result.scan, orm.Result.domain, orm.Result.emit, orm.Result.region)
+    query = query.join(orm.Model)
+    query = query.filter(orm.Model.job_id == job_id)
+    query = query.distinct()
+    query = query.order_by(orm.Result.scan, orm.Result.domain, orm.Result.emit, orm.Result.region)
+    results = query.all()
+
+    output = []
+    for row in results:
+        d = row._asdict()
+        d['model'] = -1
+        output.append(dispatch.CalcID(**d))
+
+    return output
+
+
+def query_best_task_models(session, job_id, level, count):
+    """
+    query N best models per task.
+
+    this query is used by the file tracker to determine the models to keep.
+
+    @param job_id: (int) id of the associated Jobs entry.
+    @param level: level up to which to query.
+        the level can be specified by level name (str) or numeric index (0..4).
+        if it is scan (equivalent to 1), the method queries the model and scan levels.
+    @param count: number of models to query per task.
+
+    @return set of matching model numbers (Models.model field).
+    """
+
+    try:
+        level = int(level)
+    except ValueError:
+        level = dispatch.CALC_LEVELS.index(level)
+    assert 0 <= level < len(dispatch.CALC_LEVELS)
+
+    def _query_models(t):
+        query = session.query(orm.Model.model).join(orm.Job).join(orm.Result)
+        query = query.filter(orm.Job.id == job_id)
+        query = query.filter(orm.Result.scan == t.scan)
+        query = query.filter(orm.Result.domain == t.domain)
+        query = query.filter(orm.Result.emit == t.emit)
+        query = query.filter(orm.Result.region == t.region)
+        query = query.order_by(orm.Result.rfac)
+        results = query[0:count]
+        return set((row.model for row in results))
+
+    tasks = query_tasks(session, job_id)
+    models = set()
+    for task in tasks:
+        if task.numeric_level <= level:
+            q_models = _query_models(task)
+            models |= q_models
+
+    return models
+
+
+def query_model_params_array(session, jobs=None, models=None, order=None, limit=None):
+    """
+    query parameter values and return them in a numpy array
+
+    the models table can be filtered by job and/or model.
+    else, the whole database is returned (which might be huge!).
+
+    @param session:
+    @param jobs: filter by job.
+        the argument can be a singleton or sequence of orm.Job objects or numeric id.
+    @param models: filter by model.
+        the argument can be a singleton or sequence of orm.Model objects or their id.
+    @param order: ordering of results. this can be a sequence of orm.Model attributes.
+        the default order is by job_id and model.
+    @param limit: maximum number of models to return
+    @return: dict['values']: numpy values array, dict['deltas']: numpy deltas array
+    """
+    count_query = session.query(orm.Model)
+    pn_query = session.query(orm.Param.key)
+    pv_query = session.query(orm.ParamValue)
+
+    if jobs:
+        try:
+            jobs = [int(jobs)]
+        except TypeError:
+            pass
+        job_ids = [j if isinstance(j, int) else j.id for j in jobs]
+        count_query = count_query.filter(orm.Model.job_id.in_(job_ids))
+        pn_query = pn_query.filter(orm.Model.job_id.in_(job_ids))
+        pv_query = pv_query.filter(orm.Model.job_id.in_(job_ids))
+
+    if models:
+        try:
+            models = [int(models)]
+        except TypeError:
+            pass
+        model_ids = [m if isinstance(m, int) else m.id for m in models]
+        count_query = count_query.filter(orm.ParamValue.model_id.in_(model_ids))
+        pn_query = pn_query.filter(orm.ParamValue.model_id.in_(model_ids))
+        pv_query = pv_query.filter(orm.ParamValue.model_id.in_(model_ids))
+
+    if order is not None:
+        pv_query = pv_query.order_by(*order)
+    else:
+        pv_query = pv_query.order_by(orm.Model.job_id, orm.Model.model)
+    if limit:
+        pv_query = pv_query[0:limit]
+
+    n_models = count_query.count()
+    param_names = pn_query.all()
+    param_values = pv_query.all()
+
+    special_names = orm.Model().as_dict().keys()
+    dt_names = special_names + param_names
+    dt = np.dtype([(n, util.field_to_numpy_type(n)) for n in sorted(dt_names, key=str.lower)])
+    values = np.zeros((n_models,), dtype=dt)
+    deltas = np.zeros((n_models,), dtype=dt)
+
+    for i, pv in enumerate(param_values):
+        for k, v in pv.model.as_dict():
+            values[i][k] = deltas[i][k] = v
+        values[i][pv.param_key] = pv.value
+        deltas[i][pv.param_key] = pv.delta
+
+    return {'values': values, 'deltas': deltas}
+
+
+calc_id_props = {'model': orm.Model.model,
+                 'scan': orm.Result.scan,
+                 'domain': orm.Result.domain,
+                 'emit': orm.Result.emit,
+                 'region': orm.Result.region}
+
+
+def query_model_results_array(session, jobs=None, models=None, order=None, limit=None,
+                              query_hook=None, hook_data=None, include_params=False, **index):
+    """
+    query a results table with flexible filtering options
+
+    the function returns a structured numpy array of the results and, optionally, parameter values.
+    the database is fully flattened, row of the array represents one result.
+
+    the jobs and models arguments filter for specific jobs and/or models.
+
+    custom filters can be added in a query hook function.
+    the hook function receives an sqlalchemy Query object of the Result table,
+    joined with the Model and Job tables.
+    other joins must be added explicitly.
+    the hook function can add more filters and return the modified query.
+
+    the hook function is called after the filters from the other function arguments
+    (job, models, index) have been applied,
+    and before the ordering and limit are applied.
+
+    @param session:
+    @param jobs: filter by job.
+        the argument can be a singleton or sequence of orm.Job objects or numeric id.
+    @param models: filter by model.
+        the argument can be a singleton or sequence of orm.Model objects or their id.
+    @param order: ordering of results. this can be a sequence of orm.Result attributes.
+        the default order is by `orm.Result.rfac`.
+        to override the default ascending order, append a modifier, e.g., `orm.Result.rfac.desc()`.
+    @param limit: maximum number of models to return
+    @param query_hook: hook function that modifies an sqlalchemy.orm.Query object.
+        the function receives the query as first argument, and any data from hook_data as keyword arguments.
+        it must return the modified query object.
+    @param hook_data: (dict) keyword arguments to be passed to the query_hook function.
+    @param include_params: include parameter values of each model in the result.
+        by default, only data from the Model and Result records is included.
+    @param index: filters the results list by scan, domain, emit, and/or region index.
+        for example, to get only the final results per model, specify `scan=-1`.
+    @return: numpy values array
+    """
+    results_query = session.query(orm.Result).join(orm.Model).join(orm.Job)
+
+    if jobs:
+        results_query = filter_objects(results_query, orm.Job, jobs)
+
+    if models:
+        results_query = filter_objects(results_query, orm.Model, models)
+
+    for k, v in index.items():
+        results_query = results_query.filter(calc_id_props[k] == v)
+
+    if query_hook is not None:
+        results_query = query_hook(results_query, **hook_data)
+
+    if order is not None:
+        results_query = results_query.order_by(*order)
+    if limit:
+        results = results_query[0:limit]
+    else:
+        results = results_query.all()
+    n_results = len(results)
+    logger.debug(f"query_model_results_array: {results_query.statement} ({n_results} rows)")
+
+    dt_names = [n for n in util.DB_SPECIAL_PARAMS.values()]
+    if include_params:
+        model_ids = {r.model_id for r in results}
+        pn_query = session.query(orm.Param.key).join(orm.ParamValue)
+        pn_query = pn_query.filter(orm.ParamValue.model_id.in_(model_ids))
+        pn_query = pn_query.distinct()
+        pn_query = pn_query.order_by(orm.Param.key)
+        p_names = [r.key for r in pn_query.all()]
+        dt_names.extend(p_names)
+        logger.debug(f"query_model_results_array: {pn_query.statement} ({len(p_names)} rows)")
+
+    dt = []
+    v0 = []
+    for n in dt_names:
+        ft = util.field_to_numpy_type(n)
+        dt.append((n, ft))
+        v0.append(np.nan if ft[0] == 'f' else 0)
+    dt = np.dtype(dt)
+    v0 = np.array([tuple(v0)], dtype=dt)
+    values_array = np.full((n_results,), v0, dtype=dt)
+    deltas_array = np.full((n_results,), v0, dtype=dt)
+
+    for i, r in enumerate(results):
+        d = {**r.as_dict(), **r.model.as_dict()}
+        for k, v in d.items():
+            try:
+                values_array[i][k] = v
+            except TypeError:
+                values_array[i][k] = 0
+        deltas_array[i] = values_array[i]
+        if include_params:
+            for k, v in r.model.values.items():
+                values_array[i][k] = v
+            for k, v in r.model.deltas.items():
+                deltas_array[i][k] = v
+
+    return values_array, deltas_array
+
+
+def query_best_models_per_job(session, projects=None, jobs=None, task_level='model', order=None, limit=None):
+    """
+    return the best model (by rfac) of each selected job
+
+    the query gathers the R-factors of the selected jobs at the selected task levels
+    and, for each job, returns the (database) model id where the lowest R-factor is reported
+    among the gathered results.
+
+    this can be useful if you want to compile a report of the best model per job.
+
+    @param session:
+    @param projects: filter by project.
+        the argument can be a singleton or sequence of orm.Project objects or numeric id.
+    @param jobs: filter by job.
+        the argument can be a singleton or sequence of orm.Job objects or numeric id.
+    @param task_level: element of or index into @ref pmsco.dispatch.CALC_LEVELS.
+        deepest task_level to include in the query.
+        results on deeper levels are not considered.
+        e.g. if you pass 'scan', R-factors of individual scans are included in the query.
+        note that including deeper levels will not increase the number of results returned.
+        the lowest level that can be specified is `emit`.
+    @param order: ordering of results. this can be a sequence of orm.Result attributes.
+        the default order is by `orm.Result.rfac`.
+    @param limit: maximum number of models to return
+
+    @return sequence of (orm.Model, orm.Result) tuples.
+        the number of results corresponds to the number of jobs in the filter scope.
+        to find out details of the models, execute another query that filters on these model ids.
+
+    the method produces an SQL query similar to:
+    @code{.sql}
+    select Models.id from Models
+    join Results on Models.id = Results.model_id
+    join Jobs on Models.job_id = Jobs.id
+    where scan=-1
+    and project_id=1
+    and job_id in (1,2,3)
+    group by Models.job_id
+    having min(rfac)
+    order by rfac
+    @endcode
+    """
+
+    try:
+        level = dispatch.CALC_LEVELS.index(task_level) + 1
+    except ValueError:
+        level = task_level + 1
+    try:
+        level_name = dispatch.CALC_LEVELS[level]
+    except IndexError:
+        level_name = dispatch.CALC_LEVELS[4]
+
+    query = session.query(orm.Model, orm.Result).join(orm.Result)
+
+    if projects:
+        query = filter_objects(query, orm.Project, projects)
+
+    if jobs:
+        query = filter_objects(query, orm.Job, jobs)
+
+    query = query.filter(getattr(orm.Result, level_name) == -1)
+    query = query.group_by(orm.Model.job_id)
+    query = query.having(func.min(orm.Result.rfac))
+
+    if order is not None:
+        query = query.order_by(*order)
+    else:
+        query = query.order_by(orm.Result.rfac)
+    if limit:
+        query = query[0:limit]
+    else:
+        query = query.all()
+
+    return query
+
+
+def filter_objects(query, entity, objects):
+    """
+    filter a query for the given objects
+
+    apply a simple object filter to a database query.
+    the criteria can be a single object or a sequence of objects.
+    the objects can be specified either by their object representation or numeric id.
+    the query is filtered by id.
+    thus, in the first case, the objects must have a valid id.
+
+    @param query: sqlalchemy.orm.Query object that queries a table that is linked to the entity table.
+        the function joins the entity table.
+        a table with a direct foreign key relationship to the entity table must already be in the query.
+    @param entity: orm entity class, e.g. pmsco.database.orm.Project.
+    @param objects: singleton or sequence of orm objects or their numeric ids.
+
+    @return: modified query
+    """
+    # avoid duplicate joins
+    if str(query.statement).find(entity.__tablename__) < 0:
+        query = query.join(entity)
+    try:
+        objects = [p if isinstance(p, int) else p.id for p in objects]
+        query = query.filter(entity.id.in_(objects))
+    except TypeError:
+        object = objects if isinstance(objects, int) else objects.id
+        query = query.filter(entity.id == object)
+    return query
+
+
+def filter_task_levels(query, level='model', include_parents=False):
+    """
+    refine a query by filtering by task level.
+
+    @param query: sqlalchemy.orm.Query object that queries the Result table
+        (possibly joined with others).
+    @param level: element of or index into @ref pmsco.dispatch.CALC_LEVELS.
+        deepest task_level to include in the query.
+        results on deeper levels are not considered.
+        e.g. if you pass 'scan', R-factors of individual scans are included in the query.
+        the lowest level that can be specified is `emit`.
+    @param include_parents: by default, the query will return only results from the given level.
+        if True, combined results (parents) will be returned as well.
+    """
+
+    try:
+        level = dispatch.CALC_LEVELS.index(level)
+    except ValueError:
+        level = int(level)
+    child_level = level + 1
+
+    try:
+        child_level_name = dispatch.CALC_LEVELS[child_level]
+        level_name = dispatch.CALC_LEVELS[level]
+    except IndexError:
+        child_level_name = dispatch.CALC_LEVELS[4]
+        level_name = dispatch.CALC_LEVELS[3]
+
+    query = query.filter(getattr(orm.Result, child_level_name) == -1)
+    if not include_parents:
+        query = query.filter(getattr(orm.Result, level_name) >= 0)
+
+    return query