pmsco-public/pmsco/database/project.py

"""
@package pmsco.database.project
wrapper class for project-specific database operations


usage:
~~~~~~{.py}
db = DatabaseAccess()
db.connect("file.db")
with db.session():
    # database access here
    # ...
    # commit transaction
    session.commit()
    # continue in new transaction
    # ...

# at the end of the context
# the session is closed and orm objects are detached from the database.
~~~~~~

@author Matthias Muntwiler, matthias.muntwiler@psi.ch

@copyright (c) 2016-21 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
"""

import datetime
import logging
import socket
from pmsco.database.access import DatabaseAccess
import pmsco.database.common as db_common
import pmsco.database.ingest as db_ingest
import pmsco.database.query as db_query
from pmsco.dispatch import mpi_size

logger = logging.getLogger(__name__)


class ProjectDatabase(DatabaseAccess):
    """
    wrapper class for project specific database operations

    the purpose of this class is to bundle all specific code and run-time information
    for database access of a running calculation job.

    after calling ingest_project_metadata(),
    the class object stores the persistent project and job identifiers.
    the other methods provide convenient wrappers so that database code can be kept minimal in the project.

    usage:
    ~~~~~~{.py}
    db = ProjectDatabase()
    db.connect('file.db')
    db.ingest_project_metadata(...)
    for result in results:
        db.ingest_result(result...)
    ~~~~~~
    """

    def __init__(self):
        super().__init__()
        self.db_project_id = None
        self.db_job_id = None

    def ingest_project_metadata(self, project):
        """
        ingest project metadata into the database

        @param project: pmsco.project.Project object

        @return: None
        """
        with self.session() as session:
            db_project = db_common.register_project(session=session,
                                                    name=project.project_name,
                                                    code=project.__module__,
                                                    allow_existing=True)

            db_job = db_common.register_job(session=session,
                                            project=db_project,
                                            job_name=project.job_name,
                                            allow_existing=False,
                                            mode=project.mode,
                                            machine=socket.gethostname(),
                                            git_hash=project.git_hash,
                                            datetime=datetime.datetime.now(),
                                            processes=mpi_size,
                                            hours=project.timedelta_limit.total_seconds() / 3600.,
                                            description=project.description)

            db_common.register_job_tags(session, db_job, project.job_tags)
            db_common.register_params(session, project.model_space.start.keys())
            session.commit()

            self.db_project_id = db_project.id
            self.db_job_id = db_job.id

    def ingest_result(self, index, result, delta):
        """
        add or update a result in the database.

        the method updates the Models, Results and ParamValues tables.

        the model is identified by self.job_id and index.model.
        the result is identified by self.job_id and index.
        if the model or result exists in the database, it is updated.

        @param index: (pmsco.dispatch.CalcID or dict)
            calculation index.
            in case of dict, the keys must be the attribute names of CalcID prefixed with an underscore, i.e.,
            '_model', '_scan', '_domain', '_emit', '_region'.
            extra values in the dictionary are ignored.
            undefined indices must be -1.

        @param result: (dict) dictionary containing the parameter values and the '_rfac' result.
            may also contain the special values '_gen', '_particle', '_timestamp'.
            '_gen' and '_particle' are integers and default to None.
            '_timestamp' can be numeric (seconds since jan 1, 1970)
            or an object that implements a timestamp function like datetime.datetime.
            it defaults to the current (local) time.

        @param delta: (dict) dictionary containing the delta values.
            the keys must correspond to model keys in the result dictionary.
            this argument is optional.
        """
        assert self.db_project_id is not None
        assert self.db_job_id is not None
        with self.session() as session:
            job_obj = db_common.get_job(session, self.db_project_id, self.db_job_id)
            model_obj = db_ingest.store_model(session, job_obj, index, result)
            db_ingest.store_result_data(session, model_obj, index, result)
            db_ingest.store_param_values(session, model_obj, result, delta)
            session.commit()

    def query_best_task_models(self, level, count):
        """
        query N best models per task.

        this is a wrapper for pmsco.database.query.query_best_task_models().
        in addition to the wrapped function, it opens a session and uses the registered db_job_id.

        this query is used by the file tracker to determine the models to keep.

        @param level: level up to which to query.
            the level can be specified by level name (str) or numeric index (0..4).
            if it is scan (equivalent to 1), the method queries the model and scan levels.
        @param count: number of models to query per task.

        @return set of matching model numbers (model index, Models.model field).
        """
        with self.session() as session:
            models = db_query.query_best_task_models(session, self.db_job_id, level, count)

        return models