pmsco-public/pmsco/database.py

1640 lines
64 KiB
Python

"""
@package pmsco.database
experimental development of a model database.
this module maintains an sqlite3 database of calculation results.
the database provides a flexible way to query and analyse results of various calculations
under different filtering and sorting from any client that supports sqlite3.
currently, the database module is independent of the core PMSCO,
and the database has to be maintained manually.
in the future, calculation results will be added automatically by PMSCO.
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
@copyright (c) 2016-18 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import datetime
import logging
import sqlite3
import fasteners
import numpy as np
import pmsco.dispatch as dispatch
logger = logging.getLogger(__name__)
# make sure sqlite understands numpy data types
sqlite3.register_adapter(np.float64, float)
sqlite3.register_adapter(np.float32, float)
sqlite3.register_adapter(np.int64, int)
sqlite3.register_adapter(np.int32, int)
class _DummyLock(object):
"""
dummy lock used for in memory database.
"""
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
## mapping of database fields to special parameter names
#
# `_db` parameters are returned by some query methods to identify the database records.
#
DB_SPECIAL_PARAMS = {"job_id": "_db_job",
"model_id": "_db_model",
"result_id": "_db_result",
"model": "_model",
"scan": "_scan",
"sym": "_sym",
"emit": "_emit",
"region": "_region",
"gen": "_gen",
"particle": "_particle",
"rfac": "_rfac"}
## numpy data types of special parameters by database field
#
# this dictionary helps to create a numpy array from a database record.
#
DB_SPECIAL_NUMPY_TYPES = {"job_id": "i8",
"model_id": "i8",
"result_id": "i8",
"model": "i8",
"scan": "i8",
"sym": "i8",
"emit": "i8",
"region": "i8",
"gen": "i8",
"particle": "i8",
"rfac": "f8"}
def regular_params(d):
"""
filter regular parameters from dictionary
returns a dictionary containing only the regular parameters (those not prefixed with an underscore).
@param d: dict or numpy.void or pmsco.dispatch.CalcID.
the param names must have no leading underscore.
the numpy.void type occurs when an element of a structured array is extracted.
the CalcID does not contain a regular parameter and will return an empty dictionary.
it is supported only for compatibility with special_params function.
a tuple or list is interpreted as a sequence of parameter names.
in this case the names representing special parameters are returned with underscore removed.
@return: dict. leading underscores are removed from key names.
"""
if isinstance(d, np.void):
d = {k: d[k] for k in d.dtype.names if k[0] != "_"}
elif isinstance(d, dispatch.CalcID):
d = {}
elif isinstance(d, tuple):
d = [k for k in d if k[0] != "_"]
d = tuple(d)
elif isinstance(d, dict):
d = {k: v for k, v in d.items() if k[0] != "_"}
else:
d = [k for k in d if k[0] != "_"]
return d
def special_params(d):
"""
filter special parameters from model dictionary, numpy record or sequence.
special parameters are those prefixed with an underscore.
the underscore is removed from the keys.
fields starting with '_db_' are removed.
@param d: dict or numpy.void or pmsco.dispatch.CalcID or sequence.
in the case of a dict or numpy.void,
the key names of the special parameters must have a leading underscore.
the numpy.void type occurs when an element of a structured array is extracted.
in the case of a CalcID, the attribute names become the key names.
a tuple or list is interpreted as a sequence of parameter names.
in this case the names representing special parameters are returned with underscore removed.
@return
the return type depends on the type of input `d`:
@arg in the case of a dict, numpy.void or CalcID it is a dictionary.
@arg in the case of a tuple or list the return type is the same as the input.
"""
if isinstance(d, np.void):
d = {k[1:]: d[k] for k in d.dtype.names if k[0] == "_" and k[0:4] != "_db_"}
elif isinstance(d, dispatch.CalcID):
d = d._asdict()
elif isinstance(d, tuple):
d = [k[1:] for k in d if k[0] == "_" and k[0:4] != "_db_"]
d = tuple(d)
elif isinstance(d, dict):
d = {k[1:]: v for k, v in d.items() if k[0] == "_" and k[0:4] != "_db_"}
else:
d = [k[1:] for k in d if k[0] == "_" and k[0:4] != "_db_"]
return d
def field_to_param(f):
"""
translate database field name to parameter name.
field names of optimization parameters are unchanged.
special parameters are prefixed by '_' or '_db_'.
@param f: (str) database field name.
@return: (str) parameter name as used in model dictionaries.
"""
try:
p = DB_SPECIAL_PARAMS[f]
except KeyError:
p = f
return p
def field_to_numpy_type(f):
"""
determine the numpy data type string of a database field.
@param f: (str) database field name.
@return: (str) numpy type description, e.g. 'f8'.
"""
try:
t = DB_SPECIAL_NUMPY_TYPES[f]
except KeyError:
t = 'f8'
return t
class ResultsDatabase(object):
"""
interface to a sqlite3 database of calculation results.
the class maintains a connection to a specified database file.
results can be written to and retrieved from the database.
to get started in a calculation job, call the following methods:
- connect()
- register_project()
- register_job()
- register_params()
the class also maintains a local registry of model parameters.
this is basically a dictionary that maps parameter names to parameter ids used in the database.
because the database can only relate a parameter to a project or job after a value has been inserted,
it is important to maintain such a local registry.
before any access to the ParamValues table, the project parameters must be registered
by register_params() or query_project_params().
the first method can be used at any time.
it should be used before data is inserted into the database.
if a parameter is already registered in the database, the id is taken from the database
(parameter names are unique in the whole Params table).
the second method can be used only when the database already contains data.
it should be used if the database is read only, or the parameter names are unknown.
"""
sql_create_projects = """CREATE TABLE IF NOT EXISTS `Projects` (
`id` INTEGER PRIMARY KEY,
`name` TEXT NOT NULL UNIQUE COLLATE NOCASE,
`code` TEXT COLLATE NOCASE
)"""
sql_insert_project = "insert into Projects(name, code) values (:name, :code)"
sql_select_project = "select name, code from Projects where id=:id"
sql_select_project_name = "select id, name, code from Projects where name=:name"
sql_delete_project = """delete from Projects where project_id = :project_id"""
sql_create_jobs = """CREATE TABLE IF NOT EXISTS `Jobs` (
`id` INTEGER PRIMARY KEY,
`project_id` INTEGER,
`name` TEXT NOT NULL COLLATE NOCASE,
`mode` TEXT COLLATE NOCASE,
`machine` TEXT COLLATE NOCASE,
`git_hash` TEXT,
`datetime` TEXT,
`description` TEXT,
FOREIGN KEY(project_id) REFERENCES Projects(id) ON DELETE CASCADE
)"""
sql_insert_job = """insert into Jobs(project_id, name, mode, machine, git_hash, datetime, description)
values (:project_id, :name, :mode, :machine, :git_hash, :datetime, :description)"""
sql_select_job = """select id, project_id, name, mode, machine, git_hash, datetime
from Jobs where id = :job_id"""
sql_select_job_name = """select id, project_id, name, mode, machine, git_hash, datetime
from Jobs where project_id=:project_id and name=:name"""
sql_delete_job = """delete from Jobs where id = :job_id"""
sql_create_models = """CREATE TABLE IF NOT EXISTS `Models` (
`id` INTEGER PRIMARY KEY,
`job_id` INTEGER,
`model` INTEGER,
`gen` INTEGER,
`particle` INTEGER,
FOREIGN KEY(job_id) REFERENCES Jobs(id) ON DELETE CASCADE
)"""
sql_index_models = """create index if not exists
index_models on Models
('job_id', 'model')"""
sql_drop_index_models = "drop index if exists index_models"
sql_insert_model = """insert into Models(job_id, model, gen, particle)
values (:job_id, :model, :gen, :particle)"""
sql_update_model = """update Models
set model=:model, gen=:gen, particle=:particle
where id=:model_id"""
sql_select_model = """select id, job_id, model, gen, particle
from Models where id=:id"""
sql_select_model_model = """select id, job_id, model, gen, particle
from Models where model=:model"""
sql_select_model_job = """select id, job_id, model, gen, particle
from Models where job_id=:job_id"""
sql_delete_model = """delete from Models where model_id = :model_id"""
sql_create_results = """create table if not exists `Results` (
`id` INTEGER PRIMARY KEY,
`model_id` INTEGER,
`scan` integer,
`sym` integer,
`emit` integer,
`region` integer,
`rfac` REAL,
FOREIGN KEY(model_id) REFERENCES Models(id) ON DELETE CASCADE
)"""
sql_index_results_tasks = """create index if not exists
`index_results_tasks` ON `Results`
(`model_id`, `scan`,`sym`,`emit`,`region`)"""
sql_drop_index_results_tasks = "drop index if exists index_results_tasks"
sql_index_results_models = """create index if not exists
`index_results_models` ON `Results`
(`id`, `model_id`)"""
sql_drop_index_results_models = "drop index if exists index_results_models"
sql_insert_result = """insert into Results(model_id, scan, sym, emit, region, rfac)
values (:model_id, :scan, :sym, :emit, :region, :rfac)"""
sql_update_result = """update Results
set rfac=:rfac
where id=:result_id"""
sql_select_result = """select id, model_id, scan, sym, emit, region, rfac
from Results where id=:id"""
sql_select_result_index = """select id, model_id, scan, sym, emit, region, rfac
from Results where model_id=:model_id and scan=:scan and sym=:sym and emit=:emit and region=:region"""
sql_delete_result = """delete from Results where id = :result_id"""
sql_create_params = """CREATE TABLE IF NOT EXISTS `Params` (
`id` INTEGER PRIMARY KEY,
`key` TEXT NOT NULL UNIQUE COLLATE NOCASE
)"""
sql_insert_param = "insert into Params(key) values (:key)"
sql_select_param = "select key from Params where id=:id"
sql_select_param_key = "select id, key from Params where key=:key"
sql_select_param_project = """select distinct key, param_id from Models
join Jobs on Models.job_id = Jobs.id
join ParamValues on Models.id = paramValues.model_id
join Params on Params.id = ParamValues.param_id
where Jobs.project_id = :project_id
order by key collate nocase"""
sql_select_param_job = """select distinct key, param_id from Models
join ParamValues on Models.id = paramValues.model_id
join Params on Params.id = ParamValues.param_id
where Models.job_id = :job_id
order by key collate nocase"""
sql_create_paramvalues = """CREATE TABLE IF NOT EXISTS `ParamValues` (
`id` INTEGER PRIMARY KEY,
`param_id` INTEGER NOT NULL,
`model_id` INTEGER NOT NULL,
`value` REAL,
FOREIGN KEY(param_id) REFERENCES Params(id) ON DELETE CASCADE,
FOREIGN KEY(model_id) REFERENCES Models(id) ON DELETE CASCADE
)"""
sql_index_paramvalues = """create index if not exists
`index_paramvalues` ON `ParamValues`
(`param_id`, `model_id`)"""
sql_drop_index_paramvalues = "drop index if exists index_paramvalues"
sql_insert_paramvalue = """
insert into ParamValues(param_id, model_id, value)
values (:param_id, :model_id, :value)
"""
sql_update_paramvalue = """
update ParamValues set value=:value where id=:paramvalue_id
"""
sql_select_paramvalue_model = """
select key, value from ParamValues
join Params on ParamValues.param_id = Params.id
where model_id = :model_id
"""
sql_select_paramvalue = """
select ParamValues.id as id, key, value from ParamValues
join Params on ParamValues.param_id = Params.id
where param_id = :param_id and model_id = :model_id
"""
sql_create_tags = """CREATE TABLE IF NOT EXISTS `Tags` (
`id` INTEGER PRIMARY KEY,
`key` TEXT NOT NULL UNIQUE COLLATE NOCASE
)"""
sql_insert_tag = "insert into Tags(key) values (:key)"
sql_select_tag = "select key from Tags where id=:id"
sql_select_tag_key = "select id, key from Tags where key=:key"
sql_select_tag_project = """select distinct key, tag_id from Jobs
join JobTags on Jobs.id = JobTags.job_id
join Tags on Tags.id = JobTags.tag_id
where Jobs.project_id = :project_id
order by key collate nocase"""
sql_select_tag_job = """select distinct key, tag_id from JobTags
join Tags on Tags.id = JobTags.tag_id
where JobTags.job_id = :job_id
order by key collate nocase"""
sql_create_jobtags = """CREATE TABLE IF NOT EXISTS `JobTags` (
`id` INTEGER PRIMARY KEY,
`tag_id` INTEGER NOT NULL,
`job_id` INTEGER NOT NULL,
`value` TEXT COLLATE NOCASE,
FOREIGN KEY(tag_id) REFERENCES Tags(id) ON DELETE CASCADE,
FOREIGN KEY(job_id) REFERENCES Jobs(id) ON DELETE CASCADE
)"""
sql_index_jobtags = """create index if not exists
`index_jobtags` ON `JobTags`
(`tag_id`, `job_id`)"""
sql_drop_index_jobtags = "drop index if exists index_jobtags"
sql_insert_jobtag = """
insert into JobTags(tag_id, job_id, value)
values (:tag_id, :job_id, :value)
"""
sql_update_jobtag = """
update JobTags set value=:value where id=:jobtag_id
"""
sql_select_jobtag_job = """
select key, value from JobTags
join Tags on JobTags.tag_id = Tags.id
where job_id = :job_id
"""
sql_select_jobtag = """
select JobTags.id as id, key, value from JobTags
join Tags on JobTags.tag_id = Tags.id
where tag_id = :tag_id and job_id = :job_id
"""
# @var _conn (sqlite3.Connection).
# connection interface to the database.
#
# opened and closed by self.connect() and self.disconnect(), respectively.
# the connection can remain open during the whole process.
#
# @note sqlite3.Connection is re-usable but not re-entrant.
# Be careful not to nest contexts when calling other methods from within this class!
# @var _db_filename (str).
# path and name of the database file or ":memory:" for an in-memory database.
# @var project_id (int).
# id of the current project in the primary key of the Projects table.
#
# set by self.register_project().
# new jobs and models are linked to this project id by default.
# @var job_id (int).
# id of the current job in the primary key of the Jobs table.
#
# set by self.register_job().
# new models are linked to this job id by default.
# @var _model_params (dict).
# dictionary of model parameters used in the current project.
#
# set by self.register_params() and self.register_param().
# read by self.add_model().
# @var _lock_filename (str).
# path and name of the lock file or an empty string if no locking is used.
# @var _lock (obj).
# context manager which provides a locking mechanism for the database.
#
# this is either a fasteners.InterprocessLock or _DummyLock.
# InterprocessLock allows to serialize access to the database by means of a lock file.
# _DummyLock is used with an in-memory database which does not require locking.
#
# @note InterprocessLock is re-usable but not re-entrant.
# Be careful not to nest contexts when calling other methods from within this class!
def __init__(self):
self._conn = None
self._db_filename = ""
self.project_id = 0
self.job_id = 0
self._model_params = {}
self._tags = {}
self._lock_filename = ""
self._lock = None
def connect(self, db_filename, lock_filename=""):
"""
connect to a new or existing database file.
if the file does not exist, or if it is empty, a new database schema is created.
@param db_filename: name of a file or ":memory:" for an in-memory database.
@param lock_filename: name of a file that is used to lock the database.
by default, the db_filename with a suffix of ".lock" is used.
for most uses, the default should be fine.
the argument is provided mainly for testing the locking functionality.
this must be a file that is not used for anything else.
the file does not need to exist.
it's best if the file is in the same directory as the database file.
@return: None
"""
self._db_filename = db_filename
if lock_filename:
self._lock_filename = lock_filename
elif db_filename == ":memory:":
self._lock_filename = ""
else:
self._lock_filename = db_filename + ".lock"
if self._lock_filename:
self._lock = fasteners.InterProcessLock(self._lock_filename)
else:
self._lock = _DummyLock()
self._conn = sqlite3.connect(self._db_filename)
self._conn.row_factory = sqlite3.Row
with self._lock:
self._conn.execute("PRAGMA foreign_keys = 1")
self._conn.commit()
c = self._conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='Models'")
l = c.fetchone()
if not l or not l[0]:
self.create_schema()
def disconnect(self):
"""
close the connnection to the database.
uncommitted transactions are lost.
data of an in-memory database is lost.
@return: None
"""
if self._conn is not None:
self._conn.close()
self._conn = None
self._lock = None
def check_connection(self):
"""
check that the database is connected.
checks that self.connect has been called.
this method is used by most ResultsDatabase methods to check connection before the database is used.
it essentially produces a better understandable exception message if the connect call has been forgotten.
(with out it, python would raise: `AttributeError: __exit__`).
@return None
@raise AssertionError if the connection is not valid.
"""
assert self._lock is not None, "database not connected"
assert self._conn is not None, "database not connected"
def create_schema(self):
"""
create the database schema (tables and indices).
the method can safely be applied to an existing schema.
because all create commands include the "if not exists",
only missing objects will be created.
@return: None
"""
self.check_connection()
with self._lock, self._conn:
self._conn.execute(self.sql_create_projects)
self._conn.execute(self.sql_create_jobs)
self._conn.execute(self.sql_create_models)
self._conn.execute(self.sql_create_results)
self._conn.execute(self.sql_create_params)
self._conn.execute(self.sql_create_paramvalues)
self._conn.execute(self.sql_create_tags)
self._conn.execute(self.sql_create_jobtags)
self._conn.execute(self.sql_index_results_tasks)
self._conn.execute(self.sql_index_results_models)
self._conn.execute(self.sql_index_paramvalues)
self._conn.execute(self.sql_index_jobtags)
self._conn.execute(self.sql_index_models)
def register_project(self, name, code):
"""
register a project with the database.
@param name: name of the project. alphanumeric characters only. no spaces or special characters!
@param code: name of the pmsco module that defines the project.
@return: id value of the project in the database.
"""
self.check_connection()
with self._lock, self._conn:
c = self._conn.execute(self.sql_select_project_name, {'name': name})
v = c.fetchone()
if v:
project_id = v[0]
else:
c = self._conn.execute(self.sql_insert_project, {'name': name, 'code': code})
project_id = c.lastrowid
self.project_id = project_id
return project_id
def delete_project(self, project_id):
"""
delete a project from the database.
this will delete all dependent jobs, models, results and parameter values.
@param project_id: id value of the project entry in the database.
@return None
"""
self.check_connection()
with self._lock, self._conn:
param_dict = {'project_id': project_id}
self._conn.execute(self.sql_delete_project, param_dict)
def register_job(self, project_id, name, mode, machine, git_hash, _datetime, description=None):
# type: (int, basestring, basestring, basestring, basestring, datetime.datetime, basestring) -> int
"""
register a calculation job with the database.
@param project_id: identifier of the project. see register_project().
@param name: name of the job. up to the user, must be unique within a project.
@param mode: optimization mode string (should be same as command line argument).
@param machine: name of the machine executing the calculation. up to the user.
@param git_hash: git revision (hash) of the project code.
@param _datetime: (datetime.datetime) date and time.
@param description: meaningful description of the calculation job, up to the user.
@return: id value of the job in the database.
"""
self.check_connection()
with self._lock, self._conn:
c = self._conn.execute(self.sql_select_job_name, {'project_id': project_id, 'name': name})
v = c.fetchone()
if v:
job_id = v[0]
else:
v = {'project_id': project_id,
'name': name,
'mode': mode,
'machine': machine,
'git_hash': git_hash,
'datetime': _datetime,
'description': description}
c = self._conn.execute(self.sql_insert_job, v)
job_id = c.lastrowid
self.job_id = job_id
return job_id
def delete_job(self, job_id):
"""
delete a job from the database.
this will delete all dependent models, results and parameter values.
@param job_id: id value of the job entry in the database.
@return None
"""
self.check_connection()
with self._lock, self._conn:
param_dict = {'job_id': job_id}
self._conn.execute(self.sql_delete_job, param_dict)
def _query_job_name(self, job_name, project_id=0):
"""
(internal) query a job by name
this is the internal analog of @ref query_job_name
which asserts an acquired lock and open connection.
@param job_name: name of the job
@param project_id: project identifier.
by default, the current project self.project_id is used.
@return: id value of the job in the database
@raise DatabaseError if the job can't be found.
"""
if project_id == 0:
project_id = self.project_id
param_dict = {'project_id': project_id, 'name': job_name}
c = self._conn.execute(self.sql_select_job_name, param_dict)
v = c.fetchone()
return v[0]
def query_job_name(self, job_name, project_id=0):
"""
query a job by name
@param job_name: name of the job
@param project_id: project identifier.
by default, the current project self.project_id is used.
@return: id value of the job in the database
"""
self.check_connection()
with self._lock, self._conn:
job_id = self._query_job_name(job_name, project_id=project_id)
return job_id
def register_param(self, key):
"""
register a parameter key with the database.
each parameter name must be registered once before a value can be written to the database.
see the class description for an explanation.
@param key: key (name) of the parameter.
@return: id value of the parameter in the database.
"""
self.check_connection()
with self._lock, self._conn:
return self._register_param(key)
def _register_param(self, key):
"""
register a parameter key with the database.
@note this is method does not lock the database file and does not commit.
to lock the database and commit the transaction, call the public method register_param().
@param key: key (name) of the parameter.
@return: id value of the parameter in the database.
"""
c = self._conn.execute(self.sql_select_param_key, {'key': key})
v = c.fetchone()
if v:
param_id = v[0]
else:
c = self._conn.execute(self.sql_insert_param, {'key': key})
param_id = c.lastrowid
self._model_params[key] = param_id
return param_id
def register_params(self, model_params):
"""
register the model parameters of this project with the database.
each parameter name must be registered once before a value can be written to the database.
see the class description for an explanation.
@param model_params: sequence of model parameter keys, or dictionary of model parameters.
@return: None
"""
self.check_connection()
with self._lock, self._conn:
for key in model_params:
if key[0] != '_':
self._register_param(key)
def query_project_params(self, project_id=0, job_id=0, update_registry=False):
"""
query a list of model parameters used in a project or job.
optionally, the local registry can be updated with the results of the query.
this should be done if the database is read only and the client does not know the parameter names.
see the class description for a description of the registry.
@note this method returns the parameters that are used with models in the database.
if you have registered additional parameters but not stored models with those parameters,
this method will _not_ list them.
@param project_id: project identifier.
by default, the current project self.project_id is used.
@param job_id: job identifier.
by default, all jobs of the selected project are included in the query.
if a job is specified, the project_id parameter is ignored.
@param update_registry: update the local parameter registry (self._model_params).
with the query results.
@return: dictionary of project parameters.
the keys are the parameter names, the values are the parameter ids in the database.
"""
if project_id == 0:
project_id = self.project_id
if job_id == 0:
sql = self.sql_select_param_project
args = {'project_id': project_id}
else:
sql = self.sql_select_param_job
args = {'job_id': job_id}
params = {}
self.check_connection()
with self._lock, self._conn:
c = self._conn.execute(sql, args)
for row in c:
params[row['key']] = row['param_id']
if update_registry:
self._model_params.update(params)
return params
def register_tag(self, key):
"""
register a tag with the database.
tags are a way of structuring a job description.
they can be used to, for instance, distinguish calculations made with different clusters,
different experimental data, etc.
a job tag has a key and a value, and is associated to a job.
the use of tags is up to the user. pmsco does not change or read them.
each tag name must be registered once before a value can be written to the database.
see the class description for an explanation.
@param key: key (name) of the tag.
@return: id value of the tag in the database.
"""
self.check_connection()
with self._lock, self._conn:
return self._register_tag(key)
def _register_tag(self, key):
"""
register a tag with the database without committing the transaction.
@note this method does not lock the database file and does not commit.
to lock the database and commit the transaction, call the public method register_tag().
@param key: key (name) of the tag.
@return: id value of the tag in the database.
"""
c = self._conn.execute(self.sql_select_tag_key, {'key': key})
v = c.fetchone()
if v:
tag_id = v[0]
else:
c = self._conn.execute(self.sql_insert_tag, {'key': key})
tag_id = c.lastrowid
self._tags[key] = tag_id
return tag_id
def register_tags(self, tags):
"""
register the tags of this project with the database.
each tag name must be registered once before a value can be written to the database.
see the class description for an explanation.
@param tags: sequence of tag keys, or dictionary of tags.
@return: None
"""
self.check_connection()
with self._lock, self._conn:
for key in tags:
self._register_tag(key)
def query_tags(self, project_id=0, job_id=0, update_registry=False):
"""
query a list of tag keys used in a project or job.
optionally, the local registry can be updated with the results of the query.
this should be done if the database is read only and the client does not know the tag names.
see the class description for a description of the registry.
@note this method returns the tags that are used with jobs in the database.
if you have registered additional tags but not attached them to jobs,
this method will _not_ list them.
@param project_id: project identifier.
by default, the current project self.project_id is used.
@param job_id: job identifier.
by default, all jobs of the selected project are included in the query.
if a job is specified, the project_id parameter is ignored.
@param update_registry: update the local tags registry (self._tags).
with the query results.
@return: dictionary of tags.
the keys are the tag names, the values are the tag ids in the database.
"""
if project_id == 0:
project_id = self.project_id
if job_id == 0:
sql = self.sql_select_tag_project
args = {'project_id': project_id}
else:
sql = self.sql_select_tag_job
args = {'job_id': job_id}
tags = {}
self.check_connection()
with self._lock, self._conn:
c = self._conn.execute(sql, args)
for row in c:
tags[row['key']] = row['tag_id']
if update_registry:
self._tags.update(tags)
return tags
def query_job_tags(self, job_id):
"""
query a list of tags (keys and values) associated with a job.
@param job_id: job identifier.
@return: dictionary of tags.
the keys are the tag names, the values are the tag values.
"""
sql = self.sql_select_jobtag_job
args = {'job_id': job_id}
tags = {}
self.check_connection()
with self._lock, self._conn:
c = self._conn.execute(sql, args)
for row in c:
tags[row['key']] = row['value']
return tags
def insert_jobtags(self, job_id, tags):
"""
add or update job tags in the database.
the method updates the JobTags table.
@param job_id: (int) primary key of the job entry in the Jobs table.
the entry must exist.
@param tags: (dict) dictionary containing the tags.
keys are matched or added to the Tags table,
values are added to the JobTags table and linked to the job and tag key.
@return: None
"""
self.check_connection()
with self._lock, self._conn:
for key, value in tags.items():
try:
tag_id = self._tags[key]
except KeyError:
tag_id = self._register_tag(key)
v = None
else:
jobtag_entry = {'tag_id': tag_id, 'job_id': job_id, 'value': value}
c = self._conn.execute(self.sql_select_jobtag, jobtag_entry)
v = c.fetchone()
if v:
jobtag_entry = {'jobtag_id': v[0], 'tag_id': tag_id, 'job_id': job_id, 'value': value}
self._conn.execute(self.sql_update_jobtag, jobtag_entry)
else:
jobtag_entry = {'tag_id': tag_id, 'job_id': job_id, 'value': value}
self._conn.execute(self.sql_insert_jobtag, jobtag_entry)
def create_models_view(self, job_id=0, temporary=False):
"""
create a flat (pivot) view of model parameters of the current project or job.
the view is an SQL view that can be used like other tables in select statements.
the name of the view is the project or job id prefixed by @c ViewModelsProject or ViewModelsJob, respectively.
the view contains the 'job_id', 'model_id' and 'model' columns of the Models table
as well as columns of the de-normalized ParamValues table
where the column names are the respective parameter names.
the list of parameter names is combined from the private registry and the database.
the method generates the SQL to create the view and executes it.
a previous view of the same name is dropped.
the view can optionally be made temporary (it is dropped when the database connection is closed).
depending on the `job_id` argument, a view of the current project
or of the specified job is created.
@note the view needs to be re-recreated whenever the parameter dictionary has changed.
@param job_id: job identifier.
by default, all jobs of the selected project are included in the query.
if a job is specified, the project_id parameter is ignored.
@param temporary: if True, the view is temporary and dropped when the database connection is closed.
otherwise (by default), the view is available on all connections and stored with the database.
@return sql string that was used to create the view.
"""
project_id = self.project_id
self.check_connection()
params = self.query_project_params(project_id, job_id)
params.update(self._model_params)
param_names = sorted(params, key=lambda s: s.lower())
with self._lock, self._conn:
if job_id:
view_name = "ViewModelsJob{0}".format(job_id)
else:
view_name = "ViewModelsProject{0}".format(project_id)
sql = "drop view if exists {0}".format(view_name)
self._conn.execute(sql)
sql = "create {1} view {0} as ".format(view_name, "temp" if temporary else "")
sql += "select job_id, Models.id as model_id, model, gen, particle, "
col_fmt = "max(case when key='{0}' then value end) {0}"
cols = [col_fmt.format(param) for param in param_names]
sql += ", ".join(cols)
sql += " from Models "
sql += " join ParamValues on Models.id = ParamValues.model_id "
sql += " join Params on Params.id = ParamValues.param_id "
sql += " join Jobs on Jobs.id = Models.job_id "
if job_id:
sql += " where Models.job_id = {job_id} ".format(job_id=job_id)
else:
sql += " where Jobs.project_id = {project_id} ".format(project_id=project_id)
sql += " group by model_id "
self._conn.execute(sql)
return sql
def insert_model(self, model_params):
"""
add calculated model parameters to the database.
this adds to the Models and ParamValues tables.
@param model_params: dictionary containing all model parameters and control variables.
the '_model' value is required.
'_gen' and '_particle' are optional, they are written to the database if provided, or default to None.
the names of the model parameters must have been registered with register_params() beforehand.
@return: id value of the model entry in the database.
@raise KeyError if a parameter hasn't been registered.
"""
self.check_connection()
with self._lock, self._conn:
# insert model record
model_dict = {'job_id': self.job_id, 'gen': None, 'particle': None}
model_dict.update(special_params(model_params))
c = self._conn.execute(self.sql_insert_model, model_dict)
model_id = c.lastrowid
# insert parameter value records
for key, value in model_params.items():
if key[0] != '_':
param_id = self._model_params[key]
param_dict = {'param_id': param_id, 'model_id': model_id, 'value': value}
c.execute(self.sql_insert_paramvalue, param_dict)
return model_id
def delete_model(self, model_id):
"""
delete a model from the database.
this will delete all dependent results and parameter values.
@param model_id: id value of the model entry in the database.
@return None
"""
self.check_connection()
with self._lock, self._conn:
param_dict = {'model_id': model_id}
self._conn.execute(self.sql_delete_model, param_dict)
def query_model(self, model_id):
"""
retrieve model parameters and control variables from the database.
@param model_id: id of the model in the database.
@return: dict
"""
self.check_connection()
with self._lock, self._conn:
c = self._conn.execute(self.sql_select_paramvalue_model, {'model_id': model_id})
d = {}
for row in c:
d[row['key']] = row['value']
c = self._conn.execute(self.sql_select_model, {'id': model_id})
row = dict(c.fetchone())
for key, value in row.items():
if key[0] == '_' and value is not None:
d[key] = value
return d
def insert_model_array(self, array):
"""
add a numpy array of calculated model parameters to the database.
@param array: numpy structured array as it is used, e.g., in pmsco.optimizers.population.Population.
@return:
"""
assert isinstance(array, np.ndarray)
for r in array:
d = {name: r[name] for name in r.dtype.names}
self.insert_model(d)
def query_model_array(self, filter=[], limit=0):
"""
load a range of models from the database.
the project id is implicitly given by the object attribute.
it's not possible to combine results from multiple projects.
@param filter: list of filter expressions.
each expression is a relational expression of the form <code>field operator value</code>,
where field is a unique field name of the Projects, Jobs, Models or Results table, e.g.
`job_id`, `model`, `rfac`, `scan`, `sym`, etc.
operator is one of the relational operators in SQL syntax.
value is a numeric or string constant, the latter including single or double quotes.
if the list is empty, no filtering is applied.
the project should not be specified here.
@param limit: maximum number of models to read. 0 = return all models that match the filter criteria.
@return numpy structured array as it is used, e.g., in pmsco.optimizers.population.Population.
the function uses the data type only. data in the array is overwritten.
the array is sorted by model_id.
@raise unknown exception when the project does not have results.
@raise OperationalError if the view ViewModelsProject{0} is missing in the database schema.
"""
self.check_connection()
filter += [" project_id = {0} ".format(self.project_id)]
with self._lock, self._conn:
sql = "select distinct Models.id as model_id, model "
sql += "from Models "
sql += "join Results on Models.id = Results.model_id "
sql += "join Jobs on Models.job_id = Jobs.id "
sql += "join Projects on Jobs.project_id = Projects.id "
if filter:
sql += "where "
sql += " and ".join(filter)
sql += " "
sql += "order by model_id "
if limit:
sql += "limit {0} ".format(limit)
c = self._conn.execute(sql)
models = c.fetchall()
count = len(models)
model_ids = [row['model_id'] for row in models]
sql = "select * "
sql += "from ViewModelsProject{0} ".format(self.project_id)
sql += "where model_id in ({0}) ".format(",".join("?" * len(model_ids)))
sql += "order by model_id "
c = self._conn.execute(sql, model_ids)
results = c.fetchall()
names = [desc[0] for desc in c.description]
dt = np.dtype([(field_to_param(n), field_to_numpy_type(n)) for n in sorted(names, key=str.lower)])
out_array = np.zeros((count,), dtype=dt)
for idx, row in enumerate(results):
for name in names:
try:
out_array[field_to_param(name)][idx] = row[name]
except TypeError:
# can't store None in integer
pass
return out_array
def query_best_results(self, filter=[], limit=0):
"""
load a range of results from the database.
@param filter: list of filter expressions.
each expression is a relational expression of the form <code>field operator value</code>,
where field is a unique field name of the Projects, Jobs, Models or Results table, e.g.
`job_id`, `model`, `rfac`, `scan`, `sym`, etc.
operator is one of the relational operators in SQL syntax.
value is a numeric or string constant, the latter including single or double quotes.
if the list is empty, no filtering is applied.
the project should not be specified here.
@param limit: maximum number of models to read. 0 = return all models that match the filter criteria.
@return numpy structured array as it is used, e.g., in pmsco.optimizers.population.Population.
the function uses the data type only. data in the array is overwritten.
the array is sorted by rfac.
"""
self.check_connection()
filter += [" project_id = {0} ".format(self.project_id)]
with self._lock, self._conn:
sql = "select Results.id as result_id, model_id, job_id, "
sql += "model, scan, sym, emit, region, rfac, gen, particle "
sql += "from Models "
sql += "join Results on Models.id = Results.model_id "
sql += "join Jobs on Models.job_id = Jobs.id "
sql += "join Projects on Jobs.project_id = Projects.id "
if filter:
sql += "where "
sql += " and ".join(filter)
sql += " "
sql += "order by rfac, job_id, model, scan, sym, emit, region "
if limit:
sql += "limit {0} ".format(limit)
c = self._conn.execute(sql)
results = c.fetchall()
count = len(results)
names = [desc[0] for desc in c.description]
dt = np.dtype([(field_to_param(n), field_to_numpy_type(n)) for n in sorted(names)])
out_array = np.zeros((count,), dtype=dt)
for idx, row in enumerate(results):
for name in names:
try:
out_array[field_to_param(name)][idx] = row[name]
except TypeError:
# can't store None in integer
pass
return out_array
def query_best_models_per_jobs(self, job_ids=None, task_level='model'):
"""
return the best model (by rfac) of each selected job
the query gathers the R-factors of the selected jobs at the selected task levels
and, for each job, returns the (database) model id where the lowest R-factor is reported
among the gathered results.
this can be useful if you want to compile a report of the best model per job.
@param job_ids: iterable of job ids to include in the query.
the job ids must belong to the current project.
if empty or non-specified, all jobs of the current project are included.
@param task_level: element of or index into @ref pmsco.dispatch.CALC_LEVELS.
deepest task_level to include in the query.
results on deeper levels are not considered.
e.g. if you pass 'scan', R-factors of individual scans are included in the query.
note that including deeper levels will not increase the number of results returned.
@return sequence of model_id.
the number of results corresponds to the number of jobs in the filter scope.
to find out details of the models, execute another query that filters on these model ids.
the method produces an SQL query similar to:
@code{.sql}
select Models.id from Models
join Results on Models.id = Results.model_id
join Jobs on Models.job_id = Jobs.id
where scan=-1
and project_id=1
and job_id in (1,2,3)
group by Models.job_id
having min(rfac)
order by rfac
@endcode
"""
try:
level = dispatch.CALC_LEVELS.index(task_level) + 1
except ValueError:
level = task_level + 1
try:
level_name = dispatch.CALC_LEVELS[level]
except IndexError:
level_name = dispatch.CALC_LEVELS[4]
self.check_connection()
with self._lock, self._conn:
sql = "select Models.id from Models "
sql += "join Results on Models.id = Results.model_id "
sql += "join Jobs on Models.job_id = Jobs.id "
sql += "where project_id = {0} ".format(self.project_id)
sql += "and {0} = -1 ".format(level_name)
if job_ids:
sql += "and Models.job_id in ({0}) ".format(",".join(map(str, job_ids)))
sql += "group by Models.job_id "
sql += "having min(rfac) "
sql += "order by rfac, job_id, model, scan, sym, emit, region "
c = self._conn.execute(sql)
models = [row['id'] for row in c]
return models
def query_tasks(self, job_id=0):
"""
query the task index used in a calculation job.
this query neglects the model index
and returns the unique tuples (-1, scan, sym, emit, region).
@param job_id: (int) id of the associated Jobs entry.
if 0, self.job_id is used.
@return list of pmsco.dispatch.CalcID tuples of task indices.
the model attribute is -1 in all elements.
"""
if not job_id:
job_id = self.job_id
self.check_connection()
with self._lock, self._conn:
sql = "select scan, sym, emit, region "
sql += "from Models "
sql += "join Results on Models.id = Results.model_id "
sql += "join Jobs on Models.job_id = Jobs.id "
sql += "join Projects on Jobs.project_id = Projects.id "
sql += "where job_id = :job_id"
c = self._conn.execute(sql, {'job_id': job_id})
results = c.fetchall()
output = []
for row in results:
d = dict(row)
d['model'] = -1
output.append(dispatch.CalcID(**d))
return output
def query_best_task_models(self, level, count, job_id=0):
"""
query N best models per task.
this query is used by the file tracker to determine the models to keep.
@param level: level up to which to query.
the level can be specified by level name (str) or numeric index (0..4).
if it is scan (equivalent to 1), the method queries the model and scan levels.
@param count: number of models to query per task.
@param job_id: (int) id of the associated Jobs entry.
if 0, self.job_id is used.
@return set of matching model numbers (Models.model field).
"""
if not job_id:
job_id = self.job_id
try:
level = int(level)
except ValueError:
level = dispatch.CALC_LEVELS.index(level)
assert 0 <= level < len(dispatch.CALC_LEVELS)
def _query_models(d):
sql = "select model "
sql += "from Models "
sql += "join Results on Models.id = Results.model_id "
sql += "where Models.job_id = :job_id "
sql += "and scan = :scan "
sql += "and sym = :sym "
sql += "and emit = :emit "
sql += "and region = :region "
sql += "order by rfac "
sql += "limit :limit "
c = self._conn.execute(sql, d)
results = c.fetchall()
return set([row['model'] for row in results])
tasks = self.query_tasks(job_id)
models = set([])
with self._lock, self._conn:
for task in tasks:
if task.numeric_level <= level:
d = task._asdict()
del d['model']
d['job_id'] = job_id
d['limit'] = count
q_models = _query_models(d)
models |= q_models
return models
def insert_result(self, index, result, job_id=0):
"""
add or update a result in the database.
the method updates the Models, Results and ParamValues tables.
the model is identified by job_id and index.model.
the result is identified by job_id and index.
if the model or result exists in the database, it is updated.
@param index: (pmsco.dispatch.CalcID or dict)
calculation index.
in case of dict, the keys must be the attribute names of CalcID prefixed with an underscore, i.e.,
'_model', '_scan', '_sym', '_emit', '_region'.
extra values in the dictionary are ignored.
undefined indices must be -1.
@param result: (dict) dictionary containing the parameter values and the '_rfac' result.
may also contain the special values '_gen' and '_particle'.
'_gen' and '_particle' default to None if not present.
@param job_id: (int) id of the associated Jobs entry.
if 0, self.job_id is used.
@return: id of the new or updated Results entry.
"""
if not job_id:
job_id = self.job_id
self.check_connection()
with self._lock, self._conn:
model_id = self._insert_result_model(job_id, index, result)
result_id = self._insert_result_data(model_id, index, result)
self._insert_result_paramvalues(model_id, result)
return result_id
def _insert_result_model(self, job_id, index, result):
"""
add or update the model entry for a calculation result in the database.
the method updates the Models table.
the database and transaction must be locked by the caller.
the model is identified by job_id and index.model.
the result is identified by job_id and index.
if the model exists in the database, it is updated.
@param job_id: (int) primary key of the associated Jobs entry.
no default value! passing an invalid job_id raises an exception.
@param index: (pmsco.dispatch.CalcID or dict)
calculation index.
in case of dict, the keys must be the attribute names of CalcID prefixed with an underscore, i.e.,
'_model', '_scan', '_sym', '_emit', '_region'.
extra values in the dictionary are ignored.
undefined indices must be -1.
@param result: (dict) dictionary containing the parameter values and the '_rfac' result.
may also contain the special values '_gen' and '_particle'.
'_gen' and '_particle' default to None if not present.
@return: id of the new or updated Results entry.
"""
model_entry = {'gen': None, 'particle': None}
model_entry.update(special_params(result))
model_entry['job_id'] = job_id
try:
model_entry['model'] = index.model
except AttributeError:
model_entry['model'] = index['_model']
c = self._conn.execute(self.sql_select_model_model, model_entry)
v = c.fetchone()
if v:
model_id = v[0]
model_entry['model_id'] = model_id
self._conn.execute(self.sql_update_model, model_entry)
else:
c = self._conn.execute(self.sql_insert_model, model_entry)
model_id = c.lastrowid
return model_id
def _insert_result_data(self, model_id, index, result):
"""
add or update a result in the database.
the method updates the Results table.
the database and transaction must be locked by the caller.
the model is identified by model_id.
the result is identified by model_id and index.
if the result exists in the database, it is updated.
@param model_id: (int) primary key of the model entry in the Models table.
the entry must exist.
@param index: (pmsco.dispatch.CalcID or dict)
calculation index.
in case of dict, the keys must be the attribute names of CalcID prefixed with an underscore, i.e.,
'_model', '_scan', '_sym', '_emit', '_region'.
extra values in the dictionary are ignored.
undefined indices must be -1.
@param result: (dict) dictionary containing the parameter values and the '_rfac' result.
@return: id of the new or updated Results entry.
"""
result_entry = special_params(result)
result_entry['model_id'] = model_id
result_entry.update(special_params(index))
c = self._conn.execute(self.sql_select_result_index, result_entry)
v = c.fetchone()
if v:
result_id = v[0]
result_entry['result_id'] = result_id
self._conn.execute(self.sql_update_result, result_entry)
else:
c = self._conn.execute(self.sql_insert_result, result_entry)
result_id = c.lastrowid
return result_id
def _insert_result_paramvalues(self, model_id, result):
"""
add or update parameter values of a model in the database.
the method updates the ParamValues table.
the database and transaction must be locked by the caller.
@param model_id: (int) primary key of the model entry in the Models table.
the entry must exist.
@param result: (dict) dictionary containing the parameter values.
the parameter names must exist in the Params table and in the self._model_params dictionary.
special values (with a leading underscore) are ignored.
extra parameters may raise a KeyError.
@return: None
@raise: KeyError if a parameter key is not registered.
"""
for key, value in regular_params(result).items():
param_id = self._model_params[key]
paramvalue_entry = {'param_id': param_id, 'model_id': model_id, 'value': value}
c = self._conn.execute(self.sql_select_paramvalue, paramvalue_entry)
v = c.fetchone()
if v:
paramvalue_id = v[0]
paramvalue_entry['paramvalue_id'] = paramvalue_id
self._conn.execute(self.sql_update_paramvalue, paramvalue_entry)
else:
self._conn.execute(self.sql_insert_paramvalue, paramvalue_entry)
def import_results_file(self, filename, job_id=0):
"""
import a results file into the database.
the results file is a space-delimited, general text file
such as produced by pmsco.optimizers.population.Population.save_array().
each line contains one result dataset, the columns correspond to the regular and special parameters.
the first row contains the parameter names.
a job entry with the given id must exist,
but there must be no model entries referencing the job.
it is not possible to update existing models, results or parameter values using this method.
instead, you have to delete the job (which also deletes all dependent entries)
and re-import the results.
@param filename: path and name of the results file.
@return: None.
@raise ValueError if the job already has model entries.
"""
if not job_id:
job_id = self.job_id
data = np.genfromtxt(filename, names=True)
self.register_params(data.dtype.names)
try:
unique_models, unique_index = np.unique(data['_model'], True)
except ValueError:
unique_models = np.array([0])
unique_index = np.array([0])
unique_data = data[unique_index]
model_ids = {}
def model_entry_generator():
for result in unique_data:
model_entry = {'job_id': job_id,
'model': unique_models[0],
'gen': None,
'particle': None}
model_entry.update(special_params(result))
yield model_entry
def result_entry_generator():
for result in data:
try:
model = result['_model']
except ValueError:
model = unique_models[0]
result_entry = {'model_id': model_ids[model],
'scan': -1,
'sym': -1,
'emit': -1,
'region': -1,
'rfac': None}
result_entry.update(special_params(result))
yield result_entry
def param_entry_generator():
for result in unique_data:
try:
model = result['_model']
except ValueError:
model = unique_models[0]
for key, value in regular_params(result).items():
param_entry = {'model_id': model_ids[model],
'param_id': self._model_params[key],
'value': value}
yield param_entry
with self._lock, self._conn:
c = self._conn.execute(self.sql_select_model_job, {'job_id': job_id})
v = c.fetchone()
if v:
raise ValueError("database already contains model entries for job {0}".format(job_id))
self._conn.executemany(self.sql_insert_model, model_entry_generator())
c = self._conn.execute(self.sql_select_model_job, {'job_id': job_id})
model_ids = {row['model']: row['id'] for row in c}
self._conn.executemany(self.sql_insert_result, result_entry_generator())
self._conn.executemany(self.sql_insert_paramvalue, param_entry_generator())
def import_job_results(**kwargs):
"""
import results from a calculation job.
this function contains all steps necessary to import the results from a calculation job into a database.
it registers the project and job, and imports the results data.
the project may exist in the database, the job must not exist (raises an exception).
arguments can be specified as dict (**d) or in keyword=value form.
@param kwargs: dictionary of function arguments.
the dictionary contains the following values.
all arguments are required unless noted.
@arg 'workdir' (optional) path to the working directory.
the working directory of the operating system is changed.
this is the root for relative paths of the database and results files.
if not specified, the working directory is unchanged.
@arg 'dbfile' name of the database file.
@arg 'project' name of the project.
@arg 'code' name of the project code.
@arg 'job' name of the calculation job.
@arg 'mode' pmsco optimization mode.
@arg 'machine' name of the machine where the job ran.
@arg 'git' git hash of the code revision.
@arg 'datetime' (datetime.datetime) time stamp (optional).
if not specified, the argument defaults to the time stamp of the results file.
hint: the constructor of a datetime object is
`datetime.datetime(year, month, day, hour, minute, second)`.
@arg 'description' meaningful description of the calculation job, up to the user.
@arg 'resultsfile' name of the .tasks.dat results file.
@return dict with 'project_id' and 'job_id'
@raise ValueError if the job already exists in the database.
"""
try:
os.chdir(kwargs['workdir'])
except KeyError:
pass
try:
dt = kwargs['datetime']
except KeyError:
dt = datetime.datetime.fromtimestamp(os.path.getmtime(kwargs['resultsfile']))
db = ResultsDatabase()
db.connect(kwargs['dbfile'])
project_id = db.register_project(kwargs['project'], kwargs['code'])
job_id = db.register_job(project_id, kwargs['job'], kwargs['mode'], kwargs['machine'], kwargs['git'], dt,
kwargs['description'])
db.import_results_file(kwargs['resultsfile'], job_id)
db.create_models_view()
db.disconnect()
return {'project_id': project_id, 'job_id': job_id}