1640 lines
64 KiB
Python
1640 lines
64 KiB
Python
"""
|
|
@package pmsco.database
|
|
experimental development of a model database.
|
|
|
|
this module maintains an sqlite3 database of calculation results.
|
|
the database provides a flexible way to query and analyse results of various calculations
|
|
under different filtering and sorting from any client that supports sqlite3.
|
|
|
|
currently, the database module is independent of the core PMSCO,
|
|
and the database has to be maintained manually.
|
|
in the future, calculation results will be added automatically by PMSCO.
|
|
|
|
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
|
|
|
|
@copyright (c) 2016-18 by Paul Scherrer Institut @n
|
|
Licensed under the Apache License, Version 2.0 (the "License"); @n
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
import os
|
|
import datetime
|
|
import logging
|
|
import sqlite3
|
|
import fasteners
|
|
import numpy as np
|
|
import pmsco.dispatch as dispatch
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# make sure sqlite understands numpy data types
|
|
sqlite3.register_adapter(np.float64, float)
|
|
sqlite3.register_adapter(np.float32, float)
|
|
sqlite3.register_adapter(np.int64, int)
|
|
sqlite3.register_adapter(np.int32, int)
|
|
|
|
|
|
class _DummyLock(object):
|
|
"""
|
|
dummy lock used for in memory database.
|
|
"""
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
pass
|
|
|
|
|
|
## mapping of database fields to special parameter names
|
|
#
|
|
# `_db` parameters are returned by some query methods to identify the database records.
|
|
#
|
|
DB_SPECIAL_PARAMS = {"job_id": "_db_job",
|
|
"model_id": "_db_model",
|
|
"result_id": "_db_result",
|
|
"model": "_model",
|
|
"scan": "_scan",
|
|
"sym": "_sym",
|
|
"emit": "_emit",
|
|
"region": "_region",
|
|
"gen": "_gen",
|
|
"particle": "_particle",
|
|
"rfac": "_rfac"}
|
|
|
|
|
|
## numpy data types of special parameters by database field
|
|
#
|
|
# this dictionary helps to create a numpy array from a database record.
|
|
#
|
|
DB_SPECIAL_NUMPY_TYPES = {"job_id": "i8",
|
|
"model_id": "i8",
|
|
"result_id": "i8",
|
|
"model": "i8",
|
|
"scan": "i8",
|
|
"sym": "i8",
|
|
"emit": "i8",
|
|
"region": "i8",
|
|
"gen": "i8",
|
|
"particle": "i8",
|
|
"rfac": "f8"}
|
|
|
|
|
|
def regular_params(d):
|
|
"""
|
|
filter regular parameters from dictionary
|
|
|
|
returns a dictionary containing only the regular parameters (those not prefixed with an underscore).
|
|
|
|
@param d: dict or numpy.void or pmsco.dispatch.CalcID.
|
|
the param names must have no leading underscore.
|
|
the numpy.void type occurs when an element of a structured array is extracted.
|
|
the CalcID does not contain a regular parameter and will return an empty dictionary.
|
|
it is supported only for compatibility with special_params function.
|
|
a tuple or list is interpreted as a sequence of parameter names.
|
|
in this case the names representing special parameters are returned with underscore removed.
|
|
|
|
@return: dict. leading underscores are removed from key names.
|
|
"""
|
|
if isinstance(d, np.void):
|
|
d = {k: d[k] for k in d.dtype.names if k[0] != "_"}
|
|
elif isinstance(d, dispatch.CalcID):
|
|
d = {}
|
|
elif isinstance(d, tuple):
|
|
d = [k for k in d if k[0] != "_"]
|
|
d = tuple(d)
|
|
elif isinstance(d, dict):
|
|
d = {k: v for k, v in d.items() if k[0] != "_"}
|
|
else:
|
|
d = [k for k in d if k[0] != "_"]
|
|
|
|
return d
|
|
|
|
|
|
def special_params(d):
|
|
"""
|
|
filter special parameters from model dictionary, numpy record or sequence.
|
|
|
|
special parameters are those prefixed with an underscore.
|
|
the underscore is removed from the keys.
|
|
fields starting with '_db_' are removed.
|
|
|
|
@param d: dict or numpy.void or pmsco.dispatch.CalcID or sequence.
|
|
in the case of a dict or numpy.void,
|
|
the key names of the special parameters must have a leading underscore.
|
|
the numpy.void type occurs when an element of a structured array is extracted.
|
|
in the case of a CalcID, the attribute names become the key names.
|
|
a tuple or list is interpreted as a sequence of parameter names.
|
|
in this case the names representing special parameters are returned with underscore removed.
|
|
|
|
@return
|
|
the return type depends on the type of input `d`:
|
|
@arg in the case of a dict, numpy.void or CalcID it is a dictionary.
|
|
@arg in the case of a tuple or list the return type is the same as the input.
|
|
"""
|
|
if isinstance(d, np.void):
|
|
d = {k[1:]: d[k] for k in d.dtype.names if k[0] == "_" and k[0:4] != "_db_"}
|
|
elif isinstance(d, dispatch.CalcID):
|
|
d = d._asdict()
|
|
elif isinstance(d, tuple):
|
|
d = [k[1:] for k in d if k[0] == "_" and k[0:4] != "_db_"]
|
|
d = tuple(d)
|
|
elif isinstance(d, dict):
|
|
d = {k[1:]: v for k, v in d.items() if k[0] == "_" and k[0:4] != "_db_"}
|
|
else:
|
|
d = [k[1:] for k in d if k[0] == "_" and k[0:4] != "_db_"]
|
|
|
|
return d
|
|
|
|
|
|
def field_to_param(f):
|
|
"""
|
|
translate database field name to parameter name.
|
|
|
|
field names of optimization parameters are unchanged.
|
|
special parameters are prefixed by '_' or '_db_'.
|
|
|
|
@param f: (str) database field name.
|
|
@return: (str) parameter name as used in model dictionaries.
|
|
"""
|
|
try:
|
|
p = DB_SPECIAL_PARAMS[f]
|
|
except KeyError:
|
|
p = f
|
|
return p
|
|
|
|
|
|
def field_to_numpy_type(f):
|
|
"""
|
|
determine the numpy data type string of a database field.
|
|
|
|
@param f: (str) database field name.
|
|
@return: (str) numpy type description, e.g. 'f8'.
|
|
"""
|
|
try:
|
|
t = DB_SPECIAL_NUMPY_TYPES[f]
|
|
except KeyError:
|
|
t = 'f8'
|
|
return t
|
|
|
|
|
|
class ResultsDatabase(object):
|
|
"""
|
|
interface to a sqlite3 database of calculation results.
|
|
|
|
the class maintains a connection to a specified database file.
|
|
results can be written to and retrieved from the database.
|
|
|
|
to get started in a calculation job, call the following methods:
|
|
- connect()
|
|
- register_project()
|
|
- register_job()
|
|
- register_params()
|
|
|
|
the class also maintains a local registry of model parameters.
|
|
this is basically a dictionary that maps parameter names to parameter ids used in the database.
|
|
because the database can only relate a parameter to a project or job after a value has been inserted,
|
|
it is important to maintain such a local registry.
|
|
|
|
before any access to the ParamValues table, the project parameters must be registered
|
|
by register_params() or query_project_params().
|
|
the first method can be used at any time.
|
|
it should be used before data is inserted into the database.
|
|
if a parameter is already registered in the database, the id is taken from the database
|
|
(parameter names are unique in the whole Params table).
|
|
the second method can be used only when the database already contains data.
|
|
it should be used if the database is read only, or the parameter names are unknown.
|
|
"""
|
|
sql_create_projects = """CREATE TABLE IF NOT EXISTS `Projects` (
|
|
`id` INTEGER PRIMARY KEY,
|
|
`name` TEXT NOT NULL UNIQUE COLLATE NOCASE,
|
|
`code` TEXT COLLATE NOCASE
|
|
)"""
|
|
sql_insert_project = "insert into Projects(name, code) values (:name, :code)"
|
|
sql_select_project = "select name, code from Projects where id=:id"
|
|
sql_select_project_name = "select id, name, code from Projects where name=:name"
|
|
sql_delete_project = """delete from Projects where project_id = :project_id"""
|
|
|
|
sql_create_jobs = """CREATE TABLE IF NOT EXISTS `Jobs` (
|
|
`id` INTEGER PRIMARY KEY,
|
|
`project_id` INTEGER,
|
|
`name` TEXT NOT NULL COLLATE NOCASE,
|
|
`mode` TEXT COLLATE NOCASE,
|
|
`machine` TEXT COLLATE NOCASE,
|
|
`git_hash` TEXT,
|
|
`datetime` TEXT,
|
|
`description` TEXT,
|
|
FOREIGN KEY(project_id) REFERENCES Projects(id) ON DELETE CASCADE
|
|
)"""
|
|
sql_insert_job = """insert into Jobs(project_id, name, mode, machine, git_hash, datetime, description)
|
|
values (:project_id, :name, :mode, :machine, :git_hash, :datetime, :description)"""
|
|
sql_select_job = """select id, project_id, name, mode, machine, git_hash, datetime
|
|
from Jobs where id = :job_id"""
|
|
sql_select_job_name = """select id, project_id, name, mode, machine, git_hash, datetime
|
|
from Jobs where project_id=:project_id and name=:name"""
|
|
sql_delete_job = """delete from Jobs where id = :job_id"""
|
|
|
|
sql_create_models = """CREATE TABLE IF NOT EXISTS `Models` (
|
|
`id` INTEGER PRIMARY KEY,
|
|
`job_id` INTEGER,
|
|
`model` INTEGER,
|
|
`gen` INTEGER,
|
|
`particle` INTEGER,
|
|
FOREIGN KEY(job_id) REFERENCES Jobs(id) ON DELETE CASCADE
|
|
)"""
|
|
sql_index_models = """create index if not exists
|
|
index_models on Models
|
|
('job_id', 'model')"""
|
|
sql_drop_index_models = "drop index if exists index_models"
|
|
sql_insert_model = """insert into Models(job_id, model, gen, particle)
|
|
values (:job_id, :model, :gen, :particle)"""
|
|
sql_update_model = """update Models
|
|
set model=:model, gen=:gen, particle=:particle
|
|
where id=:model_id"""
|
|
sql_select_model = """select id, job_id, model, gen, particle
|
|
from Models where id=:id"""
|
|
sql_select_model_model = """select id, job_id, model, gen, particle
|
|
from Models where model=:model"""
|
|
sql_select_model_job = """select id, job_id, model, gen, particle
|
|
from Models where job_id=:job_id"""
|
|
sql_delete_model = """delete from Models where model_id = :model_id"""
|
|
|
|
sql_create_results = """create table if not exists `Results` (
|
|
`id` INTEGER PRIMARY KEY,
|
|
`model_id` INTEGER,
|
|
`scan` integer,
|
|
`sym` integer,
|
|
`emit` integer,
|
|
`region` integer,
|
|
`rfac` REAL,
|
|
FOREIGN KEY(model_id) REFERENCES Models(id) ON DELETE CASCADE
|
|
)"""
|
|
sql_index_results_tasks = """create index if not exists
|
|
`index_results_tasks` ON `Results`
|
|
(`model_id`, `scan`,`sym`,`emit`,`region`)"""
|
|
sql_drop_index_results_tasks = "drop index if exists index_results_tasks"
|
|
sql_index_results_models = """create index if not exists
|
|
`index_results_models` ON `Results`
|
|
(`id`, `model_id`)"""
|
|
sql_drop_index_results_models = "drop index if exists index_results_models"
|
|
sql_insert_result = """insert into Results(model_id, scan, sym, emit, region, rfac)
|
|
values (:model_id, :scan, :sym, :emit, :region, :rfac)"""
|
|
sql_update_result = """update Results
|
|
set rfac=:rfac
|
|
where id=:result_id"""
|
|
sql_select_result = """select id, model_id, scan, sym, emit, region, rfac
|
|
from Results where id=:id"""
|
|
sql_select_result_index = """select id, model_id, scan, sym, emit, region, rfac
|
|
from Results where model_id=:model_id and scan=:scan and sym=:sym and emit=:emit and region=:region"""
|
|
sql_delete_result = """delete from Results where id = :result_id"""
|
|
|
|
sql_create_params = """CREATE TABLE IF NOT EXISTS `Params` (
|
|
`id` INTEGER PRIMARY KEY,
|
|
`key` TEXT NOT NULL UNIQUE COLLATE NOCASE
|
|
)"""
|
|
sql_insert_param = "insert into Params(key) values (:key)"
|
|
sql_select_param = "select key from Params where id=:id"
|
|
sql_select_param_key = "select id, key from Params where key=:key"
|
|
sql_select_param_project = """select distinct key, param_id from Models
|
|
join Jobs on Models.job_id = Jobs.id
|
|
join ParamValues on Models.id = paramValues.model_id
|
|
join Params on Params.id = ParamValues.param_id
|
|
where Jobs.project_id = :project_id
|
|
order by key collate nocase"""
|
|
sql_select_param_job = """select distinct key, param_id from Models
|
|
join ParamValues on Models.id = paramValues.model_id
|
|
join Params on Params.id = ParamValues.param_id
|
|
where Models.job_id = :job_id
|
|
order by key collate nocase"""
|
|
|
|
sql_create_paramvalues = """CREATE TABLE IF NOT EXISTS `ParamValues` (
|
|
`id` INTEGER PRIMARY KEY,
|
|
`param_id` INTEGER NOT NULL,
|
|
`model_id` INTEGER NOT NULL,
|
|
`value` REAL,
|
|
FOREIGN KEY(param_id) REFERENCES Params(id) ON DELETE CASCADE,
|
|
FOREIGN KEY(model_id) REFERENCES Models(id) ON DELETE CASCADE
|
|
)"""
|
|
sql_index_paramvalues = """create index if not exists
|
|
`index_paramvalues` ON `ParamValues`
|
|
(`param_id`, `model_id`)"""
|
|
sql_drop_index_paramvalues = "drop index if exists index_paramvalues"
|
|
sql_insert_paramvalue = """
|
|
insert into ParamValues(param_id, model_id, value)
|
|
values (:param_id, :model_id, :value)
|
|
"""
|
|
sql_update_paramvalue = """
|
|
update ParamValues set value=:value where id=:paramvalue_id
|
|
"""
|
|
sql_select_paramvalue_model = """
|
|
select key, value from ParamValues
|
|
join Params on ParamValues.param_id = Params.id
|
|
where model_id = :model_id
|
|
"""
|
|
sql_select_paramvalue = """
|
|
select ParamValues.id as id, key, value from ParamValues
|
|
join Params on ParamValues.param_id = Params.id
|
|
where param_id = :param_id and model_id = :model_id
|
|
"""
|
|
|
|
sql_create_tags = """CREATE TABLE IF NOT EXISTS `Tags` (
|
|
`id` INTEGER PRIMARY KEY,
|
|
`key` TEXT NOT NULL UNIQUE COLLATE NOCASE
|
|
)"""
|
|
sql_insert_tag = "insert into Tags(key) values (:key)"
|
|
sql_select_tag = "select key from Tags where id=:id"
|
|
sql_select_tag_key = "select id, key from Tags where key=:key"
|
|
sql_select_tag_project = """select distinct key, tag_id from Jobs
|
|
join JobTags on Jobs.id = JobTags.job_id
|
|
join Tags on Tags.id = JobTags.tag_id
|
|
where Jobs.project_id = :project_id
|
|
order by key collate nocase"""
|
|
sql_select_tag_job = """select distinct key, tag_id from JobTags
|
|
join Tags on Tags.id = JobTags.tag_id
|
|
where JobTags.job_id = :job_id
|
|
order by key collate nocase"""
|
|
|
|
sql_create_jobtags = """CREATE TABLE IF NOT EXISTS `JobTags` (
|
|
`id` INTEGER PRIMARY KEY,
|
|
`tag_id` INTEGER NOT NULL,
|
|
`job_id` INTEGER NOT NULL,
|
|
`value` TEXT COLLATE NOCASE,
|
|
FOREIGN KEY(tag_id) REFERENCES Tags(id) ON DELETE CASCADE,
|
|
FOREIGN KEY(job_id) REFERENCES Jobs(id) ON DELETE CASCADE
|
|
)"""
|
|
sql_index_jobtags = """create index if not exists
|
|
`index_jobtags` ON `JobTags`
|
|
(`tag_id`, `job_id`)"""
|
|
sql_drop_index_jobtags = "drop index if exists index_jobtags"
|
|
sql_insert_jobtag = """
|
|
insert into JobTags(tag_id, job_id, value)
|
|
values (:tag_id, :job_id, :value)
|
|
"""
|
|
sql_update_jobtag = """
|
|
update JobTags set value=:value where id=:jobtag_id
|
|
"""
|
|
sql_select_jobtag_job = """
|
|
select key, value from JobTags
|
|
join Tags on JobTags.tag_id = Tags.id
|
|
where job_id = :job_id
|
|
"""
|
|
sql_select_jobtag = """
|
|
select JobTags.id as id, key, value from JobTags
|
|
join Tags on JobTags.tag_id = Tags.id
|
|
where tag_id = :tag_id and job_id = :job_id
|
|
"""
|
|
|
|
# @var _conn (sqlite3.Connection).
|
|
# connection interface to the database.
|
|
#
|
|
# opened and closed by self.connect() and self.disconnect(), respectively.
|
|
# the connection can remain open during the whole process.
|
|
#
|
|
# @note sqlite3.Connection is re-usable but not re-entrant.
|
|
# Be careful not to nest contexts when calling other methods from within this class!
|
|
|
|
# @var _db_filename (str).
|
|
# path and name of the database file or ":memory:" for an in-memory database.
|
|
|
|
# @var project_id (int).
|
|
# id of the current project in the primary key of the Projects table.
|
|
#
|
|
# set by self.register_project().
|
|
# new jobs and models are linked to this project id by default.
|
|
|
|
# @var job_id (int).
|
|
# id of the current job in the primary key of the Jobs table.
|
|
#
|
|
# set by self.register_job().
|
|
# new models are linked to this job id by default.
|
|
|
|
# @var _model_params (dict).
|
|
# dictionary of model parameters used in the current project.
|
|
#
|
|
# set by self.register_params() and self.register_param().
|
|
# read by self.add_model().
|
|
|
|
# @var _lock_filename (str).
|
|
# path and name of the lock file or an empty string if no locking is used.
|
|
|
|
# @var _lock (obj).
|
|
# context manager which provides a locking mechanism for the database.
|
|
#
|
|
# this is either a fasteners.InterprocessLock or _DummyLock.
|
|
# InterprocessLock allows to serialize access to the database by means of a lock file.
|
|
# _DummyLock is used with an in-memory database which does not require locking.
|
|
#
|
|
# @note InterprocessLock is re-usable but not re-entrant.
|
|
# Be careful not to nest contexts when calling other methods from within this class!
|
|
|
|
def __init__(self):
|
|
self._conn = None
|
|
self._db_filename = ""
|
|
self.project_id = 0
|
|
self.job_id = 0
|
|
self._model_params = {}
|
|
self._tags = {}
|
|
self._lock_filename = ""
|
|
self._lock = None
|
|
|
|
def connect(self, db_filename, lock_filename=""):
|
|
"""
|
|
connect to a new or existing database file.
|
|
|
|
if the file does not exist, or if it is empty, a new database schema is created.
|
|
|
|
@param db_filename: name of a file or ":memory:" for an in-memory database.
|
|
|
|
@param lock_filename: name of a file that is used to lock the database.
|
|
by default, the db_filename with a suffix of ".lock" is used.
|
|
for most uses, the default should be fine.
|
|
the argument is provided mainly for testing the locking functionality.
|
|
|
|
this must be a file that is not used for anything else.
|
|
the file does not need to exist.
|
|
it's best if the file is in the same directory as the database file.
|
|
|
|
@return: None
|
|
"""
|
|
self._db_filename = db_filename
|
|
|
|
if lock_filename:
|
|
self._lock_filename = lock_filename
|
|
elif db_filename == ":memory:":
|
|
self._lock_filename = ""
|
|
else:
|
|
self._lock_filename = db_filename + ".lock"
|
|
if self._lock_filename:
|
|
self._lock = fasteners.InterProcessLock(self._lock_filename)
|
|
else:
|
|
self._lock = _DummyLock()
|
|
|
|
self._conn = sqlite3.connect(self._db_filename)
|
|
self._conn.row_factory = sqlite3.Row
|
|
with self._lock:
|
|
self._conn.execute("PRAGMA foreign_keys = 1")
|
|
self._conn.commit()
|
|
c = self._conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='Models'")
|
|
l = c.fetchone()
|
|
if not l or not l[0]:
|
|
self.create_schema()
|
|
|
|
def disconnect(self):
|
|
"""
|
|
close the connnection to the database.
|
|
|
|
uncommitted transactions are lost.
|
|
data of an in-memory database is lost.
|
|
|
|
@return: None
|
|
"""
|
|
if self._conn is not None:
|
|
self._conn.close()
|
|
self._conn = None
|
|
self._lock = None
|
|
|
|
def check_connection(self):
|
|
"""
|
|
check that the database is connected.
|
|
|
|
checks that self.connect has been called.
|
|
this method is used by most ResultsDatabase methods to check connection before the database is used.
|
|
it essentially produces a better understandable exception message if the connect call has been forgotten.
|
|
(with out it, python would raise: `AttributeError: __exit__`).
|
|
|
|
@return None
|
|
|
|
@raise AssertionError if the connection is not valid.
|
|
"""
|
|
assert self._lock is not None, "database not connected"
|
|
assert self._conn is not None, "database not connected"
|
|
|
|
def create_schema(self):
|
|
"""
|
|
create the database schema (tables and indices).
|
|
|
|
the method can safely be applied to an existing schema.
|
|
because all create commands include the "if not exists",
|
|
only missing objects will be created.
|
|
|
|
@return: None
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
self._conn.execute(self.sql_create_projects)
|
|
self._conn.execute(self.sql_create_jobs)
|
|
self._conn.execute(self.sql_create_models)
|
|
self._conn.execute(self.sql_create_results)
|
|
self._conn.execute(self.sql_create_params)
|
|
self._conn.execute(self.sql_create_paramvalues)
|
|
self._conn.execute(self.sql_create_tags)
|
|
self._conn.execute(self.sql_create_jobtags)
|
|
self._conn.execute(self.sql_index_results_tasks)
|
|
self._conn.execute(self.sql_index_results_models)
|
|
self._conn.execute(self.sql_index_paramvalues)
|
|
self._conn.execute(self.sql_index_jobtags)
|
|
self._conn.execute(self.sql_index_models)
|
|
|
|
def register_project(self, name, code):
|
|
"""
|
|
register a project with the database.
|
|
|
|
@param name: name of the project. alphanumeric characters only. no spaces or special characters!
|
|
|
|
@param code: name of the pmsco module that defines the project.
|
|
|
|
@return: id value of the project in the database.
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
c = self._conn.execute(self.sql_select_project_name, {'name': name})
|
|
v = c.fetchone()
|
|
if v:
|
|
project_id = v[0]
|
|
else:
|
|
c = self._conn.execute(self.sql_insert_project, {'name': name, 'code': code})
|
|
project_id = c.lastrowid
|
|
self.project_id = project_id
|
|
|
|
return project_id
|
|
|
|
def delete_project(self, project_id):
|
|
"""
|
|
delete a project from the database.
|
|
|
|
this will delete all dependent jobs, models, results and parameter values.
|
|
|
|
@param project_id: id value of the project entry in the database.
|
|
|
|
@return None
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
param_dict = {'project_id': project_id}
|
|
self._conn.execute(self.sql_delete_project, param_dict)
|
|
|
|
def register_job(self, project_id, name, mode, machine, git_hash, _datetime, description=None):
|
|
# type: (int, basestring, basestring, basestring, basestring, datetime.datetime, basestring) -> int
|
|
"""
|
|
register a calculation job with the database.
|
|
|
|
@param project_id: identifier of the project. see register_project().
|
|
|
|
@param name: name of the job. up to the user, must be unique within a project.
|
|
|
|
@param mode: optimization mode string (should be same as command line argument).
|
|
|
|
@param machine: name of the machine executing the calculation. up to the user.
|
|
|
|
@param git_hash: git revision (hash) of the project code.
|
|
|
|
@param _datetime: (datetime.datetime) date and time.
|
|
|
|
@param description: meaningful description of the calculation job, up to the user.
|
|
|
|
@return: id value of the job in the database.
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
c = self._conn.execute(self.sql_select_job_name, {'project_id': project_id, 'name': name})
|
|
v = c.fetchone()
|
|
if v:
|
|
job_id = v[0]
|
|
else:
|
|
v = {'project_id': project_id,
|
|
'name': name,
|
|
'mode': mode,
|
|
'machine': machine,
|
|
'git_hash': git_hash,
|
|
'datetime': _datetime,
|
|
'description': description}
|
|
c = self._conn.execute(self.sql_insert_job, v)
|
|
job_id = c.lastrowid
|
|
self.job_id = job_id
|
|
|
|
return job_id
|
|
|
|
def delete_job(self, job_id):
|
|
"""
|
|
delete a job from the database.
|
|
|
|
this will delete all dependent models, results and parameter values.
|
|
|
|
@param job_id: id value of the job entry in the database.
|
|
|
|
@return None
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
param_dict = {'job_id': job_id}
|
|
self._conn.execute(self.sql_delete_job, param_dict)
|
|
|
|
def _query_job_name(self, job_name, project_id=0):
|
|
"""
|
|
(internal) query a job by name
|
|
|
|
this is the internal analog of @ref query_job_name
|
|
which asserts an acquired lock and open connection.
|
|
|
|
@param job_name: name of the job
|
|
|
|
@param project_id: project identifier.
|
|
by default, the current project self.project_id is used.
|
|
|
|
@return: id value of the job in the database
|
|
|
|
@raise DatabaseError if the job can't be found.
|
|
"""
|
|
if project_id == 0:
|
|
project_id = self.project_id
|
|
param_dict = {'project_id': project_id, 'name': job_name}
|
|
c = self._conn.execute(self.sql_select_job_name, param_dict)
|
|
v = c.fetchone()
|
|
return v[0]
|
|
|
|
def query_job_name(self, job_name, project_id=0):
|
|
"""
|
|
query a job by name
|
|
|
|
@param job_name: name of the job
|
|
|
|
@param project_id: project identifier.
|
|
by default, the current project self.project_id is used.
|
|
|
|
@return: id value of the job in the database
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
job_id = self._query_job_name(job_name, project_id=project_id)
|
|
|
|
return job_id
|
|
|
|
def register_param(self, key):
|
|
"""
|
|
register a parameter key with the database.
|
|
|
|
each parameter name must be registered once before a value can be written to the database.
|
|
see the class description for an explanation.
|
|
|
|
@param key: key (name) of the parameter.
|
|
|
|
@return: id value of the parameter in the database.
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
return self._register_param(key)
|
|
|
|
def _register_param(self, key):
|
|
"""
|
|
register a parameter key with the database.
|
|
|
|
@note this is method does not lock the database file and does not commit.
|
|
to lock the database and commit the transaction, call the public method register_param().
|
|
|
|
@param key: key (name) of the parameter.
|
|
|
|
@return: id value of the parameter in the database.
|
|
"""
|
|
c = self._conn.execute(self.sql_select_param_key, {'key': key})
|
|
v = c.fetchone()
|
|
if v:
|
|
param_id = v[0]
|
|
else:
|
|
c = self._conn.execute(self.sql_insert_param, {'key': key})
|
|
param_id = c.lastrowid
|
|
self._model_params[key] = param_id
|
|
return param_id
|
|
|
|
def register_params(self, model_params):
|
|
"""
|
|
register the model parameters of this project with the database.
|
|
|
|
each parameter name must be registered once before a value can be written to the database.
|
|
see the class description for an explanation.
|
|
|
|
@param model_params: sequence of model parameter keys, or dictionary of model parameters.
|
|
@return: None
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
for key in model_params:
|
|
if key[0] != '_':
|
|
self._register_param(key)
|
|
|
|
def query_project_params(self, project_id=0, job_id=0, update_registry=False):
|
|
"""
|
|
query a list of model parameters used in a project or job.
|
|
|
|
optionally, the local registry can be updated with the results of the query.
|
|
this should be done if the database is read only and the client does not know the parameter names.
|
|
see the class description for a description of the registry.
|
|
|
|
@note this method returns the parameters that are used with models in the database.
|
|
if you have registered additional parameters but not stored models with those parameters,
|
|
this method will _not_ list them.
|
|
|
|
@param project_id: project identifier.
|
|
by default, the current project self.project_id is used.
|
|
|
|
@param job_id: job identifier.
|
|
by default, all jobs of the selected project are included in the query.
|
|
if a job is specified, the project_id parameter is ignored.
|
|
|
|
@param update_registry: update the local parameter registry (self._model_params).
|
|
with the query results.
|
|
|
|
@return: dictionary of project parameters.
|
|
the keys are the parameter names, the values are the parameter ids in the database.
|
|
"""
|
|
if project_id == 0:
|
|
project_id = self.project_id
|
|
if job_id == 0:
|
|
sql = self.sql_select_param_project
|
|
args = {'project_id': project_id}
|
|
else:
|
|
sql = self.sql_select_param_job
|
|
args = {'job_id': job_id}
|
|
|
|
params = {}
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
c = self._conn.execute(sql, args)
|
|
for row in c:
|
|
params[row['key']] = row['param_id']
|
|
|
|
if update_registry:
|
|
self._model_params.update(params)
|
|
|
|
return params
|
|
|
|
def register_tag(self, key):
|
|
"""
|
|
register a tag with the database.
|
|
|
|
tags are a way of structuring a job description.
|
|
they can be used to, for instance, distinguish calculations made with different clusters,
|
|
different experimental data, etc.
|
|
a job tag has a key and a value, and is associated to a job.
|
|
the use of tags is up to the user. pmsco does not change or read them.
|
|
|
|
each tag name must be registered once before a value can be written to the database.
|
|
see the class description for an explanation.
|
|
|
|
@param key: key (name) of the tag.
|
|
|
|
@return: id value of the tag in the database.
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
return self._register_tag(key)
|
|
|
|
def _register_tag(self, key):
|
|
"""
|
|
register a tag with the database without committing the transaction.
|
|
|
|
@note this method does not lock the database file and does not commit.
|
|
to lock the database and commit the transaction, call the public method register_tag().
|
|
|
|
@param key: key (name) of the tag.
|
|
|
|
@return: id value of the tag in the database.
|
|
"""
|
|
c = self._conn.execute(self.sql_select_tag_key, {'key': key})
|
|
v = c.fetchone()
|
|
if v:
|
|
tag_id = v[0]
|
|
else:
|
|
c = self._conn.execute(self.sql_insert_tag, {'key': key})
|
|
tag_id = c.lastrowid
|
|
self._tags[key] = tag_id
|
|
return tag_id
|
|
|
|
def register_tags(self, tags):
|
|
"""
|
|
register the tags of this project with the database.
|
|
|
|
each tag name must be registered once before a value can be written to the database.
|
|
see the class description for an explanation.
|
|
|
|
@param tags: sequence of tag keys, or dictionary of tags.
|
|
@return: None
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
for key in tags:
|
|
self._register_tag(key)
|
|
|
|
def query_tags(self, project_id=0, job_id=0, update_registry=False):
|
|
"""
|
|
query a list of tag keys used in a project or job.
|
|
|
|
optionally, the local registry can be updated with the results of the query.
|
|
this should be done if the database is read only and the client does not know the tag names.
|
|
see the class description for a description of the registry.
|
|
|
|
@note this method returns the tags that are used with jobs in the database.
|
|
if you have registered additional tags but not attached them to jobs,
|
|
this method will _not_ list them.
|
|
|
|
@param project_id: project identifier.
|
|
by default, the current project self.project_id is used.
|
|
|
|
@param job_id: job identifier.
|
|
by default, all jobs of the selected project are included in the query.
|
|
if a job is specified, the project_id parameter is ignored.
|
|
|
|
@param update_registry: update the local tags registry (self._tags).
|
|
with the query results.
|
|
|
|
@return: dictionary of tags.
|
|
the keys are the tag names, the values are the tag ids in the database.
|
|
"""
|
|
if project_id == 0:
|
|
project_id = self.project_id
|
|
if job_id == 0:
|
|
sql = self.sql_select_tag_project
|
|
args = {'project_id': project_id}
|
|
else:
|
|
sql = self.sql_select_tag_job
|
|
args = {'job_id': job_id}
|
|
|
|
tags = {}
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
c = self._conn.execute(sql, args)
|
|
for row in c:
|
|
tags[row['key']] = row['tag_id']
|
|
|
|
if update_registry:
|
|
self._tags.update(tags)
|
|
|
|
return tags
|
|
|
|
def query_job_tags(self, job_id):
|
|
"""
|
|
query a list of tags (keys and values) associated with a job.
|
|
|
|
@param job_id: job identifier.
|
|
|
|
@return: dictionary of tags.
|
|
the keys are the tag names, the values are the tag values.
|
|
"""
|
|
sql = self.sql_select_jobtag_job
|
|
args = {'job_id': job_id}
|
|
|
|
tags = {}
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
c = self._conn.execute(sql, args)
|
|
for row in c:
|
|
tags[row['key']] = row['value']
|
|
|
|
return tags
|
|
|
|
def insert_jobtags(self, job_id, tags):
|
|
"""
|
|
add or update job tags in the database.
|
|
|
|
the method updates the JobTags table.
|
|
|
|
@param job_id: (int) primary key of the job entry in the Jobs table.
|
|
the entry must exist.
|
|
|
|
@param tags: (dict) dictionary containing the tags.
|
|
keys are matched or added to the Tags table,
|
|
values are added to the JobTags table and linked to the job and tag key.
|
|
|
|
@return: None
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
for key, value in tags.items():
|
|
try:
|
|
tag_id = self._tags[key]
|
|
except KeyError:
|
|
tag_id = self._register_tag(key)
|
|
v = None
|
|
else:
|
|
jobtag_entry = {'tag_id': tag_id, 'job_id': job_id, 'value': value}
|
|
c = self._conn.execute(self.sql_select_jobtag, jobtag_entry)
|
|
v = c.fetchone()
|
|
|
|
if v:
|
|
jobtag_entry = {'jobtag_id': v[0], 'tag_id': tag_id, 'job_id': job_id, 'value': value}
|
|
self._conn.execute(self.sql_update_jobtag, jobtag_entry)
|
|
else:
|
|
jobtag_entry = {'tag_id': tag_id, 'job_id': job_id, 'value': value}
|
|
self._conn.execute(self.sql_insert_jobtag, jobtag_entry)
|
|
|
|
def create_models_view(self, job_id=0, temporary=False):
|
|
"""
|
|
create a flat (pivot) view of model parameters of the current project or job.
|
|
|
|
the view is an SQL view that can be used like other tables in select statements.
|
|
the name of the view is the project or job id prefixed by @c ViewModelsProject or ViewModelsJob, respectively.
|
|
the view contains the 'job_id', 'model_id' and 'model' columns of the Models table
|
|
as well as columns of the de-normalized ParamValues table
|
|
where the column names are the respective parameter names.
|
|
the list of parameter names is combined from the private registry and the database.
|
|
|
|
the method generates the SQL to create the view and executes it.
|
|
a previous view of the same name is dropped.
|
|
the view can optionally be made temporary (it is dropped when the database connection is closed).
|
|
|
|
depending on the `job_id` argument, a view of the current project
|
|
or of the specified job is created.
|
|
|
|
@note the view needs to be re-recreated whenever the parameter dictionary has changed.
|
|
|
|
@param job_id: job identifier.
|
|
by default, all jobs of the selected project are included in the query.
|
|
if a job is specified, the project_id parameter is ignored.
|
|
|
|
@param temporary: if True, the view is temporary and dropped when the database connection is closed.
|
|
otherwise (by default), the view is available on all connections and stored with the database.
|
|
|
|
@return sql string that was used to create the view.
|
|
"""
|
|
project_id = self.project_id
|
|
self.check_connection()
|
|
params = self.query_project_params(project_id, job_id)
|
|
params.update(self._model_params)
|
|
param_names = sorted(params, key=lambda s: s.lower())
|
|
with self._lock, self._conn:
|
|
if job_id:
|
|
view_name = "ViewModelsJob{0}".format(job_id)
|
|
else:
|
|
view_name = "ViewModelsProject{0}".format(project_id)
|
|
|
|
sql = "drop view if exists {0}".format(view_name)
|
|
self._conn.execute(sql)
|
|
|
|
sql = "create {1} view {0} as ".format(view_name, "temp" if temporary else "")
|
|
sql += "select job_id, Models.id as model_id, model, gen, particle, "
|
|
col_fmt = "max(case when key='{0}' then value end) {0}"
|
|
cols = [col_fmt.format(param) for param in param_names]
|
|
sql += ", ".join(cols)
|
|
sql += " from Models "
|
|
sql += " join ParamValues on Models.id = ParamValues.model_id "
|
|
sql += " join Params on Params.id = ParamValues.param_id "
|
|
sql += " join Jobs on Jobs.id = Models.job_id "
|
|
if job_id:
|
|
sql += " where Models.job_id = {job_id} ".format(job_id=job_id)
|
|
else:
|
|
sql += " where Jobs.project_id = {project_id} ".format(project_id=project_id)
|
|
sql += " group by model_id "
|
|
|
|
self._conn.execute(sql)
|
|
|
|
return sql
|
|
|
|
def insert_model(self, model_params):
|
|
"""
|
|
add calculated model parameters to the database.
|
|
|
|
this adds to the Models and ParamValues tables.
|
|
|
|
@param model_params: dictionary containing all model parameters and control variables.
|
|
the '_model' value is required.
|
|
'_gen' and '_particle' are optional, they are written to the database if provided, or default to None.
|
|
the names of the model parameters must have been registered with register_params() beforehand.
|
|
|
|
@return: id value of the model entry in the database.
|
|
|
|
@raise KeyError if a parameter hasn't been registered.
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
# insert model record
|
|
model_dict = {'job_id': self.job_id, 'gen': None, 'particle': None}
|
|
model_dict.update(special_params(model_params))
|
|
c = self._conn.execute(self.sql_insert_model, model_dict)
|
|
model_id = c.lastrowid
|
|
|
|
# insert parameter value records
|
|
for key, value in model_params.items():
|
|
if key[0] != '_':
|
|
param_id = self._model_params[key]
|
|
param_dict = {'param_id': param_id, 'model_id': model_id, 'value': value}
|
|
c.execute(self.sql_insert_paramvalue, param_dict)
|
|
|
|
return model_id
|
|
|
|
def delete_model(self, model_id):
|
|
"""
|
|
delete a model from the database.
|
|
|
|
this will delete all dependent results and parameter values.
|
|
|
|
@param model_id: id value of the model entry in the database.
|
|
|
|
@return None
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
param_dict = {'model_id': model_id}
|
|
self._conn.execute(self.sql_delete_model, param_dict)
|
|
|
|
def query_model(self, model_id):
|
|
"""
|
|
retrieve model parameters and control variables from the database.
|
|
|
|
@param model_id: id of the model in the database.
|
|
@return: dict
|
|
"""
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
c = self._conn.execute(self.sql_select_paramvalue_model, {'model_id': model_id})
|
|
d = {}
|
|
for row in c:
|
|
d[row['key']] = row['value']
|
|
c = self._conn.execute(self.sql_select_model, {'id': model_id})
|
|
row = dict(c.fetchone())
|
|
for key, value in row.items():
|
|
if key[0] == '_' and value is not None:
|
|
d[key] = value
|
|
return d
|
|
|
|
def insert_model_array(self, array):
|
|
"""
|
|
add a numpy array of calculated model parameters to the database.
|
|
|
|
@param array: numpy structured array as it is used, e.g., in pmsco.optimizers.population.Population.
|
|
|
|
@return:
|
|
"""
|
|
assert isinstance(array, np.ndarray)
|
|
|
|
for r in array:
|
|
d = {name: r[name] for name in r.dtype.names}
|
|
self.insert_model(d)
|
|
|
|
def query_model_array(self, filter=[], limit=0):
|
|
"""
|
|
load a range of models from the database.
|
|
|
|
the project id is implicitly given by the object attribute.
|
|
it's not possible to combine results from multiple projects.
|
|
|
|
@param filter: list of filter expressions.
|
|
each expression is a relational expression of the form <code>field operator value</code>,
|
|
where field is a unique field name of the Projects, Jobs, Models or Results table, e.g.
|
|
`job_id`, `model`, `rfac`, `scan`, `sym`, etc.
|
|
operator is one of the relational operators in SQL syntax.
|
|
value is a numeric or string constant, the latter including single or double quotes.
|
|
if the list is empty, no filtering is applied.
|
|
the project should not be specified here.
|
|
|
|
@param limit: maximum number of models to read. 0 = return all models that match the filter criteria.
|
|
|
|
@return numpy structured array as it is used, e.g., in pmsco.optimizers.population.Population.
|
|
the function uses the data type only. data in the array is overwritten.
|
|
the array is sorted by model_id.
|
|
|
|
@raise unknown exception when the project does not have results.
|
|
|
|
@raise OperationalError if the view ViewModelsProject{0} is missing in the database schema.
|
|
"""
|
|
self.check_connection()
|
|
filter += [" project_id = {0} ".format(self.project_id)]
|
|
with self._lock, self._conn:
|
|
sql = "select distinct Models.id as model_id, model "
|
|
sql += "from Models "
|
|
sql += "join Results on Models.id = Results.model_id "
|
|
sql += "join Jobs on Models.job_id = Jobs.id "
|
|
sql += "join Projects on Jobs.project_id = Projects.id "
|
|
if filter:
|
|
sql += "where "
|
|
sql += " and ".join(filter)
|
|
sql += " "
|
|
sql += "order by model_id "
|
|
if limit:
|
|
sql += "limit {0} ".format(limit)
|
|
c = self._conn.execute(sql)
|
|
models = c.fetchall()
|
|
count = len(models)
|
|
model_ids = [row['model_id'] for row in models]
|
|
|
|
sql = "select * "
|
|
sql += "from ViewModelsProject{0} ".format(self.project_id)
|
|
sql += "where model_id in ({0}) ".format(",".join("?" * len(model_ids)))
|
|
sql += "order by model_id "
|
|
c = self._conn.execute(sql, model_ids)
|
|
results = c.fetchall()
|
|
|
|
names = [desc[0] for desc in c.description]
|
|
dt = np.dtype([(field_to_param(n), field_to_numpy_type(n)) for n in sorted(names, key=str.lower)])
|
|
out_array = np.zeros((count,), dtype=dt)
|
|
for idx, row in enumerate(results):
|
|
for name in names:
|
|
try:
|
|
out_array[field_to_param(name)][idx] = row[name]
|
|
except TypeError:
|
|
# can't store None in integer
|
|
pass
|
|
|
|
return out_array
|
|
|
|
def query_best_results(self, filter=[], limit=0):
|
|
"""
|
|
load a range of results from the database.
|
|
|
|
@param filter: list of filter expressions.
|
|
each expression is a relational expression of the form <code>field operator value</code>,
|
|
where field is a unique field name of the Projects, Jobs, Models or Results table, e.g.
|
|
`job_id`, `model`, `rfac`, `scan`, `sym`, etc.
|
|
operator is one of the relational operators in SQL syntax.
|
|
value is a numeric or string constant, the latter including single or double quotes.
|
|
if the list is empty, no filtering is applied.
|
|
the project should not be specified here.
|
|
|
|
@param limit: maximum number of models to read. 0 = return all models that match the filter criteria.
|
|
|
|
@return numpy structured array as it is used, e.g., in pmsco.optimizers.population.Population.
|
|
the function uses the data type only. data in the array is overwritten.
|
|
the array is sorted by rfac.
|
|
"""
|
|
self.check_connection()
|
|
filter += [" project_id = {0} ".format(self.project_id)]
|
|
with self._lock, self._conn:
|
|
sql = "select Results.id as result_id, model_id, job_id, "
|
|
sql += "model, scan, sym, emit, region, rfac, gen, particle "
|
|
sql += "from Models "
|
|
sql += "join Results on Models.id = Results.model_id "
|
|
sql += "join Jobs on Models.job_id = Jobs.id "
|
|
sql += "join Projects on Jobs.project_id = Projects.id "
|
|
if filter:
|
|
sql += "where "
|
|
sql += " and ".join(filter)
|
|
sql += " "
|
|
sql += "order by rfac, job_id, model, scan, sym, emit, region "
|
|
if limit:
|
|
sql += "limit {0} ".format(limit)
|
|
c = self._conn.execute(sql)
|
|
results = c.fetchall()
|
|
count = len(results)
|
|
|
|
names = [desc[0] for desc in c.description]
|
|
dt = np.dtype([(field_to_param(n), field_to_numpy_type(n)) for n in sorted(names)])
|
|
out_array = np.zeros((count,), dtype=dt)
|
|
for idx, row in enumerate(results):
|
|
for name in names:
|
|
try:
|
|
out_array[field_to_param(name)][idx] = row[name]
|
|
except TypeError:
|
|
# can't store None in integer
|
|
pass
|
|
|
|
return out_array
|
|
|
|
def query_best_models_per_jobs(self, job_ids=None, task_level='model'):
|
|
"""
|
|
return the best model (by rfac) of each selected job
|
|
|
|
the query gathers the R-factors of the selected jobs at the selected task levels
|
|
and, for each job, returns the (database) model id where the lowest R-factor is reported
|
|
among the gathered results.
|
|
|
|
this can be useful if you want to compile a report of the best model per job.
|
|
|
|
@param job_ids: iterable of job ids to include in the query.
|
|
the job ids must belong to the current project.
|
|
if empty or non-specified, all jobs of the current project are included.
|
|
|
|
@param task_level: element of or index into @ref pmsco.dispatch.CALC_LEVELS.
|
|
deepest task_level to include in the query.
|
|
results on deeper levels are not considered.
|
|
e.g. if you pass 'scan', R-factors of individual scans are included in the query.
|
|
note that including deeper levels will not increase the number of results returned.
|
|
|
|
@return sequence of model_id.
|
|
the number of results corresponds to the number of jobs in the filter scope.
|
|
to find out details of the models, execute another query that filters on these model ids.
|
|
|
|
the method produces an SQL query similar to:
|
|
@code{.sql}
|
|
select Models.id from Models
|
|
join Results on Models.id = Results.model_id
|
|
join Jobs on Models.job_id = Jobs.id
|
|
where scan=-1
|
|
and project_id=1
|
|
and job_id in (1,2,3)
|
|
group by Models.job_id
|
|
having min(rfac)
|
|
order by rfac
|
|
@endcode
|
|
"""
|
|
|
|
try:
|
|
level = dispatch.CALC_LEVELS.index(task_level) + 1
|
|
except ValueError:
|
|
level = task_level + 1
|
|
try:
|
|
level_name = dispatch.CALC_LEVELS[level]
|
|
except IndexError:
|
|
level_name = dispatch.CALC_LEVELS[4]
|
|
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
sql = "select Models.id from Models "
|
|
sql += "join Results on Models.id = Results.model_id "
|
|
sql += "join Jobs on Models.job_id = Jobs.id "
|
|
sql += "where project_id = {0} ".format(self.project_id)
|
|
sql += "and {0} = -1 ".format(level_name)
|
|
if job_ids:
|
|
sql += "and Models.job_id in ({0}) ".format(",".join(map(str, job_ids)))
|
|
sql += "group by Models.job_id "
|
|
sql += "having min(rfac) "
|
|
sql += "order by rfac, job_id, model, scan, sym, emit, region "
|
|
c = self._conn.execute(sql)
|
|
models = [row['id'] for row in c]
|
|
|
|
return models
|
|
|
|
def query_tasks(self, job_id=0):
|
|
"""
|
|
query the task index used in a calculation job.
|
|
|
|
this query neglects the model index
|
|
and returns the unique tuples (-1, scan, sym, emit, region).
|
|
|
|
@param job_id: (int) id of the associated Jobs entry.
|
|
if 0, self.job_id is used.
|
|
|
|
@return list of pmsco.dispatch.CalcID tuples of task indices.
|
|
the model attribute is -1 in all elements.
|
|
"""
|
|
if not job_id:
|
|
job_id = self.job_id
|
|
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
sql = "select scan, sym, emit, region "
|
|
sql += "from Models "
|
|
sql += "join Results on Models.id = Results.model_id "
|
|
sql += "join Jobs on Models.job_id = Jobs.id "
|
|
sql += "join Projects on Jobs.project_id = Projects.id "
|
|
sql += "where job_id = :job_id"
|
|
c = self._conn.execute(sql, {'job_id': job_id})
|
|
results = c.fetchall()
|
|
|
|
output = []
|
|
for row in results:
|
|
d = dict(row)
|
|
d['model'] = -1
|
|
output.append(dispatch.CalcID(**d))
|
|
|
|
return output
|
|
|
|
def query_best_task_models(self, level, count, job_id=0):
|
|
"""
|
|
query N best models per task.
|
|
|
|
this query is used by the file tracker to determine the models to keep.
|
|
|
|
@param level: level up to which to query.
|
|
the level can be specified by level name (str) or numeric index (0..4).
|
|
if it is scan (equivalent to 1), the method queries the model and scan levels.
|
|
|
|
@param count: number of models to query per task.
|
|
|
|
@param job_id: (int) id of the associated Jobs entry.
|
|
if 0, self.job_id is used.
|
|
|
|
@return set of matching model numbers (Models.model field).
|
|
"""
|
|
if not job_id:
|
|
job_id = self.job_id
|
|
|
|
try:
|
|
level = int(level)
|
|
except ValueError:
|
|
level = dispatch.CALC_LEVELS.index(level)
|
|
assert 0 <= level < len(dispatch.CALC_LEVELS)
|
|
|
|
def _query_models(d):
|
|
sql = "select model "
|
|
sql += "from Models "
|
|
sql += "join Results on Models.id = Results.model_id "
|
|
sql += "where Models.job_id = :job_id "
|
|
sql += "and scan = :scan "
|
|
sql += "and sym = :sym "
|
|
sql += "and emit = :emit "
|
|
sql += "and region = :region "
|
|
sql += "order by rfac "
|
|
sql += "limit :limit "
|
|
c = self._conn.execute(sql, d)
|
|
results = c.fetchall()
|
|
return set([row['model'] for row in results])
|
|
|
|
tasks = self.query_tasks(job_id)
|
|
models = set([])
|
|
with self._lock, self._conn:
|
|
for task in tasks:
|
|
if task.numeric_level <= level:
|
|
d = task._asdict()
|
|
del d['model']
|
|
d['job_id'] = job_id
|
|
d['limit'] = count
|
|
q_models = _query_models(d)
|
|
models |= q_models
|
|
|
|
return models
|
|
|
|
def insert_result(self, index, result, job_id=0):
|
|
"""
|
|
add or update a result in the database.
|
|
|
|
the method updates the Models, Results and ParamValues tables.
|
|
|
|
the model is identified by job_id and index.model.
|
|
the result is identified by job_id and index.
|
|
|
|
if the model or result exists in the database, it is updated.
|
|
|
|
@param index: (pmsco.dispatch.CalcID or dict)
|
|
calculation index.
|
|
in case of dict, the keys must be the attribute names of CalcID prefixed with an underscore, i.e.,
|
|
'_model', '_scan', '_sym', '_emit', '_region'.
|
|
extra values in the dictionary are ignored.
|
|
undefined indices must be -1.
|
|
|
|
@param result: (dict) dictionary containing the parameter values and the '_rfac' result.
|
|
may also contain the special values '_gen' and '_particle'.
|
|
'_gen' and '_particle' default to None if not present.
|
|
|
|
@param job_id: (int) id of the associated Jobs entry.
|
|
if 0, self.job_id is used.
|
|
|
|
@return: id of the new or updated Results entry.
|
|
"""
|
|
if not job_id:
|
|
job_id = self.job_id
|
|
|
|
self.check_connection()
|
|
with self._lock, self._conn:
|
|
model_id = self._insert_result_model(job_id, index, result)
|
|
result_id = self._insert_result_data(model_id, index, result)
|
|
self._insert_result_paramvalues(model_id, result)
|
|
|
|
return result_id
|
|
|
|
def _insert_result_model(self, job_id, index, result):
|
|
"""
|
|
add or update the model entry for a calculation result in the database.
|
|
|
|
the method updates the Models table.
|
|
the database and transaction must be locked by the caller.
|
|
|
|
the model is identified by job_id and index.model.
|
|
the result is identified by job_id and index.
|
|
|
|
if the model exists in the database, it is updated.
|
|
|
|
@param job_id: (int) primary key of the associated Jobs entry.
|
|
no default value! passing an invalid job_id raises an exception.
|
|
|
|
@param index: (pmsco.dispatch.CalcID or dict)
|
|
calculation index.
|
|
in case of dict, the keys must be the attribute names of CalcID prefixed with an underscore, i.e.,
|
|
'_model', '_scan', '_sym', '_emit', '_region'.
|
|
extra values in the dictionary are ignored.
|
|
undefined indices must be -1.
|
|
|
|
@param result: (dict) dictionary containing the parameter values and the '_rfac' result.
|
|
may also contain the special values '_gen' and '_particle'.
|
|
'_gen' and '_particle' default to None if not present.
|
|
|
|
@return: id of the new or updated Results entry.
|
|
"""
|
|
model_entry = {'gen': None, 'particle': None}
|
|
model_entry.update(special_params(result))
|
|
model_entry['job_id'] = job_id
|
|
try:
|
|
model_entry['model'] = index.model
|
|
except AttributeError:
|
|
model_entry['model'] = index['_model']
|
|
c = self._conn.execute(self.sql_select_model_model, model_entry)
|
|
v = c.fetchone()
|
|
if v:
|
|
model_id = v[0]
|
|
model_entry['model_id'] = model_id
|
|
self._conn.execute(self.sql_update_model, model_entry)
|
|
else:
|
|
c = self._conn.execute(self.sql_insert_model, model_entry)
|
|
model_id = c.lastrowid
|
|
return model_id
|
|
|
|
def _insert_result_data(self, model_id, index, result):
|
|
"""
|
|
add or update a result in the database.
|
|
|
|
the method updates the Results table.
|
|
the database and transaction must be locked by the caller.
|
|
|
|
the model is identified by model_id.
|
|
the result is identified by model_id and index.
|
|
|
|
if the result exists in the database, it is updated.
|
|
|
|
@param model_id: (int) primary key of the model entry in the Models table.
|
|
the entry must exist.
|
|
|
|
@param index: (pmsco.dispatch.CalcID or dict)
|
|
calculation index.
|
|
in case of dict, the keys must be the attribute names of CalcID prefixed with an underscore, i.e.,
|
|
'_model', '_scan', '_sym', '_emit', '_region'.
|
|
extra values in the dictionary are ignored.
|
|
undefined indices must be -1.
|
|
@param result: (dict) dictionary containing the parameter values and the '_rfac' result.
|
|
|
|
@return: id of the new or updated Results entry.
|
|
"""
|
|
result_entry = special_params(result)
|
|
result_entry['model_id'] = model_id
|
|
result_entry.update(special_params(index))
|
|
c = self._conn.execute(self.sql_select_result_index, result_entry)
|
|
v = c.fetchone()
|
|
if v:
|
|
result_id = v[0]
|
|
result_entry['result_id'] = result_id
|
|
self._conn.execute(self.sql_update_result, result_entry)
|
|
else:
|
|
c = self._conn.execute(self.sql_insert_result, result_entry)
|
|
result_id = c.lastrowid
|
|
|
|
return result_id
|
|
|
|
def _insert_result_paramvalues(self, model_id, result):
|
|
"""
|
|
add or update parameter values of a model in the database.
|
|
|
|
the method updates the ParamValues table.
|
|
the database and transaction must be locked by the caller.
|
|
|
|
@param model_id: (int) primary key of the model entry in the Models table.
|
|
the entry must exist.
|
|
|
|
@param result: (dict) dictionary containing the parameter values.
|
|
the parameter names must exist in the Params table and in the self._model_params dictionary.
|
|
special values (with a leading underscore) are ignored.
|
|
extra parameters may raise a KeyError.
|
|
|
|
@return: None
|
|
|
|
@raise: KeyError if a parameter key is not registered.
|
|
"""
|
|
for key, value in regular_params(result).items():
|
|
param_id = self._model_params[key]
|
|
paramvalue_entry = {'param_id': param_id, 'model_id': model_id, 'value': value}
|
|
c = self._conn.execute(self.sql_select_paramvalue, paramvalue_entry)
|
|
v = c.fetchone()
|
|
if v:
|
|
paramvalue_id = v[0]
|
|
paramvalue_entry['paramvalue_id'] = paramvalue_id
|
|
self._conn.execute(self.sql_update_paramvalue, paramvalue_entry)
|
|
else:
|
|
self._conn.execute(self.sql_insert_paramvalue, paramvalue_entry)
|
|
|
|
def import_results_file(self, filename, job_id=0):
|
|
"""
|
|
import a results file into the database.
|
|
|
|
the results file is a space-delimited, general text file
|
|
such as produced by pmsco.optimizers.population.Population.save_array().
|
|
each line contains one result dataset, the columns correspond to the regular and special parameters.
|
|
the first row contains the parameter names.
|
|
|
|
a job entry with the given id must exist,
|
|
but there must be no model entries referencing the job.
|
|
it is not possible to update existing models, results or parameter values using this method.
|
|
instead, you have to delete the job (which also deletes all dependent entries)
|
|
and re-import the results.
|
|
|
|
@param filename: path and name of the results file.
|
|
|
|
@return: None.
|
|
|
|
@raise ValueError if the job already has model entries.
|
|
"""
|
|
if not job_id:
|
|
job_id = self.job_id
|
|
|
|
data = np.genfromtxt(filename, names=True)
|
|
self.register_params(data.dtype.names)
|
|
try:
|
|
unique_models, unique_index = np.unique(data['_model'], True)
|
|
except ValueError:
|
|
unique_models = np.array([0])
|
|
unique_index = np.array([0])
|
|
unique_data = data[unique_index]
|
|
model_ids = {}
|
|
|
|
def model_entry_generator():
|
|
for result in unique_data:
|
|
model_entry = {'job_id': job_id,
|
|
'model': unique_models[0],
|
|
'gen': None,
|
|
'particle': None}
|
|
model_entry.update(special_params(result))
|
|
yield model_entry
|
|
|
|
def result_entry_generator():
|
|
for result in data:
|
|
try:
|
|
model = result['_model']
|
|
except ValueError:
|
|
model = unique_models[0]
|
|
result_entry = {'model_id': model_ids[model],
|
|
'scan': -1,
|
|
'sym': -1,
|
|
'emit': -1,
|
|
'region': -1,
|
|
'rfac': None}
|
|
result_entry.update(special_params(result))
|
|
yield result_entry
|
|
|
|
def param_entry_generator():
|
|
for result in unique_data:
|
|
try:
|
|
model = result['_model']
|
|
except ValueError:
|
|
model = unique_models[0]
|
|
for key, value in regular_params(result).items():
|
|
param_entry = {'model_id': model_ids[model],
|
|
'param_id': self._model_params[key],
|
|
'value': value}
|
|
yield param_entry
|
|
|
|
with self._lock, self._conn:
|
|
c = self._conn.execute(self.sql_select_model_job, {'job_id': job_id})
|
|
v = c.fetchone()
|
|
if v:
|
|
raise ValueError("database already contains model entries for job {0}".format(job_id))
|
|
self._conn.executemany(self.sql_insert_model, model_entry_generator())
|
|
c = self._conn.execute(self.sql_select_model_job, {'job_id': job_id})
|
|
model_ids = {row['model']: row['id'] for row in c}
|
|
self._conn.executemany(self.sql_insert_result, result_entry_generator())
|
|
self._conn.executemany(self.sql_insert_paramvalue, param_entry_generator())
|
|
|
|
|
|
def import_job_results(**kwargs):
|
|
"""
|
|
import results from a calculation job.
|
|
|
|
this function contains all steps necessary to import the results from a calculation job into a database.
|
|
it registers the project and job, and imports the results data.
|
|
the project may exist in the database, the job must not exist (raises an exception).
|
|
|
|
arguments can be specified as dict (**d) or in keyword=value form.
|
|
|
|
@param kwargs: dictionary of function arguments.
|
|
the dictionary contains the following values.
|
|
all arguments are required unless noted.
|
|
@arg 'workdir' (optional) path to the working directory.
|
|
the working directory of the operating system is changed.
|
|
this is the root for relative paths of the database and results files.
|
|
if not specified, the working directory is unchanged.
|
|
@arg 'dbfile' name of the database file.
|
|
@arg 'project' name of the project.
|
|
@arg 'code' name of the project code.
|
|
@arg 'job' name of the calculation job.
|
|
@arg 'mode' pmsco optimization mode.
|
|
@arg 'machine' name of the machine where the job ran.
|
|
@arg 'git' git hash of the code revision.
|
|
@arg 'datetime' (datetime.datetime) time stamp (optional).
|
|
if not specified, the argument defaults to the time stamp of the results file.
|
|
hint: the constructor of a datetime object is
|
|
`datetime.datetime(year, month, day, hour, minute, second)`.
|
|
@arg 'description' meaningful description of the calculation job, up to the user.
|
|
@arg 'resultsfile' name of the .tasks.dat results file.
|
|
|
|
@return dict with 'project_id' and 'job_id'
|
|
|
|
@raise ValueError if the job already exists in the database.
|
|
"""
|
|
try:
|
|
os.chdir(kwargs['workdir'])
|
|
except KeyError:
|
|
pass
|
|
try:
|
|
dt = kwargs['datetime']
|
|
except KeyError:
|
|
dt = datetime.datetime.fromtimestamp(os.path.getmtime(kwargs['resultsfile']))
|
|
|
|
db = ResultsDatabase()
|
|
db.connect(kwargs['dbfile'])
|
|
project_id = db.register_project(kwargs['project'], kwargs['code'])
|
|
job_id = db.register_job(project_id, kwargs['job'], kwargs['mode'], kwargs['machine'], kwargs['git'], dt,
|
|
kwargs['description'])
|
|
db.import_results_file(kwargs['resultsfile'], job_id)
|
|
db.create_models_view()
|
|
db.disconnect()
|
|
|
|
return {'project_id': project_id, 'job_id': job_id}
|