public distro 2.1.0

This commit is contained in:
2019-07-19 12:54:54 +02:00
parent acea809e4e
commit fbd2d4fa8c
40 changed files with 2813 additions and 345 deletions

View File

@ -342,6 +342,53 @@ class ResultsDatabase(object):
where param_id = :param_id and model_id = :model_id
"""
sql_create_tags = """CREATE TABLE IF NOT EXISTS `Tags` (
`id` INTEGER PRIMARY KEY,
`key` TEXT NOT NULL UNIQUE COLLATE NOCASE
)"""
sql_insert_tag = "insert into Tags(key) values (:key)"
sql_select_tag = "select key from Tags where id=:id"
sql_select_tag_key = "select id, key from Tags where key=:key"
sql_select_tag_project = """select distinct key, tag_id from Jobs
join JobTags on Jobs.id = JobTags.job_id
join Tags on Tags.id = JobTags.tag_id
where Jobs.project_id = :project_id
order by key collate nocase"""
sql_select_tag_job = """select distinct key, tag_id from JobTags
join Tags on Tags.id = JobTags.tag_id
where JobTags.job_id = :job_id
order by key collate nocase"""
sql_create_jobtags = """CREATE TABLE IF NOT EXISTS `JobTags` (
`id` INTEGER PRIMARY KEY,
`tag_id` INTEGER NOT NULL,
`job_id` INTEGER NOT NULL,
`value` TEXT COLLATE NOCASE,
FOREIGN KEY(tag_id) REFERENCES Tags(id) ON DELETE CASCADE,
FOREIGN KEY(job_id) REFERENCES Jobs(id) ON DELETE CASCADE
)"""
sql_index_jobtags = """create index if not exists
`index_jobtags` ON `JobTags`
(`tag_id`, `job_id`)"""
sql_drop_index_jobtags = "drop index if exists index_jobtags"
sql_insert_jobtag = """
insert into JobTags(tag_id, job_id, value)
values (:tag_id, :job_id, :value)
"""
sql_update_jobtag = """
update JobTags set value=:value where id=:jobtag_id
"""
sql_select_jobtag_job = """
select key, value from JobTags
join Tags on JobTags.tag_id = Tags.id
where job_id = :job_id
"""
sql_select_jobtag = """
select JobTags.id as id, key, value from JobTags
join Tags on JobTags.tag_id = Tags.id
where tag_id = :tag_id and job_id = :job_id
"""
# @var _conn (sqlite3.Connection).
# connection interface to the database.
#
@ -391,6 +438,7 @@ class ResultsDatabase(object):
self.project_id = 0
self.job_id = 0
self._model_params = {}
self._tags = {}
self._lock_filename = ""
self._lock = None
@ -484,9 +532,12 @@ class ResultsDatabase(object):
self._conn.execute(self.sql_create_results)
self._conn.execute(self.sql_create_params)
self._conn.execute(self.sql_create_paramvalues)
self._conn.execute(self.sql_create_tags)
self._conn.execute(self.sql_create_jobtags)
self._conn.execute(self.sql_index_results_tasks)
self._conn.execute(self.sql_index_results_models)
self._conn.execute(self.sql_index_paramvalues)
self._conn.execute(self.sql_index_jobtags)
self._conn.execute(self.sql_index_models)
def register_project(self, name, code):
@ -583,6 +634,46 @@ class ResultsDatabase(object):
param_dict = {'job_id': job_id}
self._conn.execute(self.sql_delete_job, param_dict)
def _query_job_name(self, job_name, project_id=0):
"""
(internal) query a job by name
this is the internal analog of @ref query_job_name
which asserts an acquired lock and open connection.
@param job_name: name of the job
@param project_id: project identifier.
by default, the current project self.project_id is used.
@return: id value of the job in the database
@raise DatabaseError if the job can't be found.
"""
if project_id == 0:
project_id = self.project_id
param_dict = {'project_id': project_id, 'name': job_name}
c = self._conn.execute(self.sql_select_job_name, param_dict)
v = c.fetchone()
return v[0]
def query_job_name(self, job_name, project_id=0):
"""
query a job by name
@param job_name: name of the job
@param project_id: project identifier.
by default, the current project self.project_id is used.
@return: id value of the job in the database
"""
self.check_connection()
with self._lock, self._conn:
job_id = self._query_job_name(job_name, project_id=project_id)
return job_id
def register_param(self, key):
"""
register a parameter key with the database.
@ -681,6 +772,165 @@ class ResultsDatabase(object):
return params
def register_tag(self, key):
"""
register a tag with the database.
tags are a way of structuring a job description.
they can be used to, for instance, distinguish calculations made with different clusters,
different experimental data, etc.
a job tag has a key and a value, and is associated to a job.
the use of tags is up to the user. pmsco does not change or read them.
each tag name must be registered once before a value can be written to the database.
see the class description for an explanation.
@param key: key (name) of the tag.
@return: id value of the tag in the database.
"""
self.check_connection()
with self._lock, self._conn:
return self._register_tag(key)
def _register_tag(self, key):
"""
register a tag with the database without committing the transaction.
@note this method does not lock the database file and does not commit.
to lock the database and commit the transaction, call the public method register_tag().
@param key: key (name) of the tag.
@return: id value of the tag in the database.
"""
c = self._conn.execute(self.sql_select_tag_key, {'key': key})
v = c.fetchone()
if v:
tag_id = v[0]
else:
c = self._conn.execute(self.sql_insert_tag, {'key': key})
tag_id = c.lastrowid
self._tags[key] = tag_id
return tag_id
def register_tags(self, tags):
"""
register the tags of this project with the database.
each tag name must be registered once before a value can be written to the database.
see the class description for an explanation.
@param tags: sequence of tag keys, or dictionary of tags.
@return: None
"""
self.check_connection()
with self._lock, self._conn:
for key in tags:
self._register_tag(key)
def query_tags(self, project_id=0, job_id=0, update_registry=False):
"""
query a list of tag keys used in a project or job.
optionally, the local registry can be updated with the results of the query.
this should be done if the database is read only and the client does not know the tag names.
see the class description for a description of the registry.
@note this method returns the tags that are used with jobs in the database.
if you have registered additional tags but not attached them to jobs,
this method will _not_ list them.
@param project_id: project identifier.
by default, the current project self.project_id is used.
@param job_id: job identifier.
by default, all jobs of the selected project are included in the query.
if a job is specified, the project_id parameter is ignored.
@param update_registry: update the local tags registry (self._tags).
with the query results.
@return: dictionary of tags.
the keys are the tag names, the values are the tag ids in the database.
"""
if project_id == 0:
project_id = self.project_id
if job_id == 0:
sql = self.sql_select_tag_project
args = {'project_id': project_id}
else:
sql = self.sql_select_tag_job
args = {'job_id': job_id}
tags = {}
self.check_connection()
with self._lock, self._conn:
c = self._conn.execute(sql, args)
for row in c:
tags[row['key']] = row['tag_id']
if update_registry:
self._tags.update(tags)
return tags
def query_job_tags(self, job_id):
"""
query a list of tags (keys and values) associated with a job.
@param job_id: job identifier.
@return: dictionary of tags.
the keys are the tag names, the values are the tag values.
"""
sql = self.sql_select_jobtag_job
args = {'job_id': job_id}
tags = {}
self.check_connection()
with self._lock, self._conn:
c = self._conn.execute(sql, args)
for row in c:
tags[row['key']] = row['value']
return tags
def insert_jobtags(self, job_id, tags):
"""
add or update job tags in the database.
the method updates the JobTags table.
@param job_id: (int) primary key of the job entry in the Jobs table.
the entry must exist.
@param tags: (dict) dictionary containing the tags.
keys are matched or added to the Tags table,
values are added to the JobTags table and linked to the job and tag key.
@return: None
"""
self.check_connection()
with self._lock, self._conn:
for key, value in tags.items():
try:
tag_id = self._tags[key]
except KeyError:
tag_id = self._register_tag(key)
v = None
else:
jobtag_entry = {'tag_id': tag_id, 'job_id': job_id, 'value': value}
c = self._conn.execute(self.sql_select_jobtag, jobtag_entry)
v = c.fetchone()
if v:
jobtag_entry = {'jobtag_id': v[0], 'tag_id': tag_id, 'job_id': job_id, 'value': value}
self._conn.execute(self.sql_update_jobtag, jobtag_entry)
else:
jobtag_entry = {'tag_id': tag_id, 'job_id': job_id, 'value': value}
self._conn.execute(self.sql_insert_jobtag, jobtag_entry)
def create_models_view(self, job_id=0, temporary=False):
"""
create a flat (pivot) view of model parameters of the current project or job.
@ -878,7 +1128,7 @@ class ResultsDatabase(object):
results = c.fetchall()
names = [desc[0] for desc in c.description]
dt = np.dtype([(field_to_param(n), field_to_numpy_type(n)) for n in sorted(names)])
dt = np.dtype([(field_to_param(n), field_to_numpy_type(n)) for n in sorted(names, key=str.lower)])
out_array = np.zeros((count,), dtype=dt)
for idx, row in enumerate(results):
for name in names:
@ -942,6 +1192,70 @@ class ResultsDatabase(object):
return out_array
def query_best_models_per_jobs(self, job_ids=None, task_level='model'):
"""
return the best model (by rfac) of each selected job
the query gathers the R-factors of the selected jobs at the selected task levels
and, for each job, returns the (database) model id where the lowest R-factor is reported
among the gathered results.
this can be useful if you want to compile a report of the best model per job.
@param job_ids: iterable of job ids to include in the query.
the job ids must belong to the current project.
if empty or non-specified, all jobs of the current project are included.
@param task_level: element of or index into @ref pmsco.dispatch.CALC_LEVELS.
deepest task_level to include in the query.
results on deeper levels are not considered.
e.g. if you pass 'scan', R-factors of individual scans are included in the query.
note that including deeper levels will not increase the number of results returned.
@return sequence of model_id.
the number of results corresponds to the number of jobs in the filter scope.
to find out details of the models, execute another query that filters on these model ids.
the method produces an SQL query similar to:
@code{.sql}
select Models.id from Models
join Results on Models.id = Results.model_id
join Jobs on Models.job_id = Jobs.id
where scan=-1
and project_id=1
and job_id in (1,2,3)
group by Models.job_id
having min(rfac)
order by rfac
@endcode
"""
try:
level = dispatch.CALC_LEVELS.index(task_level) + 1
except ValueError:
level = task_level + 1
try:
level_name = dispatch.CALC_LEVELS[level]
except IndexError:
level_name = dispatch.CALC_LEVELS[4]
self.check_connection()
with self._lock, self._conn:
sql = "select Models.id from Models "
sql += "join Results on Models.id = Results.model_id "
sql += "join Jobs on Models.job_id = Jobs.id "
sql += "where project_id = {0} ".format(self.project_id)
sql += "and {0} = -1 ".format(level_name)
if job_ids:
sql += "and Models.job_id in ({0}) ".format(",".join(map(str, job_ids)))
sql += "group by Models.job_id "
sql += "having min(rfac) "
sql += "order by rfac, job_id, model, scan, sym, emit, region "
c = self._conn.execute(sql)
models = [row['id'] for row in c]
return models
def query_tasks(self, job_id=0):
"""
query the task index used in a calculation job.
@ -1213,13 +1527,18 @@ class ResultsDatabase(object):
data = np.genfromtxt(filename, names=True)
self.register_params(data.dtype.names)
unique_models, unique_index = np.unique(data['_model'], True)
try:
unique_models, unique_index = np.unique(data['_model'], True)
except ValueError:
unique_models = np.array([0])
unique_index = np.array([0])
unique_data = data[unique_index]
model_ids = {}
def model_entry_generator():
for result in unique_data:
model_entry = {'job_id': job_id,
'model': unique_models[0],
'gen': None,
'particle': None}
model_entry.update(special_params(result))
@ -1227,7 +1546,11 @@ class ResultsDatabase(object):
def result_entry_generator():
for result in data:
result_entry = {'model_id': model_ids[result['_model']],
try:
model = result['_model']
except ValueError:
model = unique_models[0]
result_entry = {'model_id': model_ids[model],
'scan': -1,
'sym': -1,
'emit': -1,
@ -1238,8 +1561,12 @@ class ResultsDatabase(object):
def param_entry_generator():
for result in unique_data:
try:
model = result['_model']
except ValueError:
model = unique_models[0]
for key, value in regular_params(result).items():
param_entry = {'model_id': model_ids[result['_model']],
param_entry = {'model_id': model_ids[model],
'param_id': self._model_params[key],
'value': value}
yield param_entry