public distro 2.1.0

2019-07-19 12:54:54 +02:00
parent acea809e4e
commit fbd2d4fa8c
40 changed files with 2813 additions and 345 deletions
--- a/pmsco/database.py
+++ b/pmsco/database.py
@ -342,6 +342,53 @@ class ResultsDatabase(object):
        where param_id = :param_id and model_id = :model_id 
        """

+    sql_create_tags = """CREATE TABLE IF NOT EXISTS `Tags` (
+        `id` INTEGER PRIMARY KEY,
+        `key` TEXT NOT NULL UNIQUE COLLATE NOCASE
+        )"""
+    sql_insert_tag = "insert into Tags(key) values (:key)"
+    sql_select_tag = "select key from Tags where id=:id"
+    sql_select_tag_key = "select id, key from Tags where key=:key"
+    sql_select_tag_project = """select distinct key, tag_id from Jobs
+        join JobTags on Jobs.id = JobTags.job_id
+        join Tags on Tags.id = JobTags.tag_id
+        where Jobs.project_id = :project_id
+        order by key collate nocase"""
+    sql_select_tag_job = """select distinct key, tag_id from JobTags
+        join Tags on Tags.id = JobTags.tag_id
+        where JobTags.job_id = :job_id
+        order by key collate nocase"""
+
+    sql_create_jobtags = """CREATE TABLE IF NOT EXISTS `JobTags` (
+        `id` INTEGER PRIMARY KEY,
+        `tag_id` INTEGER NOT NULL,
+        `job_id` INTEGER NOT NULL,
+        `value` TEXT COLLATE NOCASE,
+        FOREIGN KEY(tag_id) REFERENCES Tags(id) ON DELETE CASCADE,
+        FOREIGN KEY(job_id) REFERENCES Jobs(id) ON DELETE CASCADE
+        )"""
+    sql_index_jobtags = """create index if not exists 
+        `index_jobtags` ON `JobTags` 
+        (`tag_id`, `job_id`)"""
+    sql_drop_index_jobtags = "drop index if exists index_jobtags"
+    sql_insert_jobtag = """
+        insert into JobTags(tag_id, job_id, value)
+        values (:tag_id, :job_id, :value)
+        """
+    sql_update_jobtag = """
+        update JobTags set value=:value where id=:jobtag_id
+        """
+    sql_select_jobtag_job = """
+        select key, value from JobTags
+        join Tags on JobTags.tag_id = Tags.id
+        where job_id = :job_id
+        """
+    sql_select_jobtag = """
+        select JobTags.id as id, key, value from JobTags
+        join Tags on JobTags.tag_id = Tags.id
+        where tag_id = :tag_id and job_id = :job_id 
+        """
+
    # @var _conn (sqlite3.Connection).
    # connection interface to the database.
    #
@ -391,6 +438,7 @@ class ResultsDatabase(object):
        self.project_id = 0
        self.job_id = 0
        self._model_params = {}
+        self._tags = {}
        self._lock_filename = ""
        self._lock = None

@ -484,9 +532,12 @@ class ResultsDatabase(object):
            self._conn.execute(self.sql_create_results)
            self._conn.execute(self.sql_create_params)
            self._conn.execute(self.sql_create_paramvalues)
+            self._conn.execute(self.sql_create_tags)
+            self._conn.execute(self.sql_create_jobtags)
            self._conn.execute(self.sql_index_results_tasks)
            self._conn.execute(self.sql_index_results_models)
            self._conn.execute(self.sql_index_paramvalues)
+            self._conn.execute(self.sql_index_jobtags)
            self._conn.execute(self.sql_index_models)

    def register_project(self, name, code):
@ -583,6 +634,46 @@ class ResultsDatabase(object):
            param_dict = {'job_id': job_id}
            self._conn.execute(self.sql_delete_job, param_dict)

+    def _query_job_name(self, job_name, project_id=0):
+        """
+        (internal) query a job by name
+
+        this is the internal analog of @ref query_job_name
+        which asserts an acquired lock and open connection.
+
+        @param job_name: name of the job
+
+        @param project_id: project identifier.
+            by default, the current project self.project_id is used.
+
+        @return: id value of the job in the database
+
+        @raise DatabaseError if the job can't be found.
+        """
+        if project_id == 0:
+            project_id = self.project_id
+        param_dict = {'project_id': project_id, 'name': job_name}
+        c = self._conn.execute(self.sql_select_job_name, param_dict)
+        v = c.fetchone()
+        return v[0]
+
+    def query_job_name(self, job_name, project_id=0):
+        """
+        query a job by name
+
+        @param job_name: name of the job
+
+        @param project_id: project identifier.
+            by default, the current project self.project_id is used.
+
+        @return: id value of the job in the database
+        """
+        self.check_connection()
+        with self._lock, self._conn:
+            job_id = self._query_job_name(job_name, project_id=project_id)
+
+        return job_id
+
    def register_param(self, key):
        """
        register a parameter key with the database.
@ -681,6 +772,165 @@ class ResultsDatabase(object):

        return params

+    def register_tag(self, key):
+        """
+        register a tag with the database.
+
+        tags are a way of structuring a job description.
+        they can be used to, for instance, distinguish calculations made with different clusters,
+        different experimental data, etc.
+        a job tag has a key and a value, and is associated to a job.
+        the use of tags is up to the user. pmsco does not change or read them.
+
+        each tag name must be registered once before a value can be written to the database.
+        see the class description for an explanation.
+
+        @param key: key (name) of the tag.
+
+        @return: id value of the tag in the database.
+        """
+        self.check_connection()
+        with self._lock, self._conn:
+            return self._register_tag(key)
+
+    def _register_tag(self, key):
+        """
+        register a tag with the database without committing the transaction.
+
+        @note this method does not lock the database file and does not commit.
+            to lock the database and commit the transaction, call the public method register_tag().
+
+        @param key: key (name) of the tag.
+
+        @return: id value of the tag in the database.
+        """
+        c = self._conn.execute(self.sql_select_tag_key, {'key': key})
+        v = c.fetchone()
+        if v:
+            tag_id = v[0]
+        else:
+            c = self._conn.execute(self.sql_insert_tag, {'key': key})
+            tag_id = c.lastrowid
+        self._tags[key] = tag_id
+        return tag_id
+
+    def register_tags(self, tags):
+        """
+        register the tags of this project with the database.
+
+        each tag name must be registered once before a value can be written to the database.
+        see the class description for an explanation.
+
+        @param tags: sequence of tag keys, or dictionary of tags.
+        @return: None
+        """
+        self.check_connection()
+        with self._lock, self._conn:
+            for key in tags:
+                self._register_tag(key)
+
+    def query_tags(self, project_id=0, job_id=0, update_registry=False):
+        """
+        query a list of tag keys used in a project or job.
+
+        optionally, the local registry can be updated with the results of the query.
+        this should be done if the database is read only and the client does not know the tag names.
+        see the class description for a description of the registry.
+
+        @note this method returns the tags that are used with jobs in the database.
+            if you have registered additional tags but not attached them to jobs,
+            this method will _not_ list them.
+
+        @param project_id: project identifier.
+            by default, the current project self.project_id is used.
+
+        @param job_id: job identifier.
+            by default, all jobs of the selected project are included in the query.
+            if a job is specified, the project_id parameter is ignored.
+
+        @param update_registry: update the local tags registry (self._tags).
+            with the query results.
+
+        @return: dictionary of tags.
+            the keys are the tag names, the values are the tag ids in the database.
+        """
+        if project_id == 0:
+            project_id = self.project_id
+        if job_id == 0:
+            sql = self.sql_select_tag_project
+            args = {'project_id': project_id}
+        else:
+            sql = self.sql_select_tag_job
+            args = {'job_id': job_id}
+
+        tags = {}
+        self.check_connection()
+        with self._lock, self._conn:
+            c = self._conn.execute(sql, args)
+            for row in c:
+                tags[row['key']] = row['tag_id']
+
+        if update_registry:
+            self._tags.update(tags)
+
+        return tags
+
+    def query_job_tags(self, job_id):
+        """
+        query a list of tags (keys and values) associated with a job.
+
+        @param job_id: job identifier.
+
+        @return: dictionary of tags.
+            the keys are the tag names, the values are the tag values.
+        """
+        sql = self.sql_select_jobtag_job
+        args = {'job_id': job_id}
+
+        tags = {}
+        self.check_connection()
+        with self._lock, self._conn:
+            c = self._conn.execute(sql, args)
+            for row in c:
+                tags[row['key']] = row['value']
+
+        return tags
+
+    def insert_jobtags(self, job_id, tags):
+        """
+        add or update job tags in the database.
+
+        the method updates the JobTags table.
+
+        @param job_id: (int) primary key of the job entry in the Jobs table.
+            the entry must exist.
+
+        @param tags: (dict) dictionary containing the tags.
+            keys are matched or added to the Tags table,
+            values are added to the JobTags table and linked to the job and tag key.
+
+        @return: None
+        """
+        self.check_connection()
+        with self._lock, self._conn:
+            for key, value in tags.items():
+                try:
+                    tag_id = self._tags[key]
+                except KeyError:
+                    tag_id = self._register_tag(key)
+                    v = None
+                else:
+                    jobtag_entry = {'tag_id': tag_id, 'job_id': job_id, 'value': value}
+                    c = self._conn.execute(self.sql_select_jobtag, jobtag_entry)
+                    v = c.fetchone()
+
+                if v:
+                    jobtag_entry = {'jobtag_id': v[0], 'tag_id': tag_id, 'job_id': job_id, 'value': value}
+                    self._conn.execute(self.sql_update_jobtag, jobtag_entry)
+                else:
+                    jobtag_entry = {'tag_id': tag_id, 'job_id': job_id, 'value': value}
+                    self._conn.execute(self.sql_insert_jobtag, jobtag_entry)
+
    def create_models_view(self, job_id=0, temporary=False):
        """
        create a flat (pivot) view of model parameters of the current project or job.
@ -878,7 +1128,7 @@ class ResultsDatabase(object):
            results = c.fetchall()

            names = [desc[0] for desc in c.description]
-            dt = np.dtype([(field_to_param(n), field_to_numpy_type(n)) for n in sorted(names)])
+            dt = np.dtype([(field_to_param(n), field_to_numpy_type(n)) for n in sorted(names, key=str.lower)])
            out_array = np.zeros((count,), dtype=dt)
            for idx, row in enumerate(results):
                for name in names:
@ -942,6 +1192,70 @@ class ResultsDatabase(object):

        return out_array

+    def query_best_models_per_jobs(self, job_ids=None, task_level='model'):
+        """
+        return the best model (by rfac) of each selected job
+
+        the query gathers the R-factors of the selected jobs at the selected task levels
+        and, for each job, returns the (database) model id where the lowest R-factor is reported
+        among the gathered results.
+
+        this can be useful if you want to compile a report of the best model per job.
+
+        @param job_ids: iterable of job ids to include in the query.
+            the job ids must belong to the current project.
+            if empty or non-specified, all jobs of the current project are included.
+
+        @param task_level: element of or index into @ref pmsco.dispatch.CALC_LEVELS.
+            deepest task_level to include in the query.
+            results on deeper levels are not considered.
+            e.g. if you pass 'scan', R-factors of individual scans are included in the query.
+            note that including deeper levels will not increase the number of results returned.
+
+        @return sequence of model_id.
+            the number of results corresponds to the number of jobs in the filter scope.
+            to find out details of the models, execute another query that filters on these model ids.
+
+        the method produces an SQL query similar to:
+        @code{.sql}
+        select Models.id from Models
+        join Results on Models.id = Results.model_id
+        join Jobs on Models.job_id = Jobs.id
+        where scan=-1
+        and project_id=1
+        and job_id in (1,2,3)
+        group by Models.job_id
+        having min(rfac)
+        order by rfac
+        @endcode
+        """
+
+        try:
+            level = dispatch.CALC_LEVELS.index(task_level) + 1
+        except ValueError:
+            level = task_level + 1
+        try:
+            level_name = dispatch.CALC_LEVELS[level]
+        except IndexError:
+            level_name = dispatch.CALC_LEVELS[4]
+
+        self.check_connection()
+        with self._lock, self._conn:
+            sql = "select Models.id from Models "
+            sql += "join Results on Models.id = Results.model_id "
+            sql += "join Jobs on Models.job_id = Jobs.id "
+            sql += "where project_id = {0} ".format(self.project_id)
+            sql += "and {0} = -1 ".format(level_name)
+            if job_ids:
+                sql += "and Models.job_id in ({0}) ".format(",".join(map(str, job_ids)))
+            sql += "group by Models.job_id "
+            sql += "having min(rfac) "
+            sql += "order by rfac, job_id, model, scan, sym, emit, region "
+            c = self._conn.execute(sql)
+            models = [row['id'] for row in c]
+
+        return models
+
    def query_tasks(self, job_id=0):
        """
        query the task index used in a calculation job.
@ -1213,13 +1527,18 @@ class ResultsDatabase(object):

        data = np.genfromtxt(filename, names=True)
        self.register_params(data.dtype.names)
-        unique_models, unique_index = np.unique(data['_model'], True)
+        try:
+            unique_models, unique_index = np.unique(data['_model'], True)
+        except ValueError:
+            unique_models = np.array([0])
+            unique_index = np.array([0])
        unique_data = data[unique_index]
        model_ids = {}

        def model_entry_generator():
            for result in unique_data:
                model_entry = {'job_id': job_id,
+                               'model': unique_models[0],
                               'gen': None,
                               'particle': None}
                model_entry.update(special_params(result))
@ -1227,7 +1546,11 @@ class ResultsDatabase(object):

        def result_entry_generator():
            for result in data:
-                result_entry = {'model_id': model_ids[result['_model']],
+                try:
+                    model = result['_model']
+                except ValueError:
+                    model = unique_models[0]
+                result_entry = {'model_id': model_ids[model],
                                'scan': -1,
                                'sym': -1,
                                'emit': -1,
@ -1238,8 +1561,12 @@ class ResultsDatabase(object):

        def param_entry_generator():
            for result in unique_data:
+                try:
+                    model = result['_model']
+                except ValueError:
+                    model = unique_models[0]
                for key, value in regular_params(result).items():
-                    param_entry = {'model_id': model_ids[result['_model']],
+                    param_entry = {'model_id': model_ids[model],
                                   'param_id': self._model_params[key],
                                   'value': value}
                    yield param_entry