update public distribution

based on internal repository c9a2ac8 2019-01-03 16:04:57 +0100
tagged rev-master-2.0.0
This commit is contained in:
2019-01-31 15:45:02 +01:00
parent bbd16d0f94
commit acea809e4e
92 changed files with 165828 additions and 143181 deletions

View File

@ -1,16 +1,19 @@
"""
@package pmsco.files
manage files produced by pmsco.
manage the lifetime of files produced by pmsco.
@author Matthias Muntwiler
@copyright (c) 2016 by Paul Scherrer Institut @n
@copyright (c) 2016-18 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import logging
import mpi4py
@ -53,7 +56,7 @@ FILE_CATEGORIES = {'cluster', 'phase', 'input', 'output',
#
# this constant defines the default set of file categories that are kept after the calculation.
#
FILE_CATEGORIES_TO_KEEP = {'cluster', 'model', 'report', 'population'}
FILE_CATEGORIES_TO_KEEP = {'cluster', 'model', 'scan', 'report', 'population'}
## @var FILE_CATEGORIES_TO_DELETE
# categories of files to be deleted.
@ -67,13 +70,17 @@ FILE_CATEGORIES_TO_DELETE = FILE_CATEGORIES - FILE_CATEGORIES_TO_KEEP
class FileTracker(object):
"""
organize output files of calculations.
manage the lifetime of files produced by the calculations.
the file manager stores references to data files generated during calculations
and cleans up unused files according to a range of filter criteria.
this class identifies files by _file name_.
file names must therefore be unique over the whole calculation process.
it is possible to specify a full path that is used for communication with the operating system.
"""
## @var files_to_delete (set)
## @var categories_to_delete (set)
# categories of generated files that should be deleted after the calculation.
#
# each string of this set marks a category of files to be deleted.
@ -93,96 +100,119 @@ class FileTracker(object):
#
# the default is 10.
## @var _last_id (int)
# last used file identification number (incremental)
## @var _path_by_id (dict)
# key = file id, value = file path
## @var _model_by_id (dict)
# key = file id, value = model number
## @var _category_by_id (dict)
# key = file id, value = category (str)
## @var _file_model (dict)
# key = file name, value = model number
## @var _file_category (dict)
# key = file name, value = category (str)
## @var _file_path (dict)
# key = file name, value = absolute file path (str)
## @var _rfac_by_model (dict)
# key = model number, value = file id
# key = model number, value = R-factor
## @var _complete_by_model (dict)
# key = model number, value (boolean) = all calculations complete, files can be deleted
## @var _complete_models (set)
# this set contains the model numbers of the models that have finished all calculations.
# files of these models can be considered for clean up.
def __init__(self):
self._id_by_path = {}
self._path_by_id = {}
self._model_by_id = {}
self._category_by_id = {}
self._file_model = {}
self._file_category = {}
self._file_path = {}
self._rfac_by_model = {}
self._complete_by_model = {}
self._last_id = 0
self._complete_models = set([])
self.categories_to_delete = FILE_CATEGORIES_TO_DELETE
self.keep_rfac = 10
def add_file(self, path, model, category='default'):
def get_file_count(self):
"""
return the number of tracked files.
@return: (int) number of tracked files.
"""
return len(self._file_path)
def get_complete_models_count(self):
"""
return the number of complete models.
@return: (int) number of complete models.
"""
return len(self._complete_models)
def add_file(self, name, model, category='default', path=''):
"""
add a new data file to the list.
@param path: (str) system path of the file relative to the working directory.
@param name: (str) unique identification of the file.
this can be the file name in the file system if file names are unique without path specification.
the name must be spelled identically
whenever the same file is referenced in a call to another method of this class.
the empty string is ignored.
@param model: (int) model number
@param category: (str) file category, e.g. 'output', etc.
@param path: (str) file system path of the file.
the file system path is used for communication with the operating system when the file is deleted.
by default, the path is the name argument expanded to a full path relative to the current working directory.
the path is expanded during the call of this method and will not change when the working directory changes.
@return: None
"""
self._last_id += 1
_id = self._last_id
self._id_by_path[path] = _id
self._path_by_id[_id] = path
self._model_by_id[_id] = model
self._category_by_id[_id] = category
if name:
self._file_model[name] = model
self._file_category[name] = category
self._file_path[name] = path if path else os.path.abspath(name)
def rename_file(self, old_path, new_path):
def rename_file(self, old_name, new_name, new_path=''):
"""
rename a data file in the list.
the method does not rename the file in the file system.
@param old_path: must match an existing file path identically.
if old_path is not in the list, the method does nothing.
@param old_name: name used in the original add_file() call.
if it is not in the list, the method does nothing.
@param new_path: new path.
@param new_name: new name of the file, see add_file().
if the file is already in the list, its model and category is overwritten by the values of the old file.
@param new_path: new file system path of the file, see add_file().
by default, the path is the name argument expanded to a full path relative to the current working directory.
@return: None
"""
try:
_id = self._id_by_path[old_path]
model = self._file_model[old_name]
cat = self._file_category[old_name]
except KeyError:
pass
else:
del self._id_by_path[old_path]
self._id_by_path[new_path] = _id
self._path_by_id[_id] = new_path
del self._file_model[old_name]
del self._file_category[old_name]
del self._file_path[old_name]
self.add_file(new_name, model, cat, new_path)
def remove_file(self, path):
def remove_file(self, name):
"""
remove a file from the list.
the method does not delete the file from the file system.
@param path: must match an existing file path identically.
if path is not in the list, the method does nothing.
@param name: must match an existing file name identically.
if the name is not found in the list, the method does nothing.
@return: None
"""
try:
_id = self._id_by_path[path]
del self._file_model[name]
del self._file_category[name]
del self._file_path[name]
except KeyError:
pass
else:
del self._id_by_path[path]
del self._path_by_id[_id]
del self._model_by_id[_id]
del self._category_by_id[_id]
def update_model_rfac(self, model, rfac):
"""
@ -207,18 +237,19 @@ class FileTracker(object):
@param complete: (bool) True if all calculations of the model are complete (files can be deleted).
@return: None
"""
self._complete_by_model[model] = complete
if complete:
self._complete_models.add(model)
else:
self._complete_models.discard(model)
def delete_files(self, categories=None, keep_rfac=0):
def delete_files(self, categories=None):
"""
delete the files matching the list of categories.
@param categories: set of file categories to delete.
may include 'rfac' if bad r-factors should be deleted additionally (regardless of static category).
defaults to self.categories_to_delete.
@version this method does not act on the 'rfac' category.
@param keep_rfac: number of best models to keep if bad r-factors are to be deleted.
the effective keep number is the greater of self.keep_rfac and this argument.
@param categories: set of file categories to delete.
defaults to self.categories_to_delete.
@return: None
"""
@ -226,8 +257,6 @@ class FileTracker(object):
categories = self.categories_to_delete
for cat in categories:
self.delete_category(cat)
if 'rfac' in categories:
self.delete_bad_rfac(keep=keep_rfac)
def delete_bad_rfac(self, keep=0, force_delete=False):
"""
@ -252,8 +281,6 @@ class FileTracker(object):
@param force_delete: delete the bad files even if 'rfac' is not selected in categories_to_delete.
@return: None
@todo should clean up rfac and model dictionaries from time to time.
"""
if force_delete or 'rfac' in self.categories_to_delete:
keep = max(keep, self.keep_rfac)
@ -263,12 +290,45 @@ class FileTracker(object):
except IndexError:
return
complete_models = {_model for (_model, _complete) in self._complete_by_model.iteritems() if _complete}
del_models = {_model for (_model, _rfac) in self._rfac_by_model.iteritems() if _rfac > rfac_split}
del_models &= complete_models
del_ids = {_id for (_id, _model) in self._model_by_id.iteritems() if _model in del_models}
for _id in del_ids:
self.delete_file(_id)
keep_models = {model for (model, rfac) in self._rfac_by_model.items() if 0.0 <= rfac <= rfac_split}
del_models = self._complete_models - keep_models
del_names = {name for (name, model) in self._file_model.items() if model in del_models}
for name in del_names:
self.delete_file(name)
def delete_models(self, keep=None, delete=None):
"""
delete all files by model.
this involves the following steps:
1. determine a list of complete models
(incomplete models are still being processed and must not be deleted).
2. intersect with the _delete_ list if specified.
3. subtract the _keep_ list if specified.
if neither the _keep_ nor the _delete_ list is specified,
or if the steps above resolve to the _complete_ list
the method considers it as an error and does nothing.
@param keep: (sequence) model numbers to keep, i.e., delete all others.
@param delete: (sequence) model numbers to delete.
@return (int) number of models deleted.
"""
del_models = self._complete_models.copy()
if delete:
del_models &= delete
if keep:
del_models -= keep
if not del_models or del_models == self._complete_models:
return 0
del_names = {name for (name, model) in self._file_model.items() if model in del_models}
for name in del_names:
self.delete_file(name)
return len(del_models)
def delete_category(self, category):
"""
@ -280,45 +340,38 @@ class FileTracker(object):
@return: None
"""
complete_models = {_model for (_model, _complete) in self._complete_by_model.iteritems() if _complete}
del_ids = {_id for (_id, cat) in self._category_by_id.iteritems() if cat == category}
del_ids &= {_id for (_id, _model) in self._model_by_id.iteritems() if _model in complete_models}
for _id in del_ids:
self.delete_file(_id)
del_names = {name for (name, cat) in self._file_category.items() if cat == category}
del_names &= {name for (name, model) in self._file_model.items() if model in self._complete_models}
for name in del_names:
self.delete_file(name)
def delete_file(self, _id):
def delete_file(self, name):
"""
delete a specified file from the list and the file system.
the file is identified by ID number.
this method is unconditional. it does not consider category, completeness, nor R-factor.
@param _id: (int) ID number of the file to delete.
the method catches errors during file deletion and prints warnings to the logger.
@param name: must match an existing file path identically.
if it is not in the list, the method does nothing.
the method uses the associated path declared in add_file() to delete the file.
@return: None
"""
path = self._path_by_id[_id]
cat = self._category_by_id[_id]
model = self._model_by_id[_id]
del self._id_by_path[path]
del self._path_by_id[_id]
del self._model_by_id[_id]
del self._category_by_id[_id]
try:
self._os_delete_file(path)
except OSError:
logger.warning("error deleting file {0}".format(path))
cat = self._file_category[name]
model = self._file_model[name]
path = self._file_path[name]
except KeyError:
logger.warning("tried to delete untracked file {0}".format(name))
else:
logger.debug("delete file {0} ({1}, model {2})".format(path, cat, model))
@staticmethod
def _os_delete_file(path):
"""
have the operating system delete a file path.
this function is separate so that we can mock it in unit tests.
@param path: OS path
@return: None
"""
os.remove(path)
del self._file_model[name]
del self._file_category[name]
del self._file_path[name]
try:
os.remove(path)
except OSError:
logger.warning("file system error deleting file {0}".format(path))
else:
logger.debug("delete file {0} ({1}, model {2})".format(path, cat, model))