update public distribution
based on internal repository c9a2ac8 2019-01-03 16:04:57 +0100 tagged rev-master-2.0.0
This commit is contained in:
249
pmsco/files.py
249
pmsco/files.py
@ -1,16 +1,19 @@
|
||||
"""
|
||||
@package pmsco.files
|
||||
manage files produced by pmsco.
|
||||
manage the lifetime of files produced by pmsco.
|
||||
|
||||
@author Matthias Muntwiler
|
||||
|
||||
@copyright (c) 2016 by Paul Scherrer Institut @n
|
||||
@copyright (c) 2016-18 by Paul Scherrer Institut @n
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); @n
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import logging
|
||||
import mpi4py
|
||||
@ -53,7 +56,7 @@ FILE_CATEGORIES = {'cluster', 'phase', 'input', 'output',
|
||||
#
|
||||
# this constant defines the default set of file categories that are kept after the calculation.
|
||||
#
|
||||
FILE_CATEGORIES_TO_KEEP = {'cluster', 'model', 'report', 'population'}
|
||||
FILE_CATEGORIES_TO_KEEP = {'cluster', 'model', 'scan', 'report', 'population'}
|
||||
|
||||
## @var FILE_CATEGORIES_TO_DELETE
|
||||
# categories of files to be deleted.
|
||||
@ -67,13 +70,17 @@ FILE_CATEGORIES_TO_DELETE = FILE_CATEGORIES - FILE_CATEGORIES_TO_KEEP
|
||||
|
||||
class FileTracker(object):
|
||||
"""
|
||||
organize output files of calculations.
|
||||
manage the lifetime of files produced by the calculations.
|
||||
|
||||
the file manager stores references to data files generated during calculations
|
||||
and cleans up unused files according to a range of filter criteria.
|
||||
|
||||
this class identifies files by _file name_.
|
||||
file names must therefore be unique over the whole calculation process.
|
||||
it is possible to specify a full path that is used for communication with the operating system.
|
||||
"""
|
||||
|
||||
## @var files_to_delete (set)
|
||||
## @var categories_to_delete (set)
|
||||
# categories of generated files that should be deleted after the calculation.
|
||||
#
|
||||
# each string of this set marks a category of files to be deleted.
|
||||
@ -93,96 +100,119 @@ class FileTracker(object):
|
||||
#
|
||||
# the default is 10.
|
||||
|
||||
## @var _last_id (int)
|
||||
# last used file identification number (incremental)
|
||||
|
||||
## @var _path_by_id (dict)
|
||||
# key = file id, value = file path
|
||||
|
||||
## @var _model_by_id (dict)
|
||||
# key = file id, value = model number
|
||||
|
||||
## @var _category_by_id (dict)
|
||||
# key = file id, value = category (str)
|
||||
## @var _file_model (dict)
|
||||
# key = file name, value = model number
|
||||
|
||||
## @var _file_category (dict)
|
||||
# key = file name, value = category (str)
|
||||
|
||||
## @var _file_path (dict)
|
||||
# key = file name, value = absolute file path (str)
|
||||
|
||||
## @var _rfac_by_model (dict)
|
||||
# key = model number, value = file id
|
||||
# key = model number, value = R-factor
|
||||
|
||||
## @var _complete_by_model (dict)
|
||||
# key = model number, value (boolean) = all calculations complete, files can be deleted
|
||||
## @var _complete_models (set)
|
||||
# this set contains the model numbers of the models that have finished all calculations.
|
||||
# files of these models can be considered for clean up.
|
||||
|
||||
def __init__(self):
|
||||
self._id_by_path = {}
|
||||
self._path_by_id = {}
|
||||
self._model_by_id = {}
|
||||
self._category_by_id = {}
|
||||
self._file_model = {}
|
||||
self._file_category = {}
|
||||
self._file_path = {}
|
||||
self._rfac_by_model = {}
|
||||
self._complete_by_model = {}
|
||||
self._last_id = 0
|
||||
self._complete_models = set([])
|
||||
self.categories_to_delete = FILE_CATEGORIES_TO_DELETE
|
||||
self.keep_rfac = 10
|
||||
|
||||
def add_file(self, path, model, category='default'):
|
||||
def get_file_count(self):
|
||||
"""
|
||||
return the number of tracked files.
|
||||
|
||||
@return: (int) number of tracked files.
|
||||
"""
|
||||
return len(self._file_path)
|
||||
|
||||
def get_complete_models_count(self):
|
||||
"""
|
||||
return the number of complete models.
|
||||
|
||||
@return: (int) number of complete models.
|
||||
"""
|
||||
return len(self._complete_models)
|
||||
|
||||
def add_file(self, name, model, category='default', path=''):
|
||||
"""
|
||||
add a new data file to the list.
|
||||
|
||||
@param path: (str) system path of the file relative to the working directory.
|
||||
@param name: (str) unique identification of the file.
|
||||
this can be the file name in the file system if file names are unique without path specification.
|
||||
the name must be spelled identically
|
||||
whenever the same file is referenced in a call to another method of this class.
|
||||
the empty string is ignored.
|
||||
|
||||
@param model: (int) model number
|
||||
|
||||
@param category: (str) file category, e.g. 'output', etc.
|
||||
|
||||
@param path: (str) file system path of the file.
|
||||
the file system path is used for communication with the operating system when the file is deleted.
|
||||
|
||||
by default, the path is the name argument expanded to a full path relative to the current working directory.
|
||||
the path is expanded during the call of this method and will not change when the working directory changes.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
self._last_id += 1
|
||||
_id = self._last_id
|
||||
self._id_by_path[path] = _id
|
||||
self._path_by_id[_id] = path
|
||||
self._model_by_id[_id] = model
|
||||
self._category_by_id[_id] = category
|
||||
if name:
|
||||
self._file_model[name] = model
|
||||
self._file_category[name] = category
|
||||
self._file_path[name] = path if path else os.path.abspath(name)
|
||||
|
||||
def rename_file(self, old_path, new_path):
|
||||
def rename_file(self, old_name, new_name, new_path=''):
|
||||
"""
|
||||
rename a data file in the list.
|
||||
|
||||
the method does not rename the file in the file system.
|
||||
|
||||
@param old_path: must match an existing file path identically.
|
||||
if old_path is not in the list, the method does nothing.
|
||||
@param old_name: name used in the original add_file() call.
|
||||
if it is not in the list, the method does nothing.
|
||||
|
||||
@param new_path: new path.
|
||||
@param new_name: new name of the file, see add_file().
|
||||
if the file is already in the list, its model and category is overwritten by the values of the old file.
|
||||
|
||||
@param new_path: new file system path of the file, see add_file().
|
||||
by default, the path is the name argument expanded to a full path relative to the current working directory.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
try:
|
||||
_id = self._id_by_path[old_path]
|
||||
model = self._file_model[old_name]
|
||||
cat = self._file_category[old_name]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
del self._id_by_path[old_path]
|
||||
self._id_by_path[new_path] = _id
|
||||
self._path_by_id[_id] = new_path
|
||||
del self._file_model[old_name]
|
||||
del self._file_category[old_name]
|
||||
del self._file_path[old_name]
|
||||
self.add_file(new_name, model, cat, new_path)
|
||||
|
||||
def remove_file(self, path):
|
||||
def remove_file(self, name):
|
||||
"""
|
||||
remove a file from the list.
|
||||
|
||||
the method does not delete the file from the file system.
|
||||
|
||||
@param path: must match an existing file path identically.
|
||||
if path is not in the list, the method does nothing.
|
||||
@param name: must match an existing file name identically.
|
||||
if the name is not found in the list, the method does nothing.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
try:
|
||||
_id = self._id_by_path[path]
|
||||
del self._file_model[name]
|
||||
del self._file_category[name]
|
||||
del self._file_path[name]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
del self._id_by_path[path]
|
||||
del self._path_by_id[_id]
|
||||
del self._model_by_id[_id]
|
||||
del self._category_by_id[_id]
|
||||
|
||||
def update_model_rfac(self, model, rfac):
|
||||
"""
|
||||
@ -207,18 +237,19 @@ class FileTracker(object):
|
||||
@param complete: (bool) True if all calculations of the model are complete (files can be deleted).
|
||||
@return: None
|
||||
"""
|
||||
self._complete_by_model[model] = complete
|
||||
if complete:
|
||||
self._complete_models.add(model)
|
||||
else:
|
||||
self._complete_models.discard(model)
|
||||
|
||||
def delete_files(self, categories=None, keep_rfac=0):
|
||||
def delete_files(self, categories=None):
|
||||
"""
|
||||
delete the files matching the list of categories.
|
||||
|
||||
@param categories: set of file categories to delete.
|
||||
may include 'rfac' if bad r-factors should be deleted additionally (regardless of static category).
|
||||
defaults to self.categories_to_delete.
|
||||
@version this method does not act on the 'rfac' category.
|
||||
|
||||
@param keep_rfac: number of best models to keep if bad r-factors are to be deleted.
|
||||
the effective keep number is the greater of self.keep_rfac and this argument.
|
||||
@param categories: set of file categories to delete.
|
||||
defaults to self.categories_to_delete.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
@ -226,8 +257,6 @@ class FileTracker(object):
|
||||
categories = self.categories_to_delete
|
||||
for cat in categories:
|
||||
self.delete_category(cat)
|
||||
if 'rfac' in categories:
|
||||
self.delete_bad_rfac(keep=keep_rfac)
|
||||
|
||||
def delete_bad_rfac(self, keep=0, force_delete=False):
|
||||
"""
|
||||
@ -252,8 +281,6 @@ class FileTracker(object):
|
||||
@param force_delete: delete the bad files even if 'rfac' is not selected in categories_to_delete.
|
||||
|
||||
@return: None
|
||||
|
||||
@todo should clean up rfac and model dictionaries from time to time.
|
||||
"""
|
||||
if force_delete or 'rfac' in self.categories_to_delete:
|
||||
keep = max(keep, self.keep_rfac)
|
||||
@ -263,12 +290,45 @@ class FileTracker(object):
|
||||
except IndexError:
|
||||
return
|
||||
|
||||
complete_models = {_model for (_model, _complete) in self._complete_by_model.iteritems() if _complete}
|
||||
del_models = {_model for (_model, _rfac) in self._rfac_by_model.iteritems() if _rfac > rfac_split}
|
||||
del_models &= complete_models
|
||||
del_ids = {_id for (_id, _model) in self._model_by_id.iteritems() if _model in del_models}
|
||||
for _id in del_ids:
|
||||
self.delete_file(_id)
|
||||
keep_models = {model for (model, rfac) in self._rfac_by_model.items() if 0.0 <= rfac <= rfac_split}
|
||||
del_models = self._complete_models - keep_models
|
||||
del_names = {name for (name, model) in self._file_model.items() if model in del_models}
|
||||
for name in del_names:
|
||||
self.delete_file(name)
|
||||
|
||||
def delete_models(self, keep=None, delete=None):
|
||||
"""
|
||||
delete all files by model.
|
||||
|
||||
this involves the following steps:
|
||||
1. determine a list of complete models
|
||||
(incomplete models are still being processed and must not be deleted).
|
||||
2. intersect with the _delete_ list if specified.
|
||||
3. subtract the _keep_ list if specified.
|
||||
|
||||
if neither the _keep_ nor the _delete_ list is specified,
|
||||
or if the steps above resolve to the _complete_ list
|
||||
the method considers it as an error and does nothing.
|
||||
|
||||
@param keep: (sequence) model numbers to keep, i.e., delete all others.
|
||||
|
||||
@param delete: (sequence) model numbers to delete.
|
||||
|
||||
@return (int) number of models deleted.
|
||||
"""
|
||||
del_models = self._complete_models.copy()
|
||||
if delete:
|
||||
del_models &= delete
|
||||
if keep:
|
||||
del_models -= keep
|
||||
if not del_models or del_models == self._complete_models:
|
||||
return 0
|
||||
|
||||
del_names = {name for (name, model) in self._file_model.items() if model in del_models}
|
||||
for name in del_names:
|
||||
self.delete_file(name)
|
||||
|
||||
return len(del_models)
|
||||
|
||||
def delete_category(self, category):
|
||||
"""
|
||||
@ -280,45 +340,38 @@ class FileTracker(object):
|
||||
|
||||
@return: None
|
||||
"""
|
||||
complete_models = {_model for (_model, _complete) in self._complete_by_model.iteritems() if _complete}
|
||||
del_ids = {_id for (_id, cat) in self._category_by_id.iteritems() if cat == category}
|
||||
del_ids &= {_id for (_id, _model) in self._model_by_id.iteritems() if _model in complete_models}
|
||||
for _id in del_ids:
|
||||
self.delete_file(_id)
|
||||
del_names = {name for (name, cat) in self._file_category.items() if cat == category}
|
||||
del_names &= {name for (name, model) in self._file_model.items() if model in self._complete_models}
|
||||
for name in del_names:
|
||||
self.delete_file(name)
|
||||
|
||||
def delete_file(self, _id):
|
||||
def delete_file(self, name):
|
||||
"""
|
||||
delete a specified file from the list and the file system.
|
||||
|
||||
the file is identified by ID number.
|
||||
this method is unconditional. it does not consider category, completeness, nor R-factor.
|
||||
|
||||
@param _id: (int) ID number of the file to delete.
|
||||
the method catches errors during file deletion and prints warnings to the logger.
|
||||
|
||||
@param name: must match an existing file path identically.
|
||||
if it is not in the list, the method does nothing.
|
||||
the method uses the associated path declared in add_file() to delete the file.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
path = self._path_by_id[_id]
|
||||
cat = self._category_by_id[_id]
|
||||
model = self._model_by_id[_id]
|
||||
del self._id_by_path[path]
|
||||
del self._path_by_id[_id]
|
||||
del self._model_by_id[_id]
|
||||
del self._category_by_id[_id]
|
||||
try:
|
||||
self._os_delete_file(path)
|
||||
except OSError:
|
||||
logger.warning("error deleting file {0}".format(path))
|
||||
cat = self._file_category[name]
|
||||
model = self._file_model[name]
|
||||
path = self._file_path[name]
|
||||
except KeyError:
|
||||
logger.warning("tried to delete untracked file {0}".format(name))
|
||||
else:
|
||||
logger.debug("delete file {0} ({1}, model {2})".format(path, cat, model))
|
||||
|
||||
@staticmethod
|
||||
def _os_delete_file(path):
|
||||
"""
|
||||
have the operating system delete a file path.
|
||||
|
||||
this function is separate so that we can mock it in unit tests.
|
||||
|
||||
@param path: OS path
|
||||
@return: None
|
||||
"""
|
||||
os.remove(path)
|
||||
del self._file_model[name]
|
||||
del self._file_category[name]
|
||||
del self._file_path[name]
|
||||
try:
|
||||
os.remove(path)
|
||||
except OSError:
|
||||
logger.warning("file system error deleting file {0}".format(path))
|
||||
else:
|
||||
logger.debug("delete file {0} ({1}, model {2})".format(path, cat, model))
|
||||
|
Reference in New Issue
Block a user