add files for public distribution
based on internal repository 0a462b6 2017-11-22 14:41:39 +0100
This commit is contained in:
324
pmsco/files.py
Normal file
324
pmsco/files.py
Normal file
@ -0,0 +1,324 @@
|
||||
"""
|
||||
@package pmsco.files
|
||||
manage files produced by pmsco.
|
||||
|
||||
@author Matthias Muntwiler
|
||||
|
||||
@copyright (c) 2016 by Paul Scherrer Institut @n
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); @n
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import mpi4py
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
## @var FILE_CATEGORIES
|
||||
# categories of generated files.
|
||||
#
|
||||
# these labels are used to decide which output files are kept or deleted after the calculation.
|
||||
#
|
||||
# each string of this set marks a category of files.
|
||||
#
|
||||
# @arg @c 'input' : raw input files for calculator, including cluster and phase files in custom format
|
||||
# @arg @c 'output' : raw output files from calculator
|
||||
# @arg @c 'phase' : phase files in portable format for report
|
||||
# @arg @c 'cluster' : cluster files in portable XYZ format for report
|
||||
# @arg @c 'log' : log files
|
||||
# @arg @c 'debug' : debug files
|
||||
# @arg @c 'model': output files in ETPAI format: complete simulation (a_-1_-1_-1_-1)
|
||||
# @arg @c 'scan' : output files in ETPAI format: scan (a_b_-1_-1_-1)
|
||||
# @arg @c 'symmetry' : output files in ETPAI format: symmetry (a_b_c_-1_-1)
|
||||
# @arg @c 'emitter' : output files in ETPAI format: emitter (a_b_c_d_-1)
|
||||
# @arg @c 'region' : output files in ETPAI format: region (a_b_c_d_e)
|
||||
# @arg @c 'report': final report of results
|
||||
# @arg @c 'population': final state of particle population
|
||||
# @arg @c 'rfac': files related to models which give bad r-factors (dynamic category, see below).
|
||||
#
|
||||
# @note @c 'rfac' is a dynamic category not connected to a particular file or content type.
|
||||
# no file should be marked @c 'rfac'.
|
||||
# the string is used only to specify whether bad models should be deleted or not.
|
||||
# if so, all files related to bad models are deleted, regardless of their static category.
|
||||
#
|
||||
FILE_CATEGORIES = {'cluster', 'phase', 'input', 'output',
|
||||
'report', 'region', 'emitter', 'scan', 'symmetry', 'model',
|
||||
'log', 'debug', 'population', 'rfac'}
|
||||
|
||||
## @var FILE_CATEGORIES_TO_KEEP
|
||||
# categories of files to be keep.
|
||||
#
|
||||
# this constant defines the default set of file categories that are kept after the calculation.
|
||||
#
|
||||
FILE_CATEGORIES_TO_KEEP = {'cluster', 'model', 'report', 'population'}
|
||||
|
||||
## @var FILE_CATEGORIES_TO_DELETE
|
||||
# categories of files to be deleted.
|
||||
#
|
||||
# this constant defines the default set of file categories that are deleted after the calculation.
|
||||
# it contains all values from FILE_CATEGORIES minus FILE_CATEGORIES_TO_KEEP.
|
||||
# it is used to initialize Project.files_to_delete.
|
||||
#
|
||||
FILE_CATEGORIES_TO_DELETE = FILE_CATEGORIES - FILE_CATEGORIES_TO_KEEP
|
||||
|
||||
|
||||
class FileTracker(object):
|
||||
"""
|
||||
organize output files of calculations.
|
||||
|
||||
the file manager stores references to data files generated during calculations
|
||||
and cleans up unused files according to a range of filter criteria.
|
||||
"""
|
||||
|
||||
## @var files_to_delete (set)
|
||||
# categories of generated files that should be deleted after the calculation.
|
||||
#
|
||||
# each string of this set marks a category of files to be deleted.
|
||||
# the complete set of recognized categories is files.FILE_CATEGORIES.
|
||||
# the default setting after initialization is files.FILE_CATEGORIES_TO_DELETE.
|
||||
#
|
||||
# in optimization modes, an output file is kept only
|
||||
# if its model produced one of the best R-factors and
|
||||
# its category is not listed in this set.
|
||||
# all other (bad R-factor) files are deleted regardless of their category.
|
||||
|
||||
## @var keep_rfac (int)
|
||||
# number of best models to keep.
|
||||
#
|
||||
# if @c 'rfac' is set in files_to_delete, all files of bad models (regardless of their category) are deleted.
|
||||
# this parameter specifies how many of the best models are kept.
|
||||
#
|
||||
# the default is 10.
|
||||
|
||||
## @var _last_id (int)
|
||||
# last used file identification number (incremental)
|
||||
|
||||
## @var _path_by_id (dict)
|
||||
# key = file id, value = file path
|
||||
|
||||
## @var _model_by_id (dict)
|
||||
# key = file id, value = model number
|
||||
|
||||
## @var _category_by_id (dict)
|
||||
# key = file id, value = category (str)
|
||||
|
||||
## @var _rfac_by_model (dict)
|
||||
# key = model number, value = file id
|
||||
|
||||
## @var _complete_by_model (dict)
|
||||
# key = model number, value (boolean) = all calculations complete, files can be deleted
|
||||
|
||||
def __init__(self):
|
||||
self._id_by_path = {}
|
||||
self._path_by_id = {}
|
||||
self._model_by_id = {}
|
||||
self._category_by_id = {}
|
||||
self._rfac_by_model = {}
|
||||
self._complete_by_model = {}
|
||||
self._last_id = 0
|
||||
self.categories_to_delete = FILE_CATEGORIES_TO_DELETE
|
||||
self.keep_rfac = 10
|
||||
|
||||
def add_file(self, path, model, category='default'):
|
||||
"""
|
||||
add a new data file to the list.
|
||||
|
||||
@param path: (str) system path of the file relative to the working directory.
|
||||
|
||||
@param model: (int) model number
|
||||
|
||||
@param category: (str) file category, e.g. 'output', etc.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
self._last_id += 1
|
||||
_id = self._last_id
|
||||
self._id_by_path[path] = _id
|
||||
self._path_by_id[_id] = path
|
||||
self._model_by_id[_id] = model
|
||||
self._category_by_id[_id] = category
|
||||
|
||||
def rename_file(self, old_path, new_path):
|
||||
"""
|
||||
rename a data file in the list.
|
||||
|
||||
the method does not rename the file in the file system.
|
||||
|
||||
@param old_path: must match an existing file path identically.
|
||||
if old_path is not in the list, the method does nothing.
|
||||
|
||||
@param new_path: new path.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
try:
|
||||
_id = self._id_by_path[old_path]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
del self._id_by_path[old_path]
|
||||
self._id_by_path[new_path] = _id
|
||||
self._path_by_id[_id] = new_path
|
||||
|
||||
def remove_file(self, path):
|
||||
"""
|
||||
remove a file from the list.
|
||||
|
||||
the method does not delete the file from the file system.
|
||||
|
||||
@param path: must match an existing file path identically.
|
||||
if path is not in the list, the method does nothing.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
try:
|
||||
_id = self._id_by_path[path]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
del self._id_by_path[path]
|
||||
del self._path_by_id[_id]
|
||||
del self._model_by_id[_id]
|
||||
del self._category_by_id[_id]
|
||||
|
||||
def update_model_rfac(self, model, rfac):
|
||||
"""
|
||||
update the stored R factors of all files that depend on a specified model.
|
||||
the model handler should set this flag if files with bad R factors should be deleted.
|
||||
by default (after adding files of a new model), the R factor is unset and
|
||||
delete_bad_rfac() will not act on that model.
|
||||
|
||||
@param model: (int) model number.
|
||||
@param rfac: (float) new R factor
|
||||
@return: None
|
||||
"""
|
||||
self._rfac_by_model[model] = rfac
|
||||
|
||||
def set_model_complete(self, model, complete):
|
||||
"""
|
||||
specify whether the calculations of a model are complete and its files can be deleted.
|
||||
the model handler must set this flag.
|
||||
by default (after adding files of a new model), it is False.
|
||||
|
||||
@param model: (int) model number.
|
||||
@param complete: (bool) True if all calculations of the model are complete (files can be deleted).
|
||||
@return: None
|
||||
"""
|
||||
self._complete_by_model[model] = complete
|
||||
|
||||
def delete_files(self, categories=None, keep_rfac=0):
|
||||
"""
|
||||
delete the files matching the list of categories.
|
||||
|
||||
@param categories: set of file categories to delete.
|
||||
may include 'rfac' if bad r-factors should be deleted additionally (regardless of static category).
|
||||
defaults to self.categories_to_delete.
|
||||
|
||||
@param keep_rfac: number of best models to keep if bad r-factors are to be deleted.
|
||||
the effective keep number is the greater of self.keep_rfac and this argument.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
if categories is None:
|
||||
categories = self.categories_to_delete
|
||||
for cat in categories:
|
||||
self.delete_category(cat)
|
||||
if 'rfac' in categories:
|
||||
self.delete_bad_rfac(keep=keep_rfac)
|
||||
|
||||
def delete_bad_rfac(self, keep=0, force_delete=False):
|
||||
"""
|
||||
delete the files of all models except a specified number of good models.
|
||||
|
||||
the method first determines which models to keep.
|
||||
models with R factor values of 0.0, without a specified R-factor, and
|
||||
the specified number of best ranking non-zero models are kept.
|
||||
the files belonging to the keeper models are kept, all others are deleted,
|
||||
regardless of category.
|
||||
files of incomplete models are also kept.
|
||||
|
||||
the files are deleted from the list and the file system.
|
||||
|
||||
files are deleted only if 'rfac' is specified in self.categories_to_delete
|
||||
or if force_delete is set to True.
|
||||
otherwise the method does nothing.
|
||||
|
||||
@param keep: number of files to keep.
|
||||
the effective keep number is the greater of self.keep_rfac and this argument.
|
||||
|
||||
@param force_delete: delete the bad files even if 'rfac' is not selected in categories_to_delete.
|
||||
|
||||
@return: None
|
||||
|
||||
@todo should clean up rfac and model dictionaries from time to time.
|
||||
"""
|
||||
if force_delete or 'rfac' in self.categories_to_delete:
|
||||
keep = max(keep, self.keep_rfac)
|
||||
rfacs = [r for r in sorted(self._rfac_by_model.values()) if r > 0.0]
|
||||
try:
|
||||
rfac_split = rfacs[keep-1]
|
||||
except IndexError:
|
||||
return
|
||||
|
||||
complete_models = {_model for (_model, _complete) in self._complete_by_model.iteritems() if _complete}
|
||||
del_models = {_model for (_model, _rfac) in self._rfac_by_model.iteritems() if _rfac > rfac_split}
|
||||
del_models &= complete_models
|
||||
del_ids = {_id for (_id, _model) in self._model_by_id.iteritems() if _model in del_models}
|
||||
for _id in del_ids:
|
||||
self.delete_file(_id)
|
||||
|
||||
def delete_category(self, category):
|
||||
"""
|
||||
delete all files of a specified category from the list and the file system.
|
||||
|
||||
only files of complete models (cf. set_model_complete()) are deleted, but regardless of R-factor.
|
||||
|
||||
@param category: (str) category.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
complete_models = {_model for (_model, _complete) in self._complete_by_model.iteritems() if _complete}
|
||||
del_ids = {_id for (_id, cat) in self._category_by_id.iteritems() if cat == category}
|
||||
del_ids &= {_id for (_id, _model) in self._model_by_id.iteritems() if _model in complete_models}
|
||||
for _id in del_ids:
|
||||
self.delete_file(_id)
|
||||
|
||||
def delete_file(self, _id):
|
||||
"""
|
||||
delete a specified file from the list and the file system.
|
||||
|
||||
the file is identified by ID number.
|
||||
this method is unconditional. it does not consider category, completeness, nor R-factor.
|
||||
|
||||
@param _id: (int) ID number of the file to delete.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
path = self._path_by_id[_id]
|
||||
cat = self._category_by_id[_id]
|
||||
model = self._model_by_id[_id]
|
||||
del self._id_by_path[path]
|
||||
del self._path_by_id[_id]
|
||||
del self._model_by_id[_id]
|
||||
del self._category_by_id[_id]
|
||||
try:
|
||||
self._os_delete_file(path)
|
||||
except OSError:
|
||||
logger.warning("error deleting file {0}".format(path))
|
||||
else:
|
||||
logger.debug("delete file {0} ({1}, model {2})".format(path, cat, model))
|
||||
|
||||
@staticmethod
|
||||
def _os_delete_file(path):
|
||||
"""
|
||||
have the operating system delete a file path.
|
||||
|
||||
this function is separate so that we can mock it in unit tests.
|
||||
|
||||
@param path: OS path
|
||||
@return: None
|
||||
"""
|
||||
os.remove(path)
|
Reference in New Issue
Block a user