325 lines
12 KiB
Python
325 lines
12 KiB
Python
"""
|
|
@package pmsco.files
|
|
manage files produced by pmsco.
|
|
|
|
@author Matthias Muntwiler
|
|
|
|
@copyright (c) 2016 by Paul Scherrer Institut @n
|
|
Licensed under the Apache License, Version 2.0 (the "License"); @n
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import mpi4py
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
## @var FILE_CATEGORIES
|
|
# categories of generated files.
|
|
#
|
|
# these labels are used to decide which output files are kept or deleted after the calculation.
|
|
#
|
|
# each string of this set marks a category of files.
|
|
#
|
|
# @arg @c 'input' : raw input files for calculator, including cluster and phase files in custom format
|
|
# @arg @c 'output' : raw output files from calculator
|
|
# @arg @c 'phase' : phase files in portable format for report
|
|
# @arg @c 'cluster' : cluster files in portable XYZ format for report
|
|
# @arg @c 'log' : log files
|
|
# @arg @c 'debug' : debug files
|
|
# @arg @c 'model': output files in ETPAI format: complete simulation (a_-1_-1_-1_-1)
|
|
# @arg @c 'scan' : output files in ETPAI format: scan (a_b_-1_-1_-1)
|
|
# @arg @c 'symmetry' : output files in ETPAI format: symmetry (a_b_c_-1_-1)
|
|
# @arg @c 'emitter' : output files in ETPAI format: emitter (a_b_c_d_-1)
|
|
# @arg @c 'region' : output files in ETPAI format: region (a_b_c_d_e)
|
|
# @arg @c 'report': final report of results
|
|
# @arg @c 'population': final state of particle population
|
|
# @arg @c 'rfac': files related to models which give bad r-factors (dynamic category, see below).
|
|
#
|
|
# @note @c 'rfac' is a dynamic category not connected to a particular file or content type.
|
|
# no file should be marked @c 'rfac'.
|
|
# the string is used only to specify whether bad models should be deleted or not.
|
|
# if so, all files related to bad models are deleted, regardless of their static category.
|
|
#
|
|
FILE_CATEGORIES = {'cluster', 'phase', 'input', 'output',
|
|
'report', 'region', 'emitter', 'scan', 'symmetry', 'model',
|
|
'log', 'debug', 'population', 'rfac'}
|
|
|
|
## @var FILE_CATEGORIES_TO_KEEP
|
|
# categories of files to be keep.
|
|
#
|
|
# this constant defines the default set of file categories that are kept after the calculation.
|
|
#
|
|
FILE_CATEGORIES_TO_KEEP = {'cluster', 'model', 'report', 'population'}
|
|
|
|
## @var FILE_CATEGORIES_TO_DELETE
|
|
# categories of files to be deleted.
|
|
#
|
|
# this constant defines the default set of file categories that are deleted after the calculation.
|
|
# it contains all values from FILE_CATEGORIES minus FILE_CATEGORIES_TO_KEEP.
|
|
# it is used to initialize Project.files_to_delete.
|
|
#
|
|
FILE_CATEGORIES_TO_DELETE = FILE_CATEGORIES - FILE_CATEGORIES_TO_KEEP
|
|
|
|
|
|
class FileTracker(object):
|
|
"""
|
|
organize output files of calculations.
|
|
|
|
the file manager stores references to data files generated during calculations
|
|
and cleans up unused files according to a range of filter criteria.
|
|
"""
|
|
|
|
## @var files_to_delete (set)
|
|
# categories of generated files that should be deleted after the calculation.
|
|
#
|
|
# each string of this set marks a category of files to be deleted.
|
|
# the complete set of recognized categories is files.FILE_CATEGORIES.
|
|
# the default setting after initialization is files.FILE_CATEGORIES_TO_DELETE.
|
|
#
|
|
# in optimization modes, an output file is kept only
|
|
# if its model produced one of the best R-factors and
|
|
# its category is not listed in this set.
|
|
# all other (bad R-factor) files are deleted regardless of their category.
|
|
|
|
## @var keep_rfac (int)
|
|
# number of best models to keep.
|
|
#
|
|
# if @c 'rfac' is set in files_to_delete, all files of bad models (regardless of their category) are deleted.
|
|
# this parameter specifies how many of the best models are kept.
|
|
#
|
|
# the default is 10.
|
|
|
|
## @var _last_id (int)
|
|
# last used file identification number (incremental)
|
|
|
|
## @var _path_by_id (dict)
|
|
# key = file id, value = file path
|
|
|
|
## @var _model_by_id (dict)
|
|
# key = file id, value = model number
|
|
|
|
## @var _category_by_id (dict)
|
|
# key = file id, value = category (str)
|
|
|
|
## @var _rfac_by_model (dict)
|
|
# key = model number, value = file id
|
|
|
|
## @var _complete_by_model (dict)
|
|
# key = model number, value (boolean) = all calculations complete, files can be deleted
|
|
|
|
def __init__(self):
|
|
self._id_by_path = {}
|
|
self._path_by_id = {}
|
|
self._model_by_id = {}
|
|
self._category_by_id = {}
|
|
self._rfac_by_model = {}
|
|
self._complete_by_model = {}
|
|
self._last_id = 0
|
|
self.categories_to_delete = FILE_CATEGORIES_TO_DELETE
|
|
self.keep_rfac = 10
|
|
|
|
def add_file(self, path, model, category='default'):
|
|
"""
|
|
add a new data file to the list.
|
|
|
|
@param path: (str) system path of the file relative to the working directory.
|
|
|
|
@param model: (int) model number
|
|
|
|
@param category: (str) file category, e.g. 'output', etc.
|
|
|
|
@return: None
|
|
"""
|
|
self._last_id += 1
|
|
_id = self._last_id
|
|
self._id_by_path[path] = _id
|
|
self._path_by_id[_id] = path
|
|
self._model_by_id[_id] = model
|
|
self._category_by_id[_id] = category
|
|
|
|
def rename_file(self, old_path, new_path):
|
|
"""
|
|
rename a data file in the list.
|
|
|
|
the method does not rename the file in the file system.
|
|
|
|
@param old_path: must match an existing file path identically.
|
|
if old_path is not in the list, the method does nothing.
|
|
|
|
@param new_path: new path.
|
|
|
|
@return: None
|
|
"""
|
|
try:
|
|
_id = self._id_by_path[old_path]
|
|
except KeyError:
|
|
pass
|
|
else:
|
|
del self._id_by_path[old_path]
|
|
self._id_by_path[new_path] = _id
|
|
self._path_by_id[_id] = new_path
|
|
|
|
def remove_file(self, path):
|
|
"""
|
|
remove a file from the list.
|
|
|
|
the method does not delete the file from the file system.
|
|
|
|
@param path: must match an existing file path identically.
|
|
if path is not in the list, the method does nothing.
|
|
|
|
@return: None
|
|
"""
|
|
try:
|
|
_id = self._id_by_path[path]
|
|
except KeyError:
|
|
pass
|
|
else:
|
|
del self._id_by_path[path]
|
|
del self._path_by_id[_id]
|
|
del self._model_by_id[_id]
|
|
del self._category_by_id[_id]
|
|
|
|
def update_model_rfac(self, model, rfac):
|
|
"""
|
|
update the stored R factors of all files that depend on a specified model.
|
|
the model handler should set this flag if files with bad R factors should be deleted.
|
|
by default (after adding files of a new model), the R factor is unset and
|
|
delete_bad_rfac() will not act on that model.
|
|
|
|
@param model: (int) model number.
|
|
@param rfac: (float) new R factor
|
|
@return: None
|
|
"""
|
|
self._rfac_by_model[model] = rfac
|
|
|
|
def set_model_complete(self, model, complete):
|
|
"""
|
|
specify whether the calculations of a model are complete and its files can be deleted.
|
|
the model handler must set this flag.
|
|
by default (after adding files of a new model), it is False.
|
|
|
|
@param model: (int) model number.
|
|
@param complete: (bool) True if all calculations of the model are complete (files can be deleted).
|
|
@return: None
|
|
"""
|
|
self._complete_by_model[model] = complete
|
|
|
|
def delete_files(self, categories=None, keep_rfac=0):
|
|
"""
|
|
delete the files matching the list of categories.
|
|
|
|
@param categories: set of file categories to delete.
|
|
may include 'rfac' if bad r-factors should be deleted additionally (regardless of static category).
|
|
defaults to self.categories_to_delete.
|
|
|
|
@param keep_rfac: number of best models to keep if bad r-factors are to be deleted.
|
|
the effective keep number is the greater of self.keep_rfac and this argument.
|
|
|
|
@return: None
|
|
"""
|
|
if categories is None:
|
|
categories = self.categories_to_delete
|
|
for cat in categories:
|
|
self.delete_category(cat)
|
|
if 'rfac' in categories:
|
|
self.delete_bad_rfac(keep=keep_rfac)
|
|
|
|
def delete_bad_rfac(self, keep=0, force_delete=False):
|
|
"""
|
|
delete the files of all models except a specified number of good models.
|
|
|
|
the method first determines which models to keep.
|
|
models with R factor values of 0.0, without a specified R-factor, and
|
|
the specified number of best ranking non-zero models are kept.
|
|
the files belonging to the keeper models are kept, all others are deleted,
|
|
regardless of category.
|
|
files of incomplete models are also kept.
|
|
|
|
the files are deleted from the list and the file system.
|
|
|
|
files are deleted only if 'rfac' is specified in self.categories_to_delete
|
|
or if force_delete is set to True.
|
|
otherwise the method does nothing.
|
|
|
|
@param keep: number of files to keep.
|
|
the effective keep number is the greater of self.keep_rfac and this argument.
|
|
|
|
@param force_delete: delete the bad files even if 'rfac' is not selected in categories_to_delete.
|
|
|
|
@return: None
|
|
|
|
@todo should clean up rfac and model dictionaries from time to time.
|
|
"""
|
|
if force_delete or 'rfac' in self.categories_to_delete:
|
|
keep = max(keep, self.keep_rfac)
|
|
rfacs = [r for r in sorted(self._rfac_by_model.values()) if r > 0.0]
|
|
try:
|
|
rfac_split = rfacs[keep-1]
|
|
except IndexError:
|
|
return
|
|
|
|
complete_models = {_model for (_model, _complete) in self._complete_by_model.iteritems() if _complete}
|
|
del_models = {_model for (_model, _rfac) in self._rfac_by_model.iteritems() if _rfac > rfac_split}
|
|
del_models &= complete_models
|
|
del_ids = {_id for (_id, _model) in self._model_by_id.iteritems() if _model in del_models}
|
|
for _id in del_ids:
|
|
self.delete_file(_id)
|
|
|
|
def delete_category(self, category):
|
|
"""
|
|
delete all files of a specified category from the list and the file system.
|
|
|
|
only files of complete models (cf. set_model_complete()) are deleted, but regardless of R-factor.
|
|
|
|
@param category: (str) category.
|
|
|
|
@return: None
|
|
"""
|
|
complete_models = {_model for (_model, _complete) in self._complete_by_model.iteritems() if _complete}
|
|
del_ids = {_id for (_id, cat) in self._category_by_id.iteritems() if cat == category}
|
|
del_ids &= {_id for (_id, _model) in self._model_by_id.iteritems() if _model in complete_models}
|
|
for _id in del_ids:
|
|
self.delete_file(_id)
|
|
|
|
def delete_file(self, _id):
|
|
"""
|
|
delete a specified file from the list and the file system.
|
|
|
|
the file is identified by ID number.
|
|
this method is unconditional. it does not consider category, completeness, nor R-factor.
|
|
|
|
@param _id: (int) ID number of the file to delete.
|
|
|
|
@return: None
|
|
"""
|
|
path = self._path_by_id[_id]
|
|
cat = self._category_by_id[_id]
|
|
model = self._model_by_id[_id]
|
|
del self._id_by_path[path]
|
|
del self._path_by_id[_id]
|
|
del self._model_by_id[_id]
|
|
del self._category_by_id[_id]
|
|
try:
|
|
self._os_delete_file(path)
|
|
except OSError:
|
|
logger.warning("error deleting file {0}".format(path))
|
|
else:
|
|
logger.debug("delete file {0} ({1}, model {2})".format(path, cat, model))
|
|
|
|
@staticmethod
|
|
def _os_delete_file(path):
|
|
"""
|
|
have the operating system delete a file path.
|
|
|
|
this function is separate so that we can mock it in unit tests.
|
|
|
|
@param path: OS path
|
|
@return: None
|
|
"""
|
|
os.remove(path)
|