437 lines
17 KiB
Python
437 lines
17 KiB
Python
"""
|
|
@package pmsco.files
|
|
manage the lifetime of files produced by pmsco.
|
|
|
|
@author Matthias Muntwiler
|
|
|
|
@copyright (c) 2016-18 by Paul Scherrer Institut @n
|
|
Licensed under the Apache License, Version 2.0 (the "License"); @n
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
import os
|
|
import logging
|
|
import mpi4py
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
## @var FILE_CATEGORIES
|
|
# categories of generated files.
|
|
#
|
|
# these labels are used to decide which output files are kept or deleted after the calculation.
|
|
#
|
|
# each string of this set marks a category of files.
|
|
#
|
|
# @arg 'input' : raw input files for calculator, including cluster and atomic files in custom format
|
|
# @arg 'output' : raw output files from calculator
|
|
# @arg 'atomic' : atomic scattering (phase, emission) files in portable format
|
|
# @arg 'cluster' : cluster files in portable XYZ format for report
|
|
# @arg 'log' : log files
|
|
# @arg 'debug' : debug files
|
|
# @arg 'model': output files in ETPAI format: complete simulation (a_-1_-1_-1_-1)
|
|
# @arg 'scan' : output files in ETPAI format: scan (a_b_-1_-1_-1)
|
|
# @arg 'domain' : output files in ETPAI format: domain (a_b_c_-1_-1)
|
|
# @arg 'emitter' : output files in ETPAI format: emitter (a_b_c_d_-1)
|
|
# @arg 'region' : output files in ETPAI format: region (a_b_c_d_e)
|
|
# @arg 'report': final report of results
|
|
# @arg 'population': final state of particle population
|
|
# @arg 'rfac': files related to models which give bad r-factors (dynamic category, see below).
|
|
#
|
|
# @note @c 'rfac' is a dynamic category not connected to a particular file or content type.
|
|
# no file should be marked @c 'rfac'.
|
|
# the string is used only to specify whether bad models should be deleted or not.
|
|
# if so, all files related to bad models are deleted, regardless of their static category.
|
|
#
|
|
FILE_CATEGORIES = {'cluster', 'atomic', 'input', 'output',
|
|
'report', 'region', 'emitter', 'scan', 'domain', 'model',
|
|
'log', 'debug', 'population', 'rfac'}
|
|
|
|
## @var FILE_CATEGORIES_TO_KEEP
|
|
# categories of files to be keep.
|
|
#
|
|
# this constant defines the default set of file categories that are kept after the calculation.
|
|
#
|
|
FILE_CATEGORIES_TO_KEEP = {'cluster', 'model', 'scan', 'report', 'population'}
|
|
|
|
## @var FILE_CATEGORIES_TO_DELETE
|
|
# categories of files to be deleted.
|
|
#
|
|
# this constant defines the default set of file categories that are deleted after the calculation.
|
|
# it contains all values from FILE_CATEGORIES minus FILE_CATEGORIES_TO_KEEP.
|
|
# it is used to initialize Project.files_to_delete.
|
|
#
|
|
FILE_CATEGORIES_TO_DELETE = FILE_CATEGORIES - FILE_CATEGORIES_TO_KEEP
|
|
|
|
|
|
class FileTracker(object):
|
|
"""
|
|
manage the lifetime of files produced by the calculations.
|
|
|
|
the file manager stores references to data files generated during calculations
|
|
and cleans up unused files according to a range of filter criteria.
|
|
|
|
this class identifies files by _file name_.
|
|
file names must therefore be unique over the whole calculation process.
|
|
it is possible to specify a full path that is used for communication with the operating system.
|
|
"""
|
|
|
|
## @var categories_to_delete (set)
|
|
# categories of generated files that should be deleted after the calculation.
|
|
#
|
|
# each string of this set marks a category of files to be deleted.
|
|
# the complete set of recognized categories is files.FILE_CATEGORIES.
|
|
# the default setting after initialization is files.FILE_CATEGORIES_TO_DELETE.
|
|
#
|
|
# in optimization modes, an output file is kept only
|
|
# if its model produced one of the best R-factors and
|
|
# its category is not listed in this set.
|
|
# all other (bad R-factor) files are deleted regardless of their category.
|
|
|
|
## @var keep_rfac (int)
|
|
# number of best models to keep.
|
|
#
|
|
# if @c 'rfac' is set in files_to_delete, all files of bad models (regardless of their category) are deleted.
|
|
# this parameter specifies how many of the best models are kept.
|
|
#
|
|
# the default is 10.
|
|
|
|
## @var _file_model (dict)
|
|
# key = file name, value = model number
|
|
|
|
## @var _file_category (dict)
|
|
# key = file name, value = category (str)
|
|
|
|
## @var _file_path (dict)
|
|
# key = file name, value = absolute file path (str)
|
|
|
|
## @var _rfac_by_model (dict)
|
|
# key = model number, value = R-factor
|
|
|
|
## @var _complete_models (set)
|
|
# this set contains the model numbers of the models that have finished all calculations.
|
|
# files of these models can be considered for clean up.
|
|
|
|
def __init__(self):
|
|
self._file_model = {}
|
|
self._file_category = {}
|
|
self._file_path = {}
|
|
self._rfac_by_model = {}
|
|
self._complete_models = set([])
|
|
self.categories_to_delete = FILE_CATEGORIES_TO_DELETE
|
|
self.keep_rfac = 10
|
|
|
|
def get_file_count(self):
|
|
"""
|
|
return the number of tracked files.
|
|
|
|
@return: (int) number of tracked files.
|
|
"""
|
|
return len(self._file_path)
|
|
|
|
def get_complete_models_count(self):
|
|
"""
|
|
return the number of complete models.
|
|
|
|
@return: (int) number of complete models.
|
|
"""
|
|
return len(self._complete_models)
|
|
|
|
def add_file(self, name, model, category='default', path=''):
|
|
"""
|
|
add a new data file to the list.
|
|
|
|
@param name: (str) unique identification of the file.
|
|
this can be the file name in the file system if file names are unique without path specification.
|
|
the name must be spelled identically
|
|
whenever the same file is referenced in a call to another method of this class.
|
|
the empty string is ignored.
|
|
|
|
@param model: (int) model number
|
|
|
|
@param category: (str) file category, e.g. 'output', etc.
|
|
|
|
@param path: (str) file system path of the file.
|
|
the file system path is used for communication with the operating system when the file is deleted.
|
|
|
|
by default, the path is the name argument expanded to a full path relative to the current working directory.
|
|
the path is expanded during the call of this method and will not change when the working directory changes.
|
|
|
|
@return: None
|
|
"""
|
|
if name:
|
|
self._file_model[name] = model
|
|
self._file_category[name] = category
|
|
self._file_path[name] = path if path else os.path.abspath(name)
|
|
|
|
def rename_file(self, old_name, new_name, new_path=''):
|
|
"""
|
|
rename a data file in the list.
|
|
|
|
the method does not rename the file in the file system.
|
|
|
|
@param old_name: name used in the original add_file() call.
|
|
if it is not in the list, the method does nothing.
|
|
|
|
@param new_name: new name of the file, see add_file().
|
|
if the file is already in the list, its model and category is overwritten by the values of the old file.
|
|
|
|
@param new_path: new file system path of the file, see add_file().
|
|
by default, the path is the name argument expanded to a full path relative to the current working directory.
|
|
|
|
@return: None
|
|
"""
|
|
try:
|
|
model = self._file_model[old_name]
|
|
cat = self._file_category[old_name]
|
|
except KeyError:
|
|
pass
|
|
else:
|
|
del self._file_model[old_name]
|
|
del self._file_category[old_name]
|
|
del self._file_path[old_name]
|
|
self.add_file(new_name, model, cat, new_path)
|
|
|
|
def remove_file(self, name):
|
|
"""
|
|
remove a file from the list.
|
|
|
|
the method does not delete the file from the file system.
|
|
|
|
@param name: must match an existing file name identically.
|
|
if the name is not found in the list, the method does nothing.
|
|
|
|
@return: None
|
|
"""
|
|
try:
|
|
del self._file_model[name]
|
|
del self._file_category[name]
|
|
del self._file_path[name]
|
|
except KeyError:
|
|
pass
|
|
|
|
def update_model_rfac(self, model, rfac):
|
|
"""
|
|
update the stored R factors of all files that depend on a specified model.
|
|
the model handler should set this flag if files with bad R factors should be deleted.
|
|
by default (after adding files of a new model), the R factor is unset and
|
|
delete_bad_rfac() will not act on that model.
|
|
|
|
@param model: (int) model number.
|
|
@param rfac: (float) new R factor
|
|
@return: None
|
|
"""
|
|
self._rfac_by_model[model] = rfac
|
|
|
|
def set_model_complete(self, model, complete):
|
|
"""
|
|
specify whether the calculations of a model are complete and its files can be deleted.
|
|
the model handler must set this flag.
|
|
by default (after adding files of a new model), it is False.
|
|
|
|
@param model: (int) model number.
|
|
@param complete: (bool) True if all calculations of the model are complete (files can be deleted).
|
|
@return: None
|
|
"""
|
|
if complete:
|
|
self._complete_models.add(model)
|
|
else:
|
|
self._complete_models.discard(model)
|
|
|
|
def delete_files(self, categories=None, incomplete_models=False):
|
|
"""
|
|
delete all files matching a set of categories.
|
|
|
|
this function deletes all files that are tagged with one of the given categories.
|
|
tags are set by the code sections that create the files.
|
|
for a list of common categories, see FILE_CATEGORIES.
|
|
the categories can be given as an argument or taken from the categories_to_delete property.
|
|
|
|
files are deleted regardless of R-factor.
|
|
be sure to specify only categories that you don't need in the output at all.
|
|
|
|
by default, only files of complete models (cf. set_model_complete()) are deleted
|
|
to avoid interference with running calculations.
|
|
to clean up after calculations, the incomplete_models argument can override this.
|
|
|
|
@note this method does not act on the special 'rfac' category (see delete_bad_rfac()).
|
|
|
|
@param categories: set of file categories to delete.
|
|
if the argument is None, it defaults to the categories_to_delete property.
|
|
|
|
@param incomplete_models: (bool) delete files of incomplete models as well.
|
|
by default (False), incomplete models are not deleted.
|
|
|
|
@return: None
|
|
"""
|
|
if categories is None:
|
|
categories = self.categories_to_delete
|
|
for cat in categories:
|
|
self.delete_category(cat, incomplete_models=incomplete_models)
|
|
|
|
def delete_bad_rfac(self, keep=0, force_delete=False):
|
|
"""
|
|
delete all files of all models except for a specified number of best ranking models.
|
|
|
|
the method first determines which models to keep.
|
|
the specified number of best ranking non-zero models are kept.
|
|
in addition, incomplete models, models with R factor = 0.0,
|
|
and those without a specified R-factor are kept.
|
|
all other files are deleted.
|
|
the method does not consider the file category.
|
|
|
|
the files are deleted from the list and the file system.
|
|
|
|
the method executes only if 'rfac' is specified in self.categories_to_delete
|
|
or if force_delete is True.
|
|
otherwise the method does nothing.
|
|
|
|
@param keep: number of files to keep.
|
|
the effective keep number is the greater of self.keep_rfac and this argument.
|
|
|
|
@param force_delete: delete the bad files even if 'rfac' is not selected in categories_to_delete.
|
|
|
|
@return: None
|
|
"""
|
|
if force_delete or 'rfac' in self.categories_to_delete:
|
|
keep = max(keep, self.keep_rfac)
|
|
rfacs = [r for r in sorted(self._rfac_by_model.values()) if r > 0.0]
|
|
try:
|
|
rfac_split = rfacs[keep-1]
|
|
except IndexError:
|
|
return
|
|
|
|
keep_models = {model for (model, rfac) in self._rfac_by_model.items() if 0.0 <= rfac <= rfac_split}
|
|
del_models = self._complete_models - keep_models
|
|
del_names = {name for (name, model) in self._file_model.items() if model in del_models}
|
|
for name in del_names:
|
|
self.delete_file(name)
|
|
|
|
def delete_models(self, keep=None, delete=None):
|
|
"""
|
|
delete all files by model.
|
|
|
|
this involves the following steps:
|
|
1. determine a list of complete models
|
|
(incomplete models are still being processed and must not be deleted).
|
|
2. intersect with the _delete_ list if specified.
|
|
3. subtract the _keep_ list if specified.
|
|
|
|
if neither the _keep_ nor the _delete_ list is specified,
|
|
or if the steps above resolve to the _complete_ list
|
|
the method considers it as an error and does nothing.
|
|
|
|
@param keep: (sequence) model numbers to keep, i.e., delete all others.
|
|
|
|
@param delete: (sequence) model numbers to delete.
|
|
|
|
@return (int) number of models deleted.
|
|
"""
|
|
del_models = self._complete_models.copy()
|
|
if delete:
|
|
del_models &= delete
|
|
if keep:
|
|
del_models -= keep
|
|
if not del_models or del_models == self._complete_models:
|
|
return 0
|
|
|
|
del_names = {name for (name, model) in self._file_model.items() if model in del_models}
|
|
for name in del_names:
|
|
self.delete_file(name)
|
|
|
|
return len(del_models)
|
|
|
|
def delete_category(self, category, incomplete_models=False):
|
|
"""
|
|
delete all files of a specified category from the list and the file system.
|
|
|
|
this function deletes all files that are tagged with the given category.
|
|
tags are set by the code sections that create the files.
|
|
for a list of common categories, see FILE_CATEGORIES.
|
|
|
|
files are deleted regardless of R-factor.
|
|
be sure to specify only categories that you don't need in the output at all.
|
|
|
|
by default, only files of complete models (cf. set_model_complete()) are deleted
|
|
to avoid interference with running calculations.
|
|
to clean up after calculations, the incomplete_models argument can override this.
|
|
|
|
@param category: (str) category.
|
|
should be one of FILE_CATEGORIES. otherwise, the function has no effect.
|
|
|
|
@param incomplete_models: (bool) delete files of incomplete models as well.
|
|
by default (False), incomplete models are not deleted.
|
|
|
|
@return: None
|
|
"""
|
|
del_names = {name for (name, cat) in self._file_category.items() if cat == category}
|
|
if not incomplete_models:
|
|
del_names &= {name for (name, model) in self._file_model.items() if model in self._complete_models}
|
|
for name in del_names:
|
|
self.delete_file(name)
|
|
|
|
def delete_file(self, name):
|
|
"""
|
|
delete a specified file from the list and the file system.
|
|
|
|
this method is unconditional. it does not consider category, completeness, nor R-factor.
|
|
|
|
the method catches errors during file deletion and prints warnings to the logger.
|
|
|
|
@param name: must match an existing file path identically.
|
|
if it is not in the list, the method does nothing.
|
|
the method uses the associated path declared in add_file() to delete the file.
|
|
|
|
@return: None
|
|
"""
|
|
try:
|
|
cat = self._file_category[name]
|
|
model = self._file_model[name]
|
|
path = self._file_path[name]
|
|
except KeyError:
|
|
logger.warning("tried to delete untracked file {0}".format(name))
|
|
else:
|
|
del self._file_model[name]
|
|
del self._file_category[name]
|
|
del self._file_path[name]
|
|
try:
|
|
os.remove(path)
|
|
except OSError:
|
|
logger.warning("file system error deleting file {0}".format(path))
|
|
else:
|
|
logger.debug("delete file {0} ({1}, model {2})".format(path, cat, model))
|
|
|
|
|
|
def list_files_other_models(prefix, models):
|
|
"""
|
|
list input/output files except those of the given models.
|
|
|
|
this can be used to clean up all files except those belonging to the given models.
|
|
|
|
to delete the listed files:
|
|
|
|
for f in files:
|
|
os.remove(f)
|
|
|
|
@param prefix: file name prefix up to the first underscore.
|
|
only files starting with this prefix are listed.
|
|
|
|
@param models: sequence or set of model numbers that should not be listed.
|
|
|
|
@return: set of file names
|
|
"""
|
|
file_names = set([])
|
|
for entry in os.scandir():
|
|
if entry.is_file:
|
|
elements = entry.name.split('_')
|
|
try:
|
|
if len(elements) == 6 and elements[0] == prefix and int(elements[1]) not in models:
|
|
file_names.add(entry.name)
|
|
except (IndexError, ValueError):
|
|
pass
|
|
return file_names
|