""" @package pmsco.files manage files produced by pmsco. @author Matthias Muntwiler @copyright (c) 2016 by Paul Scherrer Institut @n Licensed under the Apache License, Version 2.0 (the "License"); @n you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 """ import os import logging import mpi4py logger = logging.getLogger(__name__) ## @var FILE_CATEGORIES # categories of generated files. # # these labels are used to decide which output files are kept or deleted after the calculation. # # each string of this set marks a category of files. # # @arg @c 'input' : raw input files for calculator, including cluster and phase files in custom format # @arg @c 'output' : raw output files from calculator # @arg @c 'phase' : phase files in portable format for report # @arg @c 'cluster' : cluster files in portable XYZ format for report # @arg @c 'log' : log files # @arg @c 'debug' : debug files # @arg @c 'model': output files in ETPAI format: complete simulation (a_-1_-1_-1_-1) # @arg @c 'scan' : output files in ETPAI format: scan (a_b_-1_-1_-1) # @arg @c 'symmetry' : output files in ETPAI format: symmetry (a_b_c_-1_-1) # @arg @c 'emitter' : output files in ETPAI format: emitter (a_b_c_d_-1) # @arg @c 'region' : output files in ETPAI format: region (a_b_c_d_e) # @arg @c 'report': final report of results # @arg @c 'population': final state of particle population # @arg @c 'rfac': files related to models which give bad r-factors (dynamic category, see below). # # @note @c 'rfac' is a dynamic category not connected to a particular file or content type. # no file should be marked @c 'rfac'. # the string is used only to specify whether bad models should be deleted or not. # if so, all files related to bad models are deleted, regardless of their static category. # FILE_CATEGORIES = {'cluster', 'phase', 'input', 'output', 'report', 'region', 'emitter', 'scan', 'symmetry', 'model', 'log', 'debug', 'population', 'rfac'} ## @var FILE_CATEGORIES_TO_KEEP # categories of files to be keep. # # this constant defines the default set of file categories that are kept after the calculation. # FILE_CATEGORIES_TO_KEEP = {'cluster', 'model', 'report', 'population'} ## @var FILE_CATEGORIES_TO_DELETE # categories of files to be deleted. # # this constant defines the default set of file categories that are deleted after the calculation. # it contains all values from FILE_CATEGORIES minus FILE_CATEGORIES_TO_KEEP. # it is used to initialize Project.files_to_delete. # FILE_CATEGORIES_TO_DELETE = FILE_CATEGORIES - FILE_CATEGORIES_TO_KEEP class FileTracker(object): """ organize output files of calculations. the file manager stores references to data files generated during calculations and cleans up unused files according to a range of filter criteria. """ ## @var files_to_delete (set) # categories of generated files that should be deleted after the calculation. # # each string of this set marks a category of files to be deleted. # the complete set of recognized categories is files.FILE_CATEGORIES. # the default setting after initialization is files.FILE_CATEGORIES_TO_DELETE. # # in optimization modes, an output file is kept only # if its model produced one of the best R-factors and # its category is not listed in this set. # all other (bad R-factor) files are deleted regardless of their category. ## @var keep_rfac (int) # number of best models to keep. # # if @c 'rfac' is set in files_to_delete, all files of bad models (regardless of their category) are deleted. # this parameter specifies how many of the best models are kept. # # the default is 10. ## @var _last_id (int) # last used file identification number (incremental) ## @var _path_by_id (dict) # key = file id, value = file path ## @var _model_by_id (dict) # key = file id, value = model number ## @var _category_by_id (dict) # key = file id, value = category (str) ## @var _rfac_by_model (dict) # key = model number, value = file id ## @var _complete_by_model (dict) # key = model number, value (boolean) = all calculations complete, files can be deleted def __init__(self): self._id_by_path = {} self._path_by_id = {} self._model_by_id = {} self._category_by_id = {} self._rfac_by_model = {} self._complete_by_model = {} self._last_id = 0 self.categories_to_delete = FILE_CATEGORIES_TO_DELETE self.keep_rfac = 10 def add_file(self, path, model, category='default'): """ add a new data file to the list. @param path: (str) system path of the file relative to the working directory. @param model: (int) model number @param category: (str) file category, e.g. 'output', etc. @return: None """ self._last_id += 1 _id = self._last_id self._id_by_path[path] = _id self._path_by_id[_id] = path self._model_by_id[_id] = model self._category_by_id[_id] = category def rename_file(self, old_path, new_path): """ rename a data file in the list. the method does not rename the file in the file system. @param old_path: must match an existing file path identically. if old_path is not in the list, the method does nothing. @param new_path: new path. @return: None """ try: _id = self._id_by_path[old_path] except KeyError: pass else: del self._id_by_path[old_path] self._id_by_path[new_path] = _id self._path_by_id[_id] = new_path def remove_file(self, path): """ remove a file from the list. the method does not delete the file from the file system. @param path: must match an existing file path identically. if path is not in the list, the method does nothing. @return: None """ try: _id = self._id_by_path[path] except KeyError: pass else: del self._id_by_path[path] del self._path_by_id[_id] del self._model_by_id[_id] del self._category_by_id[_id] def update_model_rfac(self, model, rfac): """ update the stored R factors of all files that depend on a specified model. the model handler should set this flag if files with bad R factors should be deleted. by default (after adding files of a new model), the R factor is unset and delete_bad_rfac() will not act on that model. @param model: (int) model number. @param rfac: (float) new R factor @return: None """ self._rfac_by_model[model] = rfac def set_model_complete(self, model, complete): """ specify whether the calculations of a model are complete and its files can be deleted. the model handler must set this flag. by default (after adding files of a new model), it is False. @param model: (int) model number. @param complete: (bool) True if all calculations of the model are complete (files can be deleted). @return: None """ self._complete_by_model[model] = complete def delete_files(self, categories=None, keep_rfac=0): """ delete the files matching the list of categories. @param categories: set of file categories to delete. may include 'rfac' if bad r-factors should be deleted additionally (regardless of static category). defaults to self.categories_to_delete. @param keep_rfac: number of best models to keep if bad r-factors are to be deleted. the effective keep number is the greater of self.keep_rfac and this argument. @return: None """ if categories is None: categories = self.categories_to_delete for cat in categories: self.delete_category(cat) if 'rfac' in categories: self.delete_bad_rfac(keep=keep_rfac) def delete_bad_rfac(self, keep=0, force_delete=False): """ delete the files of all models except a specified number of good models. the method first determines which models to keep. models with R factor values of 0.0, without a specified R-factor, and the specified number of best ranking non-zero models are kept. the files belonging to the keeper models are kept, all others are deleted, regardless of category. files of incomplete models are also kept. the files are deleted from the list and the file system. files are deleted only if 'rfac' is specified in self.categories_to_delete or if force_delete is set to True. otherwise the method does nothing. @param keep: number of files to keep. the effective keep number is the greater of self.keep_rfac and this argument. @param force_delete: delete the bad files even if 'rfac' is not selected in categories_to_delete. @return: None @todo should clean up rfac and model dictionaries from time to time. """ if force_delete or 'rfac' in self.categories_to_delete: keep = max(keep, self.keep_rfac) rfacs = [r for r in sorted(self._rfac_by_model.values()) if r > 0.0] try: rfac_split = rfacs[keep-1] except IndexError: return complete_models = {_model for (_model, _complete) in self._complete_by_model.iteritems() if _complete} del_models = {_model for (_model, _rfac) in self._rfac_by_model.iteritems() if _rfac > rfac_split} del_models &= complete_models del_ids = {_id for (_id, _model) in self._model_by_id.iteritems() if _model in del_models} for _id in del_ids: self.delete_file(_id) def delete_category(self, category): """ delete all files of a specified category from the list and the file system. only files of complete models (cf. set_model_complete()) are deleted, but regardless of R-factor. @param category: (str) category. @return: None """ complete_models = {_model for (_model, _complete) in self._complete_by_model.iteritems() if _complete} del_ids = {_id for (_id, cat) in self._category_by_id.iteritems() if cat == category} del_ids &= {_id for (_id, _model) in self._model_by_id.iteritems() if _model in complete_models} for _id in del_ids: self.delete_file(_id) def delete_file(self, _id): """ delete a specified file from the list and the file system. the file is identified by ID number. this method is unconditional. it does not consider category, completeness, nor R-factor. @param _id: (int) ID number of the file to delete. @return: None """ path = self._path_by_id[_id] cat = self._category_by_id[_id] model = self._model_by_id[_id] del self._id_by_path[path] del self._path_by_id[_id] del self._model_by_id[_id] del self._category_by_id[_id] try: self._os_delete_file(path) except OSError: logger.warning("error deleting file {0}".format(path)) else: logger.debug("delete file {0} ({1}, model {2})".format(path, cat, model)) @staticmethod def _os_delete_file(path): """ have the operating system delete a file path. this function is separate so that we can mock it in unit tests. @param path: OS path @return: None """ os.remove(path)