pmsco-public/pmsco/files.py

"""
@package pmsco.files
manage the lifetime of files produced by pmsco.

@author Matthias Muntwiler

@copyright (c) 2016-18 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import logging
import mpi4py

logger = logging.getLogger(__name__)

## @var FILE_CATEGORIES
# categories of generated files.
#
# these labels are used to decide which output files are kept or deleted after the calculation.
#
# each string of this set marks a category of files.
#
# @arg 'input' :     raw input files for calculator, including cluster and atomic files in custom format
# @arg 'output' :    raw output files from calculator
# @arg 'atomic' :    atomic scattering (phase, emission) files in portable format
# @arg 'cluster' :   cluster files in portable XYZ format for report
# @arg 'log' :       log files
# @arg 'debug' :     debug files
# @arg 'model':      output files in ETPAI format: complete simulation  (a_-1_-1_-1_-1)
# @arg 'scan' :      output files in ETPAI format: scan (a_b_-1_-1_-1)
# @arg 'domain' :    output files in ETPAI format: domain (a_b_c_-1_-1)
# @arg 'emitter' :   output files in ETPAI format: emitter (a_b_c_d_-1)
# @arg 'region' :    output files in ETPAI format: region (a_b_c_d_e)
# @arg 'report':     final report of results
# @arg 'population': final state of particle population
# @arg 'rfac':       files related to models which give bad r-factors (dynamic category, see below).
#
# @note @c 'rfac' is a dynamic category not connected to a particular file or content type.
# no file should be marked @c 'rfac'.
# the string is used only to specify whether bad models should be deleted or not.
# if so, all files related to bad models are deleted, regardless of their static category.
#
FILE_CATEGORIES = {'cluster', 'atomic', 'input', 'output',
                   'report', 'region', 'emitter', 'scan', 'domain', 'model',
                   'log', 'debug', 'population', 'rfac'}

## @var FILE_CATEGORIES_TO_KEEP
# categories of files to be keep.
#
# this constant defines the default set of file categories that are kept after the calculation.
#
FILE_CATEGORIES_TO_KEEP = {'cluster', 'model', 'scan', 'report', 'population'}

## @var FILE_CATEGORIES_TO_DELETE
# categories of files to be deleted.
#
# this constant defines the default set of file categories that are deleted after the calculation.
# it contains all values from FILE_CATEGORIES minus FILE_CATEGORIES_TO_KEEP.
# it is used to initialize Project.files_to_delete.
#
FILE_CATEGORIES_TO_DELETE = FILE_CATEGORIES - FILE_CATEGORIES_TO_KEEP


class FileTracker(object):
    """
    manage the lifetime of files produced by the calculations.

    the file manager stores references to data files generated during calculations
    and cleans up unused files according to a range of filter criteria.

    this class identifies files by _file name_.
    file names must therefore be unique over the whole calculation process.
    it is possible to specify a full path that is used for communication with the operating system.
    """

    ## @var categories_to_delete (set)
    # categories of generated files that should be deleted after the calculation.
    #
    # each string of this set marks a category of files to be deleted.
    # the complete set of recognized categories is files.FILE_CATEGORIES.
    # the default setting after initialization is files.FILE_CATEGORIES_TO_DELETE.
    #
    # in optimization modes, an output file is kept only
    # if its model produced one of the best R-factors and
    # its category is not listed in this set.
    # all other (bad R-factor) files are deleted regardless of their category.

    ## @var keep_rfac (int)
    # number of best models to keep.
    #
    # if @c 'rfac' is set in files_to_delete, all files of bad models (regardless of their category) are deleted.
    # this parameter specifies how many of the best models are kept.
    #
    # the default is 10.

    ## @var _file_model (dict)
    # key = file name, value = model number

    ## @var _file_category (dict)
    # key = file name, value = category (str)

    ## @var _file_path (dict)
    # key = file name, value = absolute file path (str)

    ## @var _rfac_by_model (dict)
    # key = model number, value = R-factor

    ## @var _complete_models (set)
    # this set contains the model numbers of the models that have finished all calculations.
    # files of these models can be considered for clean up.

    def __init__(self):
        self._file_model = {}
        self._file_category = {}
        self._file_path = {}
        self._rfac_by_model = {}
        self._complete_models = set([])
        self.categories_to_delete = FILE_CATEGORIES_TO_DELETE
        self.keep_rfac = 10

    def get_file_count(self):
        """
        return the number of tracked files.

        @return: (int) number of tracked files.
        """
        return len(self._file_path)

    def get_complete_models_count(self):
        """
        return the number of complete models.

        @return: (int) number of complete models.
        """
        return len(self._complete_models)

    def add_file(self, name, model, category='default', path=''):
        """
        add a new data file to the list.

        @param name: (str) unique identification of the file.
            this can be the file name in the file system if file names are unique without path specification.
            the name must be spelled identically
            whenever the same file is referenced in a call to another method of this class.
            the empty string is ignored.

        @param model: (int) model number

        @param category: (str) file category, e.g. 'output', etc.

        @param path: (str) file system path of the file.
            the file system path is used for communication with the operating system when the file is deleted.

            by default, the path is the name argument expanded to a full path relative to the current working directory.
            the path is expanded during the call of this method and will not change when the working directory changes.

        @return: None
        """
        if name:
            self._file_model[name] = model
            self._file_category[name] = category
            self._file_path[name] = path if path else os.path.abspath(name)

    def rename_file(self, old_name, new_name, new_path=''):
        """
        rename a data file in the list.

        the method does not rename the file in the file system.

        @param old_name: name used in the original add_file() call.
            if it is not in the list, the method does nothing.

        @param new_name: new name of the file, see add_file().
            if the file is already in the list, its model and category is overwritten by the values of the old file.

        @param new_path: new file system path of the file, see add_file().
            by default, the path is the name argument expanded to a full path relative to the current working directory.

        @return: None
        """
        try:
            model = self._file_model[old_name]
            cat = self._file_category[old_name]
        except KeyError:
            pass
        else:
            del self._file_model[old_name]
            del self._file_category[old_name]
            del self._file_path[old_name]
            self.add_file(new_name, model, cat, new_path)

    def remove_file(self, name):
        """
        remove a file from the list.

        the method does not delete the file from the file system.

        @param name: must match an existing file name identically.
            if the name is not found in the list, the method does nothing.

        @return:  None
        """
        try:
            del self._file_model[name]
            del self._file_category[name]
            del self._file_path[name]
        except KeyError:
            pass

    def update_model_rfac(self, model, rfac):
        """
        update the stored R factors of all files that depend on a specified model.
        the model handler should set this flag if files with bad R factors should be deleted.
        by default (after adding files of a new model), the R factor is unset and
        delete_bad_rfac() will not act on that model.

        @param model: (int) model number.
        @param rfac: (float) new R factor
        @return: None
        """
        self._rfac_by_model[model] = rfac

    def set_model_complete(self, model, complete):
        """
        specify whether the calculations of a model are complete and its files can be deleted.
        the model handler must set this flag.
        by default (after adding files of a new model), it is False.

        @param model: (int) model number.
        @param complete: (bool) True if all calculations of the model are complete (files can be deleted).
        @return: None
        """
        if complete:
            self._complete_models.add(model)
        else:
            self._complete_models.discard(model)

    def delete_files(self, categories=None, incomplete_models=False):
        """
        delete all files matching a set of categories.

        this function deletes all files that are tagged with one of the given categories.
        tags are set by the code sections that create the files.
        for a list of common categories, see FILE_CATEGORIES.
        the categories can be given as an argument or taken from the categories_to_delete property.

        files are deleted regardless of R-factor.
        be sure to specify only categories that you don't need in the output at all.

        by default, only files of complete models (cf. set_model_complete()) are deleted
        to avoid interference with running calculations.
        to clean up after calculations, the incomplete_models argument can override this.

        @note this method does not act on the special 'rfac' category (see delete_bad_rfac()).

        @param categories: set of file categories to delete.
            if the argument is None, it defaults to the categories_to_delete property.

        @param incomplete_models: (bool) delete files of incomplete models as well.
            by default (False), incomplete models are not deleted.

        @return: None
        """
        if categories is None:
            categories = self.categories_to_delete
        for cat in categories:
            self.delete_category(cat, incomplete_models=incomplete_models)

    def delete_bad_rfac(self, keep=0, force_delete=False):
        """
        delete all files of all models except for a specified number of best ranking models.

        the method first determines which models to keep.
        the specified number of best ranking non-zero models are kept.
        in addition, incomplete models, models with R factor = 0.0,
        and those without a specified R-factor are kept.
        all other files are deleted.
        the method does not consider the file category.

        the files are deleted from the list and the file system.

        the method executes only if 'rfac' is specified in self.categories_to_delete
        or if force_delete is  True.
        otherwise the method does nothing.

        @param keep: number of files to keep.
            the effective keep number is the greater of self.keep_rfac and this argument.

        @param force_delete: delete the bad files even if 'rfac' is not selected in categories_to_delete.

        @return: None
        """
        if force_delete or 'rfac' in self.categories_to_delete:
            keep = max(keep, self.keep_rfac)
            rfacs = [r for r in sorted(self._rfac_by_model.values()) if r > 0.0]
            try:
                rfac_split = rfacs[keep-1]
            except IndexError:
                return

            keep_models = {model for (model, rfac) in self._rfac_by_model.items() if 0.0 <= rfac <= rfac_split}
            del_models = self._complete_models - keep_models
            del_names = {name for (name, model) in self._file_model.items() if model in del_models}
            for name in del_names:
                self.delete_file(name)

    def delete_models(self, keep=None, delete=None):
        """
        delete all files by model.

        this involves the following steps:
        1. determine a list of complete models
           (incomplete models are still being processed and must not be deleted).
        2. intersect with the _delete_ list if specified.
        3. subtract the _keep_ list if specified.

        if neither the _keep_ nor the _delete_ list is specified,
        or if the steps above resolve to the _complete_ list
        the method considers it as an error and does nothing.

        @param keep: (sequence) model numbers to keep, i.e., delete all others.

        @param delete: (sequence) model numbers to delete.

        @return (int) number of models deleted.
        """
        del_models = self._complete_models.copy()
        if delete:
            del_models &= delete
        if keep:
            del_models -= keep
        if not del_models or del_models == self._complete_models:
            return 0

        del_names = {name for (name, model) in self._file_model.items() if model in del_models}
        for name in del_names:
            self.delete_file(name)

        return len(del_models)

    def delete_category(self, category, incomplete_models=False):
        """
        delete all files of a specified category from the list and the file system.

        this function deletes all files that are tagged with the given category.
        tags are set by the code sections that create the files.
        for a list of common categories, see FILE_CATEGORIES.

        files are deleted regardless of R-factor.
        be sure to specify only categories that you don't need in the output at all.

        by default, only files of complete models (cf. set_model_complete()) are deleted
        to avoid interference with running calculations.
        to clean up after calculations, the incomplete_models argument can override this.

        @param category: (str) category.
            should be one of FILE_CATEGORIES. otherwise, the function has no effect.

        @param incomplete_models: (bool) delete files of incomplete models as well.
            by default (False), incomplete models are not deleted.

        @return: None
        """
        del_names = {name for (name, cat) in self._file_category.items() if cat == category}
        if not incomplete_models:
            del_names &= {name for (name, model) in self._file_model.items() if model in self._complete_models}
        for name in del_names:
            self.delete_file(name)

    def delete_file(self, name):
        """
        delete a specified file from the list and the file system.

        this method is unconditional. it does not consider category, completeness, nor R-factor.

        the method catches errors during file deletion and prints warnings to the logger.

        @param name: must match an existing file path identically.
            if it is not in the list, the method does nothing.
            the method uses the associated path declared in add_file() to delete the file.

        @return: None
        """
        try:
            cat = self._file_category[name]
            model = self._file_model[name]
            path = self._file_path[name]
        except KeyError:
            logger.warning("tried to delete untracked file {0}".format(name))
        else:
            del self._file_model[name]
            del self._file_category[name]
            del self._file_path[name]
            try:
                os.remove(path)
            except OSError:
                logger.warning("file system error deleting file {0}".format(path))
            else:
                logger.debug("delete file {0} ({1}, model {2})".format(path, cat, model))


def list_files_other_models(prefix, models):
    """
    list input/output files except those of the given models.

    this can be used to clean up all files except those belonging to the given models.

    to delete the listed files:

        for f in files:
            os.remove(f)

    @param prefix: file name prefix up to the first underscore.
        only files starting with this prefix are listed.

    @param models: sequence or set of model numbers that should not be listed.

    @return: set of file names
    """
    file_names = set([])
    for entry in os.scandir():
        if entry.is_file:
            elements = entry.name.split('_')
            try:
                if len(elements) == 6 and elements[0] == prefix and int(elements[1]) not in models:
                    file_names.add(entry.name)
            except (IndexError, ValueError):
                pass
    return file_names