pmsco-public/pmsco/optimizers/grid.py

"""
@package pmsco.grid
grid search optimization handler.

the module starts multiple MSC calculations and varies parameters on a fixed coordinate grid.

@author Matthias Muntwiler, matthias.muntwiler@psi.ch

@copyright (c) 2015 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import math
import numpy as np
import logging

from pmsco.compat import open
import pmsco.handlers as handlers
import pmsco.graphics as graphics
from pmsco.helpers import BraceMessage as BMsg

logger = logging.getLogger(__name__)


class GridPopulation(object):
    """
    grid population.
    """

    ## @var model_start
    # (dict) initial model parameters.
    # read-only. call setup() to change this attribute.

    ## @var model_min
    # (dict) low limits of the model parameters.
    # read-only. call setup() to change this attribute.

    ## @var model_max
    # (dict) high limits of the model parameters.
    # if min == max, the parameter is kept constant.
    # read-only. call setup() to change this attribute.

    ## @var model_max
    # (dict) high limits of the model parameters.
    # read-only. call setup() to change this attribute.

    ## @var model_step
    # (dict) initial velocity (difference between two steps) of the particle.
    # read-only. call setup() to change this attribute.

    ## @var model_count
    # number of models (grid points).
    # initial value = 0.

    ## @var positions
    # (numpy.ndarray) flat list of grid coordinates and results.
    #
    # the column names include the names of the model parameters, taken from model_space.start,
    # and the special names @c '_model', @c '_rfac'.
    # the special fields have the following meanings:
    #
    # * @c '_model': model number.
    #   the model number counts identifies the grid point.
    #   the field is used to associate the result of a calculation with the coordinate vector.
    #   the model handlers use it to derive their model ID.
    #
    # * @c '_rfac': calculated R-factor for this position.
    #   it is set by the add_result() method.
    #
    # @note if your read a single element, e.g. pos[0], from the array, you will get a numpy.void object.
    # this object is a <em>view</em> of the original array item

    def __init__(self):
        """
        initialize the population object.

        """
        self.model_start = {}
        self.model_min = {}
        self.model_max = {}
        self.model_step = {}

        self.model_count = 0

        self.positions = None

        self.search_keys = []
        self.fixed_keys = []

    @staticmethod
    def get_model_dtype(model_params):
        """
        get numpy array data type for model parameters and grid control variables.

        @param model_params: dictionary of model parameters or list of parameter names.

        @return: dtype for use with numpy array constructors.
            this is a sorted list of (name, type) tuples.
        """
        dt = []
        for key in model_params:
            dt.append((key, 'f4'))
        dt.append(('_model', 'i4'))
        dt.append(('_rfac', 'f4'))
        dt.sort(key=lambda t: t[0].lower())
        return dt

    def setup(self, model_space):
        """
        set up the population and result arrays.

        @param model_space: (pmsco.project.ModelSpace)
            definition of initial and limiting model parameters
            expected by the cluster and parameters functions.
            the attributes have the following meanings:
            @arg start: values of the fixed parameters.
            @arg min:   minimum values allowed.
            @arg max:   maximum values allowed.
                        if abs(max - min) < step/2 , the parameter is kept constant.
            @arg step:  step size (distance between two grid points).
                        if step <= 0, the parameter is kept constant.

        """
        self.model_start = model_space.start
        self.model_min = model_space.min
        self.model_max = model_space.max
        self.model_step = model_space.step

        self.model_count = 1
        self.search_keys = []
        self.fixed_keys = []
        scales = []

        for p in model_space.step.keys():
            if model_space.step[p] > 0:
                n = int(np.round((model_space.max[p] - model_space.min[p]) / model_space.step[p]) + 1)
            else:
                n = 1
            if n > 1:
                self.search_keys.append(p)
                scales.append(np.linspace(model_space.min[p], model_space.max[p], n))
            else:
                self.fixed_keys.append(p)

        # scales is a list of 1D arrays that hold the coordinates of the individual dimensions
        # nd_positions is a list of N-D arrays that hold the coordinates in all multiple dimensions
        # flat_positions is a list of 1D arrays that hold the coordinates in flat sequence
        if len(scales) > 1:
            positions_nd = np.meshgrid(*scales, indexing='ij')
            positions_flat = [arr.flatten() for arr in positions_nd]
        else:
            positions_flat = scales
        self.model_count = positions_flat[0].shape[0]

        # shuffle the calculation order so that we may see the more interesting parts earlier
        shuffle_index = np.arange(self.model_count)
        np.random.shuffle(shuffle_index)
        positions_reordered = [pos[shuffle_index] for pos in positions_flat]

        dt = self.get_model_dtype(self.model_min)

        # positions
        self.positions = np.zeros(self.model_count, dtype=dt)

        for idx, key in enumerate(self.search_keys):
            self.positions[key] = positions_reordered[idx]
        for idx, key in enumerate(self.fixed_keys):
            self.positions[key] = self.model_start[key]

        self.positions['_model'] = np.arange(self.model_count)
        self.positions['_rfac'] = 2.1

    def add_result(self, particle, rfac):
        """
        add a calculation particle to the results array.

        @param particle: dictionary of model parameters and particle values.
            the keys must correspond to the columns of the pos array,
            i.e. the names of the model parameters plus the _rfac, and _model fields.

        @param rfac: calculated R-factor.
            the R-factor is written to the '_rfac' field.

        @return None
        """
        model = particle['_model']
        self.positions['_rfac'][model] = rfac

    def save_array(self, filename, array):
        """
        saves a population array to a text file.

        @param array: population array to save.
            must be one of self.pos, self.vel, self.best, self.results
        """
        header = " ".join(self.positions.dtype.names)
        np.savetxt(filename, array, fmt='%g', header=header)

    def load_array(self, filename, array):
        """
        load a population array from a text file.

        the array to load must be compatible with the current population
        (same number of rows, same columns).
        the first row must contain column names.
        the ordering of columns may be different.
        the returned array is ordered according to the array argument.

        @param array: population array to load.
            must be one of self.pos, self.vel, self.results.

        @return array with loaded data.
            this may be the same instance as on input.

        @raise AssertionError if the number of rows of the two files differ.
        """
        data = np.atleast_1d(np.genfromtxt(filename, names=True))
        assert data.shape == array.shape
        for name in data.dtype.names:
            array[name] = data[name]
        return array

    def save_population(self, base_filename):
        """
        saves the population array to a set of text files.

        the file name extensions are .pos, .vel, and .best
        """
        self.save_array(base_filename + ".pos", self.positions)

    def load_population(self, base_filename):
        """
        loads the population array from a set of previously saved text files.
        this can be used to continue an optimization job.

        the file name extensions are .pos, .vel, and .best.
        the files must have the same format as produced by save_population.
        the files must have the same number of rows.
        """
        self.load_array(base_filename + ".pos", self.positions)

    def save_results(self, filename):
        """
        saves the complete list of calculations results.
        """
        self.save_array(filename, self.positions)


class GridSearchHandler(handlers.ModelHandler):
    """
    model handler which implements the grid search algorithm.

    """

    ## @var _pop (Population)
    # holds the population object.

    ## @var _outfile (file)
    # output file for model parametes and R factor.
    # the file is open during calculations.
    # each calculation result adds one line.

    ## @var _model_time (timedelta)
    #  estimated CPU time to calculate one model.
    #  this value is the maximum time measured of the completed calculations.
    #  it is used to determine when the optimization should be finished so that the time limit is not exceeded.

    ## @var _timeout (bool)
    #  indicates when the handler has run out of time,
    #  i.e. time is up before convergence has been reached.
    #  if _timeout is True, create_tasks() will not create further tasks,
    #  and add_result() will signal completion when the _pending_tasks queue becomes empty.

    def __init__(self):
        super(GridSearchHandler, self).__init__()
        self._pop = None
        self._outfile = None
        self._model_time = datetime.timedelta()
        self._timeout = False
        self._invalid_limit = 10
        self._next_model = 0

    def setup(self, project, slots):
        """
        initialize the particle swarm and open an output file.

        @param project:

        @param slots: number of calculation processes available through MPI.
            for efficiency reasons we set the population size twice the number of available slots.
            the minimum number of slots is 1, the recommended value is 10 or greater.
            the population size is set to at least 4.

        @return (int) number of models to be calculated.
        """
        super(GridSearchHandler, self).setup(project, slots)

        self._pop = GridPopulation()
        self._pop.setup(self._project.create_model_space())
        self._invalid_limit = max(slots, self._invalid_limit)

        self._outfile = open(self._project.output_file + ".dat", "w")
        self._outfile.write("# ")
        self._outfile.write(" ".join(self._pop.positions.dtype.names))
        self._outfile.write("\n")

        return self._pop.model_count

    def cleanup(self):
        self._outfile.close()
        super(GridSearchHandler, self).cleanup()

    def create_tasks(self, parent_task):
        """
        develop the particle population and create a calculation task per particle.

        this method advances the population by one step, and generates one task per particle.
        during the first call, the method first sets up a new population.

        the process loop calls this method every time the length of the task queue drops
        below  the number of calculation processes (slots).

        @return list of generated tasks. empty list if all grid points have been calculated.
        """

        super(GridSearchHandler, self).create_tasks(parent_task)

        # this is the top-level handler, we expect just one parent: root.
        parent_id = parent_task.id
        assert parent_id == (-1, -1, -1, -1, -1)
        self._parent_tasks[parent_id] = parent_task

        time_pending = self._model_time * len(self._pending_tasks)
        time_avail = (self.datetime_limit - datetime.datetime.now()) * max(self._slots, 1)

        out_tasks = []
        time_pending += self._model_time
        if time_pending > time_avail:
            self._timeout = True
            logger.warning("time limit reached")

        if self._invalid_count > self._invalid_limit:
            self._timeout = True
            logger.error("number of invalid calculations (%u) exceeds limit", self._invalid_count)

        model = self._next_model
        if not self._timeout and model < self._pop.model_count:
            new_task = parent_task.copy()
            new_task.parent_id = parent_id
            pos = self._pop.positions[model]
            new_task.model = {k: pos[k] for k in pos.dtype.names}
            new_task.change_id(model=model)

            child_id = new_task.id
            self._pending_tasks[child_id] = new_task
            out_tasks.append(new_task)
            self._next_model += 1

        return out_tasks

    def add_result(self, task):
        """
        calculate the R factor of the result and store it in the positions array.

        * append the result to the result output file.
        * update the execution time statistics.
        * remove temporary files if requested.
        * check whether the grid search is complete.

        @return parent task (CalculationTask) if the search is complete, @c None otherwise.
        """
        super(GridSearchHandler, self).add_result(task)

        self._complete_tasks[task.id] = task
        del self._pending_tasks[task.id]
        parent_task = self._parent_tasks[task.parent_id]

        if task.result_valid:
            assert not math.isnan(task.rfac)
            task.model['_rfac'] = task.rfac
            self._pop.add_result(task.model, task.rfac)

            if self._outfile:
                s = (str(task.model[name]) for name in self._pop.positions.dtype.names)
                self._outfile.write(" ".join(s))
                self._outfile.write("\n")
                self._outfile.flush()

        self._project.files.update_model_rfac(task.id.model, task.rfac)
        self._project.files.set_model_complete(task.id.model, True)

        if task.result_valid:
            if task.time > self._model_time:
                self._model_time = task.time
        else:
            self._invalid_count += 1

        # grid search complete?
        if len(self._pending_tasks) == 0:
            del self._parent_tasks[parent_task.id]
        else:
            parent_task = None

        self.cleanup_files()
        return parent_task

    def save_report(self, root_task):
        """
        generate a graphical summary of the optimization.

        @param root_task: (CalculationTask) the id.model attribute is used to register the generated files.

        @return: None
        """
        super(GridSearchHandler, self).save_report(root_task)

        files = graphics.rfactor.render_results(self._project.output_file + ".dat", self._pop.positions)
        for f in files:
            self._project.files.add_file(f, root_task.id.model, "report")