""" @package pmsco.grid grid search optimization handler. the module starts multiple MSC calculations and varies parameters on a fixed coordinate grid. @author Matthias Muntwiler, matthias.muntwiler@psi.ch @copyright (c) 2015 by Paul Scherrer Institut @n Licensed under the Apache License, Version 2.0 (the "License"); @n you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import datetime import math import numpy as np import logging from pmsco.compat import open import pmsco.handlers as handlers import pmsco.graphics as graphics from pmsco.helpers import BraceMessage as BMsg logger = logging.getLogger(__name__) class GridPopulation(object): """ grid population. """ ## @var model_start # (dict) initial model parameters. # read-only. call setup() to change this attribute. ## @var model_min # (dict) low limits of the model parameters. # read-only. call setup() to change this attribute. ## @var model_max # (dict) high limits of the model parameters. # if min == max, the parameter is kept constant. # read-only. call setup() to change this attribute. ## @var model_max # (dict) high limits of the model parameters. # read-only. call setup() to change this attribute. ## @var model_step # (dict) initial velocity (difference between two steps) of the particle. # read-only. call setup() to change this attribute. ## @var model_count # number of models (grid points). # initial value = 0. ## @var positions # (numpy.ndarray) flat list of grid coordinates and results. # # the column names include the names of the model parameters, taken from model_space.start, # and the special names @c '_model', @c '_rfac'. # the special fields have the following meanings: # # * @c '_model': model number. # the model number counts identifies the grid point. # the field is used to associate the result of a calculation with the coordinate vector. # the model handlers use it to derive their model ID. # # * @c '_rfac': calculated R-factor for this position. # it is set by the add_result() method. # # @note if your read a single element, e.g. pos[0], from the array, you will get a numpy.void object. # this object is a view of the original array item def __init__(self): """ initialize the population object. """ self.model_start = {} self.model_min = {} self.model_max = {} self.model_step = {} self.model_count = 0 self.positions = None self.search_keys = [] self.fixed_keys = [] @staticmethod def get_model_dtype(model_params): """ get numpy array data type for model parameters and grid control variables. @param model_params: dictionary of model parameters or list of parameter names. @return: dtype for use with numpy array constructors. this is a sorted list of (name, type) tuples. """ dt = [] for key in model_params: dt.append((key, 'f4')) dt.append(('_model', 'i4')) dt.append(('_rfac', 'f4')) dt.sort(key=lambda t: t[0].lower()) return dt def setup(self, model_space): """ set up the population and result arrays. @param model_space: (pmsco.project.ModelSpace) definition of initial and limiting model parameters expected by the cluster and parameters functions. the attributes have the following meanings: @arg start: values of the fixed parameters. @arg min: minimum values allowed. @arg max: maximum values allowed. if abs(max - min) < step/2 , the parameter is kept constant. @arg step: step size (distance between two grid points). if step <= 0, the parameter is kept constant. """ self.model_start = model_space.start self.model_min = model_space.min self.model_max = model_space.max self.model_step = model_space.step self.model_count = 1 self.search_keys = [] self.fixed_keys = [] scales = [] for p in model_space.step.keys(): if model_space.step[p] > 0: n = int(np.round((model_space.max[p] - model_space.min[p]) / model_space.step[p]) + 1) else: n = 1 if n > 1: self.search_keys.append(p) scales.append(np.linspace(model_space.min[p], model_space.max[p], n)) else: self.fixed_keys.append(p) # scales is a list of 1D arrays that hold the coordinates of the individual dimensions # nd_positions is a list of N-D arrays that hold the coordinates in all multiple dimensions # flat_positions is a list of 1D arrays that hold the coordinates in flat sequence if len(scales) > 1: positions_nd = np.meshgrid(*scales, indexing='ij') positions_flat = [arr.flatten() for arr in positions_nd] else: positions_flat = scales self.model_count = positions_flat[0].shape[0] # shuffle the calculation order so that we may see the more interesting parts earlier shuffle_index = np.arange(self.model_count) np.random.shuffle(shuffle_index) positions_reordered = [pos[shuffle_index] for pos in positions_flat] dt = self.get_model_dtype(self.model_min) # positions self.positions = np.zeros(self.model_count, dtype=dt) for idx, key in enumerate(self.search_keys): self.positions[key] = positions_reordered[idx] for idx, key in enumerate(self.fixed_keys): self.positions[key] = self.model_start[key] self.positions['_model'] = np.arange(self.model_count) self.positions['_rfac'] = 2.1 def add_result(self, particle, rfac): """ add a calculation particle to the results array. @param particle: dictionary of model parameters and particle values. the keys must correspond to the columns of the pos array, i.e. the names of the model parameters plus the _rfac, and _model fields. @param rfac: calculated R-factor. the R-factor is written to the '_rfac' field. @return None """ model = particle['_model'] self.positions['_rfac'][model] = rfac def save_array(self, filename, array): """ saves a population array to a text file. @param array: population array to save. must be one of self.pos, self.vel, self.best, self.results """ header = " ".join(self.positions.dtype.names) np.savetxt(filename, array, fmt='%g', header=header) def load_array(self, filename, array): """ load a population array from a text file. the array to load must be compatible with the current population (same number of rows, same columns). the first row must contain column names. the ordering of columns may be different. the returned array is ordered according to the array argument. @param array: population array to load. must be one of self.pos, self.vel, self.results. @return array with loaded data. this may be the same instance as on input. @raise AssertionError if the number of rows of the two files differ. """ data = np.atleast_1d(np.genfromtxt(filename, names=True)) assert data.shape == array.shape for name in data.dtype.names: array[name] = data[name] return array def save_population(self, base_filename): """ saves the population array to a set of text files. the file name extensions are .pos, .vel, and .best """ self.save_array(base_filename + ".pos", self.positions) def load_population(self, base_filename): """ loads the population array from a set of previously saved text files. this can be used to continue an optimization job. the file name extensions are .pos, .vel, and .best. the files must have the same format as produced by save_population. the files must have the same number of rows. """ self.load_array(base_filename + ".pos", self.positions) def save_results(self, filename): """ saves the complete list of calculations results. """ self.save_array(filename, self.positions) class GridSearchHandler(handlers.ModelHandler): """ model handler which implements the grid search algorithm. """ ## @var _pop (Population) # holds the population object. ## @var _outfile (file) # output file for model parametes and R factor. # the file is open during calculations. # each calculation result adds one line. ## @var _model_time (timedelta) # estimated CPU time to calculate one model. # this value is the maximum time measured of the completed calculations. # it is used to determine when the optimization should be finished so that the time limit is not exceeded. ## @var _timeout (bool) # indicates when the handler has run out of time, # i.e. time is up before convergence has been reached. # if _timeout is True, create_tasks() will not create further tasks, # and add_result() will signal completion when the _pending_tasks queue becomes empty. def __init__(self): super(GridSearchHandler, self).__init__() self._pop = None self._outfile = None self._model_time = datetime.timedelta() self._timeout = False self._invalid_limit = 10 self._next_model = 0 def setup(self, project, slots): """ initialize the particle swarm and open an output file. @param project: @param slots: number of calculation processes available through MPI. for efficiency reasons we set the population size twice the number of available slots. the minimum number of slots is 1, the recommended value is 10 or greater. the population size is set to at least 4. @return (int) number of models to be calculated. """ super(GridSearchHandler, self).setup(project, slots) self._pop = GridPopulation() self._pop.setup(self._project.create_model_space()) self._invalid_limit = max(slots, self._invalid_limit) self._outfile = open(self._project.output_file + ".dat", "w") self._outfile.write("# ") self._outfile.write(" ".join(self._pop.positions.dtype.names)) self._outfile.write("\n") return self._pop.model_count def cleanup(self): self._outfile.close() super(GridSearchHandler, self).cleanup() def create_tasks(self, parent_task): """ develop the particle population and create a calculation task per particle. this method advances the population by one step, and generates one task per particle. during the first call, the method first sets up a new population. the process loop calls this method every time the length of the task queue drops below the number of calculation processes (slots). @return list of generated tasks. empty list if all grid points have been calculated. """ super(GridSearchHandler, self).create_tasks(parent_task) # this is the top-level handler, we expect just one parent: root. parent_id = parent_task.id assert parent_id == (-1, -1, -1, -1, -1) self._parent_tasks[parent_id] = parent_task time_pending = self._model_time * len(self._pending_tasks) time_avail = (self.datetime_limit - datetime.datetime.now()) * max(self._slots, 1) out_tasks = [] time_pending += self._model_time if time_pending > time_avail: self._timeout = True logger.warning("time limit reached") if self._invalid_count > self._invalid_limit: self._timeout = True logger.error("number of invalid calculations (%u) exceeds limit", self._invalid_count) model = self._next_model if not self._timeout and model < self._pop.model_count: new_task = parent_task.copy() new_task.parent_id = parent_id pos = self._pop.positions[model] new_task.model = {k: pos[k] for k in pos.dtype.names} new_task.change_id(model=model) child_id = new_task.id self._pending_tasks[child_id] = new_task out_tasks.append(new_task) self._next_model += 1 return out_tasks def add_result(self, task): """ calculate the R factor of the result and store it in the positions array. * append the result to the result output file. * update the execution time statistics. * remove temporary files if requested. * check whether the grid search is complete. @return parent task (CalculationTask) if the search is complete, @c None otherwise. """ super(GridSearchHandler, self).add_result(task) self._complete_tasks[task.id] = task del self._pending_tasks[task.id] parent_task = self._parent_tasks[task.parent_id] if task.result_valid: assert not math.isnan(task.rfac) task.model['_rfac'] = task.rfac self._pop.add_result(task.model, task.rfac) if self._outfile: s = (str(task.model[name]) for name in self._pop.positions.dtype.names) self._outfile.write(" ".join(s)) self._outfile.write("\n") self._outfile.flush() self._project.files.update_model_rfac(task.id.model, task.rfac) self._project.files.set_model_complete(task.id.model, True) if task.result_valid: if task.time > self._model_time: self._model_time = task.time else: self._invalid_count += 1 # grid search complete? if len(self._pending_tasks) == 0: del self._parent_tasks[parent_task.id] else: parent_task = None self.cleanup_files() return parent_task def save_report(self, root_task): """ generate a graphical summary of the optimization. @param root_task: (CalculationTask) the id.model attribute is used to register the generated files. @return: None """ super(GridSearchHandler, self).save_report(root_task) files = graphics.rfactor.render_results(self._project.output_file + ".dat", self._pop.positions) for f in files: self._project.files.add_file(f, root_task.id.model, "report")