428 lines
15 KiB
Python
428 lines
15 KiB
Python
"""
|
|
@package pmsco.grid
|
|
grid search optimization handler.
|
|
|
|
the module starts multiple MSC calculations and varies parameters on a fixed coordinate grid.
|
|
|
|
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
|
|
|
|
@copyright (c) 2015 by Paul Scherrer Institut @n
|
|
Licensed under the Apache License, Version 2.0 (the "License"); @n
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import datetime
|
|
import math
|
|
import numpy as np
|
|
import logging
|
|
|
|
from pmsco.compat import open
|
|
import pmsco.handlers as handlers
|
|
import pmsco.graphics as graphics
|
|
from pmsco.helpers import BraceMessage as BMsg
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class GridPopulation(object):
|
|
"""
|
|
grid population.
|
|
"""
|
|
|
|
## @var model_start
|
|
# (dict) initial model parameters.
|
|
# read-only. call setup() to change this attribute.
|
|
|
|
## @var model_min
|
|
# (dict) low limits of the model parameters.
|
|
# read-only. call setup() to change this attribute.
|
|
|
|
## @var model_max
|
|
# (dict) high limits of the model parameters.
|
|
# if min == max, the parameter is kept constant.
|
|
# read-only. call setup() to change this attribute.
|
|
|
|
## @var model_max
|
|
# (dict) high limits of the model parameters.
|
|
# read-only. call setup() to change this attribute.
|
|
|
|
## @var model_step
|
|
# (dict) initial velocity (difference between two steps) of the particle.
|
|
# read-only. call setup() to change this attribute.
|
|
|
|
## @var model_count
|
|
# number of models (grid points).
|
|
# initial value = 0.
|
|
|
|
## @var positions
|
|
# (numpy.ndarray) flat list of grid coordinates and results.
|
|
#
|
|
# the column names include the names of the model parameters, taken from model_space.start,
|
|
# and the special names @c '_model', @c '_rfac'.
|
|
# the special fields have the following meanings:
|
|
#
|
|
# * @c '_model': model number.
|
|
# the model number counts identifies the grid point.
|
|
# the field is used to associate the result of a calculation with the coordinate vector.
|
|
# the model handlers use it to derive their model ID.
|
|
#
|
|
# * @c '_rfac': calculated R-factor for this position.
|
|
# it is set by the add_result() method.
|
|
#
|
|
# @note if your read a single element, e.g. pos[0], from the array, you will get a numpy.void object.
|
|
# this object is a <em>view</em> of the original array item
|
|
|
|
def __init__(self):
|
|
"""
|
|
initialize the population object.
|
|
|
|
"""
|
|
self.model_start = {}
|
|
self.model_min = {}
|
|
self.model_max = {}
|
|
self.model_step = {}
|
|
|
|
self.model_count = 0
|
|
|
|
self.positions = None
|
|
|
|
self.search_keys = []
|
|
self.fixed_keys = []
|
|
|
|
@staticmethod
|
|
def get_model_dtype(model_params):
|
|
"""
|
|
get numpy array data type for model parameters and grid control variables.
|
|
|
|
@param model_params: dictionary of model parameters or list of parameter names.
|
|
|
|
@return: dtype for use with numpy array constructors.
|
|
this is a sorted list of (name, type) tuples.
|
|
"""
|
|
dt = []
|
|
for key in model_params:
|
|
dt.append((key, 'f4'))
|
|
dt.append(('_model', 'i4'))
|
|
dt.append(('_rfac', 'f4'))
|
|
dt.sort(key=lambda t: t[0].lower())
|
|
return dt
|
|
|
|
def setup(self, model_space):
|
|
"""
|
|
set up the population and result arrays.
|
|
|
|
@param model_space: (pmsco.project.ModelSpace)
|
|
definition of initial and limiting model parameters
|
|
expected by the cluster and parameters functions.
|
|
the attributes have the following meanings:
|
|
@arg start: values of the fixed parameters.
|
|
@arg min: minimum values allowed.
|
|
@arg max: maximum values allowed.
|
|
if abs(max - min) < step/2 , the parameter is kept constant.
|
|
@arg step: step size (distance between two grid points).
|
|
if step <= 0, the parameter is kept constant.
|
|
|
|
"""
|
|
self.model_start = model_space.start
|
|
self.model_min = model_space.min
|
|
self.model_max = model_space.max
|
|
self.model_step = model_space.step
|
|
|
|
self.model_count = 1
|
|
self.search_keys = []
|
|
self.fixed_keys = []
|
|
scales = []
|
|
|
|
for p in model_space.step.keys():
|
|
if model_space.step[p] > 0:
|
|
n = int(np.round((model_space.max[p] - model_space.min[p]) / model_space.step[p]) + 1)
|
|
else:
|
|
n = 1
|
|
if n > 1:
|
|
self.search_keys.append(p)
|
|
scales.append(np.linspace(model_space.min[p], model_space.max[p], n))
|
|
else:
|
|
self.fixed_keys.append(p)
|
|
|
|
# scales is a list of 1D arrays that hold the coordinates of the individual dimensions
|
|
# nd_positions is a list of N-D arrays that hold the coordinates in all multiple dimensions
|
|
# flat_positions is a list of 1D arrays that hold the coordinates in flat sequence
|
|
if len(scales) > 1:
|
|
positions_nd = np.meshgrid(*scales, indexing='ij')
|
|
positions_flat = [arr.flatten() for arr in positions_nd]
|
|
else:
|
|
positions_flat = scales
|
|
self.model_count = positions_flat[0].shape[0]
|
|
|
|
# shuffle the calculation order so that we may see the more interesting parts earlier
|
|
shuffle_index = np.arange(self.model_count)
|
|
np.random.shuffle(shuffle_index)
|
|
positions_reordered = [pos[shuffle_index] for pos in positions_flat]
|
|
|
|
dt = self.get_model_dtype(self.model_min)
|
|
|
|
# positions
|
|
self.positions = np.zeros(self.model_count, dtype=dt)
|
|
|
|
for idx, key in enumerate(self.search_keys):
|
|
self.positions[key] = positions_reordered[idx]
|
|
for idx, key in enumerate(self.fixed_keys):
|
|
self.positions[key] = self.model_start[key]
|
|
|
|
self.positions['_model'] = np.arange(self.model_count)
|
|
self.positions['_rfac'] = 2.1
|
|
|
|
def add_result(self, particle, rfac):
|
|
"""
|
|
add a calculation particle to the results array.
|
|
|
|
@param particle: dictionary of model parameters and particle values.
|
|
the keys must correspond to the columns of the pos array,
|
|
i.e. the names of the model parameters plus the _rfac, and _model fields.
|
|
|
|
@param rfac: calculated R-factor.
|
|
the R-factor is written to the '_rfac' field.
|
|
|
|
@return None
|
|
"""
|
|
model = particle['_model']
|
|
self.positions['_rfac'][model] = rfac
|
|
|
|
def save_array(self, filename, array):
|
|
"""
|
|
saves a population array to a text file.
|
|
|
|
@param array: population array to save.
|
|
must be one of self.pos, self.vel, self.best, self.results
|
|
"""
|
|
header = " ".join(self.positions.dtype.names)
|
|
np.savetxt(filename, array, fmt='%g', header=header)
|
|
|
|
def load_array(self, filename, array):
|
|
"""
|
|
load a population array from a text file.
|
|
|
|
the array to load must be compatible with the current population
|
|
(same number of rows, same columns).
|
|
the first row must contain column names.
|
|
the ordering of columns may be different.
|
|
the returned array is ordered according to the array argument.
|
|
|
|
@param array: population array to load.
|
|
must be one of self.pos, self.vel, self.results.
|
|
|
|
@return array with loaded data.
|
|
this may be the same instance as on input.
|
|
|
|
@raise AssertionError if the number of rows of the two files differ.
|
|
"""
|
|
data = np.atleast_1d(np.genfromtxt(filename, names=True))
|
|
assert data.shape == array.shape
|
|
for name in data.dtype.names:
|
|
array[name] = data[name]
|
|
return array
|
|
|
|
def save_population(self, base_filename):
|
|
"""
|
|
saves the population array to a set of text files.
|
|
|
|
the file name extensions are .pos, .vel, and .best
|
|
"""
|
|
self.save_array(base_filename + ".pos", self.positions)
|
|
|
|
def load_population(self, base_filename):
|
|
"""
|
|
loads the population array from a set of previously saved text files.
|
|
this can be used to continue an optimization job.
|
|
|
|
the file name extensions are .pos, .vel, and .best.
|
|
the files must have the same format as produced by save_population.
|
|
the files must have the same number of rows.
|
|
"""
|
|
self.load_array(base_filename + ".pos", self.positions)
|
|
|
|
def save_results(self, filename):
|
|
"""
|
|
saves the complete list of calculations results.
|
|
"""
|
|
self.save_array(filename, self.positions)
|
|
|
|
|
|
class GridSearchHandler(handlers.ModelHandler):
|
|
"""
|
|
model handler which implements the grid search algorithm.
|
|
|
|
"""
|
|
|
|
## @var _pop (Population)
|
|
# holds the population object.
|
|
|
|
## @var _outfile (file)
|
|
# output file for model parametes and R factor.
|
|
# the file is open during calculations.
|
|
# each calculation result adds one line.
|
|
|
|
## @var _model_time (timedelta)
|
|
# estimated CPU time to calculate one model.
|
|
# this value is the maximum time measured of the completed calculations.
|
|
# it is used to determine when the optimization should be finished so that the time limit is not exceeded.
|
|
|
|
## @var _timeout (bool)
|
|
# indicates when the handler has run out of time,
|
|
# i.e. time is up before convergence has been reached.
|
|
# if _timeout is True, create_tasks() will not create further tasks,
|
|
# and add_result() will signal completion when the _pending_tasks queue becomes empty.
|
|
|
|
def __init__(self):
|
|
super(GridSearchHandler, self).__init__()
|
|
self._pop = None
|
|
self._outfile = None
|
|
self._model_time = datetime.timedelta()
|
|
self._timeout = False
|
|
self._invalid_limit = 10
|
|
self._next_model = 0
|
|
|
|
def setup(self, project, slots):
|
|
"""
|
|
initialize the particle swarm and open an output file.
|
|
|
|
@param project:
|
|
|
|
@param slots: number of calculation processes available through MPI.
|
|
for efficiency reasons we set the population size twice the number of available slots.
|
|
the minimum number of slots is 1, the recommended value is 10 or greater.
|
|
the population size is set to at least 4.
|
|
|
|
@return (int) number of models to be calculated.
|
|
"""
|
|
super(GridSearchHandler, self).setup(project, slots)
|
|
|
|
self._pop = GridPopulation()
|
|
self._pop.setup(self._project.create_model_space())
|
|
self._invalid_limit = max(slots, self._invalid_limit)
|
|
|
|
self._outfile = open(self._project.output_file + ".dat", "w")
|
|
self._outfile.write("# ")
|
|
self._outfile.write(" ".join(self._pop.positions.dtype.names))
|
|
self._outfile.write("\n")
|
|
|
|
return self._pop.model_count
|
|
|
|
def cleanup(self):
|
|
self._outfile.close()
|
|
super(GridSearchHandler, self).cleanup()
|
|
|
|
def create_tasks(self, parent_task):
|
|
"""
|
|
develop the particle population and create a calculation task per particle.
|
|
|
|
this method advances the population by one step, and generates one task per particle.
|
|
during the first call, the method first sets up a new population.
|
|
|
|
the process loop calls this method every time the length of the task queue drops
|
|
below the number of calculation processes (slots).
|
|
|
|
@return list of generated tasks. empty list if all grid points have been calculated.
|
|
"""
|
|
|
|
super(GridSearchHandler, self).create_tasks(parent_task)
|
|
|
|
# this is the top-level handler, we expect just one parent: root.
|
|
parent_id = parent_task.id
|
|
assert parent_id == (-1, -1, -1, -1, -1)
|
|
self._parent_tasks[parent_id] = parent_task
|
|
|
|
time_pending = self._model_time * len(self._pending_tasks)
|
|
time_avail = (self.datetime_limit - datetime.datetime.now()) * max(self._slots, 1)
|
|
|
|
out_tasks = []
|
|
time_pending += self._model_time
|
|
if time_pending > time_avail:
|
|
self._timeout = True
|
|
logger.warning("time limit reached")
|
|
|
|
if self._invalid_count > self._invalid_limit:
|
|
self._timeout = True
|
|
logger.error("number of invalid calculations (%u) exceeds limit", self._invalid_count)
|
|
|
|
model = self._next_model
|
|
if not self._timeout and model < self._pop.model_count:
|
|
new_task = parent_task.copy()
|
|
new_task.parent_id = parent_id
|
|
pos = self._pop.positions[model]
|
|
new_task.model = {k: pos[k] for k in pos.dtype.names}
|
|
new_task.change_id(model=model)
|
|
|
|
child_id = new_task.id
|
|
self._pending_tasks[child_id] = new_task
|
|
out_tasks.append(new_task)
|
|
self._next_model += 1
|
|
|
|
return out_tasks
|
|
|
|
def add_result(self, task):
|
|
"""
|
|
calculate the R factor of the result and store it in the positions array.
|
|
|
|
* append the result to the result output file.
|
|
* update the execution time statistics.
|
|
* remove temporary files if requested.
|
|
* check whether the grid search is complete.
|
|
|
|
@return parent task (CalculationTask) if the search is complete, @c None otherwise.
|
|
"""
|
|
super(GridSearchHandler, self).add_result(task)
|
|
|
|
self._complete_tasks[task.id] = task
|
|
del self._pending_tasks[task.id]
|
|
parent_task = self._parent_tasks[task.parent_id]
|
|
|
|
if task.result_valid:
|
|
assert not math.isnan(task.rfac)
|
|
task.model['_rfac'] = task.rfac
|
|
self._pop.add_result(task.model, task.rfac)
|
|
|
|
if self._outfile:
|
|
s = (str(task.model[name]) for name in self._pop.positions.dtype.names)
|
|
self._outfile.write(" ".join(s))
|
|
self._outfile.write("\n")
|
|
self._outfile.flush()
|
|
|
|
self._project.files.update_model_rfac(task.id.model, task.rfac)
|
|
self._project.files.set_model_complete(task.id.model, True)
|
|
|
|
if task.result_valid:
|
|
if task.time > self._model_time:
|
|
self._model_time = task.time
|
|
else:
|
|
self._invalid_count += 1
|
|
|
|
# grid search complete?
|
|
if len(self._pending_tasks) == 0:
|
|
del self._parent_tasks[parent_task.id]
|
|
else:
|
|
parent_task = None
|
|
|
|
self.cleanup_files()
|
|
return parent_task
|
|
|
|
def save_report(self, root_task):
|
|
"""
|
|
generate a graphical summary of the optimization.
|
|
|
|
@param root_task: (CalculationTask) the id.model attribute is used to register the generated files.
|
|
|
|
@return: None
|
|
"""
|
|
super(GridSearchHandler, self).save_report(root_task)
|
|
|
|
files = graphics.rfactor.render_results(self._project.output_file + ".dat", self._pop.positions)
|
|
for f in files:
|
|
self._project.files.add_file(f, root_task.id.model, "report")
|