update public distribution

based on internal repository c9a2ac8 2019-01-03 16:04:57 +0100
tagged rev-master-2.0.0
This commit is contained in:
2019-01-31 15:45:02 +01:00
parent bbd16d0f94
commit acea809e4e
92 changed files with 165828 additions and 143181 deletions

View File

@ -26,16 +26,27 @@ Licensed under the Apache License, Version 2.0 (the "License"); @n
http://www.apache.org/licenses/LICENSE-2.0
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import copy
import datetime
import logging
import numpy as np
import collections
import data as md
import cluster as mc
import files
import handlers
import os.path
import socket
import sys
import pmsco.cluster as mc
from pmsco.compat import open
import pmsco.data as md
import pmsco.database as database
import pmsco.dispatch as dispatch
import pmsco.files as files
import pmsco.handlers as handlers
from pmsco.helpers import BraceMessage as BMsg
logger = logging.getLogger(__name__)
@ -95,7 +106,7 @@ class Domain(object):
self.max = {}
self.step = {}
def add_param(self, name, start, min, max, step):
def add_param(self, name, start, min=None, max=None, step=None, width=None):
"""
set the domain of one parameter with all necessary values at once.
@ -107,15 +118,29 @@ class Domain(object):
@param start (float) start value.
@param min (float) lower bound of the parameter interval.
must be lower or equal to start.
if None, the field is set to start.
@param max (float) upper bound of the parameter interval.
must be greater or equal to start.
if None, the field is set to start.
@param width (float) width of the parameter interval.
instead of min and max, the interval can be set centered around the start value.
this is equivalent to min = start - width/2, max = start + width/2.
this argument overrides min and max. don't use both arguments.
@param step (float) step size.
must be greater or equal to zero.
if None, the field is set to zero.
"""
self.start[name] = start
self.min[name] = min
self.max[name] = max
self.step[name] = step
self.min[name] = min if min is not None else start
self.max[name] = max if max is not None else start
if width is not None:
self.min[name] = start - width / 2.
self.max[name] = start + width / 2.
self.step[name] = step if step is not None else 0.0
def get_param(self, name):
"""
@ -144,9 +169,27 @@ class Params(object):
objects of this class are created by the implementation of the create_params() method
of the actual project class.
"""
## @var angular_resolution (float)
# FWHM angular resolution of the detector.
#
# maps to:
# @arg emission angle window (EDAC)
# @arg angular_broadening (MSC)
## @var phase_files (dict)
# dictionary of phase files.
#
# the keys are atomic numbers, the values file names.
# if the dictionary is empty or the files don't exist, the phases are computed internally (EDAC only).
#
# maps to:
# @arg scatterer (EDAC)
# @arg atomic_number, phase_file (MSC)
def __init__(self):
self.title = "MSC default parameters"
self.comment = "from msc_project.Params()"
self.title = "default parameters"
self.comment = "set by project.Params()"
self.cluster_file = ""
self.output_file = ""
self.scan_file = ""
@ -154,7 +197,7 @@ class Params(object):
self.initial_state = "1s"
# MSC convention: H, V, L, R, U
self.polarization = "H"
self.angular_broadening = 0.0
self.angular_resolution = 1.0
self.z_surface = 0.0
self.inner_potential = 10.0
# the energy scale of EDAC is referenced to the vacuum level
@ -167,16 +210,14 @@ class Params(object):
self.experiment_temperature = 300.0
self.debye_temperature = 400.0
self.debye_wavevector = 1.0
self.phase_files = {}
# used by MSC only
self.spherical_order = 2
self.scattering_level = 5
self.fcut = 15.0
self.cut = 15.0
self.lattice_constant = 1.0
self.atom_types = 0
self.atomic_number = [1, 2, 3, 4]
self.phase_file = ["1.pha", "2.pha", "3.pha", "4.pha"]
self.msq_displacement = [0.1, 0.1, 0.1, 0.1]
self.msq_displacement = {}
self.planewave_attenuation = 1.0
self.vibration_model = "N"
self.substrate_atomic_mass = 1.0
@ -216,9 +257,12 @@ class Scan(object):
# example: ['t','p']
## @var emitter (string)
# chemical symbol of emitter atom
# chemical symbol and, optionally following, further specification (chemical state, environment, ...)
# of photo-emitting atoms.
# the interpretation of this string is up to the project and its cluster generator.
# it should, however, always start with a chemical element symbol.
#
# example: 'Cu'
# examples: 'Ca' (calcium), 'CA' (carbon A), 'C a' (carbon a), 'C 1' (carbon one), 'N=O', 'FeIII'.
## @var initial_state (string)
# nl term of initial state
@ -342,142 +386,6 @@ class Scan(object):
self.alphas = np.zeros((1))
class ClusterGenerator(object):
"""
cluster generator class.
this class bundles the cluster methods in one place
so that it's easier to exchange them for different kinds of clusters.
the project must override at least the create_cluster method.
if emitters should be run in parallel tasks, the count_emitters method must be implemented as well.
"""
def __init__(self, project):
"""
initialize the cluster generator.
@param project: reference to the project object.
cluster generators may need to look up project parameters.
"""
self.project = project
def count_emitters(self, model, index):
"""
return the number of emitter configurations for a particular model.
the number of emitter configurations may depend on the model parameters, scan index and symmetry index.
by default, the method returns 1, which means that there is only one emitter configuration.
emitter configurations are mainly a way to distribute the calculations to multiple processes
based on emitters since the resulting diffraction patterns add up incoherently.
for this to work, the create_cluster() method must pay attention to the emitter index
and generate either a full cluster with all emitters (single process)
or a cluster with only a subset of the emitters according to the emitter index (multiple processes).
whether all emitters are calculated in one or multiple processes is decided at run-time
based on the available resources.
note that this function returns the number of _configurations_ not _atoms_.
an emitter configuration (declared in a Cluster) may include more than one atom.
it is up to the project, what is included in a particular configuration.
to enable multiple emitter configurations, the derived project class must override this method
and return a number greater than 1.
@note in some cases it may be most efficient to call create_cluster and
return Cluster.get_emitter_count() of the generated cluster.
this is possible because the method is called with emitter index -1.
model and index can be passed unchanged to create_cluster.
@param model (dictionary) model parameters to be used in the calculation.
@param index (named tuple CalcID) calculation index.
the method should consider only the following attributes:
@arg @c scan scan index (index into Project.scans)
@arg @c sym symmetry index (index into Project.symmetries)
@arg @c emit emitter index is -1 if called by the emitter handler.
@return number of emitter configurations.
this implementation returns the default value of 1.
"""
return 1
def create_cluster(self, model, index):
"""
create a Cluster object given the model parameters and calculation index.
the generated cluster will typically depend on the model parameters.
depending on the project, it may also depend on the scan index, symmetry index and emitter index.
the scan index can be used to generate a different cluster for different scan geometry,
e.g., if some atoms can be excluded due to a longer mean free path.
if this is not the case for the specific project, the scan index can be ignored.
the symmetry index may select a particular domain that has a different atomic arrangement.
in this case, depending on the value of index.sym, the function must generate a cluster corresponding
to the particular domain/symmetry.
the method can ignore the symmetry index if the project defines only one symmetry,
or if the symmetry does not correspond to a different atomic structure.
the emitter index selects a particular emitter configuration.
depending on the value of the emitter index, the method must react differently:
1. if the value lower or equal to zero, return the full cluster and mark all inequivalent emitter atoms.
emitters which are reproduced by a symmetry expansion in combine_emitters() should not be marked.
the full diffraction scan will be calculated in one calculation.
2. if the value is greater than zero, generate the cluster with the emitter configuration
selected by the emitter index.
the index is in the range between 1 and the return value of count_emitters().
the results of the individual emitter calculations are summed up in combine_emitters().
the code should ideally be written such that either case yields the same diffraction result.
if count_emitters() always returns 1 (default), the second case does not have to be implemented,
and the method can ignore the emitter index.
the method must ignore the model and energy index.
@param model (dictionary) model parameters to be used in the calculation.
@param index (named tuple CalcID) calculation index.
the method should consider only the following attributes:
@arg @c scan scan index (index into Project.scans)
@arg @c sym symmetry index (index into Project.symmetries)
@arg @c emit emitter index.
if lower or equal to zero, generate the full cluster and mark all emitters.
if greater than zero, the value is a 1-based index of the emitter configuration.
"""
return None
class LegacyClusterGenerator(ClusterGenerator):
"""
cluster generator class for projects that don't declare a generator.
in previous versions, the create_cluster and count_emitters methods were implemented by the project class.
this class redirects generator calls to the project methods
providing compatibility to older project code.
"""
def __init__(self, project):
super(LegacyClusterGenerator, self).__init__(project)
def count_emitters(self, model, index):
"""
redirect the call to the corresponding project method if implemented.
"""
try:
return self.project.count_emitters(model, index)
except AttributeError:
return 1
def create_cluster(self, model, index):
"""
redirect the call to the corresponding project method.
"""
return self.project.create_cluster(model, index)
# noinspection PyMethodMayBeStatic
class Project(object):
"""
@ -549,39 +457,27 @@ class Project(object):
# the initial value is a LegacyClusterGenerator object
# which routes cluster calls back to the project for compatibility with older project code.
## @var pop_size (int)
# population size (number of particles) in the particle swarm optimization.
## @var optimizer_params (dict)
# optional parameters of the model optimizer.
#
# by default, the ParticleSwarmHandler chooses the population size depending on the number of parallel processes.
# you may want to override the default value in cases where the automatic choice is not appropriate, e.g.:
# - the calculation of a model takes a long time compared to the available computing time.
# - the calculation of a model spawns many sub-tasks due to complex symmetry.
# - you want to increase the number of generations compared to the number of particles.
# this is a dictionary that can have (among others) the following values.
# for a detailed list, see the documentation of the respective model handler.
#
# the default value is 0.
#
# the value can be set by the command line.
## @var history_file (string)
# name of a file containing the results from previous optimization runs.
# this can be used to resume a swarm optimization where it was interrupted before.
#
# the history file is a space-delimited, multi-column, text file.
# output files of a previous optimization run can be used as is.
# there must be one column for each model parameter, and one column of R factors.
# the first row must contain the names of the model parameters.
# the name of th R factor column must be '_rfac'.
# additional columns may be included and are ignored.
#
# by default, no history is loaded.
## @var recalc_history (bool)
# select whether the R-factors of the historic models are calculated again.
#
# this is useful if the historic data was calculated for a different cluster, different set of parameters,
# or different experimental data, and if the R-factors of the new optimization may be systematically greater.
# set this argument to False only if the calculation is a continuation of a previous one
# without any changes to the code.
# @arg @c 'pop_size' (int)
# population size (number of particles) in the swarm or genetic optimization mode.
# by default, the ParticleSwarmHandler chooses the population size depending on the number of parallel processes.
# you may want to override the default value in cases where the automatic choice is not appropriate.
# the value can be set by the command line.
# @arg @c 'seed_file' (string)
# name of a file containing the results from previous optimization runs.
# this can be used to resume a swarm or genetic optimization where it was interrupted before.
# the seed file is a space-delimited, multi-column, text file,
# e.g., the output file of a previous optimization.
# by default, no seed is loaded.
# @arg @c 'recalc_seed' (bool)
# select whether the R-factors of the seed models are calculated again.
# set this argument to False only if the calculation is a continuation of a previous one
# without any changes to the code.
## @var data_dir
# directory path to experimental data.
@ -594,9 +490,17 @@ class Project(object):
# if the location of the files may depend on the machine or user account,
# the user may want to specify the data path on the command line.
## @var output_dir (string)
# directory path for data files produced during the calculation, including intermediate files.
#
# output_dir and output_file are set at once by @ref set_output.
## @var output_file (string)
# file name root for data files produced during the calculation, including intermediate files.
#
# the file name should include the path. the path must also be set in @ref output_dir.
#
# output_dir and output_file are set at once by @ref set_output.
## @var timedelta_limit (datetime.timedelta)
# wall time after which no new calculations should be started.
@ -604,11 +508,11 @@ class Project(object):
# the actual wall time may be longer by the remaining time of running calculations.
# running calculations will not be aborted.
## @var _combined_scan
## @var combined_scan
# combined raw data from scans.
# updated by add_scan().
## @var _combined_modf
## @var combined_modf
# combined modulation function from scans.
# updated by add_scan().
@ -618,30 +522,55 @@ class Project(object):
#
# files.categories_to_delete determines which files can be deleted.
## @var keep_best
# number of best models for which result files should be kept.
#
# this attribute determines how many models are kept based on R-factor ranking at each node of the task tree
# (up to keep_levels).
## @var keep_levels
# numeric task level down to which R-factors are considered when model files are cleaned up.
#
# @arg 0 = model level: combined results only.
# @arg 1 = scan level: scan nodes in addition to combined results (level 0).
# @arg 2 = symmetry level: symmetry nodes in addition to level 1.
# @arg 3 = emitter level: emitter nodes in addition to level 1.
# @arg 4 = region level: region nodes in addition to level 1.
def __init__(self):
self.mode = "single"
self.code = "edac"
self.features = {}
self.cluster_format = mc.FMT_EDAC
self.cluster_generator = LegacyClusterGenerator(self)
self.cluster_generator = mc.LegacyClusterGenerator(self)
self.scans = []
self.symmetries = []
self.pop_size = 0
self.history_file = ""
self.recalc_history = True
self.optimizer_params = {
'pop_size': 0,
'seed_file': "",
'seed_limit': 0,
'recalc_seed': True,
'table_file': ""
}
self.data_dir = ""
self.output_dir = ""
self.output_file = "pmsco_data"
self.timedelta_limit = datetime.timedelta(days=1)
self._combined_scan = None
self._combined_modf = None
self.combined_scan = None
self.combined_modf = None
self.files = files.FileTracker()
self.handler_classes = {}
self.handler_classes['model'] = handlers.SingleModelHandler
self.handler_classes['scan'] = handlers.ScanHandler
self.handler_classes['symmetry'] = handlers.SymmetryHandler
self.handler_classes['emitter'] = handlers.EmitterHandler
self.handler_classes['region'] = handlers.SingleRegionHandler
self.keep_levels = 1
self.keep_best = 10
self.handler_classes = {
'model': handlers.SingleModelHandler,
'scan': handlers.ScanHandler,
'sym': handlers.SymmetryHandler,
'emit': handlers.EmitterHandler,
'region': handlers.SingleRegionHandler
}
self.calculator_class = None
self._tasks_fields = []
self._db = database.ResultsDatabase()
def create_domain(self):
"""
@ -676,8 +605,8 @@ class Project(object):
@return: None
"""
self.scans = []
self._combined_scan = None
self._combined_modf = None
self.combined_scan = None
self.combined_modf = None
def add_scan(self, filename, emitter, initial_state, is_modf=False, modf_model=None):
"""
@ -695,7 +624,7 @@ class Project(object):
* intensity vs theta and phi (hemisphere or hologram scan)
the method calculates the modulation function if @c is_modf is @c False.
it also updates @c _combined_scan and @c _combined_modf which may be used as R-factor comparison targets.
it also updates @c combined_scan and @c combined_modf which may be used as R-factor comparison targets.
@param filename: (string) file name of the experimental data, possibly including a path.
@ -732,22 +661,26 @@ class Project(object):
scan.modulation = None
if scan.raw_data is not None:
if self._combined_scan is not None:
dtype = md.common_dtype((self._combined_scan, scan.raw_data))
self._combined_scan = np.hstack((self._combined_scan, md.restructure_data(scan.raw_data, dtype)))
if self.combined_scan is not None:
dt = md.common_dtype((self.combined_scan, scan.raw_data))
d1 = md.restructure_data(self.combined_scan, dt)
d2 = md.restructure_data(scan.raw_data, dt)
self.combined_scan = np.hstack((d1, d2))
else:
self._combined_scan = scan.raw_data.copy()
self.combined_scan = scan.raw_data.copy()
else:
self._combined_scan = None
self.combined_scan = None
if scan.modulation is not None:
if self._combined_modf is not None:
dtype = md.common_dtype((self._combined_modf, scan.modulation))
self._combined_modf = np.hstack((self._combined_modf, md.restructure_data(scan.modulation, dtype)))
if self.combined_modf is not None:
dt = md.common_dtype((self.combined_modf, scan.modulation))
d1 = md.restructure_data(self.combined_modf, dt)
d2 = md.restructure_data(scan.modulation, dt)
self.combined_modf = np.hstack((d1, d2))
else:
self._combined_modf = scan.modulation.copy()
self.combined_modf = scan.modulation.copy()
else:
self._combined_modf = None
self.combined_modf = None
return scan
@ -783,9 +716,16 @@ class Project(object):
def set_output(self, filename):
"""
set base name of output file
set path and base name of output file.
path and name are copied to the output_file attribute.
path is copied to the output_dir attribute.
if the path is missing, the destination is the current working directory.
"""
self.output_file = filename
path, name = os.path.split(filename)
self.output_dir = path
def set_timedelta_limit(self, timedelta):
"""
@ -797,12 +737,15 @@ class Project(object):
def combine_symmetries(self, parent_task, child_tasks):
"""
combine results of different symmetry into one result. calculate the modulation function.
combine results of different symmetry into one result and calculate the modulation function.
the symmetry results are read from the file system using the indices defined by the child_tasks,
and the combined result is written to the file system with the index defined by parent_task.
by default, this method adds all symmetries with equal weight.
weights can be defined in the model dictionary with keys 'wsym0', 'wsym1', etc.
missing weights default to 1.
note: to avoid correlated parameters, one symmetry must always have a fixed weight.
@param parent_task: (CalculationTask) parent task of the symmetry tasks.
the method must write the results to the files indicated
@ -817,19 +760,26 @@ class Project(object):
@raise IndexError if child_tasks is empty
@raise KeyError if a filename is missing
@raise IOError if a filename is missing
@note the weights of the symmetries (in derived classes) can be part of the optimizable model parameters.
the model parameters are available as the @c model attribute of the calculation tasks.
"""
result_data = None
sum_weights = 0.
for task in child_tasks:
data = md.load_data(task.result_filename)
if result_data is not None:
result_data['i'] += data['i']
else:
result_data = data
if result_data is None:
result_data = data.copy()
result_data['i'] = 0.
try:
weight = task.model['wsym{}'.format(task.id.sym)]
except KeyError:
weight = 1.
result_data['i'] += weight * data['i']
sum_weights += weight
result_data['i'] /= sum_weights
md.save_data(parent_task.result_filename, result_data)
@ -865,7 +815,7 @@ class Project(object):
@raise IndexError if child_tasks is empty
@raise KeyError if a filename is missing
@raise IOError if a filename is missing
@note the weights of the emitters (in derived classes) can be part of the optimizable model parameters.
the model parameters are available as the @c model attribute of the calculation tasks.
@ -898,7 +848,7 @@ class Project(object):
the datasets of the scans are appended.
this is done for intensity and modulation data independently.
@param parent_task: (CalculationTask) parent task of the symmetry tasks.
@param parent_task: (CalculationTask) parent task of the scan tasks.
the method must write the results to the files indicated
by the @c result_filename and @c modf_filename attributes.
@ -910,14 +860,12 @@ class Project(object):
@return: None
@raise IndexError if child_tasks is empty.
@raise KeyError if a filename is missing.
"""
# intensity
try:
stack1 = [md.load_data(task.result_filename) for task in child_tasks]
except (KeyError, IOError):
except IOError:
parent_task.result_filename = ""
else:
dtype = md.common_dtype(stack1)
@ -928,7 +876,7 @@ class Project(object):
# modulation
try:
stack1 = [md.load_data(task.modf_filename) for task in child_tasks]
except (KeyError, IOError):
except IOError:
parent_task.modf_filename = ""
else:
dtype = md.common_dtype(stack1)
@ -936,6 +884,142 @@ class Project(object):
result_modf = np.hstack(tuple(stack2))
md.save_data(parent_task.modf_filename, result_modf)
def combine_regions(self, parent_task, child_tasks):
"""
combine results from different regions into one result, for intensity and modulation.
the scan results are read from the file system using the indices defined by the child_tasks,
and the combined result is written to the file system with the index defined by parent_task.
the datasets of the regions are appended and sorted in the standard order of the data module.
if the resulting length differs from the corresponding experimental scan,
an error is printed to the logger, but the calculation continues.
the modulation function is calculated by calling @ref calc_modulation.
@param parent_task: (CalculationTask) parent task of the region tasks.
the method writes the results to the file names
given by the @c result_filename and @c modf_filename attributes.
@param child_tasks: (sequence of CalculationTask) tasks which identify each region.
the reads the source data from the files
indicated by the @c result_filename attributes.
the sequence is sorted by task ID, i.e., essentially, by region index.
@return: None
@raise IndexError if child_tasks is empty.
"""
# intensity
try:
stack1 = [md.load_data(task.result_filename) for task in child_tasks]
except IOError:
parent_task.result_valid = False
parent_task.result_filename = ""
else:
dtype = md.common_dtype(stack1)
stack2 = [md.restructure_data(data, dtype) for data in stack1]
result_data = np.hstack(tuple(stack2))
md.sort_data(result_data)
md.save_data(parent_task.result_filename, result_data)
scan = self.scans[parent_task.id.scan]
if result_data.shape[0] != scan.raw_data.shape[0]:
logger.error(BMsg("scan length mismatch: combined result: {result}, experimental data: {expected}",
result=result_data.shape[0], expected=scan.raw_data.shape[0]))
# modulation
try:
data = md.load_data(parent_task.result_filename)
modf = self.calc_modulation(data, parent_task.model)
except IOError:
parent_task.modf_filename = ""
else:
md.save_data(parent_task.modf_filename, modf)
def setup(self, handlers):
"""
prepare for calculations.
this method is called in the master process before starting the task loop.
at this point the task handlers have been created and set up.
if the project needs to change settings of task handlers it can do so in this method.
this instance writes the header of the tasks.dat file
that will receive sub-task evaluation results from the evaluate_result() method.
@param handlers: dictionary listing the initialized task handler instances.
the dictionary keys are the attribute names of pmsco.dispatch.CalcID:
'model', 'scan', 'sym', 'emit' and 'region'.
@return: None
"""
fields = ["rfac"]
fields.extend(dispatch.CalcID._fields)
fields = ["_" + f for f in fields]
dom = self.create_domain()
model_fields = dom.start.keys()
model_fields.sort(key=lambda name: name.lower())
fields.extend(model_fields)
self._tasks_fields = fields
with open(self.output_file + ".tasks.dat", "w") as outfile:
outfile.write("# ")
outfile.write(" ".join(fields))
outfile.write("\n")
# todo : fill in the descriptive fields, change to file-database
self._db.connect(":memory:")
project_id = self._db.register_project(self.__class__.__name__, sys.argv[0])
job_id = self._db.register_job(project_id,
"job-name",
self.mode,
socket.gethostname(),
"git-hash",
datetime.datetime.now(),
"description")
self._db.register_params(model_fields)
self._db.create_models_view()
def evaluate_result(self, parent_task, child_tasks):
"""
evaluate the result of a calculation task.
this method is called from the add_result of the task handlers at each level.
it gives the project a hook to check the progress of a model at any level of the task tree.
the method calculates the r-factor by calling the Project.calc_rfactor method.
the result is written to the task.rfac field and to the .tasks.dat file.
invalid and region-level results are skipped.
this method is called in the master process only.
@param parent_task: (CalculationTask) a calculation task.
@param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
the sequence must be sorted by task ID.
@return: None
"""
if parent_task.result_valid and parent_task.id.region == -1:
try:
parent_task.rfac = self.calc_rfactor(parent_task, child_tasks)
except ValueError:
parent_task.result_valid = False
logger.warning(BMsg("calculation {0} resulted in an undefined R-factor.", parent_task.id))
else:
values_dict = parent_task.id._asdict()
values_dict = {"_" + k: v for k, v in values_dict.items()}
values_dict.update(parent_task.model)
values_dict['_rfac'] = parent_task.rfac
values_list = [values_dict[field] for field in self._tasks_fields]
with open(self.output_file + ".tasks.dat", "a") as outfile:
outfile.write(" ".join(format(value) for value in values_list) + "\n")
self._db.insert_result(parent_task.id, values_dict)
return None
# noinspection PyUnusedLocal
def calc_modulation(self, data, model):
"""
@ -965,31 +1049,246 @@ class Project(object):
return md.calc_modfunc_loess(data)
def calc_rfactor(self, task):
def calc_rfactor(self, parent_task, child_tasks):
"""
calculate the R-factor of a task.
calculate the r-factor of a task.
the method calculates the R-factor over the combined scans.
the corresponding experimental data is taken from self._combined_modf.
the r-factor is calculated on the experimental and simulated modulation functions.
the algorithm differs for the model level and the lower task levels.
at the model level, the calculation is delegated to Project.combine_rfactors.
at all other levels, the r-factor is calculated by Project.rfactor,
where the simulated is loaded from the file specified by parent_task.modf_filename
and the experimental data from Project.scan.
this method is called by the model handler.
this method is called by the task handlers.
all child tasks belonging to the parent task must be complete.
by default, the R-factor is calculated by data.rfactor() over the combined scans.
override this method in your project to use a different R-factor algorithm.
to select or implement a specific R-factor algorithm,
the project sub-class should override Project.rfactor.
to combine scan r-factors, it should override or patch Project.combine_rfactors.
@param task: (CalculationTask) a model task.
@version in earlier versions,
projects had to override this method to implement their algorithm.
this has lead to duplication of common code.
the r-factor algorithm is now distributed over several methods,
and the method signature has changed.
new projects should override Project.rfactor and/or Project.combine_rfactors.
@return (int) calculated R-factor.
@param parent_task: (CalculationTask) a calculation task.
@param child_tasks: (sequence of CalculationTask) tasks which identify each scan.
the sequence must be sorted by task ID.
@return (float) calculated R-factor.
@raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
"""
task_data = md.load_data(task.modf_filename)
result_r = md.rfactor(self._combined_modf, task_data)
if parent_task.id.scan >= 0:
task_data = md.load_data(parent_task.modf_filename)
exp_data = self.scans[parent_task.id.scan].modulation
result_r = self.rfactor(exp_data, task_data)
else:
result_r = self.combine_rfactors(parent_task, child_tasks)
return result_r
def rfactor(self, exp_data, theo_data):
"""
calculate the r-factor of simulated diffraction data.
in this class, the method calls the data.rfactor function to calculate the r-factor.
override this method in your project to use a different R-factor algorithm.
the input arrays must have the same shape,
and the coordinate columns must be identical (they are ignored, however).
the array elements are compared element-by-element.
terms having NaN intensity are ignored.
if the sigma column is present in experiment and non-zero,
the R-factor terms are weighted.
@param exp_data: (numpy structured array)
ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
if an @c s field is present and non-zero,
the R-factor terms are weighted by 1/sigma**2.
@param theo_data: (numpy structured array)
ETPI or ETPAI array containing the calculated modulation functions.
@return: (float) scalar R-factor
@raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
"""
return md.rfactor(exp_data, theo_data)
def opt_rfactor(self, exp_data, theo_data):
"""
calculate the r-factor of simulated diffraction data, adjusting their amplitude.
this is an alternative r-factor calculation algorithm
using the pmsco.data.optimize_rfactor() function.
to activate this method (replacing the default one), assign it to Project.rfactor
in the overriding __init__ or setup method:
@code{.py}
self.rfactor = self.opt_rfactor
@endcode
@param exp_data: (numpy structured array)
ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
if an @c s field is present and non-zero,
the R-factor terms are weighted by 1/sigma**2.
@param theo_data: (numpy structured array)
ETPI or ETPAI array containing the calculated modulation functions.
@return: (float) scalar R-factor
@raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
"""
return md.optimize_rfactor(exp_data, theo_data)
def combine_rfactors(self, parent_task, child_tasks):
"""
combine r-factors of child tasks.
the r-factors are taken from the rfac attribute of the child_tasks.
the result is an average of the child r-rfactors.
to produce a balanced result, every child dataset must contain a similar amount of information.
if this is not the case, the child r-factors must be weighted.
weighting is currently not implemented but may be introduced in a future version.
the method is intended to be used at the model level (children are scans).
though it can technically be used at any level where child r-factors are available.
@param parent_task: (CalculationTask) parent task for which the r-factor is calculated,
i.e. a model task.
@param child_tasks: (sequence of CalculationTask) child tasks of parent_tasks
that may be consulted for calculating the r-factor.
@return: (float) r-factor, NaN if parent task is invalid
@raise ValueError or IndexError if child_tasks is empty.
"""
if parent_task.result_valid:
rsum = 0.
for task in child_tasks:
rsum += task.rfac
return rsum / len(child_tasks)
else:
return float('nan')
def alt_combine_rfactors(self, parent_task, child_tasks):
"""
combine r-factors of child tasks by explicit calculation on the combined result.
this is an alternative implementation of combine_rfactors.
instead of using the r-factors from child tasks,
it re-calculates the r-factor for the combined dataset.
this method avoids the issue of weighting
but can introduce bias if the amplitudes of the child datasets differ substantially.
the experimental dataset is loaded from the file specified by the parent task,
the corresponding experimental data is taken from self.combined_modf.
to activate this method, assign it to combine_rfactors.
in the overriding __init__ or setup method:
@code{.py}
self.combine_rfactors = self.alt_combine_rfactors
@endcode
@param parent_task: (CalculationTask) parent task for which the r-factor is calculated,
i.e. a model task.
@param child_tasks: (sequence of CalculationTask) child tasks of parent_tasks
that may be consulted for calculating the r-factor.
@return: (float) r-factor, NaN if parent task is invalid
"""
if parent_task.result_valid:
task_data = md.load_data(parent_task.modf_filename)
exp_data = self.combined_modf
return self.rfactor(exp_data, task_data)
else:
return float('nan')
def export_cluster(self, index, filename, cluster):
"""
export the cluster of a calculation task in XYZ format for diagnostics and reporting.
this method is called with the final cluster just before it is handed over to the calculator.
it saves the atom coordinates in XYZ format for future reference (e.g. graphics).
the method creates two files:
@arg a file with extension '.xyz' contains the whole cluster in XYZ format.
@arg a file with extension '.emit.xyz' contains only emitter atoms in XYZ format.
the first part of the file name is formatted with the output name and the complete task identification.
the file is registered with the file tracker in the 'cluster' category
so that it will be deleted unless the cluster category is selected for keeping.
derived project class may override or extend this method
to carry out further diagnostics or reporting on the cluster.
@param index: (CalcID) calculation index to which the cluster belongs.
region may be -1 if only one cluster is exported for all regions
(clusters do not depend on the scan region).
emit may be -1 if the cluster is a master from which emitter-related child clusters are derived.
@param filename: (str) base file name for the output files.
the filename should be formatted using pmsco.dispatch.CalculationTask.format_filename().
extensions are appended by this method.
@param cluster: a pmsco.cluster.Cluster() object with all atom positions and emitters.
@return: dictionary listing the names of the created files with their category.
the dictionary key is the file name,
the value is the file category (cluster).
"""
_files = {}
xyz_filename = filename + ".xyz"
cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ)
_files[xyz_filename] = 'cluster'
xyz_filename = filename + ".emit.xyz"
cluster.save_to_file(xyz_filename, fmt=mc.FMT_XYZ, emitters_only=True)
_files[xyz_filename] = 'cluster'
return _files
def cleanup(self):
"""
delete unwanted files at the end of a project.
@return: None
"""
self.cleanup_files()
self._db.disconnect()
def cleanup_files(self, keep=0):
"""
delete uninteresting files.
these are all files that
belong to one of the self.files.categories_to_delete categories or
do not belong to one of the "best" models.
"best" models are a number (self.keep_best) of models that gave the lowest R-factors
at each task level from root to self.keep_levels.
for example if `keep_best = 10` and `keep_levels = 1`
the 10 best models at the top level, and the 10 best at the scan level are kept.
this means that in total up to `n = 10 + 10 * n_scans` models may be kept,
where n_scans is the number of scan files in the job.
@param keep: minimum number of best models to keep.
0 (default): use the project parameter self.keep_best.
@return None
"""
self.files.delete_files()
if 'rfac' in self.files.categories_to_delete:
keep = max(keep, self.keep_best)
keepers = self._db.query_best_task_models(self.keep_levels, keep)
self.files.delete_models(keep=keepers)