pmsco-public/pmsco/data.py

"""
@package pmsco.data
Import, export, evaluation of msc data.

This module provides common functions for loading/saving and manipulating PED scan data sets.

@author Matthias Muntwiler

@copyright (c) 2015-23 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
"""

import logging
import math
import numpy as np
import numpy.typing as npt
import os
import scipy.special
import scipy.optimize as so
from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Union
import h5py

logger = logging.getLogger(__name__)

try:
    import loess
except (ModuleNotFoundError, ImportError) as e:
    loess = None
    logger.critical("Error importing the loess package.", exc_info=e)

## energy, intensity
DTYPE_EI = [('e', 'f4'), ('i', 'f4')]
## energy, theta, phi, intensity
DTYPE_ETPI = [('e', 'f4'), ('t', 'f4'), ('p', 'f4'), ('i', 'f4')]
## energy, theta, phi, intensity, sigma (standard deviation)
DTYPE_ETPIS = [('e', 'f4'), ('t', 'f4'), ('p', 'f4'), ('i', 'f4'), ('s', 'f4')]
## energy, theta, phi, alpha, intensity
DTYPE_ETPAI = [('e', 'f4'), ('t', 'f4'), ('p', 'f4'), ('a', 'f4'), ('i', 'f4')]
## energy, theta, phi, alpha, intensity, sigma (standard deviation)
DTYPE_ETPAIS = [('e', 'f4'), ('t', 'f4'), ('p', 'f4'), ('a', 'f4'), ('i', 'f4'), ('s', 'f4')]
## theta, phi
DTYPE_TP = [('t', 'f4'), ('p', 'f4')]
## theta, phi, intensity
DTYPE_TPI = [('t', 'f4'), ('p', 'f4'), ('i', 'f4')]
## theta, phi, intensity, sigma (standard deviation)
DTYPE_TPIS = [('t', 'f4'), ('p', 'f4'), ('i', 'f4'), ('s', 'f4')]
## intensity, theta, phi
DTYPE_ITP = [('i', 'f4'), ('t', 'f4'), ('p', 'f4')]

DTYPES = {'EI': DTYPE_EI, 'ETPI': DTYPE_ETPI, 'ETPIS': DTYPE_ETPIS, 'ETPAI': DTYPE_ETPAI, 'ETPAIS': DTYPE_ETPAIS,
          'TP': DTYPE_TP, 'TPI': DTYPE_TPI, 'TPIS': DTYPE_TPIS, 'ITP': DTYPE_ITP, }
DATATYPES = DTYPES.keys

## supportd scan types
# @arg @c 'E' energy
# @arg @c 'EA' energy - alpha (analyser)
# @arg @c 'ET' energy - theta
# @arg @c 'TP' theta - phi (holo scan)
SCANTYPES = ['E', 'EA', 'ET', 'TP']

GenTextFileLike = Union[str, os.PathLike, Iterable[str], int]
OSFileLike = Union[str, os.PathLike, int]


def create_etpi(shape: Tuple[int], sigma_column: bool = True) -> np.ndarray:
    """
    create an ETPI array of a given size.

    an ETPI array is a numpy structured array.
    the array is initialized with zeroes.

    @param shape (tuple) shape of the array
    @param sigma_column: whether the array should include a sigma field (ETPIS type instead of ETPI)
    """
    if sigma_column:
        data = np.zeros(shape, dtype=DTYPE_ETPIS)
    else:
        data = np.zeros(shape, dtype=DTYPE_ETPI)
    return data


def create_data(shape: Tuple[int], datatype: str = '', dtype: Optional[npt.DTypeLike] = None) -> np.ndarray:
    """
    create a data array of a given size and type.

    a data array is a numpy structured array.
    the array is initialized with zeroes.
    either datatype or dtype must be specified, dtypes takes precedence.

    @param shape (tuple)  shape of the array, only scalars (1-tuples) supported currently
    @param datatype       see DATATYPES
    @param dtype          see DTYPES
    """
    if not dtype:
        dtype = DTYPES[datatype]
    data = np.zeros(shape, dtype=dtype)
    return data


def holo_grid(theta_start: float = 90., theta_step: float = 1., theta_range: float = 90.,
              phi_start: float = 0., phi_range: float = 360., phi_refinement: float = 1.):
    """
    Generator of a holo grid with constant point density in solid angle.

    The generator yields the polar coordinates of a hologram scan in the traditional Osterwalder fashion,
    where the grid points are distributed evenly on the hemisphere by varying the azimuthal step size,
    while the polar step size is constant.

    The yield are tuples (theta, phi) in degrees.
    Theta is the polar, phi the azimuthal coordinate.

    @param theta_start      Maximum polar angle in degrees, 0..90. Defaults to 90 (grazing emission).
    @param theta_step       Polar angle step in degrees, 1..90. Defaults to 1.
    @param theta_range      Polar angle range in degrees, 1..th_start. Defaults to 90.
    @param phi_start        Azimuthal start angle in degrees. Defaults to 0.
                            This azimuth is included at every polar step.
    @param phi_range        Azimuthal range in degrees. Defaults to 360.
    @param phi_refinement   Azimuthal refinement/oversampling (scalar). Defaults to 1.
                            A refinement of 2 yields a factor 2 more grid points in the azimuthal sub-scans.

    @return yield tuples (theta, phi) in degrees
    """

    deg2rad = 0.01745329

    def calc_phi_step(th):
        if th < 0.5 or int(phi_range * math.sin(th * deg2rad) * phi_refinement / theta_step) == 0:
            phi_st = 0.0
        else:
            phi_st = phi_range / int(th / theta_start * phi_range / theta_step)
        if abs(phi_st) < 0.001:
            phi_st = 360.
        return phi_st

    for theta in np.arange(theta_range, -theta_step, -theta_step):
        phi_step = calc_phi_step(theta)
        for phi in np.arange(phi_start, phi_range, phi_step):
            yield theta, phi


def holo_array(generator: Callable[..., Iterable[Tuple[float, float]]],
               generator_args: Dict,
               datatype: str = 'TP',
               dtype: Optional[npt.DTypeLike] = None) -> np.ndarray:

    """
    Create an hologram scan grid in a numpy array.

    A holo data array is a numpy structured array containing at least
    a column for theta (polar angle) and phi (azimuthal angle).
    The theta and phi columns are filled with angles from the holo_grid (or custom generator) function.
    The array can contain further columns for energy, intensity, etc. according to the data type specified.
    These columns are initialized with zeroes.

    @param generator  Generator that yields tuples (theta, phi) for each grid point,
                      given the keyword arguments kwargs.
                      Defaults to holo_grid, the traditional Osterwalder holo scan.
    @param generator_args Keyword arguments to be passed to the generator.
                      For arguments of the traditional holo scan, see the documentation of holo_grid.
    @param datatype   See DATATYPES. Must contain 'T' and 'P' dimensions. Defaults to 'TP'.
    @param dtype      See DTYPES. Must contain a 't' and 'p' column. Takes precedence over datatype.
                      Defaults to None (not specified).
    """

    if not dtype:
        dtype = DTYPES[datatype]

    tp = np.fromiter(generator(**generator_args), dtype=DTYPES['TP'])

    result = np.zeros(tp.shape, dtype=dtype)
    result['t'] = tp['t']
    result['p'] = tp['p']

    return result


def analyse_holoscan_steps(holoscan: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Find the polar and azimuthal steps in a holoscan.

    @param holoscan:
    @return: thetas: unique theta angles. sorted.
             dtheta: theta steps for each theta
             dphi: phi step for each theta
    """

    thetas, indices, counts = np.unique(holoscan['t'], return_index=True, return_counts=True)
    dtheta = np.diff(thetas)
    dtheta = np.append(dtheta, dtheta[-1])

    adjusted_phis = np.append(holoscan['p'], holoscan['p'][-1])
    phis0 = adjusted_phis[indices]
    phis1 = adjusted_phis[indices+1]
    dphi = phis1 - phis0
    phi_range = counts[-1] * dphi[-1]
    dphi[counts <= 1] = phi_range

    return thetas, dtheta, dphi


def load_plt(filename: GenTextFileLike, int_column: int = -1) -> np.ndarray:
    """
    loads ETPI data from an MSC output (plt) file

    plt file format:
    5-9 columns, space or tab delimited
    column 0: energy
    column 1: momentum
    column 2: theta
    column 3: phi
    columns 4-8: intensities
    comment lines must start with # character

    filename: path or name of the file to be read

    int_column: index of the column to be read as intensity
        typical values: 4, 5, 6, 7, 8
        or negative: -1 (last), -2, (second last), ...
        default: -1

    returns a structured one-dimensional numpy.ndarray

    data[i]['e'] = energy
    data[i]['t'] = theta
    data[i]['p'] = phi
    data[i]['i'] = selected intensity column
    """
    data = np.atleast_1d(np.genfromtxt(filename, usecols=(0, 2, 3, int_column), dtype=DTYPE_ETPI))
    sort_data(data)
    return data


def load_edac_pd(filename: OSFileLike, int_column: int = -1,
                 energy: float = 0.0, theta: float = 0.0, phi: float = 0.0, fixed_cluster: bool = False) -> np.ndarray:
    """
    load ETPI or ETPAI data from an EDAC PD output file.

    EDAC file format:
    @arg row 0: "--- scan PD"
    @arg row 1: column names
    @arg rows 2 and following: space delimited data

    @arg first columns (up to 3): energy, theta, phi depending on scan
    @arg last columns (arbitrary number): intensity at the recursion order specified in the header

    @param filename: path or name of the file to be read

    @param int_column: index of the column to be read as intensity.
        typical values: -1 (last), -2, (second last), ...
        default: -1

    @param energy: default value if energy column is missing
    @param theta:  default value if theta column is missing
    @param phi:    default value if phi column is missing

    @param fixed_cluster:
        if True, (theta, phi) are mapped to (alpha, phi). theta is copied from function argument.
        if False, angles are copied literally.

    @return a structured one-dimensional numpy.ndarray (ETPI or ETPAI)

    @verbatim
    data[i]['e'] = energy
    data[i]['t'] = theta
    data[i]['p'] = phi
    data[i]['i'] = selected intensity column
    @endverbatim
    """

    with open(filename, "rt", encoding="latin1") as f:
        header1 = f.readline().strip()
        header2 = f.readline().strip()
    if not header1 == '--- scan PD':
        logger.warning("unexpected EDAC output file header format")

    col_names = header2.split()
    dtype = []
    cols = []
    ncols = 0
    for name in col_names:
        if name == "eV":
            dtype.append(('e', 'f4'))
            cols.append(ncols)
            ncols += 1
        elif name == "theta":
            dtype.append(('t', 'f4'))
            cols.append(ncols)
            ncols += 1
        elif name == "phi":
            dtype.append(('p', 'f4'))
            cols.append(ncols)
            ncols += 1
        elif name == "order":
            dtype.append(('i', 'f4'))
            cols.append(int_column)
            ncols += 1
            break
        else:
            logger.warning("unexpected EDAC output file column name")
            break
    cols = tuple(cols)
    raw = np.atleast_1d(np.genfromtxt(filename, usecols=cols, dtype=dtype, skip_header=2))

    if fixed_cluster:
        etpi = np.empty(raw.shape, dtype=DTYPE_ETPAI)
    else:
        etpi = np.empty(raw.shape, dtype=DTYPE_ETPI)

    if 'eV' in col_names:
        etpi['e'] = raw['e']
    else:
        etpi['e'] = energy
    if 'theta' in col_names:
        etpi['t'] = raw['t']
    else:
        etpi['t'] = theta
    if 'phi' in col_names:
        etpi['p'] = raw['p']
    else:
        etpi['p'] = phi
    etpi['i'] = raw['i']

    if fixed_cluster:
        etpi['a'] = etpi['t']
        etpi['t'] = theta

    sort_data(etpi)
    return etpi


def load_etpi(filename: GenTextFileLike) -> np.ndarray:
    """
    loads ETPI or ETPIS data from a text file

    etpi file format:
    4 or 5 columns, space or tab delimited
    column 0: energy
    column 1: theta
    column 2: phi
    column 3: intensity
    column 4: sigma error (standard deviation). optional defaults to 0.
    comment lines must start with # character
    comment lines may appear anywhere, and are ignored

    filename: path or name of the file to be read
        load_etpi handles compressed files (ending .gz) transparently.

    returns a structured one-dimensional numpy.ndarray

    data[i]['e'] = energy
    data[i]['t'] = theta
    data[i]['p'] = phi
    data[i]['i'] = intensity
    data[i]['s'] = sigma

    @deprecated new code should use load_data().
    """
    try:
        data = np.loadtxt(filename, dtype=DTYPE_ETPIS)
    except IndexError:
        data = np.loadtxt(filename, dtype=DTYPE_ETPI)
    sort_data(data)
    return data


def load_data(filename: GenTextFileLike, dtype: Optional[npt.DTypeLike] = None):
    """
    load column data (ETPI, and the like) from a text file.

    the extension must specify one of DATATYPES (case insensitive)
    corresponding to the meaning of the columns in the file.

    @param filename

    @param dtype: override data type recognition if the extension cannot be used.
        must be one of the data.DTYPE constants
        DTYPE_EI, DTYPE_ETPI, DTYPE_ETPIS, DTYPE_ETPAI, or DTYPE_ETPAIS.
        by default, the function uses the extension to determine the data type.
        the actual type can be read from the dtype attribute of the returned array.
        if the extension is missing, DTYPE_EI is assumed.

    @return one-dimensional numpy structured ndarray with data

    @raise IOError if the file cannot be read.

    @raise IndexError if the number of columns is lower than expected based on the dtype or extension.
    """
    if not dtype:
        (root, ext) = os.path.splitext(filename)
        ext_type = ext[1:].upper()
        try:
            dtype = DTYPES[ext_type]
        except KeyError:
            dtype = DTYPE_EI

    data = np.loadtxt(filename, dtype=dtype)
    sort_data(data)
    return data


def format_extension(data: np.ndarray) -> str:
    """
    format the file extension based on the contents of an array.

    @param data ETPI-like structured numpy.ndarray.

    @return: file extension string including the leading period.
    """
    return "." + "".join(data.dtype.names)


def save_data(filename: OSFileLike, data: npt.ArrayLike) -> None:
    """
    save column data (ETPI, and the like) to a text file.

    the extension must specify one of DATATYPES (case insensitive)
    corresponding to the meaning of the columns in the file.

    @param filename

    @param data ETPI-like structured numpy.ndarray.

    @remark this function is plain numpy.savetxt, provided for convenience.
    """
    np.savetxt(filename, data, fmt='%g')


def sort_data(data: np.ndarray) -> None:
    """
    sort scan data (ETPI and the like) in a consistent order.

    the function sorts the data array along the scan dimensions energy, theta, phi and alpha.
    this function should be used for all sorting of measured and calculated data
    to ensure a consistent sort order.

    the function determines the sort key based on the scan fields of the data array,
    ignoring the intensity and sigma fields.

    the function uses the _mergesort_ algorithm which preserves the relative order of indistinct elements.

    @warning sorting on intensity and sigma fields would mix up the scan dimensions and produce invalid results!

    @param data ETPI-like structured numpy.ndarray.

    @return: None. the data array is sorted in place.
    """
    sort_key = [name for name in data.dtype.names if name in {'e', 't', 'p', 'a'}]
    data.sort(kind='mergesort', order=sort_key)


def restructure_data(data: np.ndarray, dtype: Optional[npt.DTypeLike] = None,
                     defaults: Optional[Mapping] = None) -> np.ndarray:
    """
    restructure the type of a data array by adding or removing columns.

    example: to combine an ETPI and an ETPAI scan, both arrays must have the same data type.
    this function adds the necessary columns and initializes them with default values.
    to find out the appropriate data type, use the common_dtype() function.
    to concatenate arrays, call numpy.hstack on a tuple of arrays.

    @param data: original data array (a structured numpy array having one of the DTYPES data types).

    @param dtype: data type of the new array. must be one out of DTYPES.
        default is DTYPE_ETPAIS which includes any possible field.

    @param defaults: default values for new fields.
        this must be a dictionary where the key is the field name and value the default value of the field.
        the dictionary can contain an arbitrary sub-set of fields.
        undefined fields are initialized to zero.
        if the parameter is unspecified, all fields are initialized to zero.

    @return: re-structured numpy array or
        @c data if the new and original data types are the same.
    """
    if dtype is None:
        dtype = DTYPE_ETPAIS
    if data.dtype == dtype:
        return data
    else:
        new_data = np.zeros(data.shape, dtype=dtype)
        fields = [dt[0] for dt in dtype if dt[0] in data.dtype.names]

        if defaults is not None:
            for field, value in defaults.items():
                if field in new_data.dtype.names:
                    new_data[field] = value

        for field in fields:
            new_data[field] = data[field]

        return new_data


def common_dtype(scans: Iterable[Union[npt.ArrayLike, npt.DTypeLike]]) -> npt.DTypeLike:
    """
    determine the common data type for a number of scans.

    example: to combine an ETPI and an ETPAI scan, both arrays must have the same data type.
    this function determines the least common data type.
    to restructure each array, use the restructure_data() function.
    to concatenate arrays, call numpy.hstack on a tuple of arrays.

    @param scans: iterable of scan data or types.
        the elements of the list must be ETPI-like numpy structured arrays,
        numpy.dtype specifiers of a permitted ETPI-like array,
        or one of the DTYPE constants listed in DTYPES.

    @return: DTYPE constant which includes all the fields referred to in the input data.
    """
    fields = set([])
    for item in scans:
        if isinstance(item, np.ndarray):
            names = item.dtype.names
        elif isinstance(item, np.dtype):
            names = item.names
        else:
            names = [dt[0] for dt in item]
        for name in names:
            fields.add(name)

    dtype = [dt for dt in DTYPE_ETPAIS if dt[0] in fields]
    return dtype


def detect_scan_mode(data: np.ndarray) -> Tuple[List[str], Dict[str, np.ndarray]]:
    """
    detect the scan mode and unique scan positions in a data array.

    the function detects which columns of the data array are scanning.
    if the values of a column are not constant, the column is considered to be scanning.
    the function does not require a particular ordering of the scan positions
    (although other parts of the code may do so).
    the function returns the names of the scanning columns.

    the function also extracts unique positions for each column, and returns one array per column of input data.
    in the case of a fixed (non-scanning) column, the resulting array contains one data point.
    if the input data does not contain a particular column, the resulting array will contain 0 per default.

    if both theta and phi columns are non-constant, the function reports a theta-phi scan.
    in a theta-phi scan, each pair (theta, phi) is considered a scan position,
    and uniqueness is enforced with respect to the (theta, phi) pairs.
    the individual theta and phi arrays may contain duplicate values.

    @param data ETPI-like structured numpy.ndarray.
        only the 'e', 't', 'p', and 'a' columns are considered.

    @return the tuple (scan_mode, scan_positions), where
    @arg    scan_mode is a list of column names that refer to the scanned variables,
        i.e. non-constant columns in the input data.
        possible values are 'e', 't', 'p', and 'a'.
    @arg    scan_positions is a dictionary of scan dimensions.
        the dictionary contains one-dimensional numpy arrays, one for each dimension.
        the dictionary keys are 'e', 't', 'p', and 'a'.
        if a dimension is not scanned, the corresponding array contains just one element.
        if the input data does not contain a column at all,
        the corresponding output array is not included in the dictionary.

        note the special case of theta-phi scans.
        theta and phi are always returned as two separate arrays
    """
    scan_mode = []

    try:
        scan_energy = np.unique(data['e'])
    except ValueError:
        scan_energy = np.array([])
    try:
        scan_theta = np.unique(data['t'])
    except ValueError:
        scan_theta = np.array([])
    try:
        scan_phi = np.unique(data['p'])
    except ValueError:
        scan_phi = np.array([])
    try:
        scan_alpha = np.unique(data['a'])
    except ValueError:
        scan_alpha = np.array([])

    # theta-phi scan
    if scan_theta.shape[0] >= 2 and scan_phi.shape[0] >= 2:
        try:
            scan_theta_phi = np.unique(data[['t', 'p']])
        except ValueError:
            scan_theta_phi = None
        if scan_theta_phi is not None and len(scan_theta_phi.dtype.names) == 2:
            scan_theta = scan_theta_phi['t']
            scan_phi = scan_theta_phi['p']

    scan_positions = {}
    if scan_energy.shape[0] >= 1:
        scan_positions['e'] = scan_energy
        if scan_energy.shape[0] >= 2:
            scan_mode.append('e')
    if scan_theta.shape[0] >= 1:
        scan_positions['t'] = scan_theta
        if scan_theta.shape[0] >= 2:
            scan_mode.append('t')
    if scan_phi.shape[0] >= 1:
        scan_positions['p'] = scan_phi
        if scan_phi.shape[0] >= 2:
            scan_mode.append('p')
    if scan_alpha.shape[0] >= 1:
        scan_positions['a'] = scan_alpha
        if scan_alpha.shape[0] >= 2:
            scan_mode.append('a')

    return scan_mode, scan_positions


def filter_tp(data: np.ndarray, _filter: np.ndarray) -> np.ndarray:
    """
    select data points from an ETPI array that match theta and phi coordinates of another ETPI array.

    the matching tolerance is 0.001.

    @param data ETPI-like structured numpy.ndarray (ETPI, ETPIS, ETPAI, ETPAIS).

    @param _filter ETPI-like structured numpy.ndarray (ETPI, ETPIS, ETPAI, ETPAIS).
        only 't' and 'p' columns are used.

    @return filtered data (numpy.ndarray)
        copy of selected data rows from input data.
        same data type as input data.
    """
    # copy theta,phi into separate structured arrays
    data_tp = np.zeros_like(data, dtype=[('t', '<i4'), ('p', '<i4')])
    filt_tp = np.zeros_like(_filter, dtype=[('t', '<i4'), ('p', '<i4')])
    # multiply by 10, round to integer
    data_tp['t'] = np.around(data['t'] * 10.0)
    data_tp['p'] = np.around(data['p'] * 10.0)
    filt_tp['t'] = np.around(_filter['t'] * 10.0)
    filt_tp['p'] = np.around(_filter['p'] * 10.0)
    # calculate intersection
    idx = np.in1d(data_tp, filt_tp)
    result = data[idx]
    return result


def interpolate_hemi_scan(rect_tpi: np.ndarray, hemi_tpi: np.ndarray) -> np.ndarray:
    """
    interpolate a hemispherical scan from a rectangular angle scan.

    the function interpolates in phi (azimuth) only.
    the rectangular array must contain a matching scan line for each theta (polar angle) of the hemi scan.
    this requires that the hemi scan have a linear theta axis.

    @param rect_tpi TPI structured numpy.ndarray.
        rectangular theta-phi scan.
        each azimuthal line has the same number of points and range.
        the azimuthal coordinate is monotonically increasing.
    @param hemi_tpi TPI structured numpy.ndarray.
        hemispherical theta-phi scan.
        each theta of the hemi scan must have a matching scan line in the rectangular scan.
        the array may contain additional columns (E, A, S) as long as each (theta,phi) pair is unique.
        the extra columns are not altered.
    @return hemi_tpi with the interpolation result in the I column.
    """
    lin_theta = np.unique(hemi_tpi['t'])
    for theta in lin_theta:
        sel_theta = np.abs(hemi_tpi['t'] - theta) < 0.1
        lin_phi = hemi_tpi['p'][sel_theta]

        sel_rect_theta = np.abs(rect_tpi['t'] - theta) < 0.1
        rect_phi_1d = rect_tpi['p'][sel_rect_theta]
        rect_int_1d = rect_tpi['i'][sel_rect_theta]

        result = np.interp(lin_phi, rect_phi_1d, rect_int_1d)
        hemi_tpi['i'][sel_theta] = result
    return hemi_tpi


def reshape_2d(flat_data: np.ndarray, axis_columns: Sequence[str], return_column: str = 'i') -> \
        Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    reshape an ETPI-like array into a two-dimensional array according to the scan axes.

    @param flat_data structured, one-dimensional numpy.ndarray with column labels.
        the array must contain a rectangular scan grid.
        the array must be sorted in the order of axis_labels.

    @param axis_columns list of column names that designate the axes

    @param return_column: name of field to return in two dimensions

    @return the tuple (result_data, axis0, axis1), where
    @arg result_data (ndarray) new two-dimensional ndarray of the scan
    @arg axis0 (ndarray) scan positions along the first dimension
    @arg axis1 (ndarray) scan positions along the second dimension
    """

    axis0 = np.unique(flat_data[axis_columns[0]])
    n0 = len(axis0)
    axis1 = np.unique(flat_data[axis_columns[1]])
    n1 = len(axis1)
    data = np.reshape(flat_data[return_column], (n0, n1), order='C')
    return data.copy(), axis0, axis1


def calc_modfunc_mean(data: np.ndarray) -> np.ndarray:
    """
    calculates the modulation function using the mean value of data.
    this is a simplified calculation method
    which can be used if the I0 of the data does not have a strong variation.

    @param data: ETPI array containing experimental or calculated intensity.

    @return ETPI array containing the modulation function.
    """

    scan_mode, scan_positions = detect_scan_mode(data)
    modf = data.copy()

    if len(scan_mode) == 1:
        norm = np.mean(data['i'], dtype=np.float64)
        modf = data.copy()
        modf['i'] = (data['i'] - norm) / norm
    elif len(scan_mode) == 2:
        axis0 = scan_positions[scan_mode[0]]
        n0 = len(axis0)
        axis1 = scan_positions[scan_mode[1]]
        n1 = len(axis1)
        nd_data = np.reshape(data['i'], (n0, n1), order='C')

        prof0 = np.mean(nd_data, axis=1, dtype=np.float64)
        norm0 = np.mean(prof0, dtype=np.float64)
        nd_modf = (nd_data - norm0) / norm0

        modf['i'] = np.ravel(nd_modf, order='C')
    else:
        logger.error('unsupported scan in calc_modfunc_mean: {0}'.format(scan_mode))

    return modf


def calc_modfunc_loess(data: np.ndarray, smth: float = 0.4) -> np.ndarray:
    """
    calculate the modulation function using LOESS (locally weighted regression) smoothing.

    the modulation function of I(x) is (I(x) - S(x)) / S(x)
    where the array S(x) is a LOESS-smoothed copy of I(x).

    this function uses true multi-dimensional LOESS smoothing,
    in the same way as Igor's Loess operation.

    this function uses the LOESS algorithm implemented by
    William S. Cleveland, Eric Grosse, Ming-Jen Shyu, dated 18 August 1992.
    the code and the python interface are included in the loess package.

    @param data structured numpy.ndarray in EI, ETPI, or ETPAI format.
        can contain a one- or multi-dimensional scan.
        the algorithm does not require any specific scan mode or order
        (no rectangular grid, no particular scan hierarchy, no sorting).

        if data contains a hemispherical scan, the phi dimension is ignored,
        i.e. the function effectively applies a phi-average.

        the modulation function is calculated for the finite-valued scan points.
        NaNs are ignored and do not affect the finite values.

    @param smth: size of the smoothing window relative to the size of the scan.
        reasonable values are between 0.2 and 0.5.
        the default value 0.4 has been found to work in many cases.

    @return copy of the data array with the modulation function in the 'i' column.
    """
    sel = np.isfinite(data['i'])
    _data = data[sel]

    modf = data.copy()
    if _data.shape[0]:
        scan_mode, __ = detect_scan_mode(_data)
        if 't' in scan_mode and 'p' in scan_mode:
            scan_mode.remove('p')

        lo = loess.loess_struct(_data.shape[0], len(scan_mode))
        factors = [_data[axis] for axis in scan_mode]
        lo.set_x(np.hstack(tuple(factors)))
        lo.set_y(_data['i'])
        lo.model.span = smth
        loess.loess(lo)

        modf['i'][sel] = lo.get_fitted_residuals() / lo.get_fitted_values()
    else:
        modf['i'] = np.nan

    return modf


def square_diff_rfactor(experiment: np.ndarray, theory: np.ndarray) -> float:
    """
    Calculate the R-factor from the normalized sum of squared differences.

    If the sigma column is present in experiment and non-zero,
    the R-factor terms are weighted by 1/sigma**2.

    The input arrays must have the same shape and the coordinate columns must be identical.
    The array elements are compared element-by-element.
    The values of the coordinate arrays do not influence the result.
    Terms having NaN intensity are ignored.

    This function can be specified in the Scan.rfactor_func parameter of the project.

    @param experiment: (numpy structured array)
        ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.
        If an `s` field is present and non-zero,
        the R-factor terms are weighted by 1/sigma**2.

    @param theory: (numpy structured array)
        ETPI or ETPAI array containing the theoretical function.

    @return scalar R-factor in the range from 0.0 to 2.0.

    @raise ValueError if the function fails (e.g. division by zero or all elements non-finite).
    """
    sel = np.logical_and(np.isfinite(theory['i']), np.isfinite(experiment['i']))
    theory = theory[sel]
    experiment = experiment[sel]
    if ('s' in experiment.dtype.names) and (experiment['s'].min()) > 0.0:
        wgts = 1.0 / experiment['s'] ** 2
    else:
        wgts = 1.0
    difs = wgts * (experiment['i'] - theory['i']) ** 2
    sums = wgts * (experiment['i'] ** 2 + theory['i'] ** 2)
    sum1 = difs.sum(dtype=np.float64)
    sum2 = sums.sum(dtype=np.float64)
    return sum1 / sum2


def scaled_rfactor_func(scale: float, experiment: np.ndarray, weights: np.ndarray, theory: np.ndarray) -> float:
    """
    calculate the R-factor of a modulation function against the measurement with scaled amplitude.

    this function allows to apply a scaling factor to the experimental function and returns the R-factor.
    this is useful if the amplitudes of the two functions do not match due to systematic effects
    of the calculation or the measurement.

    this function is used by optimize_rfactor() as a scipy.optimize.least_squares optimization function,
    which requires a specific signature.

    NaNs will propagate to the final result.
    math exceptions are not handled.

    @param scale: scaling factor (> 0).
        the experimental modulation function is multiplied by this parameter.
        < 1 (> 1) decreases (increases) the experimental amplitude.
        the R factor is calculated using the scaled modulation function.

    @param experiment: numpy.ndarray containing the experimental modulation function

    @param weights: numpy.ndarray containing the experimental weights

    @param theory:  numpy.ndarray containing the theoretical modulation function

    @return: scalar R-factor in the range from 0.0 to 2.0.
        nan if any element of the function arguments is nan.

    @raise ValueError if all experiments and theory values or all weights are zero.
    """

    difs = weights * (scale * experiment - theory) ** 2
    sums = weights * (scale ** 2 * experiment ** 2 + theory ** 2)
    sum1 = difs.sum(dtype=np.float64)
    sum2 = sums.sum(dtype=np.float64)
    return sum1 / sum2


def optimize_rfactor(experiment: np.ndarray, theory: np.ndarray) -> float:
    """
    calculate the R-factor of a calculated modulation function against the measurement, adjusting their amplitude.

    if the sigma column is present in experiment and non-zero,
    the R-factor terms are weighted by 1/sigma**2.

    this function varies the scale of the experimental function and returns the minimum R-factor.
    this is useful if the amplitudes of the two functions do not match due to systematic effects
    of the calculation or the measurement.

    the optimization is done in a scipy.optimize.least_squares optimization of the scaled_rfactor_func() function.
    the initial guess of the scaling factor is 0.7, the constraining boundaries are 1/10 and 10.

    the input arrays must have the same shape and the coordinate columns must be identical (they are ignored).
    the array elements are compared element-by-element.
    terms having NaN intensity are ignored.

    This function can be specified in the Scan.rfactor_func parameter of the project.

    @param experiment: ETPI, ETPIS, ETPAI or ETPAIS array containing the experimental modulation function.

    @param theory: ETPI or ETPAI array containing the calculated modulation functions.

    @return scalar R-factor in the range from 0.0 to 2.0.

    @raise ValueError if the optimization fails (e.g. division by zero or all elements non-finite).
    """
    sel = np.logical_and(np.isfinite(theory['i']), np.isfinite(experiment['i']))
    theory = theory[sel]
    experiment = experiment[sel]
    if ('s' in experiment.dtype.names) and (experiment['s'].min() > 0.0):
        wgts = 1.0 / experiment['s'] ** 2
    else:
        wgts = np.ones_like(experiment['i'])

    result = so.least_squares(scaled_rfactor_func, 0.7, bounds=(0.1, 10.0), args=(experiment['i'], wgts, theory['i']))
    result_r = scaled_rfactor_func(result.x, experiment['i'], wgts, theory['i'])

    return result_r


def alpha_average(data: np.ndarray) -> np.ndarray:
    """
    average I(alpha, theta, phi) over alpha.

    @param data structured numpy.ndarray in ETPAI or ETPAIS format with a non-singular alpha dimension.

    @return resulting ETPI or ETPIS data array.
    """
    scan_mode, scan_positions = detect_scan_mode(data)
    result = data.copy()

    if len(scan_mode) == 2 and scan_mode[1] == 'a':
        axis0 = scan_positions[scan_mode[0]]
        n0 = len(axis0)
        axis1 = scan_positions[scan_mode[1]]
        n1 = len(axis1)
        nd_data = np.reshape(data, (n0, n1), order='C')

        nd_result = nd_data[:, 0]
        names = list(nd_data.dtype.names)
        names.remove('a')
        for name in names:
            nd_result[name] = np.mean(nd_data[name], axis=1, dtype=np.float64)
        result = nd_result[names]
    else:
        logger.error('unsupported scan in alpha_average: {0}'.format(scan_mode))

    return result


def phi_average(data: np.ndarray) -> np.ndarray:
    """
    average I(theta, phi) over phi.

    @param data TPI-like structured numpy.ndarray containing a hemispherical scan.

    @return resulting TI or TIS data array.
    """
    scan_mode, scan_positions = detect_scan_mode(data)
    result = data.copy()

    if scan_mode == ['t', 'p']:
        t_axis = np.unique(scan_positions['t'])
        nt = len(t_axis)

        names = list(data.dtype.names)
        names.remove('p')
        dtype = [(name, data.dtype[name].str) for name in names]
        result = create_data((nt,), dtype=dtype)

        for i, t in enumerate(t_axis):
            sel = np.abs(scan_positions['t'] - t) < 0.01
            for name in names:
                result[name][i] = np.mean(data[name][sel], dtype=np.float64)
    else:
        logger.error('unsupported scan in phi_average: {0}'.format(scan_mode))

    return result


def alpha_mirror_average(data: np.ndarray) -> np.ndarray:
    """
    calculate the average of I(alpha, theta, phi) and I(-alpha, theta, phi).

    @param data structured numpy.ndarray in ETPAI or ETPAIS format.
        for each (alpha, theta, phi) the array must contain a corresponding (-alpha, theta, phi)
        within a tolerance of 0.5 degrees in alpha. otherwise, a warning is issued.

    @return resulting data array, same shape as input.
        the array is sorted.
    """

    result1 = data.copy()
    sort_data(result1)

    result2 = data.copy()
    try:
        result2['a'] = -result2['a']
        sort_data(result2)
    except ValueError:
        pass

    if np.allclose(result1['a'], result2['a'], atol=0.5):
        result1['i'] = (result1['i'] + result2['i']) / 2.0
        try:
            result1['s'] = np.sqrt(result1['s'] ** 2 + result2['s'] ** 2) / 2.0
        except ValueError:
            pass
    else:
        logger.warning('asymmetric alpha scan. skipping alpha mirror average.')

    return result1


if loess is not None:
    default_modfunc = calc_modfunc_loess
    logger.info("pmsco.data.default_modfunc = pmsco.data.calc_modfunc_loess")
else:
    default_modfunc = calc_modfunc_mean
    logger.warning("pmsco.data.default_modfunc = pmsco.data.calc_modfunc_mean")

default_rfactor = square_diff_rfactor
logger.info("pmsco.data.default_rfactor = pmsco.data.square_diff_rfactor")