pyTrainSeg/filter_functions.py

# -*- coding: utf-8 -*-
"""
Created on Mon Aug 15 16:35:22 2022

@author: fische_r


class to create feature stack on 4D tomo data
transient dimension, e.g. time, should be 4th dimension of input data


TODO: add GPU support (CUDA, cupy, cucim)
TODO: store git commit sha

"""

# necessary packages for feature stack
import numpy as np
from scipy import ndimage
from skimage import filters
from skimage.morphology import ball

import dask
import dask.array
# from dask.distributed import Client, LocalCluster

from itertools import combinations_with_replacement, combinations
import xarray as xr

# functions take chunked dask-array as input

# start-up cluster, TODO: option to connect to exisitng cluster
# TODO: class/function to boot up cluster with custom options, e.g. workers/threads
# esp. use SSD for memory spilling
# cluster = LocalCluster()
# client = Client(cluster)
# print('Dashboard at '+cluster.dashboard_link)

default_feature_dict = {'Gaussian': True,
               # 'Sobel': True,
               'Hessian': True,
               'Diff of Gaussians': True,
               'maximum': True,
               'minimum': True,
               'median': True,
               'extra_time_ranks': True,
              }

def ball_4d(sig):
    bnd = np.zeros((sig*2+1,sig*2+1,sig*2+1,sig*2+1), dtype = bool)
    bnd[sig,sig,sig,sig] = True
    ecd = ndimage.distance_transform_edt(~bnd)
    bnd = (ecd<sig+0.01).astype(int)
    return bnd

class image_filter:
    def __init__(self,
                 data_path = None,
                 outpath = None,
                 sigmas = [0,2, 4],
                 feature_dict = default_feature_dict,
                 mod_feat_dict = None,
                 chunksize = '20 MiB', #try to align chunks to extend far in time --> should be useful for most filters, esp. the dynamic rank filters
                 outchunks = '300 MiB',
                 ranks = ['maximum', 'minimum', 'median'], #, 'mean'
                 sigma_t = 40
                 ):
        if mod_feat_dict is not None:
            for key in mod_feat_dict:
                feature_dict[key] = mod_feat_dict[key]

        self.data_path = data_path
        self.outpath = outpath
        self.sigmas = sigmas
        self.feature_dict = feature_dict
        # TODO: allow option of custom shaped chunks
        self.chunks = chunksize
        self.outchunks = outchunks

        #wheter considering means for first and last time step
        self.take_means = True
        self.num_means = 7

        # not sure if this is clever, does dask understand that this data is reused?
        self.Gaussian_4D_dict = {}
        self.Gaussian_space_dict = {}
        self.Gaussian_time_dict = {}
        self.Gradient_dict = {}
        self.calculated_features = []
        self.feature_names = []
        self.considered_ranks = ranks
        self.sigma_t = sigma_t

        self.prepared = False
        self.computed = False


    # TODO: currently loads full dataset into memory, consider aligning desired chunk already for original dataset to avoid rechunking
    # .rechunk() causes problems downstream: "Assertion error" , WTF?!
    # if original data soe not fit in RAM, rechunk, store to disk and load again?
    def open_raw_data(self):
        data = xr.open_dataset(self.data_path)
        da = dask.array.from_array(data.tomo.data, chunks = self.chunks)

        self.original_dataset = data
        self.data = da

    def open_lazy_data(self, chunks=None):
        if chunks is None:
            chunks = self.chunks
        data = xr.open_dataset(self.data_path, chunks = chunks)
        da = dask.array.from_array(data.tomo)
        # print('maybe re-introducing rechunking, but for large datasets auto might be ok')
        # print('smaller chunks might be better for slicewise training')
        # print('currently provided chunks are ignored')
        self.original_dataset = data#.rechunk(self.chunks)
        self.data = da

    def load_raw_data(self):
        data = xr.load_dataset(self.data_path)
        # da = dask.array.from_array(data.tomo).rechunk(chunks = self.chunks)

        da = dask.array.from_array(data.tomo.data, chunks = self.chunks)

        self.original_dataset = data
        self.data = da

    def Gaussian_Blur_4D(self, sigma):
        # TODO: check on boundary mode
        deptharray = np.ones(self.data.ndim)+4*sigma
        deptharray = tuple(np.min([deptharray, self.data.shape], axis=0))
        G = self.data.map_overlap(filters.gaussian, depth=deptharray, boundary='nearest', sigma = sigma)
        self.feature_names.append('Gaussian_4D_Blur_'+f'{sigma:.1f}')
        self.calculated_features.append(G)
        self.Gaussian_4D_dict[f'{sigma:.1f}'] = G

    def Gaussian_Blur_space(self, sigma):
        deptharray = np.ones(self.data.ndim)+4*sigma
        deptharray[-1] = 0
        sigmas = np.ones(deptharray.shape)*sigma
        deptharray = tuple(np.min([deptharray, self.data.shape], axis=0))

        sigmas[-1] = 0
        G = self.data.map_overlap(filters.gaussian, depth=deptharray, boundary='nearest', sigma = sigmas)
        self.feature_names.append('Gaussian_space_'+f'{sigma:.1f}')
        self.calculated_features.append(G)
        self.Gaussian_space_dict[f'{sigma:.1f}'] = G

    def Gaussian_Blur_time(self, sigma):
        deptharray = np.ones(self.data.ndim)+4*sigma
        deptharray[:-1] = 0
        sigmas = np.ones(deptharray.shape)*sigma
        deptharray = tuple(np.min([deptharray, self.data.shape], axis=0))
        sigmas[:-1] = 0
        G = self.data.map_overlap(filters.gaussian, depth=deptharray, boundary='nearest', sigma = sigmas)
        self.feature_names.append('Gaussian_time_'+f'{sigma:.1f}')
        self.calculated_features.append(G)
        self.Gaussian_time_dict[f'{sigma:.1f}'] = G

    def Gaussian_4D_stack(self):
        flag = True
        for sigma in self.sigmas:
            if np.abs(sigma-0)<0.1:
                if flag:
                    flag = False
                    # self.Gaussian_4D_dict['original'] = self.data
                    # self.calculated_features.append(self.data)
                    # self.feature_names.append('original')
                    sig = 0
                    self.Gaussian_Blur_4D(sig)

            else:
                self.Gaussian_Blur_4D(sigma)

    def Gaussian_space_stack(self):
        flag = True
        for sigma in self.sigmas:
            if np.abs(sigma-0)<0.1:
                if flag:
                    flag = False
                    # self.Gaussian_space_dict['original'] = self.data
                    sig = 0
                    self.Gaussian_Blur_space(sig)
            else:
                self.Gaussian_Blur_space(sigma)

    def Gaussian_time_stack(self):
        flag = True
        for sigma in self.sigmas:
            if np.abs(sigma-0)<0.1:
                if flag:
                    flag = False
                    # self.Gaussian_time_dict['original'] = self.data
                    sig = 0
                    self.Gaussian_Blur_time(sig)
            else:
                self.Gaussian_Blur_time(sigma)


    def diff_Gaussian(self, mode):
        if mode == '4D':
            lookup_dict = self.Gaussian_4D_dict
        elif mode == 'space':
            lookup_dict = self.Gaussian_space_dict
        elif mode == 'time':
            lookup_dict = self.Gaussian_time_dict
        for comb in combinations(lookup_dict.keys(),2):
            G1 = lookup_dict[comb[1]]
            G0 = lookup_dict[comb[0]]
            # DG = lookup_dict[comb[1]] - lookup_dict[comb[0]]
            # DG = G1-G0
            DG = dask.array.subtract(G1,G0)
            name = ''.join(['diff_of_gauss_',mode,'_',comb[1],'_',comb[0]])
            self.calculated_features.append(DG)
            self.feature_names.append(name)

    def diff_to_first_and_last(self, take_mean, means):
#         TODO: take temporal mean/median for first and last
        DA = self.data
        if take_mean:
            first = DA[...,:means].mean(axis=-1)
            last = DA[...,-means:].mean(axis=-1)
        else:
            first = DA[...,0]
            last = DA[...,-1]
        if type(first) is not np.ndarray:
            first = first.compute()
            last = last.compute()
        # ones = dask.array.ones(DA.shape, chunks=self.chunks)
        if type(first) is np.ndarray:
            firsts = dask.array.stack([first]*DA.shape[-1], axis=-1)
            lasts = dask.array.stack([last]*DA.shape[-1], axis=-1)
            firsts = firsts.rechunk(DA.chunksize)
            lasts = lasts.rechunk(DA.chunksize)
            DF = DA - firsts
            DL = DA - lasts
            self.calculated_features.append(DF)
            self.feature_names.append('diff_to_first_')
            self.calculated_features.append(DL)
            self.feature_names.append('diff_to_last_')
            self.feature_names.append('first_')
            self.calculated_features.append(firsts)
            self.feature_names.append('last_')
            self.calculated_features.append(lasts)
        else:
            print('Diff first and last is an unexplainable pain in the ass, solve this at one point')

    def time_mean(self):
        DA = self.data
        mean = DA.mean(axis=-1)
        means = dask.array.stack([mean]*DA.shape[-1], axis=-1)
        self.calculated_features.append(means)
        self.feature_names.append('full_temporal_mean_')


    def Gradients(self):
        for key in self.Gaussian_4D_dict:
            G = self.Gaussian_4D_dict[key]
            gradients = dask.array.gradient(G)
            self.Gradient_dict[key] = gradients

    def Hessian(self):
        # TODO: add max of all dimensions
        for key in self.Gradient_dict.keys():
            axes = range(self.data.ndim)
            gradients = self.Gradient_dict[key]
            H_elems = [dask.array.gradient(gradients[ax0], axis=ax1) for ax0, ax1 in combinations_with_replacement(axes, 2)]

            gradnames = ['Gradient_sigma_'+key+'_'+str(ax0) for ax0 in axes]
            elems = [(ax0,ax1) for ax0, ax1 in combinations_with_replacement(axes, 2)]
            hessnames = [''.join(['hessian_sigma_',key,'_',str(elm[0]),str(elm[1])]) for elm in elems ]

            self.feature_names = self.feature_names + gradnames + hessnames
            self.calculated_features = self.calculated_features+gradients+H_elems

    # def rank_filter(self, option, sigma):
    #     # note: rank filters not yet available in CUCIM and don't work with dask --> figure out why
    #     da = self.data
    #     if not np.abs(sigma-0)<1:
    #         if option == 'minimum':
    #             fun = filters.rank.minimum
    #         elif option == 'maximum':
    #             fun = filters.rank.maximum
    #         elif option == 'median':
    #             fun = filters.rank.median
    #         elif option == 'mean':
    #             fun = filters.rank.mean

    #         fp = ball_4d(sigma)
    #         deptharray = np.ones(da.ndim)+sigma
    #         deptharray = tuple(np.min([deptharray, da.shape], axis=0))

    #         R = da.map_overlap(fun, depth=deptharray, footprint=fp)
    #         name = ''.join([option,'_',f'{sigma:.1f}'])
    #         self.calculated_features.append(R)
    #         self.feature_names.append(name)

    # def dynamic_rank_filter(self, option, sigma):
    #     # TODO: add custom dynamic model, eg. sigmoid
    #     da = self.data
    #     if option == 'minimum':
    #         fun = filters.rank.minimum
    #     elif option == 'maximum':
    #         fun = filters.rank.maximum
    #     elif option == 'median':
    #         fun = filters.rank.median
    #     elif option == 'mean':
    #         fun = filters.rank.mean

    #     fp_3D = ball(sigma)
    #     fp_4D = np.zeros(list(fp_3D.shape)+[2*self.sigma_t], dtype=int)
    #     fp_4D[fp_3D>0,:] = 1
    #     deptharray = np.ones(da.ndim)+sigma
    #     deptharray[-1] = self.sigma_t
    #     deptharray = tuple(np.min([deptharray, da.shape], axis=0))

    #     R = da.map_overlap(fun, depth=deptharray, footprint=fp_4D)
    #     name = ''.join([option,'_dynamic_',f'{sigma:.1f}'])
    #     self.calculated_features.append(R)
    #     self.feature_names.append(name)

    def rank_like_filter(self, option, sigma):
        # note: rank filters not yet available in CUCIM
        da = self.data
        if not np.abs(sigma-0)<1:
            if option == 'minimum':
                fun = ndimage.minimum_filter
            elif option == 'maximum':
                fun = ndimage.maximum_filter
            elif option == 'median':
                fun = ndimage.median_filter
            # elif option == 'mean':
            #     fun = filters.rank.mean

            fp = ball_4d(sigma)
            deptharray = np.ones(da.ndim)+sigma
            deptharray = tuple(np.min([deptharray, da.shape], axis=0))

            R = da.map_overlap(fun, depth=deptharray, footprint=fp)
            name = ''.join([option,'_',f'{sigma:.1f}'])
            self.calculated_features.append(R)
            self.feature_names.append(name)

    def dynamic_rank_like_filter(self, option, sigma):
        # TODO: add custom dynamic model, eg. sigmoid
        da = self.data
        if option == 'minimum':
            fun = ndimage.minimum_filter
        elif option == 'maximum':
            fun = ndimage.maximum_filter
        elif option == 'median':
            fun = ndimage.median_filter
        # elif option == 'mean':
        #     fun = filters.rank.mean

        fp_3D = ball(sigma)
        fp_4D = np.zeros(list(fp_3D.shape)+[2*self.sigma_t], dtype=int)
        fp_4D[fp_3D>0,:] = 1
        deptharray = np.ones(da.ndim)+sigma
        deptharray[-1] = self.sigma_t
        deptharray = tuple(np.min([deptharray, da.shape], axis=0))

        R = da.map_overlap(fun, depth=deptharray, footprint=fp_4D)
        name = ''.join([option,'_dynamic_',f'{sigma:.1f}'])
        self.calculated_features.append(R)
        self.feature_names.append(name)


    def rank_filter_stack(self):
        for option in self.considered_ranks:
            for sigma in self.sigmas:
                self.rank_like_filter(option, sigma)
                self.dynamic_rank_like_filter(option, sigma)

    def pixel_coordinates(self):
        #create 3 arrays with the pixel coordinates
        da = self.data
#         coords = dask.array.where(da)

#         for i in range(3):
#             loc = coords[i].compute().reshape(da.shape) #compute() to know shape of coords, maybe find another way
#             self.calculated_features.append(loc)
#             self.feature_names.append('loc_'+'xyz'[i])

        # the following looks less elegant, but seems more compatible with dask
        # TODO: check performance
        loc_x = dask.array.ones(da.shape)*dask.array.arange(da.shape[0])[:,None, None, None]
        self.calculated_features.append(loc_x)
        self.feature_names.append('loc_'+'x')

        loc_y = dask.array.ones(da.shape)*dask.array.arange(da.shape[1])[None,:, None, None]
        self.calculated_features.append(loc_y)
        self.feature_names.append('loc_'+'y')

        loc_z = dask.array.ones(da.shape)*dask.array.arange(da.shape[2])[None, None,:, None]
        self.calculated_features.append(loc_z)
        self.feature_names.append('loc_'+'z')


    # TODO: include feature selection either in compute (better) or save
    # TODO: maybe add purge function
    # TODO: maybe add iterative segmentation results, i.e. median filter of segmentation
    def prepare(self):
        self.Gaussian_4D_dict = {}
        self.Gaussian_space_dict = {}
        self.Gaussian_time_dict = {}
        self.Gradient_dict = {}
        self.calculated_features = []
        self.feature_names = []

        self.diff_to_first_and_last(self.take_means, self.num_means)
        self.Gaussian_4D_stack()
        self.diff_Gaussian('4D')
        self.Gradients()
        self.Hessian()
        self.Gaussian_time_stack()
        self.diff_Gaussian('time')
        self.Gaussian_space_stack()
        self.diff_Gaussian('space')
        self.pixel_coordinates()
        # self.time_mean()
        # self.rank_filter_stack() #you have to load the entire raw data set for this filter --> not so good for many time steps

        self.prepared = True


    def stack_features(self):
        if not self.prepared:
            print('prepare first')
        else:
            self.feature_stack = dask.array.stack(self.calculated_features, axis = 4)

    def compute(self):
        # self.feature_stack = self.feature_stack.compute()
        self.feature_stack = self.feature_stack.persist() #not sure, but persist should be preferred
        self.computed = True

    def make_xarray_nc(self, outpath = None, store=False):
        if outpath is None:
            outpath = self.outpath
        shp = self.feature_stack.shape
        coords = {'x': np.arange(shp[0]), 'y': np.arange(shp[1]), 'z': np.arange(shp[2]), 'time': np.arange(shp[3]), 'feature': self.feature_names}
        if store:
            if self.computed:
                if not type(self.feature_stack) is np.ndarray:
                    self.feature_stack.rechunk(self.outchunks)

                #TODO avoid this explcit conversion. however seems necessary ?...
                # if type(self.feature_stack) is not np.ndarray:
                    # self.feature_stack = self.feature_stack.compute()

                self.result = xr.Dataset({'feature_stack': (['x','y','z','time', 'feature'], self.feature_stack)},
                         coords = coords
                         )
                self.result.to_netcdf(outpath)
            else:
                print('maybe you have to compute the stack first ... ?!')

        else:
            self.result = xr.Dataset({'feature_stack': (['x','y','z','time', 'feature'], self.feature_stack)},
                                     coords = coords
                                                 )