DeepLearning/src/datasets.py

from torch.utils.data import Dataset
import torch
import numpy as np

class singlePhotonDataset(Dataset):
    def __init__(self, sampleList, sampleRatio, datasetName, noiseKeV=0, numberOfAugOps=1, normalize=False, noiseThreshold=0):
        self.sampleFileList = sampleList
        self.sampleRatio = sampleRatio
        self.datasetName = datasetName
        self.numberOfAugOps = numberOfAugOps
        self.normalize = normalize
        self.noiseThreshold = noiseThreshold
        self._init_coords()

        all_samples = []
        all_labels = []
        all_ref_pts = []
        for sampleFile in self.sampleFileList:
            if '.npz' in sampleFile:
                data = np.load(sampleFile)
                all_samples.append(data['samples'])
                if 'referencePoint' in data:
                    all_ref_pts.append(data['referencePoint'])
                else:
                    all_ref_pts.append(np.zeros((data['samples'].shape[0], 2), dtype=np.float32)) ### dummy reference points
                if 'labels' in data:
                    all_labels.append(data['labels'])
                else:
                    all_labels.append(np.zeros((data['samples'].shape[0], 4), dtype=np.float32)) ### dummy labels
            elif '.h5' in sampleFile:
                import h5py
                with h5py.File(sampleFile, 'r') as f:
                    samples = f['clusters'][:]
                    if 'referencePoint' in f:
                        ref_pts = f['referencePoint'][:]
                        all_ref_pts.append(ref_pts)
                    else:
                        all_ref_pts.append(np.zeros((samples.shape[0], 2), dtype=np.float32)) ### dummy reference points
                    if 'labels' in f:
                        labels = f['labels'][:]
                    else:
                        labels = np.zeros((samples.shape[0], 4), dtype=np.float32) ### dummy labels
                    all_samples.append(samples)
                    all_labels.append(labels)
        self.samples = np.concatenate(all_samples, axis=0)
        if noiseKeV != 0:
            print(f'Adding Gaussian noise with sigma = {noiseKeV} keV to samples in {self.datasetName} dataset')
            noise = np.random.normal(loc=0.0, scale=noiseKeV, size=self.samples.shape)
            self.samples = self.samples + noise
        if self.noiseThreshold != 0 and noiseKeV != 0:
            print(f'[{self.datasetName} dataset] \t Setting values below noise threshold ({self.noiseThreshold} keV) to zero')
            self.samples[self.samples < self.noiseThreshold] = 0  ### set values below threshold to zero
        self.labels = np.concatenate(all_labels, axis=0)
        self.referencePoint = np.concatenate(all_ref_pts, axis=0) if all_ref_pts else None

        if self.normalize:
            print(f'Normalizing samples in {self.datasetName} dataset by total charge')
            total_charge = np.sum(self.samples, axis=(1,2), keepdims=True)  # (B, 1, 1)
            total_charge[total_charge == 0] = 1  # avoid division by zero
            self.samples = self.samples / total_charge * 15.  # normalize each sample by its total charge

        if self.samples.shape[1] == 5: ### if sample size is 5x5, remove border pixels to make it 3x3
            self.samples = self.samples[:, 1:-1, 1:-1]  ### remove border pixels
            self.labels = self.labels - np.array([1, 1, 0, 0])  ### adjust labels accordingly
            if self.referencePoint is not None:
                self.referencePoint = self.referencePoint + np.array([1, 1])  ### adjust reference points accordingly
        self.samples = np.expand_dims(self.samples, axis=1)
        self.labels -= np.array([self.samples.shape[-1]/2., self.samples.shape[-1]/2., 0, 0])  ### B,D,3,3 adjust labels to be centered at (0,0)
        self.labels[:, 2] /= 650. ### normalize z coordinate (depth) to [0, 1]
        ### total number of samples
        self.nSamples = int(self.samples.shape[0] * self.sampleRatio)
        self.effectiveLength = self.nSamples * self.numberOfAugOps
        print(f"[{self.datasetName} dataset] \t Total number of samples: {self.nSamples} \t Effective length (with augmentation): {self.effectiveLength}")

    def _init_coords(self):
        # Create a coordinate grid for 3x3 input
        x = np.linspace(-0.5, 0.5, 3)
        y = np.linspace(-0.5, 0.5, 3)
        x_grid, y_grid = np.meshgrid(x, y, indexing='ij')  # (3,3), (3,3)
        self.x_grid = torch.tensor(np.expand_dims(x_grid, axis=0)).float().contiguous()  # (1, 3, 3)
        self.y_grid = torch.tensor(np.expand_dims(y_grid, axis=0)).float().contiguous()  # (1, 3, 3)

    def __getitem__(self, index):
        sampleIdx, operationIdx = index // self.numberOfAugOps, index % self.numberOfAugOps
        sample = self.samples[sampleIdx]
        label = self.labels[sampleIdx]

        ###(  flipAxes, swap,  label_transform)
        ### sample axes: 0 - y axis, 1 - x axis
        ### label: (x, y, ...)
        TRANSFORMS = {
            0: (None,   False,  lambda l: l),
            1: ([1],    False,  lambda l: np.array([-l[0], l[1], l[2], l[3]])),
            2: ([0],    False,  lambda l: np.array([l[0], -l[1], l[2], l[3]])),
            3: ([0, 1], False,  lambda l: -l),
            4: (None,   True,   lambda l: np.array([l[1], l[0], l[2], l[3]])),
            5: ([1],    True,   lambda l: np.array([-l[1], l[0], l[2], l[3]])),
            6: ([0],    True,   lambda l: np.array([l[1], -l[0], l[2], l[3]])),
            7: ([0, 1], True,   lambda l: -np.array([l[1], l[0], l[2], l[3]])),
        }
        flipAxes, doSwap, labelTransform = TRANSFORMS[operationIdx]
        if doSwap:
            sample = np.swapaxes(sample, -1, -2)
        if flipAxes is not None:
            sample = np.flip(sample, axis=[ax+1 for ax in flipAxes])
        label = labelTransform(label)

        sample = torch.from_numpy(np.ascontiguousarray(sample)).float()
        sample = torch.cat((sample, self.x_grid, self.y_grid), dim=0)  ### concatenate coordinate channels
        label = torch.from_numpy(label).float()
        return sample, label

    def __len__(self):
        return self.effectiveLength

class doublePhotonDataset(Dataset):
    def __init__(self, sampleList, sampleRatio, datasetName, reuselFactor=1, noiseKeV=0, nSize=6):
        self.sampleFileList = sampleList
        self.sampleRatio = sampleRatio
        self.datasetName = datasetName
        self.noiseKeV = noiseKeV
        self.nSize = nSize
        self._init_coords()

        all_samples = []
        all_labels = []
        for idx, sampleFile in enumerate(self.sampleFileList):
            if '.npz' in sampleFile:
                data = np.load(sampleFile)
                all_samples.append(data['samples'])
                all_labels.append(data['labels'])
            elif '.h5' in sampleFile:
                import h5py
                with h5py.File(sampleFile, 'r') as f:
                    samples = f['clusters'][:]
                    labels = f['labels'][:]
                    all_samples.append(samples)
                    all_labels.append(labels)
        self.samples = np.concatenate(all_samples, axis=0)
        if self.noiseKeV != 0:
            print(f'Adding Gaussian noise with sigma = {self.noiseKeV} keV to samples in {self.datasetName} dataset')
            noise = np.random.normal(loc=0.0, scale=self.noiseKeV, size=self.samples.shape)
            #### add noise only to pixels that not zero
            noise[self.samples == 0] = 0
            self.samples = self.samples + noise
        self.labels = np.concatenate(all_labels, axis=0)

        ### total number of samples
        self.length = int(self.samples.shape[0] * self.sampleRatio) // 2 * reuselFactor
        print(f"[{self.datasetName} dataset] \t Total number of samples: {self.length}")

    def _init_coords(self):
        # Create a coordinate grid for 3x3 input
        x = np.linspace(-self.nSize/2. + 0.5, self.nSize/2. - 0.5, self.nSize)
        y = np.linspace(-self.nSize/2. + 0.5, self.nSize/2. - 0.5, self.nSize)
        x_grid, y_grid = np.meshgrid(x, y, indexing='ij')  # (nSize,nSize), (nSize,nSize)
        self.x_grid = torch.tensor(np.expand_dims(x_grid, axis=0)).float().contiguous()  # (1, nSize, nSize)
        self.y_grid = torch.tensor(np.expand_dims(y_grid, axis=0)).float().contiguous()  # (1, nSize, nSize)

    def __getitem__(self, index):
        sample = np.zeros((self.nSize+2, self.nSize+2), dtype=np.float32)
        idx1 = np.random.randint(0, self.samples.shape[0])
        idx2 = np.random.randint(0, self.samples.shape[0])
        photon1 = self.samples[idx1]
        photon2 = self.samples[idx2]
        singlePhotonSize = photon1.shape[0]

        ### random position for photons in
        pos_x1 = np.random.randint(1, 4)
        pos_y1 = np.random.randint(1, 4)
        sample[pos_y1:pos_y1+singlePhotonSize, pos_x1:pos_x1+singlePhotonSize] += photon1
        pos_x2 = np.random.randint(1, 4)
        pos_y2 = np.random.randint(1, 4)
        sample[pos_y2:pos_y2+singlePhotonSize, pos_x2:pos_x2+singlePhotonSize] += photon2
        sample = sample[1:-1, 1:-1]  ### sample size: nSize x nSize
        sample = torch.tensor(sample, dtype=torch.float32).unsqueeze(0)
        sample = torch.cat((sample, self.x_grid, self.y_grid), dim=0)  ### concatenate coordinate channels

        label1 = self.labels[idx1] + np.array([pos_x1-1-self.nSize/2., pos_y1-1-self.nSize/2., 0, 0])
        label2 = self.labels[idx2] + np.array([pos_x2-1-self.nSize/2., pos_y2-1-self.nSize/2., 0, 0])
        label = np.concatenate((label1, label2), axis=0)
        return sample, torch.tensor(label, dtype=torch.float32)

    def __len__(self):
        return self.length


class doublePhotonInferenceDataset(Dataset):
    def __init__(self, sampleList, sampleRatio, datasetName, nSize=6):
        self.sampleFileList = sampleList
        self.sampleRatio = sampleRatio
        self.datasetName = datasetName
        self.nSize = nSize
        self._init_coords()

        all_samples = []
        all_ref_pts = []
        for idx, sampleFile in enumerate(self.sampleFileList):
            if '.npz' in sampleFile:
                data = np.load(sampleFile)
                all_samples.append(data['samples'])
                all_ref_pts.append(data['referencePoint'])
            elif '.h5' in sampleFile:
                import h5py
                with h5py.File(sampleFile, 'r') as f:
                    samples = f['clusters'][:]
                    ref_pts = f['referencePoint'][:]
                    all_samples.append(samples)
                    all_ref_pts.append(ref_pts)
        self.samples = np.concatenate(all_samples, axis=0) if all_samples else None
        self.referencePoint = np.concatenate(all_ref_pts, axis=0) if all_ref_pts else None
        ### total number of samples
        self.length = int(self.samples.shape[0] * self.sampleRatio)
        self.referencePoint = self.referencePoint[:self.length]
        print(f"[{self.datasetName} dataset] \t Total number of samples: {self.length}")

    def _init_coords(self):
        # Create a coordinate grid for 3x3 input
        x = np.linspace(-self.nSize/2. + 0.5, self.nSize/2. - 0.5, self.nSize)
        y = np.linspace(-self.nSize/2. + 0.5, self.nSize/2. - 0.5, self.nSize)
        x_grid, y_grid = np.meshgrid(x, y, indexing='ij')  # (nSize,nSize), (nSize,nSize)
        self.x_grid = torch.tensor(np.expand_dims(x_grid, axis=0)).float().contiguous()  # (1, nSize, nSize)
        self.y_grid = torch.tensor(np.expand_dims(y_grid, axis=0)).float().contiguous()  # (1, nSize, nSize)

    def __getitem__(self, index):
        sample = self.samples[index]
        # sample[sample == 0] += np.random.normal(loc=0.0, scale=0.13, size=sample[sample == 0].shape)  ### add noise to zero pixels
        sample = torch.tensor(sample, dtype=torch.float32).unsqueeze(0)
        sample = torch.cat((sample, self.x_grid, self.y_grid), dim=0)  ### concatenate coordinate channels
        dummy_label = np.zeros((8,), dtype=np.float32)  ### dummy label
        return sample, torch.tensor(dummy_label, dtype=torch.float32)

    def __len__(self):
        return self.length