cristallina_analysis_package/src/cristallina/uscan.py

import json
import time
from pathlib import Path

from sfdata.utils import print_skip_warning, json_load, adjust_shape
from sfdata.ign import remove_ignored_filetypes_scan
from sfdata import SFDataFiles, SFScanInfo

from partialjson.json_parser import JSONParser

from loguru import logger

MAX_STEP_WAIT = 300  # Maximum wait time in seconds for files to appear

class UnfinishedScanInfo(SFScanInfo):
    """
    Represents an unfinished scan from SwissFEL data, mimicking a finished SFScanInfo. It allows to iterate
    over already available scan steps and waits for new data if the scan is still ongoing.

    If the scan is already finished, it behaves like a regular SFScanInfo.

    Args:
        fname (str): The filepath of the JSON file to be processed for scan information (e.g. scan.json).
        refresh_interval (int): Time in seconds to wait before checking for new data again. Default is 10 seconds.
    """

    def __init__(self, fname, refresh_interval=10):
        self.fname = fname
        self.finished = False
        self.refresh_interval = refresh_interval

        if is_finished_scan(fname):
            # simple case, it is a finished scan and our parent can handle it
            super().__init__(fname)
            self.finished = True
        else:
            # try to parse it as a partial JSON file
            self._parse_partial_json(fname)

    def _parse_partial_json(self, fname):
        with open(fname) as f:
            content = f.read()

        parser = JSONParser()
        self.info = info = parser.parse(content)

        # we try to extract as much information as possible,
        # leaving missing parts out if not available
        fnames = info.get("scan_files", [])
        self.files = remove_ignored_filetypes_scan(fnames)

        self.parameters = info.get("scan_parameters", [])

        values = info.get("scan_values", [])
        readbacks = info.get("scan_readbacks", [])

        # filter for empty values which can occur in the partial json parsing
        values = [vals for vals in values if vals]
        readbacks = [rb for rb in readbacks if rb]

        self.values = adjust_shape(values)
        self.readbacks = adjust_shape(readbacks)

    def __iter__(self):
        if self.finished:
            return super().__iter__()

        return self._generate_data()

    def _generate_data(self):
        """Generator that yields scan data as it becomes available during the scan."""
        yielded_count = 0

        while True:
            retries = 0
            self._parse_partial_json(self.fname)

            # Check if we have new files to yield
            while self.files and len(self.files) > yielded_count:

                fns = self.files[yielded_count]

                if not files_available_on_disk(fns):
                    time.sleep(self.refresh_interval)
                    retries += 1
                    if (retries * self.refresh_interval) < MAX_STEP_WAIT:  # Wait up to 5 minutes for files to appear
                        continue  # Wait and recheck
                    else:
                        logger.error(f"Timeout waiting for files to become available for step {yielded_count} {fns}")
                        # we still yield the remaining files to avoid infinite loop, but log an error and leave it to the caller to handle missing data

                yielded_count += 1
                retries = 0

                try:
                    with SFDataFiles(*fns) as data:
                        yield data
                except Exception as exc:
                    # TODO: Think about what could go wrong here and deal with it more specifically
                    sn = f"step {yielded_count - 1} {fns}"
                    print_skip_warning(exc, sn)
                    continue  # Try next file

            if is_finished_scan(self.fname) and (yielded_count >= len(self.files)):
                return  # Scan is finished, and we yielded all available files, stop iteration

            # Wait before checking again
            time.sleep(self.refresh_interval)


def is_finished_scan(fname):
    """ If the scan.json file is complete, valid and the files are on the disk the scan is finished.
        This does not cover the case where a scan is interrupted and continued later.

        It also relies on the behavior of RA that the scan.json is only fully written at the end of the scan.

        TODO: A clear criterion for a finished scan would be that another later scan exists."""
    try:
        content = json_load(fname)
        files = [file for step in content['scan_files'] for file in step]
        # Check if all files are available on disk
        if not files_available_on_disk(set(files)):
            return False
    except json.JSONDecodeError:
        return False
    return True


def files_available_on_disk(fnames):
    """Check if all files for this step are available on disk and contain some data."""
    # fnames = [fn for fn in fnames if not "PVDATA" in fn]  # PVDATA files are not written to disk at the moment!
    # logger.debug(f"Skipping PVDATA files for availability check as a workaround!")
    if all(Path(fn).exists() for fn in fnames):
       return all(Path(fn).stat().st_size > 0 for fn in fnames)
    return False