From 39599bc090b531f797acdb7c18920a15ecbf09e5 Mon Sep 17 00:00:00 2001 From: leonarski_f Date: Sun, 21 Jun 2026 14:16:34 +0200 Subject: [PATCH] tests: git-LFS large-dataset harness + jfjoch_process include cleanup - .gitattributes tracks tests/data/*.h5 via git-LFS for big reference datasets. - tests/TestData.h resolves tests/data files and reports absent / unfetched-LFS-pointer so tests can SKIP() instead of failing; tests/data/README.md documents fetching + the expected lyso_rotation/lyso_serial datasets. - JFJochProcessLargeTest: a Catch start-up listener that prints dataset availability, plus [large] full-analysis runs (rotation indexing + serial) that SKIP when data is absent. Verified against the real 1800-image lyso rotation set (100% indexing, cell 78.2/78.2/37.8) and skips cleanly without it. - jfjoch_process.cpp: drop the ~15 now-unused workflow includes left after the JFJochProcess extraction. Co-Authored-By: Claude Opus 4.8 --- .gitattributes | 3 + tests/CMakeLists.txt | 2 + tests/JFJochProcessLargeTest.cpp | 120 +++++++++++++++++++++++++++++++ tests/TestData.h | 28 ++++++++ tests/data/README.md | 35 +++++++++ tools/jfjoch_process.cpp | 37 ++-------- 6 files changed, 195 insertions(+), 30 deletions(-) create mode 100644 tests/JFJochProcessLargeTest.cpp create mode 100644 tests/TestData.h create mode 100644 tests/data/README.md diff --git a/.gitattributes b/.gitattributes index 717b707e..b81e799d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,5 @@ *.mcs filter=lfs diff=lfs merge=lfs -text *.mcs.gz filter=lfs diff=lfs merge=lfs -text + +# Large reference datasets for the [large] Catch tests (git-LFS; may not be pulled in CI). +tests/data/*.h5 filter=lfs diff=lfs merge=lfs -text diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7707b3c8..11c98990 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -46,6 +46,8 @@ ADD_EXECUTABLE(jfjoch_test ZMQMetadataSocketTest.cpp JFJochReaderTest.cpp JFJochProcessTest.cpp + JFJochProcessLargeTest.cpp + TestData.h MovingAverageTest.cpp ImageMetadataTest.cpp JFJochReceiverLiteTest.cpp diff --git a/tests/JFJochProcessLargeTest.cpp b/tests/JFJochProcessLargeTest.cpp new file mode 100644 index 00000000..e20fe7de --- /dev/null +++ b/tests/JFJochProcessLargeTest.cpp @@ -0,0 +1,120 @@ +// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute +// SPDX-License-Identifier: GPL-3.0-only + +// End-to-end JFJochProcess runs over real JUNGFRAU datasets that are kept in git-LFS under +// tests/data. They are tagged [large] and SKIP() when the data is not present (e.g. LFS not +// pulled), so the default test run stays fast and CI without the data still passes. + +#include +#include +#include + +#include +#include + +#include "TestData.h" +#include "../common/DiffractionExperiment.h" +#include "../common/IndexingSettings.h" +#include "../reader/JFJochHDF5Reader.h" +#include "../process/JFJochProcess.h" + +namespace { + // Start-up hook: report once whether the large datasets are available, so it is obvious why + // the [large] tests skip when they do. + class LargeDataListener : public Catch::EventListenerBase { + public: + using Catch::EventListenerBase::EventListenerBase; + void testRunStarting(Catch::TestRunInfo const &) override { + const bool rot = jfjoch_test::LargeDataFile("lyso_rotation_master.h5").has_value(); + const bool ser = jfjoch_test::LargeDataFile("lyso_serial_master.h5").has_value(); + std::cout << "[jfjoch_test] large datasets in " << jfjoch_test::LargeDataDir() + << ": rotation=" << (rot ? "yes" : "no") + << " serial=" << (ser ? "yes" : "no") + << " ([large] tests skip when absent)" << std::endl; + } + }; + + int default_threads() { + const unsigned hc = std::thread::hardware_concurrency(); + return hc == 0 ? 4 : static_cast(hc); + } +} + +CATCH_REGISTER_LISTENER(LargeDataListener) + +TEST_CASE("JFJochProcess_LysoRotation", "[large]") { + const auto master = jfjoch_test::LargeDataFile("lyso_rotation_master.h5"); + if (!master) + SKIP("lyso_rotation_master.h5 not available (git-lfs data not pulled)"); + + RegisterHDF5Filter(); + JFJochHDF5Reader reader; + REQUIRE_NOTHROW(reader.ReadFile(*master)); + auto dataset = reader.GetDataset(); + REQUIRE(dataset); + + DiffractionExperiment experiment(dataset->experiment); + IndexingSettings indexing; + indexing.Algorithm(IndexingAlgorithmEnum::Auto); + indexing.RotationIndexing(true); + indexing.GeomRefinementAlgorithm(GeomRefinementAlgorithmEnum::BeamCenter); + experiment.ImportIndexingSettings(indexing); + + ProcessConfig config; + config.mode = ProcessMode::FullAnalysis; + config.nthreads = default_threads(); + config.spot_finding = DiffractionExperiment::DefaultDataProcessingSettings(); + config.spot_finding.indexing = true; + config.rotation_indexing = true; + config.two_pass_rotation = true; + config.reuse_rotation_spots = false; // redo spot finding (raw dataset may carry no spots) + + JFJochProcess process(reader, experiment, dataset->pixel_mask, config); + ProcessResult result; + REQUIRE_NOTHROW(result = process.Run()); + + CHECK_FALSE(result.cancelled); + CHECK(result.images_processed == reader.GetNumberOfImages()); + REQUIRE(result.indexing_rate.has_value()); + CHECK(result.indexing_rate.value() > 0.1f); // a real rotation series indexes well + CHECK(result.consensus_cell.has_value()); + + reader.Close(); + REQUIRE(H5Fget_obj_count(H5F_OBJ_ALL, H5F_OBJ_ALL) == 0); +} + +TEST_CASE("JFJochProcess_LysoSerial", "[large]") { + const auto master = jfjoch_test::LargeDataFile("lyso_serial_master.h5"); + if (!master) + SKIP("lyso_serial_master.h5 not available (git-lfs data not pulled)"); + + RegisterHDF5Filter(); + JFJochHDF5Reader reader; + REQUIRE_NOTHROW(reader.ReadFile(*master)); + auto dataset = reader.GetDataset(); + REQUIRE(dataset); + + DiffractionExperiment experiment(dataset->experiment); + IndexingSettings indexing; + indexing.Algorithm(IndexingAlgorithmEnum::Auto); + indexing.GeomRefinementAlgorithm(GeomRefinementAlgorithmEnum::BeamCenter); + experiment.ImportIndexingSettings(indexing); + + ProcessConfig config; + config.mode = ProcessMode::FullAnalysis; + config.nthreads = default_threads(); + config.spot_finding = DiffractionExperiment::DefaultDataProcessingSettings(); + config.spot_finding.indexing = true; + + JFJochProcess process(reader, experiment, dataset->pixel_mask, config); + ProcessResult result; + REQUIRE_NOTHROW(result = process.Run()); + + CHECK_FALSE(result.cancelled); + CHECK(result.images_processed == reader.GetNumberOfImages()); + REQUIRE(result.indexing_rate.has_value()); + CHECK(result.indexing_rate.value() > 0.0f); // serial stills: at least some hits index + + reader.Close(); + REQUIRE(H5Fget_obj_count(H5F_OBJ_ALL, H5F_OBJ_ALL) == 0); +} diff --git a/tests/TestData.h b/tests/TestData.h new file mode 100644 index 00000000..9dd9ea71 --- /dev/null +++ b/tests/TestData.h @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute +// SPDX-License-Identifier: GPL-3.0-only + +#pragma once + +#include +#include +#include + +// Helpers for the large reference datasets used by the [large] Catch tests. The datasets live +// under /tests/data and are tracked with git-LFS, so they are often NOT present (LFS not +// pulled, e.g. in CI). Tests run from /tests, so the directory is reached via ../../. +namespace jfjoch_test { + inline std::string LargeDataDir() { return "../../tests/data"; } + + // Path to a large dataset file if it is present and looks like real data rather than an + // unfetched git-LFS pointer (those are tiny text stubs); std::nullopt otherwise. Tests should + // SKIP() when this returns nullopt. + inline std::optional LargeDataFile(const std::string &name) { + const std::string path = LargeDataDir() + "/" + name; + std::error_code ec; + if (!std::filesystem::is_regular_file(path, ec)) + return std::nullopt; + if (ec || std::filesystem::file_size(path, ec) < 4096 || ec) + return std::nullopt; // an LFS pointer file is well under 4 kB; an HDF5 master is not + return path; + } +} diff --git a/tests/data/README.md b/tests/data/README.md new file mode 100644 index 00000000..9a711c5d --- /dev/null +++ b/tests/data/README.md @@ -0,0 +1,35 @@ +# Large reference test datasets (git-LFS) + +The `[large]` Catch tests in `tests/` run the full analysis/processing pipeline over real +JUNGFRAU datasets that are too big to keep as ordinary git blobs. They are tracked with +**git-LFS** (see the `tests/data/*.h5` rule in the top-level `.gitattributes`). + +These files are **not required** to build or to run the normal test suite: every test that +needs them resolves the path through `jfjoch_test::LargeDataFile()` (`tests/TestData.h`) and +`SKIP()`s when the file is absent or is still an unfetched LFS pointer. `jfjoch_test` also +prints, at start-up, whether this directory is populated. + +## Fetching + +``` +git lfs install +git lfs pull # or: git lfs pull --include "tests/data/*.h5" +``` + +## Expected files + +Place each dataset's master **and its `_data_NNNNNN.h5` files** in this directory: + +| File | Dataset | +|-------------------------------|------------------------------------------| +| `lyso_rotation_master.h5` | lysozyme rotation series (~1800 images) | +| `lyso_serial_master.h5` | lysozyme serial / jet stills (~5000 images) | + +To point the tests at an existing local dataset without copying, symlink it: + +``` +ln -s /path/to/your_master.h5 tests/data/lyso_rotation_master.h5 +ln -s /path/to/your_data_000001.h5 tests/data/... +``` + +(The master references its data files by relative name, so keep them side by side.) diff --git a/tools/jfjoch_process.cpp b/tools/jfjoch_process.cpp index e9474cae..519fc328 100644 --- a/tools/jfjoch_process.cpp +++ b/tools/jfjoch_process.cpp @@ -1,45 +1,22 @@ // SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute // SPDX-License-Identifier: GPL-3.0-only -#include -#include -#include -#ifndef _WIN32 -#include // not on MSVC; only pulled getopt transitively on Linux ( below) -#endif -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include "../reader/JFJochHDF5Reader.h" #include "../common/Logger.h" #include "../common/DiffractionExperiment.h" #include "../common/PixelMask.h" -#include "../common/AzimuthalIntegrationMapping.h" -#include "../common/time_utc.h" #include "../common/print_license.h" -#include "../image_analysis/MXAnalysisWithoutFPGA.h" -#include "../image_analysis/indexing/IndexerFactory.h" -#include "../common/CUDAWrapper.h" -#include "../writer/FileWriter.h" -#include "../image_analysis/IndexAndRefine.h" -#include "../common/JFJochReceiverPlots.h" -#include "../compression/JFJochCompressor.h" #include "../image_analysis/LoadFCalcFromMtz.h" -#include "../image_analysis/scale_merge/Merge.h" -#include "../image_analysis/scale_merge/SearchSpaceGroup.h" -#include "../image_analysis/WriteReflections.h" -#include "../image_analysis/UpdateReflectionResolution.h" #include "../process/JFJochProcess.h" -#include void print_usage() { std::cout << "Usage jfjoch_process {} " << std::endl;