From 092d5c8362d2677f19d870aed1982df1432bcecf Mon Sep 17 00:00:00 2001 From: Jakub Vonka Date: Mon, 9 Feb 2026 18:32:00 +0100 Subject: [PATCH] Added simlink creation for work/retrieve and changed patch change if ran multiple times --- unarchived_data_fix.py | 111 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 102 insertions(+), 9 deletions(-) diff --git a/unarchived_data_fix.py b/unarchived_data_fix.py index b58e5ec..e402708 100755 --- a/unarchived_data_fix.py +++ b/unarchived_data_fix.py @@ -1,4 +1,47 @@ #!/usr/bin/env python +""" +unarchived_data_fix.py + +Fix metadata paths for retrieved (archived) SwissFEL experiment data. + +BACKGROUND +---------- +Raw experiment data normally lives in: + /sf//data//raw + +After archival + retrieval, data appears in: + /das/work/pXX/retrieve/ + +This script: + 1. Creates a symlink: + /sf//data//work/retrieve + -> /das/work/pXX/retrieve/ + + 2. Fixes metadata scan.json files so that raw file paths + correctly point to the retrieved location via: + /sf//data//work/retrieve/... + +It is SAFE to run multiple times: + - Already-fixed paths are not modified again. + +USAGE +----- +Dry-run (recommended first): + python unarchived_data_fix.py + +Overwrite scan_mod.json: + python unarchived_data_fix.py --no-dryrun --overwrite + +Overwrite original scan.json files in-place: + python unarchived_data_fix.py --no-dryrun --inplace --overwrite + +IMPORTANT FLAGS +--------------- +--no-dryrun Actually modify files (default is dry-run) +--overwrite Allow overwriting existing output files +--inplace Modify scan.json instead of writing scan_mod.json +""" + import argparse import json @@ -61,6 +104,38 @@ def check_equal(val, ref): assert val == ref, f'expected "{ref}" but got "{val}"' +def ensure_retrieve_symlink(instrument, pgroup): + """ + Ensure the expected symlink exists so the script can be run from anywhere: + + /sf//data//work/retrieve -> /das/work/pXX/retrieve/ + + (pXX means the first two digits after 'p', e.g. p21977 -> p21) + """ + pxx = pgroup[:3] # e.g. p21977 -> p21 + src = Path(f"/das/work/{pxx}/retrieve/{pgroup}") + link = Path(f"/sf/{instrument}/data/{pgroup}/work/retrieve") + + if not src.is_dir(): + raise SystemExit(f'retrieved directory "{src}" does not exist') + + # If link exists, verify it's a symlink to the expected src (do not silently replace). + if link.exists() or link.is_symlink(): + if not link.is_symlink(): + raise SystemExit(f'expected "{link}" to be a symlink, but it exists and is not a symlink') + try: + if link.resolve() != src.resolve(): + raise SystemExit(f'symlink "{link}" points to "{link.resolve()}" but expected "{src}"') + except FileNotFoundError: + raise SystemExit(f'symlink "{link}" is broken; expected target "{src}"') + return + + link.parent.mkdir(parents=True, exist_ok=True) + try: + link.symlink_to(src) + except Exception as e: + raise SystemExit(f'failed to create symlink "{link}" -> "{src}": {e}') + example_text = """usage examples: Dry run (nothing is changed or overwritten) @@ -90,6 +165,9 @@ clargs = parser.parse_args() instrument = clargs.instrument pgroup = clargs.pgroup +# create/validate the /sf/.../work/retrieve symlink so the script can be run from anywhere +ensure_retrieve_symlink(instrument, pgroup) + path = f"/sf/{instrument}/data/{pgroup}/work/retrieve/sf/{instrument}/data/{pgroup}/raw" path = Path(path) if not path.is_dir(): @@ -101,6 +179,13 @@ print() fns = path.glob("*/meta/scan.json") fns = sorted(fns) +# prefixes for detection of not fixing already fixed paths, and for validating the original raw paths +already_fixed_prefix = ( + "/", "sf", instrument, "data", pgroup, "work", "retrieve", + "sf", instrument, "data", pgroup, "raw" +) +raw_prefix = ("/", "sf", instrument, "data", pgroup, "raw") + for jfn in tqdm(fns): # print(jfn) tqdm.write(str(jfn)) @@ -110,8 +195,19 @@ for jfn in tqdm(fns): for j, fn in enumerate(step_files): # print("old fn:", fn) fn = Path(fn) + parts = fn.parts - root, sf, instr, data, pgroup, raw, *remainder = fn.parts + # If already fixed, do nothing (prevents double-appending on repeated runs). + if tuple(parts[:len(already_fixed_prefix)]) == already_fixed_prefix: + continue + + # Keep original strict validation, but accept only the original raw layout here. + if tuple(parts[:len(raw_prefix)]) != raw_prefix: + raise SystemExit(f'unexpected path in scan.json: "{fn}"') + + # Unpack path components. + # Using pgroup2 to distinguish the pgroup found in the file path from the CLI pgroup argument (expected to match though) + root, sf, instr, data, pgroup2, raw, *remainder = parts try: check_equal(root, "/") @@ -119,14 +215,14 @@ for jfn in tqdm(fns): check_equal(data, "data") check_equal(raw, "raw") assert instr in INSTRUMENTS - assert is_pgroup(pgroup) + assert is_pgroup(pgroup2) except AssertionError as e: raise SystemExit(e) new_fn = [ root, - sf, instr, data, pgroup, "work", "retrieve", - sf, instr, data, pgroup, raw, *remainder + sf, instr, data, pgroup2, "work", "retrieve", + sf, instr, data, pgroup2, raw, *remainder ] new_fn = Path(*new_fn) @@ -152,9 +248,6 @@ for jfn in tqdm(fns): if new_jfn.exists() and not clargs.overwrite: warn_overwrite(f"skipping existing file:", new_jfn, "\nyou might want to set --overwrite") - continue - - json_save(jdat, new_jfn, overwrite=clargs.overwrite) - - + continue + json_save(jdat, new_jfn, overwrite=clargs.overwrite) \ No newline at end of file -- 2.49.1