Added simlink creation for work/retrieve and changed patch change if ran multiple times #1

Open
vonka_j wants to merge 1 commits from vonka_j/jakub-unarchived_data_fix:link_and_overwrite_update into main

View File

@@ -1,4 +1,47 @@
#!/usr/bin/env python
"""
unarchived_data_fix.py
Fix metadata paths for retrieved (archived) SwissFEL experiment data.
BACKGROUND
----------
Raw experiment data normally lives in:
/sf/<instrument>/data/<pgroup>/raw
After archival + retrieval, data appears in:
/das/work/pXX/retrieve/<pgroup>
This script:
1. Creates a symlink:
/sf/<instrument>/data/<pgroup>/work/retrieve
-> /das/work/pXX/retrieve/<pgroup>
2. Fixes metadata scan.json files so that raw file paths
correctly point to the retrieved location via:
/sf/<instrument>/data/<pgroup>/work/retrieve/...
It is SAFE to run multiple times:
- Already-fixed paths are not modified again.
USAGE
-----
Dry-run (recommended first):
python unarchived_data_fix.py <instrument> <pgroup>
Overwrite scan_mod.json:
python unarchived_data_fix.py --no-dryrun --overwrite <instrument> <pgroup>
Overwrite original scan.json files in-place:
python unarchived_data_fix.py --no-dryrun --inplace --overwrite <instrument> <pgroup>
IMPORTANT FLAGS
---------------
--no-dryrun Actually modify files (default is dry-run)
--overwrite Allow overwriting existing output files
--inplace Modify scan.json instead of writing scan_mod.json
"""
import argparse
import json
@@ -61,6 +104,38 @@ def check_equal(val, ref):
assert val == ref, f'expected "{ref}" but got "{val}"'
def ensure_retrieve_symlink(instrument, pgroup):
"""
Ensure the expected symlink exists so the script can be run from anywhere:
/sf/<instrument>/data/<pgroup>/work/retrieve -> /das/work/pXX/retrieve/<pgroup>
(pXX means the first two digits after 'p', e.g. p21977 -> p21)
"""
pxx = pgroup[:3] # e.g. p21977 -> p21
src = Path(f"/das/work/{pxx}/retrieve/{pgroup}")
link = Path(f"/sf/{instrument}/data/{pgroup}/work/retrieve")
if not src.is_dir():
raise SystemExit(f'retrieved directory "{src}" does not exist')
# If link exists, verify it's a symlink to the expected src (do not silently replace).
if link.exists() or link.is_symlink():
if not link.is_symlink():
raise SystemExit(f'expected "{link}" to be a symlink, but it exists and is not a symlink')
try:
if link.resolve() != src.resolve():
raise SystemExit(f'symlink "{link}" points to "{link.resolve()}" but expected "{src}"')
except FileNotFoundError:
raise SystemExit(f'symlink "{link}" is broken; expected target "{src}"')
return
link.parent.mkdir(parents=True, exist_ok=True)
try:
link.symlink_to(src)
except Exception as e:
raise SystemExit(f'failed to create symlink "{link}" -> "{src}": {e}')
example_text = """usage examples:
Dry run (nothing is changed or overwritten)
@@ -90,6 +165,9 @@ clargs = parser.parse_args()
instrument = clargs.instrument
pgroup = clargs.pgroup
# create/validate the /sf/.../work/retrieve symlink so the script can be run from anywhere
ensure_retrieve_symlink(instrument, pgroup)
path = f"/sf/{instrument}/data/{pgroup}/work/retrieve/sf/{instrument}/data/{pgroup}/raw"
path = Path(path)
if not path.is_dir():
@@ -101,6 +179,13 @@ print()
fns = path.glob("*/meta/scan.json")
fns = sorted(fns)
# prefixes for detection of not fixing already fixed paths, and for validating the original raw paths
already_fixed_prefix = (
"/", "sf", instrument, "data", pgroup, "work", "retrieve",
"sf", instrument, "data", pgroup, "raw"
)
raw_prefix = ("/", "sf", instrument, "data", pgroup, "raw")
for jfn in tqdm(fns):
# print(jfn)
tqdm.write(str(jfn))
@@ -110,8 +195,19 @@ for jfn in tqdm(fns):
for j, fn in enumerate(step_files):
# print("old fn:", fn)
fn = Path(fn)
parts = fn.parts
root, sf, instr, data, pgroup, raw, *remainder = fn.parts
# If already fixed, do nothing (prevents double-appending on repeated runs).
if tuple(parts[:len(already_fixed_prefix)]) == already_fixed_prefix:
continue
# Keep original strict validation, but accept only the original raw layout here.
if tuple(parts[:len(raw_prefix)]) != raw_prefix:
raise SystemExit(f'unexpected path in scan.json: "{fn}"')
# Unpack path components.
# Using pgroup2 to distinguish the pgroup found in the file path from the CLI pgroup argument (expected to match though)
root, sf, instr, data, pgroup2, raw, *remainder = parts
try:
check_equal(root, "/")
@@ -119,14 +215,14 @@ for jfn in tqdm(fns):
check_equal(data, "data")
check_equal(raw, "raw")
assert instr in INSTRUMENTS
assert is_pgroup(pgroup)
assert is_pgroup(pgroup2)
except AssertionError as e:
raise SystemExit(e)
new_fn = [
root,
sf, instr, data, pgroup, "work", "retrieve",
sf, instr, data, pgroup, raw, *remainder
sf, instr, data, pgroup2, "work", "retrieve",
sf, instr, data, pgroup2, raw, *remainder
]
new_fn = Path(*new_fn)
@@ -152,9 +248,6 @@ for jfn in tqdm(fns):
if new_jfn.exists() and not clargs.overwrite:
warn_overwrite(f"skipping existing file:", new_jfn, "\nyou might want to set --overwrite")
continue
json_save(jdat, new_jfn, overwrite=clargs.overwrite)
continue
json_save(jdat, new_jfn, overwrite=clargs.overwrite)