Added simlink creation for work/retrieve and changed patch change if ran multiple times #1
@@ -1,4 +1,47 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
unarchived_data_fix.py
|
||||
|
||||
Fix metadata paths for retrieved (archived) SwissFEL experiment data.
|
||||
|
||||
BACKGROUND
|
||||
----------
|
||||
Raw experiment data normally lives in:
|
||||
/sf/<instrument>/data/<pgroup>/raw
|
||||
|
||||
After archival + retrieval, data appears in:
|
||||
/das/work/pXX/retrieve/<pgroup>
|
||||
|
||||
This script:
|
||||
1. Creates a symlink:
|
||||
/sf/<instrument>/data/<pgroup>/work/retrieve
|
||||
-> /das/work/pXX/retrieve/<pgroup>
|
||||
|
||||
2. Fixes metadata scan.json files so that raw file paths
|
||||
correctly point to the retrieved location via:
|
||||
/sf/<instrument>/data/<pgroup>/work/retrieve/...
|
||||
|
||||
It is SAFE to run multiple times:
|
||||
- Already-fixed paths are not modified again.
|
||||
|
||||
USAGE
|
||||
-----
|
||||
Dry-run (recommended first):
|
||||
python unarchived_data_fix.py <instrument> <pgroup>
|
||||
|
||||
Overwrite scan_mod.json:
|
||||
python unarchived_data_fix.py --no-dryrun --overwrite <instrument> <pgroup>
|
||||
|
||||
Overwrite original scan.json files in-place:
|
||||
python unarchived_data_fix.py --no-dryrun --inplace --overwrite <instrument> <pgroup>
|
||||
|
||||
IMPORTANT FLAGS
|
||||
---------------
|
||||
--no-dryrun Actually modify files (default is dry-run)
|
||||
--overwrite Allow overwriting existing output files
|
||||
--inplace Modify scan.json instead of writing scan_mod.json
|
||||
"""
|
||||
|
||||
|
||||
import argparse
|
||||
import json
|
||||
@@ -61,6 +104,38 @@ def check_equal(val, ref):
|
||||
assert val == ref, f'expected "{ref}" but got "{val}"'
|
||||
|
||||
|
||||
def ensure_retrieve_symlink(instrument, pgroup):
|
||||
"""
|
||||
Ensure the expected symlink exists so the script can be run from anywhere:
|
||||
|
||||
/sf/<instrument>/data/<pgroup>/work/retrieve -> /das/work/pXX/retrieve/<pgroup>
|
||||
|
||||
(pXX means the first two digits after 'p', e.g. p21977 -> p21)
|
||||
"""
|
||||
pxx = pgroup[:3] # e.g. p21977 -> p21
|
||||
src = Path(f"/das/work/{pxx}/retrieve/{pgroup}")
|
||||
link = Path(f"/sf/{instrument}/data/{pgroup}/work/retrieve")
|
||||
|
||||
if not src.is_dir():
|
||||
raise SystemExit(f'retrieved directory "{src}" does not exist')
|
||||
|
||||
# If link exists, verify it's a symlink to the expected src (do not silently replace).
|
||||
if link.exists() or link.is_symlink():
|
||||
if not link.is_symlink():
|
||||
raise SystemExit(f'expected "{link}" to be a symlink, but it exists and is not a symlink')
|
||||
try:
|
||||
if link.resolve() != src.resolve():
|
||||
raise SystemExit(f'symlink "{link}" points to "{link.resolve()}" but expected "{src}"')
|
||||
except FileNotFoundError:
|
||||
raise SystemExit(f'symlink "{link}" is broken; expected target "{src}"')
|
||||
return
|
||||
|
||||
link.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
link.symlink_to(src)
|
||||
except Exception as e:
|
||||
raise SystemExit(f'failed to create symlink "{link}" -> "{src}": {e}')
|
||||
|
||||
|
||||
example_text = """usage examples:
|
||||
Dry run (nothing is changed or overwritten)
|
||||
@@ -90,6 +165,9 @@ clargs = parser.parse_args()
|
||||
instrument = clargs.instrument
|
||||
pgroup = clargs.pgroup
|
||||
|
||||
# create/validate the /sf/.../work/retrieve symlink so the script can be run from anywhere
|
||||
ensure_retrieve_symlink(instrument, pgroup)
|
||||
|
||||
path = f"/sf/{instrument}/data/{pgroup}/work/retrieve/sf/{instrument}/data/{pgroup}/raw"
|
||||
path = Path(path)
|
||||
if not path.is_dir():
|
||||
@@ -101,6 +179,13 @@ print()
|
||||
fns = path.glob("*/meta/scan.json")
|
||||
fns = sorted(fns)
|
||||
|
||||
# prefixes for detection of not fixing already fixed paths, and for validating the original raw paths
|
||||
already_fixed_prefix = (
|
||||
"/", "sf", instrument, "data", pgroup, "work", "retrieve",
|
||||
"sf", instrument, "data", pgroup, "raw"
|
||||
)
|
||||
raw_prefix = ("/", "sf", instrument, "data", pgroup, "raw")
|
||||
|
||||
for jfn in tqdm(fns):
|
||||
# print(jfn)
|
||||
tqdm.write(str(jfn))
|
||||
@@ -110,8 +195,19 @@ for jfn in tqdm(fns):
|
||||
for j, fn in enumerate(step_files):
|
||||
# print("old fn:", fn)
|
||||
fn = Path(fn)
|
||||
parts = fn.parts
|
||||
|
||||
root, sf, instr, data, pgroup, raw, *remainder = fn.parts
|
||||
# If already fixed, do nothing (prevents double-appending on repeated runs).
|
||||
if tuple(parts[:len(already_fixed_prefix)]) == already_fixed_prefix:
|
||||
continue
|
||||
|
||||
# Keep original strict validation, but accept only the original raw layout here.
|
||||
if tuple(parts[:len(raw_prefix)]) != raw_prefix:
|
||||
raise SystemExit(f'unexpected path in scan.json: "{fn}"')
|
||||
|
||||
# Unpack path components.
|
||||
# Using pgroup2 to distinguish the pgroup found in the file path from the CLI pgroup argument (expected to match though)
|
||||
root, sf, instr, data, pgroup2, raw, *remainder = parts
|
||||
|
||||
try:
|
||||
check_equal(root, "/")
|
||||
@@ -119,14 +215,14 @@ for jfn in tqdm(fns):
|
||||
check_equal(data, "data")
|
||||
check_equal(raw, "raw")
|
||||
assert instr in INSTRUMENTS
|
||||
assert is_pgroup(pgroup)
|
||||
assert is_pgroup(pgroup2)
|
||||
except AssertionError as e:
|
||||
raise SystemExit(e)
|
||||
|
||||
new_fn = [
|
||||
root,
|
||||
sf, instr, data, pgroup, "work", "retrieve",
|
||||
sf, instr, data, pgroup, raw, *remainder
|
||||
sf, instr, data, pgroup2, "work", "retrieve",
|
||||
sf, instr, data, pgroup2, raw, *remainder
|
||||
]
|
||||
|
||||
new_fn = Path(*new_fn)
|
||||
@@ -152,9 +248,6 @@ for jfn in tqdm(fns):
|
||||
|
||||
if new_jfn.exists() and not clargs.overwrite:
|
||||
warn_overwrite(f"skipping existing file:", new_jfn, "\nyou might want to set --overwrite")
|
||||
continue
|
||||
|
||||
json_save(jdat, new_jfn, overwrite=clargs.overwrite)
|
||||
|
||||
|
||||
continue
|
||||
|
||||
json_save(jdat, new_jfn, overwrite=clargs.overwrite)
|
||||
Reference in New Issue
Block a user