From 6f17c41ea39cada1f50fef839fa7e080dc739078 Mon Sep 17 00:00:00 2001 From: beale_j Date: Wed, 11 Mar 2026 09:44:19 +0100 Subject: [PATCH] combine two pdbs as altconf with a defined occupancy --- generate_combined_pdb.py | 412 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 412 insertions(+) create mode 100644 generate_combined_pdb.py diff --git a/generate_combined_pdb.py b/generate_combined_pdb.py new file mode 100644 index 0000000..0aeeb6c --- /dev/null +++ b/generate_combined_pdb.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 + +import gemmi +import argparse +import tempfile +import os +import sys + + +# ----------------------------- +# Altloc validation +# ----------------------------- +def validate_altlocs(struct, label): + + allowed = {'A', 'B', '', ' ', '\x00'} + found = set() + + for chain in struct[0]: + for res in chain: + for atom in res: + + alt = atom.altloc + + if alt not in ('', ' ', '\x00'): + found.add(alt) + + if alt not in allowed: + + print() + print("ERROR: Unsupported altloc detected") + print("----------------------------------") + print(f"Structure : {label}") + print(f"Chain : {chain.name}") + print(f"Residue : {res.name} {res.seqid.num}") + print(f"Atom : {atom.name}") + print(f"Altloc : '{alt}'") + print() + print("Only altlocs A/B are allowed in input structures.") + print() + + sys.exit(1) + + if len(found) > 2: + + print() + print("ERROR: More than two conformations detected") + print("-------------------------------------------") + print(f"Structure : {label}") + print(f"Altlocs : {sorted(found)}") + print() + + sys.exit(1) + + print(f"{label} altlocs detected:", sorted(found)) + + +# ----------------------------- +# Remove ANISOU records +# ----------------------------- +def strip_anisou(pdb_in, pdb_out): + + with open(pdb_in) as fin, open(pdb_out, "w") as fout: + for line in fin: + if not line.startswith("ANISOU"): + fout.write(line) + + +# ----------------------------- +# Altloc conversion helper +# ----------------------------- +def convert_light_altloc(alt): + + if alt in ('', ' ', '\x00', 'A'): + return 'C' + + if alt == 'B': + return 'D' + + raise RuntimeError(f"Unsupported altloc '{alt}' in light structure") + + +# ----------------------------- +# report differences between states +# ----------------------------- +def report_state_differences(dark, light): + + dark_atoms = set() + light_atoms = set() + + for chain in dark[0]: + for res in chain: + for atom in res: + key = ( + chain.name, + res.seqid.num, + res.name, + atom.name + ) + dark_atoms.add(key) + + for chain in light[0]: + for res in chain: + for atom in res: + key = ( + chain.name, + res.seqid.num, + res.name, + atom.name + ) + light_atoms.add(key) + + dark_only = dark_atoms - light_atoms + light_only = light_atoms - dark_atoms + + print() + print("State difference summary") + print("------------------------") + print("Atoms only in dark :", len(dark_only)) + print("Atoms only in light:", len(light_only)) + print() + + # Print a few examples + if dark_only: + print("Example dark-only atoms:") + for x in list(dark_only)[:5]: + print(" ", x) + + if light_only: + print("Example light-only atoms:") + for x in list(light_only)[:5]: + print(" ", x) + + print() + + +# ----------------------------- +# occupancy sanity check +# ----------------------------- +def check_occupancies(struct): + + occ_map = {} + + for chain in struct[0]: + for res in chain: + for atom in res: + + key = ( + chain.name, + res.seqid.num, + res.name, + atom.name + ) + + occ_map.setdefault(key, 0.0) + occ_map[key] += atom.occ + + problems = [] + + for key, occ in occ_map.items(): + if occ > 1.01: # small tolerance + problems.append((key, occ)) + + if problems: + + print() + print("WARNING: Occupancy > 1 detected") + print("-------------------------------") + + for p in problems[:10]: + print(p) + + print() + + else: + print("Occupancy check passed (no atoms >1).") + + +# ----------------------------- +# chain/residue check +# ----------------------------- +def check_missing_light_residues(dark, light, ensemble): + + dark_res = {(c.name, r.seqid.num, r.name) for c in dark[0] for r in c} + light_res = {(c.name, r.seqid.num, r.name) for c in light[0] for r in c} + ens_res = {(c.name, r.seqid.num, r.name) for c in ensemble[0] for r in c} + + missing = light_res - ens_res + + if missing: + + print() + print("ERROR: residues lost from light structure during merge") + print("------------------------------------------------------") + + for r in sorted(missing): + print(" ", r) + + print() + print("Ensemble construction aborted.") + print("Check residue creation or atom filtering.") + print() + + sys.exit(1) + +# ----------------------------- +# Build ensemble model +# ----------------------------- +def build_ensemble(dark, light, light_occ, occ_cutoff=0.005, verbose=False): + + dark_occ = 1.0 - light_occ + + ensemble = dark.clone() + model = ensemble[0] + + dark_atoms = 0 + + # ----------------- + # Scale dark atoms + # ----------------- + for chain in model: + for res in chain: + for atom in res: + + atom.occ *= dark_occ + + if atom.occ < occ_cutoff: + continue + + if atom.altloc in ('', ' ', '\x00'): + atom.altloc = 'A' + + dark_atoms += 1 + + light_atoms = 0 + + # ----------------- + # Add light atoms + # ----------------- + for chain in light[0]: + + target_chain = model.find_chain(chain.name) + + if target_chain is None: + target_chain = gemmi.Chain(chain.name) + model.add_chain(target_chain) + + # residue lookup dictionary (O(1) lookup) + res_lookup = { + (r.seqid.num, r.seqid.icode, r.name): r + for r in target_chain + } + + for res in chain: + + key = (res.seqid.num, res.seqid.icode, res.name) + + target_res = res_lookup.get(key) + + if target_res is None: + + target_res = gemmi.Residue() + target_res.name = res.name + target_res.seqid = gemmi.SeqId(res.seqid.num, res.seqid.icode) + + target_chain.add_residue(target_res) + res_lookup[key] = target_res + + for atom in res: + + new_occ = atom.occ * light_occ + + if new_occ < occ_cutoff: + + if verbose: + print( + "Skipping low-occupancy atom:", + f"{chain.name} {res.name} {res.seqid.num}", + f"atom {atom.name}", + f"orig_occ={atom.occ:.3f}", + f"scaled_occ={new_occ:.4f}" + ) + + continue + + new_atom = gemmi.Atom() + + new_atom.name = atom.name + new_atom.pos = gemmi.Position(atom.pos.x, atom.pos.y, atom.pos.z) + new_atom.element = atom.element + new_atom.b_iso = atom.b_iso + new_atom.occ = new_occ + new_atom.altloc = convert_light_altloc(atom.altloc) + + target_res.add_atom(new_atom) + + light_atoms += 1 + + print() + print("Ensemble build summary") + print("----------------------") + print("Dark atoms written :", dark_atoms) + print("Light atoms written:", light_atoms) + print() + + return ensemble + + +# ----------------------------- +# Mainfsortdcdds +# ----------------------------- +def main(): + + parser = argparse.ArgumentParser( + description="Generate multicopy ensemble PDB (Barends-style refinement model).", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "-r", + "--reference", + type=os.path.abspath, + help="reference-state PDB file", + required=True + ) + + parser.add_argument( + "-t", + "--triggered", + type=os.path.abspath, + help="Triggered-state PDB file", + required=True + ) + + parser.add_argument( + "-f", + "--fraction", + type=float, + help="Triggered-state occupancy number < 1", + required=True + ) + + parser.add_argument( + "-o", + "--output", + default="ensemble.pdb", + help="Output ensemble PDB", + ) + + parser.add_argument( + "-c", + "--occ-cutoff", + type=float, + default=0.005, + help="Atoms with scaled occupancy below this value are removed" + ) + + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Print detailed filtering information" + ) + + args = parser.parse_args() + + # Validate occupancy + if not (0.0 <= args.fraction <= 1.0): + print("ERROR: light_occ must be between 0 and 1") + sys.exit(1) + + # Temporary files to remove any ANISOU records + with tempfile.NamedTemporaryFile(suffix=".pdb", delete=False) as tmp_dark, \ + tempfile.NamedTemporaryFile(suffix=".pdb", delete=False) as tmp_light: + + strip_anisou(args.reference, tmp_dark.name) + strip_anisou(args.triggered, tmp_light.name) + + dark = gemmi.read_structure(tmp_dark.name) + light = gemmi.read_structure(tmp_light.name) + + os.remove(tmp_dark.name) + os.remove(tmp_light.name) + + # Validate the altocs and check only two conformations present in structure + validate_altlocs(dark, "dark") + validate_altlocs(light, "light") + + # report any residue differences + report_state_differences(dark, light) + + ensemble = build_ensemble( + dark, + light, + args.fraction, + occ_cutoff=args.occ_cutoff, + verbose=args.verbose + ) + + # check all the residues are there + check_missing_light_residues(dark, light, ensemble) + + # check occupancies in the ensemble = 1 + check_occupancies(ensemble) + + # Write to file + ensemble.write_pdb(args.output) + + print("Ensemble written:", args.output) + + +if __name__ == "__main__": + main() \ No newline at end of file