From 813ad30e07d55bd123f6d163a5cf107bd1d50b04 Mon Sep 17 00:00:00 2001 From: beale_j Date: Fri, 6 Mar 2026 08:25:50 +0100 Subject: [PATCH] automatic analysis of pdb time-reoslved database --- process_database_v1.py | 604 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 604 insertions(+) create mode 100644 process_database_v1.py diff --git a/process_database_v1.py b/process_database_v1.py new file mode 100644 index 0000000..15cd325 --- /dev/null +++ b/process_database_v1.py @@ -0,0 +1,604 @@ +import os +import subprocess +import pandas as pd +import numpy as np +import regex as re +import glob +import datetime + +XTRAPOL8_ROOT = "xtrapol8_runs_2" +TEMPLATE_PHIL = "xtrapol8.phil" # same directory as script + +def extract_replicate(filename): + """ + Extract replicate number from filename. + Examples: + dark_2.mtz -> 2 + raw_3.mtz -> 3 + dark.mtz -> 1 + """ + m = re.search(r'_(\d+)\.mtz$', filename.lower()) + if m: + return int(m.group(1)) + return 1 + +def collect_cif_files(folder): + """ + Return list of absolute paths to .cif files in folder. + """ + cif_files = [] + + for f in os.listdir(folder): + if f.lower().endswith(".cif"): + cif_files.append(os.path.abspath(os.path.join(folder, f))) + + return cif_files + +def replace_phil_field(content, field_name, new_value): + """ + Replace a single-line PHIL field assignment. + """ + pattern = rf'({field_name}\s*=\s*)(.*)' + replacement = rf'\1"{new_value}"' + return re.sub(pattern, replacement, content) + + +def replace_additional_files_block(content, new_block): + """ + Replace the additional_files line(s). + """ + pattern = r'additional_files\s*=.*' + return re.sub(pattern, new_block, content) + +def write_run_phil( + dark_mtz, + raw_mtz, + pdb_file, + cif_files, + output_dir, + top_dir, + dmin=None +): + """ + Copy template PHIL and modify required fields for this run. + Returns path to new phil file. + """ + + dark_mtz = os.path.abspath(dark_mtz) + raw_mtz = os.path.abspath(raw_mtz) + pdb_file = os.path.abspath(pdb_file) + cif_files = [os.path.abspath(c) for c in cif_files] + output_dir = os.path.abspath(output_dir) + + # Read template + with open(TEMPLATE_PHIL, "r") as f: + content = f.read() + + # Replace input paths + content = replace_phil_field(content, "reference_mtz", dark_mtz) + content = replace_phil_field(content, "triggered_mtz", raw_mtz) + content = replace_phil_field(content, "reference_pdb", pdb_file) + + # Replace resolution if requested + if dmin is not None: + content = replace_phil_field(content, "high_resolution", str(dmin)) + + # Handle CIF files + if cif_files: + cif_block = "\n".join( + [f' additional_files = "{c}"' for c in cif_files] + ) + content = replace_additional_files_block(content, cif_block) + else: + content = replace_phil_field(content, "additional_files", "None") + + # Replace output directory + content = replace_phil_field(content, "outdir", output_dir) + + # Write run phil + run_phil_path = os.path.join(top_dir, "xtrapol8_run.phil") + + with open(run_phil_path, "w") as f: + f.write(content) + + return run_phil_path + +def create_xtrapol8_shell_script(output_dir): + """ + Creates shell script that: + - Loads modules + - Runs Xtrapol8 with phil file + """ + + output_dir = os.path.abspath(output_dir) + script_path = os.path.join(output_dir, "run_xtrapol8.sh") + + script_content = f"""#!/bin/bash +module purge +module load Xtrapol8/1.2.1 +module load phenix/phenix-1.19.1-4122 +module load ccp4 + +cd {output_dir} + +phenix.python ${{XTRAPOL8_DIR}}/bin/Fextr.py xtrapol8_run.phil +""" + + with open(script_path, "w") as f: + f.write(script_content) + + os.chmod(script_path, 0o755) + + return script_path + +def run_shell_script(script_path): + """ + Executes a shell script and returns (success, stdout, stderr). + """ + + result = subprocess.run( + ["bash", script_path], + capture_output=True, + text=True + ) + + success = result.returncode == 0 + return success, result.stdout, result.stderr + +def parse_peak_file(peak_file): + + if not os.path.exists(peak_file): + return None, None + + top_pos_int = None + top_neg_int = None + header_count = 0 + + with open(peak_file, "r") as f: + for line in f: + + line = line.strip() + if not line: + continue + + # Detect start of table + if line.startswith("Resn"): + header_count += 1 + continue + + parts = line.split() + + # Need at least 4 columns for tail parsing + if len(parts) < 4: + continue + + # Check last 3 tokens are numeric + try: + float(parts[-1]) # voxels + Int_value = float(parts[-2]) + float(parts[-3]) # peak + except ValueError: + continue + + # First block = positive + if header_count == 1 and top_pos_int is None: + top_pos_int = Int_value + + # Second block = negative + elif header_count == 2 and top_neg_int is None: + top_neg_int = Int_value + + if top_pos_int is not None and top_neg_int is not None: + break + + return top_pos_int, top_neg_int + +def parse_xtrapol8_outputs(timestamp_dir): + + log_file = None + peak_file = os.path.join(timestamp_dir, "peakintegration.txt") + + for f in os.listdir(timestamp_dir): + if f.endswith("_Xtrapol8.log"): + log_file = os.path.join(timestamp_dir, f) + break + + if log_file is None: + return (None,) * 8 + + Riso = None + CCiso = None + alpha_max = None + alpha_pearson = None + high_res = None + Rwork = None + Rfree = None + top_pos_int = None + top_neg_int = None + + # -------------------------------------------------- + # Parse LOG FILE + # -------------------------------------------------- + with open(log_file, "r") as f: + for line in f: + + # --- Resolution --- + if "Resolution range=" in line: + m = re.search(r"Resolution range=\s*[0-9.]+\s*-\s*([0-9.]+)", line) + if m: + high_res = float(m.group(1)) + + # --- Riso --- + if "Overall R_iso" in line: + m = re.search(r"Overall R_iso\s*=\s*([0-9.]+)", line) + if m: + Riso = float(m.group(1)) + + # --- CCiso --- + if "Overall cc_iso" in line: + m = re.search(r"Overall cc_iso\s*=\s*([0-9.]+)", line) + if m: + CCiso = float(m.group(1)) + + # --- Rwork --- + if "R_work:" in line: + m = re.search(r"R_work:\s*([0-9.]+)", line) + if m: + Rwork = float(m.group(1)) + + # --- Rfree --- + if "R_free:" in line: + m = re.search(r"R_free:\s*([0-9.]+)", line) + if m: + Rfree = float(m.group(1)) + + # --- Alpha (maximization) --- + if "Difference map maximization" in line: + m = re.search(r"Best alpha value is\s*([0-9.]+)", line) + if m: + alpha_max = float(m.group(1)) + + # --- Alpha (PearsonCC) --- + if "Difference map PearsonCC" in line: + m = re.search(r"Best alpha value is\s*([0-9.]+)", line) + if m: + alpha_pearson = float(m.group(1)) + + # -------------------------------------------------- + # Parse peak integration file + # -------------------------------------------------- + if os.path.exists(peak_file): + + top_pos_int, top_neg_int = parse_peak_file(peak_file) + + return ( + Riso, + CCiso, + alpha_max, + alpha_pearson, + high_res, + Rwork, + Rfree, + top_pos_int, + top_neg_int + ) + +def find_successful_run(pair_root, debug=False): + + if not os.path.isdir(pair_root): + print("not -", pair_root) + return None + + # Get timestamp folders + subdirs = [ + os.path.join(pair_root, d) + for d in os.listdir(pair_root) + if os.path.isdir(os.path.join(pair_root, d)) + ] + + if not subdirs: + return None + + # Newest first (timestamps sort lexicographically) + subdirs.sort(reverse=True) + + for folder in subdirs: + + if debug: + print("Checking:", folder) + + # Look for *Xtrapol8.log exactly + log_files = glob.glob(os.path.join(folder, "*Xtrapol8.log")) + + if not log_files: + if debug: + print(" No *Xtrapol8.log found") + continue + + # Look for peakintegration.txt (correct spelling) + peak_file = os.path.join(folder, "peakintegration.txt") + + if not os.path.exists(peak_file): + if debug: + print(" No peakintegration.txt found") + continue + + if debug: + print(" SUCCESS:", folder) + + return folder + + return None + +def build_pair_root(dark_mtz, raw_mtz, pdb_file, data_root="pdb_data"): + + # -------------------------------------------------- + # Absolute paths + # -------------------------------------------------- + dark_mtz = os.path.abspath(dark_mtz) + raw_mtz = os.path.abspath(raw_mtz) + pdb_file = os.path.abspath(pdb_file) + data_root = os.path.abspath(data_root) + + # -------------------------------------------------- + # Extract dark code from pdb filename + # -------------------------------------------------- + dark_code = os.path.basename(pdb_file).replace(".pdb", "").upper() + + # -------------------------------------------------- + # Extract raw code safely from MTZ filename + # -------------------------------------------------- + m = re.search(r"r([0-9a-zA-Z]{4})", os.path.basename(raw_mtz)) + if m: + raw_code = m.group(1).upper() + else: + raise ValueError(f"Could not extract raw PDB code from {raw_mtz}") + + pair_name = f"{raw_code}_minus_{dark_code}" + + # -------------------------------------------------- + # Mirror directory structure relative to pdb_data + # -------------------------------------------------- + rel_path = os.path.relpath(dark_mtz, start=data_root) + rel_dir = os.path.dirname(rel_path) + + pair_root = os.path.abspath( + os.path.join(XTRAPOL8_ROOT, rel_dir, pair_name) + ) + + os.makedirs(pair_root, exist_ok=True) + + return pair_root + +def run_xtrapol8_pair( + dark_mtz, + raw_mtz, + pdb_file, + dmin, + data_root="pdb_data" +): + + # -------------------------------------------------- + # Absolute paths + # -------------------------------------------------- + dark_mtz = os.path.abspath(dark_mtz) + raw_mtz = os.path.abspath(raw_mtz) + pdb_file = os.path.abspath(pdb_file) + + # -------------------------------------------------- + # Extract PDB codes + # -------------------------------------------------- + dark_code = os.path.basename(pdb_file).replace(".pdb", "").upper() + + # safer extraction than slicing + m = re.search(r"r([0-9a-zA-Z]{4})", os.path.basename(raw_mtz)) + if m: + raw_code = m.group(1).upper() + else: + raw_code = "UNKNOWN" + + pair_name = f"{raw_code}_minus_{dark_code}" + + # -------------------------------------------------- + # Build mirrored directory structure + # -------------------------------------------------- + rel_path = os.path.relpath(dark_mtz, start=os.path.abspath(data_root)) + rel_dir = os.path.dirname(rel_path) + + pair_root = os.path.abspath( + os.path.join(XTRAPOL8_ROOT, rel_dir, pair_name) + ) + + os.makedirs(pair_root, exist_ok=True) + + print(" Pair directory:", pair_root) + + # -------------------------------------------------- + # Collect CIF files from dataset folder + # -------------------------------------------------- + dataset_folder = os.path.dirname(dark_mtz) + cif_files = [ + os.path.join(dataset_folder, f) + for f in os.listdir(dataset_folder) + if f.lower().endswith(".cif") + ] + + # -------------------------------------------------- + # Write PHIL file + # -------------------------------------------------- + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + xtrapol8_dir = "xtrapol8_{0}".format( timestamp ) + output_dir = os.path.join(pair_root, xtrapol8_dir) + + run_phil = write_run_phil( + dark_mtz, + raw_mtz, + pdb_file, + cif_files, + output_dir, + pair_root, + dmin=dmin + ) + + # -------------------------------------------------- + # Write shell script + # -------------------------------------------------- + script_path = create_xtrapol8_shell_script(pair_root) + + # -------------------------------------------------- + # Execute Xtrapol8 via shell + # -------------------------------------------------- + + result = subprocess.run( + ["bash", script_path], + capture_output=True, + text=True + ) + + if result.returncode != 0: + print("Xtrapol8 failed:") + print(result.stderr) + +def analyze_folder(folder_path, dmin_cutoff=None): + + paper_folder = os.path.basename(folder_path) + results = [] + + # -------------------------------------------------- + # Find all MTZ files recursively + # -------------------------------------------------- + mtz_files = [] + for root, dirs, files in os.walk(folder_path): + for f in files: + if f.lower().endswith(".mtz"): + mtz_files.append(os.path.join(root, f)) + + if not mtz_files: + return pd.DataFrame() + + # -------------------------------------------------- + # Group by replicate (explicit dict) + # -------------------------------------------------- + grouped = {} + + for f in mtz_files: + name = os.path.basename(f).lower() + rep = extract_replicate(name) + + if rep not in grouped: + grouped[rep] = {"dark": [], "raw": []} + + if "dark" in name: + grouped[rep]["dark"].append(f) + elif "raw" in name: + grouped[rep]["raw"].append(f) + + # -------------------------------------------------- + # Loop through replicate groups + # -------------------------------------------------- + df = pd.DataFrame() + + for rep in sorted(grouped.keys()): + + dark_files = grouped[rep]["dark"] + raw_files = grouped[rep]["raw"] + + if not dark_files or not raw_files: + continue + + for dark_file in dark_files: + + print(f"Working on dark file: {dark_file}") + + # Determine PDB path from dark file + folder = os.path.dirname(dark_file) + pdb_code = os.path.basename(dark_file)[1:5].upper() + pdb_path = os.path.join(folder, f"{pdb_code}.pdb") + + if not os.path.exists(pdb_path): + print("Missing PDB:", pdb_path) + continue + + for raw_file in raw_files: + + print(f" Pairing with raw file: {raw_file}") + + pair_root = build_pair_root(dark_file, raw_file, pdb_path) + + successful_dir = find_successful_run(pair_root) + + if successful_dir is None: + print(f" running Xtrapol8") + run_xtrapol8_pair( + dark_file, + raw_file, + pdb_path, + dmin_cutoff, + data_root="pdb_data" + ) + successful_dir = find_successful_run(pair_root) + + if successful_dir is None: + print("Pair failed") + return None + + # -------------------------------------------------- + # Parse log + # -------------------------------------------------- + ( + Riso, + CCiso, + alpha_max, + alpha_pearson, + high_res, + Rwork, + Rfree, + top_pos_int, + top_neg_int + ) = parse_xtrapol8_outputs(successful_dir) + + if alpha_max is not None: + print( + f" Overall Riso : {Riso}\n" + f" Overall CCiso : {CCiso}\n" + f" Alpha (maximization): {alpha_max} 1/b : {1/alpha_max} \n" + f" Alpha (Pearson CC) : {alpha_pearson} 1/b : {1/alpha_pearson} \n" + f" High resolution : {high_res}\n" + f" Rwork : {Rwork}\n" + f" Rfree : {Rfree}\n" + f" Top +ve peak : {top_pos_int}\n" + f" Top -ve peak : {top_neg_int}\n" + ) + + results = [{ + "paper_folder": paper_folder, + "replicate": rep, + "dark_file": os.path.basename(dark_file), + "raw_file": os.path.basename(raw_file), + "Riso": Riso, + "CCiso": CCiso, + "alpha_max": alpha_max, + "alpha_person": alpha_pearson, + "high_res": high_res, + "Rwork": Rwork, + "Rfree": Rfree, + "top_pos_int": top_pos_int, + "top_neg_int": top_neg_int + }] + df_1 = pd.DataFrame( results ) + df = pd.concat( ( df, df_1 ) ) + + return df + +root = "pdb_data" +results_df = pd.DataFrame() + +for sub in os.listdir(root): + path = os.path.join(root, sub) + if os.path.isdir(path): + df = analyze_folder(path, dmin_cutoff=None) + + results_df = pd.concat( ( results_df, df ) ) + +print( results_df ) +results_df.to_csv( "xtrapol8_analysis.csv", sep="," ) \ No newline at end of file