From a38a9b10b1b7fe1c88276a64e671ef5ecdef0ffa Mon Sep 17 00:00:00 2001
From: Beale John Henry <beale_j@ra-l-004.psi.ch>
Date: Tue, 21 Jan 2025 15:45:46 +0100
Subject: [PATCH] auto processing script combining split, partialator and
 make_mtz. in progress

---
 reduction_tools/auto_process.py | 440 ++++++++++++++++++++++++++++++++
 1 file changed, 440 insertions(+)
 create mode 100644 reduction_tools/auto_process.py

diff --git a/reduction_tools/auto_process.py b/reduction_tools/auto_process.py
new file mode 100644
index 0000000..8398e1e
--- /dev/null
+++ b/reduction_tools/auto_process.py
@@ -0,0 +1,440 @@
+#!/usr/bin/python
+
+# author J.Beale
+
+"""
+# aim
+automatically index, integrate, scale/merge and output mtzs for individual SwissFEL runs
+combines, hopefully intelligently, indexamajig, partialator, f2mtz and truncate to produce complete mtz files
+
+# usage
+python auto_process.py
+
+# output
+in run dir
+- integrated .stream files + logs
+- scaled/merged files
+- an mtz file
+- useful plots
+- log file of output
+"""
+
+# modules
+import sys
+sys.path.insert( 1, "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/" )
+import partialator as part
+import subprocess
+import os, errno
+import shutil
+import pandas as pd
+import numpy as np
+import glob
+import argparse
+import regex as re
+from loguru import logger
+
+def read_cell( cell_file ):
+
+    # function to get cell parameter
+    def get_parameter( line, parameter ):
+
+        # general parameter search
+        pattern = r"{0}\s=\s(\d+\.\d+)".format( parameter )
+        value = re.search( pattern, line ).group(1)
+        return value
+
+    # loop through file and get parameters
+    cell = open( cell_file, "r" )
+    for line in cell:
+        if line.startswith( "a " ):
+            a = float( get_parameter( line, "a" ) ) 
+        if line.startswith( "b " ):
+            b = float( get_parameter( line, "b" ) )
+        if line.startswith( "c " ):
+            c = float( get_parameter( line, "c" ) )
+        if line.startswith( "al " ):
+            alpha = float( get_parameter( line, "al" ) )
+        if line.startswith( "be " ):
+            beta = float( get_parameter( line, "be" ) )
+        if line.startswith( "ga " ):
+            gamma = float( get_parameter( line, "ga" ) )
+    
+    cell = [ a, b, c, alpha, beta, gamma ]
+
+    return cell
+
+def cat_lst_files( script_dir, run_no, endstation, pgroup, jfj, label, output ):
+
+    cat_lst = "{0}/cat_lst.py".format( script_dir )
+
+    submit_cmd = [ "python", cat_lst,
+                   "{0}".format( run_no ),
+                   "--endstation={0}".format( endstation ),
+                   "--pgroup={0}".format( pgroup ),
+                   "--jfj={0}".format( jfj ),
+                   "--label={0}".format( label ),
+                   "--output={0}".format( output ),
+                   ]
+
+    subprocess.run( submit_cmd )
+
+def run_index( script_dir, name, lst_file, chunk_size, geom_file, cell_file,
+               indexer, peakfinder, integrator, tolerance, threshold,
+               min_snr, int_radius, multi, retry, min_pix,
+               bg_rad, min_res, reservation
+               ):
+
+    indexamajig = "{0}/crystfel_split_var.py".format( script_dir )
+
+    if reservation:
+        submit_cmd = [ "python", indexamajig,
+                    "--job_name={0}".format( name ),
+                    "--lst_file={0}".format( lst_file ),
+                    "--chunk_size={0}".format( chunk_size ),
+                    "--geom_file={0}".format( geom_file ),
+                    "--cell_file={0}".format( cell_file ),
+                    "--indexer={0}".format( indexer ),
+                    "--peakfinder={0}".format( peakfinder ),
+                    "--integrator={0}".format( integrator ),
+                    "--tolerance={0}".format( tolerance ),
+                    "--threshold={0}".format( threshold ),
+                    "--min_snr={0}".format( min_snr ),
+                    "--int_radius={0}".format( int_radius ),
+                    "--multi={0}".format( multi ),
+                    "--retry={0}".format( retry ),
+                    "--min_pix={0}".format( min_pix ),
+                    "--bg_rad={0}".format( bg_rad ),
+                    "--min_res={0}".format( min_res ),
+                    "--reservation={0}".format( reservation )
+                  ]
+    else:
+        submit_cmd = [ "python", indexamajig,
+                    "--job_name={0}".format( name ),
+                    "--lst_file={0}".format( lst_file ),
+                    "--chunk_size={0}".format( chunk_size ),
+                    "--geom_file={0}".format( geom_file ),
+                    "--cell_file={0}".format( cell_file ),
+                    "--indexer={0}".format( indexer ),
+                    "--peakfinder={0}".format( peakfinder ),
+                    "--integrator={0}".format( integrator ),
+                    "--tolerance={0}".format( tolerance ),
+                    "--threshold={0}".format( threshold ),
+                    "--min_snr={0}".format( min_snr ),
+                    "--int_radius={0}".format( int_radius ),
+                    "--multi={0}".format( multi ),
+                    "--retry={0}".format( retry ),
+                    "--min_pix={0}".format( min_pix ),
+                    "--bg_rad={0}".format( bg_rad ),
+                    "--min_res={0}".format( min_res )
+                  ]
+    
+    subprocess.run( submit_cmd )
+
+    # make stream name
+    stream_file = "{0}.stream".format( name )
+
+    return stream_file
+
+def run_partialator( script_dir, name, stream_file, pointgroup, model, iterations,
+                     cell_file, bins, resolution, max_adu, reservation
+                     ):
+
+    partialator = "{0}/partialator.py".format( script_dir, )
+
+    if reservation:
+        submit_cmd = [ "python", partialator,
+                    "--name={0}".format( name ),
+                    "--stream_file={0}".format( stream_file ),
+                    "--pointgroup={0}".format( pointgroup ),
+                    "--model={0}".format( model ),
+                    "--iterations={0}".format( iterations ),
+                    "--cell_file={0}".format( cell_file ),
+                    "--bins={0}".format( bins ),
+                    "--resolution={0}".format( resolution ),
+                    "--max_adu={0}".format( max_adu ),
+                    "--reservation={0}".format( reservation )
+                    ]
+    else:
+        submit_cmd = [ "python", partialator,
+                    "--name={0}".format( name ),
+                    "--stream_file={0}".format( stream_file ),
+                    "--pointgroup={0}".format( pointgroup ),
+                    "--model={0}".format( model ),
+                    "--iterations={0}".format( iterations ),
+                    "--cell_file={0}".format( cell_file ),
+                    "--bins={0}".format( bins ),
+                    "--resolution={0}".format( resolution ),
+                    "--max_adu={0}".format( max_adu )
+                    ]        
+
+    subprocess.run( submit_cmd )
+
+def scrub_cc( part_dir_name ):
+
+    # open 1st partialator log file
+    part_log = open( "{0}.log".format( part_dir_name ) )
+
+    # regex example = resolution at CC0.5 at 0.3 = 1.599
+    pattern = r"resolution\sat\sCC0.5\sat\s0\.3\s=\s(\d+.\d+)"
+    cc = re.search( pattern, part_log.read() ).group(1)
+    
+    return float( cc )
+
+def scrub_index( index_dir_name ):
+
+    # open 1st partialator log file
+    index_log_file = open( "{0}.log".format( index_dir_name ) )
+    index_log = index_log_file.read()
+
+    # regex example = crystals = 0
+    images_pattern = r"images\s=\s(\d+)"
+    images = re.search( images_pattern, index_log ).group(1)
+
+    # regex example = crystals = 0
+    xtals_pattern = r"crystals\s=\s(\d+)"
+    indexed = re.search( xtals_pattern, index_log ).group(1)
+
+    def get_index_cell( index_log, parameter ):
+
+        # general parameter search
+        pattern = r"mean\s{0}\s=\s(\d+\.\d+)\s\+/-\s(\d+\.\d+)".format( parameter )
+        parameters = re.search( pattern, index_log )
+        value, error = parameters.group(1), parameters.group(2)
+        return value, error
+
+    ind_res, ind_res_err = get_index_cell( index_log, "resolution" )
+    obs, obs_err = get_index_cell( index_log, "observations" )
+    a, a_err = get_index_cell( index_log, "a" )
+    b, b_err = get_index_cell( index_log, "b" )
+    c, c_err = get_index_cell( index_log, "c" )
+    alpha, alpha_err = get_index_cell( index_log, "alpha" )
+    beta, beta_err = get_index_cell( index_log, "beta" )
+    gamma, gamma_err = get_index_cell( index_log, "gamma" )
+
+    # put results into df
+    data = [ { "images" : int( images ), "indexed" : int( indexed ),
+               "index_res" : float( ind_res ), "index_res_err" : float( ind_res_err ),
+               "obs" : float( obs ), "obs_err" : float( obs_err ),
+               "a" : float( a ), "a_err" : float( a_err ),
+               "b" : float( b ), "b_err" : float( b_err ),
+               "c" : float( c ), "c_err": float( c_err ),
+               "alpha" : float( alpha ), "alpha_err" : float( alpha_err ),
+               "beta" : float( beta ), "beta_err" : float( beta_err ),
+               "gamma" : float( gamma ), "gamma_err" : float( gamma_err ) 
+                } ]
+
+    stream_df = pd.DataFrame( data )
+    
+    return stream_df
+    
+def run_make_mtz( script_dir, hklin_file, project, crystal, dataset, cell, spacegroup, residues, res_range ):
+
+    make_mtz = "{0}/make_mtz.py".format( script_dir )
+
+    submit_cmd = [ "python", make_mtz,
+                   "--hklin={0}".format( hklin_file ),
+                   "--project={0}".format( project ),
+                   "--crystal={0}".format( crystal ),
+                   "--dataset={0}".format( dataset ),
+                   "--cell={0},{1},{2},{3},{4},{5}".format( cell[0], cell[1], cell[2], cell[3], cell[4], cell[5] ),
+                   "--spacegroup={0}".format( spacegroup ),
+                   "--residues={0}".format( residues ),
+                   "--resolution_range={0},{1}".format( res_range[0], res_range[1] )
+                   ]
+    subprocess.run( submit_cmd )
+
+def scrub_prefix( endstation, pgroup, run_name ):
+
+    # regex example = /sf/cristallina/data/p22216/raw/run0001-hewl_det-calib_1
+    file_path = "/sf/{0}/data/{1}/raw/{2}-*/".format( endstation, pgroup, run_name )
+    try:
+        file_path = glob.glob( file_path )[0]
+        pattern = r"/sf/{0}/data/{1}/raw/{2}-(.*)/".format( endstation, pgroup, run_name )
+        prefix = re.search( pattern, file_path ).group(1)
+    except AttributeError as e:
+        prefix = np.nan
+
+    return prefix
+
+def make_process_dir( proc_dir ):
+    # make process directory
+    try:
+        os.makedirs( proc_dir )
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            logger.debug( "making directory error" )
+            raise
+
+def main( script_dir, cwd, runs,
+          endstation, pgroup, jfj, label,
+          chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold,
+          min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation,
+          pointgroup, model, iterations, bins, resolution, max_adu,
+          project, crystal, spacegroup, residues
+          ):
+
+    # collate results for summary csv
+    df = pd.DataFrame()
+
+    print( "beginning autoprocessing for runs {0}".format( runs ) )
+
+    for run in runs:
+
+        # make process dir
+        run_name = "run{0}".format( str( run ).zfill(4) )
+        print( "making run directory {0}".format( run_name ) )
+        proc_dir = "{0}/{1}".format( cwd, run_name )
+        make_process_dir( proc_dir )
+
+        # move to proc dir
+        os.chdir( proc_dir )
+        print( "done" )
+
+        # create lst file
+        # make lst output name
+        lst_name = "{0}_{1}.lst".format( run_name, label )
+        if not os.path.isfile( "{0}/{1}".format( proc_dir, lst_name ) ):
+            print( "makeing lst file" )
+            cat_lst_files( script_dir, run, endstation, pgroup, jfj, label, run_name )
+            print( "done" )
+        else:
+            print( "lst file already created" )
+        
+        # scrub run prefix
+        prefix = scrub_prefix( endstation, pgroup, run_name )
+
+        # run crystfel split with variable
+        index_run_name = "{0}_index".format( run_name )
+        if not os.path.isfile( "{0}/{1}.stream".format( proc_dir, index_run_name ) ):
+            print( "running indexamajig" )
+            stream_file = run_index( script_dir, index_run_name, lst_name, chunk_size, geom_file, cell_file,
+                                    indexer, peakfinder, integrator, tolerance, threshold,
+                                    min_snr, int_radius, multi, retry, min_pix,
+                                    bg_rad, min_res, reservation
+                                    )
+            print( "done" )
+        else:
+            print( "indexamajig already run" )
+            stream_file = "{0}.stream".format( index_run_name )
+        
+        # scrub stream log
+        stream_df = scrub_index( index_run_name )
+        indexed = stream_df.indexed[0]
+
+        # run partialator
+        if indexed > 100:
+            print( "indexed {0} - now run partialator and make mtz".format( indexed ) )
+
+            part_run_name = "{0}_merge".format( run_name )
+            hkl_file = "{0}.hkl".format( part_run_name )
+            if not os.path.isfile( "{0}/{1}/{2}".format( proc_dir, part_run_name, hkl_file ) ):
+                run_partialator( script_dir, part_run_name, stream_file, pointgroup, model, iterations,
+                                 cell_file, bins, resolution, max_adu, reservation
+                                )
+            else:
+                print( "partialator has run" )
+
+            # get resolution at cc 0.5
+            # scrub log file
+            try:
+                cc = scrub_cc( part_run_name )
+                print( "scrubbed resolution = {0}".format( cc ) )
+                if cc < 1.2:
+                    print( "predicted resolution too high. reset to 1.2 A" )
+                    cc = 1.2
+            except AttributeError as e:
+                cc = 3.0
+
+            # re-run check hkl
+            print( "running cc check/compare" )
+            os.chdir( part_run_name )
+            check_run_file = part.run_compare_check( "{0}/{1}".format( proc_dir, part_run_name ), part_run_name, cell_file, bins, cc )
+            submit_cmd = [ "{0}".format( check_run_file ) ]
+            subprocess.call( submit_cmd )
+            print( "done" )
+
+            # make mtz
+            print( "making final mtz" )
+            cell = read_cell( cell_file )
+            mtz_file = "{0}/{1}/{1}.mtz".format( proc_dir, part_run_name )
+            F_file = "{0}/{1}/{1}_F.mtz".format( proc_dir, part_run_name )
+            res_range = [ 50.0, cc ]
+            run_make_mtz( script_dir, hkl_file, project, crystal, run_name, cell, spacegroup, residues, res_range )
+            try:
+                shutil.move( mtz_file, proc_dir )
+            except shutil.Error as e:
+                pass
+            try:
+                shutil.move( F_file, proc_dir )
+            except shutil.Error as e:
+                pass
+            print( "done {0}".format( run_name ) )
+        
+        else:
+            print( "partialator not run. < 100 indexed" )
+        
+        data = [ { "run" : run,
+                   "prefix" : prefix,
+                 } ]
+
+        # move back to cwd
+        os.chdir( cwd )
+
+    print( "finished all!" )
+
+
+        
+#script_dir = "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/"
+script_dir = "/sf/cristallina/data/p22216/work/processing/bach"
+cwd = os.getcwd()
+#runs = [ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+#         113, 114, 115, 116, 117, 118, 119, 122, 123, 124, 125, 126, 127,
+#         128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138 ]
+runs = [ 11 ]
+endstation = "cristallina"
+pgroup = "p22216"
+jfj = True
+label = "off"
+
+geom_file = "{0}/{1}".format( cwd, "JF17T16V01_cf10_shift.geom" )
+jfj = True
+label = "off"
+chunk_size = 250
+cell_file = "/sf/cristallina/data/p22216/work/processing/bach/p1.cell"
+indexer = "xgandalf"
+peakfinder = "peakfinder8"
+integrator = "rings-nocen-nograd"
+tolerance = "10.0,10.0,10.0,2.0,3.0,2.0"
+threshold = 20
+min_snr = 5
+int_radius = "2,4,5"
+multi = True
+retry = True
+min_pix = 2
+bg_rad = 4
+min_res = 85
+reservation = None
+
+pointgroup = 1
+model = "unity"
+iterations = 1
+bins = 20
+resolution = 2.0
+max_adu = 12000
+project = "Cr_fragment"
+crystal = "bach"
+spacegroup = "P1"
+residues = 124
+
+
+logfile = "{0}.log".format( "auto_process" )
+logger.remove()
+logger.add( logfile, format="{message}", level="INFO")
+main( script_dir, cwd, runs,
+          endstation, pgroup, jfj, label,
+          chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold,
+          min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation,
+          pointgroup, model, iterations, bins, resolution, max_adu,
+          project, crystal, spacegroup, residues
+          )
+