From a38a9b10b1b7fe1c88276a64e671ef5ecdef0ffa Mon Sep 17 00:00:00 2001 From: Beale John Henry Date: Tue, 21 Jan 2025 15:45:46 +0100 Subject: [PATCH] auto processing script combining split, partialator and make_mtz. in progress --- reduction_tools/auto_process.py | 440 ++++++++++++++++++++++++++++++++ 1 file changed, 440 insertions(+) create mode 100644 reduction_tools/auto_process.py diff --git a/reduction_tools/auto_process.py b/reduction_tools/auto_process.py new file mode 100644 index 0000000..8398e1e --- /dev/null +++ b/reduction_tools/auto_process.py @@ -0,0 +1,440 @@ +#!/usr/bin/python + +# author J.Beale + +""" +# aim +automatically index, integrate, scale/merge and output mtzs for individual SwissFEL runs +combines, hopefully intelligently, indexamajig, partialator, f2mtz and truncate to produce complete mtz files + +# usage +python auto_process.py + +# output +in run dir +- integrated .stream files + logs +- scaled/merged files +- an mtz file +- useful plots +- log file of output +""" + +# modules +import sys +sys.path.insert( 1, "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/" ) +import partialator as part +import subprocess +import os, errno +import shutil +import pandas as pd +import numpy as np +import glob +import argparse +import regex as re +from loguru import logger + +def read_cell( cell_file ): + + # function to get cell parameter + def get_parameter( line, parameter ): + + # general parameter search + pattern = r"{0}\s=\s(\d+\.\d+)".format( parameter ) + value = re.search( pattern, line ).group(1) + return value + + # loop through file and get parameters + cell = open( cell_file, "r" ) + for line in cell: + if line.startswith( "a " ): + a = float( get_parameter( line, "a" ) ) + if line.startswith( "b " ): + b = float( get_parameter( line, "b" ) ) + if line.startswith( "c " ): + c = float( get_parameter( line, "c" ) ) + if line.startswith( "al " ): + alpha = float( get_parameter( line, "al" ) ) + if line.startswith( "be " ): + beta = float( get_parameter( line, "be" ) ) + if line.startswith( "ga " ): + gamma = float( get_parameter( line, "ga" ) ) + + cell = [ a, b, c, alpha, beta, gamma ] + + return cell + +def cat_lst_files( script_dir, run_no, endstation, pgroup, jfj, label, output ): + + cat_lst = "{0}/cat_lst.py".format( script_dir ) + + submit_cmd = [ "python", cat_lst, + "{0}".format( run_no ), + "--endstation={0}".format( endstation ), + "--pgroup={0}".format( pgroup ), + "--jfj={0}".format( jfj ), + "--label={0}".format( label ), + "--output={0}".format( output ), + ] + + subprocess.run( submit_cmd ) + +def run_index( script_dir, name, lst_file, chunk_size, geom_file, cell_file, + indexer, peakfinder, integrator, tolerance, threshold, + min_snr, int_radius, multi, retry, min_pix, + bg_rad, min_res, reservation + ): + + indexamajig = "{0}/crystfel_split_var.py".format( script_dir ) + + if reservation: + submit_cmd = [ "python", indexamajig, + "--job_name={0}".format( name ), + "--lst_file={0}".format( lst_file ), + "--chunk_size={0}".format( chunk_size ), + "--geom_file={0}".format( geom_file ), + "--cell_file={0}".format( cell_file ), + "--indexer={0}".format( indexer ), + "--peakfinder={0}".format( peakfinder ), + "--integrator={0}".format( integrator ), + "--tolerance={0}".format( tolerance ), + "--threshold={0}".format( threshold ), + "--min_snr={0}".format( min_snr ), + "--int_radius={0}".format( int_radius ), + "--multi={0}".format( multi ), + "--retry={0}".format( retry ), + "--min_pix={0}".format( min_pix ), + "--bg_rad={0}".format( bg_rad ), + "--min_res={0}".format( min_res ), + "--reservation={0}".format( reservation ) + ] + else: + submit_cmd = [ "python", indexamajig, + "--job_name={0}".format( name ), + "--lst_file={0}".format( lst_file ), + "--chunk_size={0}".format( chunk_size ), + "--geom_file={0}".format( geom_file ), + "--cell_file={0}".format( cell_file ), + "--indexer={0}".format( indexer ), + "--peakfinder={0}".format( peakfinder ), + "--integrator={0}".format( integrator ), + "--tolerance={0}".format( tolerance ), + "--threshold={0}".format( threshold ), + "--min_snr={0}".format( min_snr ), + "--int_radius={0}".format( int_radius ), + "--multi={0}".format( multi ), + "--retry={0}".format( retry ), + "--min_pix={0}".format( min_pix ), + "--bg_rad={0}".format( bg_rad ), + "--min_res={0}".format( min_res ) + ] + + subprocess.run( submit_cmd ) + + # make stream name + stream_file = "{0}.stream".format( name ) + + return stream_file + +def run_partialator( script_dir, name, stream_file, pointgroup, model, iterations, + cell_file, bins, resolution, max_adu, reservation + ): + + partialator = "{0}/partialator.py".format( script_dir, ) + + if reservation: + submit_cmd = [ "python", partialator, + "--name={0}".format( name ), + "--stream_file={0}".format( stream_file ), + "--pointgroup={0}".format( pointgroup ), + "--model={0}".format( model ), + "--iterations={0}".format( iterations ), + "--cell_file={0}".format( cell_file ), + "--bins={0}".format( bins ), + "--resolution={0}".format( resolution ), + "--max_adu={0}".format( max_adu ), + "--reservation={0}".format( reservation ) + ] + else: + submit_cmd = [ "python", partialator, + "--name={0}".format( name ), + "--stream_file={0}".format( stream_file ), + "--pointgroup={0}".format( pointgroup ), + "--model={0}".format( model ), + "--iterations={0}".format( iterations ), + "--cell_file={0}".format( cell_file ), + "--bins={0}".format( bins ), + "--resolution={0}".format( resolution ), + "--max_adu={0}".format( max_adu ) + ] + + subprocess.run( submit_cmd ) + +def scrub_cc( part_dir_name ): + + # open 1st partialator log file + part_log = open( "{0}.log".format( part_dir_name ) ) + + # regex example = resolution at CC0.5 at 0.3 = 1.599 + pattern = r"resolution\sat\sCC0.5\sat\s0\.3\s=\s(\d+.\d+)" + cc = re.search( pattern, part_log.read() ).group(1) + + return float( cc ) + +def scrub_index( index_dir_name ): + + # open 1st partialator log file + index_log_file = open( "{0}.log".format( index_dir_name ) ) + index_log = index_log_file.read() + + # regex example = crystals = 0 + images_pattern = r"images\s=\s(\d+)" + images = re.search( images_pattern, index_log ).group(1) + + # regex example = crystals = 0 + xtals_pattern = r"crystals\s=\s(\d+)" + indexed = re.search( xtals_pattern, index_log ).group(1) + + def get_index_cell( index_log, parameter ): + + # general parameter search + pattern = r"mean\s{0}\s=\s(\d+\.\d+)\s\+/-\s(\d+\.\d+)".format( parameter ) + parameters = re.search( pattern, index_log ) + value, error = parameters.group(1), parameters.group(2) + return value, error + + ind_res, ind_res_err = get_index_cell( index_log, "resolution" ) + obs, obs_err = get_index_cell( index_log, "observations" ) + a, a_err = get_index_cell( index_log, "a" ) + b, b_err = get_index_cell( index_log, "b" ) + c, c_err = get_index_cell( index_log, "c" ) + alpha, alpha_err = get_index_cell( index_log, "alpha" ) + beta, beta_err = get_index_cell( index_log, "beta" ) + gamma, gamma_err = get_index_cell( index_log, "gamma" ) + + # put results into df + data = [ { "images" : int( images ), "indexed" : int( indexed ), + "index_res" : float( ind_res ), "index_res_err" : float( ind_res_err ), + "obs" : float( obs ), "obs_err" : float( obs_err ), + "a" : float( a ), "a_err" : float( a_err ), + "b" : float( b ), "b_err" : float( b_err ), + "c" : float( c ), "c_err": float( c_err ), + "alpha" : float( alpha ), "alpha_err" : float( alpha_err ), + "beta" : float( beta ), "beta_err" : float( beta_err ), + "gamma" : float( gamma ), "gamma_err" : float( gamma_err ) + } ] + + stream_df = pd.DataFrame( data ) + + return stream_df + +def run_make_mtz( script_dir, hklin_file, project, crystal, dataset, cell, spacegroup, residues, res_range ): + + make_mtz = "{0}/make_mtz.py".format( script_dir ) + + submit_cmd = [ "python", make_mtz, + "--hklin={0}".format( hklin_file ), + "--project={0}".format( project ), + "--crystal={0}".format( crystal ), + "--dataset={0}".format( dataset ), + "--cell={0},{1},{2},{3},{4},{5}".format( cell[0], cell[1], cell[2], cell[3], cell[4], cell[5] ), + "--spacegroup={0}".format( spacegroup ), + "--residues={0}".format( residues ), + "--resolution_range={0},{1}".format( res_range[0], res_range[1] ) + ] + subprocess.run( submit_cmd ) + +def scrub_prefix( endstation, pgroup, run_name ): + + # regex example = /sf/cristallina/data/p22216/raw/run0001-hewl_det-calib_1 + file_path = "/sf/{0}/data/{1}/raw/{2}-*/".format( endstation, pgroup, run_name ) + try: + file_path = glob.glob( file_path )[0] + pattern = r"/sf/{0}/data/{1}/raw/{2}-(.*)/".format( endstation, pgroup, run_name ) + prefix = re.search( pattern, file_path ).group(1) + except AttributeError as e: + prefix = np.nan + + return prefix + +def make_process_dir( proc_dir ): + # make process directory + try: + os.makedirs( proc_dir ) + except OSError as e: + if e.errno != errno.EEXIST: + logger.debug( "making directory error" ) + raise + +def main( script_dir, cwd, runs, + endstation, pgroup, jfj, label, + chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold, + min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation, + pointgroup, model, iterations, bins, resolution, max_adu, + project, crystal, spacegroup, residues + ): + + # collate results for summary csv + df = pd.DataFrame() + + print( "beginning autoprocessing for runs {0}".format( runs ) ) + + for run in runs: + + # make process dir + run_name = "run{0}".format( str( run ).zfill(4) ) + print( "making run directory {0}".format( run_name ) ) + proc_dir = "{0}/{1}".format( cwd, run_name ) + make_process_dir( proc_dir ) + + # move to proc dir + os.chdir( proc_dir ) + print( "done" ) + + # create lst file + # make lst output name + lst_name = "{0}_{1}.lst".format( run_name, label ) + if not os.path.isfile( "{0}/{1}".format( proc_dir, lst_name ) ): + print( "makeing lst file" ) + cat_lst_files( script_dir, run, endstation, pgroup, jfj, label, run_name ) + print( "done" ) + else: + print( "lst file already created" ) + + # scrub run prefix + prefix = scrub_prefix( endstation, pgroup, run_name ) + + # run crystfel split with variable + index_run_name = "{0}_index".format( run_name ) + if not os.path.isfile( "{0}/{1}.stream".format( proc_dir, index_run_name ) ): + print( "running indexamajig" ) + stream_file = run_index( script_dir, index_run_name, lst_name, chunk_size, geom_file, cell_file, + indexer, peakfinder, integrator, tolerance, threshold, + min_snr, int_radius, multi, retry, min_pix, + bg_rad, min_res, reservation + ) + print( "done" ) + else: + print( "indexamajig already run" ) + stream_file = "{0}.stream".format( index_run_name ) + + # scrub stream log + stream_df = scrub_index( index_run_name ) + indexed = stream_df.indexed[0] + + # run partialator + if indexed > 100: + print( "indexed {0} - now run partialator and make mtz".format( indexed ) ) + + part_run_name = "{0}_merge".format( run_name ) + hkl_file = "{0}.hkl".format( part_run_name ) + if not os.path.isfile( "{0}/{1}/{2}".format( proc_dir, part_run_name, hkl_file ) ): + run_partialator( script_dir, part_run_name, stream_file, pointgroup, model, iterations, + cell_file, bins, resolution, max_adu, reservation + ) + else: + print( "partialator has run" ) + + # get resolution at cc 0.5 + # scrub log file + try: + cc = scrub_cc( part_run_name ) + print( "scrubbed resolution = {0}".format( cc ) ) + if cc < 1.2: + print( "predicted resolution too high. reset to 1.2 A" ) + cc = 1.2 + except AttributeError as e: + cc = 3.0 + + # re-run check hkl + print( "running cc check/compare" ) + os.chdir( part_run_name ) + check_run_file = part.run_compare_check( "{0}/{1}".format( proc_dir, part_run_name ), part_run_name, cell_file, bins, cc ) + submit_cmd = [ "{0}".format( check_run_file ) ] + subprocess.call( submit_cmd ) + print( "done" ) + + # make mtz + print( "making final mtz" ) + cell = read_cell( cell_file ) + mtz_file = "{0}/{1}/{1}.mtz".format( proc_dir, part_run_name ) + F_file = "{0}/{1}/{1}_F.mtz".format( proc_dir, part_run_name ) + res_range = [ 50.0, cc ] + run_make_mtz( script_dir, hkl_file, project, crystal, run_name, cell, spacegroup, residues, res_range ) + try: + shutil.move( mtz_file, proc_dir ) + except shutil.Error as e: + pass + try: + shutil.move( F_file, proc_dir ) + except shutil.Error as e: + pass + print( "done {0}".format( run_name ) ) + + else: + print( "partialator not run. < 100 indexed" ) + + data = [ { "run" : run, + "prefix" : prefix, + } ] + + # move back to cwd + os.chdir( cwd ) + + print( "finished all!" ) + + + +#script_dir = "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/" +script_dir = "/sf/cristallina/data/p22216/work/processing/bach" +cwd = os.getcwd() +#runs = [ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, +# 113, 114, 115, 116, 117, 118, 119, 122, 123, 124, 125, 126, 127, +# 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138 ] +runs = [ 11 ] +endstation = "cristallina" +pgroup = "p22216" +jfj = True +label = "off" + +geom_file = "{0}/{1}".format( cwd, "JF17T16V01_cf10_shift.geom" ) +jfj = True +label = "off" +chunk_size = 250 +cell_file = "/sf/cristallina/data/p22216/work/processing/bach/p1.cell" +indexer = "xgandalf" +peakfinder = "peakfinder8" +integrator = "rings-nocen-nograd" +tolerance = "10.0,10.0,10.0,2.0,3.0,2.0" +threshold = 20 +min_snr = 5 +int_radius = "2,4,5" +multi = True +retry = True +min_pix = 2 +bg_rad = 4 +min_res = 85 +reservation = None + +pointgroup = 1 +model = "unity" +iterations = 1 +bins = 20 +resolution = 2.0 +max_adu = 12000 +project = "Cr_fragment" +crystal = "bach" +spacegroup = "P1" +residues = 124 + + +logfile = "{0}.log".format( "auto_process" ) +logger.remove() +logger.add( logfile, format="{message}", level="INFO") +main( script_dir, cwd, runs, + endstation, pgroup, jfj, label, + chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold, + min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation, + pointgroup, model, iterations, bins, resolution, max_adu, + project, crystal, spacegroup, residues + ) +