Files
crystfel_tools/reduction_tools/auto_process.py
2025-01-22 15:42:57 +01:00

486 lines
18 KiB
Python

#!/usr/bin/python
# author J.Beale
"""
# aim
automatically index, integrate, scale/merge and output mtzs for individual SwissFEL runs
combines, hopefully intelligently, indexamajig, partialator, f2mtz and truncate to produce complete mtz files
# usage
python auto_process.py
# output
in run dir
- integrated .stream files + logs
- scaled/merged files
- an mtz file
- useful plots
- log file of output
"""
# modules
import sys
sys.path.insert( 1, "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/" )
import partialator as part
import subprocess
import os, errno
import shutil
import pandas as pd
import numpy as np
import glob
import argparse
import regex as re
from loguru import logger
def read_cell( cell_file ):
# function to get cell parameter
def get_parameter( line, parameter ):
# general parameter search
pattern = r"{0}\s=\s(\d+\.\d+)".format( parameter )
value = re.search( pattern, line ).group(1)
return value
# loop through file and get parameters
cell = open( cell_file, "r" )
for line in cell:
if line.startswith( "a " ):
a = float( get_parameter( line, "a" ) )
if line.startswith( "b " ):
b = float( get_parameter( line, "b" ) )
if line.startswith( "c " ):
c = float( get_parameter( line, "c" ) )
if line.startswith( "al " ):
alpha = float( get_parameter( line, "al" ) )
if line.startswith( "be " ):
beta = float( get_parameter( line, "be" ) )
if line.startswith( "ga " ):
gamma = float( get_parameter( line, "ga" ) )
cell = [ a, b, c, alpha, beta, gamma ]
return cell
def cat_lst_files( script_dir, run_no, endstation, pgroup, jfj, label, output ):
cat_lst = "{0}/cat_lst.py".format( script_dir )
submit_cmd = [ "python", cat_lst,
"{0}".format( run_no ),
"--endstation={0}".format( endstation ),
"--pgroup={0}".format( pgroup ),
"--jfj={0}".format( jfj ),
"--label={0}".format( label ),
"--output={0}".format( output ),
]
subprocess.run( submit_cmd )
def run_index( script_dir, name, lst_file, chunk_size, geom_file, cell_file,
indexer, peakfinder, integrator, tolerance, threshold,
min_snr, int_radius, multi, retry, min_pix,
bg_rad, min_res, reservation
):
indexamajig = "{0}/crystfel_split_var.py".format( script_dir )
if reservation:
submit_cmd = [ "python", indexamajig,
"--job_name={0}".format( name ),
"--lst_file={0}".format( lst_file ),
"--chunk_size={0}".format( chunk_size ),
"--geom_file={0}".format( geom_file ),
"--cell_file={0}".format( cell_file ),
"--indexer={0}".format( indexer ),
"--peakfinder={0}".format( peakfinder ),
"--integrator={0}".format( integrator ),
"--tolerance={0}".format( tolerance ),
"--threshold={0}".format( threshold ),
"--min_snr={0}".format( min_snr ),
"--int_radius={0}".format( int_radius ),
"--multi={0}".format( multi ),
"--retry={0}".format( retry ),
"--min_pix={0}".format( min_pix ),
"--bg_rad={0}".format( bg_rad ),
"--min_res={0}".format( min_res ),
"--reservation={0}".format( reservation )
]
else:
submit_cmd = [ "python", indexamajig,
"--job_name={0}".format( name ),
"--lst_file={0}".format( lst_file ),
"--chunk_size={0}".format( chunk_size ),
"--geom_file={0}".format( geom_file ),
"--cell_file={0}".format( cell_file ),
"--indexer={0}".format( indexer ),
"--peakfinder={0}".format( peakfinder ),
"--integrator={0}".format( integrator ),
"--tolerance={0}".format( tolerance ),
"--threshold={0}".format( threshold ),
"--min_snr={0}".format( min_snr ),
"--int_radius={0}".format( int_radius ),
"--multi={0}".format( multi ),
"--retry={0}".format( retry ),
"--min_pix={0}".format( min_pix ),
"--bg_rad={0}".format( bg_rad ),
"--min_res={0}".format( min_res )
]
subprocess.run( submit_cmd )
# make stream name
stream_file = "{0}.stream".format( name )
return stream_file
def run_partialator( script_dir, name, stream_file, pointgroup, model, iterations,
cell_file, bins, resolution, max_adu, reservation
):
partialator = "{0}/partialator.py".format( script_dir, )
if reservation:
submit_cmd = [ "python", partialator,
"--name={0}".format( name ),
"--stream_file={0}".format( stream_file ),
"--pointgroup={0}".format( pointgroup ),
"--model={0}".format( model ),
"--iterations={0}".format( iterations ),
"--cell_file={0}".format( cell_file ),
"--bins={0}".format( bins ),
"--resolution={0}".format( resolution ),
"--max_adu={0}".format( max_adu ),
"--reservation={0}".format( reservation )
]
else:
submit_cmd = [ "python", partialator,
"--name={0}".format( name ),
"--stream_file={0}".format( stream_file ),
"--pointgroup={0}".format( pointgroup ),
"--model={0}".format( model ),
"--iterations={0}".format( iterations ),
"--cell_file={0}".format( cell_file ),
"--bins={0}".format( bins ),
"--resolution={0}".format( resolution ),
"--max_adu={0}".format( max_adu )
]
subprocess.run( submit_cmd )
def scrub_cc( part_dir_name ):
# open 1st partialator log file
part_log = open( "{0}.log".format( part_dir_name ) )
# regex example = resolution at CC0.5 at 0.3 = 1.599
pattern = r"resolution\sat\sCC0.5\sat\s0\.3\s=\s(\d+.\d+)"
cc = re.search( pattern, part_log.read() ).group(1)
return float( cc )
def scrub_index( index_dir_name ):
# open 1st partialator log file
index_log_file = open( "{0}.log".format( index_dir_name ) )
index_log = index_log_file.read()
# regex example = crystals = 0
images_pattern = r"images?\s=\s(\d+)"
images = re.search( images_pattern, index_log ).group(1)
# regex example = crystals = 0
xtals_pattern = r"crystals\s=\s(\d+)"
indexed = re.search( xtals_pattern, index_log ).group(1)
def get_index_cell( index_log, parameter ):
# general parameter search
pattern = r"mean\s{0}\s=\s(\d+\.\d+)\s\+/-\s(\d+\.\d+)".format( parameter )
parameters = re.search( pattern, index_log )
value, error = parameters.group(1), parameters.group(2)
return value, error
ind_res, ind_res_err = get_index_cell( index_log, "resolution" )
obs, obs_err = get_index_cell( index_log, "observations" )
a, a_err = get_index_cell( index_log, "a" )
b, b_err = get_index_cell( index_log, "b" )
c, c_err = get_index_cell( index_log, "c" )
alpha, alpha_err = get_index_cell( index_log, "alpha" )
beta, beta_err = get_index_cell( index_log, "beta" )
gamma, gamma_err = get_index_cell( index_log, "gamma" )
# put results into df
per_index = round( int( indexed )/int( images )*100, 2 )
data = [ { "images" : int( images ), "indexed" : int( indexed ), "indexed%" : per_index,
"index_res" : float( ind_res ), "index_res_err" : float( ind_res_err ),
"obs" : float( obs ), "obs_err" : float( obs_err ),
"a" : float( a ), "a_err" : float( a_err ),
"b" : float( b ), "b_err" : float( b_err ),
"c" : float( c ), "c_err": float( c_err ),
"alpha" : float( alpha ), "alpha_err" : float( alpha_err ),
"beta" : float( beta ), "beta_err" : float( beta_err ),
"gamma" : float( gamma ), "gamma_err" : float( gamma_err )
} ]
stream_df = pd.DataFrame( data )
index_log_file.close()
return stream_df
def run_make_mtz( script_dir, hklin_file, project, crystal, dataset, cell, spacegroup, residues, res_range ):
make_mtz = "{0}/make_mtz.py".format( script_dir )
submit_cmd = [ "python", make_mtz,
"--hklin={0}".format( hklin_file ),
"--project={0}".format( project ),
"--crystal={0}".format( crystal ),
"--dataset={0}".format( dataset ),
"--cell={0},{1},{2},{3},{4},{5}".format( cell[0], cell[1], cell[2], cell[3], cell[4], cell[5] ),
"--spacegroup={0}".format( spacegroup ),
"--residues={0}".format( residues ),
"--resolution_range={0},{1}".format( res_range[0], res_range[1] )
]
subprocess.run( submit_cmd )
def scrub_prefix( endstation, pgroup, run_name ):
# regex example = /sf/cristallina/data/p22216/raw/run0001-hewl_det-calib_1
file_path = "/sf/{0}/data/{1}/raw/{2}-*/".format( endstation, pgroup, run_name )
try:
file_path = glob.glob( file_path )[0]
pattern = r"/sf/{0}/data/{1}/raw/{2}-(.*)/".format( endstation, pgroup, run_name )
prefix = re.search( pattern, file_path ).group(1)
except AttributeError as e:
prefix = np.nan
return prefix
def make_process_dir( proc_dir ):
# make process directory
try:
os.makedirs( proc_dir )
except OSError as e:
if e.errno != errno.EEXIST:
logger.debug( "making directory error" )
raise
def main( script_dir, cwd, runs,
endstation, pgroup, jfj, label,
chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold,
min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation,
pointgroup, model, iterations, bins, resolution, max_adu,
project, crystal, spacegroup, residues
):
# collate results for summary csv
df = pd.DataFrame()
print( "beginning autoprocessing for runs {0}".format( runs ) )
for run in runs:
# make process dir
run_name = "run{0}".format( str( run ).zfill(4) )
print( "making run directory {0}".format( run_name ) )
proc_dir = "{0}/{1}".format( cwd, run_name )
make_process_dir( proc_dir )
# move to proc dir
os.chdir( proc_dir )
print( "done" )
# create lst file
# make lst output name
lst_name = "{0}_{1}.lst".format( run_name, label )
if not os.path.isfile( "{0}/{1}".format( proc_dir, lst_name ) ):
print( "makeing lst file" )
cat_lst_files( script_dir, run, endstation, pgroup, jfj, label, run_name )
print( "done" )
else:
print( "lst file already created" )
# scrub run prefix
prefix = scrub_prefix( endstation, pgroup, run_name )
# run crystfel split with variable
index_run_name = "{0}_index".format( run_name )
if not os.path.isfile( "{0}/{1}.stream".format( proc_dir, index_run_name ) ):
print( "running indexamajig" )
stream_file = run_index( script_dir, index_run_name, lst_name, chunk_size, geom_file, cell_file,
indexer, peakfinder, integrator, tolerance, threshold,
min_snr, int_radius, multi, retry, min_pix,
bg_rad, min_res, reservation
)
print( "done" )
else:
print( "indexamajig already run" )
stream_file = "{0}.stream".format( index_run_name )
# scrub stream log
stream_df = scrub_index( index_run_name )
indexed = stream_df.indexed[0]
# run partialator
partialator_limit = 250
part_run_name = "{0}_merge".format( run_name )
hkl_file = "{0}.hkl".format( part_run_name )
if indexed > partialator_limit:
print( "indexed {0} - now run partialator and make mtz".format( indexed ) )
if not os.path.isfile( "{0}/{1}/{2}".format( proc_dir, part_run_name, hkl_file ) ):
run_partialator( script_dir, part_run_name, stream_file, pointgroup, model, iterations,
cell_file, bins, resolution, max_adu, reservation
)
else:
print( "partialator has run" )
# get resolution at cc 0.5
# scrub log file
try:
cc = scrub_cc( part_run_name )
print( "scrubbed resolution = {0}".format( cc ) )
if cc < 1.2:
print( "predicted resolution too high. reset to 1.2 A" )
cc = 1.2
except AttributeError as e:
cc = 3.0
# re-run check hkl
print( "running cc check/compare" )
# move to partialator directory
os.chdir( part_run_name )
check_run_file = part.run_compare_check( "{0}/{1}".format( proc_dir, part_run_name ), part_run_name, cell_file, bins, cc )
submit_cmd = [ "{0}".format( check_run_file ) ]
subprocess.call( submit_cmd )
print( "done" )
# get partialator metrics
try:
overcc = part.get_overall_cc()
except:
overcc = np.nan
try:
overrsplit = part.get_overall_rsplit()
except:
overrsplit = np.nan
try:
b_factor = part.get_b()
except AttributeError as e:
b_factor = np.nan
# make mtz
print( "making final mtz" )
cell = read_cell( cell_file )
mtz_file = "{0}/{1}/{1}.mtz".format( proc_dir, part_run_name )
F_file = "{0}/{1}/{1}_F.mtz".format( proc_dir, part_run_name )
res_range = [ 50.0, cc ]
run_make_mtz( script_dir, hkl_file, project, crystal, run_name, cell, spacegroup, residues, res_range )
try:
shutil.move( mtz_file, proc_dir )
except shutil.Error as e:
pass
try:
shutil.move( F_file, proc_dir )
except shutil.Error as e:
pass
# move back to top directory
os.chdir( proc_dir )
else:
print( "partialator not run. {0} > indexed".format( partialator_limit ) )
cc = np.nan
overcc = np.nan
overrsplit = np.nan
b_factor = np.nan
# collate meta data
run_data = [ { "run" : run,
"prefix" : prefix
} ]
run_df = pd.DataFrame( run_data )
part_data = [ { "overall_cc" : overcc,
"cc" : cc,
"overall_rsplit" : overrsplit,
"overall_b" : b_factor,
} ]
part_df = pd.DataFrame( part_data )
df_1 = pd.concat( [ run_df, stream_df, part_df ], axis=1 )
df = pd.concat( [ df, df_1 ] )
print( "done {0}".format( run_name ) )
# move back to cwd
os.chdir( cwd )
# output stats table
print( df )
df = df.reset_index( drop=True )
df.to_csv( "auto_process_summary.csv", sep="," )
print( "finished all" )
#script_dir = "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/"
script_dir = "/sf/cristallina/data/p22216/work/processing/bach"
cwd = os.getcwd()
runs = [ 11, 12, 13, 14, 23, 24, 25, 26, 59, 60, 61, 63, 64, 65,
67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
113, 114, 115, 116, 117, 118, 119, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138 ]
#runs = [ 11, 12 ]
endstation = "cristallina"
pgroup = "p22216"
jfj = True
label = "off"
geom_file = "{0}/{1}".format( cwd, "JF17T16V01_cf10_shift.geom" )
jfj = True
label = "off"
chunk_size = 250
cell_file = "/sf/cristallina/data/p22216/work/processing/bach/p1.cell"
indexer = "xgandalf"
peakfinder = "peakfinder8"
integrator = "rings-nocen-nograd"
tolerance = "10.0,10.0,10.0,2.0,3.0,2.0"
threshold = 20
min_snr = 5
int_radius = "2,4,5"
multi = True
retry = True
min_pix = 2
bg_rad = 4
min_res = 85
reservation = None
pointgroup = 1
model = "unity"
iterations = 1
bins = 20
resolution = 2.0
max_adu = 12000
project = "Cr_fragment"
crystal = "bach"
spacegroup = "P1"
residues = 124
logfile = "{0}.log".format( "auto_process" )
logger.remove()
logger.add( logfile, format="{message}", level="INFO")
main( script_dir, cwd, runs,
endstation, pgroup, jfj, label,
chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold,
min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation,
pointgroup, model, iterations, bins, resolution, max_adu,
project, crystal, spacegroup, residues
)