486 lines
18 KiB
Python
486 lines
18 KiB
Python
#!/usr/bin/python
|
|
|
|
# author J.Beale
|
|
|
|
"""
|
|
# aim
|
|
automatically index, integrate, scale/merge and output mtzs for individual SwissFEL runs
|
|
combines, hopefully intelligently, indexamajig, partialator, f2mtz and truncate to produce complete mtz files
|
|
|
|
# usage
|
|
python auto_process.py
|
|
|
|
# output
|
|
in run dir
|
|
- integrated .stream files + logs
|
|
- scaled/merged files
|
|
- an mtz file
|
|
- useful plots
|
|
- log file of output
|
|
"""
|
|
|
|
# modules
|
|
import sys
|
|
sys.path.insert( 1, "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/" )
|
|
import partialator as part
|
|
import subprocess
|
|
import os, errno
|
|
import shutil
|
|
import pandas as pd
|
|
import numpy as np
|
|
import glob
|
|
import argparse
|
|
import regex as re
|
|
from loguru import logger
|
|
|
|
def read_cell( cell_file ):
|
|
|
|
# function to get cell parameter
|
|
def get_parameter( line, parameter ):
|
|
|
|
# general parameter search
|
|
pattern = r"{0}\s=\s(\d+\.\d+)".format( parameter )
|
|
value = re.search( pattern, line ).group(1)
|
|
return value
|
|
|
|
# loop through file and get parameters
|
|
cell = open( cell_file, "r" )
|
|
for line in cell:
|
|
if line.startswith( "a " ):
|
|
a = float( get_parameter( line, "a" ) )
|
|
if line.startswith( "b " ):
|
|
b = float( get_parameter( line, "b" ) )
|
|
if line.startswith( "c " ):
|
|
c = float( get_parameter( line, "c" ) )
|
|
if line.startswith( "al " ):
|
|
alpha = float( get_parameter( line, "al" ) )
|
|
if line.startswith( "be " ):
|
|
beta = float( get_parameter( line, "be" ) )
|
|
if line.startswith( "ga " ):
|
|
gamma = float( get_parameter( line, "ga" ) )
|
|
|
|
cell = [ a, b, c, alpha, beta, gamma ]
|
|
|
|
return cell
|
|
|
|
def cat_lst_files( script_dir, run_no, endstation, pgroup, jfj, label, output ):
|
|
|
|
cat_lst = "{0}/cat_lst.py".format( script_dir )
|
|
|
|
submit_cmd = [ "python", cat_lst,
|
|
"{0}".format( run_no ),
|
|
"--endstation={0}".format( endstation ),
|
|
"--pgroup={0}".format( pgroup ),
|
|
"--jfj={0}".format( jfj ),
|
|
"--label={0}".format( label ),
|
|
"--output={0}".format( output ),
|
|
]
|
|
|
|
subprocess.run( submit_cmd )
|
|
|
|
def run_index( script_dir, name, lst_file, chunk_size, geom_file, cell_file,
|
|
indexer, peakfinder, integrator, tolerance, threshold,
|
|
min_snr, int_radius, multi, retry, min_pix,
|
|
bg_rad, min_res, reservation
|
|
):
|
|
|
|
indexamajig = "{0}/crystfel_split_var.py".format( script_dir )
|
|
|
|
if reservation:
|
|
submit_cmd = [ "python", indexamajig,
|
|
"--job_name={0}".format( name ),
|
|
"--lst_file={0}".format( lst_file ),
|
|
"--chunk_size={0}".format( chunk_size ),
|
|
"--geom_file={0}".format( geom_file ),
|
|
"--cell_file={0}".format( cell_file ),
|
|
"--indexer={0}".format( indexer ),
|
|
"--peakfinder={0}".format( peakfinder ),
|
|
"--integrator={0}".format( integrator ),
|
|
"--tolerance={0}".format( tolerance ),
|
|
"--threshold={0}".format( threshold ),
|
|
"--min_snr={0}".format( min_snr ),
|
|
"--int_radius={0}".format( int_radius ),
|
|
"--multi={0}".format( multi ),
|
|
"--retry={0}".format( retry ),
|
|
"--min_pix={0}".format( min_pix ),
|
|
"--bg_rad={0}".format( bg_rad ),
|
|
"--min_res={0}".format( min_res ),
|
|
"--reservation={0}".format( reservation )
|
|
]
|
|
else:
|
|
submit_cmd = [ "python", indexamajig,
|
|
"--job_name={0}".format( name ),
|
|
"--lst_file={0}".format( lst_file ),
|
|
"--chunk_size={0}".format( chunk_size ),
|
|
"--geom_file={0}".format( geom_file ),
|
|
"--cell_file={0}".format( cell_file ),
|
|
"--indexer={0}".format( indexer ),
|
|
"--peakfinder={0}".format( peakfinder ),
|
|
"--integrator={0}".format( integrator ),
|
|
"--tolerance={0}".format( tolerance ),
|
|
"--threshold={0}".format( threshold ),
|
|
"--min_snr={0}".format( min_snr ),
|
|
"--int_radius={0}".format( int_radius ),
|
|
"--multi={0}".format( multi ),
|
|
"--retry={0}".format( retry ),
|
|
"--min_pix={0}".format( min_pix ),
|
|
"--bg_rad={0}".format( bg_rad ),
|
|
"--min_res={0}".format( min_res )
|
|
]
|
|
|
|
subprocess.run( submit_cmd )
|
|
|
|
# make stream name
|
|
stream_file = "{0}.stream".format( name )
|
|
|
|
return stream_file
|
|
|
|
def run_partialator( script_dir, name, stream_file, pointgroup, model, iterations,
|
|
cell_file, bins, resolution, max_adu, reservation
|
|
):
|
|
|
|
partialator = "{0}/partialator.py".format( script_dir, )
|
|
|
|
if reservation:
|
|
submit_cmd = [ "python", partialator,
|
|
"--name={0}".format( name ),
|
|
"--stream_file={0}".format( stream_file ),
|
|
"--pointgroup={0}".format( pointgroup ),
|
|
"--model={0}".format( model ),
|
|
"--iterations={0}".format( iterations ),
|
|
"--cell_file={0}".format( cell_file ),
|
|
"--bins={0}".format( bins ),
|
|
"--resolution={0}".format( resolution ),
|
|
"--max_adu={0}".format( max_adu ),
|
|
"--reservation={0}".format( reservation )
|
|
]
|
|
else:
|
|
submit_cmd = [ "python", partialator,
|
|
"--name={0}".format( name ),
|
|
"--stream_file={0}".format( stream_file ),
|
|
"--pointgroup={0}".format( pointgroup ),
|
|
"--model={0}".format( model ),
|
|
"--iterations={0}".format( iterations ),
|
|
"--cell_file={0}".format( cell_file ),
|
|
"--bins={0}".format( bins ),
|
|
"--resolution={0}".format( resolution ),
|
|
"--max_adu={0}".format( max_adu )
|
|
]
|
|
|
|
subprocess.run( submit_cmd )
|
|
|
|
def scrub_cc( part_dir_name ):
|
|
|
|
# open 1st partialator log file
|
|
part_log = open( "{0}.log".format( part_dir_name ) )
|
|
|
|
# regex example = resolution at CC0.5 at 0.3 = 1.599
|
|
pattern = r"resolution\sat\sCC0.5\sat\s0\.3\s=\s(\d+.\d+)"
|
|
cc = re.search( pattern, part_log.read() ).group(1)
|
|
|
|
return float( cc )
|
|
|
|
def scrub_index( index_dir_name ):
|
|
|
|
# open 1st partialator log file
|
|
index_log_file = open( "{0}.log".format( index_dir_name ) )
|
|
index_log = index_log_file.read()
|
|
|
|
# regex example = crystals = 0
|
|
images_pattern = r"images?\s=\s(\d+)"
|
|
images = re.search( images_pattern, index_log ).group(1)
|
|
|
|
# regex example = crystals = 0
|
|
xtals_pattern = r"crystals\s=\s(\d+)"
|
|
indexed = re.search( xtals_pattern, index_log ).group(1)
|
|
|
|
def get_index_cell( index_log, parameter ):
|
|
|
|
# general parameter search
|
|
pattern = r"mean\s{0}\s=\s(\d+\.\d+)\s\+/-\s(\d+\.\d+)".format( parameter )
|
|
parameters = re.search( pattern, index_log )
|
|
value, error = parameters.group(1), parameters.group(2)
|
|
return value, error
|
|
|
|
ind_res, ind_res_err = get_index_cell( index_log, "resolution" )
|
|
obs, obs_err = get_index_cell( index_log, "observations" )
|
|
a, a_err = get_index_cell( index_log, "a" )
|
|
b, b_err = get_index_cell( index_log, "b" )
|
|
c, c_err = get_index_cell( index_log, "c" )
|
|
alpha, alpha_err = get_index_cell( index_log, "alpha" )
|
|
beta, beta_err = get_index_cell( index_log, "beta" )
|
|
gamma, gamma_err = get_index_cell( index_log, "gamma" )
|
|
|
|
# put results into df
|
|
per_index = round( int( indexed )/int( images )*100, 2 )
|
|
data = [ { "images" : int( images ), "indexed" : int( indexed ), "indexed%" : per_index,
|
|
"index_res" : float( ind_res ), "index_res_err" : float( ind_res_err ),
|
|
"obs" : float( obs ), "obs_err" : float( obs_err ),
|
|
"a" : float( a ), "a_err" : float( a_err ),
|
|
"b" : float( b ), "b_err" : float( b_err ),
|
|
"c" : float( c ), "c_err": float( c_err ),
|
|
"alpha" : float( alpha ), "alpha_err" : float( alpha_err ),
|
|
"beta" : float( beta ), "beta_err" : float( beta_err ),
|
|
"gamma" : float( gamma ), "gamma_err" : float( gamma_err )
|
|
} ]
|
|
|
|
stream_df = pd.DataFrame( data )
|
|
|
|
index_log_file.close()
|
|
|
|
return stream_df
|
|
|
|
def run_make_mtz( script_dir, hklin_file, project, crystal, dataset, cell, spacegroup, residues, res_range ):
|
|
|
|
make_mtz = "{0}/make_mtz.py".format( script_dir )
|
|
|
|
submit_cmd = [ "python", make_mtz,
|
|
"--hklin={0}".format( hklin_file ),
|
|
"--project={0}".format( project ),
|
|
"--crystal={0}".format( crystal ),
|
|
"--dataset={0}".format( dataset ),
|
|
"--cell={0},{1},{2},{3},{4},{5}".format( cell[0], cell[1], cell[2], cell[3], cell[4], cell[5] ),
|
|
"--spacegroup={0}".format( spacegroup ),
|
|
"--residues={0}".format( residues ),
|
|
"--resolution_range={0},{1}".format( res_range[0], res_range[1] )
|
|
]
|
|
subprocess.run( submit_cmd )
|
|
|
|
def scrub_prefix( endstation, pgroup, run_name ):
|
|
|
|
# regex example = /sf/cristallina/data/p22216/raw/run0001-hewl_det-calib_1
|
|
file_path = "/sf/{0}/data/{1}/raw/{2}-*/".format( endstation, pgroup, run_name )
|
|
try:
|
|
file_path = glob.glob( file_path )[0]
|
|
pattern = r"/sf/{0}/data/{1}/raw/{2}-(.*)/".format( endstation, pgroup, run_name )
|
|
prefix = re.search( pattern, file_path ).group(1)
|
|
except AttributeError as e:
|
|
prefix = np.nan
|
|
|
|
return prefix
|
|
|
|
def make_process_dir( proc_dir ):
|
|
# make process directory
|
|
try:
|
|
os.makedirs( proc_dir )
|
|
except OSError as e:
|
|
if e.errno != errno.EEXIST:
|
|
logger.debug( "making directory error" )
|
|
raise
|
|
|
|
def main( script_dir, cwd, runs,
|
|
endstation, pgroup, jfj, label,
|
|
chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold,
|
|
min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation,
|
|
pointgroup, model, iterations, bins, resolution, max_adu,
|
|
project, crystal, spacegroup, residues
|
|
):
|
|
|
|
# collate results for summary csv
|
|
df = pd.DataFrame()
|
|
|
|
print( "beginning autoprocessing for runs {0}".format( runs ) )
|
|
|
|
for run in runs:
|
|
|
|
# make process dir
|
|
run_name = "run{0}".format( str( run ).zfill(4) )
|
|
print( "making run directory {0}".format( run_name ) )
|
|
proc_dir = "{0}/{1}".format( cwd, run_name )
|
|
make_process_dir( proc_dir )
|
|
|
|
# move to proc dir
|
|
os.chdir( proc_dir )
|
|
print( "done" )
|
|
|
|
# create lst file
|
|
# make lst output name
|
|
lst_name = "{0}_{1}.lst".format( run_name, label )
|
|
if not os.path.isfile( "{0}/{1}".format( proc_dir, lst_name ) ):
|
|
print( "makeing lst file" )
|
|
cat_lst_files( script_dir, run, endstation, pgroup, jfj, label, run_name )
|
|
print( "done" )
|
|
else:
|
|
print( "lst file already created" )
|
|
|
|
# scrub run prefix
|
|
prefix = scrub_prefix( endstation, pgroup, run_name )
|
|
|
|
# run crystfel split with variable
|
|
index_run_name = "{0}_index".format( run_name )
|
|
if not os.path.isfile( "{0}/{1}.stream".format( proc_dir, index_run_name ) ):
|
|
print( "running indexamajig" )
|
|
stream_file = run_index( script_dir, index_run_name, lst_name, chunk_size, geom_file, cell_file,
|
|
indexer, peakfinder, integrator, tolerance, threshold,
|
|
min_snr, int_radius, multi, retry, min_pix,
|
|
bg_rad, min_res, reservation
|
|
)
|
|
print( "done" )
|
|
else:
|
|
print( "indexamajig already run" )
|
|
stream_file = "{0}.stream".format( index_run_name )
|
|
|
|
# scrub stream log
|
|
stream_df = scrub_index( index_run_name )
|
|
indexed = stream_df.indexed[0]
|
|
|
|
# run partialator
|
|
partialator_limit = 250
|
|
part_run_name = "{0}_merge".format( run_name )
|
|
hkl_file = "{0}.hkl".format( part_run_name )
|
|
if indexed > partialator_limit:
|
|
print( "indexed {0} - now run partialator and make mtz".format( indexed ) )
|
|
|
|
if not os.path.isfile( "{0}/{1}/{2}".format( proc_dir, part_run_name, hkl_file ) ):
|
|
run_partialator( script_dir, part_run_name, stream_file, pointgroup, model, iterations,
|
|
cell_file, bins, resolution, max_adu, reservation
|
|
)
|
|
else:
|
|
print( "partialator has run" )
|
|
|
|
# get resolution at cc 0.5
|
|
# scrub log file
|
|
try:
|
|
cc = scrub_cc( part_run_name )
|
|
print( "scrubbed resolution = {0}".format( cc ) )
|
|
if cc < 1.2:
|
|
print( "predicted resolution too high. reset to 1.2 A" )
|
|
cc = 1.2
|
|
except AttributeError as e:
|
|
cc = 3.0
|
|
|
|
# re-run check hkl
|
|
print( "running cc check/compare" )
|
|
# move to partialator directory
|
|
os.chdir( part_run_name )
|
|
check_run_file = part.run_compare_check( "{0}/{1}".format( proc_dir, part_run_name ), part_run_name, cell_file, bins, cc )
|
|
submit_cmd = [ "{0}".format( check_run_file ) ]
|
|
subprocess.call( submit_cmd )
|
|
print( "done" )
|
|
|
|
# get partialator metrics
|
|
try:
|
|
overcc = part.get_overall_cc()
|
|
except:
|
|
overcc = np.nan
|
|
try:
|
|
overrsplit = part.get_overall_rsplit()
|
|
except:
|
|
overrsplit = np.nan
|
|
try:
|
|
b_factor = part.get_b()
|
|
except AttributeError as e:
|
|
b_factor = np.nan
|
|
|
|
# make mtz
|
|
print( "making final mtz" )
|
|
cell = read_cell( cell_file )
|
|
mtz_file = "{0}/{1}/{1}.mtz".format( proc_dir, part_run_name )
|
|
F_file = "{0}/{1}/{1}_F.mtz".format( proc_dir, part_run_name )
|
|
res_range = [ 50.0, cc ]
|
|
run_make_mtz( script_dir, hkl_file, project, crystal, run_name, cell, spacegroup, residues, res_range )
|
|
try:
|
|
shutil.move( mtz_file, proc_dir )
|
|
except shutil.Error as e:
|
|
pass
|
|
try:
|
|
shutil.move( F_file, proc_dir )
|
|
except shutil.Error as e:
|
|
pass
|
|
|
|
# move back to top directory
|
|
os.chdir( proc_dir )
|
|
|
|
else:
|
|
print( "partialator not run. {0} > indexed".format( partialator_limit ) )
|
|
cc = np.nan
|
|
overcc = np.nan
|
|
overrsplit = np.nan
|
|
b_factor = np.nan
|
|
|
|
# collate meta data
|
|
run_data = [ { "run" : run,
|
|
"prefix" : prefix
|
|
} ]
|
|
run_df = pd.DataFrame( run_data )
|
|
part_data = [ { "overall_cc" : overcc,
|
|
"cc" : cc,
|
|
"overall_rsplit" : overrsplit,
|
|
"overall_b" : b_factor,
|
|
} ]
|
|
part_df = pd.DataFrame( part_data )
|
|
df_1 = pd.concat( [ run_df, stream_df, part_df ], axis=1 )
|
|
df = pd.concat( [ df, df_1 ] )
|
|
|
|
print( "done {0}".format( run_name ) )
|
|
|
|
# move back to cwd
|
|
os.chdir( cwd )
|
|
|
|
# output stats table
|
|
print( df )
|
|
df = df.reset_index( drop=True )
|
|
df.to_csv( "auto_process_summary.csv", sep="," )
|
|
|
|
print( "finished all" )
|
|
|
|
|
|
|
|
#script_dir = "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/"
|
|
script_dir = "/sf/cristallina/data/p22216/work/processing/bach"
|
|
cwd = os.getcwd()
|
|
runs = [ 11, 12, 13, 14, 23, 24, 25, 26, 59, 60, 61, 63, 64, 65,
|
|
67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
|
|
83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
|
|
100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
|
|
113, 114, 115, 116, 117, 118, 119, 122, 123, 124, 125, 126, 127,
|
|
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138 ]
|
|
#runs = [ 11, 12 ]
|
|
endstation = "cristallina"
|
|
pgroup = "p22216"
|
|
jfj = True
|
|
label = "off"
|
|
|
|
geom_file = "{0}/{1}".format( cwd, "JF17T16V01_cf10_shift.geom" )
|
|
jfj = True
|
|
label = "off"
|
|
chunk_size = 250
|
|
cell_file = "/sf/cristallina/data/p22216/work/processing/bach/p1.cell"
|
|
indexer = "xgandalf"
|
|
peakfinder = "peakfinder8"
|
|
integrator = "rings-nocen-nograd"
|
|
tolerance = "10.0,10.0,10.0,2.0,3.0,2.0"
|
|
threshold = 20
|
|
min_snr = 5
|
|
int_radius = "2,4,5"
|
|
multi = True
|
|
retry = True
|
|
min_pix = 2
|
|
bg_rad = 4
|
|
min_res = 85
|
|
reservation = None
|
|
|
|
pointgroup = 1
|
|
model = "unity"
|
|
iterations = 1
|
|
bins = 20
|
|
resolution = 2.0
|
|
max_adu = 12000
|
|
project = "Cr_fragment"
|
|
crystal = "bach"
|
|
spacegroup = "P1"
|
|
residues = 124
|
|
|
|
|
|
logfile = "{0}.log".format( "auto_process" )
|
|
logger.remove()
|
|
logger.add( logfile, format="{message}", level="INFO")
|
|
main( script_dir, cwd, runs,
|
|
endstation, pgroup, jfj, label,
|
|
chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold,
|
|
min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation,
|
|
pointgroup, model, iterations, bins, resolution, max_adu,
|
|
project, crystal, spacegroup, residues
|
|
)
|
|
|