534 lines
16 KiB
Python
534 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
|
|
# authors T.Mason and J.Beale
|
|
|
|
"""
|
|
# aim
|
|
to refine the detector distance using crystfel
|
|
- naming covention = #.###/#.###.stream
|
|
|
|
# usage
|
|
python detector-distance-refinement.py -l <path to lst file generated by daq>
|
|
-g <path to geom file>
|
|
-d central clen to refine around
|
|
-c cell_file
|
|
-s sample size
|
|
-e endstation - alvra or cristallina
|
|
|
|
# output
|
|
plot files of the analysis and a suggest for the clen
|
|
"""
|
|
|
|
# modules
|
|
import pandas as pd
|
|
import subprocess
|
|
import os, errno
|
|
import regex as re
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import time
|
|
from tqdm import tqdm
|
|
import argparse
|
|
from scipy.optimize import curve_fit
|
|
from scipy.signal import peak_widths, find_peaks
|
|
|
|
def h5_sample( lst, sample ):
|
|
|
|
# create sample of images from run
|
|
# read h5.lst - note - removes // from image column
|
|
cols = [ "h5", "image" ]
|
|
sample_df = pd.read_csv( lst, sep="\s//", engine="python", names=cols )
|
|
|
|
# take sample - if sample required
|
|
if len( sample_df ) > sample:
|
|
|
|
# take defined sample
|
|
sample_df = sample_df.sample( sample )
|
|
# sort list
|
|
sample_df = sample_df.sort_index()
|
|
|
|
# re-add // to image columm
|
|
sample_df[ "image" ] = "//" + sample_df.image.astype(str)
|
|
|
|
# write sample to file
|
|
sample_file = "h5_{0}_sample.lst".format( sample )
|
|
sample_df.to_csv( sample_file, sep=" ", index=False, header=False )
|
|
|
|
# return sample file name
|
|
return sample_file
|
|
|
|
def geom_amend( lab6_geom_file, clen ):
|
|
|
|
# read lab6 geom
|
|
lab6_geom = open( lab6_geom_file, "r" )
|
|
|
|
# use regex to find clen and replace with new
|
|
# clen example => clen = 0.1217
|
|
clen_geom = re.sub( "clen = 0\.\d+", "clen = {0}".format( clen ), lab6_geom.read() )
|
|
|
|
# close lab6 geom file
|
|
lab6_geom.close()
|
|
|
|
# write new clen_geom to file
|
|
clen_geom_file = "{0}.geom".format( clen )
|
|
geom = open( clen_geom_file, "w" )
|
|
geom.write( clen_geom )
|
|
geom.close()
|
|
|
|
# return clen_geom file name
|
|
return clen_geom_file
|
|
|
|
def write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file, threshold ):
|
|
|
|
# crystfel file name
|
|
cryst_run_file = "{0}_run.sh".format( clen )
|
|
|
|
# write file
|
|
run_sh = open( cryst_run_file, "w" )
|
|
run_sh.write( "#!/bin/sh\n\n" )
|
|
run_sh.write( "module purge\n" )
|
|
run_sh.write( "module load crystfel/0.11.1\n" )
|
|
run_sh.write( "indexamajig -i {0} \\\n".format( sample_h5_file ) )
|
|
run_sh.write( " --output={0}.stream \\\n".format( clen ) )
|
|
run_sh.write( " --geometry={0}\\\n".format( clen_geom_file ) )
|
|
run_sh.write( " --pdb={0} \\\n".format( cell_file ) )
|
|
run_sh.write( " --indexing=mosflm --peaks=peakfinder8 \\\n" )
|
|
run_sh.write( " --threshold={0} --min-snr=5 --int-radius=3,5,9 \\\n".format( threshold ) )
|
|
run_sh.write( " -j 32 --no-multi --no-retry --max-res=3000 --min-pix-count=2 --min-res=85\n\n" )
|
|
run_sh.close()
|
|
|
|
# make file executable
|
|
subprocess.call( [ "chmod", "+x", "{0}".format( cryst_run_file ) ] )
|
|
|
|
# return crystfel file name
|
|
return cryst_run_file
|
|
|
|
def make_sample( lst, sample ):
|
|
|
|
# make sample list
|
|
print("making {0} sample of images".format(sample))
|
|
sample_h5 = h5_sample(lst, sample)
|
|
sample_h5_file = "{0}/{1}".format(cwd, sample_h5)
|
|
print("done")
|
|
|
|
return sample_h5_file
|
|
|
|
def make_process_dir(proc_dir):
|
|
# make process directory
|
|
try:
|
|
os.makedirs( proc_dir )
|
|
except OSError as e:
|
|
if e.errno != errno.EEXIST:
|
|
raise
|
|
|
|
def make_step_range(centre_clen, step_size, steps):
|
|
# make list of clen steps above and below the central clen
|
|
print( "make clen array around {0}".format( centre_clen ) )
|
|
step_range = step_size*steps
|
|
bottom_step = centre_clen-step_range/2
|
|
top_step = bottom_step+step_range
|
|
step_range = np.arange( bottom_step, top_step, step_size )
|
|
step_range = step_range.round( 6 ) # important - otherwise np gives your .99999999 instead of 1 somethimes
|
|
print( "done" )
|
|
|
|
return step_range
|
|
|
|
def submit_job( job_file ):
|
|
|
|
# submit the job
|
|
submit_cmd = [ "sbatch", "-p", "day", "--cpus-per-task=32", "--", job_file ]
|
|
job_output = subprocess.check_output(submit_cmd)
|
|
|
|
# scrub job id from - example Submitted batch job 742403
|
|
pattern = r"Submitted batch job (\d+)"
|
|
job_id = re.search( pattern, job_output.decode().strip() ).group(1)
|
|
|
|
return int(job_id)
|
|
|
|
def wait_for_jobs( job_ids, total_jobs ):
|
|
|
|
with tqdm(total=total_jobs, desc="Jobs Completed", unit="job") as pbar:
|
|
while job_ids:
|
|
completed_jobs = set()
|
|
for job_id in job_ids:
|
|
status_cmd = ["squeue", "-h", "-j", str(job_id)]
|
|
status = subprocess.check_output(status_cmd)
|
|
if not status:
|
|
completed_jobs.add(job_id)
|
|
pbar.update(1)
|
|
job_ids.difference_update(completed_jobs)
|
|
time.sleep(2)
|
|
|
|
def scrub_clen( stream_pwd ):
|
|
|
|
# get clen from stream name
|
|
# example - /sf/cristallina/data/p20590/work/process/jhb/detector_refinement/coarse_scan/0.115/0.115.stream
|
|
# scrub clen and return - else nan
|
|
try:
|
|
pattern = r"0\.\d+/(0\.\d+)\.stream"
|
|
re_search = re.search( pattern, stream_pwd )
|
|
clen = re_search.group( 1 )
|
|
if AttributeError:
|
|
return float( clen )
|
|
except AttributeError:
|
|
return 1
|
|
|
|
def find_streams( top_dir ):
|
|
|
|
# create df for streams
|
|
stream_df = pd.DataFrame()
|
|
|
|
# search for all files that end with .stream
|
|
|
|
for path, dirs, files in os.walk( top_dir ):
|
|
for name in files:
|
|
if name.endswith( ".stream" ):
|
|
|
|
# get stream pwd
|
|
stream_pwd = os.path.join( path, name )
|
|
|
|
# scrub clen from stream
|
|
clen = scrub_clen( stream_pwd )
|
|
|
|
# put clen and stream pwd into df
|
|
data = [ { "stream_pwd" : stream_pwd,
|
|
"clen" : clen
|
|
} ]
|
|
stream_df_1 = pd.DataFrame( data )
|
|
stream_df = pd.concat( ( stream_df, stream_df_1 ) )
|
|
|
|
# sort df based on clen
|
|
stream_df = stream_df.sort_values( by="clen" )
|
|
|
|
# reset df index
|
|
stream_df = stream_df.reset_index( drop=True )
|
|
|
|
# return df of streams and clens
|
|
return stream_df
|
|
|
|
def scrub_us( stream ):
|
|
|
|
# get uc values from stream file
|
|
# example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg
|
|
# scrub clen and return - else nan
|
|
try:
|
|
pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg"
|
|
cells = re.findall( pattern, stream )
|
|
if AttributeError:
|
|
return cells
|
|
except AttributeError:
|
|
return np.nan
|
|
|
|
def scrub_helper( top_dir ):
|
|
|
|
# find stream files from process directory
|
|
print( "finding stream files" )
|
|
stream_df = find_streams( top_dir )
|
|
print( "done" )
|
|
|
|
# making results df for unit cell and index no.
|
|
stats_df = pd.DataFrame()
|
|
|
|
# loop through stream files and collect unit_cell information
|
|
print( "looping through stream files to collect unit cell, indexed information" )
|
|
for index, row in stream_df.iterrows():
|
|
|
|
stream_pwd, clen = row[ "stream_pwd" ], row[ "clen" ]
|
|
|
|
# open stream file
|
|
stream = open( stream_pwd, "r" ).read()
|
|
|
|
# scrub unit cell information
|
|
cells = scrub_us( stream )
|
|
|
|
# put cells in df
|
|
cols = [ "a", "b", "c", "alpha", "beta", "gamma" ]
|
|
cells_df = pd.DataFrame( cells, columns=cols )
|
|
cells_df = cells_df.astype( float )
|
|
|
|
# calc stats
|
|
indexed = len( cells_df )
|
|
std_a = cells_df.a.std()
|
|
std_b = cells_df.b.std()
|
|
std_c = cells_df.c.std()
|
|
std_alpha = cells_df.alpha.std()
|
|
std_beta = cells_df.beta.std()
|
|
std_gamma = cells_df.gamma.std()
|
|
|
|
# put stats in results df
|
|
stats = [ { "clen" : clen,
|
|
"indexed" : indexed,
|
|
"std_a" : std_a,
|
|
"std_b" : std_b,
|
|
"std_c" : std_c,
|
|
"std_alpha" : std_alpha,
|
|
"std_beta" : std_beta,
|
|
"std_gamma" : std_gamma,
|
|
} ]
|
|
stats_df_1 = pd.DataFrame( stats )
|
|
stats_df = pd.concat( ( stats_df, stats_df_1 ) )
|
|
|
|
# reset index
|
|
stats_df = stats_df.reset_index( drop=True )
|
|
print( "done" )
|
|
|
|
return stats_df
|
|
|
|
def find_clen_values( stats_df, scan ):
|
|
|
|
def find_min_clen(col_name):
|
|
min_val = stats_df[col_name].min()
|
|
min_row = stats_df[stats_df[col_name] == min_val]
|
|
min_clen = min_row['clen'].values[0]
|
|
return min_val, min_clen
|
|
|
|
def gauss(x, *p):
|
|
|
|
A, mu, sigma = p
|
|
return A * np.exp(-(x-mu)**2/(2.*sigma**2))
|
|
|
|
p0 = [ 30, 0.111, 0.01 ]
|
|
parameters, covariance = curve_fit( gauss, stats_df.clen, stats_df.indexed, p0=p0 )
|
|
|
|
# Get the fitted curve
|
|
stats_df[ "gaus" ] = gauss( stats_df.clen, *parameters)
|
|
# find peak centre
|
|
peaks = find_peaks( stats_df.gaus.values )
|
|
|
|
# find full peak width
|
|
fwhm = peak_widths( stats_df.gaus.values, peaks[0], rel_height=0.5 )
|
|
|
|
fwhm_str = int( round( fwhm[2][0], 0 ) )
|
|
fwhm_end = int( round( fwhm[3][0], 0 ) )
|
|
|
|
# translate width into motor values
|
|
indexed_start = stats_df.iloc[ fwhm_str, 0 ]
|
|
indexed_end = stats_df.iloc[ fwhm_end, 0 ]
|
|
mid_gauss = stats_df.clen.iloc[ peaks[0] ].values[0]
|
|
|
|
# cut df to only include indexed patterns
|
|
stats_df = stats_df[ ( stats_df.clen < indexed_end ) & ( stats_df.clen > indexed_start ) ]
|
|
|
|
# calculate minimum values
|
|
min_alpha_val, min_alpha_clen = find_min_clen('std_alpha')
|
|
min_beta_val, min_beta_clen = find_min_clen('std_beta')
|
|
min_gamma_val, min_gamma_clen = find_min_clen('std_gamma')
|
|
min_c_val, min_c_clen = find_min_clen('std_c')
|
|
|
|
# find possible clens
|
|
suggested_clen = (min_alpha_clen + min_beta_clen + min_gamma_clen )/3
|
|
suggested_clen = round(suggested_clen, 4)
|
|
|
|
print( "middle of indexing gaussion fit of {0} scan = {1}".format( scan, mid_gauss ) )
|
|
print( "mean minimum of alpha, beta, gamma of {0} scan = {1}".format( scan, suggested_clen ) )
|
|
|
|
return suggested_clen
|
|
|
|
def plot_indexed_std( stats_df, ax1, ax2 ):
|
|
|
|
# indexed images plot
|
|
color = "tab:red"
|
|
ax1.set_xlabel("clen")
|
|
ax1.set_ylabel("indexed", color=color)
|
|
ax1.plot(stats_df.clen, stats_df.indexed, color=color)
|
|
ax1.tick_params(axis="y", labelcolor=color)
|
|
|
|
# label color
|
|
color = "tab:blue"
|
|
ax2.set_ylabel("a,b,c st.deviation", color=color)
|
|
ax2.tick_params(axis='y', labelcolor=color)
|
|
|
|
# std_a plot
|
|
color = "turquoise"
|
|
ax2.plot(stats_df.clen, stats_df.std_a, color=color, label="a" )
|
|
|
|
# std_b plot
|
|
color = "deepskyblue"
|
|
ax2.plot(stats_df.clen, stats_df.std_b, color=color, label="b" )
|
|
|
|
# std_c plot
|
|
color = "royalblue"
|
|
ax2.plot(stats_df.clen, stats_df.std_c, color=color, label="c" )
|
|
|
|
def plot_indexed_std_alpha_beta_gamma( stats_df, ax1, ax2 ):
|
|
|
|
# indexed images plot
|
|
color = "tab:red"
|
|
ax1.set_xlabel("clen")
|
|
ax1.set_ylabel("indexed", color=color)
|
|
ax1.plot(stats_df.clen, stats_df.indexed, color=color)
|
|
ax1.tick_params(axis="y", labelcolor=color)
|
|
|
|
# label color
|
|
color = "tab:green"
|
|
ax2.set_ylabel("alpha, beta, gamma st.deviation", color=color)
|
|
ax2.tick_params(axis='y', labelcolor=color)
|
|
|
|
# std_alpha plot
|
|
color = "yellow"
|
|
ax2.plot(stats_df.clen, stats_df.std_alpha, color=color, label="alpha" )
|
|
|
|
# std_beta plot
|
|
color = "green"
|
|
ax2.plot(stats_df.clen, stats_df.std_beta, color=color, label="beta" )
|
|
|
|
# std_gamma plot
|
|
color = "darkolivegreen"
|
|
ax2.plot(stats_df.clen, stats_df.std_gamma, color=color, label="gamma" )
|
|
|
|
def scan( cwd, lst, sample, lab6_geom_file, centre_clen, cell_file, threshold, step_size ):
|
|
|
|
# define coarse or fine scan
|
|
if step_size == "coarse":
|
|
steps = 30
|
|
step_size = 0.0005 # m
|
|
scan_name = "scan"
|
|
if step_size == "fine":
|
|
steps = 50
|
|
step_size = 0.00005 # m
|
|
scan_name = "scan"
|
|
|
|
#make sample list
|
|
sample_h5_file = make_sample(lst, sample)
|
|
|
|
# make list of clen steps above and below the central clen
|
|
step_range = make_step_range(centre_clen, step_size, steps)
|
|
|
|
# submitted job set and job_list
|
|
submitted_job_ids = set()
|
|
job_list = []
|
|
|
|
# make directorys for results
|
|
print( "begin CrystFEL anaylsis of different clens" )
|
|
|
|
# loop to cycle through clen steps
|
|
for clen in step_range:
|
|
|
|
# define process directory
|
|
proc_dir = "{0}/{1}/{2}".format( cwd, scan_name, clen )
|
|
|
|
# make process directory
|
|
make_process_dir(proc_dir)
|
|
|
|
# move to process directory
|
|
os.chdir( proc_dir )
|
|
|
|
# make geom file
|
|
clen_geom_file = geom_amend( lab6_geom_file, clen )
|
|
|
|
# make crystfel run file
|
|
cryst_run_file = write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file, threshold )
|
|
|
|
# run crystfel file
|
|
job_list.append( cryst_run_file )
|
|
job_id = submit_job( cryst_run_file )
|
|
submitted_job_ids.add( job_id )
|
|
|
|
# move back to cwd
|
|
os.chdir( cwd )
|
|
|
|
#wait for jobs to complete
|
|
wait_for_jobs(submitted_job_ids, len(job_list))
|
|
print("slurm processing done")
|
|
|
|
def scrub_scan( scan_top_dir, scan ):
|
|
|
|
stats_df = scrub_helper(scan_top_dir)
|
|
|
|
# calculate suggested clen and make plot
|
|
suggested_clen = find_clen_values( stats_df, scan )
|
|
|
|
# plot results
|
|
fig, (ax1, ax3) = plt.subplots(1, 2)
|
|
ax2 = ax1.twinx()
|
|
ax4 = ax3.twinx()
|
|
|
|
plot_indexed_std(stats_df, ax1, ax2)
|
|
plot_indexed_std_alpha_beta_gamma(stats_df, ax3, ax4)
|
|
|
|
fig.legend(loc="upper center")
|
|
fig.tight_layout()
|
|
plt.savefig("{0}.png".format(scan))
|
|
|
|
return suggested_clen
|
|
|
|
def main( cwd, lst, sample, geom, centre_clen, cell_file, threshold ):
|
|
|
|
# run scan
|
|
if not os.path.isdir( "{0}/scan".format( cwd ) ):
|
|
|
|
# run inital coarse scan
|
|
scan( cwd, lst, sample, geom, centre_clen, cell_file, threshold, "coarse" )
|
|
# get approx centre from new results
|
|
coarse_clen = scrub_scan( cwd, scan="scan" )
|
|
# perfom 2nd scan
|
|
scan( cwd, lst, sample, geom, coarse_clen, cell_file, threshold, "fine" )
|
|
|
|
else:
|
|
print( "scan already performed" )
|
|
|
|
# check results
|
|
suggested_clen = scrub_scan( cwd, scan="scan" )
|
|
|
|
return suggested_clen
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"-l",
|
|
"--lst",
|
|
help="path to crystfel list file containing enough patterns for detector distance refinement",
|
|
type=os.path.abspath,
|
|
required=True
|
|
)
|
|
parser.add_argument(
|
|
"-g",
|
|
"--geom",
|
|
help="path to geom file to be used in the refinement",
|
|
type=os.path.abspath,
|
|
required=True
|
|
)
|
|
parser.add_argument(
|
|
"-d",
|
|
"--central_distance",
|
|
help="intial clen to use for refinement - usually from detector shift refinement",
|
|
type=float,
|
|
required=True
|
|
)
|
|
parser.add_argument(
|
|
"-c",
|
|
"--cell_file",
|
|
help="path to cell file of the crystals used in the refinement",
|
|
type=os.path.abspath,
|
|
required=True
|
|
)
|
|
parser.add_argument(
|
|
"-s",
|
|
"--sample",
|
|
help="sample size to use in the refinement",
|
|
type=int,
|
|
default=500
|
|
)
|
|
parser.add_argument(
|
|
"-e",
|
|
"--endstation",
|
|
help="which endstation did you collect these data from, e.g., alvra or cristallina",
|
|
type=str,
|
|
default="cristallina"
|
|
)
|
|
args = parser.parse_args()
|
|
# set threshold based on endstation
|
|
if args.endstation == "alvra":
|
|
threshold = 3000
|
|
elif args.endstation == "cristallina":
|
|
threshold = 10
|
|
else:
|
|
print( "you must say which beamline you collected the data on, alvra or cristallina, to set the threshold value correctly for crystfel" )
|
|
exit()
|
|
# run main
|
|
cwd = os.getcwd()
|
|
print( "top working directory = {0}".format( cwd ) )
|
|
main( cwd, args.lst, args.sample, args.geom, args.central_distance, args.cell_file, threshold )
|
|
|
|
|