#!/usr/bin/env python3 # authors T.Mason and J.Beale """ # aim to refine the detector distance using crystfel - naming covention = #.###/#.###.stream # usage python detector-distance-refinement.py -l -g -d central clen to refine around -c cell_file -s sample size -e endstation - alvra or cristallina # output plot files of the analysis and a suggest for the clen """ # modules import pandas as pd import subprocess import os, errno import regex as re import numpy as np import matplotlib.pyplot as plt import time from tqdm import tqdm import argparse from scipy.optimize import curve_fit from scipy.signal import peak_widths, find_peaks def h5_sample( lst, sample ): # create sample of images from run # read h5.lst - note - removes // from image column cols = [ "h5", "image" ] sample_df = pd.read_csv( lst, sep="\s//", engine="python", names=cols ) # take sample - if sample required if len( sample_df ) > sample: # take defined sample sample_df = sample_df.sample( sample ) # sort list sample_df = sample_df.sort_index() # re-add // to image columm sample_df[ "image" ] = "//" + sample_df.image.astype(str) # write sample to file sample_file = "h5_{0}_sample.lst".format( sample ) sample_df.to_csv( sample_file, sep=" ", index=False, header=False ) # return sample file name return sample_file def geom_amend( lab6_geom_file, clen ): # read lab6 geom lab6_geom = open( lab6_geom_file, "r" ) # use regex to find clen and replace with new # clen example => clen = 0.1217 clen_geom = re.sub( "clen = 0\.\d+", "clen = {0}".format( clen ), lab6_geom.read() ) # close lab6 geom file lab6_geom.close() # write new clen_geom to file clen_geom_file = "{0}.geom".format( clen ) geom = open( clen_geom_file, "w" ) geom.write( clen_geom ) geom.close() # return clen_geom file name return clen_geom_file def write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file, threshold ): # crystfel file name cryst_run_file = "{0}_run.sh".format( clen ) # write file run_sh = open( cryst_run_file, "w" ) run_sh.write( "#!/bin/sh\n\n" ) run_sh.write( "module purge\n" ) run_sh.write( "module use MX unstable\n" ) run_sh.write( "module load crystfel/0.10.2-rhel8\n" ) run_sh.write( "indexamajig -i {0} \\\n".format( sample_h5_file ) ) run_sh.write( " --output={0}.stream \\\n".format( clen ) ) run_sh.write( " --geometry={0}\\\n".format( clen_geom_file ) ) run_sh.write( " --pdb={0} \\\n".format( cell_file ) ) run_sh.write( " --indexing=mosflm --peaks=peakfinder8 \\\n" ) run_sh.write( " --threshold={0} --min-snr=5 --int-radius=3,5,9 \\\n".format( threshold ) ) run_sh.write( " -j 32 --no-multi --no-retry --max-res=3000 --min-pix-count=2 --min-res=85\n\n" ) run_sh.close() # make file executable subprocess.call( [ "chmod", "+x", "{0}".format( cryst_run_file ) ] ) # return crystfel file name return cryst_run_file def make_sample( lst, sample ): # make sample list print("making {0} sample of images".format(sample)) sample_h5 = h5_sample(lst, sample) sample_h5_file = "{0}/{1}".format(cwd, sample_h5) print("done") return sample_h5_file def make_process_dir(proc_dir): # make process directory try: os.makedirs( proc_dir ) except OSError as e: if e.errno != errno.EEXIST: raise def make_step_range(centre_clen, step_size, steps): # make list of clen steps above and below the central clen print( "make clen array around {0}".format( centre_clen ) ) step_range = step_size*steps bottom_step = centre_clen-step_range/2 top_step = bottom_step+step_range step_range = np.arange( bottom_step, top_step, step_size ) step_range = step_range.round( 6 ) # important - otherwise np gives your .99999999 instead of 1 somethimes print( "done" ) return step_range def submit_job( job_file ): # submit the job submit_cmd = [ "sbatch", "-p", "day", "--cpus-per-task=32", "--", job_file ] job_output = subprocess.check_output(submit_cmd) # scrub job id from - example Submitted batch job 742403 pattern = r"Submitted batch job (\d+)" job_id = re.search( pattern, job_output.decode().strip() ).group(1) return int(job_id) def wait_for_jobs( job_ids, total_jobs ): with tqdm(total=total_jobs, desc="Jobs Completed", unit="job") as pbar: while job_ids: completed_jobs = set() for job_id in job_ids: status_cmd = ["squeue", "-h", "-j", str(job_id)] status = subprocess.check_output(status_cmd) if not status: completed_jobs.add(job_id) pbar.update(1) job_ids.difference_update(completed_jobs) time.sleep(2) def scrub_clen( stream_pwd ): # get clen from stream name # example - /sf/cristallina/data/p20590/work/process/jhb/detector_refinement/coarse_scan/0.115/0.115.stream # scrub clen and return - else nan try: pattern = r"0\.\d+/(0\.\d+)\.stream" re_search = re.search( pattern, stream_pwd ) clen = re_search.group( 1 ) if AttributeError: return float( clen ) except AttributeError: return 1 def find_streams( top_dir ): # create df for streams stream_df = pd.DataFrame() # search for all files that end with .stream for path, dirs, files in os.walk( top_dir ): for name in files: if name.endswith( ".stream" ): # get stream pwd stream_pwd = os.path.join( path, name ) # scrub clen from stream clen = scrub_clen( stream_pwd ) # put clen and stream pwd into df data = [ { "stream_pwd" : stream_pwd, "clen" : clen } ] stream_df_1 = pd.DataFrame( data ) stream_df = pd.concat( ( stream_df, stream_df_1 ) ) # sort df based on clen stream_df = stream_df.sort_values( by="clen" ) # reset df index stream_df = stream_df.reset_index( drop=True ) # return df of streams and clens return stream_df def scrub_us( stream ): # get uc values from stream file # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg # scrub clen and return - else nan try: pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg" cells = re.findall( pattern, stream ) if AttributeError: return cells except AttributeError: return np.nan def scrub_helper( top_dir ): # find stream files from process directory print( "finding stream files" ) stream_df = find_streams( top_dir ) print( "done" ) # making results df for unit cell and index no. stats_df = pd.DataFrame() # loop through stream files and collect unit_cell information print( "looping through stream files to collect unit cell, indexed information" ) for index, row in stream_df.iterrows(): stream_pwd, clen = row[ "stream_pwd" ], row[ "clen" ] # open stream file stream = open( stream_pwd, "r" ).read() # scrub unit cell information cells = scrub_us( stream ) # put cells in df cols = [ "a", "b", "c", "alpha", "beta", "gamma" ] cells_df = pd.DataFrame( cells, columns=cols ) cells_df = cells_df.astype( float ) # calc stats indexed = len( cells_df ) std_a = cells_df.a.std() std_b = cells_df.b.std() std_c = cells_df.c.std() std_alpha = cells_df.alpha.std() std_beta = cells_df.beta.std() std_gamma = cells_df.gamma.std() # put stats in results df stats = [ { "clen" : clen, "indexed" : indexed, "std_a" : std_a, "std_b" : std_b, "std_c" : std_c, "std_alpha" : std_alpha, "std_beta" : std_beta, "std_gamma" : std_gamma, } ] stats_df_1 = pd.DataFrame( stats ) stats_df = pd.concat( ( stats_df, stats_df_1 ) ) # reset index stats_df = stats_df.reset_index( drop=True ) print( "done" ) return stats_df def find_clen_values( stats_df, scan ): def find_min_clen(col_name): min_val = stats_df[col_name].min() min_row = stats_df[stats_df[col_name] == min_val] min_clen = min_row['clen'].values[0] return min_val, min_clen def gauss(x, *p): A, mu, sigma = p return A * np.exp(-(x-mu)**2/(2.*sigma**2)) p0 = [ 30, 0.111, 0.01 ] parameters, covariance = curve_fit( gauss, stats_df.clen, stats_df.indexed, p0=p0 ) # Get the fitted curve stats_df[ "gaus" ] = gauss( stats_df.clen, *parameters) # find peak centre peaks = find_peaks( stats_df.gaus.values ) # find full peak width fwhm = peak_widths( stats_df.gaus.values, peaks[0], rel_height=0.5 ) fwhm_str = int( round( fwhm[2][0], 0 ) ) fwhm_end = int( round( fwhm[3][0], 0 ) ) # translate width into motor values indexed_start = stats_df.iloc[ fwhm_str, 0 ] indexed_end = stats_df.iloc[ fwhm_end, 0 ] mid_gauss = stats_df.clen.iloc[ peaks[0] ].values[0] # cut df to only include indexed patterns stats_df = stats_df[ ( stats_df.clen < indexed_end ) & ( stats_df.clen > indexed_start ) ] # calculate minimum values min_alpha_val, min_alpha_clen = find_min_clen('std_alpha') min_beta_val, min_beta_clen = find_min_clen('std_beta') min_gamma_val, min_gamma_clen = find_min_clen('std_gamma') min_c_val, min_c_clen = find_min_clen('std_c') # find possible clens suggested_clen = (min_alpha_clen + min_beta_clen + min_gamma_clen )/3 suggested_clen = round(suggested_clen, 4) print( "middle of indexing gaussion fit of {0} scan = {1}".format( scan, mid_gauss ) ) print( "mean minimum of alpha, beta, gamma of {0} scan = {1}".format( scan, suggested_clen ) ) return suggested_clen def plot_indexed_std( stats_df, ax1, ax2 ): # indexed images plot color = "tab:red" ax1.set_xlabel("clen") ax1.set_ylabel("indexed", color=color) ax1.plot(stats_df.clen, stats_df.indexed, color=color) ax1.tick_params(axis="y", labelcolor=color) # label color color = "tab:blue" ax2.set_ylabel("a,b,c st.deviation", color=color) ax2.tick_params(axis='y', labelcolor=color) # std_a plot color = "lightsteelblue" ax2.plot(stats_df.clen, stats_df.std_a, color=color) # std_b plot color = "cornflowerblue" ax2.plot(stats_df.clen, stats_df.std_b, color=color) # std_c plot color = "royalblue" ax2.plot(stats_df.clen, stats_df.std_c, color=color) def plot_indexed_std_alpha_beta_gamma( stats_df, ax1, ax2 ): # indexed images plot color = "tab:red" ax1.set_xlabel("clen") ax1.set_ylabel("indexed", color=color) ax1.plot(stats_df.clen, stats_df.indexed, color=color) ax1.tick_params(axis="y", labelcolor=color) # label color color = "tab:green" ax2.set_ylabel("alpha, beta, gamma st.deviation", color=color) ax2.tick_params(axis='y', labelcolor=color) # std_alpha plot color = "limegreen" ax2.plot(stats_df.clen, stats_df.std_alpha, color=color) # std_beta plot color = "darkgreen" ax2.plot(stats_df.clen, stats_df.std_beta, color=color) # std_gamma plot color = "green" ax2.plot(stats_df.clen, stats_df.std_gamma, color=color) def scan( cwd, lst, sample, lab6_geom_file, centre_clen, cell_file, threshold, step_size ): # define coarse or fine scan if step_size == "coarse": steps = 30 step_size = 0.0005 # m scan_name = "scan" if step_size == "fine": steps = 50 step_size = 0.00005 # m scan_name = "scan" #make sample list sample_h5_file = make_sample(lst, sample) # make list of clen steps above and below the central clen step_range = make_step_range(centre_clen, step_size, steps) # submitted job set and job_list submitted_job_ids = set() job_list = [] # make directorys for results print( "begin CrystFEL anaylsis of different clens" ) # loop to cycle through clen steps for clen in step_range: # define process directory proc_dir = "{0}/{1}/{2}".format( cwd, scan_name, clen ) # make process directory make_process_dir(proc_dir) # move to process directory os.chdir( proc_dir ) # make geom file clen_geom_file = geom_amend( lab6_geom_file, clen ) # make crystfel run file cryst_run_file = write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file, threshold ) # run crystfel file job_list.append( cryst_run_file ) job_id = submit_job( cryst_run_file ) submitted_job_ids.add( job_id ) # move back to cwd os.chdir( cwd ) #wait for jobs to complete wait_for_jobs(submitted_job_ids, len(job_list)) print("slurm processing done") def scrub_scan( scan_top_dir, scan ): stats_df = scrub_helper(scan_top_dir) # calculate suggested clen and make plot suggested_clen = find_clen_values( stats_df, scan ) # plot results fig, (ax1, ax3) = plt.subplots(1, 2) ax2 = ax1.twinx() ax4 = ax3.twinx() plot_indexed_std(stats_df, ax1, ax2) plot_indexed_std_alpha_beta_gamma(stats_df, ax3, ax4) fig.tight_layout() plt.savefig("{0}.png".format(scan)) return suggested_clen def main( cwd, lst, sample, geom, centre_clen, cell_file, threshold ): # run scan if not os.path.isdir( "{0}/scan".format( cwd ) ): # run inital coarse scan scan( cwd, lst, sample, geom, centre_clen, cell_file, threshold, "coarse" ) # get approx centre from new results coarse_clen = scrub_scan( cwd, scan="scan" ) # perfom 2nd scan scan( cwd, lst, sample, geom, coarse_clen, cell_file, threshold, "fine" ) else: print( "scan already performed" ) # check results suggested_clen = scrub_scan( cwd, scan="scan" ) return suggested_clen if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-l", "--lst", help="path to crystfel list file containing enough patterns for detector distance refinement", type=os.path.abspath, required=True ) parser.add_argument( "-g", "--geom", help="path to geom file to be used in the refinement", type=os.path.abspath, required=True ) parser.add_argument( "-d", "--central_distance", help="intial clen to use for refinement - usually from detector shift refinement", type=float, required=True ) parser.add_argument( "-c", "--cell_file", help="path to cell file of the crystals used in the refinement", type=os.path.abspath, required=True ) parser.add_argument( "-s", "--sample", help="sample size to use in the refinement", type=int, default=500 ) parser.add_argument( "-e", "--endstation", help="which endstation did you collect these data from, e.g., alvra or cristallina", type=str, default="cristallina" ) args = parser.parse_args() # set threshold based on endstation if args.endstation == "alvra": threshold = 3000 elif args.endstation == "cristallina": threshold = 10 else: print( "you must say which beamline you collected the data on, alvra or cristallina, to set the threshold value correctly for crystfel" ) exit() # run main cwd = os.getcwd() print( "top working directory = {0}".format( cwd ) ) main( cwd, args.lst, args.sample, args.geom, args.central_distance, args.cell_file, threshold )