#!/usr/bin/env python3 # authors T.Mason and J.Beale """ # aim to refine the detector distance using crystfel - naming covention = #.###/#.###.stream # usage python detector-distance-refinement.py -l -g -d central clen to refine around -c cell_file -s sample size -e endstation - alvra or cristallina # other variables -f = fine only = only perform fine scan # output plot files of the analysis and a suggest for the clen """ # modules import pandas as pd import subprocess import os, errno import regex as re import numpy as np import matplotlib.pyplot as plt import time from tqdm import tqdm import argparse def h5_sample( lst, sample ): # create sample of images from run # read h5.lst - note - removes // from image column cols = [ "h5", "image" ] sample_df = pd.read_csv( lst, sep="\s//", engine="python", names=cols ) # take defined sample sample_df = sample_df.sample( sample ) # sort list sample_df = sample_df.sort_index() # re-add // to image columm sample_df[ "image" ] = "//" + sample_df.image.astype(str) # write sample to file sample_file = "h5_{0}_sample.lst".format( sample ) sample_df.to_csv( sample_file, sep=" ", index=False, header=False ) # return sample file name return sample_file def geom_amend( lab6_geom_file, clen ): # read lab6 geom lab6_geom = open( lab6_geom_file, "r" ) # use regex to find clen and replace with new # clen example => clen = 0.1217 clen_geom = re.sub( "clen = 0\.\d+", "clen = {0}".format( clen ), lab6_geom.read() ) # close lab6 geom file lab6_geom.close() # write new clen_geom to file clen_geom_file = "{0}.geom".format( clen ) geom = open( clen_geom_file, "w" ) geom.write( clen_geom ) geom.close() # return clen_geom file name return clen_geom_file def write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file, threshold ): # crystfel file name cryst_run_file = "{0}_run.sh".format( clen ) # write file run_sh = open( cryst_run_file, "w" ) run_sh.write( "#!/bin/sh\n\n" ) run_sh.write( "module purge\n" ) run_sh.write( "module load crystfel/0.10.2\n" ) run_sh.write( "indexamajig -i {0} \\\n".format( sample_h5_file ) ) run_sh.write( " --output={0}.stream \\\n".format( clen ) ) run_sh.write( " --geometry={0}\\\n".format( clen_geom_file ) ) run_sh.write( " --pdb={0} \\\n".format( cell_file ) ) run_sh.write( " --indexing=mosflm --peaks=peakfinder8 \\\n" ) run_sh.write( " --threshold={0} --min-snr=5 --int-radius=3,5,9 \\\n".format( threshold ) ) run_sh.write( " -j 32 --no-multi --no-retry --max-res=3000 --min-pix-count=2 --min-res=85\n\n" ) run_sh.close() # make file executable subprocess.call( [ "chmod", "+x", "{0}".format( cryst_run_file ) ] ) # return crystfel file name return cryst_run_file def make_sample( lst, sample ): # make sample list print("making {0} sample of images".format(sample)) sample_h5 = h5_sample(lst, sample) sample_h5_file = "{0}/{1}".format(cwd, sample_h5) print("done") return sample_h5_file def make_process_dir(proc_dir): # make process directory try: os.makedirs( proc_dir ) except OSError as e: if e.errno != errno.EEXIST: raise def make_step_range(centre_clen, step_size, steps): # make list of clen steps above and below the central clen print( "make clen array around {0}".format( centre_clen ) ) step_range = step_size*steps bottom_step = centre_clen-step_range/2 top_step = bottom_step+step_range step_range = np.arange( bottom_step, top_step, step_size ) step_range = step_range.round( 6 ) # important - otherwise np gives your .99999999 instead of 1 somethimes print( "done" ) return step_range def submit_job( job_file ): # submit the job submit_cmd = [ "sbatch", "-p", "day", "--cpus-per-task=32", "--", job_file ] job_output = subprocess.check_output(submit_cmd) # scrub job id from - example Submitted batch job 742403 pattern = r"Submitted batch job (\d+)" job_id = re.search( pattern, job_output.decode().strip() ).group(1) return int(job_id) def wait_for_jobs( job_ids, total_jobs ): with tqdm(total=total_jobs, desc="Jobs Completed", unit="job") as pbar: while job_ids: completed_jobs = set() for job_id in job_ids: status_cmd = ["squeue", "-h", "-j", str(job_id)] status = subprocess.check_output(status_cmd) if not status: completed_jobs.add(job_id) pbar.update(1) job_ids.difference_update(completed_jobs) time.sleep(30) def scrub_clen( stream_pwd ): # get clen from stream name # example - /sf/cristallina/data/p20590/work/process/jhb/detector_refinement/coarse_scan/0.115/0.115.stream # scrub clen and return - else nan try: pattern = r"0\.\d+/(0\.\d+)\.stream" re_search = re.search( pattern, stream_pwd ) clen = re_search.group( 1 ) if AttributeError: return float( clen ) except AttributeError: return 1 def find_streams( top_dir ): # create df for streams stream_df = pd.DataFrame() # search for all files that end with .stream for path, dirs, files in os.walk( top_dir ): for name in files: if name.endswith( ".stream" ): # get stream pwd stream_pwd = os.path.join( path, name ) # scrub clen from stream clen = scrub_clen( stream_pwd ) # put clen and stream pwd into df data = [ { "stream_pwd" : stream_pwd, "clen" : clen } ] stream_df_1 = pd.DataFrame( data ) stream_df = pd.concat( ( stream_df, stream_df_1 ) ) # sort df based on clen stream_df = stream_df.sort_values( by="clen" ) # reset df index stream_df = stream_df.reset_index( drop=True ) # return df of streams and clens return stream_df def scrub_us( stream ): # get uc values from stream file # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg # scrub clen and return - else nan try: pattern = r"Cell\sparameters\s(\d\.\d+)\s(\d\.\d+)\s(\d\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg" cells = re.findall( pattern, stream ) if AttributeError: return cells except AttributeError: return np.nan def scrub_helper( top_dir ): # find stream files from process directory print( "finding stream files" ) stream_df = find_streams( top_dir ) print( "done" ) # making results df for unit cell and index no. stats_df = pd.DataFrame() # loop through stream files and collect unit_cell information print( "looping through stream files to collect unit cell, indexed information" ) for index, row in stream_df.iterrows(): stream_pwd, clen = row[ "stream_pwd" ], row[ "clen" ] # open stream file stream = open( stream_pwd, "r" ).read() # scrub unit cell information cells = scrub_us( stream ) # put cells in df cols = [ "a", "b", "c", "alpha", "beta", "gamma" ] cells_df = pd.DataFrame( cells, columns=cols ) cells_df = cells_df.astype( float ) # calc stats indexed = len( cells_df ) std_a = cells_df.a.std() std_b = cells_df.b.std() std_c = cells_df.c.std() std_alpha = cells_df.alpha.std() std_beta = cells_df.beta.std() std_gamma = cells_df.gamma.std() # put stats in results df stats = [ { "clen" : clen, "indexed" : indexed, "std_a" : std_a, "std_b" : std_b, "std_c" : std_c, "std_alpha" : std_alpha, "std_beta" : std_beta, "std_gamma" : std_gamma, } ] stats_df_1 = pd.DataFrame( stats ) stats_df = pd.concat( ( stats_df, stats_df_1 ) ) # reset index stats_df = stats_df.reset_index( drop=True ) print( "done" ) return stats_df def find_clen_values(stats_df): def find_min_clen(col_name): min_val = stats_df[col_name].min() min_row = stats_df[stats_df[col_name] == min_val] min_clen = min_row['clen'].values[0] return min_val, min_clen min_alpha_val, min_alpha_clen = find_min_clen('std_alpha') min_beta_val, min_beta_clen = find_min_clen('std_beta') min_gamma_val, min_gamma_clen = find_min_clen('std_gamma') min_c_val, min_c_clen = find_min_clen('std_c') print("The value of clen for the minimum alpha value of {} is {}".format(min_alpha_val, min_alpha_clen)) print("The value of clen for the minimum beta value of {} is {}".format(min_beta_val, min_beta_clen)) print("The value of clen for the minimum gamma value of {} is {}".format(min_gamma_val, min_gamma_clen)) print("The value of clen for the minimum c value of {} is {}".format(min_c_val, min_c_clen)) return min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val def plot_indexed_std( stats_df, ax1, ax2 ): # indexed images plot color = "tab:red" ax1.set_xlabel("clen") ax1.set_ylabel("indexed", color=color) ax1.plot(stats_df.clen, stats_df.indexed, color=color) ax1.tick_params(axis="y", labelcolor=color) # label color color = "tab:blue" ax2.set_ylabel("a,b,c st.deviation", color=color) ax2.tick_params(axis='y', labelcolor=color) # std_a plot color = "lightsteelblue" ax2.plot(stats_df.clen, stats_df.std_a, color=color) # std_b plot color = "cornflowerblue" ax2.plot(stats_df.clen, stats_df.std_b, color=color) # std_c plot color = "royalblue" ax2.plot(stats_df.clen, stats_df.std_c, color=color) def plot_indexed_std_alpha_beta_gamma( stats_df, ax1, ax2 ): # indexed images plot color = "tab:red" ax1.set_xlabel("clen") ax1.set_ylabel("indexed", color=color) ax1.plot(stats_df.clen, stats_df.indexed, color=color) ax1.tick_params(axis="y", labelcolor=color) # label color color = "tab:green" ax2.set_ylabel("alpha, beta, gamma st.deviation", color=color) ax2.tick_params(axis='y', labelcolor=color) # std_alpha plot color = "limegreen" ax2.plot(stats_df.clen, stats_df.std_alpha, color=color) # std_beta plot color = "darkgreen" ax2.plot(stats_df.clen, stats_df.std_beta, color=color) # std_gamma plot color = "green" ax2.plot(stats_df.clen, stats_df.std_gamma, color=color) def scan( cwd, lst, sample, lab6_geom_file, centre_clen, cell_file, threshold, step_size ): # define coarse or fine scan if step_size == "coarse": steps = 20 step_size = 0.0005 # m scan_name = "coarse" if step_size == "fine": steps = 50 step_size = 0.00005 # m scan_name = "fine" #make sample list sample_h5_file = make_sample(lst, sample) # make list of clen steps above and below the central clen step_range = make_step_range(centre_clen, step_size, steps) # submitted job set and job_list submitted_job_ids = set() job_list = [] # make directorys for results print( "begin CrystFEL anaylsis of different clens" ) # loop to cycle through clen steps for clen in step_range: # define process directory proc_dir = "{0}/{1}/{2}".format( cwd, scan_name, clen ) # make process directory make_process_dir(proc_dir) # move to process directory os.chdir( proc_dir ) # make geom file clen_geom_file = geom_amend( lab6_geom_file, clen ) # make crystfel run file cryst_run_file = write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file, threshold ) # run crystfel file job_list.append( cryst_run_file ) job_id = submit_job( cryst_run_file ) submitted_job_ids.add( job_id ) # move back to cwd os.chdir( cwd ) #wait for jobs to complete wait_for_jobs(submitted_job_ids, len(job_list)) print("slurm processing done") def scrub_scan( scan_top_dir, scan ): stats_df = scrub_helper(scan_top_dir) #print clen for minimum alpha, beta, and gamma values min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val = find_clen_values(stats_df) # print suggested clen suggested_clen = (min_alpha_clen + min_beta_clen + min_gamma_clen )/3 suggested_clen = round(suggested_clen, 4) print ("The suggested clen = {0}".format(suggested_clen)) # plot results fig, (ax1, ax3) = plt.subplots(1, 2) ax2 = ax1.twinx() ax4 = ax3.twinx() plot_indexed_std(stats_df, ax1, ax2) plot_indexed_std_alpha_beta_gamma(stats_df, ax3, ax4) fig.tight_layout() plt.savefig("{0}.png".format(scan)) return suggested_clen def main( cwd, lst, sample, geom, centre_clen, cell_file, fine_only, threshold ): # if statement to do coarse if fine_only==False if fine_only == False: top_dir_coarse = "{0}/coarse".format( cwd ) scan( cwd, lst, sample, geom, centre_clen, cell_file, threshold, step_size="coarse" ) suggested_clen = scrub_scan( top_dir_coarse, scan="coarse" ) else: suggested_clen = centre_clen top_dir_fine = "{0}/fine".format( cwd ) scan( cwd, lst, sample, geom, suggested_clen, cell_file, threshold, step_size="fine" ) scrub_scan( top_dir_fine, scan="fine" ) plt.show() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-l", "--lst", help="path to crystfel list file containing enough patterns for detector distance refinement", type=os.path.abspath, required=True ) parser.add_argument( "-g", "--geom", help="path to geom file to be used in the refinement", type=os.path.abspath, required=True ) parser.add_argument( "-d", "--central_distance", help="intial clen to use for refinement - usually from detector shift refinement", type=float, required=True ) parser.add_argument( "-c", "--cell_file", help="path to cell file of the crystals used in the refinement", type=os.path.abspath, required=True ) parser.add_argument( "-s", "--sample", help="sample size to use in the refinement", type=int, default=500 ) parser.add_argument( "-f", "--fine_only", help="True = only do fine scan", type=bool, default=False ) parser.add_argument( "-e", "--endstation", help="which endstation did you collect these data from, e.g., alvra or cristallina", type=str, default="cristallina" ) args = parser.parse_args() # set threshold based on endstation if args.endstation == "alvra": threshold = 3000 elif args.endstation == "cristallina": threshold = 10 else: print( "you must say which beamline you collected the data on, alvra or cristallina, to set the threshold value correctly for crystfel" ) exit() # run main cwd = os.getcwd() print( "top working directory = {0}".format( cwd ) ) main( cwd, args.lst, args.sample, args.geom, args.central_distance, args.cell_file, args.fine_only, threshold )