#!/usr/bin/env python3 # authors T. Mason and J. Beale # modules import pandas as pd import subprocess import os, errno import regex as re import numpy as np import matplotlib.pyplot as plt import time def h5_sample( lst, sample ): # create sample of images from run # read h5.lst - note - removes // from image column cols = [ "h5", "image" ] sample_df = pd.read_csv( lst, sep="\s//", engine="python", names=cols ) # take defined sample sample_df = sample_df.sample( sample ) # sort list sample_df = sample_df.sort_index() # re-add // to image columm sample_df[ "image" ] = "//" + sample_df.image.astype(str) # write sample to file sample_file = "h5_{0}_sample.lst".format( sample ) sample_df.to_csv( sample_file, sep=" ", index=False, header=False ) # return sample file name return sample_file def geom_amend( lab6_geom_file, clen ): # read lab6 geom lab6_geom = open( lab6_geom_file, "r" ) # use regex to find clen and replace with new # clen example => clen = 0.1217 clen_geom = re.sub( "clen = 0\.\d+", "clen = {0}".format( clen ), lab6_geom.read() ) # close lab6 geom file lab6_geom.close() # write new clen_geom to file clen_geom_file = "{0}.geom".format( clen ) geom = open( clen_geom_file, "w" ) geom.write( clen_geom ) geom.close() # return clen_geom file name return clen_geom_file def write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file ): # crystfel file name cryst_run_file = "{0}_cryst_run.sh".format( clen ) # write file run_sh = open( cryst_run_file, "w" ) run_sh.write( "#!/bin/sh\n\n" ) run_sh.write( "module purge\n" ) run_sh.write( "module load crystfel/0.10.2\n" ) run_sh.write( "indexamajig -i {0} \\\n".format( sample_h5_file ) ) run_sh.write( " --output={0}.stream \\\n".format( clen ) ) run_sh.write( " --geometry={0}\\\n".format( clen_geom_file ) ) run_sh.write( " --pdb={0} \\\n".format( cell_file ) ) run_sh.write( " --indexing=xgandalf-latt-cell --peaks=peakfinder8 \\\n" ) run_sh.write( " --integration=rings-grad --tolerance=10.0,10.0,10.0,2,3,2 --threshold=10 --min-snr=5 --int-radius=2,3,6 \\\n" ) run_sh.write( " -j 36 --no-multi --no-retry --check-peaks --max-res=3000 --min-pix-count=1 --local-bg-radius=4 --min-res=85\n\n" ) run_sh.close() # make file executable subprocess.call( [ "chmod", "+x", "{0}".format( cryst_run_file ) ] ) # return crystfel file name return cryst_run_file def make_sample(lst, sample): # set current working directory os.chdir("/sf/cristallina/data/p20590/work/process/jhb/detector_refinement") cwd = os.getcwd() # make sample list print("making {0} sample of images".format(sample)) sample_h5 = h5_sample(lst, sample) sample_h5_file = "{0}/{1}".format(cwd, sample_h5) print("done") return cwd, sample_h5_file def make_process_dir(proc_dir): # make process directory try: os.makedirs( proc_dir ) except OSError as e: if e.errno != errno.EEXIST: raise def make_step_range(centre_clen, step_size, steps): # make list of clen steps above and below the central clen print( "make clen array around {0}".format( centre_clen ) ) step_range = step_size*steps bottom_step = centre_clen-step_range/2 top_step = bottom_step+step_range step_range = np.arange( bottom_step, top_step, step_size ) step_range = step_range.round( 6 ) # important - otherwise np gives your .99999999 instead of 1 somethimes print( "done" ) return step_range def check_job_status(username): # wait for jobs to complete jobs_completed = False while not jobs_completed: # Get the status of the jobs using "squeue" result = subprocess.run(['squeue', '--user', '{0}'.format(username)], stdout=subprocess.PIPE) output = result.stdout.decode('utf-8') # Check if there are no jobs running for the user if '{0}'.format(username) not in output: jobs_completed = True else: # Sleep for some time and check again print("waiting for jobs to finish") time.sleep(30) # sleep for 30 seconds print("All jobs completed.") def scrub_clen( stream_pwd ): # get clen from stream name # example - /sf/cristallina/data/p20590/work/process/jhb/detector_refinement/coarse_scan/0.115/0.115.stream # scrub clen and return - else nan try: pattern = r"0\.\d+/(0\.\d+)\.stream" re_search = re.search( pattern, stream_pwd ) clen = re_search.group( 1 ) if AttributeError: return float( clen ) except AttributeError: return np.nan def find_streams( top_dir ): # create df for streams stream_df = pd.DataFrame() # search for all files that end with .stream for path, dirs, files in os.walk( top_dir ): for name in files: if name.endswith( ".stream" ): # get stream pwd stream_pwd = os.path.join( path, name ) # scrub clen from stream clen = scrub_clen( stream_pwd ) # put clen and stream pwd into df data = [ { "stream_pwd" : stream_pwd, "clen" : clen } ] stream_df_1 = pd.DataFrame( data ) stream_df = pd.concat( ( stream_df, stream_df_1 ) ) # sort df based on clen stream_df = stream_df.sort_values( by="clen" ) # reset df index stream_df = stream_df.reset_index( drop=True ) # return df of streams and clens return stream_df def scrub_us( stream ): # get uc values from stream file # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg # scrub clen and return - else nan try: pattern = r"Cell\sparameters\s(\d\.\d+)\s(\d\.\d+)\s(\d\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg" cells = re.findall( pattern, stream ) if AttributeError: return cells except AttributeError: return np.nan def scrub_helper(top_dir): # find stream files from process directory print( "finding stream files" ) stream_df = find_streams( top_dir ) print( "done" ) # making results df for unit cell and index no. stats_df = pd.DataFrame() # loop through stream files and collect unit_cell information print( "looping through stream files to collect unit cell, indexed information" ) for index, row in stream_df.iterrows(): stream_pwd, clen = row[ "stream_pwd" ], row[ "clen" ] # open stream file print( "scrubbing stream for clen={0}".format( clen ) ) stream = open( stream_pwd, "r" ).read() # scrub unit cell information cells = scrub_us( stream ) # put cells in df cols = [ "a", "b", "c", "alpha", "beta", "gamma" ] cells_df = pd.DataFrame( cells, columns=cols ) cells_df = cells_df.astype( float ) # calc stats indexed = len( cells_df ) std_a = cells_df.a.std() std_b = cells_df.b.std() std_c = cells_df.c.std() std_alpha = cells_df.alpha.std() std_beta = cells_df.beta.std() std_gamma = cells_df.gamma.std() # put stats in results df stats = [ { "clen" : clen, "indexed" : indexed, "std_a" : std_a, "std_b" : std_b, "std_c" : std_c, "std_alpha" : std_alpha, "std_beta" : std_beta, "std_gamma" : std_gamma, } ] stats_df_1 = pd.DataFrame( stats ) stats_df = pd.concat( ( stats_df, stats_df_1 ) ) print( "done" ) # reset index stats_df = stats_df.reset_index( drop=True ) return stats_df def find_clen_values(stats_df): def find_min_clen(col_name): min_val = stats_df[col_name].min() min_row = stats_df[stats_df[col_name] == min_val] min_clen = min_row['clen'].values[0] return min_val, min_clen min_alpha_val, min_alpha_clen = find_min_clen('std_alpha') min_beta_val, min_beta_clen = find_min_clen('std_beta') min_gamma_val, min_gamma_clen = find_min_clen('std_gamma') min_c_val, min_c_clen = find_min_clen('std_c') print(f"The value of clen for the minimum alpha value of {min_alpha_val} is {min_alpha_clen}") print(f"The value of clen for the minimum beta value of {min_beta_val} is {min_beta_clen}") print(f"The value of clen for the minimum gamma value of {min_gamma_val} is {min_gamma_clen}") print(f"The value of clen for the minimum c value of {min_c_val} is {min_c_clen}") return min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val def plot_indexed_std(stats_df, ax1, ax2): # indexed images plot color = "tab:red" ax1.set_xlabel("clen") ax1.set_ylabel("indexed", color=color) ax1.plot(stats_df.clen, stats_df.indexed, color=color) ax1.tick_params(axis="y", labelcolor=color) # label color color = "tab:blue" ax2.set_ylabel("a,b,c st.deviation", color=color) ax2.tick_params(axis='y', labelcolor=color) # std_a plot color = "lightsteelblue" ax2.plot(stats_df.clen, stats_df.std_a, color=color) # std_b plot color = "cornflowerblue" ax2.plot(stats_df.clen, stats_df.std_b, color=color) # std_c plot color = "royalblue" ax2.plot(stats_df.clen, stats_df.std_c, color=color) def plot_indexed_std_alpha_beta_gamma(stats_df, ax1, ax2): # indexed images plot color = "tab:red" ax1.set_xlabel("clen") ax1.set_ylabel("indexed", color=color) ax1.plot(stats_df.clen, stats_df.indexed, color=color) ax1.tick_params(axis="y", labelcolor=color) # label color color = "tab:green" ax2.set_ylabel("alpha, beta, gamma st.deviation", color=color) ax2.tick_params(axis='y', labelcolor=color) # std_alpha plot color = "limegreen" ax2.plot(stats_df.clen, stats_df.std_alpha, color=color) # std_beta plot color = "darkgreen" ax2.plot(stats_df.clen, stats_df.std_beta, color=color) # std_gamma plot color = "green" ax2.plot(stats_df.clen, stats_df.std_gamma, color=color) def main_coarse( lst, sample, lab6_geom_file, centre_clen, cell_file, steps_coarse, scan_name_coarse, step_size_coarse ): #make sample list cwd, sample_h5_file = make_sample(lst, sample) # make list of clen steps above and below the central clen step_range = make_step_range(centre_clen, step_size_coarse, steps_coarse) # make directorys for results print( "begin CrystFEL anaylsis of different clens" ) # loop to cycle through clen steps for clen in step_range: # move back to cwd os.chdir( cwd ) print( "processing clen = {0}".format( clen ) ) # define process directory proc_dir = "{0}/{1}/{2}".format( cwd, scan_name_coarse, clen ) # make process directory make_process_dir(proc_dir) # move to process directory os.chdir( proc_dir ) # make geom file clen_geom_file = geom_amend( lab6_geom_file, clen ) # make crystfel run file cryst_run_file = write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file ) # run crystfel file subprocess.call( [ "sbatch", "-p", "day", "--cpus-per-task=32", "--", "./{0}".format( cryst_run_file ) ] ) print( "done" ) #wait for jobs to complete check_job_status(username) def main_fine( lst, lab6_geom_file, centre_clen, cell_file, steps_fine, scan_name_fine, step_size_fine ): # set current working directory os.chdir("/sf/cristallina/data/p20590/work/process/jhb/detector_refinement") cwd = os.getcwd() #define the sample_h5_file location for this function sample_h5 = "h5_{0}_sample.lst".format(sample) sample_h5_file = "{0}/{1}".format(cwd, sample_h5) # make list of clen steps above and below the central clen step_range = make_step_range(centre_clen, step_size_fine, steps_fine) # make directorys for results print( "begin CrystFEL anaylsis of different clens" ) # loop to cycle through clen steps for clen in step_range: # move back to cwd os.chdir( cwd ) print( "processing clen = {0}".format( clen ) ) # define process directory proc_dir = "{0}/{1}/{2}".format( cwd, scan_name_fine, clen ) # make process directory make_process_dir(proc_dir) # move to process directory os.chdir( proc_dir ) # make geom file clen_geom_file = geom_amend( lab6_geom_file, clen ) # make crystfel run file cryst_run_file = write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file ) # run crystfel file subprocess.call( [ "sbatch", "-p", "day", "--cpus-per-task=32", "--", "./{0}".format( cryst_run_file ) ] ) print( "done" ) #wait for jobs to complete check_job_status(username) def scrub_main_coarse( top_dir_coarse ): stats_df = scrub_helper(top_dir_coarse) #print clen for minimum alpha, beta, and gamma values min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val = find_clen_values(stats_df) # plot results fig, (ax1, ax3) = plt.subplots(1, 2) ax2 = ax1.twinx() ax4 = ax3.twinx() plot_indexed_std(stats_df, ax1, ax2) plot_indexed_std_alpha_beta_gamma(stats_df, ax3, ax4) fig.tight_layout() plt.show() def scrub_main_fine( top_dir_fine ): stats_df = scrub_helper(top_dir_fine) #print clen for minimum alpha, beta, and gamma values min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val = find_clen_values(stats_df) #print suggested clen suggested_clen = (min_alpha_clen + min_beta_clen + min_gamma_clen )/3 suggested_clen = round(suggested_clen, 4) print ("The suggested clen = {0}".format(suggested_clen)) # plot results fig, (ax1, ax3) = plt.subplots(1, 2) ax2 = ax1.twinx() ax4 = ax3.twinx() plot_indexed_std(stats_df, ax1, ax2) plot_indexed_std_alpha_beta_gamma(stats_df, ax3, ax4) fig.tight_layout() plt.show() #location to which the data from coarse and fine scans will be saved #!!The scan_name must match the final folder name for the respective coarse or fine scans!! top_dir = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement" scan_name_coarse = "coarse" scan_name_fine = "fine" top_dir_coarse = "{0}/{1}".format( top_dir, scan_name_coarse ) top_dir_fine = "{0}/{1}".format( top_dir, scan_name_fine ) #General parameters for the scans lst = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement/acq0001.JF17T16V01.dark.lst" lab6_geom_file = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement/8M_p-op_c-op_p20590.geom" centre_clen = 0.122 # in m cell_file = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement/hewl.cell" username = "beale_j" #note that the timer only checks if the user has ANY jobs running, #so the user should ONLY be running the jobs related to this script on the cluster #to avoid a very long wait #stepping parameters for coarse and fine scan (generally not to be changed) sample = 500 steps_coarse = 20 step_size_coarse = 0.0005 # m steps_fine = 50 step_size_fine = 0.00005 # m #Calling the functions main_coarse( lst, sample, lab6_geom_file, centre_clen, cell_file, steps_coarse, scan_name_coarse, step_size_coarse ) scrub_main_coarse( top_dir_coarse ) main_fine( lst, lab6_geom_file, centre_clen, cell_file, steps_fine, scan_name_fine, step_size_fine ) scrub_main_fine( top_dir_fine )