From 7ea6c3c302a89e8dbb99818fffc40b2ebb81fa49 Mon Sep 17 00:00:00 2001 From: Beale John Henry Date: Wed, 26 Apr 2023 14:54:14 +0200 Subject: [PATCH] complete detector refinement script by Tom --- clen_tools/detector-distance-refinement.py | 483 +++++++++++++++++++++ 1 file changed, 483 insertions(+) create mode 100644 clen_tools/detector-distance-refinement.py diff --git a/clen_tools/detector-distance-refinement.py b/clen_tools/detector-distance-refinement.py new file mode 100644 index 0000000..f2280b2 --- /dev/null +++ b/clen_tools/detector-distance-refinement.py @@ -0,0 +1,483 @@ +#!/usr/bin/env python3 + +# authors T. Mason and J. Beale + + +# modules +import pandas as pd +import subprocess +import os, errno +import regex as re +import numpy as np +import matplotlib.pyplot as plt +import time + +def h5_sample( lst, sample ): + + # create sample of images from run + # read h5.lst - note - removes // from image column + cols = [ "h5", "image" ] + sample_df = pd.read_csv( lst, sep="\s//", engine="python", names=cols ) + + # take defined sample + sample_df = sample_df.sample( sample ) + + # sort list + sample_df = sample_df.sort_index() + + # re-add // to image columm + sample_df[ "image" ] = "//" + sample_df.image.astype(str) + + # write sample to file + sample_file = "h5_{0}_sample.lst".format( sample ) + sample_df.to_csv( sample_file, sep=" ", index=False, header=False ) + + # return sample file name + return sample_file + +def geom_amend( lab6_geom_file, clen ): + + # read lab6 geom + lab6_geom = open( lab6_geom_file, "r" ) + + # use regex to find clen and replace with new + # clen example => clen = 0.1217 + clen_geom = re.sub( "clen = 0\.\d+", "clen = {0}".format( clen ), lab6_geom.read() ) + + # close lab6 geom file + lab6_geom.close() + + # write new clen_geom to file + clen_geom_file = "{0}.geom".format( clen ) + geom = open( clen_geom_file, "w" ) + geom.write( clen_geom ) + geom.close() + + # return clen_geom file name + return clen_geom_file + +def write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file ): + + # crystfel file name + cryst_run_file = "{0}_cryst_run.sh".format( clen ) + + # write file + run_sh = open( cryst_run_file, "w" ) + run_sh.write( "#!/bin/sh\n\n" ) + run_sh.write( "module purge\n" ) + run_sh.write( "module load crystfel/0.10.2\n" ) + run_sh.write( "indexamajig -i {0} \\\n".format( sample_h5_file ) ) + run_sh.write( " --output={0}.stream \\\n".format( clen ) ) + run_sh.write( " --geometry={0}\\\n".format( clen_geom_file ) ) + run_sh.write( " --pdb={0} \\\n".format( cell_file ) ) + run_sh.write( " --indexing=xgandalf-latt-cell --peaks=peakfinder8 \\\n" ) + run_sh.write( " --integration=rings-grad --tolerance=10.0,10.0,10.0,2,3,2 --threshold=10 --min-snr=5 --int-radius=2,3,6 \\\n" ) + run_sh.write( " -j 36 --no-multi --no-retry --check-peaks --max-res=3000 --min-pix-count=1 --local-bg-radius=4 --min-res=85\n\n" ) + run_sh.close() + + # make file executable + subprocess.call( [ "chmod", "+x", "{0}".format( cryst_run_file ) ] ) + + # return crystfel file name + return cryst_run_file + +def make_sample(lst, sample): + + # set current working directory + os.chdir("/sf/cristallina/data/p20590/work/process/jhb/detector_refinement") + cwd = os.getcwd() + + # make sample list + print("making {0} sample of images".format(sample)) + sample_h5 = h5_sample(lst, sample) + sample_h5_file = "{0}/{1}".format(cwd, sample_h5) + print("done") + return cwd, sample_h5_file + +def make_process_dir(proc_dir): + # make process directory + try: + os.makedirs( proc_dir ) + except OSError as e: + if e.errno != errno.EEXIST: + raise + +def make_step_range(centre_clen, step_size, steps): + # make list of clen steps above and below the central clen + print( "make clen array around {0}".format( centre_clen ) ) + step_range = step_size*steps + bottom_step = centre_clen-step_range/2 + top_step = bottom_step+step_range + step_range = np.arange( bottom_step, top_step, step_size ) + step_range = step_range.round( 6 ) # important - otherwise np gives your .99999999 instead of 1 somethimes + print( "done" ) + + return step_range + +def check_job_status(username): + # wait for jobs to complete + jobs_completed = False + while not jobs_completed: + # Get the status of the jobs using "squeue" + result = subprocess.run(['squeue', '--user', '{0}'.format(username)], stdout=subprocess.PIPE) + output = result.stdout.decode('utf-8') + + # Check if there are no jobs running for the user + if '{0}'.format(username) not in output: + jobs_completed = True + else: + # Sleep for some time and check again + print("waiting for jobs to finish") + time.sleep(30) # sleep for 30 seconds + + print("All jobs completed.") + +def scrub_clen( stream_pwd ): + + # get clen from stream name + # example - /sf/cristallina/data/p20590/work/process/jhb/detector_refinement/coarse_scan/0.115/0.115.stream + # scrub clen and return - else nan + try: + pattern = r"0\.\d+/(0\.\d+)\.stream" + re_search = re.search( pattern, stream_pwd ) + clen = re_search.group( 1 ) + if AttributeError: + return float( clen ) + except AttributeError: + return np.nan + +def find_streams( top_dir ): + + # create df for streams + stream_df = pd.DataFrame() + + # search for all files that end with .stream + + for path, dirs, files in os.walk( top_dir ): + for name in files: + if name.endswith( ".stream" ): + + # get stream pwd + stream_pwd = os.path.join( path, name ) + + # scrub clen from stream + clen = scrub_clen( stream_pwd ) + + # put clen and stream pwd into df + data = [ { "stream_pwd" : stream_pwd, + "clen" : clen + } ] + stream_df_1 = pd.DataFrame( data ) + stream_df = pd.concat( ( stream_df, stream_df_1 ) ) + + # sort df based on clen + stream_df = stream_df.sort_values( by="clen" ) + + # reset df index + stream_df = stream_df.reset_index( drop=True ) + + # return df of streams and clens + return stream_df + + +def scrub_us( stream ): + + # get uc values from stream file + # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg + # scrub clen and return - else nan + try: + pattern = r"Cell\sparameters\s(\d\.\d+)\s(\d\.\d+)\s(\d\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg" + cells = re.findall( pattern, stream ) + if AttributeError: + return cells + except AttributeError: + return np.nan + +def scrub_helper(top_dir): + # find stream files from process directory + print( "finding stream files" ) + stream_df = find_streams( top_dir ) + print( "done" ) + + # making results df for unit cell and index no. + stats_df = pd.DataFrame() + + # loop through stream files and collect unit_cell information + print( "looping through stream files to collect unit cell, indexed information" ) + for index, row in stream_df.iterrows(): + + stream_pwd, clen = row[ "stream_pwd" ], row[ "clen" ] + + # open stream file + print( "scrubbing stream for clen={0}".format( clen ) ) + stream = open( stream_pwd, "r" ).read() + + # scrub unit cell information + cells = scrub_us( stream ) + + # put cells in df + cols = [ "a", "b", "c", "alpha", "beta", "gamma" ] + cells_df = pd.DataFrame( cells, columns=cols ) + cells_df = cells_df.astype( float ) + + # calc stats + indexed = len( cells_df ) + std_a = cells_df.a.std() + std_b = cells_df.b.std() + std_c = cells_df.c.std() + std_alpha = cells_df.alpha.std() + std_beta = cells_df.beta.std() + std_gamma = cells_df.gamma.std() + + # put stats in results df + stats = [ { "clen" : clen, + "indexed" : indexed, + "std_a" : std_a, + "std_b" : std_b, + "std_c" : std_c, + "std_alpha" : std_alpha, + "std_beta" : std_beta, + "std_gamma" : std_gamma, + } ] + stats_df_1 = pd.DataFrame( stats ) + stats_df = pd.concat( ( stats_df, stats_df_1 ) ) + + print( "done" ) + + # reset index + stats_df = stats_df.reset_index( drop=True ) + + return stats_df + +def find_clen_values(stats_df): + + def find_min_clen(col_name): + min_val = stats_df[col_name].min() + min_row = stats_df[stats_df[col_name] == min_val] + min_clen = min_row['clen'].values[0] + return min_val, min_clen + + min_alpha_val, min_alpha_clen = find_min_clen('std_alpha') + min_beta_val, min_beta_clen = find_min_clen('std_beta') + min_gamma_val, min_gamma_clen = find_min_clen('std_gamma') + min_c_val, min_c_clen = find_min_clen('std_c') + + print(f"The value of clen for the minimum alpha value of {min_alpha_val} is {min_alpha_clen}") + print(f"The value of clen for the minimum beta value of {min_beta_val} is {min_beta_clen}") + print(f"The value of clen for the minimum gamma value of {min_gamma_val} is {min_gamma_clen}") + print(f"The value of clen for the minimum c value of {min_c_val} is {min_c_clen}") + + return min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val + + +def plot_indexed_std(stats_df, ax1, ax2): + # indexed images plot + color = "tab:red" + ax1.set_xlabel("clen") + ax1.set_ylabel("indexed", color=color) + ax1.plot(stats_df.clen, stats_df.indexed, color=color) + ax1.tick_params(axis="y", labelcolor=color) + + # label color + color = "tab:blue" + ax2.set_ylabel("a,b,c st.deviation", color=color) + ax2.tick_params(axis='y', labelcolor=color) + + # std_a plot + color = "lightsteelblue" + ax2.plot(stats_df.clen, stats_df.std_a, color=color) + + # std_b plot + color = "cornflowerblue" + ax2.plot(stats_df.clen, stats_df.std_b, color=color) + + # std_c plot + color = "royalblue" + ax2.plot(stats_df.clen, stats_df.std_c, color=color) + + +def plot_indexed_std_alpha_beta_gamma(stats_df, ax1, ax2): + # indexed images plot + color = "tab:red" + ax1.set_xlabel("clen") + ax1.set_ylabel("indexed", color=color) + ax1.plot(stats_df.clen, stats_df.indexed, color=color) + ax1.tick_params(axis="y", labelcolor=color) + + # label color + color = "tab:green" + ax2.set_ylabel("alpha, beta, gamma st.deviation", color=color) + ax2.tick_params(axis='y', labelcolor=color) + + # std_alpha plot + color = "limegreen" + ax2.plot(stats_df.clen, stats_df.std_alpha, color=color) + + # std_beta plot + color = "darkgreen" + ax2.plot(stats_df.clen, stats_df.std_beta, color=color) + + # std_gamma plot + color = "green" + ax2.plot(stats_df.clen, stats_df.std_gamma, color=color) + +def main_coarse( lst, sample, lab6_geom_file, centre_clen, cell_file, steps_coarse, scan_name_coarse, step_size_coarse ): + + #make sample list + cwd, sample_h5_file = make_sample(lst, sample) + + # make list of clen steps above and below the central clen + step_range = make_step_range(centre_clen, step_size_coarse, steps_coarse) + + # make directorys for results + print( "begin CrystFEL anaylsis of different clens" ) + + # loop to cycle through clen steps + for clen in step_range: + + # move back to cwd + os.chdir( cwd ) + + print( "processing clen = {0}".format( clen ) ) + # define process directory + proc_dir = "{0}/{1}/{2}".format( cwd, scan_name_coarse, clen ) + + # make process directory + make_process_dir(proc_dir) + + # move to process directory + os.chdir( proc_dir ) + + # make geom file + clen_geom_file = geom_amend( lab6_geom_file, clen ) + + # make crystfel run file + cryst_run_file = write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file ) + + # run crystfel file + subprocess.call( [ "sbatch", "-p", "day", "--cpus-per-task=32", "--", "./{0}".format( cryst_run_file ) ] ) + print( "done" ) + + #wait for jobs to complete + check_job_status(username) + +def main_fine( lst, lab6_geom_file, centre_clen, cell_file, steps_fine, scan_name_fine, step_size_fine ): + + # set current working directory + os.chdir("/sf/cristallina/data/p20590/work/process/jhb/detector_refinement") + cwd = os.getcwd() + + #define the sample_h5_file location for this function + sample_h5 = "h5_{0}_sample.lst".format(sample) + sample_h5_file = "{0}/{1}".format(cwd, sample_h5) + + # make list of clen steps above and below the central clen + step_range = make_step_range(centre_clen, step_size_fine, steps_fine) + + # make directorys for results + print( "begin CrystFEL anaylsis of different clens" ) + + # loop to cycle through clen steps + for clen in step_range: + + # move back to cwd + os.chdir( cwd ) + + print( "processing clen = {0}".format( clen ) ) + # define process directory + proc_dir = "{0}/{1}/{2}".format( cwd, scan_name_fine, clen ) + + # make process directory + make_process_dir(proc_dir) + + # move to process directory + os.chdir( proc_dir ) + + # make geom file + clen_geom_file = geom_amend( lab6_geom_file, clen ) + + # make crystfel run file + cryst_run_file = write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file ) + + # run crystfel file + subprocess.call( [ "sbatch", "-p", "day", "--cpus-per-task=32", "--", "./{0}".format( cryst_run_file ) ] ) + print( "done" ) + + #wait for jobs to complete + check_job_status(username) + +def scrub_main_coarse( top_dir_coarse ): + + stats_df = scrub_helper(top_dir_coarse) + + #print clen for minimum alpha, beta, and gamma values + min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val = find_clen_values(stats_df) + + # plot results + fig, (ax1, ax3) = plt.subplots(1, 2) + ax2 = ax1.twinx() + ax4 = ax3.twinx() + + plot_indexed_std(stats_df, ax1, ax2) + plot_indexed_std_alpha_beta_gamma(stats_df, ax3, ax4) + + fig.tight_layout() + plt.show() + +def scrub_main_fine( top_dir_fine ): + + stats_df = scrub_helper(top_dir_fine) + + #print clen for minimum alpha, beta, and gamma values + min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val = find_clen_values(stats_df) + + #print suggested clen + suggested_clen = (min_alpha_clen + min_beta_clen + min_gamma_clen )/3 + suggested_clen = round(suggested_clen, 4) + print ("The suggested clen = {0}".format(suggested_clen)) + + # plot results + fig, (ax1, ax3) = plt.subplots(1, 2) + ax2 = ax1.twinx() + ax4 = ax3.twinx() + + plot_indexed_std(stats_df, ax1, ax2) + plot_indexed_std_alpha_beta_gamma(stats_df, ax3, ax4) + + fig.tight_layout() + plt.show() + + +#location to which the data from coarse and fine scans will be saved +#!!The scan_name must match the final folder name for the respective coarse or fine scans!! +top_dir = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement" +scan_name_coarse = "coarse" +scan_name_fine = "fine" +top_dir_coarse = "{0}/{1}".format( top_dir, scan_name_coarse ) +top_dir_fine = "{0}/{1}".format( top_dir, scan_name_fine ) + +#General parameters for the scans +lst = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement/acq0001.JF17T16V01.dark.lst" +lab6_geom_file = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement/8M_p-op_c-op_p20590.geom" +centre_clen = 0.122 # in m +cell_file = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement/hewl.cell" +username = "beale_j" #note that the timer only checks if the user has ANY jobs running, + #so the user should ONLY be running the jobs related to this script on the cluster + #to avoid a very long wait + +#stepping parameters for coarse and fine scan (generally not to be changed) +sample = 500 +steps_coarse = 20 +step_size_coarse = 0.0005 # m +steps_fine = 50 +step_size_fine = 0.00005 # m + +#Calling the functions +main_coarse( lst, sample, lab6_geom_file, centre_clen, cell_file, steps_coarse, scan_name_coarse, step_size_coarse ) + +scrub_main_coarse( top_dir_coarse ) + +main_fine( lst, lab6_geom_file, centre_clen, cell_file, steps_fine, scan_name_fine, step_size_fine ) + +scrub_main_fine( top_dir_fine ) +