From 7ea6c3c302a89e8dbb99818fffc40b2ebb81fa49 Mon Sep 17 00:00:00 2001
From: Beale John Henry <beale_j@saresc-cons-05.psi.ch>
Date: Wed, 26 Apr 2023 14:54:14 +0200
Subject: [PATCH] complete detector refinement script by Tom

---
 clen_tools/detector-distance-refinement.py | 483 +++++++++++++++++++++
 1 file changed, 483 insertions(+)
 create mode 100644 clen_tools/detector-distance-refinement.py

diff --git a/clen_tools/detector-distance-refinement.py b/clen_tools/detector-distance-refinement.py
new file mode 100644
index 0000000..f2280b2
--- /dev/null
+++ b/clen_tools/detector-distance-refinement.py
@@ -0,0 +1,483 @@
+#!/usr/bin/env python3
+
+# authors T. Mason and J. Beale
+
+
+# modules
+import pandas as pd
+import subprocess
+import os, errno
+import regex as re
+import numpy as np
+import matplotlib.pyplot as plt
+import time
+
+def h5_sample( lst, sample ):
+
+    # create sample of images from run
+    # read h5.lst - note - removes // from image column
+    cols = [ "h5", "image" ]
+    sample_df = pd.read_csv( lst, sep="\s//", engine="python", names=cols )
+
+    # take defined sample
+    sample_df = sample_df.sample( sample )
+
+    # sort list
+    sample_df = sample_df.sort_index()
+
+    # re-add // to image columm
+    sample_df[ "image" ] = "//" + sample_df.image.astype(str)
+
+    # write sample to file
+    sample_file = "h5_{0}_sample.lst".format( sample )
+    sample_df.to_csv( sample_file, sep=" ", index=False, header=False )
+
+    # return sample file name
+    return sample_file
+
+def geom_amend( lab6_geom_file, clen ):
+
+    # read lab6 geom
+    lab6_geom = open( lab6_geom_file, "r" )
+
+    # use regex to find clen and replace with new
+    # clen example => clen = 0.1217
+    clen_geom = re.sub( "clen = 0\.\d+", "clen = {0}".format( clen ), lab6_geom.read() )
+
+    # close lab6 geom file
+    lab6_geom.close()
+
+    # write new clen_geom to file
+    clen_geom_file = "{0}.geom".format( clen )
+    geom = open( clen_geom_file, "w" )
+    geom.write( clen_geom )
+    geom.close()
+
+    # return clen_geom file name
+    return clen_geom_file
+
+def write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file ):
+
+    # crystfel file name
+    cryst_run_file = "{0}_cryst_run.sh".format( clen )
+
+    # write file
+    run_sh = open( cryst_run_file, "w" )
+    run_sh.write( "#!/bin/sh\n\n" )
+    run_sh.write( "module purge\n" )
+    run_sh.write( "module load crystfel/0.10.2\n" )
+    run_sh.write( "indexamajig -i {0} \\\n".format( sample_h5_file ) )
+    run_sh.write( "  --output={0}.stream \\\n".format( clen ) )
+    run_sh.write( "  --geometry={0}\\\n".format( clen_geom_file ) )
+    run_sh.write( "  --pdb={0} \\\n".format( cell_file ) )
+    run_sh.write( "  --indexing=xgandalf-latt-cell --peaks=peakfinder8 \\\n" )
+    run_sh.write( "  --integration=rings-grad --tolerance=10.0,10.0,10.0,2,3,2 --threshold=10 --min-snr=5 --int-radius=2,3,6 \\\n" )
+    run_sh.write( "  -j 36 --no-multi --no-retry --check-peaks --max-res=3000 --min-pix-count=1 --local-bg-radius=4 --min-res=85\n\n" )
+    run_sh.close()
+
+    # make file executable
+    subprocess.call( [ "chmod", "+x", "{0}".format( cryst_run_file ) ] )
+
+    # return crystfel file name
+    return cryst_run_file
+
+def make_sample(lst, sample):
+    
+    # set current working directory
+    os.chdir("/sf/cristallina/data/p20590/work/process/jhb/detector_refinement")
+    cwd = os.getcwd()
+    
+    # make sample list
+    print("making {0} sample of images".format(sample))
+    sample_h5 = h5_sample(lst, sample)
+    sample_h5_file = "{0}/{1}".format(cwd, sample_h5)
+    print("done")
+    return cwd, sample_h5_file
+    
+def make_process_dir(proc_dir):
+        # make process directory
+        try:
+            os.makedirs( proc_dir )
+        except OSError as e:
+            if e.errno != errno.EEXIST:
+                raise
+                
+def make_step_range(centre_clen, step_size, steps):
+    # make list of clen steps above and below the central clen
+    print( "make clen array around {0}".format( centre_clen ) )
+    step_range = step_size*steps
+    bottom_step = centre_clen-step_range/2
+    top_step = bottom_step+step_range
+    step_range = np.arange( bottom_step, top_step, step_size )
+    step_range = step_range.round( 6 ) # important - otherwise np gives your .99999999 instead of 1 somethimes
+    print( "done" )
+    
+    return step_range
+                
+def check_job_status(username):
+    # wait for jobs to complete
+    jobs_completed = False
+    while not jobs_completed:
+       # Get the status of the jobs using "squeue"
+       result = subprocess.run(['squeue', '--user', '{0}'.format(username)], stdout=subprocess.PIPE)
+       output = result.stdout.decode('utf-8')
+
+       # Check if there are no jobs running for the user
+       if '{0}'.format(username) not in output:
+           jobs_completed = True
+       else:
+           # Sleep for some time and check again
+           print("waiting for jobs to finish")
+           time.sleep(30)  # sleep for 30 seconds
+
+    print("All jobs completed.")
+        
+def scrub_clen( stream_pwd ):
+
+    # get clen from stream name
+    # example - /sf/cristallina/data/p20590/work/process/jhb/detector_refinement/coarse_scan/0.115/0.115.stream
+    # scrub clen and return - else nan
+    try:
+        pattern = r"0\.\d+/(0\.\d+)\.stream"
+        re_search = re.search( pattern, stream_pwd )
+        clen = re_search.group( 1 )
+        if AttributeError:
+            return float( clen )
+    except AttributeError:
+        return np.nan
+
+def find_streams( top_dir ):
+
+    # create df for streams
+    stream_df = pd.DataFrame()
+
+    # search for all files that end with .stream
+
+    for path, dirs, files in os.walk( top_dir ):
+        for name in files:
+            if name.endswith( ".stream" ):
+
+                # get stream pwd
+                stream_pwd = os.path.join( path, name )
+
+                # scrub clen from stream
+                clen = scrub_clen( stream_pwd )
+
+                # put clen and stream pwd into df
+                data = [ {  "stream_pwd" : stream_pwd,
+                            "clen" : clen
+                         } ]
+                stream_df_1 = pd.DataFrame( data )
+                stream_df = pd.concat( ( stream_df, stream_df_1 ) )
+    
+    # sort df based on clen
+    stream_df = stream_df.sort_values( by="clen" )
+
+    # reset df index
+    stream_df = stream_df.reset_index( drop=True )
+
+    # return df of streams and clens
+    return stream_df
+
+
+def scrub_us( stream ):
+
+    # get uc values from stream file
+    # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg
+    # scrub clen and return - else nan
+    try:
+        pattern = r"Cell\sparameters\s(\d\.\d+)\s(\d\.\d+)\s(\d\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg"
+        cells = re.findall( pattern, stream )
+        if AttributeError:
+            return cells
+    except AttributeError:
+        return np.nan
+    
+def scrub_helper(top_dir):
+    # find stream files from process directory
+    print( "finding stream files" )
+    stream_df = find_streams( top_dir )
+    print( "done" )
+
+    # making results df for unit cell and index no.
+    stats_df = pd.DataFrame()
+
+    # loop through stream files and collect unit_cell information
+    print( "looping through stream files to collect unit cell, indexed information" )
+    for index, row in stream_df.iterrows():
+
+        stream_pwd, clen = row[ "stream_pwd" ], row[ "clen" ]
+
+        # open stream file
+        print( "scrubbing stream for clen={0}".format( clen ) )
+        stream = open( stream_pwd, "r" ).read()
+
+        # scrub unit cell information
+        cells = scrub_us( stream )
+
+        # put cells in df
+        cols = [ "a", "b", "c", "alpha", "beta", "gamma" ]
+        cells_df = pd.DataFrame( cells, columns=cols )
+        cells_df = cells_df.astype( float )
+
+        # calc stats
+        indexed = len( cells_df )
+        std_a = cells_df.a.std()
+        std_b = cells_df.b.std()
+        std_c = cells_df.c.std()
+        std_alpha = cells_df.alpha.std()
+        std_beta = cells_df.beta.std()
+        std_gamma = cells_df.gamma.std()
+
+        # put stats in results df
+        stats = [ { "clen" : clen,
+                    "indexed" : indexed,
+                    "std_a" : std_a,
+                    "std_b" : std_b,
+                    "std_c" : std_c,
+                    "std_alpha" : std_alpha,
+                    "std_beta" : std_beta,
+                    "std_gamma" : std_gamma,
+                } ]
+        stats_df_1 = pd.DataFrame( stats )
+        stats_df = pd.concat( ( stats_df, stats_df_1 ) )
+
+        print( "done" )
+
+    # reset index
+    stats_df = stats_df.reset_index( drop=True )
+
+    return stats_df
+
+def find_clen_values(stats_df):
+
+    def find_min_clen(col_name):
+        min_val = stats_df[col_name].min()
+        min_row = stats_df[stats_df[col_name] == min_val]
+        min_clen = min_row['clen'].values[0]
+        return min_val, min_clen
+
+    min_alpha_val, min_alpha_clen = find_min_clen('std_alpha')
+    min_beta_val, min_beta_clen = find_min_clen('std_beta')
+    min_gamma_val, min_gamma_clen = find_min_clen('std_gamma')
+    min_c_val, min_c_clen = find_min_clen('std_c')
+    
+    print(f"The value of clen for the minimum alpha value of {min_alpha_val} is {min_alpha_clen}")
+    print(f"The value of clen for the minimum beta value of {min_beta_val} is {min_beta_clen}")
+    print(f"The value of clen for the minimum gamma value of {min_gamma_val} is {min_gamma_clen}")
+    print(f"The value of clen for the minimum c value of {min_c_val} is {min_c_clen}")
+    
+    return min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val
+
+
+def plot_indexed_std(stats_df, ax1, ax2):
+    # indexed images plot
+    color = "tab:red"
+    ax1.set_xlabel("clen")
+    ax1.set_ylabel("indexed", color=color)
+    ax1.plot(stats_df.clen, stats_df.indexed, color=color)
+    ax1.tick_params(axis="y", labelcolor=color)
+
+    # label color
+    color = "tab:blue"
+    ax2.set_ylabel("a,b,c st.deviation", color=color)
+    ax2.tick_params(axis='y', labelcolor=color)
+
+    # std_a plot
+    color = "lightsteelblue"
+    ax2.plot(stats_df.clen, stats_df.std_a, color=color)
+
+    # std_b plot
+    color = "cornflowerblue"
+    ax2.plot(stats_df.clen, stats_df.std_b, color=color)
+
+    # std_c plot
+    color = "royalblue"
+    ax2.plot(stats_df.clen, stats_df.std_c, color=color)
+
+
+def plot_indexed_std_alpha_beta_gamma(stats_df, ax1, ax2):
+    # indexed images plot
+    color = "tab:red"
+    ax1.set_xlabel("clen")
+    ax1.set_ylabel("indexed", color=color)
+    ax1.plot(stats_df.clen, stats_df.indexed, color=color)
+    ax1.tick_params(axis="y", labelcolor=color)
+
+    # label color
+    color = "tab:green"
+    ax2.set_ylabel("alpha, beta, gamma st.deviation", color=color)
+    ax2.tick_params(axis='y', labelcolor=color)
+
+    # std_alpha plot
+    color = "limegreen"
+    ax2.plot(stats_df.clen, stats_df.std_alpha, color=color)
+
+    # std_beta plot
+    color = "darkgreen"
+    ax2.plot(stats_df.clen, stats_df.std_beta, color=color)
+
+    # std_gamma plot
+    color = "green"
+    ax2.plot(stats_df.clen, stats_df.std_gamma, color=color)
+        
+def main_coarse( lst, sample, lab6_geom_file, centre_clen, cell_file, steps_coarse, scan_name_coarse, step_size_coarse ):
+
+    #make sample list
+    cwd, sample_h5_file = make_sample(lst, sample)
+    
+    # make list of clen steps above and below the central clen
+    step_range = make_step_range(centre_clen, step_size_coarse, steps_coarse)
+    
+    # make directorys for results
+    print( "begin CrystFEL anaylsis of different clens" )
+
+    # loop to cycle through clen steps
+    for clen in step_range:
+
+        # move back to cwd
+        os.chdir( cwd )
+
+        print( "processing clen = {0}".format( clen ) )
+        # define process directory
+        proc_dir = "{0}/{1}/{2}".format( cwd, scan_name_coarse, clen )
+
+        # make process directory
+        make_process_dir(proc_dir)
+        
+        # move to process directory
+        os.chdir( proc_dir )
+
+        # make geom file
+        clen_geom_file = geom_amend( lab6_geom_file, clen )
+
+        # make crystfel run file
+        cryst_run_file = write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file )  
+
+        # run crystfel file
+        subprocess.call( [ "sbatch", "-p", "day", "--cpus-per-task=32", "--", "./{0}".format( cryst_run_file ) ] )
+        print( "done" )
+
+    #wait for jobs to complete
+    check_job_status(username)
+
+def main_fine( lst, lab6_geom_file, centre_clen, cell_file, steps_fine, scan_name_fine, step_size_fine ):
+
+    # set current working directory
+    os.chdir("/sf/cristallina/data/p20590/work/process/jhb/detector_refinement")
+    cwd = os.getcwd()
+    
+    #define the sample_h5_file location for this function
+    sample_h5 = "h5_{0}_sample.lst".format(sample)
+    sample_h5_file = "{0}/{1}".format(cwd, sample_h5)
+
+    # make list of clen steps above and below the central clen
+    step_range = make_step_range(centre_clen, step_size_fine, steps_fine)
+
+    # make directorys for results
+    print( "begin CrystFEL anaylsis of different clens" )
+    
+    # loop to cycle through clen steps
+    for clen in step_range:
+
+        # move back to cwd
+        os.chdir( cwd )
+
+        print( "processing clen = {0}".format( clen ) )
+        # define process directory
+        proc_dir = "{0}/{1}/{2}".format( cwd, scan_name_fine, clen )
+
+        # make process directory
+        make_process_dir(proc_dir)
+        
+        # move to process directory
+        os.chdir( proc_dir )
+
+        # make geom file
+        clen_geom_file = geom_amend( lab6_geom_file, clen )
+
+        # make crystfel run file
+        cryst_run_file = write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file )   
+
+        # run crystfel file
+        subprocess.call( [ "sbatch", "-p", "day", "--cpus-per-task=32", "--", "./{0}".format( cryst_run_file ) ] )
+        print( "done" )
+        
+    #wait for jobs to complete
+    check_job_status(username)
+
+def scrub_main_coarse( top_dir_coarse ):
+
+    stats_df = scrub_helper(top_dir_coarse)
+
+    #print clen for minimum alpha, beta, and gamma values
+    min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val = find_clen_values(stats_df)
+    
+    # plot results
+    fig, (ax1, ax3) = plt.subplots(1, 2)
+    ax2 = ax1.twinx()
+    ax4 = ax3.twinx()
+
+    plot_indexed_std(stats_df, ax1, ax2)
+    plot_indexed_std_alpha_beta_gamma(stats_df, ax3, ax4)
+
+    fig.tight_layout()
+    plt.show()
+    
+def scrub_main_fine( top_dir_fine ):
+
+    stats_df = scrub_helper(top_dir_fine)
+
+    #print clen for minimum alpha, beta, and gamma values
+    min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val = find_clen_values(stats_df)
+    
+    #print suggested clen
+    suggested_clen = (min_alpha_clen + min_beta_clen + min_gamma_clen )/3
+    suggested_clen = round(suggested_clen, 4)
+    print ("The suggested clen = {0}".format(suggested_clen))
+    
+    # plot results
+    fig, (ax1, ax3) = plt.subplots(1, 2)
+    ax2 = ax1.twinx()
+    ax4 = ax3.twinx()
+
+    plot_indexed_std(stats_df, ax1, ax2)
+    plot_indexed_std_alpha_beta_gamma(stats_df, ax3, ax4)
+
+    fig.tight_layout()
+    plt.show()
+    
+
+#location to which the data from coarse and fine scans will be saved
+#!!The scan_name must match the final folder name for the respective coarse or fine scans!!
+top_dir =  "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement"
+scan_name_coarse = "coarse"
+scan_name_fine = "fine"
+top_dir_coarse = "{0}/{1}".format( top_dir, scan_name_coarse )
+top_dir_fine = "{0}/{1}".format( top_dir, scan_name_fine )
+
+#General parameters for the scans
+lst = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement/acq0001.JF17T16V01.dark.lst"
+lab6_geom_file = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement/8M_p-op_c-op_p20590.geom"
+centre_clen = 0.122 # in m
+cell_file = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement/hewl.cell"
+username = "beale_j"  #note that the timer only checks if the user has ANY jobs running,
+                      #so the user should ONLY be running the jobs related to this script on the cluster
+                      #to avoid a very long wait
+
+#stepping parameters for coarse and fine scan (generally not to be changed)
+sample = 500
+steps_coarse = 20
+step_size_coarse = 0.0005 # m 
+steps_fine = 50
+step_size_fine = 0.00005 # m 
+
+#Calling the functions
+main_coarse( lst, sample, lab6_geom_file, centre_clen, cell_file, steps_coarse, scan_name_coarse, step_size_coarse )
+
+scrub_main_coarse( top_dir_coarse )
+
+main_fine( lst, lab6_geom_file, centre_clen, cell_file, steps_fine, scan_name_fine, step_size_fine )
+
+scrub_main_fine( top_dir_fine )
+