From 9ffee8ff7d3b2641cb21c0f114538900e2b06a52 Mon Sep 17 00:00:00 2001 From: Beale John Henry Date: Thu, 1 Feb 2024 13:29:49 +0100 Subject: [PATCH] now works with improved stats and does not overwrite --- reduction_tools/push_res_scan.py | 156 ++++++++++++++++++++----------- 1 file changed, 99 insertions(+), 57 deletions(-) diff --git a/reduction_tools/push_res_scan.py b/reduction_tools/push_res_scan.py index bb8dd66..c4da4bb 100644 --- a/reduction_tools/push_res_scan.py +++ b/reduction_tools/push_res_scan.py @@ -38,7 +38,7 @@ from scipy.optimize import curve_fit def submit_job( job_file ): # submit the job - submit_cmd = ["sbatch", "--cpus-per-task=32", "--" ,job_file] + submit_cmd = ["sbatch", "-p", "day" ,"--cpus-per-task=32", "--" ,job_file] job_output = subprocess.check_output(submit_cmd) # scrub job id from - example Submitted batch job 742403 @@ -59,7 +59,7 @@ def wait_for_jobs( job_ids, total_jobs ): completed_jobs.add(job_id) pbar.update(1) job_ids.difference_update(completed_jobs) - time.sleep(2) + time.sleep(5) def run_partialator( proc_dir, stream, pointgroup, model, iterations, push_res, cell, bins, part_h_res, flag ): @@ -116,30 +116,33 @@ def run_push_res( cwd, name, stream, pointgroup, model, iterations, cell, bins, for res in push_res_list: push_res_dir = "{0}/{1}/push-res_{2}".format( cwd, name, res ) - # make push-res directories - make_process_dir( push_res_dir ) + + # check to see if directories already made + if not os.path.exists( push_res_dir ): - # move to push-res directory - os.chdir( push_res_dir ) + # make push-res directories + make_process_dir( push_res_dir ) - # make partialator run file - part_run_file = run_partialator( push_res_dir, stream, pointgroup, model, iterations, res, cell, bins, part_h_res, flag="push-res" ) + # move to push-res directory + os.chdir( push_res_dir ) - # submit job - job_id = submit_job( part_run_file ) - print( "push-res {0} job submitted: {1}".format( res, job_id ) ) - submitted_job_ids.add( job_id ) + # make partialator run file + part_run_file = run_partialator( push_res_dir, stream, pointgroup, model, iterations, res, cell, bins, part_h_res, flag="push-res" ) - # move back to top dir - os.chdir( cwd ) - - print( "DONE" ) + # submit job + job_id = submit_job( part_run_file ) + print( "push-res {0} job submitted: {1}".format( res, job_id ) ) + submitted_job_ids.add( job_id ) + + # move back to top dir + os.chdir( cwd ) # use progress bar to track job completion + time.sleep(30) wait_for_jobs(submitted_job_ids, len(push_res_list) ) print("slurm processing done") -def summary_stats( cc_dat, ccstar_dat, mult_dat ): +def summary_stats( cc_dat, ccstar_dat, mult_dat, rsplit_dat ): # read all files into pd # function to sort out different column names @@ -153,21 +156,27 @@ def summary_stats( cc_dat, ccstar_dat, mult_dat ): elif var == "mult": cols = [ "d(nm)", "nref", "poss", "comp", "obs", "mult", "snr", "I", "d", "min", "max" ] + elif var == "rsplit": + cols = [ "d(nm)", "rsplit", "nref", "d", "min", "max" ] df = pd.read_csv( dat, names=cols, skiprows=1, sep="\s+" ) + return df + # make df cc_df = read_dat( cc_dat, "cc" ) ccstar_df = read_dat( ccstar_dat, "ccstar" ) mult_df = read_dat( mult_dat, "mult" ) + rsplit_df = read_dat( rsplit_dat, "rsplit" ) # remove unwanted cols cc_df = cc_df[ [ "cc" ] ] ccstar_df = ccstar_df[ [ "ccstar" ] ] + rsplit_df = rsplit_df[ [ "rsplit" ] ] # merge dfs - stats_df = pd.concat( [ mult_df, cc_df, ccstar_df], axis=1, join="inner" ) + stats_df = pd.concat( [ mult_df, cc_df, ccstar_df, rsplit_df ], axis=1, join="inner" ) # make 1/d, 1/d^2 column stats_df[ "1_d" ] = 1 / stats_df.d @@ -177,7 +186,7 @@ def summary_stats( cc_dat, ccstar_dat, mult_dat ): stats_df = stats_df[ [ "1_d", "1_d2", "d", "min", "max", "nref", "poss", "comp", "obs", "mult", - "snr", "I", "cc", "ccstar"] ] + "snr", "I", "cc", "ccstar", "rsplit" ] ] # change nan to 0 stats_df = stats_df.fillna(0) @@ -208,30 +217,48 @@ def get_metric( d2_series, cc_series, cut_off ): return cc_stat -def summary_cc_fig( stats_df ): +def summary_fig( stats_df, file_name ): # plot results - cc_fig, (ax1, ax2) = plt.subplots(1, 2) + cc_fig, axs = plt.subplots(2, 2) cc_fig.suptitle( "cc and cc* vs resolution" ) - # CC + # cc plot color = "tab:red" - ax1.set_xlabel( "1/d2 (1/A)" ) - ax1.set_ylabel("CC" ) - ax1.axhline(y = 0.3, color="black", linestyle = "dashed") - ax1.plot(stats_df[ "1_d" ], stats_df.cc, color=color) - ax1.tick_params(axis="y", labelcolor=color) + axs[0,0].set_xlabel( "1/d (1/A)" ) + axs[0,0].set_ylabel("CC" ) + axs[0,0].set_ylim( 0, 1 ) + axs[0,0].axhline(y = 0.3, color="black", linestyle = "dashed") + axs[0,0].plot(stats_df[ "1_d" ], stats_df.cc, color=color) + axs[0,0].tick_params(axis="y", labelcolor=color) - # CC* + # cc* plot color = "tab:blue" - ax2.set_xlabel( "1/d (1/A)" ) - ax2.set_ylabel("CC*", color=color) - ax2.axhline(y = 0.7, color="black", linestyle = "dashed") - ax2.plot(stats_df[ "1_d" ], stats_df.ccstar, color=color) - ax2.tick_params(axis='y', labelcolor=color) + axs[0,1].set_xlabel( "1/d (1/A)" ) + axs[0,1].set_ylabel("CC*", color=color) + axs[0,1].set_ylim( 0, 1 ) + axs[0,1].axhline(y = 0.7, color="black", linestyle = "dashed") + axs[0,1].plot(stats_df[ "1_d" ], stats_df.ccstar, color=color) + axs[0,1].tick_params(axis='y', labelcolor=color) - plot_file = "final_cc.png" - plt.savefig( plot_file ) + # rsplit plot + color = "tab:green" + axs[1,0].set_xlabel( "1/d (1/A)" ) + axs[1,0].set_ylabel("Rsplit", color=color) + axs[1,0].plot(stats_df[ "1_d" ], stats_df.rsplit, color=color) + axs[1,0].tick_params(axis='y', labelcolor=color) + + + # rsplit plot + color = "tab:purple" + axs[1,1].set_xlabel( "1/d (1/A)" ) + axs[1,1].set_ylabel("Multiplicity", color=color) + axs[1,1].plot(stats_df[ "1_d" ], stats_df.mult, color=color) + axs[1,1].tick_params(axis='y', labelcolor=color) + + # save figure + plt.tight_layout() + plt.savefig( "{0}/plots.png".format( file_name ) ) def cc_cut_fig( analysis_df, cc_push_res, ccstar_push_res, plot_pwd ): @@ -267,9 +294,10 @@ def push_res_analysis( cwd, name ): cc_dat = "{0}/cc.dat".format( dir ) ccstar_dat = "{0}/ccstar.dat".format( dir ) mult_dat = "{0}/mult.dat".format( dir ) + rsplit_dat = "{0}/Rsplit.dat".format( dir ) # make summary data table - stats_df = summary_stats( cc_dat, ccstar_dat, mult_dat ) + stats_df = summary_stats( cc_dat, ccstar_dat, mult_dat, rsplit_dat ) # calculate cc metrics cc_cut = get_metric( stats_df[ "1_d2" ], stats_df.cc, 0.3 ) @@ -314,44 +342,55 @@ def push_res_analysis( cwd, name ): def main( cwd, name, stream, pointgroup, model, iterations, cell, bins, part_h_res ): # run push-res scan + print( "begin push-res scan" ) run_push_res( cwd, name, stream, pointgroup, model, iterations, cell, bins, part_h_res ) # run analysis push_res, high_res = push_res_analysis( cwd, name ) + print( "done" ) + print( "best push-res = {0} and a resolution of {1}".format( push_res, high_res ) ) # make final run file final_dir = "{0}/{1}/final".format( cwd, name ) make_process_dir( final_dir ) - # move to push-res directory - os.chdir( final_dir ) + print( "rerunning partialator with {0} A cut off".format( high_res ) ) + # check to see if directories already made + if not os.path.exists( final_dir ): - # re-run final push-res - submitted_job_ids = set() - final_file = run_partialator( final_dir, stream, pointgroup, model, iterations, push_res, cell, bins, part_h_res=high_res, flag="final" ) + # move to push-res directory + os.chdir( final_dir ) - # submit job - job_id = submit_job( final_file ) - print(f"final job submitted: {1}".format( job_id ) ) - submitted_job_ids.add( job_id ) + # re-run final push-res + submitted_job_ids = set() + final_file = run_partialator( final_dir, stream, pointgroup, model, iterations, push_res, cell, bins, part_h_res=high_res, flag="final" ) - # use progress bar to track job completion - wait_for_jobs(submitted_job_ids, 1 ) - print("slurm processing done") + # submit job + job_id = submit_job( final_file ) + print(f"final job submitted: {1}".format( job_id ) ) + submitted_job_ids.add( job_id ) + + # use progress bar to track job completion + time.sleep(30) + wait_for_jobs(submitted_job_ids, 1 ) + print("slurm processing done") + print( "done" ) # stats files names - cc_dat = "cc.dat".format( dir ) - ccstar_dat = "ccstar.dat".format( dir ) - mult_dat = "mult.dat".format( dir ) + cc_dat = "{0}/cc.dat".format( final_dir ) + ccstar_dat = "{0}/ccstar.dat".format( final_dir ) + mult_dat = "{0}/mult.dat".format( final_dir ) + rsplit_dat = "{0}/Rsplit.dat".format( final_dir ) # make summary data table - stats_df = summary_stats( cc_dat, ccstar_dat, mult_dat ) + print( "calculating statistics" ) + stats_df = summary_stats( cc_dat, ccstar_dat, mult_dat, rsplit_dat ) print( stats_df.to_string() ) print_df = stats_df[ [ "1_d", "d", "min", "max", "nref", "poss", "comp", "obs", "mult", - "snr", "I", "cc", "ccstar"] ] - print_df.to_csv( "summary_table.csv", sep="\t", index=False ) + "snr", "I", "rsplit", "cc", "ccstar"] ] + print_df.to_csv( "{0}/summary_table.csv".format( final_dir ), sep="\t", index=False ) # calculate cc metrics cc_cut = get_metric( stats_df[ "1_d2" ], stats_df.cc, 0.3 ) @@ -360,10 +399,11 @@ def main( cwd, name, stream, pointgroup, model, iterations, cell, bins, part_h_r print( "resolution at CC* at 0.7 = {0}".format( ccstar_cut ) ) # show plots - summary_cc_fig( stats_df ) + summary_fig( stats_df, final_dir ) # move back to top dir os.chdir( cwd ) + print( "done" ) if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -378,13 +418,15 @@ if __name__ == "__main__": "-s", "--stream_file", help="path to stream file", - type=os.path.abspath + type=os.path.abspath, + required=True ) parser.add_argument( "-p", "--pointgroup", help="pointgroup used by CrystFEL for partialator run", - type=str + type=str, + required=True ) parser.add_argument( "-m",