#!/usr/bin/python # author J.Beale """ # aim to process a batch of data very fast by splitting it into a number of chunks and submitting these jobs separately to the cluster # usage python crystfel_split.py -l -k -g -c -n # crystfel parameter may need some editing in the function - write_crystfel_run # output a series of stream files from crystfel in the current working directory """ # modules import pandas as pd import subprocess import os, errno import time import argparse from tqdm import tqdm import regex as re import numpy as np from loguru import logger def count_chunks( stream ): # get number of chunks # example - ----- Begin chunk ----- # count them try: pattern = r"-----\sBegin\schunk\s-----" chunks = re.findall( pattern, stream ) if AttributeError: return len( chunks ) except AttributeError: logger.debug( "count_chunks error" ) return np.nan def scrub_cells( stream ): # get uc values from stream file # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg # scrub clen and return - else nan try: pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg" cell_lst = re.findall( pattern, stream ) xtals = len( cell_lst ) if AttributeError: return cell_lst, xtals except AttributeError: logger.debug( "scrub_cells error" ) return np.nan def scrub_res( stream ): # get diffraction limit # example - diffraction_resolution_limit = 4.07 nm^-1 or 2.46 A # scrub res_lst or return np.nan try: pattern = r"diffraction_resolution_limit\s=\s\d\.\d+\snm\^-1\sor\s(\d\.\d+)\sA" res_lst = re.findall( pattern, stream ) if AttributeError: return res_lst except AttributeError: logger.debug( "scrub_res error" ) return np.nan def scrub_obs( stream ): # get number of reflections # example - num_reflections = 308 # scrub reflections or return np.nan try: pattern = r"num_reflections\s=\s(\d+)" obs_lst = re.findall( pattern, stream ) if AttributeError: return obs_lst except AttributeError: logger.debug( "scrub_obs error" ) return np.nan def calculate_stats( stream_pwd ): # open stream file stream = open( stream_pwd, "r" ).read() # get total number chunks chunks = count_chunks( stream ) # get list of cells cell_lst, xtals = scrub_cells( stream ) # get list of cells res_lst = scrub_res( stream ) # get list of cells obs_lst = scrub_obs( stream ) # res_df cols = [ "a", "b", "c", "alpha", "beta", "gamma" ] df = pd.DataFrame( cell_lst, columns=cols ) df[ "resolution" ] = res_lst df[ "obs" ] = obs_lst # convert all to floats df = df.astype(float) return df, xtals, chunks def h5_split( lst, chunk_size ): # read h5.lst - note - removes // from image column # scrub file name lst_name = os.path.basename( lst ) cols = [ "h5", "image" ] df = pd.read_csv( lst, sep="\s//", engine="python", names=cols ) # re-add // to image columm and drop other columns df[ "h5_path" ] = df.h5 + " //" + df.image.astype( str ) df = df[ [ "h5_path" ] ] # split df into a lst list_df = [df[i:i + chunk_size] for i in range( 0, len(df), chunk_size)] return list_df def write_crystfel_run( proc_dir, name, chunk, chunk_lst_file, geom_file, cell_file ): """ crystfel run file - spot-finding and indexing parameters may need some editing only change from inside the quote ("") """ # stream file name stream_file = "{0}_{1}.stream".format( name, chunk ) # crystfel file name cryst_run_file = "{0}/{1}_{2}.sh".format( proc_dir, name, chunk ) # write file run_sh = open( cryst_run_file, "w" ) run_sh.write( "#!/bin/sh\n\n" ) run_sh.write( "module purge\n" ) run_sh.write( "module load crystfel/0.10.2\n" ) run_sh.write( "indexamajig -i {0} \\\n".format( chunk_lst_file ) ) run_sh.write( " --output={0} \\\n".format( stream_file ) ) run_sh.write( " --geometry={0} \\\n".format( geom_file ) ) run_sh.write( " --pdb={0} \\\n".format( cell_file ) ) run_sh.write( " --indexing=xgandalf-latt-cell \\\n" ) run_sh.write( " --peaks=peakfinder8 \\\n" ) run_sh.write( " --integration=rings-nograd \\\n" ) run_sh.write( " --tolerance=10.0,10.0,10.0,2,3,2 \\\n" ) run_sh.write( " --threshold=10 \\\n" ) run_sh.write( " --min-snr=5 \\\n" ) run_sh.write( " --int-radius=5,7,9 \\\n" ) run_sh.write( " -j 32 \\\n" ) run_sh.write( " --multi \\\n" ) run_sh.write( " --check-peaks \\\n" ) run_sh.write( " --retry \\\n" ) run_sh.write( " --max-res=3000 \\\n" ) run_sh.write( " --min-pix-count=2 \\\n" ) run_sh.write( " --local-bg-radius=4 \\\n" ) run_sh.write( " --min-res=85" ) run_sh.close() # make file executable subprocess.call( [ "chmod", "+x", "{0}".format( cryst_run_file ) ] ) # return crystfel file name return cryst_run_file, stream_file def make_process_dir( proc_dir ): # make process directory try: os.makedirs( proc_dir ) except OSError as e: if e.errno != errno.EEXIST: logger.debug( "making directory error" ) raise def submit_job( job_file ): # submit the job submit_cmd = ["sbatch", "--cpus-per-task=32", "--" ,job_file] job_output = subprocess.check_output(submit_cmd) # scrub job id from - example Submitted batch job 742403 pattern = r"Submitted batch job (\d+)" job_id = re.search( pattern, job_output.decode().strip() ).group(1) return int( job_id ) def wait_for_jobs( job_ids, total_jobs ): with tqdm(total=total_jobs, desc="Jobs Completed", unit="job") as pbar: while job_ids: completed_jobs = set() for job_id in job_ids: status_cmd = ["squeue", "-h", "-j", str(job_id)] status = subprocess.check_output(status_cmd) if not status: completed_jobs.add(job_id) pbar.update(1) job_ids.difference_update(completed_jobs) time.sleep(5) def run_splits( list_df, cwd, name, lst, chunk_size, geom_file, cell_file ): # set chunk counter chunk = 0 # submitted job set submitted_job_ids = set() # stream file list stream_lst = [] for chunk_lst in list_df: logger.info( "chunk {0} = {1} images".format( chunk, len( chunk_lst ) ) ) # define process directory proc_dir = "{0}/{1}/{1}_{2}".format( cwd, name, chunk ) # make process directory make_process_dir(proc_dir) # move to process directory os.chdir( proc_dir ) # write list to file chunk_lst_file = "{0}/{1}_{2}.lst".format( proc_dir, name, chunk ) chunk_lst.to_csv( chunk_lst_file, index=False, header=False ) # write crystfel file and append path to list cryst_run_file, stream_file = write_crystfel_run( proc_dir, name, chunk, chunk_lst_file, geom_file, cell_file ) stream_lst.append( "{0}/{1}".format( proc_dir, stream_file ) ) # submit jobs job_id = submit_job( cryst_run_file ) logger.info( f"Job submitted: { job_id }" ) submitted_job_ids.add( job_id ) # increase chunk counter chunk = chunk +1 # move back to top dir os.chdir( cwd ) return submitted_job_ids, chunk, stream_lst def main( cwd, name, lst, chunk_size, geom_file, cell_file ): print( "reading SwissFEL lst file" ) print( "creating {0} image chunks of lst".format( chunk_size ) ) list_df = h5_split( lst, chunk_size ) print( "DONE" ) # run crystfel runs on individual splits submitted_job_ids, chunk, stream_lst = run_splits( list_df, cwd, name, lst, chunk_size, geom_file, cell_file ) # monitor progress of jobs wait_for_jobs(submitted_job_ids, chunk) # make composite .stream file output_file = "{0}.stream".format( name ) try: # Open the output file in 'append' mode with open(output_file, "a") as output: for file_name in stream_lst: try: with open(file_name, "r") as input_file: # Read the contents of the input file and append to the output file output.write(input_file.read()) logger.info( f"Appended contents from {file_name} to {output_file}") except FileNotFoundError: logger.debug(f"File {file_name} not found. Skipping.") except IOError as e: logger.debug(f"An error occurred while appending files: {e}") df, xtals, chunks = calculate_stats( output_file ) # stats index_rate = round( xtals/chunks*100, 2 ) mean_res, std_res = round( df.resolution.mean(), 2 ), round( df.resolution.std(), 2 ) median_res = df.resolution.median() mean_obs, std_obs = round( df.obs.mean(), 2 ), round( df.obs.std(), 2) mean_a, std_a = round( df.a.mean()*10, 2 ), round( df.a.std()*10, 2 ) mean_b, std_b = round( df.b.mean()*10, 2 ), round( df.b.std()*10, 2 ) mean_c, std_c = round( df.c.mean()*10, 2 ), round( df.c.std()*10, 2 ) mean_alpha, std_alpha = round( df.alpha.mean(), 2 ), round( df.alpha.std(), 2 ) mean_beta, std_beta = round(df.beta.mean(), 2 ), round( df.beta.std(), 2 ) mean_gamma, std_gamma = round( df.gamma.mean(), 2 ), round( df.gamma.std(), 2 ) logger.info( "image = {0}".format( chunks ) ) logger.info( "crystals = {0}".format( xtals ) ) logger.info( "indexing rate = {0} %".format( index_rate ) ) logger.info( "mean resolution = {0} +/- {1} A".format( mean_res, std_res ) ) logger.info( "median resolution = {0} A".format( median_res ) ) logger.info( "mean observations = {0} +/- {1}".format( mean_obs, std_obs ) ) logger.info( "mean a = {0} +/- {1} A".format( mean_a, std_a ) ) logger.info( "mean b = {0} +/- {1} A".format( mean_b, std_b ) ) logger.info( "mean c = {0} +/- {1} A".format( mean_c, std_c ) ) logger.info( "mean alpha = {0} +/- {1} A".format( mean_alpha, std_alpha ) ) logger.info( "mean beta = {0} +/- {1} A".format( mean_beta, std_beta ) ) logger.info( "mean gamma = {0} +/- {1} A".format( mean_gamma, std_gamma ) ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-l", "--lst_file", help="file from SwissFEL output to be processed quickly", type=os.path.abspath ) parser.add_argument( "-k", "--chunk_size", help="how big should each chunk be? - the bigger the chunk, the fewer jobs, the slower it will be", type=int, default=500 ) parser.add_argument( "-g", "--geom_file", help="path to geom file to be used in the refinement", type=os.path.abspath ) parser.add_argument( "-c", "--cell_file", help="path to cell file of the crystals used in the refinement", type=os.path.abspath ) parser.add_argument( "-n", "--job_name", help="the name of the job to be done", type=str, default="split" ) parser.add_argument( "-d", "--debug", help="output debug to terminal", type=bool, default=False ) args = parser.parse_args() # set current working directory cwd = os.getcwd() # set loguru if not args.debug: logger.remove() logfile = "{0}.log".format( args.job_name ) logger.add( logfile, format="{message}", level="INFO") # log geometry file geom = open( args.geom_file, "r" ).read() logger.info( geom ) main( cwd, args.job_name, args.lst_file, args.chunk_size, args.geom_file, args.cell_file )