#!/usr/bin/python # author J.Beale """ # aim automatically index, integrate, scale/merge and output mtzs for individual SwissFEL runs combines, hopefully intelligently, indexamajig, partialator, f2mtz and truncate to produce complete mtz files # usage python auto_process.py # output in run dir - integrated .stream files + logs - scaled/merged files - an mtz file - useful plots - log file of output """ # modules import sys sys.path.insert( 1, "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/" ) import partialator as part import subprocess import os, errno import shutil import pandas as pd import numpy as np import glob import argparse import regex as re from loguru import logger def read_cell( cell_file ): # function to get cell parameter def get_parameter( line, parameter ): # general parameter search pattern = r"{0}\s=\s(\d+\.\d+)".format( parameter ) value = re.search( pattern, line ).group(1) return value # loop through file and get parameters cell = open( cell_file, "r" ) for line in cell: if line.startswith( "a " ): a = float( get_parameter( line, "a" ) ) if line.startswith( "b " ): b = float( get_parameter( line, "b" ) ) if line.startswith( "c " ): c = float( get_parameter( line, "c" ) ) if line.startswith( "al " ): alpha = float( get_parameter( line, "al" ) ) if line.startswith( "be " ): beta = float( get_parameter( line, "be" ) ) if line.startswith( "ga " ): gamma = float( get_parameter( line, "ga" ) ) cell = [ a, b, c, alpha, beta, gamma ] return cell def cat_lst_files( script_dir, run_no, endstation, pgroup, jfj, label, output ): cat_lst = "{0}/cat_lst.py".format( script_dir ) submit_cmd = [ "python", cat_lst, "{0}".format( run_no ), "--endstation={0}".format( endstation ), "--pgroup={0}".format( pgroup ), "--jfj={0}".format( jfj ), "--label={0}".format( label ), "--output={0}".format( output ), ] subprocess.run( submit_cmd ) def run_index( script_dir, name, lst_file, chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold, min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation ): indexamajig = "{0}/crystfel_split_var.py".format( script_dir ) if reservation: submit_cmd = [ "python", indexamajig, "--job_name={0}".format( name ), "--lst_file={0}".format( lst_file ), "--chunk_size={0}".format( chunk_size ), "--geom_file={0}".format( geom_file ), "--cell_file={0}".format( cell_file ), "--indexer={0}".format( indexer ), "--peakfinder={0}".format( peakfinder ), "--integrator={0}".format( integrator ), "--tolerance={0}".format( tolerance ), "--threshold={0}".format( threshold ), "--min_snr={0}".format( min_snr ), "--int_radius={0}".format( int_radius ), "--multi={0}".format( multi ), "--retry={0}".format( retry ), "--min_pix={0}".format( min_pix ), "--bg_rad={0}".format( bg_rad ), "--min_res={0}".format( min_res ), "--reservation={0}".format( reservation ) ] else: submit_cmd = [ "python", indexamajig, "--job_name={0}".format( name ), "--lst_file={0}".format( lst_file ), "--chunk_size={0}".format( chunk_size ), "--geom_file={0}".format( geom_file ), "--cell_file={0}".format( cell_file ), "--indexer={0}".format( indexer ), "--peakfinder={0}".format( peakfinder ), "--integrator={0}".format( integrator ), "--tolerance={0}".format( tolerance ), "--threshold={0}".format( threshold ), "--min_snr={0}".format( min_snr ), "--int_radius={0}".format( int_radius ), "--multi={0}".format( multi ), "--retry={0}".format( retry ), "--min_pix={0}".format( min_pix ), "--bg_rad={0}".format( bg_rad ), "--min_res={0}".format( min_res ) ] subprocess.run( submit_cmd ) # make stream name stream_file = "{0}.stream".format( name ) return stream_file def run_partialator( script_dir, name, stream_file, pointgroup, model, iterations, cell_file, bins, resolution, max_adu, reservation ): partialator = "{0}/partialator.py".format( script_dir, ) if reservation: submit_cmd = [ "python", partialator, "--name={0}".format( name ), "--stream_file={0}".format( stream_file ), "--pointgroup={0}".format( pointgroup ), "--model={0}".format( model ), "--iterations={0}".format( iterations ), "--cell_file={0}".format( cell_file ), "--bins={0}".format( bins ), "--resolution={0}".format( resolution ), "--max_adu={0}".format( max_adu ), "--reservation={0}".format( reservation ) ] else: submit_cmd = [ "python", partialator, "--name={0}".format( name ), "--stream_file={0}".format( stream_file ), "--pointgroup={0}".format( pointgroup ), "--model={0}".format( model ), "--iterations={0}".format( iterations ), "--cell_file={0}".format( cell_file ), "--bins={0}".format( bins ), "--resolution={0}".format( resolution ), "--max_adu={0}".format( max_adu ) ] subprocess.run( submit_cmd ) def scrub_cc( part_dir_name ): # open 1st partialator log file part_log = open( "{0}.log".format( part_dir_name ) ) # regex example = resolution at CC0.5 at 0.3 = 1.599 pattern = r"resolution\sat\sCC0.5\sat\s0\.3\s=\s(\d+.\d+)" cc = re.search( pattern, part_log.read() ).group(1) return float( cc ) def scrub_index( index_dir_name ): # open 1st partialator log file index_log_file = open( "{0}.log".format( index_dir_name ) ) index_log = index_log_file.read() # regex example = crystals = 0 images_pattern = r"images\s=\s(\d+)" images = re.search( images_pattern, index_log ).group(1) # regex example = crystals = 0 xtals_pattern = r"crystals\s=\s(\d+)" indexed = re.search( xtals_pattern, index_log ).group(1) def get_index_cell( index_log, parameter ): # general parameter search pattern = r"mean\s{0}\s=\s(\d+\.\d+)\s\+/-\s(\d+\.\d+)".format( parameter ) parameters = re.search( pattern, index_log ) value, error = parameters.group(1), parameters.group(2) return value, error ind_res, ind_res_err = get_index_cell( index_log, "resolution" ) obs, obs_err = get_index_cell( index_log, "observations" ) a, a_err = get_index_cell( index_log, "a" ) b, b_err = get_index_cell( index_log, "b" ) c, c_err = get_index_cell( index_log, "c" ) alpha, alpha_err = get_index_cell( index_log, "alpha" ) beta, beta_err = get_index_cell( index_log, "beta" ) gamma, gamma_err = get_index_cell( index_log, "gamma" ) # put results into df data = [ { "images" : int( images ), "indexed" : int( indexed ), "index_res" : float( ind_res ), "index_res_err" : float( ind_res_err ), "obs" : float( obs ), "obs_err" : float( obs_err ), "a" : float( a ), "a_err" : float( a_err ), "b" : float( b ), "b_err" : float( b_err ), "c" : float( c ), "c_err": float( c_err ), "alpha" : float( alpha ), "alpha_err" : float( alpha_err ), "beta" : float( beta ), "beta_err" : float( beta_err ), "gamma" : float( gamma ), "gamma_err" : float( gamma_err ) } ] stream_df = pd.DataFrame( data ) return stream_df def run_make_mtz( script_dir, hklin_file, project, crystal, dataset, cell, spacegroup, residues, res_range ): make_mtz = "{0}/make_mtz.py".format( script_dir ) submit_cmd = [ "python", make_mtz, "--hklin={0}".format( hklin_file ), "--project={0}".format( project ), "--crystal={0}".format( crystal ), "--dataset={0}".format( dataset ), "--cell={0},{1},{2},{3},{4},{5}".format( cell[0], cell[1], cell[2], cell[3], cell[4], cell[5] ), "--spacegroup={0}".format( spacegroup ), "--residues={0}".format( residues ), "--resolution_range={0},{1}".format( res_range[0], res_range[1] ) ] subprocess.run( submit_cmd ) def scrub_prefix( endstation, pgroup, run_name ): # regex example = /sf/cristallina/data/p22216/raw/run0001-hewl_det-calib_1 file_path = "/sf/{0}/data/{1}/raw/{2}-*/".format( endstation, pgroup, run_name ) try: file_path = glob.glob( file_path )[0] pattern = r"/sf/{0}/data/{1}/raw/{2}-(.*)/".format( endstation, pgroup, run_name ) prefix = re.search( pattern, file_path ).group(1) except AttributeError as e: prefix = np.nan return prefix def make_process_dir( proc_dir ): # make process directory try: os.makedirs( proc_dir ) except OSError as e: if e.errno != errno.EEXIST: logger.debug( "making directory error" ) raise def main( script_dir, cwd, runs, endstation, pgroup, jfj, label, chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold, min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation, pointgroup, model, iterations, bins, resolution, max_adu, project, crystal, spacegroup, residues ): # collate results for summary csv df = pd.DataFrame() print( "beginning autoprocessing for runs {0}".format( runs ) ) for run in runs: # make process dir run_name = "run{0}".format( str( run ).zfill(4) ) print( "making run directory {0}".format( run_name ) ) proc_dir = "{0}/{1}".format( cwd, run_name ) make_process_dir( proc_dir ) # move to proc dir os.chdir( proc_dir ) print( "done" ) # create lst file # make lst output name lst_name = "{0}_{1}.lst".format( run_name, label ) if not os.path.isfile( "{0}/{1}".format( proc_dir, lst_name ) ): print( "makeing lst file" ) cat_lst_files( script_dir, run, endstation, pgroup, jfj, label, run_name ) print( "done" ) else: print( "lst file already created" ) # scrub run prefix prefix = scrub_prefix( endstation, pgroup, run_name ) # run crystfel split with variable index_run_name = "{0}_index".format( run_name ) if not os.path.isfile( "{0}/{1}.stream".format( proc_dir, index_run_name ) ): print( "running indexamajig" ) stream_file = run_index( script_dir, index_run_name, lst_name, chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold, min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation ) print( "done" ) else: print( "indexamajig already run" ) stream_file = "{0}.stream".format( index_run_name ) # scrub stream log stream_df = scrub_index( index_run_name ) indexed = stream_df.indexed[0] # run partialator if indexed > 100: print( "indexed {0} - now run partialator and make mtz".format( indexed ) ) part_run_name = "{0}_merge".format( run_name ) hkl_file = "{0}.hkl".format( part_run_name ) if not os.path.isfile( "{0}/{1}/{2}".format( proc_dir, part_run_name, hkl_file ) ): run_partialator( script_dir, part_run_name, stream_file, pointgroup, model, iterations, cell_file, bins, resolution, max_adu, reservation ) else: print( "partialator has run" ) # get resolution at cc 0.5 # scrub log file try: cc = scrub_cc( part_run_name ) print( "scrubbed resolution = {0}".format( cc ) ) if cc < 1.2: print( "predicted resolution too high. reset to 1.2 A" ) cc = 1.2 except AttributeError as e: cc = 3.0 # re-run check hkl print( "running cc check/compare" ) os.chdir( part_run_name ) check_run_file = part.run_compare_check( "{0}/{1}".format( proc_dir, part_run_name ), part_run_name, cell_file, bins, cc ) submit_cmd = [ "{0}".format( check_run_file ) ] subprocess.call( submit_cmd ) print( "done" ) # make mtz print( "making final mtz" ) cell = read_cell( cell_file ) mtz_file = "{0}/{1}/{1}.mtz".format( proc_dir, part_run_name ) F_file = "{0}/{1}/{1}_F.mtz".format( proc_dir, part_run_name ) res_range = [ 50.0, cc ] run_make_mtz( script_dir, hkl_file, project, crystal, run_name, cell, spacegroup, residues, res_range ) try: shutil.move( mtz_file, proc_dir ) except shutil.Error as e: pass try: shutil.move( F_file, proc_dir ) except shutil.Error as e: pass print( "done {0}".format( run_name ) ) else: print( "partialator not run. < 100 indexed" ) data = [ { "run" : run, "prefix" : prefix, } ] # move back to cwd os.chdir( cwd ) print( "finished all!" ) #script_dir = "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/" script_dir = "/sf/cristallina/data/p22216/work/processing/bach" cwd = os.getcwd() #runs = [ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, # 113, 114, 115, 116, 117, 118, 119, 122, 123, 124, 125, 126, 127, # 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138 ] runs = [ 11 ] endstation = "cristallina" pgroup = "p22216" jfj = True label = "off" geom_file = "{0}/{1}".format( cwd, "JF17T16V01_cf10_shift.geom" ) jfj = True label = "off" chunk_size = 250 cell_file = "/sf/cristallina/data/p22216/work/processing/bach/p1.cell" indexer = "xgandalf" peakfinder = "peakfinder8" integrator = "rings-nocen-nograd" tolerance = "10.0,10.0,10.0,2.0,3.0,2.0" threshold = 20 min_snr = 5 int_radius = "2,4,5" multi = True retry = True min_pix = 2 bg_rad = 4 min_res = 85 reservation = None pointgroup = 1 model = "unity" iterations = 1 bins = 20 resolution = 2.0 max_adu = 12000 project = "Cr_fragment" crystal = "bach" spacegroup = "P1" residues = 124 logfile = "{0}.log".format( "auto_process" ) logger.remove() logger.add( logfile, format="{message}", level="INFO") main( script_dir, cwd, runs, endstation, pgroup, jfj, label, chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold, min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation, pointgroup, model, iterations, bins, resolution, max_adu, project, crystal, spacegroup, residues )