crystfel_tools/reduction_tools/auto_process.py

#!/usr/bin/python

# author J.Beale

"""
# aim
automatically index, integrate, scale/merge and output mtzs for individual SwissFEL runs
combines, hopefully intelligently, indexamajig, partialator, f2mtz and truncate to produce complete mtz files

# usage
python auto_process.py

# output
in run dir
- integrated .stream files + logs
- scaled/merged files
- an mtz file
- useful plots
- log file of output
"""

# modules
import sys
sys.path.insert( 1, "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/" )
import partialator as part
import subprocess
import os, errno
import shutil
import pandas as pd
import numpy as np
import glob
import argparse
import regex as re
from loguru import logger

def read_cell( cell_file ):

    # function to get cell parameter
    def get_parameter( line, parameter ):

        # general parameter search
        pattern = r"{0}\s=\s(\d+\.\d+)".format( parameter )
        value = re.search( pattern, line ).group(1)
        return value

    # loop through file and get parameters
    cell = open( cell_file, "r" )
    for line in cell:
        if line.startswith( "a " ):
            a = float( get_parameter( line, "a" ) )
        if line.startswith( "b " ):
            b = float( get_parameter( line, "b" ) )
        if line.startswith( "c " ):
            c = float( get_parameter( line, "c" ) )
        if line.startswith( "al " ):
            alpha = float( get_parameter( line, "al" ) )
        if line.startswith( "be " ):
            beta = float( get_parameter( line, "be" ) )
        if line.startswith( "ga " ):
            gamma = float( get_parameter( line, "ga" ) )

    cell = [ a, b, c, alpha, beta, gamma ]

    return cell

def cat_lst_files( script_dir, run_no, endstation, pgroup, jfj, label, output ):

    cat_lst = "{0}/cat_lst.py".format( script_dir )

    submit_cmd = [ "python", cat_lst,
                   "{0}".format( run_no ),
                   "--endstation={0}".format( endstation ),
                   "--pgroup={0}".format( pgroup ),
                   "--jfj={0}".format( jfj ),
                   "--label={0}".format( label ),
                   "--output={0}".format( output ),
                   ]

    subprocess.run( submit_cmd )

def run_index( script_dir, name, lst_file, chunk_size, geom_file, cell_file,
               indexer, peakfinder, integrator, tolerance, threshold,
               min_snr, int_radius, multi, retry, min_pix,
               bg_rad, min_res, reservation
               ):

    indexamajig = "{0}/crystfel_split_var.py".format( script_dir )

    if reservation:
        submit_cmd = [ "python", indexamajig,
                    "--job_name={0}".format( name ),
                    "--lst_file={0}".format( lst_file ),
                    "--chunk_size={0}".format( chunk_size ),
                    "--geom_file={0}".format( geom_file ),
                    "--cell_file={0}".format( cell_file ),
                    "--indexer={0}".format( indexer ),
                    "--peakfinder={0}".format( peakfinder ),
                    "--integrator={0}".format( integrator ),
                    "--tolerance={0}".format( tolerance ),
                    "--threshold={0}".format( threshold ),
                    "--min_snr={0}".format( min_snr ),
                    "--int_radius={0}".format( int_radius ),
                    "--multi={0}".format( multi ),
                    "--retry={0}".format( retry ),
                    "--min_pix={0}".format( min_pix ),
                    "--bg_rad={0}".format( bg_rad ),
                    "--min_res={0}".format( min_res ),
                    "--reservation={0}".format( reservation )
                  ]
    else:
        submit_cmd = [ "python", indexamajig,
                    "--job_name={0}".format( name ),
                    "--lst_file={0}".format( lst_file ),
                    "--chunk_size={0}".format( chunk_size ),
                    "--geom_file={0}".format( geom_file ),
                    "--cell_file={0}".format( cell_file ),
                    "--indexer={0}".format( indexer ),
                    "--peakfinder={0}".format( peakfinder ),
                    "--integrator={0}".format( integrator ),
                    "--tolerance={0}".format( tolerance ),
                    "--threshold={0}".format( threshold ),
                    "--min_snr={0}".format( min_snr ),
                    "--int_radius={0}".format( int_radius ),
                    "--multi={0}".format( multi ),
                    "--retry={0}".format( retry ),
                    "--min_pix={0}".format( min_pix ),
                    "--bg_rad={0}".format( bg_rad ),
                    "--min_res={0}".format( min_res )
                  ]

    subprocess.run( submit_cmd )

    # make stream name
    stream_file = "{0}.stream".format( name )

    return stream_file

def run_partialator( script_dir, name, stream_file, pointgroup, model, iterations,
                     cell_file, bins, resolution, max_adu, reservation
                     ):

    partialator = "{0}/partialator.py".format( script_dir, )

    if reservation:
        submit_cmd = [ "python", partialator,
                    "--name={0}".format( name ),
                    "--stream_file={0}".format( stream_file ),
                    "--pointgroup={0}".format( pointgroup ),
                    "--model={0}".format( model ),
                    "--iterations={0}".format( iterations ),
                    "--cell_file={0}".format( cell_file ),
                    "--bins={0}".format( bins ),
                    "--resolution={0}".format( resolution ),
                    "--max_adu={0}".format( max_adu ),
                    "--reservation={0}".format( reservation )
                    ]
    else:
        submit_cmd = [ "python", partialator,
                    "--name={0}".format( name ),
                    "--stream_file={0}".format( stream_file ),
                    "--pointgroup={0}".format( pointgroup ),
                    "--model={0}".format( model ),
                    "--iterations={0}".format( iterations ),
                    "--cell_file={0}".format( cell_file ),
                    "--bins={0}".format( bins ),
                    "--resolution={0}".format( resolution ),
                    "--max_adu={0}".format( max_adu )
                    ]

    subprocess.run( submit_cmd )

def scrub_cc( part_dir_name ):

    # open 1st partialator log file
    part_log = open( "{0}.log".format( part_dir_name ) )

    # regex example = resolution at CC0.5 at 0.3 = 1.599
    pattern = r"resolution\sat\sCC0.5\sat\s0\.3\s=\s(\d+.\d+)"
    cc = re.search( pattern, part_log.read() ).group(1)

    return float( cc )

def scrub_index( index_dir_name ):

    # open 1st partialator log file
    index_log_file = open( "{0}.log".format( index_dir_name ) )
    index_log = index_log_file.read()

    # regex example = crystals = 0
    images_pattern = r"images?\s=\s(\d+)"
    images = re.search( images_pattern, index_log ).group(1)

    # regex example = crystals = 0
    xtals_pattern = r"crystals\s=\s(\d+)"
    indexed = re.search( xtals_pattern, index_log ).group(1)

    def get_index_cell( index_log, parameter ):

        # general parameter search
        pattern = r"mean\s{0}\s=\s(\d+\.\d+)\s\+/-\s(\d+\.\d+)".format( parameter )
        parameters = re.search( pattern, index_log )
        value, error = parameters.group(1), parameters.group(2)
        return value, error

    ind_res, ind_res_err = get_index_cell( index_log, "resolution" )
    obs, obs_err = get_index_cell( index_log, "observations" )
    a, a_err = get_index_cell( index_log, "a" )
    b, b_err = get_index_cell( index_log, "b" )
    c, c_err = get_index_cell( index_log, "c" )
    alpha, alpha_err = get_index_cell( index_log, "alpha" )
    beta, beta_err = get_index_cell( index_log, "beta" )
    gamma, gamma_err = get_index_cell( index_log, "gamma" )

    # put results into df
    per_index = round( int( indexed )/int( images )*100, 2 )
    data = [ { "images" : int( images ), "indexed" : int( indexed ), "indexed%" : per_index,
               "index_res" : float( ind_res ), "index_res_err" : float( ind_res_err ),
               "obs" : float( obs ), "obs_err" : float( obs_err ),
               "a" : float( a ), "a_err" : float( a_err ),
               "b" : float( b ), "b_err" : float( b_err ),
               "c" : float( c ), "c_err": float( c_err ),
               "alpha" : float( alpha ), "alpha_err" : float( alpha_err ),
               "beta" : float( beta ), "beta_err" : float( beta_err ),
               "gamma" : float( gamma ), "gamma_err" : float( gamma_err )
                } ]

    stream_df = pd.DataFrame( data )

    index_log_file.close()

    return stream_df

def run_make_mtz( script_dir, hklin_file, project, crystal, dataset, cell, spacegroup, residues, res_range ):

    make_mtz = "{0}/make_mtz.py".format( script_dir )

    submit_cmd = [ "python", make_mtz,
                   "--hklin={0}".format( hklin_file ),
                   "--project={0}".format( project ),
                   "--crystal={0}".format( crystal ),
                   "--dataset={0}".format( dataset ),
                   "--cell={0},{1},{2},{3},{4},{5}".format( cell[0], cell[1], cell[2], cell[3], cell[4], cell[5] ),
                   "--spacegroup={0}".format( spacegroup ),
                   "--residues={0}".format( residues ),
                   "--resolution_range={0},{1}".format( res_range[0], res_range[1] )
                   ]
    subprocess.run( submit_cmd )

def scrub_prefix( endstation, pgroup, run_name ):

    # regex example = /sf/cristallina/data/p22216/raw/run0001-hewl_det-calib_1
    file_path = "/sf/{0}/data/{1}/raw/{2}-*/".format( endstation, pgroup, run_name )
    try:
        file_path = glob.glob( file_path )[0]
        pattern = r"/sf/{0}/data/{1}/raw/{2}-(.*)/".format( endstation, pgroup, run_name )
        prefix = re.search( pattern, file_path ).group(1)
    except AttributeError as e:
        prefix = np.nan

    return prefix

def make_process_dir( proc_dir ):
    # make process directory
    try:
        os.makedirs( proc_dir )
    except OSError as e:
        if e.errno != errno.EEXIST:
            logger.debug( "making directory error" )
            raise

def main( script_dir, cwd, runs,
          endstation, pgroup, jfj, label,
          chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold,
          min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation,
          pointgroup, model, iterations, bins, resolution, max_adu,
          project, crystal, spacegroup, residues
          ):

    # collate results for summary csv
    df = pd.DataFrame()

    print( "beginning autoprocessing for runs {0}".format( runs ) )

    for run in runs:

        # make process dir
        run_name = "run{0}".format( str( run ).zfill(4) )
        print( "making run directory {0}".format( run_name ) )
        proc_dir = "{0}/{1}".format( cwd, run_name )
        make_process_dir( proc_dir )

        # move to proc dir
        os.chdir( proc_dir )
        print( "done" )

        # create lst file
        # make lst output name
        lst_name = "{0}_{1}.lst".format( run_name, label )
        if not os.path.isfile( "{0}/{1}".format( proc_dir, lst_name ) ):
            print( "makeing lst file" )
            cat_lst_files( script_dir, run, endstation, pgroup, jfj, label, run_name )
            print( "done" )
        else:
            print( "lst file already created" )

        # scrub run prefix
        prefix = scrub_prefix( endstation, pgroup, run_name )

        # run crystfel split with variable
        index_run_name = "{0}_index".format( run_name )
        if not os.path.isfile( "{0}/{1}.stream".format( proc_dir, index_run_name ) ):
            print( "running indexamajig" )
            stream_file = run_index( script_dir, index_run_name, lst_name, chunk_size, geom_file, cell_file,
                                    indexer, peakfinder, integrator, tolerance, threshold,
                                    min_snr, int_radius, multi, retry, min_pix,
                                    bg_rad, min_res, reservation
                                    )
            print( "done" )
        else:
            print( "indexamajig already run" )
            stream_file = "{0}.stream".format( index_run_name )

        # scrub stream log
        stream_df = scrub_index( index_run_name )
        indexed = stream_df.indexed[0]

        # run partialator
        partialator_limit = 250
        part_run_name = "{0}_merge".format( run_name )
        hkl_file = "{0}.hkl".format( part_run_name )
        if indexed > partialator_limit:
            print( "indexed {0} - now run partialator and make mtz".format( indexed ) )

            if not os.path.isfile( "{0}/{1}/{2}".format( proc_dir, part_run_name, hkl_file ) ):
                run_partialator( script_dir, part_run_name, stream_file, pointgroup, model, iterations,
                                 cell_file, bins, resolution, max_adu, reservation
                                )
            else:
                print( "partialator has run" )

            # get resolution at cc 0.5
            # scrub log file
            try:
                cc = scrub_cc( part_run_name )
                print( "scrubbed resolution = {0}".format( cc ) )
                if cc < 1.2:
                    print( "predicted resolution too high. reset to 1.2 A" )
                    cc = 1.2
            except AttributeError as e:
                cc = 3.0

            # re-run check hkl
            print( "running cc check/compare" )
            # move to partialator directory
            os.chdir( part_run_name )
            check_run_file = part.run_compare_check( "{0}/{1}".format( proc_dir, part_run_name ), part_run_name, cell_file, bins, cc )
            submit_cmd = [ "{0}".format( check_run_file ) ]
            subprocess.call( submit_cmd )
            print( "done" )

            # get partialator metrics
            try:
                overcc = part.get_overall_cc()
            except:
                overcc = np.nan
            try:
                overrsplit = part.get_overall_rsplit()
            except:
                overrsplit = np.nan
            try:
                b_factor = part.get_b()
            except AttributeError as e:
                b_factor = np.nan

            # make mtz
            print( "making final mtz" )
            cell = read_cell( cell_file )
            mtz_file = "{0}/{1}/{1}.mtz".format( proc_dir, part_run_name )
            F_file = "{0}/{1}/{1}_F.mtz".format( proc_dir, part_run_name )
            res_range = [ 50.0, cc ]
            run_make_mtz( script_dir, hkl_file, project, crystal, run_name, cell, spacegroup, residues, res_range )
            try:
                shutil.move( mtz_file, proc_dir )
            except shutil.Error as e:
                pass
            try:
                shutil.move( F_file, proc_dir )
            except shutil.Error as e:
                pass

            # move back to top directory
            os.chdir( proc_dir )

        else:
            print( "partialator not run. {0} > indexed".format( partialator_limit ) )
            cc = np.nan
            overcc = np.nan
            overrsplit = np.nan
            b_factor = np.nan

        # collate meta data
        run_data = [ { "run" : run,
                       "prefix" : prefix
                    } ]
        run_df = pd.DataFrame( run_data )
        part_data = [ { "overall_cc" : overcc,
                        "cc" : cc,
                        "overall_rsplit" : overrsplit,
                        "overall_b" : b_factor,
                    } ]
        part_df = pd.DataFrame( part_data )
        df_1 = pd.concat( [ run_df, stream_df, part_df ], axis=1 )
        df = pd.concat( [ df, df_1 ] )

        print( "done {0}".format( run_name ) )

        # move back to cwd
        os.chdir( cwd )

    # output stats table
    print( df )
    df = df.reset_index( drop=True )
    df.to_csv( "auto_process_summary.csv", sep="," )

    print( "finished all" )


#script_dir = "/sf/cristallina/applications/mx/crystfel_tools/reduction_tools/"
script_dir = "/sf/cristallina/data/p22216/work/processing/bach"
cwd = os.getcwd()
runs = [ 11, 12, 13, 14, 23, 24, 25, 26, 59, 60, 61, 63, 64, 65,
         67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
         83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
         100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
         113, 114, 115, 116, 117, 118, 119, 122, 123, 124, 125, 126, 127,
         128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138 ]
#runs = [ 11, 12 ]
endstation = "cristallina"
pgroup = "p22216"
jfj = True
label = "off"

geom_file = "{0}/{1}".format( cwd, "JF17T16V01_cf10_shift.geom" )
jfj = True
label = "off"
chunk_size = 250
cell_file = "/sf/cristallina/data/p22216/work/processing/bach/p1.cell"
indexer = "xgandalf"
peakfinder = "peakfinder8"
integrator = "rings-nocen-nograd"
tolerance = "10.0,10.0,10.0,2.0,3.0,2.0"
threshold = 20
min_snr = 5
int_radius = "2,4,5"
multi = True
retry = True
min_pix = 2
bg_rad = 4
min_res = 85
reservation = None

pointgroup = 1
model = "unity"
iterations = 1
bins = 20
resolution = 2.0
max_adu = 12000
project = "Cr_fragment"
crystal = "bach"
spacegroup = "P1"
residues = 124


logfile = "{0}.log".format( "auto_process" )
logger.remove()
logger.add( logfile, format="{message}", level="INFO")
main( script_dir, cwd, runs,
          endstation, pgroup, jfj, label,
          chunk_size, geom_file, cell_file, indexer, peakfinder, integrator, tolerance, threshold,
          min_snr, int_radius, multi, retry, min_pix, bg_rad, min_res, reservation,
          pointgroup, model, iterations, bins, resolution, max_adu,
          project, crystal, spacegroup, residues
          )