runs partialator and makes mtz after taking boot stapped sample

2025-11-28 17:45:07 +01:00
parent d7fa7e25da
commit 4140cad8f2
1 changed files with 362 additions and 0 deletions
--- a/reduction_tools/stream_random_mtz.py
+++ b/reduction_tools/stream_random_mtz.py
@@ -0,0 +1,362 @@
 #!/usr/bin/env python3
 # author J.Beale
 """
 # aim
 randomly select a series of crystals from a stream file and
 then compile them into the correctly formated .stream
 run partialator and make an mtz
 # usage
 python stream_random.py -s <path to stream>
                        -o output file names
                        -n sample size
                        -r how many repeat random samples do you want?
                        -p pointgroup                  
                        -c <path-to-cell-file>
                        -a max-adu. Default = 12000
                        -g spacegroup
                        -r number of residues
 # output
 .stream file with random sample of xtals
 """
 # modules
 import re
 import argparse
 import pandas as pd
 import numpy as np
 import os, errno
 import subprocess
 def extract_chunks( input_file ):
    # setup
    chunk_df = pd.DataFrame()
    image_no = []
    chunks = []
    hits = []
    collect_lines = False
    # Open the input file for reading
    with open(input_file, 'r') as f:
        for line in f:
            # Check for the start condition
            if line.startswith('----- Begin chunk -----'):
                hit = False
                collect_lines = True
                chunk_lines = []
            if collect_lines:
                chunk_lines.append(line)
            # find image_no
            if line.startswith( "Event:" ):
                image_search = re.findall( r"Event: //(\d+)", line )
                image = int(image_search[0])
                image_no.append( image )
            # is there a hit in chunk
            if line.startswith( "Cell parameters" ):
                hit = True
            if line.startswith('----- End chunk -----'):
                collect_lines = False  # Stop collecting lines
                chunks.append( chunk_lines )
                hits.append( hit )
    chunk_df[ "chunks" ] = chunks
    chunk_df[ "image_no" ] = image_no
    chunk_df[ "hit" ] = hits
    return chunk_df
 def extract_xtals( chunk ):
    # setup
    xtals = []
    collect_crystal_lines = False
    # Open the input file for reading
    for line in chunk:
        # Check for the xtals start condition
        if line.startswith('--- Begin crystal'):
            collect_crystal_lines = True
            xtal_lines = []
        if collect_crystal_lines:
            xtal_lines.append(line)
        if line.startswith('--- End crystal\n'):
            collect_crystal_lines = False  # Stop collecting lines
            xtals.append( xtal_lines )
    return xtals
 def extract_header( chunk ):
    # setup
    header = []
    collect_header_lines = False
    # Open the input file for reading
    for line in chunk:
        # Check for the xtals start condition
        if line.startswith('----- Begin chunk -----'):
            collect_header_lines = True
            header_lines = []
        if collect_header_lines:
            header_lines.append(line)
        if line.startswith('End of peak list'):
            collect_header_lines = False  # Stop collecting lines
            header.append( header_lines )
    return header
 def get_header( header, input_file ):
    if header == "geom":
        start_keyword = "----- Begin geometry file -----"
        end_keyword = "----- End geometry file -----"
    if header == "cell":
        start_keyword = "----- Begin unit cell -----"
        end_keyword = "----- End unit cell -----"
    # setup
    collect_lines = False
    headers = []
    # Open the input file for reading
    with open(input_file, 'r') as f:
        for line in f:
            # Check for the start condition
            if line.strip() == start_keyword:
                collect_lines = True
                headers_lines = []
            # Collect lines between start and end conditions
            if collect_lines:
                headers_lines.append(line)
            # Check for the end condition
            if line.strip() == end_keyword:
                collect_lines = False  # Stop collecting lines
                headers.append(headers_lines)
    return headers[0]
 def write_to_file( geom, cell, chunk_header, crystals, output_file ):
    # Write sections with matching cell parameters to the output file
    with open(output_file, 'w') as out_file:
        out_file.write('CrystFEL stream format 2.3\n')
        out_file.write('Generated by CrystFEL 0.10.2\n')
        out_file.writelines(geom)
        out_file.writelines(cell)
        for crystal, header in zip( crystals, chunk_header ):
            out_file.writelines( header )
            out_file.writelines( crystal )
            out_file.writelines( "----- End chunk -----\n" )
 def run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues ):
    # partialator file name
    part_run_file = "merge_{0}.sh".format( part_name )
    # write file
    part_sh = open( part_run_file, "w" )
    part_sh.write( "#!/bin/sh\n\n" )
    part_sh.write( "source /sf/cristallina/applications/mx/conda/miniconda/bin/activate\n" )
    part_sh.write( "conda activate crmx38-analysis\n\n" )
    part_sh.write( "python {0} -n merge_{1}".format( part_mtz_py, part_name ) )
    part_sh.write( " -s {0}".format( stream_file ) )
    part_sh.write( " -p {0}".format( pointgroup ) )
    part_sh.write( " -c {0}".format( cell_file ) )
    part_sh.write( " -r 1.3" )
    part_sh.write( " -g {0}".format( spacegroup ) )
    part_sh.write( " -R {0}".format( residues ) )
    part_sh.close()
    # make file executable
    subprocess.call( [ "chmod", "+x", "{0}".format( part_run_file ) ] )
    # return partialator file name
    return part_run_file
 def strip_stream( input_stream, samples, output, repeat ):
    # get geom and cell file headers
    print( "getting header info from .stream file" )
    geom = get_header( "geom", input_stream )
    cell = get_header( "cell", input_stream )
    print( "done" )
    # extract chunks
    print( "finding chucks" )
    chunk_df = extract_chunks( input_stream )
    # display no. of chunks
    print( "found {0} chunks".format( len(chunk_df) ) )
    # remove rows without xtals
    chunk_df = chunk_df.loc[chunk_df.hit, :]
    print( "found {0} hits (not including multiples)".format( len(chunk_df) ) )
    print( "done" )
    # extract xtals
    print( "get xtals from chunks" )
    xtal_df = pd.DataFrame()
    counter = 0
    for index, row in chunk_df.iterrows():
        chunk, hit, image_no = row[ "chunks" ], row[ "hit" ], row[ "image_no" ]
        if hit:
            # find xtals and header
            header = extract_header( chunk )
            xtals = extract_xtals( chunk )
            # make header same length as xtals
            header = header*len(xtals)
            # concat results
            xtal_df_1 = pd.DataFrame()
            xtal_df_1[ "header" ] = header
            xtal_df_1[ "xtals" ] = xtals
            xtal_df_1[ "image_no" ] = image_no
            xtal_df = pd.concat( ( xtal_df, xtal_df_1 ) )
            # add count and print every 1000s
            counter = counter + len(xtals)
            if counter % 1000 == 0:
                print( counter, end='\r' )
    print( "done" )
    # sort by image no and reindex
    xtal_df = xtal_df.sort_values( by=[ "image_no" ] )
    xtal_df = xtal_df.reset_index( drop=True )
    stream_df = pd.DataFrame()
    # randomly n number of sample of xtals with replacement
    for sample in samples:
        print( "taking {0} {1} sample".format( repeat, sample ) )
        for x in range( 0, repeat ):
            try:
                sample_df = xtal_df.sample( sample, replace=True )
            except ValueError:
                print( "input image sample larger than number of hits. Sample should be less than {0}".format( len(xtal_df) ) )
            # rebuild .stream from sample
            print( "writing {0} to output file".format( x ) )
            crystals = sample_df.xtals.to_list()
            chunk_header = sample_df.header.to_list()
            output_file = "{0}_{1}_{2}.stream".format( output, sample ,x )
            write_to_file( geom, cell, chunk_header, crystals, output_file )
            # append stream file to stream list
            data = [ { "stream_file" : output_file,
                       "sample" : sample,
                       "number" : x                    
                    } ]
            stream_df_1 = pd.DataFrame( data )
            stream_df = pd.concat( ( stream_df, stream_df_1 ) )
            print( "done {0}".format( x ) )
    print( "done" )
    return stream_df
 def make_process_dir( dir ):
    # make process directory
    try:
        os.makedirs( dir )
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
 def main( cwd, input_stream, samples, output, repeat, script_dir, pointgroup, cell_file, spacegroup, residues ):
    # make sample stream files
    stream_df = strip_stream( input_stream, samples, output, repeat )
    # run partialator and make mtz
    for index, row in stream_df.iterrows():
        stream_file, sample, number = row[ "stream_file" ], row[ "sample" ], row[ "number" ]
        print( "running partialatory for sample={0}, run={1}".format( sample, number ) )
        # run partialator and make mtz
        part_mtz_py = "{0}/partialator_mtz.py".format( script_dir )
        part_name = "{0}_{1}".format( sample, number )
        part_run_file = run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues)
        subprocess.run( "./{0}".format( part_run_file ) )
        os.chdir( cwd )
 def list_of_floats(arg):
    return list(map(int, arg.split(',')))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-s",
        "--stream",
        help="input stream file",
        required=True,
        type=os.path.abspath
    )
    parser.add_argument(
        "-o",
        "--output",
        help="output stream file with sampled xtals",
        type=str,
        default="sample"
    )
    parser.add_argument(
        "-n",
        "--sample",
        help="size of sample to take from input.stream",
        type=list_of_floats,
        required=True
    )
    parser.add_argument(
        "-r",
        "--repeat",
        help="how many samples would you like?",
        type=int
    )
    parser.add_argument(
        "-p",
        "--pointgroup",
        help="pointgroup used by CrystFEL for partialator run",
        type=str,
        required=True
    )    
    parser.add_argument(
        "-c",
        "--cell_file",
        help="path to CrystFEL cell file for partialator.",
        type=os.path.abspath,
        required=True
    )
    parser.add_argument(
        "-g",
        "--spacegroup",
        help="spacegroup for making mtz, e.g P43212",
        type=str,
        required=True
    )
    parser.add_argument(
        "-R",
        "--residues",
        help="number of residues for truncate, e.g., hewl = 129",
        type=int,
        required=True
    )
    args = parser.parse_args()
    # does if need to be run multiple times?
    if args.repeat is None:
        repeat = 1
    else:
        repeat = args.repeat
    # run main
    cwd = os.getcwd()
    main( cwd, args.stream, args.sample, args.output, repeat, args.pointgroup, args.cell_file, args.spacegroup, args.residues )