From 4140cad8f214b7494bb2ad4982b74572c5bce40c Mon Sep 17 00:00:00 2001 From: beale_j Date: Fri, 28 Nov 2025 17:45:07 +0100 Subject: [PATCH] runs partialator and makes mtz after taking boot stapped sample --- reduction_tools/stream_random_mtz.py | 362 +++++++++++++++++++++++++++ 1 file changed, 362 insertions(+) create mode 100644 reduction_tools/stream_random_mtz.py diff --git a/reduction_tools/stream_random_mtz.py b/reduction_tools/stream_random_mtz.py new file mode 100644 index 0000000..62220f2 --- /dev/null +++ b/reduction_tools/stream_random_mtz.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python3 + +# author J.Beale + +""" +# aim +randomly select a series of crystals from a stream file and +then compile them into the correctly formated .stream +run partialator and make an mtz + +# usage +python stream_random.py -s + -o output file names + -n sample size + -r how many repeat random samples do you want? + -p pointgroup + -c + -a max-adu. Default = 12000 + -g spacegroup + -r number of residues + +# output +.stream file with random sample of xtals +""" + +# modules +import re +import argparse +import pandas as pd +import numpy as np +import os, errno +import subprocess + +def extract_chunks( input_file ): + + # setup + chunk_df = pd.DataFrame() + image_no = [] + chunks = [] + hits = [] + collect_lines = False + # Open the input file for reading + with open(input_file, 'r') as f: + for line in f: + + # Check for the start condition + if line.startswith('----- Begin chunk -----'): + hit = False + collect_lines = True + chunk_lines = [] + if collect_lines: + chunk_lines.append(line) + + # find image_no + if line.startswith( "Event:" ): + image_search = re.findall( r"Event: //(\d+)", line ) + image = int(image_search[0]) + image_no.append( image ) + + # is there a hit in chunk + if line.startswith( "Cell parameters" ): + hit = True + + if line.startswith('----- End chunk -----'): + collect_lines = False # Stop collecting lines + chunks.append( chunk_lines ) + hits.append( hit ) + + chunk_df[ "chunks" ] = chunks + chunk_df[ "image_no" ] = image_no + chunk_df[ "hit" ] = hits + + return chunk_df + +def extract_xtals( chunk ): + + # setup + xtals = [] + collect_crystal_lines = False + # Open the input file for reading + for line in chunk: + + # Check for the xtals start condition + if line.startswith('--- Begin crystal'): + collect_crystal_lines = True + xtal_lines = [] + if collect_crystal_lines: + xtal_lines.append(line) + if line.startswith('--- End crystal\n'): + collect_crystal_lines = False # Stop collecting lines + xtals.append( xtal_lines ) + + return xtals + +def extract_header( chunk ): + + # setup + header = [] + collect_header_lines = False + # Open the input file for reading + for line in chunk: + + # Check for the xtals start condition + if line.startswith('----- Begin chunk -----'): + collect_header_lines = True + header_lines = [] + if collect_header_lines: + header_lines.append(line) + if line.startswith('End of peak list'): + collect_header_lines = False # Stop collecting lines + header.append( header_lines ) + + return header + +def get_header( header, input_file ): + + if header == "geom": + start_keyword = "----- Begin geometry file -----" + end_keyword = "----- End geometry file -----" + if header == "cell": + start_keyword = "----- Begin unit cell -----" + end_keyword = "----- End unit cell -----" + + # setup + collect_lines = False + headers = [] + + # Open the input file for reading + with open(input_file, 'r') as f: + for line in f: + # Check for the start condition + if line.strip() == start_keyword: + collect_lines = True + headers_lines = [] + # Collect lines between start and end conditions + if collect_lines: + headers_lines.append(line) + # Check for the end condition + if line.strip() == end_keyword: + collect_lines = False # Stop collecting lines + headers.append(headers_lines) + + return headers[0] + +def write_to_file( geom, cell, chunk_header, crystals, output_file ): + + # Write sections with matching cell parameters to the output file + with open(output_file, 'w') as out_file: + out_file.write('CrystFEL stream format 2.3\n') + out_file.write('Generated by CrystFEL 0.10.2\n') + out_file.writelines(geom) + out_file.writelines(cell) + for crystal, header in zip( crystals, chunk_header ): + out_file.writelines( header ) + out_file.writelines( crystal ) + out_file.writelines( "----- End chunk -----\n" ) + +def run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues ): + + # partialator file name + part_run_file = "merge_{0}.sh".format( part_name ) + + # write file + part_sh = open( part_run_file, "w" ) + part_sh.write( "#!/bin/sh\n\n" ) + part_sh.write( "source /sf/cristallina/applications/mx/conda/miniconda/bin/activate\n" ) + part_sh.write( "conda activate crmx38-analysis\n\n" ) + part_sh.write( "python {0} -n merge_{1}".format( part_mtz_py, part_name ) ) + part_sh.write( " -s {0}".format( stream_file ) ) + part_sh.write( " -p {0}".format( pointgroup ) ) + part_sh.write( " -c {0}".format( cell_file ) ) + part_sh.write( " -r 1.3" ) + part_sh.write( " -g {0}".format( spacegroup ) ) + part_sh.write( " -R {0}".format( residues ) ) + part_sh.close() + + # make file executable + subprocess.call( [ "chmod", "+x", "{0}".format( part_run_file ) ] ) + + # return partialator file name + return part_run_file + +def strip_stream( input_stream, samples, output, repeat ): + + # get geom and cell file headers + print( "getting header info from .stream file" ) + geom = get_header( "geom", input_stream ) + cell = get_header( "cell", input_stream ) + print( "done" ) + + # extract chunks + print( "finding chucks" ) + chunk_df = extract_chunks( input_stream ) + # display no. of chunks + print( "found {0} chunks".format( len(chunk_df) ) ) + # remove rows without xtals + chunk_df = chunk_df.loc[chunk_df.hit, :] + print( "found {0} hits (not including multiples)".format( len(chunk_df) ) ) + print( "done" ) + + # extract xtals + print( "get xtals from chunks" ) + xtal_df = pd.DataFrame() + counter = 0 + for index, row in chunk_df.iterrows(): + + chunk, hit, image_no = row[ "chunks" ], row[ "hit" ], row[ "image_no" ] + + if hit: + + # find xtals and header + header = extract_header( chunk ) + xtals = extract_xtals( chunk ) + + # make header same length as xtals + header = header*len(xtals) + + # concat results + xtal_df_1 = pd.DataFrame() + xtal_df_1[ "header" ] = header + xtal_df_1[ "xtals" ] = xtals + xtal_df_1[ "image_no" ] = image_no + xtal_df = pd.concat( ( xtal_df, xtal_df_1 ) ) + + # add count and print every 1000s + counter = counter + len(xtals) + if counter % 1000 == 0: + print( counter, end='\r' ) + print( "done" ) + + # sort by image no and reindex + xtal_df = xtal_df.sort_values( by=[ "image_no" ] ) + xtal_df = xtal_df.reset_index( drop=True ) + + stream_df = pd.DataFrame() + + # randomly n number of sample of xtals with replacement + for sample in samples: + print( "taking {0} {1} sample".format( repeat, sample ) ) + for x in range( 0, repeat ): + + try: + sample_df = xtal_df.sample( sample, replace=True ) + except ValueError: + print( "input image sample larger than number of hits. Sample should be less than {0}".format( len(xtal_df) ) ) + + # rebuild .stream from sample + print( "writing {0} to output file".format( x ) ) + crystals = sample_df.xtals.to_list() + chunk_header = sample_df.header.to_list() + output_file = "{0}_{1}_{2}.stream".format( output, sample ,x ) + write_to_file( geom, cell, chunk_header, crystals, output_file ) + + # append stream file to stream list + data = [ { "stream_file" : output_file, + "sample" : sample, + "number" : x + } ] + stream_df_1 = pd.DataFrame( data ) + stream_df = pd.concat( ( stream_df, stream_df_1 ) ) + + print( "done {0}".format( x ) ) + print( "done" ) + + return stream_df + +def make_process_dir( dir ): + # make process directory + try: + os.makedirs( dir ) + except OSError as e: + if e.errno != errno.EEXIST: + raise + +def main( cwd, input_stream, samples, output, repeat, script_dir, pointgroup, cell_file, spacegroup, residues ): + + # make sample stream files + stream_df = strip_stream( input_stream, samples, output, repeat ) + + # run partialator and make mtz + for index, row in stream_df.iterrows(): + + stream_file, sample, number = row[ "stream_file" ], row[ "sample" ], row[ "number" ] + print( "running partialatory for sample={0}, run={1}".format( sample, number ) ) + + # run partialator and make mtz + part_mtz_py = "{0}/partialator_mtz.py".format( script_dir ) + part_name = "{0}_{1}".format( sample, number ) + part_run_file = run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues) + + subprocess.run( "./{0}".format( part_run_file ) ) + os.chdir( cwd ) + +def list_of_floats(arg): + return list(map(int, arg.split(','))) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-s", + "--stream", + help="input stream file", + required=True, + type=os.path.abspath + ) + parser.add_argument( + "-o", + "--output", + help="output stream file with sampled xtals", + type=str, + default="sample" + ) + parser.add_argument( + "-n", + "--sample", + help="size of sample to take from input.stream", + type=list_of_floats, + required=True + ) + parser.add_argument( + "-r", + "--repeat", + help="how many samples would you like?", + type=int + ) + parser.add_argument( + "-p", + "--pointgroup", + help="pointgroup used by CrystFEL for partialator run", + type=str, + required=True + ) + parser.add_argument( + "-c", + "--cell_file", + help="path to CrystFEL cell file for partialator.", + type=os.path.abspath, + required=True + ) + parser.add_argument( + "-g", + "--spacegroup", + help="spacegroup for making mtz, e.g P43212", + type=str, + required=True + ) + parser.add_argument( + "-R", + "--residues", + help="number of residues for truncate, e.g., hewl = 129", + type=int, + required=True + ) + args = parser.parse_args() + # does if need to be run multiple times? + if args.repeat is None: + repeat = 1 + else: + repeat = args.repeat + # run main + cwd = os.getcwd() + main( cwd, args.stream, args.sample, args.output, repeat, args.pointgroup, args.cell_file, args.spacegroup, args.residues )