From 4140cad8f214b7494bb2ad4982b74572c5bce40c Mon Sep 17 00:00:00 2001
From: beale_j <john.beale@psi.ch>
Date: Fri, 28 Nov 2025 17:45:07 +0100
Subject: [PATCH] runs partialator and makes mtz after taking boot stapped
 sample

---
 reduction_tools/stream_random_mtz.py | 362 +++++++++++++++++++++++++++
 1 file changed, 362 insertions(+)
 create mode 100644 reduction_tools/stream_random_mtz.py
diff --git a/reduction_tools/stream_random_mtz.py b/reduction_tools/stream_random_mtz.py
new file mode 100644
index 0000000..62220f2
--- /dev/null
+++ b/reduction_tools/stream_random_mtz.py
@@ -0,0 +1,362 @@
+#!/usr/bin/env python3
+
+# author J.Beale
+
+"""
+# aim
+randomly select a series of crystals from a stream file and
+then compile them into the correctly formated .stream
+run partialator and make an mtz
+
+# usage
+python stream_random.py -s <path to stream>
+                        -o output file names
+                        -n sample size
+                        -r how many repeat random samples do you want?
+                        -p pointgroup                  
+                        -c <path-to-cell-file>
+                        -a max-adu. Default = 12000
+                        -g spacegroup
+                        -r number of residues
+
+# output
+.stream file with random sample of xtals
+"""
+
+# modules
+import re
+import argparse
+import pandas as pd
+import numpy as np
+import os, errno
+import subprocess
+
+def extract_chunks( input_file ):
+
+    # setup
+    chunk_df = pd.DataFrame()
+    image_no = []
+    chunks = []
+    hits = []
+    collect_lines = False
+    # Open the input file for reading
+    with open(input_file, 'r') as f:
+        for line in f:
+
+            # Check for the start condition
+            if line.startswith('----- Begin chunk -----'):
+                hit = False
+                collect_lines = True
+                chunk_lines = []
+            if collect_lines:
+                chunk_lines.append(line)
+            
+            # find image_no
+            if line.startswith( "Event:" ):
+                image_search = re.findall( r"Event: //(\d+)", line )
+                image = int(image_search[0])
+                image_no.append( image )
+            
+            # is there a hit in chunk
+            if line.startswith( "Cell parameters" ):
+                hit = True
+
+            if line.startswith('----- End chunk -----'):
+                collect_lines = False  # Stop collecting lines
+                chunks.append( chunk_lines )
+                hits.append( hit )
+    
+    chunk_df[ "chunks" ] = chunks
+    chunk_df[ "image_no" ] = image_no
+    chunk_df[ "hit" ] = hits
+
+    return chunk_df
+
+def extract_xtals( chunk ):
+
+    # setup
+    xtals = []
+    collect_crystal_lines = False
+    # Open the input file for reading
+    for line in chunk:
+
+        # Check for the xtals start condition
+        if line.startswith('--- Begin crystal'):
+            collect_crystal_lines = True
+            xtal_lines = []
+        if collect_crystal_lines:
+            xtal_lines.append(line)
+        if line.startswith('--- End crystal\n'):
+            collect_crystal_lines = False  # Stop collecting lines
+            xtals.append( xtal_lines )
+
+    return xtals
+
+def extract_header( chunk ):
+
+    # setup
+    header = []
+    collect_header_lines = False
+    # Open the input file for reading
+    for line in chunk:
+
+        # Check for the xtals start condition
+        if line.startswith('----- Begin chunk -----'):
+            collect_header_lines = True
+            header_lines = []
+        if collect_header_lines:
+            header_lines.append(line)
+        if line.startswith('End of peak list'):
+            collect_header_lines = False  # Stop collecting lines
+            header.append( header_lines )
+
+    return header
+
+def get_header( header, input_file ):
+
+    if header == "geom":
+        start_keyword = "----- Begin geometry file -----"
+        end_keyword = "----- End geometry file -----"
+    if header == "cell":
+        start_keyword = "----- Begin unit cell -----"
+        end_keyword = "----- End unit cell -----"
+
+    # setup
+    collect_lines = False
+    headers = []
+
+    # Open the input file for reading
+    with open(input_file, 'r') as f:
+        for line in f:
+            # Check for the start condition
+            if line.strip() == start_keyword:
+                collect_lines = True
+                headers_lines = []
+            # Collect lines between start and end conditions
+            if collect_lines:
+                headers_lines.append(line)
+            # Check for the end condition
+            if line.strip() == end_keyword:
+                collect_lines = False  # Stop collecting lines
+                headers.append(headers_lines)
+    
+    return headers[0]
+
+def write_to_file( geom, cell, chunk_header, crystals, output_file ):
+
+    # Write sections with matching cell parameters to the output file
+    with open(output_file, 'w') as out_file:
+        out_file.write('CrystFEL stream format 2.3\n')
+        out_file.write('Generated by CrystFEL 0.10.2\n')
+        out_file.writelines(geom)
+        out_file.writelines(cell)
+        for crystal, header in zip( crystals, chunk_header ):
+            out_file.writelines( header )
+            out_file.writelines( crystal )
+            out_file.writelines( "----- End chunk -----\n" )
+
+def run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues ):
+
+    # partialator file name
+    part_run_file = "merge_{0}.sh".format( part_name )
+
+    # write file
+    part_sh = open( part_run_file, "w" )
+    part_sh.write( "#!/bin/sh\n\n" )
+    part_sh.write( "source /sf/cristallina/applications/mx/conda/miniconda/bin/activate\n" )
+    part_sh.write( "conda activate crmx38-analysis\n\n" )
+    part_sh.write( "python {0} -n merge_{1}".format( part_mtz_py, part_name ) )
+    part_sh.write( " -s {0}".format( stream_file ) )
+    part_sh.write( " -p {0}".format( pointgroup ) )
+    part_sh.write( " -c {0}".format( cell_file ) )
+    part_sh.write( " -r 1.3" )
+    part_sh.write( " -g {0}".format( spacegroup ) )
+    part_sh.write( " -R {0}".format( residues ) )
+    part_sh.close()
+
+    # make file executable
+    subprocess.call( [ "chmod", "+x", "{0}".format( part_run_file ) ] )
+
+    # return partialator file name
+    return part_run_file
+
+def strip_stream( input_stream, samples, output, repeat ):
+
+    # get geom and cell file headers
+    print( "getting header info from .stream file" )
+    geom = get_header( "geom", input_stream )
+    cell = get_header( "cell", input_stream )
+    print( "done" )
+    
+    # extract chunks
+    print( "finding chucks" )
+    chunk_df = extract_chunks( input_stream )
+    # display no. of chunks
+    print( "found {0} chunks".format( len(chunk_df) ) )
+    # remove rows without xtals
+    chunk_df = chunk_df.loc[chunk_df.hit, :]
+    print( "found {0} hits (not including multiples)".format( len(chunk_df) ) )
+    print( "done" )
+
+    # extract xtals
+    print( "get xtals from chunks" )
+    xtal_df = pd.DataFrame()
+    counter = 0
+    for index, row in chunk_df.iterrows():
+
+        chunk, hit, image_no = row[ "chunks" ], row[ "hit" ], row[ "image_no" ]
+
+        if hit:
+        
+            # find xtals and header
+            header = extract_header( chunk )
+            xtals = extract_xtals( chunk )
+
+            # make header same length as xtals
+            header = header*len(xtals)
+
+            # concat results
+            xtal_df_1 = pd.DataFrame()
+            xtal_df_1[ "header" ] = header
+            xtal_df_1[ "xtals" ] = xtals
+            xtal_df_1[ "image_no" ] = image_no
+            xtal_df = pd.concat( ( xtal_df, xtal_df_1 ) )
+
+            # add count and print every 1000s
+            counter = counter + len(xtals)
+            if counter % 1000 == 0:
+                print( counter, end='\r' )
+    print( "done" )
+
+    # sort by image no and reindex
+    xtal_df = xtal_df.sort_values( by=[ "image_no" ] )
+    xtal_df = xtal_df.reset_index( drop=True )
+
+    stream_df = pd.DataFrame()
+
+    # randomly n number of sample of xtals with replacement
+    for sample in samples:
+        print( "taking {0} {1} sample".format( repeat, sample ) )
+        for x in range( 0, repeat ):
+
+            try:
+                sample_df = xtal_df.sample( sample, replace=True )
+            except ValueError:
+                print( "input image sample larger than number of hits. Sample should be less than {0}".format( len(xtal_df) ) )
+
+            # rebuild .stream from sample
+            print( "writing {0} to output file".format( x ) )
+            crystals = sample_df.xtals.to_list()
+            chunk_header = sample_df.header.to_list()
+            output_file = "{0}_{1}_{2}.stream".format( output, sample ,x )
+            write_to_file( geom, cell, chunk_header, crystals, output_file )
+
+            # append stream file to stream list
+            data = [ { "stream_file" : output_file,
+                       "sample" : sample,
+                       "number" : x                    
+                    } ]
+            stream_df_1 = pd.DataFrame( data )
+            stream_df = pd.concat( ( stream_df, stream_df_1 ) )
+
+            print( "done {0}".format( x ) )
+    print( "done" )
+
+    return stream_df
+
+def make_process_dir( dir ):
+    # make process directory
+    try:
+        os.makedirs( dir )
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+def main( cwd, input_stream, samples, output, repeat, script_dir, pointgroup, cell_file, spacegroup, residues ):
+
+    # make sample stream files
+    stream_df = strip_stream( input_stream, samples, output, repeat )
+
+    # run partialator and make mtz
+    for index, row in stream_df.iterrows():
+
+        stream_file, sample, number = row[ "stream_file" ], row[ "sample" ], row[ "number" ]
+        print( "running partialatory for sample={0}, run={1}".format( sample, number ) )
+
+        # run partialator and make mtz
+        part_mtz_py = "{0}/partialator_mtz.py".format( script_dir )
+        part_name = "{0}_{1}".format( sample, number )
+        part_run_file = run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues)
+
+        subprocess.run( "./{0}".format( part_run_file ) )
+        os.chdir( cwd )
+
+def list_of_floats(arg):
+    return list(map(int, arg.split(',')))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--stream",
+        help="input stream file",
+        required=True,
+        type=os.path.abspath
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="output stream file with sampled xtals",
+        type=str,
+        default="sample"
+    )
+    parser.add_argument(
+        "-n",
+        "--sample",
+        help="size of sample to take from input.stream",
+        type=list_of_floats,
+        required=True
+    )
+    parser.add_argument(
+        "-r",
+        "--repeat",
+        help="how many samples would you like?",
+        type=int
+    )
+    parser.add_argument(
+        "-p",
+        "--pointgroup",
+        help="pointgroup used by CrystFEL for partialator run",
+        type=str,
+        required=True
+    )    
+    parser.add_argument(
+        "-c",
+        "--cell_file",
+        help="path to CrystFEL cell file for partialator.",
+        type=os.path.abspath,
+        required=True
+    )
+    parser.add_argument(
+        "-g",
+        "--spacegroup",
+        help="spacegroup for making mtz, e.g P43212",
+        type=str,
+        required=True
+    )
+    parser.add_argument(
+        "-R",
+        "--residues",
+        help="number of residues for truncate, e.g., hewl = 129",
+        type=int,
+        required=True
+    )
+    args = parser.parse_args()
+    # does if need to be run multiple times?
+    if args.repeat is None:
+        repeat = 1
+    else:
+        repeat = args.repeat
+    # run main
+    cwd = os.getcwd()
+    main( cwd, args.stream, args.sample, args.output, repeat, args.pointgroup, args.cell_file, args.spacegroup, args.residues )