From 2acd84d8a941a74aa9a1eacf61d7a2d99d804ec4 Mon Sep 17 00:00:00 2001 From: Beale John Henry Date: Thu, 8 May 2025 00:23:12 +0200 Subject: [PATCH] replacement for select xtals - works very similar to stream_random --- reduction_tools/stream_select.py | 314 +++++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 reduction_tools/stream_select.py diff --git a/reduction_tools/stream_select.py b/reduction_tools/stream_select.py new file mode 100644 index 0000000..4f7d46a --- /dev/null +++ b/reduction_tools/stream_select.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python3 + +# author J.Beale + +""" +# aim +randomly select a series of crystals from a stream file and +then compile them into the correctly formated .stream + +# usage +python stream_random.py -s + -o output file names + -n sample size + -r how many repeat random samples do you want? + +# output +.stream file with random sample of xtals +""" + +# modules +import re +import argparse +import pandas as pd +import numpy as np +import os + +def extract_chunks( input_file ): + + # setup + chunk_df = pd.DataFrame() + image_no = [] + chunks = [] + hits = [] + collect_lines = False + # Open the input file for reading + with open(input_file, 'r') as f: + for line in f: + + # Check for the start condition + if line.startswith('----- Begin chunk -----'): + hit = False + collect_lines = True + chunk_lines = [] + if collect_lines: + chunk_lines.append(line) + + # find image_no + if line.startswith( "Event:" ): + image_search = re.findall( r"Event: //(\d+)", line ) + image = int(image_search[0]) + image_no.append( image ) + + # is there a hit in chunk + if line.startswith( "Cell parameters" ): + hit = True + + if line.startswith('----- End chunk -----'): + collect_lines = False # Stop collecting lines + chunks.append( chunk_lines ) + hits.append( hit ) + + chunk_df[ "chunks" ] = chunks + chunk_df[ "image_no" ] = image_no + chunk_df[ "hit" ] = hits + + return chunk_df + +def scrub_cells( line ): + + # get uc values from stream file + # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg + pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg" + a = re.search( pattern, line ).group(1) + b = re.search( pattern, line ).group(2) + c = re.search( pattern, line ).group(3) + alpha = re.search( pattern, line ).group(4) + beta = re.search( pattern, line ).group(5) + gamma = re.search( pattern, line ).group(6) + + data = [ { "a" : float( a )*10, + "b" : float( b )*10, + "c" : float( c )*10, + "alpha" : float( alpha ), + "beta" : float( beta ), + "gamma" : float( gamma ) + } ] + cell_df = pd.DataFrame( data ) + + return cell_df + +def extract_xtals( chunk ): + + # setup + xtals = [] + cells_df = pd.DataFrame() + collect_crystal_lines = False + # Open the input file for reading + for line in chunk: + + # Check for the xtals start condition + if line.startswith('--- Begin crystal'): + collect_crystal_lines = True + xtal_lines = [] + if collect_crystal_lines: + xtal_lines.append(line) + if line.startswith('--- End crystal\n'): + collect_crystal_lines = False # Stop collecting lines + xtals.append( xtal_lines ) + if line.startswith( "Cell" ): + cell_df = scrub_cells( line ) + cells_df = pd.concat( ( cells_df, cell_df ) ) + + return xtals, cells_df + +def extract_header( chunk ): + + # setup + header = [] + collect_header_lines = False + # Open the input file for reading + for line in chunk: + + # Check for the xtals start condition + if line.startswith('----- Begin chunk -----'): + collect_header_lines = True + header_lines = [] + if collect_header_lines: + header_lines.append(line) + if line.startswith('End of peak list'): + collect_header_lines = False # Stop collecting lines + header.append( header_lines ) + + return header + +def get_header( header, input_file ): + + if header == "geom": + start_keyword = "----- Begin geometry file -----" + end_keyword = "----- End geometry file -----" + if header == "cell": + start_keyword = "----- Begin unit cell -----" + end_keyword = "----- End unit cell -----" + + # setup + collect_lines = False + headers = [] + + # Open the input file for reading + with open(input_file, 'r') as f: + for line in f: + # Check for the start condition + if line.strip() == start_keyword: + collect_lines = True + headers_lines = [] + # Collect lines between start and end conditions + if collect_lines: + headers_lines.append(line) + # Check for the end condition + if line.strip() == end_keyword: + collect_lines = False # Stop collecting lines + headers.append(headers_lines) + + return headers[0] + +def write_to_file( geom, cell, chunk_header, crystals, output_file ): + + # Write sections with matching cell parameters to the output file + with open(output_file, 'w') as out_file: + out_file.write('CrystFEL stream format 2.3\n') + out_file.write('Generated by CrystFEL 0.10.2\n') + out_file.writelines(geom) + out_file.writelines(cell) + for crystal, header in zip( crystals, chunk_header ): + out_file.writelines( header ) + out_file.writelines( crystal ) + out_file.writelines( "----- End chunk -----\n" ) + +def is_within_range( a, b, c, alpha, beta, gamma, cell_tolerance, angle_tolerance, ideal_centre ): + + cell = [ a, b, c, alpha, beta, gamma ] + returner = False + within_range = [] + + # Function to check if cell parameters are within the specified range + for i in range(3): + if cell[i] < ideal_centre[i]+cell_tolerance and cell[i] > ideal_centre[i]-cell_tolerance: + within_range.append(True) + else: + within_range.append(False) + + for i in range(3, 6): + if cell[i] < ideal_centre[i]+angle_tolerance and cell[i] > ideal_centre[i]-angle_tolerance: + within_range.append(True) + else: + within_range.append(False) + + if all(within_range): + returner = True + + return returner + +def sort_xtals( chunk_df ): + + # extract xtals + xtal_df = pd.DataFrame() + counter = 0 + for index, row in chunk_df.iterrows(): + + chunk, hit, image_no = row[ "chunks" ], row[ "hit" ], row[ "image_no" ] + + if hit: + + # find xtals and header + header = extract_header( chunk ) + xtals, cells_df = extract_xtals( chunk ) + + # make header same length as xtals + header = header*len(xtals) + + # concat results + xtal_df_1 = pd.DataFrame() + xtal_df_1[ "header" ] = header + xtal_df_1[ "xtals" ] = xtals + xtal_df_1[ "image_no" ] = image_no + xtal_df_1 = pd.concat( ( xtal_df_1, cells_df ), axis=1 ) + xtal_df = pd.concat( ( xtal_df, xtal_df_1 ) ) + + # add count and print every 1000s + counter = counter + len(xtals) + if counter % 1000 == 0: + print( counter, end='\r' ) + print( "done" ) + + # sort by image no and reindex + xtal_df = xtal_df.sort_values( by=[ "image_no" ] ) + xtal_df = xtal_df.reset_index( drop=True ) + + return xtal_df + +def main( input_file, output, cell_tolerance, angle_tolerance, ideal_centre ): + + # get geom and cell file headers + print( "getting header info from .stream file" ) + geom = get_header( "geom", input_file ) + cell = get_header( "cell", input_file ) + print( "done" ) + + # extract chunks + print( "finding chucks" ) + chunk_df = extract_chunks( input_file ) + # display no. of chunks + print( "found {0} chunks".format( len(chunk_df) ) ) + # remove rows without xtals + chunk_df = chunk_df.loc[chunk_df.hit, :] + print( "found {0} hits (not including multiples)".format( len(chunk_df) ) ) + print( "done" ) + + print( "sorting xtals from chunks" ) + xtal_df = sort_xtals( chunk_df ) + print( "done" ) + + print( "finding cells that are within tolerance" ) + # select cells that are within tolerance + xtal_df[ "select" ] = xtal_df.apply(lambda x: is_within_range( x[ "a" ], x[ "b" ], x[ "c" ], + x[ "alpha" ], x[ "beta" ], x[ "gamma" ], + cell_tolerance, + angle_tolerance, + ideal_centre ), axis=1 + ) + select_df = xtal_df[ xtal_df[ "select" ] == True ] + print( "done" ) + + print( "writing {0} to output file".format( len( select_df ) ) ) + crystals = select_df.xtals.to_list() + chunk_header = select_df.header.to_list() + output_file = "{0}.stream".format( output ) + write_to_file( geom, cell, chunk_header, crystals, output_file ) + print( "done" ) + +def list_of_floats(arg): + return list(map(int, arg.split(','))) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-s", + "--stream", + help="input stream file", + required=True, + type=os.path.abspath + ) + parser.add_argument( + "-o", + "--output", + help="output stream file with sampled xtals", + type=str, + default="sample" + ) + # parser.add_argument( + # "-n", + # "--sample", + # help="size of sample to take from input.stream", + # type=int, + # required=True + # ) + + args = parser.parse_args() + + cell_tolerance = 0.2 + angle_tolerance = 0.5 + ideal_centre = [ 78.5, 78.5, 38.45, 90, 90, 90 ] + + # run main + main( args.stream, args.output, cell_tolerance, angle_tolerance, ideal_centre ) \ No newline at end of file