#!/usr/bin/env python3 # author J.Beale """ # aim randomly select a series of crystals from a stream file and then compile them into the correctly formated .stream run partialator and make an mtz # usage python stream_random.py -s -o output file names -n sample size -r how many repeat random samples do you want? -p pointgroup -c -a max-adu. Default = 12000 -g spacegroup -r number of residues # output .stream file with random sample of xtals """ # modules import re import argparse import pandas as pd import numpy as np import os, errno import subprocess def extract_chunks( input_file ): # setup chunk_df = pd.DataFrame() image_no = [] chunks = [] hits = [] collect_lines = False # Open the input file for reading with open(input_file, 'r') as f: for line in f: # Check for the start condition if line.startswith('----- Begin chunk -----'): hit = False collect_lines = True chunk_lines = [] if collect_lines: chunk_lines.append(line) # find image_no if line.startswith( "Event:" ): image_search = re.findall( r"Event: //(\d+)", line ) image = int(image_search[0]) image_no.append( image ) # is there a hit in chunk if line.startswith( "Cell parameters" ): hit = True if line.startswith('----- End chunk -----'): collect_lines = False # Stop collecting lines chunks.append( chunk_lines ) hits.append( hit ) chunk_df[ "chunks" ] = chunks chunk_df[ "image_no" ] = image_no chunk_df[ "hit" ] = hits return chunk_df def extract_xtals( chunk ): # setup xtals = [] collect_crystal_lines = False # Open the input file for reading for line in chunk: # Check for the xtals start condition if line.startswith('--- Begin crystal'): collect_crystal_lines = True xtal_lines = [] if collect_crystal_lines: xtal_lines.append(line) if line.startswith('--- End crystal\n'): collect_crystal_lines = False # Stop collecting lines xtals.append( xtal_lines ) return xtals def extract_header( chunk ): # setup header = [] collect_header_lines = False # Open the input file for reading for line in chunk: # Check for the xtals start condition if line.startswith('----- Begin chunk -----'): collect_header_lines = True header_lines = [] if collect_header_lines: header_lines.append(line) if line.startswith('End of peak list'): collect_header_lines = False # Stop collecting lines header.append( header_lines ) return header def get_header( header, input_file ): if header == "geom": start_keyword = "----- Begin geometry file -----" end_keyword = "----- End geometry file -----" if header == "cell": start_keyword = "----- Begin unit cell -----" end_keyword = "----- End unit cell -----" # setup collect_lines = False headers = [] # Open the input file for reading with open(input_file, 'r') as f: for line in f: # Check for the start condition if line.strip() == start_keyword: collect_lines = True headers_lines = [] # Collect lines between start and end conditions if collect_lines: headers_lines.append(line) # Check for the end condition if line.strip() == end_keyword: collect_lines = False # Stop collecting lines headers.append(headers_lines) return headers[0] def write_to_file( geom, cell, chunk_header, crystals, output_file ): # Write sections with matching cell parameters to the output file with open(output_file, 'w') as out_file: out_file.write('CrystFEL stream format 2.3\n') out_file.write('Generated by CrystFEL 0.10.2\n') out_file.writelines(geom) out_file.writelines(cell) for crystal, header in zip( crystals, chunk_header ): out_file.writelines( header ) out_file.writelines( crystal ) out_file.writelines( "----- End chunk -----\n" ) def run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues ): # partialator file name part_run_file = "merge_{0}.sh".format( part_name ) # write file part_sh = open( part_run_file, "w" ) part_sh.write( "#!/bin/sh\n\n" ) part_sh.write( "source /sf/cristallina/applications/mx/conda/miniconda/bin/activate\n" ) part_sh.write( "conda activate crmx38-analysis\n\n" ) part_sh.write( "python {0} -n merge_{1}".format( part_mtz_py, part_name ) ) part_sh.write( " -s {0}".format( stream_file ) ) part_sh.write( " -p {0}".format( pointgroup ) ) part_sh.write( " -c {0}".format( cell_file ) ) part_sh.write( " -r 1.3" ) part_sh.write( " -g {0}".format( spacegroup ) ) part_sh.write( " -R {0}".format( residues ) ) part_sh.close() # make file executable subprocess.call( [ "chmod", "+x", "{0}".format( part_run_file ) ] ) # return partialator file name return part_run_file def strip_stream( input_stream, samples, output, repeat ): # get geom and cell file headers print( "getting header info from .stream file" ) geom = get_header( "geom", input_stream ) cell = get_header( "cell", input_stream ) print( "done" ) # extract chunks print( "finding chucks" ) chunk_df = extract_chunks( input_stream ) # display no. of chunks print( "found {0} chunks".format( len(chunk_df) ) ) # remove rows without xtals chunk_df = chunk_df.loc[chunk_df.hit, :] print( "found {0} hits (not including multiples)".format( len(chunk_df) ) ) print( "done" ) # extract xtals print( "get xtals from chunks" ) xtal_df = pd.DataFrame() counter = 0 for index, row in chunk_df.iterrows(): chunk, hit, image_no = row[ "chunks" ], row[ "hit" ], row[ "image_no" ] if hit: # find xtals and header header = extract_header( chunk ) xtals = extract_xtals( chunk ) # make header same length as xtals header = header*len(xtals) # concat results xtal_df_1 = pd.DataFrame() xtal_df_1[ "header" ] = header xtal_df_1[ "xtals" ] = xtals xtal_df_1[ "image_no" ] = image_no xtal_df = pd.concat( ( xtal_df, xtal_df_1 ) ) # add count and print every 1000s counter = counter + len(xtals) if counter % 1000 == 0: print( counter, end='\r' ) print( "done" ) # sort by image no and reindex xtal_df = xtal_df.sort_values( by=[ "image_no" ] ) xtal_df = xtal_df.reset_index( drop=True ) stream_df = pd.DataFrame() # randomly n number of sample of xtals with replacement for sample in samples: print( "taking {0} {1} sample".format( repeat, sample ) ) for x in range( 0, repeat ): try: sample_df = xtal_df.sample( sample, replace=True ) except ValueError: print( "input image sample larger than number of hits. Sample should be less than {0}".format( len(xtal_df) ) ) # rebuild .stream from sample print( "writing {0} to output file".format( x ) ) crystals = sample_df.xtals.to_list() chunk_header = sample_df.header.to_list() output_file = "{0}_{1}_{2}.stream".format( output, sample ,x ) write_to_file( geom, cell, chunk_header, crystals, output_file ) # append stream file to stream list data = [ { "stream_file" : output_file, "sample" : sample, "number" : x } ] stream_df_1 = pd.DataFrame( data ) stream_df = pd.concat( ( stream_df, stream_df_1 ) ) print( "done {0}".format( x ) ) print( "done" ) return stream_df def make_process_dir( dir ): # make process directory try: os.makedirs( dir ) except OSError as e: if e.errno != errno.EEXIST: raise def main( cwd, input_stream, samples, output, repeat, script_dir, pointgroup, cell_file, spacegroup, residues ): # make sample stream files stream_df = strip_stream( input_stream, samples, output, repeat ) # run partialator and make mtz for index, row in stream_df.iterrows(): stream_file, sample, number = row[ "stream_file" ], row[ "sample" ], row[ "number" ] print( "running partialatory for sample={0}, run={1}".format( sample, number ) ) # run partialator and make mtz part_mtz_py = "{0}/partialator_mtz.py".format( script_dir ) part_name = "{0}_{1}".format( sample, number ) part_run_file = run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues) subprocess.run( "./{0}".format( part_run_file ) ) os.chdir( cwd ) def list_of_floats(arg): return list(map(int, arg.split(','))) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-s", "--stream", help="input stream file", required=True, type=os.path.abspath ) parser.add_argument( "-o", "--output", help="output stream file with sampled xtals", type=str, default="sample" ) parser.add_argument( "-n", "--sample", help="size of sample to take from input.stream", type=list_of_floats, required=True ) parser.add_argument( "-r", "--repeat", help="how many samples would you like?", type=int ) parser.add_argument( "-p", "--pointgroup", help="pointgroup used by CrystFEL for partialator run", type=str, required=True ) parser.add_argument( "-c", "--cell_file", help="path to CrystFEL cell file for partialator.", type=os.path.abspath, required=True ) parser.add_argument( "-g", "--spacegroup", help="spacegroup for making mtz, e.g P43212", type=str, required=True ) parser.add_argument( "-R", "--residues", help="number of residues for truncate, e.g., hewl = 129", type=int, required=True ) args = parser.parse_args() # does if need to be run multiple times? if args.repeat is None: repeat = 1 else: repeat = args.repeat # run main cwd = os.getcwd() main( cwd, args.stream, args.sample, args.output, repeat, args.pointgroup, args.cell_file, args.spacegroup, args.residues )