#!/usr/bin/env python3 # author J.Beale """ # aim to select crystals from a stram file based on the uc distances and angles based on a tolerance to them # usage python # output .stream file with extracted crystals based on tolerance of uc """ # modules import re import argparse import pandas as pd def get_tolerence(input_file): stream = open(input_file, 'r').read() cell_lst, xtals = scrub_cells( stream ) # res_df cols = [ "a", "b", "c", "alpha", "beta", "gamma" ] df = pd.DataFrame( cell_lst, columns=cols ) # convert all to floats df = df.astype(float) mean_a, std_a = round( df.a.mean()*10, 2 ), round( df.a.std()*10, 2 ) mean_b, std_b = round( df.b.mean()*10, 2 ), round( df.b.std()*10, 2 ) mean_c, std_c = round( df.c.mean()*10, 2 ), round( df.c.std()*10, 2 ) mean_alpha, std_alpha = round( df.alpha.mean(), 2 ), round( df.alpha.std(), 2 ) mean_beta, std_beta = round(df.beta.mean(), 2 ), round( df.beta.std(), 2 ) mean_gamma, std_gamma = round( df.gamma.mean(), 2 ), round( df.gamma.std(), 2 ) return [(std_a/10)*2,(std_b/10)*2,(std_c/10)*2,std_alpha*3,std_beta*3,std_gamma*3] # Function to check if cell parameters are within the specified range def is_within_range( params, target_cell_params, target_angles, tolerance_list ): returner = False within_range = [] for i in range(3): if abs(params[i] - target_cell_params[i]) < tolerance_list[i]: within_range.append(True) else: within_range.append(False) for i in range(3, 6): if abs(params[i] - target_angles[i - 3]) < tolerance_list[i]: within_range.append(True) else: within_range.append(False) if all(within_range): returner = True return returner def scrub_cells( stream ): # get uc values from stream file # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg # scrub clen and return - else nan try: pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg" cell_lst = re.findall( pattern, stream ) xtals = len( cell_lst ) if AttributeError: return cell_lst, xtals except AttributeError: return np.nan def extract_chunks( input_file, target_cell_params, target_angles, tolerance_list ): # setup chunks = [] collect_lines = False check_crystal=False # Open the input file for reading with open(input_file, 'r') as f: for line in f: # Check for the start condition if line.strip() == '----- Begin chunk -----': collect_lines = True target_chunk = False chunk_lines = [] results = [] check_crystal=False crystal=0 # Collect lines between start and end conditions # Check for the end condition if collect_lines: chunk_lines.append(line) if line.startswith('--- Begin crystal'): check_crystal=True crystal_remover=False crystal_lines=[] elif line.startswith('--- End crystal'): check_crystal=False if crystal_remover: chunk_lines = [l for l in chunk_lines if l not in crystal_lines] if check_crystal: crystal_lines.append(line) if line.startswith('Cell parameters'): cells = re.findall(r"(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg", line) crystal+=1 if cells: for cell in cells: cell = list( cell ) param = [float(i) for i in cell] result = is_within_range( param, target_cell_params, target_angles, tolerance_list ) #print(result) results.append( result ) if all( results ): #print(results) target_chunk = True else: crystal_remover=True #print('removing cell: ', param) if line.strip() == '----- End chunk -----': collect_lines = False # Stop collecting lines #for line in chunk_lines: if target_chunk == True: chunks.append( chunk_lines ) return chunks def get_header( header, input_file ): if header == "geom": start_keyword = "----- Begin geometry file -----" end_keyword = "----- End geometry file -----" if header == "cell": start_keyword = "----- Begin unit cell -----" end_keyword = "----- End unit cell -----" # setup collect_lines = False headers = [] # Open the input file for reading with open(input_file, 'r') as f: for line in f: # Check for the start condition if line.strip() == start_keyword: collect_lines = True headers_lines = [] # Collect lines between start and end conditions if collect_lines: headers_lines.append(line) # Check for the end condition if line.strip() == end_keyword: collect_lines = False # Stop collecting lines headers.append(headers_lines) return headers[0] def write_to_file( geom, cell, sections, output_file ): # Write sections with matching cell parameters to the output file with open(output_file, 'w') as out_file: out_file.write('CrystFEL stream format 2.3\n') out_file.write('Generated by CrystFEL 0.10.2\n') out_file.writelines(geom) out_file.writelines(cell) for section in sections: out_file.write('--- Begin crystal\n') # out_file.writelines('--- Begin crystal\n' + section + "\n--- End crystal") out_file.writelines(section) out_file.write('--- End crystal\n') out_file.write('----- End chunk -----\n') def main( input_file, output_file, target_cell_params, target_angles ): tolerance_list = get_tolerence(input_file) # get geom and cell file headers print( "getting header info from .stream file" ) geom = get_header( "geom", input_file ) cell = get_header( "cell", input_file ) print( "done" ) # get chunks with defined uc tolerances print( "finding crystals with unit cells of around:" ) print( "{0} nm".format( target_cell_params ) ) print( "{0} deg".format( target_angles ) ) chunks = extract_chunks( input_file, target_cell_params, target_angles, tolerance_list ) print( "found {0} crystals".format( len(chunks) ) ) # write to file print( "writing to new file" ) write_to_file( geom, cell, chunks, output_file ) print( "done" ) def list_of_floats(arg): return list(map(float, arg.split(','))) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-s", "--stream", help="input stream file", required=True, type=str, ) parser.add_argument( "-o", "--output", help="output stream file with selected crystals", type=str, default="selected_crystals.stream" ) parser.add_argument( "-c", "--cell", help="cell lengths to be found in nm, e.g. 73.4,68.7,75.6 - must be a float/no spaces", type=list_of_floats, required=True ) parser.add_argument( "-a", "--angles", help="anlges to be found, e.g., 90.0,105.6,90.0 - must be a float/no spaces", type=list_of_floats, required=True ) args = parser.parse_args() cell = [ round( x/10,4 ) for x in args.cell ] # run main main( args.stream, args.output, cell, args.angles )