#!/usr/bin/env python3 # author J.Beale """ # aim analyses and selects crystals based on their reported resolution from crystfel # usage python stream_random.py -s -o output file names -p plots histogram of images resolution -r selects all images with higher resolution than value # output either - histogram of images by resolution - .stream file of selected images """ # modules import re import argparse import pandas as pd import numpy as np import os import matplotlib.pyplot as plt def extract_chunks( input_file ): # setup chunk_df = pd.DataFrame() image_no = [] chunks = [] hits = [] collect_lines = False # Open the input file for reading with open(input_file, 'r') as f: for line in f: # Check for the start condition if line.startswith('----- Begin chunk -----'): hit = False collect_lines = True chunk_lines = [] if collect_lines: chunk_lines.append(line) # find image_no if line.startswith( "Event:" ): image_search = re.findall( r"Event: //(\d+)", line ) image = int(image_search[0]) image_no.append( image ) # is there a hit in chunk if line.startswith( "Cell parameters" ): hit = True if line.startswith('----- End chunk -----'): collect_lines = False # Stop collecting lines chunks.append( chunk_lines ) hits.append( hit ) chunk_df[ "chunks" ] = chunks chunk_df[ "image_no" ] = image_no chunk_df[ "hit" ] = hits return chunk_df def scrub_res( line ): # get resolution try: pattern = r"diffraction_resolution_limit\s=\s\d\.\d+\snm\^-1\sor\s(\d+\.\d+)\sA" res = re.search( pattern, line ).group(1) except AttributeError as e: res = np.nan return float( res ) def extract_xtals( chunk ): # setup xtals = [] resolutions = [] collect_crystal_lines = False # Open the input file for reading for line in chunk: # Check for the xtals start condition if line.startswith('--- Begin crystal'): collect_crystal_lines = True xtal_lines = [] if collect_crystal_lines: xtal_lines.append(line) if line.startswith('--- End crystal\n'): collect_crystal_lines = False # Stop collecting lines xtals.append( xtal_lines ) if line.startswith( "diffraction_resolution_limit" ): res = scrub_res( line ) resolutions.append( res ) return xtals, resolutions def extract_header( chunk ): # setup header = [] collect_header_lines = False # Open the input file for reading for line in chunk: # Check for the xtals start condition if line.startswith('----- Begin chunk -----'): collect_header_lines = True header_lines = [] if collect_header_lines: header_lines.append(line) if line.startswith('End of peak list'): collect_header_lines = False # Stop collecting lines header.append( header_lines ) return header def get_header( header, input_file ): if header == "geom": start_keyword = "----- Begin geometry file -----" end_keyword = "----- End geometry file -----" if header == "cell": start_keyword = "----- Begin unit cell -----" end_keyword = "----- End unit cell -----" # setup collect_lines = False headers = [] # Open the input file for reading with open(input_file, 'r') as f: for line in f: # Check for the start condition if line.strip() == start_keyword: collect_lines = True headers_lines = [] # Collect lines between start and end conditions if collect_lines: headers_lines.append(line) # Check for the end condition if line.strip() == end_keyword: collect_lines = False # Stop collecting lines headers.append(headers_lines) return headers[0] def write_to_file( geom, cell, chunk_header, crystals, output_file ): # Write sections with matching cell parameters to the output file with open(output_file, 'w') as out_file: out_file.write('CrystFEL stream format 2.3\n') out_file.write('Generated by CrystFEL 0.10.2\n') out_file.writelines(geom) out_file.writelines(cell) for crystal, header in zip( crystals, chunk_header ): out_file.writelines( header ) out_file.writelines( crystal ) out_file.writelines( "----- End chunk -----\n" ) def sort_xtals( chunk_df ): # extract xtals xtal_df = pd.DataFrame() counter = 0 for index, row in chunk_df.iterrows(): chunk, hit, image_no = row[ "chunks" ], row[ "hit" ], row[ "image_no" ] if hit: # find xtals and header header = extract_header( chunk ) xtals, resolutions = extract_xtals( chunk ) # make header same length as xtals header = header*len(xtals) # concat results xtal_df_1 = pd.DataFrame() xtal_df_1[ "header" ] = header xtal_df_1[ "xtals" ] = xtals xtal_df_1[ "image_no" ] = image_no xtal_df_1[ "resolution" ] = resolutions xtal_df = pd.concat( ( xtal_df, xtal_df_1 ) ) # add count and print every 1000s counter = counter + len(xtals) if counter % 1000 == 0: print( counter, end='\r' ) print( "done" ) # sort by image no and reindex xtal_df = xtal_df.sort_values( by=[ "image_no" ] ) xtal_df = xtal_df.reset_index( drop=True ) return xtal_df def plot_res_histogram( res_df, res_median, res_q1, res_q3 ): # calculate relative numbers of bins bin_range = 0.25 res_min = res_df.min().values[0] res_max = res_df.max().values[0] q1_bins = round( ( res_q1 - res_min )/bin_range ) q2_bins = round( ( res_median - res_q1 )/bin_range ) q3_bins = round( ( res_q3 - res_median )/bin_range ) q4_bins = round( ( res_max - res_q3 )/bin_range ) # cut data by quantile df_q1 = res_df[ res_df.resolution <= res_q1 ] df_q2 = res_df[ ( res_df.resolution > res_q1 ) & ( res_df.resolution <= res_median ) ] df_q3 = res_df[ ( res_df.resolution > res_median ) & ( res_df.resolution <= res_q3 ) ] df_q4 = res_df[ res_df.resolution > res_q3 ] # plot histogram of resolution fig, axs = plt.subplots() axs.hist( df_q1, bins=q1_bins, rwidth=1, stacked=True, color="blue", label="x