runs partialator and makes mtz after taking boot stapped sample
This commit is contained in:
362
reduction_tools/stream_random_mtz.py
Normal file
362
reduction_tools/stream_random_mtz.py
Normal file
@@ -0,0 +1,362 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# author J.Beale
|
||||||
|
|
||||||
|
"""
|
||||||
|
# aim
|
||||||
|
randomly select a series of crystals from a stream file and
|
||||||
|
then compile them into the correctly formated .stream
|
||||||
|
run partialator and make an mtz
|
||||||
|
|
||||||
|
# usage
|
||||||
|
python stream_random.py -s <path to stream>
|
||||||
|
-o output file names
|
||||||
|
-n sample size
|
||||||
|
-r how many repeat random samples do you want?
|
||||||
|
-p pointgroup
|
||||||
|
-c <path-to-cell-file>
|
||||||
|
-a max-adu. Default = 12000
|
||||||
|
-g spacegroup
|
||||||
|
-r number of residues
|
||||||
|
|
||||||
|
# output
|
||||||
|
.stream file with random sample of xtals
|
||||||
|
"""
|
||||||
|
|
||||||
|
# modules
|
||||||
|
import re
|
||||||
|
import argparse
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os, errno
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
def extract_chunks( input_file ):
|
||||||
|
|
||||||
|
# setup
|
||||||
|
chunk_df = pd.DataFrame()
|
||||||
|
image_no = []
|
||||||
|
chunks = []
|
||||||
|
hits = []
|
||||||
|
collect_lines = False
|
||||||
|
# Open the input file for reading
|
||||||
|
with open(input_file, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
|
||||||
|
# Check for the start condition
|
||||||
|
if line.startswith('----- Begin chunk -----'):
|
||||||
|
hit = False
|
||||||
|
collect_lines = True
|
||||||
|
chunk_lines = []
|
||||||
|
if collect_lines:
|
||||||
|
chunk_lines.append(line)
|
||||||
|
|
||||||
|
# find image_no
|
||||||
|
if line.startswith( "Event:" ):
|
||||||
|
image_search = re.findall( r"Event: //(\d+)", line )
|
||||||
|
image = int(image_search[0])
|
||||||
|
image_no.append( image )
|
||||||
|
|
||||||
|
# is there a hit in chunk
|
||||||
|
if line.startswith( "Cell parameters" ):
|
||||||
|
hit = True
|
||||||
|
|
||||||
|
if line.startswith('----- End chunk -----'):
|
||||||
|
collect_lines = False # Stop collecting lines
|
||||||
|
chunks.append( chunk_lines )
|
||||||
|
hits.append( hit )
|
||||||
|
|
||||||
|
chunk_df[ "chunks" ] = chunks
|
||||||
|
chunk_df[ "image_no" ] = image_no
|
||||||
|
chunk_df[ "hit" ] = hits
|
||||||
|
|
||||||
|
return chunk_df
|
||||||
|
|
||||||
|
def extract_xtals( chunk ):
|
||||||
|
|
||||||
|
# setup
|
||||||
|
xtals = []
|
||||||
|
collect_crystal_lines = False
|
||||||
|
# Open the input file for reading
|
||||||
|
for line in chunk:
|
||||||
|
|
||||||
|
# Check for the xtals start condition
|
||||||
|
if line.startswith('--- Begin crystal'):
|
||||||
|
collect_crystal_lines = True
|
||||||
|
xtal_lines = []
|
||||||
|
if collect_crystal_lines:
|
||||||
|
xtal_lines.append(line)
|
||||||
|
if line.startswith('--- End crystal\n'):
|
||||||
|
collect_crystal_lines = False # Stop collecting lines
|
||||||
|
xtals.append( xtal_lines )
|
||||||
|
|
||||||
|
return xtals
|
||||||
|
|
||||||
|
def extract_header( chunk ):
|
||||||
|
|
||||||
|
# setup
|
||||||
|
header = []
|
||||||
|
collect_header_lines = False
|
||||||
|
# Open the input file for reading
|
||||||
|
for line in chunk:
|
||||||
|
|
||||||
|
# Check for the xtals start condition
|
||||||
|
if line.startswith('----- Begin chunk -----'):
|
||||||
|
collect_header_lines = True
|
||||||
|
header_lines = []
|
||||||
|
if collect_header_lines:
|
||||||
|
header_lines.append(line)
|
||||||
|
if line.startswith('End of peak list'):
|
||||||
|
collect_header_lines = False # Stop collecting lines
|
||||||
|
header.append( header_lines )
|
||||||
|
|
||||||
|
return header
|
||||||
|
|
||||||
|
def get_header( header, input_file ):
|
||||||
|
|
||||||
|
if header == "geom":
|
||||||
|
start_keyword = "----- Begin geometry file -----"
|
||||||
|
end_keyword = "----- End geometry file -----"
|
||||||
|
if header == "cell":
|
||||||
|
start_keyword = "----- Begin unit cell -----"
|
||||||
|
end_keyword = "----- End unit cell -----"
|
||||||
|
|
||||||
|
# setup
|
||||||
|
collect_lines = False
|
||||||
|
headers = []
|
||||||
|
|
||||||
|
# Open the input file for reading
|
||||||
|
with open(input_file, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
# Check for the start condition
|
||||||
|
if line.strip() == start_keyword:
|
||||||
|
collect_lines = True
|
||||||
|
headers_lines = []
|
||||||
|
# Collect lines between start and end conditions
|
||||||
|
if collect_lines:
|
||||||
|
headers_lines.append(line)
|
||||||
|
# Check for the end condition
|
||||||
|
if line.strip() == end_keyword:
|
||||||
|
collect_lines = False # Stop collecting lines
|
||||||
|
headers.append(headers_lines)
|
||||||
|
|
||||||
|
return headers[0]
|
||||||
|
|
||||||
|
def write_to_file( geom, cell, chunk_header, crystals, output_file ):
|
||||||
|
|
||||||
|
# Write sections with matching cell parameters to the output file
|
||||||
|
with open(output_file, 'w') as out_file:
|
||||||
|
out_file.write('CrystFEL stream format 2.3\n')
|
||||||
|
out_file.write('Generated by CrystFEL 0.10.2\n')
|
||||||
|
out_file.writelines(geom)
|
||||||
|
out_file.writelines(cell)
|
||||||
|
for crystal, header in zip( crystals, chunk_header ):
|
||||||
|
out_file.writelines( header )
|
||||||
|
out_file.writelines( crystal )
|
||||||
|
out_file.writelines( "----- End chunk -----\n" )
|
||||||
|
|
||||||
|
def run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues ):
|
||||||
|
|
||||||
|
# partialator file name
|
||||||
|
part_run_file = "merge_{0}.sh".format( part_name )
|
||||||
|
|
||||||
|
# write file
|
||||||
|
part_sh = open( part_run_file, "w" )
|
||||||
|
part_sh.write( "#!/bin/sh\n\n" )
|
||||||
|
part_sh.write( "source /sf/cristallina/applications/mx/conda/miniconda/bin/activate\n" )
|
||||||
|
part_sh.write( "conda activate crmx38-analysis\n\n" )
|
||||||
|
part_sh.write( "python {0} -n merge_{1}".format( part_mtz_py, part_name ) )
|
||||||
|
part_sh.write( " -s {0}".format( stream_file ) )
|
||||||
|
part_sh.write( " -p {0}".format( pointgroup ) )
|
||||||
|
part_sh.write( " -c {0}".format( cell_file ) )
|
||||||
|
part_sh.write( " -r 1.3" )
|
||||||
|
part_sh.write( " -g {0}".format( spacegroup ) )
|
||||||
|
part_sh.write( " -R {0}".format( residues ) )
|
||||||
|
part_sh.close()
|
||||||
|
|
||||||
|
# make file executable
|
||||||
|
subprocess.call( [ "chmod", "+x", "{0}".format( part_run_file ) ] )
|
||||||
|
|
||||||
|
# return partialator file name
|
||||||
|
return part_run_file
|
||||||
|
|
||||||
|
def strip_stream( input_stream, samples, output, repeat ):
|
||||||
|
|
||||||
|
# get geom and cell file headers
|
||||||
|
print( "getting header info from .stream file" )
|
||||||
|
geom = get_header( "geom", input_stream )
|
||||||
|
cell = get_header( "cell", input_stream )
|
||||||
|
print( "done" )
|
||||||
|
|
||||||
|
# extract chunks
|
||||||
|
print( "finding chucks" )
|
||||||
|
chunk_df = extract_chunks( input_stream )
|
||||||
|
# display no. of chunks
|
||||||
|
print( "found {0} chunks".format( len(chunk_df) ) )
|
||||||
|
# remove rows without xtals
|
||||||
|
chunk_df = chunk_df.loc[chunk_df.hit, :]
|
||||||
|
print( "found {0} hits (not including multiples)".format( len(chunk_df) ) )
|
||||||
|
print( "done" )
|
||||||
|
|
||||||
|
# extract xtals
|
||||||
|
print( "get xtals from chunks" )
|
||||||
|
xtal_df = pd.DataFrame()
|
||||||
|
counter = 0
|
||||||
|
for index, row in chunk_df.iterrows():
|
||||||
|
|
||||||
|
chunk, hit, image_no = row[ "chunks" ], row[ "hit" ], row[ "image_no" ]
|
||||||
|
|
||||||
|
if hit:
|
||||||
|
|
||||||
|
# find xtals and header
|
||||||
|
header = extract_header( chunk )
|
||||||
|
xtals = extract_xtals( chunk )
|
||||||
|
|
||||||
|
# make header same length as xtals
|
||||||
|
header = header*len(xtals)
|
||||||
|
|
||||||
|
# concat results
|
||||||
|
xtal_df_1 = pd.DataFrame()
|
||||||
|
xtal_df_1[ "header" ] = header
|
||||||
|
xtal_df_1[ "xtals" ] = xtals
|
||||||
|
xtal_df_1[ "image_no" ] = image_no
|
||||||
|
xtal_df = pd.concat( ( xtal_df, xtal_df_1 ) )
|
||||||
|
|
||||||
|
# add count and print every 1000s
|
||||||
|
counter = counter + len(xtals)
|
||||||
|
if counter % 1000 == 0:
|
||||||
|
print( counter, end='\r' )
|
||||||
|
print( "done" )
|
||||||
|
|
||||||
|
# sort by image no and reindex
|
||||||
|
xtal_df = xtal_df.sort_values( by=[ "image_no" ] )
|
||||||
|
xtal_df = xtal_df.reset_index( drop=True )
|
||||||
|
|
||||||
|
stream_df = pd.DataFrame()
|
||||||
|
|
||||||
|
# randomly n number of sample of xtals with replacement
|
||||||
|
for sample in samples:
|
||||||
|
print( "taking {0} {1} sample".format( repeat, sample ) )
|
||||||
|
for x in range( 0, repeat ):
|
||||||
|
|
||||||
|
try:
|
||||||
|
sample_df = xtal_df.sample( sample, replace=True )
|
||||||
|
except ValueError:
|
||||||
|
print( "input image sample larger than number of hits. Sample should be less than {0}".format( len(xtal_df) ) )
|
||||||
|
|
||||||
|
# rebuild .stream from sample
|
||||||
|
print( "writing {0} to output file".format( x ) )
|
||||||
|
crystals = sample_df.xtals.to_list()
|
||||||
|
chunk_header = sample_df.header.to_list()
|
||||||
|
output_file = "{0}_{1}_{2}.stream".format( output, sample ,x )
|
||||||
|
write_to_file( geom, cell, chunk_header, crystals, output_file )
|
||||||
|
|
||||||
|
# append stream file to stream list
|
||||||
|
data = [ { "stream_file" : output_file,
|
||||||
|
"sample" : sample,
|
||||||
|
"number" : x
|
||||||
|
} ]
|
||||||
|
stream_df_1 = pd.DataFrame( data )
|
||||||
|
stream_df = pd.concat( ( stream_df, stream_df_1 ) )
|
||||||
|
|
||||||
|
print( "done {0}".format( x ) )
|
||||||
|
print( "done" )
|
||||||
|
|
||||||
|
return stream_df
|
||||||
|
|
||||||
|
def make_process_dir( dir ):
|
||||||
|
# make process directory
|
||||||
|
try:
|
||||||
|
os.makedirs( dir )
|
||||||
|
except OSError as e:
|
||||||
|
if e.errno != errno.EEXIST:
|
||||||
|
raise
|
||||||
|
|
||||||
|
def main( cwd, input_stream, samples, output, repeat, script_dir, pointgroup, cell_file, spacegroup, residues ):
|
||||||
|
|
||||||
|
# make sample stream files
|
||||||
|
stream_df = strip_stream( input_stream, samples, output, repeat )
|
||||||
|
|
||||||
|
# run partialator and make mtz
|
||||||
|
for index, row in stream_df.iterrows():
|
||||||
|
|
||||||
|
stream_file, sample, number = row[ "stream_file" ], row[ "sample" ], row[ "number" ]
|
||||||
|
print( "running partialatory for sample={0}, run={1}".format( sample, number ) )
|
||||||
|
|
||||||
|
# run partialator and make mtz
|
||||||
|
part_mtz_py = "{0}/partialator_mtz.py".format( script_dir )
|
||||||
|
part_name = "{0}_{1}".format( sample, number )
|
||||||
|
part_run_file = run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues)
|
||||||
|
|
||||||
|
subprocess.run( "./{0}".format( part_run_file ) )
|
||||||
|
os.chdir( cwd )
|
||||||
|
|
||||||
|
def list_of_floats(arg):
|
||||||
|
return list(map(int, arg.split(',')))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--stream",
|
||||||
|
help="input stream file",
|
||||||
|
required=True,
|
||||||
|
type=os.path.abspath
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-o",
|
||||||
|
"--output",
|
||||||
|
help="output stream file with sampled xtals",
|
||||||
|
type=str,
|
||||||
|
default="sample"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-n",
|
||||||
|
"--sample",
|
||||||
|
help="size of sample to take from input.stream",
|
||||||
|
type=list_of_floats,
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-r",
|
||||||
|
"--repeat",
|
||||||
|
help="how many samples would you like?",
|
||||||
|
type=int
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-p",
|
||||||
|
"--pointgroup",
|
||||||
|
help="pointgroup used by CrystFEL for partialator run",
|
||||||
|
type=str,
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-c",
|
||||||
|
"--cell_file",
|
||||||
|
help="path to CrystFEL cell file for partialator.",
|
||||||
|
type=os.path.abspath,
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-g",
|
||||||
|
"--spacegroup",
|
||||||
|
help="spacegroup for making mtz, e.g P43212",
|
||||||
|
type=str,
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-R",
|
||||||
|
"--residues",
|
||||||
|
help="number of residues for truncate, e.g., hewl = 129",
|
||||||
|
type=int,
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
# does if need to be run multiple times?
|
||||||
|
if args.repeat is None:
|
||||||
|
repeat = 1
|
||||||
|
else:
|
||||||
|
repeat = args.repeat
|
||||||
|
# run main
|
||||||
|
cwd = os.getcwd()
|
||||||
|
main( cwd, args.stream, args.sample, args.output, repeat, args.pointgroup, args.cell_file, args.spacegroup, args.residues )
|
||||||
Reference in New Issue
Block a user