runs partialator and makes mtz after taking boot stapped sample
This commit is contained in:
362
reduction_tools/stream_random_mtz.py
Normal file
362
reduction_tools/stream_random_mtz.py
Normal file
@@ -0,0 +1,362 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# author J.Beale
|
||||
|
||||
"""
|
||||
# aim
|
||||
randomly select a series of crystals from a stream file and
|
||||
then compile them into the correctly formated .stream
|
||||
run partialator and make an mtz
|
||||
|
||||
# usage
|
||||
python stream_random.py -s <path to stream>
|
||||
-o output file names
|
||||
-n sample size
|
||||
-r how many repeat random samples do you want?
|
||||
-p pointgroup
|
||||
-c <path-to-cell-file>
|
||||
-a max-adu. Default = 12000
|
||||
-g spacegroup
|
||||
-r number of residues
|
||||
|
||||
# output
|
||||
.stream file with random sample of xtals
|
||||
"""
|
||||
|
||||
# modules
|
||||
import re
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os, errno
|
||||
import subprocess
|
||||
|
||||
def extract_chunks( input_file ):
|
||||
|
||||
# setup
|
||||
chunk_df = pd.DataFrame()
|
||||
image_no = []
|
||||
chunks = []
|
||||
hits = []
|
||||
collect_lines = False
|
||||
# Open the input file for reading
|
||||
with open(input_file, 'r') as f:
|
||||
for line in f:
|
||||
|
||||
# Check for the start condition
|
||||
if line.startswith('----- Begin chunk -----'):
|
||||
hit = False
|
||||
collect_lines = True
|
||||
chunk_lines = []
|
||||
if collect_lines:
|
||||
chunk_lines.append(line)
|
||||
|
||||
# find image_no
|
||||
if line.startswith( "Event:" ):
|
||||
image_search = re.findall( r"Event: //(\d+)", line )
|
||||
image = int(image_search[0])
|
||||
image_no.append( image )
|
||||
|
||||
# is there a hit in chunk
|
||||
if line.startswith( "Cell parameters" ):
|
||||
hit = True
|
||||
|
||||
if line.startswith('----- End chunk -----'):
|
||||
collect_lines = False # Stop collecting lines
|
||||
chunks.append( chunk_lines )
|
||||
hits.append( hit )
|
||||
|
||||
chunk_df[ "chunks" ] = chunks
|
||||
chunk_df[ "image_no" ] = image_no
|
||||
chunk_df[ "hit" ] = hits
|
||||
|
||||
return chunk_df
|
||||
|
||||
def extract_xtals( chunk ):
|
||||
|
||||
# setup
|
||||
xtals = []
|
||||
collect_crystal_lines = False
|
||||
# Open the input file for reading
|
||||
for line in chunk:
|
||||
|
||||
# Check for the xtals start condition
|
||||
if line.startswith('--- Begin crystal'):
|
||||
collect_crystal_lines = True
|
||||
xtal_lines = []
|
||||
if collect_crystal_lines:
|
||||
xtal_lines.append(line)
|
||||
if line.startswith('--- End crystal\n'):
|
||||
collect_crystal_lines = False # Stop collecting lines
|
||||
xtals.append( xtal_lines )
|
||||
|
||||
return xtals
|
||||
|
||||
def extract_header( chunk ):
|
||||
|
||||
# setup
|
||||
header = []
|
||||
collect_header_lines = False
|
||||
# Open the input file for reading
|
||||
for line in chunk:
|
||||
|
||||
# Check for the xtals start condition
|
||||
if line.startswith('----- Begin chunk -----'):
|
||||
collect_header_lines = True
|
||||
header_lines = []
|
||||
if collect_header_lines:
|
||||
header_lines.append(line)
|
||||
if line.startswith('End of peak list'):
|
||||
collect_header_lines = False # Stop collecting lines
|
||||
header.append( header_lines )
|
||||
|
||||
return header
|
||||
|
||||
def get_header( header, input_file ):
|
||||
|
||||
if header == "geom":
|
||||
start_keyword = "----- Begin geometry file -----"
|
||||
end_keyword = "----- End geometry file -----"
|
||||
if header == "cell":
|
||||
start_keyword = "----- Begin unit cell -----"
|
||||
end_keyword = "----- End unit cell -----"
|
||||
|
||||
# setup
|
||||
collect_lines = False
|
||||
headers = []
|
||||
|
||||
# Open the input file for reading
|
||||
with open(input_file, 'r') as f:
|
||||
for line in f:
|
||||
# Check for the start condition
|
||||
if line.strip() == start_keyword:
|
||||
collect_lines = True
|
||||
headers_lines = []
|
||||
# Collect lines between start and end conditions
|
||||
if collect_lines:
|
||||
headers_lines.append(line)
|
||||
# Check for the end condition
|
||||
if line.strip() == end_keyword:
|
||||
collect_lines = False # Stop collecting lines
|
||||
headers.append(headers_lines)
|
||||
|
||||
return headers[0]
|
||||
|
||||
def write_to_file( geom, cell, chunk_header, crystals, output_file ):
|
||||
|
||||
# Write sections with matching cell parameters to the output file
|
||||
with open(output_file, 'w') as out_file:
|
||||
out_file.write('CrystFEL stream format 2.3\n')
|
||||
out_file.write('Generated by CrystFEL 0.10.2\n')
|
||||
out_file.writelines(geom)
|
||||
out_file.writelines(cell)
|
||||
for crystal, header in zip( crystals, chunk_header ):
|
||||
out_file.writelines( header )
|
||||
out_file.writelines( crystal )
|
||||
out_file.writelines( "----- End chunk -----\n" )
|
||||
|
||||
def run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues ):
|
||||
|
||||
# partialator file name
|
||||
part_run_file = "merge_{0}.sh".format( part_name )
|
||||
|
||||
# write file
|
||||
part_sh = open( part_run_file, "w" )
|
||||
part_sh.write( "#!/bin/sh\n\n" )
|
||||
part_sh.write( "source /sf/cristallina/applications/mx/conda/miniconda/bin/activate\n" )
|
||||
part_sh.write( "conda activate crmx38-analysis\n\n" )
|
||||
part_sh.write( "python {0} -n merge_{1}".format( part_mtz_py, part_name ) )
|
||||
part_sh.write( " -s {0}".format( stream_file ) )
|
||||
part_sh.write( " -p {0}".format( pointgroup ) )
|
||||
part_sh.write( " -c {0}".format( cell_file ) )
|
||||
part_sh.write( " -r 1.3" )
|
||||
part_sh.write( " -g {0}".format( spacegroup ) )
|
||||
part_sh.write( " -R {0}".format( residues ) )
|
||||
part_sh.close()
|
||||
|
||||
# make file executable
|
||||
subprocess.call( [ "chmod", "+x", "{0}".format( part_run_file ) ] )
|
||||
|
||||
# return partialator file name
|
||||
return part_run_file
|
||||
|
||||
def strip_stream( input_stream, samples, output, repeat ):
|
||||
|
||||
# get geom and cell file headers
|
||||
print( "getting header info from .stream file" )
|
||||
geom = get_header( "geom", input_stream )
|
||||
cell = get_header( "cell", input_stream )
|
||||
print( "done" )
|
||||
|
||||
# extract chunks
|
||||
print( "finding chucks" )
|
||||
chunk_df = extract_chunks( input_stream )
|
||||
# display no. of chunks
|
||||
print( "found {0} chunks".format( len(chunk_df) ) )
|
||||
# remove rows without xtals
|
||||
chunk_df = chunk_df.loc[chunk_df.hit, :]
|
||||
print( "found {0} hits (not including multiples)".format( len(chunk_df) ) )
|
||||
print( "done" )
|
||||
|
||||
# extract xtals
|
||||
print( "get xtals from chunks" )
|
||||
xtal_df = pd.DataFrame()
|
||||
counter = 0
|
||||
for index, row in chunk_df.iterrows():
|
||||
|
||||
chunk, hit, image_no = row[ "chunks" ], row[ "hit" ], row[ "image_no" ]
|
||||
|
||||
if hit:
|
||||
|
||||
# find xtals and header
|
||||
header = extract_header( chunk )
|
||||
xtals = extract_xtals( chunk )
|
||||
|
||||
# make header same length as xtals
|
||||
header = header*len(xtals)
|
||||
|
||||
# concat results
|
||||
xtal_df_1 = pd.DataFrame()
|
||||
xtal_df_1[ "header" ] = header
|
||||
xtal_df_1[ "xtals" ] = xtals
|
||||
xtal_df_1[ "image_no" ] = image_no
|
||||
xtal_df = pd.concat( ( xtal_df, xtal_df_1 ) )
|
||||
|
||||
# add count and print every 1000s
|
||||
counter = counter + len(xtals)
|
||||
if counter % 1000 == 0:
|
||||
print( counter, end='\r' )
|
||||
print( "done" )
|
||||
|
||||
# sort by image no and reindex
|
||||
xtal_df = xtal_df.sort_values( by=[ "image_no" ] )
|
||||
xtal_df = xtal_df.reset_index( drop=True )
|
||||
|
||||
stream_df = pd.DataFrame()
|
||||
|
||||
# randomly n number of sample of xtals with replacement
|
||||
for sample in samples:
|
||||
print( "taking {0} {1} sample".format( repeat, sample ) )
|
||||
for x in range( 0, repeat ):
|
||||
|
||||
try:
|
||||
sample_df = xtal_df.sample( sample, replace=True )
|
||||
except ValueError:
|
||||
print( "input image sample larger than number of hits. Sample should be less than {0}".format( len(xtal_df) ) )
|
||||
|
||||
# rebuild .stream from sample
|
||||
print( "writing {0} to output file".format( x ) )
|
||||
crystals = sample_df.xtals.to_list()
|
||||
chunk_header = sample_df.header.to_list()
|
||||
output_file = "{0}_{1}_{2}.stream".format( output, sample ,x )
|
||||
write_to_file( geom, cell, chunk_header, crystals, output_file )
|
||||
|
||||
# append stream file to stream list
|
||||
data = [ { "stream_file" : output_file,
|
||||
"sample" : sample,
|
||||
"number" : x
|
||||
} ]
|
||||
stream_df_1 = pd.DataFrame( data )
|
||||
stream_df = pd.concat( ( stream_df, stream_df_1 ) )
|
||||
|
||||
print( "done {0}".format( x ) )
|
||||
print( "done" )
|
||||
|
||||
return stream_df
|
||||
|
||||
def make_process_dir( dir ):
|
||||
# make process directory
|
||||
try:
|
||||
os.makedirs( dir )
|
||||
except OSError as e:
|
||||
if e.errno != errno.EEXIST:
|
||||
raise
|
||||
|
||||
def main( cwd, input_stream, samples, output, repeat, script_dir, pointgroup, cell_file, spacegroup, residues ):
|
||||
|
||||
# make sample stream files
|
||||
stream_df = strip_stream( input_stream, samples, output, repeat )
|
||||
|
||||
# run partialator and make mtz
|
||||
for index, row in stream_df.iterrows():
|
||||
|
||||
stream_file, sample, number = row[ "stream_file" ], row[ "sample" ], row[ "number" ]
|
||||
print( "running partialatory for sample={0}, run={1}".format( sample, number ) )
|
||||
|
||||
# run partialator and make mtz
|
||||
part_mtz_py = "{0}/partialator_mtz.py".format( script_dir )
|
||||
part_name = "{0}_{1}".format( sample, number )
|
||||
part_run_file = run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues)
|
||||
|
||||
subprocess.run( "./{0}".format( part_run_file ) )
|
||||
os.chdir( cwd )
|
||||
|
||||
def list_of_floats(arg):
|
||||
return list(map(int, arg.split(',')))
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--stream",
|
||||
help="input stream file",
|
||||
required=True,
|
||||
type=os.path.abspath
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help="output stream file with sampled xtals",
|
||||
type=str,
|
||||
default="sample"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--sample",
|
||||
help="size of sample to take from input.stream",
|
||||
type=list_of_floats,
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--repeat",
|
||||
help="how many samples would you like?",
|
||||
type=int
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--pointgroup",
|
||||
help="pointgroup used by CrystFEL for partialator run",
|
||||
type=str,
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--cell_file",
|
||||
help="path to CrystFEL cell file for partialator.",
|
||||
type=os.path.abspath,
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-g",
|
||||
"--spacegroup",
|
||||
help="spacegroup for making mtz, e.g P43212",
|
||||
type=str,
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-R",
|
||||
"--residues",
|
||||
help="number of residues for truncate, e.g., hewl = 129",
|
||||
type=int,
|
||||
required=True
|
||||
)
|
||||
args = parser.parse_args()
|
||||
# does if need to be run multiple times?
|
||||
if args.repeat is None:
|
||||
repeat = 1
|
||||
else:
|
||||
repeat = args.repeat
|
||||
# run main
|
||||
cwd = os.getcwd()
|
||||
main( cwd, args.stream, args.sample, args.output, repeat, args.pointgroup, args.cell_file, args.spacegroup, args.residues )
|
||||
Reference in New Issue
Block a user