runs partialator and makes mtz after taking boot stapped sample

This commit is contained in:
2025-11-28 17:45:07 +01:00
parent d7fa7e25da
commit 4140cad8f2

View File

@@ -0,0 +1,362 @@
#!/usr/bin/env python3
# author J.Beale
"""
# aim
randomly select a series of crystals from a stream file and
then compile them into the correctly formated .stream
run partialator and make an mtz
# usage
python stream_random.py -s <path to stream>
-o output file names
-n sample size
-r how many repeat random samples do you want?
-p pointgroup
-c <path-to-cell-file>
-a max-adu. Default = 12000
-g spacegroup
-r number of residues
# output
.stream file with random sample of xtals
"""
# modules
import re
import argparse
import pandas as pd
import numpy as np
import os, errno
import subprocess
def extract_chunks( input_file ):
# setup
chunk_df = pd.DataFrame()
image_no = []
chunks = []
hits = []
collect_lines = False
# Open the input file for reading
with open(input_file, 'r') as f:
for line in f:
# Check for the start condition
if line.startswith('----- Begin chunk -----'):
hit = False
collect_lines = True
chunk_lines = []
if collect_lines:
chunk_lines.append(line)
# find image_no
if line.startswith( "Event:" ):
image_search = re.findall( r"Event: //(\d+)", line )
image = int(image_search[0])
image_no.append( image )
# is there a hit in chunk
if line.startswith( "Cell parameters" ):
hit = True
if line.startswith('----- End chunk -----'):
collect_lines = False # Stop collecting lines
chunks.append( chunk_lines )
hits.append( hit )
chunk_df[ "chunks" ] = chunks
chunk_df[ "image_no" ] = image_no
chunk_df[ "hit" ] = hits
return chunk_df
def extract_xtals( chunk ):
# setup
xtals = []
collect_crystal_lines = False
# Open the input file for reading
for line in chunk:
# Check for the xtals start condition
if line.startswith('--- Begin crystal'):
collect_crystal_lines = True
xtal_lines = []
if collect_crystal_lines:
xtal_lines.append(line)
if line.startswith('--- End crystal\n'):
collect_crystal_lines = False # Stop collecting lines
xtals.append( xtal_lines )
return xtals
def extract_header( chunk ):
# setup
header = []
collect_header_lines = False
# Open the input file for reading
for line in chunk:
# Check for the xtals start condition
if line.startswith('----- Begin chunk -----'):
collect_header_lines = True
header_lines = []
if collect_header_lines:
header_lines.append(line)
if line.startswith('End of peak list'):
collect_header_lines = False # Stop collecting lines
header.append( header_lines )
return header
def get_header( header, input_file ):
if header == "geom":
start_keyword = "----- Begin geometry file -----"
end_keyword = "----- End geometry file -----"
if header == "cell":
start_keyword = "----- Begin unit cell -----"
end_keyword = "----- End unit cell -----"
# setup
collect_lines = False
headers = []
# Open the input file for reading
with open(input_file, 'r') as f:
for line in f:
# Check for the start condition
if line.strip() == start_keyword:
collect_lines = True
headers_lines = []
# Collect lines between start and end conditions
if collect_lines:
headers_lines.append(line)
# Check for the end condition
if line.strip() == end_keyword:
collect_lines = False # Stop collecting lines
headers.append(headers_lines)
return headers[0]
def write_to_file( geom, cell, chunk_header, crystals, output_file ):
# Write sections with matching cell parameters to the output file
with open(output_file, 'w') as out_file:
out_file.write('CrystFEL stream format 2.3\n')
out_file.write('Generated by CrystFEL 0.10.2\n')
out_file.writelines(geom)
out_file.writelines(cell)
for crystal, header in zip( crystals, chunk_header ):
out_file.writelines( header )
out_file.writelines( crystal )
out_file.writelines( "----- End chunk -----\n" )
def run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues ):
# partialator file name
part_run_file = "merge_{0}.sh".format( part_name )
# write file
part_sh = open( part_run_file, "w" )
part_sh.write( "#!/bin/sh\n\n" )
part_sh.write( "source /sf/cristallina/applications/mx/conda/miniconda/bin/activate\n" )
part_sh.write( "conda activate crmx38-analysis\n\n" )
part_sh.write( "python {0} -n merge_{1}".format( part_mtz_py, part_name ) )
part_sh.write( " -s {0}".format( stream_file ) )
part_sh.write( " -p {0}".format( pointgroup ) )
part_sh.write( " -c {0}".format( cell_file ) )
part_sh.write( " -r 1.3" )
part_sh.write( " -g {0}".format( spacegroup ) )
part_sh.write( " -R {0}".format( residues ) )
part_sh.close()
# make file executable
subprocess.call( [ "chmod", "+x", "{0}".format( part_run_file ) ] )
# return partialator file name
return part_run_file
def strip_stream( input_stream, samples, output, repeat ):
# get geom and cell file headers
print( "getting header info from .stream file" )
geom = get_header( "geom", input_stream )
cell = get_header( "cell", input_stream )
print( "done" )
# extract chunks
print( "finding chucks" )
chunk_df = extract_chunks( input_stream )
# display no. of chunks
print( "found {0} chunks".format( len(chunk_df) ) )
# remove rows without xtals
chunk_df = chunk_df.loc[chunk_df.hit, :]
print( "found {0} hits (not including multiples)".format( len(chunk_df) ) )
print( "done" )
# extract xtals
print( "get xtals from chunks" )
xtal_df = pd.DataFrame()
counter = 0
for index, row in chunk_df.iterrows():
chunk, hit, image_no = row[ "chunks" ], row[ "hit" ], row[ "image_no" ]
if hit:
# find xtals and header
header = extract_header( chunk )
xtals = extract_xtals( chunk )
# make header same length as xtals
header = header*len(xtals)
# concat results
xtal_df_1 = pd.DataFrame()
xtal_df_1[ "header" ] = header
xtal_df_1[ "xtals" ] = xtals
xtal_df_1[ "image_no" ] = image_no
xtal_df = pd.concat( ( xtal_df, xtal_df_1 ) )
# add count and print every 1000s
counter = counter + len(xtals)
if counter % 1000 == 0:
print( counter, end='\r' )
print( "done" )
# sort by image no and reindex
xtal_df = xtal_df.sort_values( by=[ "image_no" ] )
xtal_df = xtal_df.reset_index( drop=True )
stream_df = pd.DataFrame()
# randomly n number of sample of xtals with replacement
for sample in samples:
print( "taking {0} {1} sample".format( repeat, sample ) )
for x in range( 0, repeat ):
try:
sample_df = xtal_df.sample( sample, replace=True )
except ValueError:
print( "input image sample larger than number of hits. Sample should be less than {0}".format( len(xtal_df) ) )
# rebuild .stream from sample
print( "writing {0} to output file".format( x ) )
crystals = sample_df.xtals.to_list()
chunk_header = sample_df.header.to_list()
output_file = "{0}_{1}_{2}.stream".format( output, sample ,x )
write_to_file( geom, cell, chunk_header, crystals, output_file )
# append stream file to stream list
data = [ { "stream_file" : output_file,
"sample" : sample,
"number" : x
} ]
stream_df_1 = pd.DataFrame( data )
stream_df = pd.concat( ( stream_df, stream_df_1 ) )
print( "done {0}".format( x ) )
print( "done" )
return stream_df
def make_process_dir( dir ):
# make process directory
try:
os.makedirs( dir )
except OSError as e:
if e.errno != errno.EEXIST:
raise
def main( cwd, input_stream, samples, output, repeat, script_dir, pointgroup, cell_file, spacegroup, residues ):
# make sample stream files
stream_df = strip_stream( input_stream, samples, output, repeat )
# run partialator and make mtz
for index, row in stream_df.iterrows():
stream_file, sample, number = row[ "stream_file" ], row[ "sample" ], row[ "number" ]
print( "running partialatory for sample={0}, run={1}".format( sample, number ) )
# run partialator and make mtz
part_mtz_py = "{0}/partialator_mtz.py".format( script_dir )
part_name = "{0}_{1}".format( sample, number )
part_run_file = run_partialator_mtz( part_mtz_py, stream_file, part_name, pointgroup, cell_file, spacegroup, residues)
subprocess.run( "./{0}".format( part_run_file ) )
os.chdir( cwd )
def list_of_floats(arg):
return list(map(int, arg.split(',')))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-s",
"--stream",
help="input stream file",
required=True,
type=os.path.abspath
)
parser.add_argument(
"-o",
"--output",
help="output stream file with sampled xtals",
type=str,
default="sample"
)
parser.add_argument(
"-n",
"--sample",
help="size of sample to take from input.stream",
type=list_of_floats,
required=True
)
parser.add_argument(
"-r",
"--repeat",
help="how many samples would you like?",
type=int
)
parser.add_argument(
"-p",
"--pointgroup",
help="pointgroup used by CrystFEL for partialator run",
type=str,
required=True
)
parser.add_argument(
"-c",
"--cell_file",
help="path to CrystFEL cell file for partialator.",
type=os.path.abspath,
required=True
)
parser.add_argument(
"-g",
"--spacegroup",
help="spacegroup for making mtz, e.g P43212",
type=str,
required=True
)
parser.add_argument(
"-R",
"--residues",
help="number of residues for truncate, e.g., hewl = 129",
type=int,
required=True
)
args = parser.parse_args()
# does if need to be run multiple times?
if args.repeat is None:
repeat = 1
else:
repeat = args.repeat
# run main
cwd = os.getcwd()
main( cwd, args.stream, args.sample, args.output, repeat, args.pointgroup, args.cell_file, args.spacegroup, args.residues )