selection missing is current version
This commit is contained in:
@@ -24,21 +24,6 @@ import pandas as pd
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
import os
|
||||||
|
|
||||||
def scrub_cells( stream ):
|
|
||||||
|
|
||||||
# get uc values from stream file
|
|
||||||
# example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg
|
|
||||||
# scrub clen and return - else nan
|
|
||||||
try:
|
|
||||||
pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg"
|
|
||||||
cell_lst = re.findall( pattern, stream )
|
|
||||||
xtals = len( cell_lst )
|
|
||||||
if AttributeError:
|
|
||||||
return cell_lst, xtals
|
|
||||||
except AttributeError:
|
|
||||||
logger.debug( "scrub_cells error" )
|
|
||||||
return np.nan
|
|
||||||
|
|
||||||
def extract_chunks( input_file ):
|
def extract_chunks( input_file ):
|
||||||
|
|
||||||
# setup
|
# setup
|
||||||
@@ -78,10 +63,6 @@ def extract_chunks( input_file ):
|
|||||||
chunk_df[ "image_no" ] = image_no
|
chunk_df[ "image_no" ] = image_no
|
||||||
chunk_df[ "hit" ] = hits
|
chunk_df[ "hit" ] = hits
|
||||||
|
|
||||||
# sort values and set image_no as index
|
|
||||||
chunk_df = chunk_df.sort_values( "image_no" )
|
|
||||||
chunk_df = chunk_df.set_index( "image_no" )
|
|
||||||
|
|
||||||
return chunk_df
|
return chunk_df
|
||||||
|
|
||||||
def extract_xtals( chunk ):
|
def extract_xtals( chunk ):
|
||||||
@@ -124,19 +105,69 @@ def extract_header( chunk ):
|
|||||||
|
|
||||||
return header
|
return header
|
||||||
|
|
||||||
|
def get_header( header, input_file ):
|
||||||
|
|
||||||
def main( input_file ):
|
if header == "geom":
|
||||||
|
start_keyword = "----- Begin geometry file -----"
|
||||||
|
end_keyword = "----- End geometry file -----"
|
||||||
|
if header == "cell":
|
||||||
|
start_keyword = "----- Begin unit cell -----"
|
||||||
|
end_keyword = "----- End unit cell -----"
|
||||||
|
|
||||||
|
# setup
|
||||||
|
collect_lines = False
|
||||||
|
headers = []
|
||||||
|
|
||||||
|
# Open the input file for reading
|
||||||
|
with open(input_file, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
# Check for the start condition
|
||||||
|
if line.strip() == start_keyword:
|
||||||
|
collect_lines = True
|
||||||
|
headers_lines = []
|
||||||
|
# Collect lines between start and end conditions
|
||||||
|
if collect_lines:
|
||||||
|
headers_lines.append(line)
|
||||||
|
# Check for the end condition
|
||||||
|
if line.strip() == end_keyword:
|
||||||
|
collect_lines = False # Stop collecting lines
|
||||||
|
headers.append(headers_lines)
|
||||||
|
|
||||||
|
return headers[0]
|
||||||
|
|
||||||
|
def write_to_file( geom, cell, chunk_header, crystals, output_file ):
|
||||||
|
|
||||||
|
# Write sections with matching cell parameters to the output file
|
||||||
|
with open(output_file, 'w') as out_file:
|
||||||
|
out_file.write('CrystFEL stream format 2.3\n')
|
||||||
|
out_file.write('Generated by CrystFEL 0.10.2\n')
|
||||||
|
out_file.writelines(geom)
|
||||||
|
out_file.writelines(cell)
|
||||||
|
for crystal, header in zip( crystals, chunk_header ):
|
||||||
|
out_file.writelines( header )
|
||||||
|
out_file.writelines( crystal )
|
||||||
|
out_file.writelines( "----- End chunk -----\n" )
|
||||||
|
|
||||||
|
def main( input_file, samples, output, repeat ):
|
||||||
|
|
||||||
|
# get geom and cell file headers
|
||||||
|
print( "getting header info from .stream file" )
|
||||||
|
geom = get_header( "geom", input_file )
|
||||||
|
cell = get_header( "cell", input_file )
|
||||||
|
print( "done" )
|
||||||
|
|
||||||
# extract chunks
|
# extract chunks
|
||||||
print( "finding chucks" )
|
print( "finding chucks" )
|
||||||
chunk_df = extract_chunks( input_file )
|
chunk_df = extract_chunks( input_file )
|
||||||
# display no. of chunks
|
# display no. of chunks
|
||||||
print( "found {0} chunks".format( len(chunk_df) ) )
|
print( "found {0} chunks".format( len(chunk_df) ) )
|
||||||
print( "found {0} crystals".format( chunk_df.hits.sum() ) )
|
# remove rows without xtals
|
||||||
|
chunk_df = chunk_df.loc[chunk_df.hit, :]
|
||||||
|
print( "found {0} hits (not including multiples)".format( len(chunk_df) ) )
|
||||||
print( "done" )
|
print( "done" )
|
||||||
|
|
||||||
# extract xtals
|
# extract xtals
|
||||||
print( "geting xtal data from from chunks" )
|
print( "get xtals from chunks" )
|
||||||
xtal_df = pd.DataFrame()
|
xtal_df = pd.DataFrame()
|
||||||
counter = 0
|
counter = 0
|
||||||
for index, row in chunk_df.iterrows():
|
for index, row in chunk_df.iterrows():
|
||||||
@@ -165,6 +196,28 @@ def main( input_file ):
|
|||||||
print( counter, end='\r' )
|
print( counter, end='\r' )
|
||||||
print( "done" )
|
print( "done" )
|
||||||
|
|
||||||
|
# sort by image no and reindex
|
||||||
|
xtal_df = xtal_df.sort_values( by=[ "image_no" ] )
|
||||||
|
xtal_df = xtal_df.reset_index( drop=True )
|
||||||
|
|
||||||
|
# randomly n number of sample of xtals
|
||||||
|
for sample in samples:
|
||||||
|
print( "taking {0} {1} sample".format( repeat, sample ) )
|
||||||
|
for x in range( 0, repeat ):
|
||||||
|
|
||||||
|
try:
|
||||||
|
sample_df = xtal_df.sample( sample )
|
||||||
|
except ValueError:
|
||||||
|
print( "input image sample larger than number of hits. Sample should be less than {0}".format( len(xtal_df) ) )
|
||||||
|
|
||||||
|
# rebuild .stream from sample
|
||||||
|
print( "writing {0} to output file".format( x ) )
|
||||||
|
crystals = sample_df.xtals.to_list()
|
||||||
|
chunk_header = sample_df.header.to_list()
|
||||||
|
output_file = "{0}_{1}_{2}.stream".format( output, sample ,x )
|
||||||
|
write_to_file( geom, cell, chunk_header, crystals, output_file )
|
||||||
|
print( "done {0}".format( x ) )
|
||||||
|
print( "done" )
|
||||||
|
|
||||||
def list_of_floats(arg):
|
def list_of_floats(arg):
|
||||||
return list(map(int, arg.split(',')))
|
return list(map(int, arg.split(',')))
|
||||||
@@ -178,6 +231,31 @@ if __name__ == "__main__":
|
|||||||
required=True,
|
required=True,
|
||||||
type=os.path.abspath
|
type=os.path.abspath
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-o",
|
||||||
|
"--output",
|
||||||
|
help="output stream file with sampled xtals",
|
||||||
|
type=str,
|
||||||
|
default="sample"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-n",
|
||||||
|
"--sample",
|
||||||
|
help="size of sample to take from input.stream",
|
||||||
|
type=list_of_floats,
|
||||||
|
required=True
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-r",
|
||||||
|
"--repeat",
|
||||||
|
help="how many samples would you like?",
|
||||||
|
type=int
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
# does if need to be run multiple times?
|
||||||
|
if args.repeat is None:
|
||||||
|
repeat = 1
|
||||||
|
else:
|
||||||
|
repeat = args.repeat
|
||||||
# run main
|
# run main
|
||||||
main( args.stream )
|
main( args.stream, args.sample, args.output, repeat)
|
||||||
Reference in New Issue
Block a user