selection missing is current version
This commit is contained in:
@@ -24,21 +24,6 @@ import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
def scrub_cells( stream ):
|
||||
|
||||
# get uc values from stream file
|
||||
# example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg
|
||||
# scrub clen and return - else nan
|
||||
try:
|
||||
pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg"
|
||||
cell_lst = re.findall( pattern, stream )
|
||||
xtals = len( cell_lst )
|
||||
if AttributeError:
|
||||
return cell_lst, xtals
|
||||
except AttributeError:
|
||||
logger.debug( "scrub_cells error" )
|
||||
return np.nan
|
||||
|
||||
def extract_chunks( input_file ):
|
||||
|
||||
# setup
|
||||
@@ -78,10 +63,6 @@ def extract_chunks( input_file ):
|
||||
chunk_df[ "image_no" ] = image_no
|
||||
chunk_df[ "hit" ] = hits
|
||||
|
||||
# sort values and set image_no as index
|
||||
chunk_df = chunk_df.sort_values( "image_no" )
|
||||
chunk_df = chunk_df.set_index( "image_no" )
|
||||
|
||||
return chunk_df
|
||||
|
||||
def extract_xtals( chunk ):
|
||||
@@ -124,19 +105,69 @@ def extract_header( chunk ):
|
||||
|
||||
return header
|
||||
|
||||
def get_header( header, input_file ):
|
||||
|
||||
def main( input_file ):
|
||||
if header == "geom":
|
||||
start_keyword = "----- Begin geometry file -----"
|
||||
end_keyword = "----- End geometry file -----"
|
||||
if header == "cell":
|
||||
start_keyword = "----- Begin unit cell -----"
|
||||
end_keyword = "----- End unit cell -----"
|
||||
|
||||
# setup
|
||||
collect_lines = False
|
||||
headers = []
|
||||
|
||||
# Open the input file for reading
|
||||
with open(input_file, 'r') as f:
|
||||
for line in f:
|
||||
# Check for the start condition
|
||||
if line.strip() == start_keyword:
|
||||
collect_lines = True
|
||||
headers_lines = []
|
||||
# Collect lines between start and end conditions
|
||||
if collect_lines:
|
||||
headers_lines.append(line)
|
||||
# Check for the end condition
|
||||
if line.strip() == end_keyword:
|
||||
collect_lines = False # Stop collecting lines
|
||||
headers.append(headers_lines)
|
||||
|
||||
return headers[0]
|
||||
|
||||
def write_to_file( geom, cell, chunk_header, crystals, output_file ):
|
||||
|
||||
# Write sections with matching cell parameters to the output file
|
||||
with open(output_file, 'w') as out_file:
|
||||
out_file.write('CrystFEL stream format 2.3\n')
|
||||
out_file.write('Generated by CrystFEL 0.10.2\n')
|
||||
out_file.writelines(geom)
|
||||
out_file.writelines(cell)
|
||||
for crystal, header in zip( crystals, chunk_header ):
|
||||
out_file.writelines( header )
|
||||
out_file.writelines( crystal )
|
||||
out_file.writelines( "----- End chunk -----\n" )
|
||||
|
||||
def main( input_file, samples, output, repeat ):
|
||||
|
||||
# get geom and cell file headers
|
||||
print( "getting header info from .stream file" )
|
||||
geom = get_header( "geom", input_file )
|
||||
cell = get_header( "cell", input_file )
|
||||
print( "done" )
|
||||
|
||||
# extract chunks
|
||||
print( "finding chucks" )
|
||||
chunk_df = extract_chunks( input_file )
|
||||
# display no. of chunks
|
||||
print( "found {0} chunks".format( len(chunk_df) ) )
|
||||
print( "found {0} crystals".format( chunk_df.hits.sum() ) )
|
||||
# remove rows without xtals
|
||||
chunk_df = chunk_df.loc[chunk_df.hit, :]
|
||||
print( "found {0} hits (not including multiples)".format( len(chunk_df) ) )
|
||||
print( "done" )
|
||||
|
||||
# extract xtals
|
||||
print( "geting xtal data from from chunks" )
|
||||
print( "get xtals from chunks" )
|
||||
xtal_df = pd.DataFrame()
|
||||
counter = 0
|
||||
for index, row in chunk_df.iterrows():
|
||||
@@ -165,6 +196,28 @@ def main( input_file ):
|
||||
print( counter, end='\r' )
|
||||
print( "done" )
|
||||
|
||||
# sort by image no and reindex
|
||||
xtal_df = xtal_df.sort_values( by=[ "image_no" ] )
|
||||
xtal_df = xtal_df.reset_index( drop=True )
|
||||
|
||||
# randomly n number of sample of xtals
|
||||
for sample in samples:
|
||||
print( "taking {0} {1} sample".format( repeat, sample ) )
|
||||
for x in range( 0, repeat ):
|
||||
|
||||
try:
|
||||
sample_df = xtal_df.sample( sample )
|
||||
except ValueError:
|
||||
print( "input image sample larger than number of hits. Sample should be less than {0}".format( len(xtal_df) ) )
|
||||
|
||||
# rebuild .stream from sample
|
||||
print( "writing {0} to output file".format( x ) )
|
||||
crystals = sample_df.xtals.to_list()
|
||||
chunk_header = sample_df.header.to_list()
|
||||
output_file = "{0}_{1}_{2}.stream".format( output, sample ,x )
|
||||
write_to_file( geom, cell, chunk_header, crystals, output_file )
|
||||
print( "done {0}".format( x ) )
|
||||
print( "done" )
|
||||
|
||||
def list_of_floats(arg):
|
||||
return list(map(int, arg.split(',')))
|
||||
@@ -178,6 +231,31 @@ if __name__ == "__main__":
|
||||
required=True,
|
||||
type=os.path.abspath
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help="output stream file with sampled xtals",
|
||||
type=str,
|
||||
default="sample"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--sample",
|
||||
help="size of sample to take from input.stream",
|
||||
type=list_of_floats,
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--repeat",
|
||||
help="how many samples would you like?",
|
||||
type=int
|
||||
)
|
||||
args = parser.parse_args()
|
||||
# does if need to be run multiple times?
|
||||
if args.repeat is None:
|
||||
repeat = 1
|
||||
else:
|
||||
repeat = args.repeat
|
||||
# run main
|
||||
main( args.stream )
|
||||
main( args.stream, args.sample, args.output, repeat)
|
||||
Reference in New Issue
Block a user