selection missing is current version

This commit is contained in:
Beale John Henry
2025-05-07 22:23:14 +02:00
parent 5ed62f2d98
commit cb25bbcb44

View File

@@ -24,21 +24,6 @@ import pandas as pd
import numpy as np import numpy as np
import os import os
def scrub_cells( stream ):
# get uc values from stream file
# example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg
# scrub clen and return - else nan
try:
pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg"
cell_lst = re.findall( pattern, stream )
xtals = len( cell_lst )
if AttributeError:
return cell_lst, xtals
except AttributeError:
logger.debug( "scrub_cells error" )
return np.nan
def extract_chunks( input_file ): def extract_chunks( input_file ):
# setup # setup
@@ -78,10 +63,6 @@ def extract_chunks( input_file ):
chunk_df[ "image_no" ] = image_no chunk_df[ "image_no" ] = image_no
chunk_df[ "hit" ] = hits chunk_df[ "hit" ] = hits
# sort values and set image_no as index
chunk_df = chunk_df.sort_values( "image_no" )
chunk_df = chunk_df.set_index( "image_no" )
return chunk_df return chunk_df
def extract_xtals( chunk ): def extract_xtals( chunk ):
@@ -124,19 +105,69 @@ def extract_header( chunk ):
return header return header
def get_header( header, input_file ):
def main( input_file ): if header == "geom":
start_keyword = "----- Begin geometry file -----"
end_keyword = "----- End geometry file -----"
if header == "cell":
start_keyword = "----- Begin unit cell -----"
end_keyword = "----- End unit cell -----"
# setup
collect_lines = False
headers = []
# Open the input file for reading
with open(input_file, 'r') as f:
for line in f:
# Check for the start condition
if line.strip() == start_keyword:
collect_lines = True
headers_lines = []
# Collect lines between start and end conditions
if collect_lines:
headers_lines.append(line)
# Check for the end condition
if line.strip() == end_keyword:
collect_lines = False # Stop collecting lines
headers.append(headers_lines)
return headers[0]
def write_to_file( geom, cell, chunk_header, crystals, output_file ):
# Write sections with matching cell parameters to the output file
with open(output_file, 'w') as out_file:
out_file.write('CrystFEL stream format 2.3\n')
out_file.write('Generated by CrystFEL 0.10.2\n')
out_file.writelines(geom)
out_file.writelines(cell)
for crystal, header in zip( crystals, chunk_header ):
out_file.writelines( header )
out_file.writelines( crystal )
out_file.writelines( "----- End chunk -----\n" )
def main( input_file, samples, output, repeat ):
# get geom and cell file headers
print( "getting header info from .stream file" )
geom = get_header( "geom", input_file )
cell = get_header( "cell", input_file )
print( "done" )
# extract chunks # extract chunks
print( "finding chucks" ) print( "finding chucks" )
chunk_df = extract_chunks( input_file ) chunk_df = extract_chunks( input_file )
# display no. of chunks # display no. of chunks
print( "found {0} chunks".format( len(chunk_df) ) ) print( "found {0} chunks".format( len(chunk_df) ) )
print( "found {0} crystals".format( chunk_df.hits.sum() ) ) # remove rows without xtals
chunk_df = chunk_df.loc[chunk_df.hit, :]
print( "found {0} hits (not including multiples)".format( len(chunk_df) ) )
print( "done" ) print( "done" )
# extract xtals # extract xtals
print( "geting xtal data from from chunks" ) print( "get xtals from chunks" )
xtal_df = pd.DataFrame() xtal_df = pd.DataFrame()
counter = 0 counter = 0
for index, row in chunk_df.iterrows(): for index, row in chunk_df.iterrows():
@@ -165,6 +196,28 @@ def main( input_file ):
print( counter, end='\r' ) print( counter, end='\r' )
print( "done" ) print( "done" )
# sort by image no and reindex
xtal_df = xtal_df.sort_values( by=[ "image_no" ] )
xtal_df = xtal_df.reset_index( drop=True )
# randomly n number of sample of xtals
for sample in samples:
print( "taking {0} {1} sample".format( repeat, sample ) )
for x in range( 0, repeat ):
try:
sample_df = xtal_df.sample( sample )
except ValueError:
print( "input image sample larger than number of hits. Sample should be less than {0}".format( len(xtal_df) ) )
# rebuild .stream from sample
print( "writing {0} to output file".format( x ) )
crystals = sample_df.xtals.to_list()
chunk_header = sample_df.header.to_list()
output_file = "{0}_{1}_{2}.stream".format( output, sample ,x )
write_to_file( geom, cell, chunk_header, crystals, output_file )
print( "done {0}".format( x ) )
print( "done" )
def list_of_floats(arg): def list_of_floats(arg):
return list(map(int, arg.split(','))) return list(map(int, arg.split(',')))
@@ -178,6 +231,31 @@ if __name__ == "__main__":
required=True, required=True,
type=os.path.abspath type=os.path.abspath
) )
parser.add_argument(
"-o",
"--output",
help="output stream file with sampled xtals",
type=str,
default="sample"
)
parser.add_argument(
"-n",
"--sample",
help="size of sample to take from input.stream",
type=list_of_floats,
required=True
)
parser.add_argument(
"-r",
"--repeat",
help="how many samples would you like?",
type=int
)
args = parser.parse_args() args = parser.parse_args()
# does if need to be run multiple times?
if args.repeat is None:
repeat = 1
else:
repeat = args.repeat
# run main # run main
main( args.stream ) main( args.stream, args.sample, args.output, repeat)