updated way to open large .stream files without crashing

2025-01-26 21:44:34 +01:00
parent 3fbfa5c31f
commit d0607a5cdc
1 changed files with 49 additions and 56 deletions
--- a/reduction_tools/crystfel_split.py
+++ b/reduction_tools/crystfel_split.py
@@ -34,85 +34,77 @@ import regex as re
 import numpy as np
 from loguru import logger
-def count_chunks( stream ):
+def scrub_cells( line ):
    # get number of chunks
    # example - ----- Begin chunk -----
    # count them
    try:
        pattern = r"-----\sBegin\schunk\s-----"
        chunks = re.findall( pattern, stream )
        if AttributeError:
            return len( chunks )
    except AttributeError:
        logger.debug( "count_chunks error" )
        return np.nan
 def scrub_cells( stream ):
    # get uc values from stream file
    # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg
-    # scrub clen and return - else nan
+    pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg"
-    try:
+    a = re.search( pattern, line ).group(1)
-        pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg"
+    b = re.search( pattern, line ).group(2)
-        cell_lst = re.findall( pattern, stream )
+    c = re.search( pattern, line ).group(3)
-        xtals = len( cell_lst )
+    alpha = re.search( pattern, line ).group(4)
-        if AttributeError:
+    beta = re.search( pattern, line ).group(5)
-            return cell_lst, xtals
+    gamma = re.search( pattern, line ).group(6)
-    except AttributeError:
+
-        logger.debug( "scrub_cells error" )
+    return [ a, b, c, alpha, beta, gamma ]
        return np.nan
 def scrub_res( stream ):
    # get diffraction limit
    # example - diffraction_resolution_limit = 4.07 nm^-1 or 2.46 A
-    # scrub res_lst or return np.nan
+    pattern = r"diffraction_resolution_limit\s=\s\d\.\d+\snm\^-1\sor\s(\d+\.\d+)\sA"
-    try:
+    res = re.search( pattern, stream ).group(1)
-        pattern = r"diffraction_resolution_limit\s=\s\d+\.\d+\snm\^-1\sor\s(\d+\.\d+)\sA"
+    return res
        res_lst = re.findall( pattern, stream )
        if AttributeError:
            return res_lst
    except AttributeError:
        logger.debug( "scrub_res error" )
        return np.nan
 def scrub_obs( stream ):
    # get number of reflections
    # example - num_reflections = 308
-    # scrub reflections or return np.nan
+    pattern = r"num_reflections\s=\s(\d+)"
-    try:
+    obs = re.search( pattern, stream ).group(1)
-        pattern = r"num_reflections\s=\s(\d+)"
+    return obs
        obs_lst = re.findall( pattern, stream )
        if AttributeError:
            return obs_lst
    except AttributeError:
        logger.debug( "scrub_obs error" )
        return np.nan
 def calculate_stats( stream_pwd ):
    chunks = 0
    xtals = 0
    cells = []
    obs_list = []
    res_list = []
    print( "scrubing data" )
    # open stream file
-    stream = open( stream_pwd, "r" ).read()
+    with open( stream_pwd ) as stream:
        for line in stream:
            # count chunks
            if line.startswith( "----- Begin chunk -----" ):
                chunks = chunks + 1
-    # get total number chunks
+            # get cell
-    chunks = count_chunks( stream )
+            if line.startswith( "Cell parameters" ):
                cell = scrub_cells( line )
                cells.append( cell )
                xtals = xtals + 1
-    # get list of cells
+            # get res
-    cell_lst, xtals = scrub_cells( stream )
+            if line.startswith( "diffraction_resolution_limit" ):
                res = scrub_res( line )
                res_list.append( res )
-    # get list of cells
+            # get obs
-    res_lst = scrub_res( stream )
+            if line.startswith( "num_reflections" ):
                obs = scrub_obs( line )
                obs_list.append( obs )
-    # get list of cells
+            if chunks % 1000 == 0:
-    obs_lst = scrub_obs( stream )
+                print( "scrubbed {0} chunks".format( chunks ), end='\r' )
    # res_df
    cols = [ "a", "b", "c", "alpha", "beta", "gamma" ]
-    df = pd.DataFrame( cell_lst, columns=cols )
+    df = pd.DataFrame( cells, columns=cols )
-    df[ "resolution" ] = res_lst
+    df[ "resolution" ] = res_list
-    df[ "obs" ] = obs_lst
+    df[ "obs" ] = obs_list
    # convert all to floats
    df = df.astype(float)
@@ -159,13 +151,14 @@ def write_crystfel_run( proc_dir, name, chunk, chunk_lst_file, geom_file, cell_f
    run_sh.write( "  --output={0} \\\n".format( stream_file ) )
    run_sh.write( "  --geometry={0} \\\n".format( geom_file ) )
    run_sh.write( "  --pdb={0} \\\n".format( cell_file ) )
    run_sh.write( "  --push-res=0.5 \\\n" )
    run_sh.write( "  --indexing=xgandalf-latt-cell \\\n" )
    run_sh.write( "  --peaks=peakfinder8 \\\n" )
    run_sh.write( "  --integration=rings-nocen-nograd \\\n" )
    run_sh.write( "  --tolerance=10.0,10.0,10.0,2,3,2 \\\n" )
    run_sh.write( "  --threshold={0} \\\n".format( threshold ) )
    run_sh.write( "  --min-snr=5 \\\n" )
-    run_sh.write( "  --int-radius=5,7,9 \\\n" )
+    run_sh.write( "  --int-radius=4,6,7 \\\n" )
    run_sh.write( "  -j 32 \\\n" )
    run_sh.write( "  --multi \\\n" )
    run_sh.write( "  --check-peaks \\\n" )