# modules import pandas as pd import regex as re import os import numpy as np import matplotlib.pyplot as plt def scrub_clen( stream_pwd ): # get clen from stream name # example - /sf/cristallina/data/p20590/work/process/jhb/detector_refinement/coarse_scan/0.115/0.115.stream # scrub clen and return - else nan try: pattern = r"0\.\d+/(0\.\d+)\.stream" re_search = re.search( pattern, stream_pwd ) clen = re_search.group( 1 ) if AttributeError: return float( clen ) except AttributeError: return np.nan def find_streams( top_dir ): # create df for streams stream_df = pd.DataFrame() # search for all files that end with .stream for path, dirs, files in os.walk( top_dir ): for name in files: if name.endswith( ".stream" ): # get stream pwd stream_pwd = os.path.join( path, name ) # scrub clen from stream clen = scrub_clen( stream_pwd ) # put clen and stream pwd into df data = [ { "stream_pwd" : stream_pwd, "clen" : clen } ] stream_df_1 = pd.DataFrame( data ) stream_df = pd.concat( ( stream_df, stream_df_1 ) ) # sort df based on clen stream_df = stream_df.sort_values( by="clen" ) # reset df index stream_df = stream_df.reset_index( drop=True ) # return df of streams and clens return stream_df def scrub_us( stream ): # get uc values from stream file # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg # scrub clen and return - else nan try: pattern = r"Cell\sparameters\s(\d\.\d+)\s(\d\.\d+)\s(\d\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg" cells = re.findall( pattern, stream ) if AttributeError: return cells except AttributeError: return np.nan def main( top_dir ): # find stream files from process directory print( "finding stream files" ) stream_df = find_streams( top_dir ) print( "done" ) # making results df for unit cell and index no. results_df = pd.DataFrame() # loop through stream files and collect unit_cell information print( "looping through stream files to collect unit cell, indexed information" ) for index, row in stream_df.iterrows(): stream_pwd, clen = row[ "stream_pwd" ], row[ "clen" ] # open stream file print( "scrubbing stream for clen={0}".format( clen ) ) stream = open( stream_pwd, "r" ).read() # scrub unit cell information cells = scrub_us( stream ) # put cells in df cols = [ "a", "b", "c", "alpha", "beta", "gamma" ] cells_df = pd.DataFrame( cells, columns=cols ) cells_df = cells_df.astype( float ) # calc stats indexed = len( cells_df ) std_a = cells_df.a.std() std_b = cells_df.b.std() std_c = cells_df.c.std() # put stats in results df stats = [ { "clen" : clen, "indexed" : indexed, "std_a" : std_a, "std_b" : std_b, "std_c" : std_c } ] results_df_1 = pd.DataFrame( stats ) results_df = pd.concat( ( results_df, results_df_1 ) ) print( "done" ) # reset index results_df = results_df.reset_index( drop=True ) # plot results fig, ax1 = plt.subplots() # indexed images plot color = "tab:red" ax1.set_xlabel( "clen" ) ax1.set_ylabel( "indexed", color=color ) ax1.plot( results_df.clen, results_df.indexed, color=color) ax1.tick_params( axis="y", labelcolor=color) # instantiate a second axes that shares the same x-axis ax2 = ax1.twinx() # std_a plot color = "tab:blue" ax2.set_ylabel( "st.deviation", color=color ) ax2.plot( results_df.clen, results_df.std_a, color=color ) ax2.tick_params(axis='y', labelcolor=color) # std_b plot ax2.plot( results_df.clen, results_df.std_b, color=color ) ax2.tick_params(axis='y', labelcolor=color) # std_b plot ax2.plot( results_df.clen, results_df.std_c, color=color ) ax2.tick_params(axis='y', labelcolor=color) fig.tight_layout() # otherwise the right y-label is slightly clipped plt.show() # variables top_dir = "/sf/cristallina/data/p20590/work/process/jhb/detector_refinement/coarse_scan" main( top_dir )