From a5bf02ec69dea3d3dec54f1e5573b438ff97b499 Mon Sep 17 00:00:00 2001 From: Barbara Bertozzi Date: Fri, 6 Jun 2025 11:39:59 +0200 Subject: [PATCH] Functions updated --- SP2XR_toolkit.py | 65 +++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/SP2XR_toolkit.py b/SP2XR_toolkit.py index 861f312..fa928ab 100644 --- a/SP2XR_toolkit.py +++ b/SP2XR_toolkit.py @@ -75,7 +75,7 @@ def find_files(directory, string, avoid='xxxxxxxxxx'): return filtered_paths -def get_file_dict(directory, file_type): +def get_file_dict(directory, file_type, level='hour'): """ Creates a dictionary with date and hour as keys and file paths as values for the given directory. @@ -87,7 +87,10 @@ def get_file_dict(directory, file_type): parts = file_path.split(os.sep) date = parts[-3] hour = parts[-2] - file_dict[(date, hour)] = os.path.join('/', *parts[:-1]) + if level == 'hour': + file_dict[(date, hour)] = os.path.join('/', *parts[:-1]) + elif level == 'date': + file_dict[(date)] = os.path.join('/', *parts[:-2]) return file_dict @@ -1508,7 +1511,10 @@ def calculate_calib_coeff(pbp_data, calib_dict, if do_peak_histogram_plots: fig, axs = plt.subplots() - axs.set_title(f'file:{folder_name} - mass={mass} fg - aerosol:{aerosol_type}') + if size_selection_method == 'APM': + axs.set_title(f'file:{folder_name} - mass={mass} fg - aerosol:{aerosol_type}') + elif size_selection_method == 'DMA': + axs.set_title(f'file:{folder_name} - diam={diam} nm - aerosol:{aerosol_type}') sns.histplot(tmp_pbp, stat='frequency', log_scale=(True, False), bins=nbins, color='grey', alpha=0.5) axst = axs.twinx() @@ -1520,7 +1526,10 @@ def calculate_calib_coeff(pbp_data, calib_dict, axs.axvline(pu[i], c='g', lw=2) axst.set_ylim(0, ) if save_peak_histogram_plots: - plt.savefig(plot_dir+f'\\hist_plot_mass_{mass}.png', dpi=600) + if size_selection_method == 'APM': + plt.savefig(plot_dir+f'/hist_plot_mass_{mass}.png', dpi=600) + elif size_selection_method == 'DMA': + plt.savefig(plot_dir+f'/hist_plot_diam_{diam}.png', dpi=600) tmp_peak_height_fit = np.concatenate( tmp_peak_height_fit, axis=0 ) if len(tmp_mass_fit)>0: tmp_mass_or_diam_fit = np.concatenate( tmp_mass_fit, axis=0 ) @@ -1560,7 +1569,7 @@ def calculate_calib_coeff(pbp_data, calib_dict, axs.plot(bin_centers, calib_curve_fit, lw=2, c='C0') if save_calib_curve_plot: - plt.savefig(plot_dir+'\\mass_peakH.png', dpi=600) + plt.savefig(plot_dir+'mass_peakH.png', dpi=600) return calib_dict, popt @@ -1753,7 +1762,7 @@ def process_pbp_parquet(dir_path_pbp, dir_path_hk, minM=None, maxM=None, n_incbins=None, minOptD=None, maxOptD=None, n_scattbins=None, minTL=None, maxTL=None, n_timelag=None, - save_final_data=True, path_parquet='' + save_final_data=True, path_parquet='', partition_on=['date', 'hour'] ): read_dir_pbp = dir_path_pbp @@ -1828,9 +1837,9 @@ def process_pbp_parquet(dir_path_pbp, dir_path_hk, flag_scatt_in_range = flag_scatt & (ddf_pbp['Opt diam'] >= minOptD) & (ddf_pbp['Opt diam'] <= maxOptD) - flag_negative_timelag = ddf_pbp['time_lag_new']<0 + flag_negative_timelag = ddf_pbp['time_lag_new']<-10 flag_extreme_positive_timelag = ddf_pbp['time_lag_new']>=400 - flag_timelag_0_50 = (ddf_pbp['time_lag_new']<50) & (ddf_pbp['time_lag_new']>=0) + flag_timelag_0_50 = (ddf_pbp['time_lag_new']<50) & (ddf_pbp['time_lag_new']>=-10) flag_timelag_greater_50 = (ddf_pbp['time_lag_new']>=50) & (ddf_pbp['time_lag_new']<400) ddf_pbp['ratio_inc_scatt'] = np.log10(ddf_pbp['Incand relPeak']) / np.log10(ddf_pbp['Scatter relPeak']) @@ -1871,9 +1880,9 @@ def process_pbp_parquet(dir_path_pbp, dir_path_hk, ddf_pbp.loc[flag_scatt & flag_scatt_not_sat & flag_inc_in_range & ((flag_timelag_0_50 & ~flag_low_ratio_inc_scatt) | (flag_timelag_greater_50 & ~flag_extreme_positive_timelag)), 'cnts_particles_for_tl_dist'] = 1 - ddf_pbp['cnts_thin_total'] = ddf_pbp['cnts_thin'] - ddf_pbp['cnts_thick_total'] = ddf_pbp['cnts_thick'] + ddf_pbp['cnts_thick_sat'] + ddf_pbp['cnts_ntl_sat'] + ddf_pbp['cnts_ntl'] - ddf_pbp['cnts_unclassified'] = ddf_pbp['cnts_thin_noScatt'] + ddf_pbp['cnts_extreme_positive_timelag'] + ddf_pbp['cnts_thin_low_inc_scatt_ratio'] + ddf_pbp['cnts_thin_sat'] + ddf_pbp['cnts_thin_total'] = ddf_pbp['cnts_thin'] + ddf_pbp['cnts_thin_noScatt'] + ddf_pbp['cnts_thick_total'] = ddf_pbp['cnts_thick'] + ddf_pbp['cnts_thick_sat'] + ddf_pbp['cnts_ntl_sat'] + ddf_pbp['cnts_ntl'] + ddf_pbp['cnts_thin_sat'] + ddf_pbp['cnts_unclassified'] = ddf_pbp['cnts_extreme_positive_timelag'] + ddf_pbp['cnts_thin_low_inc_scatt_ratio'] @@ -1978,25 +1987,25 @@ def process_pbp_parquet(dir_path_pbp, dir_path_hk, # Calculate histograms of different classifications/flags: ddf_pbp['temporary_col'] = 1 - dNdlogDmev, dMdlogDmev = process_hist_and_dist(ddf_pbp, 'BC mass within range', None, None, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_thin, dMdlogDmev_thin = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thin', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_thin_noScatt, dMdlogDmev_thin_noScatt = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thin_noScatt', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_thick, dMdlogDmev_thick = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thick', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_thick_sat, dMdlogDmev_thick_sat = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thick_sat', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_thin_sat, dMdlogDmev_thin_sat = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thin_sat', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_ntl_sat, dMdlogDmev_ntl_sat = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_ntl_sat', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_ntl, dMdlogDmev_ntl = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_ntl', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_extreme_positive_timelag, dMdlogDmev_extreme_positive_timelag = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_extreme_positive_timelag', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_thin_low_inc_scatt_ratio, dMdlogDmev_thin_low_inc_scatt_ratio = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thin_low_inc_scatt_ratio', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_thin_total, dMdlogDmev_thin_total = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thin_total', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_thick_total, dMdlogDmev_thick_total = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thick_total', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) - dNdlogDmev_unclassified, dMdlogDmev_unclassified = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_unclassified', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type) + dNdlogDmev, dMdlogDmev = process_hist_and_dist(ddf_pbp, 'BC mass within range', None, None, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_thin, dMdlogDmev_thin = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thin', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_thin_noScatt, dMdlogDmev_thin_noScatt = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thin_noScatt', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_thick, dMdlogDmev_thick = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thick', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_thick_sat, dMdlogDmev_thick_sat = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thick_sat', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_thin_sat, dMdlogDmev_thin_sat = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thin_sat', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_ntl_sat, dMdlogDmev_ntl_sat = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_ntl_sat', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_ntl, dMdlogDmev_ntl = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_ntl', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_extreme_positive_timelag, dMdlogDmev_extreme_positive_timelag = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_extreme_positive_timelag', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_thin_low_inc_scatt_ratio, dMdlogDmev_thin_low_inc_scatt_ratio = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thin_low_inc_scatt_ratio', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_thin_total, dMdlogDmev_thin_total = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thin_total', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_thick_total, dMdlogDmev_thick_total = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_thick_total', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) + dNdlogDmev_unclassified, dMdlogDmev_unclassified = process_hist_and_dist(ddf_pbp, 'BC mass within range', 'cnts_unclassified', 1, inc_mass_bin_lims, inc_mass_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=rho_eff, BC_type=BC_type, t=dt) scatt_bin_lims = np.logspace(np.log10(minOptD), np.log10(maxOptD), n_scattbins) scatt_bin_ctrs = bin_lims_to_ctrs(scatt_bin_lims) - dNdlogDsc, _ = process_hist_and_dist(ddf_pbp, 'Opt diam scatt only', None, None, scatt_bin_lims, scatt_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=None, BC_type=None) + dNdlogDsc, _ = process_hist_and_dist(ddf_pbp, 'Opt diam scatt only', None, None, scatt_bin_lims, scatt_bin_ctrs, dt_str, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=None, BC_type=None, t=dt) @@ -2005,8 +2014,8 @@ def process_pbp_parquet(dir_path_pbp, dir_path_hk, list_hist = [] for idx, (name, group) in enumerate(ddf_pbp.groupby('BC mass bin')): - a, _ = process_hist_and_dist(group, 'time_lag_new', 'cnts_particles_for_tl_dist', 1, timelag_bins_lims, timelag_bin_ctrs, dt_str, calculate_conc=False, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=None, BC_type=None, t=1) - a.columns = [f'BC_mass_{inc_mass_bin_ctrs[idx]:.2f}_timelag_{i:.1f}' for i in timelag_bin_ctrs] + a, _ = process_hist_and_dist(group, 'time_lag_new', 'cnts_particles_for_tl_dist', 1, timelag_bins_lims, timelag_bin_ctrs, dt_str, calculate_conc=True, flow=ddf_pbp_hk['Sample Flow Controller Read (vccm)'], rho_eff=None, BC_type=None, t=dt) + a.columns = [f'dNdlogDmev_{inc_mass_bin_ctrs[idx]:.2f}_timelag_{i:.1f}' for i in timelag_bin_ctrs] list_hist.append(a) time_lag_hists = pd.concat(list_hist, axis=1) @@ -2045,7 +2054,7 @@ def process_pbp_parquet(dir_path_pbp, dir_path_hk, if save_final_data == True: dd.from_pandas(final_df.sort_index(), npartitions=1).to_parquet(path = path_parquet, engine='pyarrow', - partition_on=['date', 'hour'], + partition_on=partition_on, write_index=True )