458 lines
16 KiB
Python
458 lines
16 KiB
Python
from __future__ import annotations
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
import gc
|
||
import dask.dataframe as dd
|
||
from dask.dataframe.utils import make_meta
|
||
from .calibration import BC_mass_to_diam
|
||
from sp2xr.helpers import delete_partition_if_exists
|
||
from sp2xr.schema import cast_and_arrow, DEFAULT_FLOAT
|
||
|
||
|
||
def make_bin_arrays(
|
||
*,
|
||
lims: np.ndarray | None = None,
|
||
ctrs: np.ndarray | None = None,
|
||
n: int = 50,
|
||
log: bool = True,
|
||
) -> tuple[np.ndarray, np.ndarray]:
|
||
if lims is None and ctrs is None:
|
||
lims = (
|
||
np.logspace(np.log10(0.3), np.log10(400), n)
|
||
if log
|
||
else np.linspace(0.3, 400, n)
|
||
)
|
||
elif lims is None:
|
||
lims = np.append(
|
||
ctrs[0] - (ctrs[1] - ctrs[0]) / 2,
|
||
(ctrs[:-1] + ctrs[1:]) / 2,
|
||
)
|
||
lims = np.append(lims, ctrs[-1] + (ctrs[-1] - ctrs[-2]) / 2)
|
||
ctrs = (lims[:-1] + lims[1:]) / 2
|
||
return lims, ctrs
|
||
|
||
|
||
def dNdlogDp_to_dMdlogDp(N, Dp, rho=1800):
|
||
"""This is from functions.py from Rob"""
|
||
"""
|
||
Given a normalized number size distribution [dN/dlogDp, cm^-3] defined over the diameters given by Dp [nm]
|
||
return the corresponding the mass size distribution [dM/dlogDp, ug/m3]
|
||
|
||
Optional density input given in units of kg/m3
|
||
"""
|
||
# M = N*((np.pi*(Dp*1e-9)**3)/6)*rho*1e18
|
||
M = N * np.pi / 6 * rho * (Dp**3) * 1e-12
|
||
return M
|
||
|
||
|
||
def bin_ctrs_to_lims(x_ctrs):
|
||
"""This is from functions.py from Rob"""
|
||
"""
|
||
Take an array of bin centers and return an array 1 element larger of bin limits
|
||
"""
|
||
delta = np.diff(x_ctrs) / 2
|
||
x_lims = np.append(x_ctrs[0] - delta[0], x_ctrs[:-1] + delta)
|
||
x_lims = np.append(x_lims, x_ctrs[-1] + delta[-1])
|
||
return x_lims
|
||
|
||
|
||
def bin_lims_to_ctrs(x_lims):
|
||
"""
|
||
Take an array of bin limits and return an array 1 element smaller of bin centers
|
||
"""
|
||
x_ctrs = (x_lims[1:] + x_lims[:-1]) / 2
|
||
return x_ctrs
|
||
|
||
|
||
def get_dlogp(Dp):
|
||
"""This is from functions.py from Rob"""
|
||
"""
|
||
Given an array of diameters Dp calculate an array of the log of the difference dlogdp
|
||
"""
|
||
dlogdp = np.log10(bin_ctrs_to_lims(Dp)[1:]) - np.log10(bin_ctrs_to_lims(Dp)[:-1])
|
||
return dlogdp
|
||
|
||
|
||
def counts2numConc(numb, flow, t=1):
|
||
"""This is from functions.py from Rob"""
|
||
"""
|
||
Calculate number concentration from counts (cnts), flow rate in [ccm] (Q) and sampling time in [secs] (t)
|
||
Return numConc with units of [cm^-3]
|
||
"""
|
||
conc = numb.divide(flow * (t / 60), axis=0)
|
||
return conc
|
||
|
||
|
||
def calculate_histogram(series, bin_lims=np.logspace(np.log10(0.3), np.log10(400), 50)):
|
||
series = series.dropna()
|
||
if series.empty:
|
||
counts = np.full(len(bin_lims) - 1, np.nan, dtype=float)
|
||
else:
|
||
counts, _ = np.histogram(series, bins=bin_lims)
|
||
return counts
|
||
|
||
|
||
def process_hist_and_dist_partition(
|
||
df: pd.DataFrame,
|
||
col: str,
|
||
flag_col: str | None,
|
||
flag_value: int | None,
|
||
bin_lims: np.ndarray,
|
||
bin_ctrs: np.ndarray,
|
||
dt,
|
||
calculate_conc: bool = True,
|
||
flow=None, # kept for API compatibility; not used
|
||
rho_eff: float = 1800,
|
||
BC_type: str = "",
|
||
# t: float = 1.0,
|
||
name_prefix: str | None = None, # <-- NEW: force exact output name prefix
|
||
):
|
||
# normalize dt -> "Xs"
|
||
dt_str = f"{dt}s" if isinstance(dt, (int, float)) else str(dt)
|
||
t_sec: float = pd.to_timedelta(dt_str).total_seconds()
|
||
|
||
# filter by flag (when provided)
|
||
if flag_col and flag_value is not None:
|
||
df_filtered = df.loc[df[flag_col] == flag_value]
|
||
else:
|
||
df_filtered = df
|
||
|
||
# adjust flag_col "cnts_*" -> "*"
|
||
if flag_col is not None and flag_col.startswith("cnts_"):
|
||
flag_core = flag_col[5:]
|
||
else:
|
||
flag_core = flag_col
|
||
|
||
# histogram of 'col' per time bin
|
||
df_resampled = (
|
||
df_filtered[[col]]
|
||
.resample(dt_str)
|
||
.apply(lambda x: calculate_histogram(x[col], bin_lims=bin_lims))
|
||
)
|
||
|
||
# flow per time bin (from this partition’s index)
|
||
flow_dt = df["Sample Flow Controller Read (vccm)"].resample(dt_str).mean()
|
||
|
||
# ensure df_resampled is a single-column DataFrame named "result"
|
||
if isinstance(df_resampled, pd.Series):
|
||
df_resampled = df_resampled.to_frame(name="result")
|
||
else:
|
||
df_resampled.columns = ["result"]
|
||
|
||
# drop time bins with zero original rows (matches your prior logic)
|
||
original_idx_df = df_filtered[["temporary_col"]].resample(dt_str).count()
|
||
original_idx_df.columns = ["original_idx"]
|
||
df_resampled = df_resampled.join(original_idx_df, how="left")
|
||
if "original_idx" in df_resampled.columns:
|
||
df_resampled = df_resampled[df_resampled["original_idx"] != 0]
|
||
df_resampled = df_resampled.drop(columns=["original_idx"])
|
||
|
||
# expand list-of-counts -> wide matrix (one col per bin)
|
||
max_list_length = len(bin_ctrs)
|
||
if df_resampled.empty:
|
||
ddf_hist = pd.DataFrame(
|
||
columns=[f"result_{i}" for i in range(max_list_length)],
|
||
index=pd.DatetimeIndex([], name=df.index.name),
|
||
)
|
||
else:
|
||
ddf_hist = df_resampled["result"].apply(pd.Series)
|
||
ddf_hist.index = df_resampled.index
|
||
ddf_hist.columns = [f"result_{i}" for i in range(max_list_length)]
|
||
|
||
if not calculate_conc:
|
||
return ddf_hist
|
||
|
||
# join flow
|
||
inc_hist_flow = ddf_hist.join(flow_dt)
|
||
|
||
# compute number concentration (per-log-bin)
|
||
# last column is flow; left part are counts per bin
|
||
dNd = counts2numConc(inc_hist_flow.iloc[:, :-1], inc_hist_flow.iloc[:, -1], t=t_sec)
|
||
|
||
# choose the abscissa used for dlog (Dmev for BC, or bin_ctrs otherwise)
|
||
if rho_eff is not None and BC_type is not None:
|
||
density, Dmev = BC_mass_to_diam(bin_ctrs, rho_eff=rho_eff, BC_type=BC_type)
|
||
dNdlog = dNd / get_dlogp(Dmev)
|
||
# names
|
||
if name_prefix is not None:
|
||
dNdlog.columns = [f"{name_prefix}_{b:.2f}" for b in Dmev]
|
||
else:
|
||
base = "dNdlogDmev"
|
||
tag = "all" if flag_core is None else flag_core
|
||
dNdlog.columns = [f"{base}_{tag}_{b:.2f}" for b in Dmev]
|
||
|
||
# mass concentration
|
||
dMdlog = dNdlogDp_to_dMdlogDp(dNdlog, Dmev, rho=density)
|
||
if name_prefix is not None:
|
||
# mirror the naming with a mass prefix
|
||
mass_prefix = name_prefix.replace("dNdlog", "dMdlog")
|
||
dMdlog.columns = [f"{mass_prefix}_{b:.2f}" for b in Dmev]
|
||
else:
|
||
base = "dMdlogDmev"
|
||
tag = "all" if flag_core is None else flag_core
|
||
dMdlog.columns = [f"{base}_{tag}_{b:.2f}" for b in Dmev]
|
||
|
||
return pd.concat([dNdlog, dMdlog], axis=1)
|
||
|
||
else:
|
||
# e.g., timelag/scattering case (no BC diameter conversion)
|
||
dNdlog = dNd / get_dlogp(bin_ctrs)
|
||
if name_prefix is not None:
|
||
dNdlog.columns = [f"{name_prefix}_{b:.2f}" for b in bin_ctrs]
|
||
else:
|
||
base = "dNdlogDsc"
|
||
tag = "all" if flag_core is None else flag_core
|
||
dNdlog.columns = [f"{base}_{tag}_{b:.2f}" for b in bin_ctrs]
|
||
return dNdlog
|
||
|
||
|
||
def make_hist_meta(
|
||
*,
|
||
bin_ctrs: np.ndarray,
|
||
kind: str, # "mass", "scatt", "timelag"
|
||
flag_col: str | None, # e.g. "cnts_thin" or None
|
||
name_prefix=None,
|
||
rho_eff=None,
|
||
BC_type=None,
|
||
dtype=np.float64,
|
||
):
|
||
"""
|
||
Return an empty DataFrame whose column names AND dtypes are exactly
|
||
what `process_hist_and_dist_partition()` will emit, without ever
|
||
calling that function.
|
||
"""
|
||
# ---- pick prefix exactly like the function ----------------------
|
||
prefix = (
|
||
"all_"
|
||
if flag_col is None
|
||
else f"{flag_col[5:]}_" if flag_col.startswith("cnts_") else f"{flag_col}_"
|
||
)
|
||
|
||
# ---- translate mass bins if needed ------------------------------
|
||
if kind == "mass" and rho_eff is not None and BC_type is not None:
|
||
# same conversion used inside the partition function
|
||
_, labels = BC_mass_to_diam(bin_ctrs, rho_eff=rho_eff, BC_type=BC_type)
|
||
n_prefix, m_prefix = "dNdlogDmev", "dMdlogDmev"
|
||
elif kind == "scatt":
|
||
labels = bin_ctrs
|
||
n_prefix, m_prefix = "dNdlogDsc", None
|
||
elif kind == "timelag":
|
||
labels = bin_ctrs
|
||
if name_prefix is None:
|
||
n_prefix, m_prefix = f"{prefix}", None
|
||
else:
|
||
prefix = ""
|
||
n_prefix, m_prefix = f"{name_prefix}", None
|
||
else:
|
||
raise ValueError(f"Unknown histogram kind '{kind}'")
|
||
|
||
# ---- build column list ------------------------------------------
|
||
cols = [f"{n_prefix}_{prefix}{v:.2f}" for v in labels]
|
||
if kind == "mass" and m_prefix:
|
||
cols += [f"{m_prefix}_{prefix}{v:.2f}" for v in labels]
|
||
|
||
empty_df = pd.DataFrame(
|
||
{c: pd.Series(dtype=dtype) for c in cols},
|
||
index=pd.DatetimeIndex([], name="calculated_time"),
|
||
)
|
||
return make_meta(empty_df)
|
||
|
||
|
||
def process_histograms(
|
||
ddf_pbp_with_flow,
|
||
run_config,
|
||
inc_mass_bin_lims,
|
||
inc_mass_bin_ctrs,
|
||
scatt_bin_lims,
|
||
scatt_bin_ctrs,
|
||
timelag_bins_lims,
|
||
timelag_bin_ctrs,
|
||
chunk_start,
|
||
client,
|
||
):
|
||
"""Separate function to process histograms and avoid graph buildup"""
|
||
|
||
computed_results = []
|
||
|
||
if run_config["do_BC_hist"]:
|
||
print("Computing BC distributions...")
|
||
# --- Mass histogram
|
||
BC_hist_configs = [
|
||
{"flag_col": None, "flag_value": None},
|
||
{"flag_col": "cnts_thin", "flag_value": 1},
|
||
{"flag_col": "cnts_thin_noScatt", "flag_value": 1},
|
||
{"flag_col": "cnts_thick", "flag_value": 1},
|
||
{"flag_col": "cnts_thick_sat", "flag_value": 1},
|
||
{"flag_col": "cnts_thin_sat", "flag_value": 1},
|
||
{"flag_col": "cnts_ntl_sat", "flag_value": 1},
|
||
{"flag_col": "cnts_ntl", "flag_value": 1},
|
||
{
|
||
"flag_col": "cnts_extreme_positive_timelag",
|
||
"flag_value": 1,
|
||
},
|
||
{
|
||
"flag_col": "cnts_thin_low_inc_scatt_ratio",
|
||
"flag_value": 1,
|
||
},
|
||
{"flag_col": "cnts_thin_total", "flag_value": 1},
|
||
{"flag_col": "cnts_thick_total", "flag_value": 1},
|
||
{"flag_col": "cnts_unclassified", "flag_value": 1},
|
||
]
|
||
|
||
for cfg_hist in BC_hist_configs:
|
||
meta_hist = (
|
||
make_hist_meta(
|
||
bin_ctrs=inc_mass_bin_ctrs,
|
||
kind="mass",
|
||
flag_col=cfg_hist["flag_col"],
|
||
rho_eff=run_config["rho_eff"],
|
||
BC_type=run_config["BC_type"],
|
||
)
|
||
.astype(DEFAULT_FLOAT, copy=False)
|
||
.convert_dtypes(dtype_backend="pyarrow")
|
||
)
|
||
ddf_out = ddf_pbp_with_flow.map_partitions(
|
||
process_hist_and_dist_partition,
|
||
col="BC mass within range",
|
||
flag_col=cfg_hist["flag_col"],
|
||
flag_value=cfg_hist["flag_value"],
|
||
bin_lims=inc_mass_bin_lims,
|
||
bin_ctrs=inc_mass_bin_ctrs,
|
||
dt=run_config["dt"],
|
||
calculate_conc=True,
|
||
flow=None,
|
||
rho_eff=run_config["rho_eff"],
|
||
BC_type=run_config["BC_type"],
|
||
meta=meta_hist,
|
||
).map_partitions(cast_and_arrow, meta=meta_hist)
|
||
|
||
computed_hist = ddf_out.compute()
|
||
computed_results.append(computed_hist)
|
||
del ddf_out
|
||
|
||
# Process other histogram types
|
||
if run_config["do_scatt_hist"]:
|
||
print("Computing scattering distribution...")
|
||
meta_hist = (
|
||
make_hist_meta(
|
||
bin_ctrs=scatt_bin_ctrs,
|
||
kind="scatt",
|
||
flag_col=None,
|
||
rho_eff=None,
|
||
BC_type=None,
|
||
)
|
||
.astype(DEFAULT_FLOAT, copy=False)
|
||
.convert_dtypes(dtype_backend="pyarrow")
|
||
)
|
||
ddf_scatt = ddf_pbp_with_flow.map_partitions(
|
||
process_hist_and_dist_partition,
|
||
col="Opt diam scatt only",
|
||
flag_col=None,
|
||
flag_value=None,
|
||
bin_lims=scatt_bin_lims,
|
||
bin_ctrs=scatt_bin_ctrs,
|
||
dt=run_config["dt"],
|
||
calculate_conc=True,
|
||
flow=None,
|
||
rho_eff=None,
|
||
BC_type=None,
|
||
meta=meta_hist,
|
||
).map_partitions(cast_and_arrow, meta=meta_hist)
|
||
computed_scatt = ddf_scatt.compute()
|
||
computed_results.append(computed_scatt)
|
||
|
||
if run_config["do_timelag_hist"]:
|
||
print("Computing time delay distribution...")
|
||
mass_bins = (
|
||
ddf_pbp_with_flow[["BC mass bin"]]
|
||
.compute()
|
||
.astype("Int64")
|
||
.drop_duplicates()
|
||
.dropna()
|
||
)
|
||
|
||
for idx, mass_bin in enumerate(mass_bins):
|
||
ddf_bin = ddf_pbp_with_flow[ddf_pbp_with_flow["BC mass bin"] == mass_bin]
|
||
|
||
name_prefix = f"dNdlogDmev_{inc_mass_bin_ctrs[idx]:.2f}_timelag"
|
||
|
||
meta_hist = make_hist_meta(
|
||
bin_ctrs=timelag_bin_ctrs,
|
||
kind="timelag",
|
||
flag_col="cnts_particles_for_tl_dist",
|
||
name_prefix=name_prefix,
|
||
rho_eff=None,
|
||
BC_type=None,
|
||
)
|
||
|
||
tl_ddf = ddf_bin.map_partitions(
|
||
process_hist_and_dist_partition,
|
||
col="time_lag",
|
||
flag_col="cnts_particles_for_tl_dist",
|
||
flag_value=1,
|
||
bin_lims=timelag_bins_lims,
|
||
bin_ctrs=timelag_bin_ctrs,
|
||
dt=run_config["dt"],
|
||
calculate_conc=True,
|
||
flow=None,
|
||
rho_eff=None,
|
||
BC_type=None,
|
||
name_prefix=name_prefix,
|
||
meta=meta_hist,
|
||
)
|
||
|
||
tl_ddf = tl_ddf.map_partitions(cast_and_arrow, meta=meta_hist)
|
||
|
||
computed_tl = tl_ddf.compute()
|
||
computed_results.append(computed_tl)
|
||
if computed_results:
|
||
for i, df in enumerate(computed_results):
|
||
# Ensure index is nanosecond precision
|
||
if hasattr(df.index, "dtype") and "datetime" in str(df.index.dtype):
|
||
df.index = df.index.astype("datetime64[ns]")
|
||
|
||
# Fix any datetime columns
|
||
for col in df.columns:
|
||
if hasattr(df[col], "dtype") and "datetime" in str(df[col].dtype):
|
||
computed_results[i][col] = df[col].astype("datetime64[ns]")
|
||
merged_df = pd.concat(computed_results, axis=1)
|
||
# Double-check the merged result
|
||
if hasattr(merged_df.index, "dtype") and "datetime" in str(
|
||
merged_df.index.dtype
|
||
):
|
||
merged_df.index = merged_df.index.astype("datetime64[ns]")
|
||
|
||
merged_ddf = dd.from_pandas(merged_df, npartitions=1)
|
||
# merged_ddf["date"] = dd.to_datetime(merged_ddf.index.to_series()).dt.normalize()
|
||
# merged_ddf["hour"] = merged_ddf["hour"].astype("int64")
|
||
time_index = dd.to_datetime(merged_ddf.index.to_series())
|
||
# Use string format to avoid Windows path issues with datetime partitions
|
||
merged_ddf["date"] = time_index.dt.strftime("%Y-%m-%d")
|
||
merged_ddf["hour"] = time_index.dt.hour.astype("int64")
|
||
|
||
# --- Save hists to parquet
|
||
delete_partition_if_exists(
|
||
output_path=f"{run_config['output']}/hists_{run_config['dt']}s",
|
||
partition_values={
|
||
"date": chunk_start.strftime("%Y-%m-%d"),
|
||
"hour": chunk_start.hour,
|
||
},
|
||
)
|
||
|
||
# Compute immediately
|
||
hist_future = merged_ddf.to_parquet(
|
||
f"{run_config['output']}/hists_{run_config['dt']}s",
|
||
partition_on=run_config["saving_schema"],
|
||
engine="pyarrow",
|
||
write_index=True,
|
||
write_metadata_file=True,
|
||
append=True,
|
||
schema="infer",
|
||
compute=False,
|
||
)
|
||
hist_future.compute()
|
||
|
||
del merged_df, merged_ddf, computed_results
|
||
gc.collect()
|