diff --git a/src/sp2xr/schema.py b/src/sp2xr/schema.py index ff576a8..48f77d0 100644 --- a/src/sp2xr/schema.py +++ b/src/sp2xr/schema.py @@ -20,7 +20,6 @@ CANONICAL_DTYPES = { "Scatt numb within range": pd.Float64Dtype(), "BC mass": pd.Float64Dtype(), "BC mass within range": pd.Float64Dtype(), - # "BC mass bin": pd.Int64Dtype(), "BC numb": pd.Int64Dtype(), "BC numb from file": pd.Int64Dtype(), "BC numb within range": pd.Float64Dtype(), @@ -42,20 +41,6 @@ CANONICAL_DTYPES = { "BC_massConc_within_range_vol": pd.Float64Dtype(), "BC_massConc_std": pd.Float64Dtype(), "BC_massConc_vol": pd.Float64Dtype(), - # counters / QC flags - # "cnts_thin": pd.Int64Dtype(), - # "cnts_thick": pd.Int64Dtype(), - # "cnts_unclassified": pd.Int64Dtype(), - # "cnts_thin_noScatt": pd.Int64Dtype(), - # "cnts_thick_sat": pd.Int64Dtype(), - # "cnts_thin_sat": pd.Int64Dtype(), - # "cnts_ntl_sat": pd.Int64Dtype(), - # "cnts_ntl": pd.Int64Dtype(), - # "cnts_extreme_positive_timelag": pd.Int64Dtype(), - # "cnts_thin_low_inc_scatt_ratio": pd.Int64Dtype(), - # "cnts_particles_for_tl_dist": pd.Int64Dtype(), - # "cnts_thin_total": pd.Int64Dtype(), - # "cnts_thick_total": pd.Int64Dtype(), **{ c: pd.Int64Dtype() for c in [ @@ -77,17 +62,6 @@ CANONICAL_DTYPES = { "temporary_col", ] }, - # "flag_valid_inc_signal_in_range": 'boolean', - # "flag_valid_inc_signal": 'boolean', - # "flag_inc_not_sat": 'boolean', - # "flag_valid_scatt_signal": 'boolean', - # "flag_extreme_positive_timelag": 'boolean', - # "flag_scatt_not_sat": 'boolean', - # "flag_valid_scatt_signal_in_range": 'boolean', - # "flag_negative_timelag": 'boolean', - # "flag_valid_timelag_thin": 'boolean', - # "flag_valid_timelag_thick":'boolean', - # "flag_low_ratio_inc_scatt": 'boolean', **{ c: "boolean" for c in [ @@ -107,7 +81,6 @@ CANONICAL_DTYPES = { # timing "calculated_time": "datetime64[us]", "time_lag": pd.Float64Dtype(), - # "temporary_col": pd.Float64Dtype(), "Sample Flow Controller Read (vccm)": pd.Float64Dtype(), "Sample Flow Controller Read (sccm)": pd.Float64Dtype(), "Dropped Records": pd.Float64Dtype(), @@ -116,27 +89,6 @@ CANONICAL_DTYPES = { DEFAULT_FLOAT = pd.Float64Dtype() -'''def enforce_schema(ddf: dd.DataFrame) -> dd.DataFrame: - """Return a dataframe that matches the hard-coded CANONICAL_DTYPES.""" - # 1. add columns that are missing in this partition - """for col, dtype in CANONICAL_DTYPES.items(): - if col not in ddf.columns: - ddf[col] = pd.Series(pd.NA, dtype=dtype)""" - - # 2. cast *only* the columns whose dtype differs - mismatched = { - col: dtype - for col, dtype in CANONICAL_DTYPES.items() - if ddf[col].dtype != dtype - } - if mismatched: - ddf = ddf.astype(mismatched) - - # 3. drop unexpected columns (optional but safest) - ddf = ddf[[*CANONICAL_DTYPES]] - - return ddf''' - def enforce_schema( ddf: dd.DataFrame,