Change date format to string for campatibility with Windows systems

This commit is contained in:
2025-09-30 00:31:59 +02:00
parent d46f3319f3
commit 3431bc8a4d
5 changed files with 18 additions and 13 deletions

View File

@@ -213,7 +213,7 @@ def main():
"Sample Flow Controller Read (vccm)": pd.Series(
dtype="float64"
),
"date": pd.Series(dtype="datetime64[ns]"),
"date": pd.Series(dtype="str"),
"hour": pd.Series(dtype="int64"),
},
index=pd.DatetimeIndex([]),
@@ -340,7 +340,7 @@ def main():
delete_partition_if_exists(
output_path=f"{run_config['output']}/pbp_calibrated",
partition_values={
"date": chunk_start.strftime("%Y-%m-%d 00:00:00"),
"date": chunk_start.strftime("%Y-%m-%d"),
"hour": chunk_start.hour,
},
)
@@ -411,9 +411,8 @@ def main():
)
# 2) cast partition columns *before* Dask strips them off
ddf_conc["date"] = dd.to_datetime(ddf_conc["date"]).astype(
"datetime64[ns]"
)
# Keep date as string to avoid Windows path issues with datetime partitions
ddf_conc["date"] = ddf_conc["date"].astype("str")
ddf_conc["hour"] = ddf_conc["hour"].astype("int64")
conc_future = ddf_conc.to_parquet(

View File

@@ -182,7 +182,9 @@ def calibrate_particle_data(
curve_type=scatt_conf.get("curve_type"),
params=scatt_conf.get("parameters", []),
)
df["date"] = df.index.date.astype("datetime64[ns]")
# Use string format for date to avoid Windows path issues with datetime partitions
# PyArrow creates partition dirs like "date=2019-05-08 00:00:00" which has colons (invalid on Windows)
df["date"] = df.index.strftime("%Y-%m-%d")
df["hour"] = df.index.hour.astype("int8")
return df
@@ -218,8 +220,7 @@ def calibrate_single_particle(
meta_cal = ddf_raw._meta.copy()
meta_cal["BC mass"] = pd.Series([], dtype="float64")
meta_cal["Opt diam"] = pd.Series([], dtype="float64")
meta_cal["time_lag"] = pd.Series([], dtype="int8")
meta_cal["date"] = pd.Series([], dtype="datetime64[ns]")
meta_cal["date"] = pd.Series([], dtype="str")
meta_cal["hour"] = pd.Series([], dtype="int8")
ddf_cal = ddf_raw.map_partitions(

View File

@@ -427,7 +427,8 @@ def process_histograms(
# merged_ddf["date"] = dd.to_datetime(merged_ddf.index.to_series()).dt.normalize()
# merged_ddf["hour"] = merged_ddf["hour"].astype("int64")
time_index = dd.to_datetime(merged_ddf.index.to_series())
merged_ddf["date"] = time_index.dt.normalize() # works on Series
# Use string format to avoid Windows path issues with datetime partitions
merged_ddf["date"] = time_index.dt.strftime("%Y-%m-%d")
merged_ddf["hour"] = time_index.dt.hour.astype("int64")
# --- Save hists to parquet

View File

@@ -92,7 +92,8 @@ def build_dt_summary(pdf: pd.DataFrame, dt_s: int = 1) -> pd.DataFrame:
out = out[out["original_idx"] != 0].drop(columns="original_idx")
# add date / hour helper cols
out["date"] = out.index.normalize()
# Use string format to avoid Windows path issues with datetime partitions
out["date"] = out.index.strftime("%Y-%m-%d")
out["hour"] = out.index.hour.astype("int64")
return out
@@ -130,7 +131,8 @@ def resample_hk_partition(pdf: pd.DataFrame, dt="1s") -> pd.DataFrame:
out = out.asfreq(dt).ffill(limit=1)
# --- add date/hour columns ---
out["date"] = out.index.normalize()
# Use string format to avoid Windows path issues with datetime partitions
out["date"] = out.index.strftime("%Y-%m-%d")
out["hour"] = out.index.hour.astype("int64")
return out
@@ -176,7 +178,8 @@ def aggregate_dt(ddf_pbp_dt, ddf_hk_dt, run_config):
)
time_index = dd.to_datetime(ddf_pbp_hk_dt.index.to_series())
ddf_pbp_hk_dt["date"] = time_index.dt.normalize() # works on Series
# Use string format to avoid Windows path issues with datetime partitions
ddf_pbp_hk_dt["date"] = time_index.dt.strftime("%Y-%m-%d")
ddf_pbp_hk_dt["hour"] = time_index.dt.hour.astype("int64")
# Optionally drop the old columns

View File

@@ -6,7 +6,8 @@ from typing import Mapping
# schema.py ── central truth table ───────────────────────────────────────────
CANONICAL_DTYPES = {
# identifiers / file bookkeeping
"date": "datetime64[ns]",
# Use string for date to avoid Windows path issues with datetime partitions
"date": "string[pyarrow]",
"hour": pd.Int64Dtype(),
"file": "string[pyarrow]",
"folder_name": "string[pyarrow]",