fix: correct handling of file path structures in different operating systems

This commit is contained in:
2025-09-09 19:13:14 +02:00
parent f437b1c5fe
commit 6621236ea4
2 changed files with 63 additions and 9 deletions

View File

@@ -455,3 +455,54 @@ def partition_rowcount(ddf: dd.DataFrame) -> int:
meta=pd.Series(dtype="int64"),
)
return int(row_series.sum().compute())
def extract_sp2xr_filename_parts(file_path: str | Path) -> tuple[str, str]:
"""
Extract standardized filename and folder name from SP2XR file path.
This function replicates the original logic:
- file_path.split("\\")[-1].split("_")[-2]
- file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
But uses cross-platform path handling.
Args:
file_path: Path to SP2XR file (string or Path object)
Returns:
tuple: (file_name_cut, folder_name)
Example:
>>> extract_sp2xr_filename_parts("/data/SP2XR/20240101/PbP_20240101_001.csv")
("20240101_001", "20240101")
"""
file_path_obj = Path(file_path)
filename_with_ext = file_path_obj.name # Gets filename with extension
filename_parts = filename_with_ext.split("_")
if len(filename_parts) >= 2:
# Replicate original logic exactly
folder_name = filename_parts[-2]
# For file_name_cut: take last part, then remove extension
last_part_with_ext = filename_parts[-1]
# Split by "." and take second-to-last (removes extension)
ext_parts = last_part_with_ext.split(".")
if len(ext_parts) >= 2:
last_part = ext_parts[-2] # Remove extension
else:
last_part = ext_parts[0] # No extension to remove
file_name_cut = f"{folder_name}_{last_part}"
# Handle edge case where folder_name is empty (e.g., "_.csv" -> ["", "csv"])
# This creates malformed results like "_csv", so fall back to reasonable defaults
if not folder_name or not last_part:
file_name_cut = file_path_obj.stem
folder_name = file_path_obj.parent.name
else:
# Fallback for unexpected filename formats
file_name_cut = file_path_obj.stem
folder_name = file_path_obj.parent.name
return file_name_cut, folder_name

View File

@@ -9,7 +9,7 @@ import dask.dataframe as dd
import logging
from .toolkit_legacy import calculate_delta_sec, extract_datetime
from .helpers import find_matching_hk_file
from .helpers import find_matching_hk_file, extract_sp2xr_filename_parts
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
@@ -95,13 +95,15 @@ def enrich_sp2xr_dataframe(
df["first_val"] = first_val
df["t0"] = t0
file_name_cut = (
file_path.split("\\")[-1].split("_")[-2]
+ "_"
+ file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
)
file_path_obj = Path(file_path)
filename = file_path_obj.name # Gets just the filename (last part after separator)
filename_parts = filename.split("_")
file_name_cut = f"{filename_parts[-2]}_{file_path_obj.stem.split('_')[-1]}"
folder_name = filename_parts[-2]
df["file"] = file_name_cut
folder_name = file_path.split("\\")[-1].split("_")[-2]
# folder_name = file_path.split("\\")[-1].split("_")[-2]
df["folder_name"] = folder_name
# Ensure Time Stamp is in datetime format (if present)
@@ -132,11 +134,12 @@ def enrich_sp2xr_dataframe(
def save_sp2xr_parquet(df, file_path, target_directory):
fn = (
"""fn = (
file_path.split("\\")[-1].split("_")[-2]
+ "_"
+ file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
)
)"""
fn, _ = extract_sp2xr_filename_parts(file_path)
def name(part_idx):
return f"{fn}.parquet"