fix: correct handling of file path structures in different operating systems

2025-09-09 19:13:14 +02:00
parent f437b1c5fe
commit 6621236ea4
2 changed files with 63 additions and 9 deletions
--- a/src/sp2xr/helpers.py
+++ b/src/sp2xr/helpers.py
@@ -455,3 +455,54 @@ def partition_rowcount(ddf: dd.DataFrame) -> int:
        meta=pd.Series(dtype="int64"),
    )
    return int(row_series.sum().compute())
+
+
+def extract_sp2xr_filename_parts(file_path: str | Path) -> tuple[str, str]:
+    """
+    Extract standardized filename and folder name from SP2XR file path.
+
+    This function replicates the original logic:
+    - file_path.split("\\")[-1].split("_")[-2]
+    - file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
+
+    But uses cross-platform path handling.
+
+    Args:
+        file_path: Path to SP2XR file (string or Path object)
+
+    Returns:
+        tuple: (file_name_cut, folder_name)
+
+    Example:
+        >>> extract_sp2xr_filename_parts("/data/SP2XR/20240101/PbP_20240101_001.csv")
+        ("20240101_001", "20240101")
+    """
+    file_path_obj = Path(file_path)
+    filename_with_ext = file_path_obj.name  # Gets filename with extension
+    filename_parts = filename_with_ext.split("_")
+
+    if len(filename_parts) >= 2:
+        # Replicate original logic exactly
+        folder_name = filename_parts[-2]
+        # For file_name_cut: take last part, then remove extension
+        last_part_with_ext = filename_parts[-1]
+        # Split by "." and take second-to-last (removes extension)
+        ext_parts = last_part_with_ext.split(".")
+        if len(ext_parts) >= 2:
+            last_part = ext_parts[-2]  # Remove extension
+        else:
+            last_part = ext_parts[0]  # No extension to remove
+
+        file_name_cut = f"{folder_name}_{last_part}"
+
+        # Handle edge case where folder_name is empty (e.g., "_.csv" -> ["", "csv"])
+        # This creates malformed results like "_csv", so fall back to reasonable defaults
+        if not folder_name or not last_part:
+            file_name_cut = file_path_obj.stem
+            folder_name = file_path_obj.parent.name
+    else:
+        # Fallback for unexpected filename formats
+        file_name_cut = file_path_obj.stem
+        folder_name = file_path_obj.parent.name
+
+    return file_name_cut, folder_name
--- a/src/sp2xr/io.py
+++ b/src/sp2xr/io.py
@@ -9,7 +9,7 @@ import dask.dataframe as dd
 import logging

 from .toolkit_legacy import calculate_delta_sec, extract_datetime
-from .helpers import find_matching_hk_file
+from .helpers import find_matching_hk_file, extract_sp2xr_filename_parts

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -95,13 +95,15 @@ def enrich_sp2xr_dataframe(
    df["first_val"] = first_val
    df["t0"] = t0

-    file_name_cut = (
-        file_path.split("\\")[-1].split("_")[-2]
-        + "_"
-        + file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
-    )
+    file_path_obj = Path(file_path)
+    filename = file_path_obj.name  # Gets just the filename (last part after separator)
+    filename_parts = filename.split("_")
+
+    file_name_cut = f"{filename_parts[-2]}_{file_path_obj.stem.split('_')[-1]}"
+    folder_name = filename_parts[-2]
+
    df["file"] = file_name_cut
-    folder_name = file_path.split("\\")[-1].split("_")[-2]
+    # folder_name = file_path.split("\\")[-1].split("_")[-2]
    df["folder_name"] = folder_name

    # Ensure Time Stamp is in datetime format (if present)
@@ -132,11 +134,12 @@ def enrich_sp2xr_dataframe(


 def save_sp2xr_parquet(df, file_path, target_directory):
-    fn = (
+    """fn = (
        file_path.split("\\")[-1].split("_")[-2]
        + "_"
        + file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
-    )
+    )"""
+    fn, _ = extract_sp2xr_filename_parts(file_path)

    def name(part_idx):
        return f"{fn}.parquet"