fix: correct handling of file path structures in different operating systems
This commit is contained in:
@@ -455,3 +455,54 @@ def partition_rowcount(ddf: dd.DataFrame) -> int:
|
||||
meta=pd.Series(dtype="int64"),
|
||||
)
|
||||
return int(row_series.sum().compute())
|
||||
|
||||
|
||||
def extract_sp2xr_filename_parts(file_path: str | Path) -> tuple[str, str]:
|
||||
"""
|
||||
Extract standardized filename and folder name from SP2XR file path.
|
||||
|
||||
This function replicates the original logic:
|
||||
- file_path.split("\\")[-1].split("_")[-2]
|
||||
- file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
|
||||
|
||||
But uses cross-platform path handling.
|
||||
|
||||
Args:
|
||||
file_path: Path to SP2XR file (string or Path object)
|
||||
|
||||
Returns:
|
||||
tuple: (file_name_cut, folder_name)
|
||||
|
||||
Example:
|
||||
>>> extract_sp2xr_filename_parts("/data/SP2XR/20240101/PbP_20240101_001.csv")
|
||||
("20240101_001", "20240101")
|
||||
"""
|
||||
file_path_obj = Path(file_path)
|
||||
filename_with_ext = file_path_obj.name # Gets filename with extension
|
||||
filename_parts = filename_with_ext.split("_")
|
||||
|
||||
if len(filename_parts) >= 2:
|
||||
# Replicate original logic exactly
|
||||
folder_name = filename_parts[-2]
|
||||
# For file_name_cut: take last part, then remove extension
|
||||
last_part_with_ext = filename_parts[-1]
|
||||
# Split by "." and take second-to-last (removes extension)
|
||||
ext_parts = last_part_with_ext.split(".")
|
||||
if len(ext_parts) >= 2:
|
||||
last_part = ext_parts[-2] # Remove extension
|
||||
else:
|
||||
last_part = ext_parts[0] # No extension to remove
|
||||
|
||||
file_name_cut = f"{folder_name}_{last_part}"
|
||||
|
||||
# Handle edge case where folder_name is empty (e.g., "_.csv" -> ["", "csv"])
|
||||
# This creates malformed results like "_csv", so fall back to reasonable defaults
|
||||
if not folder_name or not last_part:
|
||||
file_name_cut = file_path_obj.stem
|
||||
folder_name = file_path_obj.parent.name
|
||||
else:
|
||||
# Fallback for unexpected filename formats
|
||||
file_name_cut = file_path_obj.stem
|
||||
folder_name = file_path_obj.parent.name
|
||||
|
||||
return file_name_cut, folder_name
|
||||
|
||||
@@ -9,7 +9,7 @@ import dask.dataframe as dd
|
||||
import logging
|
||||
|
||||
from .toolkit_legacy import calculate_delta_sec, extract_datetime
|
||||
from .helpers import find_matching_hk_file
|
||||
from .helpers import find_matching_hk_file, extract_sp2xr_filename_parts
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
@@ -95,13 +95,15 @@ def enrich_sp2xr_dataframe(
|
||||
df["first_val"] = first_val
|
||||
df["t0"] = t0
|
||||
|
||||
file_name_cut = (
|
||||
file_path.split("\\")[-1].split("_")[-2]
|
||||
+ "_"
|
||||
+ file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
|
||||
)
|
||||
file_path_obj = Path(file_path)
|
||||
filename = file_path_obj.name # Gets just the filename (last part after separator)
|
||||
filename_parts = filename.split("_")
|
||||
|
||||
file_name_cut = f"{filename_parts[-2]}_{file_path_obj.stem.split('_')[-1]}"
|
||||
folder_name = filename_parts[-2]
|
||||
|
||||
df["file"] = file_name_cut
|
||||
folder_name = file_path.split("\\")[-1].split("_")[-2]
|
||||
# folder_name = file_path.split("\\")[-1].split("_")[-2]
|
||||
df["folder_name"] = folder_name
|
||||
|
||||
# Ensure Time Stamp is in datetime format (if present)
|
||||
@@ -132,11 +134,12 @@ def enrich_sp2xr_dataframe(
|
||||
|
||||
|
||||
def save_sp2xr_parquet(df, file_path, target_directory):
|
||||
fn = (
|
||||
"""fn = (
|
||||
file_path.split("\\")[-1].split("_")[-2]
|
||||
+ "_"
|
||||
+ file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
|
||||
)
|
||||
)"""
|
||||
fn, _ = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
def name(part_idx):
|
||||
return f"{fn}.parquet"
|
||||
|
||||
Reference in New Issue
Block a user