diff --git a/src/sp2xr/helpers.py b/src/sp2xr/helpers.py index 7c2d6bd..bbd40e1 100644 --- a/src/sp2xr/helpers.py +++ b/src/sp2xr/helpers.py @@ -455,3 +455,54 @@ def partition_rowcount(ddf: dd.DataFrame) -> int: meta=pd.Series(dtype="int64"), ) return int(row_series.sum().compute()) + + +def extract_sp2xr_filename_parts(file_path: str | Path) -> tuple[str, str]: + """ + Extract standardized filename and folder name from SP2XR file path. + + This function replicates the original logic: + - file_path.split("\\")[-1].split("_")[-2] + - file_path.split("\\")[-1].split("_")[-1].split(".")[-2] + + But uses cross-platform path handling. + + Args: + file_path: Path to SP2XR file (string or Path object) + + Returns: + tuple: (file_name_cut, folder_name) + + Example: + >>> extract_sp2xr_filename_parts("/data/SP2XR/20240101/PbP_20240101_001.csv") + ("20240101_001", "20240101") + """ + file_path_obj = Path(file_path) + filename_with_ext = file_path_obj.name # Gets filename with extension + filename_parts = filename_with_ext.split("_") + + if len(filename_parts) >= 2: + # Replicate original logic exactly + folder_name = filename_parts[-2] + # For file_name_cut: take last part, then remove extension + last_part_with_ext = filename_parts[-1] + # Split by "." and take second-to-last (removes extension) + ext_parts = last_part_with_ext.split(".") + if len(ext_parts) >= 2: + last_part = ext_parts[-2] # Remove extension + else: + last_part = ext_parts[0] # No extension to remove + + file_name_cut = f"{folder_name}_{last_part}" + + # Handle edge case where folder_name is empty (e.g., "_.csv" -> ["", "csv"]) + # This creates malformed results like "_csv", so fall back to reasonable defaults + if not folder_name or not last_part: + file_name_cut = file_path_obj.stem + folder_name = file_path_obj.parent.name + else: + # Fallback for unexpected filename formats + file_name_cut = file_path_obj.stem + folder_name = file_path_obj.parent.name + + return file_name_cut, folder_name diff --git a/src/sp2xr/io.py b/src/sp2xr/io.py index 9e6939f..e131296 100644 --- a/src/sp2xr/io.py +++ b/src/sp2xr/io.py @@ -9,7 +9,7 @@ import dask.dataframe as dd import logging from .toolkit_legacy import calculate_delta_sec, extract_datetime -from .helpers import find_matching_hk_file +from .helpers import find_matching_hk_file, extract_sp2xr_filename_parts logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -95,13 +95,15 @@ def enrich_sp2xr_dataframe( df["first_val"] = first_val df["t0"] = t0 - file_name_cut = ( - file_path.split("\\")[-1].split("_")[-2] - + "_" - + file_path.split("\\")[-1].split("_")[-1].split(".")[-2] - ) + file_path_obj = Path(file_path) + filename = file_path_obj.name # Gets just the filename (last part after separator) + filename_parts = filename.split("_") + + file_name_cut = f"{filename_parts[-2]}_{file_path_obj.stem.split('_')[-1]}" + folder_name = filename_parts[-2] + df["file"] = file_name_cut - folder_name = file_path.split("\\")[-1].split("_")[-2] + # folder_name = file_path.split("\\")[-1].split("_")[-2] df["folder_name"] = folder_name # Ensure Time Stamp is in datetime format (if present) @@ -132,11 +134,12 @@ def enrich_sp2xr_dataframe( def save_sp2xr_parquet(df, file_path, target_directory): - fn = ( + """fn = ( file_path.split("\\")[-1].split("_")[-2] + "_" + file_path.split("\\")[-1].split("_")[-1].split(".")[-2] - ) + )""" + fn, _ = extract_sp2xr_filename_parts(file_path) def name(part_idx): return f"{fn}.parquet"