Files
SP2XR/tests/test_path_extraction.py

250 lines
9.6 KiB
Python

import pytest
from pathlib import Path
import sys
# Add src to path to import sp2xr modules
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from sp2xr.helpers import extract_sp2xr_filename_parts
class TestExtractSP2XRFilenameParts:
"""Test cases for extract_sp2xr_filename_parts function."""
def test_standard_windows_path(self):
"""Test with typical Windows path format."""
file_path = r"C:\data\SP2XR\20240101\file_20240101_001.csv"
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
assert file_name_cut == "20240101_001"
assert folder_name == "20240101"
def test_standard_unix_path(self):
"""Test with typical Unix path format."""
file_path = "/data/SP2XR/20240101/file_20240101_001.csv"
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
assert file_name_cut == "20240101_001"
assert folder_name == "20240101"
def test_zip_file_extension(self):
"""Test with .zip extension."""
file_path = "/data/SP2XR/20240101/pbp_20240101_002.zip"
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
assert file_name_cut == "20240101_002"
assert folder_name == "20240101"
def test_multiple_underscores(self):
"""Test filename with multiple underscores."""
file_path = "/data/SP2XR/20240101/sp2_data_file_20240101_003.csv"
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
assert file_name_cut == "20240101_003"
assert folder_name == "20240101"
def test_pathlib_object_input(self):
"""Test with Path object as input."""
file_path = Path("/data/SP2XR/20240101/hk_20240101_004.csv")
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
assert file_name_cut == "20240101_004"
assert folder_name == "20240101"
def test_mixed_path_separators(self):
"""Test handling of mixed path separators."""
file_path = r"C:\data\SP2XR/20240101\file_20240101_005.csv"
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
assert file_name_cut == "20240101_005"
assert folder_name == "20240101"
def test_single_underscore_filename(self):
"""Test filename with only one underscore."""
file_path = "/data/SP2XR/20240101/data_001.csv"
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
# Should take the last part after underscore
assert file_name_cut == "data_001"
assert folder_name == "data"
def test_no_underscore_filename(self):
"""Test filename with no underscores - edge case."""
file_path = "/data/SP2XR/20240101/datafile.csv"
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
# Should fallback to stem and parent name
assert file_name_cut == "datafile"
assert folder_name == "20240101"
def test_empty_filename_parts(self):
"""Test edge case with unusual filename structure."""
file_path = "/data/SP2XR/folder/_.csv"
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
# For "_.csv": This is a pathological case that the original code couldn't handle well
# The function returns the most reasonable interpretation:
# file_name_cut = "_" (the stem of the file)
# folder_name = "folder" (falls back to parent directory)
assert file_name_cut == "_"
assert folder_name == "folder" # Falls back to parent directory name
@pytest.mark.parametrize("file_extension", [".csv", ".zip", ".parquet", ".txt"])
def test_different_extensions(self, file_extension):
"""Test with different file extensions."""
file_path = f"/data/SP2XR/20240101/test_20240101_001{file_extension}"
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
assert file_name_cut == "20240101_001"
assert folder_name == "20240101"
def test_real_sp2xr_filename_patterns(self):
"""Test with realistic SP2XR filename patterns."""
test_cases = [
("/data/SP2XR/20240101/PbP_20240101_001.csv", "20240101_001", "20240101"),
("/data/SP2XR/20240101/hk_20240101_002.zip", "20240101_002", "20240101"),
(r"C:\SP2XR\20240101\SP2_20240101_003.csv", "20240101_003", "20240101"),
]
for file_path, expected_cut, expected_folder in test_cases:
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
assert file_name_cut == expected_cut, f"Failed for {file_path}"
assert folder_name == expected_folder, f"Failed for {file_path}"
class TestExtractSP2XRFilenamePartsIntegration:
"""Integration tests using existing test files in the repository."""
@pytest.fixture
def test_data_dir(self):
"""Get the path to the test data directory."""
# Get the directory where this test file is located, then go to data
return Path(__file__).parent / "data"
def test_with_real_pbp_files(self, test_data_dir):
"""Test function with actual PbP test files."""
pbp_dir = test_data_dir / "pbp_files_test"
if not pbp_dir.exists():
pytest.skip(f"Test data directory not found: {pbp_dir}")
# Find actual PbP files in the test directory
pbp_files = list(pbp_dir.glob("**/*PbP*")) + list(pbp_dir.glob("**/*pbp*"))
if not pbp_files:
pytest.skip("No PbP test files found")
for pbp_file in pbp_files[:3]: # Test first 3 files to avoid long test times
file_name_cut, folder_name = extract_sp2xr_filename_parts(pbp_file)
# Verify the function doesn't crash and returns reasonable values
assert isinstance(file_name_cut, str)
assert isinstance(folder_name, str)
assert len(file_name_cut) > 0
assert len(folder_name) > 0
print(
f"PbP File: {pbp_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}"
)
def test_with_real_hk_files(self, test_data_dir):
"""Test function with actual HK test files."""
hk_dir = test_data_dir / "hk_files_test"
if not hk_dir.exists():
pytest.skip(f"Test data directory not found: {hk_dir}")
# Find actual HK files in the test directory
hk_files = list(hk_dir.glob("**/*hk*")) + list(hk_dir.glob("**/*HK*"))
if not hk_files:
pytest.skip("No HK test files found")
for hk_file in hk_files[:3]: # Test first 3 files
file_name_cut, folder_name = extract_sp2xr_filename_parts(hk_file)
# Verify the function doesn't crash and returns reasonable values
assert isinstance(file_name_cut, str)
assert isinstance(folder_name, str)
assert len(file_name_cut) > 0
assert len(folder_name) > 0
print(
f"HK File: {hk_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}"
)
def test_with_all_test_files(self, test_data_dir):
"""Test function with any CSV/ZIP files in test data."""
if not test_data_dir.exists():
pytest.skip(f"Test data directory not found: {test_data_dir}")
# Find all CSV and ZIP files in test data
test_files = []
test_files.extend(test_data_dir.glob("**/*.csv"))
test_files.extend(test_data_dir.glob("**/*.zip"))
if not test_files:
pytest.skip("No test files found")
successful_extractions = 0
for test_file in test_files:
try:
file_name_cut, folder_name = extract_sp2xr_filename_parts(test_file)
# Basic validation
assert isinstance(file_name_cut, str)
assert isinstance(folder_name, str)
successful_extractions += 1
print(f"{test_file.name} -> {file_name_cut} | {folder_name}")
except Exception as e:
print(f"✗ Failed on {test_file.name}: {e}")
# Don't fail the test, just report issues
# Ensure we processed at least some files successfully
assert successful_extractions > 0, "No files were processed successfully"
print(
f"\nSuccessfully processed {successful_extractions}/{len(test_files)} files"
)
class TestRealFilenamePatterns:
"""Analyze actual filename patterns in test data."""
@pytest.fixture
def test_data_dir(self):
return Path(__file__).parent / "data"
def test_analyze_filename_patterns(self, test_data_dir):
"""Analyze and report on actual filename patterns in test data."""
if not test_data_dir.exists():
pytest.skip(f"Test data directory not found: {test_data_dir}")
patterns = {}
all_files = list(test_data_dir.glob("**/*.*"))
for file_path in all_files:
if file_path.suffix in [".csv", ".zip"]:
filename = file_path.name
parts = filename.split("_")
pattern = f"{len(parts)} parts: {' | '.join(parts[:3])}{'...' if len(parts) > 3 else ''}"
if pattern not in patterns:
patterns[pattern] = []
patterns[pattern].append(filename)
print("\nFilename patterns found:")
for pattern, files in patterns.items():
print(f"\n{pattern}")
for file in files[:3]: # Show first 3 examples
print(f" - {file}")
if len(files) > 3:
print(f" ... and {len(files) - 3} more")
if __name__ == "__main__":
# Allow running the test file directly
pytest.main([__file__, "-v", "-s"])