250 lines
9.6 KiB
Python
250 lines
9.6 KiB
Python
import pytest
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
# Add src to path to import sp2xr modules
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from sp2xr.helpers import extract_sp2xr_filename_parts
|
|
|
|
|
|
class TestExtractSP2XRFilenameParts:
|
|
"""Test cases for extract_sp2xr_filename_parts function."""
|
|
|
|
def test_standard_windows_path(self):
|
|
"""Test with typical Windows path format."""
|
|
file_path = r"C:\data\SP2XR\20240101\file_20240101_001.csv"
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
|
|
assert file_name_cut == "20240101_001"
|
|
assert folder_name == "20240101"
|
|
|
|
def test_standard_unix_path(self):
|
|
"""Test with typical Unix path format."""
|
|
file_path = "/data/SP2XR/20240101/file_20240101_001.csv"
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
|
|
assert file_name_cut == "20240101_001"
|
|
assert folder_name == "20240101"
|
|
|
|
def test_zip_file_extension(self):
|
|
"""Test with .zip extension."""
|
|
file_path = "/data/SP2XR/20240101/pbp_20240101_002.zip"
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
|
|
assert file_name_cut == "20240101_002"
|
|
assert folder_name == "20240101"
|
|
|
|
def test_multiple_underscores(self):
|
|
"""Test filename with multiple underscores."""
|
|
file_path = "/data/SP2XR/20240101/sp2_data_file_20240101_003.csv"
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
|
|
assert file_name_cut == "20240101_003"
|
|
assert folder_name == "20240101"
|
|
|
|
def test_pathlib_object_input(self):
|
|
"""Test with Path object as input."""
|
|
file_path = Path("/data/SP2XR/20240101/hk_20240101_004.csv")
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
|
|
assert file_name_cut == "20240101_004"
|
|
assert folder_name == "20240101"
|
|
|
|
def test_mixed_path_separators(self):
|
|
"""Test handling of mixed path separators."""
|
|
file_path = r"C:\data\SP2XR/20240101\file_20240101_005.csv"
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
|
|
assert file_name_cut == "20240101_005"
|
|
assert folder_name == "20240101"
|
|
|
|
def test_single_underscore_filename(self):
|
|
"""Test filename with only one underscore."""
|
|
file_path = "/data/SP2XR/20240101/data_001.csv"
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
|
|
# Should take the last part after underscore
|
|
assert file_name_cut == "data_001"
|
|
assert folder_name == "data"
|
|
|
|
def test_no_underscore_filename(self):
|
|
"""Test filename with no underscores - edge case."""
|
|
file_path = "/data/SP2XR/20240101/datafile.csv"
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
|
|
# Should fallback to stem and parent name
|
|
assert file_name_cut == "datafile"
|
|
assert folder_name == "20240101"
|
|
|
|
def test_empty_filename_parts(self):
|
|
"""Test edge case with unusual filename structure."""
|
|
file_path = "/data/SP2XR/folder/_.csv"
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
|
|
# For "_.csv": This is a pathological case that the original code couldn't handle well
|
|
# The function returns the most reasonable interpretation:
|
|
# file_name_cut = "_" (the stem of the file)
|
|
# folder_name = "folder" (falls back to parent directory)
|
|
assert file_name_cut == "_"
|
|
assert folder_name == "folder" # Falls back to parent directory name
|
|
|
|
@pytest.mark.parametrize("file_extension", [".csv", ".zip", ".parquet", ".txt"])
|
|
def test_different_extensions(self, file_extension):
|
|
"""Test with different file extensions."""
|
|
file_path = f"/data/SP2XR/20240101/test_20240101_001{file_extension}"
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
|
|
assert file_name_cut == "20240101_001"
|
|
assert folder_name == "20240101"
|
|
|
|
def test_real_sp2xr_filename_patterns(self):
|
|
"""Test with realistic SP2XR filename patterns."""
|
|
test_cases = [
|
|
("/data/SP2XR/20240101/PbP_20240101_001.csv", "20240101_001", "20240101"),
|
|
("/data/SP2XR/20240101/hk_20240101_002.zip", "20240101_002", "20240101"),
|
|
(r"C:\SP2XR\20240101\SP2_20240101_003.csv", "20240101_003", "20240101"),
|
|
]
|
|
|
|
for file_path, expected_cut, expected_folder in test_cases:
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
|
assert file_name_cut == expected_cut, f"Failed for {file_path}"
|
|
assert folder_name == expected_folder, f"Failed for {file_path}"
|
|
|
|
|
|
class TestExtractSP2XRFilenamePartsIntegration:
|
|
"""Integration tests using existing test files in the repository."""
|
|
|
|
@pytest.fixture
|
|
def test_data_dir(self):
|
|
"""Get the path to the test data directory."""
|
|
# Get the directory where this test file is located, then go to data
|
|
return Path(__file__).parent / "data"
|
|
|
|
def test_with_real_pbp_files(self, test_data_dir):
|
|
"""Test function with actual PbP test files."""
|
|
pbp_dir = test_data_dir / "pbp_files_test"
|
|
|
|
if not pbp_dir.exists():
|
|
pytest.skip(f"Test data directory not found: {pbp_dir}")
|
|
|
|
# Find actual PbP files in the test directory
|
|
pbp_files = list(pbp_dir.glob("**/*PbP*")) + list(pbp_dir.glob("**/*pbp*"))
|
|
|
|
if not pbp_files:
|
|
pytest.skip("No PbP test files found")
|
|
|
|
for pbp_file in pbp_files[:3]: # Test first 3 files to avoid long test times
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(pbp_file)
|
|
|
|
# Verify the function doesn't crash and returns reasonable values
|
|
assert isinstance(file_name_cut, str)
|
|
assert isinstance(folder_name, str)
|
|
assert len(file_name_cut) > 0
|
|
assert len(folder_name) > 0
|
|
|
|
print(
|
|
f"PbP File: {pbp_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}"
|
|
)
|
|
|
|
def test_with_real_hk_files(self, test_data_dir):
|
|
"""Test function with actual HK test files."""
|
|
hk_dir = test_data_dir / "hk_files_test"
|
|
|
|
if not hk_dir.exists():
|
|
pytest.skip(f"Test data directory not found: {hk_dir}")
|
|
|
|
# Find actual HK files in the test directory
|
|
hk_files = list(hk_dir.glob("**/*hk*")) + list(hk_dir.glob("**/*HK*"))
|
|
|
|
if not hk_files:
|
|
pytest.skip("No HK test files found")
|
|
|
|
for hk_file in hk_files[:3]: # Test first 3 files
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(hk_file)
|
|
|
|
# Verify the function doesn't crash and returns reasonable values
|
|
assert isinstance(file_name_cut, str)
|
|
assert isinstance(folder_name, str)
|
|
assert len(file_name_cut) > 0
|
|
assert len(folder_name) > 0
|
|
|
|
print(
|
|
f"HK File: {hk_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}"
|
|
)
|
|
|
|
def test_with_all_test_files(self, test_data_dir):
|
|
"""Test function with any CSV/ZIP files in test data."""
|
|
if not test_data_dir.exists():
|
|
pytest.skip(f"Test data directory not found: {test_data_dir}")
|
|
|
|
# Find all CSV and ZIP files in test data
|
|
test_files = []
|
|
test_files.extend(test_data_dir.glob("**/*.csv"))
|
|
test_files.extend(test_data_dir.glob("**/*.zip"))
|
|
|
|
if not test_files:
|
|
pytest.skip("No test files found")
|
|
|
|
successful_extractions = 0
|
|
|
|
for test_file in test_files:
|
|
try:
|
|
file_name_cut, folder_name = extract_sp2xr_filename_parts(test_file)
|
|
|
|
# Basic validation
|
|
assert isinstance(file_name_cut, str)
|
|
assert isinstance(folder_name, str)
|
|
|
|
successful_extractions += 1
|
|
print(f"✓ {test_file.name} -> {file_name_cut} | {folder_name}")
|
|
|
|
except Exception as e:
|
|
print(f"✗ Failed on {test_file.name}: {e}")
|
|
# Don't fail the test, just report issues
|
|
|
|
# Ensure we processed at least some files successfully
|
|
assert successful_extractions > 0, "No files were processed successfully"
|
|
print(
|
|
f"\nSuccessfully processed {successful_extractions}/{len(test_files)} files"
|
|
)
|
|
|
|
|
|
class TestRealFilenamePatterns:
|
|
"""Analyze actual filename patterns in test data."""
|
|
|
|
@pytest.fixture
|
|
def test_data_dir(self):
|
|
return Path(__file__).parent / "data"
|
|
|
|
def test_analyze_filename_patterns(self, test_data_dir):
|
|
"""Analyze and report on actual filename patterns in test data."""
|
|
if not test_data_dir.exists():
|
|
pytest.skip(f"Test data directory not found: {test_data_dir}")
|
|
|
|
patterns = {}
|
|
all_files = list(test_data_dir.glob("**/*.*"))
|
|
|
|
for file_path in all_files:
|
|
if file_path.suffix in [".csv", ".zip"]:
|
|
filename = file_path.name
|
|
parts = filename.split("_")
|
|
pattern = f"{len(parts)} parts: {' | '.join(parts[:3])}{'...' if len(parts) > 3 else ''}"
|
|
|
|
if pattern not in patterns:
|
|
patterns[pattern] = []
|
|
patterns[pattern].append(filename)
|
|
|
|
print("\nFilename patterns found:")
|
|
for pattern, files in patterns.items():
|
|
print(f"\n{pattern}")
|
|
for file in files[:3]: # Show first 3 examples
|
|
print(f" - {file}")
|
|
if len(files) > 3:
|
|
print(f" ... and {len(files) - 3} more")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Allow running the test file directly
|
|
pytest.main([__file__, "-v", "-s"])
|