test: add test for path extraction from file directory
This commit is contained in:
249
tests/test_path_extraction.py
Normal file
249
tests/test_path_extraction.py
Normal file
@@ -0,0 +1,249 @@
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
# Add src to path to import sp2xr modules
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from sp2xr.helpers import extract_sp2xr_filename_parts
|
||||
|
||||
|
||||
class TestExtractSP2XRFilenameParts:
|
||||
"""Test cases for extract_sp2xr_filename_parts function."""
|
||||
|
||||
def test_standard_windows_path(self):
|
||||
"""Test with typical Windows path format."""
|
||||
file_path = r"C:\data\SP2XR\20240101\file_20240101_001.csv"
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
assert file_name_cut == "20240101_001"
|
||||
assert folder_name == "20240101"
|
||||
|
||||
def test_standard_unix_path(self):
|
||||
"""Test with typical Unix path format."""
|
||||
file_path = "/data/SP2XR/20240101/file_20240101_001.csv"
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
assert file_name_cut == "20240101_001"
|
||||
assert folder_name == "20240101"
|
||||
|
||||
def test_zip_file_extension(self):
|
||||
"""Test with .zip extension."""
|
||||
file_path = "/data/SP2XR/20240101/pbp_20240101_002.zip"
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
assert file_name_cut == "20240101_002"
|
||||
assert folder_name == "20240101"
|
||||
|
||||
def test_multiple_underscores(self):
|
||||
"""Test filename with multiple underscores."""
|
||||
file_path = "/data/SP2XR/20240101/sp2_data_file_20240101_003.csv"
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
assert file_name_cut == "20240101_003"
|
||||
assert folder_name == "20240101"
|
||||
|
||||
def test_pathlib_object_input(self):
|
||||
"""Test with Path object as input."""
|
||||
file_path = Path("/data/SP2XR/20240101/hk_20240101_004.csv")
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
assert file_name_cut == "20240101_004"
|
||||
assert folder_name == "20240101"
|
||||
|
||||
def test_mixed_path_separators(self):
|
||||
"""Test handling of mixed path separators."""
|
||||
file_path = r"C:\data\SP2XR/20240101\file_20240101_005.csv"
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
assert file_name_cut == "20240101_005"
|
||||
assert folder_name == "20240101"
|
||||
|
||||
def test_single_underscore_filename(self):
|
||||
"""Test filename with only one underscore."""
|
||||
file_path = "/data/SP2XR/20240101/data_001.csv"
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
# Should take the last part after underscore
|
||||
assert file_name_cut == "data_001"
|
||||
assert folder_name == "data"
|
||||
|
||||
def test_no_underscore_filename(self):
|
||||
"""Test filename with no underscores - edge case."""
|
||||
file_path = "/data/SP2XR/20240101/datafile.csv"
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
# Should fallback to stem and parent name
|
||||
assert file_name_cut == "datafile"
|
||||
assert folder_name == "20240101"
|
||||
|
||||
def test_empty_filename_parts(self):
|
||||
"""Test edge case with unusual filename structure."""
|
||||
file_path = "/data/SP2XR/folder/_.csv"
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
# For "_.csv": This is a pathological case that the original code couldn't handle well
|
||||
# The function returns the most reasonable interpretation:
|
||||
# file_name_cut = "_" (the stem of the file)
|
||||
# folder_name = "folder" (falls back to parent directory)
|
||||
assert file_name_cut == "_"
|
||||
assert folder_name == "folder" # Falls back to parent directory name
|
||||
|
||||
@pytest.mark.parametrize("file_extension", [".csv", ".zip", ".parquet", ".txt"])
|
||||
def test_different_extensions(self, file_extension):
|
||||
"""Test with different file extensions."""
|
||||
file_path = f"/data/SP2XR/20240101/test_20240101_001{file_extension}"
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
|
||||
assert file_name_cut == "20240101_001"
|
||||
assert folder_name == "20240101"
|
||||
|
||||
def test_real_sp2xr_filename_patterns(self):
|
||||
"""Test with realistic SP2XR filename patterns."""
|
||||
test_cases = [
|
||||
("/data/SP2XR/20240101/PbP_20240101_001.csv", "20240101_001", "20240101"),
|
||||
("/data/SP2XR/20240101/hk_20240101_002.zip", "20240101_002", "20240101"),
|
||||
(r"C:\SP2XR\20240101\SP2_20240101_003.csv", "20240101_003", "20240101"),
|
||||
]
|
||||
|
||||
for file_path, expected_cut, expected_folder in test_cases:
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
|
||||
assert file_name_cut == expected_cut, f"Failed for {file_path}"
|
||||
assert folder_name == expected_folder, f"Failed for {file_path}"
|
||||
|
||||
|
||||
class TestExtractSP2XRFilenamePartsIntegration:
|
||||
"""Integration tests using existing test files in the repository."""
|
||||
|
||||
@pytest.fixture
|
||||
def test_data_dir(self):
|
||||
"""Get the path to the test data directory."""
|
||||
# Get the directory where this test file is located, then go to data
|
||||
return Path(__file__).parent / "data"
|
||||
|
||||
def test_with_real_pbp_files(self, test_data_dir):
|
||||
"""Test function with actual PbP test files."""
|
||||
pbp_dir = test_data_dir / "pbp_files_test"
|
||||
|
||||
if not pbp_dir.exists():
|
||||
pytest.skip(f"Test data directory not found: {pbp_dir}")
|
||||
|
||||
# Find actual PbP files in the test directory
|
||||
pbp_files = list(pbp_dir.glob("**/*PbP*")) + list(pbp_dir.glob("**/*pbp*"))
|
||||
|
||||
if not pbp_files:
|
||||
pytest.skip("No PbP test files found")
|
||||
|
||||
for pbp_file in pbp_files[:3]: # Test first 3 files to avoid long test times
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(pbp_file)
|
||||
|
||||
# Verify the function doesn't crash and returns reasonable values
|
||||
assert isinstance(file_name_cut, str)
|
||||
assert isinstance(folder_name, str)
|
||||
assert len(file_name_cut) > 0
|
||||
assert len(folder_name) > 0
|
||||
|
||||
print(
|
||||
f"PbP File: {pbp_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}"
|
||||
)
|
||||
|
||||
def test_with_real_hk_files(self, test_data_dir):
|
||||
"""Test function with actual HK test files."""
|
||||
hk_dir = test_data_dir / "hk_files_test"
|
||||
|
||||
if not hk_dir.exists():
|
||||
pytest.skip(f"Test data directory not found: {hk_dir}")
|
||||
|
||||
# Find actual HK files in the test directory
|
||||
hk_files = list(hk_dir.glob("**/*hk*")) + list(hk_dir.glob("**/*HK*"))
|
||||
|
||||
if not hk_files:
|
||||
pytest.skip("No HK test files found")
|
||||
|
||||
for hk_file in hk_files[:3]: # Test first 3 files
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(hk_file)
|
||||
|
||||
# Verify the function doesn't crash and returns reasonable values
|
||||
assert isinstance(file_name_cut, str)
|
||||
assert isinstance(folder_name, str)
|
||||
assert len(file_name_cut) > 0
|
||||
assert len(folder_name) > 0
|
||||
|
||||
print(
|
||||
f"HK File: {hk_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}"
|
||||
)
|
||||
|
||||
def test_with_all_test_files(self, test_data_dir):
|
||||
"""Test function with any CSV/ZIP files in test data."""
|
||||
if not test_data_dir.exists():
|
||||
pytest.skip(f"Test data directory not found: {test_data_dir}")
|
||||
|
||||
# Find all CSV and ZIP files in test data
|
||||
test_files = []
|
||||
test_files.extend(test_data_dir.glob("**/*.csv"))
|
||||
test_files.extend(test_data_dir.glob("**/*.zip"))
|
||||
|
||||
if not test_files:
|
||||
pytest.skip("No test files found")
|
||||
|
||||
successful_extractions = 0
|
||||
|
||||
for test_file in test_files:
|
||||
try:
|
||||
file_name_cut, folder_name = extract_sp2xr_filename_parts(test_file)
|
||||
|
||||
# Basic validation
|
||||
assert isinstance(file_name_cut, str)
|
||||
assert isinstance(folder_name, str)
|
||||
|
||||
successful_extractions += 1
|
||||
print(f"✓ {test_file.name} -> {file_name_cut} | {folder_name}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Failed on {test_file.name}: {e}")
|
||||
# Don't fail the test, just report issues
|
||||
|
||||
# Ensure we processed at least some files successfully
|
||||
assert successful_extractions > 0, "No files were processed successfully"
|
||||
print(
|
||||
f"\nSuccessfully processed {successful_extractions}/{len(test_files)} files"
|
||||
)
|
||||
|
||||
|
||||
class TestRealFilenamePatterns:
|
||||
"""Analyze actual filename patterns in test data."""
|
||||
|
||||
@pytest.fixture
|
||||
def test_data_dir(self):
|
||||
return Path(__file__).parent / "data"
|
||||
|
||||
def test_analyze_filename_patterns(self, test_data_dir):
|
||||
"""Analyze and report on actual filename patterns in test data."""
|
||||
if not test_data_dir.exists():
|
||||
pytest.skip(f"Test data directory not found: {test_data_dir}")
|
||||
|
||||
patterns = {}
|
||||
all_files = list(test_data_dir.glob("**/*.*"))
|
||||
|
||||
for file_path in all_files:
|
||||
if file_path.suffix in [".csv", ".zip"]:
|
||||
filename = file_path.name
|
||||
parts = filename.split("_")
|
||||
pattern = f"{len(parts)} parts: {' | '.join(parts[:3])}{'...' if len(parts) > 3 else ''}"
|
||||
|
||||
if pattern not in patterns:
|
||||
patterns[pattern] = []
|
||||
patterns[pattern].append(filename)
|
||||
|
||||
print("\nFilename patterns found:")
|
||||
for pattern, files in patterns.items():
|
||||
print(f"\n{pattern}")
|
||||
for file in files[:3]: # Show first 3 examples
|
||||
print(f" - {file}")
|
||||
if len(files) > 3:
|
||||
print(f" ... and {len(files) - 3} more")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Allow running the test file directly
|
||||
pytest.main([__file__, "-v", "-s"])
|
||||
Reference in New Issue
Block a user