SP2XR/tests/test_path_extraction.py

import pytest
from pathlib import Path
import sys

# Add src to path to import sp2xr modules
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from sp2xr.helpers import extract_sp2xr_filename_parts


class TestExtractSP2XRFilenameParts:
    """Test cases for extract_sp2xr_filename_parts function."""

    def test_standard_windows_path(self):
        """Test with typical Windows path format."""
        file_path = r"C:\data\SP2XR\20240101\file_20240101_001.csv"
        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)

        assert file_name_cut == "20240101_001"
        assert folder_name == "20240101"

    def test_standard_unix_path(self):
        """Test with typical Unix path format."""
        file_path = "/data/SP2XR/20240101/file_20240101_001.csv"
        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)

        assert file_name_cut == "20240101_001"
        assert folder_name == "20240101"

    def test_zip_file_extension(self):
        """Test with .zip extension."""
        file_path = "/data/SP2XR/20240101/pbp_20240101_002.zip"
        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)

        assert file_name_cut == "20240101_002"
        assert folder_name == "20240101"

    def test_multiple_underscores(self):
        """Test filename with multiple underscores."""
        file_path = "/data/SP2XR/20240101/sp2_data_file_20240101_003.csv"
        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)

        assert file_name_cut == "20240101_003"
        assert folder_name == "20240101"

    def test_pathlib_object_input(self):
        """Test with Path object as input."""
        file_path = Path("/data/SP2XR/20240101/hk_20240101_004.csv")
        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)

        assert file_name_cut == "20240101_004"
        assert folder_name == "20240101"

    def test_mixed_path_separators(self):
        """Test handling of mixed path separators."""
        file_path = r"C:\data\SP2XR/20240101\file_20240101_005.csv"
        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)

        assert file_name_cut == "20240101_005"
        assert folder_name == "20240101"

    def test_single_underscore_filename(self):
        """Test filename with only one underscore."""
        file_path = "/data/SP2XR/20240101/data_001.csv"
        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)

        # Should take the last part after underscore
        assert file_name_cut == "data_001"
        assert folder_name == "data"

    def test_no_underscore_filename(self):
        """Test filename with no underscores - edge case."""
        file_path = "/data/SP2XR/20240101/datafile.csv"
        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)

        # Should fallback to stem and parent name
        assert file_name_cut == "datafile"
        assert folder_name == "20240101"

    def test_empty_filename_parts(self):
        """Test edge case with unusual filename structure."""
        file_path = "/data/SP2XR/folder/_.csv"
        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)

        # For "_.csv": This is a pathological case that the original code couldn't handle well
        # The function returns the most reasonable interpretation:
        # file_name_cut = "_" (the stem of the file)
        # folder_name = "folder" (falls back to parent directory)
        assert file_name_cut == "_"
        assert folder_name == "folder"  # Falls back to parent directory name

    @pytest.mark.parametrize("file_extension", [".csv", ".zip", ".parquet", ".txt"])
    def test_different_extensions(self, file_extension):
        """Test with different file extensions."""
        file_path = f"/data/SP2XR/20240101/test_20240101_001{file_extension}"
        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)

        assert file_name_cut == "20240101_001"
        assert folder_name == "20240101"

    def test_real_sp2xr_filename_patterns(self):
        """Test with realistic SP2XR filename patterns."""
        test_cases = [
            ("/data/SP2XR/20240101/PbP_20240101_001.csv", "20240101_001", "20240101"),
            ("/data/SP2XR/20240101/hk_20240101_002.zip", "20240101_002", "20240101"),
            (r"C:\SP2XR\20240101\SP2_20240101_003.csv", "20240101_003", "20240101"),
        ]

        for file_path, expected_cut, expected_folder in test_cases:
            file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
            assert file_name_cut == expected_cut, f"Failed for {file_path}"
            assert folder_name == expected_folder, f"Failed for {file_path}"


class TestExtractSP2XRFilenamePartsIntegration:
    """Integration tests using existing test files in the repository."""

    @pytest.fixture
    def test_data_dir(self):
        """Get the path to the test data directory."""
        # Get the directory where this test file is located, then go to data
        return Path(__file__).parent / "data"

    def test_with_real_pbp_files(self, test_data_dir):
        """Test function with actual PbP test files."""
        pbp_dir = test_data_dir / "pbp_files_test"

        if not pbp_dir.exists():
            pytest.skip(f"Test data directory not found: {pbp_dir}")

        # Find actual PbP files in the test directory
        pbp_files = list(pbp_dir.glob("**/*PbP*")) + list(pbp_dir.glob("**/*pbp*"))

        if not pbp_files:
            pytest.skip("No PbP test files found")

        for pbp_file in pbp_files[:3]:  # Test first 3 files to avoid long test times
            file_name_cut, folder_name = extract_sp2xr_filename_parts(pbp_file)

            # Verify the function doesn't crash and returns reasonable values
            assert isinstance(file_name_cut, str)
            assert isinstance(folder_name, str)
            assert len(file_name_cut) > 0
            assert len(folder_name) > 0

            print(
                f"PbP File: {pbp_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}"
            )

    def test_with_real_hk_files(self, test_data_dir):
        """Test function with actual HK test files."""
        hk_dir = test_data_dir / "hk_files_test"

        if not hk_dir.exists():
            pytest.skip(f"Test data directory not found: {hk_dir}")

        # Find actual HK files in the test directory
        hk_files = list(hk_dir.glob("**/*hk*")) + list(hk_dir.glob("**/*HK*"))

        if not hk_files:
            pytest.skip("No HK test files found")

        for hk_file in hk_files[:3]:  # Test first 3 files
            file_name_cut, folder_name = extract_sp2xr_filename_parts(hk_file)

            # Verify the function doesn't crash and returns reasonable values
            assert isinstance(file_name_cut, str)
            assert isinstance(folder_name, str)
            assert len(file_name_cut) > 0
            assert len(folder_name) > 0

            print(
                f"HK File: {hk_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}"
            )

    def test_with_all_test_files(self, test_data_dir):
        """Test function with any CSV/ZIP files in test data."""
        if not test_data_dir.exists():
            pytest.skip(f"Test data directory not found: {test_data_dir}")

        # Find all CSV and ZIP files in test data
        test_files = []
        test_files.extend(test_data_dir.glob("**/*.csv"))
        test_files.extend(test_data_dir.glob("**/*.zip"))

        if not test_files:
            pytest.skip("No test files found")

        successful_extractions = 0

        for test_file in test_files:
            try:
                file_name_cut, folder_name = extract_sp2xr_filename_parts(test_file)

                # Basic validation
                assert isinstance(file_name_cut, str)
                assert isinstance(folder_name, str)

                successful_extractions += 1
                print(f"✓ {test_file.name} -> {file_name_cut} | {folder_name}")

            except Exception as e:
                print(f"✗ Failed on {test_file.name}: {e}")
                # Don't fail the test, just report issues

        # Ensure we processed at least some files successfully
        assert successful_extractions > 0, "No files were processed successfully"
        print(
            f"\nSuccessfully processed {successful_extractions}/{len(test_files)} files"
        )


class TestRealFilenamePatterns:
    """Analyze actual filename patterns in test data."""

    @pytest.fixture
    def test_data_dir(self):
        return Path(__file__).parent / "data"

    def test_analyze_filename_patterns(self, test_data_dir):
        """Analyze and report on actual filename patterns in test data."""
        if not test_data_dir.exists():
            pytest.skip(f"Test data directory not found: {test_data_dir}")

        patterns = {}
        all_files = list(test_data_dir.glob("**/*.*"))

        for file_path in all_files:
            if file_path.suffix in [".csv", ".zip"]:
                filename = file_path.name
                parts = filename.split("_")
                pattern = f"{len(parts)} parts: {' | '.join(parts[:3])}{'...' if len(parts) > 3 else ''}"

                if pattern not in patterns:
                    patterns[pattern] = []
                patterns[pattern].append(filename)

        print("\nFilename patterns found:")
        for pattern, files in patterns.items():
            print(f"\n{pattern}")
            for file in files[:3]:  # Show first 3 examples
                print(f"  - {file}")
            if len(files) > 3:
                print(f"  ... and {len(files) - 3} more")


if __name__ == "__main__":
    # Allow running the test file directly
    pytest.main([__file__, "-v", "-s"])