test: add test for path extraction from file directory

2025-09-09 19:14:30 +02:00
parent 6621236ea4
commit 641871a567
1 changed files with 249 additions and 0 deletions
--- a/tests/test_path_extraction.py
+++ b/tests/test_path_extraction.py
@@ -0,0 +1,249 @@
+import pytest
+from pathlib import Path
+import sys
+
+# Add src to path to import sp2xr modules
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+from sp2xr.helpers import extract_sp2xr_filename_parts
+
+
+class TestExtractSP2XRFilenameParts:
+    """Test cases for extract_sp2xr_filename_parts function."""
+
+    def test_standard_windows_path(self):
+        """Test with typical Windows path format."""
+        file_path = r"C:\data\SP2XR\20240101\file_20240101_001.csv"
+        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+
+        assert file_name_cut == "20240101_001"
+        assert folder_name == "20240101"
+
+    def test_standard_unix_path(self):
+        """Test with typical Unix path format."""
+        file_path = "/data/SP2XR/20240101/file_20240101_001.csv"
+        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+
+        assert file_name_cut == "20240101_001"
+        assert folder_name == "20240101"
+
+    def test_zip_file_extension(self):
+        """Test with .zip extension."""
+        file_path = "/data/SP2XR/20240101/pbp_20240101_002.zip"
+        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+
+        assert file_name_cut == "20240101_002"
+        assert folder_name == "20240101"
+
+    def test_multiple_underscores(self):
+        """Test filename with multiple underscores."""
+        file_path = "/data/SP2XR/20240101/sp2_data_file_20240101_003.csv"
+        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+
+        assert file_name_cut == "20240101_003"
+        assert folder_name == "20240101"
+
+    def test_pathlib_object_input(self):
+        """Test with Path object as input."""
+        file_path = Path("/data/SP2XR/20240101/hk_20240101_004.csv")
+        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+
+        assert file_name_cut == "20240101_004"
+        assert folder_name == "20240101"
+
+    def test_mixed_path_separators(self):
+        """Test handling of mixed path separators."""
+        file_path = r"C:\data\SP2XR/20240101\file_20240101_005.csv"
+        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+
+        assert file_name_cut == "20240101_005"
+        assert folder_name == "20240101"
+
+    def test_single_underscore_filename(self):
+        """Test filename with only one underscore."""
+        file_path = "/data/SP2XR/20240101/data_001.csv"
+        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+
+        # Should take the last part after underscore
+        assert file_name_cut == "data_001"
+        assert folder_name == "data"
+
+    def test_no_underscore_filename(self):
+        """Test filename with no underscores - edge case."""
+        file_path = "/data/SP2XR/20240101/datafile.csv"
+        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+
+        # Should fallback to stem and parent name
+        assert file_name_cut == "datafile"
+        assert folder_name == "20240101"
+
+    def test_empty_filename_parts(self):
+        """Test edge case with unusual filename structure."""
+        file_path = "/data/SP2XR/folder/_.csv"
+        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+
+        # For "_.csv": This is a pathological case that the original code couldn't handle well
+        # The function returns the most reasonable interpretation:
+        # file_name_cut = "_" (the stem of the file)
+        # folder_name = "folder" (falls back to parent directory)
+        assert file_name_cut == "_"
+        assert folder_name == "folder"  # Falls back to parent directory name
+
+    @pytest.mark.parametrize("file_extension", [".csv", ".zip", ".parquet", ".txt"])
+    def test_different_extensions(self, file_extension):
+        """Test with different file extensions."""
+        file_path = f"/data/SP2XR/20240101/test_20240101_001{file_extension}"
+        file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+
+        assert file_name_cut == "20240101_001"
+        assert folder_name == "20240101"
+
+    def test_real_sp2xr_filename_patterns(self):
+        """Test with realistic SP2XR filename patterns."""
+        test_cases = [
+            ("/data/SP2XR/20240101/PbP_20240101_001.csv", "20240101_001", "20240101"),
+            ("/data/SP2XR/20240101/hk_20240101_002.zip", "20240101_002", "20240101"),
+            (r"C:\SP2XR\20240101\SP2_20240101_003.csv", "20240101_003", "20240101"),
+        ]
+
+        for file_path, expected_cut, expected_folder in test_cases:
+            file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path)
+            assert file_name_cut == expected_cut, f"Failed for {file_path}"
+            assert folder_name == expected_folder, f"Failed for {file_path}"
+
+
+class TestExtractSP2XRFilenamePartsIntegration:
+    """Integration tests using existing test files in the repository."""
+
+    @pytest.fixture
+    def test_data_dir(self):
+        """Get the path to the test data directory."""
+        # Get the directory where this test file is located, then go to data
+        return Path(__file__).parent / "data"
+
+    def test_with_real_pbp_files(self, test_data_dir):
+        """Test function with actual PbP test files."""
+        pbp_dir = test_data_dir / "pbp_files_test"
+
+        if not pbp_dir.exists():
+            pytest.skip(f"Test data directory not found: {pbp_dir}")
+
+        # Find actual PbP files in the test directory
+        pbp_files = list(pbp_dir.glob("**/*PbP*")) + list(pbp_dir.glob("**/*pbp*"))
+
+        if not pbp_files:
+            pytest.skip("No PbP test files found")
+
+        for pbp_file in pbp_files[:3]:  # Test first 3 files to avoid long test times
+            file_name_cut, folder_name = extract_sp2xr_filename_parts(pbp_file)
+
+            # Verify the function doesn't crash and returns reasonable values
+            assert isinstance(file_name_cut, str)
+            assert isinstance(folder_name, str)
+            assert len(file_name_cut) > 0
+            assert len(folder_name) > 0
+
+            print(
+                f"PbP File: {pbp_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}"
+            )
+
+    def test_with_real_hk_files(self, test_data_dir):
+        """Test function with actual HK test files."""
+        hk_dir = test_data_dir / "hk_files_test"
+
+        if not hk_dir.exists():
+            pytest.skip(f"Test data directory not found: {hk_dir}")
+
+        # Find actual HK files in the test directory
+        hk_files = list(hk_dir.glob("**/*hk*")) + list(hk_dir.glob("**/*HK*"))
+
+        if not hk_files:
+            pytest.skip("No HK test files found")
+
+        for hk_file in hk_files[:3]:  # Test first 3 files
+            file_name_cut, folder_name = extract_sp2xr_filename_parts(hk_file)
+
+            # Verify the function doesn't crash and returns reasonable values
+            assert isinstance(file_name_cut, str)
+            assert isinstance(folder_name, str)
+            assert len(file_name_cut) > 0
+            assert len(folder_name) > 0
+
+            print(
+                f"HK File: {hk_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}"
+            )
+
+    def test_with_all_test_files(self, test_data_dir):
+        """Test function with any CSV/ZIP files in test data."""
+        if not test_data_dir.exists():
+            pytest.skip(f"Test data directory not found: {test_data_dir}")
+
+        # Find all CSV and ZIP files in test data
+        test_files = []
+        test_files.extend(test_data_dir.glob("**/*.csv"))
+        test_files.extend(test_data_dir.glob("**/*.zip"))
+
+        if not test_files:
+            pytest.skip("No test files found")
+
+        successful_extractions = 0
+
+        for test_file in test_files:
+            try:
+                file_name_cut, folder_name = extract_sp2xr_filename_parts(test_file)
+
+                # Basic validation
+                assert isinstance(file_name_cut, str)
+                assert isinstance(folder_name, str)
+
+                successful_extractions += 1
+                print(f"✓ {test_file.name} -> {file_name_cut} | {folder_name}")
+
+            except Exception as e:
+                print(f"✗ Failed on {test_file.name}: {e}")
+                # Don't fail the test, just report issues
+
+        # Ensure we processed at least some files successfully
+        assert successful_extractions > 0, "No files were processed successfully"
+        print(
+            f"\nSuccessfully processed {successful_extractions}/{len(test_files)} files"
+        )
+
+
+class TestRealFilenamePatterns:
+    """Analyze actual filename patterns in test data."""
+
+    @pytest.fixture
+    def test_data_dir(self):
+        return Path(__file__).parent / "data"
+
+    def test_analyze_filename_patterns(self, test_data_dir):
+        """Analyze and report on actual filename patterns in test data."""
+        if not test_data_dir.exists():
+            pytest.skip(f"Test data directory not found: {test_data_dir}")
+
+        patterns = {}
+        all_files = list(test_data_dir.glob("**/*.*"))
+
+        for file_path in all_files:
+            if file_path.suffix in [".csv", ".zip"]:
+                filename = file_path.name
+                parts = filename.split("_")
+                pattern = f"{len(parts)} parts: {' | '.join(parts[:3])}{'...' if len(parts) > 3 else ''}"
+
+                if pattern not in patterns:
+                    patterns[pattern] = []
+                patterns[pattern].append(filename)
+
+        print("\nFilename patterns found:")
+        for pattern, files in patterns.items():
+            print(f"\n{pattern}")
+            for file in files[:3]:  # Show first 3 examples
+                print(f"  - {file}")
+            if len(files) > 3:
+                print(f"  ... and {len(files) - 3} more")
+
+
+if __name__ == "__main__":
+    # Allow running the test file directly
+    pytest.main([__file__, "-v", "-s"])