diff --git a/tests/test_path_extraction.py b/tests/test_path_extraction.py new file mode 100644 index 0000000..76d6592 --- /dev/null +++ b/tests/test_path_extraction.py @@ -0,0 +1,249 @@ +import pytest +from pathlib import Path +import sys + +# Add src to path to import sp2xr modules +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from sp2xr.helpers import extract_sp2xr_filename_parts + + +class TestExtractSP2XRFilenameParts: + """Test cases for extract_sp2xr_filename_parts function.""" + + def test_standard_windows_path(self): + """Test with typical Windows path format.""" + file_path = r"C:\data\SP2XR\20240101\file_20240101_001.csv" + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + + assert file_name_cut == "20240101_001" + assert folder_name == "20240101" + + def test_standard_unix_path(self): + """Test with typical Unix path format.""" + file_path = "/data/SP2XR/20240101/file_20240101_001.csv" + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + + assert file_name_cut == "20240101_001" + assert folder_name == "20240101" + + def test_zip_file_extension(self): + """Test with .zip extension.""" + file_path = "/data/SP2XR/20240101/pbp_20240101_002.zip" + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + + assert file_name_cut == "20240101_002" + assert folder_name == "20240101" + + def test_multiple_underscores(self): + """Test filename with multiple underscores.""" + file_path = "/data/SP2XR/20240101/sp2_data_file_20240101_003.csv" + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + + assert file_name_cut == "20240101_003" + assert folder_name == "20240101" + + def test_pathlib_object_input(self): + """Test with Path object as input.""" + file_path = Path("/data/SP2XR/20240101/hk_20240101_004.csv") + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + + assert file_name_cut == "20240101_004" + assert folder_name == "20240101" + + def test_mixed_path_separators(self): + """Test handling of mixed path separators.""" + file_path = r"C:\data\SP2XR/20240101\file_20240101_005.csv" + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + + assert file_name_cut == "20240101_005" + assert folder_name == "20240101" + + def test_single_underscore_filename(self): + """Test filename with only one underscore.""" + file_path = "/data/SP2XR/20240101/data_001.csv" + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + + # Should take the last part after underscore + assert file_name_cut == "data_001" + assert folder_name == "data" + + def test_no_underscore_filename(self): + """Test filename with no underscores - edge case.""" + file_path = "/data/SP2XR/20240101/datafile.csv" + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + + # Should fallback to stem and parent name + assert file_name_cut == "datafile" + assert folder_name == "20240101" + + def test_empty_filename_parts(self): + """Test edge case with unusual filename structure.""" + file_path = "/data/SP2XR/folder/_.csv" + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + + # For "_.csv": This is a pathological case that the original code couldn't handle well + # The function returns the most reasonable interpretation: + # file_name_cut = "_" (the stem of the file) + # folder_name = "folder" (falls back to parent directory) + assert file_name_cut == "_" + assert folder_name == "folder" # Falls back to parent directory name + + @pytest.mark.parametrize("file_extension", [".csv", ".zip", ".parquet", ".txt"]) + def test_different_extensions(self, file_extension): + """Test with different file extensions.""" + file_path = f"/data/SP2XR/20240101/test_20240101_001{file_extension}" + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + + assert file_name_cut == "20240101_001" + assert folder_name == "20240101" + + def test_real_sp2xr_filename_patterns(self): + """Test with realistic SP2XR filename patterns.""" + test_cases = [ + ("/data/SP2XR/20240101/PbP_20240101_001.csv", "20240101_001", "20240101"), + ("/data/SP2XR/20240101/hk_20240101_002.zip", "20240101_002", "20240101"), + (r"C:\SP2XR\20240101\SP2_20240101_003.csv", "20240101_003", "20240101"), + ] + + for file_path, expected_cut, expected_folder in test_cases: + file_name_cut, folder_name = extract_sp2xr_filename_parts(file_path) + assert file_name_cut == expected_cut, f"Failed for {file_path}" + assert folder_name == expected_folder, f"Failed for {file_path}" + + +class TestExtractSP2XRFilenamePartsIntegration: + """Integration tests using existing test files in the repository.""" + + @pytest.fixture + def test_data_dir(self): + """Get the path to the test data directory.""" + # Get the directory where this test file is located, then go to data + return Path(__file__).parent / "data" + + def test_with_real_pbp_files(self, test_data_dir): + """Test function with actual PbP test files.""" + pbp_dir = test_data_dir / "pbp_files_test" + + if not pbp_dir.exists(): + pytest.skip(f"Test data directory not found: {pbp_dir}") + + # Find actual PbP files in the test directory + pbp_files = list(pbp_dir.glob("**/*PbP*")) + list(pbp_dir.glob("**/*pbp*")) + + if not pbp_files: + pytest.skip("No PbP test files found") + + for pbp_file in pbp_files[:3]: # Test first 3 files to avoid long test times + file_name_cut, folder_name = extract_sp2xr_filename_parts(pbp_file) + + # Verify the function doesn't crash and returns reasonable values + assert isinstance(file_name_cut, str) + assert isinstance(folder_name, str) + assert len(file_name_cut) > 0 + assert len(folder_name) > 0 + + print( + f"PbP File: {pbp_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}" + ) + + def test_with_real_hk_files(self, test_data_dir): + """Test function with actual HK test files.""" + hk_dir = test_data_dir / "hk_files_test" + + if not hk_dir.exists(): + pytest.skip(f"Test data directory not found: {hk_dir}") + + # Find actual HK files in the test directory + hk_files = list(hk_dir.glob("**/*hk*")) + list(hk_dir.glob("**/*HK*")) + + if not hk_files: + pytest.skip("No HK test files found") + + for hk_file in hk_files[:3]: # Test first 3 files + file_name_cut, folder_name = extract_sp2xr_filename_parts(hk_file) + + # Verify the function doesn't crash and returns reasonable values + assert isinstance(file_name_cut, str) + assert isinstance(folder_name, str) + assert len(file_name_cut) > 0 + assert len(folder_name) > 0 + + print( + f"HK File: {hk_file.name} -> Cut: {file_name_cut}, Folder: {folder_name}" + ) + + def test_with_all_test_files(self, test_data_dir): + """Test function with any CSV/ZIP files in test data.""" + if not test_data_dir.exists(): + pytest.skip(f"Test data directory not found: {test_data_dir}") + + # Find all CSV and ZIP files in test data + test_files = [] + test_files.extend(test_data_dir.glob("**/*.csv")) + test_files.extend(test_data_dir.glob("**/*.zip")) + + if not test_files: + pytest.skip("No test files found") + + successful_extractions = 0 + + for test_file in test_files: + try: + file_name_cut, folder_name = extract_sp2xr_filename_parts(test_file) + + # Basic validation + assert isinstance(file_name_cut, str) + assert isinstance(folder_name, str) + + successful_extractions += 1 + print(f"✓ {test_file.name} -> {file_name_cut} | {folder_name}") + + except Exception as e: + print(f"✗ Failed on {test_file.name}: {e}") + # Don't fail the test, just report issues + + # Ensure we processed at least some files successfully + assert successful_extractions > 0, "No files were processed successfully" + print( + f"\nSuccessfully processed {successful_extractions}/{len(test_files)} files" + ) + + +class TestRealFilenamePatterns: + """Analyze actual filename patterns in test data.""" + + @pytest.fixture + def test_data_dir(self): + return Path(__file__).parent / "data" + + def test_analyze_filename_patterns(self, test_data_dir): + """Analyze and report on actual filename patterns in test data.""" + if not test_data_dir.exists(): + pytest.skip(f"Test data directory not found: {test_data_dir}") + + patterns = {} + all_files = list(test_data_dir.glob("**/*.*")) + + for file_path in all_files: + if file_path.suffix in [".csv", ".zip"]: + filename = file_path.name + parts = filename.split("_") + pattern = f"{len(parts)} parts: {' | '.join(parts[:3])}{'...' if len(parts) > 3 else ''}" + + if pattern not in patterns: + patterns[pattern] = [] + patterns[pattern].append(filename) + + print("\nFilename patterns found:") + for pattern, files in patterns.items(): + print(f"\n{pattern}") + for file in files[:3]: # Show first 3 examples + print(f" - {file}") + if len(files) > 3: + print(f" ... and {len(files) - 3} more") + + +if __name__ == "__main__": + # Allow running the test file directly + pytest.main([__file__, "-v", "-s"])