SP2XR/meta_files/generate_config.py

from __future__ import annotations

import pandas as pd
import yaml
import os
from pathlib import Path
from typing import Any


def infer_general_dtype(dtype: Any) -> str:
    """Infer general data type from pandas dtype."""
    if pd.api.types.is_integer_dtype(dtype):
        return "int"
    elif pd.api.types.is_float_dtype(dtype):
        return "float"
    elif pd.api.types.is_datetime64_any_dtype(dtype):
        return "datetime"
    else:
        return "string"


def load_schema(input_file: str | Path) -> dict[str, str]:
    """Load schema from input file by inferring column types."""
    ext = os.path.splitext(str(input_file))[1].lower()

    if ext in [".csv", ".zip"]:
        df = pd.read_csv(input_file, nrows=100)
    elif ext == ".parquet":
        df = pd.read_parquet(input_file)
    else:
        raise ValueError(f"Unsupported file format: {ext}")

    schema = {col: infer_general_dtype(dtype) for col, dtype in df.dtypes.items()}
    return schema


def get_canonical_schemas() -> dict[str, dict[str, str]]:
    """Return canonical column schemas for SP2XR data."""
    pbp_canonical = {
        "Time (sec)": "float",
        "Packet Time Stamp": "float",
        "Flag": "float",
        "Dropped Records": "float",
        "Record Count": "float",
        "Record Size": "float",
        "Particle Time Stamp": "float",
        "Particle Flags": "float",
        "Scatter relPeak": "float",
        "Scatter Transit Time": "float",
        "Scatter Peak Time": "float",
        "Scatter FWHM": "float",
        "Scatter Size (nm)": "float",
        "Incand relPeak": "float",
        "Incand Transit Time": "float",
        "Incand Peak Time": "float",
        "Incand FWHM": "float",
        "Incand Delay": "float",
        "Incand Mass (fg)": "float",
        "Reserved": "float",
    }

    hk_canonical = {
        "Time Stamp": "datetime",
        "Time (sec)": "float",
        "Sample Flow Controller Read (sccm)": "float",
        "Sample Flow Controller Read (vccm)": "float",
        # Core HK columns that are commonly used
        "Time Stamp (UTC sec)": "float",
        "Elapsed Time": "float",
        "Error Code": "float",
        "Packet Time Stamp": "float",
        "Laser TEC Temp (C)": "float",
        "Crystal TEC Temp (C)": "float",
        "Inlet Air Temp (C)": "float",
        "Computer Heatsink Temp (C)": "float",
        "Laser Heatsink Temp (C)": "float",
        "Outlet Air Temp (C)": "float",
        "YAG Output Monitor (V)": "float",
        "Cavity Pressure (hPa)": "float",
        "Laser Driver Power Monitor (uA)": "float",
        "Laser Driver Current Limit Monitor (A)": "float",
        "Laser Driver Current Monitor (A)": "float",
        # ... (other HK columns can be added as needed)
    }

    return {"pbp_canonical": pbp_canonical, "hk_canonical": hk_canonical}


def generate_combined_config(
    pbp_file: str | Path, hk_file: str | Path, output_file: str = "config.yaml"
) -> None:
    """Generate config file with both schema definitions and column mappings."""
    config = {
        "pbp_schema": load_schema(pbp_file),
        "hk_schema": load_schema(hk_file),
    }

    with open(output_file, "w") as f:
        yaml.dump(config, f, sort_keys=False)

    print(f"Unified config saved to: {output_file}")


def generate_mapping_template(
    pbp_file: str | Path,
    hk_file: str | Path,
    output_file: str = "config_with_mapping.yaml",
) -> None:
    """
    Generate enhanced config with column mapping templates.

    This creates a config file that allows users to map their instrument-specific
    column names to the canonical column names used in the main processing pipeline.
    """
    # Load actual file schemas
    pbp_schema = load_schema(pbp_file)
    hk_schema = load_schema(hk_file)

    # Get canonical schemas
    canonical_schemas = get_canonical_schemas()

    # Create column mapping templates
    pbp_mapping = {}
    hk_mapping = {}

    # For PbP: map file columns to canonical columns
    for canonical_col in canonical_schemas["pbp_canonical"]:
        # Try to find exact match first
        matching_file_col = None
        for file_col in pbp_schema.keys():
            if file_col.lower() == canonical_col.lower():
                matching_file_col = file_col
                break

        # If exact match found, use it; otherwise leave as template
        pbp_mapping[canonical_col] = (
            matching_file_col
            or f"YOUR_COLUMN_NAME_FOR_{canonical_col.replace(' ', '_').replace('(', '').replace(')', '').upper()}"
        )

    # For HK: map file columns to canonical columns
    for canonical_col in canonical_schemas["hk_canonical"]:
        matching_file_col = None
        for file_col in hk_schema.keys():
            if file_col.lower() == canonical_col.lower():
                matching_file_col = file_col
                break

        hk_mapping[canonical_col] = (
            matching_file_col
            or f"YOUR_COLUMN_NAME_FOR_{canonical_col.replace(' ', '_').replace('(', '').replace(')', '').upper()}"
        )

    # Build enhanced config
    config = {
        "# INSTRUCTIONS": [
            "This config file contains both schema definitions and column mappings.",
            "1. The *_schema sections define the data types for your input files.",
            "2. The *_column_mapping sections map your file columns to canonical names.",
            "3. Replace placeholder values (YOUR_COLUMN_NAME_FOR_*) with actual column names from your files.",
            "4. If your file doesn't have a particular canonical column, set it to null or remove the line.",
            "5. The output parquet files will use the canonical column names for consistency.",
        ],
        "pbp_schema": pbp_schema,
        "hk_schema": hk_schema,
        "pbp_canonical_schema": canonical_schemas["pbp_canonical"],
        "hk_canonical_schema": canonical_schemas["hk_canonical"],
        "pbp_column_mapping": pbp_mapping,
        "hk_column_mapping": hk_mapping,
    }

    with open(output_file, "w") as f:
        yaml.dump(config, f, sort_keys=False, default_flow_style=False)

    print(f"Enhanced config with column mapping saved to: {output_file}")
    print("\nNext steps:")
    print(
        "1. Open the config file and replace placeholder column mappings with your actual column names"
    )
    print(
        "2. Remove or set to null any canonical columns that don't exist in your data"
    )
    print("3. Use this config file with the updated CSV to Parquet conversion process")


def apply_column_mapping(
    df: pd.DataFrame, column_mapping: dict[str, str | None]
) -> pd.DataFrame:
    """
    Apply column name mapping to standardize column names.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe with instrument-specific column names
    column_mapping : dict[str, str | None]
        Mapping from canonical names to file column names

    Returns
    -------
    pd.DataFrame
        DataFrame with standardized column names
    """
    # Create reverse mapping: file_column_name -> canonical_name
    reverse_mapping = {}
    for canonical_name, file_column in column_mapping.items():
        if (
            file_column
            and file_column in df.columns
            and not file_column.startswith("YOUR_COLUMN_NAME_FOR_")
        ):
            reverse_mapping[file_column] = canonical_name

    # Rename columns using reverse mapping
    df_renamed = df.rename(columns=reverse_mapping)

    return df_renamed


# Example usage
if __name__ == "__main__":
    # Legacy function for backward compatibility
    # generate_combined_config("pbp_meta.parquet", "hk_meta.parquet")

    # New enhanced function
    pbp_tmp_file = "/data/user/bertoz_b/merlin6data/SP2XR_code/tests/data/mini_SP2XR_PbP_20190409110737_x0001.zip"
    hk_tmp_file = "/data/user/bertoz_b/merlin6data/SP2XR_code/tests/data/mini_SP2XR_hk_20190409110737_x0001.zip"
    generate_mapping_template(pbp_tmp_file, hk_tmp_file)