229 lines
7.8 KiB
Python
229 lines
7.8 KiB
Python
from __future__ import annotations
|
|
|
|
import pandas as pd
|
|
import yaml
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
def infer_general_dtype(dtype: Any) -> str:
|
|
"""Infer general data type from pandas dtype."""
|
|
if pd.api.types.is_integer_dtype(dtype):
|
|
return "int"
|
|
elif pd.api.types.is_float_dtype(dtype):
|
|
return "float"
|
|
elif pd.api.types.is_datetime64_any_dtype(dtype):
|
|
return "datetime"
|
|
else:
|
|
return "string"
|
|
|
|
|
|
def load_schema(input_file: str | Path) -> dict[str, str]:
|
|
"""Load schema from input file by inferring column types."""
|
|
ext = os.path.splitext(str(input_file))[1].lower()
|
|
|
|
if ext in [".csv", ".zip"]:
|
|
df = pd.read_csv(input_file, nrows=100)
|
|
elif ext == ".parquet":
|
|
df = pd.read_parquet(input_file)
|
|
else:
|
|
raise ValueError(f"Unsupported file format: {ext}")
|
|
|
|
schema = {col: infer_general_dtype(dtype) for col, dtype in df.dtypes.items()}
|
|
return schema
|
|
|
|
|
|
def get_canonical_schemas() -> dict[str, dict[str, str]]:
|
|
"""Return canonical column schemas for SP2XR data."""
|
|
pbp_canonical = {
|
|
"Time (sec)": "float",
|
|
"Packet Time Stamp": "float",
|
|
"Flag": "float",
|
|
"Dropped Records": "float",
|
|
"Record Count": "float",
|
|
"Record Size": "float",
|
|
"Particle Time Stamp": "float",
|
|
"Particle Flags": "float",
|
|
"Scatter relPeak": "float",
|
|
"Scatter Transit Time": "float",
|
|
"Scatter Peak Time": "float",
|
|
"Scatter FWHM": "float",
|
|
"Scatter Size (nm)": "float",
|
|
"Incand relPeak": "float",
|
|
"Incand Transit Time": "float",
|
|
"Incand Peak Time": "float",
|
|
"Incand FWHM": "float",
|
|
"Incand Delay": "float",
|
|
"Incand Mass (fg)": "float",
|
|
"Reserved": "float",
|
|
}
|
|
|
|
hk_canonical = {
|
|
"Time Stamp": "datetime",
|
|
"Time (sec)": "float",
|
|
"Sample Flow Controller Read (sccm)": "float",
|
|
"Sample Flow Controller Read (vccm)": "float",
|
|
# Core HK columns that are commonly used
|
|
"Time Stamp (UTC sec)": "float",
|
|
"Elapsed Time": "float",
|
|
"Error Code": "float",
|
|
"Packet Time Stamp": "float",
|
|
"Laser TEC Temp (C)": "float",
|
|
"Crystal TEC Temp (C)": "float",
|
|
"Inlet Air Temp (C)": "float",
|
|
"Computer Heatsink Temp (C)": "float",
|
|
"Laser Heatsink Temp (C)": "float",
|
|
"Outlet Air Temp (C)": "float",
|
|
"YAG Output Monitor (V)": "float",
|
|
"Cavity Pressure (hPa)": "float",
|
|
"Laser Driver Power Monitor (uA)": "float",
|
|
"Laser Driver Current Limit Monitor (A)": "float",
|
|
"Laser Driver Current Monitor (A)": "float",
|
|
# ... (other HK columns can be added as needed)
|
|
}
|
|
|
|
return {"pbp_canonical": pbp_canonical, "hk_canonical": hk_canonical}
|
|
|
|
|
|
def generate_combined_config(
|
|
pbp_file: str | Path, hk_file: str | Path, output_file: str = "config.yaml"
|
|
) -> None:
|
|
"""Generate config file with both schema definitions and column mappings."""
|
|
config = {
|
|
"pbp_schema": load_schema(pbp_file),
|
|
"hk_schema": load_schema(hk_file),
|
|
}
|
|
|
|
with open(output_file, "w") as f:
|
|
yaml.dump(config, f, sort_keys=False)
|
|
|
|
print(f"Unified config saved to: {output_file}")
|
|
|
|
|
|
def generate_mapping_template(
|
|
pbp_file: str | Path,
|
|
hk_file: str | Path,
|
|
output_file: str = "config_with_mapping.yaml",
|
|
) -> None:
|
|
"""
|
|
Generate enhanced config with column mapping templates.
|
|
|
|
This creates a config file that allows users to map their instrument-specific
|
|
column names to the canonical column names used in the main processing pipeline.
|
|
"""
|
|
# Load actual file schemas
|
|
pbp_schema = load_schema(pbp_file)
|
|
hk_schema = load_schema(hk_file)
|
|
|
|
# Get canonical schemas
|
|
canonical_schemas = get_canonical_schemas()
|
|
|
|
# Create column mapping templates
|
|
pbp_mapping = {}
|
|
hk_mapping = {}
|
|
|
|
# For PbP: map file columns to canonical columns
|
|
for canonical_col in canonical_schemas["pbp_canonical"]:
|
|
# Try to find exact match first
|
|
matching_file_col = None
|
|
for file_col in pbp_schema.keys():
|
|
if file_col.lower() == canonical_col.lower():
|
|
matching_file_col = file_col
|
|
break
|
|
|
|
# If exact match found, use it; otherwise leave as template
|
|
pbp_mapping[canonical_col] = (
|
|
matching_file_col
|
|
or f"YOUR_COLUMN_NAME_FOR_{canonical_col.replace(' ', '_').replace('(', '').replace(')', '').upper()}"
|
|
)
|
|
|
|
# For HK: map file columns to canonical columns
|
|
for canonical_col in canonical_schemas["hk_canonical"]:
|
|
matching_file_col = None
|
|
for file_col in hk_schema.keys():
|
|
if file_col.lower() == canonical_col.lower():
|
|
matching_file_col = file_col
|
|
break
|
|
|
|
hk_mapping[canonical_col] = (
|
|
matching_file_col
|
|
or f"YOUR_COLUMN_NAME_FOR_{canonical_col.replace(' ', '_').replace('(', '').replace(')', '').upper()}"
|
|
)
|
|
|
|
# Build enhanced config
|
|
config = {
|
|
"# INSTRUCTIONS": [
|
|
"This config file contains both schema definitions and column mappings.",
|
|
"1. The *_schema sections define the data types for your input files.",
|
|
"2. The *_column_mapping sections map your file columns to canonical names.",
|
|
"3. Replace placeholder values (YOUR_COLUMN_NAME_FOR_*) with actual column names from your files.",
|
|
"4. If your file doesn't have a particular canonical column, set it to null or remove the line.",
|
|
"5. The output parquet files will use the canonical column names for consistency.",
|
|
],
|
|
"pbp_schema": pbp_schema,
|
|
"hk_schema": hk_schema,
|
|
"pbp_canonical_schema": canonical_schemas["pbp_canonical"],
|
|
"hk_canonical_schema": canonical_schemas["hk_canonical"],
|
|
"pbp_column_mapping": pbp_mapping,
|
|
"hk_column_mapping": hk_mapping,
|
|
}
|
|
|
|
with open(output_file, "w") as f:
|
|
yaml.dump(config, f, sort_keys=False, default_flow_style=False)
|
|
|
|
print(f"Enhanced config with column mapping saved to: {output_file}")
|
|
print("\nNext steps:")
|
|
print(
|
|
"1. Open the config file and replace placeholder column mappings with your actual column names"
|
|
)
|
|
print(
|
|
"2. Remove or set to null any canonical columns that don't exist in your data"
|
|
)
|
|
print("3. Use this config file with the updated CSV to Parquet conversion process")
|
|
|
|
|
|
def apply_column_mapping(
|
|
df: pd.DataFrame, column_mapping: dict[str, str | None]
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Apply column name mapping to standardize column names.
|
|
|
|
Parameters
|
|
----------
|
|
df : pd.DataFrame
|
|
Input dataframe with instrument-specific column names
|
|
column_mapping : dict[str, str | None]
|
|
Mapping from canonical names to file column names
|
|
|
|
Returns
|
|
-------
|
|
pd.DataFrame
|
|
DataFrame with standardized column names
|
|
"""
|
|
# Create reverse mapping: file_column_name -> canonical_name
|
|
reverse_mapping = {}
|
|
for canonical_name, file_column in column_mapping.items():
|
|
if (
|
|
file_column
|
|
and file_column in df.columns
|
|
and not file_column.startswith("YOUR_COLUMN_NAME_FOR_")
|
|
):
|
|
reverse_mapping[file_column] = canonical_name
|
|
|
|
# Rename columns using reverse mapping
|
|
df_renamed = df.rename(columns=reverse_mapping)
|
|
|
|
return df_renamed
|
|
|
|
|
|
# Example usage
|
|
if __name__ == "__main__":
|
|
# Legacy function for backward compatibility
|
|
# generate_combined_config("pbp_meta.parquet", "hk_meta.parquet")
|
|
|
|
# New enhanced function
|
|
pbp_tmp_file = "/data/user/bertoz_b/merlin6data/SP2XR_code/tests/data/mini_SP2XR_PbP_20190409110737_x0001.zip"
|
|
hk_tmp_file = "/data/user/bertoz_b/merlin6data/SP2XR_code/tests/data/mini_SP2XR_hk_20190409110737_x0001.zip"
|
|
generate_mapping_template(pbp_tmp_file, hk_tmp_file)
|