From cde421edda0a5eb5eea1adea98b10a860fcc7c5c Mon Sep 17 00:00:00 2001 From: Barbara Bertozzi Date: Tue, 30 Sep 2025 00:23:12 +0200 Subject: [PATCH] cleanup config organization --- config/complete_example.yaml | 121 +++++ .../my_data_schema.yaml | 503 +++++++++--------- .../my_instrument_settings.yaml | 35 +- .../20190417210227 Calibration 20181005.ini | 266 --------- meta_files/config.yaml | 159 ------ meta_files/generate_config.py | 228 -------- meta_files/hk_meta.parquet | Bin 80166 -> 0 bytes meta_files/pbp_meta.parquet | Bin 11434 -> 0 bytes scripts/sp2xr_generate_config.py | 458 ++++++++++++++++ src/sp2xr/helpers.py | 160 +++++- tests/config.yaml | 159 ------ tests/run_config.yaml | 59 -- 12 files changed, 996 insertions(+), 1152 deletions(-) create mode 100644 config/complete_example.yaml rename meta_files/config_with_mapping.yaml => config/my_data_schema.yaml (95%) rename tests/instrument_config.yaml => config/my_instrument_settings.yaml (55%) delete mode 100644 meta_files/20190417210227 Calibration 20181005.ini delete mode 100644 meta_files/config.yaml delete mode 100644 meta_files/generate_config.py delete mode 100644 meta_files/hk_meta.parquet delete mode 100644 meta_files/pbp_meta.parquet create mode 100644 scripts/sp2xr_generate_config.py delete mode 100644 tests/config.yaml delete mode 100644 tests/run_config.yaml diff --git a/config/complete_example.yaml b/config/complete_example.yaml new file mode 100644 index 0000000..652ad65 --- /dev/null +++ b/config/complete_example.yaml @@ -0,0 +1,121 @@ +# SP2XR Complete Configuration Example + +# ============================================================================ +# METADATA - Optional documentation for this configuration +# ============================================================================ +metadata: + campaign: "Example_Campaign" + instrument: "PSI SP2XR" + operator: "Barbara Bertozzi" + description: "Complete example configuration with all features enabled" + created: "2025-09-29" + +# ============================================================================ +# FILE PATHS - All input and output locations +# ============================================================================ +paths: + input_pbp: data/pbp_files_parquet + input_hk: data/hk_files_parquet + output: data/SP2XR_processed_1min + instrument_config: config/my_instrument_settings.yaml + + +# ============================================================================ +# WORKFLOW SETTINGS - Complete analysis pipeline +# ============================================================================ +workflow: + # Analysis components + conc: true # Number and mass concentrations + BC_hist: true # BC mass distributions (dNdlogDmev, dMdlogDmev) + scatt_hist: true # Scattering size distributions (dNdlogDsc) + timelag_hist: false # Time-lag distributions for mixing state analysis + + # Time and data management + dt: 60 # Time resolution (seconds): 1, 10, 60, 300, 3600 + repartition: '1h' # Repartition frequency: '15min', '1h', '6h', '1d' + max_partition_size: "200MB" # Memory management + saving_schema: ['date', 'hour'] # Output partitioning scheme + + +# ============================================================================ +# CLUSTER SETTINGS - HPC and local computing options +# ============================================================================ +cluster: + # Execution mode + use_local: true # true for local, false for SLURM cluster + + # Resource allocation + cores: 32 # Total CPU cores + processes: 16 # Worker processes (usually cores/2) + memory: 256GB # Total memory + + # SLURM-specific settings + walltime: "4-00:00:00" # Job time limit (days-hours:minutes:seconds) + partition: general # SLURM partition: general, bigmem, gpu, etc. + log_dir: ./slurm_out # SLURM log directory + + +# ============================================================================ +# TIME CHUNKING - Temporal data processing strategy +# ============================================================================ +chunking: + freq: '7d' # Chunk frequency: '1d', '3d', '7d', '1M' + start_date: null # Start date (YYYY-MM-DD format) + end_date: null # End date (YYYY-MM-DD format) + + +# ============================================================================ +# BLACK CARBON PROPERTIES - Physical and optical parameters +# ============================================================================ +bc: + # Density settings + rho_eff: 1800 # Effective density (kg/m³) + type: constant_effective_density # Density model + +# ============================================================================ +# DETAILED HISTOGRAM SETTINGS - Size and mass distribution parameters +# ============================================================================ +histo: + # Incandescence (BC mass) distributions + inc: + min_mass: 0.3 # Minimum mass (fg) + max_mass: 400 # Maximum mass (fg) + n_bins: 50 # Number of bins + #log_spacing: true # Logarithmic bin spacing + + # Scattering (optical diameter) distributions + scatt: + min_D: 100 # Minimum diameter (nm) + max_D: 500 # Maximum diameter (nm) + n_bins: 20 # Number of bins + #log_spacing: true # Logarithmic bin spacing + + # Time-lag distributions (mixing state analysis) + timelag: + min: -10 # Minimum time lag (ns) + max: 400 # Maximum time lag (ns) + n_bins: 100 # Number of bins + #log_spacing: false # Linear bin spacing for time-lag + +# ============================================================================ +# MIXING STATE ANALYSIS - Particle coating classification +# ============================================================================ +mixing_state: + # Classification thresholds + threshold: 50 # Thick coating threshold (nm) + inc_scatt_ratio: 1.1 # Minimum incandescence/scattering ratio + + +# ============================================================================ +# ADVANCED CALIBRATION PARAMETERS +# ============================================================================ +calibration: + # Incandescence calibration + incandescence: + curve_type: "polynomial" # "polynomial", "powerlaw", or "spline" + parameters: [0.05, 2.047e-07, -1.2e-15] # Calibration coefficients + + # Scattering calibration + scattering: + curve_type: "powerlaw" # "polynomial", "powerlaw", or "spline" + parameters: [17.22, 0.169, -1.494] # [a, b, c] for powerlaw \ No newline at end of file diff --git a/meta_files/config_with_mapping.yaml b/config/my_data_schema.yaml similarity index 95% rename from meta_files/config_with_mapping.yaml rename to config/my_data_schema.yaml index 1b3edf1..5acd583 100644 --- a/meta_files/config_with_mapping.yaml +++ b/config/my_data_schema.yaml @@ -1,252 +1,251 @@ -'# INSTRUCTIONS': -- This config file contains both schema definitions and column mappings. -- 1. The *_schema sections define the data types for your input files. -- 2. The *_column_mapping sections map your file columns to canonical names. -- 3. Replace placeholder values (YOUR_COLUMN_NAME_FOR_*) with actual column names - from your files. -- 4. If your file doesn't have a particular canonical column, set it to null or remove - the line. -- 5. The output parquet files will use the canonical column names for consistency. -pbp_schema: - Time (sec): float - Packet Time Stamp: float - Flag: int - Dropped Records: int - Record Count: int - Record Size: int - Particle Time Stamp: float - Particle Flags: int - Scatter relPeak: float - Scatter Transit Time: int - Scatter Peak Time: int - Scatter FWHM: int - Scatter Size (nm): float - Incand relPeak: float - Incand Transit Time: float - Incand Peak Time: float - Incand FWHM: float - Incand Delay: float - Incand Mass (fg): float - Reserved: int -hk_schema: - Time Stamp: string - Time (sec): float - Time Stamp (UTC sec): float - Elapsed Time: float - Error Code: int - Packet Time Stamp: float - Laser TEC Temp (C): float - Crystal TEC Temp (C): int - Inlet Air Temp (C): float - Computer Heatsink Temp (C): float - Laser Heatsink Temp (C): float - Outlet Air Temp (C): float - YAG Output Monitor (V): float - Cavity Pressure (hPa): float - Laser Driver Power Monitor (uA): int - Laser Driver Current Limit Monitor (A): float - Laser Driver Current Monitor (A): float - Laser TEC Sense: float - Laser Over Temp (On/Off): int - +5V Laser Rail (V): float - ' +5V Rail (V)': float - +12V Rail (V): float - High Voltage (V): float - Battery Temp (C): float - UPS Output (V): float - 12V Iso Rail (V): float - 5V Iso Rail (V): float - 3.3V Iso Rail (V): float - Spare 22: int - Spare 23: int - 408 Board Spare 0: int - 408 Board Spare 1: int - 408 Board Spare 2: int - 408 Board Spare 3: int - 408 Board Spare 4: int - Purge Flow Monitor (sccm): float - System Input Voltage (V): float - Board Temperature (C): float - 408 Board Spare 8: int - 408 Board Spare 9: int - 408 Board Spare 10: int - 408 Board Spare 11: int - 408 Board Spare 12: int - 408 Board Spare 13: int - 408 Board Spare 14: int - 408 Board Spare 15: int - Sheath Flow Controller Read (vccm): int - Sheath Flow Controller Read (sccm): int - Sheath Flow Controller Pressure (psia): float - Sheath Flow Controller Temperature (C): float - Sample Flow Controller Read (vccm): float - Sample Flow Controller Read (sccm): float - Sample Flow Controller Pressure (psia): float - Sample Flow Controller Temperature (C): float - Fan 1 (RPM): int - Fan 2 (RPM): int - Laser Fan (RPM): int - Spare tach: int - Threshold Crossing Events: int - Dual Qualified Scatter and Incand Particles: int - Qualified Scatter Only Particles: int - Qualified Incand Only Particles: int - Disqualified Due to Scatter Saturation: int - Disqualified Due to Scatter Transit Time Min: int - Disqualified Due to Scatter Transit Time Max: int - Disqualified Due to Scatter FWHM Min: int - Disqualified Due to Scatter FWHM Max: int - Scatter Inter Part Period Min Violation: int - Disqualified Due to Incand Saturation: int - Disqualified Due to Incand Transit Time Min: int - Disqualified Due to Incand Transit Time Max: int - Disqualified Due to Incand FWHM Min: int - Disqualified Due to Incand FWHM Max: int - Incand Inter Part Period Min Violation: int - Baseline Sizer Lo: int - Baseline Sizer Hi: int - Baseline Incand Lo: int - Baseline Incand Hi: int - Bandwidth Sizer Hi: int - Bandwidth Sizer Lo: int - Bandwidth Incand Lo: int - Bandwidth Incand Hi: int - ABD-0408 HK ADCs min: int - ABD-0436 HK ADCs min: int - ABD-0408 HK ADCs max: int - ABD-0436 HK ADCs max: int - Incand Particle Conc (cts/ccm): float - Scattering Particle Conc (cts/ccm): float - Incand Mass Conc (fg/sccm): float - Scattering Mass Conc (fg/sccm): float - Sheath Flow Set Point: int - Sample Flow Set Point: int - Laser Temp Set Point: int - Laser Current Set Point: float - Spare 4 Set Point: int - Spare 5 Set Point: int - PMT HV Set Point: float - Particle Density (g/ccm): float - PbP Packet Time: float - Scatter Bin 1: int - Scatter Bin 2: int - Scatter Bin 3: int - Scatter Bin 4: int - Scatter Bin 5: int - Scatter Bin 6: int - Scatter Bin 7: int - Scatter Bin 8: int - Scatter Bin 9: int - Scatter Bin 10: int - Scatter Bin 11: int - Scatter Bin 12: int - Scatter Bin 13: int - Scatter Bin 14: int - Scatter Bin 15: int - Scatter Bin 16: int - Scatter Bin 17: int - Scatter Bin 18: int - Scatter Bin 19: int - Scatter Bin 20: int - Incand Bin 1: int - Incand Bin 2: int - Incand Bin 3: int - Incand Bin 4: int - Incand Bin 5: int - Incand Bin 6: int - Incand Bin 7: int - Incand Bin 8: int - Incand Bin 9: int - Incand Bin 10: int - Incand Bin 11: int - Incand Bin 12: int - Incand Bin 13: int - Incand Bin 14: int - Incand Bin 15: int - Incand Bin 16: int - Incand Bin 17: int - Incand Bin 18: int - Incand Bin 19: int - Incand Bin 20: int -pbp_canonical_schema: - Time (sec): float - Packet Time Stamp: float - Flag: float - Dropped Records: float - Record Count: float - Record Size: float - Particle Time Stamp: float - Particle Flags: float - Scatter relPeak: float - Scatter Transit Time: float - Scatter Peak Time: float - Scatter FWHM: float - Scatter Size (nm): float - Incand relPeak: float - Incand Transit Time: float - Incand Peak Time: float - Incand FWHM: float - Incand Delay: float - Incand Mass (fg): float - Reserved: float -hk_canonical_schema: - Time Stamp: datetime - Time (sec): float - Sample Flow Controller Read (sccm): float - Sample Flow Controller Read (vccm): float - Time Stamp (UTC sec): float - Elapsed Time: float - Error Code: float - Packet Time Stamp: float - Laser TEC Temp (C): float - Crystal TEC Temp (C): float - Inlet Air Temp (C): float - Computer Heatsink Temp (C): float - Laser Heatsink Temp (C): float - Outlet Air Temp (C): float - YAG Output Monitor (V): float - Cavity Pressure (hPa): float - Laser Driver Power Monitor (uA): float - Laser Driver Current Limit Monitor (A): float - Laser Driver Current Monitor (A): float -pbp_column_mapping: - Time (sec): Time (sec) - Packet Time Stamp: Packet Time Stamp - Flag: Flag - Dropped Records: Dropped Records - Record Count: Record Count - Record Size: Record Size - Particle Time Stamp: Particle Time Stamp - Particle Flags: Particle Flags - Scatter relPeak: Scatter relPeak - Scatter Transit Time: Scatter Transit Time - Scatter Peak Time: Scatter Peak Time - Scatter FWHM: Scatter FWHM - Scatter Size (nm): Scatter Size (nm) - Incand relPeak: Incand relPeak - Incand Transit Time: Incand Transit Time - Incand Peak Time: Incand Peak Time - Incand FWHM: Incand FWHM - Incand Delay: Incand Delay - Incand Mass (fg): Incand Mass (fg) - Reserved: Reserved -hk_column_mapping: - Time Stamp: Time Stamp - Time (sec): Time (sec) - Sample Flow Controller Read (sccm): Sample Flow Controller Read (sccm) - Sample Flow Controller Read (vccm): Sample Flow Controller Read (vccm) - Time Stamp (UTC sec): Time Stamp (UTC sec) - Elapsed Time: Elapsed Time - Error Code: Error Code - Packet Time Stamp: Packet Time Stamp - Laser TEC Temp (C): Laser TEC Temp (C) - Crystal TEC Temp (C): Crystal TEC Temp (C) - Inlet Air Temp (C): Inlet Air Temp (C) - Computer Heatsink Temp (C): Computer Heatsink Temp (C) - Laser Heatsink Temp (C): Laser Heatsink Temp (C) - Outlet Air Temp (C): Outlet Air Temp (C) - YAG Output Monitor (V): YAG Output Monitor (V) - Cavity Pressure (hPa): Cavity Pressure (hPa) - Laser Driver Power Monitor (uA): Laser Driver Power Monitor (uA) - Laser Driver Current Limit Monitor (A): Laser Driver Current Limit Monitor (A) - Laser Driver Current Monitor (A): Laser Driver Current Monitor (A) +'# INSTRUCTIONS': +- This config file contains both schema definitions and column mappings. +- 1. The *_schema sections define the data types for your input files. +- 2. The *_column_mapping sections map your file columns to canonical names. +- 3. Update column mappings if your files use different column names. +- 4. If your file doesn't have a particular canonical column, set it to null or remove + the line. +- 5. The output parquet files will use the canonical column names for consistency. +pbp_schema: + Time (sec): float + Packet Time Stamp: float + Flag: int + Dropped Records: int + Record Count: int + Record Size: int + Particle Time Stamp: float + Particle Flags: int + Scatter relPeak: float + Scatter Transit Time: int + Scatter Peak Time: int + Scatter FWHM: int + Scatter Size (nm): float + Incand relPeak: float + Incand Transit Time: float + Incand Peak Time: float + Incand FWHM: float + Incand Delay: float + Incand Mass (fg): float + Reserved: int +hk_schema: + Time Stamp: string + Time (sec): float + Time Stamp (UTC sec): float + Elapsed Time: float + Error Code: int + Packet Time Stamp: float + Laser TEC Temp (C): float + Crystal TEC Temp (C): int + Inlet Air Temp (C): float + Computer Heatsink Temp (C): float + Laser Heatsink Temp (C): float + Outlet Air Temp (C): float + YAG Output Monitor (V): float + Cavity Pressure (hPa): float + Laser Driver Power Monitor (uA): int + Laser Driver Current Limit Monitor (A): float + Laser Driver Current Monitor (A): float + Laser TEC Sense: float + Laser Over Temp (On/Off): int + +5V Laser Rail (V): float + ' +5V Rail (V)': float + +12V Rail (V): float + High Voltage (V): float + Battery Temp (C): float + UPS Output (V): float + 12V Iso Rail (V): float + 5V Iso Rail (V): float + 3.3V Iso Rail (V): float + Spare 22: int + Spare 23: int + 408 Board Spare 0: int + 408 Board Spare 1: int + 408 Board Spare 2: int + 408 Board Spare 3: int + 408 Board Spare 4: int + Purge Flow Monitor (sccm): float + System Input Voltage (V): float + Board Temperature (C): float + 408 Board Spare 8: int + 408 Board Spare 9: int + 408 Board Spare 10: int + 408 Board Spare 11: int + 408 Board Spare 12: int + 408 Board Spare 13: int + 408 Board Spare 14: int + 408 Board Spare 15: int + Sheath Flow Controller Read (vccm): int + Sheath Flow Controller Read (sccm): int + Sheath Flow Controller Pressure (psia): float + Sheath Flow Controller Temperature (C): float + Sample Flow Controller Read (vccm): float + Sample Flow Controller Read (sccm): float + Sample Flow Controller Pressure (psia): float + Sample Flow Controller Temperature (C): float + Fan 1 (RPM): int + Fan 2 (RPM): int + Laser Fan (RPM): int + Spare tach: int + Threshold Crossing Events: int + Dual Qualified Scatter and Incand Particles: int + Qualified Scatter Only Particles: int + Qualified Incand Only Particles: int + Disqualified Due to Scatter Saturation: int + Disqualified Due to Scatter Transit Time Min: int + Disqualified Due to Scatter Transit Time Max: int + Disqualified Due to Scatter FWHM Min: int + Disqualified Due to Scatter FWHM Max: int + Scatter Inter Part Period Min Violation: int + Disqualified Due to Incand Saturation: int + Disqualified Due to Incand Transit Time Min: int + Disqualified Due to Incand Transit Time Max: int + Disqualified Due to Incand FWHM Min: int + Disqualified Due to Incand FWHM Max: int + Incand Inter Part Period Min Violation: int + Baseline Sizer Lo: int + Baseline Sizer Hi: int + Baseline Incand Lo: int + Baseline Incand Hi: int + Bandwidth Sizer Hi: int + Bandwidth Sizer Lo: int + Bandwidth Incand Lo: int + Bandwidth Incand Hi: int + ABD-0408 HK ADCs min: int + ABD-0436 HK ADCs min: int + ABD-0408 HK ADCs max: int + ABD-0436 HK ADCs max: int + Incand Particle Conc (cts/ccm): float + Scattering Particle Conc (cts/ccm): float + Incand Mass Conc (fg/sccm): float + Scattering Mass Conc (fg/sccm): float + Sheath Flow Set Point: int + Sample Flow Set Point: int + Laser Temp Set Point: int + Laser Current Set Point: float + Spare 4 Set Point: int + Spare 5 Set Point: int + PMT HV Set Point: float + Particle Density (g/ccm): float + PbP Packet Time: float + Scatter Bin 1: int + Scatter Bin 2: int + Scatter Bin 3: int + Scatter Bin 4: int + Scatter Bin 5: int + Scatter Bin 6: int + Scatter Bin 7: int + Scatter Bin 8: int + Scatter Bin 9: int + Scatter Bin 10: int + Scatter Bin 11: int + Scatter Bin 12: int + Scatter Bin 13: int + Scatter Bin 14: int + Scatter Bin 15: int + Scatter Bin 16: int + Scatter Bin 17: int + Scatter Bin 18: int + Scatter Bin 19: int + Scatter Bin 20: int + Incand Bin 1: int + Incand Bin 2: int + Incand Bin 3: int + Incand Bin 4: int + Incand Bin 5: int + Incand Bin 6: int + Incand Bin 7: int + Incand Bin 8: int + Incand Bin 9: int + Incand Bin 10: int + Incand Bin 11: int + Incand Bin 12: int + Incand Bin 13: int + Incand Bin 14: int + Incand Bin 15: int + Incand Bin 16: int + Incand Bin 17: int + Incand Bin 18: int + Incand Bin 19: int + Incand Bin 20: int +pbp_canonical_schema: + Time (sec): float + Packet Time Stamp: float + Flag: float + Dropped Records: float + Record Count: float + Record Size: float + Particle Time Stamp: float + Particle Flags: float + Scatter relPeak: float + Scatter Transit Time: float + Scatter Peak Time: float + Scatter FWHM: float + Scatter Size (nm): float + Incand relPeak: float + Incand Transit Time: float + Incand Peak Time: float + Incand FWHM: float + Incand Delay: float + Incand Mass (fg): float + Reserved: float +hk_canonical_schema: + Time Stamp: datetime + Time (sec): float + Sample Flow Controller Read (sccm): float + Sample Flow Controller Read (vccm): float + Time Stamp (UTC sec): float + Elapsed Time: float + Error Code: float + Packet Time Stamp: float + Laser TEC Temp (C): float + Crystal TEC Temp (C): float + Inlet Air Temp (C): float + Computer Heatsink Temp (C): float + Laser Heatsink Temp (C): float + Outlet Air Temp (C): float + YAG Output Monitor (V): float + Cavity Pressure (hPa): float + Laser Driver Power Monitor (uA): float + Laser Driver Current Limit Monitor (A): float + Laser Driver Current Monitor (A): float +pbp_column_mapping: + Time (sec): Time (sec) + Packet Time Stamp: Packet Time Stamp + Flag: Flag + Dropped Records: Dropped Records + Record Count: Record Count + Record Size: Record Size + Particle Time Stamp: Particle Time Stamp + Particle Flags: Particle Flags + Scatter relPeak: Scatter relPeak + Scatter Transit Time: Scatter Transit Time + Scatter Peak Time: Scatter Peak Time + Scatter FWHM: Scatter FWHM + Scatter Size (nm): Scatter Size (nm) + Incand relPeak: Incand relPeak + Incand Transit Time: Incand Transit Time + Incand Peak Time: Incand Peak Time + Incand FWHM: Incand FWHM + Incand Delay: Incand Delay + Incand Mass (fg): Incand Mass (fg) + Reserved: Reserved +hk_column_mapping: + Time Stamp: Time Stamp + Time (sec): Time (sec) + Sample Flow Controller Read (sccm): Sample Flow Controller Read (sccm) + Sample Flow Controller Read (vccm): Sample Flow Controller Read (vccm) + Time Stamp (UTC sec): Time Stamp (UTC sec) + Elapsed Time: Elapsed Time + Error Code: Error Code + Packet Time Stamp: Packet Time Stamp + Laser TEC Temp (C): Laser TEC Temp (C) + Crystal TEC Temp (C): Crystal TEC Temp (C) + Inlet Air Temp (C): Inlet Air Temp (C) + Computer Heatsink Temp (C): Computer Heatsink Temp (C) + Laser Heatsink Temp (C): Laser Heatsink Temp (C) + Outlet Air Temp (C): Outlet Air Temp (C) + YAG Output Monitor (V): YAG Output Monitor (V) + Cavity Pressure (hPa): Cavity Pressure (hPa) + Laser Driver Power Monitor (uA): Laser Driver Power Monitor (uA) + Laser Driver Current Limit Monitor (A): Laser Driver Current Limit Monitor (A) + Laser Driver Current Monitor (A): Laser Driver Current Monitor (A) diff --git a/tests/instrument_config.yaml b/config/my_instrument_settings.yaml similarity index 55% rename from tests/instrument_config.yaml rename to config/my_instrument_settings.yaml index eae4592..f2f8f25 100644 --- a/tests/instrument_config.yaml +++ b/config/my_instrument_settings.yaml @@ -1,15 +1,20 @@ -instrument_parameters: - ScattTransitMin: 10.0 - ScattTransitMax: 65535.0 - ScattFWHMMin: 30.0 - ScattFWHMMax: 65535.0 - ScattInterTimeMin: 10.0 - IncTransitMin: 5.0 - IncTransitMax: 65535.0 - IncFWHMMin: 30.0 - IncFWHMMax: 65535.0 - IncInterTimeMin: 10.0 - SaveRate: 1.0 -Signal_saturation: - IncSatPoint: 1700000000.0 - ScattSatPoint: 1700000000.0 +metadata: + source_ini_file: C:\Users\Baccandr\Documents\SP2XR_code\SP2XR_code\tests\data\SP2XR_orig_files\20190508\20190508172218\20190508172218 + Calibration 20181005.ini + generated_on: '2025-09-29T22:52:49.725057' + generated_by: sp2xr_generate_config.py +instrument_parameters: + ScattTransitMin: 10.0 + ScattTransitMax: 65535.0 + ScattFWHMMin: 30.0 + ScattFWHMMax: 65535.0 + ScattInterTimeMin: 10.0 + IncTransitMin: 5.0 + IncTransitMax: 65535.0 + IncFWHMMin: 30.0 + IncFWHMMax: 65535.0 + IncInterTimeMin: 10.0 + SaveRate: 1.0 +Signal_saturation: + IncSatPoint: 1700000000.0 + ScattSatPoint: 1700000000.0 diff --git a/meta_files/20190417210227 Calibration 20181005.ini b/meta_files/20190417210227 Calibration 20181005.ini deleted file mode 100644 index 096fba9..0000000 --- a/meta_files/20190417210227 Calibration 20181005.ini +++ /dev/null @@ -1,266 +0,0 @@ -[Custom] -Display Tab=TRUE -Display Names=<2> -Display Names 0=set 1 -Display Names 1=set 2 -Sets=<2> -Sets 0.Cluster.Graph 1=<8> -Sets 0.Cluster.Graph 1 0.Plot.Channel=+5V Laser Rail (V) -Sets 0.Cluster.Graph 1 0.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 1 1.Plot.Channel= +5V Rail (V) -Sets 0.Cluster.Graph 1 1.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 1 2.Plot.Channel=+12V Rail (V) -Sets 0.Cluster.Graph 1 2.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 1 3.Plot.Channel=3.3V Iso Rail (V) -Sets 0.Cluster.Graph 1 3.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 1 4.Plot.Channel=UPS Output (V) -Sets 0.Cluster.Graph 1 4.Plot.Left/Right=TRUE -Sets 0.Cluster.Graph 1 5.Plot.Channel=Inlet Air Temp (C) -Sets 0.Cluster.Graph 1 5.Plot.Left/Right=TRUE -Sets 0.Cluster.Graph 1 6.Plot.Channel=Crystal TEC Temp (C) -Sets 0.Cluster.Graph 1 6.Plot.Left/Right=TRUE -Sets 0.Cluster.Graph 1 7.Plot.Channel=Laser Heatsink Temp (C) -Sets 0.Cluster.Graph 1 7.Plot.Left/Right=TRUE -Sets 0.Cluster.Graph 2=<8> -Sets 0.Cluster.Graph 2 0.Plot.Channel=Laser TEC Temp (C) -Sets 0.Cluster.Graph 2 0.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 2 1.Plot.Channel=Crystal TEC Temp (C) -Sets 0.Cluster.Graph 2 1.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 2 2.Plot.Channel=Inlet Air Temp (C) -Sets 0.Cluster.Graph 2 2.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 2 3.Plot.Channel=Computer Heatsink Temp (C) -Sets 0.Cluster.Graph 2 3.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 2 4.Plot.Channel=Laser Heatsink Temp (C) -Sets 0.Cluster.Graph 2 4.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 2 5.Plot.Channel=Outlet Air Temp (C) -Sets 0.Cluster.Graph 2 5.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 2 6.Plot.Channel=Battery Temp (C) -Sets 0.Cluster.Graph 2 6.Plot.Left/Right=FALSE -Sets 0.Cluster.Graph 2 7.Plot.Channel=Laser TEC Sense -Sets 0.Cluster.Graph 2 7.Plot.Left/Right=TRUE -Sets 1.Cluster.Graph 1=<4> -Sets 1.Cluster.Graph 1 0.Plot.Channel=Threshold Crossing Events -Sets 1.Cluster.Graph 1 0.Plot.Left/Right=FALSE -Sets 1.Cluster.Graph 1 1.Plot.Channel=Dual Qualified Scatter and Incand Particles -Sets 1.Cluster.Graph 1 1.Plot.Left/Right=FALSE -Sets 1.Cluster.Graph 1 2.Plot.Channel=Qualified Scatter Only Particles -Sets 1.Cluster.Graph 1 2.Plot.Left/Right=FALSE -Sets 1.Cluster.Graph 1 3.Plot.Channel=Qualified Incand Only Particles -Sets 1.Cluster.Graph 1 3.Plot.Left/Right=FALSE -Sets 1.Cluster.Graph 2=<8> -Sets 1.Cluster.Graph 2 0.Plot.Channel=Baseline Sizer Lo -Sets 1.Cluster.Graph 2 0.Plot.Left/Right=FALSE -Sets 1.Cluster.Graph 2 1.Plot.Channel=Baseline Sizer Hi -Sets 1.Cluster.Graph 2 1.Plot.Left/Right=FALSE -Sets 1.Cluster.Graph 2 2.Plot.Channel=Baseline Incand Lo -Sets 1.Cluster.Graph 2 2.Plot.Left/Right=FALSE -Sets 1.Cluster.Graph 2 3.Plot.Channel=Baseline Incand Hi -Sets 1.Cluster.Graph 2 3.Plot.Left/Right=FALSE -Sets 1.Cluster.Graph 2 4.Plot.Channel=Bandwidth Sizer Hi -Sets 1.Cluster.Graph 2 4.Plot.Left/Right=TRUE -Sets 1.Cluster.Graph 2 5.Plot.Channel=Bandwidth Sizer Lo -Sets 1.Cluster.Graph 2 5.Plot.Left/Right=TRUE -Sets 1.Cluster.Graph 2 6.Plot.Channel=Bandwidth Incand Lo -Sets 1.Cluster.Graph 2 6.Plot.Left/Right=TRUE -Sets 1.Cluster.Graph 2 7.Plot.Channel=Bandwidth Incand Hi -Sets 1.Cluster.Graph 2 7.Plot.Left/Right=TRUE -[Raw Options] -byte 0: data mux=High Dynamic Range Traces -Raw Data Particle Selection=First Scatter -Scatter relPeak=0 -Incand relPeak=0 -Inter-raw Period (ms)=100 -leader sample count=400 -footer sample count=400 -[Scatter Parameters] -Graph=Counts -X Mode=Size -Norm?=FALSE -Cumulative=FALSE -[Versions] -Instrument Name=SP2XR -SP2 Version=2.01.01.19 -Acq Version=2.00.00.00 -Last Date Updated=4/11/2019 6:24:36 AM -[Trigger Settings] -Scatter Transit Time Min=10 -Scatter Transit Time Max=65535 -Scatter FWHM Min=30 -Scatter FWHM Max=65535 -Scatter Inter Particle Time Min=10 -Incand Transit Time Min=5 -Incand Transit Time Max=65535 -Incand FWHM Min=30 -Incand FWHM Max=65535 -Incand Inter Particle Time Min=10 -Paired Particle Delay Max=10 -Scatter Threshold Min=36100 -Scatter Hysteresis Min=2703 -Incand Threshold Min=50700 -Incand Hysteresis Min=5394 -Scatter Threshold Max=559000000 -Scatter Hysteresis Max=0 -Incand Threshold Max=2147483647 -Incand Hysteresis Max=0 -Forced Trigger=FALSE -Forced Trigger Interval(ms)=1000 -[# Samples S] -# Samples S=0 -[Program] -Data File Path=D:\DMT\SP2XR Data -Restart Files=FALSE -Graph 0 Left=YAG Output Monitor (V) -Graph 0 Right=Laser Driver Current Monitor (A) -Graph 1 Left=Scattering Particle Conc (cts/ccm) -Graph 1 Right=Incand Particle Conc (cts/ccm) -Control Cycle Time=0 -NTP Server= -Write File?=FALSE -Graph Backgrounds=16448250 -Graph 2 Left=Sheath Flow Controller Read (sccm) -Graph 2 Right=Sample Flow Controller Read (sccm) -Description= -Serial Number=0001 -2 or 3 Graphs=TRUE -Time Range=12 Hours -OSDS Format= -Num to Avg=0 -Global 2=0 -Global 3=0 -Global 4=0 -Global 5=0 -Shut Down Sequence= -Crisis Shut Down Seq=turn off pump and laser -Write SP2b Data File=TRUE -Write HK File=TRUE -Write Raw Binary Data=TRUE -TabChannelNum=0 -OptimizeChannelNum=0 -Write HDF5 File=TRUE -NumParticlesPerHDF5File=100000 -Laser Temp Set=29 -Laser Current Set=1.9 -Spare 4 Set=0 -Spare 5 Set=0 -PMT HV Set=0.46 -Interface Board Scaling=<24> -Interface Board Scaling 0=1/(0.000849+0.000261*ln(10000/(65536/VAR-1))+0.000000125*ln(10000/(65536/VAR-1))^3)-273.15 -Interface Board Scaling 1= -Interface Board Scaling 2=(1.0 / (1.1135E-3 + 2.368E-4 * (ln(1E4 * (1-(VAR / (65536.0 * 1.001))) / (VAR / (65536.0 * 1.001)))) + 7.396E-8 * (ln(1E4 * (1-(VAR / (65536.0 * 1.001))) / (VAR / (65536.0 * 1.001))))^3)) - 273.15 -Interface Board Scaling 3=(1.0 / (1.1135E-3 + 2.368E-4 * (ln(1E4 * (1-(VAR / (65536.0 * 1.001))) / (VAR / (65536.0 * 1.001)))) + 7.396E-8 * (ln(1E4 * (1-(VAR / (65536.0 * 1.001))) / (VAR / (65536.0 * 1.001))))^3)) - 273.15 -Interface Board Scaling 4=(1.0 / (1.1135E-3 + 2.368E-4 * (ln(1E4 * (1-(VAR / (65536.0 * 1.001))) / (VAR / (65536.0 * 1.001)))) + 7.396E-8 * (ln(1E4 * (1-(VAR / (65536.0 * 1.001))) / (VAR / (65536.0 * 1.001))))^3)) - 273.15 -Interface Board Scaling 5=(1.0 / (1.1135E-3 + 2.368E-4 * (ln(1E4 * (1-(VAR / (65536.0 * 1.001))) / (VAR / (65536.0 * 1.001)))) + 7.396E-8 * (ln(1E4 * (1-(VAR / (65536.0 * 1.001))) / (VAR / (65536.0 * 1.001))))^3)) - 273.15 -Interface Board Scaling 6=0.0000625*VAR -Interface Board Scaling 7=VAR/72+105.55 -Interface Board Scaling 8=0.125*VAR -Interface Board Scaling 9=0.000125*VAR -Interface Board Scaling 10=0.000125*VAR -Interface Board Scaling 11=2.01*0.0000625 *VAR -Interface Board Scaling 12= -Interface Board Scaling 13=0.000125*VAR -Interface Board Scaling 14=0.000125*VAR -Interface Board Scaling 15=4.57*0.0000625*VAR -Interface Board Scaling 16=0.0152587890625*VAR -Interface Board Scaling 17=1/(0.000894+0.00025*ln((1662.22* VAR)/(39897.3 - VAR))+0.0000002*ln((1662.22*VAR)/(39897.3 -VAR))^3)-273.15 -Interface Board Scaling 18=(69.8+11.5) /11.5*0.0000625*VAR -Interface Board Scaling 19=4.57*0.0000625*VAR -Interface Board Scaling 20=0.000125*VAR -Interface Board Scaling 21=1.1*0.0000625 *VAR -Interface Board Scaling 22= -Interface Board Scaling 23= -ABD 0408 Scaling=<16> -ABD 0408 Scaling 0= -ABD 0408 Scaling 1= -ABD 0408 Scaling 2= -ABD 0408 Scaling 3= -ABD 0408 Scaling 4= -ABD 0408 Scaling 5=VAR*0.00625 -ABD 0408 Scaling 6=7.98*(6.25E-5*VAR) -ABD 0408 Scaling 7=-84.962*(6.25E-5*VAR-1.8639) -ABD 0408 Scaling 8= -ABD 0408 Scaling 9= -ABD 0408 Scaling 10= -ABD 0408 Scaling 11= -ABD 0408 Scaling 12= -ABD 0408 Scaling 13= -ABD 0408 Scaling 14= -ABD 0408 Scaling 15= -Save Every Nth Particle=1 -zip files=TRUE -Particle Density (g/cc)=1.8 -Pump Start-Up State=FALSE -[Detector DAC Settings] -Scatter unused A=23790 -Scatter unused B=65535 -[Incand Parameters] -Graph=Counts -X Mode=Mass -Norm?=FALSE -Cumulative=FALSE -[Control] -Alarms=<3> -Alarms 0.Alarm.Name=TurnPumpON -Alarms 0.Alarm.Channel=Elapsed Time -Alarms 0.Alarm.Condition=> -Alarms 0.Alarm.Threshold=0 -Alarms 0.Alarm.Action=Turn Pump On -Alarms 0.Alarm.Hysteresis=0 -Alarms 0.Alarm.Target Channel= -Alarms 0.Alarm.Set Value=0 -Alarms 0.Alarm.Min Time=0 -Alarms 0.Alarm.Sequence=turn off pump and laser -Alarms 0.Alarm.Target Alarm=TurnPumpON -Alarms 1.Alarm.Name=Turn Laser On -Alarms 1.Alarm.Channel=Elapsed Time -Alarms 1.Alarm.Condition=> -Alarms 1.Alarm.Threshold=5 -Alarms 1.Alarm.Action=Turn Laser On -Alarms 1.Alarm.Hysteresis=0 -Alarms 1.Alarm.Target Channel= -Alarms 1.Alarm.Set Value=0 -Alarms 1.Alarm.Min Time=0 -Alarms 1.Alarm.Sequence= -Alarms 1.Alarm.Target Alarm=Turn Laser On -Alarms 2.Alarm.Name=StartRecording -Alarms 2.Alarm.Channel=Elapsed Time -Alarms 2.Alarm.Condition=> -Alarms 2.Alarm.Threshold=10 -Alarms 2.Alarm.Action=Start Writing Data -Alarms 2.Alarm.Hysteresis=0 -Alarms 2.Alarm.Target Channel= -Alarms 2.Alarm.Set Value=0 -Alarms 2.Alarm.Min Time=0 -Alarms 2.Alarm.Sequence= -Alarms 2.Alarm.Target Alarm=Turn Laser On -Sequences=<0> -Timers=<0> -[Pump] -Pump=TRUE -[# Samples I] -# Samples I=0 -[SampleFlow] -SampleFlow (sccm)=30 -[SheathFlow] -SheathFlow (sccm)=600 -[Polling Interval] -HK Stream Interval (ms)=1000 -PbP Stream Interval (ms)=1000 -[Fans Settings] -Case Fan Mode=normal -Case Fan On Threshold=35 -Case Fan Off Threshold=33 -Laser Fan Mode=forced off -Laser Fan On Threshold=27 -Laser Fan Off Threshold=24 -[Channel Order] -Channel Order=<0> -Digits=<0> -[Streaming Data] -Port=0 -Baud Rate=0 -Channels=<0> -Bus=Serial Port -[Calculated Channels] -Calculated Channels=<0> -[Calculations] -Calculations=<0> diff --git a/meta_files/config.yaml b/meta_files/config.yaml deleted file mode 100644 index 701e851..0000000 --- a/meta_files/config.yaml +++ /dev/null @@ -1,159 +0,0 @@ -pbp_schema: - Time (sec): float - Packet Time Stamp: float - Flag: float - Dropped Records: float - Record Count: float - Record Size: float - Particle Time Stamp: float - Particle Flags: float - Scatter relPeak: float - Scatter Transit Time: float - Scatter Peak Time: float - Scatter FWHM: float - Scatter Size (nm): float - Incand relPeak: float - Incand Transit Time: float - Incand Peak Time: float - Incand FWHM: float - Incand Delay: float - Incand Mass (fg): float - Reserved: float -hk_schema: - Time Stamp: datetime - Time (sec): float - Time Stamp (UTC sec): float - Elapsed Time: float - Error Code: float - Packet Time Stamp: float - Laser TEC Temp (C): float - Crystal TEC Temp (C): float - Inlet Air Temp (C): float - Computer Heatsink Temp (C): float - Laser Heatsink Temp (C): float - Outlet Air Temp (C): float - YAG Output Monitor (V): float - Cavity Pressure (hPa): float - Laser Driver Power Monitor (uA): float - Laser Driver Current Limit Monitor (A): float - Laser Driver Current Monitor (A): float - Laser TEC Sense: float - Laser Over Temp (On/Off): float - +5V Laser Rail (V): float - ' +5V Rail (V)': float - +12V Rail (V): float - High Voltage (V): float - Battery Temp (C): float - UPS Output (V): float - 12V Iso Rail (V): float - 5V Iso Rail (V): float - 3.3V Iso Rail (V): float - Spare 22: float - Spare 23: float - 408 Board Spare 0: float - 408 Board Spare 1: float - 408 Board Spare 2: float - 408 Board Spare 3: float - 408 Board Spare 4: float - Purge Flow Monitor (sccm): float - System Input Voltage (V): float - Board Temperature (C): float - 408 Board Spare 8: float - 408 Board Spare 9: float - 408 Board Spare 10: float - 408 Board Spare 11: float - 408 Board Spare 12: float - 408 Board Spare 13: float - 408 Board Spare 14: float - 408 Board Spare 15: float - Sheath Flow Controller Read (vccm): float - Sheath Flow Controller Read (sccm): float - Sheath Flow Controller Pressure (psia): float - Sheath Flow Controller Temperature (C): float - Sample Flow Controller Read (vccm): float - Sample Flow Controller Read (sccm): float - Sample Flow Controller Pressure (psia): float - Sample Flow Controller Temperature (C): float - Fan 1 (RPM): float - Fan 2 (RPM): float - Laser Fan (RPM): float - Spare tach: float - Threshold Crossing Events: float - Dual Qualified Scatter and Incand Particles: float - Qualified Scatter Only Particles: float - Qualified Incand Only Particles: float - Disqualified Due to Scatter Saturation: float - Disqualified Due to Scatter Transit Time Min: float - Disqualified Due to Scatter Transit Time Max: float - Disqualified Due to Scatter FWHM Min: float - Disqualified Due to Scatter FWHM Max: float - Scatter Inter Part Period Min Violation: float - Disqualified Due to Incand Saturation: float - Disqualified Due to Incand Transit Time Min: float - Disqualified Due to Incand Transit Time Max: float - Disqualified Due to Incand FWHM Min: float - Disqualified Due to Incand FWHM Max: float - Incand Inter Part Period Min Violation: float - Baseline Sizer Lo: float - Baseline Sizer Hi: float - Baseline Incand Lo: float - Baseline Incand Hi: float - Bandwidth Sizer Hi: float - Bandwidth Sizer Lo: float - Bandwidth Incand Lo: float - Bandwidth Incand Hi: float - ABD-0408 HK ADCs min: float - ABD-0436 HK ADCs min: float - ABD-0408 HK ADCs max: float - ABD-0436 HK ADCs max: float - Incand Particle Conc (cts/ccm): float - Scattering Particle Conc (cts/ccm): float - Incand Mass Conc (fg/sccm): float - Scattering Mass Conc (fg/sccm): float - Sheath Flow Set Point: float - Sample Flow Set Point: float - Laser Temp Set Point: float - Laser Current Set Point: float - Spare 4 Set Point: float - Spare 5 Set Point: float - PMT HV Set Point: float - Particle Density (g/ccm): float - PbP Packet Time: float - Scatter Bin 1: float - Scatter Bin 2: float - Scatter Bin 3: float - Scatter Bin 4: float - Scatter Bin 5: float - Scatter Bin 6: float - Scatter Bin 7: float - Scatter Bin 8: float - Scatter Bin 9: float - Scatter Bin 10: float - Scatter Bin 11: float - Scatter Bin 12: float - Scatter Bin 13: float - Scatter Bin 14: float - Scatter Bin 15: float - Scatter Bin 16: float - Scatter Bin 17: float - Scatter Bin 18: float - Scatter Bin 19: float - Incand Bin 1: float - Incand Bin 2: float - Incand Bin 3: float - Incand Bin 4: float - Incand Bin 5: float - Incand Bin 6: float - Incand Bin 7: float - Incand Bin 8: float - Incand Bin 9: float - Incand Bin 10: float - Incand Bin 11: float - Incand Bin 12: float - Incand Bin 13: float - Incand Bin 14: float - Incand Bin 15: float - Incand Bin 16: float - Incand Bin 17: float - Incand Bin 18: float - Incand Bin 19: float diff --git a/meta_files/generate_config.py b/meta_files/generate_config.py deleted file mode 100644 index 41bafef..0000000 --- a/meta_files/generate_config.py +++ /dev/null @@ -1,228 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import yaml -import os -from pathlib import Path -from typing import Any - - -def infer_general_dtype(dtype: Any) -> str: - """Infer general data type from pandas dtype.""" - if pd.api.types.is_integer_dtype(dtype): - return "int" - elif pd.api.types.is_float_dtype(dtype): - return "float" - elif pd.api.types.is_datetime64_any_dtype(dtype): - return "datetime" - else: - return "string" - - -def load_schema(input_file: str | Path) -> dict[str, str]: - """Load schema from input file by inferring column types.""" - ext = os.path.splitext(str(input_file))[1].lower() - - if ext in [".csv", ".zip"]: - df = pd.read_csv(input_file, nrows=100) - elif ext == ".parquet": - df = pd.read_parquet(input_file) - else: - raise ValueError(f"Unsupported file format: {ext}") - - schema = {col: infer_general_dtype(dtype) for col, dtype in df.dtypes.items()} - return schema - - -def get_canonical_schemas() -> dict[str, dict[str, str]]: - """Return canonical column schemas for SP2XR data.""" - pbp_canonical = { - "Time (sec)": "float", - "Packet Time Stamp": "float", - "Flag": "float", - "Dropped Records": "float", - "Record Count": "float", - "Record Size": "float", - "Particle Time Stamp": "float", - "Particle Flags": "float", - "Scatter relPeak": "float", - "Scatter Transit Time": "float", - "Scatter Peak Time": "float", - "Scatter FWHM": "float", - "Scatter Size (nm)": "float", - "Incand relPeak": "float", - "Incand Transit Time": "float", - "Incand Peak Time": "float", - "Incand FWHM": "float", - "Incand Delay": "float", - "Incand Mass (fg)": "float", - "Reserved": "float", - } - - hk_canonical = { - "Time Stamp": "datetime", - "Time (sec)": "float", - "Sample Flow Controller Read (sccm)": "float", - "Sample Flow Controller Read (vccm)": "float", - # Core HK columns that are commonly used - "Time Stamp (UTC sec)": "float", - "Elapsed Time": "float", - "Error Code": "float", - "Packet Time Stamp": "float", - "Laser TEC Temp (C)": "float", - "Crystal TEC Temp (C)": "float", - "Inlet Air Temp (C)": "float", - "Computer Heatsink Temp (C)": "float", - "Laser Heatsink Temp (C)": "float", - "Outlet Air Temp (C)": "float", - "YAG Output Monitor (V)": "float", - "Cavity Pressure (hPa)": "float", - "Laser Driver Power Monitor (uA)": "float", - "Laser Driver Current Limit Monitor (A)": "float", - "Laser Driver Current Monitor (A)": "float", - # ... (other HK columns can be added as needed) - } - - return {"pbp_canonical": pbp_canonical, "hk_canonical": hk_canonical} - - -def generate_combined_config( - pbp_file: str | Path, hk_file: str | Path, output_file: str = "config.yaml" -) -> None: - """Generate config file with both schema definitions and column mappings.""" - config = { - "pbp_schema": load_schema(pbp_file), - "hk_schema": load_schema(hk_file), - } - - with open(output_file, "w") as f: - yaml.dump(config, f, sort_keys=False) - - print(f"Unified config saved to: {output_file}") - - -def generate_mapping_template( - pbp_file: str | Path, - hk_file: str | Path, - output_file: str = "config_with_mapping.yaml", -) -> None: - """ - Generate enhanced config with column mapping templates. - - This creates a config file that allows users to map their instrument-specific - column names to the canonical column names used in the main processing pipeline. - """ - # Load actual file schemas - pbp_schema = load_schema(pbp_file) - hk_schema = load_schema(hk_file) - - # Get canonical schemas - canonical_schemas = get_canonical_schemas() - - # Create column mapping templates - pbp_mapping = {} - hk_mapping = {} - - # For PbP: map file columns to canonical columns - for canonical_col in canonical_schemas["pbp_canonical"]: - # Try to find exact match first - matching_file_col = None - for file_col in pbp_schema.keys(): - if file_col.lower() == canonical_col.lower(): - matching_file_col = file_col - break - - # If exact match found, use it; otherwise leave as template - pbp_mapping[canonical_col] = ( - matching_file_col - or f"YOUR_COLUMN_NAME_FOR_{canonical_col.replace(' ', '_').replace('(', '').replace(')', '').upper()}" - ) - - # For HK: map file columns to canonical columns - for canonical_col in canonical_schemas["hk_canonical"]: - matching_file_col = None - for file_col in hk_schema.keys(): - if file_col.lower() == canonical_col.lower(): - matching_file_col = file_col - break - - hk_mapping[canonical_col] = ( - matching_file_col - or f"YOUR_COLUMN_NAME_FOR_{canonical_col.replace(' ', '_').replace('(', '').replace(')', '').upper()}" - ) - - # Build enhanced config - config = { - "# INSTRUCTIONS": [ - "This config file contains both schema definitions and column mappings.", - "1. The *_schema sections define the data types for your input files.", - "2. The *_column_mapping sections map your file columns to canonical names.", - "3. Replace placeholder values (YOUR_COLUMN_NAME_FOR_*) with actual column names from your files.", - "4. If your file doesn't have a particular canonical column, set it to null or remove the line.", - "5. The output parquet files will use the canonical column names for consistency.", - ], - "pbp_schema": pbp_schema, - "hk_schema": hk_schema, - "pbp_canonical_schema": canonical_schemas["pbp_canonical"], - "hk_canonical_schema": canonical_schemas["hk_canonical"], - "pbp_column_mapping": pbp_mapping, - "hk_column_mapping": hk_mapping, - } - - with open(output_file, "w") as f: - yaml.dump(config, f, sort_keys=False, default_flow_style=False) - - print(f"Enhanced config with column mapping saved to: {output_file}") - print("\nNext steps:") - print( - "1. Open the config file and replace placeholder column mappings with your actual column names" - ) - print( - "2. Remove or set to null any canonical columns that don't exist in your data" - ) - print("3. Use this config file with the updated CSV to Parquet conversion process") - - -def apply_column_mapping( - df: pd.DataFrame, column_mapping: dict[str, str | None] -) -> pd.DataFrame: - """ - Apply column name mapping to standardize column names. - - Parameters - ---------- - df : pd.DataFrame - Input dataframe with instrument-specific column names - column_mapping : dict[str, str | None] - Mapping from canonical names to file column names - - Returns - ------- - pd.DataFrame - DataFrame with standardized column names - """ - # Create reverse mapping: file_column_name -> canonical_name - reverse_mapping = {} - for canonical_name, file_column in column_mapping.items(): - if ( - file_column - and file_column in df.columns - and not file_column.startswith("YOUR_COLUMN_NAME_FOR_") - ): - reverse_mapping[file_column] = canonical_name - - # Rename columns using reverse mapping - df_renamed = df.rename(columns=reverse_mapping) - - return df_renamed - - -# Example usage -if __name__ == "__main__": - # Legacy function for backward compatibility - # generate_combined_config("pbp_meta.parquet", "hk_meta.parquet") - - # New enhanced function - pbp_tmp_file = "/data/user/bertoz_b/merlin6data/SP2XR_code/tests/data/mini_SP2XR_PbP_20190409110737_x0001.zip" - hk_tmp_file = "/data/user/bertoz_b/merlin6data/SP2XR_code/tests/data/mini_SP2XR_hk_20190409110737_x0001.zip" - generate_mapping_template(pbp_tmp_file, hk_tmp_file) diff --git a/meta_files/hk_meta.parquet b/meta_files/hk_meta.parquet deleted file mode 100644 index 1fc1eb7c08f4b785e37ca658b18696df5cb32a5b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 80166 zcmeHw3y37wdG1Pa97pj>vQ}O>vfWwRm33CqcK6Ke?l?D(s($xO_jLF4V|pA%>DPAm zOuy8R>FM!Rq9}@DD~jT56h%>dg$c%27-NJn!d#3oCI};h2}YQ3G2&v35ypfNgb8K| z_dj(`Rh_Cj=X7^%bT5S{QTsTR`?1>u*I-idz`Xh^2F%@|;cPUnba+5t`4s`s58q(`-IeHdfGN2RG}-C- zmPko{`l|-a2SWYsU@`{I;*H99)b1U!;9mPx0ac&*H3RPbSV|(SpWY$B`ocR6toQ9q z#sU@}d#3>BrFR)P_mt(B53qnuU#j10k3p9%6zST#{w@L3EAKX-&W0*S?eWZ)88$|v z$*|$O(8^S3q!-^UK>FYBF_1ot1r;8)k3jaB{uF-e#7$%x^PBGxU_N%Ifhn0VLzCgK z(Hr}=+ub(BxC#2l?-f9Q<1Pc*$As1iHi15WmjLuf?=wK(gF#V~+HLekbmRW#_X#k5 zdbfdb4rA;fDdKjp)4Q~D?HVn>*X|a8J${b?cGty?BH=VwX?HAI@{_XynCtI1VBY0J zI1Gfh?hzn7dyYc57zi>E{`36;gy-)y5Z)VaH(S19zca2hK}}8Se&d_~g0OYH`Zh*YIklEFmpP7!&{kj0+>-QND??uu`jQT8@K6{@4g8Gil4g3Pb^?3vf7%wP2h zGQaX0M&>)${a1ZTzcQ@*Fb#eR<;C9+pnU(l03|@7eEqxt<&BRBP-t;{`y&FBpMF$; zLd)f~j|xzJ{xJc{I>phyeoTP!*l!vr_hu$TXl&6=e`?tBQLWacjnR)K0on@}4778* zFi>c8eTg3GxY#ZE(F+2wZ@@QgEO-`IIIkzTZH(%c{-+TAyFJUjXwT4~SrBg?{}30nDS9L@=~Q|L8#h z%+r1m46V}ten|lH`G5$9*6IK93t+w+6v1pzHT6_b0P~GC1Lpj03x=01+%`h}-gwyW zbfC-5H7a%Ag(FKFd45d*`;+xWVXaN?yXykjXD%DC5{Fu2*f$upP2;|wZwR1Yy)qw~ z-!i{)MF9QirU89^7v_l_Vt?k!_(xX-uwVVeqOevOf9?|k*dPA30V{E+<;(b6zb$~i z{yX!b`7(a%zX_nf`@078uSF|8U%+=Emr2n^{mZ{A%Kv+o{6TyE-xKBkeIq{^G9xN% zH2l5*<6mS0;||>Qk1Mqn)s(*`3$lMtF|yyAZ^7)R)$i1Op<#anQ^uw*as)Gt5ry`W zB0&3T$UwUop1@={3%}ae+OS5jTT_RezDlnSJzov|lc@~H?OLZnROex$RJ*ih~~{1qHmkZ4~fP5eLyy zJmoV}{@k_z_|4Q(z$S`cO$mUX-!XuHQ&Z$b4=t`Bb@OE!!*;)pNc9!l{f@OhZ=?m- z-_ID>A7e>_C?Zo+zMc^vKan$#FA9(ait=Vwfd9&_6MU1@U)&YoKT$C7KO&%?p(bzU z1-LI4m%%mB`&?0g`;(G^E0J1TTuHuL5CXO|4;z)vxf|r_bJfFVQN^y`QAeUoEv{6!ZES+#vciAo~jx+ zcgsrn0Y92p#kYKNI5hHgO`XEORRlmU)NDX&mpMR>)&xK=)onmDMW)tvy)FRyPQwO7 zQ)B}AVnYD*`ZWXSL!_p(K?H0~^lH8fwejeZWhV3TH38%=ng--~(qEv_)O^?((TGrat$E2hXn(RTfP3Y@fcp?dr5N(X0|DgE4h_h&^jvEfb~G~mc5h5s zneQD6kRI*QNYwm?h4iD20O^^Yfpj-nm`3|5ETErv1wdcw8$e_`L|ckyVST1A!1~s} zz`7F;+t-;CePJL#dG%o%$_5kVD-R1$elRpp-kV9~eeog_q`j09l67#m>k$y77YaP*BnN`Zbj7GOMfWMI5Y8?z}eLZg>}emoH%JTc`U z(5pXh9tjYhJmw(K3qL=b3J{*2aS-S=pMO6VAY4D;AkfP_Z_NY<&wiSNaG9!$|2z>O zJpbQ02v;bC8=n>+yzmGIfnKD1^brBVKm9QWfnK0|{Er0)U-}aR;oT;Uq8BGW^Ctp` zuRKa^BDtPtJxFEECe&x?Zl)U~p z7lc;Pmme2_yz%E;5L!#W{pTW(H=p2w(2Dx%6C#lR_zNxwt*fv9g$U%vXSg7=y1wxl z5y%ryQXuan;{~e)eDkv+h^PLN13@d=zdb2}xc(Fef>yJ){!#?-++T4ZXeIl_QzD2L zp5{Q%D)#8pB8V^kH3x!LuqXan1o7q1aUf{*dg^l`h?o9`13@d-^}i88ynLMlL95nt z*F_ND{96j*U9@V^rtpQo6=D4IGdv916n^O$5yoqO$HSl%^s9d-!g&2z9tN$UFFz~7 z_~GC4FlZJ1*58XTe*7E{gVxbko)cmG>>qd-w32@BA4C{$JMN6gz@#>i&XXTOC^PTf2)2ey=;7m><_jF6&OZpYuMzc^~Aw=i#s8 z^n8~I{GICrfj_g|8aSj}<-Fg-?-|YnybnYO_8deC_76l1_76l3_75Al@C7!`7fGnD z_~3*KZf=VcTeea)5~u|j_6GDK2~D)J5iOg<8m;WaOj_xUT%28fL$+q|8|!8(eoJki z;uri4QT%5NV9 zNIz_PN4j9syUUC+t}xz_!G8EBXn^vFQV{E`eX%r6Nm=9h#Q^Gm{v`6c1T{F1O^ zeo6Q-za$KqUlNW?t43HD^eaL*{fdxHzam7_uL#NXD?%{+ijYgcBE-_K2&v{(2vTwM z$^;R;DV?T7bUICm=y#eD(eE@RqTgvsM8DINh<>Lj5&drS%sprBz3<)|pS^9lNHJ;2 z&@jnyESR)(@Rzh?w3p;8yi0Nz&m}ny;F26hY)Lm6q9xr-OqS#{2ur$&s4K~Nn3d#4 zT$SWvAeH1w^pmuFxRc~4zDaTv&?GsCV3Hh#EJ;pclq6?C zNzzhLNYYYaN0PHRBgt7{k>o6rNOBfBBsq%}l5Q?INV=(LAjx_7kK`=gM{*Y6BRP!d zksOBbNDgCmB!@vdlEbJR$zd3dO7F%Cs?J@7#SEwn^( zEtW)bGk8RDFIq%$J3K^kJ03)GD*!}tE8;_PF~mc17Ska)3*wL*Mr}wA!!#s^aT$`s zKn%%g11Jd&G0@CsU0@Csk0n+jz z0Mhb4|I+eK{*nW)e#wy=zvReoUvlKAFFEqemmIm`OOAZ+B}dNok|XbW$&ov~Ufm^%ez>i&W;fil= z{$~UI8F=JV@E+qmq=GjHtQiFkOzG4gGQY!gJv5#TP;uucy><n@{sTzxgV$`EaO5{k9-aR`Zo&!*E;>m`0~2FeCh)w7k0twxxB{F zxEvA~saz(8K3>;Og!QZ2h3iN2GKw~RbGcwD))yb@YcHRf#APy!BTSK-F*+64P6s=U z%Y?Ab6}FSYOXV^#_HYzCDeN>Z6T&~;*iMi*t;>XAov@vgY^O?)&}EVs$1U5*;UsdI z6!wYDc53+PTqcNl?z5d9ZZen2!AikBJlam!0vT3GqILj=?JVj!lnV2CK`?c~^@5~Y za;$_%{lsh@O{N4ZA;Es&Hjfr|N|%YUAKYyxhn>b{LQF@0+i9?UcDZ;{0HVEk+h>=H zw;nofr+}5hWg@JRm3AUnDO@JPeCWBI1p9ZF39!bD+DTxgaG40pk?VFE7ztb^!FFQ1 zod{+Um&wqMe7BcQe_2hTYurNBFD@5qizK#}&P=jmG6M6-P?M~fjNp7S)Fdk=gP{`E zO7TKB)D;->!jY8>HHpj8;GWZOuN+}gm&xKC_;065l+tCQm=_1wDdHw`nH;NnT(y(J zOtNA!Hs_O}CRs5V0snkDaGF&UvWU6$!rlfJFt|9)swJ|3$qjg6a{~()U7Tjs64_Wl zS=BTy6GERj%08yTPS|33yd+dTCSyrL^*r7XYB{E3VPYBiydl_fRLH^vGxE7jFcrAJ zu+A=&$4f$W{WFjGI9Gw-%VG39-Vn;ry@iNn&q&}h3C0C4_R`JYTrQYIB_rr4#qcs>sawqC`u9NDbgr$&6` z_-2U5xjiIayF_L=qUl_o6T7U0F|jL=tk&S>Tz6v^?h+Y2kT0=W%bE>lHt)z~HGA_w zk!9-$(X7m;Mf#!&%_My#dMywf-SiihHl4{kDbo#qNm5rLRZ@RT8;iyBxhz}#mMiVX zExRFOCRr^R@%CIYaFSJ#LF!|>FQCJeX2paE1wsf}kwna=T(SvarEy&%=0ez2!%DL< zLQD!^wmyst(vT*tniO&EtWr-{H?Y~;4lk92#Who824JYwvrRzi&VkX9Zs_Eak& zhUS?yGO^2Vm7S%2rIz_dI=hUB{tD8&EG;sG7u;`WCvveoE)z;5!?@ngPM|og%Y;$V zSctg!>D?xfy5n6xT-Yu&dMnM8%x!Y0?GIBq=I({wd@v<*nH;?Ai&ou@YxC^F;iXy$ zF`9d|iEXoVFg?tKE|Y{8rm5=m2CV^O1pL$E0odDbC6&BBMZ=s7{g)bqE&v0t7eO@8=3|AJq z_*M80()bJq7Tf0)a(`K@dg|vD7Jo%ty!v^C#WOtrXyL1$S6Dp5(|wB&Khk|rLQEUg2rczwQQeP$i3Litz@7@vnMumD;Z|d#fwedN(NbU@nU1Qk|7pd zyx82WWPn8%FE)598D7!Fi%s5223K_PVx#BU&`LdSY}lxb`$P2rteeqGNONvDrx)-zf~0VdqS3{GUS7Q zRS@`B%>=*8@V_!Gmo^T|dnx#{1^JaLiEd}IA1ki!1=B~>cyV0qrP@1frP`P!hZRVh zIOsPMy=1G}t#=abX*1E?XjMxElx9*d9gns{a=W&cYt?#LcqdQE^HD97xSR;fqeLiq zQjZm<2q(1{sH5-Y;O3}a3QYEbxa|Esz066u6zHI|)n0K_4WaV(2m7Vt!@VG*+*lN7 zkDlzU2K^QDsG~?*{*bp2lbHpeZ0SyYXOG8 zybJWkNA3O6#z8fh8&tbB6|Y###kTX!_B9LbwHc(3jzNAJU2?k^>x}nzm3%!G9RvTU z{BEUG9PLLpb-ofEz%CV<1&~>FPj0L7V(@`z0@_eoH4VsPT#I+E)#IHh@OBxnp&oOc zIw~_?_LqViCwrJ@Gf%ONKs9!}xgA1vZKYN2*Mh}^Eu4=_>sCD7DbEi1w04h=%6r*h z8uQGM`TnRHjQZ0DS(RB{X%lpgC#U9Cv{=Z_l#~5ZYJm07sux!I z%YCT3aN;y*KP-;Qdx+L-6YB&=7ue}Y8%@)7-L1wpTlH|+PugIhx~FW=dcSY3gN1Y; zx0jq%*AmUF(O&F2a9Etxx|_2^Bz?3GZH%k0=JH+A0i{(Ajyv^`Qm7!i5KV*L1g5 zHRum33+W8>)q6?+`c-I`tqzoP0)6$+#=CdCq{@3~2YisWa%=FAfx1{qkMe6pe>Dd7 zCK4S$9a`JEXZl|%MB|CbW}qGiod_xEYH(vvkAck&wMtyO)9EO=nSyu*9n3cKrOioY zFLzXHL%-SfxUWA~_`GQCAUCV*?W1;-t*+J4zjR#&vb$c_i;b>u9mWhW zMh2ZfhOub}{-t<~M7U!!n~WmqK&_YTRC}O%FovyIbnw>V)de1@_H}B!;W0@b>hFZi zj~L@UQs?%xp0}s`n%WjLyPB!CH#7U)@?0Gg=JTviObpY-GUzt84TWxWP=)c7W?Sv! zDRKH5YmT^z@oQUF>0W@|=GIBEl!zu=_;FMkM))#XPyqTX}%@&nXIcbY!_3X@fxvb z`kc(doDjw-N(|Ph*G%)lc>16m^mE7Sg*eRfgN0@e#j>@_7uOMw`%#{b;62yHb;RpE+H@c7 zUBUVW>HlGGs+XI=S_kZ&lSkm2f!6tVL57F0 zZgD!}9x-jEJ(*fOJt#Y_M`gCZmp*~@LE36TvF$-VCDoWd58>Dz zs(pKCd&IPH+tc#(7*BR;Yp^E=>nB@r1^fxfUfwOkE4Mu@&8zC$0{sfCTVsATUlH$k zjg?F5%@nNd7dlFszc0Mss3#HNO*8o?FpmHqY;y zxM@AKS9pF3YYOEco};qHLO>6!1uL*ul%&3tGeP@WZRF#XZGs=z0rH0ie+}?o&Vk=BjKz3!o5Nh74EvmgJ;051kj+8s z4F`DdE74SOVBM%31fS%Pl7;mMf2EX$F>uP;wTMm>=sX@Ey+Ui;K#w2$iEU1*@xvzg zNY21#_yES$+`8oPm=A7@kdMwS!dtT1g|P|LBabdd+cN2cUjGq}Y`n9zk;-2BnATO$ z-%cW)gFQk+#|o$}(RyC2b5!N6y&vWVwKbCtMC#ygUXNX+^*@jMKA5+GUy8>zW+7gS zafmtSr3bq2ZzM3dNuCQZ7N5YJL-!BPmpcEi@ZM_h_OuqOZwBvCyrz1K={Q6*B=WmT zx>8!_tc@iX(jP01XMOx)^#NX^tu*YF+-$vCaXjl^slGG`;{w%Z5%mkum%^O4a|C@4 zszdN&0b2#JEV`Xxj2#?f-3q#E^!aw1%HTB@>zvf+F+?O6Kw~}d@6h`*hMo7UUKgRy zE!(X3ztP+b#!n~62b9E6>FxhkXtyFd%X{2$Jl{?kI&*8=WrI%5&N%9Wb@@g1_jV(G z@HM}M`dvGZj%>1>N6-6L-F^Tq;ExAu?`W(Ka}n4B9#n(K#~K8I^S})hLpEq2AN|Pjk;mg+7R`wY&+i9( zFzqzAurI~w$XVB~eC#T3Y)12FOYfa6r(|ce@5N+oPw!dZn}zwwe!M=|kLUV^o{~Ce zj3tEWz0?s{L#YQlhiLz<5bRt=wnwNVg7jYM2*mlVRZ2OucWKjA#@ay$t?_7l8utDa z>+mdzc-k!m6KJVt1w}xk&mX5!2TK zyla~p?I++Xyd3Q(X^8KzMEpL9@{ zmpiP_)Yej}uSR($_xHQCqd9xYMn6)Do`7xBcnY;t6bowU7I9LsqN(4K`DbBHwuc`rr35&wLOJZNz)>BVT*;#^>D z)~08k>5NV@%-0ubtI!rvJXGLRX^sR6aJ~k_nA+sVXrD^D7*Y}tXKLfmJG)}+V<#fu zFN5~aV9$3eR-^k-7VY}}w;-Q)x)Muf(Vo2N>>u^)G5eWM1zuVY`yykH9oE^9zYdR9 z!~YD=PZmHAm-3x9|mjr*oT7Aujpe@hyg*e_c&fFX)T}Y zW3G>*+KO8DB&O@s>ORd|ceK^L=lNMrLa32-% zIKG%6SKj`UT?bCPpE~Bo6x(U_I1b7#*Tws)NZ^$FfaCPy`g+>^)iLRZ7O?}kZ#zXh zuy~&p@!GG=Iu1DAS$J#}@w(sA*C+AVD#9CEG1GZAk3H1=5AZyR*QVI%JR4IR>%`RD zgRzFnOy^x&veYGaeUzE*wCbE|^DT8w(w$bFV;erd&b_wvmb$Dh*v@gf)2ef>jknY} zNq1Uxj&1h*IzMgO&OImFVvjsO!@*WQufMd^dn%9ntJBiY6!*M_q@}%iJ{Pf;&yDS| z$=>6&2y6M=d5Ep(lLn zRQsBG2Tmx7?z=e*!8JLNX$$^LCE$G^1b=Oz3vrY~bRo_bdfsVXljYDT4%g;Xl&kP3 zdJd1`kZP!`%W`bmfK-R*IiUFAMxACI@@M0K7e>!%xdxfaaY$5F03?)!=Wss;e~y%q zEXT7|_;eLL%jh{Zt)TQVz}r*$@TUyV;lmiliK=l4Du;OD@0{ zxN!*N7n^3_&t@3#0&tg+oXGA#d5JLOhayx_i}P$3MtN;1e#nZk3p8HP*)Kma@- zLOw*#8#2giE4vMeY9aU(LC>iv+>sQDpu3TBGRRNP!gDwh0=$+C@{2T4-KWsAA60c? z3QQ;kRMm0xypx5h3XM=z)lt=jr>N>8==lWQjg-vBP~he#CL=sA-G(nD7fD=isViA)hI1@ydzu4CHD0x6&ffS*RsnJHuq zM^M%Sbgfe~i=b-gc?15Cw;-d86b)59`hgE}=x{xA=@R^*$j9g%LPHPw54sadw)U|* zC~LCXymSfb3Ca%cgfCq}@Yxjlf~bKX^-uLLR7bMmCyM9bc~n$*i(ha{_vfi+UK3$rS@|%jher51nO=X?nEv0X$ zA}tiQwRL0IxBB&>-=mu z;78S2Ib72DQ4S8p`o`i1Tb|G)kL3v+(T3Q*vFZm~!tfPRenM{U5ImlJLux_A)f3wU zW>=Y!@>_Ks->ZE?4K`1d&en?In5ImkeQ2SHsUV}PRXiP}?&Cu03 z{HWsttG=m~wsl44=juFu(D=aCen4mCa9igml<$f4jmZzvSvfiZrA6kVsgyUw_L|8L z(pfn&B;_aMgB^m$vj6ir|;ukx?8eeXpLJpOEsKp)A4E+7ArWAs70Ee_$XD_Dw%? z2F_TE71c918gTX%_{6L64{FlH6+%x!_FMQd9%Lf^Np=y+G>;oXDH?u@U4{P?eV!=}f1(f{IoZYZb&@aRJRe4@K29#Zy z5c-;-(PeXgVAbaeV$B#Ia^piloQcvkz9M;T0U zyhc4GDp{pL^27WzJQCXiMiu7&BtM$}6Z%O1VY9EO+9K%AXFWdFh_MGfJJI=* zvsc9Sb=St99)I5&z_%F55A)T7v6w$JR{;KuxPN`Fw;KbohOdzPX#BrsY7adA^!R#1 z=BX<1SD%pl&CvQB{?z^h>JRSvh5PW}>`X2GS*1x1O?3V;;W_-N{RhpTN#@Z1>-=r) zTru&dj^8MIl2q$Y_n}z$3uRrjWolT3H>2Cd@?oHkJ?vI^@HRms2{3)?W`}YzsVhG>x-ep zRkJ;}%1gD!0(nvME#vGOAPm-1w+a0TIgl~0r{bD7>K_)eLG$?aPtC{Jx`y)aRQ2+< zGUoM8cgyQcP;qj2c&zEeHA{ImeNgSqLj0M|$O1DHyNde{m|q=`^0fAF;dgFH}_2bY>z|@Ak7HRmzm@nI@m21)tOYot_^mmyZbj z(EhJt`>eHp>-ZGyA<1*kVnywD%b-P3^X#z{@L~u z&^K-V5mC=H#%Bs=<=9nh&rtslgA&SKqm3EbUZXuAST9k}P1epN&dNbOe|(Aj+PTYF zrK{)PvgAh_S9JLyG?j(*KH*v6j5DgCagL6Ne41fDDNn0Et3B$XLcunR&TVI%#}D&? zD@1+?Ih)Cw>cgh*xIWGEH*_umx^$OYL!i>3J`HfR!#yYedE-(CC=P(Haz3}b*S&;(H`lqmHwuAe2f;o==oC{J`Anysu5B+^OXp%SDsHNK%+H;1j zTjqU!c}NR4RB8HG$(Yw$nd5tWW{h^Pm9{Q(jP@<|j`a1?=BC-6F2mmph$@l%Avt|; z6Z}Cpm%5_!7s-IqQ1)~w{y>Pz5y{_F5}W4nqLn{&{=%+5HR(l?AJ(VCFe#$!v#tKr zu9{hYYWs~>Nq$)WxA3=xYMz?EAfE-8f08@B*fv$(-q!IpEqLm9i^7W`_-J~gwgyT`$M8(&H=JKL^K0pkkk1CJkIuXM4i%j`($-5A|3z$nH2c3qeINpr zhe>`|4>+;Vr!qi|m-zLe?%i!|ll%#JBV%4qw&@#!X16D31F{^{`IBoG&Gv2y{#1H* zNq$(L&Tp9IzXX4XsK}7yhxPxAdA-#(zCr1=@l80PuG!0RBmdEI`Bj;~`k!9@6-$1$ zeA)NVBGT562>ng@@S@pXse=@1{-+K-S$FWtO7*g4KTDaR*uJ1b!|I*c$yyN34L9yC zM_*uOP4dI~{~UVMre7>7`9x$hP>-W_r`~4`{r$F1&+#Jkz15jJwVO>rkgBT(Hn%oj;&KuB@H!l3xQ~yS9SQ zYzeh8)%Ip)zgr%lG%)_x>5FE}>xshge>4}(&fts;IPZ5K#{ZYJ@mS;lhQ^exU|B&)D`>u8xRQm)ULqoo~cydrjXOTvgqk0gXNg9B&!8*ua67JwB0uSMa zPJbv2T@xiAby7yHXCa?ISqYc-_SG{#3iN$Gu~ruD>_SCXE@AtO_W$PKAyrWJvjx~Y z(e72J>5C#gP)a&p^1y7Lv6qHB zVW!Puq!>2wioVCM=y+2Lo~^%8`6c^{xIJoZJdE1S_=w1>8D1y+YWl1y5v9*~|J1xc z4EJ#7U~Y+s3Qu%-g<*}1mX~dQ--q-{6!`;#&4GDV4y|Q2AQP5%%#z>cZ-^>biNaa) zaIYM?nmelPjxd3&B?Qy6;@HBow5{QX(yqBA48vF!oF2lDkWm z%MX!;fuN@#haQ5SdI)kVeCVOapoboV{sAcv^j!4NV-XZR%l+bqBSMq~iQJu; z_j|v2^XAQa10ADcd}D1Rv~lGKK0iBlx>3 z3k1J<*A={#>b5%_MGhAgsoj-L55{NjF2MNXnv1b+$%g56-{^S&K3`h^@cKO$;D!xQ zQpbu%{)_h($zOHl?;Q)>o+@d|Y%6|obpg&d@4GlR9UN>yoe2XD0)T(5y8sRiyuRUwhVI=be_LOG@}Cb~l$)#}35Gnwu-|>S z0OEf)U5I-mZ+#8X=6kXVO7}~i0^DEwdnYi|*$q~Ez`w1=SsrWkOja+T(Cby!K%bxZWu29`nw_-_ZHBfV zZ2x%we5YJ=xnnK*lh3i}G6z@m+$UA^yhl^?^^#{%bcw?!y3A=4UHW*5E_a?p&wYSI zmpd_{OB@l=WzU4@QinkFyiq^;<$1$=bg8%=Jy#&#{PWr-q_vyhuiX!YwjXU>?Z9a) zm|K6}d>K&etY2;k_W>iVf>WIaL)+e&KJkXuar-I#^w#Z>UmRwixvF&dBL#i%mu9 zVP?A06J(($!0?w5#em_eQOC07ll0?fkyjoveijio^T#+oHi5nXt5dBl^q%YyyZBgn zIy|(n(Ox|QcJ@>wQ$@qH{+UBAm`uZ21?gd+Xilp$3KpJ9E(6kH!-}YGXZ_OkyoLubyO@q=RUFS zO}4)Iq9|Yg5VsvT{p|3_dWTPpgh)S&zd;VZ z2?%@>A^5DpA2qCT$yTkJLnvJadFm;vYki)n>{S!_ftacEL?fp@S1C~$WxE25Wt;6f zYh)XuE^Dkhth0KuA#!CrrZ016<`IolX}8#ri~_Wwq5VFPX!Z%4LMBUR$1+nHVw_wx zF5~`MV&9axc)yykbn{h5XTQe9H9S@{DyB$deSC-K&RW&P482SmxTnX}-C|N=FejwS zT(SfBExyEJ-M}$ER4V&1rAiT8R^L|zz0#Cv(%<8&#Rg!+Y9-*CF;$*RHpN7-BkB^t zt5k~2(MerB4RDu6Fh1P_`%QQuwZdpUzJ%IRqS8FH^y>mwG5PeqGcQ&HY_42C%;dG& zsHGBn&6o$TpABl&LLzVRoG}mH6cg!KzF8m`B3FskxMD+Q4xF_)Ee-3ijvn1qqn7At zGEJ2QZ0B?yWa2|>U4wQ;YMDtlg+wLBrY+88SZA?{E|+tQH*1_kbn?2UiA?cS)YD*R zus@w;L?&$z{ssj;3Vs;(u64jhMx`fZv{N~w4Q+W{WcC|!sur_s5Z7cT4f~_maC|Tm zGqGQIbRJ0h{!n5nEjFFiz}H56CO41Y+_BQ7t+~ySrc&Z=rEmNF48NQ?&W;!P1vX2@ z#VVEb?!kcX`u7+EJytM`y6t-r^q8JUkBYev-!v_PjI4qJGxk zcP-z(Jb%qMVgJSWDqENIidkkV4Pi9TzH`T3o=+t54LMWPB$X;ldb%$qv`M_2=ffv8 zh}&=`j=(2Ia2}6^YQgp=GS9ccy})znuI-~gfj6bqIP-0DW-75pey$(8>*9{PyfebH z#w4EA51C|deoo-}u@%=)ThwEnOFq}KqMnB{wD9JA*V6jn!+oBnMv$9XzJqha%d@Y; zwUU9{)W~<7oQCA6)45Cu@;aRRzU@8d@|fB~)8~h4=c)a9Y4OUyZuK8LuKW6EM&R=@Hkkz1; z8Ob{2x^wmw%`1?H)7d8E1W@Dbk8pktwT&ro*-@>UP2zWmKS_8V ziMv+40K7+V9uGtn*9z=euCzBiY9bZaMV?rd0RCfv+wT*MCfm=^dt=s!;nKN`KTfOy z8*1-V0hif2g8G?3Dd;ls`!if_AJu7u;1|m70)9sYcuD23XbYF^={l6I&~0`Z{YzRd2&-Q(+^1x zdSc7dC^LSxv-1q34^qIMObx%_N9%|1cj9o7shG7YQHMl_4?PU<3C;)KVIY?ei5t`H%tbd?;8`L^<;bo+}7FwdGkfkTb9nsJXCJuSf*+!gJi#TGkI%HNp`Sj2dot{QEaC%rQ})KL=$W5WHw-Cm zXJFeq+qCiK`7_>?Q~}~gOyKe+U!us}f&|#=z@AS%m9z0C`!nc|Cjf%r-o&Y)e03s9 z(GDMU0+)cE{YBwPc=wkHuav*uEu=<{d@3Lh(Z(-N_NOCXNbJW&!&-wZ*u;((ZFus= z8**vri5JHhw3x{bqzt7&zGveV$0*$eNo#!!h*<>B-VY2G@~k((BXuX){;1*TPq6|0 zL36bFoin4kF};kqTi>V uhPJ{Tq5GS@(t9j*I*-YPaM-@^4O>^i@V{0MLZPL9L&`5gp=0 str: + """Infer general data type from pandas dtype.""" + if pd.api.types.is_integer_dtype(dtype): + return "int" + elif pd.api.types.is_float_dtype(dtype): + return "float" + elif pd.api.types.is_datetime64_any_dtype(dtype): + return "datetime" + else: + return "string" + + +def find_sp2xr_files(directory: str | Path) -> tuple[list[Path], list[Path]]: + """ + Find PbP and HK files in the given directory and all subdirectories. + + Parameters + ---------- + directory : str | Path + Directory to search for SP2XR files (searches recursively) + + Returns + ------- + tuple[list[Path], list[Path]] + Lists of PbP files and HK files found + """ + directory = Path(directory) + if not directory.exists(): + raise FileNotFoundError(f"Directory not found: {directory}") + + # Common SP2XR file patterns + pbp_patterns = ["*PbP*", "*pbp*", "*Pbp*"] + hk_patterns = ["*hk*", "*HK*", "*Hk*"] + file_extensions = ["*.csv", "*.zip", "*.parquet"] + + pbp_files = [] + hk_files = [] + + # Search for files matching patterns (including subdirectories) + for ext in file_extensions: + for pattern in pbp_patterns: + pbp_files.extend(directory.glob(f"**/{pattern}{ext}")) + for pattern in hk_patterns: + hk_files.extend(directory.glob(f"**/{pattern}{ext}")) + + # Remove duplicates and sort + pbp_files = sorted(list(set(pbp_files))) + hk_files = sorted(list(set(hk_files))) + + return pbp_files, hk_files + + +def load_schema(input_file: str | Path, nrows: int = 100) -> dict[str, str]: + """ + Load schema from input file by inferring column types. + + Parameters + ---------- + input_file : str | Path + Path to the input file + nrows : int + Number of rows to read for type inference (for CSV files) + + Returns + ------- + dict[str, str] + Mapping of column names to data types + """ + ext = os.path.splitext(str(input_file))[1].lower() + + if ext in [".csv", ".zip"]: + df = pd.read_csv(input_file, nrows=nrows) + elif ext == ".parquet": + # For parquet, we can just read the schema without loading data + pf = pd.read_parquet(input_file, engine="pyarrow") + df = pf.head(0) # Empty dataframe with schema + else: + raise ValueError(f"Unsupported file format: {ext}") + + schema = {col: infer_general_dtype(dtype) for col, dtype in df.dtypes.items()} + return schema + + +def get_canonical_schemas() -> dict[str, dict[str, str]]: + """Return canonical column schemas for SP2XR data.""" + pbp_canonical = { + "Time (sec)": "float", + "Packet Time Stamp": "float", + "Flag": "float", + "Dropped Records": "float", + "Record Count": "float", + "Record Size": "float", + "Particle Time Stamp": "float", + "Particle Flags": "float", + "Scatter relPeak": "float", + "Scatter Transit Time": "float", + "Scatter Peak Time": "float", + "Scatter FWHM": "float", + "Scatter Size (nm)": "float", + "Incand relPeak": "float", + "Incand Transit Time": "float", + "Incand Peak Time": "float", + "Incand FWHM": "float", + "Incand Delay": "float", + "Incand Mass (fg)": "float", + "Reserved": "float", + } + + hk_canonical = { + "Time Stamp": "datetime", + "Time (sec)": "float", + "Sample Flow Controller Read (sccm)": "float", + "Sample Flow Controller Read (vccm)": "float", + "Time Stamp (UTC sec)": "float", + "Elapsed Time": "float", + "Error Code": "float", + "Packet Time Stamp": "float", + "Laser TEC Temp (C)": "float", + "Crystal TEC Temp (C)": "float", + "Inlet Air Temp (C)": "float", + "Computer Heatsink Temp (C)": "float", + "Laser Heatsink Temp (C)": "float", + "Outlet Air Temp (C)": "float", + "YAG Output Monitor (V)": "float", + "Cavity Pressure (hPa)": "float", + "Laser Driver Power Monitor (uA)": "float", + "Laser Driver Current Limit Monitor (A)": "float", + "Laser Driver Current Monitor (A)": "float", + } + + return {"pbp_canonical": pbp_canonical, "hk_canonical": hk_canonical} + + +def generate_basic_config( + pbp_file: Path, + hk_file: Path, + schema_output: str = "config_schema.yaml", + ini_file: str = None, + instrument_output: str = None, +) -> None: + """Generate basic config schema file with data type definitions only.""" + print(f"Reading PbP schema from: {pbp_file}") + pbp_schema = load_schema(pbp_file) + + print(f"Reading HK schema from: {hk_file}") + hk_schema = load_schema(hk_file) + + config = { + "pbp_schema": pbp_schema, + "hk_schema": hk_schema, + } + + # Create output directory if it doesn't exist + schema_path = Path(schema_output) + schema_path.parent.mkdir(parents=True, exist_ok=True) + + with open(schema_output, "w") as f: + yaml.dump(config, f, sort_keys=False) + + print(f"Data schema config saved to: {schema_output}") + + # Generate separate instrument settings config from INI file + if ini_file: + # Determine instrument settings output filename + if instrument_output: + instrument_path = Path(instrument_output) + else: + instrument_path = ( + schema_path.parent / f"{schema_path.stem}_instrument_settings.yaml" + ) + + try: + from sp2xr.helpers import export_xr_ini_to_yaml_with_source + + export_xr_ini_to_yaml_with_source(ini_file, str(instrument_path)) + print(f"Instrument settings config saved to: {instrument_path}") + except ImportError: + # Fallback to original function if new one doesn't exist yet + from sp2xr.helpers import export_xr_ini_to_yaml + + export_xr_ini_to_yaml(ini_file, str(instrument_path)) + print(f"Instrument settings config saved to: {instrument_path}") + except Exception as e: + print(f"Warning: Could not convert INI to YAML: {e}") + # Still reference the original INI file as fallback + config["calibration_file"] = ini_file + with open(schema_output, "w") as f: + yaml.dump(config, f, sort_keys=False) + print(f"Added INI file reference as fallback: {Path(ini_file).name}") + + +def generate_mapping_config( + pbp_file: Path, + hk_file: Path, + schema_output: str = "config_schema_with_mapping.yaml", + ini_file: str = None, + instrument_output: str = None, +) -> None: + """Generate enhanced config schema with column mapping templates.""" + print(f"Reading PbP schema from: {pbp_file}") + pbp_schema = load_schema(pbp_file) + + print(f"Reading HK schema from: {hk_file}") + hk_schema = load_schema(hk_file) + + # Get canonical schemas + canonical_schemas = get_canonical_schemas() + + # Create column mapping templates + pbp_mapping = {} + hk_mapping = {} + + # For PbP: map file columns to canonical columns + for canonical_col in canonical_schemas["pbp_canonical"]: + matching_file_col = None + for file_col in pbp_schema.keys(): + if file_col.lower() == canonical_col.lower(): + matching_file_col = file_col + break + + pbp_mapping[canonical_col] = ( + matching_file_col or canonical_col # Use canonical name as default + ) + + # For HK: map file columns to canonical columns + for canonical_col in canonical_schemas["hk_canonical"]: + matching_file_col = None + for file_col in hk_schema.keys(): + if file_col.lower() == canonical_col.lower(): + matching_file_col = file_col + break + + hk_mapping[canonical_col] = ( + matching_file_col or canonical_col # Use canonical name as default + ) + + # Build enhanced config + config = { + "# INSTRUCTIONS": [ + "This config file contains both schema definitions and column mappings.", + "1. The *_schema sections define the data types for your input files.", + "2. The *_column_mapping sections map your file columns to canonical names.", + "3. Update column mappings if your files use different column names.", + "4. If your file doesn't have a particular canonical column, set it to null or remove the line.", + "5. The output parquet files will use the canonical column names for consistency.", + ], + "pbp_schema": pbp_schema, + "hk_schema": hk_schema, + "pbp_canonical_schema": canonical_schemas["pbp_canonical"], + "hk_canonical_schema": canonical_schemas["hk_canonical"], + "pbp_column_mapping": pbp_mapping, + "hk_column_mapping": hk_mapping, + } + + # Create output directory if it doesn't exist + schema_path = Path(schema_output) + schema_path.parent.mkdir(parents=True, exist_ok=True) + + with open(schema_output, "w") as f: + yaml.dump(config, f, sort_keys=False, default_flow_style=False) + + print(f"Enhanced data schema config with column mapping saved to: {schema_output}") + + # Generate separate instrument settings config from INI file + if ini_file: + # Determine instrument settings output filename + if instrument_output: + instrument_path = Path(instrument_output) + else: + instrument_path = ( + schema_path.parent / f"{schema_path.stem}_instrument_settings.yaml" + ) + + try: + from sp2xr.helpers import export_xr_ini_to_yaml_with_source + + export_xr_ini_to_yaml_with_source(ini_file, str(instrument_path)) + print(f"Instrument settings config saved to: {instrument_path}") + except ImportError: + # Fallback to original function if new one doesn't exist yet + from sp2xr.helpers import export_xr_ini_to_yaml + + export_xr_ini_to_yaml(ini_file, str(instrument_path)) + print(f"Instrument settings config saved to: {instrument_path}") + except Exception as e: + print(f"Warning: Could not convert INI to YAML: {e}") + # Still reference the original INI file as fallback + config["calibration_file"] = ini_file + with open(schema_output, "w") as f: + yaml.dump(config, f, sort_keys=False, default_flow_style=False) + print(f"Added INI file reference as fallback: {Path(ini_file).name}") + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Generate SP2XR configuration files from data directory", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Generate basic config schema from files in current directory + python sp2xr_generate_config.py . + + # Generate config schema from specific directory + python sp2xr_generate_config.py /path/to/sp2xr/data + + # Generate config schema with column mapping support + python sp2xr_generate_config.py /path/to/data --mapping + + # Specify custom schema and instrument settings filenames + python sp2xr_generate_config.py /path/to/data --schema-output my_schema.yaml --instrument-output my_settings.yaml + + # Generate mapping config with custom names + python sp2xr_generate_config.py /path/to/data --mapping --schema-output campaign_schema.yaml --instrument-output campaign_settings.yaml + """, + ) + + parser.add_argument( + "directory", help="Directory containing SP2XR files (PbP and HK files)" + ) + + parser.add_argument( + "--schema-output", + "-s", + default="config_schema.yaml", + help="Output filename for data schema config (default: config_schema.yaml)", + ) + + parser.add_argument( + "--instrument-output", + "-i", + default=None, + help="Output filename for instrument settings config (default: {schema_output}_instrument_settings.yaml)", + ) + + parser.add_argument( + "--mapping", + "-m", + action="store_true", + help="Generate config with column mapping support (creates config_with_mapping.yaml)", + ) + + parser.add_argument( + "--pbp-file", help="Specify specific PbP file instead of auto-detection" + ) + + parser.add_argument( + "--hk-file", help="Specify specific HK file instead of auto-detection" + ) + + return parser.parse_args() + + +def main(): + """Main entry point.""" + args = parse_args() + + try: + # Use specific files if provided, otherwise auto-detect + if args.pbp_file and args.hk_file: + pbp_file = Path(args.pbp_file) + hk_file = Path(args.hk_file) + + if not pbp_file.exists(): + raise FileNotFoundError(f"PbP file not found: {pbp_file}") + if not hk_file.exists(): + raise FileNotFoundError(f"HK file not found: {hk_file}") + + else: + # Auto-detect files in directory + print(f"Searching for SP2XR files in: {args.directory}") + pbp_files, hk_files = find_sp2xr_files(args.directory) + + if not pbp_files: + raise FileNotFoundError( + "No PbP files found. Looking for files with 'PbP', 'pbp', or 'Pbp' in the name." + ) + if not hk_files: + raise FileNotFoundError( + "No HK files found. Looking for files with 'hk', 'HK', or 'Hk' in the name." + ) + + # Use the first file found for each type + pbp_file = pbp_files[0] + hk_file = hk_files[0] + + print(f"Found {len(pbp_files)} PbP file(s), using: {pbp_file.name}") + print(f"Found {len(hk_files)} HK file(s), using: {hk_file.name}") + + # Check for INI files in the directory + try: + from sp2xr.helpers import find_and_validate_ini_files + + ini_file = find_and_validate_ini_files(str(args.directory)) + + if ini_file: + print(f"Found consistent INI calibration file: {Path(ini_file).name}") + else: + print("No INI calibration files found in directory") + + except ValueError as e: + print(f"WARNING: {e}") + print("You should process data with different calibrations separately.") + except ImportError: + print("Could not import INI validation function") + + # Generate configuration + if args.mapping: + schema_file = ( + "config_schema_with_mapping.yaml" + if args.schema_output == "config_schema.yaml" + else args.schema_output + ) + generate_mapping_config( + pbp_file, + hk_file, + schema_file, + ini_file if "ini_file" in locals() else None, + args.instrument_output, + ) + else: + generate_basic_config( + pbp_file, + hk_file, + args.schema_output, + ini_file if "ini_file" in locals() else None, + args.instrument_output, + ) + + print("\nConfiguration generation completed successfully!") + + except Exception as e: + print(f"Error: {e}") + return 1 + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/src/sp2xr/helpers.py b/src/sp2xr/helpers.py index ca68750..e11bbc6 100644 --- a/src/sp2xr/helpers.py +++ b/src/sp2xr/helpers.py @@ -4,6 +4,7 @@ import re import yaml import argparse import pandas as pd +import numpy as np import dask.dataframe as dd from pathlib import Path from dask_jobqueue import SLURMCluster @@ -43,7 +44,7 @@ def load_and_resolve_config(args): "rho_eff": choose(args.BC_rho, base, "bc.rho_eff", None), "BC_type": choose(args.BC_type, base, "bc.type", None), "cluster": { - "use_local": choose(args.local, base, "cluster.use_local"), + "use_local": args.local or get(base, "cluster.use_local", False), "cores": choose(args.cores, base, "cluster.cores", None), "memory": choose(args.memory, base, "cluster.memory", None), "walltime": choose(args.walltime, base, "cluster.walltime", None), @@ -98,9 +99,9 @@ def load_and_resolve_config(args): def initialize_cluster(config): if config["cluster"].get("use_local", False): - return make_slurm_cluster(config) - else: return make_local_cluster(config) + else: + return make_slurm_cluster(config) def make_slurm_cluster(config): @@ -137,14 +138,12 @@ def make_local_cluster(config): total_cores = multiprocessing.cpu_count() total_memory = psutil.virtual_memory().total # in bytes - # Use all cores or config override - cores = config["cluster"].get("cores", total_cores) - memory_limit = config["cluster"].get("memory") + # For local clusters, always auto-detect resources (ignore config values meant for SLURM) + cores = total_cores + memory_limit_bytes = int(total_memory * 0.8) + memory_limit = f"{memory_limit_bytes // (1024**3)}GB" - # If memory not provided, use 80% of total - if memory_limit is None: - memory_limit_bytes = int(total_memory * 0.8) - memory_limit = f"{memory_limit_bytes // (1024**3)}GB" + print(f"Auto-detected local resources: {cores} cores, {memory_limit} memory") cluster = LocalCluster( n_workers=cores, @@ -161,7 +160,7 @@ def make_local_cluster(config): timeout="300s", ) print(f"Dask LOCAL dashboard: {client.dashboard_link}") - return client + return client, cluster def extract_partitioned_datetimes(parquet_path: str) -> list[pd.Timestamp]: @@ -288,7 +287,9 @@ def parse_args(): ) # cluster / resource knobs (all optional) - p.add_argument("--local", action="store_false", help="Run Local Cluster") + p.add_argument( + "--local", action="store_true", help="Use local cluster instead of SLURM" + ) p.add_argument("--cores", type=int, default=None, help="CPU cores per SLURM job") p.add_argument("--memory", default=None, help="RAM per job") p.add_argument("--walltime", default=None, help="SLURM wall-time") @@ -419,7 +420,7 @@ def read_xr_ini_file(fname): return default params = {} - with open(fname, "r") as f: + with open(fname, "r", encoding="utf-8", errors="ignore") as f: for line in f: if "Disqualified" in line: # skip irrelevant lines continue @@ -433,6 +434,99 @@ def read_xr_ini_file(fname): return params +def find_and_validate_ini_files(directory): + """ + Find all .ini files in a directory and validate they are consistent. + + Parameters + ---------- + directory : str + Directory to search for .ini files + + Returns + ------- + str or None + Path to representative .ini file if all are consistent, None if none found + + Raises + ------ + ValueError + If multiple different .ini files are found + """ + import os + import glob + + # Find all .ini files recursively + ini_pattern = os.path.join(directory, "**", "*Calibration*.ini") + ini_files = glob.glob(ini_pattern, recursive=True) + + if not ini_files: + return None + + if len(ini_files) == 1: + return ini_files[0] + + # Compare all .ini files to ensure they're identical + reference_params = None + reference_file = None + + for ini_file in ini_files: + try: + params = read_xr_ini_file(ini_file) + + if reference_params is None: + reference_params = params + reference_file = ini_file + else: + # Compare with reference + if params != reference_params: + raise ValueError( + f"Multiple different .ini files found in {directory}:\n" + f" Reference: {reference_file}\n" + f" Different: {ini_file}\n" + f"Please process data with different calibrations separately." + ) + except Exception as e: + import warnings + + warnings.warn(f"Could not read .ini file {ini_file}: {e}") + + return reference_file + + +def export_xr_ini_to_yaml_with_source(ini_path, yaml_path): + """ + Convert an SP2-XR .ini file to a structured YAML configuration with source traceability. + - Groups instrument parameters under 'instrument_parameters' + - Pre-populates calibration and histogram sections for user editing + - Merges with existing YAML if present (preserving user edits) + - Includes source INI file path for traceability + """ + from datetime import datetime + + ini_params = read_xr_ini_file(ini_path) + yaml_path = Path(yaml_path) + + # Base structure with metadata + params = { + "metadata": { + "source_ini_file": str(Path(ini_path).resolve()), + "generated_on": datetime.now().isoformat(), + "generated_by": "sp2xr_generate_config.py", + }, + "instrument_parameters": ini_params, + "Signal_saturation": {"IncSatPoint": 1.7e9, "ScattSatPoint": 1.7e9}, + } + + # Save YAML + with open(yaml_path, "w") as f: + yaml.dump(params, f, sort_keys=False) + + print( + f"Parameters exported to {yaml_path} with structured sections and editable placeholders" + ) + + def export_xr_ini_to_yaml(ini_path, yaml_path): """ Convert an SP2-XR .ini file to a structured YAML configuration. @@ -480,7 +574,7 @@ def export_xr_ini_to_yaml(ini_path, yaml_path): yaml.dump(params, f, sort_keys=False) print( - f"✅ Parameters exported to {yaml_path} with structured sections and editable placeholders" + f"Parameters exported to {yaml_path} with structured sections and editable placeholders" ) @@ -614,3 +708,41 @@ def extract_sp2xr_filename_parts(file_path: str | Path) -> tuple[str, str]: folder_name = file_path_obj.parent.name return file_name_cut, folder_name + + +def calculate_delta_sec(df): + """ + This function calculates the difference in seconds between the columns + 'Time (sec)' and 'first_val' present in the input dataframe + + Parameters + ---------- + df : pandas dataframe + The columns 'Time (sec)' and 'first_val' must be present in the DataFrame. + + Returns + ------- + int + Floored seconds between the values in the two columns. + + """ + return np.floor(df["Time (sec)"]) - df["first_val"] + + +def extract_datetime(df): + """ + Thi function selects the datetime out of the SP2XR file name. + + Parameters + ---------- + df : Pandas DataFrame + DataFRame conteining the column 'orig_file_name'. + + Returns + ------- + Pandas Series + Date and time corresponding to the date present in the 'orig_file_name' column. + + """ + # return pd.to_datetime(df['orig_file_name'].split('_')[-2]) + return pd.to_datetime(df["path"].split("_")[-2]) diff --git a/tests/config.yaml b/tests/config.yaml deleted file mode 100644 index 701e851..0000000 --- a/tests/config.yaml +++ /dev/null @@ -1,159 +0,0 @@ -pbp_schema: - Time (sec): float - Packet Time Stamp: float - Flag: float - Dropped Records: float - Record Count: float - Record Size: float - Particle Time Stamp: float - Particle Flags: float - Scatter relPeak: float - Scatter Transit Time: float - Scatter Peak Time: float - Scatter FWHM: float - Scatter Size (nm): float - Incand relPeak: float - Incand Transit Time: float - Incand Peak Time: float - Incand FWHM: float - Incand Delay: float - Incand Mass (fg): float - Reserved: float -hk_schema: - Time Stamp: datetime - Time (sec): float - Time Stamp (UTC sec): float - Elapsed Time: float - Error Code: float - Packet Time Stamp: float - Laser TEC Temp (C): float - Crystal TEC Temp (C): float - Inlet Air Temp (C): float - Computer Heatsink Temp (C): float - Laser Heatsink Temp (C): float - Outlet Air Temp (C): float - YAG Output Monitor (V): float - Cavity Pressure (hPa): float - Laser Driver Power Monitor (uA): float - Laser Driver Current Limit Monitor (A): float - Laser Driver Current Monitor (A): float - Laser TEC Sense: float - Laser Over Temp (On/Off): float - +5V Laser Rail (V): float - ' +5V Rail (V)': float - +12V Rail (V): float - High Voltage (V): float - Battery Temp (C): float - UPS Output (V): float - 12V Iso Rail (V): float - 5V Iso Rail (V): float - 3.3V Iso Rail (V): float - Spare 22: float - Spare 23: float - 408 Board Spare 0: float - 408 Board Spare 1: float - 408 Board Spare 2: float - 408 Board Spare 3: float - 408 Board Spare 4: float - Purge Flow Monitor (sccm): float - System Input Voltage (V): float - Board Temperature (C): float - 408 Board Spare 8: float - 408 Board Spare 9: float - 408 Board Spare 10: float - 408 Board Spare 11: float - 408 Board Spare 12: float - 408 Board Spare 13: float - 408 Board Spare 14: float - 408 Board Spare 15: float - Sheath Flow Controller Read (vccm): float - Sheath Flow Controller Read (sccm): float - Sheath Flow Controller Pressure (psia): float - Sheath Flow Controller Temperature (C): float - Sample Flow Controller Read (vccm): float - Sample Flow Controller Read (sccm): float - Sample Flow Controller Pressure (psia): float - Sample Flow Controller Temperature (C): float - Fan 1 (RPM): float - Fan 2 (RPM): float - Laser Fan (RPM): float - Spare tach: float - Threshold Crossing Events: float - Dual Qualified Scatter and Incand Particles: float - Qualified Scatter Only Particles: float - Qualified Incand Only Particles: float - Disqualified Due to Scatter Saturation: float - Disqualified Due to Scatter Transit Time Min: float - Disqualified Due to Scatter Transit Time Max: float - Disqualified Due to Scatter FWHM Min: float - Disqualified Due to Scatter FWHM Max: float - Scatter Inter Part Period Min Violation: float - Disqualified Due to Incand Saturation: float - Disqualified Due to Incand Transit Time Min: float - Disqualified Due to Incand Transit Time Max: float - Disqualified Due to Incand FWHM Min: float - Disqualified Due to Incand FWHM Max: float - Incand Inter Part Period Min Violation: float - Baseline Sizer Lo: float - Baseline Sizer Hi: float - Baseline Incand Lo: float - Baseline Incand Hi: float - Bandwidth Sizer Hi: float - Bandwidth Sizer Lo: float - Bandwidth Incand Lo: float - Bandwidth Incand Hi: float - ABD-0408 HK ADCs min: float - ABD-0436 HK ADCs min: float - ABD-0408 HK ADCs max: float - ABD-0436 HK ADCs max: float - Incand Particle Conc (cts/ccm): float - Scattering Particle Conc (cts/ccm): float - Incand Mass Conc (fg/sccm): float - Scattering Mass Conc (fg/sccm): float - Sheath Flow Set Point: float - Sample Flow Set Point: float - Laser Temp Set Point: float - Laser Current Set Point: float - Spare 4 Set Point: float - Spare 5 Set Point: float - PMT HV Set Point: float - Particle Density (g/ccm): float - PbP Packet Time: float - Scatter Bin 1: float - Scatter Bin 2: float - Scatter Bin 3: float - Scatter Bin 4: float - Scatter Bin 5: float - Scatter Bin 6: float - Scatter Bin 7: float - Scatter Bin 8: float - Scatter Bin 9: float - Scatter Bin 10: float - Scatter Bin 11: float - Scatter Bin 12: float - Scatter Bin 13: float - Scatter Bin 14: float - Scatter Bin 15: float - Scatter Bin 16: float - Scatter Bin 17: float - Scatter Bin 18: float - Scatter Bin 19: float - Incand Bin 1: float - Incand Bin 2: float - Incand Bin 3: float - Incand Bin 4: float - Incand Bin 5: float - Incand Bin 6: float - Incand Bin 7: float - Incand Bin 8: float - Incand Bin 9: float - Incand Bin 10: float - Incand Bin 11: float - Incand Bin 12: float - Incand Bin 13: float - Incand Bin 14: float - Incand Bin 15: float - Incand Bin 16: float - Incand Bin 17: float - Incand Bin 18: float - Incand Bin 19: float diff --git a/tests/run_config.yaml b/tests/run_config.yaml deleted file mode 100644 index 4c56b70..0000000 --- a/tests/run_config.yaml +++ /dev/null @@ -1,59 +0,0 @@ -paths: - input_pbp: /data/user/bertoz_b/merlin6data/SP2XR/data/NyA/SP2XR_pbp_parquet - input_hk: /data/user/bertoz_b/merlin6data/SP2XR/data/NyA/SP2XR_hk_parquet - output: tests/SP2XR_NyA_processed_data_60s - instrument_config: tests/instrument_config.yaml - -workflow: - conc: true - BC_hist: true - scatt_hist: true - timelag_hist: false - dt: 60 # seconds - repartition: '1h' - max_partition_size: "200MB" - saving_schema: ['date'] - -cluster: - use_local: false - cores: 16 - processes: 8 - memory: 128GB - walltime: "2-00:59:00" - partition: general - log_dir: ./slurm_out - -chunking: - freq: '5d' - start_date: '2020-08-24' - end_date: null - -bc: - rho_eff: 1800 - type: constant_effective_density - -histo: - inc: - min_mass: 0.3 - max_mass: 400 - n_bins: 50 - scatt: - min_D: 100 - max_D: 500 - n_bins: 20 - timelag: - min: -10 - max: 400 - n_bins: 100 - -mixing_state: - threshold: 50 - inc_scatt_ratio: 1.1 - -calibration: - incandescence: - curve_type: "polynomial" - parameters: [0.05, 2.0470000507725255e-07] - scattering: - curve_type: "powerlaw" - parameters: [17.21724257, 0.16908516, -1.49431104] \ No newline at end of file