# SP2XR CSV/ZIP to Parquet Conversion Configuration Template # # This file contains all parameters for converting raw SP2XR data files # to time-indexed Parquet format. # # USAGE: # 1. Update the paths and parameters below for your dataset # 2. Run: python scripts/sp2xr_csv2parquet.py --config conversion_config_template.yaml # # NOTES: # - For local execution, the script auto-detects available CPU cores and memory # - Output files are organized by date and hour: target_directory/date=YYYY-MM-DD/hour=HH/ # - Processing is parallelized using Dask for efficient handling of large datasets # - You can monitor progress via the Dask dashboard (URL printed when script starts) # Directory containing your raw SP2XR files (CSV or ZIP format) source_directory: data/SP2XR_orig_files # Output directory for converted Parquet files target_directory: data/pbp_files_parquet_2 # Path to your data schema config file (generated by sp2xr_generate_config.py) schema_config: config/new_data_schema_with_mapping.yaml # Pattern to filter which files to process # "PbP" for particle-by-particle data, "hk" for housekeeping data file_filter: PbP # Number of files to process in each batch # Larger values = more memory usage but potentially faster # Smaller values = less memory usage but more overhead chunk_size: 100 # Execution mode: "local" or "slurm" # - local: Use your local machine (laptop/desktop) # - slurm: Use a SLURM cluster (HPC environment) execution_mode: local # --- SLURM-specific parameters (ignored if execution_mode: local) --- # Number of CPU cores per SLURM job slurm_cores: 64 # Memory per SLURM job (e.g., "128GB", "256GB") slurm_memory: 128GB # SLURM partition to use # Common values: "hourly", "daily", "general" slurm_partition: daily # Walltime for SLURM job (e.g., "01:00:00" for 1 hour, "23:59:00" for 1 day) # If not specified, defaults based on partition: # - hourly: 00:59:00 # - daily: 23:59:00 # - general: 7-00:00:00 slurm_walltime: null