62 lines
1.9 KiB
YAML
62 lines
1.9 KiB
YAML
# SP2XR CSV/ZIP to Parquet Conversion Configuration Template
|
|
#
|
|
# This file contains all parameters for converting raw SP2XR data files
|
|
# to time-indexed Parquet format.
|
|
#
|
|
# USAGE:
|
|
# 1. Update the paths and parameters below for your dataset
|
|
# 2. Run: python scripts/sp2xr_csv2parquet.py --config conversion_config_template.yaml
|
|
#
|
|
# NOTES:
|
|
# - For local execution, the script auto-detects available CPU cores and memory
|
|
# - Output files are organized by date and hour: target_directory/date=YYYY-MM-DD/hour=HH/
|
|
# - Processing is parallelized using Dask for efficient handling of large datasets
|
|
# - You can monitor progress via the Dask dashboard (URL printed when script starts)
|
|
|
|
|
|
# Directory containing your raw SP2XR files (CSV or ZIP format)
|
|
source_directory: data/SP2XR_orig_files
|
|
|
|
# Output directory for converted Parquet files
|
|
target_directory: data/pbp_files_parquet_2
|
|
|
|
# Path to your data schema config file (generated by sp2xr_generate_config.py)
|
|
schema_config: config/new_data_schema_with_mapping.yaml
|
|
|
|
|
|
# Pattern to filter which files to process
|
|
# "PbP" for particle-by-particle data, "hk" for housekeeping data
|
|
file_filter: PbP
|
|
|
|
|
|
# Number of files to process in each batch
|
|
# Larger values = more memory usage but potentially faster
|
|
# Smaller values = less memory usage but more overhead
|
|
chunk_size: 100
|
|
|
|
|
|
# Execution mode: "local" or "slurm"
|
|
# - local: Use your local machine (laptop/desktop)
|
|
# - slurm: Use a SLURM cluster (HPC environment)
|
|
execution_mode: local
|
|
|
|
|
|
|
|
# --- SLURM-specific parameters (ignored if execution_mode: local) ---
|
|
|
|
# Number of CPU cores per SLURM job
|
|
slurm_cores: 64
|
|
|
|
# Memory per SLURM job (e.g., "128GB", "256GB")
|
|
slurm_memory: 128GB
|
|
|
|
# SLURM partition to use
|
|
# Common values: "hourly", "daily", "general"
|
|
slurm_partition: daily
|
|
|
|
# Walltime for SLURM job (e.g., "01:00:00" for 1 hour, "23:59:00" for 1 day)
|
|
# If not specified, defaults based on partition:
|
|
# - hourly: 00:59:00
|
|
# - daily: 23:59:00
|
|
# - general: 7-00:00:00
|
|
slurm_walltime: null |