style: Ruff auto‑fixes in SP2XR_toolkit

This commit is contained in:
2025-07-25 08:35:03 +02:00
parent 51a005f2da
commit b443aa5db8
12 changed files with 783 additions and 172 deletions

1
.gitignore vendored
View File

@@ -7,4 +7,5 @@ sp2xr/__pycache__/
core.*
src/*.egg-info/
.venv/

159
meta_files/config.yaml Normal file
View File

@@ -0,0 +1,159 @@
pbp_schema:
Time (sec): string
Packet Time Stamp: string
Flag: string
Dropped Records: string
Record Count: string
Record Size: string
Particle Time Stamp: string
Particle Flags: string
Scatter relPeak: string
Scatter Transit Time: string
Scatter Peak Time: string
Scatter FWHM: string
Scatter Size (nm): string
Incand relPeak: string
Incand Transit Time: string
Incand Peak Time: string
Incand FWHM: string
Incand Delay: string
Incand Mass (fg): string
Reserved: string
hk_schema:
Time Stamp: string
Time (sec): string
Time Stamp (UTC sec): string
Elapsed Time: string
Error Code: string
Packet Time Stamp: string
Laser TEC Temp (C): string
Crystal TEC Temp (C): string
Inlet Air Temp (C): string
Computer Heatsink Temp (C): string
Laser Heatsink Temp (C): string
Outlet Air Temp (C): string
YAG Output Monitor (V): string
Cavity Pressure (hPa): string
Laser Driver Power Monitor (uA): string
Laser Driver Current Limit Monitor (A): string
Laser Driver Current Monitor (A): string
Laser TEC Sense: string
Laser Over Temp (On/Off): string
+5V Laser Rail (V): string
' +5V Rail (V)': string
+12V Rail (V): string
High Voltage (V): string
Battery Temp (C): string
UPS Output (V): string
12V Iso Rail (V): string
5V Iso Rail (V): string
3.3V Iso Rail (V): string
Spare 22: string
Spare 23: string
408 Board Spare 0: string
408 Board Spare 1: string
408 Board Spare 2: string
408 Board Spare 3: string
408 Board Spare 4: string
Purge Flow Monitor (sccm): string
System Input Voltage (V): string
Board Temperature (C): string
408 Board Spare 8: string
408 Board Spare 9: string
408 Board Spare 10: string
408 Board Spare 11: string
408 Board Spare 12: string
408 Board Spare 13: string
408 Board Spare 14: string
408 Board Spare 15: string
Sheath Flow Controller Read (vccm): string
Sheath Flow Controller Read (sccm): string
Sheath Flow Controller Pressure (psia): string
Sheath Flow Controller Temperature (C): string
Sample Flow Controller Read (vccm): string
Sample Flow Controller Read (sccm): string
Sample Flow Controller Pressure (psia): string
Sample Flow Controller Temperature (C): string
Fan 1 (RPM): string
Fan 2 (RPM): string
Laser Fan (RPM): string
Spare tach: string
Threshold Crossing Events: string
Dual Qualified Scatter and Incand Particles: string
Qualified Scatter Only Particles: string
Qualified Incand Only Particles: string
Disqualified Due to Scatter Saturation: string
Disqualified Due to Scatter Transit Time Min: string
Disqualified Due to Scatter Transit Time Max: string
Disqualified Due to Scatter FWHM Min: string
Disqualified Due to Scatter FWHM Max: string
Scatter Inter Part Period Min Violation: string
Disqualified Due to Incand Saturation: string
Disqualified Due to Incand Transit Time Min: string
Disqualified Due to Incand Transit Time Max: string
Disqualified Due to Incand FWHM Min: string
Disqualified Due to Incand FWHM Max: string
Incand Inter Part Period Min Violation: string
Baseline Sizer Lo: string
Baseline Sizer Hi: string
Baseline Incand Lo: string
Baseline Incand Hi: string
Bandwidth Sizer Hi: string
Bandwidth Sizer Lo: string
Bandwidth Incand Lo: string
Bandwidth Incand Hi: string
ABD-0408 HK ADCs min: string
ABD-0436 HK ADCs min: string
ABD-0408 HK ADCs max: string
ABD-0436 HK ADCs max: string
Incand Particle Conc (cts/ccm): string
Scattering Particle Conc (cts/ccm): string
Incand Mass Conc (fg/sccm): string
Scattering Mass Conc (fg/sccm): string
Sheath Flow Set Point: string
Sample Flow Set Point: string
Laser Temp Set Point: string
Laser Current Set Point: string
Spare 4 Set Point: string
Spare 5 Set Point: string
PMT HV Set Point: string
Particle Density (g/ccm): string
PbP Packet Time: string
Scatter Bin 1: string
Scatter Bin 2: string
Scatter Bin 3: string
Scatter Bin 4: string
Scatter Bin 5: string
Scatter Bin 6: string
Scatter Bin 7: string
Scatter Bin 8: string
Scatter Bin 9: string
Scatter Bin 10: string
Scatter Bin 11: string
Scatter Bin 12: string
Scatter Bin 13: string
Scatter Bin 14: string
Scatter Bin 15: string
Scatter Bin 16: string
Scatter Bin 17: string
Scatter Bin 18: string
Scatter Bin 19: string
Incand Bin 1: string
Incand Bin 2: string
Incand Bin 3: string
Incand Bin 4: string
Incand Bin 5: string
Incand Bin 6: string
Incand Bin 7: string
Incand Bin 8: string
Incand Bin 9: string
Incand Bin 10: string
Incand Bin 11: string
Incand Bin 12: string
Incand Bin 13: string
Incand Bin 14: string
Incand Bin 15: string
Incand Bin 16: string
Incand Bin 17: string
Incand Bin 18: string
Incand Bin 19: string

1
meta_files/hk_meta.csv Normal file
View File

@@ -0,0 +1 @@
Time Stamp,Time (sec),Time Stamp (UTC sec),Elapsed Time,Error Code,Packet Time Stamp,Laser TEC Temp (C),Crystal TEC Temp (C),Inlet Air Temp (C),Computer Heatsink Temp (C),Laser Heatsink Temp (C),Outlet Air Temp (C),YAG Output Monitor (V),Cavity Pressure (hPa),Laser Driver Power Monitor (uA),Laser Driver Current Limit Monitor (A),Laser Driver Current Monitor (A),Laser TEC Sense,Laser Over Temp (On/Off),+5V Laser Rail (V), +5V Rail (V),+12V Rail (V),High Voltage (V),Battery Temp (C),UPS Output (V),12V Iso Rail (V),5V Iso Rail (V),3.3V Iso Rail (V),Spare 22,Spare 23,408 Board Spare 0,408 Board Spare 1,408 Board Spare 2,408 Board Spare 3,408 Board Spare 4,Purge Flow Monitor (sccm),System Input Voltage (V),Board Temperature (C),408 Board Spare 8,408 Board Spare 9,408 Board Spare 10,408 Board Spare 11,408 Board Spare 12,408 Board Spare 13,408 Board Spare 14,408 Board Spare 15,Sheath Flow Controller Read (vccm),Sheath Flow Controller Read (sccm),Sheath Flow Controller Pressure (psia),Sheath Flow Controller Temperature (C),Sample Flow Controller Read (vccm),Sample Flow Controller Read (sccm),Sample Flow Controller Pressure (psia),Sample Flow Controller Temperature (C),Fan 1 (RPM),Fan 2 (RPM),Laser Fan (RPM),Spare tach,Threshold Crossing Events,Dual Qualified Scatter and Incand Particles,Qualified Scatter Only Particles,Qualified Incand Only Particles,Disqualified Due to Scatter Saturation,Disqualified Due to Scatter Transit Time Min,Disqualified Due to Scatter Transit Time Max,Disqualified Due to Scatter FWHM Min,Disqualified Due to Scatter FWHM Max,Scatter Inter Part Period Min Violation,Disqualified Due to Incand Saturation,Disqualified Due to Incand Transit Time Min,Disqualified Due to Incand Transit Time Max,Disqualified Due to Incand FWHM Min,Disqualified Due to Incand FWHM Max,Incand Inter Part Period Min Violation,Baseline Sizer Lo,Baseline Sizer Hi,Baseline Incand Lo,Baseline Incand Hi,Bandwidth Sizer Hi,Bandwidth Sizer Lo,Bandwidth Incand Lo,Bandwidth Incand Hi,ABD-0408 HK ADCs min,ABD-0436 HK ADCs min,ABD-0408 HK ADCs max,ABD-0436 HK ADCs max,Incand Particle Conc (cts/ccm),Scattering Particle Conc (cts/ccm),Incand Mass Conc (fg/sccm),Scattering Mass Conc (fg/sccm),Sheath Flow Set Point,Sample Flow Set Point,Laser Temp Set Point,Laser Current Set Point,Spare 4 Set Point,Spare 5 Set Point,PMT HV Set Point,Particle Density (g/ccm),PbP Packet Time,Scatter Bin 1,Scatter Bin 2,Scatter Bin 3,Scatter Bin 4,Scatter Bin 5,Scatter Bin 6,Scatter Bin 7,Scatter Bin 8,Scatter Bin 9,Scatter Bin 10,Scatter Bin 11,Scatter Bin 12,Scatter Bin 13,Scatter Bin 14,Scatter Bin 15,Scatter Bin 16,Scatter Bin 17,Scatter Bin 18,Scatter Bin 19,Incand Bin 1,Incand Bin 2,Incand Bin 3,Incand Bin 4,Incand Bin 5,Incand Bin 6,Incand Bin 7,Incand Bin 8,Incand Bin 9,Incand Bin 10,Incand Bin 11,Incand Bin 12,Incand Bin 13,Incand Bin 14,Incand Bin 15,Incand Bin 16,Incand Bin 17,Incand Bin 18,Incand Bin 19
1 Time Stamp Time (sec) Time Stamp (UTC sec) Elapsed Time Error Code Packet Time Stamp Laser TEC Temp (C) Crystal TEC Temp (C) Inlet Air Temp (C) Computer Heatsink Temp (C) Laser Heatsink Temp (C) Outlet Air Temp (C) YAG Output Monitor (V) Cavity Pressure (hPa) Laser Driver Power Monitor (uA) Laser Driver Current Limit Monitor (A) Laser Driver Current Monitor (A) Laser TEC Sense Laser Over Temp (On/Off) +5V Laser Rail (V) +5V Rail (V) +12V Rail (V) High Voltage (V) Battery Temp (C) UPS Output (V) 12V Iso Rail (V) 5V Iso Rail (V) 3.3V Iso Rail (V) Spare 22 Spare 23 408 Board Spare 0 408 Board Spare 1 408 Board Spare 2 408 Board Spare 3 408 Board Spare 4 Purge Flow Monitor (sccm) System Input Voltage (V) Board Temperature (C) 408 Board Spare 8 408 Board Spare 9 408 Board Spare 10 408 Board Spare 11 408 Board Spare 12 408 Board Spare 13 408 Board Spare 14 408 Board Spare 15 Sheath Flow Controller Read (vccm) Sheath Flow Controller Read (sccm) Sheath Flow Controller Pressure (psia) Sheath Flow Controller Temperature (C) Sample Flow Controller Read (vccm) Sample Flow Controller Read (sccm) Sample Flow Controller Pressure (psia) Sample Flow Controller Temperature (C) Fan 1 (RPM) Fan 2 (RPM) Laser Fan (RPM) Spare tach Threshold Crossing Events Dual Qualified Scatter and Incand Particles Qualified Scatter Only Particles Qualified Incand Only Particles Disqualified Due to Scatter Saturation Disqualified Due to Scatter Transit Time Min Disqualified Due to Scatter Transit Time Max Disqualified Due to Scatter FWHM Min Disqualified Due to Scatter FWHM Max Scatter Inter Part Period Min Violation Disqualified Due to Incand Saturation Disqualified Due to Incand Transit Time Min Disqualified Due to Incand Transit Time Max Disqualified Due to Incand FWHM Min Disqualified Due to Incand FWHM Max Incand Inter Part Period Min Violation Baseline Sizer Lo Baseline Sizer Hi Baseline Incand Lo Baseline Incand Hi Bandwidth Sizer Hi Bandwidth Sizer Lo Bandwidth Incand Lo Bandwidth Incand Hi ABD-0408 HK ADCs min ABD-0436 HK ADCs min ABD-0408 HK ADCs max ABD-0436 HK ADCs max Incand Particle Conc (cts/ccm) Scattering Particle Conc (cts/ccm) Incand Mass Conc (fg/sccm) Scattering Mass Conc (fg/sccm) Sheath Flow Set Point Sample Flow Set Point Laser Temp Set Point Laser Current Set Point Spare 4 Set Point Spare 5 Set Point PMT HV Set Point Particle Density (g/ccm) PbP Packet Time Scatter Bin 1 Scatter Bin 2 Scatter Bin 3 Scatter Bin 4 Scatter Bin 5 Scatter Bin 6 Scatter Bin 7 Scatter Bin 8 Scatter Bin 9 Scatter Bin 10 Scatter Bin 11 Scatter Bin 12 Scatter Bin 13 Scatter Bin 14 Scatter Bin 15 Scatter Bin 16 Scatter Bin 17 Scatter Bin 18 Scatter Bin 19 Incand Bin 1 Incand Bin 2 Incand Bin 3 Incand Bin 4 Incand Bin 5 Incand Bin 6 Incand Bin 7 Incand Bin 8 Incand Bin 9 Incand Bin 10 Incand Bin 11 Incand Bin 12 Incand Bin 13 Incand Bin 14 Incand Bin 15 Incand Bin 16 Incand Bin 17 Incand Bin 18 Incand Bin 19

277
meta_files/hk_meta.yaml Normal file
View File

@@ -0,0 +1,277 @@
file_type: csv
name: hk_meta
columns:
- name: Time Stamp
type: string
- name: Time (sec)
type: string
- name: Time Stamp (UTC sec)
type: string
- name: Elapsed Time
type: string
- name: Error Code
type: string
- name: Packet Time Stamp
type: string
- name: Laser TEC Temp (C)
type: string
- name: Crystal TEC Temp (C)
type: string
- name: Inlet Air Temp (C)
type: string
- name: Computer Heatsink Temp (C)
type: string
- name: Laser Heatsink Temp (C)
type: string
- name: Outlet Air Temp (C)
type: string
- name: YAG Output Monitor (V)
type: string
- name: Cavity Pressure (hPa)
type: string
- name: Laser Driver Power Monitor (uA)
type: string
- name: Laser Driver Current Limit Monitor (A)
type: string
- name: Laser Driver Current Monitor (A)
type: string
- name: Laser TEC Sense
type: string
- name: Laser Over Temp (On/Off)
type: string
- name: +5V Laser Rail (V)
type: string
- name: ' +5V Rail (V)'
type: string
- name: +12V Rail (V)
type: string
- name: High Voltage (V)
type: string
- name: Battery Temp (C)
type: string
- name: UPS Output (V)
type: string
- name: 12V Iso Rail (V)
type: string
- name: 5V Iso Rail (V)
type: string
- name: 3.3V Iso Rail (V)
type: string
- name: Spare 22
type: string
- name: Spare 23
type: string
- name: 408 Board Spare 0
type: string
- name: 408 Board Spare 1
type: string
- name: 408 Board Spare 2
type: string
- name: 408 Board Spare 3
type: string
- name: 408 Board Spare 4
type: string
- name: Purge Flow Monitor (sccm)
type: string
- name: System Input Voltage (V)
type: string
- name: Board Temperature (C)
type: string
- name: 408 Board Spare 8
type: string
- name: 408 Board Spare 9
type: string
- name: 408 Board Spare 10
type: string
- name: 408 Board Spare 11
type: string
- name: 408 Board Spare 12
type: string
- name: 408 Board Spare 13
type: string
- name: 408 Board Spare 14
type: string
- name: 408 Board Spare 15
type: string
- name: Sheath Flow Controller Read (vccm)
type: string
- name: Sheath Flow Controller Read (sccm)
type: string
- name: Sheath Flow Controller Pressure (psia)
type: string
- name: Sheath Flow Controller Temperature (C)
type: string
- name: Sample Flow Controller Read (vccm)
type: string
- name: Sample Flow Controller Read (sccm)
type: string
- name: Sample Flow Controller Pressure (psia)
type: string
- name: Sample Flow Controller Temperature (C)
type: string
- name: Fan 1 (RPM)
type: string
- name: Fan 2 (RPM)
type: string
- name: Laser Fan (RPM)
type: string
- name: Spare tach
type: string
- name: Threshold Crossing Events
type: string
- name: Dual Qualified Scatter and Incand Particles
type: string
- name: Qualified Scatter Only Particles
type: string
- name: Qualified Incand Only Particles
type: string
- name: Disqualified Due to Scatter Saturation
type: string
- name: Disqualified Due to Scatter Transit Time Min
type: string
- name: Disqualified Due to Scatter Transit Time Max
type: string
- name: Disqualified Due to Scatter FWHM Min
type: string
- name: Disqualified Due to Scatter FWHM Max
type: string
- name: Scatter Inter Part Period Min Violation
type: string
- name: Disqualified Due to Incand Saturation
type: string
- name: Disqualified Due to Incand Transit Time Min
type: string
- name: Disqualified Due to Incand Transit Time Max
type: string
- name: Disqualified Due to Incand FWHM Min
type: string
- name: Disqualified Due to Incand FWHM Max
type: string
- name: Incand Inter Part Period Min Violation
type: string
- name: Baseline Sizer Lo
type: string
- name: Baseline Sizer Hi
type: string
- name: Baseline Incand Lo
type: string
- name: Baseline Incand Hi
type: string
- name: Bandwidth Sizer Hi
type: string
- name: Bandwidth Sizer Lo
type: string
- name: Bandwidth Incand Lo
type: string
- name: Bandwidth Incand Hi
type: string
- name: ABD-0408 HK ADCs min
type: string
- name: ABD-0436 HK ADCs min
type: string
- name: ABD-0408 HK ADCs max
type: string
- name: ABD-0436 HK ADCs max
type: string
- name: Incand Particle Conc (cts/ccm)
type: string
- name: Scattering Particle Conc (cts/ccm)
type: string
- name: Incand Mass Conc (fg/sccm)
type: string
- name: Scattering Mass Conc (fg/sccm)
type: string
- name: Sheath Flow Set Point
type: string
- name: Sample Flow Set Point
type: string
- name: Laser Temp Set Point
type: string
- name: Laser Current Set Point
type: string
- name: Spare 4 Set Point
type: string
- name: Spare 5 Set Point
type: string
- name: PMT HV Set Point
type: string
- name: Particle Density (g/ccm)
type: string
- name: PbP Packet Time
type: string
- name: Scatter Bin 1
type: string
- name: Scatter Bin 2
type: string
- name: Scatter Bin 3
type: string
- name: Scatter Bin 4
type: string
- name: Scatter Bin 5
type: string
- name: Scatter Bin 6
type: string
- name: Scatter Bin 7
type: string
- name: Scatter Bin 8
type: string
- name: Scatter Bin 9
type: string
- name: Scatter Bin 10
type: string
- name: Scatter Bin 11
type: string
- name: Scatter Bin 12
type: string
- name: Scatter Bin 13
type: string
- name: Scatter Bin 14
type: string
- name: Scatter Bin 15
type: string
- name: Scatter Bin 16
type: string
- name: Scatter Bin 17
type: string
- name: Scatter Bin 18
type: string
- name: Scatter Bin 19
type: string
- name: Incand Bin 1
type: string
- name: Incand Bin 2
type: string
- name: Incand Bin 3
type: string
- name: Incand Bin 4
type: string
- name: Incand Bin 5
type: string
- name: Incand Bin 6
type: string
- name: Incand Bin 7
type: string
- name: Incand Bin 8
type: string
- name: Incand Bin 9
type: string
- name: Incand Bin 10
type: string
- name: Incand Bin 11
type: string
- name: Incand Bin 12
type: string
- name: Incand Bin 13
type: string
- name: Incand Bin 14
type: string
- name: Incand Bin 15
type: string
- name: Incand Bin 16
type: string
- name: Incand Bin 17
type: string
- name: Incand Bin 18
type: string
- name: Incand Bin 19
type: string

1
meta_files/pbp_meta.csv Normal file
View File

@@ -0,0 +1 @@
Time (sec),Packet Time Stamp,Flag,Dropped Records,Record Count,Record Size,Particle Time Stamp,Particle Flags,Scatter relPeak,Scatter Transit Time,Scatter Peak Time,Scatter FWHM,Scatter Size (nm),Incand relPeak,Incand Transit Time,Incand Peak Time,Incand FWHM,Incand Delay,Incand Mass (fg),Reserved
1 Time (sec) Packet Time Stamp Flag Dropped Records Record Count Record Size Particle Time Stamp Particle Flags Scatter relPeak Scatter Transit Time Scatter Peak Time Scatter FWHM Scatter Size (nm) Incand relPeak Incand Transit Time Incand Peak Time Incand FWHM Incand Delay Incand Mass (fg) Reserved

43
meta_files/pbp_meta.yaml Normal file
View File

@@ -0,0 +1,43 @@
file_type: csv
name: pbp_meta
columns:
- name: Time (sec)
type: string
- name: Packet Time Stamp
type: string
- name: Flag
type: string
- name: Dropped Records
type: string
- name: Record Count
type: string
- name: Record Size
type: string
- name: Particle Time Stamp
type: string
- name: Particle Flags
type: string
- name: Scatter relPeak
type: string
- name: Scatter Transit Time
type: string
- name: Scatter Peak Time
type: string
- name: Scatter FWHM
type: string
- name: Scatter Size (nm)
type: string
- name: Incand relPeak
type: string
- name: Incand Transit Time
type: string
- name: Incand Peak Time
type: string
- name: Incand FWHM
type: string
- name: Incand Delay
type: string
- name: Incand Mass (fg)
type: string
- name: Reserved
type: string

44
meta_files/read.py Normal file
View File

@@ -0,0 +1,44 @@
import pandas as pd
import yaml
import os
def infer_dtype(dtype):
if pd.api.types.is_integer_dtype(dtype):
return "int"
elif pd.api.types.is_float_dtype(dtype):
return "float"
elif pd.api.types.is_datetime64_any_dtype(dtype):
return "datetime"
else:
return "string"
def load_schema(input_file):
ext = os.path.splitext(input_file)[1].lower()
if ext == ".csv":
df = pd.read_csv(input_file, nrows=100)
elif ext == ".parquet":
df = pd.read_parquet(input_file)
else:
raise ValueError(f"Unsupported file format: {ext}")
schema = {col: infer_dtype(df[col].dtype) for col in df.columns}
return schema
def generate_combined_config(pbp_file, hk_file, output_file="config.yaml"):
config = {
"pbp_schema": load_schema(pbp_file),
"hk_schema": load_schema(hk_file),
}
with open(output_file, "w") as f:
yaml.dump(config, f, sort_keys=False)
print(f"Unified config saved to: {output_file}")
# Example usage:
generate_combined_config("pbp_meta.csv", "hk_meta.csv")

29
pyproject.toml Normal file
View File

@@ -0,0 +1,29 @@
[project]
name = "sp2xr"
version = "0.0.0"
description = "SP2-XR toolkit (placeholder until full v2 metadata)"
readme = "README.md"
requires-python = ">=3.9"
dependencies = [
"dask[dataframe]>=2024.6",
"pandas>=2.2",
"numpy>=1.26",
"scipy>=1.11",
"matplotlib>=3.8",
"seaborn>=0.13",
# add others as you discover theyre imported at top level
]
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[tool.setuptools]
# tell setuptools our code lives in src/
package-dir = {"" = "src"}
[project.optional-dependencies]
notebook = [
"ipython>=8",
"ipywidgets>=8",
]

16
src/sp2xr/__init__.py Normal file
View File

@@ -0,0 +1,16 @@
"""
Thin wrapper so `import sp2xr` keeps working
while we refactor the legacy monolith.
"""
from importlib import import_module
# 1 Load the legacy monolith (makes old symbols available)
_toolkit = import_module(".toolkit_legacy", package=__name__)
globals().update(_toolkit.__dict__) # reexport legacy names
# 2 Import new helpers you want public at package root
from .io import csv_to_parquet, read_csv_files_with_dask # noqa: F401,E402
# Cleanup internal names
del import_module, _toolkit

202
src/sp2xr/io.py Normal file
View File

@@ -0,0 +1,202 @@
import pandas as pd
from pathlib import Path
import os
import re
import zipfile
import warnings
import numpy as np
import dask.dataframe as dd
from .toolkit_legacy import calculate_delta_sec, extract_datetime
def csv_to_parquet(csv_path: Path, parquet_path: Path, **read_csv_kwargs) -> None:
"""
Read a CSV file, return an identical Parquet file.
Parameters
----------
csv_path : Path | str
Location of the source CSV.
parquet_path : Path | str
Destination Parquet path. Parent dirs are created automatically.
read_csv_kwargs : dict
Extra kwargs forwarded to pandas.read_csv().
"""
csv_path = Path(csv_path)
parquet_path = Path(parquet_path)
df = pd.read_csv(csv_path, **read_csv_kwargs)
parquet_path.parent.mkdir(parents=True, exist_ok=True)
df.to_parquet(parquet_path, index=False)
def read_csv_files_with_dask(file_path, meta_pbp, meta_hk, target_directory):
"""
This function reads Pbp or HK files from the SP2XR
Parameters
----------
file_path : str
Complete path of the file to read.
meta : pandas DataFrame
Empty pandas dataframe with the structure expected for the file that is read.
This is ised in case the file is empty --> The function will return an empty DataFrame
with this structure.
Returns
-------
Dask DataFrame
Content of the file as Dask DataFrame.
"""
if file_path:
tmp_hk = pd.DataFrame()
hk_0001 = re.sub(r"PbP", "hk", file_path)
hk_0001 = re.sub(r"(_x)\d{4}", r"\g<1>0001", hk_0001)
hk_0001 = re.sub(r"\.(csv|zip)$", "", hk_0001)
if os.path.exists(f"{hk_0001}.csv"):
try:
tmp_hk = pd.read_csv(
f"{hk_0001}.csv",
nrows=1,
parse_dates=["Time Stamp"],
usecols=["Time Stamp", "Time (sec)"],
)
except pd.errors.EmptyDataError:
tmp_hk = pd.DataFrame()
except zipfile.BadZipFile:
print(f"!! Bad file: {file_path}")
tmp_hk = pd.DataFrame()
elif os.path.exists(f"{hk_0001}.zip"):
try:
tmp_hk = pd.read_csv(
f"{hk_0001}.zip",
nrows=1,
parse_dates=["Time Stamp"],
usecols=["Time Stamp", "Time (sec)"],
)
except pd.errors.EmptyDataError:
tmp_hk = pd.DataFrame()
if not tmp_hk.empty:
first_val, t0 = tmp_hk[["Time (sec)", "Time Stamp"]].values[0]
if "PbP" in file_path:
temp = meta_pbp
data_type = pd.Series(temp.dtypes.values, index=temp.columns).to_dict()
try:
df = dd.read_csv(
file_path, dtype=data_type, blocksize=None
) # , include_path_column=True)
df = df.fillna(
0
) # is this because otherwise we cannot calculate the time_lag?
# df['time_lag'] = df['Incand Peak Time'] - df['Scatter Peak Time'] # 02.09.2024 this line implies that for particles with nan in the scatt transit time time_lag=incand transit time. better to calculate timelag for particles with both scatt and incand and set 0 for particles with only incand
#!!! MISSING CORRECT TIME LAG CALCULATIONS
except zipfile.BadZipFile:
print(f"!! Bad zip file: {file_path}")
df = pd.DataFrame()
return df
elif "hk" in file_path:
temp = meta_hk
data_type = pd.Series(temp.dtypes.values, index=temp.columns).to_dict()
filtered_dtype_dict = {
key: value
for key, value in data_type.items()
if value != "datetime64[ns]"
}
try:
df = dd.read_csv(
file_path,
dtype=filtered_dtype_dict,
parse_dates=["Time Stamp"],
blocksize=None,
assume_missing=True,
)
# df = dd.read_csv(file_path, dtype=data_type, parse_dates=['Time Stamp'], blocksize=None)#, assume_missing=True)
"""if 'Time Stamp' in df.columns:
datetime_format = '%m/%d/%Y %H:%M:%S.%f'
df['Time Stamp'] = df['Time Stamp'].map_partitions(pd.to_datetime, format=datetime_format, meta=('Time Stamp', 'datetime64[ns]'))
"""
except ValueError as e:
# Handle the error if the 'Time Stamp' column is missing or any other parsing error occurs
if "Missing column provided to 'parse_dates'" in str(e):
print(
f"Error for {file_path}: Missing column provided to 'parse_dates': 'Time Stamp'"
)
df = pd.DataFrame()
return df
except pd.errors.EmptyDataError:
df = pd.DataFrame()
return df
except zipfile.BadZipFile:
print(f"!! Bad zip file: {file_path}")
df = pd.DataFrame()
return df
if len(df.columns) > 0:
df = df.loc[~df.isna().all(axis=1)]
df["path"] = str(file_path)
df["first_val"] = first_val
df["t0"] = t0
file_name_cut = (
file_path.split("\\")[-1].split("_")[-2]
+ "_"
+ file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
)
df["file"] = file_name_cut
folder_name = file_path.split("\\")[-1].split("_")[-2]
df["folder_name"] = folder_name
if "Time Stamp" in df.columns:
df["Time Stamp"] = df["Time Stamp"].map_partitions(
pd.to_datetime, meta=("Time Stamp", "datetime64[ns]")
)
df["delta_sec"] = df.map_partitions(
calculate_delta_sec, meta=("delta_sec", "float64")
)
df["calculated_time"] = df["t0"] + dd.to_timedelta(
df["delta_sec"], unit="s"
)
df["file_datetime"] = df.apply(
extract_datetime, axis=1, meta=("file_datetime", "datetime64[ns]")
)
df["date_floored"] = df["calculated_time"].dt.floor("H")
df["date"] = df["calculated_time"].dt.date.astype("date64[pyarrow]")
df["hour"] = df["calculated_time"].dt.hour.astype("i8")
df["floor_time"] = df["calculated_time"].dt.floor("S")
df["Secs_2GB"] = df["Time (sec)"].apply(
np.floor, meta=("Secs_2GB", "i8")
)
fn = (
file_path.split("\\")[-1].split("_")[-2]
+ "_"
+ file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
)
def name(part_idx):
return f"{fn}.parquet"
df = df.set_index("calculated_time", drop=True, sort=False, sorted=True)
df.to_parquet(
path=target_directory,
engine="pyarrow",
partition_on=["date", "hour"],
coerce_timestamps="us",
allow_truncated_timestamps=True,
name_function=name,
write_index=True,
append=False,
)
return df
else:
warnings.warn("tmp_hk empty or not existing")
return pd.DataFrame()
else:
raise ValueError("No CSV files found.")

View File

@@ -9,8 +9,6 @@ import dask.dataframe as dd
# import dask.delayed
import pandas as pd
import numpy as np
import re
import warnings
from scipy.optimize import curve_fit
from numpy.polynomial import Polynomial
import matplotlib.pyplot as plt
@@ -23,8 +21,14 @@ from dask import delayed
import time
import ipywidgets as widgets
from IPython.display import display
try:
import ipywidgets as widgets
except ImportError: # pragma: no cover
widgets = None # interactive features will raise later if used
try:
from IPython.display import display, clear_output
except ImportError: # pragma: no cover
display = clear_output = None # or define noop fallbacks
from matplotlib.backends.backend_pdf import PdfPages
@@ -179,174 +183,6 @@ def calculate_delta_sec(df):
return np.floor(df["Time (sec)"]) - df["first_val"]
@delayed
def read_csv_files_with_dask(file_path, meta_pbp, meta_hk, target_directory):
"""
This function reads Pbp or HK files from the SP2XR
Parameters
----------
file_path : str
Complete path of the file to read.
meta : pandas DataFrame
Empty pandas dataframe with the structure expected for the file that is read.
This is ised in case the file is empty --> The function will return an empty DataFrame
with this structure.
Returns
-------
Dask DataFrame
Content of the file as Dask DataFrame.
"""
if file_path:
tmp_hk = pd.DataFrame()
hk_0001 = re.sub(r"PbP", "hk", file_path)
hk_0001 = re.sub(r"(_x)\d{4}", r"\g<1>0001", hk_0001)
hk_0001 = re.sub(r"\.(csv|zip)$", "", hk_0001)
if os.path.exists(f"{hk_0001}.csv"):
try:
tmp_hk = pd.read_csv(
f"{hk_0001}.csv",
nrows=1,
parse_dates=["Time Stamp"],
usecols=["Time Stamp", "Time (sec)"],
)
except pd.errors.EmptyDataError:
tmp_hk = pd.DataFrame()
except zipfile.BadZipFile:
print(f"!! Bad file: {file_path}")
tmp_hk = pd.DataFrame()
elif os.path.exists(f"{hk_0001}.zip"):
try:
tmp_hk = pd.read_csv(
f"{hk_0001}.zip",
nrows=1,
parse_dates=["Time Stamp"],
usecols=["Time Stamp", "Time (sec)"],
)
except pd.errors.EmptyDataError:
tmp_hk = pd.DataFrame()
if not tmp_hk.empty:
first_val, t0 = tmp_hk[["Time (sec)", "Time Stamp"]].values[0]
if "PbP" in file_path:
temp = meta_pbp
data_type = pd.Series(temp.dtypes.values, index=temp.columns).to_dict()
try:
df = dd.read_csv(
file_path, dtype=data_type, blocksize=None
) # , include_path_column=True)
df = df.fillna(
0
) # is this because otherwise we cannot calculate the time_lag?
# df['time_lag'] = df['Incand Peak Time'] - df['Scatter Peak Time'] # 02.09.2024 this line implies that for particles with nan in the scatt transit time time_lag=incand transit time. better to calculate timelag for particles with both scatt and incand and set 0 for particles with only incand
#!!! MISSING CORRECT TIME LAG CALCULATIONS
except zipfile.BadZipFile:
print(f"!! Bad zip file: {file_path}")
df = pd.DataFrame()
elif "hk" in file_path:
temp = meta_hk
data_type = pd.Series(temp.dtypes.values, index=temp.columns).to_dict()
filtered_dtype_dict = {
key: value
for key, value in data_type.items()
if value != "datetime64[ns]"
}
try:
df = dd.read_csv(
file_path,
dtype=filtered_dtype_dict,
parse_dates=["Time Stamp"],
blocksize=None,
assume_missing=True,
)
# df = dd.read_csv(file_path, dtype=data_type, parse_dates=['Time Stamp'], blocksize=None)#, assume_missing=True)
"""if 'Time Stamp' in df.columns:
datetime_format = '%m/%d/%Y %H:%M:%S.%f'
df['Time Stamp'] = df['Time Stamp'].map_partitions(pd.to_datetime, format=datetime_format, meta=('Time Stamp', 'datetime64[ns]'))
"""
except ValueError as e:
# Handle the error if the 'Time Stamp' column is missing or any other parsing error occurs
if "Missing column provided to 'parse_dates'" in str(e):
print(
f"Error for {file_path}: Missing column provided to 'parse_dates': 'Time Stamp'"
)
df = pd.DataFrame()
except pd.errors.EmptyDataError:
df = pd.DataFrame()
except zipfile.BadZipFile:
print(f"!! Bad zip file: {file_path}")
df = pd.DataFrame()
if len(df.columns) > 0:
df = df.loc[~df.isna().all(axis=1)]
df["path"] = str(file_path)
df["first_val"] = first_val
df["t0"] = t0
file_name_cut = (
file_path.split("\\")[-1].split("_")[-2]
+ "_"
+ file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
)
df["file"] = file_name_cut
folder_name = file_path.split("\\")[-1].split("_")[-2]
df["folder_name"] = folder_name
if "Time Stamp" in df.columns:
df["Time Stamp"] = df["Time Stamp"].map_partitions(
pd.to_datetime, meta=("Time Stamp", "datetime64[ns]")
)
df["delta_sec"] = df.map_partitions(
calculate_delta_sec, meta=("delta_sec", "float64")
)
df["calculated_time"] = df["t0"] + dd.to_timedelta(
df["delta_sec"], unit="s"
)
df["file_datetime"] = df.apply(
extract_datetime, axis=1, meta=("file_datetime", "datetime64[ns]")
)
df["date_floored"] = df["calculated_time"].dt.floor("H")
df["date"] = df["calculated_time"].dt.date.astype("date64[pyarrow]")
df["hour"] = df["calculated_time"].dt.hour.astype("i8")
df["floor_time"] = df["calculated_time"].dt.floor("S")
df["Secs_2GB"] = df["Time (sec)"].apply(
np.floor, meta=("Secs_2GB", "i8")
)
fn = (
file_path.split("\\")[-1].split("_")[-2]
+ "_"
+ file_path.split("\\")[-1].split("_")[-1].split(".")[-2]
)
def name(part_idx):
return f"{fn}.parquet"
df = df.set_index("calculated_time", drop=True, sort=False, sorted=True)
df.to_parquet(
path=target_directory,
engine="pyarrow",
partition_on=["date", "hour"],
coerce_timestamps="us",
allow_truncated_timestamps=True,
name_function=name,
write_index=True,
append=False,
)
del df
else:
warnings.warn("tmp_hk empty or not existing")
else:
raise ValueError("No CSV files found.")
# %% Functions to read sp2b files

2
tests/test_import.py Normal file
View File

@@ -0,0 +1,2 @@
def test_import_package():
import sp2xr # noqa: F401