From 399df18e81e239fe85e5fccd8a0ea72a146c6d02 Mon Sep 17 00:00:00 2001 From: Barbara Bertozzi Date: Fri, 25 Jul 2025 15:33:33 +0200 Subject: [PATCH] test: add real-data tests for PbP and HK ZIP input files --- src/sp2xr/io.py | 58 +++++-- tests/data/config.yaml | 159 ++++++++++++++++++ .../mini_SP2XR_PbP_20190409110737_x0001.zip | Bin 0 -> 1658 bytes .../mini_SP2XR_hk_20190409110737_x0001.zip | Bin 0 -> 7526 bytes tests/test_io_real.py | 34 ++++ tests/test_roundtrip.py | 23 +++ 6 files changed, 256 insertions(+), 18 deletions(-) create mode 100644 tests/data/config.yaml create mode 100644 tests/data/mini_SP2XR_PbP_20190409110737_x0001.zip create mode 100644 tests/data/mini_SP2XR_hk_20190409110737_x0001.zip create mode 100644 tests/test_io_real.py create mode 100644 tests/test_roundtrip.py diff --git a/src/sp2xr/io.py b/src/sp2xr/io.py index 3b108c6..bb8e100 100644 --- a/src/sp2xr/io.py +++ b/src/sp2xr/io.py @@ -4,6 +4,7 @@ import os import re import zipfile import warnings +import yaml import numpy as np import dask.dataframe as dd @@ -30,7 +31,7 @@ def csv_to_parquet(csv_path: Path, parquet_path: Path, **read_csv_kwargs) -> Non df.to_parquet(parquet_path, index=False) -def read_csv_files_with_dask(file_path, meta_pbp, meta_hk, target_directory): +def read_csv_files_with_dask(file_path, config_path, target_directory): """ This function reads Pbp or HK files from the SP2XR @@ -49,6 +50,15 @@ def read_csv_files_with_dask(file_path, meta_pbp, meta_hk, target_directory): Content of the file as Dask DataFrame. """ + if config_path: + with open(config_path) as f: + schema = yaml.safe_load(f) + + pbp_schema = schema["pbp_schema"] + hk_schema = schema["hk_schema"] + else: + raise ValueError("No config file found.") + if file_path: tmp_hk = pd.DataFrame() @@ -82,12 +92,28 @@ def read_csv_files_with_dask(file_path, meta_pbp, meta_hk, target_directory): if not tmp_hk.empty: first_val, t0 = tmp_hk[["Time (sec)", "Time Stamp"]].values[0] if "PbP" in file_path: - temp = meta_pbp - data_type = pd.Series(temp.dtypes.values, index=temp.columns).to_dict() + """ + temp = pbp_schema + data_type = { + col: ( + "float64" + if typ == "float" + else "int64" if typ == "int" else "string" + ) # default fallback + for col, typ in pbp_schema.items() + if typ != "datetime" + } + parse_dates = [ + col for col, typ in pbp_schema.items() if typ == "datetime" + ] + """ try: df = dd.read_csv( - file_path, dtype=data_type, blocksize=None - ) # , include_path_column=True) + file_path, + dtype=pbp_schema, # data_type, + # parse_dates=parse_dates, + blocksize=None, + ) df = df.fillna( 0 ) # is this because otherwise we cannot calculate the time_lag? @@ -99,18 +125,17 @@ def read_csv_files_with_dask(file_path, meta_pbp, meta_hk, target_directory): return df elif "hk" in file_path: - temp = meta_hk - data_type = pd.Series(temp.dtypes.values, index=temp.columns).to_dict() - filtered_dtype_dict = { - key: value - for key, value in data_type.items() - if value != "datetime64[ns]" + datetime_cols = [ + col for col, typ in hk_schema.items() if typ == "datetime" + ] + dtype_cols = { + col: typ for col, typ in hk_schema.items() if typ != "datetime" } try: df = dd.read_csv( file_path, - dtype=filtered_dtype_dict, - parse_dates=["Time Stamp"], + dtype=dtype_cols, # filtered_dtype_dict, + parse_dates=datetime_cols, blocksize=None, assume_missing=True, ) @@ -164,10 +189,10 @@ def read_csv_files_with_dask(file_path, meta_pbp, meta_hk, target_directory): df["file_datetime"] = df.apply( extract_datetime, axis=1, meta=("file_datetime", "datetime64[ns]") ) - df["date_floored"] = df["calculated_time"].dt.floor("H") + df["date_floored"] = df["calculated_time"].dt.floor("h") df["date"] = df["calculated_time"].dt.date.astype("date64[pyarrow]") df["hour"] = df["calculated_time"].dt.hour.astype("i8") - df["floor_time"] = df["calculated_time"].dt.floor("S") + df["floor_time"] = df["calculated_time"].dt.floor("s") df["Secs_2GB"] = df["Time (sec)"].apply( np.floor, meta=("Secs_2GB", "i8") ) @@ -200,6 +225,3 @@ def read_csv_files_with_dask(file_path, meta_pbp, meta_hk, target_directory): else: raise ValueError("No CSV files found.") - - -# TEMP:test commit separation diff --git a/tests/data/config.yaml b/tests/data/config.yaml new file mode 100644 index 0000000..701e851 --- /dev/null +++ b/tests/data/config.yaml @@ -0,0 +1,159 @@ +pbp_schema: + Time (sec): float + Packet Time Stamp: float + Flag: float + Dropped Records: float + Record Count: float + Record Size: float + Particle Time Stamp: float + Particle Flags: float + Scatter relPeak: float + Scatter Transit Time: float + Scatter Peak Time: float + Scatter FWHM: float + Scatter Size (nm): float + Incand relPeak: float + Incand Transit Time: float + Incand Peak Time: float + Incand FWHM: float + Incand Delay: float + Incand Mass (fg): float + Reserved: float +hk_schema: + Time Stamp: datetime + Time (sec): float + Time Stamp (UTC sec): float + Elapsed Time: float + Error Code: float + Packet Time Stamp: float + Laser TEC Temp (C): float + Crystal TEC Temp (C): float + Inlet Air Temp (C): float + Computer Heatsink Temp (C): float + Laser Heatsink Temp (C): float + Outlet Air Temp (C): float + YAG Output Monitor (V): float + Cavity Pressure (hPa): float + Laser Driver Power Monitor (uA): float + Laser Driver Current Limit Monitor (A): float + Laser Driver Current Monitor (A): float + Laser TEC Sense: float + Laser Over Temp (On/Off): float + +5V Laser Rail (V): float + ' +5V Rail (V)': float + +12V Rail (V): float + High Voltage (V): float + Battery Temp (C): float + UPS Output (V): float + 12V Iso Rail (V): float + 5V Iso Rail (V): float + 3.3V Iso Rail (V): float + Spare 22: float + Spare 23: float + 408 Board Spare 0: float + 408 Board Spare 1: float + 408 Board Spare 2: float + 408 Board Spare 3: float + 408 Board Spare 4: float + Purge Flow Monitor (sccm): float + System Input Voltage (V): float + Board Temperature (C): float + 408 Board Spare 8: float + 408 Board Spare 9: float + 408 Board Spare 10: float + 408 Board Spare 11: float + 408 Board Spare 12: float + 408 Board Spare 13: float + 408 Board Spare 14: float + 408 Board Spare 15: float + Sheath Flow Controller Read (vccm): float + Sheath Flow Controller Read (sccm): float + Sheath Flow Controller Pressure (psia): float + Sheath Flow Controller Temperature (C): float + Sample Flow Controller Read (vccm): float + Sample Flow Controller Read (sccm): float + Sample Flow Controller Pressure (psia): float + Sample Flow Controller Temperature (C): float + Fan 1 (RPM): float + Fan 2 (RPM): float + Laser Fan (RPM): float + Spare tach: float + Threshold Crossing Events: float + Dual Qualified Scatter and Incand Particles: float + Qualified Scatter Only Particles: float + Qualified Incand Only Particles: float + Disqualified Due to Scatter Saturation: float + Disqualified Due to Scatter Transit Time Min: float + Disqualified Due to Scatter Transit Time Max: float + Disqualified Due to Scatter FWHM Min: float + Disqualified Due to Scatter FWHM Max: float + Scatter Inter Part Period Min Violation: float + Disqualified Due to Incand Saturation: float + Disqualified Due to Incand Transit Time Min: float + Disqualified Due to Incand Transit Time Max: float + Disqualified Due to Incand FWHM Min: float + Disqualified Due to Incand FWHM Max: float + Incand Inter Part Period Min Violation: float + Baseline Sizer Lo: float + Baseline Sizer Hi: float + Baseline Incand Lo: float + Baseline Incand Hi: float + Bandwidth Sizer Hi: float + Bandwidth Sizer Lo: float + Bandwidth Incand Lo: float + Bandwidth Incand Hi: float + ABD-0408 HK ADCs min: float + ABD-0436 HK ADCs min: float + ABD-0408 HK ADCs max: float + ABD-0436 HK ADCs max: float + Incand Particle Conc (cts/ccm): float + Scattering Particle Conc (cts/ccm): float + Incand Mass Conc (fg/sccm): float + Scattering Mass Conc (fg/sccm): float + Sheath Flow Set Point: float + Sample Flow Set Point: float + Laser Temp Set Point: float + Laser Current Set Point: float + Spare 4 Set Point: float + Spare 5 Set Point: float + PMT HV Set Point: float + Particle Density (g/ccm): float + PbP Packet Time: float + Scatter Bin 1: float + Scatter Bin 2: float + Scatter Bin 3: float + Scatter Bin 4: float + Scatter Bin 5: float + Scatter Bin 6: float + Scatter Bin 7: float + Scatter Bin 8: float + Scatter Bin 9: float + Scatter Bin 10: float + Scatter Bin 11: float + Scatter Bin 12: float + Scatter Bin 13: float + Scatter Bin 14: float + Scatter Bin 15: float + Scatter Bin 16: float + Scatter Bin 17: float + Scatter Bin 18: float + Scatter Bin 19: float + Incand Bin 1: float + Incand Bin 2: float + Incand Bin 3: float + Incand Bin 4: float + Incand Bin 5: float + Incand Bin 6: float + Incand Bin 7: float + Incand Bin 8: float + Incand Bin 9: float + Incand Bin 10: float + Incand Bin 11: float + Incand Bin 12: float + Incand Bin 13: float + Incand Bin 14: float + Incand Bin 15: float + Incand Bin 16: float + Incand Bin 17: float + Incand Bin 18: float + Incand Bin 19: float diff --git a/tests/data/mini_SP2XR_PbP_20190409110737_x0001.zip b/tests/data/mini_SP2XR_PbP_20190409110737_x0001.zip new file mode 100644 index 0000000000000000000000000000000000000000..7634d40233cea22bfb23d0436b0fd813146ba6e2 GIT binary patch literal 1658 zcma*oe>Bqz9|!R7j0|l^D|Lo#`mrgQ7H7LU6z zbZ1zyWZbSAA(2Fso8Omi_ae83bn)ms|2^kC=RD_g-skiF>+|R5MW=zaHv<3w0S@?m zVMeK<{C?B{fFw8o>;@3PxwDaHS-y0R2PQOFueI6u<>l#jD_^G6T~4}JSK|_*c~_QJ zq4~C@ggca$>6ZCS99;{*9dZx^ClNHMsrG1+&bOkFb%NM^cu6?POyZwE*EaNzmXLHmf zGF;}Bi|6y3_&w21mZPbH!3neQEWxZ3j;rdH3mh(`C!QP*%Y+KR*08xTo|8JxxGYq&2ed z?=g?@3QcQgbd))QD)qKX27QW7@08fr&`X$Lrq^MT=a{?&*}^zmw6d{n8@#VA$gZk2 zMaiJadQHHoA4AdqSTX%(M93n~!tW8edpy;)#4ry3qnO&2{UUGrb)xlH>ah>@jEfp)Z&~S4b*=L zH^L{_6w)|Bg7Mqgi0vHLt8)Et8Y28oK~UWe16!K@pgVS=mSFy-+^W!@B6cu8T4X}` z+Q*UY4ZISVC-{!AzX27Ck(68~F5qQ*F%_a?!`kJ?C4&w)*RZ)#wriXmG-7vQIs zk!7ekxux74n-2<+75$w75lc2A=mKiNDqy=9^qoI~3!snAZ{ocI#IPyF03#H#{Aea^1q0dD~DW zeoJ(uznN`xc2;*G%FR1<`^GH8cp{VP5&Wl`K@)h}M4f{(`mw^kNI5bl?xd)<-$Pk| zy=YK^ZGIwYBQEKnO5#mTvMHmmLD%!EN6Z7kTKUCl8$9|BI6eIhN_qBVa{`lU)<$YV z!_tdk=(2vvD`z=p{Cu6U{(*yW#QH}<^NZ*XxWPGylvWG2kyz8D|2?9&H7oEy|gXF28$0#IJIO%UT&HBAAuQb)+ zHoj%wuGdiEW+>6?W&t6ipy^Wby~pG|<1b}vtQWpFyJexhWRi1x7TG<-VL0hB+|Ja5 zt_cc^L8-z%cX~h$q^y_4wTO)2)tLdl76wN%z z1>rMOu*sLuQ}7^hNsPcyJI8{cjN`q|+;$=Pmocf9?9U*3<%ZqA3p1_7=ysn{J$9Xo zAsd-(HAJ|ZnQ8W}5qm2pjVN_LB*t(KyNKq=0d_O`cmEhNJsGd7qg;c9d836M{FTa&$<0 zYjA3Sz;3!X-3f{4zAGqs7Ft&4xPTEAwmdg7c6>y|&>(ACbQ(y@9Q?l?zv(&%003H# i^!m50%-9|r!506I-)!?!b_^*Oh`7uSUWfPVw*M(}R{ literal 0 HcmV?d00001 diff --git a/tests/data/mini_SP2XR_hk_20190409110737_x0001.zip b/tests/data/mini_SP2XR_hk_20190409110737_x0001.zip new file mode 100644 index 0000000000000000000000000000000000000000..608d7c0106f0d0b727075319598a50ffe701dcc5 GIT binary patch literal 7526 zcma*sRZJWV6DVvbltR%0#TICBitFO;?(XhV+$qrF?u!IGltGCI`~Ri4A0`qqsz4(W z5-LR;64HGBO6sbtN8}mBmhXWQ6H(+*&{V#(VtMbjBXoCUYZNGeFXY2dGl^A}QMi6@ zx24TbI2A?Z%d3f5DeFsIElT?`GR6cI2x?vL2w4rig9Jp3+u<&{b;;)%w*}rT@=f|0 z-yz0d7Nd@Rg`ft(fuW(kU4bFDr)_P^UC%4mJGHOM*L^Qr4k?hXO>BQWKOU5OxEeiv-s#)AaeFp8G^*GW3kk{#8V?P9J`-Z9 zs9Wx6YZnpLG%0+R-spTqoZk(e9=$wlZS9EUBOZSRKMt^;;&x5H)}Dq`ES^ez>Z0s; zecE|`zDDevM)gJYZH{K)JpI6hEXrJ7ldtjlQ_| z@op5=tURV^M|5OGnXR9Nw6*)MzIu$m-8lJFh4FfjyHlrExAM39@$Tvt*Y}xkX}Yua zx^nlrF3^#^Y#G~x^n=gFD&lMdp_!7MKBN$nldjw~sIB+u>`=P1{m)~ETW3^@o0vK) zdhs!C`e}jC9>QlN`4a;m0)zlYtqcOxF?u_L+E0I(?x5Z#MYXSNTezLxw%|s=syhYAJ$2)}yErMNH|B_f zLgV!Ewoi{9UfgeS%o-jK-F<%tv8n9T!r@!1N>wTi3ArI6Z~J*mq-;gjAR z-^z6Lje0FX32g;rL4~jA#(i;c9b@k*SA4rZtxtaxyfZiL|KuDOX$E8}NeX!$j|(NJ z=VyPly1kCu$8E>;TCY-X2lZ~Hc-_p>JkmRC!QMXl+@2tI`mzpF)q*w^jRZ^YBlRAm zqW*#lL+>vvo(|zH?_kHgYuk0FI4wz_`vaxiT;@~1o!%Q$;k#I`3Q+oz#PIwtzV<9J zY|Q1$z^~X3FH0hibora}uf_;8>dJQ8PAYrr=8DGu4i)4)X&cD4atSvby0LsYrrsEL zqPaX8f{Jq0C677LUNQ|sRk`ZY$DGQfN2MK$dJ@S_54I9XPnWh50jGCcswAiVCvule z*51hmbk@3tq8NYh416j*P9h#1D zf&q(AO|YbUCX=+c@A8xbzs=$I1HYoG$c5S4=eSRZl&1dyf}tmczN*sKm_tWm`#!V3k5Ier`Va)X2L}wzN*%% zx{$&H>2jGGJeF)k8!@O;`2v0Qy!`~R)s~R82(KoGw3SOy2hdQM0R+1OM%_N$^T~mo zss z+ry7`2hRmZi56$=hSTw(0;3WdxWrLep8@0`QjFBr1L8%~6HyEDweH>ALdg8v} zHwd>#y2UbyyJT#~W^7{2H5MK!_FFBQZ* z&%x^9#Vl+JuA@3b$r};FlV{wMBTqd}6XjAzd9=6~S&(kr35l(3FuV-2=NTxE`tUmy4%9Fe{J_4e9w2$A%Ja)p-wr!iK|OBNOy>+AD5Wo@8@E&1p2F&3(%H@`cq9(1scY+0j5Xjgt)cGXdfAO=s0 z#Uvn+fhoIJvS$1jGFihHa{yK+?}!9b!@67j!b}>qeukgMWYxFZ^|Xs-a$6wP6>_#S z4Z$*O(>+@y(zYNZBQhThgTC@nZawygm^(AVw}qe6q?k{Frr)OpWT6esfQsHhZkACQ zD(Nu}_0wvS6Djnal+!d)6RcEpuG+`T=zj;8#5-Y$-rF_w{AA))9}fPK@8tqdJCteL zrMLKl8P;K*u_vvbtkXdVzq%%9l!7aq9KLBGsD~UEd$^C&eYj|E`eaH164omVkXo)N3Ma9XFkivA_y@KS{~UiO zAMFb4a!B{sI?c2#dymL8UTQXB&lEKm?IxfX^7Ul>rAn2r1@YLF+f%c(z5A z-pJ_TAW^lx5=$}}AU@9=2CXC(1WcKL8I!y`z4x~gFi<^oL(9YKO}^r^x=vOivcJ!d zj}WMliu2AM5X7nn($owVbt+kOe1=pM{r4sLKl!YR_%9EryL#>}_h2L7-gaDn3g9Oy zB?Bco7zB#j{0^rdb1Ael{WxL5T6BU71*L>*Ns#D2EGZfb+6HnRR#(-hL7S{*DKn$Y;JgSIhsO!+zv|VQ3*3*>u=db{yRzJ+5xM7Bh zGR?@D*+~kP#D=($)fN|6AaN3mgkP93*GGk?lF~Y7{$0BAFIdMs% z)8r*>V4Qt47UfdHChOW%#WGP{PBNJz>-^xRy*b*9 z1?(x+bmPZgDo|-Uc`3IZ^=yT8M1&Fo@^&#Q1#8?3o6zetg_pka|!XVkMq05p6p;hyPH<~?AR+WM;v&_e-Zk|5S zC1~2zu1_{DZxzi)$S0vvP;l30O5Te#U0(f*k9``fJl9|#po%F0Q1TS*zJ<@iv9Kg&yca~+WSq3FP2qQd9H)K^pl^QoRj{Q{DA3Nbw#h?EAb>~ zO~DeVzhBGQn=SkV9#i)(+I#FlXoq`6@zm&-8I?++?H8ee{n)M#4)q7jt3&~}>-^!N z>Fj}t7F>!!C`n7nCooSG#B_#ZA{pAigvj8s6}c9aw%=z>-iUQ|plQ&nhLQBxvNP2_ zGDf8LOQrk+3i^uw;`$%@0L*dc8(kjs_8VtTrX45aRUXRn#A=qW4hqf=R(K(k%1uQk z)t?Oxi8kDc@|F?HY*LTa+?*Ip3CP8(HRhLu@e0m8b?i zm(~eE0=bCQE?>!)RA00<1hvti`fzupQouu?+vW7Tlu^mv+Uh7F@Y(2DBY9Q9~$LU6#gU+TIo~49Ln&zCk zI?c*|1V}E_C$|I>aw*KQ5o3vtZ{MwDx*Xq{D}bp4$ZfSOHrSVrZCci~J!%f@fzI_V zOs*U=B`kQ{J_b%RIV8`iRNQkq;kBG?c0$9lg3l4i?&lEgiFZl;KY7bp)eKM;!-g}* zydBoZNF$da##ViS6Wj+{7Y~|EXzJmq)OeC(3Pe04@#+-xS)X&?e{w!-2vGRC7 z&pcI6f0v>B`erG=MGys0Dc}Uw z(%;>_*G;wx+tY_YMa2@yVwUAo7Mz_*;!5HYz335KlORQ;V@AB8`d@6KZ(?#V%Y`gg zE8*B&6oNRf6H~NB3deCvn?;rfD4Y?J>I#;a>Ou={OYGUM#@A}fyrc@RgxYRCmp{w_ z?C1#(3Vl2wq8S%b=M<*k=+y9;f^ROE|2(n!y{aw&oW;gO`gwj}havRETHa|1Z4INT zO%4*b;9A+d-u%@RV_sUth2-*y)A1&&^y_|cT)G=lOIqNe>IXocs!RX3gHFOA!*O2; z(%Gd_>8r?~3hHSx z_eHp!iX0jJh)--yg`q;1Z3I{bhr9_w4Pe(^_sUGsp4YU5otvO7-AKx#&U8+m zRRTi~av(VvnD{LK#-9`%KzV83@S6CRovw zLqSr|)8quFW_-O{BJX9D30BIO?tIV1XgNTO5HSBrkIuH{Q2pR0y< zUPHj((_ONevD^sty6K+Zad_s$UX;CT4T&>JHTt2w@(7`=j3T$~a@qaPcfNeCW4uKY z#8;hMz}TT*Zh9@lTVTB{zA%}{y2%$NJ)<~AEYN4w*pI*b+6|d3FqCC(2jg~v;brNn z2E;|6=-jUf4z)4IO-$JZ9>J;j2K9?VKK`UPyE||bfr*nWT~z_OYa#4?6?Mu> zzIXL1{z{tWC~W(i*XH6&S-P^8{<1uO_T)d_EFuFJdK*No3g~pBsoZz{41~^^`Ocl- zJ?C?O|8&>kpzTmZ2$b^-T^A9PM-PPMSll!fq9>{p{d0)~on_}qS-eMYq8)@_v?{44 z#_x*Lu6!=xJ1F|t=ZqFMf+ z4+x-OH%BA~Wl80iqcUzh@aGv_Nt$s0*bVHyHTHNi1PeGbHeA$wD4=~~a!4ey6Kx-( z`biv&rFlw=Y*A|TPv-Y@Jl8n3yYREF=d#=+A|5E1@36s@fE`utZTT7apm|l`T_fZr z4aioQI`>f5Y8T_3a~j=uMJH=^;;_+20ogtqn_d|tEt(r*)t+;_{dY%T+eRDZRYSHh zm;+{G0R2rPWF@M^gJGQ6h|2On@!zbC9cnLD(*Its?b+3rR4Nb1$?}bnrvIpEThOm9 zGj0igK%O_L+L195RZX@S*4yHUXbU3U?jZ_^ClkcQm zGE~b#F+l%aV{U@=yT0MCpV9CTuLTk3MNljLJk49KX!48i9;GKT5dpRC2+mSFD+K8Q z51lx%Y-B|gPRa+Dd%cFR`zSA*+QBwvNUI}uU8Uq-0JhzNR1||Jh3&#pz6W@20ADrA zLD|SEN`=dt?bmD&@ibVMiT=HHPf|=EeTmt zwpP?rNDO&m%9&NRipq@|Gpc)~iC${9xx^t^j57hXcC%|Z$Di?uEfI>ZYc;M+bU;)) zwH4Qo75t0?bk1+t$*ezsxzhBv2e59--x5rqkP8$;p@)`ls?E4pn~7;U)`0H2DQh=~ z91K@-$`&Q5!NxUH18yq1&2F+HLKwS}Sb zxcR=Mjkaq`t5zE-lJ$b|SRjbTsM&fH^iHS0GacvRK7whC7J64|FLqdr;Wa99!TAxL zctsl6xaeirtT&K&j5jixmeG3ZnVnd`I;H$Q<(GpBiF{;o#kbZN-%_)JF~J%mc4{ss zxnBj2UOyXVhVZTxiz$`uX;k>Z!@w}uJ^%e`j;h=icxE{zwXnL^>S{Bvt-W{X%Pt8f z!C@lyE$zfzX>8yHZJbhV7Dw8f8U7a_V_Ly!dH@+~9YxIjp3miSN(EDz0E#O=Ygf9? zbRvUExiT!tVmK&teKC@sK_l#gWUdK?|9}U}ef<6z|IKO5l**6DCLE#HO~w=qXA{$i zJFjkUOn#s+`^(n5n|4<0?kU9rDWt;8&1ai#f!u5B?usPR9DnUN(Fiv(6pYMuB{l>E z#Q98SL%IcB$M*h|`$GTjy5`^R^l~n-*0)UFV|*FIT!z?mVR~+h!Ay6@T0_fui?-kz zI>WwH*~J3_+H$`ewDPD?EUBcv(Mzn(+-og5%0c|4eZHZlM3XNJz4`_6c)}A&w}r81 z*l171PTyJ7fM%v*8SArFZ35es-p8z*T93h}Lo;kz;}iH;SB-&>@oT`M)>tjd-=wU% zvZeNji0%^rz%lO+14t``ntW$KpsMrhO9o|-dK;Ubxhs)MM)nMefgk@yl;eZuOj7H9 zJvk?(m^}m4keVa7P>{wOui8NWdoYt+y$^ss5Cr^I3uK4tQA#&H=^hqE5u^vdM|VRP zyMWekP)M;U;x!JNGHwdK7h|e(G?pIWmoC?pFVDP2#9D*D>W=pYUMd$_6qux4(Kc*J zRJx-Sw`;~Y3@-fK6PZbRdlstt7}PW1dXSTWDNT&(Q>q40R;tK}`dCF+rBQ9tU-h|B z4tc1An%AnvpDtNLI~Xq6-b1a*E!w5*7tgKz&REFEN>(Hvy!dM*2UfUFYeRu%ejILg zhE*S&c1rs^rWx44PIp~;S$5;>u)IAR5e(OHwN1^}{nYzo9^zRE&|EOE>zb@U+OWN)9IXpATh#jEV6su2(NADFEMdF(Xwii|GH83K&Y zxTA4crlu8tNTX-DF({+8Cc&b@p06X+7qD9w!D;Ci% zuxY4YE2mA}ptue$`aazlqawLL3X)%s`2&Z(6 zw1!YpDw;P3<8wT-BpFRx0k zoc(^3m^jg)Y@GOy^7q;$EFe71xSkj&07gSI6$qkQ+j3Q#CLNMQIM=M<&UvtNvJ z0Zm8e7-hi?mw%}xrTC`t`EeQ^>Uj;VI>!DL-Q>(63dKqEfqsw0-HH*? zq^lG)$s8#|i2NitjO=k{Bym9;vUeEAEFajGCRlZFB(6J1U%Dop+Z`vYvtctFDDo5$@$s!uNK{ov@nEjn=Pl@Nk!9fS4; zfG^YP@Z^r({8xsnR72l!WZ3Yt0G{u>hAR{0w1a)TO@vW?%aFfK`S2>{+CxC2VF1sV zKujR4NxMQpK;KI3`paU3rtoEY)=C`D`p%#6TwB(HNJwrj#SRR|aSR)pyt0$75T{tp zPa|r{HFt6+9!mgoS)m`wBbB!Pmy-F}$lx6+5OC*~?=G4M#gdI78Z`_YPTBJ|adBJ| z6Gy2EY>b0eI;u*2;#7PJRaZiOLy7W#Q%;iq^b-jQ`M)d;`(O9}YMyWZKlA+md8fM4 VTeSa8LH;kC|Es(IX(|%Z{{e_#vf%&# literal 0 HcmV?d00001 diff --git a/tests/test_io_real.py b/tests/test_io_real.py new file mode 100644 index 0000000..2f2f613 --- /dev/null +++ b/tests/test_io_real.py @@ -0,0 +1,34 @@ +from pathlib import Path +from sp2xr.io import read_csv_files_with_dask + +DATA = Path(__file__).parent / "data" + + +def test_read_real_pbp_zip(tmp_path): + mini_zip = DATA / "mini_SP2XR_PbP_20190409110737_x0001.zip" + config_file = DATA / "config.yaml" + + df = read_csv_files_with_dask( + file_path=str(mini_zip), + config_path=str(config_file), + target_directory=str(tmp_path / "pq_out"), + ) + + # 50 lines in file = 50 rows returned + assert len(df) == 50 + # parquet really written + assert list((tmp_path / "pq_out").rglob("*.parquet")), "No parquet output" + + +def test_read_real_hk_zip(tmp_path): + mini_zip = DATA / "mini_SP2XR_hk_20190409110737_x0001.zip" + config_file = DATA / "config.yaml" + + df = read_csv_files_with_dask( + file_path=str(mini_zip), + config_path=str(config_file), + target_directory=str(tmp_path / "pq_out"), + ) + + assert len(df) == 50 # or assert exact number if known + assert list((tmp_path / "pq_out").rglob("*.parquet")), "No parquet output" diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py new file mode 100644 index 0000000..4ef8818 --- /dev/null +++ b/tests/test_roundtrip.py @@ -0,0 +1,23 @@ +import pandas as pd +from sp2xr import csv_to_parquet + + +def test_csv_to_parquet_roundtrip(tmp_path): + # --- create synthetic mini‑dataset --- + original = pd.DataFrame( + { + "particle_id": [1, 2, 3], + "incand": [123.4, 234.5, 345.6], + "scat": [10.1, 11.2, 12.3], + } + ) + csv_file = tmp_path / "sample.csv" + pq_file = tmp_path / "sample.parquet" + original.to_csv(csv_file, index=False) + + # --- run the code under test --- + csv_to_parquet(csv_file, pq_file) + + # --- validate --- + roundtrip = pd.read_parquet(pq_file) + pd.testing.assert_frame_equal(roundtrip, original)