Compare commits
298 Commits
v1.0.0
...
edabcb57f8
Author | SHA1 | Date | |
---|---|---|---|
edabcb57f8 | |||
76f8b194c4 | |||
d0c27d4414 | |||
8b30fe5815 | |||
974260f177 | |||
ea1011a9ea | |||
d59967fcc4 | |||
40f17818f2 | |||
6b43c95a8d | |||
109be49f31 | |||
14b738818c | |||
4f438f86fe | |||
68344964ac | |||
e5fdc6fa31 | |||
2cdd6925af | |||
bc1d65d469 | |||
85d4e39299 | |||
02e926e003 | |||
81be6b54c8 | |||
df0aca97df | |||
b8900cab67 | |||
7906387271 | |||
cbf468f5ac | |||
131704dcf2 | |||
33aabf45fa | |||
3e6f6bc46e | |||
1a843ee2c6 | |||
46ca26a983 | |||
36780d1a63 | |||
5943c60216 | |||
58386ca10b | |||
d89aebd861 | |||
e358d4ab64 | |||
5e3f75d66b | |||
a3a1b8506c | |||
1b2184d8e1 | |||
7ffcd90e7b | |||
d59e9d2c0b | |||
f07dfc0a81 | |||
de9c45c21f | |||
ba49b168c4 | |||
df4bd2b3ae | |||
368e4ce6d8 | |||
4d87169732 | |||
32c1bd0731 | |||
b13b4a4b57 | |||
5fa28ca917 | |||
d787ce6972 | |||
941bf0e784 | |||
68cf2f8d3e | |||
99cc6faf11 | |||
39eec2679e | |||
6319c36cfb | |||
c97ff1208e | |||
6899894ba1 | |||
fc139e0ae5 | |||
ef8cf9bb4e | |||
d79877cc9b | |||
fa9edcb115 | |||
1bc145530c | |||
2eec2155b5 | |||
b6a41c3378 | |||
13f070e4c9 | |||
22e201d063 | |||
80a7b650af | |||
f483e8d7fe | |||
fed9aa0a1b | |||
85d44d71d6 | |||
6dc89d34fe | |||
60f9278f10 | |||
3db2eeb041 | |||
2740330709 | |||
ff31c14f92 | |||
01767340f9 | |||
7708153a62 | |||
53fae27472 | |||
d10786a534 | |||
058b82ff88 | |||
2988d816af | |||
ac80b9a3ca | |||
ca23e363c9 | |||
feee8762ed | |||
cc7ad749ca | |||
13cb5476b1 | |||
05f16bf717 | |||
59e910a0f9 | |||
90ae93c124 | |||
870d60a789 | |||
c866ce8ee9 | |||
83b4a12e8b | |||
2ae8deaf64 | |||
29c6d86583 | |||
71b7091fe7 | |||
e01147d03a | |||
c0ec7e9b26 | |||
66e236bd5d | |||
a8ceffba8b | |||
8129949db9 | |||
d49e7ae9d5 | |||
13d65a7383 | |||
9a76b3a01a | |||
5cc409f882 | |||
51e8e6ae66 | |||
5b338c8212 | |||
507454ee91 | |||
1e5581769b | |||
81e3cc58ca | |||
800a5aca49 | |||
47f095bdc8 | |||
cdb0c612ad | |||
8ea111a0c7 | |||
ecb0425a20 | |||
2dbd255589 | |||
bd4ced00ba | |||
74a78b9534 | |||
c103268102 | |||
2920be624a | |||
bac6f5d773 | |||
d49f511dbd | |||
ea898ca3c5 | |||
4f0361c6c5 | |||
1b0c666132 | |||
14a1d032b9 | |||
96500063fb | |||
db6fcd03da | |||
c992662a1f | |||
02a7c4d834 | |||
8f9e2fc594 | |||
7ab615019a | |||
57d49a8db0 | |||
7304655ba5 | |||
32ba2a13cd | |||
3e143fb9c7 | |||
dd8fc1a906 | |||
90d43a46f8 | |||
0e354a0f14 | |||
de859102ab | |||
59861c3aa8 | |||
7b3b453db1 | |||
eec38f61d7 | |||
6d91c043f8 | |||
85b4909713 | |||
0f913e5002 | |||
4813359a4f | |||
4525c1ba04 | |||
3e1a46ebc7 | |||
926dc9208a | |||
1e0da55abc | |||
0a58e86bcb | |||
cfae414b0e | |||
b499ef2845 | |||
7ad4e686a7 | |||
33ad9acdd4 | |||
f20e02d62f | |||
17dd1f1864 | |||
a33e2b681f | |||
e76ed79f1e | |||
b22b0e94e4 | |||
9d917226af | |||
da6cca1632 | |||
d6dce9a392 | |||
ea13f2b71b | |||
3fc96a89d2 | |||
6a0ae327ae | |||
9d77f4815c | |||
ac23822b0e | |||
291a5cc1b6 | |||
52a2303054 | |||
fbc8c5ebc3 | |||
780b2302b3 | |||
99fb2de6d8 | |||
381d330ee6 | |||
1acbd2f758 | |||
d7f7223d31 | |||
79a593cbbb | |||
6c50625002 | |||
085ddda0b2 | |||
6ba5a1fa2e | |||
cbc560f7e0 | |||
586dcef621 | |||
8c93c2d97b | |||
3d8b46cf05 | |||
407b287c56 | |||
aa69faa995 | |||
2992f0a645 | |||
635b158dad | |||
57ee91df7d | |||
926a0f9e08 | |||
1287d8d31f | |||
fe2e9400fd | |||
b21ccbddf0 | |||
2903856f46 | |||
29be99d479 | |||
c8113dd0d2 | |||
dde01bae8b | |||
72fc77d755 | |||
0cc6cf0785 | |||
210379a2b4 | |||
ee377ef30a | |||
2113a17e40 | |||
60f4497711 | |||
2ea9269f75 | |||
86a811e6aa | |||
652f311c8d | |||
9f6533e53b | |||
bda5e87cc8 | |||
d2e53dca3f | |||
f7f91aa105 | |||
1c937222fd | |||
2a28d45b13 | |||
71e1fffd1a | |||
622661d4d3 | |||
9b70493fbf | |||
cf82678f9e | |||
ba5b8cb407 | |||
ed33e77380 | |||
b47ed2b3f4 | |||
9d28c3e1d6 | |||
1d241f663c | |||
05e580527e | |||
f6154a6777 | |||
e3dcb1110a | |||
1f7bf98c96 | |||
0cc4a1f215 | |||
236693c66c | |||
52d8399bdd | |||
afeb2241fc | |||
156027a934 | |||
f344a45c94 | |||
866d4aa4d9 | |||
25daf66b19 | |||
2b2874cfdc | |||
85f0e69c2c | |||
d0395fff5b | |||
16dda1e834 | |||
d15c8924b5 | |||
4f462578ef | |||
8fa587ef19 | |||
894936f107 | |||
3e21ecde7b | |||
b2e807788f | |||
1987d1610f | |||
c1e5bc9ddd | |||
8226f616dd | |||
fb1c627104 | |||
4fb5ed58b1 | |||
82754e26b0 | |||
0f505df45c | |||
e0d84d7822 | |||
54e4301e93 | |||
dfd14fd029 | |||
2fe2ac2efa | |||
eb89b59702 | |||
7bfd895eb5 | |||
33fec9bd59 | |||
3ea8d1ee40 | |||
4859d6d2e4 | |||
1e10aad835 | |||
e55086b0ad | |||
b4fba4b40c | |||
1012e17905 | |||
9ad77da9f8 | |||
e0f1b6b1ff | |||
34fb1be71f | |||
7633816c23 | |||
55d3a2c92b | |||
8315f8991b | |||
8cff0d6f74 | |||
e278cde961 | |||
1c39986503 | |||
292708e745 | |||
c4f12eaa84 | |||
9c311342d8 | |||
67a52ab00a | |||
993db5d783 | |||
e4b9487575 | |||
7c1c0bf33c | |||
83de18989f | |||
e8a13dba20 | |||
6d1b7545e5 | |||
67765d53f0 | |||
fee72bbda6 | |||
faef7db666 | |||
7441d63cd3 | |||
b344a4045f | |||
9552bfead2 | |||
94d717f9db | |||
5c4be34ac2 | |||
9e951c051c | |||
1975cde356 | |||
08a64eb354 | |||
1bfa19c63f | |||
70ccc9e56d | |||
657269a2ef | |||
63b102bae9 | |||
e8409389da | |||
d0c0014a7d | |||
08bf5bc27f |
1
.gitignore
vendored
1
.gitignore
vendored
@ -7,4 +7,3 @@ logs/
|
||||
envs/
|
||||
hidden.py
|
||||
output_files/
|
||||
.env
|
22
CHANGELOG.md
22
CHANGELOG.md
@ -1,22 +0,0 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file, which is a **cumulative record**.
|
||||
|
||||
Each version entry follows a consistent structure with the following optional sections:
|
||||
|
||||
- **Added** – New features
|
||||
- **Changed** – Modifications to existing functionality
|
||||
- **Deprecated** – Features marked for future removal
|
||||
- **Removed** – Features removed in this version
|
||||
- **Fixed** – Bug fixes
|
||||
- **Security** – Vulnerability fixes
|
||||
|
||||
Format based on [Keep a Changelog](https://keepachangelog.com) and [Semantic Versioning](https://semver.org).
|
||||
|
||||
|
||||
## [1.0.0] - 2025-06-26
|
||||
### Added
|
||||
- Multi-format, multi-instrument file reading system for FAIR data processing
|
||||
- Data integration pipeline with YAML-based configuration for cross-project adaptability
|
||||
- Metadata revision and normalization pipeline
|
||||
- HDF5 manager object for data extraction, handling, and visualization
|
13
README.md
13
README.md
@ -32,10 +32,7 @@ For **Windows** users, the following are required:
|
||||
|
||||
2. **Conda**: Install [Anaconda](https://www.anaconda.com/products/individual) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html).
|
||||
|
||||
3. **PSI Network Access**
|
||||
|
||||
Ensure you have access to the PSI internal network and the necessary permissions to access the source directories. See [notebooks/demo_data_integration.ipynb](notebooks/demo_data_integration.ipynb) for details on how to set up data integration from network drives.
|
||||
|
||||
3. **PSI Network Access**: Ensure access to PSI’s network and access rights to source drives for retrieving campaign data from YAML files in the `input_files/` folder.
|
||||
|
||||
:bulb: **Tip**: Editing your system’s PATH variable ensures both Conda and Git are available in the terminal environment used by Git Bash.
|
||||
|
||||
@ -46,11 +43,11 @@ For **Windows** users, the following are required:
|
||||
|
||||
Open a **Git Bash** terminal.
|
||||
|
||||
Navigate to your `Gitea` folder, clone the repository, and navigate to the `dima` folder as follows:
|
||||
Navigate to your `GitLab` folder, clone the repository, and navigate to the `dima` folder as follows:
|
||||
|
||||
```bash
|
||||
cd path/to/Gitea
|
||||
git clone --recurse-submodules https://gitea.psi.ch/5505-public/dima.git
|
||||
cd path/to/GitLab
|
||||
git clone --recurse-submodules https://gitlab.psi.ch/5505/dima.git
|
||||
cd dima
|
||||
```
|
||||
|
||||
@ -209,7 +206,7 @@ This section is in progress!
|
||||
| actris_level | - | Indicates the processing level of the data within the ACTRIS (Aerosol, Clouds and Trace Gases Research Infrastructure) framework. |
|
||||
| dataset_startdate | - | Denotes the start datetime of the dataset collection. |
|
||||
| dataset_enddate | - | Denotes the end datetime of the dataset collection. |
|
||||
| processing_script | - | Denotes the name of the file used to process an initial version (e.g, original version) of the dataset into a processed dataset. |
|
||||
| processing_file | - | Denotes the name of the file used to process an initial version (e.g, original version) of the dataset into a processed dataset. |
|
||||
| processing_date | - | The date when the data processing was completed. | |
|
||||
## Adaptability to Experimental Campaign Needs
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
# Path to the directory where raw data is stored
|
||||
input_file_directory: '${NETWORK_MOUNT}/Data'
|
||||
input_file_directory: '//fs101/5505/Data'
|
||||
|
||||
# Path to directory where raw data is copied and converted to HDF5 format for local analysis.
|
||||
output_file_directory: '../data/'
|
||||
output_file_directory: '../output_files/'
|
||||
|
||||
# Project metadata for data lineage and provenance
|
||||
project: 'Photoenhanced uptake of NO2 driven by Fe(III)-carboxylate'
|
@ -1,8 +1,8 @@
|
||||
# Path to the directory where raw data is stored
|
||||
input_file_directory: '${NETWORK_MOUNT}/Chamber Data/L0 -raw data'
|
||||
input_file_directory: '//fs03/Iron_Sulphate'
|
||||
|
||||
# Path to directory where raw data is copied and converted to HDF5 format for local analysis.
|
||||
output_file_directory: '../data/'
|
||||
output_file_directory: 'output_files/'
|
||||
|
||||
# Project metadata for data lineage and provenance
|
||||
project: 'Fe SOA project'
|
@ -1,8 +1,8 @@
|
||||
# Path to the directory where raw data is stored
|
||||
input_file_directory: '${NETWORK_MOUNT}/People/Juan/TypicalBeamTime'
|
||||
input_file_directory: '//fs101/5505/People/Juan/TypicalBeamTime'
|
||||
|
||||
# Path to directory where raw data is copied and converted to HDF5 format for local analysis.
|
||||
output_file_directory: '../data/'
|
||||
output_file_directory: 'output_files/'
|
||||
|
||||
# Project metadata for data lineage and provenance
|
||||
project: 'Beamtime May 2024, Ice Napp'
|
@ -1,42 +0,0 @@
|
||||
table_header:
|
||||
w_CenterTime:
|
||||
description: time between start and stop of the measurement
|
||||
units: YYYY/MM/DD HH:MM:SS
|
||||
rename_as: center_time
|
||||
w_StartTime:
|
||||
description: Start time of the measurement
|
||||
units: YYYY/MM/DD HH:MM:SS
|
||||
rename_as: start_time
|
||||
w_StopTime:
|
||||
description: Stop time of the measurement
|
||||
units: YYYY/MM/DD HH:MM:SS
|
||||
rename_as: stop_time
|
||||
w_I2_molec_cm3:
|
||||
description: I2 concentration
|
||||
units: cm^-1
|
||||
rename_as: i2_concentration
|
||||
w_I2_SlCol:
|
||||
description: I2 concentration sl #?
|
||||
units: ppb #?
|
||||
rename_as: i2_sl
|
||||
w_I2_SlErr:
|
||||
description: Uncertainty in I2 concentration sl #?
|
||||
units: ppb #?
|
||||
rename_as: i2_sl_uncertainty
|
||||
w_I2_VMR:
|
||||
description: I2 concentration vmr #?
|
||||
units: ppb #?
|
||||
rename_as: i2_vmr
|
||||
w_I2_VMRErr:
|
||||
description: Uncertainty in I2 concentration vmr
|
||||
units: ppb #?
|
||||
rename_as: i2_vmr_uncertainty
|
||||
w_Rho:
|
||||
description: Rho #?
|
||||
units: ppb #?
|
||||
rename_as: rho
|
||||
w_RMS:
|
||||
description: RMS #?
|
||||
units: ppb #?
|
||||
rename_as: rms
|
||||
|
@ -16,9 +16,8 @@ from instruments.readers.g5505_text_reader import read_txt_files_as_dict
|
||||
from instruments.readers.acsm_tofware_reader import read_acsm_files_as_dict
|
||||
from instruments.readers.acsm_flag_reader import read_jsonflag_as_dict
|
||||
from instruments.readers.nasa_ames_reader import read_nasa_ames_as_dict
|
||||
from instruments.readers.structured_file_reader import read_structured_file_as_dict
|
||||
|
||||
file_extensions = ['.ibw','.txt','.dat','.h5','.TXT','.csv','.pkl','.json','.yaml','yml','.nas']
|
||||
file_extensions = ['.ibw','.txt','.dat','.h5','.TXT','.csv','.pkl','.json','.yaml','.nas']
|
||||
|
||||
# Define the instruments directory (modify this as needed or set to None)
|
||||
default_instruments_dir = None # or provide an absolute path
|
||||
@ -28,16 +27,11 @@ file_readers = {
|
||||
'txt': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'dat': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'csv': lambda a1: read_txt_files_as_dict(a1, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'yaml': lambda a1: read_structured_file_as_dict(a1),
|
||||
'yml': lambda a1: read_structured_file_as_dict(a1),
|
||||
'json': lambda a1: read_structured_file_as_dict(a1),
|
||||
'ACSM_TOFWARE_txt' : lambda x: read_acsm_files_as_dict(x, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'ACSM_TOFWARE_csv' : lambda x: read_acsm_files_as_dict(x, instruments_dir=default_instruments_dir, work_with_copy=False),
|
||||
'ACSM_TOFWARE_flags_json' : lambda x: read_jsonflag_as_dict(x),
|
||||
'ACSM_TOFWARE_nas' : lambda x: read_nasa_ames_as_dict(x)}
|
||||
|
||||
file_readers.update({'CEDOAS_txt' : lambda x: read_txt_files_as_dict(x, instruments_dir=default_instruments_dir, work_with_copy=False)})
|
||||
|
||||
REGISTRY_FILE = "registry.yaml" #os.path.join(os.path.dirname(__file__), "registry.yaml")
|
||||
|
||||
def load_registry():
|
||||
@ -58,7 +52,7 @@ def find_reader(instrument_folder, file_extension):
|
||||
registry = load_registry()
|
||||
|
||||
for entry in registry:
|
||||
if entry["instrumentFolderName"] == instrument_folder and (file_extension in entry["fileExtension"].split(sep=',')):
|
||||
if entry["instrumentFolderName"] == instrument_folder and entry["fileExtension"] == file_extension:
|
||||
return entry["fileReaderPath"], entry["InstrumentDictionaryPath"]
|
||||
|
||||
return None, None # Not found
|
||||
|
@ -81,18 +81,32 @@ gas:
|
||||
datetime_format: '%Y.%m.%d %H:%M:%S'
|
||||
link_to_description: 'dictionaries/gas.yaml'
|
||||
|
||||
CEDOAS: #CE-DOAS/I2:
|
||||
formats:
|
||||
- table_header: 'w_CenterTime w_StartTime w_StopTime w_I2_molec_cm3 w_I2_SlCol w_I2_SlErr w_I2_VMR w_I2_VMRErr w_Rho w_RMS'
|
||||
separator: '\t'
|
||||
file_encoding: 'utf-8'
|
||||
timestamp: ['w_CenterTime']
|
||||
datetime_format: '%Y/%m/%d %H:%M:%S'
|
||||
ACSM_TOFWARE:
|
||||
table_header:
|
||||
#txt:
|
||||
- 't_base VaporizerTemp_C HeaterBias_V FlowRefWave FlowRate_mb FlowRate_ccs FilamentEmission_mA Detector_V AnalogInput06_V ABRefWave ABsamp ABCorrFact'
|
||||
- 't_start_Buf,Chl_11000,NH4_11000,SO4_11000,NO3_11000,Org_11000,SO4_48_11000,SO4_62_11000,SO4_82_11000,SO4_81_11000,SO4_98_11000,NO3_30_11000,Org_60_11000,Org_43_11000,Org_44_11000'
|
||||
#csv:
|
||||
- "X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64 X65 X66 X67 X68 X69 X70 X71 X72 X73 X74 X75 X76 X77 X78 X79 X80 X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X92 X93 X94 X95 X96 X97 X98 X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112 X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128 X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144 X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160 X161 X162 X163 X164 X165 X166 X167 X168 X169 X170 X171 X172 X173 X174 X175 X176 X177 X178 X179 X180 X181 X182 X183 X184 X185 X186 X187 X188 X189 X190 X191 X192 X193 X194 X195 X196 X197 X198 X199 X200 X201 X202 X203 X204 X205 X206 X207 X208 X209 X210 X211 X212 X213 X214 X215 X216 X217 X218 X219"
|
||||
- "X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59 X60 X61 X62 X63 X64 X65 X66 X67 X68 X69 X70 X71 X72 X73 X74 X75 X76 X77 X78 X79 X80 X81 X82 X83 X84 X85 X86 X87 X88 X89 X90 X91 X92 X93 X94 X95 X96 X97 X98 X99 X100 X101 X102 X103 X104 X105 X106 X107 X108 X109 X110 X111 X112 X113 X114 X115 X116 X117 X118 X119 X120 X121 X122 X123 X124 X125 X126 X127 X128 X129 X130 X131 X132 X133 X134 X135 X136 X137 X138 X139 X140 X141 X142 X143 X144 X145 X146 X147 X148 X149 X150 X151 X152 X153 X154 X155 X156 X157 X158 X159 X160 X161 X162 X163 X164 X165 X166 X167 X168 X169 X170 X171 X172 X173 X174 X175 X176 X177 X178 X179 X180 X181 X182 X183 X184 X185 X186 X187 X188 X189 X190 X191 X192 X193 X194 X195 X196 X197 X198 X199 X200 X201 X202 X203 X204 X205 X206 X207 X208 X209 X210 X211 X212 X213 X214 X215 X216 X217 X218 X219"
|
||||
- 'MSS_base'
|
||||
- 'tseries'
|
||||
separator:
|
||||
#txt:
|
||||
- "\t"
|
||||
- ","
|
||||
#csv:
|
||||
- "\t"
|
||||
- "\t"
|
||||
- "None"
|
||||
- "None"
|
||||
file_encoding:
|
||||
#txt:
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
#csv:
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
- "utf-8"
|
||||
|
||||
- table_header: 'TimeStamp,Seconds_Midnight,Year,Month,Day,Hour,Minute,Second,HK0,HK1,HK2,HK3,HK4,HK5,HK6,HK7,HK8,HK9,HK10,HK11,HK12,HK13,HK14,HK15,RTD0_OO1,RTD1_LED,RTD2,RTD3_CBox,RTD4_Gas1,RTD5,RTD6,RTD7,Temp0,Temp1,Temp2,Temp3,DutyCycle0,DutyCycle1,DutyCycle2,DutyCycle3,Relay4,Relay5,Shutter0,Shutter1,Diode0Threshold,Diode0Hysteresis,Diode1Threshold,Diode1Hysteresis,SWTargetPosition,SWCurrentPosition,ELTargetPosition'
|
||||
separator: ','
|
||||
file_encoding: 'utf-8'
|
||||
#timestamp: []
|
||||
#datetime_format:
|
||||
|
||||
link_to_description: 'dictionaries/CEDOAS.yaml'
|
@ -19,7 +19,16 @@ import yaml
|
||||
import h5py
|
||||
import argparse
|
||||
import logging
|
||||
import warnings
|
||||
# Import project modules
|
||||
#root_dir = os.path.abspath(os.curdir)
|
||||
#sys.path.append(root_dir)
|
||||
|
||||
|
||||
#try:
|
||||
# from dima.utils import g5505_utils as utils
|
||||
#except ModuleNotFoundError:
|
||||
# import utils.g5505_utils as utils
|
||||
# import src.hdf5_ops as hdf5_ops
|
||||
import utils.g5505_utils as utils
|
||||
|
||||
|
||||
@ -32,19 +41,56 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
|
||||
module_dir = os.path.dirname(__file__)
|
||||
instruments_dir = os.path.join(module_dir, '..')
|
||||
|
||||
#(config_dict,
|
||||
#file_encoding,
|
||||
#separator,
|
||||
#table_header,
|
||||
#timestamp_variables,
|
||||
#datetime_format,
|
||||
#description_dict) = load_file_reader_parameters(filename, instruments_dir)
|
||||
# Normalize the path (resolves any '..' in the path)
|
||||
instrument_configs_path = os.path.abspath(os.path.join(instruments_dir,'readers','config_text_reader.yaml'))
|
||||
|
||||
format_variants, description_dict = load_file_reader_parameters(filename, instruments_dir)
|
||||
print(instrument_configs_path)
|
||||
|
||||
with open(instrument_configs_path,'r') as stream:
|
||||
try:
|
||||
config_dict = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
# Verify if file can be read by available intrument configurations.
|
||||
#if not any(key in filename.replace(os.sep,'/') for key in config_dict.keys()):
|
||||
# return {}
|
||||
|
||||
|
||||
#TODO: this may be prone to error if assumed folder structure is non compliant
|
||||
file_encoding = config_dict['default']['file_encoding'] #'utf-8'
|
||||
separator = config_dict['default']['separator']
|
||||
table_header = config_dict['default']['table_header']
|
||||
timestamp_variables = []
|
||||
datetime_format = []
|
||||
tb_idx = 0
|
||||
column_names = ''
|
||||
description_dict = {}
|
||||
|
||||
for instFolder in config_dict.keys():
|
||||
|
||||
if instFolder in filename.split(os.sep):
|
||||
|
||||
file_encoding = config_dict[instFolder].get('file_encoding',file_encoding)
|
||||
separator = config_dict[instFolder].get('separator',separator)
|
||||
table_header = config_dict[instFolder].get('table_header',table_header)
|
||||
timestamp_variables = config_dict[instFolder].get('timestamp',[])
|
||||
datetime_format = config_dict[instFolder].get('datetime_format',[])
|
||||
|
||||
|
||||
link_to_description = config_dict[instFolder].get('link_to_description', '').replace('/', os.sep)
|
||||
|
||||
if link_to_description:
|
||||
path = os.path.join(instruments_dir, link_to_description)
|
||||
try:
|
||||
with open(path, 'r') as stream:
|
||||
description_dict = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
except (FileNotFoundError, yaml.YAMLError) as exc:
|
||||
print(exc)
|
||||
#if 'None' in table_header:
|
||||
# return {}
|
||||
|
||||
# Read header as a dictionary and detect where data table starts
|
||||
header_dict = {'actris_level': 0, 'processing_date':utils.created_at(), 'processing_script' : os.path.relpath(thisFilePath,dimaPath)}
|
||||
header_dict = {}
|
||||
data_start = False
|
||||
# Work with copy of the file for safety
|
||||
if work_with_copy:
|
||||
@ -52,35 +98,77 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
|
||||
else:
|
||||
tmp_filename = filename
|
||||
|
||||
# Run header detection
|
||||
header_line_number, column_names, fmt_dict, table_preamble = detect_table_header_line(tmp_filename, format_variants)
|
||||
#with open(tmp_filename,'rb',encoding=file_encoding,errors='ignore') as f:
|
||||
|
||||
# Unpack validated format info
|
||||
table_header = fmt_dict['table_header']
|
||||
separator = fmt_dict['separator']
|
||||
file_encoding = fmt_dict['file_encoding']
|
||||
timestamp_variables = fmt_dict.get('timestamp', [])
|
||||
datetime_format = fmt_dict.get('datetime_format', None)
|
||||
desired_datetime_fmt = fmt_dict.get('desired_datetime_format', None)
|
||||
if not isinstance(table_header, list):
|
||||
|
||||
# Ensure separator is valid
|
||||
if not isinstance(separator, str) or not separator.strip():
|
||||
raise ValueError(f"Invalid separator found in format: {repr(separator)}")
|
||||
table_header = [table_header]
|
||||
file_encoding = [file_encoding]
|
||||
separator = [separator]
|
||||
|
||||
# Load DataFrame
|
||||
try:
|
||||
table_preamble = []
|
||||
line_number = 0
|
||||
if 'infer' not in table_header:
|
||||
|
||||
with open(tmp_filename,'rb') as f:
|
||||
|
||||
for line_number, line in enumerate(f):
|
||||
decoded_line = line.decode(file_encoding[tb_idx])
|
||||
|
||||
|
||||
for tb_idx, tb in enumerate(table_header):
|
||||
print(tb)
|
||||
if tb in decoded_line:
|
||||
break
|
||||
|
||||
if tb in decoded_line:
|
||||
|
||||
list_of_substrings = decoded_line.split(separator[tb_idx].replace('\\t','\t'))
|
||||
|
||||
# Count occurrences of each substring
|
||||
substring_counts = collections.Counter(list_of_substrings)
|
||||
data_start = True
|
||||
# Generate column names with appended index only for repeated substrings
|
||||
column_names = [f"{i}_{name.strip()}" if substring_counts[name] > 1 else name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
|
||||
#column_names = [str(i)+'_'+name.strip() for i, name in enumerate(list_of_substrings)]
|
||||
#column_names = []
|
||||
#for i, name in enumerate(list_of_substrings):
|
||||
# column_names.append(str(i)+'_'+name)
|
||||
|
||||
#print(line_number, len(column_names ),'\n')
|
||||
break
|
||||
else:
|
||||
print('Table header was not detected.')
|
||||
# Subdivide line into words, and join them by single space.
|
||||
# I asumme this can produce a cleaner line that contains no weird separator characters \t \r or extra spaces and so on.
|
||||
list_of_substrings = decoded_line.split()
|
||||
# TODO: ideally we should use a multilinear string but the yalm parser is not recognizing \n as special character
|
||||
#line = ' '.join(list_of_substrings+['\n'])
|
||||
#line = ' '.join(list_of_substrings)
|
||||
table_preamble.append(' '.join([item for item in list_of_substrings]))# += new_line
|
||||
|
||||
|
||||
# TODO: it does not work with separator as none :(. fix for RGA
|
||||
|
||||
try:
|
||||
print(column_names)
|
||||
if not 'infer' in table_header:
|
||||
#print(table_header)
|
||||
#print(file_encoding[tb_idx])
|
||||
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter=separator,
|
||||
header=header_line_number,
|
||||
encoding=file_encoding,
|
||||
delimiter = separator[tb_idx].replace('\\t','\t'),
|
||||
header=line_number,
|
||||
#encoding='latin-1',
|
||||
encoding = file_encoding[tb_idx],
|
||||
names=column_names,
|
||||
skip_blank_lines=True)
|
||||
else:
|
||||
df = pd.read_csv(tmp_filename,
|
||||
delimiter=separator,
|
||||
header=header_line_number,
|
||||
encoding=file_encoding,
|
||||
delimiter = separator[tb_idx].replace('\\t','\t'),
|
||||
header=line_number,
|
||||
encoding = file_encoding[tb_idx],
|
||||
skip_blank_lines=True)
|
||||
|
||||
df_numerical_attrs = df.select_dtypes(include ='number')
|
||||
@ -89,10 +177,6 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
|
||||
|
||||
# Consolidate into single timestamp column the separate columns 'date' 'time' specified in text_data_source.yaml
|
||||
if timestamp_variables:
|
||||
|
||||
if not all(col in df_categorical_attrs.columns for col in timestamp_variables):
|
||||
raise ValueError(f"Invalid timestamp columns: {[col for col in timestamp_variables if col not in df_categorical_attrs.columns]}.")
|
||||
|
||||
#df_categorical_attrs['timestamps'] = [' '.join(df_categorical_attrs.loc[i,timestamp_variables].to_numpy()) for i in df.index]
|
||||
#df_categorical_attrs['timestamps'] = [ df_categorical_attrs.loc[i,'0_Date']+' '+df_categorical_attrs.loc[i,'1_Time'] for i in df.index]
|
||||
|
||||
@ -108,7 +192,7 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
|
||||
df_categorical_attrs = df_categorical_attrs.loc[valid_indices,:]
|
||||
df_numerical_attrs = df_numerical_attrs.loc[valid_indices,:]
|
||||
|
||||
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(desired_datetime_fmt)
|
||||
df_categorical_attrs[timestamps_name] = df_categorical_attrs[timestamps_name].dt.strftime(config_dict['default']['desired_format'])
|
||||
startdate = df_categorical_attrs[timestamps_name].min()
|
||||
enddate = df_categorical_attrs[timestamps_name].max()
|
||||
|
||||
@ -121,6 +205,12 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
|
||||
df_categorical_attrs = df_categorical_attrs.drop(columns = timestamp_variables)
|
||||
|
||||
|
||||
#df_categorical_attrs.reindex(drop=True)
|
||||
#df_numerical_attrs.reindex(drop=True)
|
||||
|
||||
|
||||
|
||||
categorical_variables = [item for item in df_categorical_attrs.columns]
|
||||
####
|
||||
#elif 'RGA' in filename:
|
||||
# df_categorical_attrs = df_categorical_attrs.rename(columns={'0_Time(s)' : 'timestamps'})
|
||||
@ -195,169 +285,13 @@ def read_txt_files_as_dict(filename: str, instruments_dir: str = None, work_with
|
||||
# if timestamps_name in categorical_variables:
|
||||
# dataset['attributes'] = {timestamps_name: utils.parse_attribute({'unit':'YYYY-MM-DD HH:MM:SS.ffffff'})}
|
||||
# file_dict['datasets'].append(dataset)
|
||||
#except Exception as e:
|
||||
except Exception as e:
|
||||
#raise RuntimeError(f"Failed to read file with detected format: {e}")
|
||||
print(e)
|
||||
return {}
|
||||
|
||||
return file_dict
|
||||
|
||||
## Supporting functions
|
||||
|
||||
def detect_table_header_line(filepath, format_variants, verbose=False):
|
||||
"""
|
||||
Tries multiple format variants to detect the table header line in the file.
|
||||
|
||||
Args:
|
||||
filepath (str): Path to file.
|
||||
format_variants (List[Dict]): Each must contain:
|
||||
- 'file_encoding' (str)
|
||||
- 'separator' (str)
|
||||
- 'table_header' (str or list of str)
|
||||
verbose (bool): If True, prints debug info.
|
||||
|
||||
Returns:
|
||||
Tuple:
|
||||
- header_line_idx (int)
|
||||
- column_names (List[str])
|
||||
- matched_format (Dict[str, Any]) # full format dict (validated)
|
||||
- preamble_lines (List[str])
|
||||
"""
|
||||
import collections
|
||||
import warnings
|
||||
|
||||
for idx, fmt in enumerate(format_variants):
|
||||
# Validate format dict
|
||||
if 'file_encoding' not in fmt or not isinstance(fmt['file_encoding'], str):
|
||||
raise ValueError(f"[Format {idx}] 'file_encoding' must be a string.")
|
||||
if 'separator' not in fmt or not isinstance(fmt['separator'], str):
|
||||
raise ValueError(f"[Format {idx}] 'separator' must be a string.")
|
||||
if 'table_header' not in fmt or not isinstance(fmt['table_header'], (str, list)):
|
||||
raise ValueError(f"[Format {idx}] 'table_header' must be a string or list of strings.")
|
||||
|
||||
encoding = fmt['file_encoding']
|
||||
separator = fmt['separator']
|
||||
header_patterns = fmt['table_header']
|
||||
if isinstance(header_patterns, str):
|
||||
header_patterns = [header_patterns]
|
||||
|
||||
preamble_lines = []
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
for line_number, line in enumerate(f):
|
||||
try:
|
||||
decoded_line = line.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
break # Try next format
|
||||
|
||||
for pattern in header_patterns:
|
||||
if pattern in decoded_line:
|
||||
substrings = decoded_line.split(separator.replace('\\t', '\t'))
|
||||
counts = collections.Counter(substrings)
|
||||
column_names = [
|
||||
f"{i}_{name.strip()}" if counts[name] > 1 else name.strip()
|
||||
for i, name in enumerate(substrings)
|
||||
]
|
||||
if verbose:
|
||||
print(f"[Detected header] Line {line_number}: {column_names}")
|
||||
return line_number, column_names, fmt, preamble_lines
|
||||
|
||||
preamble_lines.append(' '.join(decoded_line.split()))
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
print(f"[Format {idx}] Attempt failed: {e}")
|
||||
continue
|
||||
|
||||
warnings.warn("Table header was not detected using known patterns. Will attempt inference mode.")
|
||||
|
||||
# Return fallback format with 'infer' but retain encoding/separator from first variant
|
||||
fallback_fmt = {
|
||||
'file_encoding': 'utf-8',
|
||||
'separator': ',',
|
||||
'table_header': ['infer']
|
||||
}
|
||||
return -1, [], fallback_fmt, []
|
||||
|
||||
|
||||
def load_file_reader_parameters(filename: str, instruments_dir: str) -> tuple:
|
||||
"""
|
||||
Load file reader configuration parameters based on the file and instrument directory.
|
||||
|
||||
Returns:
|
||||
- format_variants: List of dicts with keys:
|
||||
'file_encoding', 'separator', 'table_header', 'timestamp', 'datetime_format', 'desired_datetime_format'
|
||||
- description_dict: Dict loaded from instrument's description YAML
|
||||
"""
|
||||
config_path = os.path.abspath(os.path.join(instruments_dir, 'readers', 'config_text_reader.yaml'))
|
||||
if not os.path.exists(config_path):
|
||||
config_path = os.path.join(dimaPath,'instruments','readers', 'config_text_reader.yaml')
|
||||
|
||||
|
||||
try:
|
||||
with open(config_path, 'r') as stream:
|
||||
config_dict = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
except yaml.YAMLError as exc:
|
||||
print(f"[YAML Load Error] {exc}")
|
||||
return {}, [], {}
|
||||
|
||||
default_config = config_dict.get('default', {})
|
||||
default_format = {
|
||||
'file_encoding': default_config.get('file_encoding', 'utf-8'),
|
||||
'separator': default_config.get('separator', ',').replace('\\t','\t'),
|
||||
'table_header': default_config.get('table_header', 'infer'),
|
||||
'timestamp': [],
|
||||
'datetime_format': default_config.get('datetime_format', '%Y-%m-%d %H:%M:%S.%f'),
|
||||
'desired_datetime_format' : default_config.get('desired_format', '%Y-%m-%d %H:%M:%S.%f')
|
||||
}
|
||||
|
||||
format_variants = []
|
||||
description_dict = {}
|
||||
|
||||
# Match instrument key by folder name in file path
|
||||
filename = os.path.normpath(filename)
|
||||
|
||||
for instFolder in config_dict.keys():
|
||||
if instFolder in filename.split(os.sep):
|
||||
inst_config = config_dict[instFolder]
|
||||
|
||||
# New style: has 'formats' block
|
||||
if 'formats' in inst_config:
|
||||
for fmt in inst_config['formats']:
|
||||
format_variants.append({
|
||||
'file_encoding': fmt.get('file_encoding', default_format['file_encoding']),
|
||||
'separator': fmt.get('separator', default_format['separator']),
|
||||
'table_header': fmt.get('table_header', default_format['table_header']),
|
||||
'timestamp': fmt.get('timestamp', []),
|
||||
'datetime_format': fmt.get('datetime_format', default_format['desired_datetime_format']),
|
||||
'desired_datetime_format' :default_format['desired_datetime_format']
|
||||
})
|
||||
|
||||
else:
|
||||
# Old style: flat format
|
||||
format_variants.append({
|
||||
'file_encoding': inst_config.get('file_encoding', default_format['file_encoding']),
|
||||
'separator': inst_config.get('separator', default_format['separator']),
|
||||
'table_header': inst_config.get('table_header', default_format['table_header']),
|
||||
'timestamp': inst_config.get('timestamp', []),
|
||||
'datetime_format': inst_config.get('datetime_format', default_format['desired_datetime_format']),
|
||||
'desired_datetime_format' : default_format['desired_datetime_format']
|
||||
})
|
||||
|
||||
# Description loading
|
||||
link_to_description = inst_config.get('link_to_description', '').replace('/', os.sep)
|
||||
if link_to_description:
|
||||
desc_path = os.path.join(instruments_dir, link_to_description)
|
||||
try:
|
||||
with open(desc_path, 'r') as desc_stream:
|
||||
description_dict = yaml.load(desc_stream, Loader=yaml.FullLoader)
|
||||
except (FileNotFoundError, yaml.YAMLError) as exc:
|
||||
print(f"[Description Load Error] {exc}")
|
||||
|
||||
break # Stop after first match
|
||||
|
||||
# Always return config_dict + list of formats + description
|
||||
return format_variants, description_dict
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
@ -1,115 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
try:
|
||||
thisFilePath = os.path.abspath(__file__)
|
||||
except NameError:
|
||||
print("Error: __file__ is not available. Ensure the script is being run from a file.")
|
||||
print("[Notice] Path to DIMA package may not be resolved properly.")
|
||||
thisFilePath = os.getcwd() # Use current directory or specify a default
|
||||
|
||||
dimaPath = os.path.normpath(os.path.join(thisFilePath, "..",'..','..')) # Move up to project root
|
||||
|
||||
if dimaPath not in sys.path: # Avoid duplicate entries
|
||||
sys.path.insert(0,dimaPath)
|
||||
|
||||
import pandas as pd
|
||||
import json, yaml
|
||||
import h5py
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
import utils.g5505_utils as utils
|
||||
|
||||
def read_structured_file_as_dict(path_to_file):
|
||||
"""
|
||||
Reads a JSON or YAML file, flattens nested structures using pandas.json_normalize,
|
||||
converts to a NumPy structured array via utils.convert_attrdict_to_np_structured_array,
|
||||
and returns a standardized dictionary.
|
||||
"""
|
||||
|
||||
file_dict = {}
|
||||
_, path_head = os.path.split(path_to_file)
|
||||
|
||||
file_dict['name'] = path_head
|
||||
file_dict['attributes_dict'] = {'actris_level': 0, 'processing_date': utils.created_at(), 'processing_script' : os.path.relpath(thisFilePath,dimaPath)}
|
||||
file_dict['datasets'] = []
|
||||
|
||||
try:
|
||||
with open(path_to_file, 'r') as stream:
|
||||
if path_to_file.endswith(('.yaml', '.yml')):
|
||||
raw_data = yaml.safe_load(stream)
|
||||
elif path_to_file.endswith('.json'):
|
||||
raw_data = json.load(stream)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {path_to_file}")
|
||||
except Exception as exc:
|
||||
logging.error("Failed to load input file %s: %s", path_to_file, exc)
|
||||
raise
|
||||
|
||||
try:
|
||||
df = pd.json_normalize(raw_data)
|
||||
except Exception as exc:
|
||||
logging.error("Failed to normalize data structure: %s", exc)
|
||||
raise
|
||||
|
||||
for item_idx, item in enumerate(df.to_dict(orient='records')):
|
||||
try:
|
||||
structured_array = utils.convert_attrdict_to_np_structured_array(item)
|
||||
except Exception as exc:
|
||||
logging.error("Failed to convert to structured array: %s", exc)
|
||||
raise
|
||||
|
||||
dataset = {
|
||||
'name': f'data_table_{item_idx}',
|
||||
'data': structured_array,
|
||||
'shape': structured_array.shape,
|
||||
'dtype': type(structured_array)
|
||||
}
|
||||
|
||||
file_dict['datasets'].append(dataset)
|
||||
|
||||
return file_dict
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
from src.hdf5_ops import save_file_dict_to_hdf5
|
||||
from utils.g5505_utils import created_at
|
||||
|
||||
parser = argparse.ArgumentParser(description="Data ingestion process to HDF5 files.")
|
||||
parser.add_argument('dst_file_path', type=str, help="Path to the target HDF5 file.")
|
||||
parser.add_argument('src_file_path', type=str, help="Relative path to source file to be saved to target HDF5 file.")
|
||||
parser.add_argument('dst_group_name', type=str, help="Group name '/instFolder/[category]/fileName' in the target HDF5 file.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
hdf5_file_path = args.dst_file_path
|
||||
src_file_path = args.src_file_path
|
||||
dst_group_name = args.dst_group_name
|
||||
default_mode = 'r+'
|
||||
|
||||
try:
|
||||
idr_dict = read_structured_file_as_dict(src_file_path)
|
||||
|
||||
if not os.path.exists(hdf5_file_path):
|
||||
default_mode = 'w'
|
||||
|
||||
print(f'Opening HDF5 file: {hdf5_file_path} in mode {default_mode}')
|
||||
|
||||
with h5py.File(hdf5_file_path, mode=default_mode, track_order=True) as hdf5_file_obj:
|
||||
try:
|
||||
if dst_group_name not in hdf5_file_obj:
|
||||
hdf5_file_obj.create_group(dst_group_name)
|
||||
hdf5_file_obj[dst_group_name].attrs['creation_date'] = created_at().encode('utf-8')
|
||||
print(f'Created new group: {dst_group_name}')
|
||||
else:
|
||||
print(f'Group {dst_group_name} already exists. Proceeding with data transfer...')
|
||||
except Exception as inst:
|
||||
logging.error('Failed to create group %s in HDF5: %s', dst_group_name, inst)
|
||||
|
||||
save_file_dict_to_hdf5(hdf5_file_obj, dst_group_name, idr_dict)
|
||||
print(f'Completed saving file dict with keys: {idr_dict.keys()}')
|
||||
|
||||
except Exception as e:
|
||||
logging.error('File reader failed to process %s: %s', src_file_path, e)
|
||||
print(f'File reader failed to process {src_file_path}. See logs for details.')
|
@ -1,27 +1,10 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
try:
|
||||
thisFilePath = os.path.abspath(__file__)
|
||||
except NameError:
|
||||
print("Error: __file__ is not available. Ensure the script is being run from a file.")
|
||||
print("[Notice] Path to DIMA package may not be resolved properly.")
|
||||
thisFilePath = os.getcwd() # Use current directory or specify a default
|
||||
|
||||
dimaPath = os.path.normpath(os.path.join(thisFilePath, "..",'..','..')) # Move up to project root
|
||||
|
||||
if dimaPath not in sys.path: # Avoid duplicate entries
|
||||
sys.path.insert(0,dimaPath)
|
||||
|
||||
|
||||
|
||||
import sys
|
||||
import h5py
|
||||
|
||||
from igor2.binarywave import load as loadibw
|
||||
import logging
|
||||
import argparse
|
||||
import utils.g5505_utils as utils
|
||||
|
||||
|
||||
def read_xps_ibw_file_as_dict(filename):
|
||||
"""
|
||||
@ -66,7 +49,7 @@ def read_xps_ibw_file_as_dict(filename):
|
||||
|
||||
# Group name and attributes
|
||||
file_dict['name'] = path_head
|
||||
file_dict['attributes_dict'] = {'actris_level': 0, 'processing_date':utils.created_at(), 'processing_script' : os.path.relpath(thisFilePath,dimaPath)}
|
||||
file_dict['attributes_dict'] = {}
|
||||
|
||||
# Convert notes of bytes class to string class and split string into a list of elements separated by '\r'.
|
||||
notes_list = file_obj['wave']['note'].decode("utf-8").split('\r')
|
||||
@ -102,11 +85,22 @@ def read_xps_ibw_file_as_dict(filename):
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
try:
|
||||
thisFilePath = os.path.abspath(__file__)
|
||||
except NameError:
|
||||
print("Error: __file__ is not available. Ensure the script is being run from a file.")
|
||||
print("[Notice] Path to DIMA package may not be resolved properly.")
|
||||
thisFilePath = os.getcwd() # Use current directory or specify a default
|
||||
|
||||
dimaPath = os.path.normpath(os.path.join(thisFilePath, "..",'..','..')) # Move up to project root
|
||||
|
||||
if dimaPath not in sys.path: # Avoid duplicate entries
|
||||
sys.path.insert(0,dimaPath)
|
||||
|
||||
from src.hdf5_ops import save_file_dict_to_hdf5
|
||||
from utils.g5505_utils import created_at
|
||||
|
||||
|
||||
|
||||
# Set up argument parsing
|
||||
parser = argparse.ArgumentParser(description="Data ingestion process to HDF5 files.")
|
||||
parser.add_argument('dst_file_path', type=str, help="Path to the target HDF5 file.")
|
||||
|
@ -78,13 +78,3 @@ instruments:
|
||||
fileExtension: nas
|
||||
fileReaderPath: instruments/readers/nasa_ames_reader.py
|
||||
InstrumentDictionaryPath: instruments/dictionaries/EBAS.yaml
|
||||
|
||||
- instrumentFolderName: ACSM_TOFWARE
|
||||
fileExtension: yaml,yml,json
|
||||
fileReaderPath: instruments/readers/read_structured_file_as_dict.py
|
||||
InstrumentDictionaryPath: instruments/dictionaries/EBAS.yaml
|
||||
|
||||
- instrumentFolderName: CEDOAS
|
||||
fileExtension: txt
|
||||
fileReaderPath: instruments/readers/g5505_text_reader.py
|
||||
InstrumentDictionaryPath: instruments/dictionaries/CEDOAS.yaml
|
||||
|
@ -40,32 +40,24 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 1: Configure Your Data Integration Task\n",
|
||||
"\n",
|
||||
"1. Based on one of the example `.yaml` files found in the `input_files/` folder, define the input and output directory paths inside the file.\n",
|
||||
"\n",
|
||||
"2. When working with network drives, create `.env` file in the root of the `dima/` project with the following line:\n",
|
||||
"\n",
|
||||
" ```dotenv\n",
|
||||
" NETWORK_MOUNT=//your-server/your-share\n",
|
||||
" ```\n",
|
||||
"3. Excecute Cell.\n",
|
||||
"\n",
|
||||
"**Note:** Ensure `.env` is listed in `.gitignore` and `.dockerignore`.\n",
|
||||
"## Step 1: Specify data integration task through YAML configuration file\n",
|
||||
"\n",
|
||||
"* Create your configuration file (i.e., *.yaml file) adhering to the example yaml file in the input folder.\n",
|
||||
"* Set up input directory and output directory paths and Excecute Cell.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"number, initials = 2, 'TBR' # Set as either 2, 'TBR' or 3, 'NG'\n",
|
||||
"campaign_descriptor_path = f'../input_files/campaignDescriptor{number}_{initials}.yaml'\n",
|
||||
"#output_filename_path = 'output_files/unified_file_smog_chamber_2024-04-07_UTC-OFST_+0200_NG.h5'\n",
|
||||
"yaml_config_file_path = '../input_files/data_integr_config_file_TBR.yaml'\n",
|
||||
"\n",
|
||||
"print(campaign_descriptor_path)\n"
|
||||
"#path_to_input_directory = 'output_files/kinetic_flowtube_study_2022-01-31_LuciaI'\n",
|
||||
"#path_to_hdf5_file = hdf5_lib.create_hdf5_file_from_filesystem_path(path_to_input_directory)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -74,9 +66,7 @@
|
||||
"source": [
|
||||
"## Step 2: Create an integrated HDF5 file of experimental campaign.\n",
|
||||
"\n",
|
||||
"* Excecute Cell. Here we run the function `integrate_data_sources` with input argument as the previously specified YAML config file.\n",
|
||||
"\n",
|
||||
" "
|
||||
"* Excecute Cell. Here we run the function `integrate_data_sources` with input argument as the previously specified YAML config file."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -86,7 +76,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"hdf5_file_path = data_integration.run_pipeline(campaign_descriptor_path)"
|
||||
"hdf5_file_path = data_integration.run_pipeline(yaml_config_file_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -156,7 +146,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
7579
output_files/smog_chamber_study_2022-07-26_NatashaG.yaml
Normal file
7579
output_files/smog_chamber_study_2022-07-26_NatashaG.yaml
Normal file
File diff suppressed because it is too large
Load Diff
@ -38,19 +38,12 @@ def _generate_datetime_dict(datetime_steps):
|
||||
""" Generate the datetime augment dictionary from datetime steps. """
|
||||
datetime_augment_dict = {}
|
||||
for datetime_step in datetime_steps:
|
||||
#tmp = datetime.strptime(datetime_step, '%Y-%m-%d %H-%M-%S')
|
||||
datetime_augment_dict[datetime_step] = [
|
||||
datetime_step.strftime('%Y-%m-%d'), datetime_step.strftime('%Y_%m_%d'),
|
||||
datetime_step.strftime('%Y.%m.%d'), datetime_step.strftime('%Y%m%d')
|
||||
datetime_step.strftime('%Y-%m-%d'), datetime_step.strftime('%Y_%m_%d'), datetime_step.strftime('%Y.%m.%d'), datetime_step.strftime('%Y%m%d')
|
||||
]
|
||||
return datetime_augment_dict
|
||||
|
||||
def _generate_output_path_fragment(filename_prefix, integration_mode, dataset_startdate, dataset_enddate, index=None):
|
||||
"""Generate consistent directory or file name fragment based on mode."""
|
||||
if integration_mode == 'collection':
|
||||
return f'collection_{index}_{filename_prefix}_{dataset_enddate}'
|
||||
else:
|
||||
return f'{filename_prefix}_{dataset_enddate}'
|
||||
|
||||
def load_config_and_setup_logging(yaml_config_file_path, log_dir):
|
||||
"""Load YAML configuration file, set up logging, and validate required keys and datetime_steps."""
|
||||
|
||||
@ -81,22 +74,6 @@ def load_config_and_setup_logging(yaml_config_file_path, log_dir):
|
||||
if missing_keys:
|
||||
raise KeyError(f"Missing required keys in YAML configuration: {missing_keys}")
|
||||
|
||||
# Look for all placeholders like ${VAR_NAME}
|
||||
input_dir = config_dict['input_file_directory']
|
||||
placeholders = re.findall(r'\$\{([^}^{]+)\}', input_dir)
|
||||
|
||||
success = utils.load_env_from_root()
|
||||
print(f'Success : {success}')
|
||||
|
||||
for var in placeholders:
|
||||
env_value = os.environ.get(var)
|
||||
if env_value is None:
|
||||
raise ValueError(f"Environment variable '{var}' is not set but used in the config.")
|
||||
input_dir = input_dir.replace(f"${{{var}}}", env_value)
|
||||
|
||||
config_dict['input_file_directory'] = input_dir
|
||||
|
||||
|
||||
# Check the instrument_datafolder required type and ensure the list is of at least length one.
|
||||
if isinstance(config_dict['instrument_datafolder'], list) and not len(config_dict['instrument_datafolder'])>=1:
|
||||
raise ValueError('Invalid value for key "instrument_datafolder". Expected a list of strings with at least one item.'
|
||||
@ -212,6 +189,17 @@ def copy_subtree_and_create_hdf5(src, dst, select_dir_keywords, select_file_keyw
|
||||
|
||||
|
||||
def run_pipeline(path_to_config_yamlFile, log_dir='logs/'):
|
||||
|
||||
"""Integrates data sources specified by the input configuration file into HDF5 files.
|
||||
|
||||
Parameters:
|
||||
yaml_config_file_path (str): Path to the YAML configuration file.
|
||||
log_dir (str): Directory to save the log file.
|
||||
|
||||
Returns:
|
||||
list: List of Paths to the created HDF5 file(s).
|
||||
"""
|
||||
|
||||
config_dict = load_config_and_setup_logging(path_to_config_yamlFile, log_dir)
|
||||
|
||||
path_to_input_dir = config_dict['input_file_directory']
|
||||
@ -225,27 +213,22 @@ def run_pipeline(path_to_config_yamlFile, log_dir='logs/'):
|
||||
dataset_startdate = config_dict['dataset_startdate']
|
||||
dataset_enddate = config_dict['dataset_enddate']
|
||||
|
||||
integration_mode = config_dict.get('integration_mode', 'single_experiment')
|
||||
filename_prefix = config_dict['filename_prefix']
|
||||
|
||||
# Determine mode and process accordingly
|
||||
output_filename_path = []
|
||||
campaign_name_template = lambda filename_prefix, suffix: '_'.join([filename_prefix, suffix])
|
||||
date_str = f'{dataset_startdate}_{dataset_enddate}'
|
||||
|
||||
# Determine top-level campaign folder path
|
||||
top_level_foldername = _generate_output_path_fragment(
|
||||
filename_prefix, integration_mode, dataset_startdate, dataset_enddate, index=1
|
||||
)
|
||||
|
||||
# Create path to new raw datafolder and standardize with forward slashes
|
||||
path_to_rawdata_folder = os.path.join(
|
||||
path_to_output_dir, top_level_foldername, ""
|
||||
).replace(os.sep, '/')
|
||||
path_to_output_dir, 'collection_' + campaign_name_template(config_dict['filename_prefix'], date_str), "").replace(os.sep, '/')
|
||||
|
||||
# Process individual datetime steps if available, regardless of mode
|
||||
if config_dict.get('datetime_steps_dict', {}):
|
||||
# Single experiment mode
|
||||
for datetime_step, file_keywords in config_dict['datetime_steps_dict'].items():
|
||||
single_date_str = datetime_step.strftime('%Y%m%d')
|
||||
subfolder_name = f"{filename_prefix}_{single_date_str}"
|
||||
subfolder_name = f"experimental_step_{single_date_str}"
|
||||
path_to_rawdata_subfolder = os.path.join(path_to_rawdata_folder, subfolder_name, "")
|
||||
date_str = datetime_step.strftime('%Y-%m-%d')
|
||||
single_campaign_name = campaign_name_template(config_dict['filename_prefix'], date_str)
|
||||
path_to_rawdata_subfolder = os.path.join(path_to_rawdata_folder, single_campaign_name, "")
|
||||
|
||||
path_to_integrated_stepwise_hdf5_file = copy_subtree_and_create_hdf5(
|
||||
path_to_input_dir, path_to_rawdata_subfolder, select_dir_keywords,
|
||||
@ -253,12 +236,11 @@ def run_pipeline(path_to_config_yamlFile, log_dir='logs/'):
|
||||
|
||||
output_filename_path.append(path_to_integrated_stepwise_hdf5_file)
|
||||
|
||||
# Collection mode post-processing
|
||||
if integration_mode == 'collection':
|
||||
# Collection mode processing if specified
|
||||
if 'collection' in config_dict.get('integration_mode', 'single_experiment'):
|
||||
path_to_filenames_dict = {path_to_rawdata_folder: [os.path.basename(path) for path in output_filename_path]} if output_filename_path else {}
|
||||
hdf5_path = hdf5_lib.create_hdf5_file_from_filesystem_path(
|
||||
path_to_rawdata_folder, path_to_filenames_dict, [], root_metadata_dict
|
||||
)
|
||||
#hdf5_path = hdf5_lib.create_hdf5_file_from_filesystem_path_new(path_to_rawdata_folder, path_to_filenames_dict, [], root_metadata_dict)
|
||||
hdf5_path = hdf5_lib.create_hdf5_file_from_filesystem_path(path_to_rawdata_folder, path_to_filenames_dict, [], root_metadata_dict)
|
||||
output_filename_path.append(hdf5_path)
|
||||
else:
|
||||
path_to_integrated_stepwise_hdf5_file = copy_subtree_and_create_hdf5(
|
||||
@ -268,16 +250,24 @@ def run_pipeline(path_to_config_yamlFile, log_dir='logs/'):
|
||||
|
||||
return output_filename_path
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python data_integration.py <function_name> <function_args>")
|
||||
sys.exit(1)
|
||||
|
||||
# Extract the function name from the command line arguments
|
||||
function_name = sys.argv[1]
|
||||
|
||||
# Handle function execution based on the provided function name
|
||||
if function_name == 'run':
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python data_integration.py run <path_to_config_yamlFile>")
|
||||
sys.exit(1)
|
||||
# Extract path to configuration file, specifying the data integration task
|
||||
path_to_config_yamlFile = sys.argv[2]
|
||||
run_pipeline(path_to_config_yamlFile)
|
||||
|
||||
|
||||
|
@ -19,10 +19,17 @@ import pandas as pd
|
||||
import numpy as np
|
||||
import logging
|
||||
import datetime
|
||||
|
||||
import h5py
|
||||
|
||||
import yaml
|
||||
import json
|
||||
import copy
|
||||
|
||||
#try:
|
||||
# from dima.utils import g5505_utils as utils
|
||||
# from dima.src import hdf5_writer as hdf5_lib
|
||||
#except ModuleNotFoundError:
|
||||
import utils.g5505_utils as utils
|
||||
import src.hdf5_writer as hdf5_lib
|
||||
|
||||
@ -737,30 +744,10 @@ def save_file_dict_to_hdf5(h5file, group_name, file_dict):
|
||||
try:
|
||||
# Create group and add their attributes
|
||||
filename = file_dict['name']
|
||||
|
||||
# Base filename to use as group name
|
||||
base_filename = file_dict['name']
|
||||
candidate_name = base_filename
|
||||
replicate_index = 0
|
||||
|
||||
# Check for existing group and find a free name
|
||||
parent_group = h5file.require_group(group_name)
|
||||
while candidate_name in parent_group:
|
||||
replicate_index += 1
|
||||
candidate_name = f"{base_filename}_{replicate_index}"
|
||||
|
||||
group = h5file[group_name].create_group(name=candidate_name )
|
||||
group = h5file[group_name].create_group(name=filename)
|
||||
# Add group attributes
|
||||
group.attrs.update(file_dict['attributes_dict'])
|
||||
|
||||
# Annotate replicate if renamed
|
||||
if replicate_index > 0:
|
||||
group.attrs['replicate_of'] = base_filename
|
||||
group.attrs['replicate_info'] = (
|
||||
f"Renamed due to existing group with same name. "
|
||||
f"This is replicate #{replicate_index}."
|
||||
)
|
||||
|
||||
# Add datasets to the just created group
|
||||
for dataset in file_dict['datasets']:
|
||||
dataset_obj = group.create_dataset(
|
||||
|
@ -100,20 +100,6 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str,
|
||||
print(message)
|
||||
logging.error(message)
|
||||
else:
|
||||
# Step 1: Preprocess all metadata.json files into a lookup dict
|
||||
all_metadata_dict = {}
|
||||
|
||||
for dirpath, filenames in path_to_filenames_dict.items():
|
||||
metadata_file = next((f for f in filenames if f.endswith('metadata.json')), None)
|
||||
if metadata_file:
|
||||
metadata_path = os.path.join(dirpath, metadata_file)
|
||||
try:
|
||||
with open(metadata_path, 'r') as metafile:
|
||||
all_metadata_dict[dirpath] = json.load(metafile)
|
||||
except json.JSONDecodeError:
|
||||
logging.warning(f"Invalid JSON in metadata file: {metadata_path}")
|
||||
all_metadata_dict[dirpath] = {}
|
||||
|
||||
with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file:
|
||||
|
||||
number_of_dirs = len(path_to_filenames_dict.keys())
|
||||
@ -152,15 +138,22 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str,
|
||||
stdout = inst
|
||||
logging.error('Failed to create group %s into HDF5: %s', group_name, inst)
|
||||
|
||||
# Step 3: During ingestion, attach metadata per file
|
||||
metadata_dict = all_metadata_dict.get(dirpath, {})
|
||||
if 'data_lineage_metadata.json' in filtered_filenames_list:
|
||||
idx = filtered_filenames_list.index('data_lineage_metadata.json')
|
||||
data_lineage_file = filtered_filenames_list[idx]
|
||||
try:
|
||||
with open('/'.join([dirpath,data_lineage_file]),'r') as dlf:
|
||||
data_lineage_dict = json.load(dlf)
|
||||
filtered_filenames_list.pop(idx)
|
||||
except json.JSONDecodeError:
|
||||
data_lineage_dict = {} # Start fresh if file is invalid
|
||||
|
||||
else:
|
||||
data_lineage_dict = {}
|
||||
|
||||
|
||||
for filenumber, filename in enumerate(filtered_filenames_list):
|
||||
|
||||
# Skip any file that itself ends in metadata.json
|
||||
if filename.endswith('metadata.json'):
|
||||
continue
|
||||
|
||||
# hdf5 path to filename group
|
||||
dest_group_name = f'{group_name}/{filename}'
|
||||
source_file_path = os.path.join(dirpath,filename)
|
||||
@ -170,10 +163,6 @@ def create_hdf5_file_from_filesystem_path(path_to_input_directory: str,
|
||||
#file_dict = ext_to_reader_dict[file_ext](os.path.join(dirpath,filename))
|
||||
file_dict = filereader_registry.select_file_reader(dest_group_name)(source_file_path)
|
||||
|
||||
# Attach per-file metadata if available
|
||||
if filename in metadata_dict:
|
||||
file_dict.get("attributes_dict",{}).update(metadata_dict[filename])
|
||||
file_dict.get("attributes_dict",{}).update({'original_path' : dirpath})
|
||||
stdout = hdf5_ops.save_file_dict_to_hdf5(dest_file_obj, group_name, file_dict)
|
||||
|
||||
else:
|
||||
@ -281,21 +270,6 @@ def create_hdf5_file_from_filesystem_path_new(path_to_input_directory: str,
|
||||
print(message)
|
||||
logging.error(message)
|
||||
else:
|
||||
|
||||
# Step 1: Preprocess all metadata.json files into a lookup dict
|
||||
all_metadata_dict = {}
|
||||
|
||||
for dirpath, filenames in path_to_filenames_dict.items():
|
||||
metadata_file = next((f for f in filenames if f.endswith('metadata.json')), None)
|
||||
if metadata_file:
|
||||
metadata_path = os.path.join(dirpath, metadata_file)
|
||||
try:
|
||||
with open(metadata_path, 'r') as metafile:
|
||||
all_metadata_dict[dirpath] = json.load(metafile)
|
||||
except json.JSONDecodeError:
|
||||
logging.warning(f"Invalid JSON in metadata file: {metadata_path}")
|
||||
all_metadata_dict[dirpath] = {}
|
||||
|
||||
with h5py.File(path_to_output_file, mode=mode, track_order=True) as h5file:
|
||||
print('Created file')
|
||||
|
||||
@ -335,15 +309,8 @@ def create_hdf5_file_from_filesystem_path_new(path_to_input_directory: str,
|
||||
# stdout = inst
|
||||
# logging.error('Failed to create group %s into HDF5: %s', group_name, inst)
|
||||
|
||||
# Step 3: During ingestion, attach metadata per file
|
||||
# TODO: pass this metadata fict to run_file_reader line 363
|
||||
metadata_dict = all_metadata_dict.get(dirpath, {})
|
||||
|
||||
for filenumber, filename in enumerate(filtered_filenames_list):
|
||||
|
||||
if filename.endswith('metadata.json'):
|
||||
continue
|
||||
|
||||
#file_ext = os.path.splitext(filename)[1]
|
||||
#try:
|
||||
|
||||
|
@ -1,7 +0,0 @@
|
||||
exclude_paths:
|
||||
containing :
|
||||
- .ipynb_checkpoints
|
||||
- .renku
|
||||
- .git
|
||||
# - params
|
||||
- .Trash
|
@ -1,18 +1,3 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
try:
|
||||
thisFilePath = os.path.abspath(__file__)
|
||||
except NameError:
|
||||
print("Error: __file__ is not available. Ensure the script is being run from a file.")
|
||||
print("[Notice] Path to DIMA package may not be resolved properly.")
|
||||
thisFilePath = os.getcwd() # Use current directory or specify a default
|
||||
|
||||
dimaPath = os.path.normpath(os.path.join(thisFilePath, "..",'..','..')) # Move up to project root
|
||||
|
||||
if dimaPath not in sys.path: # Avoid duplicate entries
|
||||
sys.path.insert(0,dimaPath)
|
||||
|
||||
import pandas as pd
|
||||
import os
|
||||
import sys
|
||||
@ -22,7 +7,7 @@ import logging
|
||||
import numpy as np
|
||||
import h5py
|
||||
import re
|
||||
import yaml
|
||||
|
||||
|
||||
def setup_logging(log_dir, log_filename):
|
||||
"""Sets up logging to a specified directory and file.
|
||||
@ -217,49 +202,43 @@ def convert_string_to_bytes(input_list: list):
|
||||
|
||||
def convert_attrdict_to_np_structured_array(attr_value: dict):
|
||||
"""
|
||||
Converts a dictionary of attributes into a NumPy structured array with byte-encoded fields.
|
||||
Handles UTF-8 encoding to avoid UnicodeEncodeError with non-ASCII characters.
|
||||
Converts a dictionary of attributes into a numpy structured array for HDF5
|
||||
compound type compatibility.
|
||||
|
||||
Each dictionary key is mapped to a field in the structured array, with the
|
||||
data type (S) determined by the longest string representation of the values.
|
||||
If the dictionary is empty, the function returns 'missing'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
attr_value : dict
|
||||
Dictionary with scalar values (int, float, str).
|
||||
Dictionary containing the attributes to be converted. Example:
|
||||
attr_value = {
|
||||
'name': 'Temperature',
|
||||
'unit': 'Celsius',
|
||||
'value': 23.5,
|
||||
'timestamp': '2023-09-26 10:00'
|
||||
}
|
||||
|
||||
Returns
|
||||
-------
|
||||
new_attr_value : ndarray
|
||||
1-row structured array with fixed-size byte fields (dtype='S').
|
||||
new_attr_value : ndarray or str
|
||||
Numpy structured array with UTF-8 encoded fields. Returns 'missing' if
|
||||
the input dictionary is empty.
|
||||
"""
|
||||
if not isinstance(attr_value, dict):
|
||||
raise ValueError(f"Input must be a dictionary, got {type(attr_value)}")
|
||||
|
||||
if not attr_value:
|
||||
return np.array(['missing'], dtype=[('value', 'S16')]) # placeholder
|
||||
|
||||
dtype = []
|
||||
values_list = []
|
||||
|
||||
max_str_len = max(len(str(v)) for v in attr_value.values())
|
||||
byte_len = max_str_len * 4 # UTF-8 worst-case
|
||||
|
||||
for key, val in attr_value.items():
|
||||
if key == 'rename_as':
|
||||
continue
|
||||
if isinstance(val, (int, float, str)):
|
||||
dtype.append((key, f'S{byte_len}'))
|
||||
try:
|
||||
encoded_val = str(val).encode('utf-8') # explicit UTF-8
|
||||
values_list.append(encoded_val)
|
||||
except UnicodeEncodeError as e:
|
||||
logging.error(f"Failed to encode {key}={val}: {e}")
|
||||
raise
|
||||
else:
|
||||
logging.warning(f"Skipping unsupported type for key {key}: {type(val)}")
|
||||
|
||||
max_length = max(len(str(attr_value[key])) for key in attr_value.keys())
|
||||
for key in attr_value.keys():
|
||||
if key != 'rename_as':
|
||||
dtype.append((key, f'S{max_length}'))
|
||||
values_list.append(attr_value[key])
|
||||
if values_list:
|
||||
return np.array([tuple(values_list)], dtype=dtype)
|
||||
new_attr_value = np.array([tuple(values_list)], dtype=dtype)
|
||||
else:
|
||||
return np.array(['missing'], dtype=[('value', 'S16')])
|
||||
new_attr_value = 'missing'
|
||||
|
||||
return new_attr_value
|
||||
|
||||
|
||||
def infer_units(column_name):
|
||||
@ -313,19 +292,6 @@ def copy_directory_with_contraints(input_dir_path, output_dir_path,
|
||||
output_dir_path = os.path.normpath(output_dir_path)
|
||||
select_dir_keywords = [keyword.replace('/',os.sep) for keyword in select_dir_keywords]
|
||||
|
||||
try:
|
||||
with open(os.path.join(dimaPath, 'dima/utils/exclude_path_keywords.yaml'), 'r') as stream:
|
||||
exclude_path_dict = yaml.safe_load(stream)
|
||||
if isinstance(exclude_path_dict, dict):
|
||||
exclude_path_keywords = exclude_path_dict.get('exclude_paths',{}).get('containing', [])
|
||||
if not all(isinstance(keyword, str) for keyword in exclude_path_keywords):
|
||||
exclude_path_keywords = []
|
||||
else:
|
||||
exclude_path_keywords = []
|
||||
except (FileNotFoundError, yaml.YAMLError) as e:
|
||||
print(f"Warning. Unable to load YAML file: {e}")
|
||||
exclude_path_keywords = []
|
||||
|
||||
date = created_at('%Y_%m').replace(":", "-")
|
||||
log_dir='logs/'
|
||||
setup_logging(log_dir, f"copy_directory_with_contraints_{date}.log")
|
||||
@ -336,7 +302,6 @@ def copy_directory_with_contraints(input_dir_path, output_dir_path,
|
||||
|
||||
def file_is_selected(filename):
|
||||
return not select_file_keywords or any(keyword in filename for keyword in select_file_keywords)
|
||||
# Exclude path keywords
|
||||
|
||||
|
||||
# Collect paths of directories, which are directly connected to the root dir and match select_dir_keywords
|
||||
@ -355,10 +320,6 @@ def copy_directory_with_contraints(input_dir_path, output_dir_path,
|
||||
|
||||
for dirpath, _, filenames in os.walk(subpath,topdown=False):
|
||||
|
||||
# Exclude any dirpath containing a keyword in exclude_path_keywords
|
||||
if any(excluded in dirpath for excluded in exclude_path_keywords):
|
||||
continue
|
||||
|
||||
# Ensure composite keywords e.g., <keyword>/<keyword> are contained in the path
|
||||
if select_dir_keywords and not any([keyword in dirpath for keyword in select_dir_keywords]):
|
||||
continue
|
||||
@ -451,56 +412,3 @@ def is_structured_array(attr_val):
|
||||
return True if attr_val.dtype.names is not None else False
|
||||
else:
|
||||
return False
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def find_env_file(start_path=None):
|
||||
"""
|
||||
Find .env file by walking up the directory tree.
|
||||
Looks for .env in current dir, then parent dirs up to filesystem root.
|
||||
|
||||
Args:
|
||||
start_path: Starting directory (defaults to current working directory)
|
||||
|
||||
Returns:
|
||||
Path to .env file or None if not found
|
||||
"""
|
||||
if start_path is None:
|
||||
start_path = os.getcwd()
|
||||
|
||||
current_path = Path(start_path).resolve()
|
||||
|
||||
# Walk up the directory tree
|
||||
for path in [current_path] + list(current_path.parents):
|
||||
env_file = path / '.env'
|
||||
if env_file.exists():
|
||||
return str(env_file)
|
||||
|
||||
return None
|
||||
|
||||
import os
|
||||
|
||||
def load_env_from_root():
|
||||
"""Load environment variables from .env file found in project root or parent."""
|
||||
env_file = find_env_file()
|
||||
|
||||
if env_file:
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(env_file, override=True) # override existing values
|
||||
print(f"Loaded .env from: {env_file}")
|
||||
return True
|
||||
except ImportError:
|
||||
with open(env_file, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#') and '=' in line:
|
||||
key, value = line.split('=', 1)
|
||||
os.environ[key.strip()] = value.strip()
|
||||
print(f"Manually loaded .env from: {env_file}")
|
||||
return True
|
||||
|
||||
else:
|
||||
print("No .env file found in project hierarchy")
|
||||
return False
|
Reference in New Issue
Block a user