Files
dima/docs/build/html/_modules/utils/g5505_utils.html

565 lines
67 KiB
HTML

<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>utils.g5505_utils &mdash; DIMA 1.0.0 documentation</title>
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../../_static/css/theme.css?v=19f00094" />
<!--[if lt IE 9]>
<script src="../../_static/js/html5shiv.min.js"></script>
<![endif]-->
<script src="../../_static/jquery.js?v=5d32c60e"></script>
<script src="../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../../_static/documentation_options.js?v=8d563738"></script>
<script src="../../_static/doctools.js?v=9a2dae69"></script>
<script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script src="../../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../../genindex.html" />
<link rel="search" title="Search" href="../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../index.html" class="icon icon-home">
DIMA
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../modules/src.html">HDF5 data operations</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../modules/src.html#module-src.hdf5_writer">Data integration with HDF5</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../modules/src.html#module-src.hdf5_vis">Data visualization</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../modules/pipelines.html">Pipelines and workflows</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../modules/utils.html">Utilities</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../index.html">DIMA</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="../index.html">Module code</a></li>
<li class="breadcrumb-item active">utils.g5505_utils</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for utils.g5505_utils</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">shutil</span>
<span class="kn">import</span> <span class="nn">datetime</span>
<span class="kn">import</span> <span class="nn">logging</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">h5py</span>
<span class="kn">import</span> <span class="nn">re</span>
<div class="viewcode-block" id="setup_logging">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.setup_logging">[docs]</a>
<span class="k">def</span> <span class="nf">setup_logging</span><span class="p">(</span><span class="n">log_dir</span><span class="p">,</span> <span class="n">log_filename</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Sets up logging to a specified directory and file.</span>
<span class="sd"> Parameters:</span>
<span class="sd"> log_dir (str): Directory to save the log file.</span>
<span class="sd"> log_filename (str): Name of the log file.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># Ensure the log directory exists</span>
<span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="n">log_dir</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># Create a logger instance</span>
<span class="n">logger</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">()</span>
<span class="n">logger</span><span class="o">.</span><span class="n">setLevel</span><span class="p">(</span><span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">)</span>
<span class="c1"># Create a file handler</span>
<span class="n">log_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">log_dir</span><span class="p">,</span> <span class="n">log_filename</span><span class="p">)</span>
<span class="n">file_handler</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">FileHandler</span><span class="p">(</span><span class="n">log_path</span><span class="p">)</span>
<span class="c1"># Create a formatter and set it for the handler</span>
<span class="n">formatter</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">Formatter</span><span class="p">(</span><span class="s1">&#39;</span><span class="si">%(asctime)s</span><span class="s1"> - </span><span class="si">%(name)s</span><span class="s1"> - </span><span class="si">%(levelname)s</span><span class="s1"> - </span><span class="si">%(message)s</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="n">file_handler</span><span class="o">.</span><span class="n">setFormatter</span><span class="p">(</span><span class="n">formatter</span><span class="p">)</span>
<span class="c1"># Add the handler to the logger</span>
<span class="n">logger</span><span class="o">.</span><span class="n">addHandler</span><span class="p">(</span><span class="n">file_handler</span><span class="p">)</span></div>
<div class="viewcode-block" id="is_callable_list">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.is_callable_list">[docs]</a>
<span class="k">def</span> <span class="nf">is_callable_list</span><span class="p">(</span><span class="n">x</span> <span class="p">:</span> <span class="nb">list</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">all</span><span class="p">([</span><span class="nb">callable</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">x</span><span class="p">])</span></div>
<div class="viewcode-block" id="is_str_list">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.is_str_list">[docs]</a>
<span class="k">def</span> <span class="nf">is_str_list</span><span class="p">(</span><span class="n">x</span> <span class="p">:</span> <span class="nb">list</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">all</span><span class="p">([</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span><span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">x</span><span class="p">])</span></div>
<div class="viewcode-block" id="augment_with_filetype">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.augment_with_filetype">[docs]</a>
<span class="k">def</span> <span class="nf">augment_with_filetype</span><span class="p">(</span><span class="n">df</span><span class="p">):</span>
<span class="n">df</span><span class="p">[</span><span class="s1">&#39;filetype&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">splitext</span><span class="p">(</span><span class="n">item</span><span class="p">)[</span><span class="mi">1</span><span class="p">][</span><span class="mi">1</span><span class="p">::]</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">df</span><span class="p">[</span><span class="s1">&#39;filename&#39;</span><span class="p">]]</span>
<span class="c1">#return [os.path.splitext(item)[1][1::] for item in df[&#39;filename&#39;]]</span>
<span class="k">return</span> <span class="n">df</span></div>
<div class="viewcode-block" id="augment_with_filenumber">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.augment_with_filenumber">[docs]</a>
<span class="k">def</span> <span class="nf">augment_with_filenumber</span><span class="p">(</span><span class="n">df</span><span class="p">):</span>
<span class="n">df</span><span class="p">[</span><span class="s1">&#39;filenumber&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">item</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">item</span><span class="o">.</span><span class="n">find</span><span class="p">(</span><span class="s1">&#39;_&#39;</span><span class="p">)]</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">df</span><span class="p">[</span><span class="s1">&#39;filename&#39;</span><span class="p">]]</span>
<span class="c1">#return [item[0:item.find(&#39;_&#39;)] for item in df[&#39;filename&#39;]]</span>
<span class="k">return</span> <span class="n">df</span></div>
<div class="viewcode-block" id="group_by_df_column">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.group_by_df_column">[docs]</a>
<span class="k">def</span> <span class="nf">group_by_df_column</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">column_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> df (pandas.DataFrame): </span>
<span class="sd"> column_name (str): column_name of df by which grouping operation will take place. </span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">column_name</span> <span class="ow">in</span> <span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;column_name must be in the columns of df.&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">df</span><span class="p">[</span><span class="n">column_name</span><span class="p">]</span></div>
<div class="viewcode-block" id="split_sample_col_into_sample_and_data_quality_cols">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.split_sample_col_into_sample_and_data_quality_cols">[docs]</a>
<span class="k">def</span> <span class="nf">split_sample_col_into_sample_and_data_quality_cols</span><span class="p">(</span><span class="n">input_data</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">sample_name</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">sample_quality</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">input_data</span><span class="p">[</span><span class="s1">&#39;sample&#39;</span><span class="p">]:</span>
<span class="k">if</span> <span class="n">item</span><span class="o">.</span><span class="n">find</span><span class="p">(</span><span class="s1">&#39;(&#39;</span><span class="p">)</span><span class="o">!=-</span><span class="mi">1</span><span class="p">:</span>
<span class="c1">#print(item)</span>
<span class="n">sample_name</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">item</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">item</span><span class="o">.</span><span class="n">find</span><span class="p">(</span><span class="s1">&#39;(&#39;</span><span class="p">)])</span>
<span class="n">sample_quality</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">item</span><span class="p">[</span><span class="n">item</span><span class="o">.</span><span class="n">find</span><span class="p">(</span><span class="s1">&#39;(&#39;</span><span class="p">)</span><span class="o">+</span><span class="mi">1</span><span class="p">:</span><span class="nb">len</span><span class="p">(</span><span class="n">item</span><span class="p">)</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">item</span><span class="o">==</span><span class="s1">&#39;&#39;</span><span class="p">:</span>
<span class="n">sample_name</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">&#39;Not yet annotated&#39;</span><span class="p">)</span>
<span class="n">sample_quality</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">&#39;unevaluated&#39;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sample_name</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">item</span><span class="p">)</span>
<span class="n">sample_quality</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">&#39;good data&#39;</span><span class="p">)</span>
<span class="n">input_data</span><span class="p">[</span><span class="s1">&#39;sample&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">sample_name</span>
<span class="n">input_data</span><span class="p">[</span><span class="s1">&#39;data_quality&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">sample_quality</span>
<span class="k">return</span> <span class="n">input_data</span></div>
<div class="viewcode-block" id="make_file_copy">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.make_file_copy">[docs]</a>
<span class="k">def</span> <span class="nf">make_file_copy</span><span class="p">(</span><span class="n">source_file_path</span><span class="p">,</span> <span class="n">output_folder_name</span> <span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">&#39;tmp_files&#39;</span><span class="p">):</span>
<span class="n">pathtail</span><span class="p">,</span> <span class="n">filename</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">source_file_path</span><span class="p">)</span>
<span class="c1">#backup_filename = &#39;backup_&#39;+ filename</span>
<span class="n">backup_filename</span> <span class="o">=</span> <span class="n">filename</span>
<span class="c1"># Path </span>
<span class="n">ROOT_DIR</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">abspath</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">curdir</span><span class="p">)</span>
<span class="n">tmp_dirpath</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">ROOT_DIR</span><span class="p">,</span><span class="n">output_folder_name</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">tmp_dirpath</span><span class="p">):</span>
<span class="n">os</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">tmp_dirpath</span><span class="p">)</span>
<span class="n">tmp_file_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">tmp_dirpath</span><span class="p">,</span><span class="n">backup_filename</span><span class="p">)</span>
<span class="n">shutil</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">source_file_path</span><span class="p">,</span> <span class="n">tmp_file_path</span><span class="p">)</span>
<span class="k">return</span> <span class="n">tmp_file_path</span></div>
<div class="viewcode-block" id="created_at">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.created_at">[docs]</a>
<span class="k">def</span> <span class="nf">created_at</span><span class="p">(</span><span class="n">datetime_format</span> <span class="o">=</span> <span class="s1">&#39;%Y-%m-</span><span class="si">%d</span><span class="s1"> %H:%M:%S&#39;</span><span class="p">):</span>
<span class="n">now</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">()</span>
<span class="c1"># Populate now object with time zone information obtained from the local system</span>
<span class="n">now_tz_aware</span> <span class="o">=</span> <span class="n">now</span><span class="o">.</span><span class="n">astimezone</span><span class="p">()</span>
<span class="n">tz</span> <span class="o">=</span> <span class="n">now_tz_aware</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s1">&#39;%z&#39;</span><span class="p">)</span>
<span class="c1"># Replace colons in the time part of the timestamp with hyphens to make it file name friendly</span>
<span class="n">created_at</span> <span class="o">=</span> <span class="n">now_tz_aware</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="n">datetime_format</span><span class="p">)</span> <span class="c1">#+ &#39;_UTC-OFST_&#39; + tz</span>
<span class="k">return</span> <span class="n">created_at</span></div>
<div class="viewcode-block" id="sanitize_dataframe">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.sanitize_dataframe">[docs]</a>
<span class="k">def</span> <span class="nf">sanitize_dataframe</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="c1"># Handle datetime columns (convert to string in &#39;yyyy-mm-dd hh:mm:ss&#39; format)</span>
<span class="n">datetime_cols</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select_dtypes</span><span class="p">(</span><span class="n">include</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;datetime&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">columns</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">datetime_cols</span><span class="p">:</span>
<span class="c1"># Convert datetime to string in the specified format, handling NaT</span>
<span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">dt</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s1">&#39;%Y-%m-</span><span class="si">%d</span><span class="s1"> %H-%M-%S&#39;</span><span class="p">)</span>
<span class="c1"># Handle object columns with mixed types</span>
<span class="n">otype_cols</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select_dtypes</span><span class="p">(</span><span class="n">include</span><span class="o">=</span><span class="s1">&#39;O&#39;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">otype_cols</span><span class="p">:</span>
<span class="n">col_data</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span>
<span class="c1"># Check if all elements in the column are strings</span>
<span class="k">if</span> <span class="n">col_data</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="nb">str</span><span class="p">))</span><span class="o">.</span><span class="n">all</span><span class="p">():</span>
<span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">str</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># If the column contains mixed types, attempt to convert to numeric, coercing errors to NaN</span>
<span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_numeric</span><span class="p">(</span><span class="n">col_data</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s1">&#39;coerce&#39;</span><span class="p">)</span>
<span class="c1"># Handle NaN values differently based on dtype</span>
<span class="k">if</span> <span class="n">pd</span><span class="o">.</span><span class="n">api</span><span class="o">.</span><span class="n">types</span><span class="o">.</span><span class="n">is_string_dtype</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]):</span>
<span class="c1"># Replace NaN in string columns with empty string</span>
<span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="s1">&#39;&#39;</span><span class="p">)</span> <span class="c1"># Replace NaN with empty string</span>
<span class="k">elif</span> <span class="n">pd</span><span class="o">.</span><span class="n">api</span><span class="o">.</span><span class="n">types</span><span class="o">.</span><span class="n">is_numeric_dtype</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]):</span>
<span class="c1"># For numeric columns, we want to keep NaN as it is</span>
<span class="c1"># But if integer column has NaN, consider casting to float</span>
<span class="k">if</span> <span class="n">pd</span><span class="o">.</span><span class="n">api</span><span class="o">.</span><span class="n">types</span><span class="o">.</span><span class="n">is_integer_dtype</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]):</span>
<span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span> <span class="c1"># Cast to float to allow NaN</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">)</span> <span class="c1"># Keep NaN in float columns</span>
<span class="k">return</span> <span class="n">df</span></div>
<div class="viewcode-block" id="convert_dataframe_to_np_structured_array">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.convert_dataframe_to_np_structured_array">[docs]</a>
<span class="k">def</span> <span class="nf">convert_dataframe_to_np_structured_array</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">sanitize_dataframe</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="c1"># Define the dtype for the structured array, ensuring compatibility with h5py</span>
<span class="n">dtype</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span>
<span class="n">col_data</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span>
<span class="n">col_dtype</span> <span class="o">=</span> <span class="n">col_data</span><span class="o">.</span><span class="n">dtype</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">if</span> <span class="n">pd</span><span class="o">.</span><span class="n">api</span><span class="o">.</span><span class="n">types</span><span class="o">.</span><span class="n">is_string_dtype</span><span class="p">(</span><span class="n">col_dtype</span><span class="p">):</span>
<span class="c1"># Convert string dtype to fixed-length strings</span>
<span class="n">max_len</span> <span class="o">=</span> <span class="n">col_data</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">len</span><span class="p">()</span><span class="o">.</span><span class="n">max</span><span class="p">()</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">col_data</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span><span class="o">.</span><span class="n">all</span><span class="p">()</span> <span class="k">else</span> <span class="mi">0</span>
<span class="n">dtype</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">col</span><span class="p">,</span> <span class="sa">f</span><span class="s1">&#39;S</span><span class="si">{</span><span class="n">max_len</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">))</span>
<span class="k">elif</span> <span class="n">pd</span><span class="o">.</span><span class="n">api</span><span class="o">.</span><span class="n">types</span><span class="o">.</span><span class="n">is_integer_dtype</span><span class="p">(</span><span class="n">col_dtype</span><span class="p">):</span>
<span class="n">dtype</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">col</span><span class="p">,</span> <span class="s1">&#39;i4&#39;</span><span class="p">))</span> <span class="c1"># Assuming 32-bit integer</span>
<span class="k">elif</span> <span class="n">pd</span><span class="o">.</span><span class="n">api</span><span class="o">.</span><span class="n">types</span><span class="o">.</span><span class="n">is_float_dtype</span><span class="p">(</span><span class="n">col_dtype</span><span class="p">):</span>
<span class="n">dtype</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">col</span><span class="p">,</span> <span class="s1">&#39;f4&#39;</span><span class="p">))</span> <span class="c1"># Assuming 32-bit float</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Handle unsupported data types</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unsupported dtype found in column &#39;</span><span class="si">{</span><span class="n">col</span><span class="si">}</span><span class="s2">&#39;: </span><span class="si">{</span><span class="n">col_data</span><span class="o">.</span><span class="n">dtype</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unsupported data type: </span><span class="si">{</span><span class="n">col_data</span><span class="o">.</span><span class="n">dtype</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="c1"># Log more detailed error message</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Error processing column &#39;</span><span class="si">{</span><span class="n">col</span><span class="si">}</span><span class="s2">&#39;: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">raise</span>
<span class="c1"># Convert the DataFrame to a structured array</span>
<span class="n">structured_array</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">itertuples</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="kc">None</span><span class="p">)),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span>
<span class="k">return</span> <span class="n">structured_array</span></div>
<div class="viewcode-block" id="convert_string_to_bytes">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.convert_string_to_bytes">[docs]</a>
<span class="k">def</span> <span class="nf">convert_string_to_bytes</span><span class="p">(</span><span class="n">input_list</span><span class="p">:</span> <span class="nb">list</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Convert a list of strings into a numpy array with utf8-type entries.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> input_list (list) : list of string objects</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> input_array_bytes (ndarray): array of ut8-type entries.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">utf8_type</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">max_length</span><span class="p">:</span> <span class="n">h5py</span><span class="o">.</span><span class="n">string_dtype</span><span class="p">(</span><span class="s1">&#39;utf-8&#39;</span><span class="p">,</span> <span class="n">max_length</span><span class="p">)</span>
<span class="k">if</span> <span class="n">input_list</span><span class="p">:</span>
<span class="n">max_length</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">input_list</span><span class="p">)</span>
<span class="c1"># Convert the strings to bytes with utf-8 encoding, specifying errors=&#39;ignore&#39; to skip characters that cannot be encoded</span>
<span class="n">input_list_bytes</span> <span class="o">=</span> <span class="p">[</span><span class="n">item</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">&#39;utf-8&#39;</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s1">&#39;ignore&#39;</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">input_list</span><span class="p">]</span>
<span class="n">input_array_bytes</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">input_list_bytes</span><span class="p">,</span><span class="n">dtype</span><span class="o">=</span><span class="n">utf8_type</span><span class="p">(</span><span class="n">max_length</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">input_array_bytes</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([],</span><span class="n">dtype</span><span class="o">=</span><span class="n">utf8_type</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
<span class="k">return</span> <span class="n">input_array_bytes</span></div>
<div class="viewcode-block" id="convert_attrdict_to_np_structured_array">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.convert_attrdict_to_np_structured_array">[docs]</a>
<span class="k">def</span> <span class="nf">convert_attrdict_to_np_structured_array</span><span class="p">(</span><span class="n">attr_value</span><span class="p">:</span> <span class="nb">dict</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a dictionary of attributes into a numpy structured array for HDF5 </span>
<span class="sd"> compound type compatibility.</span>
<span class="sd"> Each dictionary key is mapped to a field in the structured array, with the </span>
<span class="sd"> data type (S) determined by the longest string representation of the values. </span>
<span class="sd"> If the dictionary is empty, the function returns &#39;missing&#39;.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> attr_value : dict</span>
<span class="sd"> Dictionary containing the attributes to be converted. Example:</span>
<span class="sd"> attr_value = {</span>
<span class="sd"> &#39;name&#39;: &#39;Temperature&#39;,</span>
<span class="sd"> &#39;unit&#39;: &#39;Celsius&#39;,</span>
<span class="sd"> &#39;value&#39;: 23.5,</span>
<span class="sd"> &#39;timestamp&#39;: &#39;2023-09-26 10:00&#39;</span>
<span class="sd"> }</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> new_attr_value : ndarray or str</span>
<span class="sd"> Numpy structured array with UTF-8 encoded fields. Returns &#39;missing&#39; if </span>
<span class="sd"> the input dictionary is empty.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">dtype</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">values_list</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">max_length</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">attr_value</span><span class="p">[</span><span class="n">key</span><span class="p">]))</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">attr_value</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">attr_value</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
<span class="k">if</span> <span class="n">key</span> <span class="o">!=</span> <span class="s1">&#39;rename_as&#39;</span><span class="p">:</span>
<span class="n">dtype</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">key</span><span class="p">,</span> <span class="sa">f</span><span class="s1">&#39;S</span><span class="si">{</span><span class="n">max_length</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">))</span>
<span class="n">values_list</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">attr_value</span><span class="p">[</span><span class="n">key</span><span class="p">])</span>
<span class="k">if</span> <span class="n">values_list</span><span class="p">:</span>
<span class="n">new_attr_value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="nb">tuple</span><span class="p">(</span><span class="n">values_list</span><span class="p">)],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_attr_value</span> <span class="o">=</span> <span class="s1">&#39;missing&#39;</span>
<span class="k">return</span> <span class="n">new_attr_value</span></div>
<div class="viewcode-block" id="infer_units">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.infer_units">[docs]</a>
<span class="k">def</span> <span class="nf">infer_units</span><span class="p">(</span><span class="n">column_name</span><span class="p">):</span>
<span class="c1"># TODO: complete or remove</span>
<span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="s1">&#39;\[.+\]&#39;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">match</span><span class="p">:</span>
<span class="k">return</span> <span class="n">match</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="s1">&#39;\(.+\)&#39;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">match</span></div>
<div class="viewcode-block" id="progressBar">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.progressBar">[docs]</a>
<span class="k">def</span> <span class="nf">progressBar</span><span class="p">(</span><span class="n">count_value</span><span class="p">,</span> <span class="n">total</span><span class="p">,</span> <span class="n">suffix</span><span class="o">=</span><span class="s1">&#39;&#39;</span><span class="p">):</span>
<span class="n">bar_length</span> <span class="o">=</span> <span class="mi">100</span>
<span class="n">filled_up_Length</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="nb">round</span><span class="p">(</span><span class="n">bar_length</span><span class="o">*</span> <span class="n">count_value</span> <span class="o">/</span> <span class="nb">float</span><span class="p">(</span><span class="n">total</span><span class="p">)))</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="nb">round</span><span class="p">(</span><span class="mf">100.0</span> <span class="o">*</span> <span class="n">count_value</span><span class="o">/</span><span class="nb">float</span><span class="p">(</span><span class="n">total</span><span class="p">),</span><span class="mi">1</span><span class="p">)</span>
<span class="n">bar</span> <span class="o">=</span> <span class="s1">&#39;=&#39;</span> <span class="o">*</span> <span class="n">filled_up_Length</span> <span class="o">+</span> <span class="s1">&#39;-&#39;</span> <span class="o">*</span> <span class="p">(</span><span class="n">bar_length</span> <span class="o">-</span> <span class="n">filled_up_Length</span><span class="p">)</span>
<span class="n">sys</span><span class="o">.</span><span class="n">stdout</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">&#39;[</span><span class="si">%s</span><span class="s1">] </span><span class="si">%s%s</span><span class="s1"> ...</span><span class="si">%s</span><span class="se">\r</span><span class="s1">&#39;</span> <span class="o">%</span><span class="p">(</span><span class="n">bar</span><span class="p">,</span> <span class="n">percentage</span><span class="p">,</span> <span class="s1">&#39;%&#39;</span><span class="p">,</span> <span class="n">suffix</span><span class="p">))</span>
<span class="n">sys</span><span class="o">.</span><span class="n">stdout</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span></div>
<div class="viewcode-block" id="copy_directory_with_contraints">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.copy_directory_with_contraints">[docs]</a>
<span class="k">def</span> <span class="nf">copy_directory_with_contraints</span><span class="p">(</span><span class="n">input_dir_path</span><span class="p">,</span> <span class="n">output_dir_path</span><span class="p">,</span>
<span class="n">select_dir_keywords</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">select_file_keywords</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">allowed_file_extensions</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">dry_run</span> <span class="o">=</span> <span class="kc">False</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Copies files from input_dir_path to output_dir_path based on specified constraints.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> input_dir_path (str): Path to the input directory.</span>
<span class="sd"> output_dir_path (str): Path to the output directory.</span>
<span class="sd"> select_dir_keywords (list): optional, List of keywords for selecting directories.</span>
<span class="sd"> select_file_keywords (list): optional, List of keywords for selecting files.</span>
<span class="sd"> allowed_file_extensions (list): optional, List of allowed file extensions.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> path_to_files_dict (dict): dictionary mapping directory paths to lists of copied file names satisfying the constraints.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># Unconstrained default behavior: No filters, make sure variable are lists even when defined as None in function signature</span>
<span class="n">select_dir_keywords</span> <span class="o">=</span> <span class="n">select_dir_keywords</span> <span class="ow">or</span> <span class="p">[]</span>
<span class="n">select_file_keywords</span> <span class="o">=</span> <span class="n">select_file_keywords</span> <span class="ow">or</span> <span class="p">[]</span>
<span class="n">allowed_file_extensions</span> <span class="o">=</span> <span class="n">allowed_file_extensions</span> <span class="ow">or</span> <span class="p">[]</span>
<span class="n">date</span> <span class="o">=</span> <span class="n">created_at</span><span class="p">(</span><span class="s1">&#39;%Y_%m&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">&quot;:&quot;</span><span class="p">,</span> <span class="s2">&quot;-&quot;</span><span class="p">)</span>
<span class="n">log_dir</span><span class="o">=</span><span class="s1">&#39;logs/&#39;</span>
<span class="n">setup_logging</span><span class="p">(</span><span class="n">log_dir</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;copy_directory_with_contraints_</span><span class="si">{</span><span class="n">date</span><span class="si">}</span><span class="s2">.log&quot;</span><span class="p">)</span>
<span class="c1"># Define helper functions. Return by default true when filtering lists are either None or []</span>
<span class="k">def</span> <span class="nf">has_allowed_extension</span><span class="p">(</span><span class="n">filename</span><span class="p">):</span>
<span class="k">return</span> <span class="ow">not</span> <span class="n">allowed_file_extensions</span> <span class="ow">or</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">splitext</span><span class="p">(</span><span class="n">filename</span><span class="p">)[</span><span class="mi">1</span><span class="p">]</span> <span class="ow">in</span> <span class="n">allowed_file_extensions</span>
<span class="k">def</span> <span class="nf">file_is_selected</span><span class="p">(</span><span class="n">filename</span><span class="p">):</span>
<span class="k">return</span> <span class="ow">not</span> <span class="n">select_file_keywords</span> <span class="ow">or</span> <span class="nb">any</span><span class="p">(</span><span class="n">keyword</span> <span class="ow">in</span> <span class="n">filename</span> <span class="k">for</span> <span class="n">keyword</span> <span class="ow">in</span> <span class="n">select_file_keywords</span><span class="p">)</span>
<span class="c1"># Collect paths of directories, which are directly connected to the root dir and match select_dir_keywords</span>
<span class="n">paths</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">if</span> <span class="n">select_dir_keywords</span><span class="p">:</span>
<span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">listdir</span><span class="p">(</span><span class="n">input_dir_path</span><span class="p">):</span> <span class="c1">#Path(input_dir_path).iterdir():</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">([</span><span class="n">item</span> <span class="ow">in</span> <span class="n">keyword</span> <span class="k">for</span> <span class="n">keyword</span> <span class="ow">in</span> <span class="n">select_dir_keywords</span><span class="p">]):</span>
<span class="n">paths</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">input_dir_path</span><span class="p">,</span><span class="n">item</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">paths</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">input_dir_path</span><span class="p">)</span> <span class="c1">#paths.append(Path(input_dir_path))</span>
<span class="n">path_to_files_dict</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># Dictionary to store directory-file pairs satisfying constraints</span>
<span class="k">for</span> <span class="n">subpath</span> <span class="ow">in</span> <span class="n">paths</span><span class="p">:</span>
<span class="k">for</span> <span class="n">dirpath</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">filenames</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">walk</span><span class="p">(</span><span class="n">subpath</span><span class="p">,</span><span class="n">topdown</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="c1"># Reduce filenames to those that are admissible</span>
<span class="n">admissible_filenames</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">filename</span> <span class="k">for</span> <span class="n">filename</span> <span class="ow">in</span> <span class="n">filenames</span>
<span class="k">if</span> <span class="n">file_is_selected</span><span class="p">(</span><span class="n">filename</span><span class="p">)</span> <span class="ow">and</span> <span class="n">has_allowed_extension</span><span class="p">(</span><span class="n">filename</span><span class="p">)</span>
<span class="p">]</span>
<span class="k">if</span> <span class="n">admissible_filenames</span><span class="p">:</span> <span class="c1"># Only create directory if there are files to copy</span>
<span class="n">relative_dirpath</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">relpath</span><span class="p">(</span><span class="n">dirpath</span><span class="p">,</span> <span class="n">input_dir_path</span><span class="p">)</span>
<span class="n">target_dirpath</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">output_dir_path</span><span class="p">,</span> <span class="n">relative_dirpath</span><span class="p">)</span>
<span class="n">path_to_files_dict</span><span class="p">[</span><span class="n">target_dirpath</span><span class="p">]</span> <span class="o">=</span> <span class="n">admissible_filenames</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">dry_run</span><span class="p">:</span>
<span class="c1"># Perform the actual copying</span>
<span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="n">target_dirpath</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">for</span> <span class="n">filename</span> <span class="ow">in</span> <span class="n">admissible_filenames</span><span class="p">:</span>
<span class="n">src_file_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">dirpath</span><span class="p">,</span> <span class="n">filename</span><span class="p">)</span>
<span class="n">dest_file_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">target_dirpath</span><span class="p">,</span> <span class="n">filename</span><span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">shutil</span><span class="o">.</span><span class="n">copy2</span><span class="p">(</span><span class="n">src_file_path</span><span class="p">,</span> <span class="n">dest_file_path</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="n">logging</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="s2">&quot;Failed to copy </span><span class="si">%s</span><span class="s2">: </span><span class="si">%s</span><span class="s2">&quot;</span><span class="p">,</span> <span class="n">src_file_path</span><span class="p">,</span> <span class="n">e</span><span class="p">)</span>
<span class="k">return</span> <span class="n">path_to_files_dict</span> </div>
<div class="viewcode-block" id="to_serializable_dtype">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.to_serializable_dtype">[docs]</a>
<span class="k">def</span> <span class="nf">to_serializable_dtype</span><span class="p">(</span><span class="n">value</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Transform value&#39;s dtype into YAML/JSON compatible dtype</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> value : _type_</span>
<span class="sd"> _description_</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> _type_</span>
<span class="sd"> _description_</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">generic</span><span class="p">):</span>
<span class="k">if</span> <span class="n">np</span><span class="o">.</span><span class="n">issubdtype</span><span class="p">(</span><span class="n">value</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">bytes_</span><span class="p">):</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s1">&#39;utf-8&#39;</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">np</span><span class="o">.</span><span class="n">issubdtype</span><span class="p">(</span><span class="n">value</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">unicode_</span><span class="p">):</span>
<span class="n">value</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">value</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">np</span><span class="o">.</span><span class="n">issubdtype</span><span class="p">(</span><span class="n">value</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">number</span><span class="p">):</span>
<span class="n">value</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">value</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Yaml-compatible data-type was not found. Value has been set to NaN.&#39;</span><span class="p">)</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">):</span>
<span class="c1"># Handling structured array types (with fields)</span>
<span class="k">if</span> <span class="n">value</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">names</span><span class="p">:</span>
<span class="n">value</span> <span class="o">=</span> <span class="p">{</span><span class="n">field</span><span class="p">:</span> <span class="n">to_serializable_dtype</span><span class="p">(</span><span class="n">value</span><span class="p">[</span><span class="n">field</span><span class="p">])</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">value</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">names</span><span class="p">}</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Handling regular array NumPy types with assumption of unform dtype accross array elements</span>
<span class="c1"># TODO: evaluate a more general way to check for individual dtypes </span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">bytes</span><span class="p">):</span>
<span class="c1"># Decode bytes</span>
<span class="n">value</span> <span class="o">=</span> <span class="p">[</span><span class="n">item</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s1">&#39;utf-8&#39;</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">value</span><span class="p">]</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="k">else</span> <span class="n">value</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s1">&#39;utf-8&#39;</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">str</span><span class="p">):</span>
<span class="c1"># Already a string type</span>
<span class="n">value</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">value</span><span class="p">]</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="k">else</span> <span class="nb">str</span><span class="p">(</span><span class="n">value</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">int</span><span class="p">):</span>
<span class="c1"># Integer type</span>
<span class="n">value</span> <span class="o">=</span> <span class="p">[</span><span class="nb">int</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">value</span><span class="p">]</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="k">else</span> <span class="nb">int</span><span class="p">(</span><span class="n">value</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">float</span><span class="p">):</span>
<span class="c1"># Floating type</span>
<span class="n">value</span> <span class="o">=</span> <span class="p">[</span><span class="nb">float</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">value</span><span class="p">]</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="k">else</span> <span class="nb">float</span><span class="p">(</span><span class="n">value</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Yaml-compatible data-type was not found. Value has been set to NaN.&#39;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Debug: value.dtype is&quot;</span><span class="p">,</span> <span class="n">value</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Error converting value: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s1">. Value has been set to NaN.&#39;</span><span class="p">)</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span>
<span class="k">return</span> <span class="n">value</span></div>
<div class="viewcode-block" id="is_structured_array">
<a class="viewcode-back" href="../../modules/utils.html#utils.g5505_utils.is_structured_array">[docs]</a>
<span class="k">def</span> <span class="nf">is_structured_array</span><span class="p">(</span><span class="n">attr_val</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">attr_val</span><span class="p">,</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">):</span>
<span class="k">return</span> <span class="kc">True</span> <span class="k">if</span> <span class="n">attr_val</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">False</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">False</span></div>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>&#169; Copyright 2024, JFFO.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>