Added influxDataGetter method to get curves as dataframe

2024-08-26 11:45:00 +02:00
parent 07946fc851
commit c706b78d07
1 changed files with 117 additions and 2 deletions
--- a/influxdb.py
+++ b/influxdb.py
@@ -3,6 +3,8 @@ from configparser import ConfigParser
 from chart_config import ChartConfig
 import ast
 from datetime import datetime
 from pandas import DataFrame as df, merge_ordered
 from numpy import NaN
 class InfluxDB:
    """
@@ -31,7 +33,10 @@ class InfluxDB:
            TableList : an InfluxDB list of the tables returned by the query
        """
        return self._client.query_api().query(query_str)
-
+    
    def query_data_frame(self, query_str):
        return self._client.query_api().query_data_frame(query_str)
 class PrettyFloat(float):
    """saves bandwidth when converting to JSON
@@ -137,6 +142,113 @@ class InfluxDataGetter:
        components = self._get_device_name_components(time)
        return "/".join(components)
    def get_curves_data_frame(self, variables, times, interval):
        """
        Gets the curves for the given variables within a timerange times, as a pandas dataframe.
        All curves are on a single common time axis. 
        The first column is called "relative", and consists of floating seconds, relative to the beginning of the query.
        The "timestamp" column (absolute floating UNIX timestamps in seconds, precise to the nanosecond) is the last one.
        If a curve does not have a point at a given point, the last known value for this curve is used.
        If a curve gets expired, it is filled with NaN value for the corresponding expired time windows.
        The first value (for each variable) is the last known value before the time interval.
        Parameters :
            variables ([(str)]) : an array of variable names (Influx) to get the curves for.
            times ([int]) : the timerange we want the values in. It consists of two values which are Unix timestamps in seconds, first included, second excluded.
            interval (int) : the interval (resolution) of the values to get (in nanoseconds). Allows data binning.
        Returns :
            pandas.DataFrame : the curves in a single pandas DataFrame
        """
        variables_info = {}
        for variable in variables:
            var_param = variable.split(".")
            variable_name_for_query = var_param[0]
            parameter = "value" if len(var_param) == 1 else var_param[1]
            variables_info[variable] = {}
            variables_info[variable]["expired_ranges"] = []
            query = f"""
                from(bucket: "{self._bucket}")
                    |> range(start: {times[0]}, stop: {times[1] + 1})
                    |> filter(fn : (r) => r._measurement == "{variable_name_for_query}")
                    |> filter(fn : (r) => r._field == "{parameter+"_float"}")
                    {"|> aggregateWindow(every: duration(v: "+ str(self._seconds_to_nanoseconds(interval))+"), fn: last, createEmpty:false)" if interval != "None" else ""}
                    |> map(fn: (r) => ({{r with relative: (  float(v: uint(v: r._time) - uint(v:{self._seconds_to_nanoseconds(times[0])}))  / 1000000000.0   )}}))
                    |> map(fn: (r) => ({{r with timestamp: float(v: uint(v: r._time)) / 1000000000.0}}))
                    |> drop(columns:["_start", "_stop", "_field"])
                    |> pivot(rowKey:["relative", "timestamp", "expired"], columnKey: ["_measurement"], valueColumn: "_value")
                """
            data_frame = self._db.query_data_frame(query)
            # Needed for last known value
            query_last_known = f"""
                from(bucket: "{self._bucket}")
                    |> range(start: 0, stop: {times[0] + 1})
                    |> filter(fn : (r) => r._measurement == "{variable_name_for_query}")
                    |> filter(fn : (r) => r._field == "{parameter+"_float"}")
                    |> last()            
                    |> map(fn: (r) => ({{r with relative: 0.0}}))    
                    |> map(fn: (r) => ({{r with timestamp: float(v: uint(v: r._time)) / 1000000000.0}}))
                    |> drop(columns:["_start", "_stop", "_field"])
                    |> pivot(rowKey:["relative", "timestamp", "expired"], columnKey: ["_measurement"], valueColumn: "_value")
                """
            data_frame_last_known = self._db.query_data_frame(query_last_known)
            row_to_insert = None
            for index, row in data_frame_last_known.iterrows():
                if row_to_insert == None or row["timestamp"] > row_to_insert["timestamp"]:
                    row_to_insert = row
            try:
                if not row_to_insert.empty :
                    row_to_insert["timestamp"] = float(times[0])
                    data_frame.loc[-1] = row_to_insert
            except:
                pass
            data_frame.drop(["result", "table"], axis=1, inplace=True)
            data_frame.sort_values(by=["timestamp"], inplace=True)
            data_frame.reset_index()
            # Identify time windows for which the curve is expired
            for index, row in data_frame.iterrows():
                if row["expired"] == "True":
                    data_frame.loc[index, variable] = NaN
                    variables_info[variable]["expired_ranges"].append([row["timestamp"]])
                elif row["expired"] == "False":
                    if len(variables_info[variable]["expired_ranges"]) > 0:
                        variables_info[variable]["expired_ranges"][-1].append(row["timestamp"])
            data_frame.reset_index()
            data_frame.drop(["expired"], axis=1, inplace=True)
            variables_info[variable]["df"] = data_frame
        res = None
        # Merge single curve dataframes to a global one
        if len(variables) == 1:
            res = variables_info[variables[0]]["df"]
        elif len(variables) == 2:
            res = merge_ordered(variables_info[variables[0]]["df"], variables_info[variables[1]]["df"], on=["timestamp", "relative"], suffixes=(None, None))
        else :
            for i,variable in enumerate(variables):
                if i == 1:
                    res = merge_ordered(variables_info[variables[0]]["df"], variables_info[variables[1]]["df"], on=["timestamp", "relative"], suffixes=(None, None))
                elif i > 1:
                    res = merge_ordered(res, variables_info[variables[i]]["df"], on=["timestamp", "relative"], suffixes=(None, None))
        # Forward fill missing values, then set data points to NaN for those which are expired
        if len(variables) > 1:
            res.ffill(inplace=True)
            for variable, info in variables_info.items():
                for range in info["expired_ranges"]:
                    res.loc[(res["timestamp"] >= range[0]) & ((res["timestamp"] < range[1]) if len(range) == 2 else True), variable] = NaN
        # Change order of columns
        cols = res.columns.tolist()
        cols = [cols[0]] + cols[2:] + [cols[1]]
        res = res[cols]
        return res
 # ----- PRIVATE METHODS
    def _get_all_setup_info_as_dict(self, times):
@@ -476,4 +588,7 @@ class InfluxDataGetter:
                    name = ast.literal_eval(record.get_value())
                    if name != None and name != '':
                        res.append(ast.literal_eval(record.get_value()))
-        return res
+        return res
    def _seconds_to_nanoseconds(self, seconds):
        return seconds * 1000000000