Merge pull request #140 from predict-idlab/tsdownsample

jonasvdd · web-flow · commit 71b4efe7ed1a · 2022-12-02T09:57:54.000+01:00
✨ add check_nans to add_trace(s)
diff --git a/plotly_resampler/figure_resampler/figure_resampler_interface.py b/plotly_resampler/figure_resampler/figure_resampler_interface.py
@@ -595,6 +595,7 @@ def _parse_get_trace_props(
         hf_y: Iterable = None,
         hf_text: Iterable = None,
         hf_hovertext: Iterable = None,
+        check_nans: bool = True,
     ) -> _hf_data_container:
         """Parse and capture the possibly high-frequency trace-props in a datacontainer.
 
@@ -603,14 +604,19 @@ def _parse_get_trace_props(
         trace : BaseTraceType
             The trace which will be parsed.
         hf_x : Iterable, optional
-            high-frequency trace "x" data, overrides the current trace its x-data.
+            High-frequency trace "x" data, overrides the current trace its x-data.
         hf_y : Iterable, optional
-            high-frequency trace "y" data, overrides the current trace its y-data.
+            High-frequency trace "y" data, overrides the current trace its y-data.
         hf_text : Iterable, optional
-            high-frequency trace "text" data, overrides the current trace its text-data.
+            High-frequency trace "text" data, overrides the current trace its text-data.
         hf_hovertext : Iterable, optional
-            high-frequency trace "hovertext" data, overrides the current trace its
+            High-frequency trace "hovertext" data, overrides the current trace its
             hovertext data.
+        check_nans: bool, optional
+            Whether the `hf_y` should be checked for NaNs, by default True.
+            As checking for NaNs is expensive, this can be disabled when the `hf_y` is
+            already known to contain no NaNs (or when the downsampler can handle NaNs,
+            e.g., EveryNthPoint).
 
         Returns
         -------
@@ -680,7 +686,7 @@ def _parse_get_trace_props(
             # Remove NaNs for efficiency (storing less meaningless data)
             # NaNs introduce gaps between enclosing non-NaN data points & might distort
             # the resampling algorithms
-            if pd.isna(hf_y).any():
+            if check_nans and pd.isna(hf_y).any():
                 not_nan_mask = ~pd.isna(hf_y)
                 hf_x = hf_x[not_nan_mask]
                 hf_y = hf_y[not_nan_mask]
@@ -821,6 +827,7 @@ def add_trace(
         hf_y: Iterable = None,
         hf_text: Union[str, Iterable] = None,
         hf_hovertext: Union[str, Iterable] = None,
+        check_nans: bool = True,
         **trace_kwargs,
     ):
         """Add a trace to the figure.
@@ -848,7 +855,7 @@ def add_trace(
             .. note::
                 If this variable is not set, ``_global_downsampler`` will be used.
         limit_to_view: boolean, optional
-            If set to True the trace's datapoints will be cut to the corresponding
+            If set to True, the trace's datapoints will be cut to the corresponding
             front-end view, even if the total number of samples is lower than
             ``max_n_samples``, By default False.\n
             Remark that setting this parameter to True ensures that low frequency traces
@@ -866,6 +873,13 @@ def add_trace(
         hf_hovertext: Iterable, optional
             The original high frequency hovertext. If set, this has priority over the
             trace its ```hovertext`` argument.
+        check_nans: boolean, optional
+            If set to True, the trace's data will be checked for NaNs - which will be
+            removed. By default True.
+            As this is a costly operation, it is recommended to set this parameter to
+            False if you are sure that your data does not contain NaNs (or when the
+            downsampler can handle NaNs, e.g., EveryNthPoint). This should considerably
+            speed up the graph construction time. 
         **trace_kwargs: dict
             Additional trace related keyword arguments.
             e.g.: row=.., col=..., secondary_y=...
@@ -937,7 +951,7 @@ def add_trace(
 
         # construct the hf_data_container
         # TODO in future version -> maybe regex on kwargs which start with `hf_`
-        dc = self._parse_get_trace_props(trace, hf_x, hf_y, hf_text, hf_hovertext)
+        dc = self._parse_get_trace_props(trace, hf_x, hf_y, hf_text, hf_hovertext, check_nans)
 
         # These traces will determine the autoscale RANGE!
         #   -> so also store when `limit_to_view` is set.
@@ -996,6 +1010,7 @@ def add_traces(
         | List[AbstractSeriesAggregator]
         | AbstractFigureAggregator = None,
         limit_to_views: List[bool] | bool = False,
+        check_nans: List[bool] | bool = True,
         **traces_kwargs,
     ):
         """Add traces to the figure.
@@ -1030,13 +1045,22 @@ def add_traces(
             aggregator is passed, all traces will use this aggregator.
             If this variable is not set, ``_global_downsampler`` will be used.
         limit_to_views : None | List[bool] | bool, optional
-            List of limit_to_view booleans for the added traces.  If set to True
-            the trace's datapoints will be cut to the corresponding front-end view,
-            even if the total number of samples is lower than ``max_n_samples``. If a
-            single boolean is passed, all to be added traces will use this value,
+            List of limit_to_view booleans for the added traces. If set to True the
+            trace's datapoints will be cut to the corresponding front-end view, even if
+            the total number of samples is lower than ``max_n_samples``. 
+            If a single boolean is passed, all to be added traces will use this value,
             by default False.\n
             Remark that setting this parameter to True ensures that low frequency traces
             are added to the ``hf_data`` property.
+        check_nans : None | List[bool] | bool, optional
+            List of check_nans booleans for the added traces. If set to True, the
+            trace's datapoints will be checked for NaNs. If a single boolean is passed,
+            all to be added traces will use this value, by default True.\n
+            As this is a costly operation, it is recommended to set this parameter to
+            False if the data is known to contain no NaNs (or when the downsampler can
+            handle NaNs, e.g., EveryNthPoint). This will considerably speed up the graph
+            construction time.
+
         **traces_kwargs: dict
             Additional trace related keyword arguments.
             e.g.: rows=.., cols=..., secondary_ys=...
@@ -1076,9 +1100,11 @@ def add_traces(
             downsamplers = [downsamplers] * len(data)
         if isinstance(limit_to_views, bool):
             limit_to_views = [limit_to_views] * len(data)
+        if isinstance(check_nans, bool):
+            check_nans = [check_nans] * len(data)
 
-        for i, (trace, max_out, downsampler, limit_to_view) in enumerate(
-            zip(data, max_n_samples, downsamplers, limit_to_views)
+        for i, (trace, max_out, downsampler, limit_to_view, check_nan) in enumerate(
+            zip(data, max_n_samples, downsamplers, limit_to_views, check_nans)
         ):
             if (
                 trace.type.lower() not in self._high_frequency_traces
@@ -1090,7 +1116,7 @@ def add_traces(
             if not limit_to_view and (trace.y is None or len(trace.y) <= max_out_s):
                 continue
 
-            dc = self._parse_get_trace_props(trace)
+            dc = self._parse_get_trace_props(trace, check_nans=check_nan)
             self._hf_data[trace.uid] = self._construct_hf_data_dict(
                 dc,
                 trace=trace,
diff --git a/plotly_resampler/registering.py b/plotly_resampler/registering.py
@@ -33,7 +33,7 @@ def _get_plotly_constr(constr):
     Parameters
     ----------
     constr : callable
-        The constructor of a instantiatedplotly-object.
+        The constructor of a instantiated plotly-object.
 
     Returns
     -------
@@ -98,10 +98,10 @@ def register_plotly_resampler(mode="auto", **aggregator_kwargs):
         The mode of the plotly-resampler.
         Possible values are: 'auto', 'figure', 'widget', None.
         If 'auto' is used, the mode is determined based on the environment; if it is in
-        an ipython environment, the mode is 'widget', otherwise it is 'figure'.
+        an IPython environment, the mode is 'widget', otherwise it is 'figure'.
         If 'figure' is used, all plotly figures are wrapped as FigureResampler objects.
         If 'widget' is used, all plotly figure widgets are wrapped as
-        FigureWidgetResampler objects (we advise to use this mode in ipython environment
+        FigureWidgetResampler objects (we advise to use this mode in IPython environment
         with a kernel).
         If None is used, wrapping is done as expected (go.Figure -> FigureResampler,
         go.FigureWidget -> FigureWidgetResampler).
diff --git a/tests/test_figure_resampler.py b/tests/test_figure_resampler.py
@@ -343,14 +343,17 @@ def test_nan_removed_input(float_series):
     )
 
     float_series = float_series.copy()
-    float_series.iloc[np.random.choice(len(float_series), 100)] = np.nan
+    float_series.iloc[np.random.choice(len(float_series), 100, replace=False)] = np.nan
     fig.add_trace(
         go.Scatter(x=float_series.index, y=float_series, name="float_series"),
         row=1,
         col=1,
         hf_text="text",
         hf_hovertext="hovertext",
     )
+    # Check the desired behavior
+    assert len(fig.hf_data[0]["y"]) == len(float_series) - 100
+    assert ~pd.isna(fig.hf_data[0]["y"]).any()
 
     # here we test whether we are able to deal with not-nan output
     float_series.iloc[np.random.choice(len(float_series), 100)] = np.nan
@@ -374,6 +377,37 @@ def test_nan_removed_input(float_series):
         col=2,
     )
 
+def test_nan_removed_input_check_nans_false(float_series):
+    # see: https://plotly.com/python/subplots/#custom-sized-subplot-with-subplot-titles
+    base_fig = make_subplots(
+        rows=2,
+        cols=2,
+        specs=[[{}, {}], [{"colspan": 2}, None]],
+    )
+
+    fig = FigureResampler(
+        base_fig,
+        default_n_shown_samples=1000,
+        resampled_trace_prefix_suffix=(
+            '<b style="color:sandybrown">[R]</b>',
+            '<b style="color:sandybrown">[R]</b>',
+        ),
+    )
+
+    float_series = float_series.copy()
+    float_series.iloc[np.random.choice(len(float_series), 100)] = np.nan
+    fig.add_trace(
+        go.Scatter(x=float_series.index, y=float_series, name="float_series"),
+        row=1,
+        col=1,
+        hf_text="text",
+        hf_hovertext="hovertext",
+        check_nans=False
+    )
+    # Check the undesired behavior
+    assert len(fig.hf_data[0]["y"]) == len(float_series)
+    assert pd.isna(fig.hf_data[0]["y"]).any()
+
 
 def test_hf_text():
     y = np.arange(10_000)
diff --git a/tests/test_figurewidget_resampler.py b/tests/test_figurewidget_resampler.py
@@ -263,14 +263,17 @@ def test_nan_removed_input(float_series):
     )
 
     float_series = float_series.copy()
-    float_series.iloc[np.random.choice(len(float_series), 100)] = np.nan
+    float_series.iloc[np.random.choice(len(float_series), 100, replace=False)] = np.nan
     fig.add_trace(
         go.Scatter(x=float_series.index, y=float_series, name="float_series"),
         row=1,
         col=1,
         hf_text="text",
         hf_hovertext="hovertext",
     )
+    # Check the desired behavior
+    assert len(fig.hf_data[0]["y"]) == len(float_series) - 100
+    assert ~pd.isna(fig.hf_data[0]["y"]).any()
 
     # here we test whether we are able to deal with not-nan output
     float_series.iloc[np.random.choice(len(float_series), 100)] = np.nan
@@ -295,6 +298,38 @@ def test_nan_removed_input(float_series):
     )
 
 
+def test_nan_removed_input_check_nans_false(float_series):
+    # see: https://plotly.com/python/subplots/#custom-sized-subplot-with-subplot-titles
+    base_fig = make_subplots(
+        rows=2,
+        cols=2,
+        specs=[[{}, {}], [{"colspan": 2}, None]],
+    )
+
+    fig = FigureWidgetResampler(
+        base_fig,
+        default_n_shown_samples=1000,
+        resampled_trace_prefix_suffix=(
+            '<b style="color:sandybrown">[R]</b>',
+            '<b style="color:sandybrown">[R]</b>',
+        ),
+    )
+
+    float_series = float_series.copy()
+    float_series.iloc[np.random.choice(len(float_series), 100)] = np.nan
+    fig.add_trace(
+        go.Scatter(x=float_series.index, y=float_series, name="float_series"),
+        row=1,
+        col=1,
+        hf_text="text",
+        hf_hovertext="hovertext",
+        check_nans=False
+    )
+    # Check the undesired behavior
+    assert len(fig.hf_data[0]["y"]) == len(float_series)
+    assert pd.isna(fig.hf_data[0]["y"]).any()
+
+
 def test_hf_text():
     y = np.arange(10_000)
 
@@ -795,6 +830,7 @@ def test_hf_data_subplots_non_shared_xaxes_row_col_none():
     assert 40_000 <= x_1[0] <= 40_000 + (20_000 / 1000)  
     assert (60_000 - 20_000 / 1_000) <= x_1[-1] <= 60_000
 
+
 def test_updates_two_traces():
     n = 1_000_000
     X = np.arange(n)