🐛 parse object arrays for hf_x (#116)

jvdd · jonasvdd · web-flow · commit 1b8ee36ca5e8 · 2022-12-02T12:20:59.000+01:00
* 🐛 parse object arrays for datetime or numeric dtypes, fixes #115 * 🧹 * 🙏 update poetry install in ci-cd testing (#117) * 🙏 * 🙈 * 🙏 * ♻️ improve object array parsing for multiple tzs * ♻️ could be optimized * 🧹 optimized + extend testing * 📝 make exceptions more explicit * 🧹 cleanup tests * 🙈 also parse datetime.datetime * 🐌 fix slow pandas backend bug + 🧹 * ♻️ raise ValueError when multiple time zones in hf_x * 🧹 * 🖊️ review * 🖊️ review Co-authored-by: jonasvdd <jonvdrdo.vanderdonckt@ugent.be> Co-authored-by: Jonas Van Der Donckt <38005924+jonasvdd@users.noreply.github.com>
diff --git a/plotly_resampler/aggregation/__init__.py b/plotly_resampler/aggregation/__init__.py
@@ -16,3 +16,13 @@
     MinMaxAggregator,
     MinMaxOverlapAggregator,
 )
+
+__all__ = [
+    "AbstractSeriesAggregator",
+    "LTTB",
+    "EfficientLTTB",
+    "EveryNthPoint",
+    "FuncAggregator",
+    "MinMaxAggregator",
+    "MinMaxOverlapAggregator",
+]
diff --git a/plotly_resampler/figure_resampler/figure_resampler_interface.py b/plotly_resampler/figure_resampler/figure_resampler_interface.py
@@ -210,8 +210,8 @@ def _get_current_graph(self) -> dict:
     def _check_update_trace_data(
         self,
         trace: dict,
-        start=None,
-        end=None,
+        start: Optional[Union[str, float]] = None,
+        end: Optional[Union[str, float]] = None,
     ) -> Optional[Union[dict, BaseTraceType]]:
         """Check and update the passed ``trace`` its data properties based on the
         slice range.
@@ -624,7 +624,7 @@ def _parse_get_trace_props(
             A namedtuple which serves as a datacontainer.
 
         """
-        hf_x = (
+        hf_x: np.ndarray = (
             trace["x"]
             if hasattr(trace, "x") and hf_x is None
             else hf_x.values
@@ -641,7 +641,7 @@ def _parse_get_trace_props(
             if isinstance(hf_y, (pd.Series, pd.Index))
             else hf_y
         )
-        hf_y = np.asarray(hf_y)
+        hf_y : np.ndarray = np.asarray(hf_y)
 
         hf_text = (
             hf_text
@@ -695,6 +695,29 @@ def _parse_get_trace_props(
                 if isinstance(hf_hovertext, np.ndarray):
                     hf_hovertext = hf_hovertext[not_nan_mask]
 
+            # Try to parse the hf_x data if it is of object type or
+            if len(hf_x) and (
+                hf_x.dtype.type is np.str_ or hf_x.dtype == "object"
+            ):  
+                try:
+                    # Try to parse to numeric
+                    hf_x = pd.to_numeric(hf_x, errors="raise")
+                except (ValueError, TypeError):
+                    try:
+                        # Try to parse to datetime
+                        hf_x = pd.to_datetime(hf_x, utc=False, errors="raise")
+                        # Will be cast to object array if it contains multiple timezones.
+                        if hf_x.dtype == "object":
+                            raise ValueError(
+                                "The x-data contains multiple timezones, which is not "
+                                "supported by plotly-resampler!"
+                            )
+                    except (ValueError, TypeError):
+                        raise ValueError(
+                            "plotly-resampler requires the x-data to be numeric or "
+                            "datetime-like \nMore details in the stacktrace above."
+                        )
+
             # If the categorical or string-like hf_y data is of type object (happens
             # when y argument is used for the trace constructor instead of hf_y), we
             # transform it to type string as such it will be sent as categorical data
@@ -812,9 +835,8 @@ def _add_trace_to_add_traces_kwargs(kwargs: dict) -> dict:
                 updated_kwargs[f"{keyword}s"] = [value]
             else:
                 updated_kwargs[f"{keyword}s"] = None
-    
-        return {**kwargs, **updated_kwargs}
 
+        return {**kwargs, **updated_kwargs}
 
     def add_trace(
         self,
@@ -944,16 +966,16 @@ def add_trace(
         if not isinstance(trace, BaseTraceType):
             trace = self._data_validator.validate_coerce(trace)[0]
 
-        # First add an UUID, as each (even the non-hf_data traces), must contain this
-        # key for comparison. If the trace already has an UUID, we will keep it.
+        # First add a UUID, as each (even the non-hf_data traces), must contain this
+        # key for comparison. If the trace already has a UUID, we will keep it.
         uuid_str = str(uuid4()) if trace.uid is None else trace.uid
         trace.uid = uuid_str
 
         # construct the hf_data_container
         # TODO in future version -> maybe regex on kwargs which start with `hf_`
         dc = self._parse_get_trace_props(trace, hf_x, hf_y, hf_text, hf_hovertext, check_nans)
 
-        # These traces will determine the autoscale RANGE!
+        # These traces will determine the autoscale its RANGE!
         #   -> so also store when `limit_to_view` is set.
         if trace["type"].lower() in self._high_frequency_traces:
             n_samples = len(dc.x)
@@ -1087,8 +1109,8 @@ def add_traces(
             for trace in data
         ]
 
-        # First add an UUID, as each (even the non-hf_data traces), must contain this
-        # key for comparison. If the trace already has an UUID, we will keep it.
+        # First add a UUID, as each (even the non-hf_data traces), must contain this
+        # key for comparison. If the trace already has a UUID, we will keep it.
         for trace in data:
             uuid_str = str(uuid4()) if trace.uid is None else trace.uid
             trace.uid = uuid_str
@@ -1236,7 +1258,7 @@ def construct_update_data(
             cl_k = relayout_data.keys()
 
             # ------------------ HF DATA aggregation ---------------------
-            # 1. Base case - there is a x-range specified in the front-end
+            # 1. Base case - there is an x-range specified in the front-end
             start_matches = self._re_matches(re.compile(r"xaxis\d*.range\[0]"), cl_k)
             stop_matches = self._re_matches(re.compile(r"xaxis\d*.range\[1]"), cl_k)
             if len(start_matches) and len(stop_matches):
@@ -1370,4 +1392,4 @@ def __reduce__(self):
         props["pr_props"] = {}
         for k in self._get_pr_props_keys():
             props["pr_props"][k] = getattr(self, k)
-        return (self.__class__, (props,))  # (props,) to comply with plotly magic
+        return self.__class__, (props,)  # (props,) to comply with plotly magic
diff --git a/tests/test_figure_resampler.py b/tests/test_figure_resampler.py
@@ -5,6 +5,7 @@
 
 import pytest
 import time
+import datetime
 import multiprocessing
 
 import numpy as np
@@ -17,7 +18,7 @@
 from plotly.subplots import make_subplots
 from plotly_resampler import FigureResampler, LTTB, EveryNthPoint
 
-# Note: this will be used to skip / alter behavior when running browser tests on 
+# Note: this will be used to skip / alter behavior when running browser tests on
 # non-linux platforms.
 from .utils import not_on_linux
 
@@ -101,6 +102,7 @@ def test_add_trace_not_resampling(float_series):
         hf_hovertext="hovertext",
     )
 
+
 def test_various_dtypes(float_series):
     # List of dtypes supported by orjson >= 3.8
     valid_dtype_list = [
@@ -131,11 +133,11 @@ def test_various_dtypes(float_series):
         fig.full_figure_for_development()
 
     # List of dtypes not supported by orjson >= 3.8
-    invalid_dtype_list = [ np.float16 ]
+    invalid_dtype_list = [np.float16]
     for invalid_dtype in invalid_dtype_list:
         fig = FigureResampler(go.Figure(), default_n_shown_samples=1000)
         # nb. datapoints < default_n_shown_samples
-        with pytest.raises(TypeError):  
+        with pytest.raises(TypeError):
             # if this test fails -> orjson supports f16 => remove casting frome code
             fig.add_trace(
                 go.Scatter(name="float_series"),
@@ -144,6 +146,7 @@ def test_various_dtypes(float_series):
             )
             fig.full_figure_for_development()
 
+
 def test_max_n_samples(float_series):
     s = float_series[:5000]
 
@@ -513,6 +516,9 @@ def test_multiple_timezones():
         dr.tz_convert("Australia/Canberra"),
     ]
 
+    plain_plotly_fig = make_subplots(rows=len(cs), cols=1, shared_xaxes=True)
+    plain_plotly_fig.update_layout(height=min(300, 250 * len(cs)))
+
     fr_fig = FigureResampler(
         make_subplots(rows=len(cs), cols=1, shared_xaxes=True),
         default_n_shown_samples=500,
@@ -522,13 +528,76 @@ def test_multiple_timezones():
     fr_fig.update_layout(height=min(300, 250 * len(cs)))
 
     for i, date_range in enumerate(cs, 1):
+        name = date_range.dtype.name.split(", ")[-1][:-1]
+        plain_plotly_fig.add_trace(
+            go.Scattergl(x=date_range, y=dr_v, name=name), row=i, col=1
+        )
         fr_fig.add_trace(
-            go.Scattergl(name=date_range.dtype.name.split(", ")[-1]),
+            go.Scattergl(name=name),
             hf_x=date_range,
             hf_y=dr_v,
             row=i,
             col=1,
         )
+        # Assert that the time parsing is exactly the same
+        assert plain_plotly_fig.data[0].x[0] == fr_fig.data[0].x[0]
+
+
+def test_multiple_timezones_in_single_x_index__datetimes_and_timestamps():
+    # TODO: can be improved with pytest parametrize
+    y = np.arange(20)
+
+    index1 = pd.date_range("2018-01-01", periods=10, freq="H", tz="US/Eastern")
+    index2 = pd.date_range("2018-01-02", periods=10, freq="H", tz="Asia/Dubai")
+    index_timestamps = index1.append(index2)
+    assert all(isinstance(x, pd.Timestamp) for x in index_timestamps)
+    index1_datetimes = pd.Index([x.to_pydatetime() for x in index1])
+    index_datetimes = pd.Index([x.to_pydatetime() for x in index_timestamps])
+    assert not any(isinstance(x, pd.Timestamp) for x in index_datetimes)
+    assert all(isinstance(x, datetime.datetime) for x in index_datetimes)
+
+    ## Test why we throw ValueError if array is still of object type after
+    ## successful pd.to_datetime call
+    # String array of datetimes with same tz -> NOT object array
+    assert not pd.to_datetime(index1.astype("str")).dtype == "object"
+    assert not pd.to_datetime(index1_datetimes.astype("str")).dtype == "object"
+    # String array of datetimes with multiple tz -> object array
+    assert pd.to_datetime(index_timestamps.astype("str")).dtype == "object"
+    assert pd.to_datetime(index_datetimes.astype("str")).dtype == "object"
+
+    for index in [index_timestamps, index_datetimes]:
+        fig = go.Figure()
+        fig.add_trace(go.Scattergl(x=index, y=y))
+        with pytest.raises(ValueError):
+            fr_fig = FigureResampler(fig, default_n_shown_samples=10)
+        # Add as hf_x as index
+        fr_fig = FigureResampler(default_n_shown_samples=10)
+        with pytest.raises(ValueError):
+            fr_fig.add_trace(go.Scattergl(), hf_x=index, hf_y=y)
+        # Add as hf_x as object array of datetime values
+        fr_fig = FigureResampler(default_n_shown_samples=10)
+        with pytest.raises(ValueError):
+            fr_fig.add_trace(go.Scattergl(), hf_x=index.values.astype("object"), hf_y=y)
+        # Add as hf_x as string array
+        fr_fig = FigureResampler(default_n_shown_samples=10)
+        with pytest.raises(ValueError):
+            fr_fig.add_trace(go.Scattergl(), hf_x=index.astype(str), hf_y=y)
+        # Add as hf_x as object array of strings
+        fr_fig = FigureResampler(default_n_shown_samples=10)
+        with pytest.raises(ValueError):
+            fr_fig.add_trace(
+                go.Scattergl(), hf_x=index.astype(str).astype("object"), hf_y=y
+            )
+
+        fig = go.Figure()
+        fig.add_trace(go.Scattergl(x=index.astype("object"), y=y))
+        with pytest.raises(ValueError):
+            fr_fig = FigureResampler(fig, default_n_shown_samples=10)
+
+        fig = go.Figure()
+        fig.add_trace(go.Scattergl(x=index.astype("str"), y=y))
+        with pytest.raises(ValueError):
+            fr_fig = FigureResampler(fig, default_n_shown_samples=10)
 
 
 def test_proper_copy_of_wrapped_fig(float_series):
@@ -575,6 +644,82 @@ def test_2d_input_y():
         assert "1 dimensional" in e_info
 
 
+def test_hf_x_object_array():
+    y = np.random.randn(100)
+
+    ## Object array of datetime
+    ### Should be parsed to a pd.DatetimeIndex (is more efficient than object array)
+    x = pd.date_range("2020-01-01", freq="s", periods=100).astype("object")
+    assert x.dtype == "object"
+    assert isinstance(x[0], pd.Timestamp)
+    # Add in the scatter
+    fig = FigureResampler(default_n_shown_samples=50)
+    fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
+    assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
+    assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)
+    # Add as hf_x
+    fig = FigureResampler(default_n_shown_samples=50)
+    fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
+    assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
+    assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)
+
+    ## Object array of datetime strings
+    ### Should be parsed to a pd.DatetimeIndex (is more efficient than object array)
+    x = pd.date_range("2020-01-01", freq="s", periods=100).astype(str).astype("object")
+    assert x.dtype == "object"
+    assert isinstance(x[0], str)
+    # Add in the scatter
+    fig = FigureResampler(default_n_shown_samples=50)
+    fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
+    assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
+    assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)
+    # Add as hf_x
+    fig = FigureResampler(default_n_shown_samples=50)
+    fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
+    assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
+    assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)
+
+    ## Object array of ints
+    ### Should be parsed to an int array (is more efficient than object array)
+    x = np.arange(100).astype("object")
+    assert x.dtype == "object"
+    assert isinstance(x[0], int)
+    # Add in the scatter
+    fig = FigureResampler(default_n_shown_samples=50)
+    fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
+    assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)
+    # Add as hf_x
+    fig = FigureResampler(default_n_shown_samples=50)
+    fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
+    assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)
+
+    ## Object array of ints as strings
+    ### Should be an integer array where the values are int objects
+    x = np.arange(100).astype(str).astype("object")
+    assert x.dtype == "object"
+    assert isinstance(x[0], str)
+    # Add in the scatter
+    fig = FigureResampler(default_n_shown_samples=50)
+    fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
+    assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)
+    # Add as hf_x
+    fig = FigureResampler(default_n_shown_samples=50)
+    fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
+    assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)
+
+    ## Object array of strings
+    x = np.array(["x", "y"] * 50).astype("object")
+    assert x.dtype == "object"
+    # Add in the scatter
+    with pytest.raises(ValueError):
+        fig = FigureResampler(default_n_shown_samples=50)
+        fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
+    # Add as hf_x
+    with pytest.raises(ValueError):
+        fig = FigureResampler(default_n_shown_samples=50)
+        fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
+
+
 def test_time_tz_slicing():
     n = 5050
     dr = pd.Series(
diff --git a/tests/test_figurewidget_resampler.py b/tests/test_figurewidget_resampler.py