Skip to content

Commit 1b8ee36

Browse files
jvddjonasvdd
andauthored
🐛 parse object arrays for hf_x (#116)
* 🐛 parse object arrays for datetime or numeric dtypes, fixes #115 * 🧹 * 🙏 update poetry install in ci-cd testing (#117) * 🙏 * 🙈 * 🙏 * ♻️ improve object array parsing for multiple tzs * ♻️ could be optimized * 🧹 optimized + extend testing * 📝 make exceptions more explicit * 🧹 cleanup tests * 🙈 also parse datetime.datetime * 🐌 fix slow pandas backend bug + 🧹 * ♻️ raise ValueError when multiple time zones in hf_x * 🧹 * 🖊️ review * 🖊️ review Co-authored-by: jonasvdd <[email protected]> Co-authored-by: Jonas Van Der Donckt <[email protected]>
1 parent 71b4efe commit 1b8ee36

File tree

4 files changed

+338
-24
lines changed

4 files changed

+338
-24
lines changed

plotly_resampler/aggregation/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,13 @@
1616
MinMaxAggregator,
1717
MinMaxOverlapAggregator,
1818
)
19+
20+
__all__ = [
21+
"AbstractSeriesAggregator",
22+
"LTTB",
23+
"EfficientLTTB",
24+
"EveryNthPoint",
25+
"FuncAggregator",
26+
"MinMaxAggregator",
27+
"MinMaxOverlapAggregator",
28+
]

plotly_resampler/figure_resampler/figure_resampler_interface.py

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,8 @@ def _get_current_graph(self) -> dict:
210210
def _check_update_trace_data(
211211
self,
212212
trace: dict,
213-
start=None,
214-
end=None,
213+
start: Optional[Union[str, float]] = None,
214+
end: Optional[Union[str, float]] = None,
215215
) -> Optional[Union[dict, BaseTraceType]]:
216216
"""Check and update the passed ``trace`` its data properties based on the
217217
slice range.
@@ -624,7 +624,7 @@ def _parse_get_trace_props(
624624
A namedtuple which serves as a datacontainer.
625625
626626
"""
627-
hf_x = (
627+
hf_x: np.ndarray = (
628628
trace["x"]
629629
if hasattr(trace, "x") and hf_x is None
630630
else hf_x.values
@@ -641,7 +641,7 @@ def _parse_get_trace_props(
641641
if isinstance(hf_y, (pd.Series, pd.Index))
642642
else hf_y
643643
)
644-
hf_y = np.asarray(hf_y)
644+
hf_y : np.ndarray = np.asarray(hf_y)
645645

646646
hf_text = (
647647
hf_text
@@ -695,6 +695,29 @@ def _parse_get_trace_props(
695695
if isinstance(hf_hovertext, np.ndarray):
696696
hf_hovertext = hf_hovertext[not_nan_mask]
697697

698+
# Try to parse the hf_x data if it is of object type or
699+
if len(hf_x) and (
700+
hf_x.dtype.type is np.str_ or hf_x.dtype == "object"
701+
):
702+
try:
703+
# Try to parse to numeric
704+
hf_x = pd.to_numeric(hf_x, errors="raise")
705+
except (ValueError, TypeError):
706+
try:
707+
# Try to parse to datetime
708+
hf_x = pd.to_datetime(hf_x, utc=False, errors="raise")
709+
# Will be cast to object array if it contains multiple timezones.
710+
if hf_x.dtype == "object":
711+
raise ValueError(
712+
"The x-data contains multiple timezones, which is not "
713+
"supported by plotly-resampler!"
714+
)
715+
except (ValueError, TypeError):
716+
raise ValueError(
717+
"plotly-resampler requires the x-data to be numeric or "
718+
"datetime-like \nMore details in the stacktrace above."
719+
)
720+
698721
# If the categorical or string-like hf_y data is of type object (happens
699722
# when y argument is used for the trace constructor instead of hf_y), we
700723
# transform it to type string as such it will be sent as categorical data
@@ -812,9 +835,8 @@ def _add_trace_to_add_traces_kwargs(kwargs: dict) -> dict:
812835
updated_kwargs[f"{keyword}s"] = [value]
813836
else:
814837
updated_kwargs[f"{keyword}s"] = None
815-
816-
return {**kwargs, **updated_kwargs}
817838

839+
return {**kwargs, **updated_kwargs}
818840

819841
def add_trace(
820842
self,
@@ -944,16 +966,16 @@ def add_trace(
944966
if not isinstance(trace, BaseTraceType):
945967
trace = self._data_validator.validate_coerce(trace)[0]
946968

947-
# First add an UUID, as each (even the non-hf_data traces), must contain this
948-
# key for comparison. If the trace already has an UUID, we will keep it.
969+
# First add a UUID, as each (even the non-hf_data traces), must contain this
970+
# key for comparison. If the trace already has a UUID, we will keep it.
949971
uuid_str = str(uuid4()) if trace.uid is None else trace.uid
950972
trace.uid = uuid_str
951973

952974
# construct the hf_data_container
953975
# TODO in future version -> maybe regex on kwargs which start with `hf_`
954976
dc = self._parse_get_trace_props(trace, hf_x, hf_y, hf_text, hf_hovertext, check_nans)
955977

956-
# These traces will determine the autoscale RANGE!
978+
# These traces will determine the autoscale its RANGE!
957979
# -> so also store when `limit_to_view` is set.
958980
if trace["type"].lower() in self._high_frequency_traces:
959981
n_samples = len(dc.x)
@@ -1087,8 +1109,8 @@ def add_traces(
10871109
for trace in data
10881110
]
10891111

1090-
# First add an UUID, as each (even the non-hf_data traces), must contain this
1091-
# key for comparison. If the trace already has an UUID, we will keep it.
1112+
# First add a UUID, as each (even the non-hf_data traces), must contain this
1113+
# key for comparison. If the trace already has a UUID, we will keep it.
10921114
for trace in data:
10931115
uuid_str = str(uuid4()) if trace.uid is None else trace.uid
10941116
trace.uid = uuid_str
@@ -1236,7 +1258,7 @@ def construct_update_data(
12361258
cl_k = relayout_data.keys()
12371259

12381260
# ------------------ HF DATA aggregation ---------------------
1239-
# 1. Base case - there is a x-range specified in the front-end
1261+
# 1. Base case - there is an x-range specified in the front-end
12401262
start_matches = self._re_matches(re.compile(r"xaxis\d*.range\[0]"), cl_k)
12411263
stop_matches = self._re_matches(re.compile(r"xaxis\d*.range\[1]"), cl_k)
12421264
if len(start_matches) and len(stop_matches):
@@ -1370,4 +1392,4 @@ def __reduce__(self):
13701392
props["pr_props"] = {}
13711393
for k in self._get_pr_props_keys():
13721394
props["pr_props"][k] = getattr(self, k)
1373-
return (self.__class__, (props,)) # (props,) to comply with plotly magic
1395+
return self.__class__, (props,) # (props,) to comply with plotly magic

tests/test_figure_resampler.py

Lines changed: 149 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pytest
77
import time
8+
import datetime
89
import multiprocessing
910

1011
import numpy as np
@@ -17,7 +18,7 @@
1718
from plotly.subplots import make_subplots
1819
from plotly_resampler import FigureResampler, LTTB, EveryNthPoint
1920

20-
# Note: this will be used to skip / alter behavior when running browser tests on
21+
# Note: this will be used to skip / alter behavior when running browser tests on
2122
# non-linux platforms.
2223
from .utils import not_on_linux
2324

@@ -101,6 +102,7 @@ def test_add_trace_not_resampling(float_series):
101102
hf_hovertext="hovertext",
102103
)
103104

105+
104106
def test_various_dtypes(float_series):
105107
# List of dtypes supported by orjson >= 3.8
106108
valid_dtype_list = [
@@ -131,11 +133,11 @@ def test_various_dtypes(float_series):
131133
fig.full_figure_for_development()
132134

133135
# List of dtypes not supported by orjson >= 3.8
134-
invalid_dtype_list = [ np.float16 ]
136+
invalid_dtype_list = [np.float16]
135137
for invalid_dtype in invalid_dtype_list:
136138
fig = FigureResampler(go.Figure(), default_n_shown_samples=1000)
137139
# nb. datapoints < default_n_shown_samples
138-
with pytest.raises(TypeError):
140+
with pytest.raises(TypeError):
139141
# if this test fails -> orjson supports f16 => remove casting frome code
140142
fig.add_trace(
141143
go.Scatter(name="float_series"),
@@ -144,6 +146,7 @@ def test_various_dtypes(float_series):
144146
)
145147
fig.full_figure_for_development()
146148

149+
147150
def test_max_n_samples(float_series):
148151
s = float_series[:5000]
149152

@@ -513,6 +516,9 @@ def test_multiple_timezones():
513516
dr.tz_convert("Australia/Canberra"),
514517
]
515518

519+
plain_plotly_fig = make_subplots(rows=len(cs), cols=1, shared_xaxes=True)
520+
plain_plotly_fig.update_layout(height=min(300, 250 * len(cs)))
521+
516522
fr_fig = FigureResampler(
517523
make_subplots(rows=len(cs), cols=1, shared_xaxes=True),
518524
default_n_shown_samples=500,
@@ -522,13 +528,76 @@ def test_multiple_timezones():
522528
fr_fig.update_layout(height=min(300, 250 * len(cs)))
523529

524530
for i, date_range in enumerate(cs, 1):
531+
name = date_range.dtype.name.split(", ")[-1][:-1]
532+
plain_plotly_fig.add_trace(
533+
go.Scattergl(x=date_range, y=dr_v, name=name), row=i, col=1
534+
)
525535
fr_fig.add_trace(
526-
go.Scattergl(name=date_range.dtype.name.split(", ")[-1]),
536+
go.Scattergl(name=name),
527537
hf_x=date_range,
528538
hf_y=dr_v,
529539
row=i,
530540
col=1,
531541
)
542+
# Assert that the time parsing is exactly the same
543+
assert plain_plotly_fig.data[0].x[0] == fr_fig.data[0].x[0]
544+
545+
546+
def test_multiple_timezones_in_single_x_index__datetimes_and_timestamps():
547+
# TODO: can be improved with pytest parametrize
548+
y = np.arange(20)
549+
550+
index1 = pd.date_range("2018-01-01", periods=10, freq="H", tz="US/Eastern")
551+
index2 = pd.date_range("2018-01-02", periods=10, freq="H", tz="Asia/Dubai")
552+
index_timestamps = index1.append(index2)
553+
assert all(isinstance(x, pd.Timestamp) for x in index_timestamps)
554+
index1_datetimes = pd.Index([x.to_pydatetime() for x in index1])
555+
index_datetimes = pd.Index([x.to_pydatetime() for x in index_timestamps])
556+
assert not any(isinstance(x, pd.Timestamp) for x in index_datetimes)
557+
assert all(isinstance(x, datetime.datetime) for x in index_datetimes)
558+
559+
## Test why we throw ValueError if array is still of object type after
560+
## successful pd.to_datetime call
561+
# String array of datetimes with same tz -> NOT object array
562+
assert not pd.to_datetime(index1.astype("str")).dtype == "object"
563+
assert not pd.to_datetime(index1_datetimes.astype("str")).dtype == "object"
564+
# String array of datetimes with multiple tz -> object array
565+
assert pd.to_datetime(index_timestamps.astype("str")).dtype == "object"
566+
assert pd.to_datetime(index_datetimes.astype("str")).dtype == "object"
567+
568+
for index in [index_timestamps, index_datetimes]:
569+
fig = go.Figure()
570+
fig.add_trace(go.Scattergl(x=index, y=y))
571+
with pytest.raises(ValueError):
572+
fr_fig = FigureResampler(fig, default_n_shown_samples=10)
573+
# Add as hf_x as index
574+
fr_fig = FigureResampler(default_n_shown_samples=10)
575+
with pytest.raises(ValueError):
576+
fr_fig.add_trace(go.Scattergl(), hf_x=index, hf_y=y)
577+
# Add as hf_x as object array of datetime values
578+
fr_fig = FigureResampler(default_n_shown_samples=10)
579+
with pytest.raises(ValueError):
580+
fr_fig.add_trace(go.Scattergl(), hf_x=index.values.astype("object"), hf_y=y)
581+
# Add as hf_x as string array
582+
fr_fig = FigureResampler(default_n_shown_samples=10)
583+
with pytest.raises(ValueError):
584+
fr_fig.add_trace(go.Scattergl(), hf_x=index.astype(str), hf_y=y)
585+
# Add as hf_x as object array of strings
586+
fr_fig = FigureResampler(default_n_shown_samples=10)
587+
with pytest.raises(ValueError):
588+
fr_fig.add_trace(
589+
go.Scattergl(), hf_x=index.astype(str).astype("object"), hf_y=y
590+
)
591+
592+
fig = go.Figure()
593+
fig.add_trace(go.Scattergl(x=index.astype("object"), y=y))
594+
with pytest.raises(ValueError):
595+
fr_fig = FigureResampler(fig, default_n_shown_samples=10)
596+
597+
fig = go.Figure()
598+
fig.add_trace(go.Scattergl(x=index.astype("str"), y=y))
599+
with pytest.raises(ValueError):
600+
fr_fig = FigureResampler(fig, default_n_shown_samples=10)
532601

533602

534603
def test_proper_copy_of_wrapped_fig(float_series):
@@ -575,6 +644,82 @@ def test_2d_input_y():
575644
assert "1 dimensional" in e_info
576645

577646

647+
def test_hf_x_object_array():
648+
y = np.random.randn(100)
649+
650+
## Object array of datetime
651+
### Should be parsed to a pd.DatetimeIndex (is more efficient than object array)
652+
x = pd.date_range("2020-01-01", freq="s", periods=100).astype("object")
653+
assert x.dtype == "object"
654+
assert isinstance(x[0], pd.Timestamp)
655+
# Add in the scatter
656+
fig = FigureResampler(default_n_shown_samples=50)
657+
fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
658+
assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
659+
assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)
660+
# Add as hf_x
661+
fig = FigureResampler(default_n_shown_samples=50)
662+
fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
663+
assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
664+
assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)
665+
666+
## Object array of datetime strings
667+
### Should be parsed to a pd.DatetimeIndex (is more efficient than object array)
668+
x = pd.date_range("2020-01-01", freq="s", periods=100).astype(str).astype("object")
669+
assert x.dtype == "object"
670+
assert isinstance(x[0], str)
671+
# Add in the scatter
672+
fig = FigureResampler(default_n_shown_samples=50)
673+
fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
674+
assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
675+
assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)
676+
# Add as hf_x
677+
fig = FigureResampler(default_n_shown_samples=50)
678+
fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
679+
assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
680+
assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)
681+
682+
## Object array of ints
683+
### Should be parsed to an int array (is more efficient than object array)
684+
x = np.arange(100).astype("object")
685+
assert x.dtype == "object"
686+
assert isinstance(x[0], int)
687+
# Add in the scatter
688+
fig = FigureResampler(default_n_shown_samples=50)
689+
fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
690+
assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)
691+
# Add as hf_x
692+
fig = FigureResampler(default_n_shown_samples=50)
693+
fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
694+
assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)
695+
696+
## Object array of ints as strings
697+
### Should be an integer array where the values are int objects
698+
x = np.arange(100).astype(str).astype("object")
699+
assert x.dtype == "object"
700+
assert isinstance(x[0], str)
701+
# Add in the scatter
702+
fig = FigureResampler(default_n_shown_samples=50)
703+
fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
704+
assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)
705+
# Add as hf_x
706+
fig = FigureResampler(default_n_shown_samples=50)
707+
fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
708+
assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)
709+
710+
## Object array of strings
711+
x = np.array(["x", "y"] * 50).astype("object")
712+
assert x.dtype == "object"
713+
# Add in the scatter
714+
with pytest.raises(ValueError):
715+
fig = FigureResampler(default_n_shown_samples=50)
716+
fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
717+
# Add as hf_x
718+
with pytest.raises(ValueError):
719+
fig = FigureResampler(default_n_shown_samples=50)
720+
fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
721+
722+
578723
def test_time_tz_slicing():
579724
n = 5050
580725
dr = pd.Series(

0 commit comments

Comments
 (0)