Skip to content

🐛 parse object arrays for hf_x #116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Dec 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions plotly_resampler/aggregation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,13 @@
MinMaxAggregator,
MinMaxOverlapAggregator,
)

__all__ = [
"AbstractSeriesAggregator",
"LTTB",
"EfficientLTTB",
"EveryNthPoint",
"FuncAggregator",
"MinMaxAggregator",
"MinMaxOverlapAggregator",
]
48 changes: 35 additions & 13 deletions plotly_resampler/figure_resampler/figure_resampler_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,8 @@ def _get_current_graph(self) -> dict:
def _check_update_trace_data(
self,
trace: dict,
start=None,
end=None,
start: Optional[Union[str, float]] = None,
end: Optional[Union[str, float]] = None,
) -> Optional[Union[dict, BaseTraceType]]:
"""Check and update the passed ``trace`` its data properties based on the
slice range.
Expand Down Expand Up @@ -624,7 +624,7 @@ def _parse_get_trace_props(
A namedtuple which serves as a datacontainer.

"""
hf_x = (
hf_x: np.ndarray = (
trace["x"]
if hasattr(trace, "x") and hf_x is None
else hf_x.values
Expand All @@ -641,7 +641,7 @@ def _parse_get_trace_props(
if isinstance(hf_y, (pd.Series, pd.Index))
else hf_y
)
hf_y = np.asarray(hf_y)
hf_y : np.ndarray = np.asarray(hf_y)

hf_text = (
hf_text
Expand Down Expand Up @@ -695,6 +695,29 @@ def _parse_get_trace_props(
if isinstance(hf_hovertext, np.ndarray):
hf_hovertext = hf_hovertext[not_nan_mask]

# Try to parse the hf_x data if it is of object type or
if len(hf_x) and (
hf_x.dtype.type is np.str_ or hf_x.dtype == "object"
):
try:
# Try to parse to numeric
hf_x = pd.to_numeric(hf_x, errors="raise")
except (ValueError, TypeError):
try:
# Try to parse to datetime
hf_x = pd.to_datetime(hf_x, utc=False, errors="raise")
# Will be cast to object array if it contains multiple timezones.
if hf_x.dtype == "object":
raise ValueError(
"The x-data contains multiple timezones, which is not "
"supported by plotly-resampler!"
)
except (ValueError, TypeError):
raise ValueError(
"plotly-resampler requires the x-data to be numeric or "
"datetime-like \nMore details in the stacktrace above."
)

# If the categorical or string-like hf_y data is of type object (happens
# when y argument is used for the trace constructor instead of hf_y), we
# transform it to type string as such it will be sent as categorical data
Expand Down Expand Up @@ -812,9 +835,8 @@ def _add_trace_to_add_traces_kwargs(kwargs: dict) -> dict:
updated_kwargs[f"{keyword}s"] = [value]
else:
updated_kwargs[f"{keyword}s"] = None

return {**kwargs, **updated_kwargs}

return {**kwargs, **updated_kwargs}

def add_trace(
self,
Expand Down Expand Up @@ -944,16 +966,16 @@ def add_trace(
if not isinstance(trace, BaseTraceType):
trace = self._data_validator.validate_coerce(trace)[0]

# First add an UUID, as each (even the non-hf_data traces), must contain this
# key for comparison. If the trace already has an UUID, we will keep it.
# First add a UUID, as each (even the non-hf_data traces), must contain this
# key for comparison. If the trace already has a UUID, we will keep it.
uuid_str = str(uuid4()) if trace.uid is None else trace.uid
trace.uid = uuid_str

# construct the hf_data_container
# TODO in future version -> maybe regex on kwargs which start with `hf_`
dc = self._parse_get_trace_props(trace, hf_x, hf_y, hf_text, hf_hovertext, check_nans)

# These traces will determine the autoscale RANGE!
# These traces will determine the autoscale its RANGE!
# -> so also store when `limit_to_view` is set.
if trace["type"].lower() in self._high_frequency_traces:
n_samples = len(dc.x)
Expand Down Expand Up @@ -1087,8 +1109,8 @@ def add_traces(
for trace in data
]

# First add an UUID, as each (even the non-hf_data traces), must contain this
# key for comparison. If the trace already has an UUID, we will keep it.
# First add a UUID, as each (even the non-hf_data traces), must contain this
# key for comparison. If the trace already has a UUID, we will keep it.
for trace in data:
uuid_str = str(uuid4()) if trace.uid is None else trace.uid
trace.uid = uuid_str
Expand Down Expand Up @@ -1236,7 +1258,7 @@ def construct_update_data(
cl_k = relayout_data.keys()

# ------------------ HF DATA aggregation ---------------------
# 1. Base case - there is a x-range specified in the front-end
# 1. Base case - there is an x-range specified in the front-end
start_matches = self._re_matches(re.compile(r"xaxis\d*.range\[0]"), cl_k)
stop_matches = self._re_matches(re.compile(r"xaxis\d*.range\[1]"), cl_k)
if len(start_matches) and len(stop_matches):
Expand Down Expand Up @@ -1370,4 +1392,4 @@ def __reduce__(self):
props["pr_props"] = {}
for k in self._get_pr_props_keys():
props["pr_props"][k] = getattr(self, k)
return (self.__class__, (props,)) # (props,) to comply with plotly magic
return self.__class__, (props,) # (props,) to comply with plotly magic
153 changes: 149 additions & 4 deletions tests/test_figure_resampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pytest
import time
import datetime
import multiprocessing

import numpy as np
Expand All @@ -17,7 +18,7 @@
from plotly.subplots import make_subplots
from plotly_resampler import FigureResampler, LTTB, EveryNthPoint

# Note: this will be used to skip / alter behavior when running browser tests on
# Note: this will be used to skip / alter behavior when running browser tests on
# non-linux platforms.
from .utils import not_on_linux

Expand Down Expand Up @@ -101,6 +102,7 @@ def test_add_trace_not_resampling(float_series):
hf_hovertext="hovertext",
)


def test_various_dtypes(float_series):
# List of dtypes supported by orjson >= 3.8
valid_dtype_list = [
Expand Down Expand Up @@ -131,11 +133,11 @@ def test_various_dtypes(float_series):
fig.full_figure_for_development()

# List of dtypes not supported by orjson >= 3.8
invalid_dtype_list = [ np.float16 ]
invalid_dtype_list = [np.float16]
for invalid_dtype in invalid_dtype_list:
fig = FigureResampler(go.Figure(), default_n_shown_samples=1000)
# nb. datapoints < default_n_shown_samples
with pytest.raises(TypeError):
with pytest.raises(TypeError):
# if this test fails -> orjson supports f16 => remove casting frome code
fig.add_trace(
go.Scatter(name="float_series"),
Expand All @@ -144,6 +146,7 @@ def test_various_dtypes(float_series):
)
fig.full_figure_for_development()


def test_max_n_samples(float_series):
s = float_series[:5000]

Expand Down Expand Up @@ -513,6 +516,9 @@ def test_multiple_timezones():
dr.tz_convert("Australia/Canberra"),
]

plain_plotly_fig = make_subplots(rows=len(cs), cols=1, shared_xaxes=True)
plain_plotly_fig.update_layout(height=min(300, 250 * len(cs)))

fr_fig = FigureResampler(
make_subplots(rows=len(cs), cols=1, shared_xaxes=True),
default_n_shown_samples=500,
Expand All @@ -522,13 +528,76 @@ def test_multiple_timezones():
fr_fig.update_layout(height=min(300, 250 * len(cs)))

for i, date_range in enumerate(cs, 1):
name = date_range.dtype.name.split(", ")[-1][:-1]
plain_plotly_fig.add_trace(
go.Scattergl(x=date_range, y=dr_v, name=name), row=i, col=1
)
fr_fig.add_trace(
go.Scattergl(name=date_range.dtype.name.split(", ")[-1]),
go.Scattergl(name=name),
hf_x=date_range,
hf_y=dr_v,
row=i,
col=1,
)
# Assert that the time parsing is exactly the same
assert plain_plotly_fig.data[0].x[0] == fr_fig.data[0].x[0]


def test_multiple_timezones_in_single_x_index__datetimes_and_timestamps():
# TODO: can be improved with pytest parametrize
y = np.arange(20)

index1 = pd.date_range("2018-01-01", periods=10, freq="H", tz="US/Eastern")
index2 = pd.date_range("2018-01-02", periods=10, freq="H", tz="Asia/Dubai")
index_timestamps = index1.append(index2)
assert all(isinstance(x, pd.Timestamp) for x in index_timestamps)
index1_datetimes = pd.Index([x.to_pydatetime() for x in index1])
index_datetimes = pd.Index([x.to_pydatetime() for x in index_timestamps])
assert not any(isinstance(x, pd.Timestamp) for x in index_datetimes)
assert all(isinstance(x, datetime.datetime) for x in index_datetimes)

## Test why we throw ValueError if array is still of object type after
## successful pd.to_datetime call
# String array of datetimes with same tz -> NOT object array
assert not pd.to_datetime(index1.astype("str")).dtype == "object"
assert not pd.to_datetime(index1_datetimes.astype("str")).dtype == "object"
# String array of datetimes with multiple tz -> object array
assert pd.to_datetime(index_timestamps.astype("str")).dtype == "object"
assert pd.to_datetime(index_datetimes.astype("str")).dtype == "object"

for index in [index_timestamps, index_datetimes]:
fig = go.Figure()
fig.add_trace(go.Scattergl(x=index, y=y))
with pytest.raises(ValueError):
fr_fig = FigureResampler(fig, default_n_shown_samples=10)
# Add as hf_x as index
fr_fig = FigureResampler(default_n_shown_samples=10)
with pytest.raises(ValueError):
fr_fig.add_trace(go.Scattergl(), hf_x=index, hf_y=y)
# Add as hf_x as object array of datetime values
fr_fig = FigureResampler(default_n_shown_samples=10)
with pytest.raises(ValueError):
fr_fig.add_trace(go.Scattergl(), hf_x=index.values.astype("object"), hf_y=y)
# Add as hf_x as string array
fr_fig = FigureResampler(default_n_shown_samples=10)
with pytest.raises(ValueError):
fr_fig.add_trace(go.Scattergl(), hf_x=index.astype(str), hf_y=y)
# Add as hf_x as object array of strings
fr_fig = FigureResampler(default_n_shown_samples=10)
with pytest.raises(ValueError):
fr_fig.add_trace(
go.Scattergl(), hf_x=index.astype(str).astype("object"), hf_y=y
)

fig = go.Figure()
fig.add_trace(go.Scattergl(x=index.astype("object"), y=y))
with pytest.raises(ValueError):
fr_fig = FigureResampler(fig, default_n_shown_samples=10)

fig = go.Figure()
fig.add_trace(go.Scattergl(x=index.astype("str"), y=y))
with pytest.raises(ValueError):
fr_fig = FigureResampler(fig, default_n_shown_samples=10)


def test_proper_copy_of_wrapped_fig(float_series):
Expand Down Expand Up @@ -575,6 +644,82 @@ def test_2d_input_y():
assert "1 dimensional" in e_info


def test_hf_x_object_array():
y = np.random.randn(100)

## Object array of datetime
### Should be parsed to a pd.DatetimeIndex (is more efficient than object array)
x = pd.date_range("2020-01-01", freq="s", periods=100).astype("object")
assert x.dtype == "object"
assert isinstance(x[0], pd.Timestamp)
# Add in the scatter
fig = FigureResampler(default_n_shown_samples=50)
fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)
# Add as hf_x
fig = FigureResampler(default_n_shown_samples=50)
fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)

## Object array of datetime strings
### Should be parsed to a pd.DatetimeIndex (is more efficient than object array)
x = pd.date_range("2020-01-01", freq="s", periods=100).astype(str).astype("object")
assert x.dtype == "object"
assert isinstance(x[0], str)
# Add in the scatter
fig = FigureResampler(default_n_shown_samples=50)
fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)
# Add as hf_x
fig = FigureResampler(default_n_shown_samples=50)
fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
assert isinstance(fig.hf_data[0]["x"], pd.DatetimeIndex)
assert isinstance(fig.hf_data[0]["x"][0], pd.Timestamp)

## Object array of ints
### Should be parsed to an int array (is more efficient than object array)
x = np.arange(100).astype("object")
assert x.dtype == "object"
assert isinstance(x[0], int)
# Add in the scatter
fig = FigureResampler(default_n_shown_samples=50)
fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)
# Add as hf_x
fig = FigureResampler(default_n_shown_samples=50)
fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)

## Object array of ints as strings
### Should be an integer array where the values are int objects
x = np.arange(100).astype(str).astype("object")
assert x.dtype == "object"
assert isinstance(x[0], str)
# Add in the scatter
fig = FigureResampler(default_n_shown_samples=50)
fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)
# Add as hf_x
fig = FigureResampler(default_n_shown_samples=50)
fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)
assert np.issubdtype(fig.hf_data[0]["x"].dtype, np.integer)

## Object array of strings
x = np.array(["x", "y"] * 50).astype("object")
assert x.dtype == "object"
# Add in the scatter
with pytest.raises(ValueError):
fig = FigureResampler(default_n_shown_samples=50)
fig.add_trace(go.Scatter(name="blabla", x=x, y=y))
# Add as hf_x
with pytest.raises(ValueError):
fig = FigureResampler(default_n_shown_samples=50)
fig.add_trace(go.Scatter(name="blabla"), hf_x=x, hf_y=y)


def test_time_tz_slicing():
n = 5050
dr = pd.Series(
Expand Down
Loading