Skip to content

Lttbv2 🍒 ⛏️ branch #103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 17, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import shutil
import sys

from distutils.command.build_ext import build_ext
from distutils.core import Distribution
from distutils.core import Extension
from distutils.errors import CCompilerError
from distutils.errors import DistutilsExecError
from distutils.errors import DistutilsPlatformError

import numpy as np

# C Extensions
with_extensions = True


def get_script_path():
return os.path.dirname(os.path.realpath(sys.argv[0]))

extensions = []
if with_extensions:
extensions = [
Extension(
name="plotly_resampler.aggregation.algorithms.lttbcv2",
sources=["plotly_resampler/aggregation/algorithms/lttbcv2.c"],
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
include_dirs=[np.get_include(), get_script_path()],
),
]


class BuildFailed(Exception):

pass


class ExtBuilder(build_ext):
# This class allows C extension building to fail.

built_extensions = []

def run(self):
try:
build_ext.run(self)
except (DistutilsPlatformError, FileNotFoundError) as e:
print(" Unable to build the C extensions.")
raise e

def build_extension(self, ext):
try:
build_ext.build_extension(self, ext)
except (CCompilerError, DistutilsExecError, DistutilsPlatformError, ValueError) as e:
print(' Unable to build the "{}" C extension, '.format(ext.name))
raise e


def build(setup_kwargs):
"""
This function is mandatory in order to build the extensions.
"""
distribution = Distribution({"name": "plotly_resampler", "ext_modules": extensions})
distribution.package_dir = "plotly_resampler"

cmd = ExtBuilder(distribution)
cmd.ensure_finalized()
cmd.run()

# Copy built extensions back to the project
for output in cmd.get_outputs():
relative_extension = os.path.relpath(output, cmd.build_lib)
if not os.path.exists(output):
continue

shutil.copyfile(output, relative_extension)
mode = os.stat(relative_extension).st_mode
mode |= (mode & 0o444) >> 2
os.chmod(relative_extension, mode)

return setup_kwargs


if __name__ == "__main__":
build({})
2 changes: 1 addition & 1 deletion plotly_resampler/aggregation/aggregation_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def _insert_gap_none(self, s: pd.Series) -> pd.Series:
df_gap_idx = s.index.values[s_idx_diff > 3 * med_diff]
if len(df_gap_idx):
df_res_gap = pd.Series(
index=df_gap_idx, data=None, name=s.name, copy=False
index=df_gap_idx, data=None, name=s.name, copy=False, dtype=s.dtype
)

if isinstance(df_res_gap.index, pd.DatetimeIndex):
Expand Down
61 changes: 19 additions & 42 deletions plotly_resampler/aggregation/aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@

import math

import lttbc
import numpy as np
import pandas as pd

from ..aggregation.aggregation_interface import AbstractSeriesAggregator
from .algorithms import lttbcv2


class LTTB(AbstractSeriesAggregator):
Expand Down Expand Up @@ -73,42 +73,19 @@ def __init__(self, interleave_gaps: bool = True, nan_position="end"):
)

def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
# if we have categorical data, LTTB will convert the categorical values into
# their numeric codes, i.e., the index position of the category array
s_v = s.cat.codes.values if str(s.dtype) == "category" else s.values
s_i = s.index.values

if s_i.dtype.type == np.datetime64:
# lttbc does not support this datatype -> convert to int
# (where the time is represented in ns)
# REMARK:
# -> additional logic is needed to mitigate rounding errors
# First, the start offset is subtracted, after which the input series
# is set in the already requested format, i.e. np.float64

# NOTE -> Rounding errors can still persist, but this approach is already
# significantly less prone to it than the previos implementation.
s_i0 = s_i[0].astype(np.int64)
idx, data = lttbc.downsample(
(s_i.astype(np.int64) - s_i0).astype(np.float64), s_v, n_out
)

# add the start-offset and convert back to datetime
idx = pd.to_datetime(
idx.astype(np.int64) + s_i0, unit="ns", utc=True
).tz_convert(s.index.tz)
else:
idx, data = lttbc.downsample(s_i, s_v, n_out)
idx = idx.astype(s_i.dtype)
s_i = s.index.values
s_i = s_i.astype(np.int64) if s_i.dtype.type == np.datetime64 else s_i

if str(s.dtype) == "category":
# reconvert the downsampled numeric codes to the category array
data = np.vectorize(s.dtype.categories.values.item)(data.astype(s_v.dtype))
else:
# default case, use the series it's dtype as return type
data = data.astype(s.dtype)
index = lttbcv2.downsample_return_index(s_i, s_v, n_out)

return pd.Series(index=idx, data=data, name=str(s.name), copy=False)
return pd.Series(
index=s.index[index],
data=s.values[index],
name=str(s.name),
copy=False,
)


class MinMaxOverlapAggregator(AbstractSeriesAggregator):
Expand Down Expand Up @@ -166,14 +143,14 @@ def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
# Calculate the argmin & argmax on the reshaped view of `s` &
# add the corresponding offset
argmin = (
s.iloc[: block_size * offset.shape[0]]
.values.reshape(-1, block_size)
s.values[: block_size * offset.shape[0]]
.reshape(-1, block_size)
.argmin(axis=1)
+ offset
)
argmax = (
s.iloc[argmax_offset : block_size * offset.shape[0] + argmax_offset]
.values.reshape(-1, block_size)
s.values[argmax_offset : block_size * offset.shape[0] + argmax_offset]
.reshape(-1, block_size)
.argmax(axis=1)
+ offset
+ argmax_offset
Expand Down Expand Up @@ -231,14 +208,14 @@ def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
# Calculate the argmin & argmax on the reshaped view of `s` &
# add the corresponding offset
argmin = (
s.iloc[: block_size * offset.shape[0]]
.values.reshape(-1, block_size)
s.values[: block_size * offset.shape[0]]
.reshape(-1, block_size)
.argmin(axis=1)
+ offset
)
argmax = (
s.iloc[: block_size * offset.shape[0]]
.values.reshape(-1, block_size)
s.values[: block_size * offset.shape[0]]
.reshape(-1, block_size)
.argmax(axis=1)
+ offset
)
Expand Down Expand Up @@ -297,7 +274,7 @@ def __init__(self, interleave_gaps: bool = True, nan_position="end"):
)

def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
if s.shape[0] > n_out * 1_000:
if s.shape[0] > n_out * 2_000:
s = self.minmax._aggregate(s, n_out * 50)
return self.lttb._aggregate(s, n_out)

Expand Down
165 changes: 165 additions & 0 deletions plotly_resampler/aggregation/algorithms/lttbcv2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#define PY_SSIZE_T_CLEAN
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#include <Python.h> // pull the python API into the current namespace
#include <numpy/arrayobject.h>
#include <numpy/npy_math.h>
#include <math.h>

// This code is adapted from https://github.com/dgoeries/lttbc
// Most credits are due to https://github.com/dgoeries

// method below assumes the x-delta's are equidistant
static PyObject* downsample_return_index(PyObject *self, PyObject *args) {
int threshold;
PyObject *x_obj = NULL, *y_obj = NULL;
PyArrayObject *x_array = NULL, *y_array = NULL;

if (!PyArg_ParseTuple(args, "OOi", &x_obj, &y_obj, &threshold))
return NULL;

if ((!PyArray_Check(x_obj) && !PyList_Check(x_obj)) || (!PyArray_Check(y_obj) && !PyList_Check(y_obj))) {
PyErr_SetString(PyExc_TypeError, "Function requires x and y input to be of type list or ndarray ...");
goto fail;
}


// Interpret the input objects as numpy arrays, with reqs (contiguous, aligned, and writeable ...)
x_array = (PyArrayObject *)PyArray_FROM_OTF(x_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
y_array = (PyArrayObject *)PyArray_FROM_OTF(y_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
if (x_array == NULL || y_array == NULL) {
goto fail;
}

if (PyArray_NDIM(x_array) != 1 || PyArray_NDIM(y_array) != 1) {;
PyErr_SetString(PyExc_ValueError, "Both x and y must have a single dimension ...");
goto fail;
}

if (!PyArray_SAMESHAPE(x_array, y_array)) {
PyErr_SetString(PyExc_ValueError, "Input x and y must have the same shape ...");
goto fail;
}

// Declare data length and check if we actually have to downsample!
const Py_ssize_t data_length = (Py_ssize_t)PyArray_DIM(y_array, 0);
if (threshold >= data_length || threshold <= 0) {
// Nothing to do!
PyObject *value = Py_BuildValue("O", x_array);
Py_DECREF(x_array);
Py_DECREF(y_array);
return value;
}

// Access the data in the NDArray!
double *y = (double*)PyArray_DATA(y_array);

// Create an empty output array with shape and dim for the output!
npy_intp dims[1];
dims[0] = threshold;
PyArrayObject *sampled_x = (PyArrayObject *)PyArray_Empty(1, dims,
PyArray_DescrFromType(NPY_INT64), 0);

// Get a pointer to its data
long long *sampled_x_data = (long long*)PyArray_DATA(sampled_x);

// The main loop here!
Py_ssize_t sampled_index = 0;
const double every = (double)(data_length - 2) / (threshold - 2);

Py_ssize_t a = 0;
Py_ssize_t next_a = 0;

// Always add the first point!
sampled_x_data[sampled_index] = 0;

sampled_index++;
Py_ssize_t i;
for (i = 0; i < threshold - 2; ++i) {
// Calculate point average for next bucket (containing c)
double avg_x = threshold;
double avg_y = 0;
Py_ssize_t avg_range_start = (Py_ssize_t)(floor((i + 1)* every) + 1);
Py_ssize_t avg_range_end = (Py_ssize_t)(floor((i + 2) * every) + 1);
if (avg_range_end >= data_length){
avg_range_end = data_length;
}
Py_ssize_t avg_range_length = avg_range_end - avg_range_start;

for (;avg_range_start < avg_range_end; avg_range_start++){
avg_y += y[avg_range_start];
}
avg_y /= avg_range_length;
avg_x = avg_range_start + every / 2;

// Get the range for this bucket
Py_ssize_t range_offs = (Py_ssize_t)(floor((i + 0) * every) + 1);
Py_ssize_t range_to = (Py_ssize_t)(floor((i + 1) * every) + 1);


// Point a
double point_a_y = y[a];

double max_area = -1.0;
for (; range_offs < range_to; range_offs++){
// Calculate triangle area over three buckets
double area = fabs((a - avg_x) * (y[range_offs] - point_a_y) - (a - range_offs) * (avg_y - point_a_y)) * 0.5;
if (area > max_area){
max_area = area;
next_a = range_offs; // Next a is this b
}
}
// Pick this point from the bucket
sampled_x_data[sampled_index] = next_a;

sampled_index++;

// Current a becomes the next_a (chosen b)
a = next_a;
}

// Always add last! Check for finite values!
sampled_x_data[sampled_index] = data_length - 1;

// Provide our return value
PyObject *value = Py_BuildValue("O", sampled_x);

// And remove the references!
Py_DECREF(x_array);
Py_DECREF(y_array);
Py_XDECREF(sampled_x);

return value;

fail:
Py_DECREF(x_array);
Py_XDECREF(y_array);
return NULL;
}


// Method definition object
static PyMethodDef lttbcv2Methods[] = {
{
"downsample_return_index", // The name of the method
downsample_return_index, // Function pointer to the method implementation
METH_VARARGS,
"Compute the largest triangle three buckets (LTTB) algorithm in a C extension."
},
{NULL, NULL, 0, NULL} /* Sentinel */
};

static struct PyModuleDef lttbc_module_definition = {
PyModuleDef_HEAD_INIT,
"lttbcv2", /* name of module */
"A Python module that computes the largest triangle three buckets algorithm (LTTB) using C code.",
-1, /* size of per-interpreter state of the module,
or -1 if the module keeps state in global variables. */
lttbcv2Methods
};

// Module initialization
PyMODINIT_FUNC PyInit_lttbcv2(void) {
Py_Initialize();
import_array();
return PyModule_Create(&lttbc_module_definition);
}
Loading