Merge branch 'main' into fea-values-query

ayushdg · web-flow · commit 8df5361761c1 · 2023-05-09T10:48:52.000-07:00
diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml
@@ -112,6 +112,11 @@ jobs:
           use-mamba: true
           python-version: "3.8"
           channel-priority: strict
+      - name: Install Protoc
+        uses: arduino/setup-protoc@v1
+        with:
+          version: '3.x'
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
       - name: Optionally update upstream cargo dependencies
         if: env.which_upstream == 'DataFusion'
         env:
@@ -138,12 +143,11 @@ jobs:
 
   report-failures:
     name: Open issue for upstream dev failures
-    needs: [test-dev, cluster-dev, import-dev]
+    needs: [test-dev, import-dev]
     if: |
       always()
       && (
         needs.test-dev.result == 'failure'
-        || needs.cluster-dev.result == 'failure'
         || needs.import-dev.result == 'failure'
       )
       && github.repository == 'dask-contrib/dask-sql'
diff --git a/conftest.py b/conftest.py
@@ -15,6 +15,8 @@ def pytest_runtest_setup(item):
             pytest.skip("need --rungpu option to run")
         # FIXME: P2P shuffle isn't fully supported on GPU, so we must explicitly disable it
         dask.config.set({"dataframe.shuffle.algorithm": "tasks"})
+        # manually enable cudf decimal support
+        dask.config.set({"sql.mappings.decimal_support": "cudf"})
     else:
         dask.config.set({"dataframe.shuffle.algorithm": None})
     if "queries" in item.keywords and not item.config.getoption("--runqueries"):
diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml
@@ -13,6 +13,9 @@ dependencies:
 - maturin>=0.12.8
 - mlflow
 - mock
+# tpot imports fail with numpy >=1.24.0
+# https://github.com/EpistasisLab/tpot/issues/1281
+- numpy<1.24.0
 - pandas>=1.4.0
 - pre-commit
 - prompt_toolkit>=3.0.8
diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml
@@ -12,6 +12,9 @@ dependencies:
 - maturin=0.12.8
 - mlflow
 - mock
+# tpot imports fail with numpy >=1.24.0
+# https://github.com/EpistasisLab/tpot/issues/1281
+- numpy<1.24.0
 - pandas=1.4.0
 - pre-commit
 - prompt_toolkit=3.0.8
diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml
@@ -13,6 +13,9 @@ dependencies:
 - maturin>=0.12.8
 - mlflow
 - mock
+# tpot imports fail with numpy >=1.24.0
+# https://github.com/EpistasisLab/tpot/issues/1281
+- numpy<1.24.0
 - pandas>=1.4.0
 - pre-commit
 - prompt_toolkit>=3.0.8
diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml
@@ -41,7 +41,9 @@ dependencies:
 - cuml=23.06
 - dask-cudf=23.06
 - dask-cuda=23.06
-- numpy>=1.20.1
+# tpot imports fail with numpy >=1.24.0
+# https://github.com/EpistasisLab/tpot/issues/1281
+- numpy>=1.20.1, <1.24.0
 - ucx-proc=*=gpu
 - ucx-py=0.32
 - xgboost=*rapidsai23.06
diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml
@@ -41,7 +41,9 @@ dependencies:
 - cuml=23.06
 - dask-cudf=23.06
 - dask-cuda=23.06
-- numpy>=1.20.1
+# tpot imports fail with numpy >=1.24.0
+# https://github.com/EpistasisLab/tpot/issues/1281
+- numpy>=1.20.1, <1.24.0
 - ucx-proc=*=gpu
 - ucx-py=0.32
 - xgboost=*rapidsai23.06
diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock
diff --git a/dask_planner/src/error.rs b/dask_planner/src/error.rs
@@ -14,6 +14,7 @@ pub enum DaskPlannerError {
     ParserError(ParserError),
     TokenizerError(TokenizerError),
     Internal(String),
+    InvalidIOFilter(String),
 }
 
 impl Display for DaskPlannerError {
@@ -23,6 +24,7 @@ impl Display for DaskPlannerError {
             Self::ParserError(e) => write!(f, "SQL Parser Error: {e}"),
             Self::TokenizerError(e) => write!(f, "SQL Tokenizer Error: {e}"),
             Self::Internal(e) => write!(f, "Internal Error: {e}"),
+            Self::InvalidIOFilter(e) => write!(f, "Invalid pyarrow filter: {e} encountered. Defaulting to Dask CPU/GPU bound task operation"),
         }
     }
 }
diff --git a/dask_planner/src/sql/logical/table_scan.rs b/dask_planner/src/sql/logical/table_scan.rs
@@ -1,12 +1,13 @@
 use std::sync::Arc;
 
 use datafusion_python::{
-    datafusion_common::DFSchema,
-    datafusion_expr::{logical_plan::TableScan, LogicalPlan},
+    datafusion_common::{DFSchema, ScalarValue},
+    datafusion_expr::{logical_plan::TableScan, Expr, LogicalPlan},
 };
 use pyo3::prelude::*;
 
 use crate::{
+    error::DaskPlannerError,
     expression::{py_expr_list, PyExpr},
     sql::exceptions::py_type_err,
 };
@@ -18,6 +19,138 @@ pub struct PyTableScan {
     input: Arc<LogicalPlan>,
 }
 
+#[pyclass(name = "FilteredResult", module = "dask_planner", subclass)]
+#[derive(Debug, Clone)]
+pub struct PyFilteredResult {
+    // Certain Expr(s) do not have supporting logic in pyarrow for IO filtering
+    // at read time. Those Expr(s) cannot be ignored however. This field stores
+    // those Expr(s) so that they can be used on the Python side to create
+    // Dask operations that handle that filtering as an extra task in the graph.
+    #[pyo3(get)]
+    pub io_unfilterable_exprs: Vec<PyExpr>,
+    // Expr(s) that can have their filtering logic performed in the pyarrow IO logic
+    // are stored here in a DNF format that is expected by pyarrow.
+    #[pyo3(get)]
+    pub filtered_exprs: Vec<(String, String, Vec<PyObject>)>,
+}
+
+impl PyTableScan {
+    /// Ensures that a valid Expr variant type is present
+    fn _valid_expr_type(expr: &[Expr]) -> bool {
+        expr.iter()
+            .all(|f| matches!(f, Expr::Column(_) | Expr::Literal(_)))
+    }
+
+    /// Transform the singular Expr instance into its DNF form serialized in a Vec instance. Possibly recursively expanding
+    /// it as well if needed.
+    pub fn _expand_dnf_filter(
+        filter: &Expr,
+        py: Python,
+    ) -> Result<Vec<(String, String, Vec<PyObject>)>, DaskPlannerError> {
+        let mut filter_tuple: Vec<(String, String, Vec<PyObject>)> = Vec::new();
+
+        match filter {
+            Expr::InList {
+                expr,
+                list,
+                negated,
+            } => {
+                // Only handle simple Expr(s) for InList operations for now
+                if PyTableScan::_valid_expr_type(list) {
+                    // While ANSI SQL would not allow for anything other than a Column or Literal
+                    // value in this "identifying" `expr` we explicitly check that here just to be sure.
+                    // IF it is something else it is returned to Dask to handle
+                    let ident = match *expr.clone() {
+                        Expr::Column(col) => Ok(col.name),
+                        Expr::Alias(_, name) => Ok(name),
+                        Expr::Literal(val) => Ok(format!("{}", val)),
+                        _ => Err(DaskPlannerError::InvalidIOFilter(format!(
+                            "Invalid InList Expr type `{}`. using in Dask instead",
+                            filter
+                        ))),
+                    };
+
+                    let op = if *negated { "not in" } else { "in" };
+                    let il: Result<Vec<PyObject>, DaskPlannerError> = list
+                        .iter()
+                        .map(|f| match f {
+                            Expr::Column(col) => Ok(col.name.clone().into_py(py)),
+                            Expr::Alias(_, name) => Ok(name.clone().into_py(py)),
+                            Expr::Literal(val) => match val {
+                                ScalarValue::Boolean(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::Float32(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::Float64(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::Int8(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::Int16(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::Int32(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::Int64(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::UInt8(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::UInt16(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::UInt32(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::UInt64(val) => Ok(val.unwrap().into_py(py)),
+                                ScalarValue::Utf8(val) => Ok(val.clone().unwrap().into_py(py)),
+                                ScalarValue::LargeUtf8(val) => Ok(val.clone().unwrap().into_py(py)),
+                                _ => Err(DaskPlannerError::InvalidIOFilter(format!(
+                                    "Unsupported ScalarValue `{}` encountered. using in Dask instead",
+                                    filter
+                                ))),
+                            },
+                            _ => Ok(f.canonical_name().into_py(py)),
+                        })
+                        .collect();
+
+                    filter_tuple.push((
+                        ident.unwrap_or(expr.canonical_name()),
+                        op.to_string(),
+                        il?,
+                    ));
+                    Ok(filter_tuple)
+                } else {
+                    let er = DaskPlannerError::InvalidIOFilter(format!(
+                        "Invalid identifying column Expr instance `{}`. using in Dask instead",
+                        filter
+                    ));
+                    Err::<Vec<(String, String, Vec<PyObject>)>, DaskPlannerError>(er)
+                }
+            }
+            _ => {
+                let er = DaskPlannerError::InvalidIOFilter(format!(
+                    "Unable to apply filter: `{}` to IO reader, using in Dask instead",
+                    filter
+                ));
+                Err::<Vec<(String, String, Vec<PyObject>)>, DaskPlannerError>(er)
+            }
+        }
+    }
+
+    /// Consume the `TableScan` filters (Expr(s)) and convert them into a PyArrow understandable
+    /// DNF format that can be directly passed to PyArrow IO readers for Predicate Pushdown. Expr(s)
+    /// that cannot be converted to correlating PyArrow IO calls will be returned as is and can be
+    /// used in the Python logic to form Dask tasks for the graph to do computational filtering.
+    pub fn _expand_dnf_filters(
+        input: &Arc<LogicalPlan>,
+        filters: &[Expr],
+        py: Python,
+    ) -> PyFilteredResult {
+        let mut filtered_exprs: Vec<(String, String, Vec<PyObject>)> = Vec::new();
+        let mut unfiltered_exprs: Vec<PyExpr> = Vec::new();
+
+        filters
+            .iter()
+            .for_each(|f| match PyTableScan::_expand_dnf_filter(f, py) {
+                Ok(mut expanded_dnf_filter) => filtered_exprs.append(&mut expanded_dnf_filter),
+                Err(_e) => {
+                    unfiltered_exprs.push(PyExpr::from(f.clone(), Some(vec![input.clone()])))
+                }
+            });
+
+        PyFilteredResult {
+            io_unfilterable_exprs: unfiltered_exprs,
+            filtered_exprs,
+        }
+    }
+}
+
 #[pymethods]
 impl PyTableScan {
     #[pyo3(name = "getTableScanProjects")]
@@ -45,6 +178,12 @@ impl PyTableScan {
     fn scan_filters(&self) -> PyResult<Vec<PyExpr>> {
         py_expr_list(&self.input, &self.table_scan.filters)
     }
+
+    #[pyo3(name = "getDNFFilters")]
+    fn dnf_io_filters(&self, py: Python) -> PyResult<PyFilteredResult> {
+        let results = PyTableScan::_expand_dnf_filters(&self.input, &self.table_scan.filters, py);
+        Ok(results)
+    }
 }
 
 impl TryFrom<LogicalPlan> for PyTableScan {
diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py
@@ -1,19 +1,14 @@
 import logging
-from decimal import Decimal
 from typing import Any
 
 import dask.array as da
+import dask.config as dask_config
 import dask.dataframe as dd
 import numpy as np
 import pandas as pd
 
 from dask_planner.rust import DaskTypeMap, SqlTypeName
 
-try:
-    import cudf
-except ImportError:
-    cudf = None
-
 logger = logging.getLogger(__name__)
 
 
@@ -54,7 +49,7 @@
 _SQL_TO_PYTHON_SCALARS = {
     "SqlTypeName.DOUBLE": np.float64,
     "SqlTypeName.FLOAT": np.float32,
-    "SqlTypeName.DECIMAL": Decimal,
+    "SqlTypeName.DECIMAL": np.float32,
     "SqlTypeName.BIGINT": np.int64,
     "SqlTypeName.INTEGER": np.int32,
     "SqlTypeName.SMALLINT": np.int16,
@@ -71,8 +66,7 @@
 _SQL_TO_PYTHON_FRAMES = {
     "SqlTypeName.DOUBLE": np.float64,
     "SqlTypeName.FLOAT": np.float32,
-    # a column of Decimals in pandas is `object`, but cuDF has a dedicated dtype
-    "SqlTypeName.DECIMAL": object if not cudf else cudf.Decimal128Dtype(38, 10),
+    "SqlTypeName.DECIMAL": np.float64,  # We use np.float64 always, even though we might be able to use a smaller type
     "SqlTypeName.BIGINT": pd.Int64Dtype(),
     "SqlTypeName.INTEGER": pd.Int32Dtype(),
     "SqlTypeName.SMALLINT": pd.Int16Dtype(),
@@ -151,6 +145,14 @@ def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any:
 
         return literal_value
 
+    elif (
+        sql_type == SqlTypeName.DECIMAL
+        and dask_config.get("sql.mappings.decimal_support") == "cudf"
+    ):
+        from decimal import Decimal
+
+        python_type = Decimal
+
     elif sql_type == SqlTypeName.INTERVAL_DAY:
         return np.timedelta64(literal_value[0], "D") + np.timedelta64(
             literal_value[1], "ms"
@@ -219,7 +221,16 @@ def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any:
 def sql_to_python_type(sql_type: "SqlTypeName", *args) -> type:
     """Turn an SQL type into a dataframe dtype"""
     try:
-        if str(sql_type) == "SqlTypeName.DECIMAL":
+        if (
+            sql_type == SqlTypeName.DECIMAL
+            and dask_config.get("sql.mappings.decimal_support") == "cudf"
+        ):
+            try:
+                import cudf
+            except ImportError:
+                raise ModuleNotFoundError(
+                    "Setting `sql.mappings.decimal_support=cudf` requires cudf"
+                )
             return cudf.Decimal128Dtype(*args)
         return _SQL_TO_PYTHON_FRAMES[str(sql_type)]
     except KeyError:  # pragma: no cover
diff --git a/dask_sql/sql-schema.yaml b/dask_sql/sql-schema.yaml
@@ -75,3 +75,12 @@ properties:
               optimization (when possible). ``nelem`` is defined as the limit or ``k`` value times the
               number of columns. Default is 1000000, corresponding to a LIMIT clause of 1 million in a
               1 column table.
+
+      mappings:
+        type: object
+        properties:
+
+          decimal_support:
+            type: string
+            description:
+              Decides how to handle decimal scalars/columns. ``"pandas"`` handling will treat decimals scalars and columns as floats and float64 columns, respectively, while ``"cudf"`` handling treats decimal scalars as ``decimal.Decimal`` objects and decimal columns as ``cudf.Decimal128Dtype`` columns, handling precision/scale accordingly. Default is ``"pandas"``, but ``"cudf"`` should be used if attempting to work with decimal columns on GPU.
diff --git a/dask_sql/sql.yaml b/dask_sql/sql.yaml
@@ -18,3 +18,6 @@ sql:
 
   sort:
     topk-nelem-limit: 1000000
+
+  mappings:
+    decimal_support: "pandas"
diff --git a/tests/integration/test_filter.py b/tests/integration/test_filter.py
diff --git a/tests/integration/test_groupby.py b/tests/integration/test_groupby.py

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@ pub enum DaskPlannerError {`
`14`	`14`	`ParserError(ParserError),`
`15`	`15`	`TokenizerError(TokenizerError),`
`16`	`16`	`Internal(String),`
	`17`	`+ InvalidIOFilter(String),`
`17`	`18`	`}`
`18`	`19`
`19`	`20`	`impl Display for DaskPlannerError {`
`@@ -23,6 +24,7 @@ impl Display for DaskPlannerError {`
`23`	`24`	`Self::ParserError(e) => write!(f, "SQL Parser Error: {e}"),`
`24`	`25`	`Self::TokenizerError(e) => write!(f, "SQL Tokenizer Error: {e}"),`
`25`	`26`	`Self::Internal(e) => write!(f, "Internal Error: {e}"),`
	`27`	`+ Self::InvalidIOFilter(e) => write!(f, "Invalid pyarrow filter: {e} encountered. Defaulting to Dask CPU/GPU bound task operation"),`
`26`	`28`	`}`
`27`	`29`	`}`
`28`	`30`	`}`