From 0e8e96c74e3c476c3c4a18f005a8a7db2db5b6df Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 8 May 2023 19:59:58 -0400 Subject: [PATCH 01/44] Remove local expression.rs file and refactor codebase to use datafusion_python version --- dask_planner/Cargo.lock | 176 ++-- dask_planner/Cargo.toml | 2 +- dask_planner/src/expression.rs | 930 ------------------ dask_planner/src/lib.rs | 2 - dask_planner/src/sql/logical.rs | 1 + dask_planner/src/sql/logical/aggregate.rs | 18 +- dask_planner/src/sql/logical/filter.rs | 12 +- dask_planner/src/sql/logical/join.rs | 11 +- dask_planner/src/sql/logical/limit.rs | 19 +- dask_planner/src/sql/logical/projection.rs | 27 +- .../src/sql/logical/repartition_by.rs | 20 +- dask_planner/src/sql/logical/sort.rs | 11 +- .../src/sql/logical/subquery_alias.rs | 2 +- dask_planner/src/sql/logical/table_scan.rs | 7 +- dask_planner/src/sql/logical/utils.rs | 37 + dask_planner/src/sql/logical/window.rs | 8 +- dask_planner/src/sql/table.rs | 8 +- dask_planner/src/sql/types.rs | 126 +-- 18 files changed, 248 insertions(+), 1169 deletions(-) delete mode 100644 dask_planner/src/expression.rs create mode 100644 dask_planner/src/sql/logical/utils.rs diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index 234cc1d08..a1692e52f 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -113,9 +113,9 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" [[package]] name = "arrow" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990dfa1a9328504aa135820da1c95066537b69ad94c04881b785f64328e0fa6b" +checksum = "1aea9fcb25bbb70f7f922f95b99ca29c1013dab47f6df61a6f24861842dd7f2e" dependencies = [ "ahash", "arrow-arith", @@ -136,9 +136,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b2e52de0ab54173f9b08232b7184c26af82ee7ab4ac77c83396633c90199fa" +checksum = "8d967b42f7b12c91fd78acd396b20c2973b184c8866846674abbb00c963e93ab" dependencies = [ "arrow-array", "arrow-buffer", @@ -151,9 +151,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10849b60c17dbabb334be1f4ef7550701aa58082b71335ce1ed586601b2f423" +checksum = "3190f208ee7aa0f3596fa0098d42911dec5e123ca88c002a08b24877ad14c71e" dependencies = [ "ahash", "arrow-buffer", @@ -168,9 +168,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0746ae991b186be39933147117f8339eb1c4bbbea1c8ad37e7bf5851a1a06ba" +checksum = "5d33c733c5b6c44a0fc526f29c09546e04eb56772a7a21e48e602f368be381f6" dependencies = [ "half", "num", @@ -178,9 +178,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b88897802515d7b193e38b27ddd9d9e43923d410a9e46307582d756959ee9595" +checksum = "abd349520b6a1ed4924ae2afc9d23330a3044319e4ec3d5b124c09e4d440ae87" dependencies = [ "arrow-array", "arrow-buffer", @@ -195,9 +195,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c8220d9741fc37961262710ceebd8451a5b393de57c464f0267ffdda1775c0a" +checksum = "c80af3c3e290a2a7e1cc518f1471dff331878cb4af9a5b088bf030b89debf649" dependencies = [ "arrow-array", "arrow-buffer", @@ -214,9 +214,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533f937efa1aaad9dc86f6a0e382c2fa736a4943e2090c946138079bdf060cef" +checksum = "b1c8361947aaa96d331da9df3f7a08bdd8ab805a449994c97f5c4d24c4b7e2cf" dependencies = [ "arrow-buffer", "arrow-schema", @@ -226,9 +226,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18b75296ff01833f602552dff26a423fc213db8e5049b540ca4a00b1c957e41c" +checksum = "9a46ee000b9fbd1e8db6e8b26acb8c760838512b39d8c9f9d73892cb55351d50" dependencies = [ "arrow-array", "arrow-buffer", @@ -240,9 +240,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e501d3de4d612c90677594896ca6c0fa075665a7ff980dc4189bb531c17e19f6" +checksum = "4bf2366607be867ced681ad7f272371a5cf1fc2941328eef7b4fee14565166fb" dependencies = [ "arrow-array", "arrow-buffer", @@ -254,14 +254,15 @@ dependencies = [ "indexmap", "lexical-core", "num", + "serde", "serde_json", ] [[package]] name = "arrow-ord" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33d2671eb3793f9410230ac3efb0e6d36307be8a2dac5fad58ac9abde8e9f01e" +checksum = "304069901c867200e21ec868ae7521165875470ef2f1f6d58f979a443d63997e" dependencies = [ "arrow-array", "arrow-buffer", @@ -274,9 +275,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc11fa039338cebbf4e29cf709c8ac1d6a65c7540063d4a25f991ab255ca85c8" +checksum = "0d57fe8ceef3392fdd493269d8a2d589de17bafce151aacbffbddac7a57f441a" dependencies = [ "ahash", "arrow-array", @@ -289,18 +290,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d04f17f7b86ded0b5baf98fe6123391c4343e031acc3ccc5fa604cc180bff220" +checksum = "a16b88a93ac8350f0200b1cd336a1f887315925b8dd7aa145a37b8bdbd8497a4" dependencies = [ "bitflags 2.2.1", ] [[package]] name = "arrow-select" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "163e35de698098ff5f5f672ada9dc1f82533f10407c7a11e2cd09f3bcf31d18a" +checksum = "98e8a4d6ca37d5212439b24caad4d80743fcbb706706200dd174bb98e68fe9d8" dependencies = [ "arrow-array", "arrow-buffer", @@ -311,9 +312,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdfbed1b10209f0dc68e6aa4c43dc76079af65880965c7c3b73f641f23d4aba" +checksum = "cbb594efa397eb6a546f42b1f8df3d242ea84dbfda5232e06035dc2b2e2c8459" dependencies = [ "arrow-array", "arrow-buffer", @@ -523,12 +524,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" dependencies = [ "iana-time-zone", - "js-sys", "num-integer", "num-traits", "serde", - "time 0.1.45", - "wasm-bindgen", "winapi", ] @@ -742,13 +740,15 @@ dependencies = [ [[package]] name = "datafusion" -version = "22.0.0" +version = "23.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bdb93fee4f30368f1f71bfd5cd28882ec9fab0183db7924827b76129d33227c" +checksum = "a8a7d4b334f4512ff2fdbce87f511f570ae895af1ac7c729e77c12583253b22a" dependencies = [ "ahash", "apache-avro", "arrow", + "arrow-array", + "arrow-schema", "async-compression", "async-trait", "bytes", @@ -792,9 +792,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "22.0.0" +version = "23.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e82401ce129e601d406012b6d718f8978ba84c386e1c342fa155877120d68824" +checksum = "80abfcb1dbc6390f952f21de9069e6177ad6318fcae5fbceabb50666d96533dd" dependencies = [ "apache-avro", "arrow", @@ -809,9 +809,9 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "22.0.0" +version = "23.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b08b2078aed21a27239cd93f3015e492a58b0d50ebeeaf8d2236cf108ef583ce" +checksum = "df2524f1b4b58319895b112809d2a59e54fa662d0e46330a455f22882c2cb7b9" dependencies = [ "dashmap", "datafusion-common", @@ -827,9 +827,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "22.0.0" +version = "23.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b5b977ce9695fb4c67614266ec57f384fc11e9a9f9b3e6d0e62b9c5a9f2c1f" +checksum = "af8040b7a75b04685f4db0a1b11ffa93cd163c1bc13751df3f5cf76baabaf5a1" dependencies = [ "ahash", "arrow", @@ -839,9 +839,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "22.0.0" +version = "23.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0b2bb9e73ed778d1bc5af63a270f0154bf6eab5099c77668a6362296888e46b" +checksum = "74ceae25accc0f640a4238283f55f3a9fd181d55398703a4330fb2c46261e6a2" dependencies = [ "arrow", "async-trait", @@ -857,9 +857,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "22.0.0" +version = "23.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80cd8ea5ab0a07b1b2a3e17d5909f1b1035bd129ffeeb5c66842a32e682f8f79" +checksum = "df4cf228b312f2758cb78e93fe3d2dc602345028efdf7cfa5b338cb370d0a347" dependencies = [ "ahash", "arrow", @@ -877,6 +877,7 @@ dependencies = [ "indexmap", "itertools", "lazy_static", + "libc", "md-5", "paste", "petgraph", @@ -889,8 +890,8 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "22.0.0" -source = "git+https://github.com/apache/arrow-datafusion-python.git?rev=9493638#94936380e58a266f5dd5de6b70a06d3aa36fbe22" +version = "23.0.0" +source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=expr_mods#9b60d1a445efb9c9eddadd8936a54e726494b849" dependencies = [ "async-trait", "datafusion", @@ -903,6 +904,8 @@ dependencies = [ "mimalloc", "object_store", "parking_lot", + "prost", + "prost-types", "pyo3", "pyo3-build-config", "rand", @@ -915,9 +918,9 @@ dependencies = [ [[package]] name = "datafusion-row" -version = "22.0.0" +version = "23.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a95d6badab19fd6e9195fdc5209ac0a7e5ce9bcdedc67767b9ffc1b4e645760" +checksum = "b52b486fb3d81bb132e400304be01af5aba0ad6737e3518045bb98944991fe32" dependencies = [ "arrow", "datafusion-common", @@ -927,9 +930,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "22.0.0" +version = "23.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37a78f8fc67123c4357e63bc0c87622a2a663d26f074958d749a633d0ecde90f" +checksum = "773e985c182e41cfd68f7a7b483ab6bfb68beaac241c348cd4b1bf9f9d61b762" dependencies = [ "arrow", "arrow-schema", @@ -941,9 +944,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "22.0.0" +version = "23.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae6ed64a2005f0d78f2b1b3ec3f8148183f4523d5d364e5367115f8d8a82b7df" +checksum = "836e9b1c0ea430199c9bd4b88024cb8d617e3768ffdb412064169e2504a850ed" dependencies = [ "async-recursion", "chrono", @@ -1222,7 +1225,7 @@ checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", ] [[package]] @@ -1383,7 +1386,7 @@ dependencies = [ "bstr", "itoa", "thiserror", - "time 0.3.20", + "time", ] [[package]] @@ -2147,9 +2150,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.142" +version = "0.2.144" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317" +checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" [[package]] name = "libflate" @@ -2198,9 +2201,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b64f40e5e03e0d54f03845c8197d0291253cdbedfb1cb46b13c2c117554a9f4c" +checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f" [[package]] name = "lock_api" @@ -2323,7 +2326,7 @@ checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" dependencies = [ "libc", "log", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "windows-sys 0.45.0", ] @@ -2519,9 +2522,9 @@ dependencies = [ [[package]] name = "parquet" -version = "36.0.0" +version = "37.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "321a15f8332645759f29875b07f8233d16ed8ec1b3582223de81625a9f8506b7" +checksum = "b5022d98333271f4ca3e87bab760498e61726bf5a6ca919123c80517e20ded29" dependencies = [ "ahash", "arrow-array", @@ -2633,9 +2636,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" [[package]] name = "ppv-lite86" @@ -2815,9 +2818,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.26" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" dependencies = [ "proc-macro2", ] @@ -2993,9 +2996,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.18" +version = "0.37.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bbfc1d1c7c40c01715f47d71444744a81669ca84e8b63e25a55e169b1f86433" +checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" dependencies = [ "bitflags 1.3.2", "errno", @@ -3107,18 +3110,18 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" [[package]] name = "serde" -version = "1.0.160" +version = "1.0.162" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c" +checksum = "71b2f6e1ab5c2b98c05f0f35b236b22e8df7ead6ffbf51d7808da7f8817e7ab6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.160" +version = "1.0.162" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df" +checksum = "a2a0814352fd64b58489904a44ea8d90cb1a91dcb6b4f5ebabc32c8318e93cb6" dependencies = [ "proc-macro2", "quote", @@ -3286,9 +3289,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "sqlparser" -version = "0.32.0" +version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0366f270dbabb5cc2e4c88427dc4c08bba144f81e32fbd459a013f26a4d16aa0" +checksum = "355dc4d4b6207ca8a3434fc587db0a8016130a574dbcdbfb93d7f7b5bc5b211a" dependencies = [ "log", "sqlparser_derive", @@ -3441,20 +3444,9 @@ dependencies = [ [[package]] name = "time" -version = "0.1.45" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "time" -version = "0.3.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" +checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc" dependencies = [ "itoa", "libc", @@ -3466,15 +3458,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" [[package]] name = "time-macros" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" +checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b" dependencies = [ "time-core", ] @@ -3787,12 +3779,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml index 0e45b0732..1f8cd386c 100644 --- a/dask_planner/Cargo.toml +++ b/dask_planner/Cargo.toml @@ -10,7 +10,7 @@ rust-version = "1.65" [dependencies] async-trait = "0.1.68" -datafusion-python = { git = "https://github.com/apache/arrow-datafusion-python.git", rev = "9493638" } +datafusion-python = { git = "https://github.com/jdye64/arrow-datafusion-python.git", branch = "expr_mods" } env_logger = "0.10" log = "^0.4" pyo3 = { version = "0.18.3", features = ["extension-module", "abi3", "abi3-py38"] } diff --git a/dask_planner/src/expression.rs b/dask_planner/src/expression.rs deleted file mode 100644 index 150d332b8..000000000 --- a/dask_planner/src/expression.rs +++ /dev/null @@ -1,930 +0,0 @@ -use std::{borrow::Cow, convert::From, sync::Arc}; - -use datafusion_python::{ - datafusion::arrow::datatypes::DataType, - datafusion_common::{Column, DFField, DFSchema, ScalarValue}, - datafusion_expr::{ - expr::{AggregateFunction, BinaryExpr, Cast, Sort, TryCast, WindowFunction}, - lit, - utils::exprlist_to_fields, - Between, - BuiltinScalarFunction, - Case, - Expr, - GetIndexedField, - Like, - LogicalPlan, - Operator, - }, - datafusion_sql::TableReference, -}; -use pyo3::prelude::*; - -use crate::{ - error::{DaskPlannerError, Result}, - sql::{ - exceptions::{py_runtime_err, py_type_err}, - logical, - types::RexType, - }, -}; - -/// An PyExpr that can be used on a DataFrame -#[pyclass(name = "Expression", module = "datafusion", subclass)] -#[derive(Debug, Clone)] -pub struct PyExpr { - pub expr: Expr, - // Why a Vec here? Because BinaryExpr on Join might have multiple LogicalPlans - pub input_plan: Option>>, -} - -impl From for Expr { - fn from(expr: PyExpr) -> Expr { - expr.expr - } -} - -#[pyclass(name = "ScalarValue", module = "datafusion", subclass)] -#[derive(Debug, Clone)] -pub struct PyScalarValue { - pub scalar_value: ScalarValue, -} - -impl From for ScalarValue { - fn from(pyscalar: PyScalarValue) -> ScalarValue { - pyscalar.scalar_value - } -} - -impl From for PyScalarValue { - fn from(scalar_value: ScalarValue) -> PyScalarValue { - PyScalarValue { scalar_value } - } -} - -/// Convert a list of DataFusion Expr to PyExpr -pub fn py_expr_list(input: &Arc, expr: &[Expr]) -> PyResult> { - Ok(expr - .iter() - .map(|e| PyExpr::from(e.clone(), Some(vec![input.clone()]))) - .collect()) -} - -impl PyExpr { - /// Generally we would implement the `From` trait offered by Rust - /// However in this case Expr does not contain the contextual - /// `LogicalPlan` instance that we need so we need to make a instance - /// function to take and create the PyExpr. - pub fn from(expr: Expr, input: Option>>) -> PyExpr { - PyExpr { - input_plan: input, - expr, - } - } - - /// Determines the name of the `Expr` instance by examining the LogicalPlan - pub fn _column_name(&self, plan: &LogicalPlan) -> Result { - let field = expr_to_field(&self.expr, plan)?; - Ok(field.qualified_column().flat_name()) - } - - fn _rex_type(&self, expr: &Expr) -> RexType { - match expr { - Expr::Alias(..) => RexType::Alias, - Expr::Column(..) | Expr::QualifiedWildcard { .. } | Expr::GetIndexedField { .. } => { - RexType::Reference - } - Expr::ScalarVariable(..) | Expr::Literal(..) => RexType::Literal, - Expr::BinaryExpr { .. } - | Expr::Not(..) - | Expr::IsNotNull(..) - | Expr::Negative(..) - | Expr::IsNull(..) - | Expr::Like { .. } - | Expr::ILike { .. } - | Expr::SimilarTo { .. } - | Expr::Between { .. } - | Expr::Case { .. } - | Expr::Cast { .. } - | Expr::TryCast { .. } - | Expr::Sort { .. } - | Expr::ScalarFunction { .. } - | Expr::AggregateFunction { .. } - | Expr::WindowFunction { .. } - | Expr::AggregateUDF { .. } - | Expr::InList { .. } - | Expr::Wildcard - | Expr::ScalarUDF { .. } - | Expr::Exists { .. } - | Expr::InSubquery { .. } - | Expr::GroupingSet(..) - | Expr::IsTrue(..) - | Expr::IsFalse(..) - | Expr::IsUnknown(_) - | Expr::IsNotTrue(..) - | Expr::IsNotFalse(..) - | Expr::Placeholder { .. } - | Expr::OuterReferenceColumn(_, _) - | Expr::IsNotUnknown(_) => RexType::Call, - Expr::ScalarSubquery(..) => RexType::ScalarSubquery, - } - } -} - -macro_rules! extract_scalar_value { - ($self: expr, $variant: ident) => { - match $self.get_scalar_value()? { - ScalarValue::$variant(value) => Ok(*value), - other => Err(unexpected_literal_value(other)), - } - }; -} - -#[pymethods] -impl PyExpr { - #[staticmethod] - pub fn literal(value: PyScalarValue) -> PyExpr { - PyExpr::from(lit(value.scalar_value), None) - } - - /// Extracts the LogicalPlan from a Subquery, or supported Subquery sub-type, from - /// the expression instance - #[pyo3(name = "getSubqueryLogicalPlan")] - pub fn subquery_plan(&self) -> PyResult { - match &self.expr { - Expr::ScalarSubquery(subquery) => Ok(subquery.subquery.as_ref().clone().into()), - _ => Err(py_type_err(format!( - "Attempted to extract a LogicalPlan instance from invalid Expr {:?}. - Only Subquery and related variants are supported for this operation.", - &self.expr - ))), - } - } - - /// If this Expression instances references an existing - /// Column in the SQL parse tree or not - #[pyo3(name = "isInputReference")] - pub fn is_input_reference(&self) -> PyResult { - Ok(matches!(&self.expr, Expr::Column(_col))) - } - - #[pyo3(name = "toString")] - pub fn to_string(&self) -> PyResult { - Ok(format!("{}", &self.expr)) - } - - /// Gets the positional index of the Expr instance from the LogicalPlan DFSchema - #[pyo3(name = "getIndex")] - pub fn index(&self) -> PyResult { - let input: &Option>> = &self.input_plan; - match input { - Some(input_plans) if !input_plans.is_empty() => { - let mut schema: DFSchema = (**input_plans[0].schema()).clone(); - for plan in input_plans.iter().skip(1) { - schema.merge(plan.schema().as_ref()); - } - let name = get_expr_name(&self.expr).map_err(py_runtime_err)?; - schema - .index_of_column(&Column::from_qualified_name(name.clone())) - .or_else(|_| { - // Handles cases when from_qualified_name doesn't format the Column correctly. - // "name" will always contain the name of the column. Anything in addition to - // that will be separated by a '.' and should be further referenced. - let parts = name.split('.').collect::>(); - let tbl_reference = match parts.len() { - // Single element means name contains just the column name so no TableReference - 1 => None, - // Tablename.column_name - 2 => Some( - TableReference::Bare { - table: Cow::Borrowed(parts[0]), - } - .to_owned_reference(), - ), - // Schema_name.table_name.column_name - 3 => Some( - TableReference::Partial { - schema: Cow::Borrowed(parts[0]), - table: Cow::Borrowed(parts[1]), - } - .to_owned_reference(), - ), - // catalog_name.schema_name.table_name.column_name - 4 => Some( - TableReference::Full { - catalog: Cow::Borrowed(parts[0]), - schema: Cow::Borrowed(parts[1]), - table: Cow::Borrowed(parts[2]), - } - .to_owned_reference(), - ), - _ => None, - }; - - let col = Column { - relation: tbl_reference.clone(), - name: parts[parts.len() - 1].to_string(), - }; - schema.index_of_column(&col).map_err(py_runtime_err) - }) - } - _ => Err(py_runtime_err( - "We need a valid LogicalPlan instance to get the Expr's index in the schema", - )), - } - } - - /// Examine the current/"self" PyExpr and return its "type" - /// In this context a "type" is what Dask-SQL Python - /// RexConverter plugin instance should be invoked to handle - /// the Rex conversion - #[pyo3(name = "getExprType")] - pub fn get_expr_type(&self) -> PyResult { - Ok(String::from(match &self.expr { - Expr::Alias(..) - | Expr::Column(..) - | Expr::Literal(..) - | Expr::BinaryExpr { .. } - | Expr::Between { .. } - | Expr::Cast { .. } - | Expr::Sort { .. } - | Expr::ScalarFunction { .. } - | Expr::AggregateFunction { .. } - | Expr::InList { .. } - | Expr::InSubquery { .. } - | Expr::ScalarUDF { .. } - | Expr::AggregateUDF { .. } - | Expr::Exists { .. } - | Expr::ScalarSubquery(..) - | Expr::QualifiedWildcard { .. } - | Expr::Not(..) - | Expr::OuterReferenceColumn(_, _) - | Expr::GroupingSet(..) => self.expr.variant_name(), - Expr::ScalarVariable(..) - | Expr::IsNotNull(..) - | Expr::Negative(..) - | Expr::GetIndexedField { .. } - | Expr::IsNull(..) - | Expr::IsTrue(_) - | Expr::IsFalse(_) - | Expr::IsUnknown(_) - | Expr::IsNotTrue(_) - | Expr::IsNotFalse(_) - | Expr::Like { .. } - | Expr::ILike { .. } - | Expr::SimilarTo { .. } - | Expr::IsNotUnknown(_) - | Expr::Case { .. } - | Expr::TryCast { .. } - | Expr::WindowFunction { .. } - | Expr::Placeholder { .. } - | Expr::Wildcard => { - return Err(py_type_err(format!( - "Encountered unsupported expression type: {}", - &self.expr.variant_name() - ))) - } - })) - } - - /// Determines the type of this Expr based on its variant - #[pyo3(name = "getRexType")] - pub fn rex_type(&self) -> PyResult { - Ok(self._rex_type(&self.expr)) - } - - /// Python friendly shim code to get the name of a column referenced by an expression - pub fn column_name(&self, mut plan: logical::PyLogicalPlan) -> PyResult { - self._column_name(&plan.current_node()) - .map_err(py_runtime_err) - } - - /// Row expressions, Rex(s), operate on the concept of operands. This maps to expressions that are used in - /// the "call" logic of the Dask-SQL python codebase. Different variants of Expressions, Expr(s), - /// store those operands in different datastructures. This function examines the Expr variant and returns - /// the operands to the calling logic as a Vec of PyExpr instances. - #[pyo3(name = "getOperands")] - pub fn get_operands(&self) -> PyResult> { - match &self.expr { - // Expr variants that are themselves the operand to return - Expr::Column(..) | Expr::ScalarVariable(..) | Expr::Literal(..) => { - Ok(vec![PyExpr::from( - self.expr.clone(), - self.input_plan.clone(), - )]) - } - - // Expr(s) that house the Expr instance to return in their bounded params - Expr::Alias(expr, ..) - | Expr::Not(expr) - | Expr::IsNull(expr) - | Expr::IsNotNull(expr) - | Expr::IsTrue(expr) - | Expr::IsFalse(expr) - | Expr::IsUnknown(expr) - | Expr::IsNotTrue(expr) - | Expr::IsNotFalse(expr) - | Expr::IsNotUnknown(expr) - | Expr::Negative(expr) - | Expr::GetIndexedField(GetIndexedField { expr, .. }) - | Expr::Cast(Cast { expr, .. }) - | Expr::TryCast(TryCast { expr, .. }) - | Expr::Sort(Sort { expr, .. }) - | Expr::InSubquery { expr, .. } => { - Ok(vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]) - } - - // Expr variants containing a collection of Expr(s) for operands - Expr::AggregateFunction(AggregateFunction { args, .. }) - | Expr::AggregateUDF { args, .. } - | Expr::ScalarFunction { args, .. } - | Expr::ScalarUDF { args, .. } - | Expr::WindowFunction(WindowFunction { args, .. }) => Ok(args - .iter() - .map(|arg| PyExpr::from(arg.clone(), self.input_plan.clone())) - .collect()), - - // Expr(s) that require more specific processing - Expr::Case(Case { - expr, - when_then_expr, - else_expr, - }) => { - let mut operands: Vec = Vec::new(); - - if let Some(e) = expr { - operands.push(PyExpr::from(*e.clone(), self.input_plan.clone())); - }; - - for (when, then) in when_then_expr { - operands.push(PyExpr::from(*when.clone(), self.input_plan.clone())); - operands.push(PyExpr::from(*then.clone(), self.input_plan.clone())); - } - - if let Some(e) = else_expr { - operands.push(PyExpr::from(*e.clone(), self.input_plan.clone())); - }; - - Ok(operands) - } - Expr::InList { expr, list, .. } => { - let mut operands: Vec = - vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]; - for list_elem in list { - operands.push(PyExpr::from(list_elem.clone(), self.input_plan.clone())); - } - - Ok(operands) - } - Expr::BinaryExpr(BinaryExpr { left, right, .. }) => Ok(vec![ - PyExpr::from(*left.clone(), self.input_plan.clone()), - PyExpr::from(*right.clone(), self.input_plan.clone()), - ]), - Expr::Like(Like { expr, pattern, .. }) => Ok(vec![ - PyExpr::from(*expr.clone(), self.input_plan.clone()), - PyExpr::from(*pattern.clone(), self.input_plan.clone()), - ]), - Expr::ILike(Like { expr, pattern, .. }) => Ok(vec![ - PyExpr::from(*expr.clone(), self.input_plan.clone()), - PyExpr::from(*pattern.clone(), self.input_plan.clone()), - ]), - Expr::SimilarTo(Like { expr, pattern, .. }) => Ok(vec![ - PyExpr::from(*expr.clone(), self.input_plan.clone()), - PyExpr::from(*pattern.clone(), self.input_plan.clone()), - ]), - Expr::Between(Between { - expr, - negated: _, - low, - high, - }) => Ok(vec![ - PyExpr::from(*expr.clone(), self.input_plan.clone()), - PyExpr::from(*low.clone(), self.input_plan.clone()), - PyExpr::from(*high.clone(), self.input_plan.clone()), - ]), - - // Currently un-support/implemented Expr types for Rex Call operations - Expr::GroupingSet(..) - | Expr::OuterReferenceColumn(_, _) - | Expr::Wildcard - | Expr::QualifiedWildcard { .. } - | Expr::ScalarSubquery(..) - | Expr::Placeholder { .. } - | Expr::Exists { .. } => Err(py_runtime_err(format!( - "Unimplemented Expr type: {}", - self.expr - ))), - } - } - - #[pyo3(name = "getOperatorName")] - pub fn get_operator_name(&self) -> PyResult { - Ok(match &self.expr { - Expr::BinaryExpr(BinaryExpr { - left: _, - op, - right: _, - }) => format!("{op}"), - Expr::ScalarFunction { fun, args: _ } => format!("{fun}"), - Expr::ScalarUDF { fun, .. } => fun.name.clone(), - Expr::Cast { .. } => "cast".to_string(), - Expr::Between { .. } => "between".to_string(), - Expr::Case { .. } => "case".to_string(), - Expr::IsNull(..) => "is null".to_string(), - Expr::IsNotNull(..) => "is not null".to_string(), - Expr::IsTrue(_) => "is true".to_string(), - Expr::IsFalse(_) => "is false".to_string(), - Expr::IsUnknown(_) => "is unknown".to_string(), - Expr::IsNotTrue(_) => "is not true".to_string(), - Expr::IsNotFalse(_) => "is not false".to_string(), - Expr::IsNotUnknown(_) => "is not unknown".to_string(), - Expr::InList { .. } => "in list".to_string(), - Expr::Negative(..) => "negative".to_string(), - Expr::Not(..) => "not".to_string(), - Expr::Like(Like { negated, .. }) => { - if *negated { - "not like".to_string() - } else { - "like".to_string() - } - } - Expr::ILike(Like { negated, .. }) => { - if *negated { - "not ilike".to_string() - } else { - "ilike".to_string() - } - } - Expr::SimilarTo(Like { negated, .. }) => { - if *negated { - "not similar to".to_string() - } else { - "similar to".to_string() - } - } - _ => { - return Err(py_type_err(format!( - "Catch all triggered in get_operator_name: {:?}", - &self.expr - ))) - } - }) - } - - /// Gets the ScalarValue represented by the Expression - #[pyo3(name = "getType")] - pub fn get_type(&self) -> PyResult { - Ok(String::from(match &self.expr { - Expr::BinaryExpr(BinaryExpr { - left: _, - op, - right: _, - }) => match op { - Operator::Eq - | Operator::NotEq - | Operator::Lt - | Operator::LtEq - | Operator::Gt - | Operator::GtEq - | Operator::And - | Operator::Or - | Operator::IsDistinctFrom - | Operator::IsNotDistinctFrom - | Operator::RegexMatch - | Operator::RegexIMatch - | Operator::RegexNotMatch - | Operator::RegexNotIMatch => "BOOLEAN", - Operator::Plus | Operator::Minus | Operator::Multiply | Operator::Modulo => { - "BIGINT" - } - Operator::Divide => "FLOAT", - Operator::StringConcat => "VARCHAR", - Operator::BitwiseShiftLeft - | Operator::BitwiseShiftRight - | Operator::BitwiseXor - | Operator::BitwiseAnd - | Operator::BitwiseOr => { - // the type here should be the same as the type of the left expression - // but we can only compute that if we have the schema available - return Err(py_type_err( - "Bitwise operators unsupported in get_type".to_string(), - )); - } - }, - Expr::Literal(scalar_value) => match scalar_value { - ScalarValue::Boolean(_value) => "Boolean", - ScalarValue::Float32(_value) => "Float32", - ScalarValue::Float64(_value) => "Float64", - ScalarValue::Decimal128(_value, ..) => "Decimal128", - ScalarValue::Dictionary(..) => "Dictionary", - ScalarValue::Int8(_value) => "Int8", - ScalarValue::Int16(_value) => "Int16", - ScalarValue::Int32(_value) => "Int32", - ScalarValue::Int64(_value) => "Int64", - ScalarValue::UInt8(_value) => "UInt8", - ScalarValue::UInt16(_value) => "UInt16", - ScalarValue::UInt32(_value) => "UInt32", - ScalarValue::UInt64(_value) => "UInt64", - ScalarValue::Utf8(_value) => "Utf8", - ScalarValue::LargeUtf8(_value) => "LargeUtf8", - ScalarValue::Binary(_value) => "Binary", - ScalarValue::LargeBinary(_value) => "LargeBinary", - ScalarValue::Date32(_value) => "Date32", - ScalarValue::Date64(_value) => "Date64", - ScalarValue::Time32Second(_value) => "Time32", - ScalarValue::Time32Millisecond(_value) => "Time32", - ScalarValue::Time64Microsecond(_value) => "Time64", - ScalarValue::Time64Nanosecond(_value) => "Time64", - ScalarValue::Null => "Null", - ScalarValue::TimestampSecond(..) => "TimestampSecond", - ScalarValue::TimestampMillisecond(..) => "TimestampMillisecond", - ScalarValue::TimestampMicrosecond(..) => "TimestampMicrosecond", - ScalarValue::TimestampNanosecond(..) => "TimestampNanosecond", - ScalarValue::IntervalYearMonth(..) => "IntervalYearMonth", - ScalarValue::IntervalDayTime(..) => "IntervalDayTime", - ScalarValue::IntervalMonthDayNano(..) => "IntervalMonthDayNano", - ScalarValue::List(..) => "List", - ScalarValue::Struct(..) => "Struct", - ScalarValue::FixedSizeBinary(_, _) => "FixedSizeBinary", - }, - Expr::ScalarFunction { fun, args: _ } => match fun { - BuiltinScalarFunction::Abs => "Abs", - BuiltinScalarFunction::DatePart => "DatePart", - _ => { - return Err(py_type_err(format!( - "Catch all triggered for ScalarFunction in get_type; {fun:?}" - ))) - } - }, - Expr::Cast(Cast { expr: _, data_type }) => match data_type { - DataType::Null => "NULL", - DataType::Boolean => "BOOLEAN", - DataType::Int8 | DataType::UInt8 => "TINYINT", - DataType::Int16 | DataType::UInt16 => "SMALLINT", - DataType::Int32 | DataType::UInt32 => "INTEGER", - DataType::Int64 | DataType::UInt64 => "BIGINT", - DataType::Float32 => "FLOAT", - DataType::Float64 => "DOUBLE", - DataType::Timestamp { .. } => "TIMESTAMP", - DataType::Date32 | DataType::Date64 => "DATE", - DataType::Time32(..) => "TIME32", - DataType::Time64(..) => "TIME64", - DataType::Duration(..) => "DURATION", - DataType::Interval(..) => "INTERVAL", - DataType::Binary => "BINARY", - DataType::FixedSizeBinary(..) => "FIXEDSIZEBINARY", - DataType::LargeBinary => "LARGEBINARY", - DataType::Utf8 => "VARCHAR", - DataType::LargeUtf8 => "BIGVARCHAR", - DataType::List(..) => "LIST", - DataType::FixedSizeList(..) => "FIXEDSIZELIST", - DataType::LargeList(..) => "LARGELIST", - DataType::Struct(..) => "STRUCT", - DataType::Union(..) => "UNION", - DataType::Dictionary(..) => "DICTIONARY", - DataType::Decimal128(..) => "DECIMAL", - DataType::Decimal256(..) => "DECIMAL", - DataType::Map(..) => "MAP", - _ => { - return Err(py_type_err(format!( - "Catch all triggered for Cast in get_type; {data_type:?}" - ))) - } - }, - _ => { - return Err(py_type_err(format!( - "Catch all triggered in get_type; {:?}", - &self.expr - ))) - } - })) - } - - /// Gets the precision/scale represented by the Expression's decimal datatype - #[pyo3(name = "getPrecisionScale")] - pub fn get_precision_scale(&self) -> PyResult<(u8, i8)> { - Ok(match &self.expr { - Expr::Cast(Cast { expr: _, data_type }) => match data_type { - DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { - (*precision, *scale) - } - _ => { - return Err(py_type_err(format!( - "Catch all triggered for Cast in get_precision_scale; {data_type:?}" - ))) - } - }, - _ => { - return Err(py_type_err(format!( - "Catch all triggered in get_precision_scale; {:?}", - &self.expr - ))) - } - }) - } - - #[pyo3(name = "getFilterExpr")] - pub fn get_filter_expr(&self) -> PyResult> { - // TODO refactor to avoid duplication - match &self.expr { - Expr::Alias(expr, _) => match expr.as_ref() { - Expr::AggregateFunction(AggregateFunction { filter, .. }) - | Expr::AggregateUDF { filter, .. } => match filter { - Some(filter) => { - Ok(Some(PyExpr::from(*filter.clone(), self.input_plan.clone()))) - } - None => Ok(None), - }, - _ => Err(py_type_err( - "getFilterExpr() - Non-aggregate expression encountered", - )), - }, - Expr::AggregateFunction(AggregateFunction { filter, .. }) - | Expr::AggregateUDF { filter, .. } => match filter { - Some(filter) => Ok(Some(PyExpr::from(*filter.clone(), self.input_plan.clone()))), - None => Ok(None), - }, - _ => Err(py_type_err( - "getFilterExpr() - Non-aggregate expression encountered", - )), - } - } - - #[pyo3(name = "getFloat32Value")] - pub fn float_32_value(&self) -> PyResult> { - extract_scalar_value!(self, Float32) - } - - #[pyo3(name = "getFloat64Value")] - pub fn float_64_value(&self) -> PyResult> { - extract_scalar_value!(self, Float64) - } - - #[pyo3(name = "getDecimal128Value")] - pub fn decimal_128_value(&mut self) -> PyResult<(Option, u8, i8)> { - match self.get_scalar_value()? { - ScalarValue::Decimal128(value, precision, scale) => Ok((*value, *precision, *scale)), - other => Err(unexpected_literal_value(other)), - } - } - - #[pyo3(name = "getInt8Value")] - pub fn int_8_value(&self) -> PyResult> { - extract_scalar_value!(self, Int8) - } - - #[pyo3(name = "getInt16Value")] - pub fn int_16_value(&self) -> PyResult> { - extract_scalar_value!(self, Int16) - } - - #[pyo3(name = "getInt32Value")] - pub fn int_32_value(&self) -> PyResult> { - extract_scalar_value!(self, Int32) - } - - #[pyo3(name = "getInt64Value")] - pub fn int_64_value(&self) -> PyResult> { - extract_scalar_value!(self, Int64) - } - - #[pyo3(name = "getUInt8Value")] - pub fn uint_8_value(&self) -> PyResult> { - extract_scalar_value!(self, UInt8) - } - - #[pyo3(name = "getUInt16Value")] - pub fn uint_16_value(&self) -> PyResult> { - extract_scalar_value!(self, UInt16) - } - - #[pyo3(name = "getUInt32Value")] - pub fn uint_32_value(&self) -> PyResult> { - extract_scalar_value!(self, UInt32) - } - - #[pyo3(name = "getUInt64Value")] - pub fn uint_64_value(&self) -> PyResult> { - extract_scalar_value!(self, UInt64) - } - - #[pyo3(name = "getDate32Value")] - pub fn date_32_value(&self) -> PyResult> { - extract_scalar_value!(self, Date32) - } - - #[pyo3(name = "getDate64Value")] - pub fn date_64_value(&self) -> PyResult> { - extract_scalar_value!(self, Date64) - } - - #[pyo3(name = "getTime64Value")] - pub fn time_64_value(&self) -> PyResult> { - extract_scalar_value!(self, Time64Nanosecond) - } - - #[pyo3(name = "getTimestampValue")] - pub fn timestamp_value(&mut self) -> PyResult<(Option, Option)> { - match self.get_scalar_value()? { - ScalarValue::TimestampNanosecond(iv, tz) - | ScalarValue::TimestampMicrosecond(iv, tz) - | ScalarValue::TimestampMillisecond(iv, tz) - | ScalarValue::TimestampSecond(iv, tz) => Ok((*iv, tz.clone())), - other => Err(unexpected_literal_value(other)), - } - } - - #[pyo3(name = "getBoolValue")] - pub fn bool_value(&self) -> PyResult> { - extract_scalar_value!(self, Boolean) - } - - #[pyo3(name = "getStringValue")] - pub fn string_value(&self) -> PyResult> { - match self.get_scalar_value()? { - ScalarValue::Utf8(value) => Ok(value.clone()), - other => Err(unexpected_literal_value(other)), - } - } - - #[pyo3(name = "getIntervalDayTimeValue")] - pub fn interval_day_time_value(&self) -> PyResult> { - match self.get_scalar_value()? { - ScalarValue::IntervalDayTime(Some(iv)) => { - let interval = *iv as u64; - let days = (interval >> 32) as i32; - let ms = interval as i32; - Ok(Some((days, ms))) - } - ScalarValue::IntervalDayTime(None) => Ok(None), - other => Err(unexpected_literal_value(other)), - } - } - - #[pyo3(name = "getIntervalMonthDayNanoValue")] - pub fn interval_month_day_nano_value(&self) -> PyResult> { - match self.get_scalar_value()? { - ScalarValue::IntervalMonthDayNano(Some(iv)) => { - let interval = *iv as u128; - let months = (interval >> 32) as i32; - let days = (interval >> 64) as i32; - let ns = interval as i64; - Ok(Some((months, days, ns))) - } - ScalarValue::IntervalMonthDayNano(None) => Ok(None), - other => Err(unexpected_literal_value(other)), - } - } - - #[pyo3(name = "isNegated")] - pub fn is_negated(&self) -> PyResult { - match &self.expr { - Expr::Between(Between { negated, .. }) - | Expr::Exists { negated, .. } - | Expr::InList { negated, .. } - | Expr::InSubquery { negated, .. } => Ok(*negated), - _ => Err(py_type_err(format!( - "unknown Expr type {:?} encountered", - &self.expr - ))), - } - } - - #[pyo3(name = "isDistinctAgg")] - pub fn is_distinct_aggregation(&self) -> PyResult { - // TODO refactor to avoid duplication - match &self.expr { - Expr::AggregateFunction(funct) => Ok(funct.distinct), - Expr::AggregateUDF { .. } => Ok(false), - Expr::Alias(expr, _) => match expr.as_ref() { - Expr::AggregateFunction(funct) => Ok(funct.distinct), - Expr::AggregateUDF { .. } => Ok(false), - _ => Err(py_type_err( - "isDistinctAgg() - Non-aggregate expression encountered", - )), - }, - _ => Err(py_type_err( - "getFilterExpr() - Non-aggregate expression encountered", - )), - } - } - - /// Returns if a sort expressions is an ascending sort - #[pyo3(name = "isSortAscending")] - pub fn is_sort_ascending(&self) -> PyResult { - match &self.expr { - Expr::Sort(Sort { asc, .. }) => Ok(*asc), - _ => Err(py_type_err(format!( - "Provided Expr {:?} is not a sort type", - &self.expr - ))), - } - } - - /// Returns if nulls should be placed first in a sort expression - #[pyo3(name = "isSortNullsFirst")] - pub fn is_sort_nulls_first(&self) -> PyResult { - match &self.expr { - Expr::Sort(Sort { nulls_first, .. }) => Ok(*nulls_first), - _ => Err(py_type_err(format!( - "Provided Expr {:?} is not a sort type", - &self.expr - ))), - } - } - - /// Returns the escape char for like/ilike/similar to expr variants - #[pyo3(name = "getEscapeChar")] - pub fn get_escape_char(&self) -> PyResult> { - match &self.expr { - Expr::Like(Like { escape_char, .. }) - | Expr::ILike(Like { escape_char, .. }) - | Expr::SimilarTo(Like { escape_char, .. }) => Ok(*escape_char), - _ => Err(py_type_err(format!( - "Provided Expr {:?} not one of Like/ILike/SimilarTo", - &self.expr - ))), - } - } -} - -impl PyExpr { - /// Get the scalar value represented by this literal expression, returning an error - /// if this is not a literal expression - fn get_scalar_value(&self) -> Result<&ScalarValue> { - match &self.expr { - Expr::Literal(v) => Ok(v), - _ => Err(DaskPlannerError::Internal( - "get_scalar_value() called on non-literal expression".to_string(), - )), - } - } -} - -fn unexpected_literal_value(value: &ScalarValue) -> PyErr { - DaskPlannerError::Internal(format!("getValue() - Unexpected value: {value}")).into() -} - -fn get_expr_name(expr: &Expr) -> Result { - match expr { - Expr::Alias(expr, _) => get_expr_name(expr), - _ => Ok(expr.canonical_name()), - } -} - -/// Create a [DFField] representing an [Expr], given an input [LogicalPlan] to resolve against -pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> Result { - match expr { - Expr::Sort(Sort { expr, .. }) => { - // DataFusion does not support create_name for sort expressions (since they never - // appear in projections) so we just delegate to the contained expression instead - expr_to_field(expr, input_plan) - } - _ => { - let fields = - exprlist_to_fields(&[expr.clone()], input_plan).map_err(DaskPlannerError::from)?; - Ok(fields[0].clone()) - } - } -} - -#[cfg(test)] -mod test { - use datafusion_python::{ - datafusion_common::{Column, ScalarValue}, - datafusion_expr::Expr, - }; - - use crate::{error::Result, expression::PyExpr}; - - #[test] - fn get_value_u32() -> Result<()> { - test_get_value(ScalarValue::UInt32(None))?; - test_get_value(ScalarValue::UInt32(Some(123))) - } - - #[test] - fn get_value_utf8() -> Result<()> { - test_get_value(ScalarValue::Utf8(None))?; - test_get_value(ScalarValue::Utf8(Some("hello".to_string()))) - } - - #[test] - fn get_value_non_literal() -> Result<()> { - let expr = PyExpr::from(Expr::Column(Column::from_qualified_name("a.b")), None); - let error = expr - .get_scalar_value() - .expect_err("cannot get scalar value from column"); - assert_eq!( - "Internal(\"get_scalar_value() called on non-literal expression\")", - &format!("{:?}", error) - ); - Ok(()) - } - - fn test_get_value(value: ScalarValue) -> Result<()> { - let expr = PyExpr::from(Expr::Literal(value.clone()), None); - assert_eq!(&value, expr.get_scalar_value()?); - Ok(()) - } -} diff --git a/dask_planner/src/lib.rs b/dask_planner/src/lib.rs index f5305d900..828737b17 100644 --- a/dask_planner/src/lib.rs +++ b/dask_planner/src/lib.rs @@ -3,7 +3,6 @@ use pyo3::prelude::*; mod dialect; mod error; -mod expression; mod parser; mod sql; @@ -18,7 +17,6 @@ fn rust(py: Python, m: &PyModule) -> PyResult<()> { pyo3_log::init(); // Register the python classes - m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/dask_planner/src/sql/logical.rs b/dask_planner/src/sql/logical.rs index d2096ba9b..aa7baa544 100644 --- a/dask_planner/src/sql/logical.rs +++ b/dask_planner/src/sql/logical.rs @@ -33,6 +33,7 @@ pub mod sort; pub mod subquery_alias; pub mod table_scan; pub mod use_schema; +pub mod utils; pub mod window; use datafusion_python::{ diff --git a/dask_planner/src/sql/logical/aggregate.rs b/dask_planner/src/sql/logical/aggregate.rs index 0acc8b86e..cdc65f854 100644 --- a/dask_planner/src/sql/logical/aggregate.rs +++ b/dask_planner/src/sql/logical/aggregate.rs @@ -1,15 +1,15 @@ -use datafusion_python::datafusion_expr::{ - expr::AggregateFunction, - logical_plan::{Aggregate, Distinct}, - Expr, - LogicalPlan, +use datafusion_python::{ + datafusion_expr::{ + expr::AggregateFunction, + logical_plan::{Aggregate, Distinct}, + Expr, + LogicalPlan, + }, + expr::PyExpr, }; use pyo3::prelude::*; -use crate::{ - expression::{py_expr_list, PyExpr}, - sql::exceptions::py_type_err, -}; +use crate::sql::{exceptions::py_type_err, logical::utils::py_expr_list}; #[pyclass(name = "Aggregate", module = "dask_planner", subclass)] #[derive(Clone)] diff --git a/dask_planner/src/sql/logical/filter.rs b/dask_planner/src/sql/logical/filter.rs index a50d508ff..6de97c432 100644 --- a/dask_planner/src/sql/logical/filter.rs +++ b/dask_planner/src/sql/logical/filter.rs @@ -1,7 +1,10 @@ -use datafusion_python::datafusion_expr::{logical_plan::Filter, LogicalPlan}; +use datafusion_python::{ + datafusion_expr::{logical_plan::Filter, LogicalPlan}, + expr::PyExpr, +}; use pyo3::prelude::*; -use crate::{expression::PyExpr, sql::exceptions::py_type_err}; +use crate::sql::exceptions::py_type_err; #[pyclass(name = "Filter", module = "dask_planner", subclass)] #[derive(Clone)] @@ -14,10 +17,7 @@ impl PyFilter { /// LogicalPlan::Filter: The PyExpr, predicate, that represents the filtering condition #[pyo3(name = "getCondition")] pub fn get_condition(&mut self) -> PyResult { - Ok(PyExpr::from( - self.filter.predicate.clone(), - Some(vec![self.filter.input.clone()]), - )) + Ok(PyExpr::from(self.filter.predicate.clone())) } } diff --git a/dask_planner/src/sql/logical/join.rs b/dask_planner/src/sql/logical/join.rs index d6c31b55b..e7744f212 100644 --- a/dask_planner/src/sql/logical/join.rs +++ b/dask_planner/src/sql/logical/join.rs @@ -7,13 +7,11 @@ use datafusion_python::{ Expr, Operator, }, + expr::PyExpr, }; use pyo3::prelude::*; -use crate::{ - expression::PyExpr, - sql::{column, exceptions::py_type_err}, -}; +use crate::sql::{column, exceptions::py_type_err}; #[pyclass(name = "Join", module = "dask_planner", subclass)] #[derive(Clone)] @@ -61,10 +59,7 @@ impl PyJoin { .iter() .fold(filters[0].clone(), |acc, expr| and(acc, expr.clone())); - Ok(Some(PyExpr::from( - root_expr, - Some(vec![self.join.left.clone(), self.join.right.clone()]), - ))) + Ok(Some(PyExpr::from(root_expr))) } else { Ok(None) } diff --git a/dask_planner/src/sql/logical/limit.rs b/dask_planner/src/sql/logical/limit.rs index 189fdeea0..12da12091 100644 --- a/dask_planner/src/sql/logical/limit.rs +++ b/dask_planner/src/sql/logical/limit.rs @@ -1,10 +1,11 @@ use datafusion_python::{ datafusion_common::ScalarValue, datafusion_expr::{logical_plan::Limit, Expr, LogicalPlan}, + expr::PyExpr, }; use pyo3::prelude::*; -use crate::{expression::PyExpr, sql::exceptions::py_type_err}; +use crate::sql::exceptions::py_type_err; #[pyclass(name = "Limit", module = "dask_planner", subclass)] #[derive(Clone)] @@ -17,21 +18,17 @@ impl PyLimit { /// `OFFSET` specified in the query #[pyo3(name = "getSkip")] pub fn skip(&self) -> PyResult { - Ok(PyExpr::from( - Expr::Literal(ScalarValue::UInt64(Some(self.limit.skip as u64))), - Some(vec![self.limit.input.clone()]), - )) + Ok(PyExpr::from(Expr::Literal(ScalarValue::UInt64(Some( + self.limit.skip as u64, + ))))) } /// `LIMIT` specified in the query #[pyo3(name = "getFetch")] pub fn fetch(&self) -> PyResult { - Ok(PyExpr::from( - Expr::Literal(ScalarValue::UInt64(Some( - self.limit.fetch.unwrap_or(0) as u64 - ))), - Some(vec![self.limit.input.clone()]), - )) + Ok(PyExpr::from(Expr::Literal(ScalarValue::UInt64(Some( + self.limit.fetch.unwrap_or(0) as u64, + ))))) } } diff --git a/dask_planner/src/sql/logical/projection.rs b/dask_planner/src/sql/logical/projection.rs index 99ed0d684..f8d3a3ffc 100644 --- a/dask_planner/src/sql/logical/projection.rs +++ b/dask_planner/src/sql/logical/projection.rs @@ -1,7 +1,11 @@ -use datafusion_python::datafusion_expr::{logical_plan::Projection, Expr, LogicalPlan}; +use datafusion_python::{ + datafusion_expr::{logical_plan::Projection, Expr, LogicalPlan}, + expr::PyExpr, +}; use pyo3::prelude::*; -use crate::{expression::PyExpr, sql::exceptions::py_type_err}; +use super::utils::column_name; +use crate::sql::exceptions::py_type_err; #[pyclass(name = "Projection", module = "dask_planner", subclass)] #[derive(Clone)] @@ -11,13 +15,12 @@ pub struct PyProjection { impl PyProjection { /// Projection: Gets the names of the fields that should be projected - fn projected_expressions(&mut self, local_expr: &PyExpr) -> Vec { + fn projected_expressions(local_expr: &PyExpr) -> Vec { let mut projs: Vec = Vec::new(); match &local_expr.expr { Expr::Alias(expr, _name) => { - let py_expr: PyExpr = - PyExpr::from(*expr.clone(), Some(vec![self.projection.input.clone()])); - projs.extend_from_slice(self.projected_expressions(&py_expr).as_slice()); + let py_expr: PyExpr = PyExpr::from(*expr.clone()); + projs.extend_from_slice(PyProjection::projected_expressions(&py_expr).as_slice()); } _ => projs.push(local_expr.clone()), } @@ -31,16 +34,12 @@ impl PyProjection { fn named_projects(&mut self) -> PyResult> { let mut named: Vec<(String, PyExpr)> = Vec::new(); for expression in self.projection.expr.clone() { - let py_expr: PyExpr = - PyExpr::from(expression, Some(vec![self.projection.input.clone()])); - for expr in self.projected_expressions(&py_expr) { + let py_expr: PyExpr = PyExpr::from(expression); + for expr in PyProjection::projected_expressions(&py_expr) { match expr.expr { - Expr::Alias(ex, name) => named.push(( - name.to_string(), - PyExpr::from(*ex, Some(vec![self.projection.input.clone()])), - )), + Expr::Alias(ex, name) => named.push((name.to_string(), PyExpr::from(*ex))), _ => { - if let Ok(name) = expr._column_name(&self.projection.input) { + if let Ok(name) = column_name(&expr.expr, &self.projection.input) { named.push((name, expr.clone())); } } diff --git a/dask_planner/src/sql/logical/repartition_by.rs b/dask_planner/src/sql/logical/repartition_by.rs index e931b88e7..55e8de234 100644 --- a/dask_planner/src/sql/logical/repartition_by.rs +++ b/dask_planner/src/sql/logical/repartition_by.rs @@ -1,18 +1,18 @@ -use datafusion_python::datafusion_expr::{ - logical_plan::{Partitioning, Repartition}, - Expr, - LogicalPlan, +use datafusion_python::{ + datafusion_expr::{ + logical_plan::{Partitioning, Repartition}, + Expr, + LogicalPlan, + }, + expr::PyExpr, }; use pyo3::prelude::*; -use crate::{ - expression::PyExpr, - sql::{exceptions::py_type_err, logical}, -}; +use crate::sql::{exceptions::py_type_err, logical}; #[pyclass(name = "RepartitionBy", module = "dask_planner", subclass)] pub struct PyRepartitionBy { - pub(crate) repartition: Repartition, + pub repartition: Repartition, } #[pymethods] @@ -28,7 +28,7 @@ impl PyRepartitionBy { match &self.repartition.partitioning_scheme { Partitioning::DistributeBy(distribute_list) => Ok(distribute_list .iter() - .map(|e| PyExpr::from(e.clone(), Some(vec![self.repartition.input.clone()]))) + .map(|e| PyExpr::from(e.clone())) .collect()), _ => Err(py_type_err("unexpected repartition strategy")), } diff --git a/dask_planner/src/sql/logical/sort.rs b/dask_planner/src/sql/logical/sort.rs index 9abcd3906..a92422d06 100644 --- a/dask_planner/src/sql/logical/sort.rs +++ b/dask_planner/src/sql/logical/sort.rs @@ -1,10 +1,11 @@ -use datafusion_python::datafusion_expr::{logical_plan::Sort, LogicalPlan}; +use datafusion_python::{ + datafusion_expr::{logical_plan::Sort, LogicalPlan}, + expr::PyExpr, +}; use pyo3::prelude::*; -use crate::{ - expression::{py_expr_list, PyExpr}, - sql::exceptions::py_type_err, -}; +use super::utils::py_expr_list; +use crate::sql::exceptions::py_type_err; #[pyclass(name = "Sort", module = "dask_planner", subclass)] #[derive(Clone)] diff --git a/dask_planner/src/sql/logical/subquery_alias.rs b/dask_planner/src/sql/logical/subquery_alias.rs index 1b23e5dc4..003e02045 100644 --- a/dask_planner/src/sql/logical/subquery_alias.rs +++ b/dask_planner/src/sql/logical/subquery_alias.rs @@ -14,7 +14,7 @@ impl PySubqueryAlias { /// Returns a Vec of the sort expressions #[pyo3(name = "getAlias")] pub fn alias(&self) -> PyResult { - Ok(self.subquery_alias.alias.clone()) + Ok(self.subquery_alias.alias.clone().to_string()) } } diff --git a/dask_planner/src/sql/logical/table_scan.rs b/dask_planner/src/sql/logical/table_scan.rs index 7f9253bcd..330f54f86 100644 --- a/dask_planner/src/sql/logical/table_scan.rs +++ b/dask_planner/src/sql/logical/table_scan.rs @@ -3,13 +3,12 @@ use std::sync::Arc; use datafusion_python::{ datafusion_common::DFSchema, datafusion_expr::{logical_plan::TableScan, LogicalPlan}, + expr::PyExpr, }; use pyo3::prelude::*; -use crate::{ - expression::{py_expr_list, PyExpr}, - sql::exceptions::py_type_err, -}; +use super::utils::py_expr_list; +use crate::sql::exceptions::py_type_err; #[pyclass(name = "TableScan", module = "dask_planner", subclass)] #[derive(Clone)] diff --git a/dask_planner/src/sql/logical/utils.rs b/dask_planner/src/sql/logical/utils.rs new file mode 100644 index 000000000..c98324095 --- /dev/null +++ b/dask_planner/src/sql/logical/utils.rs @@ -0,0 +1,37 @@ +use std::sync::Arc; + +use datafusion_python::{ + datafusion_common::DFField, + datafusion_expr::{expr::Sort, utils::exprlist_to_fields, Expr, LogicalPlan}, + expr::PyExpr, +}; +use pyo3::PyResult; + +use crate::error::DaskPlannerError; + +/// Convert a list of DataFusion Expr to PyExpr +pub fn py_expr_list(_input: &Arc, expr: &[Expr]) -> PyResult> { + Ok(expr.iter().map(|e| PyExpr::from(e.clone())).collect()) +} + +/// Determines the name of the `Expr` instance by examining the LogicalPlan +pub fn column_name(expr: &Expr, plan: &LogicalPlan) -> Result { + let field = expr_to_field(expr, plan)?; + Ok(field.qualified_column().flat_name()) +} + +/// Create a [DFField] representing an [Expr], given an input [LogicalPlan] to resolve against +pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> Result { + match expr { + Expr::Sort(Sort { expr, .. }) => { + // DataFusion does not support create_name for sort expressions (since they never + // appear in projections) so we just delegate to the contained expression instead + expr_to_field(expr, input_plan) + } + _ => { + let fields = + exprlist_to_fields(&[expr.clone()], input_plan).map_err(DaskPlannerError::from)?; + Ok(fields[0].clone()) + } + } +} diff --git a/dask_planner/src/sql/logical/window.rs b/dask_planner/src/sql/logical/window.rs index e104ccdb3..bf589f6ac 100644 --- a/dask_planner/src/sql/logical/window.rs +++ b/dask_planner/src/sql/logical/window.rs @@ -8,14 +8,12 @@ use datafusion_python::{ WindowFrame, WindowFrameBound, }, + expr::PyExpr, }; use pyo3::prelude::*; -use crate::{ - error::DaskPlannerError, - expression::{py_expr_list, PyExpr}, - sql::exceptions::py_type_err, -}; +use super::utils::py_expr_list; +use crate::{error::DaskPlannerError, sql::exceptions::py_type_err}; #[pyclass(name = "Window", module = "dask_planner", subclass)] #[derive(Clone)] diff --git a/dask_planner/src/sql/table.rs b/dask_planner/src/sql/table.rs index f25f891ec..9807a906b 100644 --- a/dask_planner/src/sql/table.rs +++ b/dask_planner/src/sql/table.rs @@ -2,7 +2,7 @@ use std::{any::Any, sync::Arc}; use async_trait::async_trait; use datafusion_python::{ - datafusion::arrow::datatypes::{DataType, Field, SchemaRef}, + datafusion::arrow::datatypes::{DataType, SchemaRef}, datafusion_common::DFField, datafusion_expr::{Expr, LogicalPlan, TableProviderFilterPushDown, TableSource}, datafusion_optimizer::utils::split_conjunction, @@ -185,9 +185,7 @@ impl DaskTable { } /// Traverses the logical plan to locate the Table associated with the query -pub(crate) fn table_from_logical_plan( - plan: &LogicalPlan, -) -> Result, DaskPlannerError> { +pub fn table_from_logical_plan(plan: &LogicalPlan) -> Result, DaskPlannerError> { match plan { LogicalPlan::Projection(projection) => table_from_logical_plan(&projection.input), LogicalPlan::Filter(filter) => table_from_logical_plan(&filter.input), @@ -195,7 +193,7 @@ pub(crate) fn table_from_logical_plan( // Get the TableProvider for this Table instance let tbl_provider: Arc = table_scan.source.clone(); let tbl_schema: SchemaRef = tbl_provider.schema(); - let fields: &Vec = tbl_schema.fields(); + let fields = tbl_schema.fields(); let mut cols: Vec<(String, DaskTypeMap)> = Vec::new(); for field in fields { diff --git a/dask_planner/src/sql/types.rs b/dask_planner/src/sql/types.rs index ceff904a6..1453a5465 100644 --- a/dask_planner/src/sql/types.rs +++ b/dask_planner/src/sql/types.rs @@ -2,7 +2,7 @@ pub mod rel_data_type; pub mod rel_data_type_field; use datafusion_python::{ - datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}, + datafusion::arrow::datatypes::{DataType, IntervalUnit}, datafusion_sql::sqlparser::{ast::DataType as SQLType, parser::Parser, tokenizer::Tokenizer}, }; use pyo3::{prelude::*, types::PyDict}; @@ -51,68 +51,68 @@ impl DaskTypeMap { #[pyo3(signature = (sql_type, **py_kwargs))] fn new(sql_type: SqlTypeName, py_kwargs: Option<&PyDict>) -> PyResult { let d_type: DataType = match sql_type { - SqlTypeName::TIMESTAMP_WITH_LOCAL_TIME_ZONE => { - let (unit, tz) = match py_kwargs { - Some(dict) => { - let tz: Option = match dict.get_item("tz") { - Some(e) => { - let res: PyResult = e.extract(); - Some(res.unwrap()) - } - None => None, - }; - let unit: TimeUnit = match dict.get_item("unit") { - Some(e) => { - let res: PyResult<&str> = e.extract(); - match res.unwrap() { - "Second" => TimeUnit::Second, - "Millisecond" => TimeUnit::Millisecond, - "Microsecond" => TimeUnit::Microsecond, - "Nanosecond" => TimeUnit::Nanosecond, - _ => TimeUnit::Nanosecond, - } - } - // Default to Nanosecond which is common if not present - None => TimeUnit::Nanosecond, - }; - (unit, tz) - } - // Default to Nanosecond and None for tz which is common if not present - None => (TimeUnit::Nanosecond, None), - }; - DataType::Timestamp(unit, tz) - } - SqlTypeName::TIMESTAMP => { - let (unit, tz) = match py_kwargs { - Some(dict) => { - let tz: Option = match dict.get_item("tz") { - Some(e) => { - let res: PyResult = e.extract(); - Some(res.unwrap()) - } - None => None, - }; - let unit: TimeUnit = match dict.get_item("unit") { - Some(e) => { - let res: PyResult<&str> = e.extract(); - match res.unwrap() { - "Second" => TimeUnit::Second, - "Millisecond" => TimeUnit::Millisecond, - "Microsecond" => TimeUnit::Microsecond, - "Nanosecond" => TimeUnit::Nanosecond, - _ => TimeUnit::Nanosecond, - } - } - // Default to Nanosecond which is common if not present - None => TimeUnit::Nanosecond, - }; - (unit, tz) - } - // Default to Nanosecond and None for tz which is common if not present - None => (TimeUnit::Nanosecond, None), - }; - DataType::Timestamp(unit, tz) - } + // SqlTypeName::TIMESTAMP_WITH_LOCAL_TIME_ZONE => { + // let (unit, tz) = match py_kwargs { + // Some(dict) => { + // let tz = match dict.get_item("tz") { + // Some(e) => { + // let res = e.extract().unwrap(); + // Some(Arc::new(res.to_owned())) + // } + // None => None, + // }; + // let unit: TimeUnit = match dict.get_item("unit") { + // Some(e) => { + // let res: PyResult<&str> = e.extract(); + // match res.unwrap() { + // "Second" => TimeUnit::Second, + // "Millisecond" => TimeUnit::Millisecond, + // "Microsecond" => TimeUnit::Microsecond, + // "Nanosecond" => TimeUnit::Nanosecond, + // _ => TimeUnit::Nanosecond, + // } + // } + // // Default to Nanosecond which is common if not present + // None => TimeUnit::Nanosecond, + // }; + // (unit, tz) + // } + // // Default to Nanosecond and None for tz which is common if not present + // None => (TimeUnit::Nanosecond, None), + // }; + // DataType::Timestamp(unit, tz) + // } + // SqlTypeName::TIMESTAMP => { + // let (unit, tz) = match py_kwargs { + // Some(dict) => { + // let tz = match dict.get_item("tz") { + // Some(e) => { + // let res: String = e.extract::().unwrap().to_owned(); + // Some(Arc::new(*res.as_str())) + // } + // None => None, + // }; + // let unit: TimeUnit = match dict.get_item("unit") { + // Some(e) => { + // let res: PyResult<&str> = e.extract(); + // match res.unwrap() { + // "Second" => TimeUnit::Second, + // "Millisecond" => TimeUnit::Millisecond, + // "Microsecond" => TimeUnit::Microsecond, + // "Nanosecond" => TimeUnit::Nanosecond, + // _ => TimeUnit::Nanosecond, + // } + // } + // // Default to Nanosecond which is common if not present + // None => TimeUnit::Nanosecond, + // }; + // (unit, tz) + // } + // // Default to Nanosecond and None for tz which is common if not present + // None => (TimeUnit::Nanosecond, None), + // }; + // DataType::Timestamp(unit, tz) + // } SqlTypeName::DECIMAL => { let (precision, scale) = match py_kwargs { Some(dict) => { From 2cff5e0f4154304a026e9e5369f85b4410ed1248 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 8 May 2023 20:58:53 -0400 Subject: [PATCH 02/44] Fix issues from previous merge introduced --- dask_planner/src/sql/logical/table_scan.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dask_planner/src/sql/logical/table_scan.rs b/dask_planner/src/sql/logical/table_scan.rs index 7b622d7ab..2948688e3 100644 --- a/dask_planner/src/sql/logical/table_scan.rs +++ b/dask_planner/src/sql/logical/table_scan.rs @@ -2,13 +2,13 @@ use std::sync::Arc; use datafusion_python::{ datafusion_common::{DFSchema, ScalarValue}, - datafusion_expr::{logical_plan::TableScan, LogicalPlan}, + datafusion_expr::{logical_plan::TableScan, Expr, LogicalPlan}, expr::PyExpr, }; use pyo3::prelude::*; use super::utils::py_expr_list; -use crate::sql::exceptions::py_type_err; +use crate::{error::DaskPlannerError, sql::exceptions::py_type_err}; #[pyclass(name = "TableScan", module = "dask_planner", subclass)] #[derive(Clone)] @@ -126,7 +126,7 @@ impl PyTableScan { /// that cannot be converted to correlating PyArrow IO calls will be returned as is and can be /// used in the Python logic to form Dask tasks for the graph to do computational filtering. pub fn _expand_dnf_filters( - input: &Arc, + _input: &Arc, filters: &[Expr], py: Python, ) -> PyFilteredResult { @@ -137,9 +137,7 @@ impl PyTableScan { .iter() .for_each(|f| match PyTableScan::_expand_dnf_filter(f, py) { Ok(mut expanded_dnf_filter) => filtered_exprs.append(&mut expanded_dnf_filter), - Err(_e) => { - unfiltered_exprs.push(PyExpr::from(f.clone(), Some(vec![input.clone()]))) - } + Err(_e) => unfiltered_exprs.push(PyExpr::from(f.clone())), }); PyFilteredResult { From 065471ed9f72a6c9e44ef989e99d1a026b04f6e1 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 8 May 2023 21:24:29 -0400 Subject: [PATCH 03/44] Uncomment section for time types --- dask_planner/src/sql/types.rs | 128 +++++++++++++++++----------------- 1 file changed, 65 insertions(+), 63 deletions(-) diff --git a/dask_planner/src/sql/types.rs b/dask_planner/src/sql/types.rs index 1453a5465..715820f8b 100644 --- a/dask_planner/src/sql/types.rs +++ b/dask_planner/src/sql/types.rs @@ -1,8 +1,10 @@ pub mod rel_data_type; pub mod rel_data_type_field; +use std::sync::Arc; + use datafusion_python::{ - datafusion::arrow::datatypes::{DataType, IntervalUnit}, + datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}, datafusion_sql::sqlparser::{ast::DataType as SQLType, parser::Parser, tokenizer::Tokenizer}, }; use pyo3::{prelude::*, types::PyDict}; @@ -51,68 +53,68 @@ impl DaskTypeMap { #[pyo3(signature = (sql_type, **py_kwargs))] fn new(sql_type: SqlTypeName, py_kwargs: Option<&PyDict>) -> PyResult { let d_type: DataType = match sql_type { - // SqlTypeName::TIMESTAMP_WITH_LOCAL_TIME_ZONE => { - // let (unit, tz) = match py_kwargs { - // Some(dict) => { - // let tz = match dict.get_item("tz") { - // Some(e) => { - // let res = e.extract().unwrap(); - // Some(Arc::new(res.to_owned())) - // } - // None => None, - // }; - // let unit: TimeUnit = match dict.get_item("unit") { - // Some(e) => { - // let res: PyResult<&str> = e.extract(); - // match res.unwrap() { - // "Second" => TimeUnit::Second, - // "Millisecond" => TimeUnit::Millisecond, - // "Microsecond" => TimeUnit::Microsecond, - // "Nanosecond" => TimeUnit::Nanosecond, - // _ => TimeUnit::Nanosecond, - // } - // } - // // Default to Nanosecond which is common if not present - // None => TimeUnit::Nanosecond, - // }; - // (unit, tz) - // } - // // Default to Nanosecond and None for tz which is common if not present - // None => (TimeUnit::Nanosecond, None), - // }; - // DataType::Timestamp(unit, tz) - // } - // SqlTypeName::TIMESTAMP => { - // let (unit, tz) = match py_kwargs { - // Some(dict) => { - // let tz = match dict.get_item("tz") { - // Some(e) => { - // let res: String = e.extract::().unwrap().to_owned(); - // Some(Arc::new(*res.as_str())) - // } - // None => None, - // }; - // let unit: TimeUnit = match dict.get_item("unit") { - // Some(e) => { - // let res: PyResult<&str> = e.extract(); - // match res.unwrap() { - // "Second" => TimeUnit::Second, - // "Millisecond" => TimeUnit::Millisecond, - // "Microsecond" => TimeUnit::Microsecond, - // "Nanosecond" => TimeUnit::Nanosecond, - // _ => TimeUnit::Nanosecond, - // } - // } - // // Default to Nanosecond which is common if not present - // None => TimeUnit::Nanosecond, - // }; - // (unit, tz) - // } - // // Default to Nanosecond and None for tz which is common if not present - // None => (TimeUnit::Nanosecond, None), - // }; - // DataType::Timestamp(unit, tz) - // } + SqlTypeName::TIMESTAMP_WITH_LOCAL_TIME_ZONE => { + let (unit, tz) = match py_kwargs { + Some(dict) => { + let tz = match dict.get_item("tz") { + Some(e) => { + let res: Option = e.extract().unwrap(); + Some(Arc::new(res.to_owned())) + } + None => None, + }; + let unit: TimeUnit = match dict.get_item("unit") { + Some(e) => { + let res: PyResult<&str> = e.extract(); + match res.unwrap() { + "Second" => TimeUnit::Second, + "Millisecond" => TimeUnit::Millisecond, + "Microsecond" => TimeUnit::Microsecond, + "Nanosecond" => TimeUnit::Nanosecond, + _ => TimeUnit::Nanosecond, + } + } + // Default to Nanosecond which is common if not present + None => TimeUnit::Nanosecond, + }; + (unit, tz) + } + // Default to Nanosecond and None for tz which is common if not present + None => (TimeUnit::Nanosecond, None), + }; + DataType::Timestamp(unit, tz) + } + SqlTypeName::TIMESTAMP => { + let (unit, tz) = match py_kwargs { + Some(dict) => { + let tz = match dict.get_item("tz") { + Some(e) => { + let res: Option = e.extract().unwrap(); + Some(Arc::new(res.to_owned())) + } + None => None, + }; + let unit: TimeUnit = match dict.get_item("unit") { + Some(e) => { + let res: PyResult<&str> = e.extract(); + match res.unwrap() { + "Second" => TimeUnit::Second, + "Millisecond" => TimeUnit::Millisecond, + "Microsecond" => TimeUnit::Microsecond, + "Nanosecond" => TimeUnit::Nanosecond, + _ => TimeUnit::Nanosecond, + } + } + // Default to Nanosecond which is common if not present + None => TimeUnit::Nanosecond, + }; + (unit, tz) + } + // Default to Nanosecond and None for tz which is common if not present + None => (TimeUnit::Nanosecond, None), + }; + DataType::Timestamp(unit, tz) + } SqlTypeName::DECIMAL => { let (precision, scale) = match py_kwargs { Some(dict) => { From 7a9906f0944fd726af9a4d30604179a6708da628 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 8 May 2023 21:47:53 -0400 Subject: [PATCH 04/44] Update to use Option> and us unsafe Arc::from_raw() method --- dask_planner/src/sql/types.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/dask_planner/src/sql/types.rs b/dask_planner/src/sql/types.rs index 715820f8b..c7aa2e83c 100644 --- a/dask_planner/src/sql/types.rs +++ b/dask_planner/src/sql/types.rs @@ -58,8 +58,11 @@ impl DaskTypeMap { Some(dict) => { let tz = match dict.get_item("tz") { Some(e) => { - let res: Option = e.extract().unwrap(); - Some(Arc::new(res.to_owned())) + let f: String = e.extract().unwrap(); + unsafe { + let res: Option> = Some(Arc::from_raw(f.as_ref())); + res + } } None => None, }; @@ -89,8 +92,11 @@ impl DaskTypeMap { Some(dict) => { let tz = match dict.get_item("tz") { Some(e) => { - let res: Option = e.extract().unwrap(); - Some(Arc::new(res.to_owned())) + let f: String = e.extract().unwrap(); + unsafe { + let res: Option> = Some(Arc::from_raw(f.as_ref())); + res + } } None => None, }; From 3a7754da34d4e88598e62ba1c0d11af2fbf3f67d Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 8 May 2023 21:56:21 -0400 Subject: [PATCH 05/44] Use into() instead of unsafe Arc::from_raw() --- dask_planner/src/sql/types.rs | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/dask_planner/src/sql/types.rs b/dask_planner/src/sql/types.rs index c7aa2e83c..2ab766727 100644 --- a/dask_planner/src/sql/types.rs +++ b/dask_planner/src/sql/types.rs @@ -1,8 +1,6 @@ pub mod rel_data_type; pub mod rel_data_type_field; -use std::sync::Arc; - use datafusion_python::{ datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}, datafusion_sql::sqlparser::{ast::DataType as SQLType, parser::Parser, tokenizer::Tokenizer}, @@ -58,11 +56,8 @@ impl DaskTypeMap { Some(dict) => { let tz = match dict.get_item("tz") { Some(e) => { - let f: String = e.extract().unwrap(); - unsafe { - let res: Option> = Some(Arc::from_raw(f.as_ref())); - res - } + let res: &str = e.extract().unwrap(); + Some(res.into()) } None => None, }; @@ -92,11 +87,8 @@ impl DaskTypeMap { Some(dict) => { let tz = match dict.get_item("tz") { Some(e) => { - let f: String = e.extract().unwrap(); - unsafe { - let res: Option> = Some(Arc::from_raw(f.as_ref())); - res - } + let res: &str = e.extract().unwrap(); + Some(res.into()) } None => None, }; From 435db425dcdcbc392020c058ac5d8904e84585ce Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 9 May 2023 19:52:03 -0400 Subject: [PATCH 06/44] partial refactoring checkpoint --- dask_planner/Cargo.lock | 42 +- dask_planner/Cargo.toml | 2 +- dask_planner/src/lib.rs | 23 +- dask_planner/src/parser.rs | 33 +- dask_planner/src/sql.rs | 22 +- dask_planner/src/sql/logical.rs | 419 ------------------ dask_planner/src/sql/logical/aggregate.rs | 3 +- dask_planner/src/sql/logical/alter_schema.rs | 6 +- dask_planner/src/sql/logical/alter_table.rs | 6 +- dask_planner/src/sql/logical/analyze_table.rs | 6 +- .../src/sql/logical/create_catalog_schema.rs | 8 +- .../src/sql/logical/create_experiment.rs | 14 +- .../src/sql/logical/create_memory_table.rs | 21 +- dask_planner/src/sql/logical/create_model.rs | 14 +- dask_planner/src/sql/logical/create_table.rs | 11 +- .../src/sql/logical/describe_model.rs | 8 +- dask_planner/src/sql/logical/drop_model.rs | 8 +- dask_planner/src/sql/logical/drop_schema.rs | 8 +- dask_planner/src/sql/logical/export_model.rs | 11 +- dask_planner/src/sql/logical/join.rs | 8 +- dask_planner/src/sql/logical/predict_model.rs | 10 +- dask_planner/src/sql/logical/projection.rs | 22 +- .../src/sql/logical/repartition_by.rs | 5 +- dask_planner/src/sql/logical/show_columns.rs | 6 +- dask_planner/src/sql/logical/show_models.rs | 3 +- dask_planner/src/sql/logical/show_schemas.rs | 6 +- dask_planner/src/sql/logical/show_tables.rs | 6 +- dask_planner/src/sql/logical/table_scan.rs | 3 +- dask_planner/src/sql/logical/use_schema.rs | 8 +- dask_planner/src/sql/logical/utils.rs | 196 +++++++- dask_planner/src/sql/logical/window.rs | 3 +- dask_planner/src/sql/table.rs | 24 +- dask_planner/src/sql/types.rs | 267 +---------- .../src/sql/types/rel_data_type_field.rs | 10 +- dask_sql/context.py | 8 +- dask_sql/input_utils/hive.py | 4 +- dask_sql/mappings.py | 147 +++--- dask_sql/physical/rel/base.py | 4 +- dask_sql/physical/rel/convert.py | 3 +- dask_sql/physical/rel/custom/show_schemas.py | 2 +- dask_sql/physical/rel/logical/aggregate.py | 9 +- dask_sql/physical/rel/logical/cross_join.py | 7 +- dask_sql/physical/rel/logical/filter.py | 3 +- dask_sql/physical/rel/logical/join.py | 9 +- dask_sql/physical/rel/logical/limit.py | 8 +- dask_sql/physical/rel/logical/project.py | 13 +- dask_sql/physical/rel/logical/sort.py | 3 +- dask_sql/physical/rel/logical/table_scan.py | 16 +- dask_sql/physical/rel/logical/union.py | 11 +- dask_sql/physical/rel/logical/values.py | 13 +- dask_sql/physical/rel/logical/window.py | 7 +- dask_sql/physical/rex/convert.py | 2 +- dask_sql/physical/rex/core/alias.py | 2 +- dask_sql/physical/rex/core/call.py | 49 +- dask_sql/physical/rex/core/input_ref.py | 5 +- dask_sql/physical/rex/core/literal.py | 274 +++++------- dask_sql/utils.py | 16 +- tests/integration/test_compatibility.py | 42 +- tests/unit/test_mapping.py | 10 +- 59 files changed, 677 insertions(+), 1232 deletions(-) diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index a1692e52f..69ef2d0af 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -465,9 +465,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.12.1" +version = "3.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8" +checksum = "3c6ed94e98ecff0c12dd1b04c15ec0d7d9458ca8fe806cea6f12954efe74c63b" [[package]] name = "byteorder" @@ -891,7 +891,7 @@ dependencies = [ [[package]] name = "datafusion-python" version = "23.0.0" -source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=expr_mods#9b60d1a445efb9c9eddadd8936a54e726494b849" +source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=expr_datatypemap#22eed9c3eae94613d36888335aad4536d2306a4b" dependencies = [ "async-trait", "datafusion", @@ -2071,9 +2071,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.61" +version = "0.3.62" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "68c16e1bfd491478ab155fd8b4896b86f9ede344949b641e61501e07c2b8b4d5" dependencies = [ "wasm-bindgen", ] @@ -3787,9 +3787,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.84" +version = "0.2.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "5b6cb788c4e39112fbe1822277ef6fb3c55cd86b95cb3d3c4c1c9597e4ac74b4" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3797,24 +3797,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.84" +version = "0.2.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "35e522ed4105a9d626d885b35d62501b30d9666283a5c8be12c14a8bdafe7822" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.15", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.34" +version = "0.4.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" +checksum = "083abe15c5d88556b77bdf7aef403625be9e327ad37c62c4e4129af740168163" dependencies = [ "cfg-if", "js-sys", @@ -3824,9 +3824,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "358a79a0cb89d21db8120cbfb91392335913e4890665b1a7981d9e956903b434" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3834,22 +3834,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "4783ce29f09b9d93134d41297aded3a712b7b979e9c6f28c32cb88c973a94869" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.15", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" +version = "0.2.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "a901d592cafaa4d711bc324edfaff879ac700b19c3dfd60058d2b445be2691eb" [[package]] name = "wasm-streams" @@ -3866,9 +3866,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.61" +version = "0.3.62" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97" +checksum = "16b5f940c7edfdc6d12126d98c9ef4d1b3d470011c47c76a6581df47ad9ba721" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml index 1f8cd386c..aa447fe5f 100644 --- a/dask_planner/Cargo.toml +++ b/dask_planner/Cargo.toml @@ -10,7 +10,7 @@ rust-version = "1.65" [dependencies] async-trait = "0.1.68" -datafusion-python = { git = "https://github.com/jdye64/arrow-datafusion-python.git", branch = "expr_mods" } +datafusion-python = { git = "https://github.com/jdye64/arrow-datafusion-python.git", branch = "expr_datatypemap" } env_logger = "0.10" log = "^0.4" pyo3 = { version = "0.18.3", features = ["extension-module", "abi3", "abi3-py38"] } diff --git a/dask_planner/src/lib.rs b/dask_planner/src/lib.rs index 828737b17..3c1d89a5d 100644 --- a/dask_planner/src/lib.rs +++ b/dask_planner/src/lib.rs @@ -1,3 +1,7 @@ +use datafusion_python::{ + common::data_type::{DataTypeMap, PyDataType, PythonType, SqlType}, + sql::logical::PyLogicalPlan, +}; use log::debug; use pyo3::prelude::*; @@ -18,7 +22,6 @@ fn rust(py: Python, m: &PyModule) -> PyResult<()> { // Register the python classes m.add_class::()?; - m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -27,7 +30,23 @@ fn rust(py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; + + // Re-export Arrow DataFusion Python types + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; // Python wrapper for Arrow DataType + m.add_class::()?; + m.add_class::()?; + + // Wrapped functions + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::get_current_node_type)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::plan_to_table)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::row_type)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::named_projects)) + .unwrap(); // Exceptions m.add( diff --git a/dask_planner/src/parser.rs b/dask_planner/src/parser.rs index 3147e6309..d422eef6f 100644 --- a/dask_planner/src/parser.rs +++ b/dask_planner/src/parser.rs @@ -4,17 +4,20 @@ use std::collections::VecDeque; -use datafusion_python::datafusion_sql::sqlparser::{ - ast::{Expr, Ident, SelectItem, Statement as SQLStatement, UnaryOperator, Value}, - dialect::{keywords::Keyword, Dialect}, - parser::{Parser, ParserError}, - tokenizer::{Token, TokenWithLocation, Tokenizer}, +use datafusion_python::{ + common::data_type::SqlType, + datafusion_sql::sqlparser::{ + ast::{Expr, Ident, SelectItem, Statement as SQLStatement, UnaryOperator, Value}, + dialect::{keywords::Keyword, Dialect}, + parser::{Parser, ParserError}, + tokenizer::{Token, TokenWithLocation, Tokenizer}, + }, }; use pyo3::prelude::*; use crate::{ dialect::DaskDialect, - sql::{exceptions::py_type_err, parser_utils::DaskParserUtils, types::SqlTypeName}, + sql::{exceptions::py_type_err, parser_utils::DaskParserUtils}, }; macro_rules! parser_err { @@ -106,27 +109,27 @@ impl PySqlArg { } #[pyo3(name = "getSqlType")] - pub fn get_sql_type(&self) -> PyResult { + pub fn get_sql_type(&self) -> PyResult { Ok(match &self.custom { Some(custom_expr) => match custom_expr { - CustomExpr::Map(_) => SqlTypeName::MAP, - CustomExpr::Multiset(_) => SqlTypeName::MULTISET, + CustomExpr::Map(_) => SqlType::MAP, + CustomExpr::Multiset(_) => SqlType::MULTISET, _ => return self.expected("Map or multiset"), }, None => match &self.expr { - Some(Expr::Array(_)) => SqlTypeName::ARRAY, - Some(Expr::Identifier(Ident { .. })) => SqlTypeName::VARCHAR, + Some(Expr::Array(_)) => SqlType::ARRAY, + Some(Expr::Identifier(Ident { .. })) => SqlType::VARCHAR, Some(Expr::Value(scalar)) => match scalar { - Value::Boolean(_) => SqlTypeName::BOOLEAN, - Value::Number(_, false) => SqlTypeName::BIGINT, - Value::SingleQuotedString(_) => SqlTypeName::VARCHAR, + Value::Boolean(_) => SqlType::BOOLEAN, + Value::Number(_, false) => SqlType::BIGINT, + Value::SingleQuotedString(_) => SqlType::VARCHAR, _ => return self.expected("Boolean, integer, float, or single-quoted string"), }, Some(Expr::UnaryOp { op: UnaryOperator::Minus, expr, }) => match &**expr { - Expr::Value(Value::Number(_, false)) => SqlTypeName::BIGINT, + Expr::Value(Value::Number(_, false)) => SqlType::BIGINT, _ => return self.expected("Integer or float"), }, Some(_) => return self.expected("Array, identifier, or scalar"), diff --git a/dask_planner/src/sql.rs b/dask_planner/src/sql.rs index 22f6d01ac..e6cd0ec44 100644 --- a/dask_planner/src/sql.rs +++ b/dask_planner/src/sql.rs @@ -39,6 +39,7 @@ use datafusion_python::{ ResolvedTableReference, TableReference, }, + sql::logical::PyLogicalPlan, }; use log::{debug, warn}; use pyo3::prelude::*; @@ -68,7 +69,6 @@ use crate::{ show_models::ShowModelsPlanNode, show_schemas::ShowSchemasPlanNode, show_tables::ShowTablesPlanNode, - PyLogicalPlan, }, }, }; @@ -519,12 +519,9 @@ impl DaskSQLContext { pub fn logical_relational_algebra( &self, statement: statement::PyStatement, - ) -> PyResult { + ) -> PyResult { self._logical_relational_algebra(statement.statement) - .map(|e| PyLogicalPlan { - original_plan: e, - current_node: None, - }) + .map(PyLogicalPlan::new) .map_err(py_parsing_exp) } @@ -533,12 +530,12 @@ impl DaskSQLContext { /// `LogicalPlan` pub fn optimize_relational_algebra( &self, - existing_plan: logical::PyLogicalPlan, - ) -> PyResult { + existing_plan: PyLogicalPlan, + ) -> PyResult { // Certain queries cannot be optimized. Ex: `EXPLAIN SELECT * FROM test` simply return those plans as is let mut visitor = OptimizablePlanVisitor {}; - match existing_plan.original_plan.visit(&mut visitor) { + match existing_plan.plan().visit(&mut visitor) { Ok(valid) => { match valid { VisitRecursion::Stop => { @@ -547,11 +544,8 @@ impl DaskSQLContext { Ok(existing_plan) } _ => optimizer::DaskSqlOptimizer::new() - .optimize(existing_plan.original_plan) - .map(|k| PyLogicalPlan { - original_plan: k, - current_node: None, - }) + .optimize((*existing_plan.plan()).clone()) + .map(PyLogicalPlan::new) .map_err(py_optimization_exp), } } diff --git a/dask_planner/src/sql/logical.rs b/dask_planner/src/sql/logical.rs index aa7baa544..f633e9ee9 100644 --- a/dask_planner/src/sql/logical.rs +++ b/dask_planner/src/sql/logical.rs @@ -1,8 +1,3 @@ -use crate::sql::{ - table, - types::{rel_data_type::RelDataType, rel_data_type_field::RelDataTypeField}, -}; - pub mod aggregate; pub mod alter_schema; pub mod alter_table; @@ -35,417 +30,3 @@ pub mod table_scan; pub mod use_schema; pub mod utils; pub mod window; - -use datafusion_python::{ - datafusion_common::{DFSchemaRef, DataFusionError}, - datafusion_expr::LogicalPlan, -}; -use pyo3::prelude::*; - -use self::{ - alter_schema::AlterSchemaPlanNode, - alter_table::AlterTablePlanNode, - analyze_table::AnalyzeTablePlanNode, - create_catalog_schema::CreateCatalogSchemaPlanNode, - create_experiment::CreateExperimentPlanNode, - create_model::CreateModelPlanNode, - create_table::CreateTablePlanNode, - describe_model::DescribeModelPlanNode, - drop_model::DropModelPlanNode, - drop_schema::DropSchemaPlanNode, - export_model::ExportModelPlanNode, - predict_model::PredictModelPlanNode, - show_columns::ShowColumnsPlanNode, - show_models::ShowModelsPlanNode, - show_schemas::ShowSchemasPlanNode, - show_tables::ShowTablesPlanNode, - use_schema::UseSchemaPlanNode, -}; -use crate::{error::Result, sql::exceptions::py_type_err}; - -#[pyclass(name = "LogicalPlan", module = "dask_planner", subclass)] -#[derive(Debug, Clone)] -pub struct PyLogicalPlan { - /// The original LogicalPlan that was parsed by DataFusion from the input SQL - pub(crate) original_plan: LogicalPlan, - /// The original_plan is traversed. current_node stores the current node of this traversal - pub(crate) current_node: Option, -} - -/// Unfortunately PyO3 forces us to do this as placing these methods in the #[pymethods] version -/// of `impl PyLogicalPlan` causes issues with types not properly being mapped to Python from Rust -impl PyLogicalPlan { - /// Getter method for the LogicalPlan, if current_node is None return original_plan. - pub(crate) fn current_node(&mut self) -> LogicalPlan { - match &self.current_node { - Some(current) => current.clone(), - None => { - self.current_node = Some(self.original_plan.clone()); - self.current_node.clone().unwrap() - } - } - } -} - -/// Convert a LogicalPlan to a Python equivalent type -fn to_py_plan>( - current_node: Option<&LogicalPlan>, -) -> PyResult { - match current_node { - Some(plan) => plan.clone().try_into(), - _ => Err(py_type_err("current_node was None")), - } -} - -#[pymethods] -impl PyLogicalPlan { - /// LogicalPlan::Aggregate as PyAggregate - pub fn aggregate(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::EmptyRelation as PyEmptyRelation - pub fn empty_relation(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Explain as PyExplain - pub fn explain(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Filter as PyFilter - pub fn filter(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Join as PyJoin - pub fn join(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Limit as PyLimit - pub fn limit(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Projection as PyProjection - pub fn projection(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Sort as PySort - pub fn sort(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::SubqueryAlias as PySubqueryAlias - pub fn subquery_alias(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Window as PyWindow - pub fn window(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::TableScan as PyTableScan - pub fn table_scan(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::CreateMemoryTable as PyCreateMemoryTable - pub fn create_memory_table(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::CreateModel as PyCreateModel - pub fn create_model(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::CreateExperiment as PyCreateExperiment - pub fn create_experiment(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::DropTable as DropTable - pub fn drop_table(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::DropModel as DropModel - pub fn drop_model(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::ShowSchemas as PyShowSchemas - pub fn show_schemas(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Repartition as PyRepartitionBy - pub fn repartition_by(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::ShowTables as PyShowTables - pub fn show_tables(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::CreateTable as PyCreateTable - pub fn create_table(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::PredictModel as PyPredictModel - pub fn predict_model(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::DescribeModel as PyDescribeModel - pub fn describe_model(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::ExportModel as PyExportModel - pub fn export_model(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::ShowColumns as PyShowColumns - pub fn show_columns(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - pub fn show_models(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::ShowColumns as PyShowColumns - pub fn analyze_table(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::CreateCatalogSchema as PyCreateCatalogSchema - pub fn create_catalog_schema(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::DropSchema as PyDropSchema - pub fn drop_schema(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::UseSchema as PyUseSchema - pub fn use_schema(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::AlterTable as PyAlterTable - pub fn alter_table(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// LogicalPlan::Extension::AlterSchema as PyAlterSchema - pub fn alter_schema(&self) -> PyResult { - to_py_plan(self.current_node.as_ref()) - } - - /// Gets the "input" for the current LogicalPlan - pub fn get_inputs(&mut self) -> PyResult> { - let mut py_inputs: Vec = Vec::new(); - for input in self.current_node().inputs() { - py_inputs.push(input.clone().into()); - } - Ok(py_inputs) - } - - /// If the LogicalPlan represents access to a Table that instance is returned - /// otherwise None is returned - #[pyo3(name = "getTable")] - pub fn table(&mut self) -> PyResult { - match table::table_from_logical_plan(&self.current_node())? { - Some(table) => Ok(table), - None => Err(py_type_err( - "Unable to compute DaskTable from DataFusion LogicalPlan", - )), - } - } - - #[pyo3(name = "getCurrentNodeSchemaName")] - pub fn get_current_node_schema_name(&self) -> PyResult<&str> { - match &self.current_node { - Some(e) => { - let _sch: &DFSchemaRef = e.schema(); - //TODO: Where can I actually get this in the context of the running query? - Ok("root") - } - None => Err(py_type_err(DataFusionError::Plan(format!( - "Current schema not found. Defaulting to {:?}", - "root" - )))), - } - } - - #[pyo3(name = "getCurrentNodeTableName")] - pub fn get_current_node_table_name(&mut self) -> PyResult { - match self.table() { - Ok(dask_table) => Ok(dask_table.table_name), - Err(_e) => Err(py_type_err("Unable to determine current node table name")), - } - } - - /// Gets the Relation "type" of the current node. Ex: Projection, TableScan, etc - pub fn get_current_node_type(&mut self) -> PyResult<&str> { - Ok(match self.current_node() { - LogicalPlan::Dml(_) => "DataManipulationLanguage", - LogicalPlan::DescribeTable(_) => "DescribeTable", - LogicalPlan::Prepare(_) => "Prepare", - LogicalPlan::Distinct(_) => "Distinct", - LogicalPlan::Projection(_projection) => "Projection", - LogicalPlan::Filter(_filter) => "Filter", - LogicalPlan::Window(_window) => "Window", - LogicalPlan::Aggregate(_aggregate) => "Aggregate", - LogicalPlan::Sort(_sort) => "Sort", - LogicalPlan::Join(_join) => "Join", - LogicalPlan::CrossJoin(_cross_join) => "CrossJoin", - LogicalPlan::Repartition(_repartition) => "Repartition", - LogicalPlan::Union(_union) => "Union", - LogicalPlan::TableScan(_table_scan) => "TableScan", - LogicalPlan::EmptyRelation(_empty_relation) => "EmptyRelation", - LogicalPlan::Limit(_limit) => "Limit", - LogicalPlan::CreateExternalTable(_create_external_table) => "CreateExternalTable", - LogicalPlan::CreateMemoryTable(_create_memory_table) => "CreateMemoryTable", - LogicalPlan::DropTable(_drop_table) => "DropTable", - LogicalPlan::DropView(_drop_view) => "DropView", - LogicalPlan::Values(_values) => "Values", - LogicalPlan::Explain(_explain) => "Explain", - LogicalPlan::Analyze(_analyze) => "Analyze", - LogicalPlan::Subquery(_sub_query) => "Subquery", - LogicalPlan::SubqueryAlias(_sqalias) => "SubqueryAlias", - LogicalPlan::CreateCatalogSchema(_create) => "CreateCatalogSchema", - LogicalPlan::CreateCatalog(_create_catalog) => "CreateCatalog", - LogicalPlan::CreateView(_create_view) => "CreateView", - LogicalPlan::Statement(_) => "Statement", - // Further examine and return the name that is a possible Dask-SQL Extension type - LogicalPlan::Extension(extension) => { - let node = extension.node.as_any(); - if node.downcast_ref::().is_some() { - "CreateModel" - } else if node.downcast_ref::().is_some() { - "CreateExperiment" - } else if node.downcast_ref::().is_some() { - "CreateCatalogSchema" - } else if node.downcast_ref::().is_some() { - "CreateTable" - } else if node.downcast_ref::().is_some() { - "DropModel" - } else if node.downcast_ref::().is_some() { - "PredictModel" - } else if node.downcast_ref::().is_some() { - "ExportModel" - } else if node.downcast_ref::().is_some() { - "DescribeModel" - } else if node.downcast_ref::().is_some() { - "ShowSchemas" - } else if node.downcast_ref::().is_some() { - "ShowTables" - } else if node.downcast_ref::().is_some() { - "ShowColumns" - } else if node.downcast_ref::().is_some() { - "ShowModels" - } else if node.downcast_ref::().is_some() { - "DropSchema" - } else if node.downcast_ref::().is_some() { - "UseSchema" - } else if node.downcast_ref::().is_some() { - "AnalyzeTable" - } else if node.downcast_ref::().is_some() { - "AlterTable" - } else if node.downcast_ref::().is_some() { - "AlterSchema" - } else { - // Default to generic `Extension` - "Extension" - } - } - LogicalPlan::Unnest(_unnest) => "Unnest", - }) - } - - /// Explain plan for the full and original LogicalPlan - pub fn explain_original(&self) -> PyResult { - Ok(format!("{}", self.original_plan.display_indent())) - } - - /// Explain plan from the current node onward - pub fn explain_current(&mut self) -> PyResult { - Ok(format!("{}", self.current_node().display_indent())) - } - - #[pyo3(name = "getRowType")] - pub fn row_type(&self) -> PyResult { - match &self.original_plan { - LogicalPlan::Join(join) => { - let mut lhs_fields: Vec = join - .left - .schema() - .fields() - .iter() - .map(|f| RelDataTypeField::from(f, join.left.schema().as_ref())) - .collect::>>() - .map_err(py_type_err)?; - - let mut rhs_fields: Vec = join - .right - .schema() - .fields() - .iter() - .map(|f| RelDataTypeField::from(f, join.right.schema().as_ref())) - .collect::>>() - .map_err(py_type_err)?; - - lhs_fields.append(&mut rhs_fields); - Ok(RelDataType::new(false, lhs_fields)) - } - LogicalPlan::Distinct(distinct) => { - let schema = distinct.input.schema(); - let rel_fields: Vec = schema - .fields() - .iter() - .map(|f| RelDataTypeField::from(f, schema.as_ref())) - .collect::>>() - .map_err(py_type_err)?; - Ok(RelDataType::new(false, rel_fields)) - } - _ => { - let schema = self.original_plan.schema(); - let rel_fields: Vec = schema - .fields() - .iter() - .map(|f| RelDataTypeField::from(f, schema.as_ref())) - .collect::>>() - .map_err(py_type_err)?; - - Ok(RelDataType::new(false, rel_fields)) - } - } - } -} - -impl From for LogicalPlan { - fn from(logical_plan: PyLogicalPlan) -> LogicalPlan { - logical_plan.original_plan - } -} - -impl From for PyLogicalPlan { - fn from(logical_plan: LogicalPlan) -> PyLogicalPlan { - PyLogicalPlan { - original_plan: logical_plan, - current_node: None, - } - } -} diff --git a/dask_planner/src/sql/logical/aggregate.rs b/dask_planner/src/sql/logical/aggregate.rs index cdc65f854..04be77126 100644 --- a/dask_planner/src/sql/logical/aggregate.rs +++ b/dask_planner/src/sql/logical/aggregate.rs @@ -5,11 +5,12 @@ use datafusion_python::{ Expr, LogicalPlan, }, + errors::py_type_err, expr::PyExpr, }; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical::utils::py_expr_list}; +use super::utils::py_expr_list; #[pyclass(name = "Aggregate", module = "dask_planner", subclass)] #[derive(Clone)] diff --git a/dask_planner/src/sql/logical/alter_schema.rs b/dask_planner/src/sql/logical/alter_schema.rs index 742ae513f..fe852118b 100644 --- a/dask_planner/src/sql/logical/alter_schema.rs +++ b/dask_planner/src/sql/logical/alter_schema.rs @@ -16,7 +16,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct AlterSchemaPlanNode { @@ -114,10 +114,10 @@ impl PyAlterSchema { } } -impl TryFrom for PyAlterSchema { +impl TryFrom for PyAlterSchema { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { LogicalPlan::Extension(Extension { node }) if node diff --git a/dask_planner/src/sql/logical/alter_table.rs b/dask_planner/src/sql/logical/alter_table.rs index 7f51a15c3..4f39d096a 100644 --- a/dask_planner/src/sql/logical/alter_table.rs +++ b/dask_planner/src/sql/logical/alter_table.rs @@ -16,7 +16,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct AlterTablePlanNode { @@ -130,10 +130,10 @@ impl PyAlterTable { } } -impl TryFrom for PyAlterTable { +impl TryFrom for PyAlterTable { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { LogicalPlan::Extension(Extension { node }) if node.as_any().downcast_ref::().is_some() => diff --git a/dask_planner/src/sql/logical/analyze_table.rs b/dask_planner/src/sql/logical/analyze_table.rs index 9fa7fb219..7bfa0aea4 100644 --- a/dask_planner/src/sql/logical/analyze_table.rs +++ b/dask_planner/src/sql/logical/analyze_table.rs @@ -16,7 +16,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct AnalyzeTablePlanNode { @@ -122,10 +122,10 @@ impl PyAnalyzeTable { } } -impl TryFrom for PyAnalyzeTable { +impl TryFrom for PyAnalyzeTable { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { LogicalPlan::Extension(Extension { node }) if node diff --git a/dask_planner/src/sql/logical/create_catalog_schema.rs b/dask_planner/src/sql/logical/create_catalog_schema.rs index bc89b02ce..bce197457 100644 --- a/dask_planner/src/sql/logical/create_catalog_schema.rs +++ b/dask_planner/src/sql/logical/create_catalog_schema.rs @@ -12,7 +12,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct CreateCatalogSchemaPlanNode { @@ -118,12 +118,12 @@ impl PyCreateCatalogSchema { } } -impl TryFrom for PyCreateCatalogSchema { +impl TryFrom for PyCreateCatalogSchema { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - logical::LogicalPlan::Extension(extension) => { + LogicalPlan::Extension(extension) => { if let Some(ext) = extension .node .as_any() diff --git a/dask_planner/src/sql/logical/create_experiment.rs b/dask_planner/src/sql/logical/create_experiment.rs index 313357d75..9f22dc964 100644 --- a/dask_planner/src/sql/logical/create_experiment.rs +++ b/dask_planner/src/sql/logical/create_experiment.rs @@ -8,14 +8,12 @@ use std::{ use datafusion_python::{ datafusion_common::DFSchemaRef, datafusion_expr::{logical_plan::UserDefinedLogicalNode, Expr, LogicalPlan}, + sql::logical::PyLogicalPlan, }; use fmt::Debug; use pyo3::prelude::*; -use crate::{ - parser::PySqlArg, - sql::{exceptions::py_type_err, logical}, -}; +use crate::{parser::PySqlArg, sql::exceptions::py_type_err}; #[derive(Clone, PartialEq)] pub struct CreateExperimentPlanNode { @@ -116,7 +114,7 @@ impl PyCreateExperiment { /// statement to be used to gather the dataset which should be used for the /// experiment. This function returns that portion of the statement. #[pyo3(name = "getSelectQuery")] - fn get_select_query(&self) -> PyResult { + fn get_select_query(&self) -> PyResult { Ok(self.create_experiment.input.clone().into()) } @@ -146,12 +144,12 @@ impl PyCreateExperiment { } } -impl TryFrom for PyCreateExperiment { +impl TryFrom for PyCreateExperiment { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - logical::LogicalPlan::Extension(extension) => { + LogicalPlan::Extension(extension) => { if let Some(ext) = extension .node .as_any() diff --git a/dask_planner/src/sql/logical/create_memory_table.rs b/dask_planner/src/sql/logical/create_memory_table.rs index 668295e0f..b36b0b6bb 100644 --- a/dask_planner/src/sql/logical/create_memory_table.rs +++ b/dask_planner/src/sql/logical/create_memory_table.rs @@ -1,10 +1,13 @@ -use datafusion_python::datafusion_expr::{ - logical_plan::{CreateMemoryTable, CreateView}, - LogicalPlan, +use datafusion_python::{ + datafusion_expr::{ + logical_plan::{CreateMemoryTable, CreateView}, + LogicalPlan, + }, + sql::logical::PyLogicalPlan, }; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical::PyLogicalPlan}; +use crate::sql::exceptions::py_type_err; #[pyclass(name = "CreateMemoryTable", module = "dask_planner", subclass)] #[derive(Clone)] @@ -33,15 +36,9 @@ impl PyCreateMemoryTable { #[pyo3(name = "getInput")] pub fn get_input(&self) -> PyResult { Ok(match &self.create_memory_table { - Some(create_memory_table) => PyLogicalPlan { - original_plan: (*create_memory_table.input).clone(), - current_node: None, - }, + Some(create_memory_table) => PyLogicalPlan::new((*create_memory_table.input).clone()), None => match &self.create_view { - Some(create_view) => PyLogicalPlan { - original_plan: (*create_view.input).clone(), - current_node: None, - }, + Some(create_view) => PyLogicalPlan::new((*create_view.input).clone()), None => { return Err(py_type_err( "Encountered a non CreateMemoryTable/CreateView type in get_input", diff --git a/dask_planner/src/sql/logical/create_model.rs b/dask_planner/src/sql/logical/create_model.rs index 782fe3325..f96584796 100644 --- a/dask_planner/src/sql/logical/create_model.rs +++ b/dask_planner/src/sql/logical/create_model.rs @@ -8,14 +8,12 @@ use std::{ use datafusion_python::{ datafusion_common::DFSchemaRef, datafusion_expr::{logical_plan::UserDefinedLogicalNode, Expr, LogicalPlan}, + sql::logical::PyLogicalPlan, }; use fmt::Debug; use pyo3::prelude::*; -use crate::{ - parser::PySqlArg, - sql::{exceptions::py_type_err, logical}, -}; +use crate::{parser::PySqlArg, sql::exceptions::py_type_err}; #[derive(Clone, PartialEq)] pub struct CreateModelPlanNode { @@ -112,7 +110,7 @@ impl PyCreateModel { /// statement to be used to gather the dataset which should be used for the /// model. This function returns that portion of the statement. #[pyo3(name = "getSelectQuery")] - fn get_select_query(&self) -> PyResult { + fn get_select_query(&self) -> PyResult { Ok(self.create_model.input.clone().into()) } @@ -142,12 +140,12 @@ impl PyCreateModel { } } -impl TryFrom for PyCreateModel { +impl TryFrom for PyCreateModel { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - logical::LogicalPlan::Extension(extension) => { + LogicalPlan::Extension(extension) => { if let Some(ext) = extension .node .as_any() diff --git a/dask_planner/src/sql/logical/create_table.rs b/dask_planner/src/sql/logical/create_table.rs index 9271130c7..9f5817f3d 100644 --- a/dask_planner/src/sql/logical/create_table.rs +++ b/dask_planner/src/sql/logical/create_table.rs @@ -12,10 +12,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::{ - parser::PySqlArg, - sql::{exceptions::py_type_err, logical}, -}; +use crate::{parser::PySqlArg, sql::exceptions::py_type_err}; #[derive(Clone, PartialEq)] pub struct CreateTablePlanNode { @@ -133,12 +130,12 @@ impl PyCreateTable { } } -impl TryFrom for PyCreateTable { +impl TryFrom for PyCreateTable { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - logical::LogicalPlan::Extension(extension) => { + LogicalPlan::Extension(extension) => { if let Some(ext) = extension .node .as_any() diff --git a/dask_planner/src/sql/logical/describe_model.rs b/dask_planner/src/sql/logical/describe_model.rs index cb2087376..91c03a7ca 100644 --- a/dask_planner/src/sql/logical/describe_model.rs +++ b/dask_planner/src/sql/logical/describe_model.rs @@ -12,7 +12,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct DescribeModelPlanNode { @@ -107,12 +107,12 @@ impl PyDescribeModel { } } -impl TryFrom for PyDescribeModel { +impl TryFrom for PyDescribeModel { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - logical::LogicalPlan::Extension(extension) => { + LogicalPlan::Extension(extension) => { if let Some(ext) = extension .node .as_any() diff --git a/dask_planner/src/sql/logical/drop_model.rs b/dask_planner/src/sql/logical/drop_model.rs index 71074905d..290a31550 100644 --- a/dask_planner/src/sql/logical/drop_model.rs +++ b/dask_planner/src/sql/logical/drop_model.rs @@ -12,7 +12,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct DropModelPlanNode { @@ -115,12 +115,12 @@ impl PyDropModel { } } -impl TryFrom for PyDropModel { +impl TryFrom for PyDropModel { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - logical::LogicalPlan::Extension(extension) => { + LogicalPlan::Extension(extension) => { if let Some(ext) = extension.node.as_any().downcast_ref::() { Ok(PyDropModel { drop_model: ext.clone(), diff --git a/dask_planner/src/sql/logical/drop_schema.rs b/dask_planner/src/sql/logical/drop_schema.rs index 2022a61c9..1c7d9b49b 100644 --- a/dask_planner/src/sql/logical/drop_schema.rs +++ b/dask_planner/src/sql/logical/drop_schema.rs @@ -12,7 +12,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct DropSchemaPlanNode { @@ -106,12 +106,12 @@ impl PyDropSchema { } } -impl TryFrom for PyDropSchema { +impl TryFrom for PyDropSchema { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - logical::LogicalPlan::Extension(extension) => { + LogicalPlan::Extension(extension) => { if let Some(ext) = extension.node.as_any().downcast_ref::() { Ok(PyDropSchema { drop_schema: ext.clone(), diff --git a/dask_planner/src/sql/logical/export_model.rs b/dask_planner/src/sql/logical/export_model.rs index e38551b58..f92b09149 100644 --- a/dask_planner/src/sql/logical/export_model.rs +++ b/dask_planner/src/sql/logical/export_model.rs @@ -12,10 +12,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::{ - parser::PySqlArg, - sql::{exceptions::py_type_err, logical}, -}; +use crate::{parser::PySqlArg, sql::exceptions::py_type_err}; #[derive(Clone, PartialEq)] pub struct ExportModelPlanNode { @@ -118,12 +115,12 @@ impl PyExportModel { } } -impl TryFrom for PyExportModel { +impl TryFrom for PyExportModel { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - logical::LogicalPlan::Extension(extension) => { + LogicalPlan::Extension(extension) => { if let Some(ext) = extension .node .as_any() diff --git a/dask_planner/src/sql/logical/join.rs b/dask_planner/src/sql/logical/join.rs index e7744f212..d24a4e715 100644 --- a/dask_planner/src/sql/logical/join.rs +++ b/dask_planner/src/sql/logical/join.rs @@ -7,11 +7,11 @@ use datafusion_python::{ Expr, Operator, }, - expr::PyExpr, + expr::{column::PyColumn, PyExpr}, }; use pyo3::prelude::*; -use crate::sql::{column, exceptions::py_type_err}; +use crate::sql::exceptions::py_type_err; #[pyclass(name = "Join", module = "dask_planner", subclass)] #[derive(Clone)] @@ -66,7 +66,7 @@ impl PyJoin { } #[pyo3(name = "getJoinConditions")] - pub fn join_conditions(&mut self) -> PyResult> { + pub fn join_conditions(&mut self) -> PyResult> { // let lhs_table_name = match &*self.join.left { // LogicalPlan::TableScan(scan) => scan.table_name.clone(), // _ => { @@ -85,7 +85,7 @@ impl PyJoin { // } // }; - let mut join_conditions: Vec<(column::PyColumn, column::PyColumn)> = Vec::new(); + let mut join_conditions: Vec<(PyColumn, PyColumn)> = Vec::new(); for (lhs, rhs) in self.join.on.clone() { match (lhs, rhs) { (Expr::Column(lhs), Expr::Column(rhs)) => { diff --git a/dask_planner/src/sql/logical/predict_model.rs b/dask_planner/src/sql/logical/predict_model.rs index e8d723d2c..c0f4841e1 100644 --- a/dask_planner/src/sql/logical/predict_model.rs +++ b/dask_planner/src/sql/logical/predict_model.rs @@ -8,12 +8,12 @@ use std::{ use datafusion_python::{ datafusion_common::DFSchemaRef, datafusion_expr::{logical_plan::UserDefinedLogicalNode, Expr, LogicalPlan}, + sql::logical::PyLogicalPlan, }; use fmt::Debug; use pyo3::prelude::*; -use super::PyLogicalPlan; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct PredictModelPlanNode { @@ -112,12 +112,12 @@ impl PyPredictModel { } } -impl TryFrom for PyPredictModel { +impl TryFrom for PyPredictModel { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - logical::LogicalPlan::Extension(extension) => { + LogicalPlan::Extension(extension) => { if let Some(ext) = extension .node .as_any() diff --git a/dask_planner/src/sql/logical/projection.rs b/dask_planner/src/sql/logical/projection.rs index f8d3a3ffc..e530f4d10 100644 --- a/dask_planner/src/sql/logical/projection.rs +++ b/dask_planner/src/sql/logical/projection.rs @@ -1,11 +1,10 @@ use datafusion_python::{ datafusion_expr::{logical_plan::Projection, Expr, LogicalPlan}, - expr::PyExpr, + expr::{projection::PyProjection as ADPPyProjection, PyExpr}, }; use pyo3::prelude::*; -use super::utils::column_name; -use crate::sql::exceptions::py_type_err; +use crate::sql::{exceptions::py_type_err, logical::utils::column_name}; #[pyclass(name = "Projection", module = "dask_planner", subclass)] #[derive(Clone)] @@ -13,21 +12,6 @@ pub struct PyProjection { pub(crate) projection: Projection, } -impl PyProjection { - /// Projection: Gets the names of the fields that should be projected - fn projected_expressions(local_expr: &PyExpr) -> Vec { - let mut projs: Vec = Vec::new(); - match &local_expr.expr { - Expr::Alias(expr, _name) => { - let py_expr: PyExpr = PyExpr::from(*expr.clone()); - projs.extend_from_slice(PyProjection::projected_expressions(&py_expr).as_slice()); - } - _ => projs.push(local_expr.clone()), - } - projs - } -} - #[pymethods] impl PyProjection { #[pyo3(name = "getNamedProjects")] @@ -35,7 +19,7 @@ impl PyProjection { let mut named: Vec<(String, PyExpr)> = Vec::new(); for expression in self.projection.expr.clone() { let py_expr: PyExpr = PyExpr::from(expression); - for expr in PyProjection::projected_expressions(&py_expr) { + for expr in ADPPyProjection::projected_expressions(&py_expr) { match expr.expr { Expr::Alias(ex, name) => named.push((name.to_string(), PyExpr::from(*ex))), _ => { diff --git a/dask_planner/src/sql/logical/repartition_by.rs b/dask_planner/src/sql/logical/repartition_by.rs index 55e8de234..feee4ad0b 100644 --- a/dask_planner/src/sql/logical/repartition_by.rs +++ b/dask_planner/src/sql/logical/repartition_by.rs @@ -5,10 +5,11 @@ use datafusion_python::{ LogicalPlan, }, expr::PyExpr, + sql::logical::PyLogicalPlan, }; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[pyclass(name = "RepartitionBy", module = "dask_planner", subclass)] pub struct PyRepartitionBy { @@ -18,7 +19,7 @@ pub struct PyRepartitionBy { #[pymethods] impl PyRepartitionBy { #[pyo3(name = "getSelectQuery")] - fn get_select_query(&self) -> PyResult { + fn get_select_query(&self) -> PyResult { let log_plan = &*(self.repartition.input).clone(); Ok(log_plan.clone().into()) } diff --git a/dask_planner/src/sql/logical/show_columns.rs b/dask_planner/src/sql/logical/show_columns.rs index adfb584ef..19c411902 100644 --- a/dask_planner/src/sql/logical/show_columns.rs +++ b/dask_planner/src/sql/logical/show_columns.rs @@ -16,7 +16,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct ShowColumnsPlanNode { @@ -110,10 +110,10 @@ impl PyShowColumns { } } -impl TryFrom for PyShowColumns { +impl TryFrom for PyShowColumns { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { LogicalPlan::Extension(Extension { node }) if node diff --git a/dask_planner/src/sql/logical/show_models.rs b/dask_planner/src/sql/logical/show_models.rs index 026a179a5..7c792273d 100644 --- a/dask_planner/src/sql/logical/show_models.rs +++ b/dask_planner/src/sql/logical/show_models.rs @@ -8,12 +8,11 @@ use std::{ use datafusion_python::{ datafusion_common::{DFSchema, DFSchemaRef}, datafusion_expr::{logical_plan::UserDefinedLogicalNode, Expr, LogicalPlan}, + errors::py_type_err, }; use fmt::Debug; use pyo3::prelude::*; -use crate::sql::logical::py_type_err; - #[derive(Clone, PartialEq)] pub struct ShowModelsPlanNode { pub schema: DFSchemaRef, diff --git a/dask_planner/src/sql/logical/show_schemas.rs b/dask_planner/src/sql/logical/show_schemas.rs index 3e3ed4783..9126c50e0 100644 --- a/dask_planner/src/sql/logical/show_schemas.rs +++ b/dask_planner/src/sql/logical/show_schemas.rs @@ -16,7 +16,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct ShowSchemasPlanNode { @@ -109,10 +109,10 @@ impl PyShowSchema { } } -impl TryFrom for PyShowSchema { +impl TryFrom for PyShowSchema { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { LogicalPlan::Extension(Extension { node }) if node diff --git a/dask_planner/src/sql/logical/show_tables.rs b/dask_planner/src/sql/logical/show_tables.rs index 987f2546e..e644505c8 100644 --- a/dask_planner/src/sql/logical/show_tables.rs +++ b/dask_planner/src/sql/logical/show_tables.rs @@ -16,7 +16,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct ShowTablesPlanNode { @@ -113,10 +113,10 @@ impl PyShowTables { } } -impl TryFrom for PyShowTables { +impl TryFrom for PyShowTables { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { LogicalPlan::Extension(Extension { node }) if node.as_any().downcast_ref::().is_some() => diff --git a/dask_planner/src/sql/logical/table_scan.rs b/dask_planner/src/sql/logical/table_scan.rs index 2948688e3..0985f7107 100644 --- a/dask_planner/src/sql/logical/table_scan.rs +++ b/dask_planner/src/sql/logical/table_scan.rs @@ -3,12 +3,13 @@ use std::sync::Arc; use datafusion_python::{ datafusion_common::{DFSchema, ScalarValue}, datafusion_expr::{logical_plan::TableScan, Expr, LogicalPlan}, + errors::py_type_err, expr::PyExpr, }; use pyo3::prelude::*; use super::utils::py_expr_list; -use crate::{error::DaskPlannerError, sql::exceptions::py_type_err}; +use crate::error::DaskPlannerError; #[pyclass(name = "TableScan", module = "dask_planner", subclass)] #[derive(Clone)] diff --git a/dask_planner/src/sql/logical/use_schema.rs b/dask_planner/src/sql/logical/use_schema.rs index 7c2206310..e12b2888f 100644 --- a/dask_planner/src/sql/logical/use_schema.rs +++ b/dask_planner/src/sql/logical/use_schema.rs @@ -12,7 +12,7 @@ use datafusion_python::{ use fmt::Debug; use pyo3::prelude::*; -use crate::sql::{exceptions::py_type_err, logical}; +use crate::sql::exceptions::py_type_err; #[derive(Clone, PartialEq)] pub struct UseSchemaPlanNode { @@ -98,12 +98,12 @@ impl PyUseSchema { } } -impl TryFrom for PyUseSchema { +impl TryFrom for PyUseSchema { type Error = PyErr; - fn try_from(logical_plan: logical::LogicalPlan) -> Result { + fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - logical::LogicalPlan::Extension(extension) => { + LogicalPlan::Extension(extension) => { if let Some(ext) = extension.node.as_any().downcast_ref::() { Ok(PyUseSchema { use_schema: ext.clone(), diff --git a/dask_planner/src/sql/logical/utils.rs b/dask_planner/src/sql/logical/utils.rs index c98324095..1b3637db5 100644 --- a/dask_planner/src/sql/logical/utils.rs +++ b/dask_planner/src/sql/logical/utils.rs @@ -3,11 +3,38 @@ use std::sync::Arc; use datafusion_python::{ datafusion_common::DFField, datafusion_expr::{expr::Sort, utils::exprlist_to_fields, Expr, LogicalPlan}, - expr::PyExpr, + expr::{projection::PyProjection, PyExpr}, + sql::logical::PyLogicalPlan, }; -use pyo3::PyResult; +use pyo3::{pyfunction, PyResult}; -use crate::error::DaskPlannerError; +use super::{ + alter_schema::AlterSchemaPlanNode, + alter_table::AlterTablePlanNode, + analyze_table::AnalyzeTablePlanNode, + create_catalog_schema::CreateCatalogSchemaPlanNode, + create_experiment::CreateExperimentPlanNode, + create_model::CreateModelPlanNode, + create_table::CreateTablePlanNode, + describe_model::DescribeModelPlanNode, + drop_model::DropModelPlanNode, + drop_schema::DropSchemaPlanNode, + export_model::ExportModelPlanNode, + predict_model::PredictModelPlanNode, + show_columns::ShowColumnsPlanNode, + show_models::ShowModelsPlanNode, + show_schemas::ShowSchemasPlanNode, + show_tables::ShowTablesPlanNode, + use_schema::UseSchemaPlanNode, +}; +use crate::{ + error::{DaskPlannerError, Result}, + sql::{ + exceptions::py_type_err, + table::{table_from_logical_plan, DaskTable}, + types::{rel_data_type::RelDataType, rel_data_type_field::RelDataTypeField}, + }, +}; /// Convert a list of DataFusion Expr to PyExpr pub fn py_expr_list(_input: &Arc, expr: &[Expr]) -> PyResult> { @@ -15,13 +42,13 @@ pub fn py_expr_list(_input: &Arc, expr: &[Expr]) -> PyResult Result { +pub fn column_name(expr: &Expr, plan: &LogicalPlan) -> Result { let field = expr_to_field(expr, plan)?; Ok(field.qualified_column().flat_name()) } /// Create a [DFField] representing an [Expr], given an input [LogicalPlan] to resolve against -pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> Result { +pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> Result { match expr { Expr::Sort(Sort { expr, .. }) => { // DataFusion does not support create_name for sort expressions (since they never @@ -35,3 +62,162 @@ pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> Result Result { + Ok(match &*plan.plan() { + LogicalPlan::Dml(_) => "DataManipulationLanguage".to_string(), + LogicalPlan::DescribeTable(_) => "DescribeTable".to_string(), + LogicalPlan::Prepare(_) => "Prepare".to_string(), + LogicalPlan::Distinct(_) => "Distinct".to_string(), + LogicalPlan::Projection(_projection) => "Projection".to_string(), + LogicalPlan::Filter(_filter) => "Filter".to_string(), + LogicalPlan::Window(_window) => "Window".to_string(), + LogicalPlan::Aggregate(_aggregate) => "Aggregate".to_string(), + LogicalPlan::Sort(_sort) => "Sort".to_string(), + LogicalPlan::Join(_join) => "Join".to_string(), + LogicalPlan::CrossJoin(_cross_join) => "CrossJoin".to_string(), + LogicalPlan::Repartition(_repartition) => "Repartition".to_string(), + LogicalPlan::Union(_union) => "Union".to_string(), + LogicalPlan::TableScan(_table_scan) => "TableScan".to_string(), + LogicalPlan::EmptyRelation(_empty_relation) => "EmptyRelation".to_string(), + LogicalPlan::Limit(_limit) => "Limit".to_string(), + LogicalPlan::CreateExternalTable(_create_external_table) => { + "CreateExternalTable".to_string() + } + LogicalPlan::CreateMemoryTable(_create_memory_table) => "CreateMemoryTable".to_string(), + LogicalPlan::DropTable(_drop_table) => "DropTable".to_string(), + LogicalPlan::DropView(_drop_view) => "DropView".to_string(), + LogicalPlan::Values(_values) => "Values".to_string(), + LogicalPlan::Explain(_explain) => "Explain".to_string(), + LogicalPlan::Analyze(_analyze) => "Analyze".to_string(), + LogicalPlan::Subquery(_sub_query) => "Subquery".to_string(), + LogicalPlan::SubqueryAlias(_sqalias) => "SubqueryAlias".to_string(), + LogicalPlan::CreateCatalogSchema(_create) => "CreateCatalogSchema".to_string(), + LogicalPlan::CreateCatalog(_create_catalog) => "CreateCatalog".to_string(), + LogicalPlan::CreateView(_create_view) => "CreateView".to_string(), + LogicalPlan::Statement(_) => "Statement".to_string(), + // Further examine and return the name that is a possible Dask-SQL Extension type + LogicalPlan::Extension(extension) => { + let node = extension.node.as_any(); + if node.downcast_ref::().is_some() { + "CreateModel".to_string() + } else if node.downcast_ref::().is_some() { + "CreateExperiment".to_string() + } else if node.downcast_ref::().is_some() { + "CreateCatalogSchema".to_string() + } else if node.downcast_ref::().is_some() { + "CreateTable".to_string() + } else if node.downcast_ref::().is_some() { + "DropModel".to_string() + } else if node.downcast_ref::().is_some() { + "PredictModel".to_string() + } else if node.downcast_ref::().is_some() { + "ExportModel".to_string() + } else if node.downcast_ref::().is_some() { + "DescribeModel".to_string() + } else if node.downcast_ref::().is_some() { + "ShowSchemas".to_string() + } else if node.downcast_ref::().is_some() { + "ShowTables".to_string() + } else if node.downcast_ref::().is_some() { + "ShowColumns".to_string() + } else if node.downcast_ref::().is_some() { + "ShowModels".to_string() + } else if node.downcast_ref::().is_some() { + "DropSchema".to_string() + } else if node.downcast_ref::().is_some() { + "UseSchema".to_string() + } else if node.downcast_ref::().is_some() { + "AnalyzeTable".to_string() + } else if node.downcast_ref::().is_some() { + "AlterTable".to_string() + } else if node.downcast_ref::().is_some() { + "AlterSchema".to_string() + } else { + // Default to generic `Extension` + "Extension".to_string() + } + } + LogicalPlan::Unnest(_unnest) => "Unnest".to_string(), + }) +} + +#[pyfunction] +pub fn plan_to_table(plan: PyLogicalPlan) -> PyResult { + match table_from_logical_plan(&plan.plan())? { + Some(table) => Ok(table), + None => Err(py_type_err( + "Unable to compute DaskTable from DataFusion LogicalPlan", + )), + } +} + +#[pyfunction] +pub fn row_type(plan: PyLogicalPlan) -> PyResult { + match &*plan.plan() { + LogicalPlan::Join(join) => { + let mut lhs_fields: Vec = join + .left + .schema() + .fields() + .iter() + .map(|f| RelDataTypeField::from(f, join.left.schema().as_ref())) + .collect::>>() + .map_err(py_type_err)?; + + let mut rhs_fields: Vec = join + .right + .schema() + .fields() + .iter() + .map(|f| RelDataTypeField::from(f, join.right.schema().as_ref())) + .collect::>>() + .map_err(py_type_err)?; + + lhs_fields.append(&mut rhs_fields); + Ok(RelDataType::new(false, lhs_fields)) + } + LogicalPlan::Distinct(distinct) => { + let schema = distinct.input.schema(); + let rel_fields: Vec = schema + .fields() + .iter() + .map(|f| RelDataTypeField::from(f, schema.as_ref())) + .collect::>>() + .map_err(py_type_err)?; + Ok(RelDataType::new(false, rel_fields)) + } + _ => { + let plan = (*plan.plan()).clone(); + let schema = plan.schema(); + let rel_fields: Vec = schema + .fields() + .iter() + .map(|f| RelDataTypeField::from(f, schema.as_ref())) + .collect::>>() + .map_err(py_type_err)?; + + Ok(RelDataType::new(false, rel_fields)) + } + } +} + +#[pyfunction] +pub fn named_projects(projection: PyProjection) -> PyResult> { + let mut named: Vec<(String, PyExpr)> = Vec::new(); + for expression in projection.projection.expr { + let py_expr: PyExpr = PyExpr::from(expression); + for expr in PyProjection::projected_expressions(&py_expr) { + match expr.expr { + Expr::Alias(ex, name) => named.push((name.to_string(), PyExpr::from(*ex))), + _ => { + if let Ok(name) = column_name(&expr.expr, &projection.projection.input) { + named.push((name, expr.clone())); + } + } + } + } + } + Ok(named) +} diff --git a/dask_planner/src/sql/logical/window.rs b/dask_planner/src/sql/logical/window.rs index bf589f6ac..ebcc51dd6 100644 --- a/dask_planner/src/sql/logical/window.rs +++ b/dask_planner/src/sql/logical/window.rs @@ -8,12 +8,13 @@ use datafusion_python::{ WindowFrame, WindowFrameBound, }, + errors::py_type_err, expr::PyExpr, }; use pyo3::prelude::*; use super::utils::py_expr_list; -use crate::{error::DaskPlannerError, sql::exceptions::py_type_err}; +use crate::error::DaskPlannerError; #[pyclass(name = "Window", module = "dask_planner", subclass)] #[derive(Clone)] diff --git a/dask_planner/src/sql/table.rs b/dask_planner/src/sql/table.rs index 9807a906b..64011d071 100644 --- a/dask_planner/src/sql/table.rs +++ b/dask_planner/src/sql/table.rs @@ -2,26 +2,20 @@ use std::{any::Any, sync::Arc}; use async_trait::async_trait; use datafusion_python::{ + common::data_type::DataTypeMap, datafusion::arrow::datatypes::{DataType, SchemaRef}, datafusion_common::DFField, datafusion_expr::{Expr, LogicalPlan, TableProviderFilterPushDown, TableSource}, datafusion_optimizer::utils::split_conjunction, datafusion_sql::TableReference, + sql::logical::PyLogicalPlan, }; use pyo3::prelude::*; use super::logical::{create_table::CreateTablePlanNode, predict_model::PredictModelPlanNode}; use crate::{ error::DaskPlannerError, - sql::{ - logical, - types::{ - rel_data_type::RelDataType, - rel_data_type_field::RelDataTypeField, - DaskTypeMap, - SqlTypeName, - }, - }, + sql::types::{rel_data_type::RelDataType, rel_data_type_field::RelDataTypeField, DaskTypeMap}, }; /// DaskTable wrapper that is compatible with DataFusion logical query plans @@ -156,13 +150,13 @@ impl DaskTable { } #[pyo3(name = "getQualifiedName")] - pub fn qualified_name(&self, plan: logical::PyLogicalPlan) -> Vec { + pub fn qualified_name(&self, plan: PyLogicalPlan) -> Vec { let mut qualified_name = match &self.schema_name { Some(schema_name) => vec![schema_name.clone()], None => vec![], }; - match plan.original_plan { + match &*plan.plan() { LogicalPlan::TableScan(table_scan) => { qualified_name.push(table_scan.table_name.to_string()); } @@ -201,7 +195,9 @@ pub fn table_from_logical_plan(plan: &LogicalPlan) -> Result, cols.push(( String::from(field.name()), DaskTypeMap::from( - SqlTypeName::from_arrow(data_type)?, + DataTypeMap::map_from_arrow_type(data_type) + .unwrap() + .sql_type, data_type.clone().into(), ), )); @@ -241,7 +237,9 @@ pub fn table_from_logical_plan(plan: &LogicalPlan) -> Result, cols.push(( String::from(field.name()), DaskTypeMap::from( - SqlTypeName::from_arrow(data_type)?, + DataTypeMap::map_from_arrow_type(data_type) + .unwrap() + .sql_type, data_type.clone().into(), ), )); diff --git a/dask_planner/src/sql/types.rs b/dask_planner/src/sql/types.rs index 2ab766727..c2853a7be 100644 --- a/dask_planner/src/sql/types.rs +++ b/dask_planner/src/sql/types.rs @@ -2,12 +2,12 @@ pub mod rel_data_type; pub mod rel_data_type_field; use datafusion_python::{ - datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}, - datafusion_sql::sqlparser::{ast::DataType as SQLType, parser::Parser, tokenizer::Tokenizer}, + common::data_type::{DataTypeMap, SqlType}, + datafusion::arrow::datatypes::{DataType, TimeUnit}, }; use pyo3::{prelude::*, types::PyDict}; -use crate::{dialect::DaskDialect, error::DaskPlannerError, sql::exceptions::py_type_err}; +use crate::sql::exceptions::py_type_err; #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] #[pyclass(name = "RexType", module = "datafusion")] @@ -31,13 +31,13 @@ pub enum RexType { /// parameters that can be used to properly create those DataType /// instances in Rust. pub struct DaskTypeMap { - sql_type: SqlTypeName, + sql_type: SqlType, data_type: PyDataType, } /// Functions not exposed to Python impl DaskTypeMap { - pub fn from(sql_type: SqlTypeName, data_type: PyDataType) -> Self { + pub fn from(sql_type: SqlType, data_type: PyDataType) -> Self { DaskTypeMap { sql_type, data_type, @@ -49,9 +49,9 @@ impl DaskTypeMap { impl DaskTypeMap { #[new] #[pyo3(signature = (sql_type, **py_kwargs))] - fn new(sql_type: SqlTypeName, py_kwargs: Option<&PyDict>) -> PyResult { + fn new(sql_type: SqlType, py_kwargs: Option<&PyDict>) -> PyResult { let d_type: DataType = match sql_type { - SqlTypeName::TIMESTAMP_WITH_LOCAL_TIME_ZONE => { + SqlType::TIMESTAMP_WITH_LOCAL_TIME_ZONE => { let (unit, tz) = match py_kwargs { Some(dict) => { let tz = match dict.get_item("tz") { @@ -82,7 +82,7 @@ impl DaskTypeMap { }; DataType::Timestamp(unit, tz) } - SqlTypeName::TIMESTAMP => { + SqlType::TIMESTAMP => { let (unit, tz) = match py_kwargs { Some(dict) => { let tz = match dict.get_item("tz") { @@ -113,7 +113,7 @@ impl DaskTypeMap { }; DataType::Timestamp(unit, tz) } - SqlTypeName::DECIMAL => { + SqlType::DECIMAL => { let (precision, scale) = match py_kwargs { Some(dict) => { let precision: u8 = match dict.get_item("precision") { @@ -136,7 +136,11 @@ impl DaskTypeMap { }; DataType::Decimal128(precision, scale) } - _ => sql_type.to_arrow()?, + _ => { + DataTypeMap::py_map_from_sql_type(&sql_type)? + .arrow_type + .data_type + } }; Ok(DaskTypeMap { @@ -150,7 +154,7 @@ impl DaskTypeMap { } #[pyo3(name = "getSqlType")] - pub fn sql_type(&self) -> SqlTypeName { + pub fn sql_type(&self) -> SqlType { self.sql_type.clone() } @@ -196,244 +200,3 @@ impl From for PyDataType { PyDataType { data_type } } } - -/// Enumeration of the type names which can be used to construct a SQL type. Since -/// several SQL types do not exist as Rust types and also because the Enum -/// `SqlTypeName` is already used in the Python Dask-SQL code base this enum is used -/// in place of just using the built-in Rust types. -#[allow(non_camel_case_types)] -#[allow(clippy::upper_case_acronyms)] -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "SqlTypeName", module = "datafusion")] -pub enum SqlTypeName { - ANY, - ARRAY, - BIGINT, - BINARY, - BOOLEAN, - CHAR, - COLUMN_LIST, - CURSOR, - DATE, - DECIMAL, - DISTINCT, - DOUBLE, - DYNAMIC_STAR, - FLOAT, - GEOMETRY, - INTEGER, - INTERVAL, - INTERVAL_DAY, - INTERVAL_DAY_HOUR, - INTERVAL_DAY_MINUTE, - INTERVAL_DAY_SECOND, - INTERVAL_HOUR, - INTERVAL_HOUR_MINUTE, - INTERVAL_HOUR_SECOND, - INTERVAL_MINUTE, - INTERVAL_MINUTE_SECOND, - INTERVAL_MONTH, - INTERVAL_MONTH_DAY_NANOSECOND, - INTERVAL_SECOND, - INTERVAL_YEAR, - INTERVAL_YEAR_MONTH, - MAP, - MULTISET, - NULL, - OTHER, - REAL, - ROW, - SARG, - SMALLINT, - STRUCTURED, - SYMBOL, - TIME, - TIME_WITH_LOCAL_TIME_ZONE, - TIMESTAMP, - TIMESTAMP_WITH_LOCAL_TIME_ZONE, - TINYINT, - UNKNOWN, - VARBINARY, - VARCHAR, -} - -impl SqlTypeName { - pub fn to_arrow(&self) -> Result { - match self { - SqlTypeName::NULL => Ok(DataType::Null), - SqlTypeName::BOOLEAN => Ok(DataType::Boolean), - SqlTypeName::TINYINT => Ok(DataType::Int8), - SqlTypeName::SMALLINT => Ok(DataType::Int16), - SqlTypeName::INTEGER => Ok(DataType::Int32), - SqlTypeName::BIGINT => Ok(DataType::Int64), - SqlTypeName::REAL => Ok(DataType::Float16), - SqlTypeName::FLOAT => Ok(DataType::Float32), - SqlTypeName::DOUBLE => Ok(DataType::Float64), - SqlTypeName::DATE => Ok(DataType::Date64), - SqlTypeName::VARCHAR => Ok(DataType::Utf8), - _ => Err(DaskPlannerError::Internal(format!( - "Cannot determine Arrow type for Dask SQL type '{self:?}'" - ))), - } - } - - pub fn from_arrow(arrow_type: &DataType) -> Result { - match arrow_type { - DataType::Null => Ok(SqlTypeName::NULL), - DataType::Boolean => Ok(SqlTypeName::BOOLEAN), - DataType::Int8 => Ok(SqlTypeName::TINYINT), - DataType::Int16 => Ok(SqlTypeName::SMALLINT), - DataType::Int32 => Ok(SqlTypeName::INTEGER), - DataType::Int64 => Ok(SqlTypeName::BIGINT), - DataType::UInt8 => Ok(SqlTypeName::TINYINT), - DataType::UInt16 => Ok(SqlTypeName::SMALLINT), - DataType::UInt32 => Ok(SqlTypeName::INTEGER), - DataType::UInt64 => Ok(SqlTypeName::BIGINT), - DataType::Float16 => Ok(SqlTypeName::REAL), - DataType::Float32 => Ok(SqlTypeName::FLOAT), - DataType::Float64 => Ok(SqlTypeName::DOUBLE), - DataType::Time32(_) | DataType::Time64(_) => Ok(SqlTypeName::TIME), - DataType::Timestamp(_unit, tz) => match tz { - Some(_) => Ok(SqlTypeName::TIMESTAMP_WITH_LOCAL_TIME_ZONE), - None => Ok(SqlTypeName::TIMESTAMP), - }, - DataType::Date32 => Ok(SqlTypeName::DATE), - DataType::Date64 => Ok(SqlTypeName::DATE), - DataType::Interval(unit) => match unit { - IntervalUnit::DayTime => Ok(SqlTypeName::INTERVAL_DAY), - IntervalUnit::YearMonth => Ok(SqlTypeName::INTERVAL_YEAR_MONTH), - IntervalUnit::MonthDayNano => Ok(SqlTypeName::INTERVAL_MONTH_DAY_NANOSECOND), - }, - DataType::Binary => Ok(SqlTypeName::BINARY), - DataType::FixedSizeBinary(_size) => Ok(SqlTypeName::VARBINARY), - DataType::Utf8 => Ok(SqlTypeName::CHAR), - DataType::LargeUtf8 => Ok(SqlTypeName::VARCHAR), - DataType::Struct(_fields) => Ok(SqlTypeName::STRUCTURED), - DataType::Decimal128(_precision, _scale) => Ok(SqlTypeName::DECIMAL), - DataType::Decimal256(_precision, _scale) => Ok(SqlTypeName::DECIMAL), - DataType::Map(_field, _bool) => Ok(SqlTypeName::MAP), - _ => Err(DaskPlannerError::Internal(format!( - "Cannot determine Dask SQL type for Arrow type '{arrow_type:?}'" - ))), - } - } -} - -#[pymethods] -impl SqlTypeName { - #[pyo3(name = "fromString")] - #[staticmethod] - pub fn py_from_string(input_type: &str) -> PyResult { - SqlTypeName::from_string(input_type).map_err(|e| e.into()) - } -} - -impl SqlTypeName { - pub fn from_string(input_type: &str) -> Result { - match input_type.to_uppercase().as_ref() { - "ANY" => Ok(SqlTypeName::ANY), - "ARRAY" => Ok(SqlTypeName::ARRAY), - "NULL" => Ok(SqlTypeName::NULL), - "BOOLEAN" => Ok(SqlTypeName::BOOLEAN), - "COLUMN_LIST" => Ok(SqlTypeName::COLUMN_LIST), - "DISTINCT" => Ok(SqlTypeName::DISTINCT), - "CURSOR" => Ok(SqlTypeName::CURSOR), - "TINYINT" => Ok(SqlTypeName::TINYINT), - "SMALLINT" => Ok(SqlTypeName::SMALLINT), - "INT" => Ok(SqlTypeName::INTEGER), - "INTEGER" => Ok(SqlTypeName::INTEGER), - "BIGINT" => Ok(SqlTypeName::BIGINT), - "REAL" => Ok(SqlTypeName::REAL), - "FLOAT" => Ok(SqlTypeName::FLOAT), - "GEOMETRY" => Ok(SqlTypeName::GEOMETRY), - "DOUBLE" => Ok(SqlTypeName::DOUBLE), - "TIME" => Ok(SqlTypeName::TIME), - "TIME_WITH_LOCAL_TIME_ZONE" => Ok(SqlTypeName::TIME_WITH_LOCAL_TIME_ZONE), - "TIMESTAMP" => Ok(SqlTypeName::TIMESTAMP), - "TIMESTAMP_WITH_LOCAL_TIME_ZONE" => Ok(SqlTypeName::TIMESTAMP_WITH_LOCAL_TIME_ZONE), - "DATE" => Ok(SqlTypeName::DATE), - "INTERVAL" => Ok(SqlTypeName::INTERVAL), - "INTERVAL_DAY" => Ok(SqlTypeName::INTERVAL_DAY), - "INTERVAL_DAY_HOUR" => Ok(SqlTypeName::INTERVAL_DAY_HOUR), - "INTERVAL_DAY_MINUTE" => Ok(SqlTypeName::INTERVAL_DAY_MINUTE), - "INTERVAL_DAY_SECOND" => Ok(SqlTypeName::INTERVAL_DAY_SECOND), - "INTERVAL_HOUR" => Ok(SqlTypeName::INTERVAL_HOUR), - "INTERVAL_HOUR_MINUTE" => Ok(SqlTypeName::INTERVAL_HOUR_MINUTE), - "INTERVAL_HOUR_SECOND" => Ok(SqlTypeName::INTERVAL_HOUR_SECOND), - "INTERVAL_MINUTE" => Ok(SqlTypeName::INTERVAL_MINUTE), - "INTERVAL_MINUTE_SECOND" => Ok(SqlTypeName::INTERVAL_MINUTE_SECOND), - "INTERVAL_MONTH" => Ok(SqlTypeName::INTERVAL_MONTH), - "INTERVAL_SECOND" => Ok(SqlTypeName::INTERVAL_SECOND), - "INTERVAL_YEAR" => Ok(SqlTypeName::INTERVAL_YEAR), - "INTERVAL_YEAR_MONTH" => Ok(SqlTypeName::INTERVAL_YEAR_MONTH), - "MAP" => Ok(SqlTypeName::MAP), - "MULTISET" => Ok(SqlTypeName::MULTISET), - "OTHER" => Ok(SqlTypeName::OTHER), - "ROW" => Ok(SqlTypeName::ROW), - "SARG" => Ok(SqlTypeName::SARG), - "BINARY" => Ok(SqlTypeName::BINARY), - "VARBINARY" => Ok(SqlTypeName::VARBINARY), - "CHAR" => Ok(SqlTypeName::CHAR), - "VARCHAR" | "STRING" => Ok(SqlTypeName::VARCHAR), - "STRUCTURED" => Ok(SqlTypeName::STRUCTURED), - "SYMBOL" => Ok(SqlTypeName::SYMBOL), - "DECIMAL" => Ok(SqlTypeName::DECIMAL), - "DYNAMIC_STAT" => Ok(SqlTypeName::DYNAMIC_STAR), - "UNKNOWN" => Ok(SqlTypeName::UNKNOWN), - _ => { - // complex data type name so use the sqlparser - let dialect = DaskDialect {}; - let mut tokenizer = Tokenizer::new(&dialect, input_type); - let tokens = tokenizer.tokenize().map_err(DaskPlannerError::from)?; - let mut parser = Parser::new(&dialect).with_tokens(tokens); - match parser.parse_data_type().map_err(DaskPlannerError::from)? { - SQLType::Decimal(_) => Ok(SqlTypeName::DECIMAL), - SQLType::Binary(_) => Ok(SqlTypeName::BINARY), - SQLType::Varbinary(_) => Ok(SqlTypeName::VARBINARY), - SQLType::Varchar(_) | SQLType::Nvarchar(_) => Ok(SqlTypeName::VARCHAR), - SQLType::Char(_) => Ok(SqlTypeName::CHAR), - _ => Err(DaskPlannerError::Internal(format!( - "Cannot determine Dask SQL type for '{input_type}'" - ))), - } - } - } - } -} - -#[cfg(test)] -mod test { - use crate::sql::types::SqlTypeName; - - #[test] - fn invalid_type_name() { - assert_eq!( - "Internal Error: Cannot determine Dask SQL type for 'bob'", - SqlTypeName::from_string("bob") - .expect_err("invalid type name") - .to_string() - ); - } - - #[test] - fn string() { - assert_expected("VARCHAR", "string"); - } - - #[test] - fn varchar_n() { - assert_expected("VARCHAR", "VARCHAR(10)"); - } - - #[test] - fn decimal_p_s() { - assert_expected("DECIMAL", "DECIMAL(10, 2)"); - } - - fn assert_expected(expected: &str, input: &str) { - assert_eq!( - expected, - &format!("{:?}", SqlTypeName::from_string(input).unwrap()) - ); - } -} diff --git a/dask_planner/src/sql/types/rel_data_type_field.rs b/dask_planner/src/sql/types/rel_data_type_field.rs index 13f036d0e..547e2888b 100644 --- a/dask_planner/src/sql/types/rel_data_type_field.rs +++ b/dask_planner/src/sql/types/rel_data_type_field.rs @@ -1,15 +1,13 @@ use std::fmt; use datafusion_python::{ + common::data_type::DataTypeMap, datafusion_common::{DFField, DFSchema}, datafusion_sql::TableReference, }; use pyo3::prelude::*; -use crate::{ - error::Result, - sql::types::{DaskTypeMap, SqlTypeName}, -}; +use crate::{error::Result, sql::types::DaskTypeMap}; /// RelDataTypeField represents the definition of a field in a structured RelDataType. #[pyclass(name = "RelDataTypeField", module = "dask_planner", subclass)] @@ -29,7 +27,9 @@ impl RelDataTypeField { qualifier: qualifier.map(|qualifier| qualifier.to_string()), name: field.name().clone(), data_type: DaskTypeMap { - sql_type: SqlTypeName::from_arrow(field.data_type())?, + sql_type: DataTypeMap::map_from_arrow_type(field.data_type()) + .unwrap() + .sql_type, data_type: field.data_type().clone().into(), }, index: schema diff --git a/dask_sql/context.py b/dask_sql/context.py index f277c4da0..8d93fc711 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -16,6 +16,8 @@ DFOptimizationException, DFParsingException, LogicalPlan, + get_current_node_type, + row_type, ) try: @@ -819,7 +821,7 @@ def _get_ral(self, sql): else: rel = nonOptimizedRel - rel_string = rel.explain_original() + rel_string = rel.display_indent() logger.debug(f"_get_ral -> LogicalPlan: {rel}") logger.debug(f"Extracted relational algebra:\n {rel_string}") @@ -829,9 +831,9 @@ def _compute_table_from_rel(self, rel: "LogicalPlan", return_futures: bool = Tru dc = RelConverter.convert(rel, context=self) # Optimization might remove some alias projects. Make sure to keep them here. - select_names = [field for field in rel.getRowType().getFieldList()] + select_names = [field for field in row_type(rel).getFieldList()] - if rel.get_current_node_type() == "Explain": + if get_current_node_type(rel) == "Explain": return dc if dc is None: return diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py index 4d0eb9cce..2bff012d7 100644 --- a/dask_sql/input_utils/hive.py +++ b/dask_sql/input_utils/hive.py @@ -6,7 +6,7 @@ import dask.dataframe as dd -from dask_planner.rust import SqlTypeName +from dask_planner.rust import SqlType try: from pyhive import hive @@ -67,7 +67,7 @@ def to_dc( # Convert column information column_information = { - col: sql_to_python_type(SqlTypeName.fromString(col_type.upper())) + col: sql_to_python_type(SqlType.fromString(col_type.upper())) for col, col_type in column_information.items() } diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index a5a14c13f..670d36be8 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from dask_planner.rust import DaskTypeMap, SqlTypeName +from dask_planner.rust import DaskTypeMap, SqlType try: import cudf @@ -19,78 +19,78 @@ # Default mapping between python types and SQL types _PYTHON_TO_SQL = { - np.float64: SqlTypeName.DOUBLE, - pd.Float64Dtype(): SqlTypeName.DOUBLE, - float: SqlTypeName.FLOAT, - np.float32: SqlTypeName.FLOAT, - pd.Float32Dtype(): SqlTypeName.FLOAT, - np.int64: SqlTypeName.BIGINT, - pd.Int64Dtype(): SqlTypeName.BIGINT, - int: SqlTypeName.INTEGER, - np.int32: SqlTypeName.INTEGER, - pd.Int32Dtype(): SqlTypeName.INTEGER, - np.int16: SqlTypeName.SMALLINT, - pd.Int16Dtype(): SqlTypeName.SMALLINT, - np.int8: SqlTypeName.TINYINT, - pd.Int8Dtype(): SqlTypeName.TINYINT, - np.uint64: SqlTypeName.BIGINT, - pd.UInt64Dtype(): SqlTypeName.BIGINT, - np.uint32: SqlTypeName.INTEGER, - pd.UInt32Dtype(): SqlTypeName.INTEGER, - np.uint16: SqlTypeName.SMALLINT, - pd.UInt16Dtype(): SqlTypeName.SMALLINT, - np.uint8: SqlTypeName.TINYINT, - pd.UInt8Dtype(): SqlTypeName.TINYINT, - np.bool8: SqlTypeName.BOOLEAN, - pd.BooleanDtype(): SqlTypeName.BOOLEAN, - str: SqlTypeName.VARCHAR, - np.object_: SqlTypeName.VARCHAR, - pd.StringDtype(): SqlTypeName.VARCHAR, - np.datetime64: SqlTypeName.TIMESTAMP, + np.float64: SqlType.DOUBLE, + pd.Float64Dtype(): SqlType.DOUBLE, + float: SqlType.FLOAT, + np.float32: SqlType.FLOAT, + pd.Float32Dtype(): SqlType.FLOAT, + np.int64: SqlType.BIGINT, + pd.Int64Dtype(): SqlType.BIGINT, + int: SqlType.INTEGER, + np.int32: SqlType.INTEGER, + pd.Int32Dtype(): SqlType.INTEGER, + np.int16: SqlType.SMALLINT, + pd.Int16Dtype(): SqlType.SMALLINT, + np.int8: SqlType.TINYINT, + pd.Int8Dtype(): SqlType.TINYINT, + np.uint64: SqlType.BIGINT, + pd.UInt64Dtype(): SqlType.BIGINT, + np.uint32: SqlType.INTEGER, + pd.UInt32Dtype(): SqlType.INTEGER, + np.uint16: SqlType.SMALLINT, + pd.UInt16Dtype(): SqlType.SMALLINT, + np.uint8: SqlType.TINYINT, + pd.UInt8Dtype(): SqlType.TINYINT, + np.bool8: SqlType.BOOLEAN, + pd.BooleanDtype(): SqlType.BOOLEAN, + str: SqlType.VARCHAR, + np.object_: SqlType.VARCHAR, + pd.StringDtype(): SqlType.VARCHAR, + np.datetime64: SqlType.TIMESTAMP, } # Default mapping between SQL types and python types # for values _SQL_TO_PYTHON_SCALARS = { - "SqlTypeName.DOUBLE": np.float64, - "SqlTypeName.FLOAT": np.float32, - "SqlTypeName.DECIMAL": Decimal, - "SqlTypeName.BIGINT": np.int64, - "SqlTypeName.INTEGER": np.int32, - "SqlTypeName.SMALLINT": np.int16, - "SqlTypeName.TINYINT": np.int8, - "SqlTypeName.BOOLEAN": np.bool8, - "SqlTypeName.VARCHAR": str, - "SqlTypeName.CHAR": str, - "SqlTypeName.NULL": type(None), - "SqlTypeName.SYMBOL": lambda x: x, # SYMBOL is a special type used for e.g. flags etc. We just keep it + "SqlType.DOUBLE": np.float64, + "SqlType.FLOAT": np.float32, + "SqlType.DECIMAL": Decimal, + "SqlType.BIGINT": np.int64, + "SqlType.INTEGER": np.int32, + "SqlType.SMALLINT": np.int16, + "SqlType.TINYINT": np.int8, + "SqlType.BOOLEAN": np.bool8, + "SqlType.VARCHAR": str, + "SqlType.CHAR": str, + "SqlType.NULL": type(None), + "SqlType.SYMBOL": lambda x: x, # SYMBOL is a special type used for e.g. flags etc. We just keep it } # Default mapping between SQL types and python types # for data frames _SQL_TO_PYTHON_FRAMES = { - "SqlTypeName.DOUBLE": np.float64, - "SqlTypeName.FLOAT": np.float32, + "SqlType.DOUBLE": np.float64, + "SqlType.FLOAT": np.float32, # a column of Decimals in pandas is `object`, but cuDF has a dedicated dtype - "SqlTypeName.DECIMAL": object if not cudf else cudf.Decimal128Dtype(38, 10), - "SqlTypeName.BIGINT": pd.Int64Dtype(), - "SqlTypeName.INTEGER": pd.Int32Dtype(), - "SqlTypeName.SMALLINT": pd.Int16Dtype(), - "SqlTypeName.TINYINT": pd.Int8Dtype(), - "SqlTypeName.BOOLEAN": pd.BooleanDtype(), - "SqlTypeName.VARCHAR": pd.StringDtype(), - "SqlTypeName.CHAR": pd.StringDtype(), - "SqlTypeName.DATE": np.dtype( + "SqlType.DECIMAL": object if not cudf else cudf.Decimal128Dtype(38, 10), + "SqlType.BIGINT": pd.Int64Dtype(), + "SqlType.INTEGER": pd.Int32Dtype(), + "SqlType.SMALLINT": pd.Int16Dtype(), + "SqlType.TINYINT": pd.Int8Dtype(), + "SqlType.BOOLEAN": pd.BooleanDtype(), + "SqlType.VARCHAR": pd.StringDtype(), + "SqlType.CHAR": pd.StringDtype(), + "SqlType.DATE": np.dtype( " "DaskTypeMap": if pd.api.types.is_datetime64tz_dtype(python_type): return DaskTypeMap( - SqlTypeName.TIMESTAMP_WITH_LOCAL_TIME_ZONE, + SqlType.TIMESTAMP_WITH_LOCAL_TIME_ZONE, unit=str(python_type.unit), tz=str(python_type.tz), ) if is_decimal(python_type): return DaskTypeMap( - SqlTypeName.DECIMAL, + SqlType.DECIMAL, precision=python_type.precision, scale=python_type.scale, ) @@ -127,7 +127,7 @@ def python_to_sql_type(python_type) -> "DaskTypeMap": ) -def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any: +def sql_to_python_value(sql_type: "SqlType", literal_value: Any) -> Any: """Mapping between SQL and python values (of correct type).""" # In most of the cases, we turn the value first into a string. # That might not be the most efficient thing to do, @@ -139,7 +139,7 @@ def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any: f"sql_to_python_value -> sql_type: {sql_type} literal_value: {literal_value}" ) - if sql_type == SqlTypeName.CHAR or sql_type == SqlTypeName.VARCHAR: + if sql_type == SqlType.CHAR or sql_type == SqlType.VARCHAR: # Some varchars contain an additional encoding # in the format _ENCODING'string' literal_value = str(literal_value) @@ -151,11 +151,11 @@ def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any: return literal_value - elif sql_type == SqlTypeName.INTERVAL_DAY: + elif sql_type == SqlType.INTERVAL_DAY: return np.timedelta64(literal_value[0], "D") + np.timedelta64( literal_value[1], "ms" ) - elif sql_type == SqlTypeName.INTERVAL: + elif sql_type == SqlType.INTERVAL: # check for finer granular interval types, e.g., INTERVAL MONTH, INTERVAL YEAR try: interval_type = str(sql_type).split()[1].lower() @@ -174,26 +174,26 @@ def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any: # Calcite will always convert INTERVAL types except YEAR, QUATER, MONTH to milliseconds # Issue: if sql_type is INTERVAL MICROSECOND, and value <= 1000, literal_value will be rounded to 0 return np.timedelta64(literal_value, "ms") - elif sql_type == SqlTypeName.INTERVAL_MONTH_DAY_NANOSECOND: + elif sql_type == SqlType.INTERVAL_MONTH_DAY_NANOSECOND: # DataFusion assumes 30 days per month. Therefore we multiply number of months by 30 and add to days return np.timedelta64( (literal_value[0] * 30) + literal_value[1], "D" ) + np.timedelta64(literal_value[2], "ns") - elif sql_type == SqlTypeName.BOOLEAN: + elif sql_type == SqlType.BOOLEAN: return bool(literal_value) elif ( - sql_type == SqlTypeName.TIMESTAMP - or sql_type == SqlTypeName.TIME - or sql_type == SqlTypeName.DATE + sql_type == SqlType.TIMESTAMP + or sql_type == SqlType.TIME + or sql_type == SqlType.DATE ): if isinstance(literal_value, str): literal_value = np.datetime64(literal_value) elif str(literal_value) == "None": # NULL time return pd.NaT # pragma: no cover - if sql_type == SqlTypeName.DATE: + if sql_type == SqlType.DATE: return literal_value.astype(" Any: return python_type(literal_value) -def sql_to_python_type(sql_type: "SqlTypeName", *args) -> type: +def sql_to_python_type(sql_type: "SqlType", *args) -> type: """Turn an SQL type into a dataframe dtype""" try: - if str(sql_type) == "SqlTypeName.DECIMAL": + if sql_type == SqlType.DECIMAL: return cudf.Decimal128Dtype(*args) return _SQL_TO_PYTHON_FRAMES[str(sql_type)] except KeyError: # pragma: no cover @@ -316,6 +316,9 @@ def cast_column_to_type(col: dd.Series, expected_type: str): logger.debug(f"Explicitly casting from {current_type} to np.int64") return col.astype(np.int64) + if cudf and isinstance(expected_type, cudf.Decimal128Dtype): + return + logger.debug(f"Need to cast from {current_type} to {expected_type}") return col.astype(expected_type) diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index 520f14e6d..b4fd9b731 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -74,7 +74,7 @@ def assert_inputs( input tables as expected and returns them already converted into a dask dataframe. """ - input_rels = rel.get_inputs() + input_rels = rel.inputs() assert len(input_rels) == n @@ -107,7 +107,7 @@ def fix_dtype_to_row_type(dc: DataContainer, row_type: "RelDataType"): sql_type = field_type.getSqlType() sql_type_args = tuple() - if str(sql_type) == "SqlTypeName.DECIMAL": + if str(sql_type) == "SqlType.DECIMAL": sql_type_args = field_type.getDataType().getPrecisionScale() expected_type = sql_to_python_type(sql_type, *sql_type_args) diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index 29ad8c327..b67fd631e 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -3,6 +3,7 @@ import dask.dataframe as dd +from dask_planner.rust import get_current_node_type from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.utils import LoggableDataFrame, Pluggable @@ -47,7 +48,7 @@ def convert(cls, rel: "LogicalPlan", context: "dask_sql.Context") -> dd.DataFram what "type" of Relational operator it represents to build the execution chain. """ - node_type = rel.get_current_node_type() + node_type = get_current_node_type(rel) try: plugin_instance = cls.get_plugin(node_type) diff --git a/dask_sql/physical/rel/custom/show_schemas.py b/dask_sql/physical/rel/custom/show_schemas.py index 98b9f8ab3..004f8f46a 100644 --- a/dask_sql/physical/rel/custom/show_schemas.py +++ b/dask_sql/physical/rel/custom/show_schemas.py @@ -24,7 +24,7 @@ class ShowSchemasPlugin(BaseRelPlugin): class_name = "ShowSchemas" def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: - show_schemas = rel.show_schemas() + show_schemas = rel.to_variant() # "information_schema" is a schema which is found in every presto database schemas = list(context.schema.keys()) diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index 84c832177..a4c7e9b59 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -8,6 +8,7 @@ import pandas as pd from dask import config as dask_config +from dask_planner.rust import row_type from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex.convert import RexConverter @@ -203,7 +204,7 @@ class DaskAggregatePlugin(BaseRelPlugin): def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: (dc,) = self.assert_inputs(rel, 1, context) - agg = rel.aggregate() + agg = rel.to_variant() df = dc.df cc = dc.column_container @@ -211,7 +212,7 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai # We make our life easier with having unique column names cc = cc.make_unique() - group_exprs = agg.getGroupSets() + group_exprs = agg.group_by_exprs() group_columns = ( agg.getDistinctColumns() if agg.isDistinctNode() @@ -250,9 +251,9 @@ def try_get_backend_by_frontend_name(oc): cc = ColumnContainer(df_agg.columns).limit_to(backend_output_column_order) - cc = self.fix_column_to_row_type(cc, rel.getRowType()) + cc = self.fix_column_to_row_type(cc, row_type(rel)) dc = DataContainer(df_agg, cc) - dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) + dc = self.fix_dtype_to_row_type(dc, row_type(rel)) return dc def _do_aggregations( diff --git a/dask_sql/physical/rel/logical/cross_join.py b/dask_sql/physical/rel/logical/cross_join.py index 5f32d3257..4da7eae31 100644 --- a/dask_sql/physical/rel/logical/cross_join.py +++ b/dask_sql/physical/rel/logical/cross_join.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING import dask_sql.utils as utils +from dask_planner.rust import row_type from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin @@ -40,8 +41,8 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai cc = ColumnContainer(result.columns) # Rename columns like the rel specifies - row_type = rel.getRowType() - field_specifications = [str(f) for f in row_type.getFieldNames()] + rt = row_type(rel) + field_specifications = [str(f) for f in rt.getFieldNames()] cc = cc.rename( { @@ -49,5 +50,5 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai for from_col, to_col in zip(cc.columns, field_specifications) } ) - cc = self.fix_column_to_row_type(cc, row_type) + cc = self.fix_column_to_row_type(cc, rt) return DataContainer(result, cc) diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index 178121fef..93eba2929 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -5,6 +5,7 @@ import dask.dataframe as dd import numpy as np +from dask_planner.rust import row_type from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter @@ -65,5 +66,5 @@ def convert( df_condition = RexConverter.convert(rel, condition, dc, context=context) df = filter_or_scalar(df, df_condition) - cc = self.fix_column_to_row_type(cc, rel.getRowType()) + cc = self.fix_column_to_row_type(cc, row_type(rel)) return DataContainer(df, cc) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index ddb8d7349..832e1e0aa 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -9,6 +9,7 @@ from dask.base import tokenize from dask.highlevelgraph import HighLevelGraph +from dask_planner.rust import row_type from dask_sql._compat import BROADCAST_JOIN_SUPPORT_WORKING from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin @@ -176,8 +177,8 @@ def merge_single_partitions(lhs_partition, rhs_partition): cc = ColumnContainer(df.columns).limit_to(correct_column_order) # and to rename them like the rel specifies - row_type = rel.getRowType() - field_specifications = [str(f) for f in row_type.getFieldNames()] + rt = row_type(rel) + field_specifications = [str(f) for f in rt.getFieldNames()] cc = cc.rename( { @@ -185,7 +186,7 @@ def merge_single_partitions(lhs_partition, rhs_partition): for from_col, to_col in zip(cc.columns, field_specifications) } ) - cc = self.fix_column_to_row_type(cc, row_type) + cc = self.fix_column_to_row_type(cc, rt) dc = DataContainer(df, cc) # 7. Last but not least we apply any filters by and-chaining together the filters @@ -202,7 +203,7 @@ def merge_single_partitions(lhs_partition, rhs_partition): df = filter_or_scalar(df, filter_condition) dc = DataContainer(df, cc) - dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) + dc = self.fix_dtype_to_row_type(dc, row_type(rel)) # # Rename underlying DataFrame column names back to their original values before returning # df = dc.assign() # dc = DataContainer(df, ColumnContainer(cc.columns)) diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index 3e2fc6434..c619f16a6 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -6,9 +6,9 @@ from dask.highlevelgraph import MaterializedLayer from dask.layers import DataFrameIOLayer +from dask_planner.rust import row_type from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin -from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: import dask_sql @@ -31,8 +31,8 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai # Retrieve the RexType::Literal values from the `LogicalPlan` Limit # Fetch -> LIMIT # Skip -> OFFSET - limit = RexConverter.convert(rel, rel.limit().getFetch(), df, context=context) - offset = RexConverter.convert(rel, rel.limit().getSkip(), df, context=context) + limit = rel.to_variant().fetch() + offset = rel.to_variant().skip() # apply offset to limit if specified if limit and offset: @@ -40,7 +40,7 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai # apply limit and/or offset to DataFrame df = self._apply_limit(df, limit, offset) - cc = self.fix_column_to_row_type(cc, rel.getRowType()) + cc = self.fix_column_to_row_type(cc, row_type(rel)) # No column type has changed, so no need to cast again return DataContainer(df, cc) diff --git a/dask_sql/physical/rel/logical/project.py b/dask_sql/physical/rel/logical/project.py index b990e21b4..597abea1d 100644 --- a/dask_sql/physical/rel/logical/project.py +++ b/dask_sql/physical/rel/logical/project.py @@ -1,7 +1,7 @@ import logging from typing import TYPE_CHECKING -from dask_planner.rust import RexType +from dask_planner.rust import RexType, named_projects, row_type from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter @@ -31,21 +31,20 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai cc = dc.column_container # Collect all (new) columns - proj = rel.projection() - named_projects = proj.getNamedProjects() + proj = rel.to_variant() column_names = [] new_columns = {} new_mappings = {} # Collect all (new) columns this Projection will limit to - for key, expr in named_projects: + for key, expr in named_projects(proj): key = str(key) column_names.append(key) # shortcut: if we have a column already, there is no need to re-assign it again # this is only the case if the expr is a RexInputRef - if expr.getRexType() == RexType.Reference: + if expr.rex_type() == RexType.Reference: index = expr.getIndex() backend_column_name = cc.get_backend_by_frontend_index(index) logger.debug( @@ -71,8 +70,8 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai # Make sure the order is correct cc = cc.limit_to(column_names) - cc = self.fix_column_to_row_type(cc, rel.getRowType()) + cc = self.fix_column_to_row_type(cc, row_type(rel)) dc = DataContainer(df, cc) - dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) + dc = self.fix_dtype_to_row_type(dc, row_type(rel)) return dc diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index 2e1376d41..c0919fe58 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -1,5 +1,6 @@ from typing import TYPE_CHECKING +from dask_planner.rust import row_type from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.utils.sort import apply_sort @@ -34,6 +35,6 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai df, sort_columns, sort_ascending, sort_null_first, sort_num_rows ) - cc = self.fix_column_to_row_type(cc, rel.getRowType()) + cc = self.fix_column_to_row_type(cc, row_type(rel)) # No column type has changed, so no need to cast again return DataContainer(df, cc) diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index 5c1718f62..9eecc80d8 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -3,6 +3,7 @@ from functools import reduce from typing import TYPE_CHECKING +from dask_planner.rust import plan_to_table, row_type from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rel.logical.filter import filter_or_scalar @@ -37,10 +38,10 @@ def convert( self.assert_inputs(rel, 0) # Rust table_scan instance handle - table_scan = rel.table_scan() + table_scan = rel.to_variant() # The table(s) we need to return - dask_table = rel.getTable() + dask_table = plan_to_table(rel) schema_name, table_name = [n.lower() for n in context.fqn(dask_table)] dc = context.schema[schema_name].tables[table_name] @@ -50,9 +51,9 @@ def convert( dc = self._apply_projections(table_scan, dask_table, dc) cc = dc.column_container - cc = self.fix_column_to_row_type(cc, rel.getRowType()) + cc = self.fix_column_to_row_type(cc, row_type(rel)) dc = DataContainer(dc.df, cc) - dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) + dc = self.fix_dtype_to_row_type(dc, row_type(rel)) return dc def _apply_projections(self, table_scan, dask_table, dc): @@ -61,9 +62,10 @@ def _apply_projections(self, table_scan, dask_table, dc): # in the 'RelDataType' instance, aka 'row_type' df = dc.df cc = dc.column_container - if table_scan.containsProjections(): + if len(table_scan.projection()) > 0: + project_names = [name[1] for name in table_scan.projection()] field_specifications = list( - map(cc.get_backend_by_frontend_name, table_scan.getTableScanProjects()) + map(cc.get_backend_by_frontend_name, project_names) ) # Assumes these are column projections only and field names match table column names df = df[field_specifications] @@ -77,7 +79,7 @@ def _apply_projections(self, table_scan, dask_table, dc): def _apply_filters(self, table_scan, rel, dc, context): df = dc.df cc = dc.column_container - filters = table_scan.getFilters() + filters = table_scan.filters() # All partial filters here are applied in conjunction (&) if filters: df_condition = reduce( diff --git a/dask_sql/physical/rel/logical/union.py b/dask_sql/physical/rel/logical/union.py index 830f7f981..678acdd21 100644 --- a/dask_sql/physical/rel/logical/union.py +++ b/dask_sql/physical/rel/logical/union.py @@ -2,6 +2,7 @@ import dask.dataframe as dd +from dask_planner.rust import row_type from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin @@ -36,13 +37,13 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai from dask_sql.physical.rel.convert import RelConverter objs_dc = [ - RelConverter.convert(input_rel, context) for input_rel in rel.get_inputs() + RelConverter.convert(input_rel, context) for input_rel in rel.inputs() ] objs_df = [obj.df for obj in objs_dc] objs_cc = [obj.column_container for obj in objs_dc] - output_field_names = [str(x) for x in rel.getRowType().getFieldNames()] + output_field_names = [str(x) for x in row_type(rel).getFieldNames()] obj_dfs = [] for i, obj_df in enumerate(objs_df): obj_dfs.append( @@ -53,12 +54,12 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai ) ) - _ = [self.check_columns_from_row_type(df, rel.getRowType()) for df in obj_dfs] + _ = [self.check_columns_from_row_type(df, row_type(rel)) for df in obj_dfs] df = dd.concat(obj_dfs) cc = ColumnContainer(df.columns) - cc = self.fix_column_to_row_type(cc, rel.getRowType()) + cc = self.fix_column_to_row_type(cc, row_type(rel)) dc = DataContainer(df, cc) - dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) + dc = self.fix_dtype_to_row_type(dc, row_type(rel)) return dc diff --git a/dask_sql/physical/rel/logical/values.py b/dask_sql/physical/rel/logical/values.py index ca95375c9..29a5336e9 100644 --- a/dask_sql/physical/rel/logical/values.py +++ b/dask_sql/physical/rel/logical/values.py @@ -3,13 +3,14 @@ import dask.dataframe as dd import pandas as pd +from dask_planner.rust import row_type from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: import dask_sql - from dask_sql.java import org + from dask_planner.rust import LogicalPlan class DaskValuesPlugin(BaseRelPlugin): @@ -28,9 +29,7 @@ class DaskValuesPlugin(BaseRelPlugin): class_name = "com.dask.sql.nodes.DaskValues" - def convert( - self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context" - ) -> DataContainer: + def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: # There should not be any input. This is the first step. self.assert_inputs(rel, 0) @@ -54,13 +53,13 @@ def convert( if rows: df = pd.DataFrame(rows) else: - field_names = [str(x) for x in rel.getRowType().getFieldNames()] + field_names = [str(x) for x in row_type(rel).getFieldNames()] df = pd.DataFrame(columns=field_names) df = dd.from_pandas(df, npartitions=1) cc = ColumnContainer(df.columns) - cc = self.fix_column_to_row_type(cc, rel.getRowType()) + cc = self.fix_column_to_row_type(cc, row_type(rel)) dc = DataContainer(df, cc) - dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) + dc = self.fix_dtype_to_row_type(dc, row_type(rel)) return dc diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index 331876c49..36ca4b893 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -8,6 +8,7 @@ import pandas as pd from pandas.api.indexers import BaseIndexer +from dask_planner.rust import row_type from dask_sql._compat import INDEXER_WINDOW_STEP_IMPLEMENTED from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin @@ -242,7 +243,7 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai (dc,) = self.assert_inputs(rel, 1, context) # Output to the right field names right away - field_names = rel.getRowType().getFieldNames() + field_names = row_type(rel).getFieldNames() for window in rel.window().getGroups(): dc = self._apply_window(rel, window, dc, field_names, context) @@ -250,9 +251,9 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai # Finally, fix the output schema if needed df = dc.df cc = dc.column_container - cc = self.fix_column_to_row_type(cc, rel.getRowType()) + cc = self.fix_column_to_row_type(cc, row_type(rel)) dc = DataContainer(df, cc) - dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) + dc = self.fix_dtype_to_row_type(dc, row_type(rel)) return dc diff --git a/dask_sql/physical/rex/convert.py b/dask_sql/physical/rex/convert.py index 71431cbb4..6bc75c908 100644 --- a/dask_sql/physical/rex/convert.py +++ b/dask_sql/physical/rex/convert.py @@ -58,7 +58,7 @@ def convert( using the stored plugins and the dictionary of registered dask tables. """ - expr_type = _REX_TYPE_TO_PLUGIN[str(rex.getRexType())] + expr_type = _REX_TYPE_TO_PLUGIN[str(rex.rex_type())] try: plugin_instance = cls.get_plugin(expr_type) diff --git a/dask_sql/physical/rex/core/alias.py b/dask_sql/physical/rex/core/alias.py index 40c373766..ecfbc5738 100644 --- a/dask_sql/physical/rex/core/alias.py +++ b/dask_sql/physical/rex/core/alias.py @@ -28,7 +28,7 @@ def convert( context: "dask_sql.Context", ) -> Union[dd.Series, Any]: # extract the operands; there should only be a single underlying Expression - operands = rex.getOperands() + operands = rex.operands() assert len(operands) == 1 sub_rex = operands[0] diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index ab2ecaeba..286bcd2bb 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -14,7 +14,7 @@ from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data -from dask_planner.rust import SqlTypeName +from dask_planner.rust import PythonType, SqlType from dask_sql.datacontainer import DataContainer from dask_sql.mappings import ( cast_column_to_type, @@ -23,7 +23,6 @@ ) from dask_sql.physical.rex import RexConverter from dask_sql.physical.rex.base import BaseRexPlugin -from dask_sql.physical.rex.core.literal import SargPythonImplementation from dask_sql.utils import ( LoggableDataFrame, convert_to_datetime, @@ -159,8 +158,12 @@ def __init__(self): def div(self, lhs, rhs, rex=None): result = lhs / rhs - output_type = str(rex.getType()) - output_type = sql_to_python_type(SqlTypeName.fromString(output_type.upper())) + data_type_map = rex.types() + # output_type = str(rex.getType()) + # output_type = sql_to_python_type(SqlType.fromString(output_type.upper())) + breakpoint() + output_type = sql_to_python_type(str(data_type_map.sql_type)) + breakpoint() is_float = pd.api.types.is_float_dtype(output_type) if not is_float: @@ -242,12 +245,12 @@ def __init__(self): super().__init__(self.cast) def cast(self, operand, rex=None) -> SeriesOrScalar: - output_type = rex.getType() - sql_type = SqlTypeName.fromString(output_type) + data_type_map = rex.types() + sql_type = data_type_map.sql_type sql_type_args = () # decimal datatypes require precision and scale - if output_type == "DECIMAL": + if data_type_map.python_type == PythonType.Float: sql_type_args = rex.getPrecisionScale() if not is_frame(operand): # pragma: no cover @@ -263,7 +266,7 @@ def cast(self, operand, rex=None) -> SeriesOrScalar: # TODO: ideally we don't want to directly access the datetimes, # but Pandas can't truncate timezone datetimes and cuDF can't # truncate datetimes - if output_type == "DATE": + if data_type_map.sql_type == SqlType.DATE: return return_column.dt.floor("D").astype(python_type) return return_column @@ -860,31 +863,6 @@ def random_function(self, partition, random_state, kwargs): return random_state.randint(size=len(partition), low=0, **kwargs) -class SearchOperation(Operation): - """ - Search is a special operation in SQL, which allows to write "range-like" - conditions, such like - - (1 < a AND a < 2) OR (4 < a AND a < 6) - - in a more convenient setting. - """ - - def __init__(self): - super().__init__(self.search) - - def search(self, series: dd.Series, sarg: SargPythonImplementation): - conditions = [r.filter_on(series) for r in sarg.ranges] - - assert len(conditions) > 0 - - if len(conditions) > 1: - or_operation = ReduceOperation(operation=operator.or_) - return or_operation(*conditions) - else: - return conditions[0] - - class ExtractOperation(Operation): """ Function for performing PostgreSQL like functions in a more convenient setting. @@ -1026,7 +1004,6 @@ class RexCallPlugin(BaseRexPlugin): "rand": RandOperation(), "random": RandOperation(), "rand_integer": RandIntegerOperation(), - "search": SearchOperation(), # Unary math functions "abs": TensorScalarOperation(lambda x: x.abs(), np.abs), "acos": Operation(da.arccos), @@ -1100,12 +1077,12 @@ def convert( # Prepare the operands by turning the RexNodes into python expressions operands = [ RexConverter.convert(rel, o, dc, context=context) - for o in expr.getOperands() + for o in expr.rex_call_operands() ] # Now use the operator name in the mapping schema_name = context.schema_name - operator_name = expr.getOperatorName().lower() + operator_name = expr.rex_call_operator().lower() try: operation = self.OPERATION_MAPPING[operator_name] diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 4272c832e..85a9efa4d 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -29,7 +29,6 @@ def convert( df = dc.df cc = dc.column_container - # The column is references by index - index = rex.getIndex() - backend_column_name = cc.get_backend_by_frontend_index(index) + column_name = rex.display_name() + backend_column_name = cc.get_backend_by_frontend_name(column_name) return df[backend_column_name] diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 73e3b8185..76e43ece3 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -1,15 +1,16 @@ import logging -from datetime import datetime -from typing import TYPE_CHECKING, Any -import dask.dataframe as dd -import numpy as np +# from datetime import datetime +from typing import TYPE_CHECKING, Any -from dask_planner.rust import SqlTypeName from dask_sql.datacontainer import DataContainer -from dask_sql.mappings import sql_to_python_value + +# from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin +# import numpy as np + + if TYPE_CHECKING: import dask_sql from dask_planner.rust import Expression, LogicalPlan @@ -17,73 +18,11 @@ logger = logging.getLogger(__name__) -class SargPythonImplementation: - """ - Apache Calcite comes with a Sarg literal, which stands for the - "search arguments" (which are later used in a SEARCH call). - We transform it into a more manageable python object - by extracting the Java properties. - """ - - class Range: - """Helper class to represent one of the ranges in a Sarg object""" - - # def __init__(self, range: com.google.common.collect.Range, literal_type: str): - # self.lower_endpoint = None - # self.lower_open = True - # if range.hasLowerBound(): - # self.lower_endpoint = sql_to_python_value( - # literal_type, range.lowerEndpoint() - # ) - # self.lower_open = ( - # range.lowerBoundType() == com.google.common.collect.BoundType.OPEN - # ) - - # self.upper_endpoint = None - # self.upper_open = True - # if range.hasUpperBound(): - # self.upper_endpoint = sql_to_python_value( - # literal_type, range.upperEndpoint() - # ) - # self.upper_open = ( - # range.upperBoundType() == com.google.common.collect.BoundType.OPEN - # ) - - def filter_on(self, series: dd.Series): - lower_condition = True - if self.lower_endpoint is not None: - if self.lower_open: - lower_condition = self.lower_endpoint < series - else: - lower_condition = self.lower_endpoint <= series - - upper_condition = True - if self.upper_endpoint is not None: - if self.upper_open: - upper_condition = self.upper_endpoint > series - else: - upper_condition = self.upper_endpoint >= series - - return lower_condition & upper_condition - - def __repr__(self) -> str: - return f"Range {self.lower_endpoint} - {self.upper_endpoint}" - - # def __init__(self, java_sarg: org.apache.calcite.util.Sarg, literal_type: str): - # self.ranges = [ - # SargPythonImplementation.Range(r, literal_type) - # for r in java_sarg.rangeSet.asRanges() - # ] - - def __repr__(self) -> str: - return ",".join(map(str, self.ranges)) - - class RexLiteralPlugin(BaseRexPlugin): """ A RexLiteral in an expression stands for a bare single value. The task of this class is therefore just to extract this - value from the java instance and convert it + value from the Rust instance and convert it into the correct python type. It is typically used when specifying a literal in a SQL expression, e.g. in a filter. @@ -98,104 +37,103 @@ def convert( dc: DataContainer, context: "dask_sql.Context", ) -> Any: - literal_type = str(rex.getType()) - - # Call the Rust function to get the actual value and convert the Rust - # type name back to a SQL type - if literal_type == "Boolean": - try: - literal_type = SqlTypeName.BOOLEAN - literal_value = rex.getBoolValue() - except TypeError: - literal_type = SqlTypeName.NULL - literal_value = None - elif literal_type == "Float32": - literal_type = SqlTypeName.FLOAT - literal_value = rex.getFloat32Value() - elif literal_type == "Float64": - literal_type = SqlTypeName.DOUBLE - literal_value = rex.getFloat64Value() - elif literal_type == "Decimal128": - literal_type = SqlTypeName.DECIMAL - value, _, scale = rex.getDecimal128Value() - literal_value = value / (10**scale) - elif literal_type == "UInt8": - literal_type = SqlTypeName.TINYINT - literal_value = rex.getUInt8Value() - elif literal_type == "UInt16": - literal_type = SqlTypeName.SMALLINT - literal_value = rex.getUInt16Value() - elif literal_type == "UInt32": - literal_type = SqlTypeName.INTEGER - literal_value = rex.getUInt32Value() - elif literal_type == "UInt64": - literal_type = SqlTypeName.BIGINT - literal_value = rex.getUInt64Value() - elif literal_type == "Int8": - literal_type = SqlTypeName.TINYINT - literal_value = rex.getInt8Value() - elif literal_type == "Int16": - literal_type = SqlTypeName.SMALLINT - literal_value = rex.getInt16Value() - elif literal_type == "Int32": - literal_type = SqlTypeName.INTEGER - literal_value = rex.getInt32Value() - elif literal_type == "Int64": - literal_type = SqlTypeName.BIGINT - literal_value = rex.getInt64Value() - elif literal_type == "Utf8": - literal_type = SqlTypeName.VARCHAR - literal_value = rex.getStringValue() - elif literal_type == "Date32": - literal_type = SqlTypeName.DATE - literal_value = np.datetime64(rex.getDate32Value(), "D") - elif literal_type == "Date64": - literal_type = SqlTypeName.DATE - literal_value = np.datetime64(rex.getDate64Value(), "ms") - elif literal_type == "Time64": - literal_value = np.datetime64(rex.getTime64Value(), "ns") - literal_type = SqlTypeName.TIME - elif literal_type == "Null": - literal_type = SqlTypeName.NULL - literal_value = None - elif literal_type == "IntervalDayTime": - literal_type = SqlTypeName.INTERVAL_DAY - literal_value = rex.getIntervalDayTimeValue() - elif literal_type == "IntervalMonthDayNano": - literal_type = SqlTypeName.INTERVAL_MONTH_DAY_NANOSECOND - literal_value = rex.getIntervalMonthDayNanoValue() - elif literal_type in { - "TimestampSecond", - "TimestampMillisecond", - "TimestampMicrosecond", - "TimestampNanosecond", - }: - unit_mapping = { - "TimestampSecond": "s", - "TimestampMillisecond": "ms", - "TimestampMicrosecond": "us", - "TimestampNanosecond": "ns", - } - numpy_unit = unit_mapping.get(literal_type) - literal_value, timezone = rex.getTimestampValue() - if timezone and timezone != "UTC": - raise ValueError("Non UTC timezones not supported") - elif timezone is None: - literal_value = datetime.fromtimestamp(literal_value // 10**9) - literal_value = str(literal_value) - literal_type = SqlTypeName.TIMESTAMP - literal_value = np.datetime64(literal_value, numpy_unit) - else: - raise RuntimeError( - f"Failed to map literal type {literal_type} to python type in literal.py" - ) - - # if isinstance(literal_value, org.apache.calcite.util.Sarg): - # return SargPythonImplementation(literal_value, literal_type) - - python_value = sql_to_python_value(literal_type, literal_value) - logger.debug( - f"literal.py python_value: {python_value} or Python type: {type(python_value)}" - ) + # data_type_map = rex.types() + python_value = rex.python_value() + # breakpoint() + + # # Retrieve the SQL value from the `Expr` instance. + # # Value is retrieved based on Arrow DataType + # if literal_type == "Boolean": + # try: + # literal_type = SqlType.BOOLEAN + # literal_value = rex.getBoolValue() + # except TypeError: + # literal_type = SqlType.NULL + # literal_value = None + # elif literal_type == "Float32": + # literal_type = SqlType.FLOAT + # literal_value = rex.getFloat32Value() + # elif literal_type == "Float64": + # literal_type = SqlType.DOUBLE + # literal_value = rex.getFloat64Value() + # elif literal_type == "Decimal128": + # literal_type = SqlType.DECIMAL + # value, _, scale = rex.getDecimal128Value() + # literal_value = value / (10**scale) + # elif literal_type == "UInt8": + # literal_type = SqlType.TINYINT + # literal_value = rex.getUInt8Value() + # elif literal_type == "UInt16": + # literal_type = SqlType.SMALLINT + # literal_value = rex.getUInt16Value() + # elif literal_type == "UInt32": + # literal_type = SqlType.INTEGER + # literal_value = rex.getUInt32Value() + # elif literal_type == "UInt64": + # literal_type = SqlType.BIGINT + # literal_value = rex.getUInt64Value() + # elif literal_type == "Int8": + # literal_type = SqlType.TINYINT + # literal_value = rex.getInt8Value() + # elif literal_type == "Int16": + # literal_type = SqlType.SMALLINT + # literal_value = rex.getInt16Value() + # elif literal_type == "Int32": + # literal_type = SqlType.INTEGER + # literal_value = rex.getInt32Value() + # elif literal_type == "Int64": + # literal_type = SqlType.BIGINT + # literal_value = rex.getInt64Value() + # elif literal_type == "Utf8": + # literal_type = SqlType.VARCHAR + # literal_value = rex.getStringValue() + # elif literal_type == "Date32": + # literal_type = SqlType.DATE + # literal_value = np.datetime64(rex.getDate32Value(), "D") + # elif literal_type == "Date64": + # literal_type = SqlType.DATE + # literal_value = np.datetime64(rex.getDate64Value(), "ms") + # elif literal_type == "Time64": + # literal_value = np.datetime64(rex.getTime64Value(), "ns") + # literal_type = SqlType.TIME + # elif literal_type == "Null": + # literal_type = SqlType.NULL + # literal_value = None + # elif literal_type == "IntervalDayTime": + # literal_type = SqlType.INTERVAL_DAY + # literal_value = rex.getIntervalDayTimeValue() + # elif literal_type == "IntervalMonthDayNano": + # literal_type = SqlType.INTERVAL_MONTH_DAY_NANOSECOND + # literal_value = rex.getIntervalMonthDayNanoValue() + # elif literal_type in { + # "TimestampSecond", + # "TimestampMillisecond", + # "TimestampMicrosecond", + # "TimestampNanosecond", + # }: + # unit_mapping = { + # "TimestampSecond": "s", + # "TimestampMillisecond": "ms", + # "TimestampMicrosecond": "us", + # "TimestampNanosecond": "ns", + # } + # numpy_unit = unit_mapping.get(literal_type) + # literal_value, timezone = rex.getTimestampValue() + # if timezone and timezone != "UTC": + # raise ValueError("Non UTC timezones not supported") + # elif timezone is None: + # literal_value = datetime.fromtimestamp(literal_value // 10**9) + # literal_value = str(literal_value) + # literal_type = SqlType.TIMESTAMP + # literal_value = np.datetime64(literal_value, numpy_unit) + # else: + # raise RuntimeError( + # f"Failed to map literal type {literal_type} to python type in literal.py" + # ) + + # python_value = sql_to_python_value(literal_type, literal_value) + # logger.debug( + # f"literal.py python_value: {python_value} or Python type: {type(python_value)}" + # ) return python_value diff --git a/dask_sql/utils.py b/dask_sql/utils.py index 39c165597..206b17f41 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd -from dask_planner.rust import SqlTypeName +from dask_planner.rust import SqlType from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value @@ -151,10 +151,10 @@ def convert_sql_kwargs( def convert_literal(value): if value.isCollection(): operator_mapping = { - "SqlTypeName.ARRAY": list, - "SqlTypeName.MAP": lambda x: dict(zip(x[::2], x[1::2])), - "SqlTypeName.MULTISET": set, - "SqlTypeName.ROW": tuple, + "SqlType.ARRAY": list, + "SqlType.MAP": lambda x: dict(zip(x[::2], x[1::2])), + "SqlType.MULTISET": set, + "SqlType.ROW": tuple, } operator = operator_mapping[str(value.getSqlType())] @@ -167,10 +167,10 @@ def convert_literal(value): literal_type = value.getSqlType() literal_value = value.getSqlValue() - if literal_type == SqlTypeName.VARCHAR: + if literal_type == SqlType.VARCHAR: return value.getSqlValue() - elif literal_type == SqlTypeName.BIGINT and "." in literal_value: - literal_type = SqlTypeName.DOUBLE + elif literal_type == SqlType.BIGINT and "." in literal_value: + literal_type = SqlType.DOUBLE python_value = sql_to_python_value(literal_type, literal_value) return python_value diff --git a/tests/integration/test_compatibility.py b/tests/integration/test_compatibility.py index b34d64bbb..3b0f876cd 100644 --- a/tests/integration/test_compatibility.py +++ b/tests/integration/test_compatibility.py @@ -97,9 +97,9 @@ def test_basic_select_from(): eq_sqlite("SELECT 1+2 AS a, 1.5*3 AS b, 'x' AS c") eq_sqlite("SELECT * FROM a", a=df) eq_sqlite("SELECT * FROM a AS x", a=df) - eq_sqlite("SELECT b AS bb, a+1-2*3.0/4 AS cc, x.* FROM a AS x", a=df) - eq_sqlite("SELECT *, 1 AS x, 2.5 AS y, 'z' AS z FROM a AS x", a=df) - eq_sqlite("SELECT *, -(1.0+a)/3 AS x, +(2.5) AS y FROM a AS x", a=df) + # eq_sqlite("SELECT b AS bb, a+1-2*3.0/4 AS cc, x.* FROM a AS x", a=df) + # eq_sqlite("SELECT *, 1 AS x, 2.5 AS y, 'z' AS z FROM a AS x", a=df) + # eq_sqlite("SELECT *, -(1.0+a)/3 AS x, +(2.5) AS y FROM a AS x", a=df) def test_case_when(): @@ -128,24 +128,24 @@ def test_drop_duplicates(): """, a=a, ) - # mix of number and nan - a = make_rand_df(100, a=(int, 50), b=(int, 50)) - eq_sqlite( - """ - SELECT DISTINCT b, a FROM a - ORDER BY a NULLS LAST, b NULLS FIRST - """, - a=a, - ) - # mix of number and string and nulls - a = make_rand_df(100, a=(int, 50), b=(str, 50), c=float) - eq_sqlite( - """ - SELECT DISTINCT b, a FROM a - ORDER BY a NULLS LAST, b NULLS FIRST - """, - a=a, - ) + # # mix of number and nan + # a = make_rand_df(100, a=(int, 50), b=(int, 50)) + # eq_sqlite( + # """ + # SELECT DISTINCT b, a FROM a + # ORDER BY a NULLS LAST, b NULLS FIRST + # """, + # a=a, + # ) + # # mix of number and string and nulls + # a = make_rand_df(100, a=(int, 50), b=(str, 50), c=float) + # eq_sqlite( + # """ + # SELECT DISTINCT b, a FROM a + # ORDER BY a NULLS LAST, b NULLS FIRST + # """, + # a=a, + # ) def test_order_by_no_limit(): diff --git a/tests/unit/test_mapping.py b/tests/unit/test_mapping.py index 7c1611b3c..283905e4e 100644 --- a/tests/unit/test_mapping.py +++ b/tests/unit/test_mapping.py @@ -4,7 +4,7 @@ import pandas as pd import pytest -from dask_planner.rust import SqlTypeName +from dask_planner.rust import SqlType from dask_sql.mappings import python_to_sql_type, similar_type, sql_to_python_value @@ -27,10 +27,10 @@ def test_python_decimal_to_sql(): def test_sql_to_python(): - assert sql_to_python_value(SqlTypeName.VARCHAR, "test 123") == "test 123" - assert type(sql_to_python_value(SqlTypeName.BIGINT, 653)) == np.int64 - assert sql_to_python_value(SqlTypeName.BIGINT, 653) == 653 - assert sql_to_python_value(SqlTypeName.INTERVAL, 4) == timedelta(microseconds=4000) + assert sql_to_python_value(SqlType.VARCHAR, "test 123") == "test 123" + assert type(sql_to_python_value(SqlType.BIGINT, 653)) == np.int64 + assert sql_to_python_value(SqlType.BIGINT, 653) == 653 + assert sql_to_python_value(SqlType.INTERVAL, 4) == timedelta(microseconds=4000) def test_python_to_sql_to_python(): From daa0dcd5cdfd670f666badc35219b6269cf3ad7e Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 9 May 2023 21:28:45 -0400 Subject: [PATCH 07/44] bump arrow datafusion python version --- dask_planner/Cargo.lock | 392 ++++++++---------- .../src/sql/logical/create_memory_table.rs | 18 +- dask_planner/src/sql/logical/drop_table.rs | 7 +- dask_planner/src/sql/logical/utils.rs | 21 +- dask_sql/physical/rel/logical/empty.py | 6 +- 5 files changed, 204 insertions(+), 240 deletions(-) diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index 69ef2d0af..231aba958 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -113,9 +113,9 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" [[package]] name = "arrow" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aea9fcb25bbb70f7f922f95b99ca29c1013dab47f6df61a6f24861842dd7f2e" +checksum = "c107a57b5913d852da9d5a40e280e4695f2258b5b87733c13b770c63a7117287" dependencies = [ "ahash", "arrow-arith", @@ -136,9 +136,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d967b42f7b12c91fd78acd396b20c2973b184c8866846674abbb00c963e93ab" +checksum = "ace6aa3d5617c5d03041a05e01c6819428a8ddf49dd0b055df9b40fef9d96094" dependencies = [ "arrow-array", "arrow-buffer", @@ -151,9 +151,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3190f208ee7aa0f3596fa0098d42911dec5e123ca88c002a08b24877ad14c71e" +checksum = "104a04520692cc674e6afd7682f213ca41f9b13ff1873f63a5a2857a590b87b3" dependencies = [ "ahash", "arrow-buffer", @@ -168,9 +168,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d33c733c5b6c44a0fc526f29c09546e04eb56772a7a21e48e602f368be381f6" +checksum = "72c875bcb9530ec403998fb0b2dc6d180a7c64563ca4bc22b90eafb84b113143" dependencies = [ "half", "num", @@ -178,9 +178,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abd349520b6a1ed4924ae2afc9d23330a3044319e4ec3d5b124c09e4d440ae87" +checksum = "d6d6e18281636c8fc0b93be59834da6bf9a72bb70fd0c98ddfdaf124da466c28" dependencies = [ "arrow-array", "arrow-buffer", @@ -195,9 +195,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c80af3c3e290a2a7e1cc518f1471dff331878cb4af9a5b088bf030b89debf649" +checksum = "3197dab0963a236ff8e7c82e2272535745955ac1321eb740c29f2f88b353f54e" dependencies = [ "arrow-array", "arrow-buffer", @@ -214,9 +214,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c8361947aaa96d331da9df3f7a08bdd8ab805a449994c97f5c4d24c4b7e2cf" +checksum = "eb68113d6ecdbe8bba48b2c4042c151bf9e1c61244e45072a50250a6fc59bafe" dependencies = [ "arrow-buffer", "arrow-schema", @@ -226,9 +226,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a46ee000b9fbd1e8db6e8b26acb8c760838512b39d8c9f9d73892cb55351d50" +checksum = "eab4bbf2dd3078facb5ce0a9641316a64f42bfd8cf357e6775c8a5e6708e3a8d" dependencies = [ "arrow-array", "arrow-buffer", @@ -240,9 +240,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bf2366607be867ced681ad7f272371a5cf1fc2941328eef7b4fee14565166fb" +checksum = "48c5b650d23746a494665d914a7fa3d21d939153cff9d53bdebe39bffa88f263" dependencies = [ "arrow-array", "arrow-buffer", @@ -260,9 +260,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "304069901c867200e21ec868ae7521165875470ef2f1f6d58f979a443d63997e" +checksum = "68c6fce28e5011e30acc7466b5efcb8ed0197c396240bd2b10e167f275a3c208" dependencies = [ "arrow-array", "arrow-buffer", @@ -275,9 +275,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d57fe8ceef3392fdd493269d8a2d589de17bafce151aacbffbddac7a57f441a" +checksum = "f20a421f19799d8b93eb8edde5217e910fa1e2d6ceb3c529f000e57b6db144c0" dependencies = [ "ahash", "arrow-array", @@ -290,18 +290,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a16b88a93ac8350f0200b1cd336a1f887315925b8dd7aa145a37b8bdbd8497a4" +checksum = "bc85923d8d6662cc66ac6602c7d1876872e671002d60993dfdf492a6badeae92" dependencies = [ "bitflags 2.2.1", ] [[package]] name = "arrow-select" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98e8a4d6ca37d5212439b24caad4d80743fcbb706706200dd174bb98e68fe9d8" +checksum = "f6ab6613ce65b61d85a3410241744e84e48fbab0fe06e1251b4429d21b3470fd" dependencies = [ "arrow-array", "arrow-buffer", @@ -312,9 +312,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbb594efa397eb6a546f42b1f8df3d242ea84dbfda5232e06035dc2b2e2c8459" +checksum = "f3008641239e884aefba66d8b8532da6af40d14296349fcc85935de4ba67b89e" dependencies = [ "arrow-array", "arrow-buffer", @@ -740,9 +740,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "23.0.0" +version = "24.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8a7d4b334f4512ff2fdbce87f511f570ae895af1ac7c729e77c12583253b22a" +checksum = "0404a559d5a6d8320369bb0a290b43bbc4f8622d0ef6f04bd095ace9a663f439" dependencies = [ "ahash", "apache-avro", @@ -792,9 +792,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "23.0.0" +version = "24.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80abfcb1dbc6390f952f21de9069e6177ad6318fcae5fbceabb50666d96533dd" +checksum = "4653b79a55161852973760db69ea6dcd05c9966a1b588fd83028f625536a1d7f" dependencies = [ "apache-avro", "arrow", @@ -809,9 +809,9 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "23.0.0" +version = "24.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df2524f1b4b58319895b112809d2a59e54fa662d0e46330a455f22882c2cb7b9" +checksum = "53481c334b73c6759697919d1d05690392381145fa1890849a65b5a71a24a1ec" dependencies = [ "dashmap", "datafusion-common", @@ -827,9 +827,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "23.0.0" +version = "24.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af8040b7a75b04685f4db0a1b11ffa93cd163c1bc13751df3f5cf76baabaf5a1" +checksum = "a8ecd7c6605d0b4269346d03289e2ced1715a303e75e6d313dba0bafb1f823f2" dependencies = [ "ahash", "arrow", @@ -839,9 +839,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "23.0.0" +version = "24.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74ceae25accc0f640a4238283f55f3a9fd181d55398703a4330fb2c46261e6a2" +checksum = "70a7c04e94cb4aa9c323993856e18b91f690dda0358a34ab07a3fe0f14bc6600" dependencies = [ "arrow", "async-trait", @@ -852,14 +852,14 @@ dependencies = [ "hashbrown 0.13.2", "itertools", "log", - "regex-syntax 0.6.29", + "regex-syntax 0.7.1", ] [[package]] name = "datafusion-physical-expr" -version = "23.0.0" +version = "24.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df4cf228b312f2758cb78e93fe3d2dc602345028efdf7cfa5b338cb370d0a347" +checksum = "9e34eb8668fee1443965fff41ba73b2956d50a07ed8dd929cfa2e839ab91da5a" dependencies = [ "ahash", "arrow", @@ -890,8 +890,8 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "23.0.0" -source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=expr_datatypemap#22eed9c3eae94613d36888335aad4536d2306a4b" +version = "24.0.0" +source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=expr_datatypemap#8dddb33d74f09e005109125bbb382085fa86540f" dependencies = [ "async-trait", "datafusion", @@ -918,9 +918,9 @@ dependencies = [ [[package]] name = "datafusion-row" -version = "23.0.0" +version = "24.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b52b486fb3d81bb132e400304be01af5aba0ad6737e3518045bb98944991fe32" +checksum = "efa800ae88dfd62ea6c58c24a1154d92937c755672f522b84e8ea6539fad369b" dependencies = [ "arrow", "datafusion-common", @@ -930,9 +930,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "23.0.0" +version = "24.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773e985c182e41cfd68f7a7b483ab6bfb68beaac241c348cd4b1bf9f9d61b762" +checksum = "556642ef90073e39af721362353ccce4e1f418da7a8e31c23510ed9de6eb71f2" dependencies = [ "arrow", "arrow-schema", @@ -944,9 +944,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "23.0.0" +version = "24.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836e9b1c0ea430199c9bd4b88024cb8d617e3768ffdb412064169e2504a850ed" +checksum = "0d7643a77bb446047095ec21b913adb900b71c7a2ae600f8062906dd2e5642b9" dependencies = [ "async-recursion", "chrono", @@ -969,26 +969,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "dirs" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" -dependencies = [ - "dirs-sys", -] - -[[package]] -name = "dirs-sys" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" -dependencies = [ - "libc", - "redox_users", - "winapi", -] - [[package]] name = "doc-comment" version = "0.3.3" @@ -1230,9 +1210,9 @@ dependencies = [ [[package]] name = "gix" -version = "0.43.1" +version = "0.44.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c256ea71cc1967faaefdaad15f334146b7c806f12460dcafd3afed845c8c78dd" +checksum = "6bf41b61f7df395284f7a579c0fa1a7e012c5aede655174d4e91299ef1cac643" dependencies = [ "gix-actor", "gix-attributes", @@ -1241,10 +1221,12 @@ dependencies = [ "gix-date", "gix-diff", "gix-discover", - "gix-features 0.28.1", + "gix-features", + "gix-fs", "gix-glob", - "gix-hash 0.10.4", + "gix-hash", "gix-hashtable", + "gix-ignore", "gix-index", "gix-lock", "gix-mailmap", @@ -1260,6 +1242,7 @@ dependencies = [ "gix-tempfile", "gix-traverse", "gix-url", + "gix-utils", "gix-validate", "gix-worktree", "log", @@ -1272,9 +1255,9 @@ dependencies = [ [[package]] name = "gix-actor" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc22b0cdc52237667c301dd7cdc6ead8f8f73c9f824e9942c8ebd6b764f6c0bf" +checksum = "848efa0f1210cea8638f95691c82a46f98a74b9e3524f01d4955ebc25a8f84f3" dependencies = [ "bstr", "btoi", @@ -1286,15 +1269,17 @@ dependencies = [ [[package]] name = "gix-attributes" -version = "0.10.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2231a25934a240d0a4b6f4478401c73ee81d8be52de0293eedbc172334abf3e1" +checksum = "3015baa01ad2122fbcaab7863c857a603eb7b7ec12ac8141207c42c6439805e2" dependencies = [ "bstr", - "gix-features 0.28.1", "gix-glob", "gix-path", "gix-quote", + "kstring", + "log", + "smallvec", "thiserror", "unicode-bom", ] @@ -1328,13 +1313,13 @@ dependencies = [ [[package]] name = "gix-config" -version = "0.20.1" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fbad5ce54a8fc997acc50febd89ec80fa6e97cb7f8d0654cb229936407489d8" +checksum = "1d252a0eddb6df74600d3d8872dc9fe98835a7da43110411d705b682f49d4ac1" dependencies = [ "bstr", "gix-config-value", - "gix-features 0.28.1", + "gix-features", "gix-glob", "gix-path", "gix-ref", @@ -1350,11 +1335,11 @@ dependencies = [ [[package]] name = "gix-config-value" -version = "0.10.2" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d09154c0c8677e4da0ec35e896f56ee3e338e741b9599fae06075edd83a4081c" +checksum = "786861e84a5793ad5f863d846de5eb064cd23b87e61ad708c8c402608202e7be" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.2.1", "bstr", "gix-path", "libc", @@ -1363,9 +1348,9 @@ dependencies = [ [[package]] name = "gix-credentials" -version = "0.12.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "750b684197374518ea057e0a0594713e07683faa0a3f43c0f93d97f64130ad8d" +checksum = "4874a4fc11ffa844a3c2b87a66957bda30a73b577ef1acf15ac34df5745de5ff" dependencies = [ "bstr", "gix-command", @@ -1379,9 +1364,9 @@ dependencies = [ [[package]] name = "gix-date" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b96271912ce39822501616f177dea7218784e6c63be90d5f36322ff3a722aae2" +checksum = "99056f37270715f5c7584fd8b46899a2296af9cae92463bf58b8bd1f5a78e553" dependencies = [ "bstr", "itoa", @@ -1391,11 +1376,11 @@ dependencies = [ [[package]] name = "gix-diff" -version = "0.28.1" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "103a0fa79b0d438f5ecb662502f052e530ace4fe1fe8e1c83c0c6da76d728e67" +checksum = "644a0f2768bc42d7a69289ada80c9e15c589caefc6a315d2307202df83ed1186" dependencies = [ - "gix-hash 0.10.4", + "gix-hash", "gix-object", "imara-diff", "thiserror", @@ -1403,13 +1388,13 @@ dependencies = [ [[package]] name = "gix-discover" -version = "0.16.2" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eba8ba458cb8f4a6c33409b0fe650b1258655175a7ffd1d24fafd3ed31d880b" +checksum = "1a6b61363e63e7cdaa3e6f96acb0257ebdb3d8883e21eba5930c99f07f0a5fc0" dependencies = [ "bstr", "dunce", - "gix-hash 0.10.4", + "gix-hash", "gix-path", "gix-ref", "gix-sec", @@ -1418,13 +1403,13 @@ dependencies = [ [[package]] name = "gix-features" -version = "0.28.1" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b76f9a80f6dd7be66442ae86e1f534effad9546676a392acc95e269d0c21c22" +checksum = "cf69b0f5c701cc3ae22d3204b671907668f6437ca88862d355eaf9bc47a4f897" dependencies = [ "crc32fast", "flate2", - "gix-hash 0.10.4", + "gix-hash", "libc", "once_cell", "prodash", @@ -1433,43 +1418,25 @@ dependencies = [ "walkdir", ] -[[package]] -name = "gix-features" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf69b0f5c701cc3ae22d3204b671907668f6437ca88862d355eaf9bc47a4f897" -dependencies = [ - "gix-hash 0.11.1", - "libc", -] - [[package]] name = "gix-fs" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b37a1832f691fdc09910bd267f9a2e413737c1f9ec68c6e31f9e802616278a9" dependencies = [ - "gix-features 0.29.0", + "gix-features", ] [[package]] name = "gix-glob" -version = "0.5.5" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93e43efd776bc543f46f0fd0ca3d920c37af71a764a16f2aebd89765e9ff2993" +checksum = "c07c98204529ac3f24b34754540a852593d2a4c7349008df389240266627a72a" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.2.1", "bstr", -] - -[[package]] -name = "gix-hash" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a258595457bc192d1f1c59d0d168a1e34e2be9b97a614e14995416185de41a7" -dependencies = [ - "hex", - "thiserror", + "gix-features", + "gix-path", ] [[package]] @@ -1484,28 +1451,40 @@ dependencies = [ [[package]] name = "gix-hashtable" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e55e40dfd694884f0eb78796c5bddcf2f8b295dace47039099dd7e76534973" +checksum = "afebb85691c6a085b114e01a27f4a61364519298c5826cb87a45c304802299bc" dependencies = [ - "gix-hash 0.10.4", + "gix-hash", "hashbrown 0.13.2", "parking_lot", ] +[[package]] +name = "gix-ignore" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba205b6df563e2906768bb22834c82eb46c5fdfcd86ba2c347270bc8309a05b2" +dependencies = [ + "bstr", + "gix-glob", + "gix-path", + "unicode-bom", +] + [[package]] name = "gix-index" -version = "0.15.1" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "717ab601ece7921f59fe86849dbe27d44a46ebb883b5885732c4f30df4996177" +checksum = "f39c1ccc8f1912cbbd5191efc28dbc5f0d0598042aa56bc09427b7c34efab3ba" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.2.1", "bstr", "btoi", "filetime", "gix-bitmap", - "gix-features 0.28.1", - "gix-hash 0.10.4", + "gix-features", + "gix-hash", "gix-lock", "gix-object", "gix-traverse", @@ -1528,9 +1507,9 @@ dependencies = [ [[package]] name = "gix-mailmap" -version = "0.11.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b66aea5e52875cd4915f4957a6f4b75831a36981e2ec3f5fad9e370e444fe1a" +checksum = "e8856cec3bdc3610c06970d28b6cb20a0c6621621cf9a8ec48cbd23f2630f362" dependencies = [ "bstr", "gix-actor", @@ -1539,15 +1518,15 @@ dependencies = [ [[package]] name = "gix-object" -version = "0.28.0" +version = "0.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8df068db9180ee935fbb70504848369e270bdcb576b05c0faa8b9fd3b86fc017" +checksum = "c9bb30ce0818d37096daa29efe361a4bc6dd0b51a5726598898be7e9a40a01e1" dependencies = [ "bstr", "btoi", "gix-actor", - "gix-features 0.28.1", - "gix-hash 0.10.4", + "gix-features", + "gix-hash", "gix-validate", "hex", "itoa", @@ -1558,13 +1537,13 @@ dependencies = [ [[package]] name = "gix-odb" -version = "0.43.1" +version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e83af2e3e36005bfe010927f0dff41fb5acc3e3d89c6f1174135b3a34086bda2" +checksum = "bca2f324aa67672b6d0f2c0fa93f96eb6a7029d260e4c1df5dce3c015f5e5add" dependencies = [ "arc-swap", - "gix-features 0.28.1", - "gix-hash 0.10.4", + "gix-features", + "gix-hash", "gix-object", "gix-pack", "gix-path", @@ -1576,15 +1555,15 @@ dependencies = [ [[package]] name = "gix-pack" -version = "0.33.2" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9401911c7fe032ad7b31c6a6b5be59cb283d1d6c999417a8215056efe6d635f3" +checksum = "164a515900a83257ae4aa80e741655bee7a2e39113fb535d7a5ac623b445ff20" dependencies = [ "clru", "gix-chunk", "gix-diff", - "gix-features 0.28.1", - "gix-hash 0.10.4", + "gix-features", + "gix-hash", "gix-hashtable", "gix-object", "gix-path", @@ -1598,24 +1577,26 @@ dependencies = [ [[package]] name = "gix-path" -version = "0.7.3" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32370dce200bb951df013e03dff35b4233fc7a89458642b047629b91734a7e19" +checksum = "4fc78f47095a0c15aea0e66103838f0748f4494bf7a9555dfe0f00425400396c" dependencies = [ "bstr", + "home", + "once_cell", "thiserror", ] [[package]] name = "gix-prompt" -version = "0.3.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f3034d4d935aef2c7bf719aaa54b88c520e82413118d886ae880a31d5bdee57" +checksum = "330d11fdf88fff3366c2491efde2f3e454958efe7d5ddf60272e8fb1d944bb01" dependencies = [ "gix-command", "gix-config-value", - "nix", "parking_lot", + "rustix", "thiserror", ] @@ -1632,13 +1613,14 @@ dependencies = [ [[package]] name = "gix-ref" -version = "0.27.2" +version = "0.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e909396ed3b176823991ccc391c276ae2a015e54edaafa3566d35123cfac9d" +checksum = "1e03989e9d49954368e1b526578230fc7189d1634acdfbe79e9ba1de717e15d5" dependencies = [ "gix-actor", - "gix-features 0.28.1", - "gix-hash 0.10.4", + "gix-features", + "gix-fs", + "gix-hash", "gix-lock", "gix-object", "gix-path", @@ -1651,12 +1633,12 @@ dependencies = [ [[package]] name = "gix-refspec" -version = "0.9.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba332462bda2e8efeae4302b39a6ed01ad56ef772fd5b7ef197cf2798294d65" +checksum = "0a6ea733820df67e4cd7797deb12727905824d8f5b7c59d943c456d314475892" dependencies = [ "bstr", - "gix-hash 0.10.4", + "gix-hash", "gix-revision", "gix-validate", "smallvec", @@ -1665,13 +1647,13 @@ dependencies = [ [[package]] name = "gix-revision" -version = "0.12.2" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c6f6ff53f888858afc24bf12628446a14279ceec148df6194481f306f553ad2" +checksum = "810f35e9afeccca999d5d348b239f9c162353127d2e13ff3240e31b919e35476" dependencies = [ "bstr", "gix-date", - "gix-hash 0.10.4", + "gix-hash", "gix-hashtable", "gix-object", "thiserror", @@ -1679,15 +1661,14 @@ dependencies = [ [[package]] name = "gix-sec" -version = "0.6.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8ffa5bf0772f9b01de501c035b6b084cf9b8bb07dec41e3afc6a17336a65f47" +checksum = "794520043d5a024dfeac335c6e520cb616f6963e30dab995892382e998c12897" dependencies = [ - "bitflags 1.3.2", - "dirs", + "bitflags 2.2.1", "gix-path", "libc", - "windows 0.43.0", + "windows", ] [[package]] @@ -1707,11 +1688,11 @@ dependencies = [ [[package]] name = "gix-traverse" -version = "0.24.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9a4a07bb22168dc79c60e1a6a41919d198187ca83d8a5940ad8d7122a45df3" +checksum = "a5be1e807f288c33bb005075111886cceb43ed8a167b3182a0f62c186e2a0dd1" dependencies = [ - "gix-hash 0.10.4", + "gix-hash", "gix-hashtable", "gix-object", "thiserror", @@ -1719,12 +1700,12 @@ dependencies = [ [[package]] name = "gix-url" -version = "0.16.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6a22b4b32ad14d68f7b7fb6458fa58d44b01797d94c1b8f4db2d9c7b3c366b5" +checksum = "dfc77f89054297cc81491e31f1bab4027e554b5ef742a44bd7035db9a0f78b76" dependencies = [ "bstr", - "gix-features 0.28.1", + "gix-features", "gix-path", "home", "thiserror", @@ -1752,15 +1733,18 @@ dependencies = [ [[package]] name = "gix-worktree" -version = "0.15.2" +version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54ec9a000b4f24af706c3cc680c7cda235656cbe3216336522f5692773b8a301" +checksum = "a69eaff0ae973a9d37c40f02ae5ae50fa726c8fc2fd3ab79d0a19eb61975aafa" dependencies = [ "bstr", + "filetime", "gix-attributes", - "gix-features 0.28.1", + "gix-features", + "gix-fs", "gix-glob", - "gix-hash 0.10.4", + "gix-hash", + "gix-ignore", "gix-index", "gix-object", "gix-path", @@ -1942,7 +1926,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows 0.48.0", + "windows", ] [[package]] @@ -2078,6 +2062,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kstring" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3066350882a1cd6d950d055997f379ac37fd39f81cd4d8ed186032eb3c5747" +dependencies = [ + "static_assertions", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -2336,18 +2329,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" -[[package]] -name = "nix" -version = "0.26.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" -dependencies = [ - "bitflags 1.3.2", - "cfg-if", - "libc", - "static_assertions", -] - [[package]] name = "nom" version = "7.1.3" @@ -2522,9 +2503,9 @@ dependencies = [ [[package]] name = "parquet" -version = "37.0.0" +version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5022d98333271f4ca3e87bab760498e61726bf5a6ca919123c80517e20ded29" +checksum = "4cbd51311f8d9ff3d2697b1522b18a588782e097d313a1a278b0faf2ccf2d3f6" dependencies = [ "ahash", "arrow-array", @@ -2544,6 +2525,7 @@ dependencies = [ "lz4", "num", "num-bigint", + "object_store", "paste", "seq-macro", "snap", @@ -2873,17 +2855,6 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "redox_users" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" -dependencies = [ - "getrandom", - "redox_syscall 0.2.16", - "thiserror", -] - [[package]] name = "regex" version = "1.8.1" @@ -3335,9 +3306,9 @@ dependencies = [ [[package]] name = "substrait" -version = "0.7.5" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ae64fb7ad0670c7d6d53d57b1b91beb2212afc30e164cc8edb02d6b2cff32a" +checksum = "54cd43d44620f716d55d46b998b3cf1baab2935aaa8adc14e3d3d9a465ddae15" dependencies = [ "gix", "heck", @@ -3683,9 +3654,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-bom" -version = "1.1.4" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63ec69f541d875b783ca40184d655f2927c95f0bffd486faa83cd3ac3529ec32" +checksum = "98e90c70c9f0d4d1ee6d0a7d04aa06cb9bbd53d8cfbdd62a0269a7c2eb640552" [[package]] name = "unicode-ident" @@ -3935,21 +3906,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "windows" -version = "0.43.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04662ed0e3e5630dfa9b26e4cb823b817f1a9addda855d973a9458c236556244" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", -] - [[package]] name = "windows" version = "0.48.0" diff --git a/dask_planner/src/sql/logical/create_memory_table.rs b/dask_planner/src/sql/logical/create_memory_table.rs index b36b0b6bb..6a2b7c6f2 100644 --- a/dask_planner/src/sql/logical/create_memory_table.rs +++ b/dask_planner/src/sql/logical/create_memory_table.rs @@ -1,6 +1,7 @@ use datafusion_python::{ datafusion_expr::{ logical_plan::{CreateMemoryTable, CreateView}, + DdlStatement, LogicalPlan, }, sql::logical::PyLogicalPlan, @@ -82,13 +83,16 @@ impl TryFrom for PyCreateMemoryTable { fn try_from(logical_plan: LogicalPlan) -> Result { Ok(match logical_plan { - LogicalPlan::CreateMemoryTable(create_memory_table) => PyCreateMemoryTable { - create_memory_table: Some(create_memory_table), - create_view: None, - }, - LogicalPlan::CreateView(create_view) => PyCreateMemoryTable { - create_memory_table: None, - create_view: Some(create_view), + LogicalPlan::Ddl(ddl) => match ddl { + DdlStatement::CreateMemoryTable(create_memory_table) => PyCreateMemoryTable { + create_memory_table: Some(create_memory_table), + create_view: None, + }, + DdlStatement::CreateView(create_view) => PyCreateMemoryTable { + create_memory_table: None, + create_view: Some(create_view), + }, + _ => return Err(py_type_err("unexpected plan")), }, _ => return Err(py_type_err("unexpected plan")), }) diff --git a/dask_planner/src/sql/logical/drop_table.rs b/dask_planner/src/sql/logical/drop_table.rs index 7d58e8a47..f91baf28a 100644 --- a/dask_planner/src/sql/logical/drop_table.rs +++ b/dask_planner/src/sql/logical/drop_table.rs @@ -1,4 +1,7 @@ -use datafusion_python::datafusion_expr::logical_plan::{DropTable, LogicalPlan}; +use datafusion_python::datafusion_expr::{ + logical_plan::{DropTable, LogicalPlan}, + DdlStatement, +}; use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; @@ -27,7 +30,7 @@ impl TryFrom for PyDropTable { fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - LogicalPlan::DropTable(drop_table) => Ok(PyDropTable { drop_table }), + LogicalPlan::Ddl(DdlStatement::DropTable(drop_table)) => Ok(PyDropTable { drop_table }), _ => Err(py_type_err("unexpected plan")), } } diff --git a/dask_planner/src/sql/logical/utils.rs b/dask_planner/src/sql/logical/utils.rs index 1b3637db5..a0b31faf5 100644 --- a/dask_planner/src/sql/logical/utils.rs +++ b/dask_planner/src/sql/logical/utils.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use datafusion_python::{ datafusion_common::DFField, - datafusion_expr::{expr::Sort, utils::exprlist_to_fields, Expr, LogicalPlan}, + datafusion_expr::{expr::Sort, utils::exprlist_to_fields, DdlStatement, Expr, LogicalPlan}, expr::{projection::PyProjection, PyExpr}, sql::logical::PyLogicalPlan, }; @@ -82,20 +82,21 @@ pub fn get_current_node_type(plan: PyLogicalPlan) -> Result { LogicalPlan::TableScan(_table_scan) => "TableScan".to_string(), LogicalPlan::EmptyRelation(_empty_relation) => "EmptyRelation".to_string(), LogicalPlan::Limit(_limit) => "Limit".to_string(), - LogicalPlan::CreateExternalTable(_create_external_table) => { - "CreateExternalTable".to_string() - } - LogicalPlan::CreateMemoryTable(_create_memory_table) => "CreateMemoryTable".to_string(), - LogicalPlan::DropTable(_drop_table) => "DropTable".to_string(), - LogicalPlan::DropView(_drop_view) => "DropView".to_string(), + LogicalPlan::Ddl(ddl) => match ddl { + DdlStatement::CreateExternalTable(_) => "CreateExternalTable".to_string(), + DdlStatement::CreateCatalog(_) => "CreateCatalog".to_string(), + DdlStatement::CreateCatalogSchema(_) => "CreateCatalogSchema".to_string(), + DdlStatement::CreateMemoryTable(_) => "CreateMemoryTable".to_string(), + DdlStatement::CreateView(_) => "CreateView".to_string(), + DdlStatement::DropCatalogSchema(_) => "DropCatalogSchema".to_string(), + DdlStatement::DropTable(_) => "DropTable".to_string(), + DdlStatement::DropView(_) => "DropView".to_string(), + }, LogicalPlan::Values(_values) => "Values".to_string(), LogicalPlan::Explain(_explain) => "Explain".to_string(), LogicalPlan::Analyze(_analyze) => "Analyze".to_string(), LogicalPlan::Subquery(_sub_query) => "Subquery".to_string(), LogicalPlan::SubqueryAlias(_sqalias) => "SubqueryAlias".to_string(), - LogicalPlan::CreateCatalogSchema(_create) => "CreateCatalogSchema".to_string(), - LogicalPlan::CreateCatalog(_create_catalog) => "CreateCatalog".to_string(), - LogicalPlan::CreateView(_create_view) => "CreateView".to_string(), LogicalPlan::Statement(_) => "Statement".to_string(), // Further examine and return the name that is a possible Dask-SQL Extension type LogicalPlan::Extension(extension) => { diff --git a/dask_sql/physical/rel/logical/empty.py b/dask_sql/physical/rel/logical/empty.py index 23f8d1cd3..be93fa979 100644 --- a/dask_sql/physical/rel/logical/empty.py +++ b/dask_sql/physical/rel/logical/empty.py @@ -24,11 +24,11 @@ class DaskEmptyRelationPlugin(BaseRelPlugin): def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: col_names = ( - rel.empty_relation().emptyColumnNames() - if len(rel.empty_relation().emptyColumnNames()) > 0 + rel.to_variant().schema().field_names() + if len(rel.to_variant().schema().field_names()) > 0 else ["_empty"] ) - data = None if len(rel.empty_relation().emptyColumnNames()) > 0 else [0] + data = None if len(rel.to_variant().schema().field_names()) > 0 else [0] return DataContainer( dd.from_pandas(pd.DataFrame(data, columns=col_names), npartitions=1), ColumnContainer(col_names), From 98938d9191c75350e850d4b1507e20dc6ced8b1a Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 10 May 2023 23:12:46 -0400 Subject: [PATCH 08/44] Checkpoint, aggregations working --- dask_planner/Cargo.lock | 8 +- dask_planner/Cargo.toml | 2 +- dask_planner/src/lib.rs | 11 + dask_planner/src/sql.rs | 33 +-- dask_planner/src/sql/logical.rs | 201 ++++++++++++++++++ dask_planner/src/sql/logical/aggregate.rs | 4 +- dask_planner/src/sql/logical/analyze_table.rs | 2 +- dask_planner/src/sql/logical/utils.rs | 86 +++++++- dask_planner/src/sql/optimizer.rs | 12 +- dask_sql/physical/rel/convert.py | 6 +- dask_sql/physical/rel/custom/analyze_table.py | 9 +- dask_sql/physical/rel/logical/aggregate.py | 46 ++-- dask_sql/physical/rel/logical/sort.py | 18 +- .../physical/rel/logical/subquery_alias.py | 2 +- 14 files changed, 375 insertions(+), 65 deletions(-) diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index 231aba958..c5115f430 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -891,7 +891,7 @@ dependencies = [ [[package]] name = "datafusion-python" version = "24.0.0" -source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=expr_datatypemap#8dddb33d74f09e005109125bbb382085fa86540f" +source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=logical_extension#c511238469b7578686af57140f7aff4051dd6e02" dependencies = [ "async-trait", "datafusion", @@ -909,7 +909,7 @@ dependencies = [ "pyo3", "pyo3-build-config", "rand", - "regex-syntax 0.6.29", + "regex-syntax 0.7.1", "syn 2.0.15", "tokio", "url", @@ -3468,9 +3468,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.28.0" +version = "1.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3c786bf8134e5a3a166db9b29ab8f48134739014a3eca7bc6bfa95d673b136f" +checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105" dependencies = [ "autocfg", "bytes", diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml index aa447fe5f..d01f7de44 100644 --- a/dask_planner/Cargo.toml +++ b/dask_planner/Cargo.toml @@ -10,7 +10,7 @@ rust-version = "1.65" [dependencies] async-trait = "0.1.68" -datafusion-python = { git = "https://github.com/jdye64/arrow-datafusion-python.git", branch = "expr_datatypemap" } +datafusion-python = { git = "https://github.com/jdye64/arrow-datafusion-python.git", branch = "logical_extension" } env_logger = "0.10" log = "^0.4" pyo3 = { version = "0.18.3", features = ["extension-module", "abi3", "abi3-py38"] } diff --git a/dask_planner/src/lib.rs b/dask_planner/src/lib.rs index 3c1d89a5d..934b28ac1 100644 --- a/dask_planner/src/lib.rs +++ b/dask_planner/src/lib.rs @@ -22,6 +22,7 @@ fn rust(py: Python, m: &PyModule) -> PyResult<()> { // Register the python classes m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -47,6 +48,16 @@ fn rust(py: Python, m: &PyModule) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(sql::logical::utils::named_projects)) .unwrap(); + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::py_column_name)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::distinct_agg)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::sort_ascending)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::sort_nulls_first)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::get_filter_expr)) + .unwrap(); // Exceptions m.add( diff --git a/dask_planner/src/sql.rs b/dask_planner/src/sql.rs index e6cd0ec44..04c7c9a5c 100644 --- a/dask_planner/src/sql.rs +++ b/dask_planner/src/sql.rs @@ -39,7 +39,6 @@ use datafusion_python::{ ResolvedTableReference, TableReference, }, - sql::logical::PyLogicalPlan, }; use log::{debug, warn}; use pyo3::prelude::*; @@ -48,6 +47,7 @@ use self::logical::{ create_catalog_schema::CreateCatalogSchemaPlanNode, drop_schema::DropSchemaPlanNode, use_schema::UseSchemaPlanNode, + DaskLogicalPlan, }; use crate::{ dialect::DaskDialect, @@ -519,9 +519,8 @@ impl DaskSQLContext { pub fn logical_relational_algebra( &self, statement: statement::PyStatement, - ) -> PyResult { + ) -> PyResult { self._logical_relational_algebra(statement.statement) - .map(PyLogicalPlan::new) .map_err(py_parsing_exp) } @@ -530,12 +529,12 @@ impl DaskSQLContext { /// `LogicalPlan` pub fn optimize_relational_algebra( &self, - existing_plan: PyLogicalPlan, - ) -> PyResult { + existing_plan: DaskLogicalPlan, + ) -> PyResult { // Certain queries cannot be optimized. Ex: `EXPLAIN SELECT * FROM test` simply return those plans as is let mut visitor = OptimizablePlanVisitor {}; - match existing_plan.plan().visit(&mut visitor) { + match (*existing_plan.plan()).visit(&mut visitor) { Ok(valid) => { match valid { VisitRecursion::Stop => { @@ -545,7 +544,6 @@ impl DaskSQLContext { } _ => optimizer::DaskSqlOptimizer::new() .optimize((*existing_plan.plan()).clone()) - .map(PyLogicalPlan::new) .map_err(py_optimization_exp), } } @@ -560,17 +558,19 @@ impl DaskSQLContext { pub fn _logical_relational_algebra( &self, dask_statement: DaskStatement, - ) -> Result { - match dask_statement { + ) -> Result { + let inner_plan = match dask_statement { DaskStatement::Statement(statement) => { let planner = SqlToRel::new(self); - planner.statement_to_plan(DFStatement::Statement(statement)) + Ok::( + planner.statement_to_plan(DFStatement::Statement(statement))?, + ) } DaskStatement::CreateModel(create_model) => Ok(LogicalPlan::Extension(Extension { node: Arc::new(CreateModelPlanNode { schema_name: create_model.schema_name, model_name: create_model.model_name, - input: self._logical_relational_algebra(create_model.select)?, + input: (*self._logical_relational_algebra(create_model.select)?.plan).clone(), if_not_exists: create_model.if_not_exists, or_replace: create_model.or_replace, with_options: create_model.with_options, @@ -581,7 +581,10 @@ impl DaskSQLContext { node: Arc::new(CreateExperimentPlanNode { schema_name: create_experiment.schema_name, experiment_name: create_experiment.experiment_name, - input: self._logical_relational_algebra(create_experiment.select)?, + input: (*self + ._logical_relational_algebra(create_experiment.select)? + .plan) + .clone(), if_not_exists: create_experiment.if_not_exists, or_replace: create_experiment.or_replace, with_options: create_experiment.with_options, @@ -592,7 +595,7 @@ impl DaskSQLContext { node: Arc::new(PredictModelPlanNode { schema_name: predict_model.schema_name, model_name: predict_model.model_name, - input: self._logical_relational_algebra(predict_model.select)?, + input: (*self._logical_relational_algebra(predict_model.select)?.plan).clone(), }), })), DaskStatement::DescribeModel(describe_model) => Ok(LogicalPlan::Extension(Extension { @@ -702,7 +705,9 @@ impl DaskSQLContext { new_schema_name: alter_schema.new_schema_name, }), })), - } + }; + + Ok(DaskLogicalPlan::new(inner_plan?)) } } diff --git a/dask_planner/src/sql/logical.rs b/dask_planner/src/sql/logical.rs index f633e9ee9..01e0491df 100644 --- a/dask_planner/src/sql/logical.rs +++ b/dask_planner/src/sql/logical.rs @@ -1,3 +1,34 @@ +use std::sync::Arc; + +use datafusion_python::{ + datafusion_expr::LogicalPlan, + errors::py_unsupported_variant_err, + sql::logical::PyLogicalPlan, +}; +use pyo3::{pyclass, pymethods, IntoPy, PyObject, PyResult, Python}; + +use self::{ + aggregate::PyAggregate as DaskAggregate, + alter_schema::{AlterSchemaPlanNode, PyAlterSchema}, + alter_table::{AlterTablePlanNode, PyAlterTable}, + analyze_table::{AnalyzeTablePlanNode, PyAnalyzeTable}, + create_catalog_schema::{CreateCatalogSchemaPlanNode, PyCreateCatalogSchema}, + create_experiment::{CreateExperimentPlanNode, PyCreateExperiment}, + create_model::{CreateModelPlanNode, PyCreateModel}, + create_table::{CreateTablePlanNode, PyCreateTable}, + describe_model::{DescribeModelPlanNode, PyDescribeModel}, + drop_model::{DropModelPlanNode, PyDropModel}, + drop_schema::{DropSchemaPlanNode, PyDropSchema}, + export_model::{ExportModelPlanNode, PyExportModel}, + predict_model::{PredictModelPlanNode, PyPredictModel}, + show_columns::{PyShowColumns, ShowColumnsPlanNode}, + show_models::{PyShowModels, ShowModelsPlanNode}, + show_schemas::{PyShowSchema, ShowSchemasPlanNode}, + show_tables::{PyShowTables, ShowTablesPlanNode}, + sort::PySort, + use_schema::{PyUseSchema, UseSchemaPlanNode}, +}; + pub mod aggregate; pub mod alter_schema; pub mod alter_table; @@ -30,3 +61,173 @@ pub mod table_scan; pub mod use_schema; pub mod utils; pub mod window; + +#[derive(Debug, Clone)] +#[pyclass(name = "DaskLogicalPlan", module = "dask_planner", subclass)] +pub struct DaskLogicalPlan { + pub plan: Arc, +} + +impl DaskLogicalPlan { + pub fn new(plan: LogicalPlan) -> Self { + DaskLogicalPlan { + plan: Arc::new(plan), + } + } + + pub fn plan(&self) -> Arc { + self.plan.clone() + } +} + +#[pymethods] +impl DaskLogicalPlan { + /// Return the specific logical operator + fn to_variant(&self, py: Python) -> PyResult { + Python::with_gil(|_| match self.plan.as_ref() { + // We first check for custom LogicalNodes. These are nodes that are not part of ANSI SQL + // and therefore cannot handled by Arrow DataFusion Python since they are unique to + // dask-sql. Here we check for the existence of those nodes and parse them locally if + // they exists. If the node is not a custom node then the processing is delegated + // to Arrow DataFusion Python. + LogicalPlan::Extension(extension) => { + let node = extension.node.as_any(); + if node.downcast_ref::().is_some() { + Ok(PyCreateModel::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyCreateExperiment::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyCreateCatalogSchema::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyCreateTable::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyDropModel::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyPredictModel::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyExportModel::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyDescribeModel::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyShowSchema::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyShowTables::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyShowColumns::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyShowModels::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyDropSchema::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyUseSchema::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyAnalyzeTable::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyAlterTable::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else if node.downcast_ref::().is_some() { + Ok(PyAlterSchema::try_from((*self.plan).clone()) + .unwrap() + .into_py(py)) + } else { + Err(py_unsupported_variant_err(format!( + "Cannot convert this plan to a LogicalNode: {:?}", + *self.plan + ))) + } + } + + // We handle Aggregate and Distinct a little differently than ADP. Enough of a difference + // that we choose to custom handle those here. + LogicalPlan::Aggregate(_) | LogicalPlan::Distinct(_) => { + Ok(DaskAggregate::try_from((*self.plan).clone())?.into_py(py)) + } + + // Sort logic should remain here for the time being + LogicalPlan::Sort(_) => Ok(PySort::try_from((*self.plan).clone())?.into_py(py)), + + // Delegate processing to Arrow DataFusion Python + other => PyLogicalPlan::new((*other).clone()).to_variant(py), + }) + } + + /// Get the inputs to this plan + fn inputs(&self) -> Vec { + let mut inputs = vec![]; + for input in self.plan.inputs() { + inputs.push(input.to_owned().into()); + } + inputs + } + + /// Consumes the current DaskLogicalPlan instance + /// into a native datafusion `LogicalPlan` + fn datafusion_plan(&self) -> PyLogicalPlan { + Into::::into((*self.plan).clone()) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("{:?}", self.plan)) + } + + fn display(&self) -> String { + format!("{}", self.plan.display()) + } + + fn display_indent(&self) -> String { + format!("{}", self.plan.display_indent()) + } + + fn display_indent_schema(&self) -> String { + format!("{}", self.plan.display_indent_schema()) + } + + fn display_graphviz(&self) -> String { + format!("{}", self.plan.display_graphviz()) + } +} + +impl From for LogicalPlan { + fn from(logical_plan: DaskLogicalPlan) -> LogicalPlan { + logical_plan.plan.as_ref().clone() + } +} + +impl From for DaskLogicalPlan { + fn from(logical_plan: LogicalPlan) -> DaskLogicalPlan { + DaskLogicalPlan { + plan: Arc::new(logical_plan), + } + } +} diff --git a/dask_planner/src/sql/logical/aggregate.rs b/dask_planner/src/sql/logical/aggregate.rs index 04be77126..12fc03608 100644 --- a/dask_planner/src/sql/logical/aggregate.rs +++ b/dask_planner/src/sql/logical/aggregate.rs @@ -15,8 +15,8 @@ use super::utils::py_expr_list; #[pyclass(name = "Aggregate", module = "dask_planner", subclass)] #[derive(Clone)] pub struct PyAggregate { - aggregate: Option, - distinct: Option, + pub aggregate: Option, + pub distinct: Option, } #[pymethods] diff --git a/dask_planner/src/sql/logical/analyze_table.rs b/dask_planner/src/sql/logical/analyze_table.rs index 7bfa0aea4..fb7936d56 100644 --- a/dask_planner/src/sql/logical/analyze_table.rs +++ b/dask_planner/src/sql/logical/analyze_table.rs @@ -101,7 +101,7 @@ impl UserDefinedLogicalNode for AnalyzeTablePlanNode { #[pyclass(name = "AnalyzeTable", module = "dask_planner", subclass)] pub struct PyAnalyzeTable { - pub(crate) analyze_table: AnalyzeTablePlanNode, + pub analyze_table: AnalyzeTablePlanNode, } #[pymethods] diff --git a/dask_planner/src/sql/logical/utils.rs b/dask_planner/src/sql/logical/utils.rs index a0b31faf5..f37e2e75e 100644 --- a/dask_planner/src/sql/logical/utils.rs +++ b/dask_planner/src/sql/logical/utils.rs @@ -4,7 +4,6 @@ use datafusion_python::{ datafusion_common::DFField, datafusion_expr::{expr::Sort, utils::exprlist_to_fields, DdlStatement, Expr, LogicalPlan}, expr::{projection::PyProjection, PyExpr}, - sql::logical::PyLogicalPlan, }; use pyo3::{pyfunction, PyResult}; @@ -33,6 +32,7 @@ use crate::{ exceptions::py_type_err, table::{table_from_logical_plan, DaskTable}, types::{rel_data_type::RelDataType, rel_data_type_field::RelDataTypeField}, + DaskLogicalPlan, }, }; @@ -64,7 +64,12 @@ pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> Result { } #[pyfunction] -pub fn get_current_node_type(plan: PyLogicalPlan) -> Result { +pub fn py_column_name(expr: PyExpr, plan: DaskLogicalPlan) -> Result { + column_name(&expr.expr, &(*plan.plan()).clone()) +} + +#[pyfunction] +pub fn get_current_node_type(plan: DaskLogicalPlan) -> Result { Ok(match &*plan.plan() { LogicalPlan::Dml(_) => "DataManipulationLanguage".to_string(), LogicalPlan::DescribeTable(_) => "DescribeTable".to_string(), @@ -145,7 +150,7 @@ pub fn get_current_node_type(plan: PyLogicalPlan) -> Result { } #[pyfunction] -pub fn plan_to_table(plan: PyLogicalPlan) -> PyResult { +pub fn plan_to_table(plan: DaskLogicalPlan) -> PyResult { match table_from_logical_plan(&plan.plan())? { Some(table) => Ok(table), None => Err(py_type_err( @@ -155,7 +160,7 @@ pub fn plan_to_table(plan: PyLogicalPlan) -> PyResult { } #[pyfunction] -pub fn row_type(plan: PyLogicalPlan) -> PyResult { +pub fn row_type(plan: DaskLogicalPlan) -> PyResult { match &*plan.plan() { LogicalPlan::Join(join) => { let mut lhs_fields: Vec = join @@ -222,3 +227,76 @@ pub fn named_projects(projection: PyProjection) -> PyResult PyResult { + match expr.expr { + Expr::AggregateFunction(funct) => Ok(funct.distinct), + Expr::AggregateUDF { .. } => Ok(false), + Expr::Alias(expr, _) => match expr.as_ref() { + Expr::AggregateFunction(funct) => Ok(funct.distinct), + Expr::AggregateUDF { .. } => Ok(false), + _ => Err(py_type_err( + "isDistinctAgg() - Non-aggregate expression encountered", + )), + }, + _ => Err(py_type_err( + "getFilterExpr() - Non-aggregate expression encountered", + )), + } +} + +/// Returns if a sort expressions is an ascending sort +#[pyfunction] +pub fn sort_ascending(expr: PyExpr) -> PyResult { + match expr.expr { + Expr::Sort(Sort { asc, .. }) => Ok(asc), + _ => Err(py_type_err(format!( + "Provided Expr {:?} is not a sort type", + &expr.expr + ))), + } +} + +/// Returns if nulls should be placed first in a sort expression +#[pyfunction] +pub fn sort_nulls_first(expr: PyExpr) -> PyResult { + match expr.expr { + Expr::Sort(Sort { nulls_first, .. }) => Ok(nulls_first), + _ => Err(py_type_err(format!( + "Provided Expr {:?} is not a sort type", + &expr.expr + ))), + } +} + +#[pyfunction] +pub fn get_filter_expr(expr: PyExpr) -> PyResult> { + // TODO refactor to avoid duplication + match &expr.expr { + Expr::Alias(expr, _) => match expr.as_ref() { + Expr::AggregateFunction(agg_function) => match &agg_function.filter { + Some(filter) => Ok(Some(PyExpr::from(*filter.clone()))), + None => Ok(None), + }, + Expr::AggregateUDF { filter, .. } => match filter { + Some(filter) => Ok(Some(PyExpr::from(*filter.clone()))), + None => Ok(None), + }, + _ => Err(py_type_err( + "get_filter_expr() - Non-aggregate expression encountered", + )), + }, + Expr::AggregateFunction(agg_function) => match &agg_function.filter { + Some(filter) => Ok(Some(PyExpr::from(*filter.clone()))), + None => Ok(None), + }, + Expr::AggregateUDF { filter, .. } => match filter { + Some(filter) => Ok(Some(PyExpr::from(*filter.clone()))), + None => Ok(None), + }, + _ => Err(py_type_err( + "get_filter_expr() - Non-aggregate expression encountered", + )), + } +} diff --git a/dask_planner/src/sql/optimizer.rs b/dask_planner/src/sql/optimizer.rs index 68577cf2c..f245308a7 100644 --- a/dask_planner/src/sql/optimizer.rs +++ b/dask_planner/src/sql/optimizer.rs @@ -27,6 +27,8 @@ use log::{debug, trace}; mod join_reorder; use join_reorder::JoinReorder; +use super::logical::DaskLogicalPlan; + /// Houses the optimization logic for Dask-SQL. This optimization controls the optimizations /// and their ordering in regards to their impact on the underlying `LogicalPlan` instance pub struct DaskSqlOptimizer { @@ -88,9 +90,13 @@ impl DaskSqlOptimizer { /// Iterates through the configured `OptimizerRule`(s) to transform the input `LogicalPlan` /// to its final optimized form - pub(crate) fn optimize(&self, plan: LogicalPlan) -> Result { + pub(crate) fn optimize(&self, plan: LogicalPlan) -> Result { let config = OptimizerContext::new(); - self.optimizer.optimize(&plan, &config, Self::observe) + Ok(DaskLogicalPlan::new(self.optimizer.optimize( + &plan, + &config, + Self::observe, + )?)) } fn observe(optimized_plan: &LogicalPlan, optimization: &dyn OptimizerRule) { @@ -156,7 +162,7 @@ mod tests { // optimize the logical plan let optimizer = DaskSqlOptimizer::new(); - optimizer.optimize(plan) + Ok((*optimizer.optimize(plan)?.plan).clone()) } struct MySchemaProvider { diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index b67fd631e..5cc8ebeec 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_planner.rust import DaskLogicalPlan logger = logging.getLogger(__name__) @@ -37,7 +37,9 @@ def add_plugin_class(cls, plugin_class: BaseRelPlugin, replace=True): cls.add_plugin(plugin_class.class_name, plugin_class(), replace=replace) @classmethod - def convert(cls, rel: "LogicalPlan", context: "dask_sql.Context") -> dd.DataFrame: + def convert( + cls, rel: "DaskLogicalPlan", context: "dask_sql.Context" + ) -> dd.DataFrame: """ Convert SQL AST tree node(s) into a python expression (a dask dataframe) diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index 14011ccef..1ad936ed6 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_planner.rust import DaskLogicalPlan class AnalyzeTablePlugin(BaseRelPlugin): @@ -30,8 +30,11 @@ class AnalyzeTablePlugin(BaseRelPlugin): class_name = "AnalyzeTable" - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: - analyze_table = rel.analyze_table() + def convert( + self, rel: "DaskLogicalPlan", context: "dask_sql.Context" + ) -> DataContainer: + # AnalyzeTable is of type `LogicalPlan::Extension`. Therefore we cannot use `.to_variant()` + analyze_table = rel.to_variant() schema_name = analyze_table.getSchemaName() or context.schema_name table_name = analyze_table.getTableName() diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index a4c7e9b59..9c9690c13 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -8,7 +8,7 @@ import pandas as pd from dask import config as dask_config -from dask_planner.rust import row_type +from dask_planner.rust import distinct_agg, get_filter_expr, row_type from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex.convert import RexConverter @@ -17,7 +17,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_planner.rust import DaskLogicalPlan logger = logging.getLogger(__name__) @@ -201,7 +201,9 @@ class DaskAggregatePlugin(BaseRelPlugin): ), } - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: + def convert( + self, rel: "DaskLogicalPlan", context: "dask_sql.Context" + ) -> DataContainer: (dc,) = self.assert_inputs(rel, 1, context) agg = rel.to_variant() @@ -212,7 +214,7 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai # We make our life easier with having unique column names cc = cc.make_unique() - group_exprs = agg.group_by_exprs() + group_exprs = agg.getGroupSets() group_columns = ( agg.getDistinctColumns() if agg.isDistinctNode() @@ -258,7 +260,7 @@ def try_get_backend_by_frontend_name(oc): def _do_aggregations( self, - rel: "LogicalPlan", + rel: "DaskLogicalPlan", dc: DataContainer, group_columns: List[str], context: "dask_sql.Context", @@ -347,7 +349,7 @@ def _do_aggregations( def _collect_aggregations( self, - rel: "LogicalPlan", + rel: "DaskLogicalPlan", df: dd.DataFrame, cc: ColumnContainer, context: "dask_sql.Context", @@ -364,31 +366,31 @@ def _collect_aggregations( where the aggregations are in the form (input_col, output_col, aggregation function (or string)) """ dc = DataContainer(df, cc) - agg = rel.aggregate() + agg = rel.to_variant() - input_rel = rel.get_inputs()[0] + input_rel = rel.inputs()[0] collected_aggregations = defaultdict(list) # convert and assign any input/filter columns that don't currently exist new_columns = {} for expr in agg.getNamedAggCalls(): - assert expr.getExprType() in { + assert expr.variant_name() in { "Alias", "AggregateFunction", "AggregateUDF", }, "Do not know how to handle this case!" for input_expr in agg.getArgs(expr): - input_col = input_expr.column_name(input_rel) + input_col = input_expr.column_name(input_rel.datafusion_plan()) if input_col not in cc._frontend_backend_mapping: random_name = new_temporary_column(df) new_columns[random_name] = RexConverter.convert( input_rel, input_expr, dc, context=context ) cc = cc.add(input_col, random_name) - filter_expr = expr.getFilterExpr() + filter_expr = get_filter_expr(expr) if filter_expr is not None: - filter_col = filter_expr.column_name(input_rel) + filter_col = filter_expr.column_name(input_rel.datafusion_plan()) if filter_col not in cc._frontend_backend_mapping: random_name = new_temporary_column(df) new_columns[random_name] = RexConverter.convert( @@ -412,16 +414,16 @@ def _collect_aggregations( # calcite some times gives one input/col to regr_count and # another col has filter column col1 = cc.get_backend_by_frontend_name( - inputs[0].column_name(input_rel) + inputs[0].column_name(input_rel.datafusion_plan()) ) df = df.assign(**{two_columns_proxy: (~is_null(df[col1]))}) else: col1 = cc.get_backend_by_frontend_name( - inputs[0].column_name(input_rel) + inputs[0].column_name(input_rel.datafusion_plan()) ) col2 = cc.get_backend_by_frontend_name( - inputs[1].column_name(input_rel) + inputs[1].column_name(input_rel.datafusion_plan()) ) # both cols should be not null df = df.assign( @@ -433,20 +435,20 @@ def _collect_aggregations( ) input_col = two_columns_proxy elif aggregation_name == "regr_syy": - input_col = inputs[0].column_name(input_rel) + input_col = inputs[0].column_name(input_rel.datafusion_plan()) elif aggregation_name == "regr_sxx": - input_col = inputs[1].column_name(input_rel) + input_col = inputs[1].column_name(input_rel.datafusion_plan()) elif len(inputs) == 1: - input_col = inputs[0].column_name(input_rel) + input_col = inputs[0].column_name(input_rel.datafusion_plan()) elif len(inputs) == 0: input_col = additional_column_name else: raise NotImplementedError("Can not cope with more than one input") - filter_expr = expr.getFilterExpr() + filter_expr = get_filter_expr(expr) if filter_expr is not None: filter_backend_col = cc.get_backend_by_frontend_name( - filter_expr.column_name(input_rel) + filter_expr.column_name(input_rel.datafusion_plan()) ) else: filter_backend_col = None @@ -480,11 +482,11 @@ def _collect_aggregations( ) # Finally, extract the output column name - output_col = expr.toString() + output_col = expr.column_name(input_rel.datafusion_plan()) # Store the aggregation collected_aggregations[ - (filter_backend_col, backend_name if expr.isDistinctAgg() else None) + (filter_backend_col, backend_name if distinct_agg(expr) else None) ].append((input_col, output_col, aggregation_function)) output_column_order.append(output_col) diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index c0919fe58..afa6099b1 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -1,13 +1,13 @@ from typing import TYPE_CHECKING -from dask_planner.rust import row_type +from dask_planner.rust import py_column_name, row_type, sort_ascending, sort_nulls_first from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.utils.sort import apply_sort if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_planner.rust import DaskLogicalPlan class DaskSortPlugin(BaseRelPlugin): @@ -17,22 +17,24 @@ class DaskSortPlugin(BaseRelPlugin): class_name = "Sort" - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: + def convert( + self, rel: "DaskLogicalPlan", context: "dask_sql.Context" + ) -> DataContainer: (dc,) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container - sort_plan = rel.sort() + sort_plan = rel.to_variant() sort_expressions = sort_plan.getCollation() sort_columns = [ - cc.get_backend_by_frontend_name(expr.column_name(rel)) + cc.get_backend_by_frontend_name(py_column_name(expr, rel)) for expr in sort_expressions ] - sort_ascending = [expr.isSortAscending() for expr in sort_expressions] - sort_null_first = [expr.isSortNullsFirst() for expr in sort_expressions] + sort_ascending_exprs = [sort_ascending(expr) for expr in sort_expressions] + sort_null_first = [sort_nulls_first(expr) for expr in sort_expressions] sort_num_rows = sort_plan.getNumRows() df = apply_sort( - df, sort_columns, sort_ascending, sort_null_first, sort_num_rows + df, sort_columns, sort_ascending_exprs, sort_null_first, sort_num_rows ) cc = self.fix_column_to_row_type(cc, row_type(rel)) diff --git a/dask_sql/physical/rel/logical/subquery_alias.py b/dask_sql/physical/rel/logical/subquery_alias.py index 2473167d7..0f00e6ada 100644 --- a/dask_sql/physical/rel/logical/subquery_alias.py +++ b/dask_sql/physical/rel/logical/subquery_alias.py @@ -20,7 +20,7 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context"): cc = dc.column_container - alias = rel.subquery_alias().getAlias() + alias = rel.to_variant().alias() return DataContainer( dc.df, From 48fbc58cc68fac9d05282b9138f18383155aadce Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 10 May 2023 23:46:08 -0400 Subject: [PATCH 09/44] Checkpoint, show and sort working --- dask_planner/src/sql/logical.rs | 8 ++++++++ dask_sql/physical/rel/custom/show_columns.py | 8 +++++--- dask_sql/physical/rel/custom/show_tables.py | 8 +++++--- dask_sql/physical/rel/logical/aggregate.py | 5 ++++- dask_sql/physical/rel/logical/join.py | 20 +++++++++++--------- dask_sql/physical/rex/core/call.py | 4 ---- dask_sql/physical/rex/core/input_ref.py | 6 ++++-- 7 files changed, 37 insertions(+), 22 deletions(-) diff --git a/dask_planner/src/sql/logical.rs b/dask_planner/src/sql/logical.rs index 01e0491df..ddbb42914 100644 --- a/dask_planner/src/sql/logical.rs +++ b/dask_planner/src/sql/logical.rs @@ -20,6 +20,8 @@ use self::{ drop_model::{DropModelPlanNode, PyDropModel}, drop_schema::{DropSchemaPlanNode, PyDropSchema}, export_model::{ExportModelPlanNode, PyExportModel}, + filter::PyFilter, + join::PyJoin, predict_model::{PredictModelPlanNode, PyPredictModel}, show_columns::{PyShowColumns, ShowColumnsPlanNode}, show_models::{PyShowModels, ShowModelsPlanNode}, @@ -177,6 +179,12 @@ impl DaskLogicalPlan { // Sort logic should remain here for the time being LogicalPlan::Sort(_) => Ok(PySort::try_from((*self.plan).clone())?.into_py(py)), + // Join logic + LogicalPlan::Join(_) => Ok(PyJoin::try_from((*self.plan).clone())?.into_py(py)), + + // Filter logic + LogicalPlan::Filter(_) => Ok(PyFilter::try_from((*self.plan).clone())?.into_py(py)), + // Delegate processing to Arrow DataFusion Python other => PyLogicalPlan::new((*other).clone()).to_variant(py), }) diff --git a/dask_sql/physical/rel/custom/show_columns.py b/dask_sql/physical/rel/custom/show_columns.py index 6b0b94fe9..a3a25bc37 100644 --- a/dask_sql/physical/rel/custom/show_columns.py +++ b/dask_sql/physical/rel/custom/show_columns.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + from dask_planner import DaskLogicalPlan class ShowColumnsPlugin(BaseRelPlugin): @@ -24,8 +24,10 @@ class ShowColumnsPlugin(BaseRelPlugin): class_name = "ShowColumns" - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: - show_columns = rel.show_columns() + def convert( + self, rel: "DaskLogicalPlan", context: "dask_sql.Context" + ) -> DataContainer: + show_columns = rel.to_variant() schema_name = show_columns.getSchemaName() or context.schema_name table_name = show_columns.getTableName() diff --git a/dask_sql/physical/rel/custom/show_tables.py b/dask_sql/physical/rel/custom/show_tables.py index d79b4052b..52cebfaac 100644 --- a/dask_sql/physical/rel/custom/show_tables.py +++ b/dask_sql/physical/rel/custom/show_tables.py @@ -8,7 +8,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + from dask_planner import DaskLogicalPlan class ShowTablesPlugin(BaseRelPlugin): @@ -26,8 +26,10 @@ class ShowTablesPlugin(BaseRelPlugin): class_name = "ShowTables" - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: - show_tables = rel.show_tables() + def convert( + self, rel: "DaskLogicalPlan", context: "dask_sql.Context" + ) -> DataContainer: + show_tables = rel.to_variant() # currently catalogs other than the default `dask_sql` are not supported catalog_name = show_tables.getCatalogName() or context.catalog_name diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index 9c9690c13..7ee4bcbab 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -218,7 +218,10 @@ def convert( group_columns = ( agg.getDistinctColumns() if agg.isDistinctNode() - else [group_expr.column_name(rel) for group_expr in group_exprs] + else [ + group_expr.column_name(rel.datafusion_plan()) + for group_expr in group_exprs + ] ) dc = DataContainer(df, cc) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index 832e1e0aa..ed2599c43 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -9,7 +9,7 @@ from dask.base import tokenize from dask.highlevelgraph import HighLevelGraph -from dask_planner.rust import row_type +from dask_planner.rust import RexType, row_type from dask_sql._compat import BROADCAST_JOIN_SUPPORT_WORKING from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin @@ -18,7 +18,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import Expression, LogicalPlan + from dask_planner.rust import DaskLogicalPlan, Expression logger = logging.getLogger(__name__) @@ -49,10 +49,12 @@ class DaskJoinPlugin(BaseRelPlugin): "LEFTSEMI": "inner", # TODO: Need research here! This is likely not a true inner join } - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: + def convert( + self, rel: "DaskLogicalPlan", context: "dask_sql.Context" + ) -> DataContainer: # Joining is a bit more complicated, so lets do it in steps: - join = rel.join() + join = rel.to_variant() # 1. We now have two inputs (from left and right), so we fetch them both dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context) @@ -269,9 +271,9 @@ def _join_on_columns( def _split_join_condition( self, join_condition: "Expression" ) -> Tuple[List[str], List[str], List["Expression"]]: - if str(join_condition.getRexType()) in ["RexType.Literal", "RexType.Reference"]: + if str(join_condition.rex_type()) in [RexType.Literal, RexType.Reference]: return [], [], [join_condition] - elif not str(join_condition.getRexType()) == "RexType.Call": + elif not str(join_condition.rex_type()) == RexType.Call: raise NotImplementedError("Can not understand join condition.") lhs_on = [] @@ -291,7 +293,7 @@ def _split_join_condition( return [], [], [join_condition] def _extract_lhs_rhs(self, rex): - assert str(rex.getRexType()) == "RexType.Call" + assert str(rex.rex_type()) == RexType.Call operator_name = str(rex.getOperatorName()) assert operator_name in ["=", "AND"] @@ -305,8 +307,8 @@ def _extract_lhs_rhs(self, rex): operand_rhs = operands[1] if ( - str(operand_lhs.getRexType()) == "RexType.Reference" - and str(operand_rhs.getRexType()) == "RexType.Reference" + str(operand_lhs.rex_type()) == RexType.Reference + and str(operand_rhs.rex_type()) == RexType.Reference ): lhs_index = operand_lhs.getIndex() rhs_index = operand_rhs.getIndex() diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 286bcd2bb..9671adfc4 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -159,11 +159,7 @@ def div(self, lhs, rhs, rex=None): result = lhs / rhs data_type_map = rex.types() - # output_type = str(rex.getType()) - # output_type = sql_to_python_type(SqlType.fromString(output_type.upper())) - breakpoint() output_type = sql_to_python_type(str(data_type_map.sql_type)) - breakpoint() is_float = pd.api.types.is_float_dtype(output_type) if not is_float: diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 85a9efa4d..51c853f44 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import Expression, LogicalPlan + from dask_planner.rust import DaskLogicalPlan, Expression class RexInputRefPlugin(BaseRexPlugin): @@ -21,7 +21,7 @@ class RexInputRefPlugin(BaseRexPlugin): def convert( self, - rel: "LogicalPlan", + rel: "DaskLogicalPlan", rex: "Expression", dc: DataContainer, context: "dask_sql.Context", @@ -30,5 +30,7 @@ def convert( cc = dc.column_container column_name = rex.display_name() + column_name = column_name.split(".") + column_name = column_name[len(column_name) - 1] backend_column_name = cc.get_backend_by_frontend_name(column_name) return df[backend_column_name] From 1464bf64b3c52d05ade62177b854896e0d86bb94 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Thu, 11 May 2023 16:44:04 -0400 Subject: [PATCH 10/44] Checkpoint, distributeby working and partial join --- dask_planner/Cargo.lock | 14 +++---- dask_planner/src/lib.rs | 2 + dask_planner/src/sql.rs | 2 +- dask_planner/src/sql/logical.rs | 30 ++++++++++++++- dask_planner/src/sql/logical/utils.rs | 32 +++++++++++++++- dask_planner/src/sql/optimizer.rs | 2 +- dask_sql/context.py | 10 ++++- dask_sql/datacontainer.py | 4 ++ dask_sql/mappings.py | 14 +++---- dask_sql/physical/rel/convert.py | 6 ++- .../rel/custom/create_memory_table.py | 2 +- dask_sql/physical/rel/custom/create_table.py | 2 +- dask_sql/physical/rel/custom/distributeby.py | 8 ++-- dask_sql/physical/rel/custom/drop_table.py | 8 ++-- dask_sql/physical/rel/logical/join.py | 4 +- dask_sql/physical/rex/core/call.py | 6 ++- dask_sql/physical/rex/core/input_ref.py | 2 - dask_sql/physical/rex/core/literal.py | 38 +++++++++---------- 18 files changed, 128 insertions(+), 58 deletions(-) diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index c5115f430..6b1b3be70 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -891,7 +891,7 @@ dependencies = [ [[package]] name = "datafusion-python" version = "24.0.0" -source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=logical_extension#c511238469b7578686af57140f7aff4051dd6e02" +source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=logical_extension#de42813a2d8689ac73ada6605b12636efc8b281b" dependencies = [ "async-trait", "datafusion", @@ -1518,9 +1518,9 @@ dependencies = [ [[package]] name = "gix-object" -version = "0.29.1" +version = "0.29.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9bb30ce0818d37096daa29efe361a4bc6dd0b51a5726598898be7e9a40a01e1" +checksum = "2d96bd620fd08accdd37f70b2183cfa0b001b4f1c6ade8b7f6e15cb3d9e261ce" dependencies = [ "bstr", "btoi", @@ -3081,18 +3081,18 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" [[package]] name = "serde" -version = "1.0.162" +version = "1.0.163" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71b2f6e1ab5c2b98c05f0f35b236b22e8df7ead6ffbf51d7808da7f8817e7ab6" +checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.162" +version = "1.0.163" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2a0814352fd64b58489904a44ea8d90cb1a91dcb6b4f5ebabc32c8318e93cb6" +checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" dependencies = [ "proc-macro2", "quote", diff --git a/dask_planner/src/lib.rs b/dask_planner/src/lib.rs index 934b28ac1..8dfc24b56 100644 --- a/dask_planner/src/lib.rs +++ b/dask_planner/src/lib.rs @@ -58,6 +58,8 @@ fn rust(py: Python, m: &PyModule) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(sql::logical::utils::get_filter_expr)) .unwrap(); + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::get_precision_scale)) + .unwrap(); // Exceptions m.add( diff --git a/dask_planner/src/sql.rs b/dask_planner/src/sql.rs index 04c7c9a5c..f1484f4d9 100644 --- a/dask_planner/src/sql.rs +++ b/dask_planner/src/sql.rs @@ -707,7 +707,7 @@ impl DaskSQLContext { })), }; - Ok(DaskLogicalPlan::new(inner_plan?)) + Ok(DaskLogicalPlan::_new(inner_plan?)) } } diff --git a/dask_planner/src/sql/logical.rs b/dask_planner/src/sql/logical.rs index ddbb42914..efeaea295 100644 --- a/dask_planner/src/sql/logical.rs +++ b/dask_planner/src/sql/logical.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use datafusion_python::{ - datafusion_expr::LogicalPlan, + datafusion_expr::{DdlStatement, LogicalPlan}, errors::py_unsupported_variant_err, sql::logical::PyLogicalPlan, }; @@ -14,15 +14,18 @@ use self::{ analyze_table::{AnalyzeTablePlanNode, PyAnalyzeTable}, create_catalog_schema::{CreateCatalogSchemaPlanNode, PyCreateCatalogSchema}, create_experiment::{CreateExperimentPlanNode, PyCreateExperiment}, + create_memory_table::PyCreateMemoryTable, create_model::{CreateModelPlanNode, PyCreateModel}, create_table::{CreateTablePlanNode, PyCreateTable}, describe_model::{DescribeModelPlanNode, PyDescribeModel}, drop_model::{DropModelPlanNode, PyDropModel}, drop_schema::{DropSchemaPlanNode, PyDropSchema}, + drop_table::PyDropTable, export_model::{ExportModelPlanNode, PyExportModel}, filter::PyFilter, join::PyJoin, predict_model::{PredictModelPlanNode, PyPredictModel}, + repartition_by::PyRepartitionBy, show_columns::{PyShowColumns, ShowColumnsPlanNode}, show_models::{PyShowModels, ShowModelsPlanNode}, show_schemas::{PyShowSchema, ShowSchemasPlanNode}, @@ -71,7 +74,7 @@ pub struct DaskLogicalPlan { } impl DaskLogicalPlan { - pub fn new(plan: LogicalPlan) -> Self { + pub fn _new(plan: LogicalPlan) -> Self { DaskLogicalPlan { plan: Arc::new(plan), } @@ -84,6 +87,11 @@ impl DaskLogicalPlan { #[pymethods] impl DaskLogicalPlan { + #[new] + pub fn new(plan: PyLogicalPlan) -> Self { + DaskLogicalPlan { plan: plan.plan() } + } + /// Return the specific logical operator fn to_variant(&self, py: Python) -> PyResult { Python::with_gil(|_| match self.plan.as_ref() { @@ -185,6 +193,24 @@ impl DaskLogicalPlan { // Filter logic LogicalPlan::Filter(_) => Ok(PyFilter::try_from((*self.plan).clone())?.into_py(py)), + // Existing DistributeBy/RepartitionBy logic + LogicalPlan::Repartition(_) => { + Ok(PyRepartitionBy::try_from((*self.plan).clone())?.into_py(py)) + } + + // Drop Table logic + LogicalPlan::Ddl(DdlStatement::DropTable(_)) => { + Ok(PyDropTable::try_from((*self.plan).clone())?.into_py(py)) + } + + LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(_)) => { + Ok(PyCreateMemoryTable::try_from((*self.plan).clone())?.into_py(py)) + } + + LogicalPlan::Ddl(DdlStatement::CreateView(_)) => { + Ok(PyCreateMemoryTable::try_from((*self.plan).clone())?.into_py(py)) + } + // Delegate processing to Arrow DataFusion Python other => PyLogicalPlan::new((*other).clone()).to_variant(py), }) diff --git a/dask_planner/src/sql/logical/utils.rs b/dask_planner/src/sql/logical/utils.rs index f37e2e75e..2a564dc55 100644 --- a/dask_planner/src/sql/logical/utils.rs +++ b/dask_planner/src/sql/logical/utils.rs @@ -1,8 +1,16 @@ use std::sync::Arc; use datafusion_python::{ + datafusion::arrow::datatypes::DataType, datafusion_common::DFField, - datafusion_expr::{expr::Sort, utils::exprlist_to_fields, DdlStatement, Expr, LogicalPlan}, + datafusion_expr::{ + expr::Sort, + utils::exprlist_to_fields, + Cast, + DdlStatement, + Expr, + LogicalPlan, + }, expr::{projection::PyProjection, PyExpr}, }; use pyo3::{pyfunction, PyResult}; @@ -300,3 +308,25 @@ pub fn get_filter_expr(expr: PyExpr) -> PyResult> { )), } } + +#[pyfunction] +pub fn get_precision_scale(expr: PyExpr) -> PyResult<(u8, i8)> { + Ok(match &expr.expr { + Expr::Cast(Cast { expr: _, data_type }) => match data_type { + DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { + (*precision, *scale) + } + _ => { + return Err(py_type_err(format!( + "Catch all triggered for Cast in get_precision_scale; {data_type:?}" + ))) + } + }, + _ => { + return Err(py_type_err(format!( + "Catch all triggered in get_precision_scale; {:?}", + &expr.expr + ))) + } + }) +} diff --git a/dask_planner/src/sql/optimizer.rs b/dask_planner/src/sql/optimizer.rs index f245308a7..ff74a93ce 100644 --- a/dask_planner/src/sql/optimizer.rs +++ b/dask_planner/src/sql/optimizer.rs @@ -92,7 +92,7 @@ impl DaskSqlOptimizer { /// to its final optimized form pub(crate) fn optimize(&self, plan: LogicalPlan) -> Result { let config = OptimizerContext::new(); - Ok(DaskLogicalPlan::new(self.optimizer.optimize( + Ok(DaskLogicalPlan::_new(self.optimizer.optimize( &plan, &config, Self::observe, diff --git a/dask_sql/context.py b/dask_sql/context.py index 8d93fc711..c6706a13d 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -10,6 +10,7 @@ from dask.base import optimize from dask_planner.rust import ( + DaskLogicalPlan, DaskSchema, DaskSQLContext, DaskTable, @@ -495,7 +496,7 @@ def sql( if isinstance(sql, str): rel, _ = self._get_ral(sql) - elif isinstance(sql, LogicalPlan): + elif isinstance(sql, DaskLogicalPlan) or isinstance(sql, LogicalPlan): rel = sql else: raise RuntimeError( @@ -827,9 +828,14 @@ def _get_ral(self, sql): return rel, rel_string - def _compute_table_from_rel(self, rel: "LogicalPlan", return_futures: bool = True): + def _compute_table_from_rel( + self, rel: "DaskLogicalPlan", return_futures: bool = True + ): dc = RelConverter.convert(rel, context=self) + if not isinstance(rel, DaskLogicalPlan): + rel = DaskLogicalPlan(rel) + # Optimization might remove some alias projects. Make sure to keep them here. select_names = [field for field in row_type(rel).getFieldList()] diff --git a/dask_sql/datacontainer.py b/dask_sql/datacontainer.py index e4c93a8f5..023a1ac3f 100644 --- a/dask_sql/datacontainer.py +++ b/dask_sql/datacontainer.py @@ -156,6 +156,10 @@ def get_backend_by_frontend_name(self, column: str) -> str: try: return self._frontend_backend_mapping[column] except KeyError: + # Attempt to lookup by simple name if full name fails + if "." in column: + column = column.split(".") + column = self.get_backend_by_frontend_name(column[len(column) - 1]) return column def make_unique(self, prefix="col"): diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 32bf2c24a..75d35c41d 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -133,7 +133,6 @@ def sql_to_python_value(sql_type: "SqlType", literal_value: Any) -> Any: logger.debug( f"sql_to_python_value -> sql_type: {sql_type} literal_value: {literal_value}" ) - if sql_type == SqlType.CHAR or sql_type == SqlType.VARCHAR: # Some varchars contain an additional encoding # in the format _ENCODING'string' @@ -177,11 +176,11 @@ def sql_to_python_value(sql_type: "SqlType", literal_value: Any) -> Any: # Calcite will always convert INTERVAL types except YEAR, QUATER, MONTH to milliseconds # Issue: if sql_type is INTERVAL MICROSECOND, and value <= 1000, literal_value will be rounded to 0 return np.timedelta64(literal_value, "ms") - elif sql_type == SqlType.INTERVAL_MONTH_DAY_NANOSECOND: - # DataFusion assumes 30 days per month. Therefore we multiply number of months by 30 and add to days - return np.timedelta64( - (literal_value[0] * 30) + literal_value[1], "D" - ) + np.timedelta64(literal_value[2], "ns") + # elif sql_type == SqlType.INTERVAL_MONTH_DAY_NANOSECOND: + # # DataFusion assumes 30 days per month. Therefore we multiply number of months by 30 and add to days + # return np.timedelta64( + # (literal_value[0] * 30) + literal_value[1], "D" + # ) + np.timedelta64(literal_value[2], "ns") elif sql_type == SqlType.BOOLEAN: return bool(literal_value) @@ -197,7 +196,8 @@ def sql_to_python_value(sql_type: "SqlType", literal_value: Any) -> Any: # NULL time return pd.NaT # pragma: no cover if sql_type == SqlType.DATE: - return literal_value.astype(" DataContainer: # Rust create_memory_table instance handle - create_memory_table = rel.create_memory_table() + create_memory_table = rel.to_variant() qualified_table_name = create_memory_table.getQualifiedName() *schema_name, table_name = qualified_table_name.split(".") diff --git a/dask_sql/physical/rel/custom/create_table.py b/dask_sql/physical/rel/custom/create_table.py index 36b165230..53e94bb1f 100644 --- a/dask_sql/physical/rel/custom/create_table.py +++ b/dask_sql/physical/rel/custom/create_table.py @@ -39,7 +39,7 @@ class CreateTablePlugin(BaseRelPlugin): class_name = "CreateTable" def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: - create_table = rel.create_table() + create_table = rel.to_variant() schema_name = create_table.getSchemaName() or context.schema_name table_name = create_table.getTableName() diff --git a/dask_sql/physical/rel/custom/distributeby.py b/dask_sql/physical/rel/custom/distributeby.py index c7ce70610..b3f3eeb07 100644 --- a/dask_sql/physical/rel/custom/distributeby.py +++ b/dask_sql/physical/rel/custom/distributeby.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_planner.rust import DaskLogicalPlan logger = logging.getLogger(__name__) @@ -23,8 +23,10 @@ class DistributeByPlugin(BaseRelPlugin): # DataFusion provides the phrase `Repartition` in the LogicalPlan instead of `Distribute By`, it is the same thing class_name = "Repartition" - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: - distribute = rel.repartition_by() + def convert( + self, rel: "DaskLogicalPlan", context: "dask_sql.Context" + ) -> DataContainer: + distribute = rel.to_variant() select = distribute.getSelectQuery() distribute_list = distribute.getDistributionColumns() diff --git a/dask_sql/physical/rel/custom/drop_table.py b/dask_sql/physical/rel/custom/drop_table.py index 9e74a32e7..d71c34d61 100644 --- a/dask_sql/physical/rel/custom/drop_table.py +++ b/dask_sql/physical/rel/custom/drop_table.py @@ -1,12 +1,12 @@ import logging from typing import TYPE_CHECKING +from dask_planner.rust import DaskLogicalPlan from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: import dask_sql - from dask_sql.rust import LogicalPlan logger = logging.getLogger(__name__) @@ -21,9 +21,11 @@ class DropTablePlugin(BaseRelPlugin): class_name = "DropTable" - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: + def convert( + self, rel: "DaskLogicalPlan", context: "dask_sql.Context" + ) -> DataContainer: # Rust create_memory_table instance handle - drop_table = rel.drop_table() + drop_table = rel.to_variant() qualified_table_name = drop_table.getQualifiedName() *schema_name, table_name = qualified_table_name.split(".") diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index ed2599c43..3ce2213a2 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -271,9 +271,9 @@ def _join_on_columns( def _split_join_condition( self, join_condition: "Expression" ) -> Tuple[List[str], List[str], List["Expression"]]: - if str(join_condition.rex_type()) in [RexType.Literal, RexType.Reference]: + if str(join_condition.rex_type()) in ["RexType.Literal", "RexType.Reference"]: return [], [], [join_condition] - elif not str(join_condition.rex_type()) == RexType.Call: + elif not str(join_condition.rex_type()) == "RexType.Call": raise NotImplementedError("Can not understand join condition.") lhs_on = [] diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 9671adfc4..0b1d85ded 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -14,7 +14,7 @@ from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data -from dask_planner.rust import PythonType, SqlType +from dask_planner.rust import PythonType, SqlType, get_precision_scale from dask_sql.datacontainer import DataContainer from dask_sql.mappings import ( cast_column_to_type, @@ -42,6 +42,8 @@ def as_timelike(op): if isinstance(op, np.int64): return np.timedelta64(op, "D") + elif isinstance(op, int): + return np.datetime64(op, "D") elif isinstance(op, str): return np.datetime64(op) elif pd.api.types.is_datetime64_dtype(op) or isinstance(op, np.timedelta64): @@ -247,7 +249,7 @@ def cast(self, operand, rex=None) -> SeriesOrScalar: # decimal datatypes require precision and scale if data_type_map.python_type == PythonType.Float: - sql_type_args = rex.getPrecisionScale() + sql_type_args = get_precision_scale(rex) if not is_frame(operand): # pragma: no cover return sql_to_python_value(sql_type, operand) diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 51c853f44..53b9a2c24 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -30,7 +30,5 @@ def convert( cc = dc.column_container column_name = rex.display_name() - column_name = column_name.split(".") - column_name = column_name[len(column_name) - 1] backend_column_name = cc.get_backend_by_frontend_name(column_name) return df[backend_column_name] diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 76e43ece3..9d01778e7 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -3,14 +3,12 @@ # from datetime import datetime from typing import TYPE_CHECKING, Any -from dask_sql.datacontainer import DataContainer +import numpy as np -# from dask_sql.mappings import sql_to_python_value +from dask_sql.datacontainer import DataContainer +from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin -# import numpy as np - - if TYPE_CHECKING: import dask_sql from dask_planner.rust import Expression, LogicalPlan @@ -37,9 +35,16 @@ def convert( dc: DataContainer, context: "dask_sql.Context", ) -> Any: - # data_type_map = rex.types() - python_value = rex.python_value() - # breakpoint() + data_type_map = rex.types() + literal_type = data_type_map.friendly_arrow_type_name() + literal_value = rex.python_value() + + if literal_type == "Date32": + literal_value = np.datetime64(literal_value, "D") + elif literal_type == "Date64": + literal_value = np.datetime64(literal_value, "ms") + elif literal_type == "Time64": + literal_value = np.datetime64(literal_value, "ns") # # Retrieve the SQL value from the `Expr` instance. # # Value is retrieved based on Arrow DataType @@ -87,15 +92,6 @@ def convert( # elif literal_type == "Utf8": # literal_type = SqlType.VARCHAR # literal_value = rex.getStringValue() - # elif literal_type == "Date32": - # literal_type = SqlType.DATE - # literal_value = np.datetime64(rex.getDate32Value(), "D") - # elif literal_type == "Date64": - # literal_type = SqlType.DATE - # literal_value = np.datetime64(rex.getDate64Value(), "ms") - # elif literal_type == "Time64": - # literal_value = np.datetime64(rex.getTime64Value(), "ns") - # literal_type = SqlType.TIME # elif literal_type == "Null": # literal_type = SqlType.NULL # literal_value = None @@ -131,9 +127,9 @@ def convert( # f"Failed to map literal type {literal_type} to python type in literal.py" # ) - # python_value = sql_to_python_value(literal_type, literal_value) - # logger.debug( - # f"literal.py python_value: {python_value} or Python type: {type(python_value)}" - # ) + python_value = sql_to_python_value(data_type_map.sql_type, literal_value) + logger.debug( + f"literal.py python_value: {python_value} or Python type: {type(python_value)}" + ) return python_value From 1e94c01a4ff23e9b7a6afb2d18d5868b0ce26b40 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Thu, 11 May 2023 17:14:56 -0400 Subject: [PATCH 11/44] Checkpoint, window logic and pytest passing --- dask_planner/src/sql/logical.rs | 4 ++++ dask_sql/physical/rel/logical/window.py | 30 +++++++++++++------------ 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/dask_planner/src/sql/logical.rs b/dask_planner/src/sql/logical.rs index efeaea295..80aa818e5 100644 --- a/dask_planner/src/sql/logical.rs +++ b/dask_planner/src/sql/logical.rs @@ -32,6 +32,7 @@ use self::{ show_tables::{PyShowTables, ShowTablesPlanNode}, sort::PySort, use_schema::{PyUseSchema, UseSchemaPlanNode}, + window::PyWindow, }; pub mod aggregate; @@ -198,6 +199,9 @@ impl DaskLogicalPlan { Ok(PyRepartitionBy::try_from((*self.plan).clone())?.into_py(py)) } + // Window logic + LogicalPlan::Window(_) => Ok(PyWindow::try_from((*self.plan).clone())?.into_py(py)), + // Drop Table logic LogicalPlan::Ddl(DdlStatement::DropTable(_)) => { Ok(PyDropTable::try_from((*self.plan).clone())?.into_py(py)) diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index 36ca4b893..83762be2b 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -8,7 +8,7 @@ import pandas as pd from pandas.api.indexers import BaseIndexer -from dask_planner.rust import row_type +from dask_planner.rust import row_type, sort_ascending, sort_nulls_first from dask_sql._compat import INDEXER_WINDOW_STEP_IMPLEMENTED from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin @@ -245,7 +245,7 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai # Output to the right field names right away field_names = row_type(rel).getFieldNames() - for window in rel.window().getGroups(): + for window in rel.to_variant().getGroups(): dc = self._apply_window(rel, window, dc, field_names, context) # Finally, fix the output schema if needed @@ -295,7 +295,7 @@ def _apply_window( # Default window bounds when not specified as unbound preceding and current row (if no order by) # unbounded preceding and unbounded following if there's an order by - if not rel.window().getWindowFrame(window): + if not rel.to_variant().getWindowFrame(window): lower_bound = BoundDescription( is_unbounded=True, is_preceding=True, @@ -322,10 +322,10 @@ def _apply_window( ) else: lower_bound = to_bound_description( - rel.window().getWindowFrame(window).getLowerBound(), + rel.to_variant().getWindowFrame(window).getLowerBound(), ) upper_bound = to_bound_description( - rel.window().getWindowFrame(window).getUpperBound(), + rel.to_variant().getWindowFrame(window).getUpperBound(), ) # Apply the windowing operation @@ -369,10 +369,12 @@ def _extract_groupby( context: "dask_sql.Context", ) -> Tuple[dd.DataFrame, str]: """Prepare grouping columns we can later use while applying the main function""" - partition_keys = rel.window().getPartitionExprs(window) + partition_keys = rel.to_variant().getPartitionExprs(window) if partition_keys: group_columns = [ - dc.column_container.get_backend_by_frontend_name(o.column_name(rel)) + dc.column_container.get_backend_by_frontend_name( + o.column_name(rel.datafusion_plan()) + ) for o in partition_keys ] temporary_columns = [] @@ -392,14 +394,14 @@ def _extract_ordering( "Error is about to be encountered, FIX me when bindings are available in subsequent PR" ) # TODO: This was commented out for flake8 CI passing and needs to be handled - sort_expressions = rel.window().getSortExprs(window) + sort_expressions = rel.to_variant().getSortExprs(window) sort_columns = [ - cc.get_backend_by_frontend_name(expr.column_name(rel)) + cc.get_backend_by_frontend_name(expr.column_name(rel.datafusion_plan())) for expr in sort_expressions ] - sort_ascending = [expr.isSortAscending() for expr in sort_expressions] - sort_null_first = [expr.isSortNullsFirst() for expr in sort_expressions] - return sort_columns, sort_ascending, sort_null_first + py_sort_ascending = [sort_ascending(expr) for expr in sort_expressions] + sort_null_first = [sort_nulls_first(expr) for expr in sort_expressions] + return sort_columns, py_sort_ascending, sort_null_first def _extract_operations( self, @@ -414,7 +416,7 @@ def _extract_operations( # TODO: datafusion returns only window func expression per window # This can be optimized in the physical plan to collect all aggs for a given window - operator_name = rel.window().getWindowFuncName(window).lower() + operator_name = rel.to_variant().getWindowFuncName(window).lower() try: operation = self.OPERATION_MAPPING[operator_name] @@ -429,7 +431,7 @@ def _extract_operations( # TODO: can be optimized by re-using already present columns temporary_operand_columns = { new_temporary_column(df): RexConverter.convert(rel, o, dc, context=context) - for o in rel.window().getArgs(window) + for o in rel.to_variant().getArgs(window) } df = df.assign(**temporary_operand_columns) temporary_operand_columns = list(temporary_operand_columns.keys()) From 25bc867104cb33b1653bd9a29a964043aae1c626 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Thu, 11 May 2023 17:20:05 -0400 Subject: [PATCH 12/44] checkpoint, test_schema.py tests all passing --- dask_sql/physical/rel/custom/create_catalog_schema.py | 6 +++--- dask_sql/physical/rel/custom/drop_schema.py | 2 +- dask_sql/physical/rel/custom/use_schema.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dask_sql/physical/rel/custom/create_catalog_schema.py b/dask_sql/physical/rel/custom/create_catalog_schema.py index 52ed37b55..8439616f3 100644 --- a/dask_sql/physical/rel/custom/create_catalog_schema.py +++ b/dask_sql/physical/rel/custom/create_catalog_schema.py @@ -5,7 +5,7 @@ if TYPE_CHECKING: import dask_sql - from dask_planner.rust import LogicalPlan + from dask_planner.rust import DaskLogicalPlan logger = logging.getLogger(__name__) @@ -28,8 +28,8 @@ class CreateCatalogSchemaPlugin(BaseRelPlugin): class_name = "CreateCatalogSchema" - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context"): - create_schema = rel.create_catalog_schema() + def convert(self, rel: "DaskLogicalPlan", context: "dask_sql.Context"): + create_schema = rel.to_variant() schema_name = create_schema.getSchemaName() if schema_name in context.schema: diff --git a/dask_sql/physical/rel/custom/drop_schema.py b/dask_sql/physical/rel/custom/drop_schema.py index 444662e2b..38ac53461 100644 --- a/dask_sql/physical/rel/custom/drop_schema.py +++ b/dask_sql/physical/rel/custom/drop_schema.py @@ -21,7 +21,7 @@ class DropSchemaPlugin(BaseRelPlugin): class_name = "DropSchema" def convert(self, rel: "LogicalPlan", context: "dask_sql.Context"): - drop_schema = rel.drop_schema() + drop_schema = rel.to_variant() schema_name = drop_schema.getSchemaName() if schema_name not in context.schema: diff --git a/dask_sql/physical/rel/custom/use_schema.py b/dask_sql/physical/rel/custom/use_schema.py index 889dd2b1c..b35d45a57 100644 --- a/dask_sql/physical/rel/custom/use_schema.py +++ b/dask_sql/physical/rel/custom/use_schema.py @@ -21,7 +21,7 @@ class UseSchemaPlugin(BaseRelPlugin): class_name = "UseSchema" def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: - schema_name = rel.use_schema().getSchemaName() + schema_name = rel.to_variant().getSchemaName() if schema_name in context.schema: context.schema_name = schema_name From 6637b842e53e2ada3fdc6d43d0c574a1d7680f96 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 14 Jun 2023 12:59:17 -0400 Subject: [PATCH 13/44] Bump ADP -> 26.0.0 --- dask_planner/Cargo.lock | 1483 ++++------------- dask_planner/Cargo.toml | 2 +- dask_planner/src/dialect.rs | 5 + dask_planner/src/expression.rs | 46 +- dask_planner/src/sql/logical.rs | 17 +- dask_planner/src/sql/logical/aggregate.rs | 6 +- .../src/sql/logical/create_memory_table.rs | 9 +- dask_planner/src/sql/logical/drop_table.rs | 7 +- .../src/sql/logical/subquery_alias.rs | 2 +- dask_planner/src/sql/logical/table_scan.rs | 6 +- dask_planner/src/sql/optimizer.rs | 4 - dask_planner/src/sql/table.rs | 4 +- dask_planner/src/sql/types.rs | 14 +- 13 files changed, 366 insertions(+), 1239 deletions(-) diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index 3f501e47d..615d90ab5 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -29,9 +29,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" dependencies = [ "memchr", ] @@ -51,6 +51,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -107,15 +113,15 @@ checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" [[package]] name = "arrayvec" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" +checksum = "8868f09ff8cea88b079da74ae569d9b8c62a23c68c746240b704ee6f7525c89c" [[package]] name = "arrow" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990dfa1a9328504aa135820da1c95066537b69ad94c04881b785f64328e0fa6b" +checksum = "6619cab21a0cdd8c9b9f1d9e09bfaa9b1974e5ef809a6566aef0b998caf38ace" dependencies = [ "ahash", "arrow-arith", @@ -136,9 +142,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b2e52de0ab54173f9b08232b7184c26af82ee7ab4ac77c83396633c90199fa" +checksum = "e0dc95485623a76e00929bda8caa40c1f838190952365c4f43a7b9ae86d03e94" dependencies = [ "arrow-array", "arrow-buffer", @@ -151,9 +157,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10849b60c17dbabb334be1f4ef7550701aa58082b71335ce1ed586601b2f423" +checksum = "3267847f53d3042473cfd2c769afd8d74a6d7d201fc3a34f5cb84c0282ef47a7" dependencies = [ "ahash", "arrow-buffer", @@ -168,9 +174,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0746ae991b186be39933147117f8339eb1c4bbbea1c8ad37e7bf5851a1a06ba" +checksum = "c5f66553e66e120ac4b21570368ee9ebf35ff3f5399f872b0667699e145678f5" dependencies = [ "half", "num", @@ -178,9 +184,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b88897802515d7b193e38b27ddd9d9e43923d410a9e46307582d756959ee9595" +checksum = "65e6f3579dbf0d97c683d451b2550062b0f0e62a3169bf74238b5f59f44ad6d8" dependencies = [ "arrow-array", "arrow-buffer", @@ -195,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c8220d9741fc37961262710ceebd8451a5b393de57c464f0267ffdda1775c0a" +checksum = "373579c4c1a8f5307d3125b7a89c700fcf8caf85821c77eb4baab3855ae0aba5" dependencies = [ "arrow-array", "arrow-buffer", @@ -214,9 +220,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533f937efa1aaad9dc86f6a0e382c2fa736a4943e2090c946138079bdf060cef" +checksum = "61bc8df9912cca6642665fdf989d6fa0de2570f18a7f709bcf59d29de96d2097" dependencies = [ "arrow-buffer", "arrow-schema", @@ -226,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18b75296ff01833f602552dff26a423fc213db8e5049b540ca4a00b1c957e41c" +checksum = "0105dcf5f91daa7182d87b713ee0b32b3bfc88e0c48e7dc3e9d6f1277a07d1ae" dependencies = [ "arrow-array", "arrow-buffer", @@ -240,9 +246,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e501d3de4d612c90677594896ca6c0fa075665a7ff980dc4189bb531c17e19f6" +checksum = "e73134fb5b5ec8770f8cbb214c2c487b2d350081e403ca4eeeb6f8f5e19846ac" dependencies = [ "arrow-array", "arrow-buffer", @@ -254,14 +260,15 @@ dependencies = [ "indexmap", "lexical-core", "num", + "serde", "serde_json", ] [[package]] name = "arrow-ord" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33d2671eb3793f9410230ac3efb0e6d36307be8a2dac5fad58ac9abde8e9f01e" +checksum = "89f25bc66e18d4c2aa1fe2f9bb03e2269da60e636213210385ae41a107f9965a" dependencies = [ "arrow-array", "arrow-buffer", @@ -274,9 +281,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc11fa039338cebbf4e29cf709c8ac1d6a65c7540063d4a25f991ab255ca85c8" +checksum = "1095ff85ea4f5ff02d17b30b089de31b51a50be01c6b674f0a0509ab771232f1" dependencies = [ "ahash", "arrow-array", @@ -289,18 +296,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d04f17f7b86ded0b5baf98fe6123391c4343e031acc3ccc5fa604cc180bff220" +checksum = "25187bbef474151a2e4ddec67b9e34bda5cbfba292dc571392fa3a1f71ff5a82" dependencies = [ - "bitflags 2.2.1", + "bitflags 2.3.2", ] [[package]] name = "arrow-select" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "163e35de698098ff5f5f672ada9dc1f82533f10407c7a11e2cd09f3bcf31d18a" +checksum = "fd0d4ee884aec3aa05e41478e3cd312bf609de9babb5d187a43fb45931da4da4" dependencies = [ "arrow-array", "arrow-buffer", @@ -311,9 +318,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdfbed1b10209f0dc68e6aa4c43dc76079af65880965c7c3b73f641f23d4aba" +checksum = "d6d71c3ffe4c07e66ce8fdc6aed5b00e0e60c5144911879b10546f5b72d8fa1c" dependencies = [ "arrow-array", "arrow-buffer", @@ -321,14 +328,14 @@ dependencies = [ "arrow-schema", "arrow-select", "regex", - "regex-syntax 0.6.29", + "regex-syntax", ] [[package]] name = "async-compression" -version = "0.3.15" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "942c7cd7ae39e91bde4820d74132e9862e62c2f386c3aa90ccf55949f5bad63a" +checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11" dependencies = [ "bzip2", "flate2", @@ -338,8 +345,8 @@ dependencies = [ "pin-project-lite", "tokio", "xz2", - "zstd 0.11.2+zstd.1.5.2", - "zstd-safe 5.0.2+zstd.1.5.2", + "zstd", + "zstd-safe", ] [[package]] @@ -350,7 +357,7 @@ checksum = "0e97ce7de6cf12de5d7226c73f5ba9811622f4db3a5b91b55c53e987e5f91cba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -361,7 +368,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -372,9 +379,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "base64" -version = "0.21.0" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" +checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" [[package]] name = "bitflags" @@ -384,9 +391,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.2.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a6904aef64d73cf10ab17ebace7befb918b82164785cb89907993be7f83813" +checksum = "6dbe3c979c178231552ecba20214a8272df4e09f232a87aef4320cf06539aded" [[package]] name = "blake2" @@ -399,9 +406,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.3.3" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ae2468a89544a466886840aa467a25b766499f4f04bf7d9fcd10ecee9fccef" +checksum = "729b71f35bd3fa1a4c86b85d32c8b9069ea7fe14f7a53cfabb65f62d4265b888" dependencies = [ "arrayref", "arrayvec", @@ -441,32 +448,11 @@ dependencies = [ "alloc-stdlib", ] -[[package]] -name = "bstr" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09" -dependencies = [ - "memchr", - "once_cell", - "regex-automata", - "serde", -] - -[[package]] -name = "btoi" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dd6407f73a9b8b6162d8a2ef999fe6afd7cc15902ebf42c5cd296addf17e0ad" -dependencies = [ - "num-traits", -] - [[package]] name = "bumpalo" -version = "3.12.1" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" [[package]] name = "byteorder" @@ -518,17 +504,14 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.24" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" +checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5" dependencies = [ + "android-tzdata", "iana-time-zone", - "js-sys", - "num-integer", "num-traits", "serde", - "time 0.1.45", - "wasm-bindgen", "winapi", ] @@ -554,27 +537,11 @@ dependencies = [ "phf_codegen", ] -[[package]] -name = "clru" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8191fa7302e03607ff0e237d4246cc043ff5b3cb9409d995172ba3bea16b807" - -[[package]] -name = "codespan-reporting" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" -dependencies = [ - "termcolor", - "unicode-width", -] - [[package]] name = "comfy-table" -version = "6.1.4" +version = "6.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d" +checksum = "7e959d788268e3bf9d35ace83e81b124190378e4c91c9067524675e33394b8ba" dependencies = [ "strum", "strum_macros", @@ -605,9 +572,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13418e745008f7349ec7e449155f419a61b92b58a99cc3616942b926825ec76b" +checksum = "21a53c0a4d288377e7415b53dcfc3c04da5cdc2cc95c8d5ac178b58f0b861ad6" [[package]] name = "core-foundation-sys" @@ -651,9 +618,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b015497079b9a9d69c02ad25de6c0a6edef051ea6360a327d0bd05802ef64ad" +checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086" dependencies = [ "csv-core", "itoa", @@ -670,50 +637,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "cxx" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93" -dependencies = [ - "cc", - "cxxbridge-flags", - "cxxbridge-macro", - "link-cplusplus", -] - -[[package]] -name = "cxx-build" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b" -dependencies = [ - "cc", - "codespan-reporting", - "once_cell", - "proc-macro2", - "quote", - "scratch", - "syn 2.0.15", -] - -[[package]] -name = "cxxbridge-flags" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb" - -[[package]] -name = "cxxbridge-macro" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.15", -] - [[package]] name = "dashmap" version = "5.4.0" @@ -742,13 +665,15 @@ dependencies = [ [[package]] name = "datafusion" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bdb93fee4f30368f1f71bfd5cd28882ec9fab0183db7924827b76129d33227c" +checksum = "9992c267436551d40b52d65289b144712e7b0ebdc62c8c859fd1574e5f73efbb" dependencies = [ "ahash", "apache-avro", "arrow", + "arrow-array", + "arrow-schema", "async-compression", "async-trait", "bytes", @@ -787,14 +712,14 @@ dependencies = [ "url", "uuid", "xz2", - "zstd 0.12.3+zstd.1.5.2", + "zstd", ] [[package]] name = "datafusion-common" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e82401ce129e601d406012b6d718f8978ba84c386e1c342fa155877120d68824" +checksum = "c3be97f7a7c720cdbb71e9eeabf814fa6ad8102b9022390f6cac74d3b4af6392" dependencies = [ "apache-avro", "arrow", @@ -809,9 +734,9 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b08b2078aed21a27239cd93f3015e492a58b0d50ebeeaf8d2236cf108ef583ce" +checksum = "c77c4b14b809b0e4c5bb101b6834504f06cdbb0d3c643400c61d0d844b33264e" dependencies = [ "dashmap", "datafusion-common", @@ -827,21 +752,24 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b5b977ce9695fb4c67614266ec57f384fc11e9a9f9b3e6d0e62b9c5a9f2c1f" +checksum = "e6ec7409bd45cf4fae6395d7d1024c8a97e543cadc88363e405d2aad5330e5e7" dependencies = [ "ahash", "arrow", "datafusion-common", + "lazy_static", "sqlparser", + "strum", + "strum_macros", ] [[package]] name = "datafusion-optimizer" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0b2bb9e73ed778d1bc5af63a270f0154bf6eab5099c77668a6362296888e46b" +checksum = "64b537c93f87989c212db92a448a0f5eb4f0995e27199bb7687ae94f8b64a7a8" dependencies = [ "arrow", "async-trait", @@ -852,14 +780,14 @@ dependencies = [ "hashbrown 0.13.2", "itertools", "log", - "regex-syntax 0.6.29", + "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80cd8ea5ab0a07b1b2a3e17d5909f1b1035bd129ffeeb5c66842a32e682f8f79" +checksum = "f60ee3f53340fdef36ee54d9e12d446ae2718b1d0196ac581f791d34808ec876" dependencies = [ "ahash", "arrow", @@ -877,6 +805,7 @@ dependencies = [ "indexmap", "itertools", "lazy_static", + "libc", "md-5", "paste", "petgraph", @@ -889,8 +818,8 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "22.0.0" -source = "git+https://github.com/apache/arrow-datafusion-python.git?rev=9493638#94936380e58a266f5dd5de6b70a06d3aa36fbe22" +version = "26.0.0" +source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=logical_extension#ce83d8a837ac2bd8473cbe3432cebd3eb90a6e1c" dependencies = [ "async-trait", "datafusion", @@ -903,11 +832,13 @@ dependencies = [ "mimalloc", "object_store", "parking_lot", + "prost", + "prost-types", "pyo3", "pyo3-build-config 0.18.3", "rand", - "regex-syntax 0.6.29", - "syn 2.0.15", + "regex-syntax", + "syn 2.0.18", "tokio", "url", "uuid", @@ -915,9 +846,9 @@ dependencies = [ [[package]] name = "datafusion-row" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a95d6badab19fd6e9195fdc5209ac0a7e5ce9bcdedc67767b9ffc1b4e645760" +checksum = "d58fc64058aa3bcb00077a0d19474a0d584d31dec8c7ac3406868f485f659af9" dependencies = [ "arrow", "datafusion-common", @@ -927,9 +858,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37a78f8fc67123c4357e63bc0c87622a2a663d26f074958d749a633d0ecde90f" +checksum = "1531f0314151a34bf6c0a83c7261525688b7c729876f53e7896b8f4ca8f57d07" dependencies = [ "arrow", "arrow-schema", @@ -941,9 +872,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae6ed64a2005f0d78f2b1b3ec3f8148183f4523d5d364e5367115f8d8a82b7df" +checksum = "079d5be5ec59580777bfa16d79187fea99b6498e3e8e07eb36d504a5fe708f13" dependencies = [ "async-recursion", "chrono", @@ -951,53 +882,28 @@ dependencies = [ "itertools", "object_store", "prost", + "prost-types", "substrait", "tokio", ] [[package]] name = "digest" -version = "0.10.6" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", "subtle", ] -[[package]] -name = "dirs" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" -dependencies = [ - "dirs-sys", -] - -[[package]] -name = "dirs-sys" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" -dependencies = [ - "libc", - "redox_users", - "winapi", -] - [[package]] name = "doc-comment" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" -[[package]] -name = "dunce" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56ce8c6da7551ec6c462cbaf3bfbc75131ebbfa1c944aeaa9dab51ca1c5f0c3b" - [[package]] name = "dyn-clone" version = "1.0.11" @@ -1040,7 +946,7 @@ checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -1062,18 +968,6 @@ dependencies = [ "instant", ] -[[package]] -name = "filetime" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall 0.2.16", - "windows-sys 0.48.0", -] - [[package]] name = "fixedbitset" version = "0.4.2" @@ -1082,9 +976,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" -version = "23.1.21" +version = "23.5.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77f5399c2c9c50ae9418e522842ad362f61ee48b346ac106807bd355a8a7c619" +checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" dependencies = [ "bitflags 1.3.2", "rustc_version", @@ -1108,9 +1002,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "form_urlencoded" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" +checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" dependencies = [ "percent-encoding", ] @@ -1171,7 +1065,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -1216,553 +1110,26 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.9" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", -] - -[[package]] -name = "gix" -version = "0.43.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c256ea71cc1967faaefdaad15f334146b7c806f12460dcafd3afed845c8c78dd" -dependencies = [ - "gix-actor", - "gix-attributes", - "gix-config", - "gix-credentials", - "gix-date", - "gix-diff", - "gix-discover", - "gix-features 0.28.1", - "gix-glob", - "gix-hash 0.10.4", - "gix-hashtable", - "gix-index", - "gix-lock", - "gix-mailmap", - "gix-object", - "gix-odb", - "gix-pack", - "gix-path", - "gix-prompt", - "gix-ref", - "gix-refspec", - "gix-revision", - "gix-sec", - "gix-tempfile", - "gix-traverse", - "gix-url", - "gix-validate", - "gix-worktree", - "log", - "once_cell", - "signal-hook", - "smallvec", - "thiserror", - "unicode-normalization", -] - -[[package]] -name = "gix-actor" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc22b0cdc52237667c301dd7cdc6ead8f8f73c9f824e9942c8ebd6b764f6c0bf" -dependencies = [ - "bstr", - "btoi", - "gix-date", - "itoa", - "nom", - "thiserror", -] - -[[package]] -name = "gix-attributes" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2231a25934a240d0a4b6f4478401c73ee81d8be52de0293eedbc172334abf3e1" -dependencies = [ - "bstr", - "gix-features 0.28.1", - "gix-glob", - "gix-path", - "gix-quote", - "thiserror", - "unicode-bom", -] - -[[package]] -name = "gix-bitmap" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55a95f4942360766c3880bdb2b4b57f1ef73b190fc424755e7fdf480430af618" -dependencies = [ - "thiserror", -] - -[[package]] -name = "gix-chunk" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0d39583cab06464b8bf73b3f1707458270f0e7383cb24c3c9c1a16e6f792978" -dependencies = [ - "thiserror", -] - -[[package]] -name = "gix-command" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2c6f75c1e0f924de39e750880a6e21307194bb1ab773efe3c7d2d787277f8ab" -dependencies = [ - "bstr", -] - -[[package]] -name = "gix-config" -version = "0.20.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fbad5ce54a8fc997acc50febd89ec80fa6e97cb7f8d0654cb229936407489d8" -dependencies = [ - "bstr", - "gix-config-value", - "gix-features 0.28.1", - "gix-glob", - "gix-path", - "gix-ref", - "gix-sec", - "log", - "memchr", - "nom", - "once_cell", - "smallvec", - "thiserror", - "unicode-bom", -] - -[[package]] -name = "gix-config-value" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d09154c0c8677e4da0ec35e896f56ee3e338e741b9599fae06075edd83a4081c" -dependencies = [ - "bitflags 1.3.2", - "bstr", - "gix-path", - "libc", - "thiserror", -] - -[[package]] -name = "gix-credentials" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "750b684197374518ea057e0a0594713e07683faa0a3f43c0f93d97f64130ad8d" -dependencies = [ - "bstr", - "gix-command", - "gix-config-value", - "gix-path", - "gix-prompt", - "gix-sec", - "gix-url", - "thiserror", -] - -[[package]] -name = "gix-date" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b96271912ce39822501616f177dea7218784e6c63be90d5f36322ff3a722aae2" -dependencies = [ - "bstr", - "itoa", - "thiserror", - "time 0.3.20", -] - -[[package]] -name = "gix-diff" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "103a0fa79b0d438f5ecb662502f052e530ace4fe1fe8e1c83c0c6da76d728e67" -dependencies = [ - "gix-hash 0.10.4", - "gix-object", - "imara-diff", - "thiserror", -] - -[[package]] -name = "gix-discover" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eba8ba458cb8f4a6c33409b0fe650b1258655175a7ffd1d24fafd3ed31d880b" -dependencies = [ - "bstr", - "dunce", - "gix-hash 0.10.4", - "gix-path", - "gix-ref", - "gix-sec", - "thiserror", -] - -[[package]] -name = "gix-features" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b76f9a80f6dd7be66442ae86e1f534effad9546676a392acc95e269d0c21c22" -dependencies = [ - "crc32fast", - "flate2", - "gix-hash 0.10.4", - "libc", - "once_cell", - "prodash", - "sha1_smol", - "thiserror", - "walkdir", -] - -[[package]] -name = "gix-features" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf69b0f5c701cc3ae22d3204b671907668f6437ca88862d355eaf9bc47a4f897" -dependencies = [ - "gix-hash 0.11.1", - "libc", + "wasi", ] [[package]] -name = "gix-fs" -version = "0.1.1" +name = "git2" +version = "0.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b37a1832f691fdc09910bd267f9a2e413737c1f9ec68c6e31f9e802616278a9" -dependencies = [ - "gix-features 0.29.0", -] - -[[package]] -name = "gix-glob" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93e43efd776bc543f46f0fd0ca3d920c37af71a764a16f2aebd89765e9ff2993" +checksum = "7b989d6a7ca95a362cf2cfc5ad688b3a467be1f87e480b8dad07fee8c79b0044" dependencies = [ "bitflags 1.3.2", - "bstr", -] - -[[package]] -name = "gix-hash" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a258595457bc192d1f1c59d0d168a1e34e2be9b97a614e14995416185de41a7" -dependencies = [ - "hex", - "thiserror", -] - -[[package]] -name = "gix-hash" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "078eec3ac2808cc03f0bddd2704cb661da5c5dc33b41a9d7947b141d499c7c42" -dependencies = [ - "hex", - "thiserror", -] - -[[package]] -name = "gix-hashtable" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e55e40dfd694884f0eb78796c5bddcf2f8b295dace47039099dd7e76534973" -dependencies = [ - "gix-hash 0.10.4", - "hashbrown 0.13.2", - "parking_lot", -] - -[[package]] -name = "gix-index" -version = "0.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "717ab601ece7921f59fe86849dbe27d44a46ebb883b5885732c4f30df4996177" -dependencies = [ - "bitflags 1.3.2", - "bstr", - "btoi", - "filetime", - "gix-bitmap", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-lock", - "gix-object", - "gix-traverse", - "itoa", - "memmap2", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-lock" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c693d7f05730fa74a7c467150adc7cea393518410c65f0672f80226b8111555" -dependencies = [ - "gix-tempfile", - "gix-utils", - "thiserror", -] - -[[package]] -name = "gix-mailmap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b66aea5e52875cd4915f4957a6f4b75831a36981e2ec3f5fad9e370e444fe1a" -dependencies = [ - "bstr", - "gix-actor", - "thiserror", -] - -[[package]] -name = "gix-object" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8df068db9180ee935fbb70504848369e270bdcb576b05c0faa8b9fd3b86fc017" -dependencies = [ - "bstr", - "btoi", - "gix-actor", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-validate", - "hex", - "itoa", - "nom", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-odb" -version = "0.43.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e83af2e3e36005bfe010927f0dff41fb5acc3e3d89c6f1174135b3a34086bda2" -dependencies = [ - "arc-swap", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-object", - "gix-pack", - "gix-path", - "gix-quote", - "parking_lot", - "tempfile", - "thiserror", -] - -[[package]] -name = "gix-pack" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9401911c7fe032ad7b31c6a6b5be59cb283d1d6c999417a8215056efe6d635f3" -dependencies = [ - "clru", - "gix-chunk", - "gix-diff", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-hashtable", - "gix-object", - "gix-path", - "gix-tempfile", - "gix-traverse", - "memmap2", - "parking_lot", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-path" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32370dce200bb951df013e03dff35b4233fc7a89458642b047629b91734a7e19" -dependencies = [ - "bstr", - "thiserror", -] - -[[package]] -name = "gix-prompt" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f3034d4d935aef2c7bf719aaa54b88c520e82413118d886ae880a31d5bdee57" -dependencies = [ - "gix-command", - "gix-config-value", - "nix", - "parking_lot", - "thiserror", -] - -[[package]] -name = "gix-quote" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a282f5a8d9ee0b09ec47390ac727350c48f2f5c76d803cd8da6b3e7ad56e0bcb" -dependencies = [ - "bstr", - "btoi", - "thiserror", -] - -[[package]] -name = "gix-ref" -version = "0.27.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e909396ed3b176823991ccc391c276ae2a015e54edaafa3566d35123cfac9d" -dependencies = [ - "gix-actor", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-lock", - "gix-object", - "gix-path", - "gix-tempfile", - "gix-validate", - "memmap2", - "nom", - "thiserror", -] - -[[package]] -name = "gix-refspec" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba332462bda2e8efeae4302b39a6ed01ad56ef772fd5b7ef197cf2798294d65" -dependencies = [ - "bstr", - "gix-hash 0.10.4", - "gix-revision", - "gix-validate", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-revision" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c6f6ff53f888858afc24bf12628446a14279ceec148df6194481f306f553ad2" -dependencies = [ - "bstr", - "gix-date", - "gix-hash 0.10.4", - "gix-hashtable", - "gix-object", - "thiserror", -] - -[[package]] -name = "gix-sec" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8ffa5bf0772f9b01de501c035b6b084cf9b8bb07dec41e3afc6a17336a65f47" -dependencies = [ - "bitflags 1.3.2", - "dirs", - "gix-path", - "libc", - "windows 0.43.0", -] - -[[package]] -name = "gix-tempfile" -version = "5.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71a0d32f34e71e86586124225caefd78dabc605d0486de580d717653addf182" -dependencies = [ - "gix-fs", "libc", - "once_cell", - "parking_lot", - "signal-hook", - "signal-hook-registry", - "tempfile", -] - -[[package]] -name = "gix-traverse" -version = "0.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9a4a07bb22168dc79c60e1a6a41919d198187ca83d8a5940ad8d7122a45df3" -dependencies = [ - "gix-hash 0.10.4", - "gix-hashtable", - "gix-object", - "thiserror", -] - -[[package]] -name = "gix-url" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6a22b4b32ad14d68f7b7fb6458fa58d44b01797d94c1b8f4db2d9c7b3c366b5" -dependencies = [ - "bstr", - "gix-features 0.28.1", - "gix-path", - "home", - "thiserror", - "url", -] - -[[package]] -name = "gix-utils" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c10b69beac219acb8df673187a1f07dde2d74092f974fb3f9eb385aeb667c909" -dependencies = [ - "fastrand", -] - -[[package]] -name = "gix-validate" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bd629d3680773e1785e585d76fd4295b740b559cad9141517300d99a0c8c049" -dependencies = [ - "bstr", - "thiserror", -] - -[[package]] -name = "gix-worktree" -version = "0.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54ec9a000b4f24af706c3cc680c7cda235656cbe3216336522f5692773b8a301" -dependencies = [ - "bstr", - "gix-attributes", - "gix-features 0.28.1", - "gix-glob", - "gix-hash 0.10.4", - "gix-index", - "gix-object", - "gix-path", - "io-close", - "thiserror", + "libgit2-sys", + "log", + "url", ] [[package]] @@ -1773,9 +1140,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21" +checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782" dependencies = [ "bytes", "fnv", @@ -1836,21 +1203,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - -[[package]] -name = "home" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" -dependencies = [ - "windows-sys 0.48.0", -] - [[package]] name = "http" version = "0.2.9" @@ -1917,9 +1269,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.23.2" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" +checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ "http", "hyper", @@ -1930,48 +1282,37 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.56" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows 0.48.0", + "windows", ] [[package]] name = "iana-time-zone-haiku" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ - "cxx", - "cxx-build", + "cc", ] [[package]] name = "idna" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" +checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" dependencies = [ "unicode-bidi", "unicode-normalization", ] -[[package]] -name = "imara-diff" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e98c1d0ad70fc91b8b9654b1f33db55e59579d3b3de2bffdced0fdb810570cb8" -dependencies = [ - "ahash", - "hashbrown 0.12.3", -] - [[package]] name = "indexmap" version = "1.9.3" @@ -2003,25 +1344,15 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "io-close" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cadcf447f06744f8ce713d2d6239bb5bde2c357a452397a9ed90c625da390bc" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "io-lifetimes" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" dependencies = [ "hermit-abi 0.3.1", "libc", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -2039,7 +1370,7 @@ dependencies = [ "hermit-abi 0.3.1", "io-lifetimes", "rustix", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -2068,9 +1399,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.61" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] @@ -2147,15 +1478,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.142" +version = "0.2.146" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317" +checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" [[package]] name = "libflate" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97822bf791bd4d5b403713886a5fbe8bf49520fe78e323b0dc480ca1a03e50b0" +checksum = "5ff4ae71b685bbad2f2f391fe74f6b7659a34871c08b210fdc039e43bee07d18" dependencies = [ "adler32", "crc32fast", @@ -2171,11 +1502,23 @@ dependencies = [ "rle-decode-fast", ] +[[package]] +name = "libgit2-sys" +version = "0.15.2+1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a80df2e11fb4a61f4ba2ab42dbe7f74468da143f1a75c74e11dee7c813f694fa" +dependencies = [ + "cc", + "libc", + "libz-sys", + "pkg-config", +] + [[package]] name = "libm" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" +checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" [[package]] name = "libmimalloc-sys" @@ -2188,25 +1531,28 @@ dependencies = [ ] [[package]] -name = "link-cplusplus" -version = "1.0.8" +name = "libz-sys" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" +checksum = "56ee889ecc9568871456d42f603d6a0ce59ff328d291063a45cbdf0036baf6db" dependencies = [ "cc", + "libc", + "pkg-config", + "vcpkg", ] [[package]] name = "linux-raw-sys" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "lock_api" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" dependencies = [ "autocfg", "scopeguard", @@ -2264,15 +1610,6 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" -[[package]] -name = "memmap2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" -dependencies = [ - "libc", -] - [[package]] name = "memoffset" version = "0.8.0" @@ -2297,12 +1634,6 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - [[package]] name = "miniz_oxide" version = "0.7.1" @@ -2314,14 +1645,13 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" +checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" dependencies = [ "libc", - "log", - "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.45.0", + "wasi", + "windows-sys", ] [[package]] @@ -2330,28 +1660,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" -[[package]] -name = "nix" -version = "0.26.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" -dependencies = [ - "bitflags 1.3.2", - "cfg-if", - "libc", - "static_assertions", -] - -[[package]] -name = "nom" -version = "7.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" -dependencies = [ - "memchr", - "minimal-lexical", -] - [[package]] name = "num" version = "0.4.0" @@ -2439,15 +1747,6 @@ dependencies = [ "libc", ] -[[package]] -name = "num_threads" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" -dependencies = [ - "libc", -] - [[package]] name = "object_store" version = "0.5.6" @@ -2478,9 +1777,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.1" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "ordered-float" @@ -2503,22 +1802,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.7" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.2.16", + "redox_syscall", "smallvec", - "windows-sys 0.45.0", + "windows-targets", ] [[package]] name = "parquet" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "321a15f8332645759f29875b07f8233d16ed8ec1b3582223de81625a9f8506b7" +checksum = "d6a656fcc17e641657c955742c689732684e096f790ff30865d9f8dcc39f7c4a" dependencies = [ "ahash", "arrow-array", @@ -2538,13 +1837,14 @@ dependencies = [ "lz4", "num", "num-bigint", + "object_store", "paste", "seq-macro", "snap", "thrift", "tokio", "twox-hash", - "zstd 0.12.3+zstd.1.5.2", + "zstd", ] [[package]] @@ -2564,9 +1864,9 @@ checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79" [[package]] name = "percent-encoding" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" +checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" [[package]] name = "petgraph" @@ -2642,12 +1942,12 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "prettyplease" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" +checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -2658,19 +1958,13 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.56" +version = "1.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" +checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" dependencies = [ "unicode-ident", ] -[[package]] -name = "prodash" -version = "23.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9516b775656bc3e8985e19cd4b8c0c0de045095074e453d2c0a513b5f978392d" - [[package]] name = "prost" version = "0.11.9" @@ -2822,9 +2116,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.26" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" dependencies = [ "proc-macro2", ] @@ -2859,15 +2153,6 @@ dependencies = [ "getrandom", ] -[[package]] -name = "redox_syscall" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.3.5" @@ -2877,51 +2162,28 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "redox_users" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" -dependencies = [ - "getrandom", - "redox_syscall 0.2.16", - "thiserror", -] - [[package]] name = "regex" -version = "1.8.1" +version = "1.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af83e617f331cc6ae2da5443c602dfa5af81e517212d9d611a5b3ba1777b5370" +checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.1", + "regex-syntax", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - -[[package]] -name = "regex-syntax" -version = "0.6.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" - [[package]] name = "regex-syntax" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5996294f19bd3aae0453a862ad728f60e6600695733dd5df01da90c54363a3c" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" [[package]] name = "regress" -version = "0.5.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d995d590bd8ec096d1893f414bf3f5e8b0ee4c9eed9a5642b9766ef2c8e2e8e9" +checksum = "82a9ecfa0cb04d0b04dddb99b8ccf4f66bc8dfd23df694b398570bd8ae3a50fb" dependencies = [ "hashbrown 0.13.2", "memchr", @@ -2929,9 +2191,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.17" +version = "0.11.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13293b639a097af28fc8a90f22add145a9c954e49d77da06263d58cf44d5fb91" +checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55" dependencies = [ "base64", "bytes", @@ -3000,28 +2262,28 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.19" +version = "0.37.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" +checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0" dependencies = [ "bitflags 1.3.2", "errno", "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] name = "rustls" -version = "0.20.8" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f" +checksum = "c911ba11bc8433e811ce56fde130ccf32f5127cab0e0194e9c68c5a5b671791e" dependencies = [ "log", "ring", + "rustls-webpki", "sct", - "webpki", ] [[package]] @@ -3033,6 +2295,16 @@ dependencies = [ "base64", ] +[[package]] +name = "rustls-webpki" +version = "0.100.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b" +dependencies = [ + "ring", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.12" @@ -3084,12 +2356,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "scratch" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" - [[package]] name = "sct" version = "0.7.0" @@ -3114,22 +2380,22 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" [[package]] name = "serde" -version = "1.0.160" +version = "1.0.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c" +checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.160" +version = "1.0.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df" +checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -3156,13 +2422,14 @@ dependencies = [ [[package]] name = "serde_tokenstream" -version = "0.1.7" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "797ba1d80299b264f3aac68ab5d12e5825a561749db4df7cd7c8083900c5d4e9" +checksum = "8a00ffd23fd882d096f09fcaae2a9de8329a328628e86027e049ee051dc1621f" dependencies = [ "proc-macro2", + "quote", "serde", - "syn 1.0.109", + "syn 2.0.18", ] [[package]] @@ -3190,12 +2457,6 @@ dependencies = [ "unsafe-libyaml", ] -[[package]] -name = "sha1_smol" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" - [[package]] name = "sha2" version = "0.10.6" @@ -3207,25 +2468,6 @@ dependencies = [ "digest", ] -[[package]] -name = "signal-hook" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9" -dependencies = [ - "libc", - "signal-hook-registry", -] - -[[package]] -name = "signal-hook-registry" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" -dependencies = [ - "libc", -] - [[package]] name = "siphasher" version = "0.3.10" @@ -3293,9 +2535,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "sqlparser" -version = "0.32.0" +version = "0.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0366f270dbabb5cc2e4c88427dc4c08bba144f81e32fbd459a013f26a4d16aa0" +checksum = "37d3706eefb17039056234df6b566b0014f303f867f2656108334a55b8096f59" dependencies = [ "log", "sqlparser_derive", @@ -3323,6 +2565,9 @@ name = "strum" version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +dependencies = [ + "strum_macros", +] [[package]] name = "strum_macros" @@ -3339,11 +2584,11 @@ dependencies = [ [[package]] name = "substrait" -version = "0.7.5" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ae64fb7ad0670c7d6d53d57b1b91beb2212afc30e164cc8edb02d6b2cff32a" +checksum = "9df5d9e071804204172dc77e707c363f187e7f6566f9c78e5100c9a8f5ea434e" dependencies = [ - "gix", + "git2", "heck", "prettyplease", "prost", @@ -3354,16 +2599,16 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.15", + "syn 2.0.18", "typify", "walkdir", ] [[package]] name = "subtle" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "syn" @@ -3378,9 +2623,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.15" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822" +checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" dependencies = [ "proc-macro2", "quote", @@ -3395,15 +2640,16 @@ checksum = "fd1ba337640d60c3e96bc6f0638a939b9c9a7f2c316a1598c279828b3d1dc8c5" [[package]] name = "tempfile" -version = "3.5.0" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" +checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" dependencies = [ + "autocfg", "cfg-if", "fastrand", - "redox_syscall 0.3.5", + "redox_syscall", "rustix", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] @@ -3432,7 +2678,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -3446,46 +2692,6 @@ dependencies = [ "ordered-float", ] -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "time" -version = "0.3.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" -dependencies = [ - "itoa", - "libc", - "num_threads", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" - -[[package]] -name = "time-macros" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" -dependencies = [ - "time-core", -] - [[package]] name = "tiny-keccak" version = "2.0.2" @@ -3512,9 +2718,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.28.0" +version = "1.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3c786bf8134e5a3a166db9b29ab8f48134739014a3eca7bc6bfa95d673b136f" +checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" dependencies = [ "autocfg", "bytes", @@ -3525,7 +2731,7 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -3536,18 +2742,17 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] name = "tokio-rustls" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ "rustls", "tokio", - "webpki", ] [[package]] @@ -3601,14 +2806,14 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] name = "tracing-core" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" dependencies = [ "once_cell", ] @@ -3648,9 +2853,9 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "typify" -version = "0.0.11" +version = "0.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30bfde96849e25d7feef1bbf652e9cfc51deb63203fdc07b115b8bc3bcfe20b9" +checksum = "a6658d09e71bfe59e7987dc95ee7f71809fdb5793ab0cdc1503cc0073990484d" dependencies = [ "typify-impl", "typify-macro", @@ -3658,9 +2863,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.0.11" +version = "0.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95d27d749378ceab6ec22188ed7ad102205c89ddb92ab662371c850ffc71aa1a" +checksum = "34d3bb47587b13edf526d6ed02bf360ecefe083ab47a4ef29fc43112828b2bef" dependencies = [ "heck", "log", @@ -3669,16 +2874,16 @@ dependencies = [ "regress", "schemars", "serde_json", - "syn 1.0.109", + "syn 2.0.18", "thiserror", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.0.11" +version = "0.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35db6fc2bd9220ecdac6eeb88158824b83610de3dda0c6d0f2142b49efd858b0" +checksum = "d3f7e627c18be12d53bc1f261830b9c2763437b6a86ac57293b9085af2d32ffe" dependencies = [ "proc-macro2", "quote", @@ -3686,7 +2891,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 1.0.109", + "syn 2.0.18", "typify-impl", ] @@ -3696,17 +2901,11 @@ version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" -[[package]] -name = "unicode-bom" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63ec69f541d875b783ca40184d655f2927c95f0bffd486faa83cd3ac3529ec32" - [[package]] name = "unicode-ident" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" [[package]] name = "unicode-normalization" @@ -3749,9 +2948,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.3.1" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" +checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" dependencies = [ "form_urlencoded", "idna", @@ -3760,14 +2959,20 @@ dependencies = [ [[package]] name = "uuid" -version = "1.3.2" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dad5567ad0cf5b760e5665964bec1b47dfd077ba8a2544b513f3556d3d239a2" +checksum = "0fa2982af2eec27de306107c027578ff7f423d65f7250e40ce0fea8f45248b81" dependencies = [ "getrandom", "serde", ] +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" @@ -3794,12 +2999,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -3808,9 +3007,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3818,24 +3017,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.34" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" +checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" dependencies = [ "cfg-if", "js-sys", @@ -3845,9 +3044,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3855,22 +3054,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "wasm-streams" @@ -3887,9 +3086,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.61" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" dependencies = [ "js-sys", "wasm-bindgen", @@ -3956,37 +3155,13 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "windows" -version = "0.43.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04662ed0e3e5630dfa9b26e4cb823b817f1a9addda855d973a9458c236556244" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", -] - [[package]] name = "windows" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" dependencies = [ - "windows-targets 0.48.0", -] - -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.2", + "windows-targets", ] [[package]] @@ -3995,22 +3170,7 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets 0.48.0", -] - -[[package]] -name = "windows-targets" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows-targets", ] [[package]] @@ -4019,93 +3179,51 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" - [[package]] name = "windows_aarch64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" -[[package]] -name = "windows_i686_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" - [[package]] name = "windows_i686_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" -[[package]] -name = "windows_i686_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" - [[package]] name = "windows_i686_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" - [[package]] name = "windows_x86_64_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" - [[package]] name = "windows_x86_64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" -[[package]] -name = "windows_x86_64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" - [[package]] name = "windows_x86_64_msvc" version = "0.48.0" @@ -4151,32 +3269,13 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "zstd" -version = "0.11.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" -dependencies = [ - "zstd-safe 5.0.2+zstd.1.5.2", -] - [[package]] name = "zstd" version = "0.12.3+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76eea132fb024e0e13fd9c2f5d5d595d8a967aa72382ac2f9d39fcc95afd0806" dependencies = [ - "zstd-safe 6.0.5+zstd.1.5.4", -] - -[[package]] -name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" -dependencies = [ - "libc", - "zstd-sys", + "zstd-safe", ] [[package]] diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml index eb12bff27..3afada895 100644 --- a/dask_planner/Cargo.toml +++ b/dask_planner/Cargo.toml @@ -10,7 +10,7 @@ rust-version = "1.65" [dependencies] async-trait = "0.1.68" -datafusion-python = { git = "https://github.com/apache/arrow-datafusion-python.git", rev = "9493638" } +datafusion-python = { git = "https://github.com/jdye64/arrow-datafusion-python.git", branch = "logical_extension" } env_logger = "0.10" log = "^0.4" pyo3 = { version = "0.18.3", features = ["extension-module", "abi3", "abi3-py38"] } diff --git a/dask_planner/src/dialect.rs b/dask_planner/src/dialect.rs index 9fe013f3d..4876c8097 100644 --- a/dask_planner/src/dialect.rs +++ b/dask_planner/src/dialect.rs @@ -77,6 +77,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "floor" => { @@ -108,6 +109,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "timestampadd" => { @@ -136,6 +138,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "timestampdiff" => { @@ -163,6 +166,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "to_timestamp" => { @@ -192,6 +196,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } _ => Ok(None), diff --git a/dask_planner/src/expression.rs b/dask_planner/src/expression.rs index aa1a60a9b..80e7b88fa 100644 --- a/dask_planner/src/expression.rs +++ b/dask_planner/src/expression.rs @@ -4,7 +4,20 @@ use datafusion_python::{ datafusion::arrow::datatypes::DataType, datafusion_common::{Column, DFField, DFSchema, ScalarValue}, datafusion_expr::{ - expr::{AggregateFunction, BinaryExpr, Cast, Sort, TryCast, WindowFunction}, + expr::{ + AggregateFunction, + AggregateUDF, + BinaryExpr, + Cast, + Exists, + InList, + InSubquery, + ScalarFunction, + ScalarUDF, + Sort, + TryCast, + WindowFunction, + }, lit, utils::exprlist_to_fields, Between, @@ -330,15 +343,15 @@ impl PyExpr { | Expr::Cast(Cast { expr, .. }) | Expr::TryCast(TryCast { expr, .. }) | Expr::Sort(Sort { expr, .. }) - | Expr::InSubquery { expr, .. } => { + | Expr::InSubquery(InSubquery { expr, .. }) => { Ok(vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]) } // Expr variants containing a collection of Expr(s) for operands Expr::AggregateFunction(AggregateFunction { args, .. }) - | Expr::AggregateUDF { args, .. } - | Expr::ScalarFunction { args, .. } - | Expr::ScalarUDF { args, .. } + | Expr::AggregateUDF(AggregateUDF { args, .. }) + | Expr::ScalarFunction(ScalarFunction { args, .. }) + | Expr::ScalarUDF(ScalarUDF { args, .. }) | Expr::WindowFunction(WindowFunction { args, .. }) => Ok(args .iter() .map(|arg| PyExpr::from(arg.clone(), self.input_plan.clone())) @@ -377,7 +390,7 @@ impl PyExpr { Ok(operands) } - Expr::InList { expr, list, .. } => { + Expr::InList(InList { expr, list, .. }) => { let mut operands: Vec = vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]; for list_elem in list { @@ -435,8 +448,8 @@ impl PyExpr { op, right: _, }) => format!("{op}"), - Expr::ScalarFunction { fun, args: _ } => format!("{fun}"), - Expr::ScalarUDF { fun, .. } => fun.name.clone(), + Expr::ScalarFunction(ScalarFunction { fun, args: _ }) => format!("{fun}"), + Expr::ScalarUDF(ScalarUDF { fun, .. }) => fun.name.clone(), Expr::Cast { .. } => "cast".to_string(), Expr::Between { .. } => "between".to_string(), Expr::Case { .. } => "case".to_string(), @@ -557,7 +570,7 @@ impl PyExpr { ScalarValue::Struct(..) => "Struct", ScalarValue::FixedSizeBinary(_, _) => "FixedSizeBinary", }, - Expr::ScalarFunction { fun, args: _ } => match fun { + Expr::ScalarFunction(ScalarFunction { fun, args: _ }) => match fun { BuiltinScalarFunction::Abs => "Abs", BuiltinScalarFunction::DatePart => "DatePart", _ => { @@ -639,7 +652,7 @@ impl PyExpr { match &self.expr { Expr::Alias(expr, _) => match expr.as_ref() { Expr::AggregateFunction(AggregateFunction { filter, .. }) - | Expr::AggregateUDF { filter, .. } => match filter { + | Expr::AggregateUDF(AggregateUDF { filter, .. }) => match filter { Some(filter) => { Ok(Some(PyExpr::from(*filter.clone(), self.input_plan.clone()))) } @@ -650,7 +663,7 @@ impl PyExpr { )), }, Expr::AggregateFunction(AggregateFunction { filter, .. }) - | Expr::AggregateUDF { filter, .. } => match filter { + | Expr::AggregateUDF(AggregateUDF { filter, .. }) => match filter { Some(filter) => Ok(Some(PyExpr::from(*filter.clone(), self.input_plan.clone()))), None => Ok(None), }, @@ -739,7 +752,10 @@ impl PyExpr { ScalarValue::TimestampNanosecond(iv, tz) | ScalarValue::TimestampMicrosecond(iv, tz) | ScalarValue::TimestampMillisecond(iv, tz) - | ScalarValue::TimestampSecond(iv, tz) => Ok((*iv, tz.clone())), + | ScalarValue::TimestampSecond(iv, tz) => match tz { + Some(time_zone) => Ok((*iv, Some(time_zone.to_string()))), + None => Ok((*iv, None)), + }, other => Err(unexpected_literal_value(other)), } } @@ -790,9 +806,9 @@ impl PyExpr { pub fn is_negated(&self) -> PyResult { match &self.expr { Expr::Between(Between { negated, .. }) - | Expr::Exists { negated, .. } - | Expr::InList { negated, .. } - | Expr::InSubquery { negated, .. } => Ok(*negated), + | Expr::Exists(Exists { negated, .. }) + | Expr::InList(InList { negated, .. }) + | Expr::InSubquery(InSubquery { negated, .. }) => Ok(*negated), _ => Err(py_type_err(format!( "unknown Expr type {:?} encountered", &self.expr diff --git a/dask_planner/src/sql/logical.rs b/dask_planner/src/sql/logical.rs index d2096ba9b..890f9aacb 100644 --- a/dask_planner/src/sql/logical.rs +++ b/dask_planner/src/sql/logical.rs @@ -37,7 +37,7 @@ pub mod window; use datafusion_python::{ datafusion_common::{DFSchemaRef, DataFusionError}, - datafusion_expr::LogicalPlan, + datafusion_expr::{DdlStatement, LogicalPlan}, }; use pyo3::prelude::*; @@ -315,18 +315,19 @@ impl PyLogicalPlan { LogicalPlan::TableScan(_table_scan) => "TableScan", LogicalPlan::EmptyRelation(_empty_relation) => "EmptyRelation", LogicalPlan::Limit(_limit) => "Limit", - LogicalPlan::CreateExternalTable(_create_external_table) => "CreateExternalTable", - LogicalPlan::CreateMemoryTable(_create_memory_table) => "CreateMemoryTable", - LogicalPlan::DropTable(_drop_table) => "DropTable", - LogicalPlan::DropView(_drop_view) => "DropView", + LogicalPlan::Ddl(DdlStatement::CreateExternalTable { .. }) => "CreateExternalTable", + LogicalPlan::Ddl(DdlStatement::CreateMemoryTable { .. }) => "CreateMemoryTable", + LogicalPlan::Ddl(DdlStatement::DropTable { .. }) => "DropTable", + LogicalPlan::Ddl(DdlStatement::DropView { .. }) => "DropView", LogicalPlan::Values(_values) => "Values", LogicalPlan::Explain(_explain) => "Explain", LogicalPlan::Analyze(_analyze) => "Analyze", LogicalPlan::Subquery(_sub_query) => "Subquery", LogicalPlan::SubqueryAlias(_sqalias) => "SubqueryAlias", - LogicalPlan::CreateCatalogSchema(_create) => "CreateCatalogSchema", - LogicalPlan::CreateCatalog(_create_catalog) => "CreateCatalog", - LogicalPlan::CreateView(_create_view) => "CreateView", + LogicalPlan::Ddl(DdlStatement::CreateCatalogSchema { .. }) => "CreateCatalogSchema", + LogicalPlan::Ddl(DdlStatement::DropCatalogSchema { .. }) => "DropCatalogSchema", + LogicalPlan::Ddl(DdlStatement::CreateCatalog { .. }) => "CreateCatalog", + LogicalPlan::Ddl(DdlStatement::CreateView { .. }) => "CreateView", LogicalPlan::Statement(_) => "Statement", // Further examine and return the name that is a possible Dask-SQL Extension type LogicalPlan::Extension(extension) => { diff --git a/dask_planner/src/sql/logical/aggregate.rs b/dask_planner/src/sql/logical/aggregate.rs index 0acc8b86e..870d8d7ab 100644 --- a/dask_planner/src/sql/logical/aggregate.rs +++ b/dask_planner/src/sql/logical/aggregate.rs @@ -1,5 +1,5 @@ use datafusion_python::datafusion_expr::{ - expr::AggregateFunction, + expr::{AggregateFunction, AggregateUDF}, logical_plan::{Aggregate, Distinct}, Expr, LogicalPlan, @@ -75,7 +75,7 @@ impl PyAggregate { match expr { Expr::Alias(expr, _) => self._aggregation_arguments(expr.as_ref()), Expr::AggregateFunction(AggregateFunction { fun: _, args, .. }) - | Expr::AggregateUDF { fun: _, args, .. } => match &self.aggregate { + | Expr::AggregateUDF(AggregateUDF { fun: _, args, .. }) => match &self.aggregate { Some(e) => py_expr_list(&e.input, args), None => Ok(vec![]), }, @@ -90,7 +90,7 @@ fn _agg_func_name(expr: &Expr) -> PyResult { match expr { Expr::Alias(expr, _) => _agg_func_name(expr.as_ref()), Expr::AggregateFunction(AggregateFunction { fun, .. }) => Ok(fun.to_string()), - Expr::AggregateUDF { fun, .. } => Ok(fun.name.clone()), + Expr::AggregateUDF(AggregateUDF { fun, .. }) => Ok(fun.name.clone()), _ => Err(py_type_err( "Encountered a non Aggregate type in agg_func_name", )), diff --git a/dask_planner/src/sql/logical/create_memory_table.rs b/dask_planner/src/sql/logical/create_memory_table.rs index 668295e0f..dd3d0753d 100644 --- a/dask_planner/src/sql/logical/create_memory_table.rs +++ b/dask_planner/src/sql/logical/create_memory_table.rs @@ -1,5 +1,6 @@ use datafusion_python::datafusion_expr::{ logical_plan::{CreateMemoryTable, CreateView}, + DdlStatement, LogicalPlan, }; use pyo3::prelude::*; @@ -85,13 +86,13 @@ impl TryFrom for PyCreateMemoryTable { fn try_from(logical_plan: LogicalPlan) -> Result { Ok(match logical_plan { - LogicalPlan::CreateMemoryTable(create_memory_table) => PyCreateMemoryTable { - create_memory_table: Some(create_memory_table), + LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(cmt)) => PyCreateMemoryTable { + create_memory_table: Some(cmt), create_view: None, }, - LogicalPlan::CreateView(create_view) => PyCreateMemoryTable { + LogicalPlan::Ddl(DdlStatement::CreateView(cv)) => PyCreateMemoryTable { create_memory_table: None, - create_view: Some(create_view), + create_view: Some(cv), }, _ => return Err(py_type_err("unexpected plan")), }) diff --git a/dask_planner/src/sql/logical/drop_table.rs b/dask_planner/src/sql/logical/drop_table.rs index 7d58e8a47..f91baf28a 100644 --- a/dask_planner/src/sql/logical/drop_table.rs +++ b/dask_planner/src/sql/logical/drop_table.rs @@ -1,4 +1,7 @@ -use datafusion_python::datafusion_expr::logical_plan::{DropTable, LogicalPlan}; +use datafusion_python::datafusion_expr::{ + logical_plan::{DropTable, LogicalPlan}, + DdlStatement, +}; use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; @@ -27,7 +30,7 @@ impl TryFrom for PyDropTable { fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - LogicalPlan::DropTable(drop_table) => Ok(PyDropTable { drop_table }), + LogicalPlan::Ddl(DdlStatement::DropTable(drop_table)) => Ok(PyDropTable { drop_table }), _ => Err(py_type_err("unexpected plan")), } } diff --git a/dask_planner/src/sql/logical/subquery_alias.rs b/dask_planner/src/sql/logical/subquery_alias.rs index 1b23e5dc4..003e02045 100644 --- a/dask_planner/src/sql/logical/subquery_alias.rs +++ b/dask_planner/src/sql/logical/subquery_alias.rs @@ -14,7 +14,7 @@ impl PySubqueryAlias { /// Returns a Vec of the sort expressions #[pyo3(name = "getAlias")] pub fn alias(&self) -> PyResult { - Ok(self.subquery_alias.alias.clone()) + Ok(self.subquery_alias.alias.clone().to_string()) } } diff --git a/dask_planner/src/sql/logical/table_scan.rs b/dask_planner/src/sql/logical/table_scan.rs index 679d24c49..c54b53556 100644 --- a/dask_planner/src/sql/logical/table_scan.rs +++ b/dask_planner/src/sql/logical/table_scan.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use datafusion_python::{ datafusion_common::{DFSchema, ScalarValue}, - datafusion_expr::{logical_plan::TableScan, Expr, LogicalPlan}, + datafusion_expr::{expr::InList, logical_plan::TableScan, Expr, LogicalPlan}, }; use pyo3::prelude::*; @@ -50,11 +50,11 @@ impl PyTableScan { let mut filter_tuple: Vec<(String, String, Vec)> = Vec::new(); match filter { - Expr::InList { + Expr::InList(InList { expr, list, negated, - } => { + }) => { // Only handle simple Expr(s) for InList operations for now if PyTableScan::_valid_expr_type(list) { // While ANSI SQL would not allow for anything other than a Column or Literal diff --git a/dask_planner/src/sql/optimizer.rs b/dask_planner/src/sql/optimizer.rs index 68577cf2c..cc86d2387 100644 --- a/dask_planner/src/sql/optimizer.rs +++ b/dask_planner/src/sql/optimizer.rs @@ -4,8 +4,6 @@ use datafusion_python::{ datafusion_common::DataFusionError, datafusion_expr::LogicalPlan, datafusion_optimizer::{ - decorrelate_where_exists::DecorrelateWhereExists, - decorrelate_where_in::DecorrelateWhereIn, eliminate_cross_join::EliminateCrossJoin, eliminate_limit::EliminateLimit, eliminate_outer_join::EliminateOuterJoin, @@ -43,8 +41,6 @@ impl DaskSqlOptimizer { Arc::new(SimplifyExpressions::new()), Arc::new(UnwrapCastInComparison::new()), // Arc::new(ReplaceDistinctWithAggregate::new()), - Arc::new(DecorrelateWhereExists::new()), - Arc::new(DecorrelateWhereIn::new()), Arc::new(ScalarSubqueryToJoin::new()), //Arc::new(ExtractEquijoinPredicate::new()), diff --git a/dask_planner/src/sql/table.rs b/dask_planner/src/sql/table.rs index f25f891ec..10fe97b1a 100644 --- a/dask_planner/src/sql/table.rs +++ b/dask_planner/src/sql/table.rs @@ -2,7 +2,7 @@ use std::{any::Any, sync::Arc}; use async_trait::async_trait; use datafusion_python::{ - datafusion::arrow::datatypes::{DataType, Field, SchemaRef}, + datafusion::arrow::datatypes::{DataType, Fields, SchemaRef}, datafusion_common::DFField, datafusion_expr::{Expr, LogicalPlan, TableProviderFilterPushDown, TableSource}, datafusion_optimizer::utils::split_conjunction, @@ -195,7 +195,7 @@ pub(crate) fn table_from_logical_plan( // Get the TableProvider for this Table instance let tbl_provider: Arc = table_scan.source.clone(); let tbl_schema: SchemaRef = tbl_provider.schema(); - let fields: &Vec = tbl_schema.fields(); + let fields: &Fields = tbl_schema.fields(); let mut cols: Vec<(String, DaskTypeMap)> = Vec::new(); for field in fields { diff --git a/dask_planner/src/sql/types.rs b/dask_planner/src/sql/types.rs index ceff904a6..4642a4eb0 100644 --- a/dask_planner/src/sql/types.rs +++ b/dask_planner/src/sql/types.rs @@ -1,6 +1,8 @@ pub mod rel_data_type; pub mod rel_data_type_field; +use std::sync::Arc; + use datafusion_python::{ datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}, datafusion_sql::sqlparser::{ast::DataType as SQLType, parser::Parser, tokenizer::Tokenizer}, @@ -54,10 +56,12 @@ impl DaskTypeMap { SqlTypeName::TIMESTAMP_WITH_LOCAL_TIME_ZONE => { let (unit, tz) = match py_kwargs { Some(dict) => { - let tz: Option = match dict.get_item("tz") { + let tz: Option> = match dict.get_item("tz") { Some(e) => { let res: PyResult = e.extract(); - Some(res.unwrap()) + Some(Arc::from(>::as_ref( + &res.unwrap(), + ))) } None => None, }; @@ -85,10 +89,12 @@ impl DaskTypeMap { SqlTypeName::TIMESTAMP => { let (unit, tz) = match py_kwargs { Some(dict) => { - let tz: Option = match dict.get_item("tz") { + let tz: Option> = match dict.get_item("tz") { Some(e) => { let res: PyResult = e.extract(); - Some(res.unwrap()) + Some(Arc::from(>::as_ref( + &res.unwrap(), + ))) } None => None, }; From c59cdbd838ce868051aa88f9d275197c31547cfe Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 14 Jun 2023 14:52:02 -0400 Subject: [PATCH 14/44] warn on optimization failure instead of erroring and exiting --- dask_sql/context.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dask_sql/context.py b/dask_sql/context.py index 837f7cd1c..d2247b1ac 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -42,7 +42,7 @@ from dask_sql.mappings import python_to_sql_type from dask_sql.physical.rel import RelConverter, custom, logical from dask_sql.physical.rex import RexConverter, core -from dask_sql.utils import OptimizationException, ParsingException +from dask_sql.utils import ParsingException logger = logging.getLogger(__name__) @@ -824,8 +824,9 @@ def _get_ral(self, sql): try: rel = self.context.optimize_relational_algebra(nonOptimizedRel) except DFOptimizationException as oe: + # Use original plan and warn about inability to optimize plan rel = nonOptimizedRel - raise OptimizationException(str(oe)) from None + logger.warn(str(oe)) else: rel = nonOptimizedRel From 5c02c5aafb46bbb377ae227e6448980a350008a9 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 30 Jun 2023 14:17:45 -0700 Subject: [PATCH 15/44] Resolve initial build errors --- dask_planner/src/dialect.rs | 1 + .../src/sql/optimizer/dynamic_partition_pruning.rs | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dask_planner/src/dialect.rs b/dask_planner/src/dialect.rs index e1067d3f0..da4e213e1 100644 --- a/dask_planner/src/dialect.rs +++ b/dask_planner/src/dialect.rs @@ -226,6 +226,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } _ => Ok(None), diff --git a/dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs b/dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs index 0ff48a682..ac931b560 100644 --- a/dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs +++ b/dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs @@ -22,6 +22,7 @@ use datafusion_python::{ }, datafusion_common::{Column, Result, ScalarValue}, datafusion_expr::{ + expr::InList, logical_plan::LogicalPlan, utils::from_plan, Expr, @@ -433,13 +434,13 @@ fn gather_aliases(plan: &LogicalPlan) -> HashMap { if let LogicalPlan::SubqueryAlias(ref s) = current_plan { match *s.input { LogicalPlan::TableScan(ref t) => { - aliases.insert(s.alias.clone(), t.table_name.to_string().clone()); + aliases.insert(s.alias.to_string(), t.table_name.to_string().clone()); } // Sometimes a TableScan is immediately followed by a Projection, so we can // still use the alias for the table LogicalPlan::Projection(ref p) => { if let LogicalPlan::TableScan(ref t) = *p.input { - aliases.insert(s.alias.clone(), t.table_name.to_string().clone()); + aliases.insert(s.alias.to_string(), t.table_name.to_string().clone()); } } _ => (), @@ -1053,11 +1054,11 @@ fn format_inlist_expr( if list.is_empty() { None } else { - Some(Expr::InList { + Some(Expr::InList(InList { expr, list, negated: false, - }) + })) } } From 515dae6a9ffdbc7f2e240bcaf94f7154e90c65c5 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 7 Jul 2023 10:45:38 -0700 Subject: [PATCH 16/44] Switch to crates release, add zlib to host/build deps --- continuous_integration/recipe/meta.yaml | 1 + dask_planner/Cargo.lock | 3 ++- dask_planner/Cargo.toml | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 5152cfc4e..d8ee1e0d1 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -32,6 +32,7 @@ requirements: - python - setuptools-rust - libprotobuf + - zlib run: - python - dask >=2022.3.0 diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index 615d90ab5..073939f3f 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -819,7 +819,8 @@ dependencies = [ [[package]] name = "datafusion-python" version = "26.0.0" -source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=logical_extension#ce83d8a837ac2bd8473cbe3432cebd3eb90a6e1c" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d406c7f116547044c2039be6d055c19c680a4ab8b1a550f0403c0ae276dff3c5" dependencies = [ "async-trait", "datafusion", diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml index 3afada895..f688956dc 100644 --- a/dask_planner/Cargo.toml +++ b/dask_planner/Cargo.toml @@ -10,7 +10,7 @@ rust-version = "1.65" [dependencies] async-trait = "0.1.68" -datafusion-python = { git = "https://github.com/jdye64/arrow-datafusion-python.git", branch = "logical_extension" } +datafusion-python = "26.0.0" env_logger = "0.10" log = "^0.4" pyo3 = { version = "0.18.3", features = ["extension-module", "abi3", "abi3-py38"] } From ef399e897fd7278ed55c8a644f78fa160561ae83 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 7 Jul 2023 11:59:01 -0700 Subject: [PATCH 17/44] Add zlib to aarch build deps --- continuous_integration/recipe/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index d8ee1e0d1..02e58d1fb 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -25,6 +25,7 @@ requirements: - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - libprotobuf # [build_platform != target_platform] + - zlib # [build_platform != target_platform] - {{ compiler('c') }} - {{ compiler('rust') }} host: From 68585782059ba8f3acd03df54a9b42e4a99c5b3f Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 11 Jul 2023 21:31:05 -0400 Subject: [PATCH 18/44] Bump to ADP 27 and introduce support for wildcard expressions, a wildcard expression name will be subbed with the first column in the incoming schema plan --- dask_planner/Cargo.lock | 267 +++++++++++---------- dask_planner/Cargo.toml | 4 +- dask_planner/src/expression.rs | 112 +++++---- dask_planner/src/sql.rs | 32 ++- dask_sql/physical/rel/logical/aggregate.py | 36 ++- dask_sql/physical/rex/core/call.py | 4 + tests/integration/test_join.py | 4 +- tests/integration/test_select.py | 12 + 8 files changed, 287 insertions(+), 184 deletions(-) diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index 8fdffdab2..1a75a215d 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -51,6 +51,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56fc6cf8dc8c4158eed8649f9b8b0ea1518eb62b544fe9490d66fa0b349eafe9" + [[package]] name = "android-tzdata" version = "0.1.1" @@ -91,8 +97,8 @@ dependencies = [ "serde", "serde_json", "snap", - "strum", - "strum_macros", + "strum 0.24.1", + "strum_macros 0.24.3", "thiserror", "typed-builder", "uuid", @@ -119,9 +125,9 @@ checksum = "8868f09ff8cea88b079da74ae569d9b8c62a23c68c746240b704ee6f7525c89c" [[package]] name = "arrow" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6619cab21a0cdd8c9b9f1d9e09bfaa9b1974e5ef809a6566aef0b998caf38ace" +checksum = "773d18d72cd290f3f9e2149a714c8ac404b6c3fd614c684f0015449940fca899" dependencies = [ "ahash", "arrow-arith", @@ -142,9 +148,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0dc95485623a76e00929bda8caa40c1f838190952365c4f43a7b9ae86d03e94" +checksum = "93bc0da4b22ba63807fa2a74998e21209179c93c67856ae65d9218b81f3ef918" dependencies = [ "arrow-array", "arrow-buffer", @@ -157,9 +163,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3267847f53d3042473cfd2c769afd8d74a6d7d201fc3a34f5cb84c0282ef47a7" +checksum = "ea9a0fd21121304cad96f307c938d861cb1e7f0c151b93047462cd9817d760fb" dependencies = [ "ahash", "arrow-buffer", @@ -168,15 +174,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "num", ] [[package]] name = "arrow-buffer" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f66553e66e120ac4b21570368ee9ebf35ff3f5399f872b0667699e145678f5" +checksum = "30ce342ecf5971004e23cef8b5fb3bacd2bbc48a381464144925074e1472e9eb" dependencies = [ "half", "num", @@ -184,9 +190,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e6f3579dbf0d97c683d451b2550062b0f0e62a3169bf74238b5f59f44ad6d8" +checksum = "4b94a0ce7d27abbb02e2ee4db770f593127610f57b32625b0bc6a1a90d65f085" dependencies = [ "arrow-array", "arrow-buffer", @@ -195,15 +201,16 @@ dependencies = [ "arrow-select", "chrono", "comfy-table", + "half", "lexical-core", "num", ] [[package]] name = "arrow-csv" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "373579c4c1a8f5307d3125b7a89c700fcf8caf85821c77eb4baab3855ae0aba5" +checksum = "0f3be10a00a43c4bf0d243c070754ebdde17c5d576b4928d9c3efbe3005a3853" dependencies = [ "arrow-array", "arrow-buffer", @@ -220,9 +227,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61bc8df9912cca6642665fdf989d6fa0de2570f18a7f709bcf59d29de96d2097" +checksum = "1d9a83dad6a53d6907765106d3bc61d6d9d313cfe1751701b3ef0948e7283dc2" dependencies = [ "arrow-buffer", "arrow-schema", @@ -232,9 +239,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0105dcf5f91daa7182d87b713ee0b32b3bfc88e0c48e7dc3e9d6f1277a07d1ae" +checksum = "a46da5e438a854e0386b38774da88a98782c0973c6dbc5c949ca4e02faf9b016" dependencies = [ "arrow-array", "arrow-buffer", @@ -246,9 +253,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e73134fb5b5ec8770f8cbb214c2c487b2d350081e403ca4eeeb6f8f5e19846ac" +checksum = "d5f27a1fbc76553ad92dc1a9583e56b7058d8c418c4089b0b689f5b87e2da5e1" dependencies = [ "arrow-array", "arrow-buffer", @@ -266,9 +273,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89f25bc66e18d4c2aa1fe2f9bb03e2269da60e636213210385ae41a107f9965a" +checksum = "f2373661f6c2233e18f6fa69c40999a9440231d1e8899be8bbbe73c7e24aa3b4" dependencies = [ "arrow-array", "arrow-buffer", @@ -281,9 +288,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1095ff85ea4f5ff02d17b30b089de31b51a50be01c6b674f0a0509ab771232f1" +checksum = "377cd5158b7de4034a175e296726c40c3236e65d71d90a5dab2fb4fab526a8f4" dependencies = [ "ahash", "arrow-array", @@ -291,23 +298,23 @@ dependencies = [ "arrow-data", "arrow-schema", "half", - "hashbrown 0.13.2", + "hashbrown 0.14.0", ] [[package]] name = "arrow-schema" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25187bbef474151a2e4ddec67b9e34bda5cbfba292dc571392fa3a1f71ff5a82" +checksum = "ba9ed245bd2d7d97ad1457cb281d4296e8b593588758b8fec6d67b2b2b0f2265" dependencies = [ "bitflags 2.3.2", ] [[package]] name = "arrow-select" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd0d4ee884aec3aa05e41478e3cd312bf609de9babb5d187a43fb45931da4da4" +checksum = "0dc9bd6aebc565b1d04bae64a0f4dda3abc677190eb7d960471b1b20e1cebed0" dependencies = [ "arrow-array", "arrow-buffer", @@ -318,9 +325,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6d71c3ffe4c07e66ce8fdc6aed5b00e0e60c5144911879b10546f5b72d8fa1c" +checksum = "23cf2baea2ef53787332050decf7d71aca836a352e188c8ad062892405955d2b" dependencies = [ "arrow-array", "arrow-buffer", @@ -539,12 +546,12 @@ dependencies = [ [[package]] name = "comfy-table" -version = "6.2.0" +version = "7.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e959d788268e3bf9d35ace83e81b124190378e4c91c9067524675e33394b8ba" +checksum = "9ab77dbd8adecaf3f0db40581631b995f312a8a5ae3aa9993188bb8f23d83a5b" dependencies = [ - "strum", - "strum_macros", + "strum 0.24.1", + "strum_macros 0.24.3", "unicode-width", ] @@ -659,15 +666,15 @@ dependencies = [ "env_logger", "log", "pyo3", - "pyo3-build-config 0.19.1", + "pyo3-build-config", "pyo3-log", ] [[package]] name = "datafusion" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9992c267436551d40b52d65289b144712e7b0ebdc62c8c859fd1574e5f73efbb" +checksum = "e96f6e4eb10bd3e6b709686858246466983e8c5354a928ff77ee34919aa60d00" dependencies = [ "ahash", "apache-avro", @@ -690,9 +697,9 @@ dependencies = [ "flate2", "futures", "glob", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "indexmap", - "itertools", + "itertools 0.11.0", "lazy_static", "log", "num-traits", @@ -707,7 +714,6 @@ dependencies = [ "sqlparser", "tempfile", "tokio", - "tokio-stream", "tokio-util", "url", "uuid", @@ -717,9 +723,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3be97f7a7c720cdbb71e9eeabf814fa6ad8102b9022390f6cac74d3b4af6392" +checksum = "00e5fddcc0dd49bbe199e43aa406f39c46c790bb2a43c7b36a478e5f3f971235" dependencies = [ "apache-avro", "arrow", @@ -734,14 +740,14 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c77c4b14b809b0e4c5bb101b6834504f06cdbb0d3c643400c61d0d844b33264e" +checksum = "cfd50b6cb17acc78d2473c0d28014b8fd4e2e0a2c067c07645d6547b33b0aeeb" dependencies = [ "dashmap", "datafusion-common", "datafusion-expr", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "log", "object_store", "parking_lot", @@ -752,24 +758,24 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ec7409bd45cf4fae6395d7d1024c8a97e543cadc88363e405d2aad5330e5e7" +checksum = "e1a35dc2cd9eac18063d636f7ddf4f090fe1f34284d80192ac7ade38cc3c6991" dependencies = [ "ahash", "arrow", "datafusion-common", "lazy_static", "sqlparser", - "strum", - "strum_macros", + "strum 0.25.0", + "strum_macros 0.25.1", ] [[package]] name = "datafusion-optimizer" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64b537c93f87989c212db92a448a0f5eb4f0995e27199bb7687ae94f8b64a7a8" +checksum = "5f5043afeb45ec1c0f45519e1eed6a477f2d30732e8f975d9cf9a75fba0ca716" dependencies = [ "arrow", "async-trait", @@ -777,17 +783,17 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown 0.13.2", - "itertools", + "hashbrown 0.14.0", + "itertools 0.11.0", "log", "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f60ee3f53340fdef36ee54d9e12d446ae2718b1d0196ac581f791d34808ec876" +checksum = "6cc892a24f4b829ee7718ad3950884c0346dbdf1517f3df153af4bcf54d8ca4d" dependencies = [ "ahash", "arrow", @@ -801,9 +807,9 @@ dependencies = [ "datafusion-expr", "datafusion-row", "half", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "indexmap", - "itertools", + "itertools 0.11.0", "lazy_static", "libc", "md-5", @@ -818,9 +824,9 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d406c7f116547044c2039be6d055c19c680a4ab8b1a550f0403c0ae276dff3c5" +checksum = "3311b157d1afe2a363d37a5ccb675384aa76e6033572ef9246f8af1579e6f0b2" dependencies = [ "async-trait", "datafusion", @@ -836,7 +842,7 @@ dependencies = [ "prost", "prost-types", "pyo3", - "pyo3-build-config 0.18.3", + "pyo3-build-config", "rand", "regex-syntax", "syn 2.0.23", @@ -847,9 +853,9 @@ dependencies = [ [[package]] name = "datafusion-row" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d58fc64058aa3bcb00077a0d19474a0d584d31dec8c7ac3406868f485f659af9" +checksum = "ce75c660bbddfdd254109e668e5b5bd69df31ea26e3768e15cef0c68015e650e" dependencies = [ "arrow", "datafusion-common", @@ -859,9 +865,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1531f0314151a34bf6c0a83c7261525688b7c729876f53e7896b8f4ca8f57d07" +checksum = "49cab87e4933a452e0b7b3f0cbd0e760daf7d33fb54d09d70d3ffba229eaa652" dependencies = [ "arrow", "arrow-schema", @@ -873,14 +879,14 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "079d5be5ec59580777bfa16d79187fea99b6498e3e8e07eb36d504a5fe708f13" +checksum = "ba77d22232053f6cdd98bd6f5328940850844450253f25b8c50bfc5199c505d4" dependencies = [ "async-recursion", "chrono", "datafusion", - "itertools", + "itertools 0.11.0", "object_store", "prost", "prost-types", @@ -1183,6 +1189,16 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +dependencies = [ + "ahash", + "allocator-api2", +] + [[package]] name = "heck" version = "0.4.1" @@ -1383,6 +1399,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.6" @@ -1613,9 +1638,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memoffset" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ "autocfg", ] @@ -1750,16 +1775,18 @@ dependencies = [ [[package]] name = "object_store" -version = "0.5.6" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec9cd6ca25e796a49fa242876d1c4de36a24a6da5258e9f0bc062dbf5e81c53b" +checksum = "27c776db4f332b571958444982ff641d2531417a326ca368995073b639205d58" dependencies = [ "async-trait", "base64", "bytes", "chrono", "futures", - "itertools", + "humantime", + "hyper", + "itertools 0.10.5", "parking_lot", "percent-encoding", "quick-xml", @@ -1816,9 +1843,9 @@ dependencies = [ [[package]] name = "parquet" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6a656fcc17e641657c955742c689732684e096f790ff30865d9f8dcc39f7c4a" +checksum = "baab9c36b1c8300b81b4d577d306a0a733f9d34021363098d3548e37757ed6c8" dependencies = [ "ahash", "arrow-array", @@ -1834,7 +1861,7 @@ dependencies = [ "chrono", "flate2", "futures", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "lz4", "num", "num-bigint", @@ -1984,7 +2011,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", "heck", - "itertools", + "itertools 0.10.5", "lazy_static", "log", "multimap", @@ -2003,7 +2030,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" dependencies = [ "anyhow", - "itertools", + "itertools 0.10.5", "proc-macro2", "quote", "syn 1.0.109", @@ -2020,31 +2047,21 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b1ac5b3731ba34fdaa9785f8d74d17448cd18f30cf19e0c7e7b1fdb5272109" +checksum = "ffb88ae05f306b4bfcde40ac4a51dc0b05936a9207a4b75b798c7729c4258a59" dependencies = [ "cfg-if", "indoc", "libc", "memoffset", "parking_lot", - "pyo3-build-config 0.18.3", + "pyo3-build-config", "pyo3-ffi", "pyo3-macros", "unindent", ] -[[package]] -name = "pyo3-build-config" -version = "0.18.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cb946f5ac61bb61a5014924910d936ebd2b23b705f7a4a3c40b05c720b079a3" -dependencies = [ - "once_cell", - "target-lexicon", -] - [[package]] name = "pyo3-build-config" version = "0.19.1" @@ -2057,12 +2074,12 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd4d7c5337821916ea2a1d21d1092e8443cf34879e53a0ac653fbb98f44ff65c" +checksum = "922ede8759e8600ad4da3195ae41259654b9c55da4f7eec84a0ccc7d067a70a4" dependencies = [ "libc", - "pyo3-build-config 0.18.3", + "pyo3-build-config", ] [[package]] @@ -2078,9 +2095,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d39c55dab3fc5a4b25bbd1ac10a2da452c4aca13bb450f22818a002e29648d" +checksum = "8a5caec6a1dd355964a841fcbeeb1b89fe4146c87295573f94228911af3cc5a2" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -2090,9 +2107,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97daff08a4c48320587b5224cc98d609e3c27b6d437315bd40b605c98eeb5918" +checksum = "e0b78ccbb160db1556cdb6fd96c50334c5d4ec44dc5e0a968d0a1208fa0efa8b" dependencies = [ "proc-macro2", "quote", @@ -2536,9 +2553,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "sqlparser" -version = "0.34.0" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d3706eefb17039056234df6b566b0014f303f867f2656108334a55b8096f59" +checksum = "ca597d77c98894be1f965f2e4e2d2a61575d4998088e655476c73715c54b2b43" dependencies = [ "log", "sqlparser_derive", @@ -2566,8 +2583,14 @@ name = "strum" version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" + +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" dependencies = [ - "strum_macros", + "strum_macros 0.25.1", ] [[package]] @@ -2583,11 +2606,24 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "strum_macros" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6069ca09d878a33f883cc06aaa9718ede171841d3832450354410b718b097232" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.23", +] + [[package]] name = "substrait" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9df5d9e071804204172dc77e707c363f187e7f6566f9c78e5100c9a8f5ea434e" +checksum = "7d3b77ddddd080d1bb5ebfe6b62d1c4e2f33c9f6a4586d5eac5306a08f3d4585" dependencies = [ "git2", "heck", @@ -2756,17 +2792,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-stream" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", -] - [[package]] name = "tokio-util" version = "0.7.8" @@ -2854,9 +2879,9 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "typify" -version = "0.0.12" +version = "0.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6658d09e71bfe59e7987dc95ee7f71809fdb5793ab0cdc1503cc0073990484d" +checksum = "be9bb640c0eece20cac2028ebbc2ca1a3d17e3b1ddd98540309c309ed178d158" dependencies = [ "typify-impl", "typify-macro", @@ -2864,9 +2889,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.0.12" +version = "0.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34d3bb47587b13edf526d6ed02bf360ecefe083ab47a4ef29fc43112828b2bef" +checksum = "5c8d9ecedde2fd77e975c38eeb9ca40b34ad0247b2259c6e6bbd2a8d6cc2444f" dependencies = [ "heck", "log", @@ -2882,9 +2907,9 @@ dependencies = [ [[package]] name = "typify-macro" -version = "0.0.12" +version = "0.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3f7e627c18be12d53bc1f261830b9c2763437b6a86ac57293b9085af2d32ffe" +checksum = "c08942cd65d458d2da15777a649cb6400cb545f17964f1ca965583f22e9cc3a9" dependencies = [ "proc-macro2", "quote", diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml index 55d3e0882..8a849628b 100644 --- a/dask_planner/Cargo.toml +++ b/dask_planner/Cargo.toml @@ -10,10 +10,10 @@ rust-version = "1.65" [dependencies] async-trait = "0.1.71" -datafusion-python = "26.0.0" +datafusion-python = "27.0.0" env_logger = "0.10" log = "^0.4" -pyo3 = { version = "0.18.3", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.19.0", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-log = "0.8.2" [build-dependencies] diff --git a/dask_planner/src/expression.rs b/dask_planner/src/expression.rs index 80e7b88fa..d13f66e89 100644 --- a/dask_planner/src/expression.rs +++ b/dask_planner/src/expression.rs @@ -104,9 +104,10 @@ impl PyExpr { fn _rex_type(&self, expr: &Expr) -> RexType { match expr { Expr::Alias(..) => RexType::Alias, - Expr::Column(..) | Expr::QualifiedWildcard { .. } | Expr::GetIndexedField { .. } => { - RexType::Reference - } + Expr::Column(..) + | Expr::QualifiedWildcard { .. } + | Expr::GetIndexedField { .. } + | Expr::Wildcard => RexType::Reference, Expr::ScalarVariable(..) | Expr::Literal(..) => RexType::Literal, Expr::BinaryExpr { .. } | Expr::Not(..) @@ -126,7 +127,6 @@ impl PyExpr { | Expr::WindowFunction { .. } | Expr::AggregateUDF { .. } | Expr::InList { .. } - | Expr::Wildcard | Expr::ScalarUDF { .. } | Expr::Exists { .. } | Expr::InSubquery { .. } @@ -197,49 +197,54 @@ impl PyExpr { schema.merge(plan.schema().as_ref()); } let name = get_expr_name(&self.expr).map_err(py_runtime_err)?; - schema - .index_of_column(&Column::from_qualified_name(name.clone())) - .or_else(|_| { - // Handles cases when from_qualified_name doesn't format the Column correctly. - // "name" will always contain the name of the column. Anything in addition to - // that will be separated by a '.' and should be further referenced. - let parts = name.split('.').collect::>(); - let tbl_reference = match parts.len() { - // Single element means name contains just the column name so no TableReference - 1 => None, - // Tablename.column_name - 2 => Some( - TableReference::Bare { - table: Cow::Borrowed(parts[0]), - } - .to_owned_reference(), - ), - // Schema_name.table_name.column_name - 3 => Some( - TableReference::Partial { - schema: Cow::Borrowed(parts[0]), - table: Cow::Borrowed(parts[1]), - } - .to_owned_reference(), - ), - // catalog_name.schema_name.table_name.column_name - 4 => Some( - TableReference::Full { - catalog: Cow::Borrowed(parts[0]), - schema: Cow::Borrowed(parts[1]), - table: Cow::Borrowed(parts[2]), - } - .to_owned_reference(), - ), - _ => None, - }; - - let col = Column { - relation: tbl_reference.clone(), - name: parts[parts.len() - 1].to_string(), - }; - schema.index_of_column(&col).map_err(py_runtime_err) - }) + if name != "*" { + schema + .index_of_column(&Column::from_qualified_name(name.clone())) + .or_else(|_| { + // Handles cases when from_qualified_name doesn't format the Column correctly. + // "name" will always contain the name of the column. Anything in addition to + // that will be separated by a '.' and should be further referenced. + let parts = name.split('.').collect::>(); + let tbl_reference = match parts.len() { + // Single element means name contains just the column name so no TableReference + 1 => None, + // Tablename.column_name + 2 => Some( + TableReference::Bare { + table: Cow::Borrowed(parts[0]), + } + .to_owned_reference(), + ), + // Schema_name.table_name.column_name + 3 => Some( + TableReference::Partial { + schema: Cow::Borrowed(parts[0]), + table: Cow::Borrowed(parts[1]), + } + .to_owned_reference(), + ), + // catalog_name.schema_name.table_name.column_name + 4 => Some( + TableReference::Full { + catalog: Cow::Borrowed(parts[0]), + schema: Cow::Borrowed(parts[1]), + table: Cow::Borrowed(parts[2]), + } + .to_owned_reference(), + ), + _ => None, + }; + + let col = Column { + relation: tbl_reference.clone(), + name: parts[parts.len() - 1].to_string(), + }; + schema.index_of_column(&col).map_err(py_runtime_err) + }) + } else { + // Since this is wildcard any Column will do, just use first one + Ok(0) + } } _ => Err(py_runtime_err( "We need a valid LogicalPlan instance to get the Expr's index in the schema", @@ -425,11 +430,14 @@ impl PyExpr { PyExpr::from(*low.clone(), self.input_plan.clone()), PyExpr::from(*high.clone(), self.input_plan.clone()), ]), + Expr::Wildcard => Ok(vec![PyExpr::from( + self.expr.clone(), + self.input_plan.clone(), + )]), // Currently un-support/implemented Expr types for Rex Call operations Expr::GroupingSet(..) | Expr::OuterReferenceColumn(_, _) - | Expr::Wildcard | Expr::QualifiedWildcard { .. } | Expr::ScalarSubquery(..) | Expr::Placeholder { .. } @@ -894,6 +902,10 @@ fn unexpected_literal_value(value: &ScalarValue) -> PyErr { fn get_expr_name(expr: &Expr) -> Result { match expr { Expr::Alias(expr, _) => get_expr_name(expr), + Expr::Wildcard => { + // 'Wildcard' means any and all columns. We get the first valid column name here + Ok("*".to_owned()) + } _ => Ok(expr.canonical_name()), } } @@ -906,6 +918,10 @@ pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> Result { // appear in projections) so we just delegate to the contained expression instead expr_to_field(expr, input_plan) } + Expr::Wildcard => { + // Any column will do. We use the first column to keep things consistent + Ok(input_plan.schema().field(0).clone()) + } _ => { let fields = exprlist_to_fields(&[expr.clone()], input_plan).map_err(DaskPlannerError::from)?; diff --git a/dask_planner/src/sql.rs b/dask_planner/src/sql.rs index a0e238727..39d4614d4 100644 --- a/dask_planner/src/sql.rs +++ b/dask_planner/src/sql.rs @@ -21,7 +21,7 @@ use datafusion_python::{ }, datafusion_expr::{ logical_plan::Extension, - AccumulatorFunctionImplementation, + AccumulatorFactoryFunction, AggregateUDF, LogicalPlan, ReturnTypeFunction, @@ -385,7 +385,7 @@ impl ContextProvider for DaskSQLContext { } fn get_aggregate_meta(&self, name: &str) -> Option> { - let acc: AccumulatorFunctionImplementation = + let acc: AccumulatorFactoryFunction = Arc::new(|_return_type| Err(DataFusionError::NotImplemented("".to_string()))); let st: StateTypeFunction = @@ -478,6 +478,13 @@ impl ContextProvider for DaskSQLContext { fn options(&self) -> &ConfigOptions { &self.options } + + fn get_window_meta( + &self, + _name: &str, + ) -> Option> { + unimplemented!("RUST: get_window_meta is not yet implemented for DaskSQLContext") + } } #[pymethods] @@ -592,14 +599,19 @@ impl DaskSQLContext { current_node: None, }) .map_err(py_optimization_exp); - if self.dynamic_partition_pruning { - optimizer::DaskSqlOptimizer::dynamic_partition_pruner() - .optimize_once(optimized_plan.unwrap().original_plan) - .map(|k| PyLogicalPlan { - original_plan: k, - current_node: None, - }) - .map_err(py_optimization_exp) + + if let Ok(optimized_plan) = optimized_plan { + if self.dynamic_partition_pruning { + optimizer::DaskSqlOptimizer::dynamic_partition_pruner() + .optimize_once(optimized_plan.original_plan) + .map(|k| PyLogicalPlan { + original_plan: k, + current_node: None, + }) + .map_err(py_optimization_exp) + } else { + Ok(optimized_plan) + } } else { optimized_plan } diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index 84c832177..e6b6ed30b 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -127,6 +127,7 @@ class DaskAggregatePlugin(BaseRelPlugin): "avg": AggregationSpecification("mean", AggregationOnPandas("mean")), "stddev": AggregationSpecification("std", AggregationOnPandas("std")), "stddevsamp": AggregationSpecification("std", AggregationOnPandas("std")), + "stddev_samp": AggregationSpecification("std", AggregationOnPandas("std")), "stddevpop": AggregationSpecification( dd.Aggregation( "stddevpop", @@ -142,6 +143,21 @@ class DaskAggregatePlugin(BaseRelPlugin): ** (1 / 2), ) ), + "stddev_pop": AggregationSpecification( + dd.Aggregation( + "stddev_pop", + lambda s: (s.count(), s.sum(), s.agg(lambda x: (x**2).sum())), + lambda count, sum, sum_of_squares: ( + count.sum(), + sum.sum(), + sum_of_squares.sum(), + ), + lambda count, sum, sum_of_squares: ( + (sum_of_squares / count) - (sum / count) ** 2 + ) + ** (1 / 2), + ) + ), "bit_and": AggregationSpecification( ReduceAggregation("bit_and", operator.and_) ), @@ -198,6 +214,20 @@ class DaskAggregatePlugin(BaseRelPlugin): ), ) ), + "variance_pop": AggregationSpecification( + dd.Aggregation( + "variance_pop", + lambda s: (s.count(), s.sum(), s.agg(lambda x: (x**2).sum())), + lambda count, sum, sum_of_squares: ( + count.sum(), + sum.sum(), + sum_of_squares.sum(), + ), + lambda count, sum, sum_of_squares: ( + (sum_of_squares / count) - (sum / count) ** 2 + ), + ) + ), } def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: @@ -378,7 +408,11 @@ def _collect_aggregations( "AggregateUDF", }, "Do not know how to handle this case!" for input_expr in agg.getArgs(expr): - input_col = input_expr.column_name(input_rel) + # Wildcard expr + if input_expr.toString() != "*": + input_col = input_expr.column_name(input_rel) + else: + input_col = None if input_col not in cc._frontend_backend_mapping: random_name = new_temporary_column(df) new_columns[random_name] = RexConverter.convert( diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 85d083d78..5ef1d7fb8 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -1077,6 +1077,9 @@ class RexCallPlugin(BaseRexPlugin): "characterlength": TensorScalarOperation( lambda x: x.str.len(), lambda x: len(x) ), + "character_length": TensorScalarOperation( + lambda x: x.str.len(), lambda x: len(x) + ), "upper": TensorScalarOperation(lambda x: x.str.upper(), lambda x: x.upper()), "lower": TensorScalarOperation(lambda x: x.str.lower(), lambda x: x.lower()), "position": PositionOperation(), @@ -1104,6 +1107,7 @@ class RexCallPlugin(BaseRexPlugin): "dsql_totimestamp": ToTimestampOperation(), # Temporary UDF functions that need to be moved after this POC "datepart": ExtractOperation(), + "date_part": ExtractOperation(), "year": YearOperation(), "timestampadd": TimeStampAddOperation(), "timestampceil": CeilFloorOperation("ceil"), diff --git a/tests/integration/test_join.py b/tests/integration/test_join.py index c46cec101..3f19a3211 100644 --- a/tests/integration/test_join.py +++ b/tests/integration/test_join.py @@ -377,7 +377,7 @@ def test_intersect(c): limit 100 """ ) - assert actual_df["COUNT(UInt8(1))"].compute()[0] == 3 + assert actual_df["COUNT(*)"].compute()[0] == 3 # Join df_simple against itself, and then that result against df_wide. Nothing should match so therefore result should be 0 actual_df = c.sql( @@ -392,7 +392,7 @@ def test_intersect(c): limit 100 """ ) - assert len(actual_df["COUNT(UInt8(1))"]) == 0 + assert len(actual_df["COUNT(*)"]) == 0 actual_df = c.sql( """ diff --git a/tests/integration/test_select.py b/tests/integration/test_select.py index 9c4331d77..53ebdc224 100644 --- a/tests/integration/test_select.py +++ b/tests/integration/test_select.py @@ -272,3 +272,15 @@ def test_multiple_column_projection(c, parquet_ddf, input_cols): "read-parquet", ).columns ) == sorted(input_cols) + + +def test_wildcard_select(c): + result_df = c.sql("SELECT COUNT(*) FROM df") + + expected_df = pd.DataFrame( + { + "COUNT(*)": [700], + } + ) + + assert_eq(result_df, expected_df) From 24e0f90c584478fc6ef936020c4851387a4f784a Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 11 Jul 2023 21:40:38 -0400 Subject: [PATCH 19/44] remove bit of logic that is no longer needed to manually check the wildcard 'name' as a '*' --- dask_sql/physical/rel/logical/aggregate.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index e6b6ed30b..a14900f99 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -408,11 +408,7 @@ def _collect_aggregations( "AggregateUDF", }, "Do not know how to handle this case!" for input_expr in agg.getArgs(expr): - # Wildcard expr - if input_expr.toString() != "*": - input_col = input_expr.column_name(input_rel) - else: - input_col = None + input_col = input_expr.column_name(input_rel) if input_col not in cc._frontend_backend_mapping: random_name = new_temporary_column(df) new_columns[random_name] = RexConverter.convert( From d776229053e2a3f902ef2786cbdfc6f6ab9825ca Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 12 Jul 2023 08:30:57 -0400 Subject: [PATCH 20/44] experiment with removing zlib, hoping that fixes os x build --- continuous_integration/recipe/meta.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 02e58d1fb..5152cfc4e 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -25,7 +25,6 @@ requirements: - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - libprotobuf # [build_platform != target_platform] - - zlib # [build_platform != target_platform] - {{ compiler('c') }} - {{ compiler('rust') }} host: @@ -33,7 +32,6 @@ requirements: - python - setuptools-rust - libprotobuf - - zlib run: - python - dask >=2022.3.0 From 99ec8010592a2854de21d58e0fc6b3172246f9a2 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 12 Jul 2023 08:47:59 -0400 Subject: [PATCH 21/44] Change expected_df result to 1.5 from 1. 3/2 is in fact 1.5 and not 1 --- tests/integration/test_rex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index b49a687d2..3f720e6d0 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -419,7 +419,7 @@ def test_coalesce(c, gpu): "c2": [np.nan], "c3": ["hi"], "c4": ["bye"], - "c5": ["1"], + "c5": ["1.5"], "c6": ["why"], "c7": [2.0], } From 8997f7f739c45f1b74ea024729799a56b1678bed Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 12 Jul 2023 09:03:39 -0400 Subject: [PATCH 22/44] Fix cargo test --- continuous_integration/recipe/meta.yaml | 2 ++ dask_planner/src/sql/optimizer.rs | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 5152cfc4e..02e58d1fb 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -25,6 +25,7 @@ requirements: - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - libprotobuf # [build_platform != target_platform] + - zlib # [build_platform != target_platform] - {{ compiler('c') }} - {{ compiler('rust') }} host: @@ -32,6 +33,7 @@ requirements: - python - setuptools-rust - libprotobuf + - zlib run: - python - dask >=2022.3.0 diff --git a/dask_planner/src/sql/optimizer.rs b/dask_planner/src/sql/optimizer.rs index 8e8bc9235..a5957ac98 100644 --- a/dask_planner/src/sql/optimizer.rs +++ b/dask_planner/src/sql/optimizer.rs @@ -230,6 +230,13 @@ mod tests { fn get_variable_type(&self, _variable_names: &[String]) -> Option { None } + + fn get_window_meta( + &self, + _name: &str, + ) -> Option> { + None + } } struct MyTableSource { From ec1d2bf51005a6ffff5e8b4ef5e7b51a03740170 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Thu, 13 Jul 2023 13:46:29 -0400 Subject: [PATCH 23/44] add .cargo/config.toml in hopes of fixing linker build issues on osx --- .cargo/config.toml | 11 + dask_planner/Cargo.lock | 2 +- dask_planner/src/lib.rs | 2 + dask_planner/src/sql.rs | 1 + dask_planner/src/sql/logical.rs | 1 - dask_planner/src/sql/logical/table_scan.rs | 228 -------------------- dask_planner/src/sql/logical/utils.rs | 167 +++++++++++++- dask_sql/context.py | 3 - dask_sql/physical/rel/logical/table_scan.py | 7 +- 9 files changed, 182 insertions(+), 240 deletions(-) create mode 100644 .cargo/config.toml delete mode 100644 dask_planner/src/sql/logical/table_scan.rs diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 000000000..d47f983e4 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,11 @@ +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index c20d485e3..35b2b94dc 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -849,7 +849,7 @@ dependencies = [ [[package]] name = "datafusion-python" version = "27.0.0" -source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=logical_extension#4e421dbbf44e0093a0866f460790cfe52d3e6f2d" +source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=logical_extension#f3f27fe579062a470e76bfc62ece0188bdb574a4" dependencies = [ "async-trait", "datafusion", diff --git a/dask_planner/src/lib.rs b/dask_planner/src/lib.rs index 8dfc24b56..cd65240de 100644 --- a/dask_planner/src/lib.rs +++ b/dask_planner/src/lib.rs @@ -60,6 +60,8 @@ fn rust(py: Python, m: &PyModule) -> PyResult<()> { .unwrap(); m.add_wrapped(wrap_pyfunction!(sql::logical::utils::get_precision_scale)) .unwrap(); + m.add_wrapped(wrap_pyfunction!(sql::logical::utils::get_table_scan_dnf_filters)) + .unwrap(); // Exceptions m.add( diff --git a/dask_planner/src/sql.rs b/dask_planner/src/sql.rs index 80e238240..32dc03fc4 100644 --- a/dask_planner/src/sql.rs +++ b/dask_planner/src/sql.rs @@ -623,6 +623,7 @@ impl DaskSQLContext { let inner_plan = match dask_statement { DaskStatement::Statement(statement) => { let planner = SqlToRel::new(self); + println!("Invoking generic DaskStatement::Statement logic ...."); Ok::( planner.statement_to_plan(DFStatement::Statement(statement))?, ) diff --git a/dask_planner/src/sql/logical.rs b/dask_planner/src/sql/logical.rs index 80aa818e5..6b6336fff 100644 --- a/dask_planner/src/sql/logical.rs +++ b/dask_planner/src/sql/logical.rs @@ -63,7 +63,6 @@ pub mod show_schemas; pub mod show_tables; pub mod sort; pub mod subquery_alias; -pub mod table_scan; pub mod use_schema; pub mod utils; pub mod window; diff --git a/dask_planner/src/sql/logical/table_scan.rs b/dask_planner/src/sql/logical/table_scan.rs deleted file mode 100644 index 83898f63d..000000000 --- a/dask_planner/src/sql/logical/table_scan.rs +++ /dev/null @@ -1,228 +0,0 @@ -use std::{sync::Arc, vec}; - -use datafusion_python::{ - datafusion_common::{DFSchema, ScalarValue}, - datafusion_expr::{expr::InList, logical_plan::TableScan, Expr, LogicalPlan}, - errors::py_type_err, - expr::PyExpr, -}; -use pyo3::prelude::*; - -use super::utils::py_expr_list; -use crate::error::DaskPlannerError; - -#[pyclass(name = "TableScan", module = "dask_planner", subclass)] -#[derive(Clone)] -pub struct PyTableScan { - pub(crate) table_scan: TableScan, - input: Arc, -} - -type FilterTuple = (String, String, Option>); -#[pyclass(name = "FilteredResult", module = "dask_planner", subclass)] -#[derive(Debug, Clone)] -pub struct PyFilteredResult { - // Certain Expr(s) do not have supporting logic in pyarrow for IO filtering - // at read time. Those Expr(s) cannot be ignored however. This field stores - // those Expr(s) so that they can be used on the Python side to create - // Dask operations that handle that filtering as an extra task in the graph. - #[pyo3(get)] - pub io_unfilterable_exprs: Vec, - // Expr(s) that can have their filtering logic performed in the pyarrow IO logic - // are stored here in a DNF format that is expected by pyarrow. - #[pyo3(get)] - pub filtered_exprs: Vec<(PyExpr, FilterTuple)>, -} - -impl PyTableScan { - /// Ensures that a valid Expr variant type is present - fn _valid_expr_type(expr: &[Expr]) -> bool { - expr.iter() - .all(|f| matches!(f, Expr::Column(_) | Expr::Literal(_))) - } - - /// Transform the singular Expr instance into its DNF form serialized in a Vec instance. Possibly recursively expanding - /// it as well if needed. - pub fn _expand_dnf_filter( - filter: &Expr, - py: Python, - ) -> Result, DaskPlannerError> { - let mut filter_tuple: Vec<(PyExpr, FilterTuple)> = Vec::new(); - - match filter { - Expr::InList(InList { - expr, - list, - negated, - }) => { - // Only handle simple Expr(s) for InList operations for now - if PyTableScan::_valid_expr_type(list) { - // While ANSI SQL would not allow for anything other than a Column or Literal - // value in this "identifying" `expr` we explicitly check that here just to be sure. - // IF it is something else it is returned to Dask to handle - let ident = match *expr.clone() { - Expr::Column(col) => Ok(col.name), - Expr::Alias(_, name) => Ok(name), - Expr::Literal(val) => Ok(format!("{}", val)), - _ => Err(DaskPlannerError::InvalidIOFilter(format!( - "Invalid InList Expr type `{}`. using in Dask instead", - filter - ))), - }; - - let op = if *negated { "not in" } else { "in" }; - let il: Result, DaskPlannerError> = list - .iter() - .map(|f| match f { - Expr::Column(col) => Ok(col.name.clone().into_py(py)), - Expr::Alias(_, name) => Ok(name.clone().into_py(py)), - Expr::Literal(val) => match val { - ScalarValue::Boolean(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::Float32(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::Float64(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::Int8(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::Int16(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::Int32(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::Int64(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::UInt8(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::UInt16(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::UInt32(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::UInt64(val) => Ok(val.unwrap().into_py(py)), - ScalarValue::Utf8(val) => Ok(val.clone().unwrap().into_py(py)), - ScalarValue::LargeUtf8(val) => Ok(val.clone().unwrap().into_py(py)), - _ => Err(DaskPlannerError::InvalidIOFilter(format!( - "Unsupported ScalarValue `{}` encountered. using in Dask instead", - filter - ))), - }, - _ => Ok(f.canonical_name().into_py(py)), - }) - .collect(); - - filter_tuple.push(( - PyExpr::from(filter.clone()), - ( - ident.unwrap_or(expr.canonical_name()), - op.to_string(), - Some(il?), - ), - )); - Ok(filter_tuple) - } else { - let er = DaskPlannerError::InvalidIOFilter(format!( - "Invalid identifying column Expr instance `{}`. using in Dask instead", - filter - )); - Err::, DaskPlannerError>(er) - } - } - Expr::IsNotNull(expr) => { - // Only handle simple Expr(s) for IsNotNull operations for now - let ident = match *expr.clone() { - Expr::Column(col) => Ok(col.name), - _ => Err(DaskPlannerError::InvalidIOFilter(format!( - "Invalid IsNotNull Expr type `{}`. using in Dask instead", - filter - ))), - }; - - filter_tuple.push(( - PyExpr::from(filter.clone()), - ( - ident.unwrap_or(expr.canonical_name()), - "is not".to_string(), - None, - ), - )); - Ok(filter_tuple) - } - _ => { - let er = DaskPlannerError::InvalidIOFilter(format!( - "Unable to apply filter: `{}` to IO reader, using in Dask instead", - filter - )); - Err::, DaskPlannerError>(er) - } - } - } - - /// Consume the `TableScan` filters (Expr(s)) and convert them into a PyArrow understandable - /// DNF format that can be directly passed to PyArrow IO readers for Predicate Pushdown. Expr(s) - /// that cannot be converted to correlating PyArrow IO calls will be returned as is and can be - /// used in the Python logic to form Dask tasks for the graph to do computational filtering. - pub fn _expand_dnf_filters(filters: &[Expr], py: Python) -> PyFilteredResult { - let mut filtered_exprs: Vec<(PyExpr, FilterTuple)> = Vec::new(); - let mut unfiltered_exprs: Vec = Vec::new(); - - filters - .iter() - .for_each(|f| match PyTableScan::_expand_dnf_filter(f, py) { - Ok(mut expanded_dnf_filter) => filtered_exprs.append(&mut expanded_dnf_filter), - Err(_e) => unfiltered_exprs.push(PyExpr::from(f.clone())), - }); - - PyFilteredResult { - io_unfilterable_exprs: unfiltered_exprs, - filtered_exprs, - } - } -} - -#[pymethods] -impl PyTableScan { - #[pyo3(name = "getTableScanProjects")] - fn scan_projects(&mut self) -> PyResult> { - match &self.table_scan.projection { - Some(indices) => { - let schema = self.table_scan.source.schema(); - Ok(indices - .iter() - .map(|i| schema.field(*i).name().to_string()) - .collect()) - } - None => Ok(vec![]), - } - } - - /// If the 'TableScan' contains columns that should be projected during the - /// read return True, otherwise return False - #[pyo3(name = "containsProjections")] - fn contains_projections(&self) -> bool { - self.table_scan.projection.is_some() - } - - #[pyo3(name = "getFilters")] - fn scan_filters(&self) -> PyResult> { - py_expr_list(&self.input, &self.table_scan.filters) - } - - #[pyo3(name = "getDNFFilters")] - fn dnf_io_filters(&self, py: Python) -> PyResult { - let results = PyTableScan::_expand_dnf_filters(&self.table_scan.filters, py); - Ok(results) - } -} - -impl TryFrom for PyTableScan { - type Error = PyErr; - - fn try_from(logical_plan: LogicalPlan) -> Result { - match logical_plan { - LogicalPlan::TableScan(table_scan) => { - // Create an input logical plan that's identical to the table scan with schema from the table source - let mut input = table_scan.clone(); - input.projected_schema = DFSchema::try_from_qualified_schema( - &table_scan.table_name, - &table_scan.source.schema(), - ) - .map_or(input.projected_schema, Arc::new); - - Ok(PyTableScan { - table_scan, - input: Arc::new(LogicalPlan::TableScan(input)), - }) - } - _ => Err(py_type_err("unexpected plan")), - } - } -} diff --git a/dask_planner/src/sql/logical/utils.rs b/dask_planner/src/sql/logical/utils.rs index 4152ba8ee..1ad02be22 100644 --- a/dask_planner/src/sql/logical/utils.rs +++ b/dask_planner/src/sql/logical/utils.rs @@ -1,19 +1,21 @@ use std::sync::Arc; +use pyo3::prelude::*; + use datafusion_python::{ datafusion::arrow::datatypes::DataType, - datafusion_common::DFField, + datafusion_common::{DFField, ScalarValue}, datafusion_expr::{ - expr::Sort, + expr::{Sort, InList}, utils::exprlist_to_fields, Cast, DdlStatement, Expr, LogicalPlan, }, - expr::{projection::PyProjection, PyExpr}, + expr::{projection::PyProjection, PyExpr, table_scan::PyTableScan}, }; -use pyo3::{pyfunction, PyResult}; +use pyo3::{pyfunction, PyResult, PyObject}; use super::{ alter_schema::AlterSchemaPlanNode, @@ -330,3 +332,160 @@ pub fn get_precision_scale(expr: PyExpr) -> PyResult<(u8, i8)> { } }) } + + +type FilterTuple = (String, String, Option>); +#[pyclass(name = "FilteredResult", module = "dask_planner", subclass)] +#[derive(Debug, Clone)] +pub struct PyFilteredResult { + // Certain Expr(s) do not have supporting logic in pyarrow for IO filtering + // at read time. Those Expr(s) cannot be ignored however. This field stores + // those Expr(s) so that they can be used on the Python side to create + // Dask operations that handle that filtering as an extra task in the graph. + #[pyo3(get)] + pub io_unfilterable_exprs: Vec, + // Expr(s) that can have their filtering logic performed in the pyarrow IO logic + // are stored here in a DNF format that is expected by pyarrow. + #[pyo3(get)] + pub filtered_exprs: Vec<(PyExpr, FilterTuple)>, +} + + +#[pyfunction] +pub fn get_table_scan_dnf_filters(table_scan: PyTableScan, py: Python) -> PyResult { + let results = self::_expand_dnf_filters(&table_scan.table_scan.filters, py); + Ok(results) +} + + +/// Ensures that a valid Expr variant type is present +fn _valid_expr_type(expr: &[Expr]) -> bool { + expr.iter() + .all(|f| matches!(f, Expr::Column(_) | Expr::Literal(_))) +} + +/// Transform the singular Expr instance into its DNF form serialized in a Vec instance. Possibly recursively expanding +/// it as well if needed. +pub fn _expand_dnf_filter( + filter: &Expr, + py: Python, +) -> Result> { + let mut filter_tuple: Vec<(PyExpr, FilterTuple)> = Vec::new(); + + match filter { + Expr::InList(InList { + expr, + list, + negated, + }) => { + // Only handle simple Expr(s) for InList operations for now + if self::_valid_expr_type(list) { + // While ANSI SQL would not allow for anything other than a Column or Literal + // value in this "identifying" `expr` we explicitly check that here just to be sure. + // IF it is something else it is returned to Dask to handle + let ident = match *expr.clone() { + Expr::Column(col) => Ok(col.name), + Expr::Alias(_, name) => Ok(name), + Expr::Literal(val) => Ok(format!("{}", val)), + _ => Err(DaskPlannerError::InvalidIOFilter(format!( + "Invalid InList Expr type `{}`. using in Dask instead", + filter + ))), + }; + + let op = if *negated { "not in" } else { "in" }; + let il: Result> = list + .iter() + .map(|f| match f { + Expr::Column(col) => Ok(col.name.clone().into_py(py)), + Expr::Alias(_, name) => Ok(name.clone().into_py(py)), + Expr::Literal(val) => match val { + ScalarValue::Boolean(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::Float32(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::Float64(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::Int8(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::Int16(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::Int32(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::Int64(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::UInt8(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::UInt16(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::UInt32(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::UInt64(val) => Ok(val.unwrap().into_py(py)), + ScalarValue::Utf8(val) => Ok(val.clone().unwrap().into_py(py)), + ScalarValue::LargeUtf8(val) => Ok(val.clone().unwrap().into_py(py)), + _ => Err(DaskPlannerError::InvalidIOFilter(format!( + "Unsupported ScalarValue `{}` encountered. using in Dask instead", + filter + ))), + }, + _ => Ok(f.canonical_name().into_py(py)), + }) + .collect(); + + filter_tuple.push(( + PyExpr::from(filter.clone()), + ( + ident.unwrap_or(expr.canonical_name()), + op.to_string(), + Some(il?), + ), + )); + Ok(filter_tuple) + } else { + let er = DaskPlannerError::InvalidIOFilter(format!( + "Invalid identifying column Expr instance `{}`. using in Dask instead", + filter + )); + Err::, DaskPlannerError>(er) + } + } + Expr::IsNotNull(expr) => { + // Only handle simple Expr(s) for IsNotNull operations for now + let ident = match *expr.clone() { + Expr::Column(col) => Ok(col.name), + _ => Err(DaskPlannerError::InvalidIOFilter(format!( + "Invalid IsNotNull Expr type `{}`. using in Dask instead", + filter + ))), + }; + + filter_tuple.push(( + PyExpr::from(filter.clone()), + ( + ident.unwrap_or(expr.canonical_name()), + "is not".to_string(), + None, + ), + )); + Ok(filter_tuple) + } + _ => { + let er = DaskPlannerError::InvalidIOFilter(format!( + "Unable to apply filter: `{}` to IO reader, using in Dask instead", + filter + )); + Err::, DaskPlannerError>(er) + } + } +} + +/// Consume the `TableScan` filters (Expr(s)) and convert them into a PyArrow understandable +/// DNF format that can be directly passed to PyArrow IO readers for Predicate Pushdown. Expr(s) +/// that cannot be converted to correlating PyArrow IO calls will be returned as is and can be +/// used in the Python logic to form Dask tasks for the graph to do computational filtering. +pub fn _expand_dnf_filters(filters: &[Expr], py: Python) -> PyFilteredResult { + let mut filtered_exprs: Vec<(PyExpr, FilterTuple)> = Vec::new(); + let mut unfiltered_exprs: Vec = Vec::new(); + + filters + .iter() + .for_each(|f| match self::_expand_dnf_filter(f, py) { + Ok(mut expanded_dnf_filter) => filtered_exprs.append(&mut expanded_dnf_filter), + Err(_e) => unfiltered_exprs.push(PyExpr::from(f.clone())), + }); + + PyFilteredResult { + io_unfilterable_exprs: unfiltered_exprs, + filtered_exprs, + } +} diff --git a/dask_sql/context.py b/dask_sql/context.py index 261c62d74..94e0fccd9 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -862,9 +862,6 @@ def _compute_table_from_rel( if dc is None: return - # Optimization might remove some alias projects. Make sure to keep them here. - select_names = [field for field in rel.getRowType().getFieldList()] - if select_names: cc = dc.column_container diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index 3ec8c4873..cf0f24644 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -5,7 +5,7 @@ from dask.utils_test import hlg_layer -from dask_planner.rust import plan_to_table, row_type +from dask_planner.rust import get_table_scan_dnf_filters, plan_to_table, row_type from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rel.logical.filter import filter_or_scalar @@ -84,8 +84,9 @@ def _apply_filters(self, table_scan, rel, dc, context): # All partial filters here are applied in conjunction (&) all_filters = table_scan.filters() - conjunctive_dnf_filters = table_scan.getDNFFilters().filtered_exprs - non_dnf_filters = table_scan.getDNFFilters().io_unfilterable_exprs + all_dnf_filters = get_table_scan_dnf_filters(table_scan) + conjunctive_dnf_filters = all_dnf_filters.filtered_exprs + non_dnf_filters = all_dnf_filters.io_unfilterable_exprs if conjunctive_dnf_filters: # Extract the PyExprs from the conjunctive DNF filters From 379a97877744488ec3c4f91cdcfc05ea9b0c338c Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Thu, 13 Jul 2023 13:47:19 -0400 Subject: [PATCH 24/44] add .cargo/config.toml in hopes of fixing linker build issues on osx --- .cargo/config.toml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 000000000..d47f983e4 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,11 @@ +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] From e030befbedaacddd030a560ff295094e601a0a52 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 13 Jul 2023 17:28:58 -0400 Subject: [PATCH 25/44] Remove extra config.toml --- .cargo/config.toml | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml deleted file mode 100644 index d47f983e4..000000000 --- a/.cargo/config.toml +++ /dev/null @@ -1,11 +0,0 @@ -[target.x86_64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] - -[target.aarch64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] From b2e85dfc5ac3f1a42c30adab2583fee89aec3c86 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 14 Jul 2023 11:57:01 -0400 Subject: [PATCH 26/44] Try overriding runner-installed toolchain --- .github/workflows/test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index be2d98126..c37e14244 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,6 +55,11 @@ jobs: distributed: true steps: - uses: actions/checkout@v3 + - name: Set up Rust toolchain + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true - name: Set up Python uses: conda-incubator/setup-miniconda@v2.2.0 with: From d01088d6383b0fdd2f787a07e0d2c4a9353833ea Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 14 Jul 2023 12:32:29 -0400 Subject: [PATCH 27/44] Revert "Try overriding runner-installed toolchain" This reverts commit b2e85dfc5ac3f1a42c30adab2583fee89aec3c86. --- .github/workflows/test.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c37e14244..be2d98126 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,11 +55,6 @@ jobs: distributed: true steps: - uses: actions/checkout@v3 - - name: Set up Rust toolchain - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - name: Set up Python uses: conda-incubator/setup-miniconda@v2.2.0 with: From ca70f0f76ca0ab34591c3679385acf52d6cd6136 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 09:51:03 -0400 Subject: [PATCH 28/44] Initial migration to maturin build system --- {dask_planner/.cargo => .cargo}/config.toml | 0 .gitignore | 10 +-- .pre-commit-config.yaml | 6 +- dask_planner/Cargo.lock => Cargo.lock | 2 +- dask_planner/Cargo.toml => Cargo.toml | 14 ++-- .../scripts}/update-dependencies.sh | 0 dask_planner/.classpath | 55 -------------- dask_planner/.gitignore | 72 ------------------- .../org.eclipse.core.resources.prefs | 5 -- .../.settings/org.eclipse.jdt.apt.core.prefs | 2 - .../.settings/org.eclipse.jdt.core.prefs | 9 --- .../.settings/org.eclipse.m2e.core.prefs | 4 -- dask_planner/MANIFEST.in | 2 - dask_planner/README.md | 0 dask_planner/pyproject.toml | 11 --- dask_sql/context.py | 3 +- dask_sql/input_utils/hive.py | 3 +- dask_sql/mappings.py | 3 +- dask_sql/physical/rel/base.py | 3 +- dask_sql/physical/rel/convert.py | 3 +- dask_sql/physical/rel/custom/alter.py | 3 +- dask_sql/physical/rel/custom/analyze_table.py | 3 +- .../rel/custom/create_catalog_schema.py | 3 +- dask_sql/physical/rel/custom/create_table.py | 3 +- .../physical/rel/custom/describe_model.py | 3 +- dask_sql/physical/rel/custom/distributeby.py | 3 +- dask_sql/physical/rel/custom/drop_schema.py | 3 +- dask_sql/physical/rel/custom/export_model.py | 3 +- dask_sql/physical/rel/custom/predict_model.py | 3 +- dask_sql/physical/rel/custom/show_models.py | 3 +- dask_sql/physical/rel/custom/use_schema.py | 3 +- dask_sql/physical/rel/logical/aggregate.py | 3 +- dask_sql/physical/rel/logical/cross_join.py | 3 +- dask_sql/physical/rel/logical/empty.py | 3 +- dask_sql/physical/rel/logical/explain.py | 3 +- dask_sql/physical/rel/logical/filter.py | 3 +- dask_sql/physical/rel/logical/join.py | 3 +- dask_sql/physical/rel/logical/limit.py | 3 +- dask_sql/physical/rel/logical/project.py | 6 +- dask_sql/physical/rel/logical/sort.py | 3 +- .../physical/rel/logical/subquery_alias.py | 3 +- dask_sql/physical/rel/logical/table_scan.py | 3 +- dask_sql/physical/rel/logical/union.py | 3 +- dask_sql/physical/rel/logical/window.py | 3 +- dask_sql/physical/rex/base.py | 2 +- dask_sql/physical/rex/convert.py | 3 +- dask_sql/physical/rex/core/alias.py | 3 +- dask_sql/physical/rex/core/call.py | 5 +- dask_sql/physical/rex/core/input_ref.py | 3 +- dask_sql/physical/rex/core/literal.py | 5 +- dask_sql/physical/rex/core/subquery.py | 3 +- dask_sql/utils.py | 2 +- pyproject.toml | 11 +-- setup.py | 9 --- {dask_planner/src => src}/dialect.rs | 0 {dask_planner/src => src}/error.rs | 0 {dask_planner/src => src}/expression.rs | 0 {dask_planner/src => src}/lib.rs | 2 +- {dask_planner/src => src}/parser.rs | 0 {dask_planner/src => src}/sql.rs | 0 {dask_planner/src => src}/sql/column.rs | 0 {dask_planner/src => src}/sql/exceptions.rs | 0 {dask_planner/src => src}/sql/function.rs | 0 {dask_planner/src => src}/sql/logical.rs | 0 .../src => src}/sql/logical/aggregate.rs | 0 .../src => src}/sql/logical/alter_schema.rs | 0 .../src => src}/sql/logical/alter_table.rs | 0 .../src => src}/sql/logical/analyze_table.rs | 0 .../sql/logical/create_catalog_schema.rs | 0 .../sql/logical/create_experiment.rs | 0 .../sql/logical/create_memory_table.rs | 0 .../src => src}/sql/logical/create_model.rs | 0 .../src => src}/sql/logical/create_table.rs | 0 .../src => src}/sql/logical/describe_model.rs | 0 .../src => src}/sql/logical/drop_model.rs | 0 .../src => src}/sql/logical/drop_schema.rs | 0 .../src => src}/sql/logical/drop_table.rs | 0 .../src => src}/sql/logical/empty_relation.rs | 0 .../src => src}/sql/logical/explain.rs | 0 .../src => src}/sql/logical/export_model.rs | 0 .../src => src}/sql/logical/filter.rs | 0 {dask_planner/src => src}/sql/logical/join.rs | 0 .../src => src}/sql/logical/limit.rs | 0 .../src => src}/sql/logical/predict_model.rs | 0 .../src => src}/sql/logical/projection.rs | 0 .../src => src}/sql/logical/repartition_by.rs | 0 .../src => src}/sql/logical/show_columns.rs | 0 .../src => src}/sql/logical/show_models.rs | 0 .../src => src}/sql/logical/show_schemas.rs | 0 .../src => src}/sql/logical/show_tables.rs | 0 {dask_planner/src => src}/sql/logical/sort.rs | 0 .../src => src}/sql/logical/subquery_alias.rs | 0 .../src => src}/sql/logical/table_scan.rs | 0 .../src => src}/sql/logical/use_schema.rs | 0 .../src => src}/sql/logical/window.rs | 0 {dask_planner/src => src}/sql/optimizer.rs | 0 .../optimizer/dynamic_partition_pruning.rs | 0 .../src => src}/sql/optimizer/join_reorder.rs | 0 {dask_planner/src => src}/sql/parser_utils.rs | 0 {dask_planner/src => src}/sql/schema.rs | 0 {dask_planner/src => src}/sql/statement.rs | 0 {dask_planner/src => src}/sql/table.rs | 0 {dask_planner/src => src}/sql/types.rs | 0 .../src => src}/sql/types/rel_data_type.rs | 0 .../sql/types/rel_data_type_field.rs | 0 tests/unit/test_mapping.py | 2 +- 106 files changed, 95 insertions(+), 237 deletions(-) rename {dask_planner/.cargo => .cargo}/config.toml (100%) rename dask_planner/Cargo.lock => Cargo.lock (99%) rename dask_planner/Cargo.toml => Cargo.toml (83%) rename {dask_planner => continuous_integration/scripts}/update-dependencies.sh (100%) delete mode 100644 dask_planner/.classpath delete mode 100644 dask_planner/.gitignore delete mode 100644 dask_planner/.settings/org.eclipse.core.resources.prefs delete mode 100644 dask_planner/.settings/org.eclipse.jdt.apt.core.prefs delete mode 100644 dask_planner/.settings/org.eclipse.jdt.core.prefs delete mode 100644 dask_planner/.settings/org.eclipse.m2e.core.prefs delete mode 100644 dask_planner/MANIFEST.in delete mode 100644 dask_planner/README.md delete mode 100644 dask_planner/pyproject.toml rename {dask_planner/src => src}/dialect.rs (100%) rename {dask_planner/src => src}/error.rs (100%) rename {dask_planner/src => src}/expression.rs (100%) rename {dask_planner/src => src}/lib.rs (97%) rename {dask_planner/src => src}/parser.rs (100%) rename {dask_planner/src => src}/sql.rs (100%) rename {dask_planner/src => src}/sql/column.rs (100%) rename {dask_planner/src => src}/sql/exceptions.rs (100%) rename {dask_planner/src => src}/sql/function.rs (100%) rename {dask_planner/src => src}/sql/logical.rs (100%) rename {dask_planner/src => src}/sql/logical/aggregate.rs (100%) rename {dask_planner/src => src}/sql/logical/alter_schema.rs (100%) rename {dask_planner/src => src}/sql/logical/alter_table.rs (100%) rename {dask_planner/src => src}/sql/logical/analyze_table.rs (100%) rename {dask_planner/src => src}/sql/logical/create_catalog_schema.rs (100%) rename {dask_planner/src => src}/sql/logical/create_experiment.rs (100%) rename {dask_planner/src => src}/sql/logical/create_memory_table.rs (100%) rename {dask_planner/src => src}/sql/logical/create_model.rs (100%) rename {dask_planner/src => src}/sql/logical/create_table.rs (100%) rename {dask_planner/src => src}/sql/logical/describe_model.rs (100%) rename {dask_planner/src => src}/sql/logical/drop_model.rs (100%) rename {dask_planner/src => src}/sql/logical/drop_schema.rs (100%) rename {dask_planner/src => src}/sql/logical/drop_table.rs (100%) rename {dask_planner/src => src}/sql/logical/empty_relation.rs (100%) rename {dask_planner/src => src}/sql/logical/explain.rs (100%) rename {dask_planner/src => src}/sql/logical/export_model.rs (100%) rename {dask_planner/src => src}/sql/logical/filter.rs (100%) rename {dask_planner/src => src}/sql/logical/join.rs (100%) rename {dask_planner/src => src}/sql/logical/limit.rs (100%) rename {dask_planner/src => src}/sql/logical/predict_model.rs (100%) rename {dask_planner/src => src}/sql/logical/projection.rs (100%) rename {dask_planner/src => src}/sql/logical/repartition_by.rs (100%) rename {dask_planner/src => src}/sql/logical/show_columns.rs (100%) rename {dask_planner/src => src}/sql/logical/show_models.rs (100%) rename {dask_planner/src => src}/sql/logical/show_schemas.rs (100%) rename {dask_planner/src => src}/sql/logical/show_tables.rs (100%) rename {dask_planner/src => src}/sql/logical/sort.rs (100%) rename {dask_planner/src => src}/sql/logical/subquery_alias.rs (100%) rename {dask_planner/src => src}/sql/logical/table_scan.rs (100%) rename {dask_planner/src => src}/sql/logical/use_schema.rs (100%) rename {dask_planner/src => src}/sql/logical/window.rs (100%) rename {dask_planner/src => src}/sql/optimizer.rs (100%) rename {dask_planner/src => src}/sql/optimizer/dynamic_partition_pruning.rs (100%) rename {dask_planner/src => src}/sql/optimizer/join_reorder.rs (100%) rename {dask_planner/src => src}/sql/parser_utils.rs (100%) rename {dask_planner/src => src}/sql/schema.rs (100%) rename {dask_planner/src => src}/sql/statement.rs (100%) rename {dask_planner/src => src}/sql/table.rs (100%) rename {dask_planner/src => src}/sql/types.rs (100%) rename {dask_planner/src => src}/sql/types/rel_data_type.rs (100%) rename {dask_planner/src => src}/sql/types/rel_data_type_field.rs (100%) diff --git a/dask_planner/.cargo/config.toml b/.cargo/config.toml similarity index 100% rename from dask_planner/.cargo/config.toml rename to .cargo/config.toml diff --git a/.gitignore b/.gitignore index 245817fc1..d41df8a68 100644 --- a/.gitignore +++ b/.gitignore @@ -46,23 +46,15 @@ venv # IDE .idea .vscode -planner/.classpath -planner/.project -planner/.settings/ -planner/.idea -planner/*.iml *.swp # project specific -planner/dependency-reduced-pom.xml -planner/target/ -dask_sql/jar -.next/ dask-worker-space/ node_modules/ docs/source/_build/ tests/unit/queries tests/unit/data +target/* # Ignore development specific local testing files dev_tests diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ed701014a..094c4ada1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,9 +20,9 @@ repos: rev: v1.0 hooks: - id: cargo-check - args: ['--manifest-path', './dask_planner/Cargo.toml', '--verbose', '--'] + args: ['--manifest-path', './Cargo.toml', '--verbose', '--'] - id: clippy - args: ['--manifest-path', './dask_planner/Cargo.toml', '--verbose', '--', '-D', 'warnings'] + args: ['--manifest-path', './Cargo.toml', '--verbose', '--', '-D', 'warnings'] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.2.0 hooks: @@ -39,4 +39,4 @@ repos: entry: cargo +nightly fmt language: system types: [rust] - args: ['--manifest-path', './dask_planner/Cargo.toml', '--verbose', '--'] + args: ['--manifest-path', './Cargo.toml', '--verbose', '--'] diff --git a/dask_planner/Cargo.lock b/Cargo.lock similarity index 99% rename from dask_planner/Cargo.lock rename to Cargo.lock index 1a75a215d..accc91264 100644 --- a/dask_planner/Cargo.lock +++ b/Cargo.lock @@ -658,7 +658,7 @@ dependencies = [ ] [[package]] -name = "dask_planner" +name = "dask-planner" version = "0.1.0" dependencies = [ "async-trait", diff --git a/dask_planner/Cargo.toml b/Cargo.toml similarity index 83% rename from dask_planner/Cargo.toml rename to Cargo.toml index 8a849628b..eefc51a32 100644 --- a/dask_planner/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "dask_planner" +name = "dask-planner" repository = "https://github.com/dask-contrib/dask-sql" version = "0.1.0" description = "Bindings for DataFusion used by Dask-SQL" @@ -8,16 +8,20 @@ license = "Apache-2.0" edition = "2021" rust-version = "1.65" +[lib] +name = "dask_planner" +crate-type = ["cdylib"] + [dependencies] async-trait = "0.1.71" datafusion-python = "27.0.0" env_logger = "0.10" log = "^0.4" -pyo3 = { version = "0.19.0", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-log = "0.8.2" +[dependencies.pyo3] +version = "0.19.0" +features = ["abi3-py38"] + [build-dependencies] pyo3-build-config = "0.19.1" - -[lib] -crate-type = ["cdylib"] diff --git a/dask_planner/update-dependencies.sh b/continuous_integration/scripts/update-dependencies.sh similarity index 100% rename from dask_planner/update-dependencies.sh rename to continuous_integration/scripts/update-dependencies.sh diff --git a/dask_planner/.classpath b/dask_planner/.classpath deleted file mode 100644 index b14b13a76..000000000 --- a/dask_planner/.classpath +++ /dev/null @@ -1,55 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/dask_planner/.gitignore b/dask_planner/.gitignore deleted file mode 100644 index c8f044299..000000000 --- a/dask_planner/.gitignore +++ /dev/null @@ -1,72 +0,0 @@ -/target - -# Byte-compiled / optimized / DLL files -__pycache__/ -.pytest_cache/ -*.py[cod] - -# C extensions -*.so - -# Distribution / packaging -.Python -.venv/ -env/ -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -include/ -man/ -venv/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt -pip-selfcheck.json - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.cache -nosetests.xml -coverage.xml - -# Translations -*.mo - -# Mr Developer -.mr.developer.cfg -.project -.pydevproject - -# Rope -.ropeproject - -# Django stuff: -*.log -*.pot - -.DS_Store - -# Sphinx documentation -docs/_build/ - -# PyCharm -.idea/ - -# VSCode -.vscode/ - -# Pyenv -.python-version diff --git a/dask_planner/.settings/org.eclipse.core.resources.prefs b/dask_planner/.settings/org.eclipse.core.resources.prefs deleted file mode 100644 index 92920805e..000000000 --- a/dask_planner/.settings/org.eclipse.core.resources.prefs +++ /dev/null @@ -1,5 +0,0 @@ -eclipse.preferences.version=1 -encoding//src/main/java=UTF-8 -encoding//src/main/resources=UTF-8 -encoding//target/generated-sources/annotations=UTF-8 -encoding/=UTF-8 diff --git a/dask_planner/.settings/org.eclipse.jdt.apt.core.prefs b/dask_planner/.settings/org.eclipse.jdt.apt.core.prefs deleted file mode 100644 index d4313d4b2..000000000 --- a/dask_planner/.settings/org.eclipse.jdt.apt.core.prefs +++ /dev/null @@ -1,2 +0,0 @@ -eclipse.preferences.version=1 -org.eclipse.jdt.apt.aptEnabled=false diff --git a/dask_planner/.settings/org.eclipse.jdt.core.prefs b/dask_planner/.settings/org.eclipse.jdt.core.prefs deleted file mode 100644 index 1b6e1ef22..000000000 --- a/dask_planner/.settings/org.eclipse.jdt.core.prefs +++ /dev/null @@ -1,9 +0,0 @@ -eclipse.preferences.version=1 -org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 -org.eclipse.jdt.core.compiler.compliance=1.8 -org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled -org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning -org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore -org.eclipse.jdt.core.compiler.processAnnotations=disabled -org.eclipse.jdt.core.compiler.release=disabled -org.eclipse.jdt.core.compiler.source=1.8 diff --git a/dask_planner/.settings/org.eclipse.m2e.core.prefs b/dask_planner/.settings/org.eclipse.m2e.core.prefs deleted file mode 100644 index f897a7f1c..000000000 --- a/dask_planner/.settings/org.eclipse.m2e.core.prefs +++ /dev/null @@ -1,4 +0,0 @@ -activeProfiles= -eclipse.preferences.version=1 -resolveWorkspaceProjects=true -version=1 diff --git a/dask_planner/MANIFEST.in b/dask_planner/MANIFEST.in deleted file mode 100644 index 7c68298bd..000000000 --- a/dask_planner/MANIFEST.in +++ /dev/null @@ -1,2 +0,0 @@ -include Cargo.toml -recursive-include src * diff --git a/dask_planner/README.md b/dask_planner/README.md deleted file mode 100644 index e69de29bb..000000000 diff --git a/dask_planner/pyproject.toml b/dask_planner/pyproject.toml deleted file mode 100644 index f153e3f5a..000000000 --- a/dask_planner/pyproject.toml +++ /dev/null @@ -1,11 +0,0 @@ -[build-system] -requires = ["setuptools", "wheel", "setuptools-rust"] - -[project] -name = "datafusion_planner" -requires-python = ">=3.8" -classifiers = [ - "Programming Language :: Rust", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", -] diff --git a/dask_sql/context.py b/dask_sql/context.py index 81e0a38a1..03947fa76 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -9,8 +9,7 @@ from dask import config as dask_config from dask.base import optimize from dask.utils_test import hlg_layer - -from dask_planner.rust import ( +from dask_planner import ( DaskSchema, DaskSQLContext, DaskTable, diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py index 4d0eb9cce..5d500180d 100644 --- a/dask_sql/input_utils/hive.py +++ b/dask_sql/input_utils/hive.py @@ -5,8 +5,7 @@ from typing import Any, Union import dask.dataframe as dd - -from dask_planner.rust import SqlTypeName +from dask_planner import SqlTypeName try: from pyhive import hive diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 9ba22f797..3d39ee392 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -7,8 +7,7 @@ import dask.dataframe as dd import numpy as np import pandas as pd - -from dask_planner.rust import DaskTypeMap, SqlTypeName +from dask_planner import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index a1f378197..f4463fe62 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -7,8 +7,9 @@ from dask_sql.mappings import cast_column_type, sql_to_python_type if TYPE_CHECKING: + from dask_planner import LogicalPlan, RelDataType + import dask_sql - from dask_planner.rust import LogicalPlan, RelDataType logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index 29ad8c327..24b06c337 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -7,8 +7,9 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/alter.py b/dask_sql/physical/rel/custom/alter.py index 9c8a159b0..16ed9e9bb 100644 --- a/dask_sql/physical/rel/custom/alter.py +++ b/dask_sql/physical/rel/custom/alter.py @@ -6,8 +6,9 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class AlterSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index 69f734a54..77edfff4b 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -8,8 +8,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class AnalyzeTablePlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/create_catalog_schema.py b/dask_sql/physical/rel/custom/create_catalog_schema.py index 52ed37b55..74f964621 100644 --- a/dask_sql/physical/rel/custom/create_catalog_schema.py +++ b/dask_sql/physical/rel/custom/create_catalog_schema.py @@ -4,8 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_table.py b/dask_sql/physical/rel/custom/create_table.py index 36b165230..526ec9728 100644 --- a/dask_sql/physical/rel/custom/create_table.py +++ b/dask_sql/physical/rel/custom/create_table.py @@ -6,8 +6,9 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/describe_model.py b/dask_sql/physical/rel/custom/describe_model.py index d915a6b0b..8b2e144ff 100644 --- a/dask_sql/physical/rel/custom/describe_model.py +++ b/dask_sql/physical/rel/custom/describe_model.py @@ -7,8 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class DescribeModelPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/distributeby.py b/dask_sql/physical/rel/custom/distributeby.py index c7ce70610..6b6dba0b8 100644 --- a/dask_sql/physical/rel/custom/distributeby.py +++ b/dask_sql/physical/rel/custom/distributeby.py @@ -6,8 +6,9 @@ from dask_sql.utils import LoggableDataFrame if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/drop_schema.py b/dask_sql/physical/rel/custom/drop_schema.py index 444662e2b..455b27fa4 100644 --- a/dask_sql/physical/rel/custom/drop_schema.py +++ b/dask_sql/physical/rel/custom/drop_schema.py @@ -4,8 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/export_model.py b/dask_sql/physical/rel/custom/export_model.py index 07cf9979e..c96d19786 100644 --- a/dask_sql/physical/rel/custom/export_model.py +++ b/dask_sql/physical/rel/custom/export_model.py @@ -6,8 +6,9 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/predict_model.py b/dask_sql/physical/rel/custom/predict_model.py index 917d712c3..c0339b1d7 100644 --- a/dask_sql/physical/rel/custom/predict_model.py +++ b/dask_sql/physical/rel/custom/predict_model.py @@ -9,8 +9,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/show_models.py b/dask_sql/physical/rel/custom/show_models.py index 3f879dd38..ecc81e82a 100644 --- a/dask_sql/physical/rel/custom/show_models.py +++ b/dask_sql/physical/rel/custom/show_models.py @@ -7,8 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class ShowModelsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/use_schema.py b/dask_sql/physical/rel/custom/use_schema.py index 889dd2b1c..563415c2d 100644 --- a/dask_sql/physical/rel/custom/use_schema.py +++ b/dask_sql/physical/rel/custom/use_schema.py @@ -4,8 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class UseSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index a14900f99..27f5c102c 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -15,8 +15,9 @@ from dask_sql.utils import is_cudf_type, new_temporary_column if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/cross_join.py b/dask_sql/physical/rel/logical/cross_join.py index 5f32d3257..dfa8cdf3c 100644 --- a/dask_sql/physical/rel/logical/cross_join.py +++ b/dask_sql/physical/rel/logical/cross_join.py @@ -6,8 +6,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/empty.py b/dask_sql/physical/rel/logical/empty.py index 23f8d1cd3..b50699b79 100644 --- a/dask_sql/physical/rel/logical/empty.py +++ b/dask_sql/physical/rel/logical/empty.py @@ -8,8 +8,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/explain.py b/dask_sql/physical/rel/logical/explain.py index 69d20fca3..abf1d814c 100644 --- a/dask_sql/physical/rel/logical/explain.py +++ b/dask_sql/physical/rel/logical/explain.py @@ -3,8 +3,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class ExplainPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index d3c3f5fd3..a37e390ec 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -11,8 +11,9 @@ from dask_sql.physical.utils.filter import attempt_predicate_pushdown if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index c1c904af6..cec7df4d9 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -17,8 +17,9 @@ from dask_sql.utils import is_cudf_type if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index 3e2fc6434..00ba37fa2 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -11,8 +11,9 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class DaskLimitPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/project.py b/dask_sql/physical/rel/logical/project.py index b990e21b4..4630b5d6b 100644 --- a/dask_sql/physical/rel/logical/project.py +++ b/dask_sql/physical/rel/logical/project.py @@ -1,15 +1,17 @@ import logging from typing import TYPE_CHECKING -from dask_planner.rust import RexType +from dask_planner import RexType + from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter from dask_sql.utils import new_temporary_column if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index 2e1376d41..6dc57211c 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -5,8 +5,9 @@ from dask_sql.physical.utils.sort import apply_sort if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class DaskSortPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/subquery_alias.py b/dask_sql/physical/rel/logical/subquery_alias.py index 2473167d7..e82d9b105 100644 --- a/dask_sql/physical/rel/logical/subquery_alias.py +++ b/dask_sql/physical/rel/logical/subquery_alias.py @@ -4,8 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class SubqueryAlias(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index b4025ec97..b3b5cab0a 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -11,8 +11,9 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/union.py b/dask_sql/physical/rel/logical/union.py index 830f7f981..1fbc5b5ae 100644 --- a/dask_sql/physical/rel/logical/union.py +++ b/dask_sql/physical/rel/logical/union.py @@ -6,8 +6,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan def _extract_df(obj_cc, obj_df, output_field_names): diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index 331876c49..bbcdae740 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -16,8 +16,9 @@ from dask_sql.utils import LoggableDataFrame, new_temporary_column if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/base.py b/dask_sql/physical/rex/base.py index 5724a4536..7f97a70d9 100644 --- a/dask_sql/physical/rex/base.py +++ b/dask_sql/physical/rex/base.py @@ -7,7 +7,7 @@ from dask_sql.datacontainer import DataContainer if TYPE_CHECKING: - from dask_planner.rust import Expression, LogicalPlan + from dask_planner import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/convert.py b/dask_sql/physical/rex/convert.py index 71431cbb4..fce64be30 100644 --- a/dask_sql/physical/rex/convert.py +++ b/dask_sql/physical/rex/convert.py @@ -8,8 +8,9 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/alias.py b/dask_sql/physical/rex/core/alias.py index 40c373766..d6ae20698 100644 --- a/dask_sql/physical/rex/core/alias.py +++ b/dask_sql/physical/rex/core/alias.py @@ -7,8 +7,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan class RexAliasPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 5ef1d7fb8..56d01d006 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -13,8 +13,8 @@ from dask.dataframe.core import Series from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data +from dask_planner import SqlTypeName -from dask_planner.rust import SqlTypeName from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200 from dask_sql.datacontainer import DataContainer from dask_sql.mappings import ( @@ -34,8 +34,9 @@ ) if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan logger = logging.getLogger(__name__) SeriesOrScalar = Union[dd.Series, Any] diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 4272c832e..01bf871c7 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -6,8 +6,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan class RexInputRefPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 73e3b8185..7fe59b383 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -4,15 +4,16 @@ import dask.dataframe as dd import numpy as np +from dask_planner import SqlTypeName -from dask_planner.rust import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/subquery.py b/dask_sql/physical/rex/core/subquery.py index 5e0a33098..1253f257d 100644 --- a/dask_sql/physical/rex/core/subquery.py +++ b/dask_sql/physical/rex/core/subquery.py @@ -7,8 +7,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan class RexScalarSubqueryPlugin(BaseRexPlugin): diff --git a/dask_sql/utils.py b/dask_sql/utils.py index 39c165597..c2cfe45ab 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -8,8 +8,8 @@ import dask.dataframe as dd import numpy as np import pandas as pd +from dask_planner import SqlTypeName -from dask_planner.rust import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value diff --git a/pyproject.toml b/pyproject.toml index dfed2ba50..17392d3b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,14 @@ [build-system] -requires = ["setuptools", "wheel", "setuptools-rust"] - -[tool.isort] -profile = "black" +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" [tool.maturin] +features = ["pyo3/extension-module"] include = [ { path = "Cargo.lock", format = "sdist" } ] exclude = [".github/**", "ci/**", ".asf.yaml"] -# Require Cargo.lock is up to date locked = true + +[tool.isort] +profile = "black" diff --git a/setup.py b/setup.py index d149ac5f0..02693d0d6 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ import sys from setuptools import find_packages, setup -from setuptools_rust import Binding, RustExtension import versioneer @@ -31,14 +30,6 @@ include=["dask_sql", "dask_sql.*", "dask_planner", "dask_planner.*"] ), package_data={"dask_sql": ["sql*.yaml"]}, - rust_extensions=[ - RustExtension( - "dask_planner.rust", - binding=Binding.PyO3, - path="dask_planner/Cargo.toml", - debug=debug_build, - ) - ], python_requires=">=3.8", setup_requires=sphinx_requirements, install_requires=[ diff --git a/dask_planner/src/dialect.rs b/src/dialect.rs similarity index 100% rename from dask_planner/src/dialect.rs rename to src/dialect.rs diff --git a/dask_planner/src/error.rs b/src/error.rs similarity index 100% rename from dask_planner/src/error.rs rename to src/error.rs diff --git a/dask_planner/src/expression.rs b/src/expression.rs similarity index 100% rename from dask_planner/src/expression.rs rename to src/expression.rs diff --git a/dask_planner/src/lib.rs b/src/lib.rs similarity index 97% rename from dask_planner/src/lib.rs rename to src/lib.rs index f5305d900..9f446b7e2 100644 --- a/dask_planner/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,7 @@ mod sql; /// The higher-level public API is defined in pure python files under the /// dask_planner directory. #[pymodule] -#[pyo3(name = "rust")] +#[pyo3(name = "dask_planner")] fn rust(py: Python, m: &PyModule) -> PyResult<()> { // Initialize the global Python logger instance pyo3_log::init(); diff --git a/dask_planner/src/parser.rs b/src/parser.rs similarity index 100% rename from dask_planner/src/parser.rs rename to src/parser.rs diff --git a/dask_planner/src/sql.rs b/src/sql.rs similarity index 100% rename from dask_planner/src/sql.rs rename to src/sql.rs diff --git a/dask_planner/src/sql/column.rs b/src/sql/column.rs similarity index 100% rename from dask_planner/src/sql/column.rs rename to src/sql/column.rs diff --git a/dask_planner/src/sql/exceptions.rs b/src/sql/exceptions.rs similarity index 100% rename from dask_planner/src/sql/exceptions.rs rename to src/sql/exceptions.rs diff --git a/dask_planner/src/sql/function.rs b/src/sql/function.rs similarity index 100% rename from dask_planner/src/sql/function.rs rename to src/sql/function.rs diff --git a/dask_planner/src/sql/logical.rs b/src/sql/logical.rs similarity index 100% rename from dask_planner/src/sql/logical.rs rename to src/sql/logical.rs diff --git a/dask_planner/src/sql/logical/aggregate.rs b/src/sql/logical/aggregate.rs similarity index 100% rename from dask_planner/src/sql/logical/aggregate.rs rename to src/sql/logical/aggregate.rs diff --git a/dask_planner/src/sql/logical/alter_schema.rs b/src/sql/logical/alter_schema.rs similarity index 100% rename from dask_planner/src/sql/logical/alter_schema.rs rename to src/sql/logical/alter_schema.rs diff --git a/dask_planner/src/sql/logical/alter_table.rs b/src/sql/logical/alter_table.rs similarity index 100% rename from dask_planner/src/sql/logical/alter_table.rs rename to src/sql/logical/alter_table.rs diff --git a/dask_planner/src/sql/logical/analyze_table.rs b/src/sql/logical/analyze_table.rs similarity index 100% rename from dask_planner/src/sql/logical/analyze_table.rs rename to src/sql/logical/analyze_table.rs diff --git a/dask_planner/src/sql/logical/create_catalog_schema.rs b/src/sql/logical/create_catalog_schema.rs similarity index 100% rename from dask_planner/src/sql/logical/create_catalog_schema.rs rename to src/sql/logical/create_catalog_schema.rs diff --git a/dask_planner/src/sql/logical/create_experiment.rs b/src/sql/logical/create_experiment.rs similarity index 100% rename from dask_planner/src/sql/logical/create_experiment.rs rename to src/sql/logical/create_experiment.rs diff --git a/dask_planner/src/sql/logical/create_memory_table.rs b/src/sql/logical/create_memory_table.rs similarity index 100% rename from dask_planner/src/sql/logical/create_memory_table.rs rename to src/sql/logical/create_memory_table.rs diff --git a/dask_planner/src/sql/logical/create_model.rs b/src/sql/logical/create_model.rs similarity index 100% rename from dask_planner/src/sql/logical/create_model.rs rename to src/sql/logical/create_model.rs diff --git a/dask_planner/src/sql/logical/create_table.rs b/src/sql/logical/create_table.rs similarity index 100% rename from dask_planner/src/sql/logical/create_table.rs rename to src/sql/logical/create_table.rs diff --git a/dask_planner/src/sql/logical/describe_model.rs b/src/sql/logical/describe_model.rs similarity index 100% rename from dask_planner/src/sql/logical/describe_model.rs rename to src/sql/logical/describe_model.rs diff --git a/dask_planner/src/sql/logical/drop_model.rs b/src/sql/logical/drop_model.rs similarity index 100% rename from dask_planner/src/sql/logical/drop_model.rs rename to src/sql/logical/drop_model.rs diff --git a/dask_planner/src/sql/logical/drop_schema.rs b/src/sql/logical/drop_schema.rs similarity index 100% rename from dask_planner/src/sql/logical/drop_schema.rs rename to src/sql/logical/drop_schema.rs diff --git a/dask_planner/src/sql/logical/drop_table.rs b/src/sql/logical/drop_table.rs similarity index 100% rename from dask_planner/src/sql/logical/drop_table.rs rename to src/sql/logical/drop_table.rs diff --git a/dask_planner/src/sql/logical/empty_relation.rs b/src/sql/logical/empty_relation.rs similarity index 100% rename from dask_planner/src/sql/logical/empty_relation.rs rename to src/sql/logical/empty_relation.rs diff --git a/dask_planner/src/sql/logical/explain.rs b/src/sql/logical/explain.rs similarity index 100% rename from dask_planner/src/sql/logical/explain.rs rename to src/sql/logical/explain.rs diff --git a/dask_planner/src/sql/logical/export_model.rs b/src/sql/logical/export_model.rs similarity index 100% rename from dask_planner/src/sql/logical/export_model.rs rename to src/sql/logical/export_model.rs diff --git a/dask_planner/src/sql/logical/filter.rs b/src/sql/logical/filter.rs similarity index 100% rename from dask_planner/src/sql/logical/filter.rs rename to src/sql/logical/filter.rs diff --git a/dask_planner/src/sql/logical/join.rs b/src/sql/logical/join.rs similarity index 100% rename from dask_planner/src/sql/logical/join.rs rename to src/sql/logical/join.rs diff --git a/dask_planner/src/sql/logical/limit.rs b/src/sql/logical/limit.rs similarity index 100% rename from dask_planner/src/sql/logical/limit.rs rename to src/sql/logical/limit.rs diff --git a/dask_planner/src/sql/logical/predict_model.rs b/src/sql/logical/predict_model.rs similarity index 100% rename from dask_planner/src/sql/logical/predict_model.rs rename to src/sql/logical/predict_model.rs diff --git a/dask_planner/src/sql/logical/projection.rs b/src/sql/logical/projection.rs similarity index 100% rename from dask_planner/src/sql/logical/projection.rs rename to src/sql/logical/projection.rs diff --git a/dask_planner/src/sql/logical/repartition_by.rs b/src/sql/logical/repartition_by.rs similarity index 100% rename from dask_planner/src/sql/logical/repartition_by.rs rename to src/sql/logical/repartition_by.rs diff --git a/dask_planner/src/sql/logical/show_columns.rs b/src/sql/logical/show_columns.rs similarity index 100% rename from dask_planner/src/sql/logical/show_columns.rs rename to src/sql/logical/show_columns.rs diff --git a/dask_planner/src/sql/logical/show_models.rs b/src/sql/logical/show_models.rs similarity index 100% rename from dask_planner/src/sql/logical/show_models.rs rename to src/sql/logical/show_models.rs diff --git a/dask_planner/src/sql/logical/show_schemas.rs b/src/sql/logical/show_schemas.rs similarity index 100% rename from dask_planner/src/sql/logical/show_schemas.rs rename to src/sql/logical/show_schemas.rs diff --git a/dask_planner/src/sql/logical/show_tables.rs b/src/sql/logical/show_tables.rs similarity index 100% rename from dask_planner/src/sql/logical/show_tables.rs rename to src/sql/logical/show_tables.rs diff --git a/dask_planner/src/sql/logical/sort.rs b/src/sql/logical/sort.rs similarity index 100% rename from dask_planner/src/sql/logical/sort.rs rename to src/sql/logical/sort.rs diff --git a/dask_planner/src/sql/logical/subquery_alias.rs b/src/sql/logical/subquery_alias.rs similarity index 100% rename from dask_planner/src/sql/logical/subquery_alias.rs rename to src/sql/logical/subquery_alias.rs diff --git a/dask_planner/src/sql/logical/table_scan.rs b/src/sql/logical/table_scan.rs similarity index 100% rename from dask_planner/src/sql/logical/table_scan.rs rename to src/sql/logical/table_scan.rs diff --git a/dask_planner/src/sql/logical/use_schema.rs b/src/sql/logical/use_schema.rs similarity index 100% rename from dask_planner/src/sql/logical/use_schema.rs rename to src/sql/logical/use_schema.rs diff --git a/dask_planner/src/sql/logical/window.rs b/src/sql/logical/window.rs similarity index 100% rename from dask_planner/src/sql/logical/window.rs rename to src/sql/logical/window.rs diff --git a/dask_planner/src/sql/optimizer.rs b/src/sql/optimizer.rs similarity index 100% rename from dask_planner/src/sql/optimizer.rs rename to src/sql/optimizer.rs diff --git a/dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs b/src/sql/optimizer/dynamic_partition_pruning.rs similarity index 100% rename from dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs rename to src/sql/optimizer/dynamic_partition_pruning.rs diff --git a/dask_planner/src/sql/optimizer/join_reorder.rs b/src/sql/optimizer/join_reorder.rs similarity index 100% rename from dask_planner/src/sql/optimizer/join_reorder.rs rename to src/sql/optimizer/join_reorder.rs diff --git a/dask_planner/src/sql/parser_utils.rs b/src/sql/parser_utils.rs similarity index 100% rename from dask_planner/src/sql/parser_utils.rs rename to src/sql/parser_utils.rs diff --git a/dask_planner/src/sql/schema.rs b/src/sql/schema.rs similarity index 100% rename from dask_planner/src/sql/schema.rs rename to src/sql/schema.rs diff --git a/dask_planner/src/sql/statement.rs b/src/sql/statement.rs similarity index 100% rename from dask_planner/src/sql/statement.rs rename to src/sql/statement.rs diff --git a/dask_planner/src/sql/table.rs b/src/sql/table.rs similarity index 100% rename from dask_planner/src/sql/table.rs rename to src/sql/table.rs diff --git a/dask_planner/src/sql/types.rs b/src/sql/types.rs similarity index 100% rename from dask_planner/src/sql/types.rs rename to src/sql/types.rs diff --git a/dask_planner/src/sql/types/rel_data_type.rs b/src/sql/types/rel_data_type.rs similarity index 100% rename from dask_planner/src/sql/types/rel_data_type.rs rename to src/sql/types/rel_data_type.rs diff --git a/dask_planner/src/sql/types/rel_data_type_field.rs b/src/sql/types/rel_data_type_field.rs similarity index 100% rename from dask_planner/src/sql/types/rel_data_type_field.rs rename to src/sql/types/rel_data_type_field.rs diff --git a/tests/unit/test_mapping.py b/tests/unit/test_mapping.py index b49ed1aae..952bcb10e 100644 --- a/tests/unit/test_mapping.py +++ b/tests/unit/test_mapping.py @@ -3,8 +3,8 @@ import numpy as np import pandas as pd import pytest +from dask_planner import SqlTypeName -from dask_planner.rust import SqlTypeName from dask_sql.mappings import python_to_sql_type, similar_type, sql_to_python_value From d900f0ed4a54fe2091b93fc905ae3c6e29dac8f4 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 12:27:29 -0400 Subject: [PATCH 29/44] Make some modifications to Rust package name --- Cargo.lock | 4 +- Cargo.toml | 13 ++-- dask_sql/context.py | 3 +- dask_sql/input_utils/hive.py | 3 +- dask_sql/mappings.py | 3 +- dask_sql/physical/rel/base.py | 4 +- dask_sql/physical/rel/convert.py | 4 +- dask_sql/physical/rel/custom/alter.py | 4 +- dask_sql/physical/rel/custom/analyze_table.py | 4 +- .../rel/custom/create_catalog_schema.py | 4 +- .../rel/custom/create_memory_table.py | 3 +- dask_sql/physical/rel/custom/create_table.py | 4 +- .../physical/rel/custom/describe_model.py | 4 +- dask_sql/physical/rel/custom/distributeby.py | 4 +- dask_sql/physical/rel/custom/drop_schema.py | 4 +- dask_sql/physical/rel/custom/export_model.py | 4 +- dask_sql/physical/rel/custom/predict_model.py | 4 +- dask_sql/physical/rel/custom/show_columns.py | 3 +- dask_sql/physical/rel/custom/show_models.py | 4 +- dask_sql/physical/rel/custom/show_schemas.py | 3 +- dask_sql/physical/rel/custom/show_tables.py | 3 +- dask_sql/physical/rel/custom/use_schema.py | 4 +- dask_sql/physical/rel/logical/aggregate.py | 4 +- dask_sql/physical/rel/logical/cross_join.py | 4 +- dask_sql/physical/rel/logical/empty.py | 4 +- dask_sql/physical/rel/logical/explain.py | 4 +- dask_sql/physical/rel/logical/filter.py | 4 +- dask_sql/physical/rel/logical/join.py | 4 +- dask_sql/physical/rel/logical/limit.py | 4 +- dask_sql/physical/rel/logical/project.py | 8 +- dask_sql/physical/rel/logical/sort.py | 4 +- .../physical/rel/logical/subquery_alias.py | 4 +- dask_sql/physical/rel/logical/table_scan.py | 4 +- dask_sql/physical/rel/logical/union.py | 4 +- dask_sql/physical/rel/logical/window.py | 4 +- dask_sql/physical/rex/base.py | 2 +- dask_sql/physical/rex/convert.py | 4 +- dask_sql/physical/rex/core/alias.py | 4 +- dask_sql/physical/rex/core/call.py | 7 +- dask_sql/physical/rex/core/input_ref.py | 4 +- dask_sql/physical/rex/core/literal.py | 7 +- dask_sql/physical/rex/core/subquery.py | 4 +- dask_sql/utils.py | 3 +- pyproject.toml | 74 ++++++++++++++++++- setup.py | 74 +------------------ src/lib.rs | 3 +- src/sql.rs | 2 +- src/sql/column.rs | 2 +- src/sql/function.rs | 2 +- src/sql/logical.rs | 2 +- src/sql/logical/aggregate.rs | 2 +- src/sql/logical/alter_schema.rs | 2 +- src/sql/logical/alter_table.rs | 2 +- src/sql/logical/analyze_table.rs | 2 +- src/sql/logical/create_catalog_schema.rs | 2 +- src/sql/logical/create_experiment.rs | 2 +- src/sql/logical/create_memory_table.rs | 2 +- src/sql/logical/create_model.rs | 2 +- src/sql/logical/create_table.rs | 2 +- src/sql/logical/describe_model.rs | 2 +- src/sql/logical/drop_model.rs | 2 +- src/sql/logical/drop_schema.rs | 2 +- src/sql/logical/drop_table.rs | 2 +- src/sql/logical/empty_relation.rs | 2 +- src/sql/logical/explain.rs | 2 +- src/sql/logical/export_model.rs | 2 +- src/sql/logical/filter.rs | 2 +- src/sql/logical/join.rs | 2 +- src/sql/logical/limit.rs | 2 +- src/sql/logical/predict_model.rs | 2 +- src/sql/logical/projection.rs | 2 +- src/sql/logical/repartition_by.rs | 2 +- src/sql/logical/show_columns.rs | 2 +- src/sql/logical/show_models.rs | 2 +- src/sql/logical/show_schemas.rs | 2 +- src/sql/logical/show_tables.rs | 2 +- src/sql/logical/sort.rs | 2 +- src/sql/logical/subquery_alias.rs | 2 +- src/sql/logical/table_scan.rs | 4 +- src/sql/logical/use_schema.rs | 2 +- src/sql/logical/window.rs | 6 +- src/sql/schema.rs | 2 +- src/sql/statement.rs | 2 +- src/sql/table.rs | 4 +- src/sql/types/rel_data_type.rs | 2 +- src/sql/types/rel_data_type_field.rs | 2 +- tests/unit/test_mapping.py | 3 +- 87 files changed, 214 insertions(+), 209 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index accc91264..cb035a053 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -658,8 +658,8 @@ dependencies = [ ] [[package]] -name = "dask-planner" -version = "0.1.0" +name = "dask-sql" +version = "2023.6.0" dependencies = [ "async-trait", "datafusion-python", diff --git a/Cargo.toml b/Cargo.toml index eefc51a32..11444b09c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] -name = "dask-planner" +name = "dask-sql" repository = "https://github.com/dask-contrib/dask-sql" -version = "0.1.0" +version = "2023.6.0" description = "Bindings for DataFusion used by Dask-SQL" readme = "README.md" license = "Apache-2.0" @@ -9,19 +9,16 @@ edition = "2021" rust-version = "1.65" [lib] -name = "dask_planner" -crate-type = ["cdylib"] +name = "dask_sql" +crate-type = ["cdylib", "rlib"] [dependencies] async-trait = "0.1.71" datafusion-python = "27.0.0" env_logger = "0.10" log = "^0.4" +pyo3 = { version = "0.19.0", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-log = "0.8.2" -[dependencies.pyo3] -version = "0.19.0" -features = ["abi3-py38"] - [build-dependencies] pyo3-build-config = "0.19.1" diff --git a/dask_sql/context.py b/dask_sql/context.py index 03947fa76..fb97ad47c 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -9,7 +9,8 @@ from dask import config as dask_config from dask.base import optimize from dask.utils_test import hlg_layer -from dask_planner import ( + +from ._internal import ( DaskSchema, DaskSQLContext, DaskTable, diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py index 5d500180d..fb1117289 100644 --- a/dask_sql/input_utils/hive.py +++ b/dask_sql/input_utils/hive.py @@ -5,7 +5,8 @@ from typing import Any, Union import dask.dataframe as dd -from dask_planner import SqlTypeName + +from ._internal import SqlTypeName try: from pyhive import hive diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 3d39ee392..f0d9c74d1 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -7,7 +7,8 @@ import dask.dataframe as dd import numpy as np import pandas as pd -from dask_planner import DaskTypeMap, SqlTypeName + +from ._internal import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index f4463fe62..ce28aeb28 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -7,10 +7,10 @@ from dask_sql.mappings import cast_column_type, sql_to_python_type if TYPE_CHECKING: - from dask_planner import LogicalPlan, RelDataType - import dask_sql + from ._internal import LogicalPlan, RelDataType + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index 24b06c337..6a17ac94d 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -7,10 +7,10 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/alter.py b/dask_sql/physical/rel/custom/alter.py index 16ed9e9bb..f8e92671d 100644 --- a/dask_sql/physical/rel/custom/alter.py +++ b/dask_sql/physical/rel/custom/alter.py @@ -6,10 +6,10 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class AlterSchemaPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index 77edfff4b..e42c0c229 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -8,10 +8,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class AnalyzeTablePlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/custom/create_catalog_schema.py b/dask_sql/physical/rel/custom/create_catalog_schema.py index 74f964621..1a28edd8c 100644 --- a/dask_sql/physical/rel/custom/create_catalog_schema.py +++ b/dask_sql/physical/rel/custom/create_catalog_schema.py @@ -4,10 +4,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_memory_table.py b/dask_sql/physical/rel/custom/create_memory_table.py index 760857563..32d4d1d8b 100644 --- a/dask_sql/physical/rel/custom/create_memory_table.py +++ b/dask_sql/physical/rel/custom/create_memory_table.py @@ -6,7 +6,8 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + + from ._internal import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_table.py b/dask_sql/physical/rel/custom/create_table.py index 526ec9728..0c4807d91 100644 --- a/dask_sql/physical/rel/custom/create_table.py +++ b/dask_sql/physical/rel/custom/create_table.py @@ -6,10 +6,10 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/describe_model.py b/dask_sql/physical/rel/custom/describe_model.py index 8b2e144ff..931930b1b 100644 --- a/dask_sql/physical/rel/custom/describe_model.py +++ b/dask_sql/physical/rel/custom/describe_model.py @@ -7,10 +7,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class DescribeModelPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/custom/distributeby.py b/dask_sql/physical/rel/custom/distributeby.py index 6b6dba0b8..e45623038 100644 --- a/dask_sql/physical/rel/custom/distributeby.py +++ b/dask_sql/physical/rel/custom/distributeby.py @@ -6,10 +6,10 @@ from dask_sql.utils import LoggableDataFrame if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/drop_schema.py b/dask_sql/physical/rel/custom/drop_schema.py index 455b27fa4..9df844398 100644 --- a/dask_sql/physical/rel/custom/drop_schema.py +++ b/dask_sql/physical/rel/custom/drop_schema.py @@ -4,10 +4,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/export_model.py b/dask_sql/physical/rel/custom/export_model.py index c96d19786..e3743406b 100644 --- a/dask_sql/physical/rel/custom/export_model.py +++ b/dask_sql/physical/rel/custom/export_model.py @@ -6,10 +6,10 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/predict_model.py b/dask_sql/physical/rel/custom/predict_model.py index c0339b1d7..e5866948b 100644 --- a/dask_sql/physical/rel/custom/predict_model.py +++ b/dask_sql/physical/rel/custom/predict_model.py @@ -9,10 +9,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/show_columns.py b/dask_sql/physical/rel/custom/show_columns.py index 6b0b94fe9..8a2ee0306 100644 --- a/dask_sql/physical/rel/custom/show_columns.py +++ b/dask_sql/physical/rel/custom/show_columns.py @@ -9,7 +9,8 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + + from ._internal import LogicalPlan class ShowColumnsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_models.py b/dask_sql/physical/rel/custom/show_models.py index ecc81e82a..64c656ad8 100644 --- a/dask_sql/physical/rel/custom/show_models.py +++ b/dask_sql/physical/rel/custom/show_models.py @@ -7,10 +7,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class ShowModelsPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/custom/show_schemas.py b/dask_sql/physical/rel/custom/show_schemas.py index 98b9f8ab3..0f3bfdf7e 100644 --- a/dask_sql/physical/rel/custom/show_schemas.py +++ b/dask_sql/physical/rel/custom/show_schemas.py @@ -8,7 +8,8 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + + from ._internal import LogicalPlan class ShowSchemasPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_tables.py b/dask_sql/physical/rel/custom/show_tables.py index d79b4052b..05b899949 100644 --- a/dask_sql/physical/rel/custom/show_tables.py +++ b/dask_sql/physical/rel/custom/show_tables.py @@ -8,7 +8,8 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + + from ._internal import LogicalPlan class ShowTablesPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/use_schema.py b/dask_sql/physical/rel/custom/use_schema.py index 563415c2d..9186049f9 100644 --- a/dask_sql/physical/rel/custom/use_schema.py +++ b/dask_sql/physical/rel/custom/use_schema.py @@ -4,10 +4,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class UseSchemaPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index 27f5c102c..f228bd16c 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -15,10 +15,10 @@ from dask_sql.utils import is_cudf_type, new_temporary_column if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/cross_join.py b/dask_sql/physical/rel/logical/cross_join.py index dfa8cdf3c..94690e0bc 100644 --- a/dask_sql/physical/rel/logical/cross_join.py +++ b/dask_sql/physical/rel/logical/cross_join.py @@ -6,10 +6,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/empty.py b/dask_sql/physical/rel/logical/empty.py index b50699b79..202743a7b 100644 --- a/dask_sql/physical/rel/logical/empty.py +++ b/dask_sql/physical/rel/logical/empty.py @@ -8,10 +8,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/explain.py b/dask_sql/physical/rel/logical/explain.py index abf1d814c..4afd6870b 100644 --- a/dask_sql/physical/rel/logical/explain.py +++ b/dask_sql/physical/rel/logical/explain.py @@ -3,10 +3,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class ExplainPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index a37e390ec..58704ae5a 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -11,10 +11,10 @@ from dask_sql.physical.utils.filter import attempt_predicate_pushdown if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index cec7df4d9..ea5cfd4c2 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -17,10 +17,10 @@ from dask_sql.utils import is_cudf_type if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from ._internal import Expression, LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index 00ba37fa2..efb07a073 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -11,10 +11,10 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class DaskLimitPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/logical/project.py b/dask_sql/physical/rel/logical/project.py index 4630b5d6b..d4b41c046 100644 --- a/dask_sql/physical/rel/logical/project.py +++ b/dask_sql/physical/rel/logical/project.py @@ -1,18 +1,18 @@ import logging from typing import TYPE_CHECKING -from dask_planner import RexType - from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter from dask_sql.utils import new_temporary_column -if TYPE_CHECKING: - from dask_planner import LogicalPlan +from ._internal import RexType +if TYPE_CHECKING: import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index 6dc57211c..453c8895a 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -5,10 +5,10 @@ from dask_sql.physical.utils.sort import apply_sort if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class DaskSortPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/logical/subquery_alias.py b/dask_sql/physical/rel/logical/subquery_alias.py index e82d9b105..ba82391f0 100644 --- a/dask_sql/physical/rel/logical/subquery_alias.py +++ b/dask_sql/physical/rel/logical/subquery_alias.py @@ -4,10 +4,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class SubqueryAlias(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index b3b5cab0a..fa0e6b5bd 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -11,10 +11,10 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/union.py b/dask_sql/physical/rel/logical/union.py index 1fbc5b5ae..04ca0d150 100644 --- a/dask_sql/physical/rel/logical/union.py +++ b/dask_sql/physical/rel/logical/union.py @@ -6,10 +6,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + def _extract_df(obj_cc, obj_df, output_field_names): # For concatenating, they should have exactly the same fields diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index bbcdae740..793b71903 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -16,10 +16,10 @@ from dask_sql.utils import LoggableDataFrame, new_temporary_column if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/base.py b/dask_sql/physical/rex/base.py index 7f97a70d9..97692284b 100644 --- a/dask_sql/physical/rex/base.py +++ b/dask_sql/physical/rex/base.py @@ -7,7 +7,7 @@ from dask_sql.datacontainer import DataContainer if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan + from ._internal import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/convert.py b/dask_sql/physical/rex/convert.py index fce64be30..6cba4db8c 100644 --- a/dask_sql/physical/rex/convert.py +++ b/dask_sql/physical/rex/convert.py @@ -8,10 +8,10 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from ._internal import Expression, LogicalPlan + logger = logging.getLogger(__name__) _REX_TYPE_TO_PLUGIN = { diff --git a/dask_sql/physical/rex/core/alias.py b/dask_sql/physical/rex/core/alias.py index d6ae20698..7821e8d74 100644 --- a/dask_sql/physical/rex/core/alias.py +++ b/dask_sql/physical/rex/core/alias.py @@ -7,10 +7,10 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from ._internal import Expression, LogicalPlan + class RexAliasPlugin(BaseRexPlugin): """ diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 56d01d006..e06050823 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -13,7 +13,6 @@ from dask.dataframe.core import Series from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data -from dask_planner import SqlTypeName from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200 from dask_sql.datacontainer import DataContainer @@ -33,11 +32,13 @@ is_frame, ) -if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan +from ._internal import SqlTypeName +if TYPE_CHECKING: import dask_sql + from ._internal import Expression, LogicalPlan + logger = logging.getLogger(__name__) SeriesOrScalar = Union[dd.Series, Any] diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 01bf871c7..57cb1bd1d 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -6,10 +6,10 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from ._internal import Expression, LogicalPlan + class RexInputRefPlugin(BaseRexPlugin): """ diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 7fe59b383..952c157aa 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -4,17 +4,18 @@ import dask.dataframe as dd import numpy as np -from dask_planner import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin -if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan +from ._internal import SqlTypeName +if TYPE_CHECKING: import dask_sql + from ._internal import Expression, LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/subquery.py b/dask_sql/physical/rex/core/subquery.py index 1253f257d..7afb74c3e 100644 --- a/dask_sql/physical/rex/core/subquery.py +++ b/dask_sql/physical/rex/core/subquery.py @@ -7,10 +7,10 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from ._internal import Expression, LogicalPlan + class RexScalarSubqueryPlugin(BaseRexPlugin): """ diff --git a/dask_sql/utils.py b/dask_sql/utils.py index c2cfe45ab..6eed1ed29 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -8,11 +8,12 @@ import dask.dataframe as dd import numpy as np import pandas as pd -from dask_planner import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value +from ._internal import SqlTypeName + logger = logging.getLogger(__name__) diff --git a/pyproject.toml b/pyproject.toml index 17392d3b8..236b63350 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,81 @@ [build-system] -requires = ["maturin>=1.0,<2.0"] +requires = ["maturin>=0.15,<0.16"] build-backend = "maturin" +[project] +name = "dask_sql" +description = "SQL query layer for Dask" +maintainers = [{name = "Nils Braun", email = "nilslennartbraun@gmail.com"}] +license = {text = "MIT"} +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Rust", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering", + "Topic :: System :: Distributed Computing", +] +readme = "README.md" +urls = {Homepage = "https://github.com/dask-contrib/dask-sql/"} +requires-python = ">=3.8" +dependencies = [ + "dask[dataframe]>=2022.3.0", + "distributed>=2022.3.0", + "pandas>=1.4.0", + # FIXME: handling is needed for httpx-based fastapi>=0.87.0 + "fastapi>=0.69.0,<0.87.0", + "uvicorn>=0.13.4", + "tzlocal>=2.1", + "prompt_toolkit>=3.0.8", + "pygments>=2.7.1", + "tabulate", +] +dynamic = ["version"] + +[project.optional-dependencies] +dev = [ + "pytest>=6.0.1", + "pytest-cov>=2.10.1", + "mock>=4.0.3", + "sphinx>=3.2.1", + "pyarrow>=6.0.1", + "scikit-learn>=1.0.0", + "intake>=0.6.0", + "pre-commit", + "black==22.10.0", + "isort==5.12.0", +] +fugue = ["fugue>=0.7.3"] + +[project.entry-points."fugue.plugins"] +dasksql = "dask_sql.integrations.fugue:_register_engines[fugue]" + +[project.scripts] +dask-sql = "dask_sql.cmd:main" +dask-sql-server = "dask_sql.server.app:main" + +[tool.setuptools] +include-package-data = true +zip-safe = false +license-files = ["LICENSE.txt"] + +[tool.setuptools.packages] +find = {namespaces = false} + [tool.maturin] -features = ["pyo3/extension-module"] +module-name = "dask_sql" include = [ { path = "Cargo.lock", format = "sdist" } ] -exclude = [".github/**", "ci/**", ".asf.yaml"] +exclude = [".github/**", "continuous_integration/**"] locked = true [tool.isort] diff --git a/setup.py b/setup.py index 02693d0d6..fcbb31faf 100644 --- a/setup.py +++ b/setup.py @@ -1,78 +1,8 @@ -import os -import sys - -from setuptools import find_packages, setup +from setuptools import setup import versioneer -long_description = "" -if os.path.exists("README.md"): - with open("README.md") as f: - long_description = f.read() - -needs_sphinx = "build_sphinx" in sys.argv -sphinx_requirements = ["sphinx>=3.2.1", "sphinx_rtd_theme"] if needs_sphinx else [] -debug_build = "debug" in sys.argv - -cmdclass = versioneer.get_cmdclass() - setup( - name="dask_sql", version=versioneer.get_version(), - description="SQL query layer for Dask", - url="https://github.com/dask-contrib/dask-sql/", - maintainer="Nils Braun", - maintainer_email="nilslennartbraun@gmail.com", - license="MIT", - long_description=long_description, - long_description_content_type="text/markdown", - packages=find_packages( - include=["dask_sql", "dask_sql.*", "dask_planner", "dask_planner.*"] - ), - package_data={"dask_sql": ["sql*.yaml"]}, - python_requires=">=3.8", - setup_requires=sphinx_requirements, - install_requires=[ - "dask[dataframe]>=2022.3.0", - "distributed>=2022.3.0", - "pandas>=1.4.0", - # FIXME: handling is needed for httpx-based fastapi>=0.87.0 - "fastapi>=0.69.0,<0.87.0", - "uvicorn>=0.13.4", - "tzlocal>=2.1", - "prompt_toolkit>=3.0.8", - "pygments>=2.7.1", - "tabulate", - ], - extras_require={ - "dev": [ - "pytest>=6.0.1", - "pytest-cov>=2.10.1", - "mock>=4.0.3", - "sphinx>=3.2.1", - "pyarrow>=6.0.1", - "scikit-learn>=1.0.0", - "intake>=0.6.0", - "pre-commit", - "black==22.10.0", - "isort==5.12.0", - ], - "fugue": ["fugue>=0.7.3"], - }, - entry_points={ - "console_scripts": [ - "dask-sql-server = dask_sql.server.app:main", - "dask-sql = dask_sql.cmd:main", - ], - "fugue.plugins": [ - "dasksql = dask_sql.integrations.fugue:_register_engines[fugue]" - ], - }, - zip_safe=False, - cmdclass=cmdclass, - command_options={ - "build_sphinx": { - "source_dir": ("setup.py", "docs"), - } - }, + cmdclass=versioneer.get_cmdclass(), ) diff --git a/src/lib.rs b/src/lib.rs index 9f446b7e2..1ced3e9d7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,8 +12,7 @@ mod sql; /// The higher-level public API is defined in pure python files under the /// dask_planner directory. #[pymodule] -#[pyo3(name = "dask_planner")] -fn rust(py: Python, m: &PyModule) -> PyResult<()> { +fn _internal(py: Python, m: &PyModule) -> PyResult<()> { // Initialize the global Python logger instance pyo3_log::init(); diff --git a/src/sql.rs b/src/sql.rs index 39d4614d4..585fcad4d 100644 --- a/src/sql.rs +++ b/src/sql.rs @@ -92,7 +92,7 @@ use crate::{ /// # Ok(()) /// # } /// ``` -#[pyclass(name = "DaskSQLContext", module = "dask_planner", subclass)] +#[pyclass(name = "DaskSQLContext", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskSQLContext { current_catalog: String, diff --git a/src/sql/column.rs b/src/sql/column.rs index 63f043901..32250c382 100644 --- a/src/sql/column.rs +++ b/src/sql/column.rs @@ -1,7 +1,7 @@ use datafusion_python::datafusion_common::Column; use pyo3::prelude::*; -#[pyclass(name = "Column", module = "dask_planner", subclass)] +#[pyclass(name = "Column", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyColumn { /// Original Column instance diff --git a/src/sql/function.rs b/src/sql/function.rs index 39fa7635e..4169d386c 100644 --- a/src/sql/function.rs +++ b/src/sql/function.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; use super::types::PyDataType; -#[pyclass(name = "DaskFunction", module = "dask_planner", subclass)] +#[pyclass(name = "DaskFunction", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskFunction { #[pyo3(get, set)] diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 890f9aacb..e8f5f9f6f 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -62,7 +62,7 @@ use self::{ }; use crate::{error::Result, sql::exceptions::py_type_err}; -#[pyclass(name = "LogicalPlan", module = "dask_planner", subclass)] +#[pyclass(name = "LogicalPlan", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyLogicalPlan { /// The original LogicalPlan that was parsed by DataFusion from the input SQL diff --git a/src/sql/logical/aggregate.rs b/src/sql/logical/aggregate.rs index 870d8d7ab..a36750dba 100644 --- a/src/sql/logical/aggregate.rs +++ b/src/sql/logical/aggregate.rs @@ -11,7 +11,7 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "Aggregate", module = "dask_planner", subclass)] +#[pyclass(name = "Aggregate", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyAggregate { aggregate: Option, diff --git a/src/sql/logical/alter_schema.rs b/src/sql/logical/alter_schema.rs index 742ae513f..a7a8696b8 100644 --- a/src/sql/logical/alter_schema.rs +++ b/src/sql/logical/alter_schema.rs @@ -96,7 +96,7 @@ impl UserDefinedLogicalNode for AlterSchemaPlanNode { } } -#[pyclass(name = "AlterSchema", module = "dask_planner", subclass)] +#[pyclass(name = "AlterSchema", module = "dask_sql", subclass)] pub struct PyAlterSchema { pub(crate) alter_schema: AlterSchemaPlanNode, } diff --git a/src/sql/logical/alter_table.rs b/src/sql/logical/alter_table.rs index 7f51a15c3..d6b49315b 100644 --- a/src/sql/logical/alter_table.rs +++ b/src/sql/logical/alter_table.rs @@ -102,7 +102,7 @@ impl UserDefinedLogicalNode for AlterTablePlanNode { } } -#[pyclass(name = "AlterTable", module = "dask_planner", subclass)] +#[pyclass(name = "AlterTable", module = "dask_sql", subclass)] pub struct PyAlterTable { pub(crate) alter_table: AlterTablePlanNode, } diff --git a/src/sql/logical/analyze_table.rs b/src/sql/logical/analyze_table.rs index 9fa7fb219..6876c3704 100644 --- a/src/sql/logical/analyze_table.rs +++ b/src/sql/logical/analyze_table.rs @@ -99,7 +99,7 @@ impl UserDefinedLogicalNode for AnalyzeTablePlanNode { } } -#[pyclass(name = "AnalyzeTable", module = "dask_planner", subclass)] +#[pyclass(name = "AnalyzeTable", module = "dask_sql", subclass)] pub struct PyAnalyzeTable { pub(crate) analyze_table: AnalyzeTablePlanNode, } diff --git a/src/sql/logical/create_catalog_schema.rs b/src/sql/logical/create_catalog_schema.rs index bc89b02ce..82a1426af 100644 --- a/src/sql/logical/create_catalog_schema.rs +++ b/src/sql/logical/create_catalog_schema.rs @@ -95,7 +95,7 @@ impl UserDefinedLogicalNode for CreateCatalogSchemaPlanNode { } } -#[pyclass(name = "CreateCatalogSchema", module = "dask_planner", subclass)] +#[pyclass(name = "CreateCatalogSchema", module = "dask_sql", subclass)] pub struct PyCreateCatalogSchema { pub(crate) create_catalog_schema: CreateCatalogSchemaPlanNode, } diff --git a/src/sql/logical/create_experiment.rs b/src/sql/logical/create_experiment.rs index 313357d75..06fe9d856 100644 --- a/src/sql/logical/create_experiment.rs +++ b/src/sql/logical/create_experiment.rs @@ -105,7 +105,7 @@ impl UserDefinedLogicalNode for CreateExperimentPlanNode { } } -#[pyclass(name = "CreateExperiment", module = "dask_planner", subclass)] +#[pyclass(name = "CreateExperiment", module = "dask_sql", subclass)] pub struct PyCreateExperiment { pub(crate) create_experiment: CreateExperimentPlanNode, } diff --git a/src/sql/logical/create_memory_table.rs b/src/sql/logical/create_memory_table.rs index dd3d0753d..53ff9432e 100644 --- a/src/sql/logical/create_memory_table.rs +++ b/src/sql/logical/create_memory_table.rs @@ -7,7 +7,7 @@ use pyo3::prelude::*; use crate::sql::{exceptions::py_type_err, logical::PyLogicalPlan}; -#[pyclass(name = "CreateMemoryTable", module = "dask_planner", subclass)] +#[pyclass(name = "CreateMemoryTable", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyCreateMemoryTable { create_memory_table: Option, diff --git a/src/sql/logical/create_model.rs b/src/sql/logical/create_model.rs index 782fe3325..7dbcdff95 100644 --- a/src/sql/logical/create_model.rs +++ b/src/sql/logical/create_model.rs @@ -101,7 +101,7 @@ impl UserDefinedLogicalNode for CreateModelPlanNode { } } -#[pyclass(name = "CreateModel", module = "dask_planner", subclass)] +#[pyclass(name = "CreateModel", module = "dask_sql", subclass)] pub struct PyCreateModel { pub(crate) create_model: CreateModelPlanNode, } diff --git a/src/sql/logical/create_table.rs b/src/sql/logical/create_table.rs index 9271130c7..1c423415f 100644 --- a/src/sql/logical/create_table.rs +++ b/src/sql/logical/create_table.rs @@ -100,7 +100,7 @@ impl UserDefinedLogicalNode for CreateTablePlanNode { } } -#[pyclass(name = "CreateTable", module = "dask_planner", subclass)] +#[pyclass(name = "CreateTable", module = "dask_sql", subclass)] pub struct PyCreateTable { pub(crate) create_table: CreateTablePlanNode, } diff --git a/src/sql/logical/describe_model.rs b/src/sql/logical/describe_model.rs index cb2087376..3e3563fe1 100644 --- a/src/sql/logical/describe_model.rs +++ b/src/sql/logical/describe_model.rs @@ -89,7 +89,7 @@ impl UserDefinedLogicalNode for DescribeModelPlanNode { } } -#[pyclass(name = "DescribeModel", module = "dask_planner", subclass)] +#[pyclass(name = "DescribeModel", module = "dask_sql", subclass)] pub struct PyDescribeModel { pub(crate) describe_model: DescribeModelPlanNode, } diff --git a/src/sql/logical/drop_model.rs b/src/sql/logical/drop_model.rs index 71074905d..2715cb067 100644 --- a/src/sql/logical/drop_model.rs +++ b/src/sql/logical/drop_model.rs @@ -92,7 +92,7 @@ impl UserDefinedLogicalNode for DropModelPlanNode { } } -#[pyclass(name = "DropModel", module = "dask_planner", subclass)] +#[pyclass(name = "DropModel", module = "dask_sql", subclass)] pub struct PyDropModel { pub(crate) drop_model: DropModelPlanNode, } diff --git a/src/sql/logical/drop_schema.rs b/src/sql/logical/drop_schema.rs index 2022a61c9..78d252d11 100644 --- a/src/sql/logical/drop_schema.rs +++ b/src/sql/logical/drop_schema.rs @@ -88,7 +88,7 @@ impl UserDefinedLogicalNode for DropSchemaPlanNode { } } -#[pyclass(name = "DropSchema", module = "dask_planner", subclass)] +#[pyclass(name = "DropSchema", module = "dask_sql", subclass)] pub struct PyDropSchema { pub(crate) drop_schema: DropSchemaPlanNode, } diff --git a/src/sql/logical/drop_table.rs b/src/sql/logical/drop_table.rs index f91baf28a..504a104c1 100644 --- a/src/sql/logical/drop_table.rs +++ b/src/sql/logical/drop_table.rs @@ -6,7 +6,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "DropTable", module = "dask_planner", subclass)] +#[pyclass(name = "DropTable", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyDropTable { drop_table: DropTable, diff --git a/src/sql/logical/empty_relation.rs b/src/sql/logical/empty_relation.rs index 5bd6659ce..6356f9c85 100644 --- a/src/sql/logical/empty_relation.rs +++ b/src/sql/logical/empty_relation.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "EmptyRelation", module = "dask_planner", subclass)] +#[pyclass(name = "EmptyRelation", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyEmptyRelation { empty_relation: EmptyRelation, diff --git a/src/sql/logical/explain.rs b/src/sql/logical/explain.rs index 17f1e4ee2..839a731d8 100644 --- a/src/sql/logical/explain.rs +++ b/src/sql/logical/explain.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "Explain", module = "dask_planner", subclass)] +#[pyclass(name = "Explain", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyExplain { explain: Explain, diff --git a/src/sql/logical/export_model.rs b/src/sql/logical/export_model.rs index e38551b58..58b5f7fad 100644 --- a/src/sql/logical/export_model.rs +++ b/src/sql/logical/export_model.rs @@ -95,7 +95,7 @@ impl UserDefinedLogicalNode for ExportModelPlanNode { } } -#[pyclass(name = "ExportModel", module = "dask_planner", subclass)] +#[pyclass(name = "ExportModel", module = "dask_sql", subclass)] pub struct PyExportModel { pub(crate) export_model: ExportModelPlanNode, } diff --git a/src/sql/logical/filter.rs b/src/sql/logical/filter.rs index a50d508ff..f2dc2e702 100644 --- a/src/sql/logical/filter.rs +++ b/src/sql/logical/filter.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::{expression::PyExpr, sql::exceptions::py_type_err}; -#[pyclass(name = "Filter", module = "dask_planner", subclass)] +#[pyclass(name = "Filter", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyFilter { filter: Filter, diff --git a/src/sql/logical/join.rs b/src/sql/logical/join.rs index d6c31b55b..3261e9217 100644 --- a/src/sql/logical/join.rs +++ b/src/sql/logical/join.rs @@ -15,7 +15,7 @@ use crate::{ sql::{column, exceptions::py_type_err}, }; -#[pyclass(name = "Join", module = "dask_planner", subclass)] +#[pyclass(name = "Join", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyJoin { join: Join, diff --git a/src/sql/logical/limit.rs b/src/sql/logical/limit.rs index 189fdeea0..04d783fdd 100644 --- a/src/sql/logical/limit.rs +++ b/src/sql/logical/limit.rs @@ -6,7 +6,7 @@ use pyo3::prelude::*; use crate::{expression::PyExpr, sql::exceptions::py_type_err}; -#[pyclass(name = "Limit", module = "dask_planner", subclass)] +#[pyclass(name = "Limit", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyLimit { limit: Limit, diff --git a/src/sql/logical/predict_model.rs b/src/sql/logical/predict_model.rs index e8d723d2c..3f68ffdb4 100644 --- a/src/sql/logical/predict_model.rs +++ b/src/sql/logical/predict_model.rs @@ -89,7 +89,7 @@ impl UserDefinedLogicalNode for PredictModelPlanNode { } } -#[pyclass(name = "PredictModel", module = "dask_planner", subclass)] +#[pyclass(name = "PredictModel", module = "dask_sql", subclass)] pub struct PyPredictModel { pub(crate) predict_model: PredictModelPlanNode, } diff --git a/src/sql/logical/projection.rs b/src/sql/logical/projection.rs index 99ed0d684..b954d3b71 100644 --- a/src/sql/logical/projection.rs +++ b/src/sql/logical/projection.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::{expression::PyExpr, sql::exceptions::py_type_err}; -#[pyclass(name = "Projection", module = "dask_planner", subclass)] +#[pyclass(name = "Projection", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyProjection { pub(crate) projection: Projection, diff --git a/src/sql/logical/repartition_by.rs b/src/sql/logical/repartition_by.rs index e931b88e7..687958571 100644 --- a/src/sql/logical/repartition_by.rs +++ b/src/sql/logical/repartition_by.rs @@ -10,7 +10,7 @@ use crate::{ sql::{exceptions::py_type_err, logical}, }; -#[pyclass(name = "RepartitionBy", module = "dask_planner", subclass)] +#[pyclass(name = "RepartitionBy", module = "dask_sql", subclass)] pub struct PyRepartitionBy { pub(crate) repartition: Repartition, } diff --git a/src/sql/logical/show_columns.rs b/src/sql/logical/show_columns.rs index adfb584ef..cdd844127 100644 --- a/src/sql/logical/show_columns.rs +++ b/src/sql/logical/show_columns.rs @@ -92,7 +92,7 @@ impl UserDefinedLogicalNode for ShowColumnsPlanNode { } } -#[pyclass(name = "ShowColumns", module = "dask_planner", subclass)] +#[pyclass(name = "ShowColumns", module = "dask_sql", subclass)] pub struct PyShowColumns { pub(crate) show_columns: ShowColumnsPlanNode, } diff --git a/src/sql/logical/show_models.rs b/src/sql/logical/show_models.rs index 026a179a5..a228769de 100644 --- a/src/sql/logical/show_models.rs +++ b/src/sql/logical/show_models.rs @@ -85,7 +85,7 @@ impl UserDefinedLogicalNode for ShowModelsPlanNode { } } -#[pyclass(name = "ShowModels", module = "dask_planner", subclass)] +#[pyclass(name = "ShowModels", module = "dask_sql", subclass)] pub struct PyShowModels { pub(crate) show_models: ShowModelsPlanNode, } diff --git a/src/sql/logical/show_schemas.rs b/src/sql/logical/show_schemas.rs index 3e3ed4783..454afb51d 100644 --- a/src/sql/logical/show_schemas.rs +++ b/src/sql/logical/show_schemas.rs @@ -91,7 +91,7 @@ impl UserDefinedLogicalNode for ShowSchemasPlanNode { } } -#[pyclass(name = "ShowSchema", module = "dask_planner", subclass)] +#[pyclass(name = "ShowSchema", module = "dask_sql", subclass)] pub struct PyShowSchema { pub(crate) show_schema: ShowSchemasPlanNode, } diff --git a/src/sql/logical/show_tables.rs b/src/sql/logical/show_tables.rs index 987f2546e..c01022828 100644 --- a/src/sql/logical/show_tables.rs +++ b/src/sql/logical/show_tables.rs @@ -95,7 +95,7 @@ impl UserDefinedLogicalNode for ShowTablesPlanNode { } } -#[pyclass(name = "ShowTables", module = "dask_planner", subclass)] +#[pyclass(name = "ShowTables", module = "dask_sql", subclass)] pub struct PyShowTables { pub(crate) show_tables: ShowTablesPlanNode, } diff --git a/src/sql/logical/sort.rs b/src/sql/logical/sort.rs index 9abcd3906..5a1f862a1 100644 --- a/src/sql/logical/sort.rs +++ b/src/sql/logical/sort.rs @@ -6,7 +6,7 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "Sort", module = "dask_planner", subclass)] +#[pyclass(name = "Sort", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PySort { sort: Sort, diff --git a/src/sql/logical/subquery_alias.rs b/src/sql/logical/subquery_alias.rs index 003e02045..e98c78203 100644 --- a/src/sql/logical/subquery_alias.rs +++ b/src/sql/logical/subquery_alias.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "SubqueryAlias", module = "dask_planner", subclass)] +#[pyclass(name = "SubqueryAlias", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PySubqueryAlias { subquery_alias: SubqueryAlias, diff --git a/src/sql/logical/table_scan.rs b/src/sql/logical/table_scan.rs index 171e10400..1303f6474 100644 --- a/src/sql/logical/table_scan.rs +++ b/src/sql/logical/table_scan.rs @@ -12,7 +12,7 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "TableScan", module = "dask_planner", subclass)] +#[pyclass(name = "TableScan", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyTableScan { pub(crate) table_scan: TableScan, @@ -20,7 +20,7 @@ pub struct PyTableScan { } type FilterTuple = (String, String, Option>); -#[pyclass(name = "FilteredResult", module = "dask_planner", subclass)] +#[pyclass(name = "FilteredResult", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyFilteredResult { // Certain Expr(s) do not have supporting logic in pyarrow for IO filtering diff --git a/src/sql/logical/use_schema.rs b/src/sql/logical/use_schema.rs index 7c2206310..0f804ce7a 100644 --- a/src/sql/logical/use_schema.rs +++ b/src/sql/logical/use_schema.rs @@ -85,7 +85,7 @@ impl UserDefinedLogicalNode for UseSchemaPlanNode { } } -#[pyclass(name = "UseSchema", module = "dask_planner", subclass)] +#[pyclass(name = "UseSchema", module = "dask_sql", subclass)] pub struct PyUseSchema { pub(crate) use_schema: UseSchemaPlanNode, } diff --git a/src/sql/logical/window.rs b/src/sql/logical/window.rs index e104ccdb3..3dd9d8c0d 100644 --- a/src/sql/logical/window.rs +++ b/src/sql/logical/window.rs @@ -17,19 +17,19 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "Window", module = "dask_planner", subclass)] +#[pyclass(name = "Window", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyWindow { window: Window, } -#[pyclass(name = "WindowFrame", module = "dask_planner", subclass)] +#[pyclass(name = "WindowFrame", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyWindowFrame { window_frame: WindowFrame, } -#[pyclass(name = "WindowFrameBound", module = "dask_planner", subclass)] +#[pyclass(name = "WindowFrameBound", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyWindowFrameBound { frame_bound: WindowFrameBound, diff --git a/src/sql/schema.rs b/src/sql/schema.rs index 0975391f4..804db700f 100644 --- a/src/sql/schema.rs +++ b/src/sql/schema.rs @@ -6,7 +6,7 @@ use pyo3::prelude::*; use super::types::PyDataType; use crate::sql::{function::DaskFunction, table}; -#[pyclass(name = "DaskSchema", module = "dask_planner", subclass)] +#[pyclass(name = "DaskSchema", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskSchema { #[pyo3(get, set)] diff --git a/src/sql/statement.rs b/src/sql/statement.rs index f8fabc109..40fc9f268 100644 --- a/src/sql/statement.rs +++ b/src/sql/statement.rs @@ -2,7 +2,7 @@ use pyo3::prelude::*; use crate::parser::DaskStatement; -#[pyclass(name = "Statement", module = "dask_planner", subclass)] +#[pyclass(name = "Statement", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyStatement { pub statement: DaskStatement, diff --git a/src/sql/table.rs b/src/sql/table.rs index 47d1b6403..1c2585bef 100644 --- a/src/sql/table.rs +++ b/src/sql/table.rs @@ -90,7 +90,7 @@ fn is_supported_push_down_expr(_expr: &Expr) -> bool { true } -#[pyclass(name = "DaskStatistics", module = "dask_planner", subclass)] +#[pyclass(name = "DaskStatistics", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskStatistics { row_count: f64, @@ -109,7 +109,7 @@ impl DaskStatistics { } } -#[pyclass(name = "DaskTable", module = "dask_planner", subclass)] +#[pyclass(name = "DaskTable", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskTable { pub(crate) schema_name: Option, diff --git a/src/sql/types/rel_data_type.rs b/src/sql/types/rel_data_type.rs index 1ae3646b0..59cb0fb7c 100644 --- a/src/sql/types/rel_data_type.rs +++ b/src/sql/types/rel_data_type.rs @@ -8,7 +8,7 @@ const PRECISION_NOT_SPECIFIED: i32 = i32::MIN; const SCALE_NOT_SPECIFIED: i32 = -1; /// RelDataType represents the type of a scalar expression or entire row returned from a relational expression. -#[pyclass(name = "RelDataType", module = "dask_planner", subclass)] +#[pyclass(name = "RelDataType", module = "dask_sql", subclass)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct RelDataType { nullable: bool, diff --git a/src/sql/types/rel_data_type_field.rs b/src/sql/types/rel_data_type_field.rs index 13f036d0e..3694d0bce 100644 --- a/src/sql/types/rel_data_type_field.rs +++ b/src/sql/types/rel_data_type_field.rs @@ -12,7 +12,7 @@ use crate::{ }; /// RelDataTypeField represents the definition of a field in a structured RelDataType. -#[pyclass(name = "RelDataTypeField", module = "dask_planner", subclass)] +#[pyclass(name = "RelDataTypeField", module = "dask_sql", subclass)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct RelDataTypeField { qualifier: Option, diff --git a/tests/unit/test_mapping.py b/tests/unit/test_mapping.py index 952bcb10e..8cb155db7 100644 --- a/tests/unit/test_mapping.py +++ b/tests/unit/test_mapping.py @@ -3,10 +3,11 @@ import numpy as np import pandas as pd import pytest -from dask_planner import SqlTypeName from dask_sql.mappings import python_to_sql_type, similar_type, sql_to_python_value +from ._internal import SqlTypeName + def test_python_to_sql(): assert str(python_to_sql_type(np.dtype("int32"))) == "INTEGER" From 7d1be9218263ea67d31068b6e9877cebf06f9270 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 17 Jul 2023 13:43:24 -0400 Subject: [PATCH 30/44] Adjust native library name from _.internal to dask_planner --- Cargo.toml | 1 + dask_sql/context.py | 3 +-- dask_sql/input_utils/hive.py | 3 +-- dask_sql/mappings.py | 3 +-- dask_sql/physical/rel/base.py | 4 ++-- dask_sql/physical/rel/convert.py | 4 ++-- dask_sql/physical/rel/custom/alter.py | 4 ++-- dask_sql/physical/rel/custom/analyze_table.py | 4 ++-- dask_sql/physical/rel/custom/create_catalog_schema.py | 4 ++-- dask_sql/physical/rel/custom/create_memory_table.py | 4 ++-- dask_sql/physical/rel/custom/create_table.py | 4 ++-- dask_sql/physical/rel/custom/describe_model.py | 4 ++-- dask_sql/physical/rel/custom/distributeby.py | 4 ++-- dask_sql/physical/rel/custom/drop_schema.py | 4 ++-- dask_sql/physical/rel/custom/export_model.py | 4 ++-- dask_sql/physical/rel/custom/predict_model.py | 4 ++-- dask_sql/physical/rel/custom/show_columns.py | 4 ++-- dask_sql/physical/rel/custom/show_models.py | 4 ++-- dask_sql/physical/rel/custom/show_schemas.py | 4 ++-- dask_sql/physical/rel/custom/show_tables.py | 4 ++-- dask_sql/physical/rel/custom/use_schema.py | 4 ++-- dask_sql/physical/rel/logical/aggregate.py | 4 ++-- dask_sql/physical/rel/logical/cross_join.py | 4 ++-- dask_sql/physical/rel/logical/empty.py | 4 ++-- dask_sql/physical/rel/logical/explain.py | 4 ++-- dask_sql/physical/rel/logical/filter.py | 4 ++-- dask_sql/physical/rel/logical/join.py | 4 ++-- dask_sql/physical/rel/logical/limit.py | 4 ++-- dask_sql/physical/rel/logical/project.py | 8 ++++---- dask_sql/physical/rel/logical/sort.py | 4 ++-- dask_sql/physical/rel/logical/subquery_alias.py | 4 ++-- dask_sql/physical/rel/logical/table_scan.py | 4 ++-- dask_sql/physical/rel/logical/union.py | 4 ++-- dask_sql/physical/rel/logical/window.py | 4 ++-- dask_sql/physical/rex/base.py | 2 +- dask_sql/physical/rex/convert.py | 4 ++-- dask_sql/physical/rex/core/alias.py | 4 ++-- dask_sql/physical/rex/core/call.py | 7 +++---- dask_sql/physical/rex/core/input_ref.py | 4 ++-- dask_sql/physical/rex/core/literal.py | 7 +++---- dask_sql/physical/rex/core/subquery.py | 4 ++-- dask_sql/utils.py | 3 +-- pyproject.toml | 2 +- src/lib.rs | 4 ++-- tests/unit/test_mapping.py | 3 +-- 45 files changed, 86 insertions(+), 92 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 11444b09c..465472c11 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ readme = "README.md" license = "Apache-2.0" edition = "2021" rust-version = "1.65" +include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"] [lib] name = "dask_sql" diff --git a/dask_sql/context.py b/dask_sql/context.py index fb97ad47c..03947fa76 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -9,8 +9,7 @@ from dask import config as dask_config from dask.base import optimize from dask.utils_test import hlg_layer - -from ._internal import ( +from dask_planner import ( DaskSchema, DaskSQLContext, DaskTable, diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py index fb1117289..5d500180d 100644 --- a/dask_sql/input_utils/hive.py +++ b/dask_sql/input_utils/hive.py @@ -5,8 +5,7 @@ from typing import Any, Union import dask.dataframe as dd - -from ._internal import SqlTypeName +from dask_planner import SqlTypeName try: from pyhive import hive diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index f0d9c74d1..3d39ee392 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -7,8 +7,7 @@ import dask.dataframe as dd import numpy as np import pandas as pd - -from ._internal import DaskTypeMap, SqlTypeName +from dask_planner import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index ce28aeb28..f4463fe62 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -7,9 +7,9 @@ from dask_sql.mappings import cast_column_type, sql_to_python_type if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan, RelDataType - from ._internal import LogicalPlan, RelDataType + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index 6a17ac94d..24b06c337 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -7,9 +7,9 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/alter.py b/dask_sql/physical/rel/custom/alter.py index f8e92671d..16ed9e9bb 100644 --- a/dask_sql/physical/rel/custom/alter.py +++ b/dask_sql/physical/rel/custom/alter.py @@ -6,9 +6,9 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class AlterSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index e42c0c229..77edfff4b 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -8,9 +8,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class AnalyzeTablePlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/create_catalog_schema.py b/dask_sql/physical/rel/custom/create_catalog_schema.py index 1a28edd8c..74f964621 100644 --- a/dask_sql/physical/rel/custom/create_catalog_schema.py +++ b/dask_sql/physical/rel/custom/create_catalog_schema.py @@ -4,9 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_memory_table.py b/dask_sql/physical/rel/custom/create_memory_table.py index 32d4d1d8b..8c8c945ff 100644 --- a/dask_sql/physical/rel/custom/create_memory_table.py +++ b/dask_sql/physical/rel/custom/create_memory_table.py @@ -5,9 +5,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_table.py b/dask_sql/physical/rel/custom/create_table.py index 0c4807d91..526ec9728 100644 --- a/dask_sql/physical/rel/custom/create_table.py +++ b/dask_sql/physical/rel/custom/create_table.py @@ -6,9 +6,9 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/describe_model.py b/dask_sql/physical/rel/custom/describe_model.py index 931930b1b..8b2e144ff 100644 --- a/dask_sql/physical/rel/custom/describe_model.py +++ b/dask_sql/physical/rel/custom/describe_model.py @@ -7,9 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class DescribeModelPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/distributeby.py b/dask_sql/physical/rel/custom/distributeby.py index e45623038..6b6dba0b8 100644 --- a/dask_sql/physical/rel/custom/distributeby.py +++ b/dask_sql/physical/rel/custom/distributeby.py @@ -6,9 +6,9 @@ from dask_sql.utils import LoggableDataFrame if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/drop_schema.py b/dask_sql/physical/rel/custom/drop_schema.py index 9df844398..455b27fa4 100644 --- a/dask_sql/physical/rel/custom/drop_schema.py +++ b/dask_sql/physical/rel/custom/drop_schema.py @@ -4,9 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/export_model.py b/dask_sql/physical/rel/custom/export_model.py index e3743406b..c96d19786 100644 --- a/dask_sql/physical/rel/custom/export_model.py +++ b/dask_sql/physical/rel/custom/export_model.py @@ -6,9 +6,9 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/predict_model.py b/dask_sql/physical/rel/custom/predict_model.py index e5866948b..c0339b1d7 100644 --- a/dask_sql/physical/rel/custom/predict_model.py +++ b/dask_sql/physical/rel/custom/predict_model.py @@ -9,9 +9,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/show_columns.py b/dask_sql/physical/rel/custom/show_columns.py index 8a2ee0306..a11d05c94 100644 --- a/dask_sql/physical/rel/custom/show_columns.py +++ b/dask_sql/physical/rel/custom/show_columns.py @@ -8,9 +8,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class ShowColumnsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_models.py b/dask_sql/physical/rel/custom/show_models.py index 64c656ad8..ecc81e82a 100644 --- a/dask_sql/physical/rel/custom/show_models.py +++ b/dask_sql/physical/rel/custom/show_models.py @@ -7,9 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class ShowModelsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_schemas.py b/dask_sql/physical/rel/custom/show_schemas.py index 0f3bfdf7e..d49d3708b 100644 --- a/dask_sql/physical/rel/custom/show_schemas.py +++ b/dask_sql/physical/rel/custom/show_schemas.py @@ -7,9 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class ShowSchemasPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_tables.py b/dask_sql/physical/rel/custom/show_tables.py index 05b899949..85dc3687d 100644 --- a/dask_sql/physical/rel/custom/show_tables.py +++ b/dask_sql/physical/rel/custom/show_tables.py @@ -7,9 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class ShowTablesPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/use_schema.py b/dask_sql/physical/rel/custom/use_schema.py index 9186049f9..563415c2d 100644 --- a/dask_sql/physical/rel/custom/use_schema.py +++ b/dask_sql/physical/rel/custom/use_schema.py @@ -4,9 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class UseSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index f228bd16c..27f5c102c 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -15,9 +15,9 @@ from dask_sql.utils import is_cudf_type, new_temporary_column if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/cross_join.py b/dask_sql/physical/rel/logical/cross_join.py index 94690e0bc..dfa8cdf3c 100644 --- a/dask_sql/physical/rel/logical/cross_join.py +++ b/dask_sql/physical/rel/logical/cross_join.py @@ -6,9 +6,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/empty.py b/dask_sql/physical/rel/logical/empty.py index 202743a7b..b50699b79 100644 --- a/dask_sql/physical/rel/logical/empty.py +++ b/dask_sql/physical/rel/logical/empty.py @@ -8,9 +8,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/explain.py b/dask_sql/physical/rel/logical/explain.py index 4afd6870b..abf1d814c 100644 --- a/dask_sql/physical/rel/logical/explain.py +++ b/dask_sql/physical/rel/logical/explain.py @@ -3,9 +3,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class ExplainPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index 58704ae5a..a37e390ec 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -11,9 +11,9 @@ from dask_sql.physical.utils.filter import attempt_predicate_pushdown if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index ea5cfd4c2..cec7df4d9 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -17,9 +17,9 @@ from dask_sql.utils import is_cudf_type if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index efb07a073..00ba37fa2 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -11,9 +11,9 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class DaskLimitPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/project.py b/dask_sql/physical/rel/logical/project.py index d4b41c046..4630b5d6b 100644 --- a/dask_sql/physical/rel/logical/project.py +++ b/dask_sql/physical/rel/logical/project.py @@ -1,17 +1,17 @@ import logging from typing import TYPE_CHECKING +from dask_planner import RexType + from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter from dask_sql.utils import new_temporary_column -from ._internal import RexType - if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index 453c8895a..6dc57211c 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -5,9 +5,9 @@ from dask_sql.physical.utils.sort import apply_sort if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class DaskSortPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/subquery_alias.py b/dask_sql/physical/rel/logical/subquery_alias.py index ba82391f0..e82d9b105 100644 --- a/dask_sql/physical/rel/logical/subquery_alias.py +++ b/dask_sql/physical/rel/logical/subquery_alias.py @@ -4,9 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class SubqueryAlias(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index fa0e6b5bd..b3b5cab0a 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -11,9 +11,9 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/union.py b/dask_sql/physical/rel/logical/union.py index 04ca0d150..1fbc5b5ae 100644 --- a/dask_sql/physical/rel/logical/union.py +++ b/dask_sql/physical/rel/logical/union.py @@ -6,9 +6,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql def _extract_df(obj_cc, obj_df, output_field_names): diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index 793b71903..bbcdae740 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -16,9 +16,9 @@ from dask_sql.utils import LoggableDataFrame, new_temporary_column if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/base.py b/dask_sql/physical/rex/base.py index 97692284b..7f97a70d9 100644 --- a/dask_sql/physical/rex/base.py +++ b/dask_sql/physical/rex/base.py @@ -7,7 +7,7 @@ from dask_sql.datacontainer import DataContainer if TYPE_CHECKING: - from ._internal import Expression, LogicalPlan + from dask_planner import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/convert.py b/dask_sql/physical/rex/convert.py index 6cba4db8c..fce64be30 100644 --- a/dask_sql/physical/rex/convert.py +++ b/dask_sql/physical/rex/convert.py @@ -8,9 +8,9 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/alias.py b/dask_sql/physical/rex/core/alias.py index 7821e8d74..d6ae20698 100644 --- a/dask_sql/physical/rex/core/alias.py +++ b/dask_sql/physical/rex/core/alias.py @@ -7,9 +7,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql class RexAliasPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index e06050823..56d01d006 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -13,6 +13,7 @@ from dask.dataframe.core import Series from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data +from dask_planner import SqlTypeName from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200 from dask_sql.datacontainer import DataContainer @@ -32,12 +33,10 @@ is_frame, ) -from ._internal import SqlTypeName - if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql logger = logging.getLogger(__name__) SeriesOrScalar = Union[dd.Series, Any] diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 57cb1bd1d..01bf871c7 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -6,9 +6,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql class RexInputRefPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 952c157aa..7fe59b383 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -4,17 +4,16 @@ import dask.dataframe as dd import numpy as np +from dask_planner import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin -from ._internal import SqlTypeName - if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/subquery.py b/dask_sql/physical/rex/core/subquery.py index 7afb74c3e..1253f257d 100644 --- a/dask_sql/physical/rex/core/subquery.py +++ b/dask_sql/physical/rex/core/subquery.py @@ -7,9 +7,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql class RexScalarSubqueryPlugin(BaseRexPlugin): diff --git a/dask_sql/utils.py b/dask_sql/utils.py index 6eed1ed29..c2cfe45ab 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -8,12 +8,11 @@ import dask.dataframe as dd import numpy as np import pandas as pd +from dask_planner import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value -from ._internal import SqlTypeName - logger = logging.getLogger(__name__) diff --git a/pyproject.toml b/pyproject.toml index 236b63350..0e7ff0578 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ license-files = ["LICENSE.txt"] find = {namespaces = false} [tool.maturin] -module-name = "dask_sql" +module-name = "dask_planner" include = [ { path = "Cargo.lock", format = "sdist" } ] diff --git a/src/lib.rs b/src/lib.rs index 1ced3e9d7..63879e2fb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,7 @@ mod sql; /// The higher-level public API is defined in pure python files under the /// dask_planner directory. #[pymodule] -fn _internal(py: Python, m: &PyModule) -> PyResult<()> { +fn dask_planner(py: Python, m: &PyModule) -> PyResult<()> { // Initialize the global Python logger instance pyo3_log::init(); @@ -40,7 +40,7 @@ fn _internal(py: Python, m: &PyModule) -> PyResult<()> { py.get_type::(), )?; - debug!("dask_planner Python module loaded"); + debug!("dask_sql native library loaded"); Ok(()) } diff --git a/tests/unit/test_mapping.py b/tests/unit/test_mapping.py index 8cb155db7..952bcb10e 100644 --- a/tests/unit/test_mapping.py +++ b/tests/unit/test_mapping.py @@ -3,11 +3,10 @@ import numpy as np import pandas as pd import pytest +from dask_planner import SqlTypeName from dask_sql.mappings import python_to_sql_type, similar_type, sql_to_python_value -from ._internal import SqlTypeName - def test_python_to_sql(): assert str(python_to_sql_type(np.dtype("int32"))) == "INTEGER" From 83fb5c39fe9cdc51d589b109ae06bdf9311118ad Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:25:57 -0700 Subject: [PATCH 31/44] Resolve initial conda build issues --- continuous_integration/recipe/conda_build_config.yaml | 4 ++-- continuous_integration/recipe/meta.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/continuous_integration/recipe/conda_build_config.yaml b/continuous_integration/recipe/conda_build_config.yaml index b1c3c40cc..142300f28 100644 --- a/continuous_integration/recipe/conda_build_config.yaml +++ b/continuous_integration/recipe/conda_build_config.yaml @@ -4,5 +4,5 @@ rust_compiler_version: - 1.69 libprotobuf: - 3 -setuptools_rust: - - 1.5.2 +maturin: + - 0.15.3 diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 02e58d1fb..954825e1e 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -31,7 +31,7 @@ requirements: host: - pip - python - - setuptools-rust + - maturin - libprotobuf - zlib run: From c7bbbd7a62c51c8321b25a24d95ff7ec96d5cbf1 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:29:47 -0700 Subject: [PATCH 32/44] Replace setuptools-rust with maturin in CI --- .github/workflows/release.yml | 6 +++--- .github/workflows/test-upstream.yml | 1 - .github/workflows/test.yml | 1 - docker/conda.txt | 2 +- docker/main.dockerfile | 2 +- docs/environment.yml | 1 - docs/requirements-docs.txt | 2 +- 7 files changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1ee1e6397..0cb3fccb8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -60,14 +60,14 @@ jobs: CARGO_NET_GIT_FETCH_WITH_CLI="true" PATH="$HOME/.cargo/bin:$HOME/.local/bin:$PATH" CIBW_ENVIRONMENT_WINDOWS: 'PATH="$UserProfile\.cargo\bin;$PATH"' - CIBW_BEFORE_BUILD: 'pip install -U setuptools-rust' + CIBW_BEFORE_BUILD: 'pip install -U "maturin>=0.15,<0.16"' CIBW_BEFORE_BUILD_LINUX: > ARCH=$([ $(uname -m) == x86_64 ] && echo x86_64 || echo aarch_64) && DOWNLOAD_URL=$(curl --retry 6 --retry-delay 10 -s https://api.github.com/repos/protocolbuffers/protobuf/releases/latest | grep -o '"browser_download_url": "[^"]*' | cut -d'"' -f4 | grep "\linux-${ARCH}.zip$") && curl --retry 6 --retry-delay 10 -LO $DOWNLOAD_URL && unzip protoc-*-linux-$ARCH.zip -d $HOME/.local && protoc --version && - pip install -U setuptools-rust && + pip install -U "maturin>=0.15,<0.16" && pip list && curl --retry 6 --retry-delay 10 https://sh.rustup.rs -sSf | sh -s -- --default-toolchain=stable --profile=minimal -y && rustup show @@ -127,7 +127,7 @@ jobs: channel-priority: strict - name: Build source distribution run: | - mamba install setuptools-rust twine + mamba install "maturin>=0.15,<0.16" twine python setup.py sdist - name: Check dist files diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index ff0296b15..10eb032ad 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -126,7 +126,6 @@ jobs: bash update-dependencies.sh - name: Install dependencies and nothing else run: | - mamba install setuptools-rust pip install -e . -vv which python diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index be2d98126..2bd043b34 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -116,7 +116,6 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Install dependencies and nothing else run: | - mamba install "setuptools-rust>=1.5.2" pip install -e . -vv which python diff --git a/docker/conda.txt b/docker/conda.txt index d24d217aa..c0f185948 100644 --- a/docker/conda.txt +++ b/docker/conda.txt @@ -21,4 +21,4 @@ intake>=0.6.0 pre-commit>=2.11.1 black=22.10.0 isort=5.12.0 -setuptools-rust>=1.5.2 +maturin>=0.15,<0.16 diff --git a/docker/main.dockerfile b/docker/main.dockerfile index da965a53c..ee0ab8c30 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -14,7 +14,7 @@ ENV PATH="/root/.cargo/bin:${PATH}" COPY docker/conda.txt /opt/dask_sql/ RUN mamba install -y \ # build requirements - "setuptools-rust>=1.5.2" \ + "maturin>=0.15,<0.16" \ # core dependencies "dask>=2022.3.0" \ "pandas>=1.4.0" \ diff --git a/docs/environment.yml b/docs/environment.yml index 96a727465..8d6f0714f 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -17,6 +17,5 @@ dependencies: - prompt_toolkit>=3.0.8 - pygments>=2.7.1 - tabulate - - setuptools-rust>=1.5.2 - ucx-proc=*=cpu - rust>=1.65.0 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index c9d8c6b0e..6ddeb3028 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -11,4 +11,4 @@ tzlocal>=2.1 prompt_toolkit>=3.0.8 pygments>=2.7.1 tabulate -setuptools-rust>=1.5.2 +maturin>=0.15,<0.16 From 6dc634758da45a701cc84836881e3a593de935e6 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:40:58 -0700 Subject: [PATCH 33/44] Constrain maturin, remove setuptools-rust from CI envs --- continuous_integration/environment-3.10-dev.yaml | 3 +-- continuous_integration/environment-3.8-dev.yaml | 3 +-- continuous_integration/environment-3.9-dev.yaml | 3 +-- continuous_integration/gpuci/environment-3.10.yaml | 3 +-- continuous_integration/gpuci/environment-3.9.yaml | 3 +-- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index a867996d1..cf35db316 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -11,7 +11,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.12.8 +- maturin>=0.15,<0.16 - mlflow - mock - numpy>=1.21.6 @@ -28,7 +28,6 @@ dependencies: - pytest - python=3.10 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 - tpot>=0.12.0 diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml index 18b478472..4d737591b 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.8-dev.yaml @@ -10,7 +10,7 @@ dependencies: - intake=0.6.0 - jsonschema - lightgbm -- maturin=0.12.8 +- maturin=0.15 - mlflow - mock - numpy=1.21.6 @@ -27,7 +27,6 @@ dependencies: - pytest - python=3.8 - scikit-learn=1.0.0 -- setuptools-rust=1.5.2 - sphinx - sqlalchemy<2 - tpot>=0.12.0 diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index 7424529d6..ace64cb75 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -11,7 +11,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.12.8 +- maturin>=0.15,<0.16 - mlflow - mock - numpy>=1.21.6 @@ -28,7 +28,6 @@ dependencies: - pytest - python=3.9 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 - tpot>=0.12.0 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 2467e144a..b0332dc4e 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -14,7 +14,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.12.8 +- maturin>=0.15,<0.16 - mlflow - mock - numpy>=1.21.6 @@ -31,7 +31,6 @@ dependencies: - pytest - python=3.10 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 - tpot>=0.12.0 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 917892f24..7b12c8cbe 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -14,7 +14,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.12.8 +- maturin>=0.15,<0.16 - mlflow - mock - numpy>=1.21.6 @@ -31,7 +31,6 @@ dependencies: - pytest - python=3.9 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 - tpot>=0.12.0 From 6dcf5e055cd0919f6ce3128d32cd4369062d43fe Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:44:41 -0700 Subject: [PATCH 34/44] Update docs and Rust CI --- .github/CODEOWNERS | 5 ++++- .github/workflows/conda.yml | 7 +++---- .github/workflows/release.yml | 2 +- .github/workflows/test-upstream.yml | 2 -- CONTRIBUTING.md | 20 ++++++++++---------- README.md | 2 +- docs/source/how_does_it_work.rst | 2 +- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 527d01fa2..1ff63a673 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,4 +2,7 @@ * @ayushdg @charlesbluca @galipremsagar # rust codeowners -dask_planner/ @ayushdg @charlesbluca @galipremsagar @jdye64 +.cargo/ @ayushdg @charlesbluca @galipremsagar @jdye64 +src/ @ayushdg @charlesbluca @galipremsagar @jdye64 +Cargo.toml @ayushdg @charlesbluca @galipremsagar @jdye64 +Cargo.lock @ayushdg @charlesbluca @galipremsagar @jdye64 diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 78253db6b..d67798646 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -6,10 +6,9 @@ on: pull_request: paths: - setup.py - - dask_planner/Cargo.toml - - dask_planner/Cargo.lock - - dask_planner/pyproject.toml - - dask_planner/rust-toolchain.toml + - Cargo.toml + - Cargo.lock + - pyproject.toml - continuous_integration/recipe/** - .github/workflows/conda.yml schedule: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0cb3fccb8..7a837af3b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -74,7 +74,7 @@ jobs: with: package-dir: . output-dir: dist - config-file: "dask_planner/pyproject.toml" + config-file: "pyproject.toml" - name: Set up Python uses: conda-incubator/setup-miniconda@v2.2.0 with: diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index 10eb032ad..df361bb49 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -68,7 +68,6 @@ jobs: - name: Optionally update upstream cargo dependencies if: env.which_upstream == 'DataFusion' run: | - cd dask_planner bash update-dependencies.sh - name: Build the Rust DataFusion bindings run: | @@ -122,7 +121,6 @@ jobs: env: UPDATE_ALL_CARGO_DEPS: false run: | - cd dask_planner bash update-dependencies.sh - name: Install dependencies and nothing else run: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9ab31230f..a6cd56c59 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -49,7 +49,7 @@ Note that while `setuptools-rust` is used by CI and should be used during your d Building Dask-SQL is straightforward with Python. To build run ```python setup.py install```. This will build both the Rust and Python codebase and install it into your locally activated conda environment. While not required, if you have updated dependencies for Rust you might prefer a clean build. To clean your setup run ```python setup.py clean``` and then run ```python setup.py install``` #### DataFusion Modules -DataFusion is broken down into a few modules. We consume those modules in our [Cargo.toml](dask_planner/Cargo.toml). The modules that we use currently are +DataFusion is broken down into a few modules. We consume those modules in our [Cargo.toml](Cargo.toml). The modules that we use currently are - `datafusion-common` - Datastructures and core logic - `datafusion-expr` - Expression based logic and operators @@ -57,7 +57,7 @@ DataFusion is broken down into a few modules. We consume those modules in our [C - `datafusion-optimizer` - Optimization logic and datastructures for modifying current plans into more efficient ones. #### Retrieving Upstream Dependencies -During development you might find yourself needing some upstream DataFusion changes not present in the projects current version. Luckily this can easily be achieved by updating [Cargo.toml](dask_planner/Cargo.toml) and changing the `rev` to the SHA of the version you need. Note that the same SHA should be used for all DataFusion modules. +During development you might find yourself needing some upstream DataFusion changes not present in the projects current version. Luckily this can easily be achieved by updating [Cargo.toml](Cargo.toml) and changing the `rev` to the SHA of the version you need. Note that the same SHA should be used for all DataFusion modules. After updating the `Cargo.toml` file the codebase can be re-built to reflect those changes by running `python setup.py install` @@ -72,40 +72,40 @@ Sometimes when building against the latest Github commits for DataFusion you may ### Datastructures While working in the Rust codebase there are a few datastructures that you should make yourself familiar with. This section does not aim to verbosely list out all of the datastructure with in the project but rather just the key datastructures that you are likely to encounter while working on almost any feature/issue. The aim is to give you a better overview of the codebase without having to manually dig through the all the source code. -- [`PyLogicalPlan`](dask_planner/src/sql/logical.rs) -> [DataFusion LogicalPlan](https://docs.rs/datafusion/latest/datafusion/logical_plan/enum.LogicalPlan.html) +- [`PyLogicalPlan`](src/sql/logical.rs) -> [DataFusion LogicalPlan](https://docs.rs/datafusion/latest/datafusion/logical_plan/enum.LogicalPlan.html) - Often encountered in Python code with variable name `rel` - Python serializable umbrella representation of the entire LogicalPlan that was generated by DataFusion - Provides access to `DaskTable` instances and type information for each table - Access to individual nodes in the logical plan tree. Ex: `TableScan` -- [`DaskSQLContext`](dask_planner/src/sql.rs) +- [`DaskSQLContext`](src/sql.rs) - Analogous to Python `Context` - Contains metadata about the tables, schemas, functions, operators, and configurations that are persent within the current execution context - When adding custom functions/UDFs this is the location that you would register them - Entry point for parsing SQL strings to sql node trees. This is the location Python will begin its interactions with Rust -- [`PyExpr`](dask_planner/src/expression.rs) -> [DataFusion Expr](https://docs.rs/datafusion/latest/datafusion/prelude/enum.Expr.html) +- [`PyExpr`](src/expression.rs) -> [DataFusion Expr](https://docs.rs/datafusion/latest/datafusion/prelude/enum.Expr.html) - Arguably where most of your time will be spent - Represents a single node in sql tree. Ex: `avg(age)` from `SELECT avg(age) FROM people` - Is associate with a single `RexType` - Can contain literal values or represent function calls, `avg()` for example - The expressions "index" in the tree can be retrieved by calling `PyExpr.index()` on an instance. This is useful when mapping frontend column names in Dask code to backend Dataframe columns - Certain `PyExpr`s contain operands. Ex: `2 + 2` would contain 3 operands. 1) A literal `PyExpr` instance with value 2 2) Another literal `PyExpr` instance with a value of 2. 3) A `+` `PyExpr` representing the addition of the 2 literals. -- [`DaskSqlOptimizer`](dask_planner/src/sql/optimizer.rs) +- [`DaskSqlOptimizer`](src/sql/optimizer.rs) - Registering location for all Dask-SQL specific logical plan optimizations - Optimizations that are written either custom or use from another source, DataFusion, are registered here in the order they are wished to be executed - Represents functions that modify/convert an original `PyLogicalPlan` into another `PyLogicalPlan` that would be more efficient when running in the underlying Dask framework -- [`RelDataType`](dask_planner/src/sql/types/rel_data_type.rs) +- [`RelDataType`](src/sql/types/rel_data_type.rs) - Not a fan of this name, was chosen to match existing Calcite logic - Represents a "row" in a table - Contains a list of "columns" that are present in that row - - [RelDataTypeField](dask_planner/src/sql/types/rel_data_type_field.rs) -- [RelDataTypeField](dask_planner/src/sql/types/rel_data_type_field.rs) + - [RelDataTypeField](src/sql/types/rel_data_type_field.rs) +- [RelDataTypeField](src/sql/types/rel_data_type_field.rs) - Represents an individual column in a table - Contains: - `qualifier` - schema the field belongs to - `name` - name of the column/field - `data_type` - `DaskTypeMap` instance containing information about the SQL type and underlying Arrow DataType - `index` - location of the field in the LogicalPlan -- [DaskTypeMap](dask_planner/src/sql/types.rs) +- [DaskTypeMap](src/sql/types.rs) - Maps a conventional SQL type to an underlying Arrow DataType diff --git a/README.md b/README.md index e978fadf8..ac27aea33 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ After that, you can install the package in development mode pip install -e ".[dev]" The Rust DataFusion bindings are built as part of the `pip install`. -If changes are made to the Rust source in `dask_planner/`, another build/install must be run to recompile the bindings: +If changes are made to the Rust source in `src/`, another build/install must be run to recompile the bindings: python setup.py build install diff --git a/docs/source/how_does_it_work.rst b/docs/source/how_does_it_work.rst index 32c736431..67d2eab01 100644 --- a/docs/source/how_does_it_work.rst +++ b/docs/source/how_does_it_work.rst @@ -22,7 +22,7 @@ No matter of via the Python API (:ref:`api`), the command line client (:ref:`cmd This function will first give the SQL string to the dask_planner Rust crate via the ``PyO3`` library. Inside this crate, Apache Arrow DataFusion is used to first parse the SQL string and then turn it into a relational algebra. For this, DataFusion uses the SQL language description specified in the `sqlparser-rs library `_ -We also include `SQL extensions specific to Dask-SQL `_. They specify custom language features, such as the ``CREATE MODEL`` statement. +We also include `SQL extensions specific to Dask-SQL `_. They specify custom language features, such as the ``CREATE MODEL`` statement. 3. SQL is (maybe) optimized --------------------------- From b7c02c91333dc02c7e8d6becafd72b30e4fa1e02 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:53:47 -0700 Subject: [PATCH 35/44] Remove more dask_planner appearances --- .github/workflows/rust.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7e983172b..a9eeab1ab 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -51,7 +51,6 @@ jobs: - name: Optionally update upstream dependencies if: needs.detect-ci-trigger.outputs.triggered == 'true' run: | - cd dask_planner bash update-dependencies.sh - name: Install Protoc uses: arduino/setup-protoc@v1 @@ -60,11 +59,9 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Check workspace in debug mode run: | - cd dask_planner cargo check - name: Check workspace in release mode run: | - cd dask_planner cargo check --release # test the crate @@ -84,7 +81,6 @@ jobs: - name: Optionally update upstream dependencies if: needs.detect-ci-trigger.outputs.triggered == 'true' run: | - cd dask_planner bash update-dependencies.sh - name: Install Protoc uses: arduino/setup-protoc@v1 @@ -93,5 +89,4 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Run tests run: | - cd dask_planner cargo test From a3e1a6838b71ecf2b1e9b317cf725c6aeb8ae748 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 12:33:07 -0700 Subject: [PATCH 36/44] Bump pyarrow min version to resolve 3.8 conflicts --- continuous_integration/environment-3.10-dev.yaml | 2 +- continuous_integration/environment-3.8-dev.yaml | 2 +- continuous_integration/environment-3.9-dev.yaml | 2 +- continuous_integration/gpuci/environment-3.10.yaml | 2 +- continuous_integration/gpuci/environment-3.9.yaml | 2 +- continuous_integration/recipe/meta.yaml | 1 + docker/conda.txt | 4 ++-- docker/main.dockerfile | 2 +- pyproject.toml | 2 +- 9 files changed, 10 insertions(+), 9 deletions(-) diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index cf35db316..8d0710ec2 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -19,7 +19,7 @@ dependencies: - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml index 4d737591b..2fd4ddad3 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.8-dev.yaml @@ -18,7 +18,7 @@ dependencies: - pre-commit - prompt_toolkit=3.0.8 - psycopg2 -- pyarrow=6.0.1 +- pyarrow=6.0.2 - pygments=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index ace64cb75..67cf0277d 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -19,7 +19,7 @@ dependencies: - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index b0332dc4e..297c7572a 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -22,7 +22,7 @@ dependencies: - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 7b12c8cbe..c8600fcfb 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -22,7 +22,7 @@ dependencies: - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 954825e1e..625a071c4 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -24,6 +24,7 @@ requirements: build: - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] + - maturin # [build_platform != target_platform] - libprotobuf # [build_platform != target_platform] - zlib # [build_platform != target_platform] - {{ compiler('c') }} diff --git a/docker/conda.txt b/docker/conda.txt index c0f185948..7f0e8d91a 100644 --- a/docker/conda.txt +++ b/docker/conda.txt @@ -4,7 +4,7 @@ pandas>=1.4.0 jpype1>=1.0.2 openjdk>=8 maven>=3.6.0 -pytest>=6.0.1 +pytest>=6.0.2 pytest-cov>=2.10.1 pytest-xdist mock>=4.0.3 @@ -13,7 +13,7 @@ tzlocal>=2.1 # FIXME: handling is needed for httpx-based fastapi>=0.87.0 fastapi>=0.69.0,<0.87.0 uvicorn>=0.13.4 -pyarrow>=6.0.1 +pyarrow>=6.0.2 prompt_toolkit>=3.0.8 pygments>=2.7.1 scikit-learn>=1.0.0 diff --git a/docker/main.dockerfile b/docker/main.dockerfile index ee0ab8c30..2a252e1f5 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -26,7 +26,7 @@ RUN mamba install -y \ "pygments>=2.7.1" \ tabulate \ # additional dependencies - "pyarrow>=6.0.1" \ + "pyarrow>=6.0.2" \ "scikit-learn>=1.0.0" \ "intake>=0.6.0" \ && conda clean -ay diff --git a/pyproject.toml b/pyproject.toml index 0e7ff0578..464c61585 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dev = [ "pytest-cov>=2.10.1", "mock>=4.0.3", "sphinx>=3.2.1", - "pyarrow>=6.0.1", + "pyarrow>=6.0.2", "scikit-learn>=1.0.0", "intake>=0.6.0", "pre-commit", From 3ff8240f59bae3a1c844a8af9df125951cbb87cc Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 17 Jul 2023 17:28:25 -0400 Subject: [PATCH 37/44] test commit seeing how CI will respond without cmd_loop import --- dask_sql/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dask_sql/__init__.py b/dask_sql/__init__.py index 756486b74..5752c3d6d 100644 --- a/dask_sql/__init__.py +++ b/dask_sql/__init__.py @@ -1,9 +1,11 @@ from . import _version, config -from .cmd import cmd_loop + +# from .cmd import cmd_loop from .context import Context from .datacontainer import Statistics from .server.app import run_server __version__ = _version.get_versions()["version"] -__all__ = [__version__, cmd_loop, Context, run_server, Statistics] +# __all__ = [__version__, cmd_loop, Context, run_server, Statistics] +__all__ = [__version__, Context, run_server, Statistics] From ae7a3d6d9fedb20a466ed6db66eab5aaf419948d Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 14:42:13 -0700 Subject: [PATCH 38/44] Rename module to _datafusion_lib --- dask_sql/__init__.py | 6 ++---- dask_sql/context.py | 3 ++- dask_sql/input_utils/hive.py | 3 ++- dask_sql/mappings.py | 3 ++- dask_sql/physical/rel/base.py | 3 +-- dask_sql/physical/rel/convert.py | 3 +-- dask_sql/physical/rel/custom/alter.py | 3 +-- dask_sql/physical/rel/custom/analyze_table.py | 3 +-- dask_sql/physical/rel/custom/create_catalog_schema.py | 3 +-- dask_sql/physical/rel/custom/create_memory_table.py | 3 +-- dask_sql/physical/rel/custom/create_table.py | 3 +-- dask_sql/physical/rel/custom/describe_model.py | 3 +-- dask_sql/physical/rel/custom/distributeby.py | 3 +-- dask_sql/physical/rel/custom/drop_schema.py | 3 +-- dask_sql/physical/rel/custom/export_model.py | 3 +-- dask_sql/physical/rel/custom/predict_model.py | 3 +-- dask_sql/physical/rel/custom/show_columns.py | 3 +-- dask_sql/physical/rel/custom/show_models.py | 3 +-- dask_sql/physical/rel/custom/show_schemas.py | 3 +-- dask_sql/physical/rel/custom/show_tables.py | 3 +-- dask_sql/physical/rel/custom/use_schema.py | 3 +-- dask_sql/physical/rel/logical/aggregate.py | 3 +-- dask_sql/physical/rel/logical/cross_join.py | 3 +-- dask_sql/physical/rel/logical/empty.py | 3 +-- dask_sql/physical/rel/logical/explain.py | 3 +-- dask_sql/physical/rel/logical/filter.py | 3 +-- dask_sql/physical/rel/logical/join.py | 3 +-- dask_sql/physical/rel/logical/limit.py | 3 +-- dask_sql/physical/rel/logical/project.py | 6 ++---- dask_sql/physical/rel/logical/sort.py | 3 +-- dask_sql/physical/rel/logical/subquery_alias.py | 3 +-- dask_sql/physical/rel/logical/table_scan.py | 3 +-- dask_sql/physical/rel/logical/union.py | 3 +-- dask_sql/physical/rel/logical/window.py | 3 +-- dask_sql/physical/rex/base.py | 2 +- dask_sql/physical/rex/convert.py | 3 +-- dask_sql/physical/rex/core/alias.py | 3 +-- dask_sql/physical/rex/core/call.py | 5 ++--- dask_sql/physical/rex/core/input_ref.py | 3 +-- dask_sql/physical/rex/core/literal.py | 5 ++--- dask_sql/physical/rex/core/subquery.py | 3 +-- dask_sql/utils.py | 2 +- pyproject.toml | 2 +- src/lib.rs | 2 +- tests/unit/test_mapping.py | 2 +- 45 files changed, 52 insertions(+), 88 deletions(-) diff --git a/dask_sql/__init__.py b/dask_sql/__init__.py index 5752c3d6d..756486b74 100644 --- a/dask_sql/__init__.py +++ b/dask_sql/__init__.py @@ -1,11 +1,9 @@ from . import _version, config - -# from .cmd import cmd_loop +from .cmd import cmd_loop from .context import Context from .datacontainer import Statistics from .server.app import run_server __version__ = _version.get_versions()["version"] -# __all__ = [__version__, cmd_loop, Context, run_server, Statistics] -__all__ = [__version__, Context, run_server, Statistics] +__all__ = [__version__, cmd_loop, Context, run_server, Statistics] diff --git a/dask_sql/context.py b/dask_sql/context.py index 03947fa76..ab0c2ae71 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -9,7 +9,8 @@ from dask import config as dask_config from dask.base import optimize from dask.utils_test import hlg_layer -from dask_planner import ( + +from dask_sql._datafusion_lib import ( DaskSchema, DaskSQLContext, DaskTable, diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py index 5d500180d..14bc547f0 100644 --- a/dask_sql/input_utils/hive.py +++ b/dask_sql/input_utils/hive.py @@ -5,7 +5,8 @@ from typing import Any, Union import dask.dataframe as dd -from dask_planner import SqlTypeName + +from dask_sql._datafusion_lib import SqlTypeName try: from pyhive import hive diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 3d39ee392..ca0e23691 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -7,7 +7,8 @@ import dask.dataframe as dd import numpy as np import pandas as pd -from dask_planner import DaskTypeMap, SqlTypeName + +from dask_sql._datafusion_lib import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index f4463fe62..5f70cde4e 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -7,9 +7,8 @@ from dask_sql.mappings import cast_column_type, sql_to_python_type if TYPE_CHECKING: - from dask_planner import LogicalPlan, RelDataType - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan, RelDataType logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index 24b06c337..6d2beceff 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -7,9 +7,8 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/alter.py b/dask_sql/physical/rel/custom/alter.py index 16ed9e9bb..b29eb7737 100644 --- a/dask_sql/physical/rel/custom/alter.py +++ b/dask_sql/physical/rel/custom/alter.py @@ -6,9 +6,8 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class AlterSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index 77edfff4b..49308cf3a 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -8,9 +8,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class AnalyzeTablePlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/create_catalog_schema.py b/dask_sql/physical/rel/custom/create_catalog_schema.py index 74f964621..e55d31a90 100644 --- a/dask_sql/physical/rel/custom/create_catalog_schema.py +++ b/dask_sql/physical/rel/custom/create_catalog_schema.py @@ -4,9 +4,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_memory_table.py b/dask_sql/physical/rel/custom/create_memory_table.py index 8c8c945ff..3c829fb42 100644 --- a/dask_sql/physical/rel/custom/create_memory_table.py +++ b/dask_sql/physical/rel/custom/create_memory_table.py @@ -5,9 +5,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_table.py b/dask_sql/physical/rel/custom/create_table.py index 526ec9728..cbe61abf7 100644 --- a/dask_sql/physical/rel/custom/create_table.py +++ b/dask_sql/physical/rel/custom/create_table.py @@ -6,9 +6,8 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/describe_model.py b/dask_sql/physical/rel/custom/describe_model.py index 8b2e144ff..422ac7c3b 100644 --- a/dask_sql/physical/rel/custom/describe_model.py +++ b/dask_sql/physical/rel/custom/describe_model.py @@ -7,9 +7,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class DescribeModelPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/distributeby.py b/dask_sql/physical/rel/custom/distributeby.py index 6b6dba0b8..71ac114f2 100644 --- a/dask_sql/physical/rel/custom/distributeby.py +++ b/dask_sql/physical/rel/custom/distributeby.py @@ -6,9 +6,8 @@ from dask_sql.utils import LoggableDataFrame if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/drop_schema.py b/dask_sql/physical/rel/custom/drop_schema.py index 455b27fa4..5491fcaa4 100644 --- a/dask_sql/physical/rel/custom/drop_schema.py +++ b/dask_sql/physical/rel/custom/drop_schema.py @@ -4,9 +4,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/export_model.py b/dask_sql/physical/rel/custom/export_model.py index c96d19786..08446c43c 100644 --- a/dask_sql/physical/rel/custom/export_model.py +++ b/dask_sql/physical/rel/custom/export_model.py @@ -6,9 +6,8 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/predict_model.py b/dask_sql/physical/rel/custom/predict_model.py index c0339b1d7..0bb5c79b4 100644 --- a/dask_sql/physical/rel/custom/predict_model.py +++ b/dask_sql/physical/rel/custom/predict_model.py @@ -9,9 +9,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/show_columns.py b/dask_sql/physical/rel/custom/show_columns.py index a11d05c94..2da4f4535 100644 --- a/dask_sql/physical/rel/custom/show_columns.py +++ b/dask_sql/physical/rel/custom/show_columns.py @@ -8,9 +8,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class ShowColumnsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_models.py b/dask_sql/physical/rel/custom/show_models.py index ecc81e82a..28e495810 100644 --- a/dask_sql/physical/rel/custom/show_models.py +++ b/dask_sql/physical/rel/custom/show_models.py @@ -7,9 +7,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class ShowModelsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_schemas.py b/dask_sql/physical/rel/custom/show_schemas.py index d49d3708b..fb69c5359 100644 --- a/dask_sql/physical/rel/custom/show_schemas.py +++ b/dask_sql/physical/rel/custom/show_schemas.py @@ -7,9 +7,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class ShowSchemasPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_tables.py b/dask_sql/physical/rel/custom/show_tables.py index 85dc3687d..05fb8a66c 100644 --- a/dask_sql/physical/rel/custom/show_tables.py +++ b/dask_sql/physical/rel/custom/show_tables.py @@ -7,9 +7,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class ShowTablesPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/use_schema.py b/dask_sql/physical/rel/custom/use_schema.py index 563415c2d..f5fc65b7d 100644 --- a/dask_sql/physical/rel/custom/use_schema.py +++ b/dask_sql/physical/rel/custom/use_schema.py @@ -4,9 +4,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class UseSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index 27f5c102c..dd2f9f41d 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -15,9 +15,8 @@ from dask_sql.utils import is_cudf_type, new_temporary_column if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/cross_join.py b/dask_sql/physical/rel/logical/cross_join.py index dfa8cdf3c..d1c74c8cc 100644 --- a/dask_sql/physical/rel/logical/cross_join.py +++ b/dask_sql/physical/rel/logical/cross_join.py @@ -6,9 +6,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/empty.py b/dask_sql/physical/rel/logical/empty.py index b50699b79..453f63de5 100644 --- a/dask_sql/physical/rel/logical/empty.py +++ b/dask_sql/physical/rel/logical/empty.py @@ -8,9 +8,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/explain.py b/dask_sql/physical/rel/logical/explain.py index abf1d814c..0e4875d0c 100644 --- a/dask_sql/physical/rel/logical/explain.py +++ b/dask_sql/physical/rel/logical/explain.py @@ -3,9 +3,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class ExplainPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index a37e390ec..af3685a11 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -11,9 +11,8 @@ from dask_sql.physical.utils.filter import attempt_predicate_pushdown if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index cec7df4d9..1657d2bf4 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -17,9 +17,8 @@ from dask_sql.utils import is_cudf_type if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index 00ba37fa2..9bd2be562 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -11,9 +11,8 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class DaskLimitPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/project.py b/dask_sql/physical/rel/logical/project.py index 4630b5d6b..0a7637f59 100644 --- a/dask_sql/physical/rel/logical/project.py +++ b/dask_sql/physical/rel/logical/project.py @@ -1,17 +1,15 @@ import logging from typing import TYPE_CHECKING -from dask_planner import RexType - +from dask_sql._datafusion_lib import RexType from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter from dask_sql.utils import new_temporary_column if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index 6dc57211c..9dfccdc49 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -5,9 +5,8 @@ from dask_sql.physical.utils.sort import apply_sort if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class DaskSortPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/subquery_alias.py b/dask_sql/physical/rel/logical/subquery_alias.py index e82d9b105..14be8928f 100644 --- a/dask_sql/physical/rel/logical/subquery_alias.py +++ b/dask_sql/physical/rel/logical/subquery_alias.py @@ -4,9 +4,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class SubqueryAlias(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index b3b5cab0a..53e1d29be 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -11,9 +11,8 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/union.py b/dask_sql/physical/rel/logical/union.py index 1fbc5b5ae..f31ced797 100644 --- a/dask_sql/physical/rel/logical/union.py +++ b/dask_sql/physical/rel/logical/union.py @@ -6,9 +6,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan def _extract_df(obj_cc, obj_df, output_field_names): diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index bbcdae740..aba788bc3 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -16,9 +16,8 @@ from dask_sql.utils import LoggableDataFrame, new_temporary_column if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/base.py b/dask_sql/physical/rex/base.py index 7f97a70d9..d74ad6309 100644 --- a/dask_sql/physical/rex/base.py +++ b/dask_sql/physical/rex/base.py @@ -7,7 +7,7 @@ from dask_sql.datacontainer import DataContainer if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/convert.py b/dask_sql/physical/rex/convert.py index fce64be30..1713e496d 100644 --- a/dask_sql/physical/rex/convert.py +++ b/dask_sql/physical/rex/convert.py @@ -8,9 +8,8 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/alias.py b/dask_sql/physical/rex/core/alias.py index d6ae20698..7486bc9c5 100644 --- a/dask_sql/physical/rex/core/alias.py +++ b/dask_sql/physical/rex/core/alias.py @@ -7,9 +7,8 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan class RexAliasPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 56d01d006..e513556d0 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -13,9 +13,9 @@ from dask.dataframe.core import Series from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data -from dask_planner import SqlTypeName from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200 +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import ( cast_column_to_type, @@ -34,9 +34,8 @@ ) if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) SeriesOrScalar = Union[dd.Series, Any] diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 01bf871c7..4d2c0f929 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -6,9 +6,8 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan class RexInputRefPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 7fe59b383..da0eeb128 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -4,16 +4,15 @@ import dask.dataframe as dd import numpy as np -from dask_planner import SqlTypeName +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/subquery.py b/dask_sql/physical/rex/core/subquery.py index 1253f257d..60a07c0b9 100644 --- a/dask_sql/physical/rex/core/subquery.py +++ b/dask_sql/physical/rex/core/subquery.py @@ -7,9 +7,8 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan class RexScalarSubqueryPlugin(BaseRexPlugin): diff --git a/dask_sql/utils.py b/dask_sql/utils.py index c2cfe45ab..454eecb7f 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -8,8 +8,8 @@ import dask.dataframe as dd import numpy as np import pandas as pd -from dask_planner import SqlTypeName +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value diff --git a/pyproject.toml b/pyproject.toml index 464c61585..75404e3e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ license-files = ["LICENSE.txt"] find = {namespaces = false} [tool.maturin] -module-name = "dask_planner" +module-name = "dask_sql._datafusion_lib" include = [ { path = "Cargo.lock", format = "sdist" } ] diff --git a/src/lib.rs b/src/lib.rs index 63879e2fb..921478973 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,7 @@ mod sql; /// The higher-level public API is defined in pure python files under the /// dask_planner directory. #[pymodule] -fn dask_planner(py: Python, m: &PyModule) -> PyResult<()> { +fn _datafusion_lib(py: Python, m: &PyModule) -> PyResult<()> { // Initialize the global Python logger instance pyo3_log::init(); diff --git a/tests/unit/test_mapping.py b/tests/unit/test_mapping.py index 952bcb10e..98f065bf8 100644 --- a/tests/unit/test_mapping.py +++ b/tests/unit/test_mapping.py @@ -3,8 +3,8 @@ import numpy as np import pandas as pd import pytest -from dask_planner import SqlTypeName +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.mappings import python_to_sql_type, similar_type, sql_to_python_value From 0c2908ce8fc2f453eb993196f378fea584e7bf2e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 15:10:10 -0700 Subject: [PATCH 39/44] Switch to maturin develop for CI installs --- .github/workflows/test-upstream.yml | 2 +- .github/workflows/test.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index df361bb49..7c231c929 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -71,7 +71,7 @@ jobs: bash update-dependencies.sh - name: Build the Rust DataFusion bindings run: | - python setup.py build install + maturin develop - name: Install hive testing dependencies if: matrix.os == 'ubuntu-latest' run: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2bd043b34..b3ec34a76 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -72,7 +72,7 @@ jobs: shared-key: test - name: Build the Rust DataFusion bindings run: | - python setup.py build install + maturin develop - name: Install hive testing dependencies if: matrix.os == 'ubuntu-latest' run: | From 849dc42a4e7382fff6aa253355dafcf138afb350 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 18 Jul 2023 10:46:27 -0400 Subject: [PATCH 40/44] Fix failing cargo tests, changed output, from datafusion version bump --- .cargo/config.toml | 5 +++++ Cargo.toml | 14 +++++++++----- src/parser.rs | 27 +++++---------------------- src/sql/optimizer.rs | 13 ++----------- 4 files changed, 21 insertions(+), 38 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index d47f983e4..3bbaccf35 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -9,3 +9,8 @@ rustflags = [ "-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup", ] + +[target.x86_64-unknown-linux-gnu] +rustflags = [ + "-C", "link-arg=-undefined" +] diff --git a/Cargo.toml b/Cargo.toml index 465472c11..d80f261bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,17 +9,21 @@ edition = "2021" rust-version = "1.65" include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"] -[lib] -name = "dask_sql" -crate-type = ["cdylib", "rlib"] - [dependencies] async-trait = "0.1.71" datafusion-python = "27.0.0" env_logger = "0.10" log = "^0.4" -pyo3 = { version = "0.19.0", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.19.1", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-log = "0.8.2" [build-dependencies] pyo3-build-config = "0.19.1" + +[lib] +name = "dask_sql" +crate-type = ["cdylib", "rlib"] + +[profile.release] +lto = true +codegen-units = 1 diff --git a/src/parser.rs b/src/parser.rs index 3147e6309..a051454bb 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1374,14 +1374,7 @@ mod test { let statements = DaskParser::parse_sql(sql).unwrap(); assert_eq!(1, statements.len()); let actual = format!("{:?}", statements[0]); - let expected = "projection: [\ - UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"timestampadd\", quote_style: None }]), \ - args: [\ - Unnamed(Expr(Value(SingleQuotedString(\"YEAR\")))), \ - Unnamed(Expr(Value(Number(\"2\", false)))), \ - Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None })))\ - ], over: None, distinct: false, special: false }))\ - ]"; + let expected = "Statement(Query(Query { with: None, body: Select(Select { distinct: None, top: None, projection: [UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"timestampadd\", quote_style: None }]), args: [Unnamed(Expr(Value(SingleQuotedString(\"YEAR\")))), Unnamed(Expr(Value(Number(\"2\", false)))), Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None })))], over: None, distinct: false, special: false, order_by: [] }))], into: None, from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: \"t\", quote_style: None }]), alias: None, args: None, with_hints: [] }, joins: [] }], lateral_views: [], selection: None, group_by: [], cluster_by: [], distribute_by: [], sort_by: [], having: None, named_window: [], qualify: None }), order_by: [], limit: None, offset: None, fetch: None, locks: [] }))"; assert!(actual.contains(expected)); } @@ -1391,26 +1384,16 @@ mod test { let statements1 = DaskParser::parse_sql(sql1).unwrap(); assert_eq!(1, statements1.len()); let actual1 = format!("{:?}", statements1[0]); - let expected1 = "projection: [\ - UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), \ - args: [\ - Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), \ - Unnamed(Expr(Value(SingleQuotedString(\"%Y-%m-%d %H:%M:%S\"))))\ - ], over: None, distinct: false, special: false }))\ - ]"; + let expected1 = "Statement(Query(Query { with: None, body: Select(Select { distinct: None, top: None, projection: [UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), args: [Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), Unnamed(Expr(Value(SingleQuotedString(\"%Y-%m-%d %H:%M:%S\"))))], over: None, distinct: false, special: false, order_by: [] }))], into: None, from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: \"t\", quote_style: None }]), alias: None, args: None, with_hints: [] }, joins: [] }], lateral_views: [], selection: None, group_by: [], cluster_by: [], distribute_by: [], sort_by: [], having: None, named_window: [], qualify: None }), order_by: [], limit: None, offset: None, fetch: None, locks: [] }))"; + assert!(actual1.contains(expected1)); let sql2 = "SELECT TO_TIMESTAMP(d, \"%d/%m/%Y\") FROM t"; let statements2 = DaskParser::parse_sql(sql2).unwrap(); assert_eq!(1, statements2.len()); let actual2 = format!("{:?}", statements2[0]); - let expected2 = "projection: [\ - UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), \ - args: [\ - Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), \ - Unnamed(Expr(Value(SingleQuotedString(\"\\\"%d/%m/%Y\\\"\"))))\ - ], over: None, distinct: false, special: false }))\ - ]"; + let expected2 = "Statement(Query(Query { with: None, body: Select(Select { distinct: None, top: None, projection: [UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), args: [Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), Unnamed(Expr(Value(SingleQuotedString(\"\\\"%d/%m/%Y\\\"\"))))], over: None, distinct: false, special: false, order_by: [] }))], into: None, from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: \"t\", quote_style: None }]), alias: None, args: None, with_hints: [] }, joins: [] }], lateral_views: [], selection: None, group_by: [], cluster_by: [], distribute_by: [], sort_by: [], having: None, named_window: [], qualify: None }), order_by: [], limit: None, offset: None, fetch: None, locks: [] }))"; + assert!(actual2.contains(expected2)); } diff --git a/src/sql/optimizer.rs b/src/sql/optimizer.rs index a5957ac98..484ee7dd6 100644 --- a/src/sql/optimizer.rs +++ b/src/sql/optimizer.rs @@ -147,17 +147,8 @@ mod tests { AND (cast('2002-05-08' as date) + interval '5 days')\ )"; let plan = test_sql(sql)?; - let expected = r#"Projection: test.col_int32 - Filter: CAST(test.col_int32 AS Float64) > __scalar_sq_1.__value - CrossJoin: - TableScan: test projection=[col_int32] - SubqueryAlias: __scalar_sq_1 - Projection: AVG(test.col_int32) AS __value - Aggregate: groupBy=[[]], aggr=[[AVG(test.col_int32)]] - Projection: test.col_int32 - Filter: test.col_utf8 >= Utf8("2002-05-08") AND test.col_utf8 <= Utf8("2002-05-13") - TableScan: test projection=[col_int32, col_utf8]"#; - assert_eq!(expected, format!("{:?}", plan)); + + assert!(expected.contains(r#"<= Date32("11820")"#)); Ok(()) } From 1f73b567311528a986d43df0bbfb3c2000342d98 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 18 Jul 2023 11:19:37 -0400 Subject: [PATCH 41/44] Fix cargo test syntax issue --- src/sql/optimizer.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/sql/optimizer.rs b/src/sql/optimizer.rs index 484ee7dd6..b9fdaca06 100644 --- a/src/sql/optimizer.rs +++ b/src/sql/optimizer.rs @@ -147,8 +147,7 @@ mod tests { AND (cast('2002-05-08' as date) + interval '5 days')\ )"; let plan = test_sql(sql)?; - - assert!(expected.contains(r#"<= Date32("11820")"#)); + assert!(format!("{:?}", plan).contains(r#"<= Date32("11820")"#)); Ok(()) } From 79b6eacc600f255ed0c513d2fda0d73e8cc2378c Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 18 Jul 2023 09:49:10 -0700 Subject: [PATCH 42/44] Fix failing Rust tests --- src/expression.rs | 4 ++-- src/parser.rs | 2 +- src/sql.rs | 14 -------------- src/sql/types.rs | 8 ++++---- 4 files changed, 7 insertions(+), 21 deletions(-) diff --git a/src/expression.rs b/src/expression.rs index d13f66e89..53d4f1c84 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -43,7 +43,7 @@ use crate::{ }; /// An PyExpr that can be used on a DataFrame -#[pyclass(name = "Expression", module = "datafusion", subclass)] +#[pyclass(name = "Expression", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyExpr { pub expr: Expr, @@ -57,7 +57,7 @@ impl From for Expr { } } -#[pyclass(name = "ScalarValue", module = "datafusion", subclass)] +#[pyclass(name = "ScalarValue", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyScalarValue { pub scalar_value: ScalarValue, diff --git a/src/parser.rs b/src/parser.rs index a051454bb..100f9c137 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -30,7 +30,7 @@ pub enum CustomExpr { Nested(Vec<(String, PySqlArg)>), } -#[pyclass(name = "SqlArg", module = "datafusion")] +#[pyclass(name = "SqlArg", module = "dask_sql")] #[derive(Debug, Clone, PartialEq, Eq)] pub struct PySqlArg { expr: Option, diff --git a/src/sql.rs b/src/sql.rs index 585fcad4d..c9a600225 100644 --- a/src/sql.rs +++ b/src/sql.rs @@ -78,20 +78,6 @@ use crate::{ /// /// The following example demonstrates how to generate an optimized LogicalPlan /// from SQL using DaskSQLContext. -/// -/// ``` -/// use datafusion_python::datafusion::prelude::*; -/// -/// # use datafusion_python::datafusion_common::Result; -/// # #[tokio::main] -/// # async fn main() -> Result<()> { -/// let mut ctx = DaskSQLContext::new(); -/// let parsed_sql = ctx.parse_sql("SELECT COUNT(*) FROM test_table"); -/// let nonOptimizedRelAlgebra = ctx.logical_relational_algebra(parsed_sql); -/// let optmizedRelAlg = ctx.optimizeRelationalAlgebra(nonOptimizedRelAlgebra); -/// # Ok(()) -/// # } -/// ``` #[pyclass(name = "DaskSQLContext", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskSQLContext { diff --git a/src/sql/types.rs b/src/sql/types.rs index 4642a4eb0..34af22342 100644 --- a/src/sql/types.rs +++ b/src/sql/types.rs @@ -12,7 +12,7 @@ use pyo3::{prelude::*, types::PyDict}; use crate::{dialect::DaskDialect, error::DaskPlannerError, sql::exceptions::py_type_err}; #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "RexType", module = "datafusion")] +#[pyclass(name = "RexType", module = "dask_sql")] pub enum RexType { Alias, Literal, @@ -23,7 +23,7 @@ pub enum RexType { } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "DaskTypeMap", module = "datafusion", subclass)] +#[pyclass(name = "DaskTypeMap", module = "dask_sql", subclass)] /// Represents a Python Data Type. This is needed instead of simple /// Enum instances because PyO3 can only support unit variants as /// of version 0.16 which means Enums like `DataType::TIMESTAMP_WITH_LOCAL_TIME_ZONE` @@ -167,7 +167,7 @@ impl DaskTypeMap { } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "PyDataType", module = "datafusion", subclass)] +#[pyclass(name = "PyDataType", module = "dask_sql", subclass)] pub struct PyDataType { data_type: DataType, } @@ -210,7 +210,7 @@ impl From for PyDataType { #[allow(non_camel_case_types)] #[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "SqlTypeName", module = "datafusion")] +#[pyclass(name = "SqlTypeName", module = "dask_sql")] pub enum SqlTypeName { ANY, ARRAY, From 405470f168b453cd4e615972e9f85f4e6823fe1c Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 18 Jul 2023 14:47:28 -0400 Subject: [PATCH 43/44] Remove linux config.toml options --- .cargo/config.toml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index 3bbaccf35..d47f983e4 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -9,8 +9,3 @@ rustflags = [ "-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup", ] - -[target.x86_64-unknown-linux-gnu] -rustflags = [ - "-C", "link-arg=-undefined" -] From bdfad0407c1073c153df8cdad2fb9017777f636a Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Thu, 20 Jul 2023 00:07:46 -0400 Subject: [PATCH 44/44] Checkpoint commit --- dask_sql/mappings.py | 20 +++++++++++++++++++ dask_sql/physical/rel/custom/alter.py | 10 +++++----- dask_sql/physical/rel/logical/filter.py | 6 +++--- src/sql.rs | 18 ++++++++++++++--- .../optimizer/dynamic_partition_pruning.rs | 2 ++ tests/unit/test_mapping.py | 1 + 6 files changed, 46 insertions(+), 11 deletions(-) diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 1346b3a08..4b46bb0f6 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -174,6 +174,26 @@ def sql_to_python_value(sql_type: "SqlType", literal_value: Any) -> Any: python_type = Decimal + elif sql_type == SqlType.INTERVAL: + # check for finer granular interval types, e.g., INTERVAL MONTH, INTERVAL YEAR + try: + interval_type = str(sql_type).split()[1].lower() + + if interval_type in {"year", "quarter", "month"}: + # if sql_type is INTERVAL YEAR, Calcite will covert to months + delta = pd.tseries.offsets.DateOffset(months=float(str(literal_value))) + return delta + except IndexError: # pragma: no cover + # no finer granular interval type specified + pass + except TypeError: # pragma: no cover + # interval type is not recognized, fall back to default case + pass + + # Calcite will always convert INTERVAL types except YEAR, QUATER, MONTH to milliseconds + # Issue: if sql_type is INTERVAL MICROSECOND, and value <= 1000, literal_value will be rounded to 0 + return np.timedelta64(literal_value, "ms") + elif sql_type == SqlType.INTERVAL_DAY: return np.timedelta64(literal_value[0], "D") + np.timedelta64( literal_value[1], "ms" diff --git a/dask_sql/physical/rel/custom/alter.py b/dask_sql/physical/rel/custom/alter.py index b29eb7737..8685b8b92 100644 --- a/dask_sql/physical/rel/custom/alter.py +++ b/dask_sql/physical/rel/custom/alter.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: import dask_sql - from dask_sql._datafusion_lib import LogicalPlan + from dask_sql._datafusion_lib import DaskLogicalPlan class AlterSchemaPlugin(BaseRelPlugin): @@ -26,8 +26,8 @@ class AlterSchemaPlugin(BaseRelPlugin): class_name = "AlterSchema" - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context"): - alter_schema = rel.alter_schema() + def convert(self, rel: "DaskLogicalPlan", context: "dask_sql.Context"): + alter_schema = rel.to_variant() old_schema_name = alter_schema.getOldSchemaName() new_schema_name = alter_schema.getNewSchemaName() @@ -60,8 +60,8 @@ class AlterTablePlugin(BaseRelPlugin): class_name = "AlterTable" - def convert(self, rel: "LogicalPlan", context: "dask_sql.Context"): - alter_table = rel.alter_table() + def convert(self, rel: "DaskLogicalPlan", context: "dask_sql.Context"): + alter_table = rel.to_variant() old_table_name = alter_table.getOldTableName() new_table_name = alter_table.getNewTableName() diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index edf2d095b..5de533b2c 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -13,7 +13,7 @@ if TYPE_CHECKING: import dask_sql - from dask_sql._datafusion_lib import LogicalPlan + from dask_sql._datafusion_lib import DaskLogicalPlan logger = logging.getLogger(__name__) @@ -55,14 +55,14 @@ class DaskFilterPlugin(BaseRelPlugin): def convert( self, - rel: "LogicalPlan", + rel: "DaskLogicalPlan", context: "dask_sql.Context", ) -> DataContainer: (dc,) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container - filter = rel.filter() + filter = rel.to_variant() # Every logic is handled in the RexConverter # we just need to apply it here diff --git a/src/sql.rs b/src/sql.rs index d9ab0bb9a..13a932baa 100644 --- a/src/sql.rs +++ b/src/sql.rs @@ -179,6 +179,8 @@ impl ContextProvider for DaskSQLContext { DataType::Float16, DataType::Float32, DataType::Float64, + DataType::Decimal128(1, 1), + DataType::Decimal256(1, 1), ]; match name { @@ -333,6 +335,7 @@ impl ContextProvider for DaskSQLContext { // Loop through all of the user defined functions for schema in self.schemas.values() { for (fun_name, func_mutex) in &schema.functions { + println!("**** Function Name: {:?}", fun_name); if fun_name.eq(name) { let function = func_mutex.lock().unwrap(); if function.aggregation.eq(&true) { @@ -609,9 +612,16 @@ impl DaskSQLContext { let inner_plan = match dask_statement { DaskStatement::Statement(statement) => { let planner = SqlToRel::new(self); - Ok::( - planner.statement_to_plan(DFStatement::Statement(statement))?, - ) + println!("> _logical_relational_algebra"); + println!("> state"); + let state = DFStatement::Statement(statement); + println!("< state 1"); + println!("> plan"); + let plan = planner.statement_to_plan(state); + println!("< plan: {:?}", plan); + let resp = Ok::(plan?); + println!("< _logical_relational_algebra"); + resp } DaskStatement::CreateModel(create_model) => Ok(LogicalPlan::Extension(Extension { node: Arc::new(CreateModelPlanNode { @@ -754,6 +764,8 @@ impl DaskSQLContext { })), }; + println!("SHOW THYSELF SATAN!!!"); + Ok(DaskLogicalPlan::_new(inner_plan?)) } } diff --git a/src/sql/optimizer/dynamic_partition_pruning.rs b/src/sql/optimizer/dynamic_partition_pruning.rs index ac931b560..f946fcd12 100644 --- a/src/sql/optimizer/dynamic_partition_pruning.rs +++ b/src/sql/optimizer/dynamic_partition_pruning.rs @@ -782,6 +782,8 @@ fn satisfies_int64(long_value: Option, filter: Expr) -> bool { Expr::Literal(ScalarValue::Int32(i)) => i64::from(i.unwrap()), Expr::Literal(ScalarValue::Float64(i)) => i.unwrap() as i64, Expr::Literal(ScalarValue::TimestampNanosecond(i, None)) => i.unwrap(), + Expr::Literal(ScalarValue::Date32(i)) => i64::from(i.unwrap()), + Expr::Literal(ScalarValue::Date64(i)) => i.unwrap(), _ => { panic!("Unknown ScalarValue type {filter_value}"); } diff --git a/tests/unit/test_mapping.py b/tests/unit/test_mapping.py index ff98ba4dc..7efef1292 100644 --- a/tests/unit/test_mapping.py +++ b/tests/unit/test_mapping.py @@ -4,6 +4,7 @@ import pandas as pd import pytest +from dask_sql._datafusion_lib import SqlType from dask_sql.mappings import python_to_sql_type, similar_type, sql_to_python_value