fix: correct LZ0 to LZO in compression options (#995)

kosiew · web-flow · commit 39fec53ca118 · 2025-01-14T07:01:10.000-05:00
* fix: correct LZ0 to LZO in compression options

* fix: disable LZO compression option and update tests to reflect its unavailability

* fix: ruff format expected string in test_execution_plan

* fix: update test for execution plan and add validation for invalid LZO compression

* fix: remove LZO compression option and related test cases

* ruff autoformat

* fix: remove TODO comment regarding LZO compression implementation
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -57,7 +57,9 @@ class Compression(Enum):
     GZIP = "gzip"
     BROTLI = "brotli"
     LZ4 = "lz4"
-    LZ0 = "lz0"
+    # lzo is not implemented yet
+    # https://github.com/apache/arrow-rs/issues/6970
+    # LZO = "lzo"
     ZSTD = "zstd"
     LZ4_RAW = "lz4_raw"
 
@@ -696,10 +698,10 @@ def write_parquet(
                 - "snappy": Snappy compression.
                 - "gzip": Gzip compression.
                 - "brotli": Brotli compression.
-                - "lz0": LZ0 compression.
                 - "lz4": LZ4 compression.
                 - "lz4_raw": LZ4_RAW compression.
                 - "zstd": Zstandard compression.
+            Note: LZO is not yet implemented in arrow-rs and is therefore excluded.
             compression_level: Compression level to use. For ZSTD, the
                 recommended range is 1 to 22, with the default being 4. Higher levels
                 provide better compression but slower speed.
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1115,6 +1115,8 @@ def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression)
         df.write_parquet(str(path), compression=compression)
 
 
+# not testing lzo because it it not implemented yet
+# https://github.com/apache/arrow-rs/issues/6970
 @pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
 def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
     # Test write_parquet with zstd, brotli, gzip default compression level,
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
@@ -790,9 +790,9 @@ def test_hash_functions(df):
     )
     assert result.column(2) == pa.array(
         [
-            b("185F8DB32271FE25F561A6FC938B2E26" "4306EC304EDA518007D1764826381969"),
-            b("78AE647DC5544D227130A0682A51E30B" "C7777FBB6D8A8F17007463A3ECD1D524"),
-            b("BB7208BC9B5D7C04F1236A82A0093A5E" "33F40423D5BA8D4266F7092C3BA43B62"),
+            b("185F8DB32271FE25F561A6FC938B2E264306EC304EDA518007D1764826381969"),
+            b("78AE647DC5544D227130A0682A51E30BC7777FBB6D8A8F17007463A3ECD1D524"),
+            b("BB7208BC9B5D7C04F1236A82A0093A5E33F40423D5BA8D4266F7092C3BA43B62"),
         ]
     )
     assert result.column(3) == pa.array(
@@ -838,16 +838,16 @@ def test_hash_functions(df):
     )
     assert result.column(5) == pa.array(
         [
-            b("F73A5FBF881F89B814871F46E26AD3FA" "37CB2921C5E8561618639015B3CCBB71"),
-            b("B792A0383FB9E7A189EC150686579532" "854E44B71AC394831DAED169BA85CCC5"),
-            b("27988A0E51812297C77A433F63523334" "6AEE29A829DCF4F46E0F58F402C6CFCB"),
+            b("F73A5FBF881F89B814871F46E26AD3FA37CB2921C5E8561618639015B3CCBB71"),
+            b("B792A0383FB9E7A189EC150686579532854E44B71AC394831DAED169BA85CCC5"),
+            b("27988A0E51812297C77A433F635233346AEE29A829DCF4F46E0F58F402C6CFCB"),
         ]
     )
     assert result.column(6) == pa.array(
         [
-            b("FBC2B0516EE8744D293B980779178A35" "08850FDCFE965985782C39601B65794F"),
-            b("BF73D18575A736E4037D45F9E316085B" "86C19BE6363DE6AA789E13DEAACC1C4E"),
-            b("C8D11B9F7237E4034ADBCD2005735F9B" "C4C597C75AD89F4492BEC8F77D15F7EB"),
+            b("FBC2B0516EE8744D293B980779178A3508850FDCFE965985782C39601B65794F"),
+            b("BF73D18575A736E4037D45F9E316085B86C19BE6363DE6AA789E13DEAACC1C4E"),
+            b("C8D11B9F7237E4034ADBCD2005735F9BC4C597C75AD89F4492BEC8F77D15F7EB"),
         ]
     )
     assert result.column(7) == result.column(1)  # SHA-224
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -491,7 +491,7 @@ impl PyDataFrame {
                 ZstdLevel::try_new(verify_compression_level(compression_level)? as i32)
                     .map_err(|e| PyValueError::new_err(format!("{e}")))?,
             ),
-            "lz0" => Compression::LZO,
+            "lzo" => Compression::LZO,
             "lz4" => Compression::LZ4,
             "lz4_raw" => Compression::LZ4_RAW,
             "uncompressed" => Compression::UNCOMPRESSED,

Original file line number	Diff line number	Diff line change
`@@ -790,9 +790,9 @@ def test_hash_functions(df):`
`790`	`790`	`)`
`791`	`791`	`assert result.column(2) == pa.array(`
`792`	`792`	`[`
`793`		`- b("185F8DB32271FE25F561A6FC938B2E26" "4306EC304EDA518007D1764826381969"),`
`794`		`- b("78AE647DC5544D227130A0682A51E30B" "C7777FBB6D8A8F17007463A3ECD1D524"),`
`795`		`- b("BB7208BC9B5D7C04F1236A82A0093A5E" "33F40423D5BA8D4266F7092C3BA43B62"),`
	`793`	`+ b("185F8DB32271FE25F561A6FC938B2E264306EC304EDA518007D1764826381969"),`
	`794`	`+ b("78AE647DC5544D227130A0682A51E30BC7777FBB6D8A8F17007463A3ECD1D524"),`
	`795`	`+ b("BB7208BC9B5D7C04F1236A82A0093A5E33F40423D5BA8D4266F7092C3BA43B62"),`
`796`	`796`	`]`
`797`	`797`	`)`
`798`	`798`	`assert result.column(3) == pa.array(`
`@@ -838,16 +838,16 @@ def test_hash_functions(df):`
`838`	`838`	`)`
`839`	`839`	`assert result.column(5) == pa.array(`
`840`	`840`	`[`
`841`		`- b("F73A5FBF881F89B814871F46E26AD3FA" "37CB2921C5E8561618639015B3CCBB71"),`
`842`		`- b("B792A0383FB9E7A189EC150686579532" "854E44B71AC394831DAED169BA85CCC5"),`
`843`		`- b("27988A0E51812297C77A433F63523334" "6AEE29A829DCF4F46E0F58F402C6CFCB"),`
	`841`	`+ b("F73A5FBF881F89B814871F46E26AD3FA37CB2921C5E8561618639015B3CCBB71"),`
	`842`	`+ b("B792A0383FB9E7A189EC150686579532854E44B71AC394831DAED169BA85CCC5"),`
	`843`	`+ b("27988A0E51812297C77A433F635233346AEE29A829DCF4F46E0F58F402C6CFCB"),`
`844`	`844`	`]`
`845`	`845`	`)`
`846`	`846`	`assert result.column(6) == pa.array(`
`847`	`847`	`[`
`848`		`- b("FBC2B0516EE8744D293B980779178A35" "08850FDCFE965985782C39601B65794F"),`
`849`		`- b("BF73D18575A736E4037D45F9E316085B" "86C19BE6363DE6AA789E13DEAACC1C4E"),`
`850`		`- b("C8D11B9F7237E4034ADBCD2005735F9B" "C4C597C75AD89F4492BEC8F77D15F7EB"),`
	`848`	`+ b("FBC2B0516EE8744D293B980779178A3508850FDCFE965985782C39601B65794F"),`
	`849`	`+ b("BF73D18575A736E4037D45F9E316085B86C19BE6363DE6AA789E13DEAACC1C4E"),`
	`850`	`+ b("C8D11B9F7237E4034ADBCD2005735F9BC4C597C75AD89F4492BEC8F77D15F7EB"),`
`851`	`851`	`]`
`852`	`852`	`)`
`853`	`853`	`assert result.column(7) == result.column(1) # SHA-224`