Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions paimon-python/pypaimon/common/options/core_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,10 @@ class CoreOptions:
METADATA_STATS_MODE: ConfigOption[str] = (
ConfigOptions.key("metadata.stats-mode")
.string_type()
.default_value("none")
.with_description("Stats Mode, Python by default is none. Java is truncate(16).")
.default_value("truncate(16)")
.with_description("The mode of metadata stats. Available modes: "
"'none' (no stats), 'counts' (null counts only), "
"'full' (exact min/max), 'truncate(length)' (truncated min/max).")
)

BLOB_AS_DESCRIPTOR: ConfigOption[bool] = (
Expand Down Expand Up @@ -475,7 +477,32 @@ def file_block_size(self, default=None):
return self.options.get(CoreOptions.FILE_BLOCK_SIZE, default)

def metadata_stats_enabled(self, default=None):
return self.options.get(CoreOptions.METADATA_STATS_MODE, default) == "full"
mode, _ = CoreOptions.parse_metadata_stats_mode(
self.options.get(CoreOptions.METADATA_STATS_MODE, default))
return mode != "NONE"

def metadata_stats_mode(self, default=None):
mode = self.options.get(CoreOptions.METADATA_STATS_MODE, default)
CoreOptions.parse_metadata_stats_mode(mode)
return mode.strip()

@staticmethod
def parse_metadata_stats_mode(mode: str):
if mode is None:
mode = CoreOptions.METADATA_STATS_MODE.default_value()
normalized = mode.strip()
upper = normalized.upper()
if upper in ("NONE", "COUNTS", "FULL"):
return upper, None
if upper.startswith("TRUNCATE(") and upper.endswith(")"):
length_text = upper[9:-1]
if not length_text or not all('0' <= c <= '9' for c in length_text):
raise ValueError(f"Unsupported metadata.stats-mode: {mode}")
length = int(length_text)
if length <= 0:
raise ValueError(f"Truncate length must be > 0, got: {mode}")
return "TRUNCATE", length
raise ValueError(f"Unsupported metadata.stats-mode: {mode}")

def blob_as_descriptor(self, default=None):
return self.options.get(CoreOptions.BLOB_AS_DESCRIPTOR, default)
Expand Down
9 changes: 9 additions & 0 deletions paimon-python/pypaimon/tests/predicates_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,15 @@ def test_is_null(self):
)
self.assertTrue(pred.test_by_simple_stats(stat_positive, 10))

def test_missing_minmax_keeps_file_for_value_predicate(self):
stat_missing_minmax = SimpleStats(
min_values=GenericRow([None], []),
max_values=GenericRow([None], []),
null_counts=[0],
)
pred = Predicate(method="equal", index=0, field="c", literals=["target"])
self.assertTrue(pred.test_by_simple_stats(stat_missing_minmax, 10))

def test_filter_with_null_and_or(self):
p_gt = Predicate(method='greaterThan', index=1, field='score', literals=[10])
p_null = Predicate(method='isNull', index=1, field='score', literals=[])
Expand Down
34 changes: 11 additions & 23 deletions paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
"""
import logging
import time
import random
from datetime import date
from decimal import Decimal
from unittest.mock import Mock
Expand Down Expand Up @@ -144,9 +143,7 @@ def test_full_data_types(self):
('f10', pa.decimal128(10, 2)),
('f11', pa.date32()),
])
stats_enabled = random.random() < 0.5
options = {'metadata.stats-mode': 'full'} if stats_enabled else {}
schema = Schema.from_pyarrow_schema(simple_pa_schema, options=options)
schema = Schema.from_pyarrow_schema(simple_pa_schema)
self.rest_catalog.create_table('default.test_full_data_types', schema, False)
table = self.rest_catalog.get_table('default.test_full_data_types')

Expand Down Expand Up @@ -186,25 +183,16 @@ def test_full_data_types(self):
manifest_files[0].file_name,
lambda row: table_scan.file_scanner._filter_manifest_entry(row),
drop_stats=False)
# Python write does not produce value stats
if stats_enabled:
self.assertEqual(manifest_entries[0].file.value_stats_cols, None)
min_value_stats = GenericRowDeserializer.from_bytes(manifest_entries[0].file.value_stats.min_values.data,
table.fields).values
max_value_stats = GenericRowDeserializer.from_bytes(manifest_entries[0].file.value_stats.max_values.data,
table.fields).values
expected_min_values = [col[0].as_py() for col in expect_data]
expected_max_values = [col[1].as_py() for col in expect_data]
self.assertEqual(min_value_stats, expected_min_values)
self.assertEqual(max_value_stats, expected_max_values)
else:
self.assertEqual(manifest_entries[0].file.value_stats_cols, [])
min_value_stats = GenericRowDeserializer.from_bytes(manifest_entries[0].file.value_stats.min_values.data,
[]).values
max_value_stats = GenericRowDeserializer.from_bytes(manifest_entries[0].file.value_stats.max_values.data,
[]).values
self.assertEqual(min_value_stats, [])
self.assertEqual(max_value_stats, [])
# Both 'full' and default 'truncate(16)' modes produce value stats
self.assertEqual(manifest_entries[0].file.value_stats_cols, None)
min_value_stats = GenericRowDeserializer.from_bytes(manifest_entries[0].file.value_stats.min_values.data,
table.fields).values
max_value_stats = GenericRowDeserializer.from_bytes(manifest_entries[0].file.value_stats.max_values.data,
table.fields).values
expected_min_values = [col[0].as_py() for col in expect_data]
expected_max_values = [col[1].as_py() for col in expect_data]
self.assertEqual(min_value_stats, expected_min_values)
self.assertEqual(max_value_stats, expected_max_values)

def test_mixed_add_and_delete_entries_same_partition(self):
"""Test record_count calculation with mixed ADD/DELETE entries in same partition."""
Expand Down
Loading
Loading