11from collections .abc import Callable
22from dataclasses import dataclass
33from typing import Any , Generator
4+ from collections .abc import Container
45import duckdb
56import pyarrow as pa
67import pyarrow .compute as pc
@@ -16,33 +17,36 @@ class BaseFieldInfo:
1617 """
1718
1819 has_nulls : bool
20+ """Whether the field has null values in this file."""
21+
1922 has_non_nulls : bool
23+ """Whether the field has non-null values in this file."""
2024
2125
2226@dataclass
2327class RangeFieldInfo (BaseFieldInfo ):
2428 """
2529 Information about a field that has a min and max value.
30+ This is used for range-based filtering in scan planning.
2631 """
2732
28- min_value : pa .Scalar
29- max_value : pa .Scalar
33+ min_value : pa .Scalar | None
34+ """Minimum value in the field, can be None if the field is empty."""
35+
36+ max_value : pa .Scalar | None
37+ """Maximum value in the field, can be None if the field is empty."""
3038
3139
3240@dataclass
3341class SetFieldInfo (BaseFieldInfo ):
3442 """
3543 Information about a field where the set of values are known.
36- The information about what values that are contained can produce
37- false positives.
3844 """
3945
40- values : set [
46+ values : Container [
4147 pa .Scalar
4248 ] # Set of values that are known to be present in the field, false positives are okay.
43-
44-
45- AnyFieldInfo = SetFieldInfo | RangeFieldInfo
49+ """A container of values that are known to be present in the field in this file."""
4650
4751
4852def _scalar_value_op (
@@ -101,9 +105,7 @@ def _sv_eq(a: pa.Scalar, b: pa.Scalar) -> bool:
101105 return _scalar_value_op (a , b , lambda x , y : x == y )
102106
103107
104- FileFieldInfo = dict [str , AnyFieldInfo ]
105-
106- # When bailing out we should know why we bailed out if we couldn't evaluate the expression.
108+ FileFieldInfo = dict [str , SetFieldInfo | RangeFieldInfo ]
107109
108110
109111class Planner :
@@ -113,13 +115,12 @@ class Planner:
113115
114116 def __init__ (self , files : list [tuple [str , FileFieldInfo ]]):
115117 """
116- Initialize with list of (filename, min_value, max_value) tuples.
117-
118- Args:
119- file_ranges: List of tuples containing (filename, min_val, max_val)
118+ Initialize with a list of (filename, FileFieldInfo) tuples.
120119 """
121- self .files = files
122- self .connection = duckdb .connect (":memory:" )
120+ self ._files = files
121+ """The list of files with their field information."""
122+ self ._connection = duckdb .connect (":memory:" )
123+ """DuckDB connection for evaluating scalar values."""
123124
124125 def _eval_predicate (
125126 self ,
@@ -162,7 +163,7 @@ def _eval_predicate(
162163
163164 # The thing on the right side should be something that can be evaluated against a range.
164165 # ideally, its going to be a
165- value_result = self .connection .execute (
166+ value_result = self ._connection .execute (
166167 f"select { node .right .sql ('duckdb' )} "
167168 ).arrow ()
168169 assert value_result .num_rows == 1 , (
@@ -497,31 +498,29 @@ def _evaluate_sql_node(
497498 f"Supported types: Connector, Predicate, Not, Boolean, Case, Null"
498499 )
499500
500- def get_matching_files (
501- self , exp : sqlglot .expressions .Expression | str , * , dialect : str = "duckdb"
501+ def files (
502+ self ,
503+ expression : sqlglot .expressions .Expression | str ,
504+ * ,
505+ dialect : str = "duckdb" ,
502506 ) -> Generator [str , None , None ]:
503507 """
504508 Get a set of files that match the given SQL expression.
505- Args:
506- expression: The SQL expression to evaluate.
507- dialect: The SQL dialect to use for parsing the expression.
508- Returns:
509- A set of filenames that match the expression.
510509 """
511- if isinstance (exp , str ):
510+ if isinstance (expression , str ):
512511 # Parse the expression if it is a string.
513- expression = sqlglot .parse_one (exp , dialect = dialect )
512+ exp = sqlglot .parse_one (expression , dialect = dialect )
514513 else :
515- expression = exp
514+ exp = expression
516515
517- if not isinstance (expression , sqlglot .expressions .Expression ):
518- raise ValueError (f"Expected a sqlglot expression, got { type (expression )} " )
516+ if not isinstance (exp , sqlglot .expressions .Expression ):
517+ raise ValueError (f"Expected a sqlglot expression, got { type (exp )} " )
519518
520519 # Simplify the parsed expression, move all of the literals to the right side
521- expression = sqlglot .optimizer .optimize (expression )
520+ exp = sqlglot .optimizer .optimize (exp )
522521
523- for filename , file_info in self .files :
524- eval_result = self ._evaluate_sql_node (expression , file_info )
522+ for filename , file_info in self ._files :
523+ eval_result = self ._evaluate_sql_node (exp , file_info )
525524 if eval_result is None or eval_result is True :
526525 # If the expression evaluates to True or cannot be evaluated, add the file
527526 # to the result set since the caller will be able to filter the rows further.
0 commit comments