From 486630b719921d09d2c72975e51539368d587f9e Mon Sep 17 00:00:00 2001
From: Chang chen <changchen@microsoft.com>
Date: Tue, 3 Mar 2026 09:07:14 +0000
Subject: [PATCH 1/4] Point Velox to PR2 branch with parquet-thrift fix

Replace OAP commit [15173][15343] (INT narrowing) with upstream Velox
PR #15173 (fix reading array of row) to fix parquet-thrift compatibility.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ep/build-velox/src/get-velox.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/ep/build-velox/src/get-velox.sh b/ep/build-velox/src/get-velox.sh
index 558f988e5c93..acca3d6e61b5 100755
--- a/ep/build-velox/src/get-velox.sh
+++ b/ep/build-velox/src/get-velox.sh
@@ -17,8 +17,8 @@
 set -exu
 
 CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd)
-VELOX_REPO=https://github.com/IBM/velox.git
-VELOX_BRANCH=dft-2026_03_08-iceberg
+VELOX_REPO=https://github.com/baibaichen/velox.git
+VELOX_BRANCH=pr2/fix-parquet-thrift-spark18108-new
 VELOX_ENHANCED_BRANCH=ibm-2026_03_08
 VELOX_HOME=""
 RUN_SETUP_SCRIPT=ON

From eafcc0cd4672dd23b845b3c79e590070e8162875 Mon Sep 17 00:00:00 2001
From: Chang chen <changchen@microsoft.com>
Date: Tue, 3 Mar 2026 07:39:55 +0000
Subject: [PATCH 2/4] Fix SPARK-18108: exclude partition columns from
 HiveTableHandle dataColumns

When Gluten creates HiveTableHandle, it was passing all columns (including
partition columns) as dataColumns. This caused Velox's convertType() to
validate partition column types against the Parquet file's physical types,
failing when they differ (e.g., LongType in file vs IntegerType from
partition inference).

Fix: build dataColumns excluding partition columns (ColumnType::kPartitionKey).
Partition column values come from the partition path, not from the file.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cpp/velox/substrait/SubstraitToVeloxPlan.cc | 27 ++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.cc b/cpp/velox/substrait/SubstraitToVeloxPlan.cc
index 834127e20cc1..f754f4ae0bbf 100644
--- a/cpp/velox/substrait/SubstraitToVeloxPlan.cc
+++ b/cpp/velox/substrait/SubstraitToVeloxPlan.cc
@@ -1495,6 +1495,31 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait::
   // The columns present in the table, if not available default to the baseSchema.
   auto tableSchema = splitInfo->tableSchema ? splitInfo->tableSchema : baseSchema;
 
+  // Build dataColumns from tableSchema, excluding partition columns.
+  // HiveTableHandle::dataColumns() is used as fileSchema for the reader.
+  // Partition columns should not be validated against the file's physical types
+  // (their values come from the partition path, not from the file).
+  std::unordered_set<std::string> partitionColNames;
+  for (int idx = 0; idx < colNameList.size(); idx++) {
+    if (columnTypes[idx] == ColumnType::kPartitionKey) {
+      partitionColNames.insert(colNameList[idx]);
+    }
+  }
+  RowTypePtr dataColumns;
+  if (partitionColNames.empty()) {
+    dataColumns = tableSchema;
+  } else {
+    std::vector<std::string> dataColNames;
+    std::vector<TypePtr> dataColTypes;
+    for (int idx = 0; idx < tableSchema->size(); idx++) {
+      if (partitionColNames.find(tableSchema->nameOf(idx)) == partitionColNames.end()) {
+        dataColNames.push_back(tableSchema->nameOf(idx));
+        dataColTypes.push_back(tableSchema->childAt(idx));
+      }
+    }
+    dataColumns = ROW(std::move(dataColNames), std::move(dataColTypes));
+  }
+
   connector::ConnectorTableHandlePtr tableHandle;
   auto remainingFilter = readRel.has_filter() ? exprConverter_->toVeloxExpr(readRel.filter(), baseSchema) : nullptr;
   auto connectorId = kHiveConnectorId;
@@ -1506,7 +1531,7 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait::
   }
   common::SubfieldFilters subfieldFilters;
   tableHandle = std::make_shared<connector::hive::HiveTableHandle>(
-      connectorId, "hive_table", std::move(subfieldFilters), remainingFilter, tableSchema);
+      connectorId, "hive_table", std::move(subfieldFilters), remainingFilter, dataColumns);
 
   // Get assignments and out names.
   std::vector<std::string> outNames;

From 95027b708b4132860e0e10c161b9bf25c02c6b57 Mon Sep 17 00:00:00 2001
From: Chang chen <changchen@microsoft.com>
Date: Tue, 3 Mar 2026 11:14:13 +0000
Subject: [PATCH 3/4] Update VeloxTestSettings for Velox PR2

With OAP INT narrowing commit replaced by upstream Velox PR #15173:
- Remove 2 excludes now passing: LongType->IntegerType, LongType->DateType
- Add 2 excludes for new failures: IntegerType->ShortType (OAP removed)

Exclude 63 (net unchanged: -2 +2). Test results: 21 pass / 63 ignored.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala     | 4 ++--
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 4f7c67daaad6..828e22eb88c7 100644
--- a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -363,11 +363,9 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("unsupported parquet conversion IntegerType -> DecimalType(10,1)")
     .exclude("unsupported parquet conversion IntegerType -> DecimalType(5,0)")
     .exclude("unsupported parquet conversion IntegerType -> DecimalType(9,0)")
-    .exclude("unsupported parquet conversion LongType -> DateType")
     .exclude("unsupported parquet conversion LongType -> DecimalType(10,0)")
     .exclude("unsupported parquet conversion LongType -> DecimalType(19,0)")
     .exclude("unsupported parquet conversion LongType -> DecimalType(20,1)")
-    .exclude("unsupported parquet conversion LongType -> IntegerType")
     .exclude("unsupported parquet conversion ShortType -> DecimalType(3,0)")
     .exclude("unsupported parquet conversion ShortType -> DecimalType(4,0)")
     .exclude("unsupported parquet conversion ShortType -> DecimalType(5,0)")
@@ -379,6 +377,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("parquet widening conversion IntegerType -> DecimalType(20,0)")
     .exclude("parquet widening conversion IntegerType -> DecimalType(38,0)")
     .exclude("parquet widening conversion IntegerType -> DoubleType")
+    .exclude("parquet widening conversion IntegerType -> ShortType")
     .exclude("parquet widening conversion LongType -> DecimalType(20,0)")
     .exclude("parquet widening conversion LongType -> DecimalType(21,1)")
     .exclude("parquet widening conversion LongType -> DecimalType(38,0)")
@@ -386,6 +385,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("parquet widening conversion ShortType -> DecimalType(20,0)")
     .exclude("parquet widening conversion ShortType -> DecimalType(38,0)")
     .exclude("parquet widening conversion ShortType -> DoubleType")
+    .exclude("parquet decimal type change IntegerType -> ShortType overflows")
   enableSuite[GlutenParquetVariantShreddingSuite]
   // Generated suites for org.apache.spark.sql.execution.datasources.text
   // TODO: 4.x enableSuite[GlutenWholeTextFileV1Suite]  // 1 failure
diff --git a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 0dadfa1d0bd8..7fc3704a6271 100644
--- a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -374,11 +374,9 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("unsupported parquet conversion IntegerType -> DecimalType(10,1)")
     .exclude("unsupported parquet conversion IntegerType -> DecimalType(5,0)")
     .exclude("unsupported parquet conversion IntegerType -> DecimalType(9,0)")
-    .exclude("unsupported parquet conversion LongType -> DateType")
     .exclude("unsupported parquet conversion LongType -> DecimalType(10,0)")
     .exclude("unsupported parquet conversion LongType -> DecimalType(19,0)")
     .exclude("unsupported parquet conversion LongType -> DecimalType(20,1)")
-    .exclude("unsupported parquet conversion LongType -> IntegerType")
     .exclude("unsupported parquet conversion ShortType -> DecimalType(3,0)")
     .exclude("unsupported parquet conversion ShortType -> DecimalType(4,0)")
     .exclude("unsupported parquet conversion ShortType -> DecimalType(5,0)")
@@ -390,6 +388,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("parquet widening conversion IntegerType -> DecimalType(20,0)")
     .exclude("parquet widening conversion IntegerType -> DecimalType(38,0)")
     .exclude("parquet widening conversion IntegerType -> DoubleType")
+    .exclude("parquet widening conversion IntegerType -> ShortType")
     .exclude("parquet widening conversion LongType -> DecimalType(20,0)")
     .exclude("parquet widening conversion LongType -> DecimalType(21,1)")
     .exclude("parquet widening conversion LongType -> DecimalType(38,0)")
@@ -397,6 +396,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("parquet widening conversion ShortType -> DecimalType(20,0)")
     .exclude("parquet widening conversion ShortType -> DecimalType(38,0)")
     .exclude("parquet widening conversion ShortType -> DoubleType")
+    .exclude("parquet decimal type change IntegerType -> ShortType overflows")
   // TODO: 4.x enableSuite[GlutenParquetVariantShreddingSuite]  // 1 failure
   // Generated suites for org.apache.spark.sql.execution.datasources.text
   // TODO: 4.x enableSuite[GlutenWholeTextFileV1Suite]  // 1 failure

From 4bef8b1d2f2bab000f4db7195206fd19b4109114 Mon Sep 17 00:00:00 2001
From: Chang chen <changchen@microsoft.com>
Date: Tue, 10 Mar 2026 06:09:54 +0000
Subject: [PATCH 4/4] Exclude SPARK-16632 and SPARK-34817 tests pending PR3
 widening fix

These tests regress after skipping OAP commit 8c2bd0849 (Allow reading
integers into smaller-range types). They will be re-enabled in PR3 when
Velox widening commits are applied.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ep/build-velox/src/get-velox.sh                                | 2 +-
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala      | 3 +++
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala      | 3 +++
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala      | 3 +++
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala      | 3 +++
 .../org/apache/gluten/utils/velox/VeloxTestSettings.scala      | 3 +++
 6 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/ep/build-velox/src/get-velox.sh b/ep/build-velox/src/get-velox.sh
index acca3d6e61b5..bed30a716a8d 100755
--- a/ep/build-velox/src/get-velox.sh
+++ b/ep/build-velox/src/get-velox.sh
@@ -18,7 +18,7 @@ set -exu
 
 CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd)
 VELOX_REPO=https://github.com/baibaichen/velox.git
-VELOX_BRANCH=pr2/fix-parquet-thrift-spark18108-new
+VELOX_BRANCH=pr2/fix-parquet-thrift-spark18108
 VELOX_ENHANCED_BRANCH=ibm-2026_03_08
 VELOX_HOME=""
 RUN_SETUP_SCRIPT=ON
diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 4a850690d336..b8613fc3e94f 100644
--- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -449,6 +449,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-35640: int as long should throw schema incompatible error")
     // Velox parquet reader not allow offset zero.
     .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
+    .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
   enableSuite[GlutenParquetProtobufCompatibilitySuite]
@@ -463,6 +464,7 @@ class VeloxTestSettings extends BackendTestSettings {
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
+    .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
   enableSuite[GlutenParquetV2QuerySuite]
     // Unsupport spark.sql.files.ignoreCorruptFiles.
     .exclude("Enabling/disabling ignoreCorruptFiles")
@@ -471,6 +473,7 @@ class VeloxTestSettings extends BackendTestSettings {
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
+    .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
   enableSuite[GlutenParquetV1SchemaPruningSuite]
   enableSuite[GlutenParquetV2SchemaPruningSuite]
   enableSuite[GlutenParquetRebaseDatetimeV1Suite]
diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 87d085ec510d..fddb01b27f53 100644
--- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -453,6 +453,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-35640: int as long should throw schema incompatible error")
     // Velox parquet reader not allow offset zero.
     .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
+    .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
   enableSuite[GlutenParquetProtobufCompatibilitySuite]
@@ -468,6 +469,7 @@ class VeloxTestSettings extends BackendTestSettings {
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
+    .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
   enableSuite[GlutenParquetV2QuerySuite]
     .exclude("row group skipping doesn't overflow when reading into larger type")
     // Unsupport spark.sql.files.ignoreCorruptFiles.
@@ -477,6 +479,7 @@ class VeloxTestSettings extends BackendTestSettings {
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
+    .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
   enableSuite[GlutenParquetV1SchemaPruningSuite]
   enableSuite[GlutenParquetV2SchemaPruningSuite]
   enableSuite[GlutenParquetRebaseDatetimeV1Suite]
diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 1207121da708..2be2882776dc 100644
--- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -396,6 +396,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-35640: int as long should throw schema incompatible error")
     // Velox parquet reader not allow offset zero.
     .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
+    .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
   enableSuite[GlutenParquetProtobufCompatibilitySuite]
@@ -411,6 +412,7 @@ class VeloxTestSettings extends BackendTestSettings {
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
+    .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
   enableSuite[GlutenParquetV2QuerySuite]
     .exclude("row group skipping doesn't overflow when reading into larger type")
     // Unsupport spark.sql.files.ignoreCorruptFiles.
@@ -420,6 +422,7 @@ class VeloxTestSettings extends BackendTestSettings {
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
+    .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
   enableSuite[GlutenParquetV1SchemaPruningSuite]
   enableSuite[GlutenParquetV2SchemaPruningSuite]
   enableSuite[GlutenParquetRebaseDatetimeV1Suite]
diff --git a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 828e22eb88c7..70c0827e4407 100644
--- a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -578,6 +578,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
     // TODO: fix in Spark-4.0
     .exclude("explode nested lists crossing a rowgroup boundary")
+    .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
   enableSuite[GlutenParquetProtobufCompatibilitySuite]
@@ -593,6 +594,7 @@ class VeloxTestSettings extends BackendTestSettings {
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
+    .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
   enableSuite[GlutenParquetV2QuerySuite]
     .exclude("row group skipping doesn't overflow when reading into larger type")
     // Unsupport spark.sql.files.ignoreCorruptFiles.
@@ -602,6 +604,7 @@ class VeloxTestSettings extends BackendTestSettings {
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
+    .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
   enableSuite[GlutenParquetV1SchemaPruningSuite]
   enableSuite[GlutenParquetV2SchemaPruningSuite]
   enableSuite[GlutenParquetRebaseDatetimeV1Suite]
diff --git a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 7fc3704a6271..079cb66fc964 100644
--- a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -539,6 +539,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .excludeByPrefix("SPARK-53535") // see https://issues.apache.org/jira/browse/SPARK-53535
     .excludeByPrefix("vectorized reader: missing all struct fields")
     .excludeByPrefix("SPARK-54220") // https://issues.apache.org/jira/browse/SPARK-54220
+    .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
   enableSuite[GlutenParquetV1PartitionDiscoverySuite]
   enableSuite[GlutenParquetV2PartitionDiscoverySuite]
   enableSuite[GlutenParquetProtobufCompatibilitySuite]
@@ -554,6 +555,7 @@ class VeloxTestSettings extends BackendTestSettings {
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
+    .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
   enableSuite[GlutenParquetV2QuerySuite]
     .exclude("row group skipping doesn't overflow when reading into larger type")
     // Unsupport spark.sql.files.ignoreCorruptFiles.
@@ -563,6 +565,7 @@ class VeloxTestSettings extends BackendTestSettings {
     // Rewrite because the filter after datasource is not needed.
     .exclude(
       "SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
+    .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
   enableSuite[GlutenParquetV1SchemaPruningSuite]
   enableSuite[GlutenParquetV2SchemaPruningSuite]
   enableSuite[GlutenParquetRebaseDatetimeV1Suite]