diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.cc b/cpp/velox/substrait/SubstraitToVeloxPlan.cc index 834127e20cc1..f754f4ae0bbf 100644 --- a/cpp/velox/substrait/SubstraitToVeloxPlan.cc +++ b/cpp/velox/substrait/SubstraitToVeloxPlan.cc @@ -1495,6 +1495,31 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: // The columns present in the table, if not available default to the baseSchema. auto tableSchema = splitInfo->tableSchema ? splitInfo->tableSchema : baseSchema; + // Build dataColumns from tableSchema, excluding partition columns. + // HiveTableHandle::dataColumns() is used as fileSchema for the reader. + // Partition columns should not be validated against the file's physical types + // (their values come from the partition path, not from the file). + std::unordered_set partitionColNames; + for (int idx = 0; idx < colNameList.size(); idx++) { + if (columnTypes[idx] == ColumnType::kPartitionKey) { + partitionColNames.insert(colNameList[idx]); + } + } + RowTypePtr dataColumns; + if (partitionColNames.empty()) { + dataColumns = tableSchema; + } else { + std::vector dataColNames; + std::vector dataColTypes; + for (int idx = 0; idx < tableSchema->size(); idx++) { + if (partitionColNames.find(tableSchema->nameOf(idx)) == partitionColNames.end()) { + dataColNames.push_back(tableSchema->nameOf(idx)); + dataColTypes.push_back(tableSchema->childAt(idx)); + } + } + dataColumns = ROW(std::move(dataColNames), std::move(dataColTypes)); + } + connector::ConnectorTableHandlePtr tableHandle; auto remainingFilter = readRel.has_filter() ? exprConverter_->toVeloxExpr(readRel.filter(), baseSchema) : nullptr; auto connectorId = kHiveConnectorId; @@ -1506,7 +1531,7 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: } common::SubfieldFilters subfieldFilters; tableHandle = std::make_shared( - connectorId, "hive_table", std::move(subfieldFilters), remainingFilter, tableSchema); + connectorId, "hive_table", std::move(subfieldFilters), remainingFilter, dataColumns); // Get assignments and out names. std::vector outNames; diff --git a/ep/build-velox/src/get-velox.sh b/ep/build-velox/src/get-velox.sh index 558f988e5c93..bed30a716a8d 100755 --- a/ep/build-velox/src/get-velox.sh +++ b/ep/build-velox/src/get-velox.sh @@ -17,8 +17,8 @@ set -exu CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) -VELOX_REPO=https://github.com/IBM/velox.git -VELOX_BRANCH=dft-2026_03_08-iceberg +VELOX_REPO=https://github.com/baibaichen/velox.git +VELOX_BRANCH=pr2/fix-parquet-thrift-spark18108 VELOX_ENHANCED_BRANCH=ibm-2026_03_08 VELOX_HOME="" RUN_SETUP_SCRIPT=ON diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 4a850690d336..b8613fc3e94f 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -449,6 +449,7 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-35640: int as long should throw schema incompatible error") // Velox parquet reader not allow offset zero. .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings") + .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet") enableSuite[GlutenParquetV1PartitionDiscoverySuite] enableSuite[GlutenParquetV2PartitionDiscoverySuite] enableSuite[GlutenParquetProtobufCompatibilitySuite] @@ -463,6 +464,7 @@ class VeloxTestSettings extends BackendTestSettings { // Rewrite because the filter after datasource is not needed. .exclude( "SPARK-26677: negated null-safe equality comparison should not filter matched row groups") + .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType") enableSuite[GlutenParquetV2QuerySuite] // Unsupport spark.sql.files.ignoreCorruptFiles. .exclude("Enabling/disabling ignoreCorruptFiles") @@ -471,6 +473,7 @@ class VeloxTestSettings extends BackendTestSettings { // Rewrite because the filter after datasource is not needed. .exclude( "SPARK-26677: negated null-safe equality comparison should not filter matched row groups") + .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType") enableSuite[GlutenParquetV1SchemaPruningSuite] enableSuite[GlutenParquetV2SchemaPruningSuite] enableSuite[GlutenParquetRebaseDatetimeV1Suite] diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 87d085ec510d..fddb01b27f53 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -453,6 +453,7 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-35640: int as long should throw schema incompatible error") // Velox parquet reader not allow offset zero. .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings") + .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet") enableSuite[GlutenParquetV1PartitionDiscoverySuite] enableSuite[GlutenParquetV2PartitionDiscoverySuite] enableSuite[GlutenParquetProtobufCompatibilitySuite] @@ -468,6 +469,7 @@ class VeloxTestSettings extends BackendTestSettings { // Rewrite because the filter after datasource is not needed. .exclude( "SPARK-26677: negated null-safe equality comparison should not filter matched row groups") + .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType") enableSuite[GlutenParquetV2QuerySuite] .exclude("row group skipping doesn't overflow when reading into larger type") // Unsupport spark.sql.files.ignoreCorruptFiles. @@ -477,6 +479,7 @@ class VeloxTestSettings extends BackendTestSettings { // Rewrite because the filter after datasource is not needed. .exclude( "SPARK-26677: negated null-safe equality comparison should not filter matched row groups") + .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType") enableSuite[GlutenParquetV1SchemaPruningSuite] enableSuite[GlutenParquetV2SchemaPruningSuite] enableSuite[GlutenParquetRebaseDatetimeV1Suite] diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 1207121da708..2be2882776dc 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -396,6 +396,7 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-35640: int as long should throw schema incompatible error") // Velox parquet reader not allow offset zero. .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings") + .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet") enableSuite[GlutenParquetV1PartitionDiscoverySuite] enableSuite[GlutenParquetV2PartitionDiscoverySuite] enableSuite[GlutenParquetProtobufCompatibilitySuite] @@ -411,6 +412,7 @@ class VeloxTestSettings extends BackendTestSettings { // Rewrite because the filter after datasource is not needed. .exclude( "SPARK-26677: negated null-safe equality comparison should not filter matched row groups") + .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType") enableSuite[GlutenParquetV2QuerySuite] .exclude("row group skipping doesn't overflow when reading into larger type") // Unsupport spark.sql.files.ignoreCorruptFiles. @@ -420,6 +422,7 @@ class VeloxTestSettings extends BackendTestSettings { // Rewrite because the filter after datasource is not needed. .exclude( "SPARK-26677: negated null-safe equality comparison should not filter matched row groups") + .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType") enableSuite[GlutenParquetV1SchemaPruningSuite] enableSuite[GlutenParquetV2SchemaPruningSuite] enableSuite[GlutenParquetRebaseDatetimeV1Suite] diff --git a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 4f7c67daaad6..70c0827e4407 100644 --- a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -363,11 +363,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("unsupported parquet conversion IntegerType -> DecimalType(10,1)") .exclude("unsupported parquet conversion IntegerType -> DecimalType(5,0)") .exclude("unsupported parquet conversion IntegerType -> DecimalType(9,0)") - .exclude("unsupported parquet conversion LongType -> DateType") .exclude("unsupported parquet conversion LongType -> DecimalType(10,0)") .exclude("unsupported parquet conversion LongType -> DecimalType(19,0)") .exclude("unsupported parquet conversion LongType -> DecimalType(20,1)") - .exclude("unsupported parquet conversion LongType -> IntegerType") .exclude("unsupported parquet conversion ShortType -> DecimalType(3,0)") .exclude("unsupported parquet conversion ShortType -> DecimalType(4,0)") .exclude("unsupported parquet conversion ShortType -> DecimalType(5,0)") @@ -379,6 +377,7 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("parquet widening conversion IntegerType -> DecimalType(20,0)") .exclude("parquet widening conversion IntegerType -> DecimalType(38,0)") .exclude("parquet widening conversion IntegerType -> DoubleType") + .exclude("parquet widening conversion IntegerType -> ShortType") .exclude("parquet widening conversion LongType -> DecimalType(20,0)") .exclude("parquet widening conversion LongType -> DecimalType(21,1)") .exclude("parquet widening conversion LongType -> DecimalType(38,0)") @@ -386,6 +385,7 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("parquet widening conversion ShortType -> DecimalType(20,0)") .exclude("parquet widening conversion ShortType -> DecimalType(38,0)") .exclude("parquet widening conversion ShortType -> DoubleType") + .exclude("parquet decimal type change IntegerType -> ShortType overflows") enableSuite[GlutenParquetVariantShreddingSuite] // Generated suites for org.apache.spark.sql.execution.datasources.text // TODO: 4.x enableSuite[GlutenWholeTextFileV1Suite] // 1 failure @@ -578,6 +578,7 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings") // TODO: fix in Spark-4.0 .exclude("explode nested lists crossing a rowgroup boundary") + .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet") enableSuite[GlutenParquetV1PartitionDiscoverySuite] enableSuite[GlutenParquetV2PartitionDiscoverySuite] enableSuite[GlutenParquetProtobufCompatibilitySuite] @@ -593,6 +594,7 @@ class VeloxTestSettings extends BackendTestSettings { // Rewrite because the filter after datasource is not needed. .exclude( "SPARK-26677: negated null-safe equality comparison should not filter matched row groups") + .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType") enableSuite[GlutenParquetV2QuerySuite] .exclude("row group skipping doesn't overflow when reading into larger type") // Unsupport spark.sql.files.ignoreCorruptFiles. @@ -602,6 +604,7 @@ class VeloxTestSettings extends BackendTestSettings { // Rewrite because the filter after datasource is not needed. .exclude( "SPARK-26677: negated null-safe equality comparison should not filter matched row groups") + .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType") enableSuite[GlutenParquetV1SchemaPruningSuite] enableSuite[GlutenParquetV2SchemaPruningSuite] enableSuite[GlutenParquetRebaseDatetimeV1Suite] diff --git a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 0dadfa1d0bd8..079cb66fc964 100644 --- a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -374,11 +374,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("unsupported parquet conversion IntegerType -> DecimalType(10,1)") .exclude("unsupported parquet conversion IntegerType -> DecimalType(5,0)") .exclude("unsupported parquet conversion IntegerType -> DecimalType(9,0)") - .exclude("unsupported parquet conversion LongType -> DateType") .exclude("unsupported parquet conversion LongType -> DecimalType(10,0)") .exclude("unsupported parquet conversion LongType -> DecimalType(19,0)") .exclude("unsupported parquet conversion LongType -> DecimalType(20,1)") - .exclude("unsupported parquet conversion LongType -> IntegerType") .exclude("unsupported parquet conversion ShortType -> DecimalType(3,0)") .exclude("unsupported parquet conversion ShortType -> DecimalType(4,0)") .exclude("unsupported parquet conversion ShortType -> DecimalType(5,0)") @@ -390,6 +388,7 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("parquet widening conversion IntegerType -> DecimalType(20,0)") .exclude("parquet widening conversion IntegerType -> DecimalType(38,0)") .exclude("parquet widening conversion IntegerType -> DoubleType") + .exclude("parquet widening conversion IntegerType -> ShortType") .exclude("parquet widening conversion LongType -> DecimalType(20,0)") .exclude("parquet widening conversion LongType -> DecimalType(21,1)") .exclude("parquet widening conversion LongType -> DecimalType(38,0)") @@ -397,6 +396,7 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("parquet widening conversion ShortType -> DecimalType(20,0)") .exclude("parquet widening conversion ShortType -> DecimalType(38,0)") .exclude("parquet widening conversion ShortType -> DoubleType") + .exclude("parquet decimal type change IntegerType -> ShortType overflows") // TODO: 4.x enableSuite[GlutenParquetVariantShreddingSuite] // 1 failure // Generated suites for org.apache.spark.sql.execution.datasources.text // TODO: 4.x enableSuite[GlutenWholeTextFileV1Suite] // 1 failure @@ -539,6 +539,7 @@ class VeloxTestSettings extends BackendTestSettings { .excludeByPrefix("SPARK-53535") // see https://issues.apache.org/jira/browse/SPARK-53535 .excludeByPrefix("vectorized reader: missing all struct fields") .excludeByPrefix("SPARK-54220") // https://issues.apache.org/jira/browse/SPARK-54220 + .exclude("SPARK-34817: Read UINT_64 as Decimal from parquet") enableSuite[GlutenParquetV1PartitionDiscoverySuite] enableSuite[GlutenParquetV2PartitionDiscoverySuite] enableSuite[GlutenParquetProtobufCompatibilitySuite] @@ -554,6 +555,7 @@ class VeloxTestSettings extends BackendTestSettings { // Rewrite because the filter after datasource is not needed. .exclude( "SPARK-26677: negated null-safe equality comparison should not filter matched row groups") + .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType") enableSuite[GlutenParquetV2QuerySuite] .exclude("row group skipping doesn't overflow when reading into larger type") // Unsupport spark.sql.files.ignoreCorruptFiles. @@ -563,6 +565,7 @@ class VeloxTestSettings extends BackendTestSettings { // Rewrite because the filter after datasource is not needed. .exclude( "SPARK-26677: negated null-safe equality comparison should not filter matched row groups") + .exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType") enableSuite[GlutenParquetV1SchemaPruningSuite] enableSuite[GlutenParquetV2SchemaPruningSuite] enableSuite[GlutenParquetRebaseDatetimeV1Suite]