apache · Indhumathi27 · Oct 3, 2025 · deniskuzZ · Oct 6, 2025 · Indhumathi27
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
@@ -4501,6 +4501,12 @@ public static ImmutablePair<VectorAggregationDesc,String> getVectorAggregationDe
       vecAggrClasses = new Class[] {
           VectorUDAFComputeDsKllSketchDouble.class, VectorUDAFComputeDsKllSketchFinal.class
       };
+    } else if (VECTORIZABLE_UDAF.COUNT.toString().equalsIgnoreCase(aggregationName) && parameterList.size() > 1) {
+      // Handle unsupported multi-column COUNT DISTINCT
+      String issue = "Unsupported COUNT DISTINCT with multiple columns: "
+              + aggregationName + "(" + parameterList + "). "
+              + "Hive only supports COUNT(DISTINCT col) in vectorized execution. ";
+      return new ImmutablePair<>(null, issue);
     } else {
       VectorizedUDAFs annotation =
           AnnotationUtils.getAnnotation(evaluator.getClass(), VectorizedUDAFs.class);

diff --git a/ql/src/test/queries/clientpositive/vector_count_distinct_multiarg.q b/ql/src/test/queries/clientpositive/vector_count_distinct_multiarg.q
@@ -0,0 +1,35 @@
+drop table if exists test_vector;
+create external table test_vector(id string, pid bigint) PARTITIONED BY (full_date int);
+insert into test_vector (pid, full_date, id) values (1, '20240305', '6150');
+
+--------------------------------------------------------------------------------
+-- 1. Basic COUNT cases (valid in vectorization)
+--------------------------------------------------------------------------------
+SELECT COUNT(pid) AS cnt_col, COUNT(*) AS cnt_star, COUNT(20240305) AS cnt_const, COUNT(DISTINCT pid) as cnt_distinct, COUNT(1) AS CNT
+FROM test_vector WHERE full_date=20240305;
+EXPLAIN VECTORIZATION EXPRESSION
+SELECT COUNT(pid) AS cnt_col, COUNT(*) AS cnt_star, COUNT(20240305) AS cnt_const,COUNT(DISTINCT pid) as cnt_distinct, COUNT(1) AS CNT
+FROM test_vector WHERE full_date=20240305;
+
+--------------------------------------------------------------------------------
+-- 2. COUNT with DISTINCT column + constant (INVALID in vectorization)
+--------------------------------------------------------------------------------
+SELECT COUNT(DISTINCT pid, 20240305) AS CNT FROM test_vector WHERE full_date=20240305;
+EXPLAIN VECTORIZATION EXPRESSION
+SELECT COUNT(DISTINCT pid, 20240305) AS CNT FROM test_vector WHERE full_date=20240305;
+
+--------------------------------------------------------------------------------
+-- 3. COUNT(DISTINCT pid, full_date) (multi-col distinct → FAIL)
+--------------------------------------------------------------------------------
+SELECT COUNT(DISTINCT pid, full_date) AS CNT FROM test_vector WHERE full_date=20240305;
+EXPLAIN VECTORIZATION EXPRESSION
+SELECT COUNT(DISTINCT pid, full_date) AS CNT FROM test_vector WHERE full_date=20240305;
+
+--------------------------------------------------------------------------------
+-- 4. COUNT(DISTINCT pid, full_date, id) (multi-col distinct → FAIL)
+--------------------------------------------------------------------------------
+SELECT COUNT(DISTINCT pid, full_date, id) AS CNT FROM test_vector WHERE full_date=20240305;
 throw new UDFArgumentException("DISTINCT keyword must be specified"); 
 throw new UDFArgumentException("DISTINCT keyword must be specified"); 
+EXPLAIN VECTORIZATION EXPRESSION
+SELECT COUNT(DISTINCT pid, full_date, id) AS CNT FROM test_vector WHERE full_date=20240305;
+
+DROP TABLE test_vector;
diff --git a/ql/src/test/results/clientpositive/llap/vector_count.q.out b/ql/src/test/results/clientpositive/llap/vector_count.q.out
@@ -212,7 +212,7 @@ STAGE PLANS:
                 enabled: true
                 enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
                 inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
-                notVectorizedReason: GROUPBY operator: Aggregations with > 1 parameter are not supported unless all the extra parameters are constants count([Column[a], Column[b]])
+                notVectorizedReason: GROUPBY operator: Unsupported COUNT DISTINCT with multiple columns: count([Column[a], Column[b]]). Hive only supports COUNT(DISTINCT col) in vectorized execution. 
                 vectorized: false
         Reducer 2 
             Execution mode: llap