Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4501,6 +4501,12 @@ public static ImmutablePair<VectorAggregationDesc,String> getVectorAggregationDe
vecAggrClasses = new Class[] {
VectorUDAFComputeDsKllSketchDouble.class, VectorUDAFComputeDsKllSketchFinal.class
};
} else if (VECTORIZABLE_UDAF.COUNT.toString().equalsIgnoreCase(aggregationName) && parameterList.size() > 1) {
// Handle unsupported multi-column COUNT DISTINCT
String issue = "Unsupported COUNT DISTINCT with multiple columns: "
+ aggregationName + "(" + parameterList + "). "
+ "Hive only supports COUNT(DISTINCT col) in vectorized execution. ";
return new ImmutablePair<>(null, issue);
} else {
VectorizedUDAFs annotation =
AnnotationUtils.getAnnotation(evaluator.getClass(), VectorizedUDAFs.class);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
drop table if exists test_vector;
create external table test_vector(id string, pid bigint) PARTITIONED BY (full_date int);
insert into test_vector (pid, full_date, id) values (1, '20240305', '6150');

--------------------------------------------------------------------------------
-- 1. Basic COUNT cases (valid in vectorization)
--------------------------------------------------------------------------------
SELECT COUNT(pid) AS cnt_col, COUNT(*) AS cnt_star, COUNT(20240305) AS cnt_const, COUNT(DISTINCT pid) as cnt_distinct, COUNT(1) AS CNT
FROM test_vector WHERE full_date=20240305;
EXPLAIN VECTORIZATION EXPRESSION
SELECT COUNT(pid) AS cnt_col, COUNT(*) AS cnt_star, COUNT(20240305) AS cnt_const,COUNT(DISTINCT pid) as cnt_distinct, COUNT(1) AS CNT
FROM test_vector WHERE full_date=20240305;

--------------------------------------------------------------------------------
-- 2. COUNT with DISTINCT column + constant (INVALID in vectorization)
--------------------------------------------------------------------------------
SELECT COUNT(DISTINCT pid, 20240305) AS CNT FROM test_vector WHERE full_date=20240305;
EXPLAIN VECTORIZATION EXPRESSION
SELECT COUNT(DISTINCT pid, 20240305) AS CNT FROM test_vector WHERE full_date=20240305;

--------------------------------------------------------------------------------
-- 3. COUNT(DISTINCT pid, full_date) (multi-col distinct → FAIL)
--------------------------------------------------------------------------------
SELECT COUNT(DISTINCT pid, full_date) AS CNT FROM test_vector WHERE full_date=20240305;
EXPLAIN VECTORIZATION EXPRESSION
SELECT COUNT(DISTINCT pid, full_date) AS CNT FROM test_vector WHERE full_date=20240305;

--------------------------------------------------------------------------------
-- 4. COUNT(DISTINCT pid, full_date, id) (multi-col distinct → FAIL)
--------------------------------------------------------------------------------
SELECT COUNT(DISTINCT pid, full_date, id) AS CNT FROM test_vector WHERE full_date=20240305;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting that it works for you — I’m getting an exception unless I wrap the distinct columns in parentheses.

 org.apache.hadoop.hive.ql.exec.UDFArgumentException: DISTINCT keyword must be specified
	at org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount.getEvaluator(GenericUDAFCount.java:73)

Copy link
Contributor Author

@Indhumathi27 Indhumathi27 Oct 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

COUNT UDAF excepts DISTINCT to be specified, when the parameters are more than 1.

throw new UDFArgumentException("DISTINCT keyword must be specified");

Copy link
Member

@deniskuzZ deniskuzZ Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

try it

SET hive.vectorized.execution.enabled=true;

create external table test_vector(id string, pid bigint, full_date int);
insert into test_vector (pid, full_date, id) values (1, '20240305', '6150');

SELECT COUNT(DISTINCT pid, full_date) AS CNT FROM test_vector WHERE full_date=20240305;

exception

 org.apache.hadoop.hive.ql.exec.UDFArgumentException: DISTINCT keyword must be specified
	at org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount.getEvaluator(GenericUDAFCount.java:73)

EXPLAIN VECTORIZATION EXPRESSION
SELECT COUNT(DISTINCT pid, full_date, id) AS CNT FROM test_vector WHERE full_date=20240305;

DROP TABLE test_vector;
2 changes: 1 addition & 1 deletion ql/src/test/results/clientpositive/llap/vector_count.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ STAGE PLANS:
enabled: true
enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true
inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
notVectorizedReason: GROUPBY operator: Aggregations with > 1 parameter are not supported unless all the extra parameters are constants count([Column[a], Column[b]])
notVectorizedReason: GROUPBY operator: Unsupported COUNT DISTINCT with multiple columns: count([Column[a], Column[b]]). Hive only supports COUNT(DISTINCT col) in vectorized execution.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was the original message not good enough?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes. Before, It has covered some cases like count(distinct col1, col2). Not cases like count(distinct col1, constant), count(distinct col1, col2, constant) etc.

Copy link
Member

@deniskuzZ deniskuzZ Oct 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

before we supported multi-column aggregations with constant expressions and now we don't? At least that what the message was saying

Aggregations with > 1 parameter are not supported unless all the extra parameters are constants count([Column[a], Column[b]])

i don't get why are we changing the message? if the issue was related to filter on partition column, it shouldn't change non-partition table behavior

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

create external table test_vector(id string, pid bigint, full_date int);
insert into test_vector (pid, full_date, id) values (1, '20240305', '6150');

EXPLAIN VECTORIZATION EXPRESSION
SELECT COUNT(DISTINCT(pid, full_date)) AS CNT FROM test_vector WHERE full_date=20240305;

vectorized: true

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@deniskuzZ the message was not changed for other cases. i added a new message for count udf with more than one parameter. now both partition table and non-partition one will have same behavior

Copy link
Member

@deniskuzZ deniskuzZ Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SELECT COUNT(DISTINCT(pid, full_date)) AS CNT FROM test_vector WHERE full_date=20240305;

works fine with partitioned table as well.
i am not sure Hive properly handles DISTINCT with missing parentheses inside COUNT.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the expectation that count(distinct pid, full_date) == count(distinct(pid, full_date)) ?

vectorized: false
Reducer 2
Execution mode: llap
Expand Down
Loading