Reminiscent · Reminiscent · Dec 17, 2025
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
@@ -54,6 +54,7 @@
 #include "utils/sortsupport.h"
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
+#include "utils/typcache.h"
 
 
 /* Per-index data for ANALYZE */
@@ -1888,6 +1889,70 @@ static int	analyze_mcv_list(int *mcv_counts,
 							 int samplerows,
 							 double totalrows);
 
+#define ANALYZE_HASH_THRESHOLD 200
+
+typedef struct DistinctHashEntry
+{
+	Datum		value;
+	int			index;
+	uint32		hash;
+	char		status;
+} DistinctHashEntry;
+
+typedef struct DistinctHashContext
+{
+	FmgrInfo   *cmpfunc;
+	FmgrInfo   *hashfunc;
+	Oid			collation;
+} DistinctHashContext;
+
+typedef struct DistinctHash_hash DistinctHash_hash;
+
+static uint32 distinct_hash_hash(DistinctHash_hash *tab, Datum key);
+static bool distinct_hash_equal(DistinctHash_hash *tab, Datum key0, Datum key1);
+
+#define SH_PREFIX				DistinctHash
+#define SH_ELEMENT_TYPE			DistinctHashEntry
+#define SH_KEY_TYPE				Datum
+#define SH_KEY					value
+#define SH_HASH_KEY(tab, key)	distinct_hash_hash(tab, key)
+#define SH_EQUAL(tab, key0, key1) distinct_hash_equal(tab, key0, key1)
+#define SH_SCOPE				static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tab, ent)	((ent)->hash)
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+static uint32
+distinct_hash_hash(DistinctHash_hash *tab, Datum key)
+{
+	DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
+	Datum		result;
+
+	result = FunctionCall1Coll(context->hashfunc, context->collation, key);
+	return DatumGetUInt32(result);
+}
+
+static bool
+distinct_hash_equal(DistinctHash_hash *tab, Datum key0, Datum key1)
+{
+	DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
+	Datum		result;
+
+	result = FunctionCall2Coll(context->cmpfunc, context->collation, key0, key1);
+	return DatumGetBool(result);
+}
+
+static inline void
+distinct_hash_set_index(DistinctHash_hash *hash, Datum value, uint32 value_hash,
+						int index)
+{
+	DistinctHashEntry *entry = DistinctHash_lookup_hash(hash, value, value_hash);
+
+	if (entry != NULL)
+		entry->index = index;
+}
 
 /*
  * std_typanalyze -- the default type-specific typanalyze function
@@ -2076,15 +2141,21 @@ compute_distinct_stats(VacAttrStatsP stats,
 	bool		is_varwidth = (!stats->attrtype->typbyval &&
 							   stats->attrtype->typlen < 0);
 	FmgrInfo	f_cmpeq;
+	TypeCacheEntry *typentry;
 	typedef struct
 	{
 		Datum		value;
 		int			count;
+		uint32		hash;
 	} TrackItem;
 	TrackItem  *track;
 	int			track_cnt,
 				track_max;
 	int			num_mcv = stats->attstattarget;
+	int			firstcount1 = 0;
+	bool		use_hash;
+	DistinctHashContext hash_context;
+	DistinctHash_hash *track_hash = NULL;
 	StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
 
 	/*
@@ -2097,14 +2168,34 @@ compute_distinct_stats(VacAttrStatsP stats,
 	track_cnt = 0;
 
 	fmgr_info(mystats->eqfunc, &f_cmpeq);
+	typentry = lookup_type_cache(stats->attrtypid,
+								 TYPECACHE_HASH_PROC_FINFO | TYPECACHE_EQ_OPR);
+
+	/*
+	 * For sufficiently large statistics targets, use a hash table to avoid
+	 * repeated linear searches of the track[] array, but only when we can use
+	 * the type's default hash support that matches the equality operator.
+	 */
+	use_hash = (track_max >= ANALYZE_HASH_THRESHOLD &&
+				OidIsValid(mystats->eqfunc) &&
+				mystats->eqopr == typentry->eq_opr &&
+				OidIsValid(typentry->hash_proc));
+	if (use_hash)
+	{
+		hash_context.cmpfunc = &f_cmpeq;
+		hash_context.hashfunc = &typentry->hash_proc_finfo;
+		hash_context.collation = stats->attrcollid;
+		track_hash = DistinctHash_create(CurrentMemoryContext,
+										 track_max, &hash_context);
+	}
 
 	for (i = 0; i < samplerows; i++)
 	{
 		Datum		value;
 		bool		isnull;
 		bool		match;
-		int			firstcount1,
-					j;
+		int			j = 0;
+		uint32		value_hash = 0;
 
 		vacuum_delay_point(true);
 
@@ -2151,19 +2242,35 @@ compute_distinct_stats(VacAttrStatsP stats,
 		/*
 		 * See if the value matches anything we're already tracking.
 		 */
-		match = false;
-		firstcount1 = track_cnt;
-		for (j = 0; j < track_cnt; j++)
+		if (use_hash)
+		{
+			DistinctHashEntry *entry;
+
+			value_hash = distinct_hash_hash(track_hash, value);
+			entry = DistinctHash_lookup_hash(track_hash, value, value_hash);
+			match = (entry != NULL);
+			if (match)
+				j = entry->index;
+		}
+		else
 		{
-			if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
-											   stats->attrcollid,
-											   value, track[j].value)))
+			int			firstcount1_local = track_cnt;
+
+			match = false;
+			for (j = 0; j < track_cnt; j++)
 			{
-				match = true;
-				break;
+				if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
+												   stats->attrcollid,
+												   value, track[j].value)))
+				{
+					match = true;
+					break;
+				}
+				if (j < firstcount1_local && track[j].count == 1)
+					firstcount1_local = j;
 			}
-			if (j < firstcount1 && track[j].count == 1)
-				firstcount1 = j;
+
+			firstcount1 = firstcount1_local;
 		}
 
 		if (match)
@@ -2175,23 +2282,70 @@ compute_distinct_stats(VacAttrStatsP stats,
 			{
 				swapDatum(track[j].value, track[j - 1].value);
 				swapInt(track[j].count, track[j - 1].count);
+				if (use_hash)
+				{
+					uint32		tmp;
+
+					tmp = track[j].hash;
+					track[j].hash = track[j - 1].hash;
+					track[j - 1].hash = tmp;
+					distinct_hash_set_index(track_hash, track[j].value,
+											track[j].hash, j);
+					distinct_hash_set_index(track_hash, track[j - 1].value,
+											track[j - 1].hash, j - 1);
+				}
 				j--;
 			}
+			while (use_hash && firstcount1 < track_cnt &&
+				   track[firstcount1].count > 1)
+				firstcount1++;
 		}
 		else
 		{
 			/* No match.  Insert at head of count-1 list */
 			if (track_cnt < track_max)
 				track_cnt++;
+			else if (use_hash && firstcount1 >= track_cnt)
+				continue;
+			else if (use_hash)
+			{
+				DistinctHashEntry *delentry;
+
+				delentry = DistinctHash_lookup_hash(track_hash,
+													track[track_cnt - 1].value,
+													track[track_cnt - 1].hash);
+				Assert(delentry != NULL);
+				if (delentry != NULL)
+					DistinctHash_delete_item(track_hash, delentry);
+				else
+					DistinctHash_delete(track_hash, track[track_cnt - 1].value);
+			}
 			for (j = track_cnt - 1; j > firstcount1; j--)
 			{
 				track[j].value = track[j - 1].value;
 				track[j].count = track[j - 1].count;
+				if (use_hash)
+				{
+					track[j].hash = track[j - 1].hash;
+					distinct_hash_set_index(track_hash, track[j].value,
+											track[j].hash, j);
+				}
 			}
 			if (firstcount1 < track_cnt)
 			{
 				track[firstcount1].value = value;
 				track[firstcount1].count = 1;
+				if (use_hash)
+				{
+					bool		found_hash;
+					DistinctHashEntry *entry;
+
+					track[firstcount1].hash = value_hash;
+					entry = DistinctHash_insert_hash(track_hash, value, value_hash,
+													 &found_hash);
+					Assert(!found_hash);
+					entry->index = firstcount1;
+				}
 			}
 		}
 	}

diff --git a/src/test/regress/expected/analyze_distinct_hash.out b/src/test/regress/expected/analyze_distinct_hash.out
@@ -0,0 +1,55 @@
+--
+-- Exercise compute_distinct_stats() when hashable types allow hashed lookups.
+--
+SET client_min_messages TO WARNING;
+--
+-- Case 1: all values are distinct.  This forces the track[] array to fill
+-- and then exercise the "drop tail item" path repeatedly.
+--
+DROP TABLE IF EXISTS analyze_distinct_hash_unique;
+CREATE TABLE analyze_distinct_hash_unique (x xid);
+ALTER TABLE analyze_distinct_hash_unique ALTER COLUMN x SET STATISTICS 100;
+INSERT INTO analyze_distinct_hash_unique
+SELECT i::text::xid FROM generate_series(1, 300) i;
+ANALYZE analyze_distinct_hash_unique;
+WITH m AS MATERIALIZED (
+	SELECT string_to_array(trim(both '{}' from most_common_vals::text), ',') AS mcv
+	FROM pg_stats
+	WHERE schemaname = 'public'
+	  AND tablename = 'analyze_distinct_hash_unique'
+	  AND attname = 'x'
+)
+SELECT array_length(mcv, 1) AS mcv_len,
+	   mcv[1] AS mcv_first,
+	   mcv[100] AS mcv_100th
+FROM m;
+ mcv_len | mcv_first | mcv_100th 
+---------+-----------+-----------
+     100 | 300       | 201
+(1 row)
+
+--
+-- Case 2: bubble-up during repeated matches, exercising swaps while keeping
+-- hashed indexes in sync.
+--
+DROP TABLE IF EXISTS analyze_distinct_hash_bubble;
+CREATE TABLE analyze_distinct_hash_bubble (x xid);
+ALTER TABLE analyze_distinct_hash_bubble ALTER COLUMN x SET STATISTICS 100;
+INSERT INTO analyze_distinct_hash_bubble
+SELECT i::text::xid FROM generate_series(1, 10) i;
+INSERT INTO analyze_distinct_hash_bubble
+SELECT '1'::xid FROM generate_series(1, 20);
+ANALYZE analyze_distinct_hash_bubble;
+SELECT most_common_vals::text
+FROM pg_stats
+WHERE schemaname = 'public'
+  AND tablename = 'analyze_distinct_hash_bubble'
+  AND attname = 'x';
+    most_common_vals    
+------------------------
+ {1,10,9,8,7,6,5,4,3,2}
+(1 row)
+
+DROP TABLE analyze_distinct_hash_unique;
+DROP TABLE analyze_distinct_hash_bubble;
+RESET client_min_messages;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
@@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t
 # geometry depends on point, lseg, line, box, path, polygon, circle
 # horology depends on date, time, timetz, timestamp, timestamptz, interval
 # ----------
-test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies
+test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import analyze_distinct_hash pg_ndistinct pg_dependencies
 
 # ----------
 # Load huge amounts of data

diff --git a/src/test/regress/sql/analyze_distinct_hash.sql b/src/test/regress/sql/analyze_distinct_hash.sql
@@ -0,0 +1,52 @@
+--
+-- Exercise compute_distinct_stats() when hashable types allow hashed lookups.
+--
+
+SET client_min_messages TO WARNING;
+
+--
+-- Case 1: all values are distinct.  This forces the track[] array to fill
+-- and then exercise the "drop tail item" path repeatedly.
+--
+DROP TABLE IF EXISTS analyze_distinct_hash_unique;
+CREATE TABLE analyze_distinct_hash_unique (x xid);
+ALTER TABLE analyze_distinct_hash_unique ALTER COLUMN x SET STATISTICS 100;
+INSERT INTO analyze_distinct_hash_unique
+SELECT i::text::xid FROM generate_series(1, 300) i;
+ANALYZE analyze_distinct_hash_unique;
+
+WITH m AS MATERIALIZED (
+	SELECT string_to_array(trim(both '{}' from most_common_vals::text), ',') AS mcv
+	FROM pg_stats
+	WHERE schemaname = 'public'
+	  AND tablename = 'analyze_distinct_hash_unique'
+	  AND attname = 'x'
+)
+SELECT array_length(mcv, 1) AS mcv_len,
+	   mcv[1] AS mcv_first,
+	   mcv[100] AS mcv_100th
+FROM m;
+
+--
+-- Case 2: bubble-up during repeated matches, exercising swaps while keeping
+-- hashed indexes in sync.
+--
+DROP TABLE IF EXISTS analyze_distinct_hash_bubble;
+CREATE TABLE analyze_distinct_hash_bubble (x xid);
+ALTER TABLE analyze_distinct_hash_bubble ALTER COLUMN x SET STATISTICS 100;
+INSERT INTO analyze_distinct_hash_bubble
+SELECT i::text::xid FROM generate_series(1, 10) i;
+INSERT INTO analyze_distinct_hash_bubble
+SELECT '1'::xid FROM generate_series(1, 20);
+ANALYZE analyze_distinct_hash_bubble;
+
+SELECT most_common_vals::text
+FROM pg_stats
+WHERE schemaname = 'public'
+  AND tablename = 'analyze_distinct_hash_bubble'
+  AND attname = 'x';
+
+DROP TABLE analyze_distinct_hash_unique;
+DROP TABLE analyze_distinct_hash_bubble;
+
+RESET client_min_messages;