diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 5e2a7a8234ec8..44090a5923cb4 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -54,6 +54,7 @@ #include "utils/sortsupport.h" #include "utils/syscache.h" #include "utils/timestamp.h" +#include "utils/typcache.h" /* Per-index data for ANALYZE */ @@ -1888,6 +1889,70 @@ static int analyze_mcv_list(int *mcv_counts, int samplerows, double totalrows); +#define ANALYZE_HASH_THRESHOLD 200 + +typedef struct DistinctHashEntry +{ + Datum value; + int index; + uint32 hash; + char status; +} DistinctHashEntry; + +typedef struct DistinctHashContext +{ + FmgrInfo *cmpfunc; + FmgrInfo *hashfunc; + Oid collation; +} DistinctHashContext; + +typedef struct DistinctHash_hash DistinctHash_hash; + +static uint32 distinct_hash_hash(DistinctHash_hash *tab, Datum key); +static bool distinct_hash_equal(DistinctHash_hash *tab, Datum key0, Datum key1); + +#define SH_PREFIX DistinctHash +#define SH_ELEMENT_TYPE DistinctHashEntry +#define SH_KEY_TYPE Datum +#define SH_KEY value +#define SH_HASH_KEY(tab, key) distinct_hash_hash(tab, key) +#define SH_EQUAL(tab, key0, key1) distinct_hash_equal(tab, key0, key1) +#define SH_SCOPE static inline +#define SH_STORE_HASH +#define SH_GET_HASH(tab, ent) ((ent)->hash) +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +static uint32 +distinct_hash_hash(DistinctHash_hash *tab, Datum key) +{ + DistinctHashContext *context = (DistinctHashContext *) tab->private_data; + Datum result; + + result = FunctionCall1Coll(context->hashfunc, context->collation, key); + return DatumGetUInt32(result); +} + +static bool +distinct_hash_equal(DistinctHash_hash *tab, Datum key0, Datum key1) +{ + DistinctHashContext *context = (DistinctHashContext *) tab->private_data; + Datum result; + + result = FunctionCall2Coll(context->cmpfunc, context->collation, key0, key1); + return DatumGetBool(result); +} + +static inline void +distinct_hash_set_index(DistinctHash_hash *hash, Datum value, uint32 value_hash, + int index) +{ + DistinctHashEntry *entry = DistinctHash_lookup_hash(hash, value, value_hash); + + if (entry != NULL) + entry->index = index; +} /* * std_typanalyze -- the default type-specific typanalyze function @@ -2076,15 +2141,21 @@ compute_distinct_stats(VacAttrStatsP stats, bool is_varwidth = (!stats->attrtype->typbyval && stats->attrtype->typlen < 0); FmgrInfo f_cmpeq; + TypeCacheEntry *typentry; typedef struct { Datum value; int count; + uint32 hash; } TrackItem; TrackItem *track; int track_cnt, track_max; int num_mcv = stats->attstattarget; + int firstcount1 = 0; + bool use_hash; + DistinctHashContext hash_context; + DistinctHash_hash *track_hash = NULL; StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data; /* @@ -2097,14 +2168,34 @@ compute_distinct_stats(VacAttrStatsP stats, track_cnt = 0; fmgr_info(mystats->eqfunc, &f_cmpeq); + typentry = lookup_type_cache(stats->attrtypid, + TYPECACHE_HASH_PROC_FINFO | TYPECACHE_EQ_OPR); + + /* + * For sufficiently large statistics targets, use a hash table to avoid + * repeated linear searches of the track[] array, but only when we can use + * the type's default hash support that matches the equality operator. + */ + use_hash = (track_max >= ANALYZE_HASH_THRESHOLD && + OidIsValid(mystats->eqfunc) && + mystats->eqopr == typentry->eq_opr && + OidIsValid(typentry->hash_proc)); + if (use_hash) + { + hash_context.cmpfunc = &f_cmpeq; + hash_context.hashfunc = &typentry->hash_proc_finfo; + hash_context.collation = stats->attrcollid; + track_hash = DistinctHash_create(CurrentMemoryContext, + track_max, &hash_context); + } for (i = 0; i < samplerows; i++) { Datum value; bool isnull; bool match; - int firstcount1, - j; + int j = 0; + uint32 value_hash = 0; vacuum_delay_point(true); @@ -2151,19 +2242,35 @@ compute_distinct_stats(VacAttrStatsP stats, /* * See if the value matches anything we're already tracking. */ - match = false; - firstcount1 = track_cnt; - for (j = 0; j < track_cnt; j++) + if (use_hash) + { + DistinctHashEntry *entry; + + value_hash = distinct_hash_hash(track_hash, value); + entry = DistinctHash_lookup_hash(track_hash, value, value_hash); + match = (entry != NULL); + if (match) + j = entry->index; + } + else { - if (DatumGetBool(FunctionCall2Coll(&f_cmpeq, - stats->attrcollid, - value, track[j].value))) + int firstcount1_local = track_cnt; + + match = false; + for (j = 0; j < track_cnt; j++) { - match = true; - break; + if (DatumGetBool(FunctionCall2Coll(&f_cmpeq, + stats->attrcollid, + value, track[j].value))) + { + match = true; + break; + } + if (j < firstcount1_local && track[j].count == 1) + firstcount1_local = j; } - if (j < firstcount1 && track[j].count == 1) - firstcount1 = j; + + firstcount1 = firstcount1_local; } if (match) @@ -2175,23 +2282,70 @@ compute_distinct_stats(VacAttrStatsP stats, { swapDatum(track[j].value, track[j - 1].value); swapInt(track[j].count, track[j - 1].count); + if (use_hash) + { + uint32 tmp; + + tmp = track[j].hash; + track[j].hash = track[j - 1].hash; + track[j - 1].hash = tmp; + distinct_hash_set_index(track_hash, track[j].value, + track[j].hash, j); + distinct_hash_set_index(track_hash, track[j - 1].value, + track[j - 1].hash, j - 1); + } j--; } + while (use_hash && firstcount1 < track_cnt && + track[firstcount1].count > 1) + firstcount1++; } else { /* No match. Insert at head of count-1 list */ if (track_cnt < track_max) track_cnt++; + else if (use_hash && firstcount1 >= track_cnt) + continue; + else if (use_hash) + { + DistinctHashEntry *delentry; + + delentry = DistinctHash_lookup_hash(track_hash, + track[track_cnt - 1].value, + track[track_cnt - 1].hash); + Assert(delentry != NULL); + if (delentry != NULL) + DistinctHash_delete_item(track_hash, delentry); + else + DistinctHash_delete(track_hash, track[track_cnt - 1].value); + } for (j = track_cnt - 1; j > firstcount1; j--) { track[j].value = track[j - 1].value; track[j].count = track[j - 1].count; + if (use_hash) + { + track[j].hash = track[j - 1].hash; + distinct_hash_set_index(track_hash, track[j].value, + track[j].hash, j); + } } if (firstcount1 < track_cnt) { track[firstcount1].value = value; track[firstcount1].count = 1; + if (use_hash) + { + bool found_hash; + DistinctHashEntry *entry; + + track[firstcount1].hash = value_hash; + entry = DistinctHash_insert_hash(track_hash, value, value_hash, + &found_hash); + Assert(!found_hash); + entry->index = firstcount1; + } } } } diff --git a/src/test/regress/expected/analyze_distinct_hash.out b/src/test/regress/expected/analyze_distinct_hash.out new file mode 100644 index 0000000000000..9f92083f6d6e7 --- /dev/null +++ b/src/test/regress/expected/analyze_distinct_hash.out @@ -0,0 +1,55 @@ +-- +-- Exercise compute_distinct_stats() when hashable types allow hashed lookups. +-- +SET client_min_messages TO WARNING; +-- +-- Case 1: all values are distinct. This forces the track[] array to fill +-- and then exercise the "drop tail item" path repeatedly. +-- +DROP TABLE IF EXISTS analyze_distinct_hash_unique; +CREATE TABLE analyze_distinct_hash_unique (x xid); +ALTER TABLE analyze_distinct_hash_unique ALTER COLUMN x SET STATISTICS 100; +INSERT INTO analyze_distinct_hash_unique +SELECT i::text::xid FROM generate_series(1, 300) i; +ANALYZE analyze_distinct_hash_unique; +WITH m AS MATERIALIZED ( + SELECT string_to_array(trim(both '{}' from most_common_vals::text), ',') AS mcv + FROM pg_stats + WHERE schemaname = 'public' + AND tablename = 'analyze_distinct_hash_unique' + AND attname = 'x' +) +SELECT array_length(mcv, 1) AS mcv_len, + mcv[1] AS mcv_first, + mcv[100] AS mcv_100th +FROM m; + mcv_len | mcv_first | mcv_100th +---------+-----------+----------- + 100 | 300 | 201 +(1 row) + +-- +-- Case 2: bubble-up during repeated matches, exercising swaps while keeping +-- hashed indexes in sync. +-- +DROP TABLE IF EXISTS analyze_distinct_hash_bubble; +CREATE TABLE analyze_distinct_hash_bubble (x xid); +ALTER TABLE analyze_distinct_hash_bubble ALTER COLUMN x SET STATISTICS 100; +INSERT INTO analyze_distinct_hash_bubble +SELECT i::text::xid FROM generate_series(1, 10) i; +INSERT INTO analyze_distinct_hash_bubble +SELECT '1'::xid FROM generate_series(1, 20); +ANALYZE analyze_distinct_hash_bubble; +SELECT most_common_vals::text +FROM pg_stats +WHERE schemaname = 'public' + AND tablename = 'analyze_distinct_hash_bubble' + AND attname = 'x'; + most_common_vals +------------------------ + {1,10,9,8,7,6,5,4,3,2} +(1 row) + +DROP TABLE analyze_distinct_hash_unique; +DROP TABLE analyze_distinct_hash_bubble; +RESET client_min_messages; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 905f9bca95987..d7655b1ea8507 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t # geometry depends on point, lseg, line, box, path, polygon, circle # horology depends on date, time, timetz, timestamp, timestamptz, interval # ---------- -test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies +test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import analyze_distinct_hash pg_ndistinct pg_dependencies # ---------- # Load huge amounts of data diff --git a/src/test/regress/sql/analyze_distinct_hash.sql b/src/test/regress/sql/analyze_distinct_hash.sql new file mode 100644 index 0000000000000..06e2e273a271d --- /dev/null +++ b/src/test/regress/sql/analyze_distinct_hash.sql @@ -0,0 +1,52 @@ +-- +-- Exercise compute_distinct_stats() when hashable types allow hashed lookups. +-- + +SET client_min_messages TO WARNING; + +-- +-- Case 1: all values are distinct. This forces the track[] array to fill +-- and then exercise the "drop tail item" path repeatedly. +-- +DROP TABLE IF EXISTS analyze_distinct_hash_unique; +CREATE TABLE analyze_distinct_hash_unique (x xid); +ALTER TABLE analyze_distinct_hash_unique ALTER COLUMN x SET STATISTICS 100; +INSERT INTO analyze_distinct_hash_unique +SELECT i::text::xid FROM generate_series(1, 300) i; +ANALYZE analyze_distinct_hash_unique; + +WITH m AS MATERIALIZED ( + SELECT string_to_array(trim(both '{}' from most_common_vals::text), ',') AS mcv + FROM pg_stats + WHERE schemaname = 'public' + AND tablename = 'analyze_distinct_hash_unique' + AND attname = 'x' +) +SELECT array_length(mcv, 1) AS mcv_len, + mcv[1] AS mcv_first, + mcv[100] AS mcv_100th +FROM m; + +-- +-- Case 2: bubble-up during repeated matches, exercising swaps while keeping +-- hashed indexes in sync. +-- +DROP TABLE IF EXISTS analyze_distinct_hash_bubble; +CREATE TABLE analyze_distinct_hash_bubble (x xid); +ALTER TABLE analyze_distinct_hash_bubble ALTER COLUMN x SET STATISTICS 100; +INSERT INTO analyze_distinct_hash_bubble +SELECT i::text::xid FROM generate_series(1, 10) i; +INSERT INTO analyze_distinct_hash_bubble +SELECT '1'::xid FROM generate_series(1, 20); +ANALYZE analyze_distinct_hash_bubble; + +SELECT most_common_vals::text +FROM pg_stats +WHERE schemaname = 'public' + AND tablename = 'analyze_distinct_hash_bubble' + AND attname = 'x'; + +DROP TABLE analyze_distinct_hash_unique; +DROP TABLE analyze_distinct_hash_bubble; + +RESET client_min_messages;