Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 166 additions & 12 deletions src/backend/commands/analyze.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
#include "utils/sortsupport.h"
#include "utils/syscache.h"
#include "utils/timestamp.h"
#include "utils/typcache.h"


/* Per-index data for ANALYZE */
Expand Down Expand Up @@ -1888,6 +1889,70 @@ static int analyze_mcv_list(int *mcv_counts,
int samplerows,
double totalrows);

#define ANALYZE_HASH_THRESHOLD 200

typedef struct DistinctHashEntry
{
Datum value;
int index;
uint32 hash;
char status;
} DistinctHashEntry;

typedef struct DistinctHashContext
{
FmgrInfo *cmpfunc;
FmgrInfo *hashfunc;
Oid collation;
} DistinctHashContext;

typedef struct DistinctHash_hash DistinctHash_hash;

static uint32 distinct_hash_hash(DistinctHash_hash *tab, Datum key);
static bool distinct_hash_equal(DistinctHash_hash *tab, Datum key0, Datum key1);

#define SH_PREFIX DistinctHash
#define SH_ELEMENT_TYPE DistinctHashEntry
#define SH_KEY_TYPE Datum
#define SH_KEY value
#define SH_HASH_KEY(tab, key) distinct_hash_hash(tab, key)
#define SH_EQUAL(tab, key0, key1) distinct_hash_equal(tab, key0, key1)
#define SH_SCOPE static inline
#define SH_STORE_HASH
#define SH_GET_HASH(tab, ent) ((ent)->hash)
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"

static uint32
distinct_hash_hash(DistinctHash_hash *tab, Datum key)
{
DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
Datum result;

result = FunctionCall1Coll(context->hashfunc, context->collation, key);
return DatumGetUInt32(result);
}

static bool
distinct_hash_equal(DistinctHash_hash *tab, Datum key0, Datum key1)
{
DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
Datum result;

result = FunctionCall2Coll(context->cmpfunc, context->collation, key0, key1);
return DatumGetBool(result);
}

static inline void
distinct_hash_set_index(DistinctHash_hash *hash, Datum value, uint32 value_hash,
int index)
{
DistinctHashEntry *entry = DistinctHash_lookup_hash(hash, value, value_hash);

if (entry != NULL)
entry->index = index;
}

/*
* std_typanalyze -- the default type-specific typanalyze function
Expand Down Expand Up @@ -2076,15 +2141,21 @@ compute_distinct_stats(VacAttrStatsP stats,
bool is_varwidth = (!stats->attrtype->typbyval &&
stats->attrtype->typlen < 0);
FmgrInfo f_cmpeq;
TypeCacheEntry *typentry;
typedef struct
{
Datum value;
int count;
uint32 hash;
} TrackItem;
TrackItem *track;
int track_cnt,
track_max;
int num_mcv = stats->attstattarget;
int firstcount1 = 0;
bool use_hash;
DistinctHashContext hash_context;
DistinctHash_hash *track_hash = NULL;
StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;

/*
Expand All @@ -2097,14 +2168,34 @@ compute_distinct_stats(VacAttrStatsP stats,
track_cnt = 0;

fmgr_info(mystats->eqfunc, &f_cmpeq);
typentry = lookup_type_cache(stats->attrtypid,
TYPECACHE_HASH_PROC_FINFO | TYPECACHE_EQ_OPR);

/*
* For sufficiently large statistics targets, use a hash table to avoid
* repeated linear searches of the track[] array, but only when we can use
* the type's default hash support that matches the equality operator.
*/
use_hash = (track_max >= ANALYZE_HASH_THRESHOLD &&
OidIsValid(mystats->eqfunc) &&
mystats->eqopr == typentry->eq_opr &&
OidIsValid(typentry->hash_proc));
if (use_hash)
{
hash_context.cmpfunc = &f_cmpeq;
hash_context.hashfunc = &typentry->hash_proc_finfo;
hash_context.collation = stats->attrcollid;
track_hash = DistinctHash_create(CurrentMemoryContext,
track_max, &hash_context);
}

for (i = 0; i < samplerows; i++)
{
Datum value;
bool isnull;
bool match;
int firstcount1,
j;
int j = 0;
uint32 value_hash = 0;

vacuum_delay_point(true);

Expand Down Expand Up @@ -2151,19 +2242,35 @@ compute_distinct_stats(VacAttrStatsP stats,
/*
* See if the value matches anything we're already tracking.
*/
match = false;
firstcount1 = track_cnt;
for (j = 0; j < track_cnt; j++)
if (use_hash)
{
DistinctHashEntry *entry;

value_hash = distinct_hash_hash(track_hash, value);
entry = DistinctHash_lookup_hash(track_hash, value, value_hash);
match = (entry != NULL);
if (match)
j = entry->index;
}
else
{
if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
stats->attrcollid,
value, track[j].value)))
int firstcount1_local = track_cnt;

match = false;
for (j = 0; j < track_cnt; j++)
{
match = true;
break;
if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
stats->attrcollid,
value, track[j].value)))
{
match = true;
break;
}
if (j < firstcount1_local && track[j].count == 1)
firstcount1_local = j;
}
if (j < firstcount1 && track[j].count == 1)
firstcount1 = j;

firstcount1 = firstcount1_local;
}

if (match)
Expand All @@ -2175,23 +2282,70 @@ compute_distinct_stats(VacAttrStatsP stats,
{
swapDatum(track[j].value, track[j - 1].value);
swapInt(track[j].count, track[j - 1].count);
if (use_hash)
{
uint32 tmp;

tmp = track[j].hash;
track[j].hash = track[j - 1].hash;
track[j - 1].hash = tmp;
distinct_hash_set_index(track_hash, track[j].value,
track[j].hash, j);
distinct_hash_set_index(track_hash, track[j - 1].value,
track[j - 1].hash, j - 1);
}
j--;
}
while (use_hash && firstcount1 < track_cnt &&
track[firstcount1].count > 1)
firstcount1++;
}
else
{
/* No match. Insert at head of count-1 list */
if (track_cnt < track_max)
track_cnt++;
else if (use_hash && firstcount1 >= track_cnt)
continue;
else if (use_hash)
{
DistinctHashEntry *delentry;

delentry = DistinctHash_lookup_hash(track_hash,
track[track_cnt - 1].value,
track[track_cnt - 1].hash);
Assert(delentry != NULL);
if (delentry != NULL)
DistinctHash_delete_item(track_hash, delentry);
else
DistinctHash_delete(track_hash, track[track_cnt - 1].value);
}
for (j = track_cnt - 1; j > firstcount1; j--)
{
track[j].value = track[j - 1].value;
track[j].count = track[j - 1].count;
if (use_hash)
{
track[j].hash = track[j - 1].hash;
distinct_hash_set_index(track_hash, track[j].value,
track[j].hash, j);
}
}
if (firstcount1 < track_cnt)
{
track[firstcount1].value = value;
track[firstcount1].count = 1;
if (use_hash)
{
bool found_hash;
DistinctHashEntry *entry;

track[firstcount1].hash = value_hash;
entry = DistinctHash_insert_hash(track_hash, value, value_hash,
&found_hash);
Assert(!found_hash);
entry->index = firstcount1;
}
}
}
}
Expand Down
55 changes: 55 additions & 0 deletions src/test/regress/expected/analyze_distinct_hash.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
--
-- Exercise compute_distinct_stats() when hashable types allow hashed lookups.
--
SET client_min_messages TO WARNING;
--
-- Case 1: all values are distinct. This forces the track[] array to fill
-- and then exercise the "drop tail item" path repeatedly.
--
DROP TABLE IF EXISTS analyze_distinct_hash_unique;
CREATE TABLE analyze_distinct_hash_unique (x xid);
ALTER TABLE analyze_distinct_hash_unique ALTER COLUMN x SET STATISTICS 100;
INSERT INTO analyze_distinct_hash_unique
SELECT i::text::xid FROM generate_series(1, 300) i;
ANALYZE analyze_distinct_hash_unique;
WITH m AS MATERIALIZED (
SELECT string_to_array(trim(both '{}' from most_common_vals::text), ',') AS mcv
FROM pg_stats
WHERE schemaname = 'public'
AND tablename = 'analyze_distinct_hash_unique'
AND attname = 'x'
)
SELECT array_length(mcv, 1) AS mcv_len,
mcv[1] AS mcv_first,
mcv[100] AS mcv_100th
FROM m;
mcv_len | mcv_first | mcv_100th
---------+-----------+-----------
100 | 300 | 201
(1 row)

--
-- Case 2: bubble-up during repeated matches, exercising swaps while keeping
-- hashed indexes in sync.
--
DROP TABLE IF EXISTS analyze_distinct_hash_bubble;
CREATE TABLE analyze_distinct_hash_bubble (x xid);
ALTER TABLE analyze_distinct_hash_bubble ALTER COLUMN x SET STATISTICS 100;
INSERT INTO analyze_distinct_hash_bubble
SELECT i::text::xid FROM generate_series(1, 10) i;
INSERT INTO analyze_distinct_hash_bubble
SELECT '1'::xid FROM generate_series(1, 20);
ANALYZE analyze_distinct_hash_bubble;
SELECT most_common_vals::text
FROM pg_stats
WHERE schemaname = 'public'
AND tablename = 'analyze_distinct_hash_bubble'
AND attname = 'x';
most_common_vals
------------------------
{1,10,9,8,7,6,5,4,3,2}
(1 row)

DROP TABLE analyze_distinct_hash_unique;
DROP TABLE analyze_distinct_hash_bubble;
RESET client_min_messages;
2 changes: 1 addition & 1 deletion src/test/regress/parallel_schedule
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t
# geometry depends on point, lseg, line, box, path, polygon, circle
# horology depends on date, time, timetz, timestamp, timestamptz, interval
# ----------
test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies
test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import analyze_distinct_hash pg_ndistinct pg_dependencies

# ----------
# Load huge amounts of data
Expand Down
52 changes: 52 additions & 0 deletions src/test/regress/sql/analyze_distinct_hash.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
--
-- Exercise compute_distinct_stats() when hashable types allow hashed lookups.
--

SET client_min_messages TO WARNING;

--
-- Case 1: all values are distinct. This forces the track[] array to fill
-- and then exercise the "drop tail item" path repeatedly.
--
DROP TABLE IF EXISTS analyze_distinct_hash_unique;
CREATE TABLE analyze_distinct_hash_unique (x xid);
ALTER TABLE analyze_distinct_hash_unique ALTER COLUMN x SET STATISTICS 100;
INSERT INTO analyze_distinct_hash_unique
SELECT i::text::xid FROM generate_series(1, 300) i;
ANALYZE analyze_distinct_hash_unique;

WITH m AS MATERIALIZED (
SELECT string_to_array(trim(both '{}' from most_common_vals::text), ',') AS mcv
FROM pg_stats
WHERE schemaname = 'public'
AND tablename = 'analyze_distinct_hash_unique'
AND attname = 'x'
)
SELECT array_length(mcv, 1) AS mcv_len,
mcv[1] AS mcv_first,
mcv[100] AS mcv_100th
FROM m;

--
-- Case 2: bubble-up during repeated matches, exercising swaps while keeping
-- hashed indexes in sync.
--
DROP TABLE IF EXISTS analyze_distinct_hash_bubble;
CREATE TABLE analyze_distinct_hash_bubble (x xid);
ALTER TABLE analyze_distinct_hash_bubble ALTER COLUMN x SET STATISTICS 100;
INSERT INTO analyze_distinct_hash_bubble
SELECT i::text::xid FROM generate_series(1, 10) i;
INSERT INTO analyze_distinct_hash_bubble
SELECT '1'::xid FROM generate_series(1, 20);
ANALYZE analyze_distinct_hash_bubble;

SELECT most_common_vals::text
FROM pg_stats
WHERE schemaname = 'public'
AND tablename = 'analyze_distinct_hash_bubble'
AND attname = 'x';

DROP TABLE analyze_distinct_hash_unique;
DROP TABLE analyze_distinct_hash_bubble;

RESET client_min_messages;