From 5514e9d764ade19aba2819b2f80b5e38cb308a04 Mon Sep 17 00:00:00 2001 From: Xingbo Wang Date: Wed, 25 Mar 2026 18:50:20 -0700 Subject: [PATCH 1/6] db_stress: constrain batched prefix scans Batched snapshot mode does not verify the full DB state against the expected-state oracle. Instead, it relies on local invariants over the 10-key family written atomically by BatchedOpsStressTest::TestPut(). For a base key K and value body V, TestPut() writes: "0" + K -> V + "0" ... "9" + K -> V + "9" in one write batch. TestPrefixScan() then picks one base key, derives 10 seek prefixes 0+P .. 9+P where P is the first FLAGS_prefix_size - 1 bytes of K, and opens 10 iterators on the same snapshot. The verifier advances those iterators in lockstep and checks that: * each iterator stays in its own seek prefix; * no iterator exhausts its prefix before the others; * at position n, all values match after stripping the trailing digit; * the trailing digit still matches the corresponding prefix stream; and * value() and columns() remain mutually consistent for wide-column / entity reads. That lockstep oracle assumes every iterator is explicitly bounded to the seek prefix before we assert same-length, same-index progress across all 10 streams. Before this change, the batched test only installed an iterate_upper_bound to the next prefix half of the time. In the other half, Seek(prefix) could legitimately continue into the next prefix, while the verifier still assumed the iterator would stop at the prefix boundary. That made the verifier depend on behavior it had not actually requested from RocksDB. Set ReadOptions::prefix_same_as_start = true for every iterator used by this batched prefix verifier. This matches the semantics the verifier needs: RocksDB records the seek prefix and invalidates the iterator once Next() / Prev() leave that prefix, and the memtable path also switches to dynamic prefix iteration under the same condition. Relevant option interactions for this verifier: * test_batches_snapshots requires prefix_size > 0. * db_stress installs NewFixedPrefixTransform(prefix_size) whenever prefix_size >= 0, so the semantic precondition here is the configured prefix_extractor, not specifically memtablerep=prefix_hash. * prefixpercent only controls how often this verifier runs. * auto_refresh_iterator_with_snapshot is irrelevant here because the batched verifier pins one explicit snapshot for all 10 iterators. * When a next-prefix upper bound is installed, the test still exercises iterate_upper_bound and SQFC range-query filtering. * allow_unprepared_value is still handled via PrepareValue(). * use_put_entity_one_in, use_attribute_group, use_timed_put_one_in, and use_merge only change how the values/entities are encoded; the verifier still checks trailing-digit agreement and value()/columns() consistency. * memtablerep=prefix_hash, memtable_prefix_bloom_size_ratio, memtable_whole_key_filtering, and enable_memtable_insert_with_hint_prefix_extractor affect prefix-aware filtering/performance paths but do not change the verifier's lockstep invariant. * user_timestamp_size is out of scope because batched snapshot mode does not support timestamps. This intentionally makes the batched prefix-scan oracle validate explicitly prefix-bounded iteration every time, while the non-batched stress path continues to provide separate randomized coverage of iterate_upper_bound-only behavior. --- db_stress_tool/batched_ops_stress.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/db_stress_tool/batched_ops_stress.cc b/db_stress_tool/batched_ops_stress.cc index 454eecb7fdb6..46f2bdbae6c0 100644 --- a/db_stress_tool/batched_ops_stress.cc +++ b/db_stress_tool/batched_ops_stress.cc @@ -585,6 +585,10 @@ class BatchedOpsStressTest : public StressTest { ro_copies[i] = readoptions; ro_copies[i].snapshot = snapshot; + // This verifier compares 10 prefix scans entry-by-entry, so each + // iterator must be constrained to the seek prefix before we assert + // lockstep progress across them. + ro_copies[i].prefix_same_as_start = true; if (thread->rand.OneIn(2) && GetNextPrefix(prefix_slices[i], &(upper_bounds[i]))) { // For half of the time, set the upper bound to the next prefix From 79e5b6ccca9a8c06a7c451f944ceb5acdcba55f7 Mon Sep 17 00:00:00 2001 From: Xingbo Wang Date: Wed, 25 Mar 2026 15:31:22 -0700 Subject: [PATCH 2/6] db_crashtest: disable BlobDB in best-efforts recovery --- tools/db_crashtest.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 4cfb67b1789b..5e51b532e314 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -880,9 +880,15 @@ def finalize_and_sanitize(src_params): dest_params["prefix_size"] = 1 # BER disables WAL and tests unsynced data loss which - # does not work with inplace_update_support. + # does not work with inplace_update_support. Integrated BlobDB is also + # incompatible, so force blob-related toggles off even if they came from + # command-line overrides or another preset. if dest_params.get("best_efforts_recovery") == 1: dest_params["inplace_update_support"] = 0 + dest_params["enable_blob_files"] = 0 + dest_params["enable_blob_garbage_collection"] = 0 + dest_params["allow_setting_blob_options_dynamically"] = 0 + dest_params["enable_blob_direct_write"] = 0 # Remote Compaction Incompatible Tests and Features if dest_params.get("remote_compaction_worker_threads", 0) > 0: From e33d0f5addac33747eb48c6e7fd95fcd1ab7e58a Mon Sep 17 00:00:00 2001 From: Xingbo Wang Date: Fri, 27 Mar 2026 17:44:50 -0700 Subject: [PATCH 3/6] db_crashtest: preserve expected-state dirs across restarts --- Makefile | 5 +++ tools/db_crashtest.py | 21 +++++++--- tools/db_crashtest_test.py | 83 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 6 deletions(-) create mode 100644 tools/db_crashtest_test.py diff --git a/Makefile b/Makefile index c16e696ef989..87abf665643b 100644 --- a/Makefile +++ b/Makefile @@ -1048,6 +1048,7 @@ ifneq ($(PLATFORM), OS_AIX) $(PYTHON) tools/check_all_python.py ifndef ASSERT_STATUS_CHECKED # not yet working with these tests $(PYTHON) tools/ldb_test.py + $(PYTHON) tools/db_crashtest_test.py sh tools/rocksdb_dump_test.sh endif endif @@ -1065,6 +1066,10 @@ check_some: $(ROCKSDBTESTS_SUBSET) ldb_tests: ldb $(PYTHON) tools/ldb_test.py +.PHONY: db_crashtest_tests +db_crashtest_tests: + $(PYTHON) tools/db_crashtest_test.py + include crash_test.mk asan_check: clean diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 5e51b532e314..a96359e5bc9b 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -520,12 +520,20 @@ def setup_expected_values_dir(): else: # if tmpdir is specified, store the expected_values_dir under that dir expected_values_dir = test_exp_tmpdir + "/rocksdb_crashtest_expected" - if os.path.exists(expected_values_dir): - shutil.rmtree(expected_values_dir) - os.mkdir(expected_values_dir) + os.makedirs(expected_values_dir, exist_ok=True) return expected_values_dir +def prepare_expected_values_dir(expected_dir, destroy_db_initially): + if expected_dir is None or expected_dir == "": + return + + if destroy_db_initially and os.path.exists(expected_dir): + shutil.rmtree(expected_dir, True) + + os.makedirs(expected_dir, exist_ok=True) + + multiops_txn_key_spaces_file = None @@ -1369,6 +1377,10 @@ def gen_cmd_params(args): def gen_cmd(params, unknown_params): finalzied_params = finalize_and_sanitize(params) + prepare_expected_values_dir( + finalzied_params.get("expected_values_dir"), + finalzied_params.get("destroy_db_initially", 0), + ) cmd = ( [stress_cmd] + [ @@ -1746,9 +1758,6 @@ def whitebox_crash_main(args, unknown_args): if time.time() > half_time: # Set next iteration to destroy DB (works for remote DB) cmd_params["destroy_db_initially"] = 1 - if expected_values_dir is not None: - shutil.rmtree(expected_values_dir, True) - os.mkdir(expected_values_dir) check_mode = (check_mode + 1) % total_check_mode time.sleep(1) # time to stabilize after a kill diff --git a/tools/db_crashtest_test.py b/tools/db_crashtest_test.py new file mode 100644 index 000000000000..514ef0eacbff --- /dev/null +++ b/tools/db_crashtest_test.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import importlib.util +import os +import shutil +import sys +import tempfile +import unittest + + +_DB_CRASHTEST_PATH = os.path.join(os.path.dirname(__file__), "db_crashtest.py") +_TEST_DIR_ENV_VAR = "TEST_TMPDIR" +_TEST_EXPECTED_DIR_ENV_VAR = "TEST_TMPDIR_EXPECTED" + + +def load_db_crashtest_module(): + spec = importlib.util.spec_from_file_location( + "db_crashtest_under_test", _DB_CRASHTEST_PATH + ) + module = importlib.util.module_from_spec(spec) + old_argv = sys.argv[:] + try: + sys.argv = [_DB_CRASHTEST_PATH] + spec.loader.exec_module(module) + finally: + sys.argv = old_argv + return module + + +class DBCrashTestTest(unittest.TestCase): + def setUp(self): + self.test_tmpdir = tempfile.mkdtemp(prefix="db_crashtest_test_") + self.expected_dir = os.path.join( + self.test_tmpdir, "rocksdb_crashtest_expected" + ) + self.old_test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) + self.old_test_expected_tmpdir = os.environ.get(_TEST_EXPECTED_DIR_ENV_VAR) + os.environ[_TEST_DIR_ENV_VAR] = self.test_tmpdir + os.environ.pop(_TEST_EXPECTED_DIR_ENV_VAR, None) + + def tearDown(self): + if self.old_test_tmpdir is None: + os.environ.pop(_TEST_DIR_ENV_VAR, None) + else: + os.environ[_TEST_DIR_ENV_VAR] = self.old_test_tmpdir + + if self.old_test_expected_tmpdir is None: + os.environ.pop(_TEST_EXPECTED_DIR_ENV_VAR, None) + else: + os.environ[_TEST_EXPECTED_DIR_ENV_VAR] = self.old_test_expected_tmpdir + + shutil.rmtree(self.test_tmpdir) + + def test_setup_expected_values_dir_preserves_existing_contents(self): + os.makedirs(self.expected_dir) + marker = os.path.join(self.expected_dir, "marker") + with open(marker, "w") as f: + f.write("keep") + + db_crashtest = load_db_crashtest_module() + + expected_dir = db_crashtest.setup_expected_values_dir() + + self.assertEqual(self.expected_dir, expected_dir) + self.assertTrue(os.path.exists(marker)) + + def test_prepare_expected_values_dir_resets_for_fresh_db(self): + os.makedirs(self.expected_dir) + marker = os.path.join(self.expected_dir, "marker") + with open(marker, "w") as f: + f.write("remove") + + db_crashtest = load_db_crashtest_module() + + db_crashtest.prepare_expected_values_dir(self.expected_dir, True) + + self.assertTrue(os.path.isdir(self.expected_dir)) + self.assertFalse(os.path.exists(marker)) + + +if __name__ == "__main__": + unittest.main() From 5f1605c9d844f480a594bea99536d622caa30f9d Mon Sep 17 00:00:00 2001 From: Xingbo Wang Date: Fri, 27 Mar 2026 18:22:35 -0700 Subject: [PATCH 4/6] Skip invalid cross-prefix iterator stress checks --- db_stress_tool/db_stress_test_base.cc | 15 +++++++++++++++ tools/db_crashtest.py | 7 +++++++ 2 files changed, 22 insertions(+) diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index ae57b1e33685..c91f2eec1289 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -2211,6 +2211,21 @@ void StressTest::VerifyIterator( return; } + if (!ro.total_order_seek && options_.prefix_extractor != nullptr && + ro.iterate_lower_bound != nullptr) { + const SliceTransform* prefix_extractor = options_.prefix_extractor.get(); + if (!prefix_extractor->InDomain(seek_key) || + !prefix_extractor->InDomain(*ro.iterate_lower_bound) || + prefix_extractor->Transform(seek_key) != + prefix_extractor->Transform(*ro.iterate_lower_bound)) { + // ReadOptions requires the seek target and iterate_lower_bound to share + // a prefix when prefix iteration is enabled. Skip verification for this + // undefined configuration. + *diverged = true; + return; + } + } + const SliceTransform* pe = (ro.total_order_seek || ro.auto_prefix_mode) ? nullptr : options_.prefix_extractor.get(); diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index a96359e5bc9b..168b4754cd87 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -1045,6 +1045,13 @@ def finalize_and_sanitize(src_params): if dest_params.get("prefix_size") == -1: dest_params["readpercent"] += dest_params.get("prefixpercent", 20) dest_params["prefixpercent"] = 0 + elif dest_params.get("simple") and dest_params.get("test_type") == "blackbox": + # `db_stress` randomizes iterate_lower_bound independently of the seek + # target. With a configured prefix extractor this can violate + # ReadOptions' same-prefix requirement, so disable random iterator + # operations in simple blackbox mode. + dest_params["readpercent"] += dest_params.get("iterpercent", 10) + dest_params["iterpercent"] = 0 if ( dest_params.get("prefix_size") == -1 and dest_params.get("memtable_whole_key_filtering") == 0 From ed37508b72727c4d4d1bfae53491eb2ec8e6f8cc Mon Sep 17 00:00:00 2001 From: Xingbo Wang Date: Fri, 27 Mar 2026 18:22:42 -0700 Subject: [PATCH 5/6] Add trie UDI iterator regression coverage --- tools/db_crashtest_test.py | 4 ++ utilities/trie_index/trie_index_db_test.cc | 76 ++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/tools/db_crashtest_test.py b/tools/db_crashtest_test.py index 514ef0eacbff..aecad83e29e1 100644 --- a/tools/db_crashtest_test.py +++ b/tools/db_crashtest_test.py @@ -1,3 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This source code is licensed under both the GPLv2 (found in the COPYING file in the root directory) +# and the Apache 2.0 License (found in the LICENSE.Apache file in the root directory). + #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. diff --git a/utilities/trie_index/trie_index_db_test.cc b/utilities/trie_index/trie_index_db_test.cc index 1533e6fffa1e..619be18e3e23 100644 --- a/utilities/trie_index/trie_index_db_test.cc +++ b/utilities/trie_index/trie_index_db_test.cc @@ -51,6 +51,10 @@ static std::string MakeKeyBody(int k) { return key_body; } +static std::string MakeStressKey(int prefix, int middle, int suffix) { + return MakeKeyBody(prefix) + MakeKeyBody(middle) + MakeKeyBody(suffix); +} + class TrieIndexDBTest : public testing::Test { protected: void SetUp() override { @@ -1265,6 +1269,78 @@ TEST_F(TrieIndexDBTest, SameUserKeyManyVersionsSeekCorrectness) { } } +TEST_F(TrieIndexDBTest, + AutoPrefixBoundsSnapshotIteratorMatchesStandardIndexWithHiddenVersions) { + options_.compression = kNoCompression; + options_.disable_auto_compactions = true; + options_.max_sequential_skip_in_iterations = 2; + options_.prefix_extractor.reset(NewFixedPrefixTransform(8)); + ASSERT_OK(OpenDB(/*block_size=*/64)); + + auto large_value = [](char ch) { return std::string(96, ch); }; + + const std::string lower_bound = MakeStressKey(0x45, 0x12B, 0x00A3); + const std::string before = MakeStressKey(0xB3, 0x12B, 0x012E); + const std::string repeated = MakeStressKey(0xB3, 0x12B, 0x012F); + const std::string expected_1 = MakeStressKey(0xB3, 0x12B, 0x0131); + const std::string expected_2 = MakeStressKey(0xB3, 0x12B, 0x0132); + const std::string upper_bound = MakeStressKey(0x1D5, 0x12B, 0x0200); + + ASSERT_OK(db_->Put(WriteOptions(), before, large_value('a'))); + for (int version = 0; version < 8; ++version) { + ASSERT_OK(db_->Put(WriteOptions(), repeated, large_value('b' + version))); + } + ASSERT_OK(db_->Put(WriteOptions(), expected_1, large_value('m'))); + ASSERT_OK(db_->Put(WriteOptions(), expected_2, large_value('n'))); + ASSERT_OK(db_->Flush(FlushOptions())); + + const Snapshot* snapshot = db_->GetSnapshot(); + const Slice lower_bound_slice(lower_bound); + const Slice upper_bound_slice(upper_bound); + + const auto build_read_options = + [&](const UserDefinedIndexFactory* table_index_factory) { + ReadOptions ro; + ro.snapshot = snapshot; + ro.auto_prefix_mode = true; + ro.allow_unprepared_value = true; + ro.auto_refresh_iterator_with_snapshot = true; + ro.iterate_lower_bound = &lower_bound_slice; + ro.iterate_upper_bound = &upper_bound_slice; + ro.table_index_factory = table_index_factory; + return ro; + }; + + const std::vector> expected = { + {before, large_value('a')}, + {repeated, large_value('i')}, + {expected_1, large_value('m')}, + {expected_2, large_value('n')}, + }; + + const UserDefinedIndexFactory* table_index_factories[] = { + nullptr, trie_factory_.get()}; + for (const auto* table_index_factory : table_index_factories) { + SCOPED_TRACE(table_index_factory == nullptr ? "standard index" + : "trie index"); + const ReadOptions ro = build_read_options(table_index_factory); + ASSERT_EQ(ScanAllKeyValues(ro), expected); + + std::unique_ptr iter(db_->NewIterator(ro)); + iter->Seek(before); + for (const auto& [expected_key, expected_value] : expected) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), expected_key); + ASSERT_EQ(iter->value().ToString(), expected_value); + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + db_->ReleaseSnapshot(snapshot); +} + // ============================================================================ // MultiGet test // ============================================================================ From f46dc8cd52bdaa674ef22ad23e5fdea6d975cf11 Mon Sep 17 00:00:00 2001 From: Xingbo Wang Date: Sat, 28 Mar 2026 07:56:51 -0700 Subject: [PATCH 6/6] Fix MSVC C4244 warning: cast char arithmetic to char in trie test --- utilities/trie_index/trie_index_db_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utilities/trie_index/trie_index_db_test.cc b/utilities/trie_index/trie_index_db_test.cc index 619be18e3e23..5f8b95ca2cf5 100644 --- a/utilities/trie_index/trie_index_db_test.cc +++ b/utilities/trie_index/trie_index_db_test.cc @@ -1288,7 +1288,8 @@ TEST_F(TrieIndexDBTest, ASSERT_OK(db_->Put(WriteOptions(), before, large_value('a'))); for (int version = 0; version < 8; ++version) { - ASSERT_OK(db_->Put(WriteOptions(), repeated, large_value('b' + version))); + ASSERT_OK(db_->Put(WriteOptions(), repeated, + large_value(static_cast('b' + version)))); } ASSERT_OK(db_->Put(WriteOptions(), expected_1, large_value('m'))); ASSERT_OK(db_->Put(WriteOptions(), expected_2, large_value('n')));