From b6834170b87cc336841c00b5d21ba467ed1704ad Mon Sep 17 00:00:00 2001 From: Keita Iwabuchi Date: Thu, 5 Mar 2026 11:42:32 -0800 Subject: [PATCH 1/3] Use multi-bucket sets in string_store --- include/string_table/string_store.hpp | 181 ++++++++++++++++++++---- test/string_table/test_string_store.cpp | 18 +++ 2 files changed, 173 insertions(+), 26 deletions(-) diff --git a/include/string_table/string_store.hpp b/include/string_table/string_store.hpp index cc70ca6..5dd029c 100644 --- a/include/string_table/string_store.hpp +++ b/include/string_table/string_store.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -15,11 +16,17 @@ #include #include +#include #include #include +#include #include "string_table/string_accessor.hpp" +#ifndef METALLDATA_STRING_TABLE_NUM_BUCKETS +#define METALLDATA_STRING_TABLE_NUM_BUCKETS 1024 +#endif + namespace compact_string { namespace csdtl { /// \brief Allocate a string and embed the length of the string in front of the @@ -81,6 +88,8 @@ class string_store { class str_holder { public: + using value_type = char; + str_holder() = default; /// \brief Construct a string holder from a pointer to the string data. @@ -100,6 +109,8 @@ class string_store { return std::to_address(&m_ptr[sizeof(size_type)]); } + const char *c_str() const { return str(); } + size_type length() const { // First entry is the length return reinterpret_cast(std::to_address(m_ptr))[0]; @@ -136,18 +147,24 @@ class string_store { } }; - // Hash function for basic_string_view - struct set_hasher { + template + struct str_hasher { using is_transparent = void; std::size_t operator()(const str_holder &str) const { - return boost::hash_range(str.str(), str.str() + str.length()); + return metall::mtlldetail::MurmurHash64A(str.c_str(), str.length(), + hash_seed); } std::size_t operator()(const std::string_view &str) const { - return boost::hash_range(str.data(), str.data() + str.length()); + return metall::mtlldetail::MurmurHash64A(str.data(), str.length(), + hash_seed); } }; + // Hash function for basic_string_view + static constexpr unsigned int k_string_set_hash_seed = 0xd1340ca; + using set_hasher = str_hasher; + struct set_equal { using is_transparent = void; bool operator()(const str_holder &left, const str_holder &right) const { @@ -170,13 +187,26 @@ class string_store { }; using set_allocator_type = other_scoped_allocator; - using set_type = boost::unordered_flat_set; + using string_set_type = + boost::unordered_flat_set; + + static constexpr size_t k_num_string_sets = + METALLDATA_STRING_TABLE_NUM_BUCKETS; + using string_set_vector_allocator_type = + other_scoped_allocator; + using string_sets_table_type = + std::vector; + static constexpr unsigned int k_bucket_hash_seed = 0xb47a63bc; + static_assert(k_bucket_hash_seed != k_string_set_hash_seed, + "k_bucket_hash_seed and k_set_hash_seed must be different"); + + class const_sets_iterator; public: /// \brief Get the length of the string - /// \param str A pointer to actual string data (not the address that points to - /// the length data). + /// \param str A pointer to actual string data (not the address that points + /// to the length data). static constexpr size_t str_length(const char *const str) { const auto *ptr = reinterpret_cast(str); return *(ptr - 1); @@ -184,7 +214,8 @@ class string_store { string_store() = default; - explicit string_store(allocator_type allocator) : m_set(allocator) {} + explicit string_store(allocator_type allocator) + : m_str_sets_table(k_num_string_sets, allocator) {} string_store(const string_store &) = delete; string_store(string_store &&) noexcept = default; @@ -195,14 +226,15 @@ class string_store { /// Return the pointer to the string data if the string is found in the store. const char *find_or_add(std::string_view str) { - auto itr = m_set.find(str); - if (itr != m_set.end()) { + auto &set = m_str_sets_table[priv_str_set_no(str)]; + auto itr = set.find(str); + if (itr != set.end()) { // Found in the store return std::to_address(itr->str()); } // Not found, add it char *len_str_buf = priv_allocate_string(str); - auto ret = m_set.emplace(len_str_buf); + auto ret = set.emplace(len_str_buf); assert(ret.second); // must be inserted const auto &str_holder = *(ret.first); assert(str_holder.length() == str.length()); @@ -212,31 +244,56 @@ class string_store { } const char *find(std::string_view str) const { - auto itr = m_set.find(str); - if (itr == m_set.end()) { + auto &set = m_str_sets_table[priv_str_set_no(str)]; + auto itr = set.find(str); + if (itr == set.end()) { return nullptr; } return std::to_address(itr->str()); } - std::size_t size() const { return m_set.size(); } + std::size_t size() const { + size_t size = 0; + for (const auto &set : m_str_sets_table) { + size += set.size(); + } + return size; + } - typename set_type::const_iterator begin() const { return m_set.begin(); } - typename set_type::const_iterator end() const { return m_set.end(); } + const_sets_iterator begin() const { + return const_sets_iterator(m_str_sets_table.begin(), m_str_sets_table.end(), + false); + } + const_sets_iterator end() const { + return const_sets_iterator(m_str_sets_table.end(), m_str_sets_table.end(), + true); + } void clear() { - for (auto &item : m_set) { - priv_deallocate_string(item); + for (auto &set : m_str_sets_table) { + for (auto &item : set) { + priv_deallocate_string(item); + } + set.clear(); } - m_set.clear(); } - allocator_type get_allocator() const { return m_set.get_allocator(); } + allocator_type get_allocator() const { + return m_str_sets_table.at(0).get_allocator(); + } private: + static int priv_str_set_no(const std::string_view &str) { + if constexpr (k_num_string_sets == 1) { + return 0; + } else { + return str_hasher{}(str) % k_num_string_sets; + } + } + char *priv_allocate_string(const std::string_view &str) { - return csdtl::allocate_string_embedding_length( - str, m_set.get_allocator()); + return csdtl::allocate_string_embedding_length(str, + get_allocator()); } void priv_deallocate_string(const std::string_view &str) { @@ -246,11 +303,83 @@ class string_store { "allocator_type::value_type must be the same as char"); std::allocator_traits::deallocate( - m_set.get_allocator(), const_cast(str.data()), + get_allocator(), const_cast(str.data()) - sizeof(size_type), sizeof(size_type) + str.size() + 1); } - set_type m_set; + void priv_deallocate_string(const str_holder &str) { + priv_deallocate_string(std::string_view(str.str(), str.length())); + } + + string_sets_table_type m_str_sets_table; +}; + +template +class string_store::const_sets_iterator { + public: + using iterator_category = std::forward_iterator_tag; + using value_type = string_store::str_holder; + using difference_type = std::ptrdiff_t; + using pointer = const value_type *; + using reference = const value_type &; + + const_sets_iterator( + typename string_store::string_sets_table_type::const_iterator + set_itr, + typename string_store::string_sets_table_type::const_iterator + set_end_itr, + bool is_end) + : m_set_itr(set_itr), m_set_end_itr(set_end_itr) { + if (is_end) { + m_set_itr = m_set_end_itr; + } else { + // Move to the first non-empty set + while (m_set_itr != m_set_end_itr && (*m_set_itr).empty()) { + ++m_set_itr; + } + } + if (m_set_itr != m_set_end_itr) { + m_item_itr = (*m_set_itr).begin(); + } + } + + reference operator*() const { return *m_item_itr; } + + pointer operator->() const { return &operator*(); } + + const_sets_iterator &operator++() { + ++m_item_itr; + // If we reach the end of the current set, move to the next non-empty set + while (m_set_itr != m_set_end_itr && m_item_itr == (*m_set_itr).end()) { + ++m_set_itr; + if (m_set_itr != m_set_end_itr) { + m_item_itr = (*m_set_itr).begin(); + } + } + return *this; + } + + const_sets_iterator operator++(int) { + const_sets_iterator temp = *this; + ++(*this); + return temp; + } + + bool operator==(const const_sets_iterator &other) const { + return m_set_itr == other.m_set_itr && + (m_set_itr == m_set_end_itr || m_item_itr == other.m_item_itr); + } + + bool operator!=(const const_sets_iterator &other) const { + return !(*this == other); + } + + private: + typename string_store::string_sets_table_type::const_iterator + m_set_itr; + typename string_store::string_sets_table_type::const_iterator + m_set_end_itr; + typename string_store::string_set_type::const_iterator m_item_itr; }; /// \brief Helper function to add a string to the string store @@ -282,4 +411,4 @@ std::optional find_string(std::string_view sv, return {}; } -} // namespace compact_string +} // namespace compact_string \ No newline at end of file diff --git a/test/string_table/test_string_store.cpp b/test/string_table/test_string_store.cpp index 0a3e9e8..f68b162 100644 --- a/test/string_table/test_string_store.cpp +++ b/test/string_table/test_string_store.cpp @@ -5,6 +5,7 @@ #include +#include #include #include @@ -48,6 +49,23 @@ TEST(StringTableTest, Basic) { EXPECT_EQ(store->find("key1"), *ptr1); EXPECT_EQ(store->size(), size_t(2)); + + int count = 0; + bool found_key0 = false; + bool found_key1 = false; + for (const auto &item : *store) { + EXPECT_TRUE(std::strcmp(item.c_str(), "key0") == 0 || + std::strcmp(item.c_str(), "key1") == 0); + if (std::strcmp(item.c_str(), "key0") == 0) { + found_key0 = true; + } else if (std::strcmp(item.c_str(), "key1") == 0) { + found_key1 = true; + } + ++count; + } + EXPECT_EQ(count, 2); + EXPECT_TRUE(found_key0); + EXPECT_TRUE(found_key1); } } From aaa289941b54a4e80c3926573b0106792801df3d Mon Sep 17 00:00:00 2001 From: Keita Iwabuchi Date: Thu, 5 Mar 2026 11:49:26 -0800 Subject: [PATCH 2/3] Use boost::vector in string_store --- include/string_table/string_store.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/string_table/string_store.hpp b/include/string_table/string_store.hpp index 5dd029c..5772607 100644 --- a/include/string_table/string_store.hpp +++ b/include/string_table/string_store.hpp @@ -13,7 +13,6 @@ #include #include #include -#include #include #include @@ -196,7 +195,7 @@ class string_store { using string_set_vector_allocator_type = other_scoped_allocator; using string_sets_table_type = - std::vector; + boost::container::vector; static constexpr unsigned int k_bucket_hash_seed = 0xb47a63bc; static_assert(k_bucket_hash_seed != k_string_set_hash_seed, "k_bucket_hash_seed and k_set_hash_seed must be different"); From fdbfd2f5b6764cc8ffefd47d4336a79a8a36b8f2 Mon Sep 17 00:00:00 2001 From: Keita Iwabuchi Date: Thu, 5 Mar 2026 13:48:22 -0800 Subject: [PATCH 3/3] Bugfix in string_store --- include/string_table/string_store.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/string_table/string_store.hpp b/include/string_table/string_store.hpp index 5772607..5afec59 100644 --- a/include/string_table/string_store.hpp +++ b/include/string_table/string_store.hpp @@ -211,7 +211,7 @@ class string_store { return *(ptr - 1); } - string_store() = default; + string_store() : m_str_sets_table(k_num_string_sets) {} explicit string_store(allocator_type allocator) : m_str_sets_table(k_num_string_sets, allocator) {} @@ -225,7 +225,9 @@ class string_store { /// Return the pointer to the string data if the string is found in the store. const char *find_or_add(std::string_view str) { - auto &set = m_str_sets_table[priv_str_set_no(str)]; + const auto set_no = priv_str_set_no(str); + assert(m_str_sets_table.size() > set_no); + auto &set = m_str_sets_table[set_no]; auto itr = set.find(str); if (itr != set.end()) { // Found in the store