From e4899105ea5592f14e1763db8bd477588c8764d6 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Mon, 16 Mar 2026 17:04:23 +0800 Subject: [PATCH 01/24] fix default string support --- include/neug/utils/mmap_array.h | 35 ++++++++++++++++++------- include/neug/utils/property/column.h | 38 ++++++++++++++++++++++++++++ src/storages/graph/property_graph.cc | 31 ++++++++++++++++++----- tests/utils/test_table.cc | 31 +++++++++++++++++++++++ tools/python_bind/tests/test_ddl.py | 33 ++++++++++++++++++++++++ 5 files changed, 152 insertions(+), 16 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 969001a0..f0217e45 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -469,10 +469,17 @@ class mmap_array { }; struct string_item { - uint64_t offset : 48; - uint32_t length : 16; + uint64_t offset : 47; + uint64_t length : 16; + uint64_t inserted : 1; // indicates whether the item is inserted or empty + string_item() : offset(0), length(0), inserted(0) {} + string_item(uint64_t offset, uint64_t length, uint64_t inserted) + : offset(offset), length(length), inserted(inserted) {} }; +static_assert(sizeof(string_item) == sizeof(uint64_t), + "string_item must stay 64-bit wide"); + template <> class mmap_array { public: @@ -534,9 +541,10 @@ class mmap_array { size_t total_length = 0; size_t non_zero_count = 0; for (size_t i = 0; i < items_.size(); ++i) { - if (items_.get(i).length > 0) { + const auto item = items_.get(i); + if (item.inserted && item.length > 0) { ++non_zero_count; - total_length += items_.get(i).length; + total_length += item.length; } } return non_zero_count > 0 @@ -545,13 +553,19 @@ class mmap_array { } void set(size_t idx, size_t offset, const std::string_view& val) { - items_.set(idx, {offset, static_cast(val.size())}); + items_.set(idx, {static_cast(offset), + static_cast(val.size()), 1}); assert(data_.data() + offset + val.size() <= data_.data() + data_.size()); memcpy(data_.data() + offset, val.data(), val.size()); } + bool inserted(size_t idx) const { return items_.get(idx).inserted == 1; } + std::string_view get(size_t idx) const { const string_item& item = items_.get(idx); + if (!item.inserted) { + return std::string_view{"", 0}; + } return std::string_view(data_.data() + item.offset, item.length); } @@ -603,7 +617,7 @@ class mmap_array { limit_offset = std::max(limit_offset, entry.offset + entry.length); memcpy(dst, src, entry.length); items_.set(entry.index, - {static_cast(write_offset), entry.length}); + {static_cast(write_offset), entry.length, 1}); write_offset += entry.length; } assert(write_offset == plan.total_size); @@ -630,8 +644,11 @@ class mmap_array { plan.entries.reserve(items_.size()); for (size_t i = 0; i < items_.size(); ++i) { const string_item& item = items_.get(i); - plan.total_size += item.length; - plan.entries.push_back({i, item.offset, item.length}); + if (item.inserted) { + plan.total_size += item.length; + plan.entries.push_back( + {i, item.offset, static_cast(item.length)}); + } } return plan; } @@ -662,7 +679,7 @@ class mmap_array { } } items_.set(entry.index, - {static_cast(write_offset), entry.length}); + {static_cast(write_offset), entry.length, 1}); write_offset += entry.length; } assert(write_offset == plan.total_size); diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index 1566b4d5..293a4036 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -406,6 +406,9 @@ class TypedColumn : public ColumnBase { void set_value_safe(size_t idx, const std::string_view& value); inline std::string_view get_view(size_t idx) const { + if (!buffer_.inserted(idx)) { + return default_value_; + } return buffer_.get(idx); } @@ -433,6 +436,8 @@ class TypedColumn : public ColumnBase { buffer_.ensure_writable(work_dir); } + std::string_view default_value() const { return default_value_; } + private: inline void init_pos(const std::string& file_path) { if (std::filesystem::exists(file_path)) { @@ -500,6 +505,39 @@ class TypedRefColumn : public RefColumnBase { size_t basic_size; }; +template <> +class TypedRefColumn : public RefColumnBase { + public: + using value_type = std::string_view; + + explicit TypedRefColumn(const StringColumn& column) + : basic_buffer(column.buffer()), + basic_size(column.buffer_size()), + default_value_(column.default_value()) {} + ~TypedRefColumn() {} + + inline std::string_view get_view(size_t index) const { + assert(index < basic_size); + if (!basic_buffer.inserted(index)) { + return default_value_; + } + return basic_buffer.get(index); + } + + Property get(size_t index) const override { + return PropUtils::to_prop(get_view(index)); + } + + DataTypeId type() const override { return DataTypeId::kVarchar; } + + ColType col_type() const override { return ColType::kInternal; } + + private: + const mmap_array& basic_buffer; + size_t basic_size; + std::string_view default_value_; +}; + // Create a reference column from a ColumnBase that contains a const reference // to the actual column storage, offering a column-based store interface for // vertex properties. diff --git a/src/storages/graph/property_graph.cc b/src/storages/graph/property_graph.cc index 242e56e0..04370f96 100644 --- a/src/storages/graph/property_graph.cc +++ b/src/storages/graph/property_graph.cc @@ -154,7 +154,6 @@ Status PropertyGraph::BatchAddEdges( return neug::Status::OK(); } -// TODO(zhanglei): support extra_type_info Status PropertyGraph::CreateVertexType( const std::string& vertex_type_name, const std::vector>& properties, @@ -260,7 +259,6 @@ Status PropertyGraph::CreateVertexType( return neug::Status::OK(); } -// TODO(zhanglei): support extra_type_info Status PropertyGraph::CreateEdgeType( const std::string& src_vertex_type, const std::string& dst_vertex_type, const std::string& edge_type_name, @@ -338,7 +336,6 @@ Status PropertyGraph::CreateEdgeType( return neug::Status::OK(); } -// TODO(zhanglei): Support extra_type_info Status PropertyGraph::AddVertexProperties( const std::string& vertex_type_name, const std::vector>& @@ -377,16 +374,27 @@ Status PropertyGraph::AddVertexProperties( } add_default_property_values.emplace_back(default_value); } + label_t v_label = schema_.get_vertex_label_id(vertex_type_name); + size_t old_prop_num = + schema_.get_vertex_schema(v_label)->property_names.size(); schema_.AddVertexProperties(vertex_type_name, add_property_names, add_property_types, add_property_storages, add_default_property_values); - label_t v_label = schema_.get_vertex_label_id(vertex_type_name); + + // Use the default property values in schema for the new properties if not + // provided in the function arguments, to ensure the default values are + // consistent with the vertex schema. + std::vector schema_new_default_values; + const auto& v_prop_default_values = + schema_.get_vertex_schema(v_label)->default_property_values; + for (size_t i = old_prop_num; i < v_prop_default_values.size(); i++) { + schema_new_default_values.emplace_back(v_prop_default_values[i]); + } vertex_tables_[v_label].AddProperties(add_property_names, add_property_types, - add_default_property_values); + schema_new_default_values); return neug::Status::OK(); } -// TODO(zhanglei): Support extra_type_info Status PropertyGraph::AddEdgeProperties( const std::string& src_type_name, const std::string& dst_type_name, const std::string& edge_type_name, @@ -423,6 +431,8 @@ Status PropertyGraph::AddEdgeProperties( label_t src_label = schema_.get_vertex_label_id(src_type_name); label_t dst_label = schema_.get_vertex_label_id(dst_type_name); label_t e_label = schema_.get_edge_label_id(edge_type_name); + size_t old_prop_num = schema_.get_edge_schema(src_label, dst_label, e_label) + ->property_names.size(); schema_.AddEdgeProperties(src_type_name, dst_type_name, edge_type_name, add_property_names, add_property_types, @@ -438,9 +448,16 @@ Status PropertyGraph::AddEdgeProperties( "] does not exist, cannot add properties."); } + std::vector schema_new_default_values; + const auto& e_prop_default_values = + schema_.get_edge_schema(src_label, dst_label, e_label) + ->default_property_values; + for (size_t i = old_prop_num; i < e_prop_default_values.size(); i++) { + schema_new_default_values.emplace_back(e_prop_default_values[i]); + } auto& edge_table = edge_tables_.at(index); edge_table.AddProperties(add_property_names, add_property_types, - add_default_property_values); + schema_new_default_values); return neug::Status::OK(); } diff --git a/tests/utils/test_table.cc b/tests/utils/test_table.cc index f02c843a..6d683db7 100644 --- a/tests/utils/test_table.cc +++ b/tests/utils/test_table.cc @@ -339,5 +339,36 @@ TEST(TableTest, TestTableBasic) { EXPECT_EQ(mem_table_ref.get_column_by_id(0)->type(), DataTypeId::kBoolean); } +TEST(TableTest, StringColumnDistinguishesUnsetFromEmptyString) { + if (std::filesystem::exists(TEST_DIR)) { + std::filesystem::remove_all(TEST_DIR); + } + std::filesystem::create_directories(TEST_DIR); + std::filesystem::create_directories(std::string(TEST_DIR) + "/checkpoint"); + std::filesystem::create_directories(std::string(TEST_DIR) + "/runtime/tmp"); + + Table table; + std::vector col_name = {"string_column"}; + std::vector property_types = {{DataTypeId::kVarchar}}; + std::vector default_values = { + Property::from_string_view("default_value")}; + std::vector mem_strategies(col_name.size(), + StorageStrategy::kMem); + + table.init("test_string_validity", TEST_DIR, col_name, property_types, + default_values, mem_strategies); + table.resize(2); + + auto string_column = std::dynamic_pointer_cast( + table.get_column("string_column")); + ASSERT_NE(string_column, nullptr); + + EXPECT_EQ(string_column->get_prop(0).as_string_view(), "default_value"); + + string_column->set_prop(1, Property::from_string_view("")); + EXPECT_TRUE(string_column->get_prop(1).as_string_view().empty()); + EXPECT_EQ(string_column->get_prop(1).type(), DataTypeId::kVarchar); +} + } // namespace test } // namespace neug \ No newline at end of file diff --git a/tools/python_bind/tests/test_ddl.py b/tools/python_bind/tests/test_ddl.py index 5b780d5f..631fe83e 100644 --- a/tools/python_bind/tests/test_ddl.py +++ b/tools/python_bind/tests/test_ddl.py @@ -360,3 +360,36 @@ def test_alter_varchar_type(): assert list(res) == [[1, "Alice"]] conn.close() db.close() + + +def test_get_varchar_default_value_1(): + db_dir = "/tmp/test_get_varchar_default_value" + shutil.rmtree(db_dir, ignore_errors=True) + db = Database(db_dir, "w") + conn = db.connect() + conn.execute( + "CREATE NODE TABLE TestNode(id INT64 PRIMARY KEY, name VARCHAR(20) DEFAULT 'default_name');" + ) + conn.execute("CREATE (:TestNode {id: 1});") + conn.execute("CREATE (:TestNode {id: 2});") + conn.execute("CREATE (:TestNode {id: 3});") + res = conn.execute("Match (n:TestNode) Return n.name;") + assert list(res) == [["default_name"], ["default_name"], ["default_name"]] + conn.close() + db.close() + + +def test_get_varchar_default_value_2(): + db_dir = "/tmp/test_get_varchar_default_value" + shutil.rmtree(db_dir, ignore_errors=True) + db = Database(db_dir, "w") + conn = db.connect() + conn.execute("CREATE NODE TABLE TestNode(id INT64 PRIMARY KEY);") + conn.execute("CREATE (:TestNode {id: 1});") + conn.execute("CREATE (:TestNode {id: 2});") + conn.execute("ALTER TABLE TestNode ADD name VARCHAR(20) DEFAULT 'default_name';") + conn.execute("CREATE (:TestNode {id: 3});") + res = conn.execute("Match (n:TestNode) Return n.name;") + assert list(res) == [["default_name"], ["default_name"], ["default_name"]] + conn.close() + db.close() From 038f315cad0d372e4c854e1e964e38eb9198d7f8 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Mon, 16 Mar 2026 17:10:26 +0800 Subject: [PATCH 02/24] minor fixes --- include/neug/utils/mmap_array.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index f0217e45..1ab8f362 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -472,9 +472,6 @@ struct string_item { uint64_t offset : 47; uint64_t length : 16; uint64_t inserted : 1; // indicates whether the item is inserted or empty - string_item() : offset(0), length(0), inserted(0) {} - string_item(uint64_t offset, uint64_t length, uint64_t inserted) - : offset(offset), length(length), inserted(inserted) {} }; static_assert(sizeof(string_item) == sizeof(uint64_t), @@ -541,10 +538,9 @@ class mmap_array { size_t total_length = 0; size_t non_zero_count = 0; for (size_t i = 0; i < items_.size(); ++i) { - const auto item = items_.get(i); - if (item.inserted && item.length > 0) { + if (items_.get(i).length > 0) { ++non_zero_count; - total_length += item.length; + total_length += items_.get(i).length; } } return non_zero_count > 0 From 69e6d6fe7edb3f888d15b9a861bdd1b7433c8be1 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Mon, 16 Mar 2026 18:53:43 +0800 Subject: [PATCH 03/24] use a separate file to store validility --- include/neug/utils/mmap_array.h | 92 +++++++++++++++++++++++----- include/neug/utils/property/column.h | 8 ++- tests/utils/test_table.cc | 2 +- 3 files changed, 84 insertions(+), 18 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 1ab8f362..4b0a8d06 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -469,27 +469,27 @@ class mmap_array { }; struct string_item { - uint64_t offset : 47; + uint64_t offset : 48; uint64_t length : 16; - uint64_t inserted : 1; // indicates whether the item is inserted or empty }; -static_assert(sizeof(string_item) == sizeof(uint64_t), - "string_item must stay 64-bit wide"); - template <> class mmap_array { public: + static constexpr size_t kMaterializedBitsPerWord = sizeof(uint64_t) * 8; + mmap_array() {} mmap_array(mmap_array&& rhs) : mmap_array() { swap(rhs); } ~mmap_array() {} void reset() { + materialized_map_.reset(); items_.reset(); data_.reset(); } void set_hugepage_prefered(bool val) { + materialized_map_.set_hugepage_prefered(val); items_.set_hugepage_prefered(val); data_.set_hugepage_prefered(val); } @@ -499,14 +499,17 @@ class mmap_array { is_writable_ = is_writable; items_.open(filename + ".items", sync_to_file, is_writable); data_.open(filename + ".data", sync_to_file, is_writable); + open_materialized_map(filename, sync_to_file, is_writable); } void open_with_hugepages(const std::string& filename) { items_.open_with_hugepages(filename + ".items"); data_.open_with_hugepages(filename + ".data"); + open_materialized_map_with_hugepages(filename); } void touch(const std::string& filename) { + materialized_map_.touch(filename + ".materialized"); items_.touch(filename + ".items"); data_.touch(filename + ".data"); } @@ -517,16 +520,19 @@ class mmap_array { bool should_stream = !data_.is_sync_to_file() && plan.total_size < data_.size(); if (should_stream) { - stream_compact_and_dump(plan, filename + ".data", filename + ".items"); + stream_compact_and_dump(plan, filename + ".data", filename + ".items", + filename + ".materialized"); return; } compact(); + materialized_map_.dump(filename + ".materialized"); items_.dump(filename + ".items"); data_.dump(filename + ".data"); } void resize(size_t size, size_t data_size) { + materialized_map_.resize(materialized_word_num(size)); items_.resize(size); data_.resize(data_size); } @@ -538,7 +544,7 @@ class mmap_array { size_t total_length = 0; size_t non_zero_count = 0; for (size_t i = 0; i < items_.size(); ++i) { - if (items_.get(i).length > 0) { + if (is_materialized(i) && items_.get(i).length > 0) { ++non_zero_count; total_length += items_.get(i).length; } @@ -550,18 +556,26 @@ class mmap_array { void set(size_t idx, size_t offset, const std::string_view& val) { items_.set(idx, {static_cast(offset), - static_cast(val.size()), 1}); + static_cast(val.size())}); + set_materialized(idx, true); assert(data_.data() + offset + val.size() <= data_.data() + data_.size()); memcpy(data_.data() + offset, val.data(), val.size()); } - bool inserted(size_t idx) const { return items_.get(idx).inserted == 1; } + bool is_materialized(size_t idx) const { + size_t word_idx = idx / kMaterializedBitsPerWord; + size_t bit_idx = idx % kMaterializedBitsPerWord; + if (word_idx >= materialized_map_.size()) { + return false; + } + return (materialized_map_.get(word_idx) >> bit_idx) & 1ULL; + } std::string_view get(size_t idx) const { - const string_item& item = items_.get(idx); - if (!item.inserted) { + if (!is_materialized(idx)) { return std::string_view{"", 0}; } + const string_item& item = items_.get(idx); return std::string_view(data_.data() + item.offset, item.length); } @@ -570,11 +584,13 @@ class mmap_array { size_t data_size() const { return data_.size(); } void swap(mmap_array& rhs) { + materialized_map_.swap(rhs.materialized_map_); items_.swap(rhs.items_); data_.swap(rhs.data_); } void set_writable(bool is_writable) { + materialized_map_.set_writable(is_writable); items_.set_writable(is_writable); data_.set_writable(is_writable); is_writable_ = is_writable; @@ -584,6 +600,7 @@ class mmap_array { if (is_writable_) { return; } + materialized_map_.ensure_writable(work_dir); items_.ensure_writable(work_dir); data_.ensure_writable(work_dir); is_writable_ = true; @@ -613,7 +630,7 @@ class mmap_array { limit_offset = std::max(limit_offset, entry.offset + entry.length); memcpy(dst, src, entry.length); items_.set(entry.index, - {static_cast(write_offset), entry.length, 1}); + {static_cast(write_offset), entry.length}); write_offset += entry.length; } assert(write_offset == plan.total_size); @@ -640,7 +657,7 @@ class mmap_array { plan.entries.reserve(items_.size()); for (size_t i = 0; i < items_.size(); ++i) { const string_item& item = items_.get(i); - if (item.inserted) { + if (is_materialized(i)) { plan.total_size += item.length; plan.entries.push_back( {i, item.offset, static_cast(item.length)}); @@ -651,7 +668,8 @@ class mmap_array { void stream_compact_and_dump(const CompactionPlan& plan, const std::string& data_filename, - const std::string& items_filename) { + const std::string& items_filename, + const std::string& materialized_filename) { size_t size_before_compact = data_.size(); FILE* fout = fopen(data_filename.c_str(), "wb"); if (fout == NULL) { @@ -675,7 +693,7 @@ class mmap_array { } } items_.set(entry.index, - {static_cast(write_offset), entry.length, 1}); + {static_cast(write_offset), entry.length}); write_offset += entry.length; } assert(write_offset == plan.total_size); @@ -725,9 +743,53 @@ class mmap_array { VLOG(1) << "Compaction completed. New data size: " << plan.total_size << ", old data size: " << size_before_compact; + materialized_map_.dump(materialized_filename); items_.dump(items_filename); } + void set_materialized(size_t idx, bool materialized) { + size_t word_idx = idx / kMaterializedBitsPerWord; + size_t bit_idx = idx % kMaterializedBitsPerWord; + uint64_t word = materialized_map_.get(word_idx); + if (materialized) { + word |= (1ULL << bit_idx); + } else { + word &= ~(1ULL << bit_idx); + } + materialized_map_.set(word_idx, word); + } + + static size_t materialized_word_num(size_t size) { + return (size + kMaterializedBitsPerWord - 1) / kMaterializedBitsPerWord; + } + + void open_materialized_map(const std::string& filename, bool sync_to_file, + bool is_writable) { + const auto materialized_file = filename + ".materialized"; + materialized_map_.open(materialized_file, sync_to_file, is_writable); + validate_materialized_map_size(materialized_file); + } + + void open_materialized_map_with_hugepages(const std::string& filename) { + const auto materialized_file = filename + ".materialized"; + materialized_map_.open_with_hugepages(materialized_file); + validate_materialized_map_size(materialized_file); + } + + void validate_materialized_map_size( + const std::string& materialized_file) const { + const auto expected_size = materialized_word_num(items_.size()); + if (materialized_map_.size() != expected_size) { + std::stringstream ss; + ss << "Invalid string materialized map file [ " << materialized_file + << " ], expected " << expected_size << " words, got " + << materialized_map_.size(); + LOG(ERROR) << ss.str(); + THROW_RUNTIME_ERROR(ss.str()); + } + } + + mmap_array materialized_map_; mmap_array items_; mmap_array data_; bool is_writable_ = true; diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index 293a4036..9ef6bcc1 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -343,6 +343,7 @@ class TypedColumn : public ColumnBase { } copy_file(cur_path + ".data", tmp_path + ".data"); copy_file(cur_path + ".items", tmp_path + ".items"); + copy_file(cur_path + ".materialized", tmp_path + ".materialized"); copy_file(cur_path + ".pos", tmp_path + ".pos"); buffer_.reset(); @@ -406,7 +407,7 @@ class TypedColumn : public ColumnBase { void set_value_safe(size_t idx, const std::string_view& value); inline std::string_view get_view(size_t idx) const { - if (!buffer_.inserted(idx)) { + if (!buffer_.is_materialized(idx)) { return default_value_; } return buffer_.get(idx); @@ -518,7 +519,7 @@ class TypedRefColumn : public RefColumnBase { inline std::string_view get_view(size_t index) const { assert(index < basic_size); - if (!basic_buffer.inserted(index)) { + if (!basic_buffer.is_materialized(index)) { return default_value_; } return basic_buffer.get(index); @@ -535,6 +536,9 @@ class TypedRefColumn : public RefColumnBase { private: const mmap_array& basic_buffer; size_t basic_size; + // NOTE: default_value_ is a non-owning view. The pointed-to string data is + // owned by the schema's default_property_strings and must outlive this + // object. std::string_view default_value_; }; diff --git a/tests/utils/test_table.cc b/tests/utils/test_table.cc index 6d683db7..d896c160 100644 --- a/tests/utils/test_table.cc +++ b/tests/utils/test_table.cc @@ -371,4 +371,4 @@ TEST(TableTest, StringColumnDistinguishesUnsetFromEmptyString) { } } // namespace test -} // namespace neug \ No newline at end of file +} // namespace neug From e75485adb27bb843abc99bba7d5a3036b7d38527 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Mon, 16 Mar 2026 18:55:07 +0800 Subject: [PATCH 04/24] revert to uint32_t --- include/neug/utils/mmap_array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 4b0a8d06..7da0fc72 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -470,7 +470,7 @@ class mmap_array { struct string_item { uint64_t offset : 48; - uint64_t length : 16; + uint32_t length : 16; }; template <> From c788bf993d295820737a63fd15561b8a92b63c7f Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Mon, 16 Mar 2026 19:09:37 +0800 Subject: [PATCH 05/24] fix --- include/neug/utils/mmap_array.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 7da0fc72..06696f28 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -556,7 +556,7 @@ class mmap_array { void set(size_t idx, size_t offset, const std::string_view& val) { items_.set(idx, {static_cast(offset), - static_cast(val.size())}); + static_cast(val.size())}); set_materialized(idx, true); assert(data_.data() + offset + val.size() <= data_.data() + data_.size()); memcpy(data_.data() + offset, val.data(), val.size()); @@ -629,8 +629,8 @@ class mmap_array { char* dst = temp_buf.data() + write_offset; limit_offset = std::max(limit_offset, entry.offset + entry.length); memcpy(dst, src, entry.length); - items_.set(entry.index, - {static_cast(write_offset), entry.length}); + items_.set(entry.index, {static_cast(write_offset), + static_cast(entry.length)}); write_offset += entry.length; } assert(write_offset == plan.total_size); @@ -692,8 +692,8 @@ class mmap_array { THROW_RUNTIME_ERROR(ss.str()); } } - items_.set(entry.index, - {static_cast(write_offset), entry.length}); + items_.set(entry.index, {static_cast(write_offset), + static_cast(entry.length)}); write_offset += entry.length; } assert(write_offset == plan.total_size); @@ -750,6 +750,10 @@ class mmap_array { void set_materialized(size_t idx, bool materialized) { size_t word_idx = idx / kMaterializedBitsPerWord; size_t bit_idx = idx % kMaterializedBitsPerWord; + if (word_idx >= materialized_map_.size()) { + THROW_RUNTIME_ERROR("Materialized map index out of range"); + return; + } uint64_t word = materialized_map_.get(word_idx); if (materialized) { word |= (1ULL << bit_idx); From 8212b34743b117e1467c001f4e974436da5d25cc Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Mon, 16 Mar 2026 19:38:59 +0800 Subject: [PATCH 06/24] ensure backward compatibility --- include/neug/utils/mmap_array.h | 21 +++++++++++++-------- include/neug/utils/property/column.h | 4 +++- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 06696f28..bfd7a05b 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -780,16 +780,21 @@ class mmap_array { validate_materialized_map_size(materialized_file); } - void validate_materialized_map_size( - const std::string& materialized_file) const { + void validate_materialized_map_size(const std::string& materialized_file) { const auto expected_size = materialized_word_num(items_.size()); + // Is this backward compatibility logic necessary? if (materialized_map_.size() != expected_size) { - std::stringstream ss; - ss << "Invalid string materialized map file [ " << materialized_file - << " ], expected " << expected_size << " words, got " - << materialized_map_.size(); - LOG(ERROR) << ss.str(); - THROW_RUNTIME_ERROR(ss.str()); + LOG(WARNING) << "Invalid string materialized map file [ " + << materialized_file << " ], expected " << expected_size + << " words, got " << materialized_map_.size() + << ", try to adapt it"; + materialized_map_.resize(expected_size); + for (size_t i = 0; i < materialized_map_.size(); ++i) { + if (items_.get(i).length > 0) { + set_materialized(i, true); + } + } + return; } } diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index 9ef6bcc1..3ff2d200 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -343,7 +343,9 @@ class TypedColumn : public ColumnBase { } copy_file(cur_path + ".data", tmp_path + ".data"); copy_file(cur_path + ".items", tmp_path + ".items"); - copy_file(cur_path + ".materialized", tmp_path + ".materialized"); + if (std::filesystem::exists(cur_path + ".materialized")) { + copy_file(cur_path + ".materialized", tmp_path + ".materialized"); + } copy_file(cur_path + ".pos", tmp_path + ".pos"); buffer_.reset(); From cef540c338dfdbba85283ae66b7e66ac6e97a8af Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Tue, 17 Mar 2026 08:53:19 +0800 Subject: [PATCH 07/24] Update include/neug/utils/mmap_array.h Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- include/neug/utils/mmap_array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index bfd7a05b..4cc2b2c8 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -789,7 +789,7 @@ class mmap_array { << " words, got " << materialized_map_.size() << ", try to adapt it"; materialized_map_.resize(expected_size); - for (size_t i = 0; i < materialized_map_.size(); ++i) { + for (size_t i = 0; i < items_.size(); ++i) { if (items_.get(i).length > 0) { set_materialized(i, true); } From 316c9af14c1bef9e8c2a1a92056d8f96ca3186d9 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Tue, 17 Mar 2026 08:56:57 +0800 Subject: [PATCH 08/24] minor fix --- include/neug/utils/mmap_array.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index bfd7a05b..b630ee96 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -784,6 +784,11 @@ class mmap_array { const auto expected_size = materialized_word_num(items_.size()); // Is this backward compatibility logic necessary? if (materialized_map_.size() != expected_size) { + if (!is_writable_) { + LOG(WARNING) << "Materialized map size mismatch in read-only mode; " + "default values will be used for all unset entries."; + return; + } LOG(WARNING) << "Invalid string materialized map file [ " << materialized_file << " ], expected " << expected_size << " words, got " << materialized_map_.size() From ba0167ab4041cb2ac632e27aacdcc9ab442eae72 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Tue, 17 Mar 2026 10:08:50 +0800 Subject: [PATCH 09/24] use mmap_array to store materialized info --- include/neug/utils/mmap_array.h | 56 ++++++----------------------- tools/python_bind/tests/test_ddl.py | 4 +-- 2 files changed, 12 insertions(+), 48 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 54375aa4..31471171 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -476,8 +476,6 @@ struct string_item { template <> class mmap_array { public: - static constexpr size_t kMaterializedBitsPerWord = sizeof(uint64_t) * 8; - mmap_array() {} mmap_array(mmap_array&& rhs) : mmap_array() { swap(rhs); } ~mmap_array() {} @@ -503,6 +501,7 @@ class mmap_array { } void open_with_hugepages(const std::string& filename) { + is_writable_ = true; items_.open_with_hugepages(filename + ".items"); data_.open_with_hugepages(filename + ".data"); open_materialized_map_with_hugepages(filename); @@ -532,7 +531,7 @@ class mmap_array { } void resize(size_t size, size_t data_size) { - materialized_map_.resize(materialized_word_num(size)); + materialized_map_.resize(size); items_.resize(size); data_.resize(data_size); } @@ -557,18 +556,16 @@ class mmap_array { void set(size_t idx, size_t offset, const std::string_view& val) { items_.set(idx, {static_cast(offset), static_cast(val.size())}); - set_materialized(idx, true); assert(data_.data() + offset + val.size() <= data_.data() + data_.size()); memcpy(data_.data() + offset, val.data(), val.size()); + materialized_map_.set(idx, 1); } bool is_materialized(size_t idx) const { - size_t word_idx = idx / kMaterializedBitsPerWord; - size_t bit_idx = idx % kMaterializedBitsPerWord; - if (word_idx >= materialized_map_.size()) { + if (idx >= materialized_map_.size()) { return false; } - return (materialized_map_.get(word_idx) >> bit_idx) & 1ULL; + return materialized_map_.get(idx) != 0; } std::string_view get(size_t idx) const { @@ -747,26 +744,6 @@ class mmap_array { items_.dump(items_filename); } - void set_materialized(size_t idx, bool materialized) { - size_t word_idx = idx / kMaterializedBitsPerWord; - size_t bit_idx = idx % kMaterializedBitsPerWord; - if (word_idx >= materialized_map_.size()) { - THROW_RUNTIME_ERROR("Materialized map index out of range"); - return; - } - uint64_t word = materialized_map_.get(word_idx); - if (materialized) { - word |= (1ULL << bit_idx); - } else { - word &= ~(1ULL << bit_idx); - } - materialized_map_.set(word_idx, word); - } - - static size_t materialized_word_num(size_t size) { - return (size + kMaterializedBitsPerWord - 1) / kMaterializedBitsPerWord; - } - void open_materialized_map(const std::string& filename, bool sync_to_file, bool is_writable) { const auto materialized_file = filename + ".materialized"; @@ -781,29 +758,16 @@ class mmap_array { } void validate_materialized_map_size(const std::string& materialized_file) { - const auto expected_size = materialized_word_num(items_.size()); + const auto expected_size = items_.size(); // Is this backward compatibility logic necessary? if (materialized_map_.size() != expected_size) { - if (!is_writable_) { - LOG(WARNING) << "Materialized map size mismatch in read-only mode; " - "default values will be used for all unset entries."; - return; - } - LOG(WARNING) << "Invalid string materialized map file [ " - << materialized_file << " ], expected " << expected_size - << " words, got " << materialized_map_.size() - << ", try to adapt it"; - materialized_map_.resize(expected_size); - for (size_t i = 0; i < items_.size(); ++i) { - if (items_.get(i).length > 0) { - set_materialized(i, true); - } - } - return; + THROW_RUNTIME_ERROR( + "Materialized map size does not match items size for file: " + + materialized_file + ", try reloading the graph"); } } - mmap_array materialized_map_; + mmap_array materialized_map_; mmap_array items_; mmap_array data_; bool is_writable_ = true; diff --git a/tools/python_bind/tests/test_ddl.py b/tools/python_bind/tests/test_ddl.py index 631fe83e..def1e183 100644 --- a/tools/python_bind/tests/test_ddl.py +++ b/tools/python_bind/tests/test_ddl.py @@ -363,7 +363,7 @@ def test_alter_varchar_type(): def test_get_varchar_default_value_1(): - db_dir = "/tmp/test_get_varchar_default_value" + db_dir = "/tmp/test_get_varchar_default_value_1" shutil.rmtree(db_dir, ignore_errors=True) db = Database(db_dir, "w") conn = db.connect() @@ -380,7 +380,7 @@ def test_get_varchar_default_value_1(): def test_get_varchar_default_value_2(): - db_dir = "/tmp/test_get_varchar_default_value" + db_dir = "/tmp/test_get_varchar_default_value_2" shutil.rmtree(db_dir, ignore_errors=True) db = Database(db_dir, "w") conn = db.connect() From 00550ce4afe39b00d048644ab54876a4c8c2f7f8 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Tue, 17 Mar 2026 15:09:24 +0800 Subject: [PATCH 10/24] store default value in data_ and let multiple items point to it --- include/neug/utils/mmap_array.h | 249 ++++++++++++++++++++------- include/neug/utils/property/column.h | 50 +++--- 2 files changed, 206 insertions(+), 93 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 90de723b..8577f7f2 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include "glog/logging.h" #include "neug/storages/file_names.h" #include "neug/utils/exception/exception.h" +#include "neug/utils/file_utils.h" #ifdef __ia64__ #define ADDR (void*) (0x8000000000000000UL) @@ -475,19 +477,62 @@ struct string_item { template <> class mmap_array { + private: + struct CompactionPlan { + struct Entry { + size_t index; + uint64_t offset; + uint32_t length; + }; + std::vector entries; + size_t total_size = 0; + }; + public: - mmap_array() {} + mmap_array() : mmap_array("") {} + explicit mmap_array(const std::string_view& default_value) + : default_value_(default_value), default_item_{0, 0} {} mmap_array(mmap_array&& rhs) : mmap_array() { swap(rhs); } ~mmap_array() {} void reset() { - materialized_map_.reset(); + default_value_.clear(); + default_item_ = {0, 0}; items_.reset(); data_.reset(); + is_writable_ = true; + } + + std::string_view default_value() const { return default_value_; } + + bool has_default_item() const { + return default_item_.offset != 0 || default_item_.length != 0 || + default_value_.empty(); + } + + size_t ensure_default_item(size_t offset) { + if (has_default_item()) { + return static_cast(default_item_.offset + default_item_.length); + } + if (offset + default_value_.size() > data_.size()) { + THROW_RUNTIME_ERROR("Not enough space for varchar default value"); + } + if (!default_value_.empty()) { + memcpy(data_.data() + offset, default_value_.data(), + default_value_.size()); + } + default_item_ = {static_cast(offset), + static_cast(default_value_.size())}; + return offset + default_value_.size(); + } + + void fill_default_items(size_t begin, size_t end) { + for (size_t i = begin; i < end; ++i) { + items_.set(i, default_item_); + } } void set_hugepage_prefered(bool val) { - materialized_map_.set_hugepage_prefered(val); items_.set_hugepage_prefered(val); data_.set_hugepage_prefered(val); } @@ -497,20 +542,14 @@ class mmap_array { is_writable_ = is_writable; items_.open(filename + ".items", sync_to_file, is_writable); data_.open(filename + ".data", sync_to_file, is_writable); - open_materialized_map(filename, sync_to_file, is_writable); + load_meta(filename + ".meta"); } void open_with_hugepages(const std::string& filename) { is_writable_ = true; items_.open_with_hugepages(filename + ".items"); data_.open_with_hugepages(filename + ".data"); - open_materialized_map_with_hugepages(filename); - } - - void touch(const std::string& filename) { - materialized_map_.touch(filename + ".materialized"); - items_.touch(filename + ".items"); - data_.touch(filename + ".data"); + load_meta(filename + ".meta"); } void dump(const std::string& filename) { @@ -520,18 +559,17 @@ class mmap_array { !data_.is_sync_to_file() && plan.total_size < data_.size(); if (should_stream) { stream_compact_and_dump(plan, filename + ".data", filename + ".items", - filename + ".materialized"); + filename + ".meta"); return; } - compact(); - materialized_map_.dump(filename + ".materialized"); + compact(plan); items_.dump(filename + ".items"); data_.dump(filename + ".data"); + dump_meta(filename + ".meta"); } void resize(size_t size, size_t data_size) { - materialized_map_.resize(size); items_.resize(size); data_.resize(data_size); } @@ -543,7 +581,7 @@ class mmap_array { size_t total_length = 0; size_t non_zero_count = 0; for (size_t i = 0; i < items_.size(); ++i) { - if (is_materialized(i) && items_.get(i).length > 0) { + if (items_.get(i).length > 0) { ++non_zero_count; total_length += items_.get(i).length; } @@ -558,20 +596,9 @@ class mmap_array { static_cast(val.size())}); assert(data_.data() + offset + val.size() <= data_.data() + data_.size()); memcpy(data_.data() + offset, val.data(), val.size()); - materialized_map_.set(idx, 1); - } - - bool is_materialized(size_t idx) const { - if (idx >= materialized_map_.size()) { - return false; - } - return materialized_map_.get(idx) != 0; } std::string_view get(size_t idx) const { - if (!is_materialized(idx)) { - return std::string_view{"", 0}; - } const string_item& item = items_.get(idx); return std::string_view(data_.data() + item.offset, item.length); } @@ -581,13 +608,14 @@ class mmap_array { size_t data_size() const { return data_.size(); } void swap(mmap_array& rhs) { - materialized_map_.swap(rhs.materialized_map_); + std::swap(default_value_, rhs.default_value_); + std::swap(default_item_, rhs.default_item_); items_.swap(rhs.items_); data_.swap(rhs.data_); + std::swap(is_writable_, rhs.is_writable_); } void set_writable(bool is_writable) { - materialized_map_.set_writable(is_writable); items_.set_writable(is_writable); data_.set_writable(is_writable); is_writable_ = is_writable; @@ -597,7 +625,6 @@ class mmap_array { if (is_writable_) { return; } - materialized_map_.ensure_writable(work_dir); items_.ensure_writable(work_dir); data_.ensure_writable(work_dir); is_writable_ = true; @@ -608,8 +635,7 @@ class mmap_array { // Returns the compacted data size. Note that the reserved size of data buffer // is not changed, and new strings can still be appended after the compacted // data. - size_t compact() { - auto plan = prepare_compaction_plan(); + size_t compact(const CompactionPlan& plan) { if (items_.size() == 0) { return 0; } @@ -621,6 +647,18 @@ class mmap_array { std::vector temp_buf(plan.total_size); size_t write_offset = 0; size_t limit_offset = 0; + const auto old_default_item = default_item_; + const bool has_stored_default = has_default_item(); + if (has_stored_default) { + default_item_ = {static_cast(write_offset), + old_default_item.length}; + if (old_default_item.length > 0) { + const char* default_src = data_.data() + old_default_item.offset; + memcpy(temp_buf.data() + write_offset, default_src, + old_default_item.length); + } + write_offset += old_default_item.length; + } for (const auto& entry : plan.entries) { const char* src = data_.data() + entry.offset; char* dst = temp_buf.data() + write_offset; @@ -631,6 +669,15 @@ class mmap_array { static_cast(entry.length)}); write_offset += entry.length; } + if (has_stored_default) { + for (size_t i = 0; i < items_.size(); ++i) { + const string_item& item = items_.get(i); + if (item.offset == old_default_item.offset && + item.length == old_default_item.length) { + items_.set(i, default_item_); + } + } + } assert(write_offset == plan.total_size); memcpy(data_.data(), temp_buf.data(), plan.total_size); @@ -640,26 +687,23 @@ class mmap_array { } private: - struct CompactionPlan { - struct Entry { - size_t index; - uint64_t offset; - uint32_t length; - }; - std::vector entries; - size_t total_size = 0; - }; - CompactionPlan prepare_compaction_plan() const { CompactionPlan plan; plan.entries.reserve(items_.size()); + const auto old_default_item = default_item_; + const bool has_stored_default = has_default_item(); + if (has_stored_default && old_default_item.length > 0) { + plan.total_size += old_default_item.length; + } for (size_t i = 0; i < items_.size(); ++i) { const string_item& item = items_.get(i); - if (is_materialized(i)) { - plan.total_size += item.length; - plan.entries.push_back( - {i, item.offset, static_cast(item.length)}); + if (has_stored_default && item.offset == old_default_item.offset && + item.length == old_default_item.length) { + continue; } + plan.total_size += item.length; + plan.entries.push_back( + {i, item.offset, static_cast(item.length)}); } return plan; } @@ -667,7 +711,7 @@ class mmap_array { void stream_compact_and_dump(const CompactionPlan& plan, const std::string& data_filename, const std::string& items_filename, - const std::string& materialized_filename) { + const std::string& meta_filename) { size_t size_before_compact = data_.size(); FILE* fout = fopen(data_filename.c_str(), "wb"); if (fout == NULL) { @@ -679,6 +723,24 @@ class mmap_array { } size_t write_offset = 0; + const auto old_default_item = default_item_; + const bool has_stored_default = has_default_item(); + if (has_stored_default) { + default_item_ = {static_cast(write_offset), + old_default_item.length}; + if (old_default_item.length > 0) { + const char* default_src = data_.data() + old_default_item.offset; + if (fwrite(default_src, 1, old_default_item.length, fout) != + old_default_item.length) { + std::stringstream ss; + ss << "Failed to fwrite file [ " << data_filename << " ], " + << strerror(errno); + LOG(ERROR) << ss.str(); + THROW_RUNTIME_ERROR(ss.str()); + } + } + write_offset += old_default_item.length; + } for (const auto& entry : plan.entries) { if (entry.length > 0) { const char* src = data_.data() + entry.offset; @@ -694,6 +756,15 @@ class mmap_array { static_cast(entry.length)}); write_offset += entry.length; } + if (has_stored_default) { + for (size_t i = 0; i < items_.size(); ++i) { + const string_item& item = items_.get(i); + if (item.offset == old_default_item.offset && + item.length == old_default_item.length) { + items_.set(i, default_item_); + } + } + } assert(write_offset == plan.total_size); if (fflush(fout) != 0) { @@ -741,35 +812,81 @@ class mmap_array { VLOG(1) << "Compaction completed. New data size: " << plan.total_size << ", old data size: " << size_before_compact; - materialized_map_.dump(materialized_filename); + dump_meta(meta_filename); items_.dump(items_filename); } - void open_materialized_map(const std::string& filename, bool sync_to_file, - bool is_writable) { - const auto materialized_file = filename + ".materialized"; - materialized_map_.open(materialized_file, sync_to_file, is_writable); - validate_materialized_map_size(materialized_file); - } + void dump_meta(const std::string& meta_filename) const { + StringMetaHeader header{static_cast(default_item_.offset), + static_cast(default_item_.length), + static_cast(default_value_.size())}; + std::vector meta_buf(sizeof(header) + default_value_.size()); + memcpy(meta_buf.data(), &header, sizeof(header)); + if (!default_value_.empty()) { + memcpy(meta_buf.data() + sizeof(header), default_value_.data(), + default_value_.size()); + } + write_file(meta_filename, meta_buf.data(), meta_buf.size(), 1); - void open_materialized_map_with_hugepages(const std::string& filename) { - const auto materialized_file = filename + ".materialized"; - materialized_map_.open_with_hugepages(materialized_file); - validate_materialized_map_size(materialized_file); + std::filesystem::perms readPermission = std::filesystem::perms::owner_read; + std::error_code errorCode; + std::filesystem::permissions(meta_filename, readPermission, + std::filesystem::perm_options::add, errorCode); + if (errorCode) { + std::stringstream ss; + ss << "Failed to set read permission for file: " << meta_filename << " " + << errorCode.message() << std::endl; + LOG(ERROR) << ss.str(); + THROW_RUNTIME_ERROR(ss.str()); + } } - void validate_materialized_map_size(const std::string& materialized_file) { - const auto expected_size = items_.size(); - // Is this backward compatibility logic necessary? - if (materialized_map_.size() != expected_size) { - THROW_RUNTIME_ERROR( - "Materialized map size does not match items size for file: " + - materialized_file + ", try reloading the graph"); + void load_meta(const std::string& meta_filename) { + if (!std::filesystem::exists(meta_filename)) { + return; + } + default_value_.clear(); + default_item_ = {0, 0}; + const size_t meta_size = std::filesystem::file_size(meta_filename); + if (meta_size < sizeof(StringMetaHeader)) { + std::stringstream ss; + ss << "Invalid meta file [ " << meta_filename << " ], expected at least " + << sizeof(StringMetaHeader) << " bytes, got " << meta_size; + LOG(ERROR) << ss.str(); + THROW_RUNTIME_ERROR(ss.str()); + } + + std::vector meta_buf(meta_size); + read_file(meta_filename, meta_buf.data(), meta_size, 1); + + StringMetaHeader header{}; + memcpy(&header, meta_buf.data(), sizeof(header)); + default_item_ = {header.default_offset, header.default_length}; + const size_t expected_size = + sizeof(StringMetaHeader) + header.default_value_size; + if (meta_size != expected_size) { + std::stringstream ss; + ss << "Invalid meta file [ " << meta_filename << " ], expected " + << expected_size << " bytes, got " << meta_size; + LOG(ERROR) << ss.str(); + THROW_RUNTIME_ERROR(ss.str()); + } + default_value_.resize(header.default_value_size); + if (header.default_value_size > 0) { + memcpy(default_value_.data(), meta_buf.data() + sizeof(StringMetaHeader), + header.default_value_size); } } - mmap_array materialized_map_; + struct StringMetaHeader { + uint64_t default_offset; + uint32_t default_length; + uint32_t default_value_size; + }; + + std::string default_value_; mmap_array items_; + string_item default_item_; mmap_array data_; bool is_writable_ = true; }; diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index 3ff2d200..4b542eed 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -265,24 +265,23 @@ class TypedColumn : public ColumnBase { StorageStrategy strategy_; }; -// No default value for StringColumn template <> class TypedColumn : public ColumnBase { public: TypedColumn(StorageStrategy strategy, uint16_t width, std::string_view default_value = "") - : size_(0), + : buffer_(truncate_utf8(default_value, width)), + size_(0), pos_(0), strategy_(strategy), width_(width), - default_value_(default_value), type_(DataTypeId::kVarchar) {} explicit TypedColumn(StorageStrategy strategy) - : size_(0), + : buffer_(""), + size_(0), pos_(0), strategy_(strategy), width_(STRING_DEFAULT_MAX_LENGTH), - default_value_(""), type_(DataTypeId::kVarchar) {} TypedColumn(TypedColumn&& rhs) { buffer_.swap(rhs.buffer_); @@ -290,7 +289,6 @@ class TypedColumn : public ColumnBase { pos_ = rhs.pos_.load(); strategy_ = rhs.strategy_; width_ = rhs.width_; - default_value_ = rhs.default_value_; type_ = rhs.type_; } @@ -337,14 +335,14 @@ class TypedColumn : public ColumnBase { void copy_to_tmp(const std::string& cur_path, const std::string& tmp_path) override { - mmap_array tmp; + mmap_array tmp(buffer_.default_value()); if (!std::filesystem::exists(cur_path + ".data")) { return; } copy_file(cur_path + ".data", tmp_path + ".data"); copy_file(cur_path + ".items", tmp_path + ".items"); - if (std::filesystem::exists(cur_path + ".materialized")) { - copy_file(cur_path + ".materialized", tmp_path + ".materialized"); + if (std::filesystem::exists(cur_path + ".meta")) { + copy_file(cur_path + ".meta", tmp_path + ".meta"); } copy_file(cur_path + ".pos", tmp_path + ".pos"); @@ -365,16 +363,28 @@ class TypedColumn : public ColumnBase { void resize(size_t size) override { std::unique_lock lock(rw_mutex_); + const size_t old_size = buffer_.size(); size_ = size; + size_t min_data_size = pos_.load(); + if (size_ > 0 && !buffer_.has_default_item()) { + min_data_size += buffer_.default_value().size(); + } if (buffer_.size() != 0) { size_t avg_width = buffer_.avg_size(); // calculate average width of existing strings buffer_.resize( size_, std::max(size_ * (avg_width > 0 ? avg_width : STRING_DEFAULT_MAX_LENGTH), - pos_.load())); + min_data_size)); } else { - buffer_.resize(size_, std::max(size_ * width_, pos_.load())); + buffer_.resize(size_, std::max(size_ * width_, min_data_size)); + } + if (size_ > 0) { + size_t next_pos = buffer_.ensure_default_item(pos_.load()); + pos_.store(std::max(pos_.load(), next_pos)); + } + if (size_ > old_size) { + buffer_.fill_default_items(old_size, size_); } } @@ -409,9 +419,6 @@ class TypedColumn : public ColumnBase { void set_value_safe(size_t idx, const std::string_view& value); inline std::string_view get_view(size_t idx) const { - if (!buffer_.is_materialized(idx)) { - return default_value_; - } return buffer_.get(idx); } @@ -439,8 +446,6 @@ class TypedColumn : public ColumnBase { buffer_.ensure_writable(work_dir); } - std::string_view default_value() const { return default_value_; } - private: inline void init_pos(const std::string& file_path) { if (std::filesystem::exists(file_path)) { @@ -451,13 +456,13 @@ class TypedColumn : public ColumnBase { pos_.store(0); } } + mmap_array buffer_; size_t size_; std::atomic pos_; StorageStrategy strategy_; std::shared_mutex rw_mutex_; uint16_t width_; - std::string_view default_value_; DataTypeId type_; }; @@ -514,16 +519,11 @@ class TypedRefColumn : public RefColumnBase { using value_type = std::string_view; explicit TypedRefColumn(const StringColumn& column) - : basic_buffer(column.buffer()), - basic_size(column.buffer_size()), - default_value_(column.default_value()) {} + : basic_buffer(column.buffer()), basic_size(column.buffer_size()) {} ~TypedRefColumn() {} inline std::string_view get_view(size_t index) const { assert(index < basic_size); - if (!basic_buffer.is_materialized(index)) { - return default_value_; - } return basic_buffer.get(index); } @@ -538,10 +538,6 @@ class TypedRefColumn : public RefColumnBase { private: const mmap_array& basic_buffer; size_t basic_size; - // NOTE: default_value_ is a non-owning view. The pointed-to string data is - // owned by the schema's default_property_strings and must outlive this - // object. - std::string_view default_value_; }; // Create a reference column from a ColumnBase that contains a const reference From 7dc36f16b47972257eaf821d57c51958c533032e Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Tue, 17 Mar 2026 16:01:58 +0800 Subject: [PATCH 11/24] fix --- include/neug/utils/mmap_array.h | 79 ++++++++++++++++++---------- include/neug/utils/property/column.h | 27 ---------- tools/python_bind/tests/test_ddl.py | 30 ++++++++++- 3 files changed, 79 insertions(+), 57 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 8577f7f2..92fc02a0 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -477,17 +477,6 @@ struct string_item { template <> class mmap_array { - private: - struct CompactionPlan { - struct Entry { - size_t index; - uint64_t offset; - uint32_t length; - }; - std::vector entries; - size_t total_size = 0; - }; - public: mmap_array() : mmap_array("") {} explicit mmap_array(const std::string_view& default_value) @@ -505,10 +494,9 @@ class mmap_array { std::string_view default_value() const { return default_value_; } - bool has_default_item() const { - return default_item_.offset != 0 || default_item_.length != 0 || - default_value_.empty(); - } + // The behavior of having no default value is same with having an empty string + // as default value. + bool has_default_item() const { return default_item_.length != 0; } size_t ensure_default_item(size_t offset) { if (has_default_item()) { @@ -527,6 +515,8 @@ class mmap_array { } void fill_default_items(size_t begin, size_t end) { + begin = std::min(begin, items_.size()); + end = std::min(end, items_.size()); for (size_t i = begin; i < end; ++i) { items_.set(i, default_item_); } @@ -563,7 +553,7 @@ class mmap_array { return; } - compact(plan); + compact(); items_.dump(filename + ".items"); data_.dump(filename + ".data"); dump_meta(filename + ".meta"); @@ -579,15 +569,17 @@ class mmap_array { return 0; } size_t total_length = 0; - size_t non_zero_count = 0; + size_t non_default_count = 0; for (size_t i = 0; i < items_.size(); ++i) { - if (items_.get(i).length > 0) { - ++non_zero_count; - total_length += items_.get(i).length; + const auto& item = items_.get(i); + if (item.length > 0 && !(item.offset == default_item_.offset && + item.length == default_item_.length)) { + ++non_default_count; + total_length += item.length; } } - return non_zero_count > 0 - ? (total_length + non_zero_count - 1) / non_zero_count + return non_default_count > 0 + ? (total_length + non_default_count - 1) / non_default_count : 0; } @@ -635,7 +627,8 @@ class mmap_array { // Returns the compacted data size. Note that the reserved size of data buffer // is not changed, and new strings can still be appended after the compacted // data. - size_t compact(const CompactionPlan& plan) { + size_t compact() { + auto plan = prepare_compaction_plan(); if (items_.size() == 0) { return 0; } @@ -687,6 +680,16 @@ class mmap_array { } private: + struct CompactionPlan { + struct Entry { + size_t index; + uint64_t offset; + uint32_t length; + }; + std::vector entries; + size_t total_size = 0; + }; + CompactionPlan prepare_compaction_plan() const { CompactionPlan plan; plan.entries.reserve(items_.size()); @@ -697,6 +700,11 @@ class mmap_array { } for (size_t i = 0; i < items_.size(); ++i) { const string_item& item = items_.get(i); + // Items pointing to old_default_item are default-initialized entries that + // share a single storage slot. They are safe to identify by {offset, + // length} because ensure_default_item() always writes the default at + // offset 0 before any explicit string is appended (pos_ >= + // default_value_.size() at all times after resize()). if (has_stored_default && item.offset == old_default_item.offset && item.length == old_default_item.length) { continue; @@ -732,6 +740,7 @@ class mmap_array { const char* default_src = data_.data() + old_default_item.offset; if (fwrite(default_src, 1, old_default_item.length, fout) != old_default_item.length) { + fclose(fout); std::stringstream ss; ss << "Failed to fwrite file [ " << data_filename << " ], " << strerror(errno); @@ -745,6 +754,7 @@ class mmap_array { if (entry.length > 0) { const char* src = data_.data() + entry.offset; if (fwrite(src, 1, entry.length, fout) != entry.length) { + fclose(fout); std::stringstream ss; ss << "Failed to fwrite file [ " << data_filename << " ], " << strerror(errno); @@ -768,6 +778,7 @@ class mmap_array { assert(write_offset == plan.total_size); if (fflush(fout) != 0) { + fclose(fout); std::stringstream ss; ss << "Failed to fflush file [ " << data_filename << " ], " << strerror(errno); @@ -776,6 +787,7 @@ class mmap_array { } int fd = fileno(fout); if (fd == -1) { + fclose(fout); std::stringstream ss; ss << "Failed to get file descriptor for [ " << data_filename << " ], " << strerror(errno); @@ -783,6 +795,7 @@ class mmap_array { THROW_RUNTIME_ERROR(ss.str()); } if (ftruncate(fd, static_cast(size_before_compact)) != 0) { + fclose(fout); std::stringstream ss; ss << "Failed to ftruncate file [ " << data_filename << " ], " << strerror(errno); @@ -845,8 +858,6 @@ class mmap_array { if (!std::filesystem::exists(meta_filename)) { return; } - default_value_.clear(); - default_item_ = {0, 0}; const size_t meta_size = std::filesystem::file_size(meta_filename); if (meta_size < sizeof(StringMetaHeader)) { std::stringstream ss; @@ -871,10 +882,22 @@ class mmap_array { LOG(ERROR) << ss.str(); THROW_RUNTIME_ERROR(ss.str()); } - default_value_.resize(header.default_value_size); if (header.default_value_size > 0) { - memcpy(default_value_.data(), meta_buf.data() + sizeof(StringMetaHeader), - header.default_value_size); + if (default_value_.size() > 0) { + if (header.default_value_size != default_value_.size() || + memcmp(default_value_.data(), + meta_buf.data() + sizeof(StringMetaHeader), + header.default_value_size) != 0) { + std::stringstream ss; + ss << "Default value in meta file [ " << meta_filename + << " ] does not match the one in memory"; + LOG(ERROR) << ss.str(); + THROW_RUNTIME_ERROR(ss.str()); + } + } else { + default_value_.assign(meta_buf.data() + sizeof(StringMetaHeader), + header.default_value_size); + } } } diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index 4b542eed..ab5aebe9 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -513,33 +513,6 @@ class TypedRefColumn : public RefColumnBase { size_t basic_size; }; -template <> -class TypedRefColumn : public RefColumnBase { - public: - using value_type = std::string_view; - - explicit TypedRefColumn(const StringColumn& column) - : basic_buffer(column.buffer()), basic_size(column.buffer_size()) {} - ~TypedRefColumn() {} - - inline std::string_view get_view(size_t index) const { - assert(index < basic_size); - return basic_buffer.get(index); - } - - Property get(size_t index) const override { - return PropUtils::to_prop(get_view(index)); - } - - DataTypeId type() const override { return DataTypeId::kVarchar; } - - ColType col_type() const override { return ColType::kInternal; } - - private: - const mmap_array& basic_buffer; - size_t basic_size; -}; - // Create a reference column from a ColumnBase that contains a const reference // to the actual column storage, offering a column-based store interface for // vertex properties. diff --git a/tools/python_bind/tests/test_ddl.py b/tools/python_bind/tests/test_ddl.py index def1e183..9db55455 100644 --- a/tools/python_bind/tests/test_ddl.py +++ b/tools/python_bind/tests/test_ddl.py @@ -385,11 +385,37 @@ def test_get_varchar_default_value_2(): db = Database(db_dir, "w") conn = db.connect() conn.execute("CREATE NODE TABLE TestNode(id INT64 PRIMARY KEY);") + conn.execute("CREATE REL TABLE TestEdge(FROM TestNode TO TestNode);") conn.execute("CREATE (:TestNode {id: 1});") conn.execute("CREATE (:TestNode {id: 2});") - conn.execute("ALTER TABLE TestNode ADD name VARCHAR(20) DEFAULT 'default_name';") conn.execute("CREATE (:TestNode {id: 3});") + conn.execute( + "MATCH (a:TestNode {id: 1}), (b:TestNode {id: 2}) CREATE (a)-[:TestEdge]->(b);" + ) + conn.execute( + "MATCH (a:TestNode {id: 2}), (b:TestNode {id: 3}) CREATE (a)-[:TestEdge]->(b);" + ) + conn.execute("ALTER TABLE TestNode ADD name VARCHAR(20) DEFAULT 'default_name';") + conn.execute("CREATE (:TestNode {id: 4});") + conn.execute("CREATE (:TestNode {id: 5, name: 'custom_name'});") res = conn.execute("Match (n:TestNode) Return n.name;") - assert list(res) == [["default_name"], ["default_name"], ["default_name"]] + assert list(res) == [ + ["default_name"], + ["default_name"], + ["default_name"], + ["default_name"], + ["custom_name"], + ] + conn.execute("ALTER TABLE TestEdge ADD date INT64;") + conn.execute( + "MATCH (a:TestNode {id: 1})-[e:TestEdge]->(b:TestNode {id: 2}) SET e.date = 1234567890;" + ) + conn.execute( + "MATCH (a:TestNode {id: 1}), (b:TestNode { id: 3 }) CREATE (a)-[:TestEdge {date: 9876543210}]->(b);" + ) + res = conn.execute( + "MATCH (a:TestNode {id: 1})-[e:TestEdge]->(b:TestNode) RETURN e.date;" + ) + assert list(res) == [[1234567890], [9876543210]] conn.close() db.close() From f186997756f61548c2b9276de0b4bfaae2f167c0 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Tue, 17 Mar 2026 16:13:06 +0800 Subject: [PATCH 12/24] revert unneccessary changes --- src/storages/graph/property_graph.cc | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/src/storages/graph/property_graph.cc b/src/storages/graph/property_graph.cc index 04370f96..391a1cbe 100644 --- a/src/storages/graph/property_graph.cc +++ b/src/storages/graph/property_graph.cc @@ -374,24 +374,12 @@ Status PropertyGraph::AddVertexProperties( } add_default_property_values.emplace_back(default_value); } - label_t v_label = schema_.get_vertex_label_id(vertex_type_name); - size_t old_prop_num = - schema_.get_vertex_schema(v_label)->property_names.size(); schema_.AddVertexProperties(vertex_type_name, add_property_names, add_property_types, add_property_storages, add_default_property_values); - - // Use the default property values in schema for the new properties if not - // provided in the function arguments, to ensure the default values are - // consistent with the vertex schema. - std::vector schema_new_default_values; - const auto& v_prop_default_values = - schema_.get_vertex_schema(v_label)->default_property_values; - for (size_t i = old_prop_num; i < v_prop_default_values.size(); i++) { - schema_new_default_values.emplace_back(v_prop_default_values[i]); - } + label_t v_label = schema_.get_vertex_label_id(vertex_type_name); vertex_tables_[v_label].AddProperties(add_property_names, add_property_types, - schema_new_default_values); + add_default_property_values); return neug::Status::OK(); } @@ -431,8 +419,6 @@ Status PropertyGraph::AddEdgeProperties( label_t src_label = schema_.get_vertex_label_id(src_type_name); label_t dst_label = schema_.get_vertex_label_id(dst_type_name); label_t e_label = schema_.get_edge_label_id(edge_type_name); - size_t old_prop_num = schema_.get_edge_schema(src_label, dst_label, e_label) - ->property_names.size(); schema_.AddEdgeProperties(src_type_name, dst_type_name, edge_type_name, add_property_names, add_property_types, @@ -448,16 +434,9 @@ Status PropertyGraph::AddEdgeProperties( "] does not exist, cannot add properties."); } - std::vector schema_new_default_values; - const auto& e_prop_default_values = - schema_.get_edge_schema(src_label, dst_label, e_label) - ->default_property_values; - for (size_t i = old_prop_num; i < e_prop_default_values.size(); i++) { - schema_new_default_values.emplace_back(e_prop_default_values[i]); - } auto& edge_table = edge_tables_.at(index); edge_table.AddProperties(add_property_names, add_property_types, - schema_new_default_values); + add_default_property_values); return neug::Status::OK(); } From 23d7ff59a8ffbf5249551e42ab0819503b7cf0b7 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Tue, 17 Mar 2026 16:32:15 +0800 Subject: [PATCH 13/24] fix --- include/neug/utils/mmap_array.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 92fc02a0..4cb9d1f6 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -825,8 +825,8 @@ class mmap_array { VLOG(1) << "Compaction completed. New data size: " << plan.total_size << ", old data size: " << size_before_compact; - dump_meta(meta_filename); items_.dump(items_filename); + dump_meta(meta_filename); } void dump_meta(const std::string& meta_filename) const { @@ -872,7 +872,6 @@ class mmap_array { StringMetaHeader header{}; memcpy(&header, meta_buf.data(), sizeof(header)); - default_item_ = {header.default_offset, header.default_length}; const size_t expected_size = sizeof(StringMetaHeader) + header.default_value_size; if (meta_size != expected_size) { @@ -898,7 +897,14 @@ class mmap_array { default_value_.assign(meta_buf.data() + sizeof(StringMetaHeader), header.default_value_size); } + } else if (default_value_.size() > 0) { + std::stringstream ss; + ss << "Meta file [ " << meta_filename + << " ] does not contain default value, but memory has a default value"; + LOG(ERROR) << ss.str(); + THROW_RUNTIME_ERROR(ss.str()); } + default_item_ = {header.default_offset, header.default_length}; } struct StringMetaHeader { From 54dff3d65c38022783dfd8849bfa15f6b051cd6c Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Tue, 17 Mar 2026 16:54:06 +0800 Subject: [PATCH 14/24] Update include/neug/utils/mmap_array.h Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- include/neug/utils/mmap_array.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 4cb9d1f6..41c7c5c9 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -912,6 +912,9 @@ class mmap_array { uint32_t default_length; uint32_t default_value_size; }; + static_assert(sizeof(StringMetaHeader) == 16, + "StringMetaHeader layout has unexpected padding; on-disk " + "format would be broken."); std::string default_value_; mmap_array items_; From 47254f1439c3d2d6e40742e654c6154e3f36436a Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Tue, 17 Mar 2026 17:08:22 +0800 Subject: [PATCH 15/24] resolve greptile comments --- include/neug/utils/mmap_array.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 4cb9d1f6..256be8bd 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -480,7 +480,9 @@ class mmap_array { public: mmap_array() : mmap_array("") {} explicit mmap_array(const std::string_view& default_value) - : default_value_(default_value), default_item_{0, 0} {} + : default_value_(default_value), + default_item_{0, 0}, + has_default_item_(false) {} mmap_array(mmap_array&& rhs) : mmap_array() { swap(rhs); } ~mmap_array() {} @@ -490,13 +492,12 @@ class mmap_array { items_.reset(); data_.reset(); is_writable_ = true; + has_default_item_ = false; } std::string_view default_value() const { return default_value_; } - // The behavior of having no default value is same with having an empty string - // as default value. - bool has_default_item() const { return default_item_.length != 0; } + bool has_default_item() const { return has_default_item_; } size_t ensure_default_item(size_t offset) { if (has_default_item()) { @@ -511,10 +512,12 @@ class mmap_array { } default_item_ = {static_cast(offset), static_cast(default_value_.size())}; + has_default_item_ = true; return offset + default_value_.size(); } void fill_default_items(size_t begin, size_t end) { + assert(has_default_item_); begin = std::min(begin, items_.size()); end = std::min(end, items_.size()); for (size_t i = begin; i < end; ++i) { @@ -557,6 +560,7 @@ class mmap_array { items_.dump(filename + ".items"); data_.dump(filename + ".data"); dump_meta(filename + ".meta"); + reset(); } void resize(size_t size, size_t data_size) { @@ -605,6 +609,7 @@ class mmap_array { items_.swap(rhs.items_); data_.swap(rhs.data_); std::swap(is_writable_, rhs.is_writable_); + std::swap(has_default_item_, rhs.has_default_item_); } void set_writable(bool is_writable) { @@ -841,9 +846,10 @@ class mmap_array { } write_file(meta_filename, meta_buf.data(), meta_buf.size(), 1); - std::filesystem::perms readPermission = std::filesystem::perms::owner_read; + std::filesystem::perms rwPermissions = std::filesystem::perms::owner_read | + std::filesystem::perms::owner_write; std::error_code errorCode; - std::filesystem::permissions(meta_filename, readPermission, + std::filesystem::permissions(meta_filename, rwPermissions, std::filesystem::perm_options::add, errorCode); if (errorCode) { std::stringstream ss; @@ -905,6 +911,7 @@ class mmap_array { THROW_RUNTIME_ERROR(ss.str()); } default_item_ = {header.default_offset, header.default_length}; + has_default_item_ = true; } struct StringMetaHeader { @@ -918,6 +925,7 @@ class mmap_array { string_item default_item_; mmap_array data_; bool is_writable_ = true; + bool has_default_item_ = false; }; } // namespace neug From 46a6b20e62cb592e8b27fb291adbaf76ca658451 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Wed, 18 Mar 2026 10:32:12 +0800 Subject: [PATCH 16/24] only keep default value in schema --- include/neug/utils/id_indexer.h | 10 +- include/neug/utils/mmap_array.h | 228 ++------------------------- include/neug/utils/property/column.h | 72 ++++++--- include/neug/utils/property/table.h | 12 +- src/storages/graph/edge_table.cc | 30 +--- src/storages/graph/vertex_table.cc | 5 +- src/utils/property/column.cc | 14 +- src/utils/property/table.cc | 53 +++---- tests/utils/test_table.cc | 29 +--- 9 files changed, 122 insertions(+), 331 deletions(-) diff --git a/include/neug/utils/id_indexer.h b/include/neug/utils/id_indexer.h index b31b37a1..a8acad2b 100644 --- a/include/neug/utils/id_indexer.h +++ b/include/neug/utils/id_indexer.h @@ -242,13 +242,11 @@ class LFIndexer { void init(const DataTypeId& type, std::shared_ptr extra_type_info = nullptr) { keys_ = nullptr; - auto default_value = get_default_value(type); switch (type) { -#define TYPE_DISPATCHER(enum_val, T) \ - case DataTypeId::enum_val: { \ - keys_ = std::make_shared>( \ - PropUtils::to_typed(default_value), StorageStrategy::kMem); \ - break; \ +#define TYPE_DISPATCHER(enum_val, T) \ + case DataTypeId::enum_val: { \ + keys_ = std::make_shared>(StorageStrategy::kMem); \ + break; \ } TYPE_DISPATCHER(kInt64, int64_t) TYPE_DISPATCHER(kInt32, int32_t) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 8421a51b..35f9a8d3 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -68,6 +68,9 @@ inline size_t hugepage_round_up(size_t size) { return ROUND_UP(size); } namespace neug { +template +class TypedColumn; + enum class MemoryStrategy { kSyncToFile, kMemoryOnly, @@ -478,51 +481,14 @@ struct string_item { template <> class mmap_array { public: - mmap_array() : mmap_array("") {} - explicit mmap_array(const std::string_view& default_value) - : default_value_(default_value), - default_item_{0, 0}, - has_default_item_(false) {} + friend class TypedColumn; + mmap_array() {} mmap_array(mmap_array&& rhs) : mmap_array() { swap(rhs); } ~mmap_array() {} void reset() { - default_value_.clear(); - default_item_ = {0, 0}; items_.reset(); data_.reset(); - is_writable_ = true; - has_default_item_ = false; - } - - std::string_view default_value() const { return default_value_; } - - bool has_default_item() const { return has_default_item_; } - - size_t ensure_default_item(size_t offset) { - if (has_default_item()) { - return static_cast(default_item_.offset + default_item_.length); - } - if (offset + default_value_.size() > data_.size()) { - THROW_RUNTIME_ERROR("Not enough space for varchar default value"); - } - if (!default_value_.empty()) { - memcpy(data_.data() + offset, default_value_.data(), - default_value_.size()); - } - default_item_ = {static_cast(offset), - static_cast(default_value_.size())}; - has_default_item_ = true; - return offset + default_value_.size(); - } - - void fill_default_items(size_t begin, size_t end) { - assert(has_default_item_); - begin = std::min(begin, items_.size()); - end = std::min(end, items_.size()); - for (size_t i = begin; i < end; ++i) { - items_.set(i, default_item_); - } } void set_hugepage_prefered(bool val) { @@ -535,14 +501,12 @@ class mmap_array { is_writable_ = is_writable; items_.open(filename + ".items", sync_to_file, is_writable); data_.open(filename + ".data", sync_to_file, is_writable); - load_meta(filename + ".meta"); } void open_with_hugepages(const std::string& filename) { is_writable_ = true; items_.open_with_hugepages(filename + ".items"); data_.open_with_hugepages(filename + ".data"); - load_meta(filename + ".meta"); } void dump(const std::string& filename) { @@ -551,15 +515,13 @@ class mmap_array { bool should_stream = !data_.is_sync_to_file() && plan.total_size < data_.size(); if (should_stream) { - stream_compact_and_dump(plan, filename + ".data", filename + ".items", - filename + ".meta"); + stream_compact_and_dump(plan, filename + ".data", filename + ".items"); return; } compact(); items_.dump(filename + ".items"); data_.dump(filename + ".data"); - dump_meta(filename + ".meta"); reset(); } @@ -573,17 +535,15 @@ class mmap_array { return 0; } size_t total_length = 0; - size_t non_default_count = 0; + size_t non_zero_count = 0; for (size_t i = 0; i < items_.size(); ++i) { - const auto& item = items_.get(i); - if (item.length > 0 && !(item.offset == default_item_.offset && - item.length == default_item_.length)) { - ++non_default_count; - total_length += item.length; + if (items_.get(i).length > 0) { + ++non_zero_count; + total_length += items_.get(i).length; } } - return non_default_count > 0 - ? (total_length + non_default_count - 1) / non_default_count + return non_zero_count > 0 + ? (total_length + non_zero_count - 1) / non_zero_count : 0; } @@ -604,12 +564,9 @@ class mmap_array { size_t data_size() const { return data_.size(); } void swap(mmap_array& rhs) { - std::swap(default_value_, rhs.default_value_); - std::swap(default_item_, rhs.default_item_); items_.swap(rhs.items_); data_.swap(rhs.data_); std::swap(is_writable_, rhs.is_writable_); - std::swap(has_default_item_, rhs.has_default_item_); } void set_writable(bool is_writable) { @@ -645,18 +602,6 @@ class mmap_array { std::vector temp_buf(plan.total_size); size_t write_offset = 0; size_t limit_offset = 0; - const auto old_default_item = default_item_; - const bool has_stored_default = has_default_item(); - if (has_stored_default) { - default_item_ = {static_cast(write_offset), - old_default_item.length}; - if (old_default_item.length > 0) { - const char* default_src = data_.data() + old_default_item.offset; - memcpy(temp_buf.data() + write_offset, default_src, - old_default_item.length); - } - write_offset += old_default_item.length; - } for (const auto& entry : plan.entries) { const char* src = data_.data() + entry.offset; char* dst = temp_buf.data() + write_offset; @@ -667,15 +612,6 @@ class mmap_array { static_cast(entry.length)}); write_offset += entry.length; } - if (has_stored_default) { - for (size_t i = 0; i < items_.size(); ++i) { - const string_item& item = items_.get(i); - if (item.offset == old_default_item.offset && - item.length == old_default_item.length) { - items_.set(i, default_item_); - } - } - } assert(write_offset == plan.total_size); memcpy(data_.data(), temp_buf.data(), plan.total_size); @@ -698,22 +634,8 @@ class mmap_array { CompactionPlan prepare_compaction_plan() const { CompactionPlan plan; plan.entries.reserve(items_.size()); - const auto old_default_item = default_item_; - const bool has_stored_default = has_default_item(); - if (has_stored_default && old_default_item.length > 0) { - plan.total_size += old_default_item.length; - } for (size_t i = 0; i < items_.size(); ++i) { const string_item& item = items_.get(i); - // Items pointing to old_default_item are default-initialized entries that - // share a single storage slot. They are safe to identify by {offset, - // length} because ensure_default_item() always writes the default at - // offset 0 before any explicit string is appended (pos_ >= - // default_value_.size() at all times after resize()). - if (has_stored_default && item.offset == old_default_item.offset && - item.length == old_default_item.length) { - continue; - } plan.total_size += item.length; plan.entries.push_back( {i, item.offset, static_cast(item.length)}); @@ -723,8 +645,7 @@ class mmap_array { void stream_compact_and_dump(const CompactionPlan& plan, const std::string& data_filename, - const std::string& items_filename, - const std::string& meta_filename) { + const std::string& items_filename) { size_t size_before_compact = data_.size(); FILE* fout = fopen(data_filename.c_str(), "wb"); if (fout == NULL) { @@ -736,25 +657,6 @@ class mmap_array { } size_t write_offset = 0; - const auto old_default_item = default_item_; - const bool has_stored_default = has_default_item(); - if (has_stored_default) { - default_item_ = {static_cast(write_offset), - old_default_item.length}; - if (old_default_item.length > 0) { - const char* default_src = data_.data() + old_default_item.offset; - if (fwrite(default_src, 1, old_default_item.length, fout) != - old_default_item.length) { - fclose(fout); - std::stringstream ss; - ss << "Failed to fwrite file [ " << data_filename << " ], " - << strerror(errno); - LOG(ERROR) << ss.str(); - THROW_RUNTIME_ERROR(ss.str()); - } - } - write_offset += old_default_item.length; - } for (const auto& entry : plan.entries) { if (entry.length > 0) { const char* src = data_.data() + entry.offset; @@ -771,15 +673,6 @@ class mmap_array { static_cast(entry.length)}); write_offset += entry.length; } - if (has_stored_default) { - for (size_t i = 0; i < items_.size(); ++i) { - const string_item& item = items_.get(i); - if (item.offset == old_default_item.offset && - item.length == old_default_item.length) { - items_.set(i, default_item_); - } - } - } assert(write_offset == plan.total_size); if (fflush(fout) != 0) { @@ -831,104 +724,17 @@ class mmap_array { VLOG(1) << "Compaction completed. New data size: " << plan.total_size << ", old data size: " << size_before_compact; items_.dump(items_filename); - dump_meta(meta_filename); - } - - void dump_meta(const std::string& meta_filename) const { - StringMetaHeader header{static_cast(default_item_.offset), - static_cast(default_item_.length), - static_cast(default_value_.size())}; - std::vector meta_buf(sizeof(header) + default_value_.size()); - memcpy(meta_buf.data(), &header, sizeof(header)); - if (!default_value_.empty()) { - memcpy(meta_buf.data() + sizeof(header), default_value_.data(), - default_value_.size()); - } - write_file(meta_filename, meta_buf.data(), meta_buf.size(), 1); - - std::filesystem::perms rwPermissions = std::filesystem::perms::owner_read | - std::filesystem::perms::owner_write; - std::error_code errorCode; - std::filesystem::permissions(meta_filename, rwPermissions, - std::filesystem::perm_options::add, errorCode); - if (errorCode) { - std::stringstream ss; - ss << "Failed to set read permission for file: " << meta_filename << " " - << errorCode.message() << std::endl; - LOG(ERROR) << ss.str(); - THROW_RUNTIME_ERROR(ss.str()); - } } - void load_meta(const std::string& meta_filename) { - if (!std::filesystem::exists(meta_filename)) { - return; - } - const size_t meta_size = std::filesystem::file_size(meta_filename); - if (meta_size < sizeof(StringMetaHeader)) { - std::stringstream ss; - ss << "Invalid meta file [ " << meta_filename << " ], expected at least " - << sizeof(StringMetaHeader) << " bytes, got " << meta_size; - LOG(ERROR) << ss.str(); - THROW_RUNTIME_ERROR(ss.str()); - } - - std::vector meta_buf(meta_size); - read_file(meta_filename, meta_buf.data(), meta_size, 1); - - StringMetaHeader header{}; - memcpy(&header, meta_buf.data(), sizeof(header)); - const size_t expected_size = - sizeof(StringMetaHeader) + header.default_value_size; - if (meta_size != expected_size) { - std::stringstream ss; - ss << "Invalid meta file [ " << meta_filename << " ], expected " - << expected_size << " bytes, got " << meta_size; - LOG(ERROR) << ss.str(); - THROW_RUNTIME_ERROR(ss.str()); - } - if (header.default_value_size > 0) { - if (default_value_.size() > 0) { - if (header.default_value_size != default_value_.size() || - memcmp(default_value_.data(), - meta_buf.data() + sizeof(StringMetaHeader), - header.default_value_size) != 0) { - std::stringstream ss; - ss << "Default value in meta file [ " << meta_filename - << " ] does not match the one in memory"; - LOG(ERROR) << ss.str(); - THROW_RUNTIME_ERROR(ss.str()); - } - } else { - default_value_.assign(meta_buf.data() + sizeof(StringMetaHeader), - header.default_value_size); - } - } else if (default_value_.size() > 0) { - std::stringstream ss; - ss << "Meta file [ " << meta_filename - << " ] does not contain default value, but memory has a default value"; - LOG(ERROR) << ss.str(); - THROW_RUNTIME_ERROR(ss.str()); - } - default_item_ = {header.default_offset, header.default_length}; - has_default_item_ = true; + // Should only be used internally when we are sure the idx is valid + string_item get_string_item(size_t idx) const { return items_.get(idx); } + void set_string_item(size_t idx, const string_item& item) { + items_.set(idx, item); } - struct StringMetaHeader { - uint64_t default_offset; - uint32_t default_length; - uint32_t default_value_size; - }; - static_assert(sizeof(StringMetaHeader) == 16, - "StringMetaHeader layout has unexpected padding; on-disk " - "format would be broken."); - - std::string default_value_; mmap_array items_; - string_item default_item_; mmap_array data_; bool is_writable_ = true; - bool has_default_item_ = false; }; } // namespace neug diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index ab5aebe9..5f7d17c6 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -62,6 +62,7 @@ class ColumnBase { virtual void copy_to_tmp(const std::string& cur_path, const std::string& tmp_path) = 0; virtual void resize(size_t size) = 0; + virtual void resize(size_t size, const Property& default_value) = 0; virtual DataTypeId type() const = 0; @@ -88,8 +89,8 @@ class ColumnBase { template class TypedColumn : public ColumnBase { public: - explicit TypedColumn(const T& default_value, StorageStrategy strategy) - : default_value_(default_value), size_(0), strategy_(strategy) {} + explicit TypedColumn(StorageStrategy strategy) + : size_(0), strategy_(strategy) {} ~TypedColumn() { close(); } void open(const std::string& name, const std::string& snapshot_dir, @@ -154,11 +155,20 @@ class TypedColumn : public ColumnBase { size_t size() const override { return size_; } void resize(size_t size) override { + size_ = size; + buffer_.resize(size_); + } + + // Assume it is safe to insert the default value even if it is reserving, + // since user could always override + void resize(size_t size, const Property& default_value) override { + assert(default_value.type() == type()); size_t old_size = size_; size_ = size; buffer_.resize(size_); + auto default_typed_value = PropUtils::to_typed(default_value); for (size_t i = old_size; i < size_; ++i) { - buffer_.set(i, default_value_); + set_value(i, default_typed_value); } } @@ -206,7 +216,6 @@ class TypedColumn : public ColumnBase { } private: - T default_value_; mmap_array buffer_; size_t size_; StorageStrategy strategy_; @@ -241,6 +250,7 @@ class TypedColumn : public ColumnBase { void close() override {} size_t size() const override { return 0; } void resize(size_t size) override {} + void resize(size_t size, const Property& default_value) override {} DataTypeId type() const override { return DataTypeId::kEmpty; } @@ -268,17 +278,14 @@ class TypedColumn : public ColumnBase { template <> class TypedColumn : public ColumnBase { public: - TypedColumn(StorageStrategy strategy, uint16_t width, - std::string_view default_value = "") - : buffer_(truncate_utf8(default_value, width)), - size_(0), + TypedColumn(StorageStrategy strategy, uint16_t width) + : size_(0), pos_(0), strategy_(strategy), width_(width), type_(DataTypeId::kVarchar) {} explicit TypedColumn(StorageStrategy strategy) - : buffer_(""), - size_(0), + : size_(0), pos_(0), strategy_(strategy), width_(STRING_DEFAULT_MAX_LENGTH), @@ -335,7 +342,7 @@ class TypedColumn : public ColumnBase { void copy_to_tmp(const std::string& cur_path, const std::string& tmp_path) override { - mmap_array tmp(buffer_.default_value()); + mmap_array tmp; if (!std::filesystem::exists(cur_path + ".data")) { return; } @@ -363,28 +370,46 @@ class TypedColumn : public ColumnBase { void resize(size_t size) override { std::unique_lock lock(rw_mutex_); - const size_t old_size = buffer_.size(); size_ = size; - size_t min_data_size = pos_.load(); - if (size_ > 0 && !buffer_.has_default_item()) { - min_data_size += buffer_.default_value().size(); + if (buffer_.size() != 0) { + size_t avg_width = + buffer_.avg_size(); // calculate average width of existing strings + buffer_.resize( + size_, std::max(size_ * (avg_width > 0 ? avg_width + : STRING_DEFAULT_MAX_LENGTH), + pos_.load())); + } else { + buffer_.resize(size_, std::max(size_ * width_, pos_.load())); } + } + + void resize(size_t size, const Property& default_value) override { + assert(default_value.type() == type()); + std::unique_lock lock(rw_mutex_); + size_t old_size = size_; + size_ = size; + auto default_str = PropUtils::to_typed(default_value); if (buffer_.size() != 0) { size_t avg_width = buffer_.avg_size(); // calculate average width of existing strings buffer_.resize( size_, std::max(size_ * (avg_width > 0 ? avg_width : STRING_DEFAULT_MAX_LENGTH), - min_data_size)); + pos_.load())); } else { - buffer_.resize(size_, std::max(size_ * width_, min_data_size)); + buffer_.resize(size_, std::max(size_ * width_, pos_.load())); } - if (size_ > 0) { - size_t next_pos = buffer_.ensure_default_item(pos_.load()); - pos_.store(std::max(pos_.load(), next_pos)); + if (default_str.size() <= 0) { + return; } - if (size_ > old_size) { - buffer_.fill_default_items(old_size, size_); + default_str = truncate_utf8(default_str, width_); + + if (old_size < size_) { + set_value(old_size, default_str); + auto string_item = buffer_.get_string_item(old_size); + for (size_t i = old_size + 1; i < size_; ++i) { + buffer_.set_string_item(i, string_item); + } } } @@ -469,8 +494,7 @@ class TypedColumn : public ColumnBase { using StringColumn = TypedColumn; std::shared_ptr CreateColumn( - DataType type, Property default_value, - StorageStrategy strategy = StorageStrategy::kMem); + DataType type, StorageStrategy strategy = StorageStrategy::kMem); /// Create RefColumn for ease of usage for hqps class RefColumnBase { diff --git a/include/neug/utils/property/table.h b/include/neug/utils/property/table.h index ed5fa119..c2c4af76 100644 --- a/include/neug/utils/property/table.h +++ b/include/neug/utils/property/table.h @@ -35,25 +35,21 @@ class Table { void init(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& types, - const std::vector& default_property_values, const std::vector& strategies_); void open(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_); void open_in_memory(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_); void open_with_hugepages(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_, bool force = false); @@ -108,6 +104,12 @@ class Table { bool insert_safe = false); void resize(size_t row_num); + /** + * @brief Resize the table to row_num, and fill the new rows with default + * values. Assume it is safe to insert the default value even if it is + * reserving, since user could always override. + */ + void resize(size_t row_num, const std::vector& default_values); inline Property at(size_t row_id, size_t col_id) const { return column_ptrs_[col_id]->get_prop(row_id); @@ -130,12 +132,10 @@ class Table { void buildColumnPtrs(); void initColumns(const std::vector& col_name, const std::vector& types, - const std::vector& default_property_values, const std::vector& strategies_); std::unordered_map col_id_map_; std::vector col_names_; - std::vector col_default_values_; std::vector> columns_; std::vector column_ptrs_; diff --git a/src/storages/graph/edge_table.cc b/src/storages/graph/edge_table.cc index 2c70764c..b68acd4c 100644 --- a/src/storages/graph/edge_table.cc +++ b/src/storages/graph/edge_table.cc @@ -514,7 +514,7 @@ void EdgeTable::Open(const std::string& work_dir) { table_->open(edata_prefix(meta_->src_label_name, meta_->dst_label_name, meta_->edge_label_name), work_dir, meta_->property_names, meta_->properties, - meta_->default_property_values, meta_->strategies); + meta_->strategies); assert(table_->col_num() > 0); size_t table_cap = table_->get_column_by_id(0)->size(); load_statistic_file(work_dir, meta_->src_label_name, meta_->dst_label_name, @@ -543,8 +543,7 @@ void EdgeTable::OpenInMemory(const std::string& work_dir) { table_->open_in_memory( edata_prefix(meta_->src_label_name, meta_->dst_label_name, meta_->edge_label_name), - work_dir_, meta_->property_names, meta_->properties, - meta_->default_property_values, meta_->strategies); + work_dir_, meta_->property_names, meta_->properties, meta_->strategies); assert(table_->col_num() > 0); size_t table_cap = table_->get_column_by_id(0)->size(); load_statistic_file(work_dir, meta_->src_label_name, meta_->dst_label_name, @@ -574,7 +573,7 @@ void EdgeTable::OpenWithHugepages(const std::string& work_dir) { edata_prefix(meta_->src_label_name, meta_->dst_label_name, meta_->edge_label_name), checkpoint_dir_path, meta_->property_names, meta_->properties, - meta_->default_property_values, meta_->strategies, (memory_level_ > 2)); + meta_->strategies, (memory_level_ > 2)); assert(table_->col_num() > 0); size_t table_cap = table_->get_column_by_id(0)->size(); load_statistic_file(work_dir, meta_->src_label_name, meta_->dst_label_name, @@ -711,7 +710,7 @@ void EdgeTable::EnsureCapacity(size_t capacity) { return; } capacity = std::max(capacity, 4096UL); - table_->resize(capacity); + table_->resize(capacity, meta_->default_property_values); capacity_.store(capacity); } } @@ -1041,8 +1040,7 @@ void EdgeTable::dropAndCreateNewUnbundledCSR(bool delete_property) { LOG(INFO) << "rebuild unbundled edge csr with edge properties: " << meta_->property_names.size(); table_->open_in_memory(next_table_prefix, work_dir_, meta_->property_names, - meta_->properties, meta_->default_property_values, - meta_->strategies); + meta_->properties, meta_->strategies); } std::shared_ptr prev_data_col = nullptr; @@ -1060,26 +1058,10 @@ void EdgeTable::dropAndCreateNewUnbundledCSR(bool delete_property) { auto edges = out_csr_->batch_export(prev_data_col); if (prev_data_col && prev_data_col->size() > 0) { - table_->resize(prev_data_col->size()); + table_->resize(prev_data_col->size(), meta_->default_property_values); table_idx_.store(prev_data_col->size()); EnsureCapacity(prev_data_col->size()); } - // Set default value for other columns - for (size_t col_id = 1; col_id < table_->col_num(); ++col_id) { - auto col = table_->get_column_by_id(col_id); - if (col->type() == DataTypeId::kVarchar) { - VLOG(10) << "Skip set default value for column " << col_id - << " of type StringView"; - continue; - } - auto default_value = meta_->default_property_values[col_id]; - VLOG(10) << "Set default value for column " << col_id << ": " - << default_value.to_string() - << ", type: " << std::to_string(default_value.type()); - for (size_t row = 0; row < col->size(); ++row) { - col->set_any(row, default_value); - } - } std::vector row_ids; for (size_t i = 0; i < std::get<0>(edges).size(); ++i) { row_ids.push_back(i); diff --git a/src/storages/graph/vertex_table.cc b/src/storages/graph/vertex_table.cc index dcc10d08..18de1fc5 100644 --- a/src/storages/graph/vertex_table.cc +++ b/src/storages/graph/vertex_table.cc @@ -34,7 +34,6 @@ void VertexTable::Open(const std::string& work_dir, int memory_level) { indexer_.open(indexer_filename, checkpoint_dir_path, work_dir_); table_->open(vertex_table_prefix(label_name), work_dir_, vertex_schema_->property_names, vertex_schema_->property_types, - vertex_schema_->default_property_values, vertex_schema_->storage_strategies); } else if (memory_level_ == 1) { @@ -42,7 +41,6 @@ void VertexTable::Open(const std::string& work_dir, int memory_level) { table_->open_in_memory(vertex_table_prefix(label_name), work_dir_, vertex_schema_->property_names, vertex_schema_->property_types, - vertex_schema_->default_property_values, vertex_schema_->storage_strategies); } else if (memory_level_ >= 2) { @@ -51,7 +49,6 @@ void VertexTable::Open(const std::string& work_dir, int memory_level) { table_->open_with_hugepages( vertex_table_prefix(label_name), work_dir_, vertex_schema_->property_names, vertex_schema_->property_types, - vertex_schema_->default_property_values, vertex_schema_->storage_strategies, (memory_level_ > 2)); } else { THROW_INTERNAL_EXCEPTION("Invalid memory level: " + @@ -188,7 +185,7 @@ size_t VertexTable::EnsureCapacity(size_t capacity) { indexer_.reserve(capacity); } if (table_ && table_->size() < capacity) { - table_->resize(capacity); + table_->resize(capacity, vertex_schema_->default_property_values); } v_ts_.Reserve(capacity); return indexer_.capacity(); diff --git a/src/utils/property/column.cc b/src/utils/property/column.cc index 01b3401c..2bbc7826 100644 --- a/src/utils/property/column.cc +++ b/src/utils/property/column.cc @@ -68,6 +68,7 @@ class TypedEmptyColumn : public ColumnBase { void close() override {} size_t size() const override { return 0; } void resize(size_t size) override {} + void resize(size_t size, const Property& default_value) override {} DataTypeId type() const override { return PropUtils::prop_type(); } @@ -108,6 +109,7 @@ class TypedEmptyColumn : public ColumnBase { void close() override {} size_t size() const override { return 0; } void resize(size_t size) override {} + void resize(size_t size, const Property& default_value) override {} DataTypeId type() const override { return DataTypeId::kVarchar; } @@ -132,7 +134,7 @@ class TypedEmptyColumn : public ColumnBase { void ensure_writable(const std::string& work_dir) override {} }; -std::shared_ptr CreateColumn(DataType type, Property default_value, +std::shared_ptr CreateColumn(DataType type, StorageStrategy strategy) { auto type_id = type.id(); auto extra_type_info = type.RawExtraTypeInfo(); @@ -151,10 +153,9 @@ std::shared_ptr CreateColumn(DataType type, Property default_value, } } else { switch (type_id) { -#define TYPE_DISPATCHER(enum_val, type) \ - case DataTypeId::enum_val: \ - return std::make_shared>( \ - PropUtils::to_typed(default_value), strategy); +#define TYPE_DISPATCHER(enum_val, type) \ + case DataTypeId::enum_val: \ + return std::make_shared>(strategy); FOR_EACH_DATA_TYPE_NO_STRING(TYPE_DISPATCHER) #undef TYPE_DISPATCHER case DataTypeId::kVarchar: { @@ -165,8 +166,7 @@ std::shared_ptr CreateColumn(DataType type, Property default_value, max_length = str_info->max_length; } } - return std::make_shared(strategy, max_length, - default_value.as_string_view()); + return std::make_shared(strategy, max_length); } case DataTypeId::kEmpty: { return std::make_shared>(strategy); diff --git a/src/utils/property/table.cc b/src/utils/property/table.cc index c3299b2b..623b3cb0 100644 --- a/src/utils/property/table.cc +++ b/src/utils/property/table.cc @@ -33,12 +33,10 @@ Table::~Table() { close(); } void Table::initColumns(const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_) { size_t col_num = col_name.size(); columns_.clear(); col_names_.clear(); - col_default_values_.clear(); col_id_map_.clear(); columns_.resize(col_num, nullptr); auto strategies = strategies_; @@ -49,11 +47,7 @@ void Table::initColumns(const std::vector& col_name, col_id_map_.insert({col_name[i], col_id}); col_names_.emplace_back(col_name[i]); assert(i < property_types.size()); - col_default_values_.emplace_back( - i < default_property_values.size() - ? default_property_values[i] - : get_default_value(property_types[i].id())); - columns_[col_id] = CreateColumn(property_types[i], col_default_values_[i]); + columns_[col_id] = CreateColumn(property_types[i]); } columns_.resize(col_id_map_.size()); } @@ -61,11 +55,10 @@ void Table::initColumns(const std::vector& col_name, void Table::init(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_) { name_ = name; work_dir_ = work_dir; - initColumns(col_name, property_types, default_property_values, strategies_); + initColumns(col_name, property_types, strategies_); for (size_t i = 0; i < columns_.size(); ++i) { columns_[i]->open(name + ".col_" + std::to_string(i), "", work_dir); } @@ -76,12 +69,11 @@ void Table::init(const std::string& name, const std::string& work_dir, void Table::open(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_) { name_ = name; work_dir_ = work_dir; snapshot_dir_ = checkpoint_dir(work_dir_); - initColumns(col_name, property_types, default_property_values, strategies_); + initColumns(col_name, property_types, strategies_); for (size_t i = 0; i < columns_.size(); ++i) { columns_[i]->open(name + ".col_" + std::to_string(i), snapshot_dir_, tmp_dir(work_dir)); @@ -93,12 +85,11 @@ void Table::open(const std::string& name, const std::string& work_dir, void Table::open_in_memory(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_) { name_ = name; work_dir_ = work_dir; snapshot_dir_ = checkpoint_dir(work_dir_); - initColumns(col_name, property_types, default_property_values, strategies_); + initColumns(col_name, property_types, strategies_); for (size_t i = 0; i < columns_.size(); ++i) { columns_[i]->open_in_memory(snapshot_dir_ + "/" + name + ".col_" + std::to_string(i)); @@ -107,16 +98,16 @@ void Table::open_in_memory(const std::string& name, const std::string& work_dir, buildColumnPtrs(); } -void Table::open_with_hugepages( - const std::string& name, const std::string& work_dir, - const std::vector& col_name, - const std::vector& property_types, - const std::vector& default_property_values, - const std::vector& strategies_, bool force) { +void Table::open_with_hugepages(const std::string& name, + const std::string& work_dir, + const std::vector& col_name, + const std::vector& property_types, + const std::vector& strategies_, + bool force) { name_ = name; work_dir_ = work_dir; snapshot_dir_ = checkpoint_dir(work_dir); - initColumns(col_name, property_types, default_property_values, strategies_); + initColumns(col_name, property_types, strategies_); for (size_t i = 0; i < columns_.size(); ++i) { columns_[i]->open_with_hugepages( snapshot_dir_ + "/" + name + ".col_" + std::to_string(i), force); @@ -159,7 +150,7 @@ void Table::reset_header(const std::vector& col_name) { void Table::add_columns(const std::vector& col_names, const std::vector& col_types, const std::vector& default_property_values, - size_t column_size, + size_t capacity, const std::vector& strategies_, int memory_level) { // When add_columns are called, the table is already initialized and col_files @@ -175,10 +166,9 @@ void Table::add_columns(const std::vector& col_names, int col_id = col_names_.size(); col_id_map_.insert({col_names[i], col_id}); col_names_.emplace_back(col_names[i]); - col_default_values_.emplace_back(default_property_values[i]); - columns_[col_id] = CreateColumn( - col_types[i], default_property_values[i], - i < strategies_.size() ? strategies_[i] : StorageStrategy::kMem); + columns_[col_id] = CreateColumn(col_types[i], i < strategies_.size() + ? strategies_[i] + : StorageStrategy::kMem); } for (size_t i = old_size; i < columns_.size(); ++i) { if (memory_level == 0) { @@ -190,7 +180,7 @@ void Table::add_columns(const std::vector& col_names, } else { THROW_NOT_IMPLEMENTED_EXCEPTION("Unsupported memory level"); } - columns_[i]->resize(column_size); + columns_[i]->resize(capacity, default_property_values[i - old_size]); } buildColumnPtrs(); } @@ -217,7 +207,6 @@ void Table::delete_column(const std::string& col_name) { columns_[col_id].reset(); columns_.erase(columns_.begin() + col_id); col_names_.erase(col_names_.begin() + col_id); - col_default_values_.erase(col_default_values_.begin() + col_id); for (size_t i = col_id; i < column_ptrs_.size() - 1; i++) { column_ptrs_[i] = column_ptrs_[i + 1]; } @@ -329,6 +318,16 @@ void Table::resize(size_t row_num) { } } +void Table::resize(size_t row_num, + const std::vector& default_values) { + assert(default_values.size() == columns_.size()); + CHECK_EQ(default_values.size(), columns_.size()); + for (size_t i = 0; i < columns_.size(); ++i) { + columns_[i]->ensure_writable(work_dir_); + columns_[i]->resize(row_num, default_values[i]); + } +} + void Table::ingest(uint32_t index, OutArchive& arc) { if (column_ptrs_.size() == 0) { return; diff --git a/tests/utils/test_table.cc b/tests/utils/test_table.cc index d896c160..b787f32e 100644 --- a/tests/utils/test_table.cc +++ b/tests/utils/test_table.cc @@ -80,19 +80,6 @@ TEST(TableTest, TestTableBasic) { {DataTypeId::kTimestampMs}, {DataTypeId::kInterval}, {DataTypeId::kVarchar}}; - std::vector default_values = { - Property::from_bool(false), - Property::from_int32(0), - Property::from_uint32(0), - Property::from_int64(0), - Property::from_uint64(0), - Property::from_float(0.0), - Property::from_double(0.0), - Property::from_date(Date(0)), - Property::from_datetime(DateTime(0)), - Property::from_interval(Interval(std::string(""))), - Property::from_string_view("")}; - std::vector disk_strategies(col_name.size(), StorageStrategy::kDisk); std::vector mem_strategies(col_name.size(), @@ -101,11 +88,11 @@ TEST(TableTest, TestTableBasic) { StorageStrategy::kNone); disk_table.init("test_dist", TEST_DIR, col_name, property_types, - default_values, disk_strategies); + disk_strategies); mem_table.init("test_dist", TEST_DIR, col_name, property_types, - default_values, mem_strategies); + mem_strategies); none_table.init("test_dist", TEST_DIR, col_name, property_types, - default_values, none_strategies); + none_strategies); disk_table.resize(10); mem_table.resize(10); @@ -316,7 +303,7 @@ TEST(TableTest, TestTableBasic) { mem_table.drop(); disk_table.open("disk_table", std::string(TEST_DIR), col_name, property_types, - default_values, disk_strategies); + disk_strategies); EXPECT_EQ(disk_table.col_num(), 11); EXPECT_EQ(disk_table.get_column_by_id(0)->size(), 10); disk_table.reset_header(col_name); @@ -330,7 +317,7 @@ TEST(TableTest, TestTableBasic) { disk_table.drop(); mem_table.open_in_memory("disk_table", std::string(TEST_DIR), col_name, - property_types, default_values, mem_strategies); + property_types, mem_strategies); EXPECT_EQ(mem_table.col_num(), 11); EXPECT_EQ(mem_table.get_column_by_id(0)->size(), 10); const Table& mem_table_ref = mem_table; @@ -350,14 +337,12 @@ TEST(TableTest, StringColumnDistinguishesUnsetFromEmptyString) { Table table; std::vector col_name = {"string_column"}; std::vector property_types = {{DataTypeId::kVarchar}}; - std::vector default_values = { - Property::from_string_view("default_value")}; std::vector mem_strategies(col_name.size(), StorageStrategy::kMem); table.init("test_string_validity", TEST_DIR, col_name, property_types, - default_values, mem_strategies); - table.resize(2); + mem_strategies); + table.resize(2, {Property::from_string_view("default_value")}); auto string_column = std::dynamic_pointer_cast( table.get_column("string_column")); From bc63913c86a5e5c211c2cb9cb6ef14a521018197 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Wed, 18 Mar 2026 10:35:59 +0800 Subject: [PATCH 17/24] remove dummy code --- include/neug/utils/property/column.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index 5f7d17c6..03fe1369 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -348,9 +348,6 @@ class TypedColumn : public ColumnBase { } copy_file(cur_path + ".data", tmp_path + ".data"); copy_file(cur_path + ".items", tmp_path + ".items"); - if (std::filesystem::exists(cur_path + ".meta")) { - copy_file(cur_path + ".meta", tmp_path + ".meta"); - } copy_file(cur_path + ".pos", tmp_path + ".pos"); buffer_.reset(); From 8cdb1840417b76378094f926248aab910f3d6478 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Wed, 18 Mar 2026 11:03:41 +0800 Subject: [PATCH 18/24] minor fix --- include/neug/utils/mmap_array.h | 44 +++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 35f9a8d3..e8d21f12 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -584,6 +585,7 @@ class mmap_array { is_writable_ = true; } + private: // Compact the data buffer by removing unused space and updating offsets // This is an in-place operation that shifts valid string data forward // Returns the compacted data size. Note that the reserved size of data buffer @@ -598,16 +600,38 @@ class mmap_array { if (plan.total_size == size_before_compact) { return size_before_compact; } + // First round to check whether we need compact. + size_t reused_space = 0; + std::unordered_map seen_offsets; + for (const auto& entry : plan.entries) { + if (entry.length > 0) { + if (seen_offsets.find(entry.offset) != seen_offsets.end()) { + reused_space += entry.length; + continue; + } + seen_offsets.insert({entry.offset, entry.length}); + } + } + if (plan.total_size == reused_space + size_before_compact) { + return size_before_compact; + } std::vector temp_buf(plan.total_size); size_t write_offset = 0; size_t limit_offset = 0; + seen_offsets.clear(); for (const auto& entry : plan.entries) { - const char* src = data_.data() + entry.offset; - char* dst = temp_buf.data() + write_offset; - limit_offset = std::max(limit_offset, - static_cast(entry.offset + entry.length)); - memcpy(dst, src, entry.length); + if (entry.length > 0) { + if (seen_offsets.find(entry.offset) != seen_offsets.end()) { + continue; + } + seen_offsets.insert({entry.offset, entry.length}); + const char* src = data_.data() + entry.offset; + char* dst = temp_buf.data() + write_offset; + limit_offset = std::max( + limit_offset, static_cast(entry.offset + entry.length)); + memcpy(dst, src, entry.length); + } items_.set(entry.index, {static_cast(write_offset), static_cast(entry.length)}); write_offset += entry.length; @@ -620,7 +644,6 @@ class mmap_array { return plan.total_size; } - private: struct CompactionPlan { struct Entry { size_t index; @@ -657,8 +680,15 @@ class mmap_array { } size_t write_offset = 0; + std::unordered_map seen_offsets; + size_t reused_space = 0; for (const auto& entry : plan.entries) { if (entry.length > 0) { + if (seen_offsets.find(entry.offset) != seen_offsets.end()) { + reused_space += entry.length; + continue; + } + seen_offsets.insert({entry.offset, entry.length}); const char* src = data_.data() + entry.offset; if (fwrite(src, 1, entry.length, fout) != entry.length) { fclose(fout); @@ -673,7 +703,7 @@ class mmap_array { static_cast(entry.length)}); write_offset += entry.length; } - assert(write_offset == plan.total_size); + assert(write_offset + reused_space == plan.total_size); if (fflush(fout) != 0) { fclose(fout); From 82951735c4d6a1fa6baf7418c3f2cff8b2634fe8 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Wed, 18 Mar 2026 11:33:17 +0800 Subject: [PATCH 19/24] migrate to new offset --- include/neug/utils/mmap_array.h | 23 ++++++++++++++--------- tests/utils/test_table.cc | 15 +++++++++++++++ 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index e8d21f12..33094805 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -602,14 +603,14 @@ class mmap_array { } // First round to check whether we need compact. size_t reused_space = 0; - std::unordered_map seen_offsets; + std::unordered_set seen_offsets; for (const auto& entry : plan.entries) { if (entry.length > 0) { if (seen_offsets.find(entry.offset) != seen_offsets.end()) { reused_space += entry.length; continue; } - seen_offsets.insert({entry.offset, entry.length}); + seen_offsets.insert(entry.offset); } } if (plan.total_size == reused_space + size_before_compact) { @@ -619,13 +620,15 @@ class mmap_array { std::vector temp_buf(plan.total_size); size_t write_offset = 0; size_t limit_offset = 0; - seen_offsets.clear(); + std::unordered_map old_offset_to_new; for (const auto& entry : plan.entries) { if (entry.length > 0) { - if (seen_offsets.find(entry.offset) != seen_offsets.end()) { + auto it = old_offset_to_new.find(entry.offset); + if (it != old_offset_to_new.end()) { + items_.set(entry.index, {it->second, entry.length}); continue; } - seen_offsets.insert({entry.offset, entry.length}); + old_offset_to_new.insert({entry.offset, write_offset}); const char* src = data_.data() + entry.offset; char* dst = temp_buf.data() + write_offset; limit_offset = std::max( @@ -636,7 +639,7 @@ class mmap_array { static_cast(entry.length)}); write_offset += entry.length; } - assert(write_offset == plan.total_size); + assert(write_offset + reused_space == plan.total_size); memcpy(data_.data(), temp_buf.data(), plan.total_size); VLOG(1) << "Compaction completed. New data size: " << plan.total_size @@ -680,15 +683,17 @@ class mmap_array { } size_t write_offset = 0; - std::unordered_map seen_offsets; + std::unordered_map old_offset_to_new; size_t reused_space = 0; for (const auto& entry : plan.entries) { if (entry.length > 0) { - if (seen_offsets.find(entry.offset) != seen_offsets.end()) { + auto it = old_offset_to_new.find(entry.offset); + if (it != old_offset_to_new.end()) { + items_.set(entry.index, {it->second, entry.length}); reused_space += entry.length; continue; } - seen_offsets.insert({entry.offset, entry.length}); + old_offset_to_new.insert({entry.offset, write_offset}); const char* src = data_.data() + entry.offset; if (fwrite(src, 1, entry.length, fout) != entry.length) { fclose(fout); diff --git a/tests/utils/test_table.cc b/tests/utils/test_table.cc index b787f32e..e9cdf972 100644 --- a/tests/utils/test_table.cc +++ b/tests/utils/test_table.cc @@ -353,6 +353,21 @@ TEST(TableTest, StringColumnDistinguishesUnsetFromEmptyString) { string_column->set_prop(1, Property::from_string_view("")); EXPECT_TRUE(string_column->get_prop(1).as_string_view().empty()); EXPECT_EQ(string_column->get_prop(1).type(), DataTypeId::kVarchar); + string_column->set_prop( + 1, Property::from_string_view("new value new value new value")); + EXPECT_EQ(string_column->get_prop(1).as_string_view(), + "new value new value new value"); + std::string path = std::string(TEST_DIR) + "/checkpoint/string_column.data"; + if (std::filesystem::exists(path)) { + std::filesystem::remove(path); + } + string_column->dump(path); + + StringColumn new_string_column(StorageStrategy::kMem); + new_string_column.open_in_memory(path); + EXPECT_EQ(new_string_column.get_prop(0).as_string_view(), "default_value"); + EXPECT_EQ(new_string_column.get_prop(1).as_string_view(), + "new value new value new value"); } } // namespace test From e8a5cf5ac42bca94ae6424575041554ab6ece7b7 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Wed, 18 Mar 2026 12:06:45 +0800 Subject: [PATCH 20/24] fix --- include/neug/utils/mmap_array.h | 44 ++++++++++++---------------- include/neug/utils/property/column.h | 10 +++---- src/utils/property/table.cc | 1 - 3 files changed, 23 insertions(+), 32 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 33094805..24ddee75 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -514,8 +514,9 @@ class mmap_array { void dump(const std::string& filename) { // Compact before dumping to reclaim unused space auto plan = prepare_compaction_plan(); + size_t effective_size = plan.total_size - plan.reused_size; bool should_stream = - !data_.is_sync_to_file() && plan.total_size < data_.size(); + !data_.is_sync_to_file() && effective_size < data_.size(); if (should_stream) { stream_compact_and_dump(plan, filename + ".data", filename + ".items"); return; @@ -598,26 +599,12 @@ class mmap_array { return 0; } size_t size_before_compact = data_.size(); - if (plan.total_size == size_before_compact) { - return size_before_compact; - } - // First round to check whether we need compact. - size_t reused_space = 0; - std::unordered_set seen_offsets; - for (const auto& entry : plan.entries) { - if (entry.length > 0) { - if (seen_offsets.find(entry.offset) != seen_offsets.end()) { - reused_space += entry.length; - continue; - } - seen_offsets.insert(entry.offset); - } - } - if (plan.total_size == reused_space + size_before_compact) { + if (plan.total_size == plan.reused_size + size_before_compact) { return size_before_compact; } + size_t effective_size = plan.total_size - plan.reused_size; - std::vector temp_buf(plan.total_size); + std::vector temp_buf(effective_size); size_t write_offset = 0; size_t limit_offset = 0; std::unordered_map old_offset_to_new; @@ -639,12 +626,12 @@ class mmap_array { static_cast(entry.length)}); write_offset += entry.length; } - assert(write_offset + reused_space == plan.total_size); - memcpy(data_.data(), temp_buf.data(), plan.total_size); + assert(write_offset + plan.reused_size == plan.total_size); + memcpy(data_.data(), temp_buf.data(), effective_size); - VLOG(1) << "Compaction completed. New data size: " << plan.total_size + VLOG(1) << "Compaction completed. New data size: " << effective_size << ", old data size: " << limit_offset; - return plan.total_size; + return effective_size; } struct CompactionPlan { @@ -655,16 +642,25 @@ class mmap_array { }; std::vector entries; size_t total_size = 0; + size_t reused_size = 0; }; CompactionPlan prepare_compaction_plan() const { CompactionPlan plan; plan.entries.reserve(items_.size()); + std::unordered_set seen_offsets; for (size_t i = 0; i < items_.size(); ++i) { const string_item& item = items_.get(i); plan.total_size += item.length; plan.entries.push_back( {i, item.offset, static_cast(item.length)}); + if (item.length > 0) { + if (seen_offsets.find(item.offset) != seen_offsets.end()) { + plan.reused_size += item.length; + } else { + seen_offsets.insert(item.offset); + } + } } return plan; } @@ -684,13 +680,11 @@ class mmap_array { size_t write_offset = 0; std::unordered_map old_offset_to_new; - size_t reused_space = 0; for (const auto& entry : plan.entries) { if (entry.length > 0) { auto it = old_offset_to_new.find(entry.offset); if (it != old_offset_to_new.end()) { items_.set(entry.index, {it->second, entry.length}); - reused_space += entry.length; continue; } old_offset_to_new.insert({entry.offset, write_offset}); @@ -708,7 +702,7 @@ class mmap_array { static_cast(entry.length)}); write_offset += entry.length; } - assert(write_offset + reused_space == plan.total_size); + assert(write_offset + plan.reused_size == plan.total_size); if (fflush(fout) != 0) { fclose(fout); diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index 03fe1369..b5ac6fac 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -372,9 +372,8 @@ class TypedColumn : public ColumnBase { size_t avg_width = buffer_.avg_size(); // calculate average width of existing strings buffer_.resize( - size_, std::max(size_ * (avg_width > 0 ? avg_width - : STRING_DEFAULT_MAX_LENGTH), - pos_.load())); + size_, + std::max(size_ * (avg_width > 0 ? avg_width : width_), pos_.load())); } else { buffer_.resize(size_, std::max(size_ * width_, pos_.load())); } @@ -390,9 +389,8 @@ class TypedColumn : public ColumnBase { size_t avg_width = buffer_.avg_size(); // calculate average width of existing strings buffer_.resize( - size_, std::max(size_ * (avg_width > 0 ? avg_width - : STRING_DEFAULT_MAX_LENGTH), - pos_.load())); + size_, + std::max(size_ * (avg_width > 0 ? avg_width : width_), pos_.load())); } else { buffer_.resize(size_, std::max(size_ * width_, pos_.load())); } diff --git a/src/utils/property/table.cc b/src/utils/property/table.cc index 623b3cb0..02a00c3c 100644 --- a/src/utils/property/table.cc +++ b/src/utils/property/table.cc @@ -321,7 +321,6 @@ void Table::resize(size_t row_num) { void Table::resize(size_t row_num, const std::vector& default_values) { assert(default_values.size() == columns_.size()); - CHECK_EQ(default_values.size(), columns_.size()); for (size_t i = 0; i < columns_.size(); ++i) { columns_[i]->ensure_writable(work_dir_); columns_[i]->resize(row_num, default_values[i]); From c34ccf88800c1f06ed256a1de58949c7cad68c2d Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Wed, 18 Mar 2026 12:19:29 +0800 Subject: [PATCH 21/24] minor fix --- src/utils/property/table.cc | 6 +++++- tools/python_bind/tests/test_ddl.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/utils/property/table.cc b/src/utils/property/table.cc index 02a00c3c..2eeedcc1 100644 --- a/src/utils/property/table.cc +++ b/src/utils/property/table.cc @@ -320,7 +320,11 @@ void Table::resize(size_t row_num) { void Table::resize(size_t row_num, const std::vector& default_values) { - assert(default_values.size() == columns_.size()); + if (default_values.size() != columns_.size()) { + THROW_RUNTIME_ERROR("default_values size mismatch: expected " + + std::to_string(columns_.size()) + " but got " + + std::to_string(default_values.size())); + } for (size_t i = 0; i < columns_.size(); ++i) { columns_[i]->ensure_writable(work_dir_); columns_[i]->resize(row_num, default_values[i]); diff --git a/tools/python_bind/tests/test_ddl.py b/tools/python_bind/tests/test_ddl.py index 9db55455..8b9b63ea 100644 --- a/tools/python_bind/tests/test_ddl.py +++ b/tools/python_bind/tests/test_ddl.py @@ -398,13 +398,13 @@ def test_get_varchar_default_value_2(): conn.execute("ALTER TABLE TestNode ADD name VARCHAR(20) DEFAULT 'default_name';") conn.execute("CREATE (:TestNode {id: 4});") conn.execute("CREATE (:TestNode {id: 5, name: 'custom_name'});") - res = conn.execute("Match (n:TestNode) Return n.name;") + res = conn.execute("Match (n:TestNode) Return n.name ORDER BY n.name;") assert list(res) == [ + ["custom_name"], ["default_name"], ["default_name"], ["default_name"], ["default_name"], - ["custom_name"], ] conn.execute("ALTER TABLE TestEdge ADD date INT64;") conn.execute( From c9202f9a4955196ba0dc9adbbc9d821ee8674953 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Wed, 18 Mar 2026 12:29:39 +0800 Subject: [PATCH 22/24] report error with throw --- include/neug/utils/mmap_array.h | 7 ++++++- include/neug/utils/property/column.h | 8 ++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 24ddee75..026d9603 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -630,7 +630,8 @@ class mmap_array { memcpy(data_.data(), temp_buf.data(), effective_size); VLOG(1) << "Compaction completed. New data size: " << effective_size - << ", old data size: " << limit_offset; + << ", old data size: " << limit_offset + << ", effective size: " << effective_size; return effective_size; } @@ -758,6 +759,10 @@ class mmap_array { // Should only be used internally when we are sure the idx is valid string_item get_string_item(size_t idx) const { return items_.get(idx); } void set_string_item(size_t idx, const string_item& item) { + if (!is_writable_) { + THROW_RUNTIME_ERROR( + "Attempt to set_string_item on a read-only mmap_array"); + } items_.set(idx, item); } diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index b5ac6fac..3340ecc1 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -162,7 +162,9 @@ class TypedColumn : public ColumnBase { // Assume it is safe to insert the default value even if it is reserving, // since user could always override void resize(size_t size, const Property& default_value) override { - assert(default_value.type() == type()); + if (default_value.type() != type()) { + THROW_RUNTIME_ERROR("Default value type does not match column type"); + } size_t old_size = size_; size_ = size; buffer_.resize(size_); @@ -380,7 +382,9 @@ class TypedColumn : public ColumnBase { } void resize(size_t size, const Property& default_value) override { - assert(default_value.type() == type()); + if (default_value.type() != type()) { + THROW_RUNTIME_ERROR("Default value type does not match column type"); + } std::unique_lock lock(rw_mutex_); size_t old_size = size_; size_ = size; From 357c2935a153f65ad98388c50924e5c4b25c3add Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Wed, 18 Mar 2026 13:09:40 +0800 Subject: [PATCH 23/24] minor fix --- include/neug/utils/mmap_array.h | 3 +-- include/neug/utils/property/column.h | 2 +- src/utils/property/table.cc | 5 +++++ tests/utils/test_table.cc | 5 +---- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index 026d9603..b65912ab 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -630,8 +630,7 @@ class mmap_array { memcpy(data_.data(), temp_buf.data(), effective_size); VLOG(1) << "Compaction completed. New data size: " << effective_size - << ", old data size: " << limit_offset - << ", effective size: " << effective_size; + << ", old data size: " << limit_offset; return effective_size; } diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index 3340ecc1..05e288da 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -398,7 +398,7 @@ class TypedColumn : public ColumnBase { } else { buffer_.resize(size_, std::max(size_ * width_, pos_.load())); } - if (default_str.size() <= 0) { + if (default_str.size() == 0) { return; } default_str = truncate_utf8(default_str, width_); diff --git a/src/utils/property/table.cc b/src/utils/property/table.cc index 2eeedcc1..53f9ab39 100644 --- a/src/utils/property/table.cc +++ b/src/utils/property/table.cc @@ -153,6 +153,11 @@ void Table::add_columns(const std::vector& col_names, size_t capacity, const std::vector& strategies_, int memory_level) { + if (default_property_values.size() != col_names.size()) { + THROW_RUNTIME_ERROR("default_property_values size mismatch: expected " + + std::to_string(col_names.size()) + " but got " + + std::to_string(default_property_values.size())); + } // When add_columns are called, the table is already initialized and col_files // are opened. std::stringstream ss; diff --git a/tests/utils/test_table.cc b/tests/utils/test_table.cc index e9cdf972..4e0e74ee 100644 --- a/tests/utils/test_table.cc +++ b/tests/utils/test_table.cc @@ -357,10 +357,7 @@ TEST(TableTest, StringColumnDistinguishesUnsetFromEmptyString) { 1, Property::from_string_view("new value new value new value")); EXPECT_EQ(string_column->get_prop(1).as_string_view(), "new value new value new value"); - std::string path = std::string(TEST_DIR) + "/checkpoint/string_column.data"; - if (std::filesystem::exists(path)) { - std::filesystem::remove(path); - } + std::string path = std::string(TEST_DIR) + "/string_column"; string_column->dump(path); StringColumn new_string_column(StorageStrategy::kMem); From a25949c479403eefe1b4ca384fb003f89c8e8e83 Mon Sep 17 00:00:00 2001 From: "xiaolei.zl" Date: Wed, 18 Mar 2026 13:33:28 +0800 Subject: [PATCH 24/24] fix --- include/neug/utils/mmap_array.h | 1 - include/neug/utils/property/column.h | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index b65912ab..965ec30b 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index 05e288da..9642b0d9 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -389,19 +389,19 @@ class TypedColumn : public ColumnBase { size_t old_size = size_; size_ = size; auto default_str = PropUtils::to_typed(default_value); + default_str = truncate_utf8(default_str, width_); if (buffer_.size() != 0) { size_t avg_width = buffer_.avg_size(); // calculate average width of existing strings - buffer_.resize( - size_, - std::max(size_ * (avg_width > 0 ? avg_width : width_), pos_.load())); + buffer_.resize(size_, + std::max(size_ * (avg_width > 0 ? avg_width : width_), + pos_.load() + width_)); } else { buffer_.resize(size_, std::max(size_ * width_, pos_.load())); } if (default_str.size() == 0) { return; } - default_str = truncate_utf8(default_str, width_); if (old_size < size_) { set_value(old_size, default_str);