diff --git a/include/neug/utils/id_indexer.h b/include/neug/utils/id_indexer.h index b31b37a1..a8acad2b 100644 --- a/include/neug/utils/id_indexer.h +++ b/include/neug/utils/id_indexer.h @@ -242,13 +242,11 @@ class LFIndexer { void init(const DataTypeId& type, std::shared_ptr extra_type_info = nullptr) { keys_ = nullptr; - auto default_value = get_default_value(type); switch (type) { -#define TYPE_DISPATCHER(enum_val, T) \ - case DataTypeId::enum_val: { \ - keys_ = std::make_shared>( \ - PropUtils::to_typed(default_value), StorageStrategy::kMem); \ - break; \ +#define TYPE_DISPATCHER(enum_val, T) \ + case DataTypeId::enum_val: { \ + keys_ = std::make_shared>(StorageStrategy::kMem); \ + break; \ } TYPE_DISPATCHER(kInt64, int64_t) TYPE_DISPATCHER(kInt32, int32_t) diff --git a/include/neug/utils/mmap_array.h b/include/neug/utils/mmap_array.h index b64eaa14..965ec30b 100644 --- a/include/neug/utils/mmap_array.h +++ b/include/neug/utils/mmap_array.h @@ -23,12 +23,15 @@ #include #include #include +#include +#include #include #include #include "glog/logging.h" #include "neug/storages/file_names.h" #include "neug/utils/exception/exception.h" +#include "neug/utils/file_utils.h" #ifdef __ia64__ #define ADDR (void*) (0x8000000000000000UL) @@ -66,6 +69,9 @@ inline size_t hugepage_round_up(size_t size) { return ROUND_UP(size); } namespace neug { +template +class TypedColumn; + enum class MemoryStrategy { kSyncToFile, kMemoryOnly, @@ -476,6 +482,7 @@ struct string_item { template <> class mmap_array { public: + friend class TypedColumn; mmap_array() {} mmap_array(mmap_array&& rhs) : mmap_array() { swap(rhs); } ~mmap_array() {} @@ -498,20 +505,17 @@ class mmap_array { } void open_with_hugepages(const std::string& filename) { + is_writable_ = true; items_.open_with_hugepages(filename + ".items"); data_.open_with_hugepages(filename + ".data"); } - void touch(const std::string& filename) { - items_.touch(filename + ".items"); - data_.touch(filename + ".data"); - } - void dump(const std::string& filename) { // Compact before dumping to reclaim unused space auto plan = prepare_compaction_plan(); + size_t effective_size = plan.total_size - plan.reused_size; bool should_stream = - !data_.is_sync_to_file() && plan.total_size < data_.size(); + !data_.is_sync_to_file() && effective_size < data_.size(); if (should_stream) { stream_compact_and_dump(plan, filename + ".data", filename + ".items"); return; @@ -520,6 +524,7 @@ class mmap_array { compact(); items_.dump(filename + ".items"); data_.dump(filename + ".data"); + reset(); } void resize(size_t size, size_t data_size) { @@ -545,7 +550,8 @@ class mmap_array { } void set(size_t idx, size_t offset, const std::string_view& val) { - items_.set(idx, {offset, static_cast(val.size())}); + items_.set(idx, {static_cast(offset), + static_cast(val.size())}); assert(data_.data() + offset + val.size() <= data_.data() + data_.size()); memcpy(data_.data() + offset, val.data(), val.size()); } @@ -562,6 +568,7 @@ class mmap_array { void swap(mmap_array& rhs) { items_.swap(rhs.items_); data_.swap(rhs.data_); + std::swap(is_writable_, rhs.is_writable_); } void set_writable(bool is_writable) { @@ -579,6 +586,7 @@ class mmap_array { is_writable_ = true; } + private: // Compact the data buffer by removing unused space and updating offsets // This is an in-place operation that shifts valid string data forward // Returns the compacted data size. Note that the reserved size of data buffer @@ -590,32 +598,41 @@ class mmap_array { return 0; } size_t size_before_compact = data_.size(); - if (plan.total_size == size_before_compact) { + if (plan.total_size == plan.reused_size + size_before_compact) { return size_before_compact; } + size_t effective_size = plan.total_size - plan.reused_size; - std::vector temp_buf(plan.total_size); + std::vector temp_buf(effective_size); size_t write_offset = 0; size_t limit_offset = 0; + std::unordered_map old_offset_to_new; for (const auto& entry : plan.entries) { - const char* src = data_.data() + entry.offset; - char* dst = temp_buf.data() + write_offset; - limit_offset = std::max(limit_offset, - static_cast(entry.offset + entry.length)); - memcpy(dst, src, entry.length); - items_.set(entry.index, - {static_cast(write_offset), entry.length}); + if (entry.length > 0) { + auto it = old_offset_to_new.find(entry.offset); + if (it != old_offset_to_new.end()) { + items_.set(entry.index, {it->second, entry.length}); + continue; + } + old_offset_to_new.insert({entry.offset, write_offset}); + const char* src = data_.data() + entry.offset; + char* dst = temp_buf.data() + write_offset; + limit_offset = std::max( + limit_offset, static_cast(entry.offset + entry.length)); + memcpy(dst, src, entry.length); + } + items_.set(entry.index, {static_cast(write_offset), + static_cast(entry.length)}); write_offset += entry.length; } - assert(write_offset == plan.total_size); - memcpy(data_.data(), temp_buf.data(), plan.total_size); + assert(write_offset + plan.reused_size == plan.total_size); + memcpy(data_.data(), temp_buf.data(), effective_size); - VLOG(1) << "Compaction completed. New data size: " << plan.total_size + VLOG(1) << "Compaction completed. New data size: " << effective_size << ", old data size: " << limit_offset; - return plan.total_size; + return effective_size; } - private: struct CompactionPlan { struct Entry { size_t index; @@ -624,15 +641,25 @@ class mmap_array { }; std::vector entries; size_t total_size = 0; + size_t reused_size = 0; }; CompactionPlan prepare_compaction_plan() const { CompactionPlan plan; plan.entries.reserve(items_.size()); + std::unordered_set seen_offsets; for (size_t i = 0; i < items_.size(); ++i) { const string_item& item = items_.get(i); plan.total_size += item.length; - plan.entries.push_back({i, item.offset, item.length}); + plan.entries.push_back( + {i, item.offset, static_cast(item.length)}); + if (item.length > 0) { + if (seen_offsets.find(item.offset) != seen_offsets.end()) { + plan.reused_size += item.length; + } else { + seen_offsets.insert(item.offset); + } + } } return plan; } @@ -651,10 +678,18 @@ class mmap_array { } size_t write_offset = 0; + std::unordered_map old_offset_to_new; for (const auto& entry : plan.entries) { if (entry.length > 0) { + auto it = old_offset_to_new.find(entry.offset); + if (it != old_offset_to_new.end()) { + items_.set(entry.index, {it->second, entry.length}); + continue; + } + old_offset_to_new.insert({entry.offset, write_offset}); const char* src = data_.data() + entry.offset; if (fwrite(src, 1, entry.length, fout) != entry.length) { + fclose(fout); std::stringstream ss; ss << "Failed to fwrite file [ " << data_filename << " ], " << strerror(errno); @@ -662,13 +697,14 @@ class mmap_array { THROW_RUNTIME_ERROR(ss.str()); } } - items_.set(entry.index, - {static_cast(write_offset), entry.length}); + items_.set(entry.index, {static_cast(write_offset), + static_cast(entry.length)}); write_offset += entry.length; } - assert(write_offset == plan.total_size); + assert(write_offset + plan.reused_size == plan.total_size); if (fflush(fout) != 0) { + fclose(fout); std::stringstream ss; ss << "Failed to fflush file [ " << data_filename << " ], " << strerror(errno); @@ -677,6 +713,7 @@ class mmap_array { } int fd = fileno(fout); if (fd == -1) { + fclose(fout); std::stringstream ss; ss << "Failed to get file descriptor for [ " << data_filename << " ], " << strerror(errno); @@ -684,6 +721,7 @@ class mmap_array { THROW_RUNTIME_ERROR(ss.str()); } if (ftruncate(fd, static_cast(size_before_compact)) != 0) { + fclose(fout); std::stringstream ss; ss << "Failed to ftruncate file [ " << data_filename << " ], " << strerror(errno); @@ -716,6 +754,16 @@ class mmap_array { items_.dump(items_filename); } + // Should only be used internally when we are sure the idx is valid + string_item get_string_item(size_t idx) const { return items_.get(idx); } + void set_string_item(size_t idx, const string_item& item) { + if (!is_writable_) { + THROW_RUNTIME_ERROR( + "Attempt to set_string_item on a read-only mmap_array"); + } + items_.set(idx, item); + } + mmap_array items_; mmap_array data_; bool is_writable_ = true; diff --git a/include/neug/utils/property/column.h b/include/neug/utils/property/column.h index 1566b4d5..9642b0d9 100644 --- a/include/neug/utils/property/column.h +++ b/include/neug/utils/property/column.h @@ -62,6 +62,7 @@ class ColumnBase { virtual void copy_to_tmp(const std::string& cur_path, const std::string& tmp_path) = 0; virtual void resize(size_t size) = 0; + virtual void resize(size_t size, const Property& default_value) = 0; virtual DataTypeId type() const = 0; @@ -88,8 +89,8 @@ class ColumnBase { template class TypedColumn : public ColumnBase { public: - explicit TypedColumn(const T& default_value, StorageStrategy strategy) - : default_value_(default_value), size_(0), strategy_(strategy) {} + explicit TypedColumn(StorageStrategy strategy) + : size_(0), strategy_(strategy) {} ~TypedColumn() { close(); } void open(const std::string& name, const std::string& snapshot_dir, @@ -154,11 +155,22 @@ class TypedColumn : public ColumnBase { size_t size() const override { return size_; } void resize(size_t size) override { + size_ = size; + buffer_.resize(size_); + } + + // Assume it is safe to insert the default value even if it is reserving, + // since user could always override + void resize(size_t size, const Property& default_value) override { + if (default_value.type() != type()) { + THROW_RUNTIME_ERROR("Default value type does not match column type"); + } size_t old_size = size_; size_ = size; buffer_.resize(size_); + auto default_typed_value = PropUtils::to_typed(default_value); for (size_t i = old_size; i < size_; ++i) { - buffer_.set(i, default_value_); + set_value(i, default_typed_value); } } @@ -206,7 +218,6 @@ class TypedColumn : public ColumnBase { } private: - T default_value_; mmap_array buffer_; size_t size_; StorageStrategy strategy_; @@ -241,6 +252,7 @@ class TypedColumn : public ColumnBase { void close() override {} size_t size() const override { return 0; } void resize(size_t size) override {} + void resize(size_t size, const Property& default_value) override {} DataTypeId type() const override { return DataTypeId::kEmpty; } @@ -265,24 +277,20 @@ class TypedColumn : public ColumnBase { StorageStrategy strategy_; }; -// No default value for StringColumn template <> class TypedColumn : public ColumnBase { public: - TypedColumn(StorageStrategy strategy, uint16_t width, - std::string_view default_value = "") + TypedColumn(StorageStrategy strategy, uint16_t width) : size_(0), pos_(0), strategy_(strategy), width_(width), - default_value_(default_value), type_(DataTypeId::kVarchar) {} explicit TypedColumn(StorageStrategy strategy) : size_(0), pos_(0), strategy_(strategy), width_(STRING_DEFAULT_MAX_LENGTH), - default_value_(""), type_(DataTypeId::kVarchar) {} TypedColumn(TypedColumn&& rhs) { buffer_.swap(rhs.buffer_); @@ -290,7 +298,6 @@ class TypedColumn : public ColumnBase { pos_ = rhs.pos_.load(); strategy_ = rhs.strategy_; width_ = rhs.width_; - default_value_ = rhs.default_value_; type_ = rhs.type_; } @@ -367,12 +374,42 @@ class TypedColumn : public ColumnBase { size_t avg_width = buffer_.avg_size(); // calculate average width of existing strings buffer_.resize( - size_, std::max(size_ * (avg_width > 0 ? avg_width - : STRING_DEFAULT_MAX_LENGTH), - pos_.load())); + size_, + std::max(size_ * (avg_width > 0 ? avg_width : width_), pos_.load())); + } else { + buffer_.resize(size_, std::max(size_ * width_, pos_.load())); + } + } + + void resize(size_t size, const Property& default_value) override { + if (default_value.type() != type()) { + THROW_RUNTIME_ERROR("Default value type does not match column type"); + } + std::unique_lock lock(rw_mutex_); + size_t old_size = size_; + size_ = size; + auto default_str = PropUtils::to_typed(default_value); + default_str = truncate_utf8(default_str, width_); + if (buffer_.size() != 0) { + size_t avg_width = + buffer_.avg_size(); // calculate average width of existing strings + buffer_.resize(size_, + std::max(size_ * (avg_width > 0 ? avg_width : width_), + pos_.load() + width_)); } else { buffer_.resize(size_, std::max(size_ * width_, pos_.load())); } + if (default_str.size() == 0) { + return; + } + + if (old_size < size_) { + set_value(old_size, default_str); + auto string_item = buffer_.get_string_item(old_size); + for (size_t i = old_size + 1; i < size_; ++i) { + buffer_.set_string_item(i, string_item); + } + } } DataTypeId type() const override { return type_; } @@ -443,21 +480,20 @@ class TypedColumn : public ColumnBase { pos_.store(0); } } + mmap_array buffer_; size_t size_; std::atomic pos_; StorageStrategy strategy_; std::shared_mutex rw_mutex_; uint16_t width_; - std::string_view default_value_; DataTypeId type_; }; using StringColumn = TypedColumn; std::shared_ptr CreateColumn( - DataType type, Property default_value, - StorageStrategy strategy = StorageStrategy::kMem); + DataType type, StorageStrategy strategy = StorageStrategy::kMem); /// Create RefColumn for ease of usage for hqps class RefColumnBase { diff --git a/include/neug/utils/property/table.h b/include/neug/utils/property/table.h index ed5fa119..c2c4af76 100644 --- a/include/neug/utils/property/table.h +++ b/include/neug/utils/property/table.h @@ -35,25 +35,21 @@ class Table { void init(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& types, - const std::vector& default_property_values, const std::vector& strategies_); void open(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_); void open_in_memory(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_); void open_with_hugepages(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_, bool force = false); @@ -108,6 +104,12 @@ class Table { bool insert_safe = false); void resize(size_t row_num); + /** + * @brief Resize the table to row_num, and fill the new rows with default + * values. Assume it is safe to insert the default value even if it is + * reserving, since user could always override. + */ + void resize(size_t row_num, const std::vector& default_values); inline Property at(size_t row_id, size_t col_id) const { return column_ptrs_[col_id]->get_prop(row_id); @@ -130,12 +132,10 @@ class Table { void buildColumnPtrs(); void initColumns(const std::vector& col_name, const std::vector& types, - const std::vector& default_property_values, const std::vector& strategies_); std::unordered_map col_id_map_; std::vector col_names_; - std::vector col_default_values_; std::vector> columns_; std::vector column_ptrs_; diff --git a/src/storages/graph/edge_table.cc b/src/storages/graph/edge_table.cc index 2c70764c..b68acd4c 100644 --- a/src/storages/graph/edge_table.cc +++ b/src/storages/graph/edge_table.cc @@ -514,7 +514,7 @@ void EdgeTable::Open(const std::string& work_dir) { table_->open(edata_prefix(meta_->src_label_name, meta_->dst_label_name, meta_->edge_label_name), work_dir, meta_->property_names, meta_->properties, - meta_->default_property_values, meta_->strategies); + meta_->strategies); assert(table_->col_num() > 0); size_t table_cap = table_->get_column_by_id(0)->size(); load_statistic_file(work_dir, meta_->src_label_name, meta_->dst_label_name, @@ -543,8 +543,7 @@ void EdgeTable::OpenInMemory(const std::string& work_dir) { table_->open_in_memory( edata_prefix(meta_->src_label_name, meta_->dst_label_name, meta_->edge_label_name), - work_dir_, meta_->property_names, meta_->properties, - meta_->default_property_values, meta_->strategies); + work_dir_, meta_->property_names, meta_->properties, meta_->strategies); assert(table_->col_num() > 0); size_t table_cap = table_->get_column_by_id(0)->size(); load_statistic_file(work_dir, meta_->src_label_name, meta_->dst_label_name, @@ -574,7 +573,7 @@ void EdgeTable::OpenWithHugepages(const std::string& work_dir) { edata_prefix(meta_->src_label_name, meta_->dst_label_name, meta_->edge_label_name), checkpoint_dir_path, meta_->property_names, meta_->properties, - meta_->default_property_values, meta_->strategies, (memory_level_ > 2)); + meta_->strategies, (memory_level_ > 2)); assert(table_->col_num() > 0); size_t table_cap = table_->get_column_by_id(0)->size(); load_statistic_file(work_dir, meta_->src_label_name, meta_->dst_label_name, @@ -711,7 +710,7 @@ void EdgeTable::EnsureCapacity(size_t capacity) { return; } capacity = std::max(capacity, 4096UL); - table_->resize(capacity); + table_->resize(capacity, meta_->default_property_values); capacity_.store(capacity); } } @@ -1041,8 +1040,7 @@ void EdgeTable::dropAndCreateNewUnbundledCSR(bool delete_property) { LOG(INFO) << "rebuild unbundled edge csr with edge properties: " << meta_->property_names.size(); table_->open_in_memory(next_table_prefix, work_dir_, meta_->property_names, - meta_->properties, meta_->default_property_values, - meta_->strategies); + meta_->properties, meta_->strategies); } std::shared_ptr prev_data_col = nullptr; @@ -1060,26 +1058,10 @@ void EdgeTable::dropAndCreateNewUnbundledCSR(bool delete_property) { auto edges = out_csr_->batch_export(prev_data_col); if (prev_data_col && prev_data_col->size() > 0) { - table_->resize(prev_data_col->size()); + table_->resize(prev_data_col->size(), meta_->default_property_values); table_idx_.store(prev_data_col->size()); EnsureCapacity(prev_data_col->size()); } - // Set default value for other columns - for (size_t col_id = 1; col_id < table_->col_num(); ++col_id) { - auto col = table_->get_column_by_id(col_id); - if (col->type() == DataTypeId::kVarchar) { - VLOG(10) << "Skip set default value for column " << col_id - << " of type StringView"; - continue; - } - auto default_value = meta_->default_property_values[col_id]; - VLOG(10) << "Set default value for column " << col_id << ": " - << default_value.to_string() - << ", type: " << std::to_string(default_value.type()); - for (size_t row = 0; row < col->size(); ++row) { - col->set_any(row, default_value); - } - } std::vector row_ids; for (size_t i = 0; i < std::get<0>(edges).size(); ++i) { row_ids.push_back(i); diff --git a/src/storages/graph/property_graph.cc b/src/storages/graph/property_graph.cc index 242e56e0..391a1cbe 100644 --- a/src/storages/graph/property_graph.cc +++ b/src/storages/graph/property_graph.cc @@ -154,7 +154,6 @@ Status PropertyGraph::BatchAddEdges( return neug::Status::OK(); } -// TODO(zhanglei): support extra_type_info Status PropertyGraph::CreateVertexType( const std::string& vertex_type_name, const std::vector>& properties, @@ -260,7 +259,6 @@ Status PropertyGraph::CreateVertexType( return neug::Status::OK(); } -// TODO(zhanglei): support extra_type_info Status PropertyGraph::CreateEdgeType( const std::string& src_vertex_type, const std::string& dst_vertex_type, const std::string& edge_type_name, @@ -338,7 +336,6 @@ Status PropertyGraph::CreateEdgeType( return neug::Status::OK(); } -// TODO(zhanglei): Support extra_type_info Status PropertyGraph::AddVertexProperties( const std::string& vertex_type_name, const std::vector>& @@ -386,7 +383,6 @@ Status PropertyGraph::AddVertexProperties( return neug::Status::OK(); } -// TODO(zhanglei): Support extra_type_info Status PropertyGraph::AddEdgeProperties( const std::string& src_type_name, const std::string& dst_type_name, const std::string& edge_type_name, diff --git a/src/storages/graph/vertex_table.cc b/src/storages/graph/vertex_table.cc index dcc10d08..18de1fc5 100644 --- a/src/storages/graph/vertex_table.cc +++ b/src/storages/graph/vertex_table.cc @@ -34,7 +34,6 @@ void VertexTable::Open(const std::string& work_dir, int memory_level) { indexer_.open(indexer_filename, checkpoint_dir_path, work_dir_); table_->open(vertex_table_prefix(label_name), work_dir_, vertex_schema_->property_names, vertex_schema_->property_types, - vertex_schema_->default_property_values, vertex_schema_->storage_strategies); } else if (memory_level_ == 1) { @@ -42,7 +41,6 @@ void VertexTable::Open(const std::string& work_dir, int memory_level) { table_->open_in_memory(vertex_table_prefix(label_name), work_dir_, vertex_schema_->property_names, vertex_schema_->property_types, - vertex_schema_->default_property_values, vertex_schema_->storage_strategies); } else if (memory_level_ >= 2) { @@ -51,7 +49,6 @@ void VertexTable::Open(const std::string& work_dir, int memory_level) { table_->open_with_hugepages( vertex_table_prefix(label_name), work_dir_, vertex_schema_->property_names, vertex_schema_->property_types, - vertex_schema_->default_property_values, vertex_schema_->storage_strategies, (memory_level_ > 2)); } else { THROW_INTERNAL_EXCEPTION("Invalid memory level: " + @@ -188,7 +185,7 @@ size_t VertexTable::EnsureCapacity(size_t capacity) { indexer_.reserve(capacity); } if (table_ && table_->size() < capacity) { - table_->resize(capacity); + table_->resize(capacity, vertex_schema_->default_property_values); } v_ts_.Reserve(capacity); return indexer_.capacity(); diff --git a/src/utils/property/column.cc b/src/utils/property/column.cc index 01b3401c..2bbc7826 100644 --- a/src/utils/property/column.cc +++ b/src/utils/property/column.cc @@ -68,6 +68,7 @@ class TypedEmptyColumn : public ColumnBase { void close() override {} size_t size() const override { return 0; } void resize(size_t size) override {} + void resize(size_t size, const Property& default_value) override {} DataTypeId type() const override { return PropUtils::prop_type(); } @@ -108,6 +109,7 @@ class TypedEmptyColumn : public ColumnBase { void close() override {} size_t size() const override { return 0; } void resize(size_t size) override {} + void resize(size_t size, const Property& default_value) override {} DataTypeId type() const override { return DataTypeId::kVarchar; } @@ -132,7 +134,7 @@ class TypedEmptyColumn : public ColumnBase { void ensure_writable(const std::string& work_dir) override {} }; -std::shared_ptr CreateColumn(DataType type, Property default_value, +std::shared_ptr CreateColumn(DataType type, StorageStrategy strategy) { auto type_id = type.id(); auto extra_type_info = type.RawExtraTypeInfo(); @@ -151,10 +153,9 @@ std::shared_ptr CreateColumn(DataType type, Property default_value, } } else { switch (type_id) { -#define TYPE_DISPATCHER(enum_val, type) \ - case DataTypeId::enum_val: \ - return std::make_shared>( \ - PropUtils::to_typed(default_value), strategy); +#define TYPE_DISPATCHER(enum_val, type) \ + case DataTypeId::enum_val: \ + return std::make_shared>(strategy); FOR_EACH_DATA_TYPE_NO_STRING(TYPE_DISPATCHER) #undef TYPE_DISPATCHER case DataTypeId::kVarchar: { @@ -165,8 +166,7 @@ std::shared_ptr CreateColumn(DataType type, Property default_value, max_length = str_info->max_length; } } - return std::make_shared(strategy, max_length, - default_value.as_string_view()); + return std::make_shared(strategy, max_length); } case DataTypeId::kEmpty: { return std::make_shared>(strategy); diff --git a/src/utils/property/table.cc b/src/utils/property/table.cc index c3299b2b..53f9ab39 100644 --- a/src/utils/property/table.cc +++ b/src/utils/property/table.cc @@ -33,12 +33,10 @@ Table::~Table() { close(); } void Table::initColumns(const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_) { size_t col_num = col_name.size(); columns_.clear(); col_names_.clear(); - col_default_values_.clear(); col_id_map_.clear(); columns_.resize(col_num, nullptr); auto strategies = strategies_; @@ -49,11 +47,7 @@ void Table::initColumns(const std::vector& col_name, col_id_map_.insert({col_name[i], col_id}); col_names_.emplace_back(col_name[i]); assert(i < property_types.size()); - col_default_values_.emplace_back( - i < default_property_values.size() - ? default_property_values[i] - : get_default_value(property_types[i].id())); - columns_[col_id] = CreateColumn(property_types[i], col_default_values_[i]); + columns_[col_id] = CreateColumn(property_types[i]); } columns_.resize(col_id_map_.size()); } @@ -61,11 +55,10 @@ void Table::initColumns(const std::vector& col_name, void Table::init(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_) { name_ = name; work_dir_ = work_dir; - initColumns(col_name, property_types, default_property_values, strategies_); + initColumns(col_name, property_types, strategies_); for (size_t i = 0; i < columns_.size(); ++i) { columns_[i]->open(name + ".col_" + std::to_string(i), "", work_dir); } @@ -76,12 +69,11 @@ void Table::init(const std::string& name, const std::string& work_dir, void Table::open(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_) { name_ = name; work_dir_ = work_dir; snapshot_dir_ = checkpoint_dir(work_dir_); - initColumns(col_name, property_types, default_property_values, strategies_); + initColumns(col_name, property_types, strategies_); for (size_t i = 0; i < columns_.size(); ++i) { columns_[i]->open(name + ".col_" + std::to_string(i), snapshot_dir_, tmp_dir(work_dir)); @@ -93,12 +85,11 @@ void Table::open(const std::string& name, const std::string& work_dir, void Table::open_in_memory(const std::string& name, const std::string& work_dir, const std::vector& col_name, const std::vector& property_types, - const std::vector& default_property_values, const std::vector& strategies_) { name_ = name; work_dir_ = work_dir; snapshot_dir_ = checkpoint_dir(work_dir_); - initColumns(col_name, property_types, default_property_values, strategies_); + initColumns(col_name, property_types, strategies_); for (size_t i = 0; i < columns_.size(); ++i) { columns_[i]->open_in_memory(snapshot_dir_ + "/" + name + ".col_" + std::to_string(i)); @@ -107,16 +98,16 @@ void Table::open_in_memory(const std::string& name, const std::string& work_dir, buildColumnPtrs(); } -void Table::open_with_hugepages( - const std::string& name, const std::string& work_dir, - const std::vector& col_name, - const std::vector& property_types, - const std::vector& default_property_values, - const std::vector& strategies_, bool force) { +void Table::open_with_hugepages(const std::string& name, + const std::string& work_dir, + const std::vector& col_name, + const std::vector& property_types, + const std::vector& strategies_, + bool force) { name_ = name; work_dir_ = work_dir; snapshot_dir_ = checkpoint_dir(work_dir); - initColumns(col_name, property_types, default_property_values, strategies_); + initColumns(col_name, property_types, strategies_); for (size_t i = 0; i < columns_.size(); ++i) { columns_[i]->open_with_hugepages( snapshot_dir_ + "/" + name + ".col_" + std::to_string(i), force); @@ -159,9 +150,14 @@ void Table::reset_header(const std::vector& col_name) { void Table::add_columns(const std::vector& col_names, const std::vector& col_types, const std::vector& default_property_values, - size_t column_size, + size_t capacity, const std::vector& strategies_, int memory_level) { + if (default_property_values.size() != col_names.size()) { + THROW_RUNTIME_ERROR("default_property_values size mismatch: expected " + + std::to_string(col_names.size()) + " but got " + + std::to_string(default_property_values.size())); + } // When add_columns are called, the table is already initialized and col_files // are opened. std::stringstream ss; @@ -175,10 +171,9 @@ void Table::add_columns(const std::vector& col_names, int col_id = col_names_.size(); col_id_map_.insert({col_names[i], col_id}); col_names_.emplace_back(col_names[i]); - col_default_values_.emplace_back(default_property_values[i]); - columns_[col_id] = CreateColumn( - col_types[i], default_property_values[i], - i < strategies_.size() ? strategies_[i] : StorageStrategy::kMem); + columns_[col_id] = CreateColumn(col_types[i], i < strategies_.size() + ? strategies_[i] + : StorageStrategy::kMem); } for (size_t i = old_size; i < columns_.size(); ++i) { if (memory_level == 0) { @@ -190,7 +185,7 @@ void Table::add_columns(const std::vector& col_names, } else { THROW_NOT_IMPLEMENTED_EXCEPTION("Unsupported memory level"); } - columns_[i]->resize(column_size); + columns_[i]->resize(capacity, default_property_values[i - old_size]); } buildColumnPtrs(); } @@ -217,7 +212,6 @@ void Table::delete_column(const std::string& col_name) { columns_[col_id].reset(); columns_.erase(columns_.begin() + col_id); col_names_.erase(col_names_.begin() + col_id); - col_default_values_.erase(col_default_values_.begin() + col_id); for (size_t i = col_id; i < column_ptrs_.size() - 1; i++) { column_ptrs_[i] = column_ptrs_[i + 1]; } @@ -329,6 +323,19 @@ void Table::resize(size_t row_num) { } } +void Table::resize(size_t row_num, + const std::vector& default_values) { + if (default_values.size() != columns_.size()) { + THROW_RUNTIME_ERROR("default_values size mismatch: expected " + + std::to_string(columns_.size()) + " but got " + + std::to_string(default_values.size())); + } + for (size_t i = 0; i < columns_.size(); ++i) { + columns_[i]->ensure_writable(work_dir_); + columns_[i]->resize(row_num, default_values[i]); + } +} + void Table::ingest(uint32_t index, OutArchive& arc) { if (column_ptrs_.size() == 0) { return; diff --git a/tests/utils/test_table.cc b/tests/utils/test_table.cc index f02c843a..4e0e74ee 100644 --- a/tests/utils/test_table.cc +++ b/tests/utils/test_table.cc @@ -80,19 +80,6 @@ TEST(TableTest, TestTableBasic) { {DataTypeId::kTimestampMs}, {DataTypeId::kInterval}, {DataTypeId::kVarchar}}; - std::vector default_values = { - Property::from_bool(false), - Property::from_int32(0), - Property::from_uint32(0), - Property::from_int64(0), - Property::from_uint64(0), - Property::from_float(0.0), - Property::from_double(0.0), - Property::from_date(Date(0)), - Property::from_datetime(DateTime(0)), - Property::from_interval(Interval(std::string(""))), - Property::from_string_view("")}; - std::vector disk_strategies(col_name.size(), StorageStrategy::kDisk); std::vector mem_strategies(col_name.size(), @@ -101,11 +88,11 @@ TEST(TableTest, TestTableBasic) { StorageStrategy::kNone); disk_table.init("test_dist", TEST_DIR, col_name, property_types, - default_values, disk_strategies); + disk_strategies); mem_table.init("test_dist", TEST_DIR, col_name, property_types, - default_values, mem_strategies); + mem_strategies); none_table.init("test_dist", TEST_DIR, col_name, property_types, - default_values, none_strategies); + none_strategies); disk_table.resize(10); mem_table.resize(10); @@ -316,7 +303,7 @@ TEST(TableTest, TestTableBasic) { mem_table.drop(); disk_table.open("disk_table", std::string(TEST_DIR), col_name, property_types, - default_values, disk_strategies); + disk_strategies); EXPECT_EQ(disk_table.col_num(), 11); EXPECT_EQ(disk_table.get_column_by_id(0)->size(), 10); disk_table.reset_header(col_name); @@ -330,7 +317,7 @@ TEST(TableTest, TestTableBasic) { disk_table.drop(); mem_table.open_in_memory("disk_table", std::string(TEST_DIR), col_name, - property_types, default_values, mem_strategies); + property_types, mem_strategies); EXPECT_EQ(mem_table.col_num(), 11); EXPECT_EQ(mem_table.get_column_by_id(0)->size(), 10); const Table& mem_table_ref = mem_table; @@ -339,5 +326,46 @@ TEST(TableTest, TestTableBasic) { EXPECT_EQ(mem_table_ref.get_column_by_id(0)->type(), DataTypeId::kBoolean); } +TEST(TableTest, StringColumnDistinguishesUnsetFromEmptyString) { + if (std::filesystem::exists(TEST_DIR)) { + std::filesystem::remove_all(TEST_DIR); + } + std::filesystem::create_directories(TEST_DIR); + std::filesystem::create_directories(std::string(TEST_DIR) + "/checkpoint"); + std::filesystem::create_directories(std::string(TEST_DIR) + "/runtime/tmp"); + + Table table; + std::vector col_name = {"string_column"}; + std::vector property_types = {{DataTypeId::kVarchar}}; + std::vector mem_strategies(col_name.size(), + StorageStrategy::kMem); + + table.init("test_string_validity", TEST_DIR, col_name, property_types, + mem_strategies); + table.resize(2, {Property::from_string_view("default_value")}); + + auto string_column = std::dynamic_pointer_cast( + table.get_column("string_column")); + ASSERT_NE(string_column, nullptr); + + EXPECT_EQ(string_column->get_prop(0).as_string_view(), "default_value"); + + string_column->set_prop(1, Property::from_string_view("")); + EXPECT_TRUE(string_column->get_prop(1).as_string_view().empty()); + EXPECT_EQ(string_column->get_prop(1).type(), DataTypeId::kVarchar); + string_column->set_prop( + 1, Property::from_string_view("new value new value new value")); + EXPECT_EQ(string_column->get_prop(1).as_string_view(), + "new value new value new value"); + std::string path = std::string(TEST_DIR) + "/string_column"; + string_column->dump(path); + + StringColumn new_string_column(StorageStrategy::kMem); + new_string_column.open_in_memory(path); + EXPECT_EQ(new_string_column.get_prop(0).as_string_view(), "default_value"); + EXPECT_EQ(new_string_column.get_prop(1).as_string_view(), + "new value new value new value"); +} + } // namespace test -} // namespace neug \ No newline at end of file +} // namespace neug diff --git a/tools/python_bind/tests/test_ddl.py b/tools/python_bind/tests/test_ddl.py index 5b780d5f..8b9b63ea 100644 --- a/tools/python_bind/tests/test_ddl.py +++ b/tools/python_bind/tests/test_ddl.py @@ -360,3 +360,62 @@ def test_alter_varchar_type(): assert list(res) == [[1, "Alice"]] conn.close() db.close() + + +def test_get_varchar_default_value_1(): + db_dir = "/tmp/test_get_varchar_default_value_1" + shutil.rmtree(db_dir, ignore_errors=True) + db = Database(db_dir, "w") + conn = db.connect() + conn.execute( + "CREATE NODE TABLE TestNode(id INT64 PRIMARY KEY, name VARCHAR(20) DEFAULT 'default_name');" + ) + conn.execute("CREATE (:TestNode {id: 1});") + conn.execute("CREATE (:TestNode {id: 2});") + conn.execute("CREATE (:TestNode {id: 3});") + res = conn.execute("Match (n:TestNode) Return n.name;") + assert list(res) == [["default_name"], ["default_name"], ["default_name"]] + conn.close() + db.close() + + +def test_get_varchar_default_value_2(): + db_dir = "/tmp/test_get_varchar_default_value_2" + shutil.rmtree(db_dir, ignore_errors=True) + db = Database(db_dir, "w") + conn = db.connect() + conn.execute("CREATE NODE TABLE TestNode(id INT64 PRIMARY KEY);") + conn.execute("CREATE REL TABLE TestEdge(FROM TestNode TO TestNode);") + conn.execute("CREATE (:TestNode {id: 1});") + conn.execute("CREATE (:TestNode {id: 2});") + conn.execute("CREATE (:TestNode {id: 3});") + conn.execute( + "MATCH (a:TestNode {id: 1}), (b:TestNode {id: 2}) CREATE (a)-[:TestEdge]->(b);" + ) + conn.execute( + "MATCH (a:TestNode {id: 2}), (b:TestNode {id: 3}) CREATE (a)-[:TestEdge]->(b);" + ) + conn.execute("ALTER TABLE TestNode ADD name VARCHAR(20) DEFAULT 'default_name';") + conn.execute("CREATE (:TestNode {id: 4});") + conn.execute("CREATE (:TestNode {id: 5, name: 'custom_name'});") + res = conn.execute("Match (n:TestNode) Return n.name ORDER BY n.name;") + assert list(res) == [ + ["custom_name"], + ["default_name"], + ["default_name"], + ["default_name"], + ["default_name"], + ] + conn.execute("ALTER TABLE TestEdge ADD date INT64;") + conn.execute( + "MATCH (a:TestNode {id: 1})-[e:TestEdge]->(b:TestNode {id: 2}) SET e.date = 1234567890;" + ) + conn.execute( + "MATCH (a:TestNode {id: 1}), (b:TestNode { id: 3 }) CREATE (a)-[:TestEdge {date: 9876543210}]->(b);" + ) + res = conn.execute( + "MATCH (a:TestNode {id: 1})-[e:TestEdge]->(b:TestNode) RETURN e.date;" + ) + assert list(res) == [[1234567890], [9876543210]] + conn.close() + db.close()