tilemaker/src/sorted_way_store.cpp at master · prgm767/tilemaker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
#include <algorithm>
#include <bitset>
#include <cstring>
#include <iostream>
#include "external/libpopcnt.h"
#include "external/streamvbyte.h"
#include "external/streamvbyte_zigzag.h"
#include "sorted_way_store.h"
#include "node_store.h"

namespace SortedWayStoreTypes {
	const uint16_t GroupSize = 256;
	const uint16_t ChunkSize = 256;
	const size_t LargeWayAlignment = 64;

	// We encode some things in the length of a way's unused upper bits.
	const uint16_t CompressedWay = 1 << 15;
	const uint16_t ClosedWay = 1 << 14;
	const uint16_t UniformUpperBits = 1 << 13;

	struct ThreadStorage {
		ThreadStorage():
			collectingOrphans(true),
			groupStart(-1),
			localWays(nullptr) {}

		bool collectingOrphans;
		uint64_t groupStart;
		std::vector<std::pair<WayID, std::vector<NodeID>>>* localWays;
		std::vector<uint8_t> encodedWay;
	};

	thread_local std::deque<std::pair<const SortedWayStore*, ThreadStorage>> threadStorage;

	inline ThreadStorage& s(const SortedWayStore* who) {
		for (auto& entry : threadStorage)
			if (entry.first == who)
				return entry.second;

		threadStorage.push_back(std::make_pair(who, ThreadStorage()));

		auto& rv = threadStorage.back();
		return rv.second;
	}

	// C++ doesn't support variable length arrays declared on stack.
	// g++ and clang support it, but msvc doesn't. Rather than pay the
	// cost of a vector for every decode, we use a thread_local with room for at
	// least 2,000 nodes.
	//
	// Note: these are scratch buffers, so they remain as true thread-locals,
	// and aren't part of ThreadStorage.
	thread_local uint64_t highBytes[2000];
	thread_local uint32_t uint32Buffer[2000];
	thread_local int32_t int32Buffer[2000];
	thread_local uint8_t uint8Buffer[8192];
}

using namespace SortedWayStoreTypes;

SortedWayStore::SortedWayStore(bool compressWays, const NodeStore& nodeStore): compressWays(compressWays), nodeStore(nodeStore) {
	s(this); // allocate our ThreadStorage before multi-threading
	reopen();
}

SortedWayStore::~SortedWayStore() {
	s(this) = ThreadStorage();
}

void SortedWayStore::reopen() {
	allocatedMemory.clear();

	totalWays = 0;
	totalNodes = 0;
	totalGroups = 0;
	totalGroupSpace = 0;
	totalChunks = 0;
	orphanage.clear();
	workerBuffers.clear();

	// Each group can store 64K ways. If we allocate 32K slots,
	// we support 2^31 = 2B ways, or about twice the number used
	// by OSM as of December 2023.
	groups.clear();
	groups.resize(32 * 1024);

}

bool SortedWayStore::contains(size_t shard, WayID id) const {
	const size_t groupIndex = id / (GroupSize * ChunkSize);
	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
	const uint64_t chunkMaskByte = chunk / 8;
	const uint64_t chunkMaskBit = chunk % 8;

	const uint64_t wayMaskByte = (id % ChunkSize) / 8;
	const uint64_t wayMaskBit = id % 8;

	GroupInfo* groupPtr = groups[groupIndex];

	if (groupPtr == nullptr)
		return false;

	size_t chunkOffset = 0;
	{
		chunkOffset = popcnt(groupPtr->chunkMask, chunkMaskByte);
		uint8_t maskByte = groupPtr->chunkMask[chunkMaskByte];
		maskByte = maskByte & ((1 << chunkMaskBit) - 1);
		chunkOffset += popcnt(&maskByte, 1);

		if (!(groupPtr->chunkMask[chunkMaskByte] & (1 << chunkMaskBit)))
			return false;
	}

	ChunkInfo* chunkPtr = (ChunkInfo*)((char*)groupPtr + groupPtr->chunkOffsets[chunkOffset]);

	{
		size_t wayOffset = 0;
		wayOffset = popcnt(chunkPtr->smallWayMask, wayMaskByte);
		uint8_t maskByte = chunkPtr->smallWayMask[wayMaskByte];
		maskByte = maskByte & ((1 << wayMaskBit) - 1);
		wayOffset += popcnt(&maskByte, 1);
		if (chunkPtr->smallWayMask[wayMaskByte] & (1 << wayMaskBit))
			return true;
	}

	size_t wayOffset = 0;
	wayOffset += popcnt(chunkPtr->smallWayMask, 32);
	wayOffset += popcnt(chunkPtr->bigWayMask, wayMaskByte);
	uint8_t maskByte = chunkPtr->bigWayMask[wayMaskByte];
	maskByte = maskByte & ((1 << wayMaskBit) - 1);
	wayOffset += popcnt(&maskByte, 1);
	if (!(chunkPtr->bigWayMask[wayMaskByte] & (1 << wayMaskBit)))
		return false;

	return true;
}

std::vector<LatpLon> SortedWayStore::at(WayID id) const {
	const size_t groupIndex = id / (GroupSize * ChunkSize);
	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
	const uint64_t chunkMaskByte = chunk / 8;
	const uint64_t chunkMaskBit = chunk % 8;

	const uint64_t wayMaskByte = (id % ChunkSize) / 8;
	const uint64_t wayMaskBit = id % 8;

	GroupInfo* groupPtr = groups[groupIndex];

	if (groupPtr == nullptr) {
		throw std::out_of_range("SortedWayStore::at(" + std::to_string(id) + ") uses non-existent group " + std::to_string(groupIndex));
	}

	size_t chunkOffset = 0;
	{
		chunkOffset = popcnt(groupPtr->chunkMask, chunkMaskByte);
		uint8_t maskByte = groupPtr->chunkMask[chunkMaskByte];
		maskByte = maskByte & ((1 << chunkMaskBit) - 1);
		chunkOffset += popcnt(&maskByte, 1);

		if (!(groupPtr->chunkMask[chunkMaskByte] & (1 << chunkMaskBit)))
			throw std::out_of_range("SortedWayStore: way " + std::to_string(id) + " missing, no chunk");
	}

	ChunkInfo* chunkPtr = (ChunkInfo*)((char*)groupPtr + groupPtr->chunkOffsets[chunkOffset]);
	const size_t numWays = popcnt(chunkPtr->smallWayMask, 32) + popcnt(chunkPtr->bigWayMask, 32);

	uint8_t* const endOfWayOffsetPtr = (uint8_t*)(chunkPtr->wayOffsets + numWays);
	EncodedWay* wayPtr = nullptr;

	{
		size_t wayOffset = 0;
		wayOffset = popcnt(chunkPtr->smallWayMask, wayMaskByte);
		uint8_t maskByte = chunkPtr->smallWayMask[wayMaskByte];
		maskByte = maskByte & ((1 << wayMaskBit) - 1);
		wayOffset += popcnt(&maskByte, 1);
		if (chunkPtr->smallWayMask[wayMaskByte] & (1 << wayMaskBit)) {
			wayPtr = (EncodedWay*)(endOfWayOffsetPtr + chunkPtr->wayOffsets[wayOffset]);
		}
	}

	// If we didn't find it in small ways, look in big ways.
	if (wayPtr == nullptr) {
		size_t wayOffset = 0;
		wayOffset += popcnt(chunkPtr->smallWayMask, 32);
		wayOffset += popcnt(chunkPtr->bigWayMask, wayMaskByte);
		uint8_t maskByte = chunkPtr->bigWayMask[wayMaskByte];
		maskByte = maskByte & ((1 << wayMaskBit) - 1);
		wayOffset += popcnt(&maskByte, 1);
		if (!(chunkPtr->bigWayMask[wayMaskByte] & (1 << wayMaskBit)))
			throw std::out_of_range("SortedWayStore: way " + std::to_string(id) + " missing, no way");

		wayPtr = (EncodedWay*)(endOfWayOffsetPtr + chunkPtr->wayOffsets[wayOffset] * LargeWayAlignment);
	}

	std::vector<NodeID> nodes = SortedWayStore::decodeWay(wayPtr->flags, wayPtr->data);
	std::vector<LatpLon> rv;
	for (const NodeID& node : nodes)
		rv.push_back(nodeStore.at(node));
	return rv;
}

void SortedWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) {
	throw std::runtime_error("SortedWayStore does not support insertLatpLons");
}

void SortedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
	// pbf_processor can call with an empty array if the only ways it read were unable to
	// be processed due to missing nodes, so be robust against empty way vector.
	if (newWays.empty())
		return;

	ThreadStorage& tls = s(this);
	if (tls.localWays == nullptr) {
		std::lock_guard<std::mutex> lock(orphanageMutex);
		if (workerBuffers.size() == 0)
			workerBuffers.reserve(256);
		else if (workerBuffers.size() == workerBuffers.capacity())
			throw std::runtime_error("SortedWayStore doesn't support more than 256 cores");
		workerBuffers.push_back(std::vector<std::pair<WayID, std::vector<NodeID>>>());
		tls.localWays = &workerBuffers.back();
	}

	if (tls.groupStart == -1) {
		// Mark where the first full group starts, so we know when to transition
		// out of collecting orphans.
		tls.groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
	}

	int i = 0;
	while (tls.collectingOrphans && i < newWays.size()) {
		const auto& el = newWays[i];
		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
			tls.collectingOrphans = false;
			// Calculate new groupStart, rounding to previous boundary.
			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
			collectOrphans(*tls.localWays);
			tls.localWays->clear();
		}
		tls.localWays->push_back(el);
		i++;
	}

	while(i < newWays.size()) {
		const auto& el = newWays[i];

		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
			publishGroup(*tls.localWays);
			tls.localWays->clear();
			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
		}

		tls.localWays->push_back(el);
		i++;
	}
}

void SortedWayStore::clear() {
	// TODO: why does this function exist in addition to reopen?
	reopen();
}

std::size_t SortedWayStore::size() const {
	return totalWays.load();
}

void SortedWayStore::finalize(unsigned int threadNum) {
	for (const auto& buffer: workerBuffers) {
		if (buffer.size() > 0) {
			collectOrphans(buffer);
		}
	}
	workerBuffers.clear();

	// Empty the orphanage into the index.
	std::vector<std::pair<WayID, std::vector<NodeID>>> copy;
	for (const auto& entry: orphanage) {
		for (const auto& orphan: entry.second)
			copy.push_back(orphan);

		// Orphans may come from different workers, and thus be unsorted.
		std::sort(
			copy.begin(),
			copy.end(),
			[](auto const &a, auto const &b) { return a.first < b.first; }
		);
		publishGroup(copy);
		copy.clear();
	}

	orphanage.clear();

	std::cout << "SortedWayStore: " << totalGroups << " groups, " << totalChunks << " chunks, " << totalWays.load() << " ways, " << totalNodes.load() << " nodes, " << totalGroupSpace.load() << " bytes" << std::endl;
}

void SortedWayStore::batchStart() {
	ThreadStorage& tls = s(this);
	tls.collectingOrphans = true;
	tls.groupStart = -1;
	if (tls.localWays == nullptr || tls.localWays->size() == 0)
		return;

	collectOrphans(*tls.localWays);
	tls.localWays->clear();
}

void SortedWayStore::collectOrphans(const std::vector<std::pair<WayID, std::vector<NodeID>>>& orphans) {
	std::lock_guard<std::mutex> lock(orphanageMutex);
	size_t groupIndex = orphans[0].first / (GroupSize * ChunkSize);

	std::vector<std::pair<WayID, std::vector<NodeID>>>& vec = orphanage[groupIndex];
	const size_t i = vec.size();

	vec.resize(i + orphans.size());
	std::copy(orphans.begin(), orphans.end(), vec.begin() + i);
}

std::vector<NodeID> SortedWayStore::decodeWay(uint16_t flags, const uint8_t* input) {
	std::vector<NodeID> rv;

	bool isCompressed = flags & CompressedWay;
	bool isClosed = flags & ClosedWay;

	const uint16_t length = flags & 0b0000011111111111;

	if (!(flags & UniformUpperBits)) {
		// The nodes don't all share the same upper int; unpack which
		// bits are set on a per-node basis.
		for (int i = 0; i <= (length - 1) / 2; i++) {
			uint8_t byte = *input;
			for (int j = i * 2; j < std::min<int>(length, i * 2 + 2); j++) {
				uint64_t highByte = 0;
				highByte |= (byte & 0b00001111);
				byte = byte >> 4;
				highBytes[j] = (highByte << 31);
			}
			input++;
		}
	} else {
		uint8_t setBits = *(uint8_t*)input;
		input++;
		uint64_t highByte = setBits;
		highByte = highByte << 31;
		for (int i = 0; i < length; i++)
			highBytes[i] = highByte;
	}

	if (!isCompressed) {
		// Decode the low ints
		uint32_t* lowIntData = (uint32_t*)input;
		for (int i = 0; i < length; i++)
			rv.push_back(highBytes[i] | lowIntData[i]);
	} else {
		input += 2;

		uint32_t firstInt = *(uint32_t*)(input);
		input += 4;
		rv.push_back(highBytes[0] | firstInt);

		streamvbyte_decode(input, uint32Buffer, length - 1);
		zigzag_delta_decode(uint32Buffer, int32Buffer, length - 1, firstInt);
		for (int i = 1; i < length; i++) {
			uint32_t tmp = int32Buffer[i - 1];
			rv.push_back(highBytes[i] | tmp);
		}
	}

	if (isClosed)
		rv.push_back(rv[0]);
	return rv;
};

uint16_t SortedWayStore::encodeWay(const std::vector<NodeID>& way, std::vector<uint8_t>& output, bool compress) {
	if (way.size() == 0)
		throw std::runtime_error("Cannot encode an empty way");

	if (way.size() > 2000)
		throw std::runtime_error("Way had more than 2,000 nodes");

	bool isClosed = way.size() > 1 && way[0] == way[way.size() - 1];
	output.clear();

	// When the way is closed, store that in a single bit and omit
	// the final point.
	const int max = isClosed ? way.size() - 1 : way.size();

	uint16_t rv = max;

	if (compress)
		rv |= CompressedWay;

	if (isClosed)
		rv |= ClosedWay;

	bool pushUpperBits = false;

	// zigzag encoding can only be done on ints, not uints, so we shift
	// 31 bits, not 32.
	uint32_t upperInt = way[0] >> 31;
	for (int i = 1; i < way.size(); i++) {
		if (way[i] >> 31 != upperInt) {
			pushUpperBits = true;
			break;
		}
	}

	if (pushUpperBits) {
		for (int i = 0; i <= (max - 1) / 2; i++) {
			uint8_t byte = 0;

			bool first = true;
			for (int j = std::min(max, i * 2 + 2) - 1; j >= i * 2; j--) {
				if (!first)
					byte = byte << 4;
				first = false;
				uint8_t upper4Bits = way[j] >> 31;
				if (upper4Bits > 15)
					throw std::runtime_error("unexpectedly high node ID: " + std::to_string(way[j]));
				byte |= upper4Bits;
			}

			output.push_back(byte);
		}
	} else {
		if (upperInt > 15)
			throw std::runtime_error("unexpectedly high node ID");

		rv |= UniformUpperBits;
		output.push_back(upperInt);
	}

	// Push the low bytes.
	if (!compress) {
		const size_t oldSize = output.size();
		output.resize(output.size() + max * 4);
		uint32_t* dataStart = (uint32_t*)(output.data() + oldSize);
		for (int i = 0; i < max; i++) {
			uint32_t lowBits = way[i];
			lowBits = lowBits & 0x7FFFFFFF;
			dataStart[i] = lowBits;
		}
	} else {
		for (int i = 0; i < max; i++) {
			uint32_t truncated = way[i];
			truncated = truncated & 0x7FFFFFFF;
			int32Buffer[i] = truncated;
		}

		zigzag_delta_encode(int32Buffer + 1, uint32Buffer, max - 1, int32Buffer[0]);

		size_t compressedSize = streamvbyte_encode(uint32Buffer, max - 1, uint8Buffer);

		const size_t oldSize = output.size();
		output.resize(output.size() + 2 /* compressed size */ + 4 /* first 32-bit value */ + compressedSize);
		*(uint16_t*)(output.data() + oldSize) = compressedSize;
		*(uint32_t*)(output.data() + oldSize + 2) = way[0];
		*(uint32_t*)(output.data() + oldSize + 2) &= 0x7FFFFFFF;

		memcpy(output.data() + oldSize + 2 + 4, uint8Buffer, compressedSize);
	}

	return rv;
}

void populateMask(uint8_t* mask, const std::vector<uint8_t>& ids) {
	// mask should be a 32-byte array of uint8_t
	memset(mask, 0, 32);
	for (const uint8_t id : ids) {
		const uint64_t maskByte = id / 8;
		const uint64_t maskBit = id % 8;

		mask[maskByte] |= 1 << maskBit;
	}
}

void SortedWayStore::publishGroup(const std::vector<std::pair<WayID, std::vector<NodeID>>>& ways) {
	ThreadStorage& tls = s(this);
	totalWays += ways.size();
	if (ways.size() == 0) {
		throw std::runtime_error("SortedWayStore: group is empty");
	}
	size_t groupIndex = ways[0].first / (GroupSize * ChunkSize);

	if (groupIndex >= groups.size())
		throw std::runtime_error("SortedWayStore: unexpected groupIndex " + std::to_string(groupIndex));

	if (ways.size() > ChunkSize * GroupSize) {
		std::cout << "groupIndex=" << groupIndex << ", first ID=" << ways[0].first << ", ways.size() = " << ways.size() << std::endl;
		throw std::runtime_error("SortedWayStore: group is too big");
	}

	totalGroups++;

	struct ChunkData {
		uint8_t id;
		std::vector<uint8_t> wayIds;
		std::vector<uint16_t> wayFlags;
		std::deque<std::vector<uint8_t>> encodedWays;
	};

	std::deque<ChunkData> chunks;


	ChunkData* lastChunk = nullptr;

	// Encode the ways and group by chunk - don't allocate final memory yet.
	uint32_t seenNodes = 0;
	for (const auto& way : ways) {
		seenNodes += way.second.size();
		const uint8_t currentChunk = (way.first % (GroupSize * ChunkSize)) / ChunkSize;

		if (lastChunk == nullptr || lastChunk->id != currentChunk) {
			totalChunks++;
			chunks.push_back({});
			lastChunk = &chunks.back();
			lastChunk->id = currentChunk;
		}
		const WayID id = way.first;
		lastChunk->wayIds.push_back(id % ChunkSize);

		uint16_t flags = encodeWay(way.second, tls.encodedWay, compressWays && way.second.size() >= 4);
		lastChunk->wayFlags.push_back(flags);

		std::vector<uint8_t> encoded;
		encoded.resize(tls.encodedWay.size());
		memcpy(encoded.data(), tls.encodedWay.data(), tls.encodedWay.size());

		lastChunk->encodedWays.push_back(std::move(encoded));
	}
	totalNodes += seenNodes;

	// We now have the sizes of everything, so we can generate the final memory layout.

	// 1. compute the memory that is needed
	size_t groupSpace = sizeof(GroupInfo); // every group needs a GroupInfo
	groupSpace += chunks.size() * sizeof(uint32_t); // every chunk needs a 32-bit offset
	groupSpace += chunks.size() * sizeof(ChunkInfo); // every chunk needs a ChunkInfo
	for (const auto& chunk : chunks) {
		groupSpace += chunk.wayIds.size() * sizeof(uint16_t); // every way need a 16-bit offset

		// Ways that are < 256 bytes get stored in the small ways buffer with
		// no wasted space. Ways that are >= 256 bytes are stored in the large ways
		// buffer with some wasted space.

		size_t smallWaySize = 0;
		size_t largeWaySize = 0;
		for (int i = 0; i < chunk.wayIds.size(); i++) {
			size_t waySize = chunk.encodedWays[i].size() + sizeof(EncodedWay);
			if (waySize < 256) {
				smallWaySize += waySize;
			} else {
				largeWaySize += (((waySize - 1) / LargeWayAlignment) + 1) * LargeWayAlignment;
			}
		}

		groupSpace += smallWaySize;

		if (smallWaySize % LargeWayAlignment != 0)
			groupSpace += LargeWayAlignment - (smallWaySize % LargeWayAlignment);
		groupSpace += largeWaySize;
	}
	// During decoding, the library may read up to STREAMVBYTE_PADDING extra
	// bytes -- ensure that won't cause out-of-bounds reads.
	groupSpace += STREAMVBYTE_PADDING;

	totalGroupSpace += groupSpace;

	// 2. allocate and track the memory
	GroupInfo* groupInfo = nullptr;
	{
		groupInfo = (GroupInfo*)void_mmap_allocator::allocate(groupSpace);
		if (groupInfo == nullptr)
			throw std::runtime_error("SortedWayStore: failed to allocate space for group");
		std::lock_guard<std::mutex> lock(orphanageMutex);
		allocatedMemory.push_back(std::make_pair((void*)groupInfo, groupSpace));
	}

	if (groups[groupIndex] != nullptr)
		throw std::runtime_error("SortedNodeStore: group already present");
	groups[groupIndex] = groupInfo;

	// 3. populate the masks and offsets
	std::vector<uint8_t> chunkIds;
	chunkIds.reserve(chunks.size());
	for (const auto& chunk : chunks)
		chunkIds.push_back(chunk.id);
	populateMask(groupInfo->chunkMask, chunkIds);

	ChunkInfo* chunkPtr = (ChunkInfo*)((char*)groupInfo->chunkOffsets + (sizeof(uint32_t) * chunks.size()));

	for (size_t chunkIndex = 0; chunkIndex < chunks.size(); chunkIndex++) {
		groupInfo->chunkOffsets[chunkIndex] = (char*)chunkPtr - (char*)groupInfo;

		// Populate: smallWayMask, bigWayMask, wayOffsets
		std::vector<uint8_t> smallWays;
		std::vector<uint8_t> bigWays;

		const ChunkData& chunk = chunks[chunkIndex];
		const size_t numWays = chunk.wayIds.size();
		for (int i = 0; i < numWays; i++) {
			const size_t waySize = chunk.encodedWays[i].size() + sizeof(EncodedWay);
			if (waySize < 256) {
				smallWays.push_back(chunk.wayIds[i]);
			} else {
				bigWays.push_back(chunk.wayIds[i]);
			}
		}
		populateMask(chunkPtr->smallWayMask, smallWays);
		populateMask(chunkPtr->bigWayMask, bigWays);

		// Publish the small ways
		uint8_t* const endOfWayOffsetPtr = (uint8_t*)(chunkPtr->wayOffsets + numWays);
		uint8_t* wayStartPtr = endOfWayOffsetPtr;
		int offsetIndex = 0;
		for (int i = 0; i < numWays; i++) {
			const size_t waySize = chunk.encodedWays[i].size() + sizeof(EncodedWay);
			if (waySize < 256) {
				chunkPtr->wayOffsets[offsetIndex] = wayStartPtr - endOfWayOffsetPtr;
				EncodedWay* wayPtr = (EncodedWay*)wayStartPtr;
				wayPtr->flags = chunk.wayFlags[i];
				memcpy(wayPtr->data, chunk.encodedWays[i].data(), chunk.encodedWays[i].size());

				wayStartPtr += sizeof(EncodedWay) + chunk.encodedWays[i].size();
				offsetIndex++;
			}
		}

		// Publish the big ways
		// Offset is scaled for big ways, so make sure we're on a multiple of LargeWayAlignment
		if ((wayStartPtr - endOfWayOffsetPtr) % LargeWayAlignment != 0)
			wayStartPtr += LargeWayAlignment - ((wayStartPtr - endOfWayOffsetPtr) % LargeWayAlignment);
		for (int i = 0; i < numWays; i++) {
			const size_t waySize = chunk.encodedWays[i].size() + sizeof(EncodedWay);
			if (waySize >= 256) {
				uint32_t spaceNeeded = (((waySize - 1) / LargeWayAlignment) + 1) * LargeWayAlignment;
				uint32_t offset = wayStartPtr - endOfWayOffsetPtr;
				if (offset % LargeWayAlignment != 0)
					throw std::runtime_error("big way alignment error");

				chunkPtr->wayOffsets[offsetIndex] = offset / LargeWayAlignment;
				EncodedWay* wayPtr = (EncodedWay*)wayStartPtr;
				wayPtr->flags = chunk.wayFlags[i];
				memcpy(wayPtr->data, chunk.encodedWays[i].data(), chunk.encodedWays[i].size());

				wayStartPtr += spaceNeeded;
				offsetIndex++;
			}
		}


		// Update chunkPtr
		chunkPtr = (ChunkInfo*)wayStartPtr;
	}
}