Skip to content

Commit e03ad84

Browse files
Using validity bitmap as Arrow does (#14409)
1 parent 1b2abdf commit e03ad84

27 files changed

+422
-170
lines changed

std-bits/table/src/main/java/org/enso/table/data/column/builder/BoolBuilder.java

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@
1313
/** A builder for boolean columns. */
1414
final class BoolBuilder implements BuilderForBoolean, BuilderWithRetyping {
1515
private final BitSet vals;
16-
private final BitSet isNothing;
16+
private final BitSet validityMap;
1717
int size = 0;
1818

1919
// ** Creates a new builder for boolean columns. Should be built via Builder.getForBoolean. */
2020
BoolBuilder(int capacity) {
2121
vals = new BitSet(capacity);
22-
isNothing = new BitSet(capacity);
22+
validityMap = new BitSet(capacity);
2323
}
2424

2525
@Override
@@ -31,6 +31,7 @@ public BoolBuilder append(Object o) {
3131
if (b) {
3232
vals.set(size);
3333
}
34+
validityMap.set(size);
3435
} else {
3536
throw new ValueTypeMismatchException(getType(), o);
3637
}
@@ -54,13 +55,14 @@ public BoolBuilder appendBoolean(boolean value) {
5455
if (value) {
5556
vals.set(size);
5657
}
58+
validityMap.set(size, true);
5759
size++;
5860
return this;
5961
}
6062

6163
@Override
6264
public BoolBuilder appendNulls(int count) {
63-
isNothing.set(size, size + count);
65+
validityMap.set(size, size + count, false);
6466
size += count;
6567
return this;
6668
}
@@ -71,7 +73,7 @@ public void appendBulkStorage(ColumnStorage<?> storage) {
7173
// We know this is valid for a BoolStorage.
7274
int toCopy = (int) boolStorage.getSize();
7375
BitSets.copy(boolStorage.getValues(), vals, size, toCopy);
74-
BitSets.copy(boolStorage.getIsNothingMap(), isNothing, size, toCopy);
76+
boolStorage.getValidityMap().copyTo(validityMap, size, toCopy);
7577
size += toCopy;
7678
} else if (storage instanceof ColumnBooleanStorage columnBooleanStorage) {
7779
for (long i = 0; i < columnBooleanStorage.getSize(); i++) {
@@ -90,7 +92,7 @@ public void appendBulkStorage(ColumnStorage<?> storage) {
9092

9193
@Override
9294
public ColumnStorage<Boolean> seal() {
93-
return new BoolStorage(vals, isNothing, size, false);
95+
return new BoolStorage(vals, validityMap, size, false);
9496
}
9597

9698
@Override
@@ -101,7 +103,7 @@ public long getCurrentSize() {
101103
@Override
102104
public void copyDataTo(Object[] items) {
103105
for (int i = 0; i < size; i++) {
104-
if (isNothing.get(i)) {
106+
if (!validityMap.get(i)) {
105107
items[i] = null;
106108
} else {
107109
items[i] = vals.get(i);

std-bits/table/src/main/java/org/enso/table/data/column/builder/Builder.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,12 @@ static ColumnStorage<?> fromRepeatedItem(Object item, long size) {
5454
// Create a single storage item based on the type of the item.
5555
return switch (item) {
5656
case null -> new NullBuilder().appendNulls(checkSize(size)).seal();
57-
case Boolean booleanValue ->
58-
new BoolStorage(new BitSet(), new BitSet(), checkSize(size), booleanValue);
57+
case Boolean booleanValue -> {
58+
var s = checkSize(size);
59+
var validity = new BitSet();
60+
validity.set(0, s, true);
61+
yield new BoolStorage(new BitSet(), validity, s, booleanValue);
62+
}
5963
default -> {
6064
var storageType = StorageType.forBoxedItem(item, PreciseTypeOptions.DEFAULT);
6165
var builder = Builder.getForType(storageType, size, BlackholeProblemAggregator.INSTANCE);

std-bits/table/src/main/java/org/enso/table/data/column/builder/DoubleBuilder.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
import org.enso.table.data.column.storage.type.StorageType;
1414
import org.enso.table.error.ValueTypeMismatchException;
1515
import org.enso.table.problems.ProblemAggregator;
16-
import org.enso.table.util.BitSets;
1716

1817
/** A builder for floating point columns. */
19-
class DoubleBuilder extends NumericBuilder implements BuilderForDouble {
18+
sealed class DoubleBuilder extends NumericBuilder implements BuilderForDouble
19+
permits InferredDoubleBuilder {
2020
protected final PrecisionLossAggregator precisionLossAggregator;
2121
protected double[] data;
2222

@@ -80,6 +80,7 @@ public DoubleBuilder append(Object o) {
8080
}
8181

8282
ensureSpaceToAppend();
83+
setValid(currentSize);
8384
data[currentSize++] = value;
8485
return this;
8586
}
@@ -91,7 +92,7 @@ public void appendBulkStorage(ColumnStorage<?> storage) {
9192
int n = (int) doubleStorage.getSize();
9293
ensureFreeSpaceFor(n);
9394
System.arraycopy(doubleStorage.getData(), 0, data, currentSize, n);
94-
BitSets.copy(doubleStorage.getIsNothingMap(), isNothing, currentSize, n);
95+
appendValidityMap(doubleStorage.getValidityMap(), n);
9596
currentSize += n;
9697
} else {
9798
var doubleStorage = floatType.asTypedStorage(storage);
@@ -147,8 +148,10 @@ public void appendBulkStorage(ColumnStorage<?> storage) {
147148
*
148149
* @param value the double to append
149150
*/
151+
@Override
150152
public DoubleBuilder appendDouble(double value) {
151153
ensureSpaceToAppend();
154+
setValid(currentSize);
152155
data[currentSize++] = value;
153156
return this;
154157
}
@@ -158,14 +161,15 @@ public DoubleBuilder appendDouble(double value) {
158161
*
159162
* <p>It ensures that any loss of precision is reported.
160163
*/
164+
@Override
161165
public DoubleBuilder appendLong(long value) {
162166
appendDouble(convertLongToDouble(value));
163167
return this;
164168
}
165169

166170
@Override
167171
public ColumnStorage<Double> seal() {
168-
return new DoubleStorage(data, currentSize, isNothing);
172+
return new DoubleStorage(data, currentSize, validityMap());
169173
}
170174

171175
/**

std-bits/table/src/main/java/org/enso/table/data/column/builder/InferredDoubleBuilder.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ static InferredDoubleBuilder retypeFromLongBuilder(
6666
public void copyDataTo(Object[] items) {
6767
int rawN = rawData == null ? 0 : rawData.length;
6868
for (int i = 0; i < currentSize; i++) {
69-
if (isNothing.get(i)) {
69+
if (!isValid(i)) {
7070
items[i] = null;
7171
} else {
7272
if (isLongCompactedAsDouble.get(i)) {
@@ -160,7 +160,7 @@ public Builder retypeTo(StorageType<?> type) {
160160
if (type instanceof BigDecimalType) {
161161
Builder res = Builder.getForBigDecimal(data.length);
162162
for (int i = 0; i < currentSize; i++) {
163-
if (isNothing.get(i)) {
163+
if (!isValid(i)) {
164164
res.appendNulls(1);
165165
} else {
166166
BigDecimal bigDecimal = BigDecimal.valueOf(data[i]);

std-bits/table/src/main/java/org/enso/table/data/column/builder/LongBuilder.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
import org.enso.table.data.column.storage.type.StorageType;
1414
import org.enso.table.error.ValueTypeMismatchException;
1515
import org.enso.table.problems.ProblemAggregator;
16-
import org.enso.table.util.BitSets;
1716

1817
/** A builder for integer columns. */
19-
class LongBuilder extends NumericBuilder implements BuilderForLong, BuilderWithRetyping {
18+
sealed class LongBuilder extends NumericBuilder implements BuilderForLong, BuilderWithRetyping
19+
permits BoundCheckedIntegerBuilder {
2020
protected final ProblemAggregator problemAggregator;
2121
protected long[] data;
2222

@@ -49,7 +49,7 @@ protected void resize(int desiredCapacity) {
4949
@Override
5050
public void copyDataTo(Object[] items) {
5151
for (int i = 0; i < currentSize; i++) {
52-
if (isNothing.get(i)) {
52+
if (!isValid(i)) {
5353
items[i] = null;
5454
} else {
5555
items[i] = data[i];
@@ -96,7 +96,7 @@ public void appendBulkStorage(ColumnStorage<?> storage) {
9696
int n = (int) longStorage.getSize();
9797
ensureFreeSpaceFor(n);
9898
System.arraycopy(longStorage.getData(), 0, data, currentSize, n);
99-
BitSets.copy(longStorage.getIsNothingMap(), isNothing, currentSize, n);
99+
appendValidityMap(longStorage.getValidityMap(), n);
100100
currentSize += n;
101101
} else {
102102
// No conversions needed, but we need to iterate over the items.
@@ -134,6 +134,7 @@ public void appendBulkStorage(ColumnStorage<?> storage) {
134134
*/
135135
public LongBuilder appendLong(long value) {
136136
ensureSpaceToAppend();
137+
this.setValid(currentSize);
137138
this.data[currentSize++] = value;
138139
return this;
139140
}
@@ -143,7 +144,7 @@ public boolean isNothing(long index) {
143144
if (index >= currentSize) {
144145
throw new IndexOutOfBoundsException();
145146
} else {
146-
return isNothing.get((int) index);
147+
return !isValid((int) index);
147148
}
148149
}
149150

@@ -185,6 +186,6 @@ public LongBuilder append(Object o) {
185186

186187
@Override
187188
public ColumnStorage<Long> seal() {
188-
return new LongStorage(data, currentSize, isNothing, getType());
189+
return new LongStorage(data, currentSize, validityMap(), getType());
189190
}
190191
}

std-bits/table/src/main/java/org/enso/table/data/column/builder/NumericBuilder.java

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,58 @@
11
package org.enso.table.data.column.builder;
22

33
import java.util.BitSet;
4+
import org.enso.table.util.ImmutableBitSet;
45

56
/** A common base for numeric builders. */
6-
abstract class NumericBuilder implements Builder {
7-
protected BitSet isNothing;
8-
protected int currentSize;
7+
abstract sealed class NumericBuilder implements Builder permits DoubleBuilder, LongBuilder {
8+
private BitSet validityMap;
9+
int currentSize;
910

10-
protected NumericBuilder() {
11-
this.isNothing = new BitSet();
12-
this.currentSize = 0;
11+
protected NumericBuilder() {}
12+
13+
private BitSet getValidityMap() {
14+
if (validityMap == null) {
15+
validityMap = new BitSet();
16+
validityMap.set(0, currentSize);
17+
}
18+
return validityMap;
1319
}
1420

15-
protected void doAppendNulls(int count) {
16-
isNothing.set(currentSize, currentSize + count);
21+
protected final void doAppendNulls(int count) {
22+
getValidityMap().set(currentSize, currentSize + count, false);
1723
currentSize += count;
1824
}
1925

26+
protected final boolean isValid(int i) {
27+
return validityMap == null || validityMap.get(i);
28+
}
29+
30+
protected final void setValid(int i) {
31+
if (validityMap != null) {
32+
validityMap.set(i);
33+
}
34+
}
35+
36+
protected final ImmutableBitSet validityMap() {
37+
if (validityMap == null) {
38+
return ImmutableBitSet.allTrue(currentSize);
39+
} else {
40+
return new ImmutableBitSet(validityMap, currentSize);
41+
}
42+
}
43+
44+
protected final void appendValidityMap(ImmutableBitSet validity, int n) {
45+
if (validity.cardinality() < n || validityMap != null) {
46+
validity.copyTo(getValidityMap(), currentSize, n);
47+
}
48+
}
49+
2050
@Override
2151
public long getCurrentSize() {
2252
return currentSize;
2353
}
2454

25-
protected void ensureFreeSpaceFor(int additionalSize) {
55+
protected final void ensureFreeSpaceFor(int additionalSize) {
2656
if (currentSize + additionalSize > getDataSize()) {
2757
resize(currentSize + additionalSize);
2858
}
@@ -35,7 +65,7 @@ protected void ensureFreeSpaceFor(int additionalSize) {
3565
* appends. It tries to keep the invariant that after calling `grow` the array has at least one
3666
* free slot.
3767
*/
38-
protected void ensureSpaceToAppend() {
68+
protected final void ensureSpaceToAppend() {
3969
int dataLength = getDataSize();
4070

4171
// Check current size. If there is space, we don't need to grow.

std-bits/table/src/main/java/org/enso/table/data/column/operation/IsInOperation.java

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import org.enso.table.data.column.storage.ColumnBooleanStorage;
1515
import org.enso.table.data.column.storage.ColumnStorage;
1616
import org.enso.table.data.column.storage.ColumnStorageWithInferredStorage;
17-
import org.enso.table.data.column.storage.ColumnStorageWithNothingMap;
17+
import org.enso.table.data.column.storage.ColumnStorageWithValidityMap;
1818
import org.enso.table.data.column.storage.type.AnyObjectType;
1919
import org.enso.table.data.column.storage.type.BigDecimalType;
2020
import org.enso.table.data.column.storage.type.BigIntegerType;
@@ -28,6 +28,7 @@
2828
import org.enso.table.data.column.storage.type.TimeOfDayType;
2929
import org.enso.table.data.table.Column;
3030
import org.enso.table.data.table.problems.MapOperationProblemAggregator;
31+
import org.enso.table.util.ImmutableBitSet;
3132

3233
/**
3334
* The IsInOperation class provides a way to check if a value is in a set of values. It checks if
@@ -265,8 +266,8 @@ private static ColumnStorage<?> applyBooleanIsIn(
265266

266267
// If had both true and false, then return all true when not nothing
267268
if (flags.hadTrue && flags.hadFalse) {
268-
var isNothing = makeIsNothingMap(boolStorage, checkedSize);
269-
return new BoolStorage(new BitSet(), isNothing, checkedSize, true);
269+
var validityMap = makeValidityMap(boolStorage, checkedSize);
270+
return new BoolStorage(new BitSet(), validityMap, checkedSize, true);
270271
}
271272

272273
// Only have one of true or false
@@ -296,34 +297,41 @@ private static ColumnStorage<?> applyBooleanIsIn(
296297
private static ColumnStorage<?> applyBoolStorage(
297298
boolean keepValue, BoolStorage boolStorage, int checkedSize) {
298299
BitSet values = boolStorage.getValues();
299-
BitSet isNothing = boolStorage.getIsNothingMap();
300+
BitSet isNothing = boolStorage.getValidityMap().cloneBitSet();
301+
isNothing.flip(0, Math.toIntExact(boolStorage.getSize()));
300302

301303
if (keepValue) {
302304
var newIsNothing =
303-
boolStorage.isNegated() ? or(isNothing, values) : orNot(isNothing, values, checkedSize);
305+
boolStorage.isNegated()
306+
? or(isNothing, values, checkedSize)
307+
: orNot(isNothing, values, checkedSize);
308+
newIsNothing.flip(0, checkedSize);
304309
return new BoolStorage(values, newIsNothing, checkedSize, boolStorage.isNegated());
305310
} else {
306311
var newIsNothing =
307-
boolStorage.isNegated() ? orNot(isNothing, values, checkedSize) : or(isNothing, values);
312+
boolStorage.isNegated()
313+
? orNot(isNothing, values, checkedSize)
314+
: or(isNothing, values, checkedSize);
315+
newIsNothing.flip(0, checkedSize);
308316
return new BoolStorage(values, newIsNothing, checkedSize, !boolStorage.isNegated());
309317
}
310318
}
311319

312-
private static BitSet makeIsNothingMap(ColumnStorage<?> storage, int size) {
313-
if (storage instanceof ColumnStorageWithNothingMap withNothingMap) {
314-
return withNothingMap.getIsNothingMap();
320+
private static ImmutableBitSet makeValidityMap(ColumnStorage<?> storage, int size) {
321+
if (storage instanceof ColumnStorageWithValidityMap withNothingMap) {
322+
return withNothingMap.getValidityMap();
315323
}
316324

317-
BitSet isNothingMap = new BitSet(size);
325+
BitSet validityMap = new BitSet(size);
318326
for (int i = 0; i < size; i++) {
319-
if (storage.isNothing(i)) {
320-
isNothingMap.set(i);
327+
if (!storage.isNothing(i)) {
328+
validityMap.set(i);
321329
}
322330
}
323-
return isNothingMap;
331+
return new ImmutableBitSet(validityMap, size);
324332
}
325333

326-
private static BitSet or(BitSet left, BitSet right) {
334+
private static BitSet or(BitSet left, BitSet right, int sizeIsIgnored) {
327335
BitSet result = (BitSet) left.clone();
328336
result.or(right);
329337
return result;

0 commit comments

Comments
 (0)