diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6a5536023821..fc3e323569eb 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -252,6 +252,9 @@ Other Applications using SecurityManager now need to grant SerializablePermission("serialFilter") to the analysis-smartcn module. (Uwe Schindler, Isaac David) +* GITHUB#15341: Align float vectors on disk to 64 bytes, for optimal performance on Arm Neoverse + machines. (Mike McCandless, Kaival Parikh) + Build --------------------- * Upgrade forbiddenapis to version 3.10. (Uwe Schindler) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index 66e72bf11c3b..10b6818f2a43 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -19,12 +19,16 @@ import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.file.Files; +import java.util.Random; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.codecs.lucene95.OffHeapFloatVectorValues; import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; @@ -62,57 +66,140 @@ value = 3, jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"}) public class VectorScorerBenchmark { + private static final float EPSILON = 1e-4f; @Param({"1", "128", "207", "256", "300", "512", "702", "1024"}) public int size; + @Param({"0", "1", "4", "64"}) + public int padBytes; // capture performance impact of byte alignment in the index + Directory dir; - IndexInput in; - KnnVectorValues vectorValues; + IndexInput bytesIn; + IndexInput floatsIn; + KnnVectorValues byteVectorValues; + KnnVectorValues floatVectorValues; byte[] vec1, vec2; - UpdateableRandomVectorScorer scorer; + float[] floatsA, floatsB; + float expectedBytes, expectedFloats; + UpdateableRandomVectorScorer byteScorer; + UpdateableRandomVectorScorer floatScorer; @Setup(Level.Iteration) public void init() throws IOException { + Random random = ThreadLocalRandom.current(); + vec1 = new byte[size]; vec2 = new byte[size]; - ThreadLocalRandom.current().nextBytes(vec1); - ThreadLocalRandom.current().nextBytes(vec2); + random.nextBytes(vec1); + random.nextBytes(vec2); + expectedBytes = DOT_PRODUCT.compare(vec1, vec2); + + // random float arrays for float methods + floatsA = new float[size]; + floatsB = new float[size]; + for (int i = 0; i < size; ++i) { + floatsA[i] = random.nextFloat(); + floatsB[i] = random.nextFloat(); + } + expectedFloats = DOT_PRODUCT.compare(floatsA, floatsB); dir = new MMapDirectory(Files.createTempDirectory("VectorScorerBenchmark")); - try (IndexOutput out = dir.createOutput("vector.data", IOContext.DEFAULT)) { + try (IndexOutput out = dir.createOutput("byteVector.data", IOContext.DEFAULT)) { + out.writeBytes(new byte[padBytes], 0, padBytes); + out.writeBytes(vec1, 0, vec1.length); out.writeBytes(vec2, 0, vec2.length); } - in = dir.openInput("vector.data", IOContext.DEFAULT); - vectorValues = vectorValues(size, 2, in, DOT_PRODUCT); - scorer = + try (IndexOutput out = dir.createOutput("floatVector.data", IOContext.DEFAULT)) { + out.writeBytes(new byte[padBytes], 0, padBytes); + + byte[] buffer = new byte[size * Float.BYTES]; + ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().put(floatsA); + out.writeBytes(buffer, 0, buffer.length); + ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().put(floatsB); + out.writeBytes(buffer, 0, buffer.length); + } + + bytesIn = dir.openInput("byteVector.data", IOContext.DEFAULT); + byteVectorValues = byteVectorValues(DOT_PRODUCT); + byteScorer = + FlatVectorScorerUtil.getLucene99FlatVectorsScorer() + .getRandomVectorScorerSupplier(DOT_PRODUCT, byteVectorValues) + .scorer(); + byteScorer.setScoringOrdinal(0); + + floatsIn = dir.openInput("floatVector.data", IOContext.DEFAULT); + floatVectorValues = floatVectorValues(DOT_PRODUCT); + floatScorer = FlatVectorScorerUtil.getLucene99FlatVectorsScorer() - .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues) + .getRandomVectorScorerSupplier(DOT_PRODUCT, floatVectorValues) .scorer(); - scorer.setScoringOrdinal(0); + floatScorer.setScoringOrdinal(0); } @TearDown public void teardown() throws IOException { - IOUtils.close(dir, in); + IOUtils.close(dir, bytesIn); } @Benchmark public float binaryDotProductDefault() throws IOException { - return scorer.score(1); + float result = byteScorer.score(1); + if (Math.abs(result - expectedBytes) > EPSILON) { + throw new RuntimeException("Expected " + result + " but got " + expectedBytes); + } + return result; } @Benchmark @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public float binaryDotProductMemSeg() throws IOException { - return scorer.score(1); + float result = byteScorer.score(1); + if (Math.abs(result - expectedBytes) > EPSILON) { + throw new RuntimeException("Expected " + result + " but got " + expectedBytes); + } + return result; + } + + @Benchmark + public float floatDotProductDefault() throws IOException { + float result = floatScorer.score(1); + if (Math.abs(result - expectedFloats) > EPSILON) { + throw new RuntimeException("Expected " + result + " but got " + expectedFloats); + } + return result; + } + + @Benchmark + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + public float floatDotProductMemSeg() throws IOException { + float result = floatScorer.score(1); + if (Math.abs(result - expectedFloats) > EPSILON) { + throw new RuntimeException("Expected " + result + " but got " + expectedFloats); + } + return result; } - static KnnVectorValues vectorValues( - int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { + KnnVectorValues byteVectorValues(VectorSimilarityFunction sim) throws IOException { return new OffHeapByteVectorValues.DenseOffHeapVectorValues( - dims, size, in.slice("test", 0, in.length()), dims, new ThrowingFlatVectorScorer(), sim); + size, + 2, + bytesIn.slice("test", padBytes, size * 2L), + size, + new ThrowingFlatVectorScorer(), + sim); + } + + KnnVectorValues floatVectorValues(VectorSimilarityFunction sim) throws IOException { + int byteSize = size * Float.BYTES; + return new OffHeapFloatVectorValues.DenseOffHeapVectorValues( + size, + 2, + floatsIn.slice("test", padBytes, byteSize * 2L), + byteSize, + new ThrowingFlatVectorScorer(), + sim); } static final class ThrowingFlatVectorScorer implements FlatVectorsScorer { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java index c8ef2709db66..46be88836ca1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java @@ -63,6 +63,12 @@ * that only in sparse case * * + *

NOTE: Arm Neoverse machines have a performance overhead in reading data that is not aligned to + * 64 bytes, so this format aligns the .vec file to that size. There may be a + * performance penalty in searching of float vectors that do not have a dimension of a + * multiple of 16 (equivalent to 64 bytes), because the alignment will not hold for all vectors in + * the file. + * * @lucene.experimental */ public final class Lucene99FlatVectorsFormat extends FlatVectorsFormat { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index 1432f5ea46b8..3416a131735d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -153,10 +153,19 @@ public long ramBytesUsed() { return total; } + private static long alignOutput(IndexOutput output, VectorEncoding encoding) throws IOException { + return output.alignFilePointer( + switch (encoding) { + case BYTE -> Float.BYTES; + case FLOAT32 -> 64; // optimal alignment for Arm Neoverse machines. + }); + } + private void writeField(FieldWriter fieldData, int maxDoc) throws IOException { // write vector values - long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES); - switch (fieldData.fieldInfo.getVectorEncoding()) { + VectorEncoding encoding = fieldData.fieldInfo.getVectorEncoding(); + long vectorDataOffset = alignOutput(vectorData, encoding); + switch (encoding) { case BYTE -> writeByteVectors(fieldData); case FLOAT32 -> writeFloat32Vectors(fieldData); } @@ -190,19 +199,19 @@ private void writeSortingField(FieldWriter fieldData, int maxDoc, Sorter.DocM mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, null, ordMap, newDocsWithField); // write vector values - long vectorDataOffset = - switch (fieldData.fieldInfo.getVectorEncoding()) { - case BYTE -> writeSortedByteVectors(fieldData, ordMap); - case FLOAT32 -> writeSortedFloat32Vectors(fieldData, ordMap); - }; + VectorEncoding encoding = fieldData.fieldInfo.getVectorEncoding(); + long vectorDataOffset = alignOutput(vectorData, encoding); + switch (encoding) { + case BYTE -> writeSortedByteVectors(fieldData, ordMap); + case FLOAT32 -> writeSortedFloat32Vectors(fieldData, ordMap); + } long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; writeMeta(fieldData.fieldInfo, maxDoc, vectorDataOffset, vectorDataLength, newDocsWithField); } - private long writeSortedFloat32Vectors(FieldWriter fieldData, int[] ordMap) + private void writeSortedFloat32Vectors(FieldWriter fieldData, int[] ordMap) throws IOException { - long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES); final ByteBuffer buffer = ByteBuffer.allocate(fieldData.dim * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); for (int ordinal : ordMap) { @@ -210,26 +219,24 @@ private long writeSortedFloat32Vectors(FieldWriter fieldData, int[] ordMap) buffer.asFloatBuffer().put(vector); vectorData.writeBytes(buffer.array(), buffer.array().length); } - return vectorDataOffset; } - private long writeSortedByteVectors(FieldWriter fieldData, int[] ordMap) throws IOException { - long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES); + private void writeSortedByteVectors(FieldWriter fieldData, int[] ordMap) throws IOException { for (int ordinal : ordMap) { byte[] vector = (byte[]) fieldData.vectors.get(ordinal); vectorData.writeBytes(vector, vector.length); } - return vectorDataOffset; } @Override public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { // Since we know we will not be searching for additional indexing, we can just write the // the vectors directly to the new segment. - long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES); + VectorEncoding encoding = fieldInfo.getVectorEncoding(); + long vectorDataOffset = alignOutput(vectorData, encoding); // No need to use temporary file as we don't have to re-open for reading DocsWithFieldSet docsWithField = - switch (fieldInfo.getVectorEncoding()) { + switch (encoding) { case BYTE -> writeByteVectorData( vectorData, @@ -252,7 +259,8 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE @Override public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex( FieldInfo fieldInfo, MergeState mergeState) throws IOException { - long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES); + VectorEncoding encoding = fieldInfo.getVectorEncoding(); + long vectorDataOffset = alignOutput(vectorData, encoding); IndexOutput tempVectorData = segmentWriteState.directory.createTempOutput( vectorData.getName(), "temp", segmentWriteState.context); @@ -260,7 +268,7 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex( try { // write the vector data to a temporary file DocsWithFieldSet docsWithField = - switch (fieldInfo.getVectorEncoding()) { + switch (encoding) { case BYTE -> writeByteVectorData( tempVectorData, @@ -298,7 +306,7 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex( vectorDataInput = null; final RandomVectorScorerSupplier randomVectorScorerSupplier = - switch (fieldInfo.getVectorEncoding()) { + switch (encoding) { case BYTE -> vectorsScorer.getRandomVectorScorerSupplier( fieldInfo.getVectorSimilarityFunction(),