diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 6a5536023821..fc3e323569eb 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -252,6 +252,9 @@ Other
   Applications using SecurityManager now need to grant SerializablePermission("serialFilter")
   to the analysis-smartcn module. (Uwe Schindler, Isaac David)
 
+* GITHUB#15341: Align float vectors on disk to 64 bytes, for optimal performance on Arm Neoverse
+  machines. (Mike McCandless, Kaival Parikh)
+
 Build
 ---------------------
 * Upgrade forbiddenapis to version 3.10.  (Uwe Schindler)
diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java
index 66e72bf11c3b..10b6818f2a43 100644
--- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java
+++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java
@@ -19,12 +19,16 @@
 import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.file.Files;
+import java.util.Random;
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
 import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
 import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
 import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues;
+import org.apache.lucene.codecs.lucene95.OffHeapFloatVectorValues;
 import org.apache.lucene.index.KnnVectorValues;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.store.Directory;
@@ -62,57 +66,140 @@
     value = 3,
     jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"})
 public class VectorScorerBenchmark {
+  private static final float EPSILON = 1e-4f;
 
   @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
   public int size;
 
+  @Param({"0", "1", "4", "64"})
+  public int padBytes; // capture performance impact of byte alignment in the index
+
   Directory dir;
-  IndexInput in;
-  KnnVectorValues vectorValues;
+  IndexInput bytesIn;
+  IndexInput floatsIn;
+  KnnVectorValues byteVectorValues;
+  KnnVectorValues floatVectorValues;
   byte[] vec1, vec2;
-  UpdateableRandomVectorScorer scorer;
+  float[] floatsA, floatsB;
+  float expectedBytes, expectedFloats;
+  UpdateableRandomVectorScorer byteScorer;
+  UpdateableRandomVectorScorer floatScorer;
 
   @Setup(Level.Iteration)
   public void init() throws IOException {
+    Random random = ThreadLocalRandom.current();
+
     vec1 = new byte[size];
     vec2 = new byte[size];
-    ThreadLocalRandom.current().nextBytes(vec1);
-    ThreadLocalRandom.current().nextBytes(vec2);
+    random.nextBytes(vec1);
+    random.nextBytes(vec2);
+    expectedBytes = DOT_PRODUCT.compare(vec1, vec2);
+
+    // random float arrays for float methods
+    floatsA = new float[size];
+    floatsB = new float[size];
+    for (int i = 0; i < size; ++i) {
+      floatsA[i] = random.nextFloat();
+      floatsB[i] = random.nextFloat();
+    }
+    expectedFloats = DOT_PRODUCT.compare(floatsA, floatsB);
 
     dir = new MMapDirectory(Files.createTempDirectory("VectorScorerBenchmark"));
-    try (IndexOutput out = dir.createOutput("vector.data", IOContext.DEFAULT)) {
+    try (IndexOutput out = dir.createOutput("byteVector.data", IOContext.DEFAULT)) {
+      out.writeBytes(new byte[padBytes], 0, padBytes);
+
       out.writeBytes(vec1, 0, vec1.length);
       out.writeBytes(vec2, 0, vec2.length);
     }
-    in = dir.openInput("vector.data", IOContext.DEFAULT);
-    vectorValues = vectorValues(size, 2, in, DOT_PRODUCT);
-    scorer =
+    try (IndexOutput out = dir.createOutput("floatVector.data", IOContext.DEFAULT)) {
+      out.writeBytes(new byte[padBytes], 0, padBytes);
+
+      byte[] buffer = new byte[size * Float.BYTES];
+      ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().put(floatsA);
+      out.writeBytes(buffer, 0, buffer.length);
+      ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().put(floatsB);
+      out.writeBytes(buffer, 0, buffer.length);
+    }
+
+    bytesIn = dir.openInput("byteVector.data", IOContext.DEFAULT);
+    byteVectorValues = byteVectorValues(DOT_PRODUCT);
+    byteScorer =
+        FlatVectorScorerUtil.getLucene99FlatVectorsScorer()
+            .getRandomVectorScorerSupplier(DOT_PRODUCT, byteVectorValues)
+            .scorer();
+    byteScorer.setScoringOrdinal(0);
+
+    floatsIn = dir.openInput("floatVector.data", IOContext.DEFAULT);
+    floatVectorValues = floatVectorValues(DOT_PRODUCT);
+    floatScorer =
         FlatVectorScorerUtil.getLucene99FlatVectorsScorer()
-            .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues)
+            .getRandomVectorScorerSupplier(DOT_PRODUCT, floatVectorValues)
             .scorer();
-    scorer.setScoringOrdinal(0);
+    floatScorer.setScoringOrdinal(0);
   }
 
   @TearDown
   public void teardown() throws IOException {
-    IOUtils.close(dir, in);
+    IOUtils.close(dir, bytesIn);
   }
 
   @Benchmark
   public float binaryDotProductDefault() throws IOException {
-    return scorer.score(1);
+    float result = byteScorer.score(1);
+    if (Math.abs(result - expectedBytes) > EPSILON) {
+      throw new RuntimeException("Expected " + result + " but got " + expectedBytes);
+    }
+    return result;
   }
 
   @Benchmark
   @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
   public float binaryDotProductMemSeg() throws IOException {
-    return scorer.score(1);
+    float result = byteScorer.score(1);
+    if (Math.abs(result - expectedBytes) > EPSILON) {
+      throw new RuntimeException("Expected " + result + " but got " + expectedBytes);
+    }
+    return result;
+  }
+
+  @Benchmark
+  public float floatDotProductDefault() throws IOException {
+    float result = floatScorer.score(1);
+    if (Math.abs(result - expectedFloats) > EPSILON) {
+      throw new RuntimeException("Expected " + result + " but got " + expectedFloats);
+    }
+    return result;
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public float floatDotProductMemSeg() throws IOException {
+    float result = floatScorer.score(1);
+    if (Math.abs(result - expectedFloats) > EPSILON) {
+      throw new RuntimeException("Expected " + result + " but got " + expectedFloats);
+    }
+    return result;
   }
 
-  static KnnVectorValues vectorValues(
-      int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException {
+  KnnVectorValues byteVectorValues(VectorSimilarityFunction sim) throws IOException {
     return new OffHeapByteVectorValues.DenseOffHeapVectorValues(
-        dims, size, in.slice("test", 0, in.length()), dims, new ThrowingFlatVectorScorer(), sim);
+        size,
+        2,
+        bytesIn.slice("test", padBytes, size * 2L),
+        size,
+        new ThrowingFlatVectorScorer(),
+        sim);
+  }
+
+  KnnVectorValues floatVectorValues(VectorSimilarityFunction sim) throws IOException {
+    int byteSize = size * Float.BYTES;
+    return new OffHeapFloatVectorValues.DenseOffHeapVectorValues(
+        size,
+        2,
+        floatsIn.slice("test", padBytes, byteSize * 2L),
+        byteSize,
+        new ThrowingFlatVectorScorer(),
+        sim);
   }
 
   static final class ThrowingFlatVectorScorer implements FlatVectorsScorer {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java
index c8ef2709db66..46be88836ca1 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java
@@ -63,6 +63,12 @@
  *       that only in sparse case
  * </ul>
  *
+ * <p>NOTE: Arm Neoverse machines have a performance overhead in reading data that is not aligned to
+ * 64 bytes, so this format aligns the <code>.vec</code> file to that size. There may be a
+ * performance penalty in searching of float vectors that do <b>not</b> have a dimension of a
+ * multiple of 16 (equivalent to 64 bytes), because the alignment will not hold for all vectors in
+ * the file.
+ *
  * @lucene.experimental
  */
 public final class Lucene99FlatVectorsFormat extends FlatVectorsFormat {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
index 1432f5ea46b8..3416a131735d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java
@@ -153,10 +153,19 @@ public long ramBytesUsed() {
     return total;
   }
 
+  private static long alignOutput(IndexOutput output, VectorEncoding encoding) throws IOException {
+    return output.alignFilePointer(
+        switch (encoding) {
+          case BYTE -> Float.BYTES;
+          case FLOAT32 -> 64; // optimal alignment for Arm Neoverse machines.
+        });
+  }
+
   private void writeField(FieldWriter<?> fieldData, int maxDoc) throws IOException {
     // write vector values
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
-    switch (fieldData.fieldInfo.getVectorEncoding()) {
+    VectorEncoding encoding = fieldData.fieldInfo.getVectorEncoding();
+    long vectorDataOffset = alignOutput(vectorData, encoding);
+    switch (encoding) {
       case BYTE -> writeByteVectors(fieldData);
       case FLOAT32 -> writeFloat32Vectors(fieldData);
     }
@@ -190,19 +199,19 @@ private void writeSortingField(FieldWriter<?> fieldData, int maxDoc, Sorter.DocM
     mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, null, ordMap, newDocsWithField);
 
     // write vector values
-    long vectorDataOffset =
-        switch (fieldData.fieldInfo.getVectorEncoding()) {
-          case BYTE -> writeSortedByteVectors(fieldData, ordMap);
-          case FLOAT32 -> writeSortedFloat32Vectors(fieldData, ordMap);
-        };
+    VectorEncoding encoding = fieldData.fieldInfo.getVectorEncoding();
+    long vectorDataOffset = alignOutput(vectorData, encoding);
+    switch (encoding) {
+      case BYTE -> writeSortedByteVectors(fieldData, ordMap);
+      case FLOAT32 -> writeSortedFloat32Vectors(fieldData, ordMap);
+    }
     long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
 
     writeMeta(fieldData.fieldInfo, maxDoc, vectorDataOffset, vectorDataLength, newDocsWithField);
   }
 
-  private long writeSortedFloat32Vectors(FieldWriter<?> fieldData, int[] ordMap)
+  private void writeSortedFloat32Vectors(FieldWriter<?> fieldData, int[] ordMap)
       throws IOException {
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
     final ByteBuffer buffer =
         ByteBuffer.allocate(fieldData.dim * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
     for (int ordinal : ordMap) {
@@ -210,26 +219,24 @@ private long writeSortedFloat32Vectors(FieldWriter<?> fieldData, int[] ordMap)
       buffer.asFloatBuffer().put(vector);
       vectorData.writeBytes(buffer.array(), buffer.array().length);
     }
-    return vectorDataOffset;
   }
 
-  private long writeSortedByteVectors(FieldWriter<?> fieldData, int[] ordMap) throws IOException {
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+  private void writeSortedByteVectors(FieldWriter<?> fieldData, int[] ordMap) throws IOException {
     for (int ordinal : ordMap) {
       byte[] vector = (byte[]) fieldData.vectors.get(ordinal);
       vectorData.writeBytes(vector, vector.length);
     }
-    return vectorDataOffset;
   }
 
   @Override
   public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
     // Since we know we will not be searching for additional indexing, we can just write the
     // the vectors directly to the new segment.
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+    VectorEncoding encoding = fieldInfo.getVectorEncoding();
+    long vectorDataOffset = alignOutput(vectorData, encoding);
     // No need to use temporary file as we don't have to re-open for reading
     DocsWithFieldSet docsWithField =
-        switch (fieldInfo.getVectorEncoding()) {
+        switch (encoding) {
           case BYTE ->
               writeByteVectorData(
                   vectorData,
@@ -252,7 +259,8 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE
   @Override
   public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
       FieldInfo fieldInfo, MergeState mergeState) throws IOException {
-    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+    VectorEncoding encoding = fieldInfo.getVectorEncoding();
+    long vectorDataOffset = alignOutput(vectorData, encoding);
     IndexOutput tempVectorData =
         segmentWriteState.directory.createTempOutput(
             vectorData.getName(), "temp", segmentWriteState.context);
@@ -260,7 +268,7 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
     try {
       // write the vector data to a temporary file
       DocsWithFieldSet docsWithField =
-          switch (fieldInfo.getVectorEncoding()) {
+          switch (encoding) {
             case BYTE ->
                 writeByteVectorData(
                     tempVectorData,
@@ -298,7 +306,7 @@ public CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
       vectorDataInput = null;
 
       final RandomVectorScorerSupplier randomVectorScorerSupplier =
-          switch (fieldInfo.getVectorEncoding()) {
+          switch (encoding) {
             case BYTE ->
                 vectorsScorer.getRandomVectorScorerSupplier(
                     fieldInfo.getVectorSimilarityFunction(),