-
Notifications
You must be signed in to change notification settings - Fork 33
PECOBLR-1121 JMH benchmark for Arrow parsing. #1162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
tejassp-db
wants to merge
14
commits into
PECOBLR-1121/arrow-patch/stack-3
from
PECOBLR-1121/arrow-patch/stack-4
Closed
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
c308183
PECOBLR-1121 JMH benchmark for Arrow parsing.
tejassp-db c1f75ca
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db 1fefb67
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db a4fff9c
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db 49aa025
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db 244a8be
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db d08f6eb
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db 5febf7c
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db 333c0eb
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db b54ce0c
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db d5e5be7
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db 7007abe
PECOBLR-1121 Benchmark with all Arrow types.
tejassp-db 24d850f
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db d026b2c
Merge branch 'PECOBLR-1121/arrow-patch/stack-3' into PECOBLR-1121/arr…
tejassp-db File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
161 changes: 161 additions & 0 deletions
161
src/test/java/org/apache/arrow/memory/ArrowParsingBenchmark.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,161 @@ | ||
| package org.apache.arrow.memory; | ||
|
|
||
| import static org.junit.jupiter.api.Assertions.assertNotNull; | ||
|
|
||
| import java.io.ByteArrayInputStream; | ||
| import java.io.IOException; | ||
| import java.io.InputStream; | ||
| import java.nio.file.Path; | ||
| import java.util.ArrayList; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.concurrent.TimeUnit; | ||
| import java.util.stream.Collectors; | ||
| import net.jpountz.lz4.LZ4FrameInputStream; | ||
| import org.apache.arrow.vector.ValueVector; | ||
| import org.apache.arrow.vector.VectorSchemaRoot; | ||
| import org.apache.arrow.vector.ipc.ArrowStreamReader; | ||
| import org.apache.arrow.vector.util.TransferPair; | ||
| import org.openjdk.jmh.annotations.Benchmark; | ||
| import org.openjdk.jmh.annotations.BenchmarkMode; | ||
| import org.openjdk.jmh.annotations.Fork; | ||
| import org.openjdk.jmh.annotations.Level; | ||
| import org.openjdk.jmh.annotations.Measurement; | ||
| import org.openjdk.jmh.annotations.Mode; | ||
| import org.openjdk.jmh.annotations.OutputTimeUnit; | ||
| import org.openjdk.jmh.annotations.Scope; | ||
| import org.openjdk.jmh.annotations.Setup; | ||
| import org.openjdk.jmh.annotations.State; | ||
| import org.openjdk.jmh.annotations.Warmup; | ||
| import org.openjdk.jmh.runner.Runner; | ||
| import org.openjdk.jmh.runner.RunnerException; | ||
| import org.openjdk.jmh.runner.options.Options; | ||
| import org.openjdk.jmh.runner.options.OptionsBuilder; | ||
|
|
||
| @State(Scope.Benchmark) | ||
| @BenchmarkMode(Mode.AverageTime) | ||
| @Fork(value = 1) | ||
| @Measurement(iterations = 20, time = 100, timeUnit = TimeUnit.MILLISECONDS) | ||
| @OutputTimeUnit(TimeUnit.MILLISECONDS) | ||
| @Warmup(iterations = 20, time = 100, timeUnit = TimeUnit.MILLISECONDS) | ||
| public class ArrowParsingBenchmark { | ||
| /** Path to an arrow chunk. */ | ||
| private static final Path ARROW_CHUNK_PATH = Path.of("arrow", "chunk_all_types.arrow"); | ||
|
|
||
| /** Path to a LZ4 compressed arrow chunk. */ | ||
| private static final Path ARROW_CHUNK_COMPRESSED_PATH = | ||
| Path.of("arrow", "chunk_all_types.arrow.lz4"); | ||
|
|
||
| /** Compressed Arrow file suffix. */ | ||
| private static final String ARROW_CHUNK_COMPRESSED_FILE_SUFFIX = ".lz4"; | ||
|
|
||
| public static void main(String[] args) throws RunnerException { | ||
| Options options = | ||
| new OptionsBuilder().include(ArrowParsingBenchmark.class.getSimpleName()).build(); | ||
| new Runner(options).run(); | ||
| } | ||
|
|
||
| // Pre-loaded file contents | ||
| private byte[] arrowChunkBytes; | ||
| private byte[] arrowChunkCompressedBytes; | ||
|
|
||
| @Setup(Level.Trial) | ||
| public void setup() throws IOException { | ||
| // Load files into memory once before all benchmark iterations | ||
| arrowChunkBytes = loadFileToMemory(ARROW_CHUNK_PATH); | ||
| arrowChunkCompressedBytes = loadFileToMemory(ARROW_CHUNK_COMPRESSED_PATH); | ||
| } | ||
|
|
||
| private byte[] loadFileToMemory(Path filePath) throws IOException { | ||
| try (InputStream stream = | ||
| getClass().getClassLoader().getResourceAsStream(filePath.toString())) { | ||
| assertNotNull(stream, filePath + " not found"); | ||
| return stream.readAllBytes(); | ||
| } | ||
| } | ||
|
|
||
| @Benchmark | ||
| public List<Map<String, Object>> parseArrowChunk() throws IOException { | ||
| try (BufferAllocator allocator = new RootAllocator()) { | ||
| return parseArrowStream(arrowChunkBytes, false, allocator); | ||
| } | ||
| } | ||
|
|
||
| @Benchmark | ||
| public List<Map<String, Object>> parseArrowCompressedChunk() throws IOException { | ||
| try (BufferAllocator allocator = new RootAllocator()) { | ||
| return parseArrowStream(arrowChunkCompressedBytes, true, allocator); | ||
| } | ||
| } | ||
|
|
||
| @Benchmark | ||
| public List<Map<String, Object>> parsePatchedArrowChunk() throws IOException { | ||
| try (BufferAllocator allocator = new DatabricksBufferAllocator()) { | ||
| return parseArrowStream(arrowChunkBytes, false, allocator); | ||
| } | ||
| } | ||
|
|
||
| @Benchmark | ||
| public List<Map<String, Object>> parsePatchedArrowCompressedChunk() throws IOException { | ||
| try (BufferAllocator allocator = new DatabricksBufferAllocator()) { | ||
| return parseArrowStream(arrowChunkCompressedBytes, true, allocator); | ||
| } | ||
| } | ||
|
|
||
| /** Parse the Arrow stream file stored at {@code filePath} and return the records in the file. */ | ||
| private List<Map<String, Object>> parseArrowStream( | ||
| byte[] arrowChunkBytes, boolean isCompressed, BufferAllocator allocator) throws IOException { | ||
| ArrayList<Map<String, Object>> records = new ArrayList<>(); | ||
|
|
||
| InputStream arrowStream = new ByteArrayInputStream(arrowChunkBytes); | ||
| if (isCompressed) { | ||
| arrowStream = new LZ4FrameInputStream(arrowStream); | ||
| } | ||
|
|
||
| try (ArrowStreamReader reader = new ArrowStreamReader(arrowStream, allocator)) { | ||
| // Iterate over batches. | ||
| while (reader.loadNextBatch()) { | ||
| VectorSchemaRoot root = reader.getVectorSchemaRoot(); | ||
|
|
||
| // Transfer all vectors. | ||
| List<ValueVector> valueVectors = | ||
| root.getFieldVectors().stream() | ||
| .map( | ||
| fieldVector -> { | ||
| TransferPair transferPair = fieldVector.getTransferPair(allocator); | ||
| transferPair.transfer(); | ||
| return transferPair.getTo(); | ||
| }) | ||
| .collect(Collectors.toList()); | ||
|
|
||
| // Parse and populate each record/row in this batch. | ||
| try { | ||
| for (int recordIndex = 0; recordIndex < root.getRowCount(); recordIndex++) { | ||
| HashMap<String, Object> record = new HashMap<>(); | ||
| for (ValueVector valueVector : valueVectors) { | ||
| record.put(valueVector.getField().getName(), valueVector.getObject(recordIndex)); | ||
| } | ||
| records.add(record); | ||
| } | ||
| } finally { | ||
| // Close all transferred vectors to prevent memory leak | ||
| valueVectors.forEach(ValueVector::close); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return records; | ||
| } | ||
|
|
||
| /** | ||
| * @return an input stream for the filePath. | ||
| */ | ||
| private InputStream getStream(Path filePath) throws IOException { | ||
| InputStream arrowStream = getClass().getClassLoader().getResourceAsStream(filePath.toString()); | ||
| assertNotNull(arrowStream, filePath + " not found"); | ||
| return filePath.toString().endsWith(ARROW_CHUNK_COMPRESSED_FILE_SUFFIX) | ||
| ? new LZ4FrameInputStream(arrowStream) | ||
| : arrowStream; | ||
| } | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so is the test data persisted in a local file?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its in stack-1.