From 2c17ad1cbe1fa24283581a5b9cc881d14ee0870e Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 28 Jan 2026 23:15:52 +0100 Subject: [PATCH] Revert "Update Results for DataFusion 52.0.0" --- datafusion-partitioned/README.md | 40 ++++----- datafusion-partitioned/benchmark.sh | 31 +++---- .../results/c6a.2xlarge.json | 90 +++++++++---------- .../results/c6a.4xlarge.json | 90 +++++++++---------- .../results/c6a.xlarge.json | 86 +++++++++--------- .../results/c8g.4xlarge.json | 90 +++++++++---------- datafusion-partitioned/save-result.sh | 37 -------- datafusion/README.md | 38 ++++---- datafusion/benchmark.sh | 31 +++---- datafusion/results/c6a.2xlarge.json | 90 +++++++++---------- datafusion/results/c6a.4xlarge.json | 90 +++++++++---------- datafusion/results/c6a.xlarge.json | 86 +++++++++--------- datafusion/results/c8g.4xlarge.json | 90 +++++++++---------- datafusion/save-result.sh | 37 -------- 14 files changed, 415 insertions(+), 511 deletions(-) delete mode 100755 datafusion-partitioned/save-result.sh delete mode 100755 datafusion/save-result.sh diff --git a/datafusion-partitioned/README.md b/datafusion-partitioned/README.md index a4052a523..503fa565d 100644 --- a/datafusion-partitioned/README.md +++ b/datafusion-partitioned/README.md @@ -1,46 +1,38 @@ # DataFusion -[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. For more information, please check - -[Apache DataFusion]: https://arrow.apache.org/datafusion/ -[Apache Arrow]: https://arrow.apache.org/ +DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check We use parquet file here and create an external table for it; and then execute the queries. -## Cookbook: Generate benchmark results +## Generate benchmark results The benchmark should be completed in under an hour. On-demand pricing is $0.6 per hour while spot pricing is only $0.2 to $0.3 per hour (us-east-2). -1. manually start a AWS EC2 instance, the following environments are included in this dir: - - | Instance Type | OS | Disk | Arch | - | :-----------: | :---------------------: | :----------------: | :---: | - | `c6a.xlarge` | `Ubuntu 24.04` or later | Root 500GB gp2 SSD | AMD64 | - | `c6a.2xlarge` | | | AMD64 | - | `c6a.4xlarge` | | | AMD64 | - | `c8g.4xlarge` | | | ARM64 | - - All with no EBS optimized, no instance store. For `c6a.xlarge` instance, its memory is not capable to compile datafusion. It's recommended to enable a 8GB swap with ```sudo fallocate -l 4G /swapfile && sudo chmod 600 /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile```. - -2. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` -3. `git clone https://github.com/ClickHouse/ClickBench` -4. `cd ClickBench/datafusion-partitioned` -5. `vi benchmark.sh` and modify following line to target Datafusion version +1. manually start a AWS EC2 instance + - `c6a.4xlarge` + - Ubuntu 22.04 or later + - Root 500GB gp2 SSD + - no EBS optimized + - no instance store +1. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` +1. `git clone https://github.com/ClickHouse/ClickBench` +1. `cd ClickBench/datafusion` +1. `vi benchmark.sh` and modify following line to target Datafusion version ```bash git checkout 46.0.0 ``` -6. `bash benchmark.sh` -7. Update corresponding `.json` file under `results`, or run `./save-result.sh` with instance type like `./save-result.sh c6a.4xlarge` +1. `bash benchmark.sh` ### Know Issues 1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) 2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) +3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050 ## Generate full human readable results (for debugging) 1. install datafusion-cli -2. download the parquet ```seq 0 99 | xargs -P100 -I{} bash -c 'wget --directory-prefix partitioned --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'``` -3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh``` +2. download the parquet ```wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.parquet``` +3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```bash run2.sh``` diff --git a/datafusion-partitioned/benchmark.sh b/datafusion-partitioned/benchmark.sh index c25e7cadc..1c10a401d 100755 --- a/datafusion-partitioned/benchmark.sh +++ b/datafusion-partitioned/benchmark.sh @@ -1,25 +1,22 @@ #!/bin/bash +echo "Install Rust" +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh +bash rust-init.sh -y +export HOME=${HOME:=~} +source ~/.cargo/env + echo "Install Dependencies" sudo apt-get update -y +sudo apt-get install -y gcc -echo "Install Homebrew" -# This requires password input for sudo, which is not set by default. -# You may need to run the following command to set a password first: -# ``` -# sudo su -# passwd ubuntu -# exit -# ``` -/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" -echo >> /home/ubuntu/.bashrc -echo 'eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv bash)"' >> /home/ubuntu/.bashrc -eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv bash)" - -echo "Install datafusion-cli" -# or use `brew install datafusion@52` to install a specific version -brew install datafusion -datafusion-cli --version +echo "Install DataFusion main branch" +git clone https://github.com/apache/arrow-datafusion.git +cd arrow-datafusion/ +git checkout 47.0.0 +CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli +export PATH="`pwd`/target/release:$PATH" +cd .. echo "Download benchmark target data, partitioned" mkdir -p partitioned diff --git a/datafusion-partitioned/results/c6a.2xlarge.json b/datafusion-partitioned/results/c6a.2xlarge.json index 7cc3fe790..087386f3d 100644 --- a/datafusion-partitioned/results/c6a.2xlarge.json +++ b/datafusion-partitioned/results/c6a.2xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, partitioned)", - "date": "2026-01-15", + "date": "2025-07-10", "machine": "c6a.2xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14737666736, "result": [ - [0.052, 0.002, 0.002], - [0.117, 0.040, 0.038], - [0.950, 0.116, 0.111], - [2.713, 0.100, 0.108], - [2.921, 1.162, 1.009], - [3.116, 1.176, 1.047], - [0.055, 0.002, 0.002], - [0.126, 0.041, 0.043], - [3.124, 1.198, 1.194], - [4.286, 1.531, 1.493], - [2.358, 0.276, 0.275], - [2.714, 0.312, 0.290], - [3.249, 1.089, 0.965], - [6.469, 1.600, 1.630], - [3.244, 1.031, 1.036], - [2.522, 1.228, 1.260], - [6.138, 2.155, 2.165], - [6.118, 2.022, 2.108], - [11.294, 4.265, 4.152], - [1.706, 0.091, 0.091], - [20.960, 1.253, 1.267], - [23.958, 1.558, 1.453], - [45.677, 2.494, 2.559], - [108.672, 95.195, 91.845], - [1.474, 0.157, 0.159], - [3.367, 0.327, 0.323], - [1.546, 0.156, 0.155], - [21.312, 1.754, 1.709], - [19.173, 15.870, 15.832], - [0.859, 0.756, 0.750], - [7.448, 0.959, 1.028], - [15.002, 1.040, 1.054], - [11.322, 3.872, 3.830], - [20.749, 4.133, 4.390], - [20.763, 4.043, 4.438], - [1.892, 1.689, 1.658], - [0.170, 0.049, 0.055], - [0.126, 0.037, 0.033], - [0.179, 0.058, 0.058], - [0.464, 0.076, 0.074], - [0.122, 0.020, 0.024], - [0.133, 0.017, 0.021], - [0.094, 0.020, 0.016] - ] + [0.068, 0.022, 0.021], + [0.167, 0.06, 0.059], + [0.362, 0.144, 0.147], + [0.523, 0.109, 0.113], + [1.644, 1.224, 1.334], + [1.719, 1.167, 1.174], + [0.13, 0.037, 0.038], + [0.181, 0.07, 0.065], + [1.803, 1.414, 1.398], + [2.079, 1.591, 1.617], + [0.875, 0.396, 0.381], + [1.016, 0.452, 0.44], + [1.702, 1.216, 1.197], + [3.255, 1.883, 1.93], + [1.629, 1.124, 1.237], + [1.816, 1.529, 1.51], + [3.179, 2.585, 2.593], + [2.891, 2.197, 2.287], + [6.073, 4.78, 4.877], + [0.597, 0.1, 0.101], + [9.674, 1.35, 1.344], + [11.432, 1.673, 1.652], + [22.163, 3.015, 3.05], + [55.44, 46.286, 43.371], + [2.831, 0.611, 0.604], + [1.025, 0.535, 0.558], + [2.845, 0.724, 0.724], + [9.733, 2.09, 2.088], + [19.263, 18.559, 18.21], + [0.953, 0.806, 0.774], + [2.548, 1.265, 1.166], + [6.191, 1.162, 1.161], + [5.003, 4.177, 4.193], + [10.349, 4.795, 4.817], + [10.307, 4.831, 4.884], + [2.14, 1.835, 1.843], + [0.352, 0.121, 0.111], + [0.217, 0.056, 0.058], + [0.328, 0.11, 0.109], + [0.47, 0.156, 0.157], + [0.201, 0.05, 0.046], + [0.186, 0.046, 0.046], + [0.174, 0.041, 0.044] +] } diff --git a/datafusion-partitioned/results/c6a.4xlarge.json b/datafusion-partitioned/results/c6a.4xlarge.json index 7f29c3b48..e6f6f87db 100644 --- a/datafusion-partitioned/results/c6a.4xlarge.json +++ b/datafusion-partitioned/results/c6a.4xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, partitioned)", - "date": "2026-01-15", + "date": "2025-07-10", "machine": "c6a.4xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14737666736, "result": [ - [0.042, 0.002, 0.002], - [0.082, 0.024, 0.023], - [0.177, 0.068, 0.064], - [0.615, 0.076, 0.073], - [1.198, 0.703, 0.718], - [1.059, 0.727, 0.723], - [0.054, 0.002, 0.002], - [0.100, 0.025, 0.026], - [0.996, 0.824, 0.840], - [1.713, 0.942, 0.981], - [0.632, 0.193, 0.192], - [0.849, 0.228, 0.220], - [1.156, 0.736, 0.745], - [2.658, 1.245, 1.244], - [1.188, 0.753, 0.749], - [0.977, 0.810, 0.818], - [2.701, 1.527, 1.521], - [2.655, 1.522, 1.538], - [5.484, 3.126, 3.143], - [0.275, 0.070, 0.065], - [10.288, 0.958, 0.937], - [11.562, 1.139, 1.109], - [22.298, 2.243, 2.250], - [52.816, 8.052, 8.039], - [0.247, 0.115, 0.129], - [1.284, 0.206, 0.208], - [0.481, 0.121, 0.126], - [10.408, 1.285, 1.342], - [9.295, 8.614, 8.565], - [0.487, 0.401, 0.401], - [3.186, 0.721, 0.691], - [6.936, 0.867, 0.894], - [5.055, 3.304, 3.237], - [10.231, 3.302, 3.297], - [10.289, 3.304, 3.270], - [1.182, 1.097, 1.115], - [0.158, 0.058, 0.054], - [0.112, 0.033, 0.035], - [0.161, 0.057, 0.054], - [0.224, 0.088, 0.086], - [0.093, 0.021, 0.024], - [0.092, 0.018, 0.018], - [0.090, 0.016, 0.016] - ] + [0.058, 0.017, 0.015], + [0.116, 0.035, 0.037], + [0.2, 0.084, 0.088], + [0.43, 0.081, 0.084], + [1.086, 0.78, 0.799], + [0.977, 0.751, 0.756], + [0.086, 0.026, 0.026], + [0.125, 0.04, 0.037], + [1.011, 0.882, 0.862], + [1.349, 0.971, 0.983], + [0.565, 0.231, 0.24], + [0.677, 0.264, 0.265], + [1.062, 0.816, 0.82], + [2.769, 1.346, 1.201], + [1.135, 0.792, 0.78], + [1.021, 0.926, 0.916], + [2.638, 1.639, 1.63], + [2.585, 1.555, 1.592], + [5.159, 3.238, 3.24], + [0.26, 0.077, 0.077], + [10.045, 1.067, 1.082], + [11.424, 1.291, 1.269], + [22.117, 2.487, 2.511], + [55.492, 9.765, 9.851], + [2.825, 0.432, 0.423], + [0.853, 0.328, 0.33], + [2.837, 0.508, 0.504], + [9.744, 1.469, 1.478], + [9.444, 9.445, 9.475], + [0.515, 0.405, 0.415], + [2.433, 0.729, 0.735], + [6.158, 0.884, 0.891], + [4.608, 3.342, 3.281], + [10.221, 3.481, 3.455], + [10.145, 3.486, 3.46], + [1.261, 1.188, 1.168], + [0.309, 0.114, 0.114], + [0.175, 0.05, 0.048], + [0.313, 0.099, 0.117], + [0.451, 0.166, 0.192], + [0.183, 0.04, 0.043], + [0.171, 0.04, 0.041], + [0.143, 0.035, 0.037] +] } diff --git a/datafusion-partitioned/results/c6a.xlarge.json b/datafusion-partitioned/results/c6a.xlarge.json index 1358f4c75..b5fdbd81d 100644 --- a/datafusion-partitioned/results/c6a.xlarge.json +++ b/datafusion-partitioned/results/c6a.xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, partitioned)", - "date": "2026-01-15", + "date": "2025-07-11", "machine": "c6a.xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14737666736, "result": [ - [0.050, 0.002, 0.002], - [0.155, 0.070, 0.069], - [0.916, 0.211, 0.210], - [2.559, 0.176, 0.177], - [3.135, 1.786, 1.855], - [3.332, 1.705, 1.709], - [0.053, 0.002, 0.002], - [0.165, 0.073, 0.073], - [3.476, 2.107, 2.106], - [4.591, 2.450, 2.461], - [2.405, 0.485, 0.461], - [2.598, 0.534, 0.576], - [3.340, 1.444, 1.455], - [6.839, 2.004, 2.061], - [3.427, 1.412, 1.403], - [2.831, 1.924, 1.911], - [6.857, 3.741, 3.456], - [6.659, 3.394, 3.398], + [0.075, 0.035, 0.034], + [0.209, 0.105, 0.107], + [0.558, 0.278, 0.281], + [0.681, 0.201, 0.209], + [3.153, 2.413, 2.399], + [2.628, 2.299, 2.034], + [0.155, 0.064, 0.065], + [0.244, 0.143, 0.137], + [3.546, 2.918, 2.963], + [4.135, 3.296, 3.367], + [1.376, 0.779, 0.817], + [1.548, 1.001, 0.951], + [2.942, 2.662, 2.272], + [4.581, 3.397, 3.699], + [2.802, 2.287, 2.28], + [3.964, 3.285, 3.753], + [5.96, 5.313, 5.198], + [4.913, 4.098, 4.001], [null, null, null], - [1.927, 0.151, 0.150], - [20.884, 2.116, 2.132], - [23.982, 2.600, 2.573], - [45.662, 34.659, 32.172], - [111.062, 100.587, 94.511], - [1.224, 0.198, 0.193], - [3.302, 0.520, 0.519], - [1.246, 0.218, 0.213], - [21.202, 2.943, 2.926], - [30.980, 29.455, 29.697], - [1.574, 1.418, 1.408], - [7.496, 1.788, 1.786], - [15.047, 1.533, 1.524], + [0.697, 0.169, 0.17], + [9.898, 2.361, 2.249], + [11.36, 3.659, 3.492], + [22.105, 17.643, 16.388], + [56.066, 49.612, 48.044], + [2.824, 1.274, 1.265], + [1.471, 1.07, 1.149], + [2.855, 1.477, 1.477], + [9.621, 4.491, 4.587], + [42.151, 40.396, 40.48], + [1.704, 1.498, 1.511], + [3.412, 2.41, 2.46], + [6.256, 2.544, 2.367], [null, null, null], - [null, null, null], - [null, null, null], - [2.984, 2.515, 2.492], - [0.169, 0.051, 0.063], - [0.116, 0.034, 0.034], - [0.170, 0.055, 0.054], - [0.591, 0.077, 0.102], - [0.132, 0.027, 0.022], - [0.144, 0.025, 0.025], - [0.097, 0.018, 0.018] - ] + [null, null, 22.127], + [21.955, null, null], + [4.232, 4.072, 3.842], + [0.329, 0.121, 0.134], + [0.201, 0.073, 0.076], + [0.321, 0.129, 0.128], + [0.479, 0.214, 0.185], + [0.183, 0.064, 0.065], + [0.18, 0.07, 0.067], + [0.159, 0.061, 0.059] +] } diff --git a/datafusion-partitioned/results/c8g.4xlarge.json b/datafusion-partitioned/results/c8g.4xlarge.json index e71c15353..7a1b85655 100644 --- a/datafusion-partitioned/results/c8g.4xlarge.json +++ b/datafusion-partitioned/results/c8g.4xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, partitioned)", - "date": "2026-01-15", + "date": "2025-07-12", "machine": "c8g.4xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14737666736, "result": [ - [0.038, 0.002, 0.002], - [0.066, 0.015, 0.018], - [0.151, 0.046, 0.047], - [0.824, 0.047, 0.042], - [1.210, 0.321, 0.315], - [0.994, 0.453, 0.461], - [0.044, 0.002, 0.002], - [0.072, 0.017, 0.018], - [0.839, 0.356, 0.448], - [1.648, 0.709, 0.688], - [0.619, 0.115, 0.119], - [1.110, 0.130, 0.131], - [1.365, 0.438, 0.450], - [2.495, 0.574, 0.581], - [1.077, 0.394, 0.396], - [0.639, 0.343, 0.356], - [2.458, 0.682, 0.646], - [2.440, 0.645, 0.644], - [4.804, 1.219, 1.233], - [0.277, 0.041, 0.040], - [10.332, 0.636, 0.638], - [11.491, 0.746, 0.687], - [22.285, 1.103, 1.076], - [53.877, 2.915, 2.906], - [0.239, 0.090, 0.097], - [1.400, 0.152, 0.147], - [0.575, 0.095, 0.095], - [10.501, 0.725, 0.743], - [8.956, 6.767, 6.827], - [0.431, 0.343, 0.346], - [3.119, 0.394, 0.388], - [6.932, 0.392, 0.413], - [4.933, 1.143, 0.994], - [9.930, 1.425, 1.553], - [9.941, 1.592, 1.396], - [0.669, 0.503, 0.588], - [0.143, 0.051, 0.051], - [0.099, 0.033, 0.032], - [0.143, 0.051, 0.052], - [0.218, 0.085, 0.087], - [0.079, 0.018, 0.016], - [0.077, 0.015, 0.015], - [0.077, 0.014, 0.014] - ] + [0.055, 0.011, 0.011], + [0.105, 0.02, 0.02], + [0.199, 0.051, 0.051], + [0.432, 0.052, 0.053], + [1.007, 0.434, 0.385], + [0.921, 0.421, 0.421], + [0.083, 0.019, 0.019], + [0.115, 0.021, 0.023], + [0.763, 0.406, 0.398], + [1.276, 0.622, 0.59], + [0.548, 0.156, 0.147], + [0.867, 0.166, 0.177], + [1.161, 0.473, 0.461], + [2.485, 0.625, 0.561], + [1.018, 0.435, 0.452], + [0.604, 0.441, 0.418], + [2.378, 0.734, 0.74], + [2.348, 0.67, 0.661], + [4.44, 1.37, 1.367], + [0.287, 0.043, 0.046], + [10.103, 0.613, 0.621], + [11.384, 0.693, 0.689], + [22.081, 1.158, 1.173], + [55.487, 3.748, 3.699], + [2.837, 0.248, 0.27], + [0.846, 0.212, 0.217], + [2.917, 0.276, 0.276], + [9.713, 0.81, 0.824], + [9.263, 7.386, 7.351], + [0.474, 0.363, 0.377], + [2.404, 0.377, 0.383], + [6.132, 0.381, 0.392], + [4.446, 1.121, 1.123], + [9.862, 1.529, 1.503], + [9.915, 1.818, 1.754], + [0.754, 0.559, 0.556], + [0.278, 0.097, 0.098], + [0.155, 0.04, 0.041], + [0.27, 0.098, 0.096], + [0.41, 0.15, 0.157], + [0.164, 0.031, 0.034], + [0.152, 0.03, 0.03], + [0.132, 0.026, 0.026] +] } diff --git a/datafusion-partitioned/save-result.sh b/datafusion-partitioned/save-result.sh deleted file mode 100755 index 9b38ce9f9..000000000 --- a/datafusion-partitioned/save-result.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -# This scripts converts the raw results.csv data from `benchmark.sh` into a the -# final json format used by the benchmark dashboard. -# -# usage : ./save-result.sh -# -# example (save results/c6a.4xlarge.json) -# ./save-result.sh c6a.4xlarge - -MACHINE=$1 -OUTPUT_FILE="results/${MACHINE}.json" -SYSTEM_NAME="DataFusion (Parquet, single)" -DATE=$(date +%Y-%m-%d) - - -# Read the CSV and build the result array using sed -RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE -{ - "system": "$SYSTEM_NAME", - "date": "$DATE", - "machine": "$MACHINE", - "cluster_size": 1, - "proprietary": "no", - "tuned": "no", - "hardware": "cpu", - "tags": ["Rust","column-oriented","embedded","stateless"], - "load_time": 0, - "data_size": 14779976446, - "result": [ - $RESULT_ARRAY - ] -} -EOF \ No newline at end of file diff --git a/datafusion/README.md b/datafusion/README.md index f8f52ccd0..503fa565d 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -1,46 +1,38 @@ # DataFusion -[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. For more information, please check - -[Apache DataFusion]: https://arrow.apache.org/datafusion/ -[Apache Arrow]: https://arrow.apache.org/ +DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check We use parquet file here and create an external table for it; and then execute the queries. -## Cookbook: Generate benchmark results +## Generate benchmark results The benchmark should be completed in under an hour. On-demand pricing is $0.6 per hour while spot pricing is only $0.2 to $0.3 per hour (us-east-2). -1. manually start a AWS EC2 instance, the following environments are included in this dir: - - | Instance Type | OS | Disk | Arch | - | :-----------: | :---------------------: | :----------------: | :---: | - | `c6a.xlarge` | `Ubuntu 24.04` or later | Root 500GB gp2 SSD | AMD64 | - | `c6a.2xlarge` | | | AMD64 | - | `c6a.4xlarge` | | | AMD64 | - | `c8g.4xlarge` | | | ARM64 | - - All with no EBS optimized, no instance store. For `c6a.xlarge` instance, its memory is not capable to compile datafusion. It's recommended to enable a 8GB swap with ```sudo fallocate -l 4G /swapfile && sudo chmod 600 /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile```. - -2. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` -3. `git clone https://github.com/ClickHouse/ClickBench` -4. `cd ClickBench/datafusion` -5. `vi benchmark.sh` and modify following line to target Datafusion version +1. manually start a AWS EC2 instance + - `c6a.4xlarge` + - Ubuntu 22.04 or later + - Root 500GB gp2 SSD + - no EBS optimized + - no instance store +1. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` +1. `git clone https://github.com/ClickHouse/ClickBench` +1. `cd ClickBench/datafusion` +1. `vi benchmark.sh` and modify following line to target Datafusion version ```bash git checkout 46.0.0 ``` -6. `bash benchmark.sh` -7. Update corresponding `.json` file under `results`, or run `./save-result.sh` with instance type like `./save-result.sh c6a.4xlarge` +1. `bash benchmark.sh` ### Know Issues 1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) 2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) +3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050 ## Generate full human readable results (for debugging) 1. install datafusion-cli 2. download the parquet ```wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.parquet``` -3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh``` +3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```bash run2.sh``` diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index dde073414..c16368cd3 100755 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -1,25 +1,22 @@ #!/bin/bash +echo "Install Rust" +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh +bash rust-init.sh -y +export HOME=${HOME:=~} +source ~/.cargo/env + echo "Install Dependencies" sudo apt-get update -y +sudo apt-get install -y gcc -echo "Install Homebrew" -# This requires password input for sudo, which is not set by default. -# You may need to run the following command to set a password first: -# ``` -# sudo su -# passwd ubuntu -# exit -# ``` -/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" -echo >> /home/ubuntu/.bashrc -echo 'eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv bash)"' >> /home/ubuntu/.bashrc -eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv bash)" - -echo "Install datafusion-cli" -# or use `brew install datafusion@52` to install a specific version -brew install datafusion -datafusion-cli --version +echo "Install DataFusion main branch" +git clone https://github.com/apache/arrow-datafusion.git +cd arrow-datafusion/ +git checkout 47.0.0 +CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli +export PATH="`pwd`/target/release:$PATH" +cd .. echo "Download benchmark target data, single file" wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.parquet diff --git a/datafusion/results/c6a.2xlarge.json b/datafusion/results/c6a.2xlarge.json index 868ab65e7..9d6b4f1bf 100644 --- a/datafusion/results/c6a.2xlarge.json +++ b/datafusion/results/c6a.2xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, single)", - "date": "2026-01-15", + "date": "2025-07-10", "machine": "c6a.2xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14779976446, "result": [ - [0.050, 0.001, 0.001], - [0.149, 0.042, 0.041], - [0.269, 0.109, 0.110], - [2.133, 0.097, 0.095], - [2.676, 1.070, 1.062], - [2.654, 1.169, 1.165], - [0.057, 0.001, 0.001], - [0.128, 0.044, 0.043], - [2.534, 1.254, 1.249], - [3.522, 1.523, 1.541], - [1.896, 0.281, 0.261], - [2.310, 0.297, 0.304], - [2.892, 1.047, 1.044], - [5.834, 1.635, 1.658], - [2.803, 1.028, 1.010], - [2.094, 1.179, 1.165], - [5.568, 2.150, 2.158], - [5.536, 2.269, 2.132], - [10.088, 4.027, 3.934], - [1.363, 0.089, 0.088], - [20.710, 1.404, 1.413], - [23.415, 1.857, 1.786], - [45.164, 5.607, 5.581], - [112.225, 101.388, 95.682], - [6.284, 0.640, 0.660], - [2.539, 0.523, 0.514], - [6.259, 0.668, 0.672], - [20.368, 1.882, 1.852], - [20.198, 17.316, 17.465], - [0.847, 0.755, 0.775], - [6.528, 1.201, 1.193], - [13.633, 1.253, 1.172], - [10.277, 3.849, 3.866], - [20.523, 4.286, 4.279], - [20.545, 4.214, 4.205], - [1.729, 1.540, 1.556], - [0.265, 0.118, 0.110], - [0.190, 0.098, 0.095], - [0.289, 0.107, 0.107], - [0.461, 0.179, 0.189], - [0.159, 0.037, 0.038], - [0.143, 0.037, 0.036], - [0.141, 0.032, 0.030] - ] + [0.096, 0.048, 0.043], + [0.178, 0.089, 0.088], + [0.324, 0.172, 0.172], + [0.447, 0.143, 0.136], + [1.515, 1.336, 1.345], + [1.361, 1.151, 1.143], + [0.116, 0.057, 0.057], + [0.183, 0.099, 0.096], + [1.692, 1.451, 1.46], + [2.024, 1.665, 1.665], + [0.746, 0.43, 0.429], + [0.812, 0.495, 0.485], + [1.384, 1.185, 1.184], + [2.992, 1.937, 1.907], + [1.338, 1.144, 1.144], + [1.838, 1.6, 1.632], + [2.964, 2.522, 2.548], + [2.805, 2.233, 2.239], + [5.649, 4.744, 4.665], + [0.307, 0.126, 0.132], + [9.886, 1.33, 1.335], + [11.311, 1.772, 1.838], + [22.224, 3.518, 3.578], + [55.96, 46.554, 44.205], + [2.743, 0.668, 0.696], + [0.865, 0.585, 0.558], + [2.746, 0.77, 0.745], + [9.641, 2.196, 2.201], + [20.636, 19.789, 20.073], + [0.919, 0.806, 0.765], + [2.445, 1.223, 1.24], + [5.895, 1.209, 1.227], + [4.797, 4.257, 4.311], + [10.619, 4.901, 4.848], + [10.629, 4.829, 4.871], + [2.026, 1.864, 1.826], + [0.308, 0.139, 0.139], + [0.219, 0.09, 0.082], + [0.315, 0.151, 0.143], + [0.422, 0.204, 0.206], + [0.186, 0.073, 0.089], + [0.188, 0.078, 0.074], + [0.181, 0.071, 0.072] +] } diff --git a/datafusion/results/c6a.4xlarge.json b/datafusion/results/c6a.4xlarge.json index 45cc9abd0..cab20d01f 100644 --- a/datafusion/results/c6a.4xlarge.json +++ b/datafusion/results/c6a.4xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, single)", - "date": "2026-01-15", + "date": "2025-07-10", "machine": "c6a.4xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14779976446, "result": [ - [0.062, 0.001, 0.001], - [0.122, 0.038, 0.039], - [0.188, 0.065, 0.063], - [0.393, 0.072, 0.069], - [1.101, 0.737, 0.713], - [0.974, 0.800, 0.787], - [0.068, 0.001, 0.001], - [0.134, 0.041, 0.040], - [1.026, 0.867, 0.857], - [1.353, 0.962, 0.995], - [0.481, 0.206, 0.206], - [0.703, 0.215, 0.217], - [1.031, 0.820, 0.815], - [2.488, 1.180, 1.183], - [1.023, 0.797, 0.795], - [0.984, 0.828, 0.827], - [2.592, 1.576, 1.591], - [2.551, 1.567, 1.575], - [5.086, 3.134, 3.124], - [0.251, 0.080, 0.079], - [10.079, 0.985, 0.996], - [11.328, 1.197, 1.234], - [22.336, 3.132, 3.103], - [55.832, 9.891, 9.749], - [2.685, 0.439, 0.442], - [0.818, 0.340, 0.351], - [2.703, 0.444, 0.443], - [9.786, 1.215, 1.241], - [9.912, 9.087, 9.131], - [0.500, 0.396, 0.394], - [2.858, 0.795, 0.810], - [6.345, 0.925, 0.927], - [4.690, 3.265, 3.560], - [10.113, 3.361, 3.382], - [10.116, 3.342, 3.345], - [1.289, 1.149, 1.078], - [0.316, 0.122, 0.122], - [0.218, 0.123, 0.124], - [0.305, 0.120, 0.120], - [0.480, 0.191, 0.192], - [0.165, 0.051, 0.050], - [0.152, 0.048, 0.048], - [0.149, 0.043, 0.043] - ] + [0.098, 0.059, 0.062], + [0.149, 0.092, 0.091], + [0.224, 0.12, 0.126], + [0.409, 0.12, 0.117], + [1.089, 0.826, 0.857], + [0.947, 0.791, 0.776], + [0.114, 0.074, 0.061], + [0.173, 0.098, 0.096], + [1.072, 0.913, 0.897], + [1.306, 1.078, 1.033], + [0.54, 0.294, 0.29], + [0.643, 0.315, 0.305], + [0.963, 0.831, 0.84], + [2.786, 1.247, 1.399], + [1.047, 0.822, 0.809], + [1.081, 1.019, 0.978], + [2.588, 1.683, 1.68], + [2.585, 1.629, 1.625], + [5.078, 3.227, 3.265], + [0.282, 0.139, 0.13], + [9.925, 1.079, 1.078], + [11.375, 1.302, 1.324], + [22.24, 2.678, 2.725], + [55.848, 10.042, 10.348], + [2.701, 0.49, 0.485], + [0.859, 0.381, 0.393], + [2.701, 0.535, 0.552], + [9.697, 1.644, 1.66], + [10.333, 9.847, 9.703], + [0.537, 0.458, 0.427], + [2.354, 0.829, 0.816], + [5.894, 1.001, 0.991], + [4.426, 3.514, 3.5], + [10.153, 3.738, 3.713], + [10.134, 3.712, 3.729], + [1.347, 1.2, 1.248], + [0.362, 0.178, 0.18], + [0.23, 0.114, 0.133], + [0.357, 0.182, 0.199], + [0.511, 0.247, 0.246], + [0.212, 0.098, 0.101], + [0.215, 0.102, 0.113], + [0.203, 0.1, 0.093] +] } diff --git a/datafusion/results/c6a.xlarge.json b/datafusion/results/c6a.xlarge.json index 3f4cf4a29..c48aaf779 100644 --- a/datafusion/results/c6a.xlarge.json +++ b/datafusion/results/c6a.xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, single)", - "date": "2026-01-15", + "date": "2025-07-11", "machine": "c6a.xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14779976446, "result": [ - [0.061, 0.001, 0.001], - [0.177, 0.062, 0.064], - [0.416, 0.193, 0.194], - [2.082, 0.165, 0.165], - [2.824, 1.699, 1.691], - [2.937, 2.110, 2.101], - [0.050, 0.001, 0.001], - [0.154, 0.066, 0.067], - [2.857, 2.065, 2.047], - [3.543, 2.345, 2.343], - [1.938, 0.436, 0.432], - [2.109, 0.493, 0.492], - [2.954, 1.851, 1.844], - [6.119, 2.711, 2.659], - [3.002, 1.782, 1.787], - [2.437, 1.885, 1.873], - [6.339, 3.711, 3.693], - [6.328, 3.671, 3.670], + [0.091, 0.048, 0.047], + [0.228, 0.119, 0.121], + [0.502, 0.279, 0.279], + [0.597, 0.2, 0.199], + [2.614, 2.278, 2.321], + [2.436, 2.056, 2.039], + [0.15, 0.073, 0.072], + [0.255, 0.151, 0.151], + [3.156, 2.83, 2.87], + [3.723, 3.186, 3.08], + [1.225, 0.803, 0.804], + [1.361, 0.886, 0.914], + [2.662, 2.331, 2.319], + [4.128, 3.6, 3.609], + [2.618, 2.237, 2.208], + [3.459, 3.165, 3.153], + [5.596, 5.059, 5.119], + [4.571, 3.898, 3.928], + [10.933, null, 19.109], + [0.55, 0.18, 0.178], + [9.608, 2.416, 2.429], + [11.302, 3.716, 3.757], + [22.33, 15.919, 13.787], + [55.818, 47.55, 46.996], + [2.703, 1.31, 1.318], + [1.527, 1.115, 1.092], + [2.731, 1.51, 1.479], + [9.664, 4.516, 4.776], + [42.285, 41.141, 41.129], + [1.619, 1.472, 1.451], + [3.121, 2.416, 2.476], + [5.996, 2.345, 2.324], + [9.889, null, 20.018], [null, null, null], - [1.421, 0.147, 0.147], - [20.657, 2.639, 2.654], - [23.471, 3.422, 3.431], - [45.366, 35.012, 31.708], - [112.328, 102.982, 96.806], - [6.317, 1.188, 1.194], - [2.552, 0.954, 0.951], - [6.303, 1.200, 1.204], - [20.396, 3.341, 3.356], - [32.125, 30.914, 30.854], - [1.564, 1.403, 1.401], - [6.693, 1.972, 1.970], - [13.640, 1.992, 1.960], [null, null, null], - [null, null, null], - [null, null, null], - [2.686, 2.433, 2.431], - [0.295, 0.104, 0.105], - [0.209, 0.096, 0.092], - [0.294, 0.104, 0.109], - [0.454, 0.177, 0.175], - [0.149, 0.039, 0.036], - [0.142, 0.039, 0.039], - [0.132, 0.033, 0.033] - ] + [3.787, 3.49, 3.558], + [0.34, 0.148, 0.16], + [0.223, 0.097, 0.099], + [0.336, 0.141, 0.149], + [0.501, 0.219, 0.219], + [0.201, 0.074, 0.08], + [0.185, 0.079, 0.09], + [0.172, 0.073, 0.079] +] } diff --git a/datafusion/results/c8g.4xlarge.json b/datafusion/results/c8g.4xlarge.json index 04be35493..814cb7f77 100644 --- a/datafusion/results/c8g.4xlarge.json +++ b/datafusion/results/c8g.4xlarge.json @@ -1,6 +1,6 @@ { "system": "DataFusion (Parquet, single)", - "date": "2026-01-15", + "date": "2025-07-12", "machine": "c8g.4xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,48 +10,48 @@ "load_time": 0, "data_size": 14779976446, "result": [ - [0.050, 0.001, 0.001], - [0.083, 0.017, 0.019], - [0.150, 0.046, 0.045], - [0.400, 0.043, 0.038], - [1.093, 0.296, 0.300], - [0.830, 0.417, 0.406], - [0.041, 0.001, 0.001], - [0.070, 0.019, 0.019], - [0.700, 0.401, 0.403], - [1.276, 0.678, 0.693], - [0.450, 0.118, 0.121], - [0.980, 0.135, 0.133], - [1.246, 0.399, 0.420], - [2.249, 0.473, 0.532], - [0.909, 0.392, 0.375], - [0.569, 0.360, 0.334], - [2.188, 0.654, 0.630], - [2.183, 0.617, 0.619], - [4.319, 1.204, 1.262], - [0.194, 0.042, 0.046], - [10.194, 0.644, 0.646], - [11.257, 0.706, 0.716], - [22.174, 1.279, 1.257], - [55.737, 3.583, 3.627], - [2.671, 0.210, 0.213], - [0.959, 0.172, 0.156], - [3.033, 0.219, 0.218], - [9.849, 0.751, 0.745], - [10.612, 7.221, 7.211], - [0.443, 0.342, 0.342], - [2.784, 0.384, 0.367], - [6.308, 0.379, 0.395], - [4.509, 1.099, 1.028], - [9.716, 1.482, 1.447], - [9.757, 1.453, 1.454], - [0.688, 0.584, 0.554], - [0.230, 0.095, 0.096], - [0.122, 0.046, 0.046], - [0.255, 0.097, 0.095], - [0.405, 0.159, 0.159], - [0.122, 0.030, 0.030], - [0.109, 0.031, 0.029], - [0.102, 0.025, 0.026] - ] + [0.08, 0.044, 0.042], + [0.113, 0.045, 0.052], + [0.167, 0.081, 0.071], + [0.382, 0.064, 0.068], + [1.014, 0.393, 0.413], + [0.838, 0.422, 0.42], + [0.087, 0.05, 0.05], + [0.115, 0.055, 0.055], + [0.751, 0.451, 0.427], + [1.209, 0.645, 0.665], + [0.488, 0.177, 0.171], + [0.858, 0.188, 0.188], + [1.096, 0.47, 0.455], + [2.412, 0.632, 0.656], + [0.968, 0.437, 0.433], + [0.633, 0.44, 0.453], + [2.255, 0.787, 0.802], + [2.221, 0.705, 0.712], + [4.306, 1.397, 1.411], + [0.247, 0.07, 0.068], + [10.071, 0.67, 0.687], + [11.304, 0.767, 0.794], + [22.184, 1.451, 1.388], + [55.753, 3.883, 4.14], + [2.671, 0.276, 0.268], + [0.817, 0.241, 0.242], + [2.775, 0.298, 0.303], + [9.652, 0.924, 0.871], + [10.242, 7.78, 7.539], + [0.477, 0.374, 0.381], + [2.269, 0.404, 0.43], + [5.833, 0.423, 0.397], + [4.217, 1.133, 1.132], + [9.806, 1.576, 1.613], + [9.73, 1.55, 1.618], + [0.729, 0.62, 0.579], + [0.285, 0.129, 0.129], + [0.177, 0.07, 0.072], + [0.282, 0.141, 0.131], + [0.411, 0.194, 0.198], + [0.159, 0.062, 0.057], + [0.148, 0.055, 0.055], + [0.144, 0.051, 0.06] +] } diff --git a/datafusion/save-result.sh b/datafusion/save-result.sh deleted file mode 100755 index 9b38ce9f9..000000000 --- a/datafusion/save-result.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -# This scripts converts the raw results.csv data from `benchmark.sh` into a the -# final json format used by the benchmark dashboard. -# -# usage : ./save-result.sh -# -# example (save results/c6a.4xlarge.json) -# ./save-result.sh c6a.4xlarge - -MACHINE=$1 -OUTPUT_FILE="results/${MACHINE}.json" -SYSTEM_NAME="DataFusion (Parquet, single)" -DATE=$(date +%Y-%m-%d) - - -# Read the CSV and build the result array using sed -RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE -{ - "system": "$SYSTEM_NAME", - "date": "$DATE", - "machine": "$MACHINE", - "cluster_size": 1, - "proprietary": "no", - "tuned": "no", - "hardware": "cpu", - "tags": ["Rust","column-oriented","embedded","stateless"], - "load_time": 0, - "data_size": 14779976446, - "result": [ - $RESULT_ARRAY - ] -} -EOF \ No newline at end of file