From 3e4ced30df439e88dbe50daa9065bffb1ce14170 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 27 Feb 2026 11:57:13 -0500 Subject: [PATCH 1/2] Update DataFusion instructions / Enable swap on small machines --- datafusion-partitioned/README.md | 40 ++++++---------- datafusion-partitioned/benchmark.sh | 14 ++++++ datafusion-partitioned/make-json.sh | 37 +++++++++++++++ datafusion/README.md | 72 ++++++++++++++++++++--------- datafusion/benchmark.sh | 15 ++++++ datafusion/make-json.sh | 37 +++++++++++++++ 6 files changed, 166 insertions(+), 49 deletions(-) create mode 100755 datafusion-partitioned/make-json.sh create mode 100755 datafusion/make-json.sh diff --git a/datafusion-partitioned/README.md b/datafusion-partitioned/README.md index 503fa565d..0832df0d7 100644 --- a/datafusion-partitioned/README.md +++ b/datafusion-partitioned/README.md @@ -1,38 +1,26 @@ # DataFusion -DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check +Partitioned (100-file) Parquet dataset -We use parquet file here and create an external table for it; and then execute the queries. +## Cookbook: Generate benchmark results -## Generate benchmark results +Follow instructions in the [datafusion](../datafusion/README.md) directory. -The benchmark should be completed in under an hour. On-demand pricing is $0.6 per hour while spot pricing is only $0.2 to $0.3 per hour (us-east-2). +### Known Issues -1. manually start a AWS EC2 instance - - `c6a.4xlarge` - - Ubuntu 22.04 or later - - Root 500GB gp2 SSD - - no EBS optimized - - no instance store -1. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` -1. `git clone https://github.com/ClickHouse/ClickBench` -1. `cd ClickBench/datafusion` -1. `vi benchmark.sh` and modify following line to target Datafusion version +1. DataFusion follows the SQL standard with case-sensitive identifiers, so all column names in `queries.sql` use double-quoted literals (e.g. `EventTime` -> `"EventTime"`). - ```bash - git checkout 46.0.0 - ``` +2. You must set the `('binary_as_string' 'true')` due to an incorrect logical type +annotation in the partitioned files. See [Issue#7](https://github.com/ClickHouse/ClickBench/issues/7) -1. `bash benchmark.sh` +## Generate full human-readable results (for debugging) -### Know Issues +1. Install/build `datafusion-cli`. -1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) -2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) -3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050 +2. Download the parquet files: -## Generate full human readable results (for debugging) +``` +seq 0 99 | xargs -P100 -I{} bash -c 'wget --directory-prefix partitioned --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' +``` -1. install datafusion-cli -2. download the parquet ```wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.parquet``` -3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```bash run2.sh``` +3. Run the queries: `datafusion-cli -f create.sql -f queries.sql` or `PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh`. diff --git a/datafusion-partitioned/benchmark.sh b/datafusion-partitioned/benchmark.sh index 1c10a401d..4aa3a6867 100755 --- a/datafusion-partitioned/benchmark.sh +++ b/datafusion-partitioned/benchmark.sh @@ -6,6 +6,20 @@ bash rust-init.sh -y export HOME=${HOME:=~} source ~/.cargo/env +if [ $(free -g | awk '/^Mem:/{print $2}') -lt 12 ]; then + echo "LOW MEMORY MODE" + # Enable swap if not already enabled. This is needed both for rustc and until we have a better + # solution for low memory machines, see + # https://github.com/apache/datafusion/issues/18473 + if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then + echo "Enabling 8G swap" + sudo fallocate -l 8G /swapfile + sudo chmod 600 /swapfile + sudo mkswap /swapfile + sudo swapon /swapfile + fi +fi + echo "Install Dependencies" sudo apt-get update -y sudo apt-get install -y gcc diff --git a/datafusion-partitioned/make-json.sh b/datafusion-partitioned/make-json.sh new file mode 100755 index 000000000..702b636a5 --- /dev/null +++ b/datafusion-partitioned/make-json.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# This scripts converts the raw results.csv data from `benchmark.sh` into a the +# final json format used by the benchmark dashboard. +# +# usage : ./save-result.sh +# +# example (save results/c6a.4xlarge.json) +# ./save-result.sh c6a.4xlarge + +MACHINE=$1 +OUTPUT_FILE="results/${MACHINE}.json" +SYSTEM_NAME="DataFusion (Parquet, partitioned)" +DATE=$(date +%Y-%m-%d) + + +# Read the CSV and build the result array using sed +RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE +{ + "system": "$SYSTEM_NAME", + "date": "$DATE", + "machine": "$MACHINE", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + $RESULT_ARRAY + ] +} +EOF diff --git a/datafusion/README.md b/datafusion/README.md index 503fa565d..c28dff468 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -1,38 +1,64 @@ # DataFusion -DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check +Single (1 file) Parquet dataset -We use parquet file here and create an external table for it; and then execute the queries. +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. -## Generate benchmark results +[Apache DataFusion]: https://datafusion.apache.org/ +[Apache Arrow]: https://arrow.apache.org/ + +## Cookbook: Generate benchmark results The benchmark should be completed in under an hour. On-demand pricing is $0.6 per hour while spot pricing is only $0.2 to $0.3 per hour (us-east-2). -1. manually start a AWS EC2 instance - - `c6a.4xlarge` - - Ubuntu 22.04 or later - - Root 500GB gp2 SSD - - no EBS optimized - - no instance store -1. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}` -1. `git clone https://github.com/ClickHouse/ClickBench` -1. `cd ClickBench/datafusion` -1. `vi benchmark.sh` and modify following line to target Datafusion version +1. Manually start an AWS EC2 instance. The following environments are included in this directory: + + | Instance Type | OS | Disk | Arch | + | :-----------: | :---------------------: | :----------------: | :---: | + | `c6a.xlarge` | `Ubuntu 24.04` or later | Root 500GB gp2 SSD | AMD64 | + | `c6a.2xlarge` | | | AMD64 | + | `c6a.4xlarge` | | | AMD64 | + | `c8g.4xlarge` | | | ARM64 | + +All with no EBS optimization and no instance store. + +2. Wait for the status checks to pass, then ssh to EC2: `ssh ubuntu@{ip}` +3. `git clone https://github.com/ClickHouse/ClickBench` +4. `cd ClickBench/datafusion` +5. `vi benchmark.sh` and modify the following line to target the DataFusion version ```bash - git checkout 46.0.0 + git checkout 47.0.0 ``` -1. `bash benchmark.sh` +6. `bash benchmark.sh` + +You can update/preview the results by running: +``` +./make-json.sh # Example. ./make-json.sh c6a.xlarge +``` + +### Known Issues + +1. DataFusion follows the SQL standard with case-sensitive identifiers, so all column names in `queries.sql` use double-quoted literals (e.g. `EventTime` -> `"EventTime"`). + +## Generate full human-readable results (for debugging) + +1. Install/build `datafusion-cli`. + +2. Download the parquet file: -### Know Issues +``` +wget --continue https://datasets.clickhouse.com/hits_compatible/hits.parquet +``` -1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) -2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) -3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050 +3. Run the queries: -## Generate full human readable results (for debugging) +``` +datafusion-cli -f create.sql -f queries.sql +``` -1. install datafusion-cli -2. download the parquet ```wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.parquet``` -3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```bash run2.sh``` +Or use the runner script: +``` +PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh +``` diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index c16368cd3..c1a7a6639 100755 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -6,6 +6,21 @@ bash rust-init.sh -y export HOME=${HOME:=~} source ~/.cargo/env +if [ $(free -g | awk '/^Mem:/{print $2}') -lt 12 ]; then + echo "LOW MEMORY MODE" + # Enable swap if not already enabled. This is needed both for rustc and until we have a better + # solution for low memory machines, see + # https://github.com/apache/datafusion/issues/18473 + if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then + echo "Enabling 8G swap" + sudo fallocate -l 8G /swapfile + sudo chmod 600 /swapfile + sudo mkswap /swapfile + sudo swapon /swapfile + fi +fi + + echo "Install Dependencies" sudo apt-get update -y sudo apt-get install -y gcc diff --git a/datafusion/make-json.sh b/datafusion/make-json.sh new file mode 100755 index 000000000..6dd3f00f3 --- /dev/null +++ b/datafusion/make-json.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# This scripts converts the raw results.csv data from `benchmark.sh` into a the +# final json format used by the benchmark dashboard. +# +# usage : ./make-json.sh +# +# example ./make-json c6a.4xlarge # saves results/c6a.4xlarge.json +# + +MACHINE=$1 +OUTPUT_FILE="results/${MACHINE}.json" +SYSTEM_NAME="DataFusion (Parquet, single)" +DATE=$(date +%Y-%m-%d) + + +# Read the CSV and build the result array using sed +RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE +{ + "system": "$SYSTEM_NAME", + "date": "$DATE", + "machine": "$MACHINE", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + $RESULT_ARRAY + ] +} +EOF From 0cdec15a356757c8f17942fc78dc5e90a340663d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 27 Feb 2026 17:04:06 -0500 Subject: [PATCH 2/2] typos --- datafusion-partitioned/README.md | 2 +- datafusion-partitioned/make-json.sh | 6 +++--- datafusion/README.md | 2 +- datafusion/make-json.sh | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/datafusion-partitioned/README.md b/datafusion-partitioned/README.md index 0832df0d7..c7f4eca1c 100644 --- a/datafusion-partitioned/README.md +++ b/datafusion-partitioned/README.md @@ -23,4 +23,4 @@ annotation in the partitioned files. See [Issue#7](https://github.com/ClickHouse seq 0 99 | xargs -P100 -I{} bash -c 'wget --directory-prefix partitioned --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' ``` -3. Run the queries: `datafusion-cli -f create.sql -f queries.sql` or `PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh`. +3. Run the queries: `datafusion-cli -f create.sql -f queries.sql` or `PATH="$(pwd)/arrow-datafusion/target/release:$PATH" ./run.sh`. diff --git a/datafusion-partitioned/make-json.sh b/datafusion-partitioned/make-json.sh index 702b636a5..7973d394f 100755 --- a/datafusion-partitioned/make-json.sh +++ b/datafusion-partitioned/make-json.sh @@ -1,12 +1,12 @@ #!/bin/bash -# This scripts converts the raw results.csv data from `benchmark.sh` into a the +# This script converts the raw `result.csv` data from `benchmark.sh` into the # final json format used by the benchmark dashboard. # -# usage : ./save-result.sh +# usage : ./make-json.sh # # example (save results/c6a.4xlarge.json) -# ./save-result.sh c6a.4xlarge +# ./make-json.sh c6a.4xlarge MACHINE=$1 OUTPUT_FILE="results/${MACHINE}.json" diff --git a/datafusion/README.md b/datafusion/README.md index c28dff468..6b110ba16 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -60,5 +60,5 @@ datafusion-cli -f create.sql -f queries.sql Or use the runner script: ``` -PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh +PATH="$(pwd)/arrow-datafusion/target/release:$PATH" ./run.sh ``` diff --git a/datafusion/make-json.sh b/datafusion/make-json.sh index 6dd3f00f3..67afcee6c 100755 --- a/datafusion/make-json.sh +++ b/datafusion/make-json.sh @@ -1,11 +1,11 @@ #!/bin/bash -# This scripts converts the raw results.csv data from `benchmark.sh` into a the +# This script converts the raw `result.csv` data from `benchmark.sh` into the # final json format used by the benchmark dashboard. # # usage : ./make-json.sh # -# example ./make-json c6a.4xlarge # saves results/c6a.4xlarge.json +# example ./make-json.sh c6a.4xlarge # saves results/c6a.4xlarge.json # MACHINE=$1