From 3e4ced30df439e88dbe50daa9065bffb1ce14170 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 27 Feb 2026 11:57:13 -0500
Subject: [PATCH 1/2] Update DataFusion instructions / Enable swap on small
 machines

---
 datafusion-partitioned/README.md    | 40 ++++++----------
 datafusion-partitioned/benchmark.sh | 14 ++++++
 datafusion-partitioned/make-json.sh | 37 +++++++++++++++
 datafusion/README.md                | 72 ++++++++++++++++++++---------
 datafusion/benchmark.sh             | 15 ++++++
 datafusion/make-json.sh             | 37 +++++++++++++++
 6 files changed, 166 insertions(+), 49 deletions(-)
 create mode 100755 datafusion-partitioned/make-json.sh
 create mode 100755 datafusion/make-json.sh

diff --git a/datafusion-partitioned/README.md b/datafusion-partitioned/README.md
index 503fa565d..0832df0d7 100644
--- a/datafusion-partitioned/README.md
+++ b/datafusion-partitioned/README.md
@@ -1,38 +1,26 @@
 # DataFusion
 
-DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check <https://arrow.apache.org/datafusion/user-guide/introduction.html>
+Partitioned (100-file) Parquet dataset
 
-We use parquet file here and create an external table for it; and then execute the queries.
+## Cookbook: Generate benchmark results
 
-## Generate benchmark results
+Follow instructions in the [datafusion](../datafusion/README.md) directory.
 
-The benchmark should be completed in under an hour. On-demand pricing is $0.6 per hour while spot pricing is only $0.2 to $0.3 per hour (us-east-2).
+### Known Issues
 
-1. manually start a AWS EC2 instance
-    - `c6a.4xlarge`
-    - Ubuntu 22.04 or later
-    - Root 500GB gp2 SSD
-    - no EBS optimized
-    - no instance store
-1. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}`
-1. `git clone https://github.com/ClickHouse/ClickBench`
-1. `cd ClickBench/datafusion`
-1. `vi benchmark.sh` and modify following line to target Datafusion version
+1. DataFusion follows the SQL standard with case-sensitive identifiers, so all column names in `queries.sql` use double-quoted literals (e.g. `EventTime` -> `"EventTime"`).
 
-    ```bash
-    git checkout 46.0.0
-    ```
+2. You must set the `('binary_as_string' 'true')` due to an incorrect logical type
+annotation in the partitioned files. See [Issue#7](https://github.com/ClickHouse/ClickBench/issues/7)
 
-1. `bash benchmark.sh`
+## Generate full human-readable results (for debugging)
 
-### Know Issues
+1. Install/build `datafusion-cli`.
 
-1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`)
-2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`)
-3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050
+2. Download the parquet files:
 
-## Generate full human readable results (for debugging)
+```
+seq 0 99 | xargs -P100 -I{} bash -c 'wget --directory-prefix partitioned --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
+```
 
-1. install datafusion-cli
-2. download the parquet ```wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.parquet```
-3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```bash run2.sh```
+3. Run the queries: `datafusion-cli -f create.sql -f queries.sql` or `PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh`.
diff --git a/datafusion-partitioned/benchmark.sh b/datafusion-partitioned/benchmark.sh
index 1c10a401d..4aa3a6867 100755
--- a/datafusion-partitioned/benchmark.sh
+++ b/datafusion-partitioned/benchmark.sh
@@ -6,6 +6,20 @@ bash rust-init.sh -y
 export HOME=${HOME:=~}
 source ~/.cargo/env
 
+if [ $(free -g | awk '/^Mem:/{print $2}') -lt 12 ]; then
+  echo "LOW MEMORY MODE"
+  # Enable swap if not already enabled. This is needed both for rustc and until we have a better
+  # solution for low memory machines, see
+  # https://github.com/apache/datafusion/issues/18473
+  if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then
+    echo "Enabling 8G swap"
+    sudo fallocate -l 8G /swapfile
+    sudo chmod 600 /swapfile
+    sudo mkswap /swapfile
+    sudo swapon /swapfile
+  fi
+fi
+
 echo "Install Dependencies"
 sudo apt-get update -y
 sudo apt-get install -y gcc
diff --git a/datafusion-partitioned/make-json.sh b/datafusion-partitioned/make-json.sh
new file mode 100755
index 000000000..702b636a5
--- /dev/null
+++ b/datafusion-partitioned/make-json.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# This scripts converts the raw results.csv data from `benchmark.sh` into a the
+# final json format used by the benchmark dashboard.
+#
+# usage : ./save-result.sh <machine>
+#
+# example (save results/c6a.4xlarge.json)
+#         ./save-result.sh c6a.4xlarge
+
+MACHINE=$1
+OUTPUT_FILE="results/${MACHINE}.json"
+SYSTEM_NAME="DataFusion (Parquet, partitioned)"
+DATE=$(date +%Y-%m-%d)
+
+
+# Read the CSV and build the result array using sed
+RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf "        ["arr[i]"]"; if (i<length(arr)) printf ",\n"}}' result.csv)
+
+# form the final JSON structure from the template
+cat <<EOF > $OUTPUT_FILE
+{
+    "system": "$SYSTEM_NAME",
+    "date": "$DATE",
+    "machine": "$MACHINE",
+    "cluster_size": 1,
+    "proprietary": "no",
+    "tuned": "no",
+    "hardware": "cpu",
+    "tags": ["Rust","column-oriented","embedded","stateless"],
+    "load_time": 0,
+    "data_size": 14737666736,
+    "result": [
+        $RESULT_ARRAY
+    ]
+}
+EOF
diff --git a/datafusion/README.md b/datafusion/README.md
index 503fa565d..c28dff468 100644
--- a/datafusion/README.md
+++ b/datafusion/README.md
@@ -1,38 +1,64 @@
 # DataFusion
 
-DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check <https://arrow.apache.org/datafusion/user-guide/introduction.html>
+Single (1 file) Parquet dataset
 
-We use parquet file here and create an external table for it; and then execute the queries.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
-## Generate benchmark results
+[Apache DataFusion]: https://datafusion.apache.org/
+[Apache Arrow]: https://arrow.apache.org/
+
+## Cookbook: Generate benchmark results
 
 The benchmark should be completed in under an hour. On-demand pricing is $0.6 per hour while spot pricing is only $0.2 to $0.3 per hour (us-east-2).
 
-1. manually start a AWS EC2 instance
-    - `c6a.4xlarge`
-    - Ubuntu 22.04 or later
-    - Root 500GB gp2 SSD
-    - no EBS optimized
-    - no instance store
-1. wait for status check passed, then ssh to EC2 `ssh ubuntu@{ip}`
-1. `git clone https://github.com/ClickHouse/ClickBench`
-1. `cd ClickBench/datafusion`
-1. `vi benchmark.sh` and modify following line to target Datafusion version
+1. Manually start an AWS EC2 instance. The following environments are included in this directory:
+
+   | Instance Type |           OS            |        Disk        | Arch  |
+   | :-----------: | :---------------------: | :----------------: | :---: |
+   | `c6a.xlarge`  | `Ubuntu 24.04` or later | Root 500GB gp2 SSD | AMD64 |
+   | `c6a.2xlarge` |                         |                    | AMD64 |
+   | `c6a.4xlarge` |                         |                    | AMD64 |
+   | `c8g.4xlarge` |                         |                    | ARM64 |
+
+All with no EBS optimization and no instance store.
+
+2. Wait for the status checks to pass, then ssh to EC2: `ssh ubuntu@{ip}`
+3. `git clone https://github.com/ClickHouse/ClickBench`
+4. `cd ClickBench/datafusion`
+5. `vi benchmark.sh` and modify the following line to target the DataFusion version
 
     ```bash
-    git checkout 46.0.0
+    git checkout 47.0.0
     ```
 
-1. `bash benchmark.sh`
+6. `bash benchmark.sh`
+
+You can update/preview the results by running:
+```
+./make-json.sh <machine-name> # Example. ./make-json.sh c6a.xlarge
+```
+
+### Known Issues
+
+1. DataFusion follows the SQL standard with case-sensitive identifiers, so all column names in `queries.sql` use double-quoted literals (e.g. `EventTime` -> `"EventTime"`).
+
+## Generate full human-readable results (for debugging)
+
+1. Install/build `datafusion-cli`.
+
+2. Download the parquet file:
 
-### Know Issues
+```
+wget --continue https://datasets.clickhouse.com/hits_compatible/hits.parquet
+```
 
-1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`)
-2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`)
-3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050
+3. Run the queries:
 
-## Generate full human readable results (for debugging)
+```
+datafusion-cli -f create.sql -f queries.sql
+```
 
-1. install datafusion-cli
-2. download the parquet ```wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.parquet```
-3. execute it ```datafusion-cli -f create_single.sql queries.sql``` or ```bash run2.sh```
+Or use the runner script:
+```
+PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh
+```
diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh
index c16368cd3..c1a7a6639 100755
--- a/datafusion/benchmark.sh
+++ b/datafusion/benchmark.sh
@@ -6,6 +6,21 @@ bash rust-init.sh -y
 export HOME=${HOME:=~}
 source ~/.cargo/env
 
+if [ $(free -g | awk '/^Mem:/{print $2}') -lt 12 ]; then
+  echo "LOW MEMORY MODE"
+  # Enable swap if not already enabled. This is needed both for rustc and until we have a better
+  # solution for low memory machines, see
+  # https://github.com/apache/datafusion/issues/18473
+  if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then
+    echo "Enabling 8G swap"
+    sudo fallocate -l 8G /swapfile
+    sudo chmod 600 /swapfile
+    sudo mkswap /swapfile
+    sudo swapon /swapfile
+  fi
+fi
+
+
 echo "Install Dependencies"
 sudo apt-get update -y
 sudo apt-get install -y gcc
diff --git a/datafusion/make-json.sh b/datafusion/make-json.sh
new file mode 100755
index 000000000..6dd3f00f3
--- /dev/null
+++ b/datafusion/make-json.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# This scripts converts the raw results.csv data from `benchmark.sh` into a the
+# final json format used by the benchmark dashboard.
+#
+# usage : ./make-json.sh <machine>
+#
+# example ./make-json c6a.4xlarge # saves results/c6a.4xlarge.json
+#
+
+MACHINE=$1
+OUTPUT_FILE="results/${MACHINE}.json"
+SYSTEM_NAME="DataFusion (Parquet, single)"
+DATE=$(date +%Y-%m-%d)
+
+
+# Read the CSV and build the result array using sed
+RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf "        ["arr[i]"]"; if (i<length(arr)) printf ",\n"}}' result.csv)
+
+# form the final JSON structure from the template
+cat <<EOF > $OUTPUT_FILE
+{
+    "system": "$SYSTEM_NAME",
+    "date": "$DATE",
+    "machine": "$MACHINE",
+    "cluster_size": 1,
+    "proprietary": "no",
+    "tuned": "no",
+    "hardware": "cpu",
+    "tags": ["Rust","column-oriented","embedded","stateless"],
+    "load_time": 0,
+    "data_size": 14779976446,
+    "result": [
+        $RESULT_ARRAY
+    ]
+}
+EOF

From 0cdec15a356757c8f17942fc78dc5e90a340663d Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 27 Feb 2026 17:04:06 -0500
Subject: [PATCH 2/2] typos

---
 datafusion-partitioned/README.md    | 2 +-
 datafusion-partitioned/make-json.sh | 6 +++---
 datafusion/README.md                | 2 +-
 datafusion/make-json.sh             | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/datafusion-partitioned/README.md b/datafusion-partitioned/README.md
index 0832df0d7..c7f4eca1c 100644
--- a/datafusion-partitioned/README.md
+++ b/datafusion-partitioned/README.md
@@ -23,4 +23,4 @@ annotation in the partitioned files. See [Issue#7](https://github.com/ClickHouse
 seq 0 99 | xargs -P100 -I{} bash -c 'wget --directory-prefix partitioned --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
 ```
 
-3. Run the queries: `datafusion-cli -f create.sql -f queries.sql` or `PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh`.
+3. Run the queries: `datafusion-cli -f create.sql -f queries.sql` or `PATH="$(pwd)/arrow-datafusion/target/release:$PATH" ./run.sh`.
diff --git a/datafusion-partitioned/make-json.sh b/datafusion-partitioned/make-json.sh
index 702b636a5..7973d394f 100755
--- a/datafusion-partitioned/make-json.sh
+++ b/datafusion-partitioned/make-json.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 
-# This scripts converts the raw results.csv data from `benchmark.sh` into a the
+# This script converts the raw `result.csv` data from `benchmark.sh` into the
 # final json format used by the benchmark dashboard.
 #
-# usage : ./save-result.sh <machine>
+# usage : ./make-json.sh <machine>
 #
 # example (save results/c6a.4xlarge.json)
-#         ./save-result.sh c6a.4xlarge
+#         ./make-json.sh c6a.4xlarge
 
 MACHINE=$1
 OUTPUT_FILE="results/${MACHINE}.json"
diff --git a/datafusion/README.md b/datafusion/README.md
index c28dff468..6b110ba16 100644
--- a/datafusion/README.md
+++ b/datafusion/README.md
@@ -60,5 +60,5 @@ datafusion-cli -f create.sql -f queries.sql
 
 Or use the runner script:
 ```
-PATH="$(pwd)/datafusion/target/release:$PATH" ./run.sh
+PATH="$(pwd)/arrow-datafusion/target/release:$PATH" ./run.sh
 ```
diff --git a/datafusion/make-json.sh b/datafusion/make-json.sh
index 6dd3f00f3..67afcee6c 100755
--- a/datafusion/make-json.sh
+++ b/datafusion/make-json.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 
-# This scripts converts the raw results.csv data from `benchmark.sh` into a the
+# This script converts the raw `result.csv` data from `benchmark.sh` into the
 # final json format used by the benchmark dashboard.
 #
 # usage : ./make-json.sh <machine>
 #
-# example ./make-json c6a.4xlarge # saves results/c6a.4xlarge.json
+# example ./make-json.sh c6a.4xlarge # saves results/c6a.4xlarge.json
 #
 
 MACHINE=$1