From 2451fc56e468e8715816e9ee041c2285086ac2a9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 7 May 2026 12:23:47 +0000 Subject: [PATCH 1/2] Add spark-gluten-clickhouse entry (Spark + Gluten with the CH backend) Adds a spark-gluten-clickhouse/ entry that runs the ClickBench query suite against Apache Spark with Apache Gluten configured to use the ClickHouse backend ('ch'), in which Gluten loads libch.so (a fork of ClickHouse v23.1) into the Spark executor JVM and runs the columnar plan natively through it. Compared with spark-gluten/ (which uses the Velox backend), this exercises a meaningfully different execution path: Catalyst -> Substrait -> ClickHouse engine, rather than Catalyst -> Substrait -> Velox. No pre-built bundle is published for the CH backend (the Apache Gluten release tarball ships only the Velox bundle), so benchmark.sh builds both libch.so and the Gluten Spark plugin from source. The build is memory-hungry; a 64 GB host (c6a.8xlarge or larger) is recommended. Queries use ClickHouse-style regex backreferences (\1) since the regex evaluation runs inside libch.so, as anticipated in the spark-gluten/ README. Co-Authored-By: Claude Opus 4.7 (1M context) --- spark-gluten-clickhouse/README.md | 29 +++++ spark-gluten-clickhouse/benchmark.sh | 147 ++++++++++++++++++++++++++ spark-gluten-clickhouse/queries.sql | 43 ++++++++ spark-gluten-clickhouse/query.py | 68 ++++++++++++ spark-gluten-clickhouse/run.sh | 10 ++ spark-gluten-clickhouse/template.json | 13 +++ 6 files changed, 310 insertions(+) create mode 100644 spark-gluten-clickhouse/README.md create mode 100755 spark-gluten-clickhouse/benchmark.sh create mode 100644 spark-gluten-clickhouse/queries.sql create mode 100755 spark-gluten-clickhouse/query.py create mode 100755 spark-gluten-clickhouse/run.sh create mode 100644 spark-gluten-clickhouse/template.json diff --git a/spark-gluten-clickhouse/README.md b/spark-gluten-clickhouse/README.md new file mode 100644 index 0000000000..6dc439adac --- /dev/null +++ b/spark-gluten-clickhouse/README.md @@ -0,0 +1,29 @@ +This entry runs Apache Spark with the [Apache Gluten](https://gluten.apache.org/) plugin configured to use the **ClickHouse backend** ('ch'). Gluten loads `libch.so` (a fork of ClickHouse v23.1) into the Spark executor JVM and runs the columnar physical plan natively through it. See also [`spark-gluten/`](../spark-gluten/) (Velox backend) and the [accelerators README](../spark/README-accelerators.md). + +### Run + +`./benchmark.sh` builds everything from source (no pre-built bundle is published for the CH backend) and then runs all 43 queries. Optional first argument is the machine spec, e.g. `./benchmark.sh c6a.8xlarge`. + +## Notes + +### Build + +The CH backend is not part of Apache Gluten's release tarball — only the Velox bundle is published. As a result `benchmark.sh` builds two things from source: + +1. **`libch.so`** — built from [Kyligence/ClickHouse](https://github.com/Kyligence/ClickHouse) at the branch pinned in `gluten/cpp-ch/clickhouse.version`. The build uses Clang 18 / cmake / ninja. +2. **The Gluten Spark plugin** — built via Maven with `-P backends-clickhouse,spark-3.5`. JDK 8 is required at compile time (Gluten's POM); Spark itself runs under JDK 17. + +Building libch.so essentially compiles ClickHouse from source: it is **memory-hungry** (Gluten's docs note that 64 GB RAM is recommended). On a c6a.4xlarge (32 GB RAM) the compile may OOM; use c6a.8xlarge or larger for a clean run. + +### Configuration + +- `spark.gluten.sql.columnar.backend.lib=ch` selects the ClickHouse backend over Velox. +- `spark.gluten.sql.columnar.libpath=` points to the native library. The build location is `gluten/cpp-ch/build_ch/utils/extern-local-engine/libch.so`; `benchmark.sh` symlinks it as `libch.so` in the entry directory. +- Memory is split 50/50 between Spark heap and Gluten off-heap, identical to the Velox entry — the CH backend also runs off-heap via JNI. +- Queries use ClickHouse-style regex backreferences (`\1`) rather than Spark's `$1`, since the regex evaluation happens inside libch.so. See the discussion in [`spark-gluten/README.md`](../spark-gluten/README.md) and [Gluten issue #7545](https://github.com/apache/incubator-gluten/issues/7545). + +### Links + +- [Gluten ClickHouse-backend getting started](https://gluten.apache.org/docs/get-started/ClickHouse/). +- [Gluten release page](https://gluten.apache.org/downloads/) (Velox bundles only). +- [Kyligence/ClickHouse fork](https://github.com/Kyligence/ClickHouse) (the source of libch.so). diff --git a/spark-gluten-clickhouse/benchmark.sh b/spark-gluten-clickhouse/benchmark.sh new file mode 100755 index 0000000000..8b19fb39cc --- /dev/null +++ b/spark-gluten-clickhouse/benchmark.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +# Spark + Apache Gluten with the ClickHouse backend ('ch'). Unlike the +# Velox backend, no pre-built bundle is published for the CH backend, so +# this script builds both libch.so (a ClickHouse fork) and the Gluten +# Spark plugin from source. +# +# Note: Keep in sync with spark-*/benchmark.sh (see README-accelerators.md for details) +# +# The ClickHouse compile is RAM-hungry; building on c6a.4xlarge (32 GB) +# may OOM. A larger machine (>= 64 GB RAM, c6a.8xlarge or above) is +# recommended. + +set -e + +GLUTEN_VERSION=v1.4.0 +SPARK_PROFILE=spark-3.5 + +# Install build prerequisites: +# - Java 8 to build Gluten via Maven (Gluten's pom requires JDK 8) +# - Java 17 to run Spark (auto-selected via JAVA_HOME below) +# - Clang 18, cmake, ninja, etc. to build libch.so +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv \ + openjdk-8-jdk-headless openjdk-17-jdk-headless \ + maven git cmake ccache ninja-build nasm yasm gawk \ + lsb-release wget software-properties-common gnupg + +# Install Clang 18 (required by libch.so build). +wget -O - https://apt.llvm.org/llvm.sh | sudo bash -s -- 18 + +export CC=clang-18 +export CXX=clang++-18 + +# pyspark venv +python3 -m venv myenv +source myenv/bin/activate +pip install pyspark==3.5.2 psutil + +# Load the data +../download-hits-parquet-single + +# Clone Gluten and the Kyligence ClickHouse fork that the CH backend wraps. +GLUTEN_DIR="$PWD/gluten" +if [ ! -d "$GLUTEN_DIR" ]; then + git clone --depth 1 --branch "$GLUTEN_VERSION" \ + https://github.com/apache/gluten.git "$GLUTEN_DIR" +fi + +CH_BRANCH=$(grep '^CH_BRANCH=' "$GLUTEN_DIR/cpp-ch/clickhouse.version" | cut -d= -f2) +CH_DIR="$GLUTEN_DIR/cpp-ch/ClickHouse" +if [ ! -d "$CH_DIR" ]; then + git clone --recursive --shallow-submodules \ + --branch "$CH_BRANCH" \ + https://github.com/Kyligence/ClickHouse.git "$CH_DIR" +fi + +# Build libch.so. The wrapper at cpp-ch/build_ch invokes the inner +# ClickHouse build, whose final artifact ends up at cpp-ch/build/. +LIBCH_SO="$GLUTEN_DIR/cpp-ch/build/utils/extern-local-engine/libch.so" +if [ ! -f "$LIBCH_SO" ]; then + bash "$GLUTEN_DIR/ep/build-clickhouse/src/build_clickhouse.sh" +fi + +# Build the Gluten Spark plugin against the CH backend. JDK 8 is required +# at compile time per Gluten's pom; Spark itself runs under JDK 17 below. +# pyspark wheels ship Scala 2.12 jars, so build with scala-2.12 to match. +JAVA_HOME_8="/usr/lib/jvm/java-8-openjdk-$(dpkg --print-architecture)" +( + cd "$GLUTEN_DIR" + JAVA_HOME="$JAVA_HOME_8" PATH="$JAVA_HOME_8/bin:$PATH" \ + mvn -B clean package \ + -Pbackends-clickhouse -P"$SPARK_PROFILE" -Pscala-2.12 \ + -DskipTests -Dcheckstyle.skip +) + +# Symlink the produced uber jar (jar-with-dependencies) and libch.so into +# the entry directory; query.py expects them as ./gluten.jar and ./libch.so. +GLUTEN_JAR=$(ls "$GLUTEN_DIR"/backends-clickhouse/target/gluten-*-spark-3.5-jar-with-dependencies.jar 2>/dev/null | head -n1) +if [ -z "$GLUTEN_JAR" ]; then + echo "ERROR: could not locate built Gluten CH-backend jar" >&2 + ls "$GLUTEN_DIR/backends-clickhouse/target/" >&2 || true + exit 1 +fi +ln -sf "$GLUTEN_JAR" gluten.jar +ln -sf "$LIBCH_SO" libch.so + +# Run Spark queries under JDK 17. +export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-$(dpkg --print-architecture)/" +export PATH="$JAVA_HOME/bin:$PATH" + +./run.sh 2>&1 | tee log.txt + +# Print results to stdout as required +cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | + awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' + +DATA_SIZE=$(du -b hits.parquet | cut -f1) + +echo "Data size: $DATA_SIZE" +echo "Load time: 0" + +# Save results as JSON +MACHINE="${1:-c6a.8xlarge}" +SPARK_VERSION=$(pip freeze | grep '^pyspark==' | cut -d '=' -f3) +GLUTEN_TAG="${GLUTEN_VERSION#v}" + +mkdir -p results + +( +cat << EOF +{ + "system": "Spark (Gluten-on-ClickHouse)", + "date": "$(date +%Y-%m-%d)", + "machine": "${MACHINE}", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "comment": "Apache Gluten ${GLUTEN_TAG} with the ClickHouse backend (libch.so), Spark ${SPARK_VERSION}", + "tags": ["Java", "C++", "column-oriented", "Spark derivative", "ClickHouse", "Parquet"], + "load_time": 0, + "data_size": ${DATA_SIZE}, + "result": [ +EOF + +cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' | + awk -v total=$(grep -cP '^Time:\s+[\d\.]+|Failure!' log.txt) ' + { + if (i % 3 == 0) printf "\t\t["; + if ($1 == "null") printf "null"; + else printf "%.3f", $1; + if (i % 3 != 2) printf ", "; + else { + if (i < total - 1) printf "],\n"; + else printf "]"; + } + i++; + }' + +cat << EOF + + ] +} +EOF +) > "results/${MACHINE}.json" + +echo "Results have been saved to results/${MACHINE}.json" diff --git a/spark-gluten-clickhouse/queries.sql b/spark-gluten-clickhouse/queries.sql new file mode 100644 index 0000000000..31f65fc898 --- /dev/null +++ b/spark-gluten-clickhouse/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/spark-gluten-clickhouse/query.py b/spark-gluten-clickhouse/query.py new file mode 100755 index 0000000000..5c43a4efb8 --- /dev/null +++ b/spark-gluten-clickhouse/query.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +""" +Spark + Apache Gluten using the ClickHouse backend ('ch'). The CH backend +loads libch.so (a fork of ClickHouse v23.1) into the Spark executor JVM +and runs the columnar physical plan natively. + +Note: Keep in sync with spark-*/query.py (see README-accelerators.md for details). +""" + +import os +import sys +import timeit + +import psutil +from pyspark.sql import SparkSession +import pyspark.sql.functions as F + + +query = sys.stdin.read() +print(query) + +# Calculate available memory to configure SparkSession (in MB). +# The CH backend runs off-heap (via JNI into libch.so), so split available +# memory between Spark's JVM heap and the off-heap pool the same way the +# Velox backend does. +ram = int(round(psutil.virtual_memory().available / (1024 ** 2) * 0.7)) +heap = ram // 2 +off_heap = ram - heap +print(f"SparkSession will use {heap} MB of heap and {off_heap} MB of off-heap memory (total {ram} MB)") + +builder = ( + SparkSession + .builder + .appName("ClickBench") + .config("spark.driver", "local[*]") + .config("spark.driver.memory", f"{heap}m") + .config("spark.sql.parquet.binaryAsString", True) + + # Gluten + ClickHouse backend configuration + .config("spark.jars", "gluten.jar") + .config("spark.driver.extraClassPath", "gluten.jar") + .config("spark.plugins", "org.apache.gluten.GlutenPlugin") + .config("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager") + .config("spark.gluten.sql.columnar.backend.lib", "ch") + .config("spark.gluten.sql.columnar.libpath", os.path.abspath("libch.so")) + .config("spark.memory.offHeap.enabled", "true") + .config("spark.memory.offHeap.size", f"{off_heap}m") + .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true") +) + +spark = builder.getOrCreate() + +df = spark.read.parquet("hits.parquet") +df = df.withColumn("EventTime", F.col("EventTime").cast("timestamp")) +df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate"))) +df.createOrReplaceTempView("hits") + +for try_num in range(3): + try: + start = timeit.default_timer() + result = spark.sql(query) + result.show(100) + end = timeit.default_timer() + print("Time: ", end - start) + except Exception as e: + print(e) + print("Failure!") diff --git a/spark-gluten-clickhouse/run.sh b/spark-gluten-clickhouse/run.sh new file mode 100755 index 0000000000..8c9ca12890 --- /dev/null +++ b/spark-gluten-clickhouse/run.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Note: Keep in sync with spark-*/run.sh (see README-accelerators.md for details) + +cat queries.sql | while read query; do + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + + ./query.py <<< "${query}" +done diff --git a/spark-gluten-clickhouse/template.json b/spark-gluten-clickhouse/template.json new file mode 100644 index 0000000000..6ae1ad2873 --- /dev/null +++ b/spark-gluten-clickhouse/template.json @@ -0,0 +1,13 @@ +{ + "system": "Spark (Gluten-on-ClickHouse)", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "Java", + "C++", + "column-oriented", + "Spark derivative", + "ClickHouse" + ] +} From fe4e87fe92b86ceed85a006fbfb6bbb3362e3aca Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 8 May 2026 22:34:43 +0000 Subject: [PATCH 2/2] spark-gluten-clickhouse: bump Clang to 19 for libch.so build The pinned Kyligence/ClickHouse fork now rejects Clang < 19 in cmake/tools.cmake, so installing Clang 18 fails the configure step. Co-Authored-By: Claude Opus 4.7 (1M context) --- spark-gluten-clickhouse/benchmark.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/spark-gluten-clickhouse/benchmark.sh b/spark-gluten-clickhouse/benchmark.sh index 8b19fb39cc..558d4d85b3 100755 --- a/spark-gluten-clickhouse/benchmark.sh +++ b/spark-gluten-clickhouse/benchmark.sh @@ -19,18 +19,19 @@ SPARK_PROFILE=spark-3.5 # Install build prerequisites: # - Java 8 to build Gluten via Maven (Gluten's pom requires JDK 8) # - Java 17 to run Spark (auto-selected via JAVA_HOME below) -# - Clang 18, cmake, ninja, etc. to build libch.so +# - Clang 19, cmake, ninja, etc. to build libch.so sudo apt-get update -y sudo apt-get install -y python3-pip python3-venv \ openjdk-8-jdk-headless openjdk-17-jdk-headless \ maven git cmake ccache ninja-build nasm yasm gawk \ lsb-release wget software-properties-common gnupg -# Install Clang 18 (required by libch.so build). -wget -O - https://apt.llvm.org/llvm.sh | sudo bash -s -- 18 +# Install Clang 19 (required by libch.so build — the pinned Kyligence/ClickHouse +# fork's cmake/tools.cmake rejects Clang < 19). +wget -O - https://apt.llvm.org/llvm.sh | sudo bash -s -- 19 -export CC=clang-18 -export CXX=clang++-18 +export CC=clang-19 +export CXX=clang++-19 # pyspark venv python3 -m venv myenv