From 8a45dca6776b283e0ea0219e574badae8dfe5d64 Mon Sep 17 00:00:00 2001 From: Giovanni Spadaccini Date: Wed, 3 Dec 2025 13:40:52 +0100 Subject: [PATCH 1/9] feat: unify ANTLR setup and add pre_push make target --- .devcontainer/Dockerfile | 4 +-- CONTRIBUTING.md | 21 ++++++++++++- Makefile | 11 ++++++- gen_proto.sh => scripts/gen_proto.sh | 0 scripts/setup_antlr.sh | 34 ++++++++++++++++++++++ update_proto.sh => scripts/update_proto.sh | 0 6 files changed, 66 insertions(+), 4 deletions(-) rename gen_proto.sh => scripts/gen_proto.sh (100%) create mode 100644 scripts/setup_antlr.sh rename update_proto.sh => scripts/update_proto.sh (100%) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 8d3354b..dba8c2c 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -3,8 +3,8 @@ USER vscode RUN curl -s "https://get.sdkman.io" | bash RUN bash -c "source $HOME/.sdkman/bin/sdkman-init.sh && \ sdk install java 25-graalce" -RUN mkdir -p ~/lib && cd ~/lib && curl -L -O http://www.antlr.org/download/antlr-4.13.2-complete.jar -ENV ANTLR_JAR="~/lib/antlr-4.13.2-complete.jar" +COPY scripts/setup_antlr.sh /tmp/setup_antlr.sh +RUN bash /tmp/setup_antlr.sh && rm /tmp/setup_antlr.sh # protoc 29.5 is the last version with protobuf python v5 which is compatible with protoletariat v3 RUN cd ~ && curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v29.5/protoc-29.5-linux-x86_64.zip && \ unzip protoc-29.5-linux-x86_64.zip -d ~/.local && \ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f2bb2ef..7928ee5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,7 +33,7 @@ uv run ./update_proto.sh ## Antlr grammar -Substrait uses antlr grammar to derive output types of extension functions. Make sure java is installed and ANTLR_JAR environment variable is set. Take a look at .devcontainer/Dockerfile for example setup. +Substrait uses antlr grammar to derive output types of extension functions. Make sure java is installed and antlr, by using running `make setup-antlr`. ``` make antlr @@ -62,3 +62,22 @@ Run tests in the project's root dir. uv sync --extra test uv run pytest ``` + +# Pre-Push Checklist + +Before pushing your changes, run the following command to ensure all requirements are met: + +``` +make pre_push +``` + +This command performs the following checks and updates: +1. Sets up ANTLR dependencies (`setup-antlr`) +2. Formats code with ruff (`format`) +3. Fixes linting issues with ruff (`lint_fix`) +4. Regenerates ANTLR grammar (`antlr`) +5. Regenerates extension stubs (`codegen-extensions`) +6. Syncs dependencies (`uv sync --extra test`) +7. Runs tests (`uv run pytest`) + +This ensures your code is properly formatted, linted, all generated files are up-to-date, and all tests pass. diff --git a/Makefile b/Makefile index 4e95eb6..ecffd04 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,10 @@ +setup-antlr: + @bash scripts/setup_antlr.sh > /dev/null + antlr: + @export ANTLR_JAR=$$(bash scripts/setup_antlr.sh); \ cd third_party/substrait/grammar \ - && java -jar ${ANTLR_JAR} -o ../../../src/substrait/gen/antlr -Dlanguage=Python3 SubstraitType.g4 \ + && java -jar ../../../$${ANTLR_JAR} -o ../../../src/substrait/gen/antlr -Dlanguage=Python3 SubstraitType.g4 \ && rm ../../../src/substrait/gen/antlr/*.tokens \ && rm ../../../src/substrait/gen/antlr/*.interp @@ -10,6 +14,7 @@ codegen-extensions: --input third_party/substrait/text/simple_extensions_schema.yaml \ --output src/substrait/gen/json/simple_extensions.py \ --output-model-type dataclasses.dataclass \ + --target-python-version 3.10 \ --disable-timestamp lint: @@ -20,3 +25,7 @@ lint_fix: format: uvx ruff@0.11.11 format + +pre_push: format lint_fix antlr codegen-extensions + uv sync --extra test + uv run pytest diff --git a/gen_proto.sh b/scripts/gen_proto.sh similarity index 100% rename from gen_proto.sh rename to scripts/gen_proto.sh diff --git a/scripts/setup_antlr.sh b/scripts/setup_antlr.sh new file mode 100644 index 0000000..a618f03 --- /dev/null +++ b/scripts/setup_antlr.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Setup ANTLR for Substrait Python + +set -e + +ANTLR_VERSION="4.13.2" +ANTLR_JAR_DIR="lib" +ANTLR_JAR="${ANTLR_JAR_DIR}/antlr-${ANTLR_VERSION}-complete.jar" +ANTLR_URL="http://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar" +VERSION_FILE="${ANTLR_JAR_DIR}/.antlr_version" + +echo "Setting up ANTLR ${ANTLR_VERSION}..." >&2 + +# Create directory if it doesn't exist +mkdir -p "${ANTLR_JAR_DIR}" + +# Check if installed version matches required version +INSTALLED_VERSION="" +if [ -f "${VERSION_FILE}" ]; then + INSTALLED_VERSION=$(cat "${VERSION_FILE}") +fi + +if [ "${INSTALLED_VERSION}" = "${ANTLR_VERSION}" ] && [ -f "${ANTLR_JAR}" ]; then + echo "ANTLR ${ANTLR_VERSION} is already installed" >&2 +else + echo "Downloading ANTLR ${ANTLR_VERSION}..." >&2 + rm -f "${ANTLR_JAR}" + curl -s -L -o "${ANTLR_JAR}" "${ANTLR_URL}" + echo "${ANTLR_VERSION}" > "${VERSION_FILE}" + echo "ANTLR ${ANTLR_VERSION} downloaded successfully" >&2 +fi + +# Output the path so it can be captured +echo "${ANTLR_JAR}" diff --git a/update_proto.sh b/scripts/update_proto.sh similarity index 100% rename from update_proto.sh rename to scripts/update_proto.sh From 9a2957aade9a3579baad83807e79a28c83501ebe Mon Sep 17 00:00:00 2001 From: Giovanni Spadaccini Date: Wed, 3 Dec 2025 13:45:28 +0100 Subject: [PATCH 2/9] fix: runned pre_push --- src/substrait/gen/json/simple_extensions.py | 26 ++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/substrait/gen/json/simple_extensions.py b/src/substrait/gen/json/simple_extensions.py index 2885bb4..765fbef 100644 --- a/src/substrait/gen/json/simple_extensions.py +++ b/src/substrait/gen/json/simple_extensions.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from enum import Enum -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, TypeAlias, Union class Functions(Enum): @@ -13,7 +13,7 @@ class Functions(Enum): SEPARATE = 'SEPARATE' -Type = Union[str, Dict[str, Any]] +Type: TypeAlias = Union[str, Dict[str, Any]] class Type1(Enum): @@ -24,7 +24,7 @@ class Type1(Enum): string = 'string' -EnumOptions = List[str] +EnumOptions: TypeAlias = List[str] @dataclass @@ -49,7 +49,7 @@ class TypeArg: description: Optional[str] = None -Arguments = List[Union[EnumerationArg, ValueArg, TypeArg]] +Arguments: TypeAlias = List[Union[EnumerationArg, ValueArg, TypeArg]] @dataclass @@ -58,7 +58,7 @@ class Options1: description: Optional[str] = None -Options = Dict[str, Options1] +Options: TypeAlias = Dict[str, Options1] class ParameterConsistency(Enum): @@ -73,10 +73,10 @@ class VariadicBehavior: parameterConsistency: Optional[ParameterConsistency] = None -Deterministic = bool +Deterministic: TypeAlias = bool -SessionDependent = bool +SessionDependent: TypeAlias = bool class NullabilityHandling(Enum): @@ -85,13 +85,13 @@ class NullabilityHandling(Enum): DISCRETE = 'DISCRETE' -ReturnValue = Type +ReturnValue: TypeAlias = Type -Implementation = Dict[str, str] +Implementation: TypeAlias = Dict[str, str] -Intermediate = Type +Intermediate: TypeAlias = Type class Decomposable(Enum): @@ -100,10 +100,10 @@ class Decomposable(Enum): MANY = 'MANY' -Maxset = float +Maxset: TypeAlias = float -Ordered = bool +Ordered: TypeAlias = bool @dataclass @@ -196,7 +196,7 @@ class TypeParamDef: optional: Optional[bool] = None -TypeParamDefs = List[TypeParamDef] +TypeParamDefs: TypeAlias = List[TypeParamDef] @dataclass From 39d5569bc426df4b4c8b9e640062221b2b0e23dc Mon Sep 17 00:00:00 2001 From: Giovanni Spadaccini Date: Wed, 3 Dec 2025 14:09:57 +0100 Subject: [PATCH 3/9] fix: update ANTLR setup --- .devcontainer/Dockerfile | 1 + CONTRIBUTING.md | 2 +- Makefile | 5 ++--- scripts/setup_antlr.sh | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index dba8c2c..6c17e27 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -5,6 +5,7 @@ RUN bash -c "source $HOME/.sdkman/bin/sdkman-init.sh && \ sdk install java 25-graalce" COPY scripts/setup_antlr.sh /tmp/setup_antlr.sh RUN bash /tmp/setup_antlr.sh && rm /tmp/setup_antlr.sh +ENV ANTLR_JAR="lib/antlr-complete.jar" # protoc 29.5 is the last version with protobuf python v5 which is compatible with protoletariat v3 RUN cd ~ && curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v29.5/protoc-29.5-linux-x86_64.zip && \ unzip protoc-29.5-linux-x86_64.zip -d ~/.local && \ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7928ee5..ef54b1c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -28,7 +28,7 @@ Run the upgrade script to upgrade the submodule and regenerate the protobuf stub ``` uv sync --extra gen_proto -uv run ./update_proto.sh +uv run scripts/update_proto.sh ``` ## Antlr grammar diff --git a/Makefile b/Makefile index ecffd04..e97989b 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,9 @@ setup-antlr: @bash scripts/setup_antlr.sh > /dev/null -antlr: - @export ANTLR_JAR=$$(bash scripts/setup_antlr.sh); \ +antlr: setup-antlr cd third_party/substrait/grammar \ - && java -jar ../../../$${ANTLR_JAR} -o ../../../src/substrait/gen/antlr -Dlanguage=Python3 SubstraitType.g4 \ + && java -jar ../../../lib/antlr-complete.jar -o ../../../src/substrait/gen/antlr -Dlanguage=Python3 SubstraitType.g4 \ && rm ../../../src/substrait/gen/antlr/*.tokens \ && rm ../../../src/substrait/gen/antlr/*.interp diff --git a/scripts/setup_antlr.sh b/scripts/setup_antlr.sh index a618f03..102d7ed 100644 --- a/scripts/setup_antlr.sh +++ b/scripts/setup_antlr.sh @@ -5,7 +5,7 @@ set -e ANTLR_VERSION="4.13.2" ANTLR_JAR_DIR="lib" -ANTLR_JAR="${ANTLR_JAR_DIR}/antlr-${ANTLR_VERSION}-complete.jar" +ANTLR_JAR="${ANTLR_JAR_DIR}/antlr-complete.jar" ANTLR_URL="http://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar" VERSION_FILE="${ANTLR_JAR_DIR}/.antlr_version" From abd34d99948389b3e692966ee3d87c9789f8966f Mon Sep 17 00:00:00 2001 From: Giovanni Spadaccini Date: Wed, 3 Dec 2025 14:49:41 +0100 Subject: [PATCH 4/9] fix: docker build --- scripts/setup_antlr.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/setup_antlr.sh b/scripts/setup_antlr.sh index 102d7ed..810fc39 100644 --- a/scripts/setup_antlr.sh +++ b/scripts/setup_antlr.sh @@ -6,7 +6,7 @@ set -e ANTLR_VERSION="4.13.2" ANTLR_JAR_DIR="lib" ANTLR_JAR="${ANTLR_JAR_DIR}/antlr-complete.jar" -ANTLR_URL="http://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar" +ANTLR_URL="https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar" VERSION_FILE="${ANTLR_JAR_DIR}/.antlr_version" echo "Setting up ANTLR ${ANTLR_VERSION}..." >&2 @@ -25,7 +25,10 @@ if [ "${INSTALLED_VERSION}" = "${ANTLR_VERSION}" ] && [ -f "${ANTLR_JAR}" ]; the else echo "Downloading ANTLR ${ANTLR_VERSION}..." >&2 rm -f "${ANTLR_JAR}" - curl -s -L -o "${ANTLR_JAR}" "${ANTLR_URL}" + if ! curl -s -L -f -o "${ANTLR_JAR}" "${ANTLR_URL}"; then + echo "Failed to download ANTLR from ${ANTLR_URL}" >&2 + exit 1 + fi echo "${ANTLR_VERSION}" > "${VERSION_FILE}" echo "ANTLR ${ANTLR_VERSION} downloaded successfully" >&2 fi From c6736fb2966a29de42cbb686f3be2b2b9d6afb11 Mon Sep 17 00:00:00 2001 From: Giovanni Spadaccini Date: Wed, 3 Dec 2025 14:59:38 +0100 Subject: [PATCH 5/9] fix: changed path handling --- scripts/setup_antlr.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/setup_antlr.sh b/scripts/setup_antlr.sh index 810fc39..c8efac2 100644 --- a/scripts/setup_antlr.sh +++ b/scripts/setup_antlr.sh @@ -4,7 +4,10 @@ set -e ANTLR_VERSION="4.13.2" -ANTLR_JAR_DIR="lib" +# Get the project root (parent of scripts directory) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")" +ANTLR_JAR_DIR="${PROJECT_ROOT}/lib" ANTLR_JAR="${ANTLR_JAR_DIR}/antlr-complete.jar" ANTLR_URL="https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar" VERSION_FILE="${ANTLR_JAR_DIR}/.antlr_version" From 8954df00411f24ab1c47f15273cc3c4f62f40dae Mon Sep 17 00:00:00 2001 From: Giovanni Spadaccini Date: Wed, 3 Dec 2025 15:04:58 +0100 Subject: [PATCH 6/9] fix: make ANTLR setup script work in Docker container --- .devcontainer/Dockerfile | 4 ++-- scripts/setup_antlr.sh | 15 +++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 6c17e27..c91d65e 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -4,8 +4,8 @@ RUN curl -s "https://get.sdkman.io" | bash RUN bash -c "source $HOME/.sdkman/bin/sdkman-init.sh && \ sdk install java 25-graalce" COPY scripts/setup_antlr.sh /tmp/setup_antlr.sh -RUN bash /tmp/setup_antlr.sh && rm /tmp/setup_antlr.sh -ENV ANTLR_JAR="lib/antlr-complete.jar" +RUN bash /tmp/setup_antlr.sh ~/lib && rm /tmp/setup_antlr.sh +ENV ANTLR_JAR="~/lib/antlr-complete.jar" # protoc 29.5 is the last version with protobuf python v5 which is compatible with protoletariat v3 RUN cd ~ && curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v29.5/protoc-29.5-linux-x86_64.zip && \ unzip protoc-29.5-linux-x86_64.zip -d ~/.local && \ diff --git a/scripts/setup_antlr.sh b/scripts/setup_antlr.sh index c8efac2..c8df094 100644 --- a/scripts/setup_antlr.sh +++ b/scripts/setup_antlr.sh @@ -1,13 +1,20 @@ #!/bin/bash # Setup ANTLR for Substrait Python +# Usage: setup_antlr.sh [ANTLR_JAR_DIR] +# If ANTLR_JAR_DIR is not provided, defaults to project root/lib set -e ANTLR_VERSION="4.13.2" -# Get the project root (parent of scripts directory) -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")" -ANTLR_JAR_DIR="${PROJECT_ROOT}/lib" + +# Determine ANTLR_JAR_DIR +if [ -n "$1" ]; then + # Use provided argument + ANTLR_JAR_DIR="$1" +else + ANTLR_JAR_DIR="lib" +fi + ANTLR_JAR="${ANTLR_JAR_DIR}/antlr-complete.jar" ANTLR_URL="https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar" VERSION_FILE="${ANTLR_JAR_DIR}/.antlr_version" From 3b633ef216512057082ff258755655f52f6c2f92 Mon Sep 17 00:00:00 2001 From: Giovanni Spadaccini Date: Wed, 3 Dec 2025 15:07:17 +0100 Subject: [PATCH 7/9] fix: removed rm operation --- .devcontainer/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index c91d65e..a04e35f 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -4,7 +4,7 @@ RUN curl -s "https://get.sdkman.io" | bash RUN bash -c "source $HOME/.sdkman/bin/sdkman-init.sh && \ sdk install java 25-graalce" COPY scripts/setup_antlr.sh /tmp/setup_antlr.sh -RUN bash /tmp/setup_antlr.sh ~/lib && rm /tmp/setup_antlr.sh +RUN bash /tmp/setup_antlr.sh ~/lib ENV ANTLR_JAR="~/lib/antlr-complete.jar" # protoc 29.5 is the last version with protobuf python v5 which is compatible with protoletariat v3 RUN cd ~ && curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v29.5/protoc-29.5-linux-x86_64.zip && \ From 968d93ab53c68dbde1cee3f842e1680e9f34d893 Mon Sep 17 00:00:00 2001 From: Giovanni Spadaccini Date: Tue, 9 Dec 2025 14:03:34 +0100 Subject: [PATCH 8/9] Update script path for code generation --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a867549..0d12557 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ codegen-version: && echo '"' >> src/substrait/gen/version.py codegen-proto: - ./gen_proto.sh + ./scripts/gen_proto.sh codegen-extensions: uv run --with datamodel-code-generator datamodel-codegen \ From b82d66f92b6dc87e2c859fe97df910b8f91da446 Mon Sep 17 00:00:00 2001 From: Giovanni Spadaccini Date: Wed, 17 Dec 2025 15:26:16 +0100 Subject: [PATCH 9/9] Fix codegen target dependencies in Makefile --- Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 0d12557..b3df363 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,6 @@ setup-antlr: @bash scripts/setup_antlr.sh > /dev/null -codegen: antlr codegen-proto codegen-extensions codegen-version - antlr: setup-antlr cd third_party/substrait/grammar \ @@ -37,6 +35,6 @@ lint_fix: format: uvx ruff@0.11.11 format -pre_push: format lint_fix antlr codegen-extensions +codegen: format lint_fix antlr codegen-proto codegen-extensions codegen-version uv sync --extra test uv run pytest