mlcommons · viraatc · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
@@ -14,16 +14,13 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Set up Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+      - name: Install uv
+        uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5
         with:
-          python-version: "3.12"
+          python-version-file: .python-version
 
       - name: Install dependencies
-        run: |
-          python -m pip install pip==26.0.1
-          pip install -e ".[dev]"
+        run: uv sync --frozen --extra dev
 
       - name: Run pre-commit
-        run: |
-          pre-commit run --all-files --show-diff-on-failure
+        run: uv run pre-commit run --all-files --show-diff-on-failure
@@ -9,26 +9,19 @@ on:
 jobs:
   test:
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+      - name: Install uv
+        uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version-file: .python-version
 
       - name: Install dependencies
-        run: |
-          python -m pip install pip==26.0.1
-          pip install -e ".[test]"
+        run: uv sync --frozen --extra test
 
       - name: Run tests
-        run: |
-          pytest -xv -m "not slow and not performance" --cov=src --cov-report=xml --cov-report=html
+        run: uv run pytest -xv -m "not slow and not performance" --cov=src --cov-report=xml --cov-report=html
 
       - name: Upload coverage report
         uses: actions/upload-artifact@v4
@@ -38,21 +31,40 @@ jobs:
             coverage.xml
             htmlcov/
 
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5
+        with:
+          python-version-file: .python-version
+
+      - name: Build wheel
+        run: uv build
+
+      - name: Install from wheel and smoke test
+        run: |
+          uv venv /tmp/smoke-test
+          uv pip install --python /tmp/smoke-test/bin/python dist/*.whl
+          /tmp/smoke-test/bin/inference-endpoint --help
+
   audit:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
-      - name: Set up Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+      - name: Install uv
+        uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5
         with:
-          python-version: "3.12"
+          python-version-file: .python-version
+
+      - name: Install dependencies
+        run: uv sync --frozen --extra dev --extra test --extra performance
 
       - name: Audit dependencies for known vulnerabilities
-        run: |
-          python -m pip install pip==26.0.1
-          pip install -e ".[dev,test,performance]"
-          pip-audit
+        run: uv run pip-audit
 
   schema-updated:
     runs-on: ubuntu-latest
@@ -62,6 +74,11 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Install uv
+        uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5
+        with:
+          python-version-file: .python-version
+
       - name: Check for schema changes
         id: schema
         run: |
@@ -73,24 +90,10 @@ jobs:
             'src/inference_endpoint/config/templates/*.yaml')
           echo "changed=$([[ -n "$CHANGED" ]] && echo true || echo false)" >> "$GITHUB_OUTPUT"
 
-      - name: Set up Python 3.12
-        if: steps.schema.outputs.changed == 'true'
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
-        with:
-          python-version: "3.12"
-
-      - name: Install dependencies
-        if: steps.schema.outputs.changed == 'true'
-        run: |
-          python -m pip install --upgrade pip
-          pip install -e ".[test]"
-
       - name: Run schema fuzz tests
         if: steps.schema.outputs.changed == 'true'
-        run: |
-          pytest -xv -m schema_fuzz
+        run: uv run --frozen --extra test pytest -xv -m schema_fuzz
 
       - name: Check YAML templates are up to date
         if: steps.schema.outputs.changed == 'true'
-        run: |
-          python scripts/regenerate_templates.py --check
+        run: uv run --frozen python scripts/regenerate_templates.py --check
@@ -62,3 +62,11 @@ repos:
         types: [python]
         pass_filenames: true
         exclude: ^(src/inference_endpoint/openai/openai_types_gen.py)$
+
+      - id: uv-lock-check
+        name: Check uv.lock is up-to-date
+        entry: uv lock --check
+        language: python
+        additional_dependencies: ["uv==0.7.6"]
+        pass_filenames: false
+        files: ^(pyproject\.toml|uv\.lock)$
@@ -0,0 +1 @@
+3.12.11
@@ -10,11 +10,39 @@ High-performance benchmarking tool for LLM inference endpoints targeting 50k+ QP
 
 ```bash
 # Development setup
+uv sync --extra dev --extra test
+uv run pre-commit install
+
+# Testing
+uv run pytest                                        # All tests (excludes slow/performance)
+uv run pytest -m unit                                # Unit tests only
+uv run pytest -m integration                         # Integration tests only
+uv run pytest --cov=src --cov-report=html            # With coverage
+uv run pytest -xvs tests/unit/path/to/test_file.py  # Single test file
+
+# Code quality (run before commits)
+uv run pre-commit run --all-files
+
+# Local testing with echo server
+uv run python -m inference_endpoint.testing.echo_server --port 8765
+uv run inference-endpoint probe --endpoints http://localhost:8765 --model test-model
+
+# CLI usage
+uv run inference-endpoint benchmark offline --endpoints URL --model NAME --dataset PATH
+uv run inference-endpoint benchmark online --endpoints URL --model NAME --dataset PATH --load-pattern poisson --target-qps 100
+uv run inference-endpoint benchmark from-config --config config.yaml
+```
+
+### Backward-compatible setup (pip + venv)
+
+Does not use `uv.lock` — dependency versions may differ from the lockfile.
+
+```bash
 python3.12 -m venv venv && source venv/bin/activate
 pip install -e ".[dev,test]"
 pre-commit install
 
-# Testing
+# After activating the venv, commands run without the `uv run` prefix:
 pytest                                        # All tests (excludes slow/performance)
 pytest -m unit                                # Unit tests only
 pytest -m integration                         # Integration tests only
@@ -354,5 +382,5 @@ Known failure modes when AI tools generate code for this project. Reference thes
 
 ### Dependency & Environment
 
-- **Adding new dependencies without justification**: AI may `pip install` or add imports for packages not in `pyproject.toml`. Any new dependency must be justified, added to the correct optional group, and pinned to an exact version (`==`). After adding a dependency, run `pip-audit` (included in `dev` extras) to verify it has no known vulnerabilities.
+- **Adding new dependencies without justification**: AI may `pip install` or add imports for packages not in `pyproject.toml`. Any new runtime, dev, or test dependency must be justified, added to the correct optional group, and pinned to an exact version (`==`). After adding a dependency, run `pip-audit` (included in `dev` extras) to verify it has no known vulnerabilities. When adding dependencies, use `uv add <package>==<version>` to update both `pyproject.toml` and `uv.lock` atomically, then run `uv run pip-audit` to check for vulnerabilities. Note: `[build-system] requires` is also pinned to exact versions for reproducibility.
 - **Using `requests`/`aiohttp` for HTTP**: This project has its own HTTP client (`endpoint_client/http.py`) using `httptools`. AI defaults to `requests` or `aiohttp` — these should not appear in production code (test dependencies are fine).
@@ -39,28 +39,39 @@ benchmarking tool for LLM inference endpoints targeting 50k+ QPS.
 git clone https://github.com/<your-username>/endpoints.git
 cd endpoints
 
-# Create virtual environment
-python3.12 -m venv venv
-source venv/bin/activate
-
 # Install with dev and test extras
-pip install -e ".[dev,test]"
+uv sync --extra dev --extra test
 
 # Install pre-commit hooks
-pre-commit install
+uv run pre-commit install
 
 # Verify your setup
-pytest -m unit -x --timeout=60
+uv run pytest -m unit -x --timeout=60
+```
+
+<details>
+<summary>Using pip + venv instead (backward-compatible)</summary>
+
+> **Note:** Does not use `uv.lock` — dependency versions may differ from the lockfile.
+
+```bash
+python3.12 -m venv venv && source venv/bin/activate
+pip install -e ".[dev,test]"
+pre-commit install
 ```
 
+After activating the venv, commands work without the `uv run` prefix.
+
+</details>
+
 ### Local Testing with Echo Server
 
 ```bash
 # Start a local echo server
-python -m inference_endpoint.testing.echo_server --port 8765
+uv run python -m inference_endpoint.testing.echo_server --port 8765
 
 # Run a quick probe
-inference-endpoint probe --endpoints http://localhost:8765 --model test-model
+uv run inference-endpoint probe --endpoints http://localhost:8765 --model test-model
 ```
 
 ## Code Style and Conventions
@@ -73,7 +84,7 @@ these automatically.
 
 ```bash
 # Run all checks manually
-pre-commit run --all-files
+uv run pre-commit run --all-files
 ```
 
 ### Key Conventions
@@ -106,19 +117,19 @@ is latency-critical. In these paths:
 
 ```bash
 # All tests (excludes slow/performance)
-pytest
+uv run pytest
 
 # Unit tests only
-pytest -m unit
+uv run pytest -m unit
 
 # Integration tests
-pytest -m integration
+uv run pytest -m integration
 
 # Single file
-pytest -xvs tests/unit/path/to/test_file.py
+uv run pytest -xvs tests/unit/path/to/test_file.py
 
 # With coverage
-pytest --cov=src --cov-report=html
+uv run pytest --cov=src --cov-report=html
 ```
 
 ### Test Markers
@@ -154,7 +165,7 @@ docs/short-description
 
 1. **Create a focused PR** — one logical change per PR
 2. **Fill out the PR template** — describe what, why, and how to test
-3. **Ensure CI passes** — `pre-commit run --all-files` and `pytest -m unit` locally before pushing
+3. **Ensure CI passes** — `uv run pre-commit run --all-files` and `uv run pytest -m unit` locally before pushing
 4. **Link related issues** — use `Closes #123` or `Relates to #123`
 5. **Expect review within 2-3 business days** — reviewers are auto-assigned based on changed files
 

@@ -13,10 +13,23 @@ A high-performance benchmarking tool for LLM inference endpoints, targeting 50k+
 ```bash
 git clone https://github.com/mlcommons/endpoints.git
 cd endpoints
+uv sync
+```
+
+<details>
+<summary>Using pip + venv instead (backward-compatible)</summary>
+
+> **Note:** Does not use `uv.lock` — dependency versions may differ from the lockfile.
+
+```bash
 python3.12 -m venv venv && source venv/bin/activate
 pip install .
 ```
 
+After activating the venv, commands work without the `uv run` prefix.
+
+</details>
+
 ```bash
 # Test endpoint connectivity
 inference-endpoint probe \

@@ -54,8 +54,8 @@ Optimal worker count depends on your workload — prompt size, streaming mode, a
 ### Full sweep
 
 ```bash
-python -m inference_endpoint.utils.benchmark_httpclient --full -d 5
-python -m inference_endpoint.utils.benchmark_httpclient --full -d 5 --stream
+uv run python -m inference_endpoint.utils.benchmark_httpclient --full -d 5
+uv run python -m inference_endpoint.utils.benchmark_httpclient --full -d 5 --stream
 ```
 
 Runs all common worker counts against a range of prompt lengths (CPU pinning is on by default). Produces a plot at `/tmp/sweep_*.png` showing send/recv rate per configuration, with shaded variation bands and a stall% overlay.
@@ -66,19 +66,19 @@ With `--stream`, the full sweep also varies stream interval (0%, 50%, 100% of pr
 
 ```bash
 # Sweep workers for a specific prompt length
-python -m inference_endpoint.utils.benchmark_httpclient -w 1:16 -l 4096 -d 10
+uv run python -m inference_endpoint.utils.benchmark_httpclient -w 1:16 -l 4096 -d 10
 
 # Sweep workers with explicit values
-python -m inference_endpoint.utils.benchmark_httpclient -w 1,2,4,8,12,16 -l 4096 -d 10
+uv run python -m inference_endpoint.utils.benchmark_httpclient -w 1,2,4,8,12,16 -l 4096 -d 10
 
 # Cartesian product: workers x prompt lengths
-python -m inference_endpoint.utils.benchmark_httpclient -w 1:16::8 -l 128,1024,8192 -d 5
+uv run python -m inference_endpoint.utils.benchmark_httpclient -w 1:16::8 -l 128,1024,8192 -d 5
 
 # Streaming: sweep workers with a fixed stream interval (chars per SSE event)
-python -m inference_endpoint.utils.benchmark_httpclient -w 1:16 -l 4096 --stream --stream-interval 100 -d 5
+uv run python -m inference_endpoint.utils.benchmark_httpclient -w 1:16 -l 4096 --stream --stream-interval 100 -d 5
 
 # Streaming: sweep stream intervals (total events = ceil(output_length / interval))
-python -m inference_endpoint.utils.benchmark_httpclient -w 8 --stream --stream-interval 1,50,500 -d 5
+uv run python -m inference_endpoint.utils.benchmark_httpclient -w 8 --stream --stream-interval 1,50,500 -d 5
 ```
 
 ### Reading the results
@@ -134,8 +134,8 @@ Two built-in servers for benchmarking without a real GPU endpoint.
 Returns identical pre-compiled responses instantly — zero compute, pure client roofline.
 
 ```bash
-python -m inference_endpoint.testing.max_throughput_server --port 12345 --stats
-python -m inference_endpoint.testing.max_throughput_server --stream --stream-interval 50 --stats
+uv run python -m inference_endpoint.testing.max_throughput_server --port 12345 --stats
+uv run python -m inference_endpoint.testing.max_throughput_server --stream --stream-interval 50 --stats
 ```
 
 | Flag                | Default | Description              |
@@ -156,15 +156,15 @@ Two mutually exclusive timing modes:
 
 ```bash
 # Non-streaming with response-rate control
-python -m inference_endpoint.testing.variable_throughput_server --stats \
+uv run python -m inference_endpoint.testing.variable_throughput_server --stats \
     --response-rate-mean 1000
 
 # Streaming with TPOT + TTFT
-python -m inference_endpoint.testing.variable_throughput_server --stream --stats \
+uv run python -m inference_endpoint.testing.variable_throughput_server --stream --stats \
     --inter-token-latency 15 --first-chunk-latency 1.5 --stream-interval 10
 
 # With jitter
-python -m inference_endpoint.testing.variable_throughput_server --stream --stats \
+uv run python -m inference_endpoint.testing.variable_throughput_server --stream --stats \
     --response-rate-mean 50 --response-rate-spread 0.2 \
     --first-chunk-latency 0.5 --first-chunk-spread 0.2
 ```

@@ -2,6 +2,8 @@
 
 Command-line reference for all `inference-endpoint` subcommands, flags, load patterns, and usage examples.
 
+> **Note:** Commands below assume an activated venv (`source .venv/bin/activate`). Without activation, prefix all commands with `uv run`.
+
 ## Commands
 
 ### Performance Benchmarking