Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 21 additions & 23 deletions amplifier_app_cli/utils/source_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import json
import logging
import re
import subprocess
from dataclasses import dataclass
from dataclasses import field
Expand Down Expand Up @@ -543,43 +542,42 @@ def _cache_age_days_from_string(cached_at: str) -> int:
async def _get_github_commit_sha(
client: httpx.AsyncClient, repo_url: str, ref: str
) -> str:
"""Get SHA for ref using GitHub Atom feed.
"""Get SHA for ref using the GitHub REST API.

Uses public Atom feed: https://github.com/{owner}/{repo}/commits/{ref}.atom
Uses: https://api.github.com/repos/{owner}/{repo}/commits/{ref}

Uses auth headers when available (GITHUB_TOKEN or gh auth login) to
avoid GitHub's unauthenticated rate limit of 60 requests/hour.
Authenticated requests get 5,000 requests/hour.
The web atom feed at github.com/.../commits/{ref}.atom does not accept
Bearer auth. For SAML-protected private repos, GitHub silently returns
HTTP 200 with an empty body, causing SHA extraction to fail with an
opaque ValueError that bypasses the HTTPStatusError handler in
_check_all_cached_modules. The REST API honors PAT auth and returns
proper 401/403/404 status codes, letting the existing handler surface
the right "private repo or rate limited" hint.

Authenticated requests get 5,000 requests/hour
(vs. 60/hr unauthenticated).

Args:
client: Shared httpx client for all HTTP requests
"""
# Remove .git suffix properly (not with rstrip - it removes any char in '.git'!)
url_clean = repo_url[:-4] if repo_url.endswith(".git") else repo_url
parts = url_clean.split("github.com/")[-1].split("/")
if len(parts) < 2:
raise ValueError(f"Could not parse GitHub URL: {repo_url}")

owner, repo = parts[0], parts[1]

atom_url = f"https://github.com/{owner}/{repo}/commits/{ref}.atom"

# Use auth headers proactively to avoid GitHub's unauthenticated rate limit
# (60 req/hr). Authenticated requests get 5,000 req/hr. Falls back gracefully
# to no-auth when GITHUB_TOKEN is unset and gh auth login hasn't been run.
auth_headers = _get_github_auth_headers()
response = await client.get(atom_url, headers=auth_headers)
response.raise_for_status()

# Parse XML for first commit SHA
# Format: <id>tag:github.com,2008:Grit::Commit/{SHA}</id>
match = re.search(
r"<id>tag:github\.com,2008:Grit::Commit/([a-f0-9]{40})</id>", response.text
api_url = f"https://api.github.com/repos/{owner}/{repo}/commits/{ref}"
response = await client.get(
api_url,
headers={
"Accept": "application/vnd.github.v3+json",
**_get_github_auth_headers(),
},
)
if not match:
raise ValueError(f"Could not extract commit SHA from Atom feed: {atom_url}")
response.raise_for_status()

return match.group(1)
return response.json()["sha"]


async def _get_commit_details(
Expand Down
105 changes: 89 additions & 16 deletions tests/test_source_status_auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,15 @@ async def test_uses_auth_headers_on_first_request_when_available(self):
"""Auth headers should be sent on the first request, not only after a 404."""
from amplifier_app_cli.utils.source_status import _get_github_commit_sha

atom_content = """<?xml version="1.0"?>
<feed>
<entry>
<id>tag:github.com,2008:Grit::Commit/abcdef1234567890abcdef1234567890abcdef12</id>
</entry>
</feed>"""

captured_requests = []

async def mock_get(url, **kwargs):
captured_requests.append({"url": url, "headers": kwargs.get("headers", {})})
response = MagicMock(spec=httpx.Response)
response.status_code = 200
response.text = atom_content
response.json = MagicMock(
return_value={"sha": "abcdef1234567890abcdef1234567890abcdef12"}
)
response.raise_for_status = MagicMock()
return response

Expand Down Expand Up @@ -63,17 +58,12 @@ async def test_proceeds_without_auth_when_unavailable(self):
"""Requests without auth should still work when no token is configured."""
from amplifier_app_cli.utils.source_status import _get_github_commit_sha

atom_content = """<?xml version="1.0"?>
<feed>
<entry>
<id>tag:github.com,2008:Grit::Commit/deadbeefdeadbeefdeadbeefdeadbeefdeadbeef</id>
</entry>
</feed>"""

async def mock_get(url, **kwargs):
response = MagicMock(spec=httpx.Response)
response.status_code = 200
response.text = atom_content
response.json = MagicMock(
return_value={"sha": "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"}
)
response.raise_for_status = MagicMock()
return response

Expand Down Expand Up @@ -172,3 +162,86 @@ def test_docstring_does_not_claim_no_rate_limits(self):
"Docstring falsely claims 'no rate limits' — GitHub Atom feeds "
"DO have rate limits for unauthenticated requests"
)


class TestUsesRestApiNotAtomFeed:
"""Regression: must use the REST API, not the web atom feed.

The web atom feed (github.com/.../commits/{ref}.atom) does not accept
Bearer auth — for SAML-protected private repos GitHub silently returns
HTTP 200 with an empty body, causing SHA extraction to fail with an
opaque ValueError. The REST API honors PAT auth and returns proper
401/403/404 status codes for unauthorized requests.
"""

@pytest.mark.asyncio
async def test_calls_rest_api_endpoint(self):
"""The function must call api.github.com, not github.com/.../*.atom."""
from amplifier_app_cli.utils.source_status import _get_github_commit_sha

captured_urls = []

async def mock_get(url, **kwargs):
captured_urls.append(url)
response = MagicMock(spec=httpx.Response)
response.status_code = 200
response.json = MagicMock(return_value={"sha": "a" * 40})
response.raise_for_status = MagicMock()
return response

mock_client = MagicMock(spec=httpx.AsyncClient)
mock_client.get = AsyncMock(side_effect=mock_get)

with patch(
"amplifier_app_cli.utils.source_status._get_github_auth_headers",
return_value={"Authorization": "Bearer test-token"},
):
await _get_github_commit_sha(
mock_client,
"https://github.com/microsoft/amplifier",
"main",
)

assert len(captured_urls) == 1
assert captured_urls[0].startswith("https://api.github.com/"), (
f"Must use REST API, got: {captured_urls[0]}. "
"The web atom feed does not accept Bearer auth and silently "
"returns HTTP 200 with empty body for SAML-protected private repos."
)
assert ".atom" not in captured_urls[0]

@pytest.mark.asyncio
async def test_403_surfaces_as_http_status_error(self):
"""A 403 from the API must raise HTTPStatusError, not ValueError.

This is the contract the _check_all_cached_modules error handler
relies on to print "private repo or rate limited" instead of
"Unexpected error".
"""
from amplifier_app_cli.utils.source_status import _get_github_commit_sha

async def mock_get(url, **kwargs):
response = MagicMock(spec=httpx.Response)
response.status_code = 403
request = MagicMock()
response.raise_for_status = MagicMock(
side_effect=httpx.HTTPStatusError(
"403 Forbidden", request=request, response=response
)
)
return response

mock_client = MagicMock(spec=httpx.AsyncClient)
mock_client.get = AsyncMock(side_effect=mock_get)

with patch(
"amplifier_app_cli.utils.source_status._get_github_auth_headers",
return_value={"Authorization": "Bearer test-token"},
):
with pytest.raises(httpx.HTTPStatusError):
await _get_github_commit_sha(
mock_client,
"https://github.com/Aleph-Alpha/private-repo",
"main",
)