From 5f5f67bbb6ddedb80ed5a67efa10b60e083e9ace Mon Sep 17 00:00:00 2001 From: Hans Sperker Date: Fri, 8 May 2026 13:18:05 +0200 Subject: [PATCH] fix(source_status): use REST API for SHA lookup to support SAML-protected private repos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The web atom feed at github.com/.../commits/{ref}.atom does not accept Bearer auth. For SAML-protected private repos, GitHub silently returns HTTP 200 with an empty body — the SHA regex finds no match and raises ValueError, which bypasses the HTTPStatusError handler in _check_all_cached_modules and surfaces as 'Unexpected error checking {module}: ValueError: Could not extract commit SHA from Atom feed'. PR #100 added Bearer auth on the assumption GitHub would return 401/403/404 for unauthorized atom requests. It does not — the empty-200 response path was missed. Switch _get_github_commit_sha to the REST API (api.github.com/repos/{owner}/{repo}/commits/{ref}), which honors PAT auth and returns proper status codes. _get_commit_details in the same file already uses this endpoint, so no new auth or dependency surface. Rate limits are equivalent (5000/hr authenticated). --- amplifier_app_cli/utils/source_status.py | 44 +++++----- tests/test_source_status_auth.py | 105 +++++++++++++++++++---- 2 files changed, 110 insertions(+), 39 deletions(-) diff --git a/amplifier_app_cli/utils/source_status.py b/amplifier_app_cli/utils/source_status.py index 82d747c..e6efa9b 100644 --- a/amplifier_app_cli/utils/source_status.py +++ b/amplifier_app_cli/utils/source_status.py @@ -6,7 +6,6 @@ import json import logging -import re import subprocess from dataclasses import dataclass from dataclasses import field @@ -543,18 +542,24 @@ def _cache_age_days_from_string(cached_at: str) -> int: async def _get_github_commit_sha( client: httpx.AsyncClient, repo_url: str, ref: str ) -> str: - """Get SHA for ref using GitHub Atom feed. + """Get SHA for ref using the GitHub REST API. - Uses public Atom feed: https://github.com/{owner}/{repo}/commits/{ref}.atom + Uses: https://api.github.com/repos/{owner}/{repo}/commits/{ref} - Uses auth headers when available (GITHUB_TOKEN or gh auth login) to - avoid GitHub's unauthenticated rate limit of 60 requests/hour. - Authenticated requests get 5,000 requests/hour. + The web atom feed at github.com/.../commits/{ref}.atom does not accept + Bearer auth. For SAML-protected private repos, GitHub silently returns + HTTP 200 with an empty body, causing SHA extraction to fail with an + opaque ValueError that bypasses the HTTPStatusError handler in + _check_all_cached_modules. The REST API honors PAT auth and returns + proper 401/403/404 status codes, letting the existing handler surface + the right "private repo or rate limited" hint. + + Authenticated requests get 5,000 requests/hour + (vs. 60/hr unauthenticated). Args: client: Shared httpx client for all HTTP requests """ - # Remove .git suffix properly (not with rstrip - it removes any char in '.git'!) url_clean = repo_url[:-4] if repo_url.endswith(".git") else repo_url parts = url_clean.split("github.com/")[-1].split("/") if len(parts) < 2: @@ -562,24 +567,17 @@ async def _get_github_commit_sha( owner, repo = parts[0], parts[1] - atom_url = f"https://github.com/{owner}/{repo}/commits/{ref}.atom" - - # Use auth headers proactively to avoid GitHub's unauthenticated rate limit - # (60 req/hr). Authenticated requests get 5,000 req/hr. Falls back gracefully - # to no-auth when GITHUB_TOKEN is unset and gh auth login hasn't been run. - auth_headers = _get_github_auth_headers() - response = await client.get(atom_url, headers=auth_headers) - response.raise_for_status() - - # Parse XML for first commit SHA - # Format: tag:github.com,2008:Grit::Commit/{SHA} - match = re.search( - r"tag:github\.com,2008:Grit::Commit/([a-f0-9]{40})", response.text + api_url = f"https://api.github.com/repos/{owner}/{repo}/commits/{ref}" + response = await client.get( + api_url, + headers={ + "Accept": "application/vnd.github.v3+json", + **_get_github_auth_headers(), + }, ) - if not match: - raise ValueError(f"Could not extract commit SHA from Atom feed: {atom_url}") + response.raise_for_status() - return match.group(1) + return response.json()["sha"] async def _get_commit_details( diff --git a/tests/test_source_status_auth.py b/tests/test_source_status_auth.py index e3573fe..2fd842d 100644 --- a/tests/test_source_status_auth.py +++ b/tests/test_source_status_auth.py @@ -18,20 +18,15 @@ async def test_uses_auth_headers_on_first_request_when_available(self): """Auth headers should be sent on the first request, not only after a 404.""" from amplifier_app_cli.utils.source_status import _get_github_commit_sha - atom_content = """ - - - tag:github.com,2008:Grit::Commit/abcdef1234567890abcdef1234567890abcdef12 - -""" - captured_requests = [] async def mock_get(url, **kwargs): captured_requests.append({"url": url, "headers": kwargs.get("headers", {})}) response = MagicMock(spec=httpx.Response) response.status_code = 200 - response.text = atom_content + response.json = MagicMock( + return_value={"sha": "abcdef1234567890abcdef1234567890abcdef12"} + ) response.raise_for_status = MagicMock() return response @@ -63,17 +58,12 @@ async def test_proceeds_without_auth_when_unavailable(self): """Requests without auth should still work when no token is configured.""" from amplifier_app_cli.utils.source_status import _get_github_commit_sha - atom_content = """ - - - tag:github.com,2008:Grit::Commit/deadbeefdeadbeefdeadbeefdeadbeefdeadbeef - -""" - async def mock_get(url, **kwargs): response = MagicMock(spec=httpx.Response) response.status_code = 200 - response.text = atom_content + response.json = MagicMock( + return_value={"sha": "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef"} + ) response.raise_for_status = MagicMock() return response @@ -172,3 +162,86 @@ def test_docstring_does_not_claim_no_rate_limits(self): "Docstring falsely claims 'no rate limits' — GitHub Atom feeds " "DO have rate limits for unauthenticated requests" ) + + +class TestUsesRestApiNotAtomFeed: + """Regression: must use the REST API, not the web atom feed. + + The web atom feed (github.com/.../commits/{ref}.atom) does not accept + Bearer auth — for SAML-protected private repos GitHub silently returns + HTTP 200 with an empty body, causing SHA extraction to fail with an + opaque ValueError. The REST API honors PAT auth and returns proper + 401/403/404 status codes for unauthorized requests. + """ + + @pytest.mark.asyncio + async def test_calls_rest_api_endpoint(self): + """The function must call api.github.com, not github.com/.../*.atom.""" + from amplifier_app_cli.utils.source_status import _get_github_commit_sha + + captured_urls = [] + + async def mock_get(url, **kwargs): + captured_urls.append(url) + response = MagicMock(spec=httpx.Response) + response.status_code = 200 + response.json = MagicMock(return_value={"sha": "a" * 40}) + response.raise_for_status = MagicMock() + return response + + mock_client = MagicMock(spec=httpx.AsyncClient) + mock_client.get = AsyncMock(side_effect=mock_get) + + with patch( + "amplifier_app_cli.utils.source_status._get_github_auth_headers", + return_value={"Authorization": "Bearer test-token"}, + ): + await _get_github_commit_sha( + mock_client, + "https://github.com/microsoft/amplifier", + "main", + ) + + assert len(captured_urls) == 1 + assert captured_urls[0].startswith("https://api.github.com/"), ( + f"Must use REST API, got: {captured_urls[0]}. " + "The web atom feed does not accept Bearer auth and silently " + "returns HTTP 200 with empty body for SAML-protected private repos." + ) + assert ".atom" not in captured_urls[0] + + @pytest.mark.asyncio + async def test_403_surfaces_as_http_status_error(self): + """A 403 from the API must raise HTTPStatusError, not ValueError. + + This is the contract the _check_all_cached_modules error handler + relies on to print "private repo or rate limited" instead of + "Unexpected error". + """ + from amplifier_app_cli.utils.source_status import _get_github_commit_sha + + async def mock_get(url, **kwargs): + response = MagicMock(spec=httpx.Response) + response.status_code = 403 + request = MagicMock() + response.raise_for_status = MagicMock( + side_effect=httpx.HTTPStatusError( + "403 Forbidden", request=request, response=response + ) + ) + return response + + mock_client = MagicMock(spec=httpx.AsyncClient) + mock_client.get = AsyncMock(side_effect=mock_get) + + with patch( + "amplifier_app_cli.utils.source_status._get_github_auth_headers", + return_value={"Authorization": "Bearer test-token"}, + ): + with pytest.raises(httpx.HTTPStatusError): + await _get_github_commit_sha( + mock_client, + "https://github.com/Aleph-Alpha/private-repo", + "main", + ) +