From e0e6d66a3d59a970e400ef87cba20ec1373c0fb8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 14 Apr 2026 03:46:24 +0000
Subject: [PATCH 1/7] Initial plan


From 530b52ad67d12d7f435bdebf99088c1691b142bb Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 14 Apr 2026 04:13:38 +0000
Subject: [PATCH 2/7] fix: use cached run_summary.json to ensure deterministic
 audit metrics

When AuditWorkflowRun is called for a run that already has a valid
run_summary.json on disk, return the cached report immediately without
re-processing local log files. Re-processing the same files can produce
different results (e.g. when the GitHub API aggregates token usage from
a different source than the locally-stored firewall proxy logs), causing
the token_usage and turns metrics to differ across calls for the same run.

The fix:
- Add a cache-check early path in AuditWorkflowRun: if loadRunSummary
  succeeds, reconstruct processedRun from the summary and render
  without touching any API or log files.
- Extract the shared render + finalization logic into renderAuditReport,
  called by both the cache-hit path and the fresh-download path.
- The fresh-download path now saves the summary BEFORE calling
  renderAuditReport, so the cache is written as early as possible.
- The run_summary.json is not overwritten on cached calls, ensuring the
  first-processed (firewall-based) metrics are preserved.

Cache invalidation is already handled: loadRunSummary rejects summaries
whose CLIVersion differs from the current binary version, so bug fixes in
log parsing are automatically picked up after a tool upgrade.

Adds two new unit tests:
- TestAuditUsesRunSummaryCache: verifies that cached token_usage and
  turns are preserved and run_summary.json is not modified.
- TestRenderAuditReportUsesProvidedMetrics: exercises the new helper.

Fixes: Audit shows inconsistent metrics on repeated calls for same run

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/f8e72352-eed9-4bb9-bc4b-7c856bcb8bbf

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 pkg/cli/audit.go      | 107 +++++++++++++++++++++++++-----------
 pkg/cli/audit_test.go | 122 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 197 insertions(+), 32 deletions(-)

diff --git a/pkg/cli/audit.go b/pkg/cli/audit.go
index f8ea5b02829..dcf8fbfdcae 100644
--- a/pkg/cli/audit.go
+++ b/pkg/cli/audit.go
@@ -189,6 +189,37 @@ func AuditWorkflowRun(ctx context.Context, runID int64, owner, repo, hostname st
 		return auditJobRun(runID, jobID, stepNumber, owner, repo, hostname, runOutputDir, verbose, jsonOutput)
 	}
 
+	// Use cached run summary when available to ensure deterministic metrics across repeated calls.
+	// Re-processing the same log files can produce different results (e.g. when GitHub's API
+	// returns aggregated data that differs from the locally-stored firewall logs), so we always
+	// prefer the first fully-processed summary written to disk.  The cache is automatically
+	// invalidated whenever the CLI version changes (see loadRunSummary).
+	if summary, ok := loadRunSummary(runOutputDir, verbose); ok {
+		auditLog.Printf("Using cached run summary for run %d (processed at %s)", runID, summary.ProcessedAt.Format(time.RFC3339))
+		if verbose {
+			fmt.Fprintln(os.Stderr, console.FormatInfoMessage(fmt.Sprintf("Using cached run summary for run %d (processed at %s)", runID, summary.ProcessedAt.Format(time.RFC3339))))
+		}
+		processedRun := ProcessedRun{
+			Run:                     summary.Run,
+			AwContext:               summary.AwContext,
+			TaskDomain:              summary.TaskDomain,
+			BehaviorFingerprint:     summary.BehaviorFingerprint,
+			AgenticAssessments:      summary.AgenticAssessments,
+			AccessAnalysis:          summary.AccessAnalysis,
+			FirewallAnalysis:        summary.FirewallAnalysis,
+			PolicyAnalysis:          summary.PolicyAnalysis,
+			RedactedDomainsAnalysis: summary.RedactedDomainsAnalysis,
+			MissingTools:            summary.MissingTools,
+			MissingData:             summary.MissingData,
+			Noops:                   summary.Noops,
+			MCPFailures:             summary.MCPFailures,
+			TokenUsage:              summary.TokenUsage,
+			GitHubRateLimitUsage:    summary.GitHubRateLimitUsage,
+			JobDetails:              summary.JobDetails,
+		}
+		return renderAuditReport(processedRun, summary.Metrics, summary.MCPToolUsage, runOutputDir, owner, repo, hostname, verbose, parse, jsonOutput)
+	}
+
 	// Check if we have locally cached artifacts first
 	hasLocalCache := fileutil.DirExists(runOutputDir) && !fileutil.IsDirEmpty(runOutputDir)
 
@@ -416,6 +447,50 @@ func AuditWorkflowRun(ctx context.Context, runID int64, owner, repo, hostname st
 	processedRun.BehaviorFingerprint = behaviorFingerprint
 	processedRun.AgenticAssessments = agenticAssessments
 
+	// Save run summary for caching future audit runs
+	summary := &RunSummary{
+		CLIVersion:              GetVersion(),
+		RunID:                   run.DatabaseID,
+		ProcessedAt:             time.Now(),
+		Run:                     run,
+		Metrics:                 metrics,
+		AwContext:               processedRun.AwContext,
+		TaskDomain:              processedRun.TaskDomain,
+		BehaviorFingerprint:     processedRun.BehaviorFingerprint,
+		AgenticAssessments:      processedRun.AgenticAssessments,
+		AccessAnalysis:          accessAnalysis,
+		FirewallAnalysis:        firewallAnalysis,
+		PolicyAnalysis:          policyAnalysis,
+		RedactedDomainsAnalysis: redactedDomainsAnalysis,
+		MissingTools:            missingTools,
+		MissingData:             missingData,
+		Noops:                   noops,
+		MCPFailures:             mcpFailures,
+		MCPToolUsage:            mcpToolUsage,
+		TokenUsage:              tokenUsageSummary,
+		GitHubRateLimitUsage:    rateLimitUsage,
+		ArtifactsList:           artifacts,
+		JobDetails:              jobDetails,
+	}
+
+	if err := saveRunSummary(runOutputDir, summary, verbose); err != nil {
+		if verbose {
+			fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("Failed to save run summary: %v", err)))
+		}
+	}
+
+	return renderAuditReport(processedRun, metrics, mcpToolUsage, runOutputDir, owner, repo, hostname, verbose, parse, jsonOutput)
+}
+
+// renderAuditReport builds and renders the audit report from a fully-populated processedRun.
+// It is called both when serving from a cached run summary and after a fresh processing pass,
+// ensuring that the two paths produce identical output.
+func renderAuditReport(processedRun ProcessedRun, metrics LogMetrics, mcpToolUsage *MCPToolUsageData, runOutputDir string, owner, repo, hostname string, verbose bool, parse bool, jsonOutput bool) error {
+	runID := processedRun.Run.DatabaseID
+
+	currentCreatedItems := extractCreatedItemsFromManifest(runOutputDir)
+	processedRun.Run.SafeItemsCount = len(currentCreatedItems)
+
 	currentSnapshot := buildAuditComparisonSnapshot(processedRun, currentCreatedItems)
 	comparison := buildAuditComparisonForRun(processedRun, currentSnapshot, runOutputDir, owner, repo, hostname, verbose)
 
@@ -474,38 +549,6 @@ func AuditWorkflowRun(ctx context.Context, runID int64, owner, repo, hostname st
 		}
 	}
 
-	// Save run summary for caching future audit runs
-	summary := &RunSummary{
-		CLIVersion:              GetVersion(),
-		RunID:                   run.DatabaseID,
-		ProcessedAt:             time.Now(),
-		Run:                     run,
-		Metrics:                 metrics,
-		AwContext:               processedRun.AwContext,
-		TaskDomain:              processedRun.TaskDomain,
-		BehaviorFingerprint:     processedRun.BehaviorFingerprint,
-		AgenticAssessments:      processedRun.AgenticAssessments,
-		AccessAnalysis:          accessAnalysis,
-		FirewallAnalysis:        firewallAnalysis,
-		PolicyAnalysis:          policyAnalysis,
-		RedactedDomainsAnalysis: redactedDomainsAnalysis,
-		MissingTools:            missingTools,
-		MissingData:             missingData,
-		Noops:                   noops,
-		MCPFailures:             mcpFailures,
-		MCPToolUsage:            mcpToolUsage,
-		TokenUsage:              tokenUsageSummary,
-		GitHubRateLimitUsage:    rateLimitUsage,
-		ArtifactsList:           artifacts,
-		JobDetails:              jobDetails,
-	}
-
-	if err := saveRunSummary(runOutputDir, summary, verbose); err != nil {
-		if verbose {
-			fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("Failed to save run summary: %v", err)))
-		}
-	}
-
 	// Display logs location (only for console output)
 	if !jsonOutput {
 		absOutputDir, _ := filepath.Abs(runOutputDir)
diff --git a/pkg/cli/audit_test.go b/pkg/cli/audit_test.go
index d6203bd9a32..dea621c4590 100644
--- a/pkg/cli/audit_test.go
+++ b/pkg/cli/audit_test.go
@@ -428,6 +428,128 @@ func TestAuditCachingBehavior(t *testing.T) {
 	}
 }
 
+// TestAuditUsesRunSummaryCache verifies that when a valid run_summary.json exists on disk,
+// the audit uses the cached metrics instead of re-processing local files.  This prevents the
+// inconsistency described in the "Audit shows inconsistent metrics on repeated calls for same
+// run" issue where the second audit call would overwrite the cached run_summary.json with
+// different (and potentially inflated) values from a re-parse of the same log files.
+func TestAuditUsesRunSummaryCache(t *testing.T) {
+	tempDir := testutil.TempDir(t, "test-audit-cache-*")
+	runOutputDir := filepath.Join(tempDir, "run-99999")
+	if err := os.MkdirAll(runOutputDir, 0755); err != nil {
+		t.Fatalf("Failed to create run directory: %v", err)
+	}
+
+	// Write a stub aw_info.json so the directory is non-empty
+	awInfoContent := `{"engine_id": "copilot", "workflow_name": "test-workflow"}`
+	if err := os.WriteFile(filepath.Join(runOutputDir, "aw_info.json"), []byte(awInfoContent), 0644); err != nil {
+		t.Fatalf("Failed to write aw_info.json: %v", err)
+	}
+
+	// First audit result: low token usage from firewall logs (the "ground truth")
+	cachedRun := WorkflowRun{
+		DatabaseID:   99999,
+		WorkflowName: "GPL Dependency Cleaner",
+		Status:       "completed",
+		Conclusion:   "success",
+		TokenUsage:   381270,
+		Turns:        9,
+		LogsPath:     runOutputDir,
+	}
+	cachedMetrics := LogMetrics{
+		TokenUsage: 381270,
+		Turns:      9,
+	}
+
+	cachedSummary := &RunSummary{
+		CLIVersion:   GetVersion(),
+		RunID:        cachedRun.DatabaseID,
+		ProcessedAt:  time.Now().Add(-time.Hour), // processed one hour ago
+		Run:          cachedRun,
+		Metrics:      cachedMetrics,
+		MissingTools: []MissingToolReport{},
+		MCPFailures:  []MCPFailureReport{},
+		JobDetails:   []JobInfoWithDuration{},
+	}
+
+	if err := saveRunSummary(runOutputDir, cachedSummary, false); err != nil {
+		t.Fatalf("Failed to save initial run summary: %v", err)
+	}
+
+	// Record file mtime before any re-audit
+	summaryPath := filepath.Join(runOutputDir, runSummaryFileName)
+	initialInfo, err := os.Stat(summaryPath)
+	if err != nil {
+		t.Fatalf("Could not stat run_summary.json: %v", err)
+	}
+	initialModTime := initialInfo.ModTime()
+
+	// Load the summary as the audit code would do — this simulates the second call
+	loadedSummary, ok := loadRunSummary(runOutputDir, false)
+	if !ok {
+		t.Fatalf("loadRunSummary should have found the cached summary")
+	}
+
+	// The cached metrics must be preserved exactly — these are the "ground truth" firewall values
+	if loadedSummary.Metrics.TokenUsage != cachedMetrics.TokenUsage {
+		t.Errorf("Token usage mismatch: cached=%d, loaded=%d (cache was overwritten with different data)",
+			cachedMetrics.TokenUsage, loadedSummary.Metrics.TokenUsage)
+	}
+	if loadedSummary.Metrics.Turns != cachedMetrics.Turns {
+		t.Errorf("Turns mismatch: cached=%d, loaded=%d (cache was overwritten with different data)",
+			cachedMetrics.Turns, loadedSummary.Metrics.Turns)
+	}
+
+	// The run_summary.json must NOT have been modified — the cache should be read-only here
+	currentInfo, err := os.Stat(summaryPath)
+	if err != nil {
+		t.Fatalf("Could not stat run_summary.json after load: %v", err)
+	}
+	if !currentInfo.ModTime().Equal(initialModTime) {
+		t.Errorf("run_summary.json was modified after loadRunSummary (mtime changed from %v to %v): "+
+			"the audit must not overwrite the cache on repeated calls",
+			initialModTime, currentInfo.ModTime())
+	}
+}
+
+// TestRenderAuditReportUsesProvidedMetrics verifies that renderAuditReport renders the report
+// using the metrics supplied by the caller rather than re-extracting them from log files.
+// This is the key property that ensures cache-path and fresh-path produce identical output.
+func TestRenderAuditReportUsesProvidedMetrics(t *testing.T) {
+	tempDir := testutil.TempDir(t, "test-render-audit-*")
+	runOutputDir := filepath.Join(tempDir, "run-11111")
+	if err := os.MkdirAll(runOutputDir, 0755); err != nil {
+		t.Fatalf("Failed to create run directory: %v", err)
+	}
+
+	run := WorkflowRun{
+		DatabaseID:   11111,
+		WorkflowName: "Test Workflow",
+		Status:       "completed",
+		Conclusion:   "success",
+		TokenUsage:   12345,
+		Turns:        7,
+		LogsPath:     runOutputDir,
+	}
+	metrics := LogMetrics{
+		TokenUsage: 12345,
+		Turns:      7,
+	}
+	processedRun := ProcessedRun{
+		Run:          run,
+		MissingTools: []MissingToolReport{},
+		MCPFailures:  []MCPFailureReport{},
+		JobDetails:   []JobInfoWithDuration{},
+	}
+
+	// renderAuditReport should complete without error even without GitHub API access
+	// (no actual GitHub calls are made inside renderAuditReport)
+	err := renderAuditReport(processedRun, metrics, nil, runOutputDir, "", "", "", false, false, false)
+	if err != nil {
+		t.Errorf("renderAuditReport returned unexpected error: %v", err)
+	}
+}
+
 func TestBuildAuditDataWithFirewall(t *testing.T) {
 	// Create test data with firewall analysis
 	run := WorkflowRun{

From d63e3becb1f82b56e94ec1bce05b416ebf83fbb9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 14 Apr 2026 04:37:55 +0000
Subject: [PATCH 3/7] fix: skip workflow-logs/ dir in log-walk to prevent
 double-counting tokens
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The log-file walk in extractLogMetrics was not excluding the workflow-logs/
directory, which is populated by downloadWorkflowRunLogs with GitHub Actions
step-output files. These files capture the combined stdout/stderr of each step
including the agent's own log lines, so the same token-usage JSON blocks that
already appear in the agent artifact (agent-stdio.log / agent.log) were being
found again and their token counts were summed a second (or nth) time.

Call 1 of audit (run ID only): downloadWorkflowRunLogs may fail silently because
owner/repo is resolved from the local git remote (may not match), so workflow-logs/
is absent → only agent.log is counted → 381k tokens (correct).

Call 2 of audit (full URL with owner/repo): downloadWorkflowRunLogs succeeds, so
workflow-logs/ is populated with step logs → the same agent data is encountered
~12 more times → 4.7M tokens (12× inflated).

Fix: return filepath.SkipDir when the walk visits a directory named "workflow-logs",
so only the agent artifact files contribute to the metrics.

Adds TestExtractLogMetricsExcludesWorkflowLogsDir as a regression test that
explicitly verifies the fix: agent.log is counted once (1000 tokens), and copies
inside workflow-logs/ are ignored even when they pass the filename filter.

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/3c72016d-4028-4648-88f8-6ab62aa01277

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 pkg/cli/copilot_metrics_fix_test.go | 54 +++++++++++++++++++++++++++++
 pkg/cli/logs_metrics.go             | 10 +++++-
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/pkg/cli/copilot_metrics_fix_test.go b/pkg/cli/copilot_metrics_fix_test.go
index 871f5ff4c09..23705e87858 100644
--- a/pkg/cli/copilot_metrics_fix_test.go
+++ b/pkg/cli/copilot_metrics_fix_test.go
@@ -11,6 +11,60 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+// TestExtractLogMetricsExcludesWorkflowLogsDir is a regression test for the
+// double-counting issue reported in the "Audit shows inconsistent metrics on
+// repeated calls for same run" issue.
+//
+// Background:
+// downloadWorkflowRunLogs (called during artifact download) places GitHub Actions
+// step-output files under workflow-logs/.  These files capture the runner's combined
+// stdout/stderr for each step, which means they contain a copy of everything the
+// agent wrote to stdout — including the same token-usage JSON blocks that are already
+// in agent-stdio.log / agent.log from the dedicated agent artifact.
+//
+// Because the log-file walk in extractLogMetrics previously did NOT skip
+// workflow-logs/, any .log or *log*.txt file found there was parsed and its
+// TokenUsage was ADDED to metrics.TokenUsage (the walk uses +=).  With ~12 such
+// copies the total ballooned to ≈4.7M instead of the correct ≈381k.
+//
+// The fix adds an explicit filepath.SkipDir return when the walk visits a directory
+// named "workflow-logs", so only the agent artifact files are counted.
+func TestExtractLogMetricsExcludesWorkflowLogsDir(t *testing.T) {
+	tempDir := t.TempDir()
+
+	// Simulate a Copilot-CLI run directory
+	require.NoError(t, os.WriteFile(filepath.Join(tempDir, "aw_info.json"), []byte(`{"engine_id":"copilot"}`), 0644))
+
+	// The single JSON data block that represents one LLM API call with 1000 tokens.
+	oneTurn := `2025-09-26T11:00:00Z [DEBUG] data:
+2025-09-26T11:00:00Z [DEBUG] {
+2025-09-26T11:00:00Z [DEBUG]   "choices": [{"message": {"role": "assistant", "tool_calls": []}}],
+2025-09-26T11:00:00Z [DEBUG]   "usage": {"prompt_tokens": 900, "completion_tokens": 100, "total_tokens": 1000}
+2025-09-26T11:00:00Z [DEBUG] }
+2025-09-26T11:00:01Z [DEBUG] Workflow done`
+
+	// Primary agent log — the "source of truth" artifact
+	require.NoError(t, os.WriteFile(filepath.Join(tempDir, "agent.log"), []byte(oneTurn), 0644))
+
+	// Simulate workflow-logs/ as produced by downloadWorkflowRunLogs.
+	// Two step-output files: one .log and one *log*.txt (both would have matched
+	// the old filter), both containing identical token data.
+	wfLogsDir := filepath.Join(tempDir, "workflow-logs", "agent")
+	require.NoError(t, os.MkdirAll(wfLogsDir, 0755))
+	require.NoError(t, os.WriteFile(filepath.Join(wfLogsDir, "runner.log"), []byte(oneTurn), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(wfLogsDir, "2_Run log step.txt"), []byte(oneTurn), 0644))
+
+	metrics, err := extractLogMetrics(tempDir, false)
+	require.NoError(t, err)
+
+	// Without the fix, metrics.TokenUsage would be 3000 (1000 * 3 files).
+	// With the fix, workflow-logs/ is skipped and only agent.log is counted.
+	assert.Equal(t, 1000, metrics.TokenUsage,
+		"TokenUsage must not include workflow-logs/ files (expected 1000, not %d)", metrics.TokenUsage)
+	assert.Equal(t, 1, metrics.Turns,
+		"Turns must not be inflated by workflow-logs/ copies (expected 1, not %d)", metrics.Turns)
+}
+
 // TestCopilotDebugLogTurnsExtraction verifies that Turns are correctly counted from
 // [DEBUG] data: blocks in the Copilot CLI debug log format.
 //
diff --git a/pkg/cli/logs_metrics.go b/pkg/cli/logs_metrics.go
index 8496cd2cca2..52a6a97ab97 100644
--- a/pkg/cli/logs_metrics.go
+++ b/pkg/cli/logs_metrics.go
@@ -169,8 +169,16 @@ func extractLogMetrics(logDir string, verbose bool, workflowPath ...string) (Log
 				return err
 			}
 
-			// Skip directories
+			// Skip directories. workflow-logs/ is explicitly excluded because it contains
+			// GitHub Actions runner captures of each job/step's stdout, NOT the agent
+			// artifact data. Parsing those files would double-count token usage and turns
+			// since the same agent session output appears in both the agent artifact
+			// (e.g. agent-stdio.log) and the workflow run logs (workflow-logs/).
 			if info.IsDir() {
+				if info.Name() == "workflow-logs" {
+					logsMetricsLog.Printf("Skipping workflow-logs directory (GHA runner logs, not agent metrics): %s", path)
+					return filepath.SkipDir
+				}
 				return nil
 			}
 

From 44bd32e153d8920ab84b2cfd125ffc8a584ef93b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 14 Apr 2026 04:39:19 +0000
Subject: [PATCH 4/7] fix: address code review feedback (comment clarity + file
 permissions)

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/3c72016d-4028-4648-88f8-6ab62aa01277

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 pkg/cli/copilot_metrics_fix_test.go | 4 ++--
 pkg/cli/logs_metrics.go             | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pkg/cli/copilot_metrics_fix_test.go b/pkg/cli/copilot_metrics_fix_test.go
index 23705e87858..b7043c552cd 100644
--- a/pkg/cli/copilot_metrics_fix_test.go
+++ b/pkg/cli/copilot_metrics_fix_test.go
@@ -33,7 +33,7 @@ func TestExtractLogMetricsExcludesWorkflowLogsDir(t *testing.T) {
 	tempDir := t.TempDir()
 
 	// Simulate a Copilot-CLI run directory
-	require.NoError(t, os.WriteFile(filepath.Join(tempDir, "aw_info.json"), []byte(`{"engine_id":"copilot"}`), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(tempDir, "aw_info.json"), []byte(`{"engine_id":"copilot"}`), 0600))
 
 	// The single JSON data block that represents one LLM API call with 1000 tokens.
 	oneTurn := `2025-09-26T11:00:00Z [DEBUG] data:
@@ -44,7 +44,7 @@ func TestExtractLogMetricsExcludesWorkflowLogsDir(t *testing.T) {
 2025-09-26T11:00:01Z [DEBUG] Workflow done`
 
 	// Primary agent log — the "source of truth" artifact
-	require.NoError(t, os.WriteFile(filepath.Join(tempDir, "agent.log"), []byte(oneTurn), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(tempDir, "agent.log"), []byte(oneTurn), 0600))
 
 	// Simulate workflow-logs/ as produced by downloadWorkflowRunLogs.
 	// Two step-output files: one .log and one *log*.txt (both would have matched
diff --git a/pkg/cli/logs_metrics.go b/pkg/cli/logs_metrics.go
index 52a6a97ab97..e36b9dc56d7 100644
--- a/pkg/cli/logs_metrics.go
+++ b/pkg/cli/logs_metrics.go
@@ -170,9 +170,9 @@ func extractLogMetrics(logDir string, verbose bool, workflowPath ...string) (Log
 			}
 
 			// Skip directories. workflow-logs/ is explicitly excluded because it contains
-			// GitHub Actions runner captures of each job/step's stdout, NOT the agent
-			// artifact data. Parsing those files would double-count token usage and turns
-			// since the same agent session output appears in both the agent artifact
+			// GitHub Actions runner captures of each job/step's stdout rather than the agent
+			// artifact data. Parsing those files would double-count token usage and turns;
+			// the same agent session output appears in both the agent artifact
 			// (e.g. agent-stdio.log) and the workflow run logs (workflow-logs/).
 			if info.IsDir() {
 				if info.Name() == "workflow-logs" {

From ee21de29c75952cf72d53b7b507cd31e99c0dd73 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 14 Apr 2026 11:43:45 +0000
Subject: [PATCH 5/7] docs(adr): add draft ADR-26148 for deterministic audit
 metrics

---
 ...tic-audit-metrics-via-run-summary-cache.md | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 docs/adr/26148-deterministic-audit-metrics-via-run-summary-cache.md

diff --git a/docs/adr/26148-deterministic-audit-metrics-via-run-summary-cache.md b/docs/adr/26148-deterministic-audit-metrics-via-run-summary-cache.md
new file mode 100644
index 00000000000..3c1b097b5a2
--- /dev/null
+++ b/docs/adr/26148-deterministic-audit-metrics-via-run-summary-cache.md
@@ -0,0 +1,82 @@
+# ADR-26148: Deterministic Audit Metrics via run_summary.json Cache and workflow-logs/ Exclusion
+
+**Date**: 2026-04-14
+**Status**: Draft
+**Deciders**: pelikhan, Copilot
+
+---
+
+## Part 1 — Narrative (Human-Friendly)
+
+### Context
+
+The `audit` command reported wildly inconsistent `token_usage` and `turns` on repeated invocations for the same workflow run (observed: 9 turns / 381k tokens on one call, 22 turns / 4.7M tokens on another). Two compounding bugs caused this: (1) `AuditWorkflowRun` unconditionally re-processed all local log files on every call, even when a fully-computed `run_summary.json` was already on disk; and (2) the log-file walk in `extractLogMetrics` did not exclude the `workflow-logs/` directory, which `downloadWorkflowRunLogs` populates with GitHub Actions step-output — files that capture the same agent stdout already present in the agent artifact logs, inflating token counts by approximately 12×.
+
+### Decision
+
+We will adopt a **cache-first strategy** for `AuditWorkflowRun`: before performing any API calls or log processing, check whether a valid `run_summary.json` exists on disk (validated by CLI version). If a cache hit is found, reconstruct `ProcessedRun` from the cached summary and return immediately via a shared `renderAuditReport` helper, bypassing all re-download and re-parse logic. We will additionally **exclude the `workflow-logs/` directory** from the `extractLogMetrics` log walk by returning `filepath.SkipDir` whenever the walk visits a directory named `workflow-logs`, preventing GitHub Actions runner captures from being counted as agent artifact data. Together, these two changes ensure that repeated `audit` calls for the same run produce identical metrics.
+
+### Alternatives Considered
+
+#### Alternative 1: Invalidate and Overwrite the Cache on Every Call
+
+Rather than treating the cached `run_summary.json` as authoritative, re-process logs on every call and overwrite the cache. This would keep the cache "fresh" but would perpetuate the inconsistency problem: log re-processing can produce different values depending on which files are present at the time (e.g., if `workflow-logs/` has been populated between calls). This was rejected because consistency of audit metrics across repeated calls is the primary requirement.
+
+#### Alternative 2: Exclude `workflow-logs/` Files by Name Pattern Instead of Directory Skip
+
+Rather than skipping the entire `workflow-logs/` directory with `filepath.SkipDir`, selectively exclude individual files whose names match known GitHub Actions runner-log patterns (e.g., `*_Run log step.txt`). This would be fragile: GitHub Actions file naming conventions can change, and any unrecognized file would silently inflate metrics again. Skipping the entire directory by name is simpler, robust, and aligns with how `downloadWorkflowRunLogs` places its output.
+
+#### Alternative 3: Store Canonical Metrics in a Separate Lock File
+
+Record only the metrics (token usage, turns) in a dedicated lock file separate from `run_summary.json`, and read that lock file on subsequent calls. This adds file-system complexity without meaningful benefit over reusing the existing `run_summary.json` structure. The current `loadRunSummary` already performs CLI-version validation, providing a clean automatic invalidation mechanism.
+
+### Consequences
+
+#### Positive
+- Repeated `audit` calls for the same run are now deterministic and produce identical output.
+- The cache-hit path avoids all API calls and file re-parsing, making subsequent audits significantly faster.
+- The `renderAuditReport` helper function eliminates the duplicated render + finalization logic that previously existed in both the fresh-download and (now) cache-hit code paths.
+- Cache invalidation on CLI upgrade is automatic via the existing `CLIVersion` check in `loadRunSummary`.
+
+#### Negative
+- The first successful `audit` call becomes the canonical source of truth. If log files were incomplete on the first run (e.g., partial download), the cached metrics will be wrong until the cache is manually cleared or the CLI is upgraded.
+- The `workflow-logs/` exclusion is a directory-name-based heuristic. If `downloadWorkflowRunLogs` ever changes the output directory name, the exclusion silently stops working.
+- Adding a new top-level helper (`renderAuditReport`) increases the surface area of the package's internal API.
+
+#### Neutral
+- The `run_summary.json` format is unchanged; only the read/write ordering is adjusted (save-before-render in the fresh-download path).
+- Existing tests for `loadRunSummary` and `saveRunSummary` remain valid; new regression tests were added for the cache-hit path and the `workflow-logs/` exclusion.
+
+---
+
+## Part 2 — Normative Specification (RFC 2119)
+
+> The key words **MUST**, **MUST NOT**, **REQUIRED**, **SHALL**, **SHALL NOT**, **SHOULD**, **SHOULD NOT**, **RECOMMENDED**, **MAY**, and **OPTIONAL** in this section are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119).
+
+### Cache-First Audit Strategy
+
+1. Implementations **MUST** check for a valid `run_summary.json` on disk before initiating any API calls or log-file processing in `AuditWorkflowRun`.
+2. Implementations **MUST** treat a cache hit (valid `run_summary.json` with matching `CLIVersion`) as the authoritative source of metrics and return immediately without re-processing logs.
+3. Implementations **MUST NOT** overwrite an existing `run_summary.json` when serving a cache hit; the cached file **MUST** remain unmodified.
+4. Implementations **MUST** persist `run_summary.json` to disk before calling the render step in the fresh-download path, so that a render failure does not prevent future cache hits.
+5. Implementations **SHOULD** log a message (at the appropriate verbosity level) indicating that a cached summary is being used, including the original `ProcessedAt` timestamp.
+
+### Log Metric Extraction
+
+1. Implementations **MUST** skip the `workflow-logs/` directory (and its contents) when walking the run output directory in `extractLogMetrics`.
+2. Implementations **MUST** use `filepath.SkipDir` (or equivalent) to exclude the entire `workflow-logs/` subtree, not individual files within it.
+3. Implementations **MUST NOT** include token-usage data found in `workflow-logs/` in the `LogMetrics.TokenUsage` or `LogMetrics.Turns` totals.
+4. Implementations **MAY** log a debug message when skipping the `workflow-logs/` directory to aid in future diagnostics.
+
+### Shared Render Path
+
+1. Implementations **MUST** use a single shared function (currently `renderAuditReport`) to build and emit the audit report, invoked by both the cache-hit path and the fresh-download path.
+2. The shared render function **MUST NOT** re-extract metrics from log files; it **MUST** use only the metrics passed to it by the caller.
+
+### Conformance
+
+An implementation is considered conformant with this ADR if it satisfies all **MUST** and **MUST NOT** requirements above. Failure to meet any **MUST** or **MUST NOT** requirement constitutes non-conformance.
+
+---
+
+*This is a DRAFT ADR generated by the [Design Decision Gate](https://github.com/github/gh-aw/actions/runs/24396807146) workflow. The PR author must review, complete, and finalize this document before the PR can merge.*

From a748291ee17ed192cb9dbd16c02802baadf0c2c1 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 14 Apr 2026 14:35:19 +0000
Subject: [PATCH 6/7] fix: address review feedback - LogsPath override,
 improved cache test, fix comment

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/182b342d-dba2-4d38-965c-c10a304d2e31

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 pkg/cli/audit.go      |  4 +++
 pkg/cli/audit_test.go | 78 +++++++++++++++++++++++++++----------------
 2 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/pkg/cli/audit.go b/pkg/cli/audit.go
index dcf8fbfdcae..9d3d5f19472 100644
--- a/pkg/cli/audit.go
+++ b/pkg/cli/audit.go
@@ -217,6 +217,10 @@ func AuditWorkflowRun(ctx context.Context, runID int64, owner, repo, hostname st
 			GitHubRateLimitUsage:    summary.GitHubRateLimitUsage,
 			JobDetails:              summary.JobDetails,
 		}
+		// Override the cached LogsPath with the current runOutputDir so that downstream
+		// file reads (created items, aw_info, etc.) resolve correctly even if the run
+		// directory has been moved or copied since the summary was first written.
+		processedRun.Run.LogsPath = runOutputDir
 		return renderAuditReport(processedRun, summary.Metrics, summary.MCPToolUsage, runOutputDir, owner, repo, hostname, verbose, parse, jsonOutput)
 	}
 
diff --git a/pkg/cli/audit_test.go b/pkg/cli/audit_test.go
index dea621c4590..fe1a8a23bee 100644
--- a/pkg/cli/audit_test.go
+++ b/pkg/cli/audit_test.go
@@ -5,6 +5,7 @@ package cli
 import (
 	"encoding/json"
 	"errors"
+	"fmt"
 	"io"
 	"os"
 	"path/filepath"
@@ -429,13 +430,19 @@ func TestAuditCachingBehavior(t *testing.T) {
 }
 
 // TestAuditUsesRunSummaryCache verifies that when a valid run_summary.json exists on disk,
-// the audit uses the cached metrics instead of re-processing local files.  This prevents the
-// inconsistency described in the "Audit shows inconsistent metrics on repeated calls for same
-// run" issue where the second audit call would overwrite the cached run_summary.json with
-// different (and potentially inflated) values from a re-parse of the same log files.
+// AuditWorkflowRun returns successfully using only cached data — without calling
+// fetchWorkflowRunMetadata (which would require a live GitHub API) and without
+// re-processing local log files.
+//
+// The test is structured so that, if the early-return cache path is removed, the function
+// would call fetchWorkflowRunMetadata → gh api → fail in the test environment (no credentials),
+// causing the test to fail.  Only the cache path can satisfy the call without network access.
 func TestAuditUsesRunSummaryCache(t *testing.T) {
 	tempDir := testutil.TempDir(t, "test-audit-cache-*")
-	runOutputDir := filepath.Join(tempDir, "run-99999")
+	// AuditWorkflowRun derives runOutputDir as <outputDir>/run-<runID>, so use tempDir as
+	// the outputDir and let the function build the subdirectory path.
+	const runID int64 = 99999
+	runOutputDir := filepath.Join(tempDir, fmt.Sprintf("run-%d", runID))
 	if err := os.MkdirAll(runOutputDir, 0755); err != nil {
 		t.Fatalf("Failed to create run directory: %v", err)
 	}
@@ -446,9 +453,17 @@ func TestAuditUsesRunSummaryCache(t *testing.T) {
 		t.Fatalf("Failed to write aw_info.json: %v", err)
 	}
 
-	// First audit result: low token usage from firewall logs (the "ground truth")
+	// Write a "poison" log file with grossly inflated token counts.  If the cache path is
+	// bypassed and log files are re-processed, these values would be counted and would
+	// overwrite the summary — but the test verifies that never happens.
+	poisonLog := strings.Repeat(`{"type":"agent_turn","usage":{"total_tokens":9999999}}`+"\n", 10)
+	if err := os.WriteFile(filepath.Join(runOutputDir, "agent-stdio.log"), []byte(poisonLog), 0644); err != nil {
+		t.Fatalf("Failed to write poison log: %v", err)
+	}
+
+	// Ground-truth metrics that were captured on the first (correct) audit pass
 	cachedRun := WorkflowRun{
-		DatabaseID:   99999,
+		DatabaseID:   runID,
 		WorkflowName: "GPL Dependency Cleaner",
 		Status:       "completed",
 		Conclusion:   "success",
@@ -463,7 +478,7 @@ func TestAuditUsesRunSummaryCache(t *testing.T) {
 
 	cachedSummary := &RunSummary{
 		CLIVersion:   GetVersion(),
-		RunID:        cachedRun.DatabaseID,
+		RunID:        runID,
 		ProcessedAt:  time.Now().Add(-time.Hour), // processed one hour ago
 		Run:          cachedRun,
 		Metrics:      cachedMetrics,
@@ -476,7 +491,6 @@ func TestAuditUsesRunSummaryCache(t *testing.T) {
 		t.Fatalf("Failed to save initial run summary: %v", err)
 	}
 
-	// Record file mtime before any re-audit
 	summaryPath := filepath.Join(runOutputDir, runSummaryFileName)
 	initialInfo, err := os.Stat(summaryPath)
 	if err != nil {
@@ -484,32 +498,39 @@ func TestAuditUsesRunSummaryCache(t *testing.T) {
 	}
 	initialModTime := initialInfo.ModTime()
 
-	// Load the summary as the audit code would do — this simulates the second call
-	loadedSummary, ok := loadRunSummary(runOutputDir, false)
-	if !ok {
-		t.Fatalf("loadRunSummary should have found the cached summary")
-	}
-
-	// The cached metrics must be preserved exactly — these are the "ground truth" firewall values
-	if loadedSummary.Metrics.TokenUsage != cachedMetrics.TokenUsage {
-		t.Errorf("Token usage mismatch: cached=%d, loaded=%d (cache was overwritten with different data)",
-			cachedMetrics.TokenUsage, loadedSummary.Metrics.TokenUsage)
-	}
-	if loadedSummary.Metrics.Turns != cachedMetrics.Turns {
-		t.Errorf("Turns mismatch: cached=%d, loaded=%d (cache was overwritten with different data)",
-			cachedMetrics.Turns, loadedSummary.Metrics.Turns)
+	// Call AuditWorkflowRun — the only way this can succeed in a test environment (no GitHub
+	// credentials) is if the early-return cache path is taken, skipping fetchWorkflowRunMetadata.
+	// WorkflowPath is empty in the cached summary, so renderAuditReport will not attempt any
+	// GitHub API calls for baseline comparison either.
+	ctx := t.Context()
+	if err := AuditWorkflowRun(ctx, runID, "", "", "", tempDir, false, false, false, 0, 0, nil); err != nil {
+		t.Fatalf("AuditWorkflowRun failed — cache path not taken (fetchWorkflowRunMetadata was probably called): %v", err)
 	}
 
-	// The run_summary.json must NOT have been modified — the cache should be read-only here
+	// The run_summary.json must NOT have been modified — the poison log must not have been parsed
 	currentInfo, err := os.Stat(summaryPath)
 	if err != nil {
-		t.Fatalf("Could not stat run_summary.json after load: %v", err)
+		t.Fatalf("Could not stat run_summary.json after AuditWorkflowRun: %v", err)
 	}
 	if !currentInfo.ModTime().Equal(initialModTime) {
-		t.Errorf("run_summary.json was modified after loadRunSummary (mtime changed from %v to %v): "+
+		t.Errorf("run_summary.json was modified (mtime changed from %v to %v): "+
 			"the audit must not overwrite the cache on repeated calls",
 			initialModTime, currentInfo.ModTime())
 	}
+
+	// Verify cached metrics are untouched — the poison log would have inflated these if parsed
+	loadedSummary, ok := loadRunSummary(runOutputDir, false)
+	if !ok {
+		t.Fatalf("loadRunSummary should still find a valid cached summary")
+	}
+	if loadedSummary.Metrics.TokenUsage != cachedMetrics.TokenUsage {
+		t.Errorf("Token usage mismatch: expected cached=%d, got=%d (poison log was parsed)",
+			cachedMetrics.TokenUsage, loadedSummary.Metrics.TokenUsage)
+	}
+	if loadedSummary.Metrics.Turns != cachedMetrics.Turns {
+		t.Errorf("Turns mismatch: expected cached=%d, got=%d",
+			cachedMetrics.Turns, loadedSummary.Metrics.Turns)
+	}
 }
 
 // TestRenderAuditReportUsesProvidedMetrics verifies that renderAuditReport renders the report
@@ -542,8 +563,9 @@ func TestRenderAuditReportUsesProvidedMetrics(t *testing.T) {
 		JobDetails:   []JobInfoWithDuration{},
 	}
 
-	// renderAuditReport should complete without error even without GitHub API access
-	// (no actual GitHub calls are made inside renderAuditReport)
+	// renderAuditReport should complete without error even without GitHub API access.
+	// No GitHub calls are made because WorkflowPath is empty, causing findPreviousSuccessfulWorkflowRuns
+	// to return early with an error before any network requests are issued.
 	err := renderAuditReport(processedRun, metrics, nil, runOutputDir, "", "", "", false, false, false)
 	if err != nil {
 		t.Errorf("renderAuditReport returned unexpected error: %v", err)

From a2a126672c56b6773059fa94b22290f561c0cdfd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 14 Apr 2026 14:37:49 +0000
Subject: [PATCH 7/7] fix: simplify poison log and add parameter comments in
 TestAuditUsesRunSummaryCache

Agent-Logs-Url: https://github.com/github/gh-aw/sessions/182b342d-dba2-4d38-965c-c10a304d2e31

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 pkg/cli/audit_test.go | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/pkg/cli/audit_test.go b/pkg/cli/audit_test.go
index fe1a8a23bee..dab9aaed9d5 100644
--- a/pkg/cli/audit_test.go
+++ b/pkg/cli/audit_test.go
@@ -453,10 +453,10 @@ func TestAuditUsesRunSummaryCache(t *testing.T) {
 		t.Fatalf("Failed to write aw_info.json: %v", err)
 	}
 
-	// Write a "poison" log file with grossly inflated token counts.  If the cache path is
-	// bypassed and log files are re-processed, these values would be counted and would
+	// Write a "poison" log file with a grossly inflated token count.  If the cache path is
+	// bypassed and log files are re-processed, this value would be counted and would
 	// overwrite the summary — but the test verifies that never happens.
-	poisonLog := strings.Repeat(`{"type":"agent_turn","usage":{"total_tokens":9999999}}`+"\n", 10)
+	poisonLog := `{"type":"agent_turn","usage":{"total_tokens":9999999}}` + "\n"
 	if err := os.WriteFile(filepath.Join(runOutputDir, "agent-stdio.log"), []byte(poisonLog), 0644); err != nil {
 		t.Fatalf("Failed to write poison log: %v", err)
 	}
@@ -503,7 +503,20 @@ func TestAuditUsesRunSummaryCache(t *testing.T) {
 	// WorkflowPath is empty in the cached summary, so renderAuditReport will not attempt any
 	// GitHub API calls for baseline comparison either.
 	ctx := t.Context()
-	if err := AuditWorkflowRun(ctx, runID, "", "", "", tempDir, false, false, false, 0, 0, nil); err != nil {
+	if err := AuditWorkflowRun(
+		ctx,
+		runID,
+		"", // owner — empty: no explicit repo context, relies on gh CLI defaults
+		"", // repo
+		"", // hostname — empty: uses github.com
+		tempDir,
+		false, // verbose
+		false, // parse
+		false, // jsonOutput
+		0,     // jobID — 0: full-run audit (not job-specific)
+		0,     // stepNumber
+		nil,   // artifactSets
+	); err != nil {
 		t.Fatalf("AuditWorkflowRun failed — cache path not taken (fetchWorkflowRunMetadata was probably called): %v", err)
 	}