From d580e6e3ea92426323d72cea6dd9d62f5c389ab7 Mon Sep 17 00:00:00 2001 From: Brian Love Date: Sun, 17 May 2026 07:57:29 -0700 Subject: [PATCH] feat(posthog): schedule live telemetry quality checks --- .github/workflows/posthog-quality.yml | 48 +++++++ tools/posthog/README.md | 16 ++- tools/posthog/live-quality.spec.ts | 99 +++++++++++++- tools/posthog/live-quality.ts | 181 ++++++++++++++++++++++---- 4 files changed, 308 insertions(+), 36 deletions(-) create mode 100644 .github/workflows/posthog-quality.yml diff --git a/.github/workflows/posthog-quality.yml b/.github/workflows/posthog-quality.yml new file mode 100644 index 000000000..3adf2c96a --- /dev/null +++ b/.github/workflows/posthog-quality.yml @@ -0,0 +1,48 @@ +name: PostHog telemetry quality + +on: + workflow_dispatch: + inputs: + days: + description: Days of live events to inspect + required: false + default: '7' + limit_per_event: + description: Maximum events to sample per contracted event + required: false + default: '100' + schedule: + - cron: '23 14 * * *' + +concurrency: + group: posthog-telemetry-quality + cancel-in-progress: false + +permissions: + contents: read + +jobs: + live-quality: + name: Live telemetry contract and coverage + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6.0.2 + - uses: actions/setup-node@v6.3.0 + with: + node-version: 22 + cache: npm + - run: npm ci + - name: Run live telemetry quality check + env: + POSTHOG_PERSONAL_API_KEY: ${{ secrets.POSTHOG_PERSONAL_API_KEY }} + POSTHOG_HOST: https://us.i.posthog.com + POSTHOG_PROJECT_ID: ${{ secrets.POSTHOG_PROJECT_ID }} + QUALITY_DAYS: ${{ github.event.inputs.days || '7' }} + QUALITY_LIMIT_PER_EVENT: ${{ github.event.inputs.limit_per_event || '100' }} + run: | + set -euo pipefail + if [ -z "${POSTHOG_PERSONAL_API_KEY:-}" ] || [ -z "${POSTHOG_PROJECT_ID:-}" ]; then + echo "::error::POSTHOG_PERSONAL_API_KEY and POSTHOG_PROJECT_ID Actions secrets are required." + exit 1 + fi + npx nx run posthog-tools:quality:live -- --days "$QUALITY_DAYS" --limit-per-event "$QUALITY_LIMIT_PER_EVENT" --require-critical-coverage diff --git a/tools/posthog/README.md b/tools/posthog/README.md index d79da2087..8ca1cd24a 100644 --- a/tools/posthog/README.md +++ b/tools/posthog/README.md @@ -52,11 +52,11 @@ Requires a **Personal API Key** with `dashboard:write`, `insight:write`, `cohort Env vars (see `.env.example` at repo root): -| Variable | Purpose | -|----------|---------| -| `POSTHOG_PERSONAL_API_KEY` | Personal API Key (Bearer) | -| `POSTHOG_HOST` | `https://us.i.posthog.com` (default) or your region | -| `POSTHOG_PROJECT_ID` | Numeric project id (visible in PostHog URL) | +| Variable | Purpose | +| -------------------------- | --------------------------------------------------- | +| `POSTHOG_PERSONAL_API_KEY` | Personal API Key (Bearer) | +| `POSTHOG_HOST` | `https://us.i.posthog.com` (default) or your region | +| `POSTHOG_PROJECT_ID` | Numeric project id (visible in PostHog URL) | **CI** uses the same key (write-scoped) for `--plan` only. **Production hardening TODO:** create a read-only Personal API Key for CI and add it as `POSTHOG_PERSONAL_API_KEY_READONLY` in GitHub Actions secrets. Local development continues using the write-scoped key for `--apply` and `--report`. @@ -65,8 +65,8 @@ Env vars (see `.env.example` at repo root): ```jsonc // tools/posthog/dashboards/developer-funnel.json { - "slug": "developer-funnel", // local id, stable across syncs - "posthog_id": null, // assigned on first sync; do not edit + "slug": "developer-funnel", // local id, stable across syncs + "posthog_id": null, // assigned on first sync; do not edit "name": "GTM · Developer funnel", "description": "Pageview → install → cockpit activation.", "tags": ["gtm", "developer-track"], @@ -99,6 +99,8 @@ Event names must match [`docs/gtm/taxonomy.md`](../../docs/gtm/taxonomy.md). The - `taxonomy.spec.ts` and `telemetry-contract.spec.ts` guard committed dashboard JSON against undocumented events, unsupported breakdowns, unsupported filters, runtime dashboard coverage drift, and forbidden sensitive runtime fields. - `npm run posthog:quality -- --days 7 --limit-per-event 25` samples recent live PostHog events and validates observed payloads against the same contract. It exits non-zero for missing required properties or forbidden sensitive properties, and prints warnings for non-contract fields. +- `npm run posthog:quality -- --days 7 --limit-per-event 100 --require-critical-coverage` also requires recent samples for critical install and runtime events. The scheduled `PostHog telemetry quality` workflow runs this thresholded check daily and supports manual dispatch. +- The live workflow requires Actions secrets named `POSTHOG_PERSONAL_API_KEY` and `POSTHOG_PROJECT_ID`. ## Sync semantics diff --git a/tools/posthog/live-quality.spec.ts b/tools/posthog/live-quality.spec.ts index bf604a89a..d1ffd2023 100644 --- a/tools/posthog/live-quality.spec.ts +++ b/tools/posthog/live-quality.spec.ts @@ -2,9 +2,11 @@ import { test } from 'node:test'; import assert from 'node:assert/strict'; import { analyzeTelemetryEvents, + analyzeTelemetryCoverage, fetchRecentContractEvents, formatLiveQualityReport, hasBlockingFindings, + type LiveCoverageRequirement, type LiveTelemetryEvent, } from './live-quality.js'; import { TELEMETRY_EVENT_CONTRACT } from './telemetry-contract.js'; @@ -38,7 +40,7 @@ test('analyzeTelemetryEvents flags missing required and forbidden properties', ( property: 'messages', kind: 'forbidden_property', }, - ], + ] ); assert.equal(hasBlockingFindings(findings), true); }); @@ -69,11 +71,29 @@ test('analyzeTelemetryEvents warns on non-contract properties but ignores PostHo property: 'accidental_extra', kind: 'unexpected_property', }, - ], + ] ); assert.equal(hasBlockingFindings(findings), false); }); +test('analyzeTelemetryEvents ignores PostHog attribution metadata', () => { + const findings = analyzeTelemetryEvents([ + { + event: 'marketing:cta_click', + timestamp: '2026-05-17T00:00:00Z', + properties: { + cta_id: 'hero_docs', + gclid: 'click-id', + fbclid: 'click-id', + utm_source: 'newsletter', + utm_campaign: 'launch', + }, + }, + ]); + + assert.deepEqual(findings, []); +}); + test('formatLiveQualityReport summarizes clean coverage and warnings', () => { const events: LiveTelemetryEvent[] = [ { @@ -98,6 +118,69 @@ test('formatLiveQualityReport summarizes clean coverage and warnings', () => { assert.match(report, /unexpected/); }); +test('analyzeTelemetryCoverage flags required events with no recent samples', () => { + const requirements: LiveCoverageRequirement[] = [ + { event: 'ngaf:runtime_request_created', minCount: 1 }, + { event: 'ngaf:stream_started', minCount: 2 }, + ]; + + const findings = analyzeTelemetryCoverage( + [ + { + event: 'ngaf:stream_started', + timestamp: '2026-05-17T00:00:00Z', + properties: { transport: 'langgraph' }, + }, + ], + requirements + ); + + assert.deepEqual( + findings.map((finding) => ({ + severity: finding.severity, + event: finding.event, + property: finding.property, + kind: finding.kind, + message: finding.message, + })), + [ + { + severity: 'error', + event: 'ngaf:runtime_request_created', + property: 'event_count', + kind: 'insufficient_event_coverage', + message: + 'ngaf:runtime_request_created has 0 recent events; expected at least 1', + }, + { + severity: 'error', + event: 'ngaf:stream_started', + property: 'event_count', + kind: 'insufficient_event_coverage', + message: 'ngaf:stream_started has 1 recent event; expected at least 2', + }, + ] + ); + assert.equal(hasBlockingFindings(findings), true); +}); + +test('formatLiveQualityReport includes coverage requirements', () => { + const report = formatLiveQualityReport({ + days: 7, + events: [], + findings: analyzeTelemetryCoverage( + [], + [{ event: 'ngaf:postinstall', minCount: 1 }] + ), + checkedEvents: ['ngaf:postinstall'], + coverageRequirements: [{ event: 'ngaf:postinstall', minCount: 1 }], + }); + + assert.match(report, /\| Event \| Sampled events \| Required minimum \|/); + assert.match(report, /\| ngaf:postinstall \| 0 \| 1 \|/); + assert.match(report, /insufficient_event_coverage/); +}); + test('fetchRecentContractEvents requests each contract event with bounded limits', async () => { const calls: unknown[] = []; const client = { @@ -161,9 +244,17 @@ test('every contracted event can be analyzed without a bespoke case', () => { event, timestamp: '2026-05-17T00:00:00Z', properties: Object.fromEntries( - TELEMETRY_EVENT_CONTRACT[event].requiredProperties.map((property) => [property, 'x']), + TELEMETRY_EVENT_CONTRACT[event].requiredProperties.map((property) => [ + property, + 'x', + ]) ), })); - assert.equal(analyzeTelemetryEvents(events).some((finding) => finding.severity === 'error'), false); + assert.equal( + analyzeTelemetryEvents(events).some( + (finding) => finding.severity === 'error' + ), + false + ); }); diff --git a/tools/posthog/live-quality.ts b/tools/posthog/live-quality.ts index 8917ad06e..31145c838 100644 --- a/tools/posthog/live-quality.ts +++ b/tools/posthog/live-quality.ts @@ -13,7 +13,8 @@ export interface LiveTelemetryEvent { export type LiveQualityFindingKind = | 'missing_required_property' | 'forbidden_property' - | 'unexpected_property'; + | 'unexpected_property' + | 'insufficient_event_coverage'; export interface LiveQualityFinding { severity: 'error' | 'warning'; @@ -32,13 +33,55 @@ export interface FetchRecentContractEventsOptions { } export interface LiveTelemetryEventsClient { - GET(path: string, options: unknown): Promise<{ data?: { results?: LiveTelemetryEvent[] }; error?: unknown }>; + GET( + path: string, + options: unknown + ): Promise<{ data?: { results?: LiveTelemetryEvent[] }; error?: unknown }>; } +export interface LiveCoverageRequirement { + event: string; + minCount: number; +} + +export const DEFAULT_LIVE_COVERAGE_REQUIREMENTS: readonly LiveCoverageRequirement[] = + [ + { event: 'ngaf:postinstall', minCount: 1 }, + { event: 'ngaf:browser_chat_init', minCount: 1 }, + { event: 'ngaf:runtime_instance_created', minCount: 1 }, + { event: 'ngaf:runtime_request_created', minCount: 1 }, + { event: 'ngaf:stream_started', minCount: 1 }, + { event: 'ngaf:stream_ended', minCount: 1 }, + ]; + const INTERNAL_PROPERTY_NAMES = new Set([ + '_kx', + 'dclid', 'distinct_id', + 'epik', + 'fbclid', + 'gad_source', + 'gbraid', + 'gclid', + 'gclsrc', + 'igshid', + 'irclid', + 'li_fat_id', + 'mc_cid', + 'msclkid', + 'qclid', + 'rdt_cid', + 'sccid', + 'ttclid', + 'twclid', 'token', + 'utm_campaign', + 'utm_content', + 'utm_medium', + 'utm_source', + 'utm_term', 'uuid', + 'wbraid', ]); function isMissing(value: unknown): boolean { @@ -51,7 +94,7 @@ function isInternalProperty(property: string): boolean { export function analyzeTelemetryEvents( events: readonly LiveTelemetryEvent[], - contract = TELEMETRY_EVENT_CONTRACT, + contract = TELEMETRY_EVENT_CONTRACT ): LiveQualityFinding[] { const forbiddenProperties = new Set(TELEMETRY_FORBIDDEN_PROPERTIES); const findings: LiveQualityFinding[] = []; @@ -103,7 +146,40 @@ export function analyzeTelemetryEvents( return findings; } -export function hasBlockingFindings(findings: readonly LiveQualityFinding[]): boolean { +function pluralizeEvent(count: number): string { + return count === 1 ? 'event' : 'events'; +} + +export function analyzeTelemetryCoverage( + events: readonly LiveTelemetryEvent[], + requirements: readonly LiveCoverageRequirement[] +): LiveQualityFinding[] { + const findings: LiveQualityFinding[] = []; + + for (const requirement of requirements) { + const count = events.filter( + (item) => item.event === requirement.event + ).length; + if (count < requirement.minCount) { + findings.push({ + severity: 'error', + kind: 'insufficient_event_coverage', + event: requirement.event, + property: 'event_count', + timestamp: 'n/a', + message: `${requirement.event} has ${count} recent ${pluralizeEvent( + count + )}; expected at least ${requirement.minCount}`, + }); + } + } + + return findings; +} + +export function hasBlockingFindings( + findings: readonly LiveQualityFinding[] +): boolean { return findings.some((finding) => finding.severity === 'error'); } @@ -127,7 +203,11 @@ export async function fetchRecentContractEvents({ }); if (response.error || response.data === undefined) { - throw new Error(`PostHog events query failed for ${event}: ${JSON.stringify(response.error)}`); + throw new Error( + `PostHog events query failed for ${event}: ${JSON.stringify( + response.error + )}` + ); } fetched.push(...(response.data.results ?? [])); } @@ -136,23 +216,44 @@ export async function fetchRecentContractEvents({ export function formatLiveQualityReport({ checkedEvents, + coverageRequirements = [], days, events, findings, }: { checkedEvents: readonly string[]; + coverageRequirements?: readonly LiveCoverageRequirement[]; days: number; events: readonly LiveTelemetryEvent[]; findings: readonly LiveQualityFinding[]; }): string { const lines: string[] = []; - lines.push(`Live telemetry quality — last ${days} ${days === 1 ? 'day' : 'days'}`); + const requiredMinimums = new Map( + coverageRequirements.map((requirement) => [ + requirement.event, + requirement.minCount, + ]) + ); + lines.push( + `Live telemetry quality — last ${days} ${days === 1 ? 'day' : 'days'}` + ); lines.push(''); - lines.push('| Event | Sampled events |'); - lines.push('|-------|---------------:|'); + if (coverageRequirements.length > 0) { + lines.push('| Event | Sampled events | Required minimum |'); + lines.push('|-------|---------------:|-----------------:|'); + } else { + lines.push('| Event | Sampled events |'); + lines.push('|-------|---------------:|'); + } for (const event of checkedEvents) { const count = events.filter((item) => item.event === event).length; - lines.push(`| ${event} | ${count} |`); + if (coverageRequirements.length > 0) { + lines.push( + `| ${event} | ${count} | ${requiredMinimums.get(event) ?? ''} |` + ); + } else { + lines.push(`| ${event} | ${count} |`); + } } const errors = findings.filter((finding) => finding.severity === 'error'); @@ -165,7 +266,9 @@ export function formatLiveQualityReport({ lines.push(''); lines.push('## Errors'); for (const finding of errors) { - lines.push(`- ${finding.message} (${finding.timestamp})`); + lines.push( + `- ${finding.kind}: ${finding.message} (${finding.timestamp})` + ); } } @@ -173,16 +276,23 @@ export function formatLiveQualityReport({ lines.push(''); lines.push('## Warnings'); for (const finding of warnings) { - lines.push(`- ${finding.message} (${finding.timestamp})`); + lines.push( + `- ${finding.kind}: ${finding.message} (${finding.timestamp})` + ); } } return lines.join('\n'); } -function parseArgs(args: readonly string[]): { days: number; limitPerEvent: number } { +function parseArgs(args: readonly string[]): { + days: number; + limitPerEvent: number; + coverageRequirements: readonly LiveCoverageRequirement[]; +} { let days = 7; let limitPerEvent = 100; + let coverageRequirements: readonly LiveCoverageRequirement[] = []; for (let i = 0; i < args.length; i += 1) { const arg = args[i]; if (arg === '--days') { @@ -191,29 +301,40 @@ function parseArgs(args: readonly string[]): { days: number; limitPerEvent: numb } else if (arg === '--limit-per-event') { limitPerEvent = Number(args[i + 1]); i += 1; + } else if (arg === '--require-critical-coverage') { + coverageRequirements = DEFAULT_LIVE_COVERAGE_REQUIREMENTS; } else { throw new Error(`Unknown argument: ${arg}`); } } - if (!Number.isInteger(days) || days <= 0) throw new Error('--days must be a positive integer'); + if (!Number.isInteger(days) || days <= 0) + throw new Error('--days must be a positive integer'); if (!Number.isInteger(limitPerEvent) || limitPerEvent <= 0) { throw new Error('--limit-per-event must be a positive integer'); } - return { days, limitPerEvent }; + return { days, limitPerEvent, coverageRequirements }; } async function main(): Promise { - let options: { days: number; limitPerEvent: number }; + let options: { + days: number; + limitPerEvent: number; + coverageRequirements: readonly LiveCoverageRequirement[]; + }; try { options = parseArgs(process.argv.slice(2)); } catch (err) { console.error(err instanceof Error ? err.message : err); - console.error('Usage: tsx tools/posthog/live-quality.ts [--days N] [--limit-per-event N]'); + console.error( + 'Usage: tsx tools/posthog/live-quality.ts [--days N] [--limit-per-event N] [--require-critical-coverage]' + ); return 1; } const checkedEvents = Object.keys(TELEMETRY_EVENT_CONTRACT).sort(); - const after = new Date(Date.now() - options.days * 24 * 60 * 60 * 1000).toISOString(); + const after = new Date( + Date.now() - options.days * 24 * 60 * 60 * 1000 + ).toISOString(); try { const events = await fetchRecentContractEvents({ client: ph() as LiveTelemetryEventsClient, @@ -221,16 +342,26 @@ async function main(): Promise { after, limitPerEvent: options.limitPerEvent, }); - const findings = analyzeTelemetryEvents(events); - console.log(formatLiveQualityReport({ - checkedEvents, - days: options.days, - events, - findings, - })); + const findings = [ + ...analyzeTelemetryEvents(events), + ...analyzeTelemetryCoverage(events, options.coverageRequirements), + ]; + console.log( + formatLiveQualityReport({ + checkedEvents, + coverageRequirements: options.coverageRequirements, + days: options.days, + events, + findings, + }) + ); return hasBlockingFindings(findings) ? 1 : 0; } catch (err) { - console.error(`Live telemetry quality check failed: ${err instanceof Error ? err.message : err}`); + console.error( + `Live telemetry quality check failed: ${ + err instanceof Error ? err.message : err + }` + ); return 1; } }