From 8566eacc26a5a618f800ee0fe0b68bc091a5640f Mon Sep 17 00:00:00 2001 From: Travis Rich Date: Wed, 15 Apr 2026 13:18:57 -0400 Subject: [PATCH 1/3] refactor ingestion and table --- client/containers/Pub/PubHeader/Download.tsx | 1 - infra/.env.test | 3 - pnpm-lock.yaml | 12 +- server/analytics/api.ts | 97 +++++++++++--- server/analytics/impactApi.ts | 84 ++++++++----- server/analytics/model.ts | 72 +++-------- server/analytics/summaryViews.ts | 64 +++++----- tools/migrateRedshift.ts | 126 ++++++++----------- utils/analytics/usePageOnce.ts | 2 - utils/api/schemas/analytics.ts | 34 ++--- 10 files changed, 263 insertions(+), 232 deletions(-) diff --git a/client/containers/Pub/PubHeader/Download.tsx b/client/containers/Pub/PubHeader/Download.tsx index 767cf6f2b5..e8752efbc0 100644 --- a/client/containers/Pub/PubHeader/Download.tsx +++ b/client/containers/Pub/PubHeader/Download.tsx @@ -71,7 +71,6 @@ const Download = (props: Props) => { communitySubdomain: communityData.subdomain, format: type.format, pubId: pubData.id, - isProd: locationData.isProd, }); if (type.format === 'formatted') { return download(formattedDownload.url); diff --git a/infra/.env.test b/infra/.env.test index 36ced00c38..6664540dd5 100644 --- a/infra/.env.test +++ b/infra/.env.test @@ -10,9 +10,6 @@ MAILCHIMP_API_KEY=xxx FIREBASE_SERVICE_ACCOUNT_BASE64=xxx CLOUDAMQP_URL=xxx -ALGOLIA_ID=ooo -ALGOLIA_KEY=ooo -ALGOLIA_SEARCH_KEY=ooo JWT_SIGNING_SECRET=shhhhhh BYPASS_CAPTCHA=true FIREBASE_TEST_DB_URL=http://localhost:9875?ns=pubpub-v6 diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8e96111a5f..69191cd279 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -6,7 +6,7 @@ settings: patchedDependencies: reakit: - hash: 31488077229fe3d19a71c66458b888c58eab18010632a29e3b5c485df77242a8 + hash: hpw5k5ors3jxzufoxbjqeo4iee path: patches/reakit.patch importers: @@ -525,7 +525,7 @@ importers: version: 1.0.9(react@16.14.0) reakit: specifier: 1.0.0-beta.14 - version: 1.0.0-beta.14(patch_hash=31488077229fe3d19a71c66458b888c58eab18010632a29e3b5c485df77242a8)(react-dom@16.14.0(react@16.14.0))(react@16.14.0) + version: 1.0.0-beta.14(patch_hash=hpw5k5ors3jxzufoxbjqeo4iee)(react-dom@16.14.0(react@16.14.0))(react@16.14.0) rebound: specifier: ^0.1.0 version: 0.1.0 @@ -23469,11 +23469,11 @@ snapshots: dependencies: picomatch: 2.3.1 - reakit-system@0.7.2(react-dom@16.14.0(react@16.14.0))(react@16.14.0)(reakit-utils@0.7.3(react-dom@16.14.0(react@16.14.0))(react@16.14.0))(reakit@1.0.0-beta.14(patch_hash=31488077229fe3d19a71c66458b888c58eab18010632a29e3b5c485df77242a8)(react-dom@16.14.0(react@16.14.0))(react@16.14.0)): + reakit-system@0.7.2(react-dom@16.14.0(react@16.14.0))(react@16.14.0)(reakit-utils@0.7.3(react-dom@16.14.0(react@16.14.0))(react@16.14.0))(reakit@1.0.0-beta.14(patch_hash=hpw5k5ors3jxzufoxbjqeo4iee)(react-dom@16.14.0(react@16.14.0))(react@16.14.0)): dependencies: react: 16.14.0 react-dom: 16.14.0(react@16.14.0) - reakit: 1.0.0-beta.14(patch_hash=31488077229fe3d19a71c66458b888c58eab18010632a29e3b5c485df77242a8)(react-dom@16.14.0(react@16.14.0))(react@16.14.0) + reakit: 1.0.0-beta.14(patch_hash=hpw5k5ors3jxzufoxbjqeo4iee)(react-dom@16.14.0(react@16.14.0))(react@16.14.0) reakit-utils: 0.7.3(react-dom@16.14.0(react@16.14.0))(react@16.14.0) reakit-utils@0.7.3(react-dom@16.14.0(react@16.14.0))(react@16.14.0): @@ -23481,13 +23481,13 @@ snapshots: react: 16.14.0 react-dom: 16.14.0(react@16.14.0) - reakit@1.0.0-beta.14(patch_hash=31488077229fe3d19a71c66458b888c58eab18010632a29e3b5c485df77242a8)(react-dom@16.14.0(react@16.14.0))(react@16.14.0): + reakit@1.0.0-beta.14(patch_hash=hpw5k5ors3jxzufoxbjqeo4iee)(react-dom@16.14.0(react@16.14.0))(react@16.14.0): dependencies: body-scroll-lock: 2.7.1 popper.js: 1.16.1 react: 16.14.0 react-dom: 16.14.0(react@16.14.0) - reakit-system: 0.7.2(react-dom@16.14.0(react@16.14.0))(react@16.14.0)(reakit-utils@0.7.3(react-dom@16.14.0(react@16.14.0))(react@16.14.0))(reakit@1.0.0-beta.14(patch_hash=31488077229fe3d19a71c66458b888c58eab18010632a29e3b5c485df77242a8)(react-dom@16.14.0(react@16.14.0))(react@16.14.0)) + reakit-system: 0.7.2(react-dom@16.14.0(react@16.14.0))(react@16.14.0)(reakit-utils@0.7.3(react-dom@16.14.0(react@16.14.0))(react@16.14.0))(reakit@1.0.0-beta.14(patch_hash=hpw5k5ors3jxzufoxbjqeo4iee)(react-dom@16.14.0(react@16.14.0))(react@16.14.0)) reakit-utils: 0.7.3(react-dom@16.14.0(react@16.14.0))(react@16.14.0) rebound@0.1.0: {} diff --git a/server/analytics/api.ts b/server/analytics/api.ts index d4dc8603df..44500c37fe 100644 --- a/server/analytics/api.ts +++ b/server/analytics/api.ts @@ -9,21 +9,82 @@ import { contract } from 'utils/api/contract'; import { enqueue } from './writeBuffer'; +// ─── Stitch dual-write (temporary for rollback safety) ────────────────────── + +/** Fire-and-forget POST to the old Stitch/Redshift webhook so we can rollback if needed. */ +function sendToStitch(payload: unknown) { + fetch(env.STITCH_WEBHOOK_URL, { + method: 'POST', + body: JSON.stringify(payload), + headers: { 'Content-Type': 'application/json' }, + }).catch(() => { + // Silently swallow — Stitch is best-effort during the transition period. + }); +} + const s = initServer(); -const toEventRecord = ( - payload: AnalyticsEventPayload, - enrichment: { country: string | null; countryCode: string | null }, -) => { +// ─── validation helpers ────────────────────────────────────────────────────── + +const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; + +/** Coerce a value to a valid UUID or null. Prevents bad UUIDs from killing the entire bulkCreate batch. */ +function sanitizeUuid(val: unknown): string | null { + return typeof val === 'string' && UUID_RE.test(val) ? val : null; +} + +const THIRTY_DAYS_MS = 30 * 24 * 60 * 60 * 1000; + +/** Returns true if the timestamp is within an acceptable range (not future, not >30 days old). */ +function isTimestampValid(ts: number): boolean { + const now = Date.now(); + return ts <= now && ts >= now - THIRTY_DAYS_MS; +} + +// ─── fields to strip (no longer stored) ────────────────────────────────────── + +const DROPPED_FIELDS = new Set([ + 'title', + 'country', + 'countryCode', + 'isProd', + 'communityName', + 'communitySubdomain', + 'pubTitle', + 'pubSlug', + 'collectionTitle', + 'collectionSlug', + 'collectionKind', + 'collectionIds', + 'primaryCollectionId', + 'pageTitle', + 'pageSlug', +]); + +// ─── transform payload → DB record ────────────────────────────────────────── + +const toEventRecord = (payload: AnalyticsEventPayload) => { const raw = payload as Record; - const { unique, collectionIds, timestamp, ...fields } = raw; + const { unique, timestamp, ...rest } = raw; + + // Strip fields that are no longer stored in the table + const fields: Record = {}; + for (const [key, value] of Object.entries(rest)) { + if (!DROPPED_FIELDS.has(key)) { + fields[key] = value; + } + } + + // Sanitize UUID fields to prevent a single bad value from failing the whole batch + fields.communityId = sanitizeUuid(fields.communityId); + fields.pubId = sanitizeUuid(fields.pubId); + fields.collectionId = sanitizeUuid(fields.collectionId); + fields.pageId = sanitizeUuid(fields.pageId); return { ...fields, - ...enrichment, - timestamp: new Date(timestamp as number), + createdAt: new Date(timestamp as number), isUnique: (unique as boolean | undefined) ?? null, - collectionIds: typeof collectionIds === 'string' ? collectionIds.split(',') : null, }; }; @@ -44,21 +105,23 @@ export const analyticsServer = s.router(contract.analytics, { }, ], handler: async ({ body: payload }) => { - // Only record events on the production deployment (or in tests) - // to avoid polluting analytics with localhost / staging page views. - // Checked server-side so it can't be spoofed by clients. - if (!env.PUBPUB_PRODUCTION && env.NODE_ENV !== 'test') { + // Dual-write to Stitch/Redshift for rollback safety (temporary). + // Sent unconditionally (before validation) to match old behavior exactly: + // { country, countryCode, ...payload } + const { timezone } = payload; + const { name: country = null, id: countryCode = null } = + getCountryForTimezone(timezone) || {}; + sendToStitch({ country, countryCode, ...payload }); + + // Reject events with unreasonable timestamps (future or >30 days old) + if (!isTimestampValid(payload.timestamp)) { return { status: 204, body: undefined, }; } - const { timezone } = payload; - const { name: country = null, id: countryCode = null } = - getCountryForTimezone(timezone) || {}; - - const record = toEventRecord(payload, { country, countryCode }); + const record = toEventRecord(payload); enqueue(record); diff --git a/server/analytics/impactApi.ts b/server/analytics/impactApi.ts index c1fc9a02f7..ca83564da1 100644 --- a/server/analytics/impactApi.ts +++ b/server/analytics/impactApi.ts @@ -5,6 +5,7 @@ * Mounted at /api/analytics-impact/... */ import { Router } from 'express'; +import { getCountryForTimezone } from 'countries-and-timezones'; import { QueryTypes } from 'sequelize'; import { sequelize } from 'server/sequelize'; @@ -49,6 +50,7 @@ function setCache(key: string, data: unknown) { type DailyRow = { date: string; pageViews: number; uniquePageViews: number }; type CountryRow = { country: string; countryCode: string; count: number }; +type TimezoneRow = { timezone: string; count: number }; type ReferrerRow = { referrer: string; count: number }; type CampaignRow = { campaign: string; count: number }; type TopPageRow = { pageTitle: string; path: string; count: number }; @@ -56,6 +58,28 @@ type TopPubRow = { pubTitle: string; pubId: string; views: number; downloads: nu type TopCollectionRow = { collectionTitle: string; collectionId: string; count: number }; type DeviceRow = { device_type: string; count: number }; +// ─── timezone → country mapping ────────────────────────────────────────────── + +/** Rolls up timezone-level rows into country-level totals using the npm package. */ +function rollUpTimezoneToCountries(rows: TimezoneRow[]): CountryRow[] { + const countryMap = new Map(); + for (const row of rows) { + const tz = getCountryForTimezone(row.timezone); + const key = tz ? tz.id : 'Unknown'; + const existing = countryMap.get(key); + if (existing) { + existing.count += Number(row.count); + } else { + countryMap.set(key, { + country: tz?.name ?? 'Unknown', + countryCode: tz?.id ?? '', + count: Number(row.count), + }); + } + } + return [...countryMap.values()].sort((a, b) => b.count - a.count); +} + // ─── scope filter builder ──────────────────────────────────────────────────── type Scope = { @@ -102,7 +126,7 @@ async function fetchSummary(scope: Scope, startDate: string, endDate: string) { return fetchSummaryFromRaw(scope, startDate, endDate); } - const [daily, countries, topPubs, topPages, topCollections, referrers, campaigns, devices] = + const [daily, timezoneRows, topPubs, topPages, topCollections, referrers, campaigns, devices] = await Promise.all([ // ── daily breakdown from matview sequelize.query( @@ -116,16 +140,15 @@ async function fetchSummary(scope: Scope, startDate: string, endDate: string) { ORDER BY date`, { replacements, type: QueryTypes.SELECT }, ), - // ── countries from matview - sequelize.query( + // ── countries from matview (timezone → country mapped in JS) + sequelize.query( `SELECT - country, - country_code AS "countryCode", + timezone, SUM(count)::int AS count - FROM analytics_daily_country + FROM analytics_daily_timezone WHERE ${mvWhere} - GROUP BY country, country_code - ORDER BY count DESC LIMIT 250`, + GROUP BY timezone + ORDER BY count DESC`, { replacements, type: QueryTypes.SELECT }, ), // ── top pubs from matview, JOIN Pubs for titles @@ -221,7 +244,7 @@ async function fetchSummary(scope: Scope, startDate: string, endDate: string) { totalUniqueVisits, totalDownloads, daily: dailyParsed, - countries: countries.map((c) => ({ ...c, count: Number(c.count) })), + countries: rollUpTimezoneToCountries(timezoneRows).slice(0, 250), topPubs: topPubs.map((p) => ({ ...p, views: Number(p.views), @@ -243,18 +266,18 @@ async function fetchSummaryFromRaw(scope: Scope, startDate: string, endDate: str const { clause: scopeClause, replacements: scopeReplacements } = scopeWhere(scope); const baseReplacements = { ...scopeReplacements, startDate, endDate }; - const dateFilter = `"timestamp" >= :startDate::date AND "timestamp" < (:endDate::date + interval '1 day')`; + const dateFilter = `"createdAt" >= :startDate::date AND "createdAt" < (:endDate::date + interval '1 day')`; const pageEvents = `event IN ('page','pub','collection','other')`; const baseWhere = `${scopeClause} AND ${dateFilter}`; const { clause: aeScopeClause } = scopeWhere(scope, 'ae'); - const aeDateFilter = `ae."timestamp" >= :startDate::date AND ae."timestamp" < (:endDate::date + interval '1 day')`; + const aeDateFilter = `ae."createdAt" >= :startDate::date AND ae."createdAt" < (:endDate::date + interval '1 day')`; const aeBaseWhere = `${aeScopeClause} AND ${aeDateFilter}`; const [ daily, [totalDlRow], - countries, + timezoneRows, topPubs, topPages, topCollections, @@ -264,7 +287,7 @@ async function fetchSummaryFromRaw(scope: Scope, startDate: string, endDate: str ] = await Promise.all([ sequelize.query( `SELECT - date_trunc('day', "timestamp")::date::text AS date, + date_trunc('day', "createdAt")::date::text AS date, COUNT(*) AS "pageViews", COUNT(*) FILTER (WHERE "isUnique" = true) AS "uniquePageViews" FROM "AnalyticsEvents" @@ -278,15 +301,14 @@ async function fetchSummaryFromRaw(scope: Scope, startDate: string, endDate: str WHERE ${baseWhere} AND event = 'download'`, { replacements: baseReplacements, type: QueryTypes.SELECT }, ), - sequelize.query( + sequelize.query( `SELECT - COALESCE(country, 'Unknown') AS country, - COALESCE("countryCode", '') AS "countryCode", + COALESCE(timezone, '') AS timezone, COUNT(*) AS count FROM "AnalyticsEvents" WHERE ${baseWhere} AND ${pageEvents} - GROUP BY country, "countryCode" - ORDER BY count DESC LIMIT 250`, + GROUP BY timezone + ORDER BY count DESC`, { replacements: baseReplacements, type: QueryTypes.SELECT }, ), sequelize.query( @@ -304,23 +326,27 @@ async function fetchSummaryFromRaw(scope: Scope, startDate: string, endDate: str ), sequelize.query( `SELECT - COALESCE("pageTitle", "pubTitle", "collectionTitle", path, '') AS "pageTitle", - COALESCE(path, '') AS path, + COALESCE(pg.title, p.title, c.title, ae.path, '') AS "pageTitle", + COALESCE(ae.path, '') AS path, COUNT(*) AS count - FROM "AnalyticsEvents" - WHERE ${baseWhere} AND ${pageEvents} - GROUP BY "pageTitle", "pubTitle", "collectionTitle", path + FROM "AnalyticsEvents" ae + LEFT JOIN "Pages" pg ON pg.id = ae."pageId" + LEFT JOIN "Pubs" p ON p.id = ae."pubId" + LEFT JOIN "Collections" c ON c.id = ae."collectionId" + WHERE ${aeBaseWhere} AND ae.event IN ('page','pub','collection','other') + GROUP BY pg.title, p.title, c.title, ae.path ORDER BY count DESC LIMIT 250`, { replacements: baseReplacements, type: QueryTypes.SELECT }, ), sequelize.query( `SELECT - COALESCE("collectionTitle", "collectionSlug", "collectionId"::text) AS "collectionTitle", - "collectionId"::text AS "collectionId", + COALESCE(c.title, ae."collectionId"::text) AS "collectionTitle", + ae."collectionId"::text AS "collectionId", COUNT(*) AS count - FROM "AnalyticsEvents" - WHERE ${baseWhere} AND "collectionId" IS NOT NULL AND event IN ('collection','pub') - GROUP BY "collectionTitle", "collectionSlug", "collectionId" + FROM "AnalyticsEvents" ae + LEFT JOIN "Collections" c ON c.id = ae."collectionId" + WHERE ${aeBaseWhere} AND ae."collectionId" IS NOT NULL AND ae.event IN ('collection','pub') + GROUP BY ae."collectionId", c.title ORDER BY count DESC LIMIT 250`, { replacements: baseReplacements, type: QueryTypes.SELECT }, ), @@ -376,7 +402,7 @@ async function fetchSummaryFromRaw(scope: Scope, startDate: string, endDate: str totalUniqueVisits, totalDownloads: parseInt(String((totalDlRow as any)?.totalDownloads ?? '0'), 10), daily: dailyParsed, - countries: countries.map((c) => ({ ...c, count: Number(c.count) })), + countries: rollUpTimezoneToCountries(timezoneRows).slice(0, 250), topPubs: topPubs.map((p) => ({ ...p, views: Number(p.views), diff --git a/server/analytics/model.ts b/server/analytics/model.ts index b03f5f5f6d..00338784cf 100644 --- a/server/analytics/model.ts +++ b/server/analytics/model.ts @@ -14,28 +14,30 @@ import { @Table({ updatedAt: false, + // Map Sequelize's auto-managed createdAt to our renamed column (was "timestamp") + createdAt: 'createdAt', indexes: [ { - name: 'analytics_events_community_event_ts', - fields: ['communityId', 'event', 'timestamp'], + name: 'analytics_events_community_event_created', + fields: ['communityId', 'event', 'createdAt'], }, - { name: 'analytics_events_pub_event_ts', fields: ['pubId', 'event', 'timestamp'] }, + { name: 'analytics_events_pub_event_created', fields: ['pubId', 'event', 'createdAt'] }, { - name: 'analytics_events_collection_event_ts', - fields: ['collectionId', 'event', 'timestamp'], + name: 'analytics_events_collection_event_created', + fields: ['collectionId', 'event', 'createdAt'], }, { - name: 'analytics_events_community_ts', - fields: ['communityId', 'timestamp'], + name: 'analytics_events_community_created', + fields: ['communityId', 'createdAt'], }, { name: 'analytics_events_community_pages', - fields: ['communityId', 'timestamp', 'isUnique'], + fields: ['communityId', 'createdAt', 'isUnique'], where: { event: { [Op.in]: ['page', 'pub', 'collection', 'other'] } }, }, { name: 'analytics_events_pub_views_dl', - fields: ['communityId', 'pubId', 'timestamp'], + fields: ['communityId', 'pubId', 'createdAt'], where: { pubId: { [Op.ne]: null }, event: { [Op.in]: ['pub', 'download'] }, @@ -60,9 +62,9 @@ export class AnalyticsEvent extends Model< @Column(DataType.TEXT) declare event: string; - @AllowNull(false) - @Column(DataType.DATE) - declare timestamp: Date; + // Sequelize auto-manages this via `createdAt: 'createdAt'` in Table options. + // For imported rows the value was preserved from the original Redshift "timestamp" column. + declare createdAt: CreationOptional; @Column(DataType.TEXT) declare referrer: string | null; @@ -107,28 +109,9 @@ export class AnalyticsEvent extends Model< @Column(DataType.UUID) declare communityId: string | null; - @Column(DataType.TEXT) - declare communitySubdomain: string | null; - - @Column(DataType.TEXT) - declare communityName: string | null; - - @AllowNull(false) - @Column(DataType.BOOLEAN) - declare isProd: boolean; - - @Column(DataType.TEXT) - declare country: string | null; - - @Column(DataType.TEXT) - declare countryCode: string | null; - @Column(DataType.TEXT) declare url: string | null; - @Column(DataType.TEXT) - declare title: string | null; - @Column(DataType.TEXT) declare hash: string | null; @@ -141,42 +124,15 @@ export class AnalyticsEvent extends Model< @Column(DataType.TEXT) declare path: string | null; - @Column(DataType.TEXT) - declare pageTitle: string | null; - @Column(DataType.UUID) declare pageId: string | null; - @Column(DataType.TEXT) - declare pageSlug: string | null; - - @Column(DataType.TEXT) - declare collectionTitle: string | null; - - @Column(DataType.TEXT) - declare collectionKind: string | null; - @Column(DataType.UUID) declare collectionId: string | null; - @Column(DataType.TEXT) - declare collectionSlug: string | null; - - @Column(DataType.TEXT) - declare pubTitle: string | null; - @Column(DataType.UUID) declare pubId: string | null; - @Column(DataType.TEXT) - declare pubSlug: string | null; - - @Column(DataType.ARRAY(DataType.TEXT)) - declare collectionIds: string[] | null; - - @Column(DataType.UUID) - declare primaryCollectionId: string | null; - @Column(DataType.TEXT) declare release: string | null; diff --git a/server/analytics/summaryViews.ts b/server/analytics/summaryViews.ts index 36945dfd68..c7e55bd7cd 100644 --- a/server/analytics/summaryViews.ts +++ b/server/analytics/summaryViews.ts @@ -21,32 +21,31 @@ const VIEWS: Array<{ name: string; createSql: string; indexSql: string }> = [ CREATE MATERIALIZED VIEW IF NOT EXISTS analytics_daily_summary AS SELECT "communityId", - date_trunc('day', "timestamp")::date AS date, + date_trunc('day', "createdAt")::date AS date, COUNT(*) FILTER (WHERE event IN ('page','pub','collection','other')) AS page_views, COUNT(*) FILTER (WHERE event IN ('page','pub','collection','other') AND "isUnique" = true) AS unique_page_views, COUNT(*) FILTER (WHERE event = 'download') AS downloads FROM "AnalyticsEvents" -GROUP BY "communityId", date_trunc('day', "timestamp")::date`, +GROUP BY "communityId", date_trunc('day', "createdAt")::date`, indexSql: ` CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_summary_uk ON analytics_daily_summary ("communityId", date)`, }, { - name: 'analytics_daily_country', + name: 'analytics_daily_timezone', createSql: ` -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics_daily_country AS +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics_daily_timezone AS SELECT "communityId", - date_trunc('day', "timestamp")::date AS date, - COALESCE(country, 'Unknown') AS country, - COALESCE("countryCode", '') AS country_code, + date_trunc('day', "createdAt")::date AS date, + COALESCE(timezone, '') AS timezone, COUNT(*) AS count FROM "AnalyticsEvents" WHERE event IN ('page','pub','collection','other') -GROUP BY "communityId", date_trunc('day', "timestamp")::date, country, "countryCode"`, +GROUP BY "communityId", date_trunc('day', "createdAt")::date, timezone`, indexSql: ` -CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_country_uk - ON analytics_daily_country ("communityId", date, country, country_code)`, +CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_timezone_uk + ON analytics_daily_timezone ("communityId", date, md5(timezone))`, }, { name: 'analytics_daily_pub', @@ -55,12 +54,12 @@ CREATE MATERIALIZED VIEW IF NOT EXISTS analytics_daily_pub AS SELECT "communityId", "pubId", - date_trunc('day', "timestamp")::date AS date, + date_trunc('day', "createdAt")::date AS date, COUNT(*) FILTER (WHERE event = 'pub') AS views, COUNT(*) FILTER (WHERE event = 'download') AS downloads FROM "AnalyticsEvents" WHERE "pubId" IS NOT NULL AND event IN ('pub','download') -GROUP BY "communityId", "pubId", date_trunc('day', "timestamp")::date`, +GROUP BY "communityId", "pubId", date_trunc('day', "createdAt")::date`, indexSql: ` CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_pub_uk ON analytics_daily_pub ("communityId", "pubId", date)`, @@ -72,11 +71,11 @@ CREATE MATERIALIZED VIEW IF NOT EXISTS analytics_daily_collection AS SELECT "communityId", "collectionId", - date_trunc('day', "timestamp")::date AS date, + date_trunc('day', "createdAt")::date AS date, COUNT(*) AS count FROM "AnalyticsEvents" WHERE "collectionId" IS NOT NULL AND event IN ('collection','pub') -GROUP BY "communityId", "collectionId", date_trunc('day', "timestamp")::date`, +GROUP BY "communityId", "collectionId", date_trunc('day', "createdAt")::date`, indexSql: ` CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_collection_uk ON analytics_daily_collection ("communityId", "collectionId", date)`, @@ -91,12 +90,12 @@ CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_collection_uk CREATE MATERIALIZED VIEW IF NOT EXISTS analytics_daily_referrer AS SELECT "communityId", - date_trunc('day', "timestamp")::date AS date, + date_trunc('day', "createdAt")::date AS date, COALESCE(LEFT(referrer, 500), 'Direct') AS referrer, COUNT(*) AS count FROM "AnalyticsEvents" WHERE event IN ('page','pub','collection','other') -GROUP BY "communityId", date_trunc('day', "timestamp")::date, COALESCE(LEFT(referrer, 500), 'Direct')`, +GROUP BY "communityId", date_trunc('day', "createdAt")::date, COALESCE(LEFT(referrer, 500), 'Direct')`, indexSql: ` CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_referrer_uk ON analytics_daily_referrer ("communityId", date, md5(referrer))`, @@ -107,12 +106,12 @@ CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_referrer_uk CREATE MATERIALIZED VIEW IF NOT EXISTS analytics_daily_campaign AS SELECT "communityId", - date_trunc('day', "timestamp")::date AS date, + date_trunc('day', "createdAt")::date AS date, "utmCampaign" AS campaign, COUNT(*) AS count FROM "AnalyticsEvents" WHERE "utmCampaign" IS NOT NULL AND "utmCampaign" != '' -GROUP BY "communityId", date_trunc('day', "timestamp")::date, "utmCampaign"`, +GROUP BY "communityId", date_trunc('day', "createdAt")::date, "utmCampaign"`, indexSql: ` CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_campaign_uk ON analytics_daily_campaign ("communityId", date, md5(campaign))`, @@ -122,16 +121,19 @@ CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_campaign_uk createSql: ` CREATE MATERIALIZED VIEW IF NOT EXISTS analytics_daily_page AS SELECT - "communityId", - date_trunc('day', "timestamp")::date AS date, - LEFT(COALESCE("pageTitle", "pubTitle", "collectionTitle", path, ''), 300) AS page_title, - LEFT(COALESCE(path, ''), 300) AS path, + ae."communityId", + date_trunc('day', ae."createdAt")::date AS date, + LEFT(COALESCE(pg.title, p.title, c.title, ae.path, ''), 300) AS page_title, + LEFT(COALESCE(ae.path, ''), 300) AS path, COUNT(*) AS count -FROM "AnalyticsEvents" -WHERE event IN ('page','pub','collection','other') -GROUP BY "communityId", date_trunc('day', "timestamp")::date, - LEFT(COALESCE("pageTitle", "pubTitle", "collectionTitle", path, ''), 300), - LEFT(COALESCE(path, ''), 300)`, +FROM "AnalyticsEvents" ae +LEFT JOIN "Pages" pg ON pg.id = ae."pageId" +LEFT JOIN "Pubs" p ON p.id = ae."pubId" +LEFT JOIN "Collections" c ON c.id = ae."collectionId" +WHERE ae.event IN ('page','pub','collection','other') +GROUP BY ae."communityId", date_trunc('day', ae."createdAt")::date, + LEFT(COALESCE(pg.title, p.title, c.title, ae.path, ''), 300), + LEFT(COALESCE(ae.path, ''), 300)`, indexSql: ` CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_page_uk ON analytics_daily_page ("communityId", date, md5(page_title || '|' || path))`, @@ -142,7 +144,7 @@ CREATE UNIQUE INDEX IF NOT EXISTS analytics_daily_page_uk CREATE MATERIALIZED VIEW IF NOT EXISTS analytics_daily_device AS SELECT "communityId", - date_trunc('day', "timestamp")::date AS date, + date_trunc('day', "createdAt")::date AS date, CASE WHEN os IN ('iOS','Android') THEN 'Mobile' WHEN os IN ('Windows','MacOS','Linux','UNIX','ChromeOS') THEN 'Desktop' @@ -152,7 +154,7 @@ SELECT COUNT(*) AS count FROM "AnalyticsEvents" WHERE event IN ('page','pub','collection','other') -GROUP BY "communityId", date_trunc('day', "timestamp")::date, +GROUP BY "communityId", date_trunc('day', "createdAt")::date, CASE WHEN os IN ('iOS','Android') THEN 'Mobile' WHEN os IN ('Windows','MacOS','Linux','UNIX','ChromeOS') THEN 'Desktop' @@ -187,6 +189,9 @@ export async function createSummaryViews() { await sequelize.query(` CREATE INDEX IF NOT EXISTS analytics_daily_page_comm_date ON analytics_daily_page ("communityId", date)`); + await sequelize.query(` + CREATE INDEX IF NOT EXISTS analytics_daily_timezone_comm_date + ON analytics_daily_timezone ("communityId", date)`); } /** @@ -202,6 +207,7 @@ export async function createSummaryViews() { const CLUSTER_INDEX: Record = { analytics_daily_referrer: 'analytics_daily_referrer_comm_date', analytics_daily_page: 'analytics_daily_page_comm_date', + analytics_daily_timezone: 'analytics_daily_timezone_comm_date', }; export async function refreshSummaryViews() { diff --git a/tools/migrateRedshift.ts b/tools/migrateRedshift.ts index 5fbc23e57e..e64cf848e9 100644 --- a/tools/migrateRedshift.ts +++ b/tools/migrateRedshift.ts @@ -108,7 +108,7 @@ CREATE TABLE IF NOT EXISTS "AnalyticsEvents" ( id uuid PRIMARY KEY DEFAULT gen_random_uuid(), type text NOT NULL, event text NOT NULL, - "timestamp" timestamptz NOT NULL, + "createdAt" timestamptz NOT NULL DEFAULT now(), referrer text, "isUnique" boolean, search text, @@ -122,53 +122,37 @@ CREATE TABLE IF NOT EXISTS "AnalyticsEvents" ( "userAgent" text NOT NULL, os text NOT NULL, "communityId" uuid, - "communitySubdomain" text, - "communityName" text, - "isProd" boolean NOT NULL, - country text, - "countryCode" text, url text, - title text, hash text, height integer, width integer, path text, - "pageTitle" text, "pageId" uuid, - "pageSlug" text, - "collectionTitle" text, - "collectionKind" text, "collectionId" uuid, - "collectionSlug" text, - "pubTitle" text, "pubId" uuid, - "pubSlug" text, - "collectionIds" text[], - "primaryCollectionId" uuid, release text, - format text, - "createdAt" timestamptz NOT NULL DEFAULT now() + format text ); -CREATE INDEX IF NOT EXISTS "analytics_events_community_event_ts" - ON "AnalyticsEvents" ("communityId", event, "timestamp"); -CREATE INDEX IF NOT EXISTS "analytics_events_pub_event_ts" - ON "AnalyticsEvents" ("pubId", event, "timestamp"); -CREATE INDEX IF NOT EXISTS "analytics_events_collection_event_ts" - ON "AnalyticsEvents" ("collectionId", event, "timestamp"); +CREATE INDEX IF NOT EXISTS "analytics_events_community_event_created" + ON "AnalyticsEvents" ("communityId", event, "createdAt"); +CREATE INDEX IF NOT EXISTS "analytics_events_pub_event_created" + ON "AnalyticsEvents" ("pubId", event, "createdAt"); +CREATE INDEX IF NOT EXISTS "analytics_events_collection_event_created" + ON "AnalyticsEvents" ("collectionId", event, "createdAt"); -- Optimized index for dashboard queries: all filter by communityId + time range -CREATE INDEX IF NOT EXISTS "analytics_events_community_ts" - ON "AnalyticsEvents" ("communityId", "timestamp"); +CREATE INDEX IF NOT EXISTS "analytics_events_community_created" + ON "AnalyticsEvents" ("communityId", "createdAt"); -- Partial covering index for the common page-view aggregations CREATE INDEX IF NOT EXISTS "analytics_events_community_pages" - ON "AnalyticsEvents" ("communityId", "timestamp", "isUnique") + ON "AnalyticsEvents" ("communityId", "createdAt", "isUnique") WHERE event IN ('page','pub','collection','other'); -- Partial index for pub-scoped views + downloads CREATE INDEX IF NOT EXISTS "analytics_events_pub_views_dl" - ON "AnalyticsEvents" ("communityId", "pubId", "timestamp") + ON "AnalyticsEvents" ("communityId", "pubId", "createdAt") WHERE "pubId" IS NOT NULL AND event IN ('pub','download'); `; @@ -225,7 +209,17 @@ CREATE UNLOGGED TABLE analytics_staging ( ); `; -const TRANSFORM_SQL = ` +/** + * Build the TRANSFORM SQL with an optional cutoff timestamp. + * If cutoffTs is provided, Redshift rows with a timestamp >= cutoffTs are skipped + * to avoid importing duplicates of events already written directly to PG. + */ +function buildTransformSql(cutoffTs: string | null): string { + const cutoffClause = cutoffTs + ? `\n AND pg_temp.safe_ts(s."timestamp") < '${cutoffTs}'::timestamptz` + : ''; + + return ` CREATE OR REPLACE FUNCTION pg_temp.safe_uuid(val text) RETURNS uuid AS $$ SELECT CASE WHEN val ~ '^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$' @@ -247,7 +241,7 @@ $$ LANGUAGE sql IMMUTABLE; CREATE OR REPLACE FUNCTION pg_temp.is_spam(VARIADIC vals text[]) RETURNS boolean AS $$ SELECT EXISTS ( SELECT 1 FROM unnest(vals) v - WHERE v ~* '(= '2016-01-01'::timestamptz - AND pg_temp.safe_ts(s."timestamp") <= (now() + interval '1 day') + AND pg_temp.safe_ts(s."timestamp") <= (now() + interval '1 day')${cutoffClause} ON CONFLICT (id) DO NOTHING; `; +} // ─── main ──────────────────────────────────────────────────────────────────── @@ -365,8 +330,8 @@ DELETE FROM "AnalyticsEvents" WHERE referrer ~* '( (now() + interval '1 day'); + OR "createdAt" < '2016-01-01'::timestamptz + OR "createdAt" > (now() + interval '1 day'); `; async function main() { @@ -394,6 +359,23 @@ async function main() { log(` table already has ${existingCount} rows, duplicates will be skipped`); } + // Determine cutoff: if the PG table already has rows (from direct writes), + // skip any Redshift data with a timestamp >= the earliest PG row to avoid + // importing duplicates of events that were dual-written. + let cutoffTs: string | null = null; + if (existingCount > 0) { + const [minRow] = await sequelize.query<{ min_ts: string }>( + 'SELECT MIN("createdAt")::text AS min_ts FROM "AnalyticsEvents"', + { type: 'SELECT' as any }, + ); + cutoffTs = (minRow as any)?.min_ts ?? null; + if (cutoffTs) { + log(` cutoff: skipping Redshift rows with timestamp >= ${cutoffTs}`); + } + } + + const transformSql = buildTransformSql(cutoffTs); + // Steps 3-5: process each CSV file one at a time to keep peak disk usage // low. For each file: create staging → COPY in → transform to final table // → drop staging. This avoids needing disk space for ALL rows in staging @@ -429,7 +411,7 @@ async function main() { ); // Transform and insert into final table - await sequelize.query(TRANSFORM_SQL); + await sequelize.query(transformSql); // Drop staging to free disk space before next file await sequelize.query('DROP TABLE IF EXISTS analytics_staging'); diff --git a/utils/analytics/usePageOnce.ts b/utils/analytics/usePageOnce.ts index e7c12b7aeb..63433653e1 100644 --- a/utils/analytics/usePageOnce.ts +++ b/utils/analytics/usePageOnce.ts @@ -64,7 +64,6 @@ const determinePayload = ( communityId: pubData.communityId, communityName: communityData.title, communitySubdomain: communityData.subdomain, - isProd: locationData.isProd, release: pubData.isRelease && pubData.releaseNumber ? pubData.releaseNumber @@ -76,7 +75,6 @@ const determinePayload = ( communityId: communityData.id, communityName: communityData.title, communitySubdomain: communityData.subdomain, - isProd: locationData.isProd, }; const collection = scopeData?.elements?.activeCollection; diff --git a/utils/api/schemas/analytics.ts b/utils/api/schemas/analytics.ts index 907885282b..38b720f6aa 100644 --- a/utils/api/schemas/analytics.ts +++ b/utils/api/schemas/analytics.ts @@ -23,17 +23,18 @@ export const baseSchema = z.object({ /** Information that should always be included in any event payload */ export const sharedEventPayloadSchema = z.object({ communityId: z.string().uuid(), - // if it's null, it 'www.pubpub.org' - communitySubdomain: z.string(), - communityName: z.string(), - isProd: z.boolean(), + // Dropped columns — accepted for backward compat with cached clients but not stored + communitySubdomain: z.string().optional(), + communityName: z.string().optional(), + isProd: z.boolean().optional(), }); export const basePageViewSchema = baseSchema.merge( z.object({ type: z.literal('page'), url: z.string().url(), - title: z.string(), + // Dropped column — accepted for backward compat but not stored + title: z.string().optional(), hash: z.string().optional(), height: z.number().int(), width: z.number().int(), @@ -44,9 +45,9 @@ export const basePageViewSchema = baseSchema.merge( export const sharedPageViewPayloadSchema = sharedEventPayloadSchema.merge( z.object({ communityId: z.string().uuid().nullable(), - // if it's null, it 'www.pubpub.org' - communitySubdomain: z.string().nullable().default('www'), - communityName: z.string().nullable().default('pubpub'), + // Dropped columns — accepted for backward compat but not stored + communitySubdomain: z.string().nullable().optional(), + communityName: z.string().nullable().optional(), event: z.enum(['page', 'collection', 'pub', 'other']), }), ); @@ -61,19 +62,21 @@ export const otherPageViewPayloadSchema = sharedPageViewPayloadSchema.merge( export const pagePageViewPayloadSchema = sharedPageViewPayloadSchema.merge( z.object({ event: z.literal('page'), - pageTitle: z.string(), + // Dropped columns — accepted for backward compat but not stored + pageTitle: z.string().optional(), pageId: z.string(), - pageSlug: z.string(), + pageSlug: z.string().optional(), }), ); export const collectionPageViewPayloadSchema = sharedPageViewPayloadSchema.merge( z.object({ event: z.literal('collection'), - collectionTitle: z.string(), - collectionKind: z.enum(['issue', 'conference', 'book', 'tag']), + // Dropped columns — accepted for backward compat but not stored + collectionTitle: z.string().optional(), + collectionKind: z.enum(['issue', 'conference', 'book', 'tag']).optional(), collectionId: z.string().uuid(), - collectionSlug: z.string(), + collectionSlug: z.string().optional(), }), ); @@ -81,9 +84,10 @@ export const pubPageViewPayloadSchema = sharedPageViewPayloadSchema .merge( z.object({ event: z.literal('pub'), - pubTitle: z.string(), + // Dropped columns — accepted for backward compat but not stored + pubTitle: z.string().optional(), pubId: z.string().uuid(), - pubSlug: z.string(), + pubSlug: z.string().optional(), collectionIds: z .string() .regex(/^[a-f0-9-]+(,[a-f0-9-]+)*$/) From 62d51802b2dea29ea85659cee64057eed516ae71 Mon Sep 17 00:00:00 2001 From: Travis Rich Date: Wed, 15 Apr 2026 13:19:10 -0400 Subject: [PATCH 2/3] lint --- server/analytics/impactApi.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/analytics/impactApi.ts b/server/analytics/impactApi.ts index ca83564da1..cd9fd66040 100644 --- a/server/analytics/impactApi.ts +++ b/server/analytics/impactApi.ts @@ -4,8 +4,9 @@ * * Mounted at /api/analytics-impact/... */ -import { Router } from 'express'; + import { getCountryForTimezone } from 'countries-and-timezones'; +import { Router } from 'express'; import { QueryTypes } from 'sequelize'; import { sequelize } from 'server/sequelize'; From c9838e3109192b0dffb521a87cdf0c7160c469c7 Mon Sep 17 00:00:00 2001 From: Travis Rich Date: Wed, 15 Apr 2026 13:35:11 -0400 Subject: [PATCH 3/3] Rename table --- server/analyticsCloudflareCache/model.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/server/analyticsCloudflareCache/model.ts b/server/analyticsCloudflareCache/model.ts index 9a54c216eb..11f518488d 100644 --- a/server/analyticsCloudflareCache/model.ts +++ b/server/analyticsCloudflareCache/model.ts @@ -12,9 +12,8 @@ import { AllowNull, Column, DataType, Model, PrimaryKey, Table } from 'sequelize * Past days are cached permanently (expiresAt = null). * Today's partial data is cached with a short TTL (expiresAt = now + 1h). * - * Note: tableName kept as 'AnalyticsDailyCaches' to avoid a DB migration. */ -@Table({ tableName: 'AnalyticsDailyCaches', timestamps: false }) +@Table({ timestamps: false }) export class AnalyticsCloudflareCache extends Model< InferAttributes, InferCreationAttributes