From de0c62283f6ababf7de5bb1a882b0b282f293333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Tue, 21 Apr 2026 18:29:49 +0200 Subject: [PATCH 01/48] Install xml-js --- package-lock.json | 33 ++++++++++++++++----------------- package.json | 3 ++- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/package-lock.json b/package-lock.json index 3e11f1cf9..2292b55ce 100644 --- a/package-lock.json +++ b/package-lock.json @@ -58,7 +58,8 @@ "swagger-ui-express": "^5.0.1", "turndown": "^7.2.1", "winston": "^3.17.0", - "winston-mail": "^2.0.0" + "winston-mail": "^2.0.0", + "xml-js": "^1.6.11" }, "bin": { "ota": "bin/ota.js" @@ -1273,7 +1274,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=20.19.0" }, @@ -1320,7 +1320,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=20.19.0" } @@ -1639,7 +1638,6 @@ "integrity": "sha512-jCs9ldd7NwzpgXDIf6P3+NrHh9/sD6CQdxHyjQI+h/6rDNo88ypBxxz45UDuZHz9r3tNz7N/VInSVoVdtXEI4A==", "devOptional": true, "license": "MIT", - "peer": true, "engines": { "node": "^14.21.3 || >=16" }, @@ -1793,7 +1791,6 @@ "resolved": "https://registry.npmjs.org/@octokit/core/-/core-7.0.4.tgz", "integrity": "sha512-jOT8V1Ba5BdC79sKrRWDdMT5l1R+XNHTPR6CPWzUP2EcfAcvIHZWF0eAbmRcpOOP5gVIwnqNg0C4nvh6Abc3OA==", "license": "MIT", - "peer": true, "dependencies": { "@octokit/auth-token": "^6.0.0", "@octokit/graphql": "^9.0.1", @@ -2294,8 +2291,7 @@ "version": "20.7.0", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.7.0.tgz", "integrity": "sha512-zI22/pJW2wUZOVyguFaUL1HABdmSVxpXrzIqkjsHmyUjNhPoWM1CKfvVuXfetHhIok4RY573cqS0mZ1SJEnoTg==", - "license": "MIT", - "peer": true + "license": "MIT" }, "node_modules/@types/triple-beam": { "version": "1.3.5", @@ -2386,7 +2382,6 @@ "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -3108,7 +3103,6 @@ "resolved": "https://registry.npmjs.org/chai/-/chai-6.0.1.tgz", "integrity": "sha512-/JOoU2//6p5vCXh00FpNgtlw0LjvhGttaWc+y7wpW9yjBm3ys0dI8tSKZxIOgNruz5J0RleccatSIC3uxEZP0g==", "license": "MIT", - "peer": true, "engines": { "node": ">=18" } @@ -3569,7 +3563,6 @@ "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz", "integrity": "sha512-itvL5h8RETACmOTFc4UfIyB2RfEHi71Ax6E/PivVxq9NseKbOWpeyHEOIbmAw1rs8Ak0VursQNww7lf7YtUwzg==", "license": "MIT", - "peer": true, "dependencies": { "env-paths": "^2.2.1", "import-fresh": "^3.3.0", @@ -3989,8 +3982,7 @@ "version": "0.0.1495869", "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1495869.tgz", "integrity": "sha512-i+bkd9UYFis40RcnkW7XrOprCujXRAHg62IVh/Ah3G8MmNXpCGt1m0dTFhSdx/AVs8XEMbdOGRwdkR1Bcta8AA==", - "license": "BSD-3-Clause", - "peer": true + "license": "BSD-3-Clause" }, "node_modules/dezalgo": { "version": "1.0.4", @@ -4482,7 +4474,6 @@ "integrity": "sha512-ypowyDxpVSYpkXr9WPv2PAZCtNip1Mv5KTW0SCurXv/9iOpcrH9PaqUElksqEB6pChqHGDRCFTyrZlGhnLNGiA==", "deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.", "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.6.1", @@ -4624,7 +4615,6 @@ "resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.32.0.tgz", "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==", "license": "MIT", - "peer": true, "dependencies": { "@rtsao/scc": "^1.1.0", "array-includes": "^3.1.9", @@ -8870,7 +8860,6 @@ "integrity": "sha512-QabGIvu7F0hAMiKGHZCIRHMb6UoH0QAJA2OaqxEU2tL5noXPrxUcotg2l3ttOA4p1PFnVIGkr6PXRAWlM2evVQ==", "hasInstallScript": true, "license": "Apache-2.0", - "peer": true, "dependencies": { "@puppeteer/browsers": "2.10.10", "chromium-bidi": "8.0.0", @@ -8926,7 +8915,6 @@ "resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-3.3.6.tgz", "integrity": "sha512-rsLBE/6mMxAjlLd06LuGacrukP2bqbzKCLzV1vrhHFavqQE/taQ2UXv3H5P0Ls7nsrASa+6x3bDbXHpqMwq+7A==", "license": "MIT", - "peer": true, "dependencies": { "@types/debug": "^4.1.0", "debug": "^4.1.1", @@ -9887,7 +9875,6 @@ "resolved": "https://registry.npmjs.org/socks/-/socks-2.8.7.tgz", "integrity": "sha512-HLpt+uLy/pxB+bum/9DzAgiKS8CX1EvbWxI4zlmgGCExImLdiad2iCwXT5Z4c9c3Eq8rP2318mPW2c+QbtjK8A==", "license": "MIT", - "peer": true, "dependencies": { "ip-address": "^10.0.1", "smart-buffer": "^4.2.0" @@ -11334,6 +11321,18 @@ } } }, + "node_modules/xml-js": { + "version": "1.6.11", + "resolved": "https://registry.npmjs.org/xml-js/-/xml-js-1.6.11.tgz", + "integrity": "sha512-7rVi2KMfwfWFl+GpPg6m80IVMWXLRjO+PxTq7V2CDhoGak0wzYzFgUY2m4XJ47OGdXd8eLE8EmwfAmdjw7lC1g==", + "license": "MIT", + "dependencies": { + "sax": "^1.2.4" + }, + "bin": { + "xml-js": "bin/cli.js" + } + }, "node_modules/xml-name-validator": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz", diff --git a/package.json b/package.json index f21c5f107..db0fbd146 100644 --- a/package.json +++ b/package.json @@ -100,7 +100,8 @@ "swagger-ui-express": "^5.0.1", "turndown": "^7.2.1", "winston": "^3.17.0", - "winston-mail": "^2.0.0" + "winston-mail": "^2.0.0", + "xml-js": "^1.6.11" }, "devDependencies": { "@commitlint/cli": "^19.8.1", From 78ff82e9f42b416c9a84619c0b38877f408bf9d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Tue, 21 Apr 2026 18:30:03 +0200 Subject: [PATCH 02/48] Add findRecent method --- .../recorder/repositories/git/index.js | 20 +++ .../recorder/repositories/git/index.test.js | 142 ++++++++++++++++++ .../recorder/repositories/interface.js | 14 ++ .../recorder/repositories/mongo/index.js | 16 ++ .../recorder/repositories/mongo/index.test.js | 140 +++++++++++++++++ 5 files changed, 332 insertions(+) diff --git a/src/archivist/recorder/repositories/git/index.js b/src/archivist/recorder/repositories/git/index.js index 5caf59948..32904cad3 100644 --- a/src/archivist/recorder/repositories/git/index.js +++ b/src/archivist/recorder/repositories/git/index.js @@ -92,6 +92,26 @@ export default class GitRepository extends RepositoryInterface { return Promise.all((await this.#getCommits()).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); } + async findRecent(limit, { serviceId, termsType } = {}) { + const commits = (await this.#getCommits()).reverse(); + const records = []; + + for (const commit of commits) { + if (records.length >= limit) break; + + const record = await this.#toDomain(commit, { deferContentLoading: true }); + + if (!record) continue; + + if (serviceId !== undefined && record.serviceId !== serviceId) continue; + if (termsType !== undefined && record.termsType !== termsType) continue; + + records.push(record); + } + + return records; + } + async count() { return (await this.git.log(Object.values(DataMapper.COMMIT_MESSAGE_PREFIXES).map(prefix => `--grep=${prefix}`))).length; } diff --git a/src/archivist/recorder/repositories/git/index.test.js b/src/archivist/recorder/repositories/git/index.test.js index 6c7e1dea0..ee8d8b6e6 100644 --- a/src/archivist/recorder/repositories/git/index.test.js +++ b/src/archivist/recorder/repositories/git/index.test.js @@ -584,6 +584,148 @@ describe('GitRepository', () => { }); }); + describe('#findRecent', () => { + const OTHER_SERVICE = 'other_service'; + const OTHER_TERMS = 'Privacy Policy'; + + before(async function () { + this.timeout(5000); + + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: OTHER_TERMS, + content: CONTENT, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: OTHER_SERVICE, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + }); + + after(() => subject.removeAll()); + + context('without filters', () => { + let records; + + before(async () => { + records = await subject.findRecent(10); + }); + + it('returns records in descending chronological order', () => { + const dates = records.map(record => record.fetchDate.getTime()); + + expect(dates).to.deep.equal([...dates].sort((a, b) => b - a)); + }); + + it('returns all matching records', () => { + expect(records).to.have.length(4); + }); + + it('does not load content eagerly', () => { + for (const record of records) { + expect(() => record.content).to.throw('Content not defined'); + } + }); + + it('exposes the metadata needed for feed entries', () => { + const [record] = records; + + expect(record.id).to.be.a('string'); + expect(record.serviceId).to.be.a('string'); + expect(record.termsType).to.be.a('string'); + expect(record.fetchDate).to.be.an.instanceof(Date); + expect(record.isFirstRecord).to.be.a('boolean'); + expect(record.isTechnicalUpgrade).to.be.a('boolean'); + }); + }); + + context('when limit is smaller than the number of matching records', () => { + let records; + + before(async () => { + records = await subject.findRecent(2); + }); + + it('returns at most limit records', () => { + expect(records).to.have.length(2); + }); + + it('returns the most recent records', () => { + for (const record of records) { + expect(record.fetchDate.getTime()).to.be.at.least(FETCH_DATE.getTime()); + } + }); + }); + + context('when a serviceId filter is given', () => { + let records; + + before(async () => { + records = await subject.findRecent(10, { serviceId: SERVICE_PROVIDER_ID }); + }); + + it('returns only records for that service', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + } + }); + + it('returns all records that match', () => { + expect(records).to.have.length(3); + }); + }); + + context('when both serviceId and termsType filters are given', () => { + let records; + + before(async () => { + records = await subject.findRecent(10, { serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE }); + }); + + it('returns only records for that service and terms type', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + expect(record.termsType).to.equal(TERMS_TYPE); + } + }); + + it('returns all records that match', () => { + expect(records).to.have.length(2); + }); + }); + + context('when filters match no record', () => { + let records; + + before(async () => { + records = await subject.findRecent(10, { serviceId: 'unknown' }); + }); + + it('returns an empty array', () => { + expect(records).to.deep.equal([]); + }); + }); + }); + describe('#findLatest', () => { context('when there are records for the given service', () => { let lastSnapshotId; diff --git a/src/archivist/recorder/repositories/interface.js b/src/archivist/recorder/repositories/interface.js index 1d9270944..1c1cfd7d6 100644 --- a/src/archivist/recorder/repositories/interface.js +++ b/src/archivist/recorder/repositories/interface.js @@ -79,6 +79,20 @@ class RepositoryInterface { throw new Error(`#findAll method is not implemented in ${this.constructor.name}`); } + /** + * Find the most recent records in the repository, optionally filtered by service ID and terms type + * For performance reasons, the content of the records will not be loaded. Use #loadRecordContent to load the content of individual records + * @see RepositoryInterface#loadRecordContent + * @param {number} limit - Maximum number of records to return + * @param {object} [filters] - Optional filters + * @param {string} [filters.serviceId] - Restrict results to this service ID + * @param {string} [filters.termsType] - Restrict results to this terms type + * @returns {Promise>} Promise that will be resolved with an array of records in descending chronological order + */ + async findRecent(limit, filters) { + throw new Error(`#findRecent method is not implemented in ${this.constructor.name}`); + } + /** * Count the total number of records in the repository * For performance reasons, use this method rather than counting the number of entries returned by #findAll if you only need the size of a repository diff --git a/src/archivist/recorder/repositories/mongo/index.js b/src/archivist/recorder/repositories/mongo/index.js index 2a4abb18c..b9cf9437c 100644 --- a/src/archivist/recorder/repositories/mongo/index.js +++ b/src/archivist/recorder/repositories/mongo/index.js @@ -93,6 +93,22 @@ export default class MongoRepository extends RepositoryInterface { .map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); } + async findRecent(limit, { serviceId, termsType } = {}) { + const query = {}; + + if (serviceId !== undefined) query.serviceId = serviceId; + if (termsType !== undefined) query.termsType = termsType; + + const mongoDocuments = await this.collection + .find(query) + .project({ content: 0 }) + .sort({ fetchDate: -1 }) + .limit(limit) + .toArray(); + + return Promise.all(mongoDocuments.map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); + } + count() { return this.collection.countDocuments(); } diff --git a/src/archivist/recorder/repositories/mongo/index.test.js b/src/archivist/recorder/repositories/mongo/index.test.js index 61ecfd1d0..880c9b2e3 100644 --- a/src/archivist/recorder/repositories/mongo/index.test.js +++ b/src/archivist/recorder/repositories/mongo/index.test.js @@ -671,6 +671,146 @@ describe('MongoRepository', () => { }); }); + describe('#findRecent', () => { + const OTHER_SERVICE = 'other_service'; + const OTHER_TERMS = 'Privacy Policy'; + + before(async () => { + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: OTHER_TERMS, + content: CONTENT, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: OTHER_SERVICE, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + }); + + after(() => subject.removeAll()); + + context('without filters', () => { + let records; + + before(async () => { + records = await subject.findRecent(10); + }); + + it('returns records in descending chronological order', () => { + const dates = records.map(record => record.fetchDate.getTime()); + + expect(dates).to.deep.equal([...dates].sort((a, b) => b - a)); + }); + + it('returns all matching records', () => { + expect(records).to.have.length(4); + }); + + it('does not load content eagerly', () => { + for (const record of records) { + expect(() => record.content).to.throw('Content not defined'); + } + }); + + it('exposes the metadata needed for feed entries', () => { + const [record] = records; + + expect(record.id).to.be.a('string'); + expect(record.serviceId).to.be.a('string'); + expect(record.termsType).to.be.a('string'); + expect(record.fetchDate).to.be.an.instanceof(Date); + expect(record.isFirstRecord).to.be.a('boolean'); + expect(record.isTechnicalUpgrade).to.be.a('boolean'); + }); + }); + + context('when limit is smaller than the number of matching records', () => { + let records; + + before(async () => { + records = await subject.findRecent(2); + }); + + it('returns at most limit records', () => { + expect(records).to.have.length(2); + }); + + it('returns the most recent records', () => { + for (const record of records) { + expect(record.fetchDate.getTime()).to.be.at.least(FETCH_DATE.getTime()); + } + }); + }); + + context('when a serviceId filter is given', () => { + let records; + + before(async () => { + records = await subject.findRecent(10, { serviceId: SERVICE_PROVIDER_ID }); + }); + + it('returns only records for that service', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + } + }); + + it('returns all records that match', () => { + expect(records).to.have.length(3); + }); + }); + + context('when both serviceId and termsType filters are given', () => { + let records; + + before(async () => { + records = await subject.findRecent(10, { serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE }); + }); + + it('returns only records for that service and terms type', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + expect(record.termsType).to.equal(TERMS_TYPE); + } + }); + + it('returns all records that match', () => { + expect(records).to.have.length(2); + }); + }); + + context('when filters match no record', () => { + let records; + + before(async () => { + records = await subject.findRecent(10, { serviceId: 'unknown' }); + }); + + it('returns an empty array', () => { + expect(records).to.deep.equal([]); + }); + }); + }); + describe('#findLatest', () => { context('when there are records for the given service', () => { let lastSnapshotId; From 18a518161b0b3c08fd952690cf3a59d03426bb78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 22 Apr 2026 08:42:09 +0200 Subject: [PATCH 03/48] Add collection Atom feed endpoint --- src/collection-api/routes/feed.js | 149 +++++++++++ src/collection-api/routes/feed.test.js | 231 ++++++++++++++++++ src/collection-api/routes/index.js | 2 + src/collection-api/routes/versions.js | 6 +- .../routes/versionsRepository.js | 9 + 5 files changed, 393 insertions(+), 4 deletions(-) create mode 100644 src/collection-api/routes/feed.js create mode 100644 src/collection-api/routes/feed.test.js create mode 100644 src/collection-api/routes/versionsRepository.js diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js new file mode 100644 index 000000000..7758b473c --- /dev/null +++ b/src/collection-api/routes/feed.js @@ -0,0 +1,149 @@ +import express from 'express'; +import { js2xml } from 'xml-js'; + +import { getCollection } from '../../archivist/collection/index.js'; +import { COMMIT_MESSAGE_PREFIXES } from '../../archivist/recorder/repositories/git/dataMapper.js'; +import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; + +import versionsRepository, { storageConfig } from './versionsRepository.js'; + +const TAG_AUTHORITY = 'opentermsarchive.org,2026'; +const FEED_AUTHOR_NAME = 'OTA-Bot'; +const DEFAULT_LIMIT = 100; + +const RECORD_TYPES = { + firstRecord: 'First record', + technicalUpgrade: 'Technical upgrade', + change: 'Change', +}; + +const SCHEMES = { + service: `tag:${TAG_AUTHORITY}:scheme:service`, + termsType: `tag:${TAG_AUTHORITY}:scheme:terms-type`, + recordType: `tag:${TAG_AUTHORITY}:scheme:record-type`, +}; + +function buildAbsoluteBaseUrl(req) { + return `${req.protocol}://${req.get('host')}${req.baseUrl}`; +} + +function classifyRecordType(version) { + if (version.isFirstRecord) return RECORD_TYPES.firstRecord; + if (version.isTechnicalUpgrade) return RECORD_TYPES.technicalUpgrade; + + return RECORD_TYPES.change; +} + +function buildEntryTitle(version) { + let prefix = COMMIT_MESSAGE_PREFIXES.update; + + if (version.isFirstRecord) prefix = COMMIT_MESSAGE_PREFIXES.startTracking; + else if (version.isTechnicalUpgrade) prefix = COMMIT_MESSAGE_PREFIXES.technicalUpgrade; + + return `${prefix} ${version.serviceId} ${version.termsType}`; +} + +function buildVersionLink(baseUrl, version) { + const encodedDate = encodeURIComponent(toISODateWithoutMilliseconds(version.fetchDate)); + const encodedService = encodeURIComponent(version.serviceId); + const encodedTermsType = encodeURIComponent(version.termsType); + + return `${baseUrl}/version/${encodedService}/${encodedTermsType}/${encodedDate}`; +} + +function buildEntryId(collection, version) { + return `tag:${TAG_AUTHORITY}:version:${collection.metadata?.id}:${storageConfig.type}:${version.id}`; +} + +function buildEntry(collection, baseUrl, version) { + return { + id: { _text: buildEntryId(collection, version) }, + link: { _attributes: { + rel: 'alternate', + type: 'text/html', + href: buildVersionLink(baseUrl, version), + } }, + title: { _text: buildEntryTitle(version) }, + updated: { _text: version.fetchDate.toISOString() }, + category: [ + { _attributes: { term: version.serviceId, scheme: SCHEMES.service } }, + { _attributes: { term: version.termsType, scheme: SCHEMES.termsType } }, + { _attributes: { term: classifyRecordType(version), scheme: SCHEMES.recordType } }, + ], + }; +} + +function buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }) { + const latestFetchDate = versions.length > 0 ? versions[0].fetchDate : new Date(); + + const feed = { + _attributes: { xmlns: 'http://www.w3.org/2005/Atom' }, + title: { _text: collection.metadata?.name || '' }, + subtitle: { _text: collection.metadata?.tagline || '' }, + id: { _text: feedId }, + updated: { _text: latestFetchDate.toISOString() }, + link: { _attributes: { rel: 'self', href: selfHref } }, + author: { name: { _text: FEED_AUTHOR_NAME } }, + }; + + if (collection.metadata?.logo) { + feed.logo = { _text: collection.metadata.logo }; + } + + feed.entry = versions.map(version => buildEntry(collection, baseUrl, version)); + + return { + _declaration: { _attributes: { version: '1.0', encoding: 'utf-8' } }, + feed, + }; +} + +function sendAtom(res, xml) { + res.set('Content-Type', 'application/atom+xml; charset=utf-8'); + res.status(200).send(xml); +} + +function render(document) { + return js2xml(document, { compact: true, spaces: 2 }); +} + +/** + * @returns {express.Router} The router instance + * @swagger + * tags: + * name: Feeds + * description: Atom feeds of version changes + */ +export default function feedRouter() { + const router = express.Router(); + + /** + * @swagger + * /feed: + * get: + * summary: Atom feed of the latest version changes across the whole collection. + * tags: [Feeds] + * produces: + * - application/atom+xml + * responses: + * 200: + * description: An Atom 1.0 feed listing the latest version records, newest first. + * content: + * application/atom+xml: + * schema: + * type: string + */ + router.get('/feed', async (req, res) => { + const collection = await getCollection(); + const baseUrl = buildAbsoluteBaseUrl(req); + const selfHref = `${baseUrl}/feed`; + const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}`; + + const versions = await versionsRepository.findRecent(DEFAULT_LIMIT); + const document = buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }); + + sendAtom(res, render(document)); + }); + + return router; +} diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js new file mode 100644 index 000000000..5f8e7325e --- /dev/null +++ b/src/collection-api/routes/feed.test.js @@ -0,0 +1,231 @@ +import { expect } from 'chai'; +import config from 'config'; +import supertest from 'supertest'; + +import { getCollection } from '../../archivist/collection/index.js'; +import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'; +import Version from '../../archivist/recorder/version.js'; +import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; +import app from '../server.js'; + +const basePath = config.get('@opentermsarchive/engine.collection-api.basePath'); +const request = supertest(app); +const storageConfig = config.get('@opentermsarchive/engine.recorder.versions.storage'); + +function extractTag(xml, tag) { + const match = xml.match(new RegExp(`<${tag}>([\\s\\S]*?)`)); + + return match ? match[1] : null; +} + +describe('Feed API', () => { + describe('GET /feed', () => { + let response; + let collection; + + before(async () => { + collection = await getCollection(); + response = await request.get(`${basePath}/v1/feed`); + }); + + it('responds with 200 status code', () => { + expect(response.status).to.equal(200); + }); + + it('responds with Content-Type application/atom+xml', () => { + expect(response.headers['content-type']).to.match(/^application\/atom\+xml/); + }); + + it('is a valid Atom feed root', () => { + expect(response.text).to.match(/^<\?xml version="1\.0"/); + expect(response.text).to.include(' { + it('has a title matching the collection name', () => { + expect(extractTag(response.text, 'title')).to.equal(collection.metadata.name); + }); + + it('has a subtitle matching the collection tagline', () => { + expect(extractTag(response.text, 'subtitle')).to.equal(collection.metadata.tagline); + }); + + it('has a tag URI id based on the collection id', () => { + expect(extractTag(response.text, 'id')).to.equal(`tag:opentermsarchive.org,2026:feed:${collection.metadata.id}`); + }); + + it('has an updated element with a valid ISO 8601 datetime', () => { + const updated = extractTag(response.text, 'updated'); + + expect(updated).to.be.a('string'); + expect(new Date(updated).toString()).to.not.equal('Invalid Date'); + }); + + it('has a self link pointing to the feed endpoint', () => { + const selfHrefMatch = response.text.match(/]*rel="self"[^>]*href="([^"]+)"/); + + expect(selfHrefMatch).to.not.be.null; + expect(selfHrefMatch[1]).to.match(new RegExp(`${basePath}/v1/feed$`)); + }); + + it('has an author named OTA-Bot', () => { + expect(response.text).to.match(/[\s\S]*OTA-Bot<\/name>[\s\S]*<\/author>/); + }); + + it('has a logo matching the collection logo', () => { + expect(extractTag(response.text, 'logo')).to.equal(collection.metadata.logo); + }); + }); + }); + + describe('GET /feed — entries', () => { + const FETCH_DATE_FIRST = new Date('2023-01-01T12:00:00Z'); + const FETCH_DATE_CHANGE = new Date('2023-06-15T08:30:00Z'); + const FETCH_DATE_UPGRADE = new Date('2024-02-10T16:45:00Z'); + + let response; + let repository; + let savedVersions; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + + const firstRecord = await repository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'first content', + fetchDate: FETCH_DATE_FIRST, + snapshotIds: ['snapshot_1'], + })); + + const changeRecord = await repository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'changed content', + fetchDate: FETCH_DATE_CHANGE, + snapshotIds: ['snapshot_2'], + })); + + const upgradeRecord = await repository.save(new Version({ + serviceId: 'service-2', + termsType: 'Privacy Policy', + content: 'initial privacy', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['snapshot_3'], + })); + + const technicalUpgradeRecord = await repository.save(new Version({ + serviceId: 'service-2', + termsType: 'Privacy Policy', + content: 'upgraded privacy', + fetchDate: FETCH_DATE_UPGRADE, + snapshotIds: ['snapshot_4'], + isTechnicalUpgrade: true, + })); + + savedVersions = { firstRecord, changeRecord, upgradeRecord, technicalUpgradeRecord }; + response = await request.get(`${basePath}/v1/feed`); + }); + + after(() => repository.removeAll()); + + it('orders entries newest-first', () => { + const updates = [...response.text.matchAll(/[\s\S]*?([^<]+)<\/updated>[\s\S]*?<\/entry>/g)].map(match => match[1]); + + expect(updates).to.deep.equal([...updates].sort().reverse()); + }); + + describe('entry metadata', () => { + let firstEntry; + + before(() => { + firstEntry = response.text.match(/[\s\S]*?<\/entry>/)[0]; + }); + + it('has an id tag URI including storage type and record id', () => { + const collectionId = 'test'; + const expected = `tag:opentermsarchive.org,2026:version:${collectionId}:${storageConfig.type}:${savedVersions.technicalUpgradeRecord.id}`; + + expect(firstEntry).to.include(`${expected}`); + }); + + it('has an alternate link to the version API endpoint', () => { + const href = firstEntry.match(/]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + const expectedPathFragment = `/version/${encodeURIComponent('service-2')}/${encodeURIComponent('Privacy Policy')}/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE_UPGRADE))}`; + + expect(href).to.include(expectedPathFragment); + }); + + it('has a type="text/html" on the alternate link', () => { + expect(firstEntry).to.match(/]*rel="alternate"[^>]*type="text\/html"/); + }); + + it('has a title reconstructed from commit prefix + serviceId + termsType', () => { + const title = firstEntry.match(/]*>([\s\S]*?)<\/title>/)[1]; + + expect(title).to.include('Apply technical or declaration upgrade on'); + expect(title).to.include('service-2'); + expect(title).to.include('Privacy Policy'); + }); + + it('has an updated element matching the fetch date', () => { + const updated = firstEntry.match(/([^<]+)<\/updated>/)[1]; + + expect(new Date(updated).toISOString()).to.equal(FETCH_DATE_UPGRADE.toISOString()); + }); + + it('has three categories with the expected schemes', () => { + const categories = [...firstEntry.matchAll(//g)].map(match => match[1]); + + expect(categories).to.have.length(3); + + const schemes = categories.map(attrs => attrs.match(/scheme="([^"]+)"/)[1]); + + expect(schemes).to.include('tag:opentermsarchive.org,2026:scheme:service'); + expect(schemes).to.include('tag:opentermsarchive.org,2026:scheme:terms-type'); + expect(schemes).to.include('tag:opentermsarchive.org,2026:scheme:record-type'); + }); + + it('has category terms for service, terms type and record type', () => { + const categories = [...firstEntry.matchAll(//g)].map(match => match[1]); + const terms = categories.map(attrs => attrs.match(/term="([^"]+)"/)[1]); + + expect(terms).to.include('service-2'); + expect(terms).to.include('Privacy Policy'); + expect(terms).to.include('Technical upgrade'); + }); + }); + + describe('record-type classification', () => { + function findEntryById(xml, recordId) { + const match = [...xml.matchAll(/[\s\S]*?<\/entry>/g)].find(entry => entry[0].includes(`:${recordId}`)); + + return match && match[0]; + } + + it('classifies a first record as "First record"', () => { + const entry = findEntryById(response.text, savedVersions.upgradeRecord.id); + + expect(entry).to.not.be.undefined; + expect(entry).to.match(/term="First record"/); + }); + + it('classifies a content change as "Change"', () => { + const entry = findEntryById(response.text, savedVersions.changeRecord.id); + + expect(entry).to.not.be.undefined; + expect(entry).to.match(/term="Change"/); + }); + + it('classifies a technical upgrade as "Technical upgrade"', () => { + const entry = findEntryById(response.text, savedVersions.technicalUpgradeRecord.id); + + expect(entry).to.not.be.undefined; + expect(entry).to.match(/term="Technical upgrade"/); + }); + }); + }); +}); diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index b99636b90..a334b1691 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -5,6 +5,7 @@ import { getCollection } from '../../archivist/collection/index.js'; import * as Services from '../../archivist/services/index.js'; import docsRouter from './docs.js'; +import feedRouter from './feed.js'; import metadataRouter from './metadata.js'; import servicesRouter from './services.js'; import versionsRouter from './versions.js'; @@ -37,6 +38,7 @@ export default async function apiRouter(basePath) { router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); router.use(versionsRouter); + router.use(feedRouter()); return router; } diff --git a/src/collection-api/routes/versions.js b/src/collection-api/routes/versions.js index e420f8998..555f74c56 100644 --- a/src/collection-api/routes/versions.js +++ b/src/collection-api/routes/versions.js @@ -1,9 +1,9 @@ -import config from 'config'; import express from 'express'; -import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; +import versionsRepository from './versionsRepository.js'; + /** * @private * @swagger @@ -29,8 +29,6 @@ import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; */ const router = express.Router(); -const versionsRepository = await RepositoryFactory.create(config.get('@opentermsarchive/engine.recorder.versions.storage')).initialize(); - /** * @private * @swagger diff --git a/src/collection-api/routes/versionsRepository.js b/src/collection-api/routes/versionsRepository.js new file mode 100644 index 000000000..d76d06ce1 --- /dev/null +++ b/src/collection-api/routes/versionsRepository.js @@ -0,0 +1,9 @@ +import config from 'config'; + +import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'; + +export const storageConfig = config.get('@opentermsarchive/engine.recorder.versions.storage'); + +const versionsRepository = await RepositoryFactory.create(storageConfig).initialize(); + +export default versionsRepository; From e73d36222c4c86f34b2acd3722c728c9ceabdcf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 22 Apr 2026 08:57:52 +0200 Subject: [PATCH 04/48] Add service-scoped feed endpoint --- src/collection-api/routes/feed.js | 49 ++++++++++- src/collection-api/routes/feed.test.js | 116 +++++++++++++++++++++++++ src/collection-api/routes/index.js | 2 +- src/collection-api/routes/services.js | 5 +- src/collection-api/routes/utils.js | 5 ++ 5 files changed, 172 insertions(+), 5 deletions(-) create mode 100644 src/collection-api/routes/utils.js diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 7758b473c..fee9fdccd 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -6,6 +6,7 @@ import { COMMIT_MESSAGE_PREFIXES } from '../../archivist/recorder/repositories/g import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; import versionsRepository, { storageConfig } from './versionsRepository.js'; +import { findServiceCaseInsensitive } from './utils.js'; const TAG_AUTHORITY = 'opentermsarchive.org,2026'; const FEED_AUTHOR_NAME = 'OTA-Bot'; @@ -108,13 +109,14 @@ function render(document) { } /** - * @returns {express.Router} The router instance + * @param {object} services The services to be exposed by the API + * @returns {express.Router} The router instance * @swagger * tags: * name: Feeds * description: Atom feeds of version changes */ -export default function feedRouter() { +export default function feedRouter(services) { const router = express.Router(); /** @@ -145,5 +147,48 @@ export default function feedRouter() { sendAtom(res, render(document)); }); + /** + * @swagger + * /feed/{serviceId}: + * get: + * summary: Atom feed of the latest version changes scoped to a single service. + * tags: [Feeds] + * produces: + * - application/atom+xml + * parameters: + * - in: path + * name: serviceId + * description: The ID of the service. Case-insensitive. + * schema: + * type: string + * required: true + * responses: + * 200: + * description: An Atom 1.0 feed listing the latest version records for the given service, newest first. + * content: + * application/atom+xml: + * schema: + * type: string + * 404: + * description: No service matching the provided ID is found. + */ + router.get('/feed/:serviceId', async (req, res) => { + const service = findServiceCaseInsensitive(services, req.params.serviceId); + + if (!service) { + return res.status(404).send('Service not found'); + } + + const collection = await getCollection(); + const baseUrl = buildAbsoluteBaseUrl(req); + const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}`; + const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}`; + + const versions = await versionsRepository.findRecent(DEFAULT_LIMIT, { serviceId: service.id }); + const document = buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }); + + return sendAtom(res, render(document)); + }); + return router; } diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 5f8e7325e..4a40f27f7 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -228,4 +228,120 @@ describe('Feed API', () => { }); }); }); + + describe('GET /feed/:serviceId', () => { + const SERVICE = 'service_without_history'; + const OTHER_SERVICE = 'service_with_history'; + const TERMS = 'Terms of Service'; + + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'c1', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['s1'], + })); + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'c2', + fetchDate: new Date('2024-02-01T00:00:00Z'), + snapshotIds: ['s2'], + })); + await repository.save(new Version({ + serviceId: OTHER_SERVICE, + termsType: TERMS, + content: 'c3', + fetchDate: new Date('2024-03-01T00:00:00Z'), + snapshotIds: ['s3'], + })); + }); + + after(() => repository.removeAll()); + + context('when the service exists and has versions', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE)}`); + }); + + it('responds with 200', () => { + expect(response.status).to.equal(200); + }); + + it('responds with Content-Type application/atom+xml', () => { + expect(response.headers['content-type']).to.match(/^application\/atom\+xml/); + }); + + it('includes only entries for that service', () => { + const serviceTerms = [...response.text.matchAll(/scheme="tag:opentermsarchive.org,2026:scheme:service"[^/]*term="([^"]+)"/g)] + .concat([...response.text.matchAll(/term="([^"]+)"[^/]*scheme="tag:opentermsarchive.org,2026:scheme:service"/g)]) + .map(match => match[1]); + + expect(serviceTerms).to.not.be.empty; + + for (const term of serviceTerms) { + expect(term).to.equal(SERVICE); + } + }); + + it('has a feed id including the service id', () => { + expect(extractTag(response.text, 'id')).to.equal(`tag:opentermsarchive.org,2026:feed:test:${SERVICE}`); + }); + + it('has a self link pointing to the service-scoped feed endpoint', () => { + const href = response.text.match(/]*rel="self"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.match(new RegExp(`/feed/${SERVICE}$`)); + }); + }); + + context('when the service exists but has no versions', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent('service_with_filters_history')}`); + }); + + it('responds with 200', () => { + expect(response.status).to.equal(200); + }); + + it('returns an empty feed (no entries)', () => { + expect(response.text).to.not.include(''); + }); + }); + + context('when the service does not exist', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/DoesNotExist`); + }); + + it('responds with 404', () => { + expect(response.status).to.equal(404); + }); + }); + + context('when the serviceId uses different casing', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE.toUpperCase())}`); + }); + + it('still resolves to the service (case-insensitive)', () => { + expect(response.status).to.equal(200); + }); + }); + }); }); diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index a334b1691..f492593a4 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -38,7 +38,7 @@ export default async function apiRouter(basePath) { router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); router.use(versionsRouter); - router.use(feedRouter()); + router.use(feedRouter(services)); return router; } diff --git a/src/collection-api/routes/services.js b/src/collection-api/routes/services.js index f13879d2d..95febb6a2 100644 --- a/src/collection-api/routes/services.js +++ b/src/collection-api/routes/services.js @@ -1,5 +1,7 @@ import express from 'express'; +import { findServiceCaseInsensitive } from './utils.js'; + /** * @param {object} services The services to be exposed by the API * @returns {express.Router} The router instance @@ -130,8 +132,7 @@ export default function servicesRouter(services) { * description: No service matching the provided ID is found. */ router.get('/service/:serviceId', (req, res) => { - const matchedServiceID = Object.keys(services).find(key => key.toLowerCase() === req.params.serviceId?.toLowerCase()); - const service = services[matchedServiceID]; + const service = findServiceCaseInsensitive(services, req.params.serviceId); if (!service) { res.status(404).send('Service not found'); diff --git a/src/collection-api/routes/utils.js b/src/collection-api/routes/utils.js new file mode 100644 index 000000000..18728e445 --- /dev/null +++ b/src/collection-api/routes/utils.js @@ -0,0 +1,5 @@ +export function findServiceCaseInsensitive(services, serviceId) { + const matched = Object.keys(services).find(key => key.toLowerCase() === serviceId?.toLowerCase()); + + return matched ? services[matched] : null; +} From 891151ab11b6e1bb85357ac46ce24ed97dd1d9cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 22 Apr 2026 09:09:16 +0200 Subject: [PATCH 05/48] Add service and terms type scoped feed endpoint --- src/collection-api/routes/feed.js | 55 ++++++++++ src/collection-api/routes/feed.test.js | 139 +++++++++++++++++++++++++ 2 files changed, 194 insertions(+) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index fee9fdccd..35490a74c 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -190,5 +190,60 @@ export default function feedRouter(services) { return sendAtom(res, render(document)); }); + /** + * @swagger + * /feed/{serviceId}/{termsType}: + * get: + * summary: Atom feed of the latest version changes scoped to a service and terms type. + * tags: [Feeds] + * produces: + * - application/atom+xml + * parameters: + * - in: path + * name: serviceId + * description: The ID of the service. Case-insensitive. + * schema: + * type: string + * required: true + * - in: path + * name: termsType + * description: The terms type declared by the service (e.g. "Terms of Service", "Privacy Policy"). + * schema: + * type: string + * required: true + * responses: + * 200: + * description: An Atom 1.0 feed listing the latest version records for the given service and terms type, newest first. + * content: + * application/atom+xml: + * schema: + * type: string + * 404: + * description: Either the service ID does not match any service or the terms type is not declared by that service. + */ + router.get('/feed/:serviceId/:termsType', async (req, res) => { + const service = findServiceCaseInsensitive(services, req.params.serviceId); + + if (!service) { + return res.status(404).send('Service not found'); + } + + const { termsType } = req.params; + + if (!service.getTermsTypes().includes(termsType)) { + return res.status(404).send('Terms type not found for this service'); + } + + const collection = await getCollection(); + const baseUrl = buildAbsoluteBaseUrl(req); + const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}/${encodeURIComponent(termsType)}`; + const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}:${termsType}`; + + const versions = await versionsRepository.findRecent(DEFAULT_LIMIT, { serviceId: service.id, termsType }); + const document = buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }); + + return sendAtom(res, render(document)); + }); + return router; } diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 4a40f27f7..1d1d1ac95 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -344,4 +344,143 @@ describe('Feed API', () => { }); }); }); + + describe('XML escaping and URL encoding', () => { + const SERVICE = 'Service B!'; + const TERMS = 'Privacy Policy'; + const FETCH_DATE = new Date('2024-05-15T10:00:00Z'); + + let response; + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'content with & and ', + fetchDate: FETCH_DATE, + snapshotIds: ['s_escape'], + })); + + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE)}/${encodeURIComponent(TERMS)}`); + }); + + after(() => repository.removeAll()); + + it('responds with 200', () => { + expect(response.status).to.equal(200); + }); + + it('URL-encodes spaces and special characters in the self link href', () => { + const href = response.text.match(/]*rel="self"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.include('Service%20B!'); + expect(href).to.include('Privacy%20Policy'); + expect(href).to.not.include('Service B!'); + }); + + it('URL-encodes spaces and special characters in entry alternate links', () => { + const href = response.text.match(/]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.include('Service%20B!'); + expect(href).to.include('Privacy%20Policy'); + }); + }); + + describe('GET /feed/:serviceId/:termsType', () => { + const SERVICE = 'service_without_history'; + const TERMS = 'Terms of Service'; + const UNKNOWN_TERMS = 'Imprint'; + + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'first', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['s1'], + })); + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'updated', + fetchDate: new Date('2024-02-01T00:00:00Z'), + snapshotIds: ['s2'], + })); + }); + + after(() => repository.removeAll()); + + context('when the service and terms type match', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE)}/${encodeURIComponent(TERMS)}`); + }); + + it('responds with 200', () => { + expect(response.status).to.equal(200); + }); + + it('includes entries for the combination', () => { + const entries = response.text.match(//g) || []; + + expect(entries.length).to.be.at.least(1); + }); + + it('entries only have the expected terms type', () => { + const termsTypeTerms = [...response.text.matchAll(/ match[1]); + + for (const term of termsTypeTerms) { + expect(term).to.equal(TERMS); + } + }); + + it('has a feed id that includes both service and terms type', () => { + expect(extractTag(response.text, 'id')).to.equal(`tag:opentermsarchive.org,2026:feed:test:${SERVICE}:${TERMS}`); + }); + + it('has a self link pointing to the combination endpoint', () => { + const href = response.text.match(/]*rel="self"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.match(new RegExp(`/feed/${SERVICE}/${encodeURIComponent(TERMS)}$`)); + }); + }); + + context('when the service exists but does not declare the terms type', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE)}/${encodeURIComponent(UNKNOWN_TERMS)}`); + }); + + it('responds with 404', () => { + expect(response.status).to.equal(404); + }); + }); + + context('when the service does not exist', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/DoesNotExist/${encodeURIComponent(TERMS)}`); + }); + + it('responds with 404', () => { + expect(response.status).to.equal(404); + }); + }); + }); }); From 54e0e1d54d01438142af6a30c852af1d5b9f71f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 22 Apr 2026 09:29:09 +0200 Subject: [PATCH 06/48] Cap feed entries with configurable limit --- config/default.json | 5 +++++ config/test.json | 5 ++++- src/collection-api/routes/feed.js | 17 +++++++++++++---- src/collection-api/routes/feed.test.js | 16 ++++++++++++++++ 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/config/default.json b/config/default.json index c044f2939..96309b6fb 100644 --- a/config/default.json +++ b/config/default.json @@ -47,6 +47,11 @@ }, "dataset": { "publishingSchedule": "30 8 * * MON" + }, + "collection-api": { + "feed": { + "limit": 100 + } } } } diff --git a/config/test.json b/config/test.json index cf14b8be3..050fd5b79 100644 --- a/config/test.json +++ b/config/test.json @@ -47,7 +47,10 @@ }, "collection-api": { "port": 3000, - "basePath": "/collection-api" + "basePath": "/collection-api", + "feed": { + "limit": 3 + } } } } diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 35490a74c..050bf163b 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -1,3 +1,4 @@ +import config from 'config'; import express from 'express'; import { js2xml } from 'xml-js'; @@ -12,6 +13,14 @@ const TAG_AUTHORITY = 'opentermsarchive.org,2026'; const FEED_AUTHOR_NAME = 'OTA-Bot'; const DEFAULT_LIMIT = 100; +function getFeedLimit() { + if (config.has('@opentermsarchive/engine.collection-api.feed.limit')) { + return config.get('@opentermsarchive/engine.collection-api.feed.limit'); + } + + return DEFAULT_LIMIT; +} + const RECORD_TYPES = { firstRecord: 'First record', technicalUpgrade: 'Technical upgrade', @@ -129,7 +138,7 @@ export default function feedRouter(services) { * - application/atom+xml * responses: * 200: - * description: An Atom 1.0 feed listing the latest version records, newest first. + * description: An Atom 1.0 feed listing the latest version records, newest first. The maximum number of entries is server-configured. * content: * application/atom+xml: * schema: @@ -141,7 +150,7 @@ export default function feedRouter(services) { const selfHref = `${baseUrl}/feed`; const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}`; - const versions = await versionsRepository.findRecent(DEFAULT_LIMIT); + const versions = await versionsRepository.findRecent(getFeedLimit()); const document = buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }); sendAtom(res, render(document)); @@ -184,7 +193,7 @@ export default function feedRouter(services) { const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}`; const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}`; - const versions = await versionsRepository.findRecent(DEFAULT_LIMIT, { serviceId: service.id }); + const versions = await versionsRepository.findRecent(getFeedLimit(), { serviceId: service.id }); const document = buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }); return sendAtom(res, render(document)); @@ -239,7 +248,7 @@ export default function feedRouter(services) { const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}/${encodeURIComponent(termsType)}`; const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}:${termsType}`; - const versions = await versionsRepository.findRecent(DEFAULT_LIMIT, { serviceId: service.id, termsType }); + const versions = await versionsRepository.findRecent(getFeedLimit(), { serviceId: service.id, termsType }); const document = buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }); return sendAtom(res, render(document)); diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 1d1d1ac95..d71278dd1 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -132,6 +132,13 @@ describe('Feed API', () => { after(() => repository.removeAll()); + it('lists one entry per saved version up to the configured limit', () => { + const limit = config.get('@opentermsarchive/engine.collection-api.feed.limit'); + const entries = response.text.match(//g) || []; + + expect(entries).to.have.length(Math.min(4, limit)); + }); + it('orders entries newest-first', () => { const updates = [...response.text.matchAll(/[\s\S]*?([^<]+)<\/updated>[\s\S]*?<\/entry>/g)].map(match => match[1]); @@ -227,6 +234,15 @@ describe('Feed API', () => { expect(entry).to.match(/term="Technical upgrade"/); }); }); + + describe('configurable limit', () => { + it('returns at most the configured number of entries', () => { + const limit = config.get('@opentermsarchive/engine.collection-api.feed.limit'); + const entries = response.text.match(//g) || []; + + expect(entries.length).to.be.at.most(limit); + }); + }); }); describe('GET /feed/:serviceId', () => { From eb24e391a1c5a04b0ec91388397e4ad10c70fbd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 22 Apr 2026 14:19:22 +0200 Subject: [PATCH 07/48] Link feed entries to GitHub commits --- src/collection-api/routes/feed.js | 15 ++++++++++----- src/collection-api/routes/feed.test.js | 13 ++++++++++--- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 050bf163b..5cd0692cf 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -66,13 +66,18 @@ function buildEntryId(collection, version) { } function buildEntry(collection, baseUrl, version) { + const apiLink = buildVersionLink(baseUrl, version); + const githubCommitLink = collection.metadata?.versions && `${collection.metadata.versions}/commit/${version.id}`; + + const links = [{ _attributes: { rel: 'alternate', type: 'text/html', href: githubCommitLink || apiLink } }]; + + if (githubCommitLink) { + links.push({ _attributes: { rel: 'related', type: 'text/html', href: apiLink } }); + } + return { id: { _text: buildEntryId(collection, version) }, - link: { _attributes: { - rel: 'alternate', - type: 'text/html', - href: buildVersionLink(baseUrl, version), - } }, + link: links, title: { _text: buildEntryTitle(version) }, updated: { _text: version.fetchDate.toISOString() }, category: [ diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index d71278dd1..0118e1f70 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -159,8 +159,15 @@ describe('Feed API', () => { expect(firstEntry).to.include(`${expected}`); }); - it('has an alternate link to the version API endpoint', () => { + it('has an alternate link to the GitHub commit', async () => { + const collection = await getCollection(); const href = firstEntry.match(/]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.equal(`${collection.metadata.versions}/commit/${savedVersions.technicalUpgradeRecord.id}`); + }); + + it('has a related link to the version API endpoint', () => { + const href = firstEntry.match(/]*rel="related"[^>]*href="([^"]+)"/)[1]; const expectedPathFragment = `/version/${encodeURIComponent('service-2')}/${encodeURIComponent('Privacy Policy')}/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE_UPGRADE))}`; expect(href).to.include(expectedPathFragment); @@ -399,8 +406,8 @@ describe('Feed API', () => { expect(href).to.not.include('Service B!'); }); - it('URL-encodes spaces and special characters in entry alternate links', () => { - const href = response.text.match(/]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + it('URL-encodes spaces and special characters in entry related links', () => { + const href = response.text.match(/]*rel="related"[^>]*href="([^"]+)"/)[1]; expect(href).to.include('Service%20B!'); expect(href).to.include('Privacy%20Policy'); From c21bb463d458b14d3129e13cdb14aa52e7eb0487 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 22 Apr 2026 15:07:59 +0200 Subject: [PATCH 08/48] Resolve serviceId case-insensitively --- src/collection-api/routes/index.js | 2 +- src/collection-api/routes/utils.test.js | 44 +++++++++++++++++++++ src/collection-api/routes/versions.js | 45 +++++++++++++--------- src/collection-api/routes/versions.test.js | 33 ++++++++++++++-- 4 files changed, 101 insertions(+), 23 deletions(-) create mode 100644 src/collection-api/routes/utils.test.js diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index f492593a4..e02829fa2 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -37,7 +37,7 @@ export default async function apiRouter(basePath) { router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); - router.use(versionsRouter); + router.use(versionsRouter(services)); router.use(feedRouter(services)); return router; diff --git a/src/collection-api/routes/utils.test.js b/src/collection-api/routes/utils.test.js new file mode 100644 index 000000000..b7bb137f0 --- /dev/null +++ b/src/collection-api/routes/utils.test.js @@ -0,0 +1,44 @@ +import { expect } from 'chai'; + +import { findServiceCaseInsensitive } from './utils.js'; + +describe('findServiceCaseInsensitive', () => { + const services = { + '42Corp': { id: '42Corp' }, + ACMEco: { id: 'ACMEco' }, + 'example.org': { id: 'example.org' }, + 'Foo Bar': { id: 'Foo Bar' }, + 'service-b': { id: 'service-b' }, + serviceĀ·A: { id: 'serviceĀ·A' }, + }; + + it('returns the service when the id matches exactly', () => { + expect(findServiceCaseInsensitive(services, '42Corp')).to.equal(services['42Corp']); + expect(findServiceCaseInsensitive(services, 'ACMEco')).to.equal(services.ACMEco); + expect(findServiceCaseInsensitive(services, 'example.org')).to.equal(services['example.org']); + expect(findServiceCaseInsensitive(services, 'Foo Bar')).to.equal(services['Foo Bar']); + expect(findServiceCaseInsensitive(services, 'service-b')).to.equal(services['service-b']); + expect(findServiceCaseInsensitive(services, 'serviceĀ·A')).to.equal(services['serviceĀ·A']); + }); + + it('returns the service when the id casing differs', () => { + expect(findServiceCaseInsensitive(services, '42CORP')).to.equal(services['42Corp']); + expect(findServiceCaseInsensitive(services, 'acmeco')).to.equal(services.ACMEco); + expect(findServiceCaseInsensitive(services, 'EXAMPLE.ORG')).to.equal(services['example.org']); + expect(findServiceCaseInsensitive(services, 'foo bar')).to.equal(services['Foo Bar']); + expect(findServiceCaseInsensitive(services, 'SERVICE-B')).to.equal(services['service-b']); + expect(findServiceCaseInsensitive(services, 'SERVICEĀ·A')).to.equal(services['serviceĀ·A']); + }); + + it('returns null when no service matches', () => { + expect(findServiceCaseInsensitive(services, 'Unknown')).to.be.null; + }); + + it('returns null when serviceId is undefined', () => { + expect(findServiceCaseInsensitive(services, undefined)).to.be.null; + }); + + it('returns null when services is empty', () => { + expect(findServiceCaseInsensitive({}, 'Foo Bar')).to.be.null; + }); +}); diff --git a/src/collection-api/routes/versions.js b/src/collection-api/routes/versions.js index 555f74c56..0cc412bd6 100644 --- a/src/collection-api/routes/versions.js +++ b/src/collection-api/routes/versions.js @@ -3,6 +3,7 @@ import express from 'express'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; import versionsRepository from './versionsRepository.js'; +import { findServiceCaseInsensitive } from './utils.js'; /** * @private @@ -27,9 +28,10 @@ import versionsRepository from './versionsRepository.js'; * type: string * description: The JSON-escaped Markdown content of the version */ -const router = express.Router(); +export default function versionsRouter(services) { + const router = express.Router(); -/** + /** * @private * @swagger * /version/{serviceId}/{termsType}/{date}: @@ -86,25 +88,32 @@ const router = express.Router(); * type: string * description: Error message indicating that the requested date is in the future. */ -router.get('/version/:serviceId/:termsType/:date', async (req, res) => { - const { serviceId, termsType, date } = req.params; - const requestedDate = new Date(date); + router.get('/version/:serviceId/:termsType/:date', async (req, res) => { + const { termsType, date } = req.params; + const requestedDate = new Date(date); + + if (requestedDate > new Date()) { + return res.status(416).json({ error: 'Requested version is in the future' }); + } + + const service = findServiceCaseInsensitive(services, req.params.serviceId); - if (requestedDate > new Date()) { - return res.status(416).json({ error: 'Requested version is in the future' }); - } + if (!service) { + return res.status(404).json({ error: 'Service not found' }); + } - const version = await versionsRepository.findByDate(serviceId, termsType, requestedDate); + const version = await versionsRepository.findByDate(service.id, termsType, requestedDate); - if (!version) { - return res.status(404).json({ error: `No version found for date ${date}` }); - } + if (!version) { + return res.status(404).json({ error: `No version found for date ${date}` }); + } - return res.status(200).json({ - id: version.id, - fetchDate: toISODateWithoutMilliseconds(version.fetchDate), - content: version.content, + return res.status(200).json({ + id: version.id, + fetchDate: toISODateWithoutMilliseconds(version.fetchDate), + content: version.content, + }); }); -}); -export default router; + return router; +} diff --git a/src/collection-api/routes/versions.test.js b/src/collection-api/routes/versions.test.js index aadcfe14b..1ec145854 100644 --- a/src/collection-api/routes/versions.test.js +++ b/src/collection-api/routes/versions.test.js @@ -17,7 +17,7 @@ describe('Versions API', () => { let versionsRepository; const FETCH_DATE = new Date('2023-01-01T12:00:00Z'); const VERSION_COMMON_ATTRIBUTES = { - serviceId: 'service-1', + serviceId: 'serviceĀ·A', termsType: 'Terms of Service', snapshotId: ['snapshot_id'], }; @@ -62,7 +62,7 @@ describe('Versions API', () => { context('when a version is found', () => { before(async () => { - response = await request.get(`${basePath}/v1/version/service-1/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE))}`); + response = await request.get(`${basePath}/v1/version/serviceĀ·A/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE))}`); }); it('responds with 200 status code', () => { @@ -80,7 +80,7 @@ describe('Versions API', () => { context('when the requested date is anterior to the first available version', () => { before(async () => { - response = await request.get(`${basePath}/v1/version/service-1/Terms%20of%20Service/2000-01-01T12:00:00Z`); + response = await request.get(`${basePath}/v1/version/serviceĀ·A/Terms%20of%20Service/2000-01-01T12:00:00Z`); }); it('responds with 404 status code', () => { @@ -96,11 +96,36 @@ describe('Versions API', () => { }); }); + context('when the serviceId uses different casing', () => { + before(async () => { + response = await request.get(`${basePath}/v1/version/SERVICEĀ·A/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE))}`); + }); + + it('still resolves to the service (case-insensitive)', () => { + expect(response.status).to.equal(200); + expect(response.body).to.deep.equal(expectedResult); + }); + }); + + context('when the service does not exist', () => { + before(async () => { + response = await request.get(`${basePath}/v1/version/DoesNotExist/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE))}`); + }); + + it('responds with 404 status code', () => { + expect(response.status).to.equal(404); + }); + + it('returns an error message', () => { + expect(response.body.error).to.equal('Service not found'); + }); + }); + context('when the requested date is in the future', () => { before(async () => { const dateInTheFuture = new Date(Date.now() + 60000); // 1 minute in the future - response = await request.get(`${basePath}/v1/version/service-1/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(dateInTheFuture))}`); + response = await request.get(`${basePath}/v1/version/serviceĀ·A/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(dateInTheFuture))}`); }); it('responds with 416 status code', () => { From d2ed924089e505fab11b1816a73227a35f753b71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 22 Apr 2026 09:29:16 +0200 Subject: [PATCH 09/48] Add changelog entry --- CHANGELOG.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad05dcd96..0c0c6adee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,21 @@ All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased [minor] + +> Development of this release was supported by [Reset Tech](https://www.reset.tech). + +### Added + +- Add `GET /feed` endpoint on the Collection API exposing an Atom feed of the latest version changes across the whole collection +- Add `GET /feed/:serviceId` endpoint on the Collection API exposing an Atom feed scoped to a single service +- Add `GET /feed/:serviceId/:termsType` endpoint on the Collection API exposing an Atom feed scoped to a single service and terms type +- Add [`@opentermsarchive/engine.collection-api.feed.limit`](https://docs.opentermsarchive.org/collections/reference/configuration/) configuration option controlling the maximum number of entries returned by feed endpoints (default: `100`) + +### Changed + +- Resolve `serviceId` path parameter case-insensitively on the `GET /version/:serviceId/:termsType/:date` endpoint, consistent with other endpoints + ## 11.0.2 - 2026-04-14 > Development of this release was supported by [Reset Tech](https://www.reset.tech). From a6a4723bb254c0d0bb37e7635bf1f92930fe4ff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 29 Apr 2026 15:42:41 +0200 Subject: [PATCH 10/48] Enforce consistent brace style --- .eslintrc.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.eslintrc.yaml b/.eslintrc.yaml index 85702b255..3731975fa 100644 --- a/.eslintrc.yaml +++ b/.eslintrc.yaml @@ -37,6 +37,9 @@ rules: - error - always-multiline consistent-return: 0 + curly: + - error + - all function-paren-newline: - error - multiline From 2582940d84449ed874eab62cf0ee9b0815de3c54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 29 Apr 2026 15:43:09 +0200 Subject: [PATCH 11/48] Lint --- scripts/reporter/duplicate/index.js | 2 +- src/archivist/collection/index.test.js | 2 +- .../recorder/repositories/git/index.js | 8 +- .../recorder/repositories/interface.js | 10 +- .../recorder/repositories/mongo/index.js | 4 +- src/archivist/services/index.js | 2 +- src/collection-api/routes/feed.js | 9 +- src/collection-api/routes/feed.test.js | 2 +- src/collection-api/routes/versions.js | 116 +++++++++--------- src/reporter/gitlab/index.js | 2 +- 10 files changed, 79 insertions(+), 78 deletions(-) diff --git a/scripts/reporter/duplicate/index.js b/scripts/reporter/duplicate/index.js index d2b508770..22e13b1a0 100644 --- a/scripts/reporter/duplicate/index.js +++ b/scripts/reporter/duplicate/index.js @@ -39,7 +39,7 @@ async function removeDuplicateIssues() { } for (const [ title, duplicateIssues ] of issuesByTitle) { - if (duplicateIssues.length === 1) continue; + if (duplicateIssues.length === 1) { continue; } const originalIssue = duplicateIssues.reduce((oldest, current) => (new Date(current.created_at) < new Date(oldest.created_at) ? current : oldest)); diff --git a/src/archivist/collection/index.test.js b/src/archivist/collection/index.test.js index f7689384d..3b817e615 100644 --- a/src/archivist/collection/index.test.js +++ b/src/archivist/collection/index.test.js @@ -18,7 +18,7 @@ describe('Collection', () => { try { metadataBackup = await fs.readFile(metadataPath, 'utf8'); } catch (error) { - if (error.code !== 'ENOENT') throw error; + if (error.code !== 'ENOENT') { throw error; } } }); diff --git a/src/archivist/recorder/repositories/git/index.js b/src/archivist/recorder/repositories/git/index.js index 32904cad3..284a0340c 100644 --- a/src/archivist/recorder/repositories/git/index.js +++ b/src/archivist/recorder/repositories/git/index.js @@ -97,14 +97,14 @@ export default class GitRepository extends RepositoryInterface { const records = []; for (const commit of commits) { - if (records.length >= limit) break; + if (records.length >= limit) { break; } const record = await this.#toDomain(commit, { deferContentLoading: true }); - if (!record) continue; + if (!record) { continue; } - if (serviceId !== undefined && record.serviceId !== serviceId) continue; - if (termsType !== undefined && record.termsType !== termsType) continue; + if (serviceId !== undefined && record.serviceId !== serviceId) { continue; } + if (termsType !== undefined && record.termsType !== termsType) { continue; } records.push(record); } diff --git a/src/archivist/recorder/repositories/interface.js b/src/archivist/recorder/repositories/interface.js index 1c1cfd7d6..cf18e6a85 100644 --- a/src/archivist/recorder/repositories/interface.js +++ b/src/archivist/recorder/repositories/interface.js @@ -83,11 +83,11 @@ class RepositoryInterface { * Find the most recent records in the repository, optionally filtered by service ID and terms type * For performance reasons, the content of the records will not be loaded. Use #loadRecordContent to load the content of individual records * @see RepositoryInterface#loadRecordContent - * @param {number} limit - Maximum number of records to return - * @param {object} [filters] - Optional filters - * @param {string} [filters.serviceId] - Restrict results to this service ID - * @param {string} [filters.termsType] - Restrict results to this terms type - * @returns {Promise>} Promise that will be resolved with an array of records in descending chronological order + * @param {number} limit - Maximum number of records to return + * @param {object} [filters] - Optional filters + * @param {string} [filters.serviceId] - Restrict results to this service ID + * @param {string} [filters.termsType] - Restrict results to this terms type + * @returns {Promise>} Promise that will be resolved with an array of records in descending chronological order */ async findRecent(limit, filters) { throw new Error(`#findRecent method is not implemented in ${this.constructor.name}`); diff --git a/src/archivist/recorder/repositories/mongo/index.js b/src/archivist/recorder/repositories/mongo/index.js index b9cf9437c..cd64940f9 100644 --- a/src/archivist/recorder/repositories/mongo/index.js +++ b/src/archivist/recorder/repositories/mongo/index.js @@ -96,8 +96,8 @@ export default class MongoRepository extends RepositoryInterface { async findRecent(limit, { serviceId, termsType } = {}) { const query = {}; - if (serviceId !== undefined) query.serviceId = serviceId; - if (termsType !== undefined) query.termsType = termsType; + if (serviceId !== undefined) { query.serviceId = serviceId; } + if (termsType !== undefined) { query.termsType = termsType; } const mongoDocuments = await this.collection .find(query) diff --git a/src/archivist/services/index.js b/src/archivist/services/index.js index cdcc07bbf..980973379 100644 --- a/src/archivist/services/index.js +++ b/src/archivist/services/index.js @@ -281,7 +281,7 @@ function getHistoryFilePaths(serviceId) { } async function loadServiceHistory(historyFilePath) { - if (!(await fileExists(historyFilePath))) return {}; + if (!(await fileExists(historyFilePath))) { return {}; } try { return JSON.parse(await fs.readFile(historyFilePath)); diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 5cd0692cf..bd4885c93 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -6,8 +6,8 @@ import { getCollection } from '../../archivist/collection/index.js'; import { COMMIT_MESSAGE_PREFIXES } from '../../archivist/recorder/repositories/git/dataMapper.js'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; -import versionsRepository, { storageConfig } from './versionsRepository.js'; import { findServiceCaseInsensitive } from './utils.js'; +import versionsRepository, { storageConfig } from './versionsRepository.js'; const TAG_AUTHORITY = 'opentermsarchive.org,2026'; const FEED_AUTHOR_NAME = 'OTA-Bot'; @@ -38,8 +38,8 @@ function buildAbsoluteBaseUrl(req) { } function classifyRecordType(version) { - if (version.isFirstRecord) return RECORD_TYPES.firstRecord; - if (version.isTechnicalUpgrade) return RECORD_TYPES.technicalUpgrade; + if (version.isFirstRecord) { return RECORD_TYPES.firstRecord; } + if (version.isTechnicalUpgrade) { return RECORD_TYPES.technicalUpgrade; } return RECORD_TYPES.change; } @@ -47,8 +47,7 @@ function classifyRecordType(version) { function buildEntryTitle(version) { let prefix = COMMIT_MESSAGE_PREFIXES.update; - if (version.isFirstRecord) prefix = COMMIT_MESSAGE_PREFIXES.startTracking; - else if (version.isTechnicalUpgrade) prefix = COMMIT_MESSAGE_PREFIXES.technicalUpgrade; + if (version.isFirstRecord) { prefix = COMMIT_MESSAGE_PREFIXES.startTracking; } else if (version.isTechnicalUpgrade) { prefix = COMMIT_MESSAGE_PREFIXES.technicalUpgrade; } return `${prefix} ${version.serviceId} ${version.termsType}`; } diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 0118e1f70..b63d1fb04 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -149,7 +149,7 @@ describe('Feed API', () => { let firstEntry; before(() => { - firstEntry = response.text.match(/[\s\S]*?<\/entry>/)[0]; + [firstEntry] = response.text.match(/[\s\S]*?<\/entry>/); }); it('has an id tag URI including storage type and record id', () => { diff --git a/src/collection-api/routes/versions.js b/src/collection-api/routes/versions.js index 0cc412bd6..069cd7f7b 100644 --- a/src/collection-api/routes/versions.js +++ b/src/collection-api/routes/versions.js @@ -2,10 +2,12 @@ import express from 'express'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; -import versionsRepository from './versionsRepository.js'; import { findServiceCaseInsensitive } from './utils.js'; +import versionsRepository from './versionsRepository.js'; /** + * @param {object} services The services to be exposed by the API + * @returns {express.Router} The router instance * @private * @swagger * tags: @@ -32,62 +34,62 @@ export default function versionsRouter(services) { const router = express.Router(); /** - * @private - * @swagger - * /version/{serviceId}/{termsType}/{date}: - * get: - * summary: Get a specific version of some terms at a given date. - * tags: [Versions] - * produces: - * - application/json - * parameters: - * - in: path - * name: serviceId - * description: The ID of the service whose version will be returned. - * schema: - * type: string - * required: true - * - in: path - * name: termsType - * description: The type of terms whose version will be returned. - * schema: - * type: string - * required: true - * - in: path - * name: date - * description: The date and time for which the version is requested, in ISO 8601 format. - * schema: - * type: string - * format: date-time - * required: true - * responses: - * 200: - * description: A JSON object containing the version content and metadata. - * content: - * application/json: - * schema: - * $ref: '#/components/schemas/Version' - * 404: - * description: No version found for the specified combination of service ID, terms type and date. - * content: - * application/json: - * schema: - * type: object - * properties: - * error: - * type: string - * description: Error message indicating that no version is found. - * 416: - * description: The requested date is in the future. - * content: - * application/json: - * schema: - * type: object - * properties: - * error: - * type: string - * description: Error message indicating that the requested date is in the future. - */ + * @private + * @swagger + * /version/{serviceId}/{termsType}/{date}: + * get: + * summary: Get a specific version of some terms at a given date. + * tags: [Versions] + * produces: + * - application/json + * parameters: + * - in: path + * name: serviceId + * description: The ID of the service whose version will be returned. + * schema: + * type: string + * required: true + * - in: path + * name: termsType + * description: The type of terms whose version will be returned. + * schema: + * type: string + * required: true + * - in: path + * name: date + * description: The date and time for which the version is requested, in ISO 8601 format. + * schema: + * type: string + * format: date-time + * required: true + * responses: + * 200: + * description: A JSON object containing the version content and metadata. + * content: + * application/json: + * schema: + * $ref: '#/components/schemas/Version' + * 404: + * description: No version found for the specified combination of service ID, terms type and date. + * content: + * application/json: + * schema: + * type: object + * properties: + * error: + * type: string + * description: Error message indicating that no version is found. + * 416: + * description: The requested date is in the future. + * content: + * application/json: + * schema: + * type: object + * properties: + * error: + * type: string + * description: Error message indicating that the requested date is in the future. + */ router.get('/version/:serviceId/:termsType/:date', async (req, res) => { const { termsType, date } = req.params; const requestedDate = new Date(date); diff --git a/src/reporter/gitlab/index.js b/src/reporter/gitlab/index.js index 431416768..55ea591f1 100644 --- a/src/reporter/gitlab/index.js +++ b/src/reporter/gitlab/index.js @@ -358,7 +358,7 @@ export default class GitLab { try { let apiUrl = `${this.apiBaseURL}/projects/${this.projectId}/issues?search=${encodeURIComponent(title)}&state=${searchParams.state}&per_page=100`; - if (searchParams.state == 'all') apiUrl = `${this.apiBaseURL}/projects/${this.projectId}/issues?search=${encodeURIComponent(title)}&per_page=100`; + if (searchParams.state == 'all') { apiUrl = `${this.apiBaseURL}/projects/${this.projectId}/issues?search=${encodeURIComponent(title)}&per_page=100`; } const options = GitLab.baseOptionsHttpReq(); From ef71d573ed10b896f35ee29b582cffdf9223be82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 29 Apr 2026 16:20:29 +0200 Subject: [PATCH 12/48] Instantiate versions repository in API router --- src/collection-api/routes/feed.js | 27 ++++++++++--------- src/collection-api/routes/index.js | 8 ++++-- src/collection-api/routes/versions.js | 8 +++--- .../routes/versionsRepository.js | 9 ------- 4 files changed, 24 insertions(+), 28 deletions(-) delete mode 100644 src/collection-api/routes/versionsRepository.js diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index bd4885c93..dc73f20a3 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -7,7 +7,6 @@ import { COMMIT_MESSAGE_PREFIXES } from '../../archivist/recorder/repositories/g import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; import { findServiceCaseInsensitive } from './utils.js'; -import versionsRepository, { storageConfig } from './versionsRepository.js'; const TAG_AUTHORITY = 'opentermsarchive.org,2026'; const FEED_AUTHOR_NAME = 'OTA-Bot'; @@ -60,11 +59,11 @@ function buildVersionLink(baseUrl, version) { return `${baseUrl}/version/${encodedService}/${encodedTermsType}/${encodedDate}`; } -function buildEntryId(collection, version) { - return `tag:${TAG_AUTHORITY}:version:${collection.metadata?.id}:${storageConfig.type}:${version.id}`; +function buildEntryId(collection, storageType, version) { + return `tag:${TAG_AUTHORITY}:version:${collection.metadata?.id}:${storageType}:${version.id}`; } -function buildEntry(collection, baseUrl, version) { +function buildEntry(collection, storageType, baseUrl, version) { const apiLink = buildVersionLink(baseUrl, version); const githubCommitLink = collection.metadata?.versions && `${collection.metadata.versions}/commit/${version.id}`; @@ -75,7 +74,7 @@ function buildEntry(collection, baseUrl, version) { } return { - id: { _text: buildEntryId(collection, version) }, + id: { _text: buildEntryId(collection, storageType, version) }, link: links, title: { _text: buildEntryTitle(version) }, updated: { _text: version.fetchDate.toISOString() }, @@ -87,7 +86,7 @@ function buildEntry(collection, baseUrl, version) { }; } -function buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }) { +function buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }) { const latestFetchDate = versions.length > 0 ? versions[0].fetchDate : new Date(); const feed = { @@ -104,7 +103,7 @@ function buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }) feed.logo = { _text: collection.metadata.logo }; } - feed.entry = versions.map(version => buildEntry(collection, baseUrl, version)); + feed.entry = versions.map(version => buildEntry(collection, storageType, baseUrl, version)); return { _declaration: { _attributes: { version: '1.0', encoding: 'utf-8' } }, @@ -122,14 +121,16 @@ function render(document) { } /** - * @param {object} services The services to be exposed by the API - * @returns {express.Router} The router instance + * @param {object} services The services to be exposed by the API + * @param {object} versionsRepository The versions repository instance + * @param {string} storageType The storage type identifier of the versions repository + * @returns {express.Router} The router instance * @swagger * tags: * name: Feeds * description: Atom feeds of version changes */ -export default function feedRouter(services) { +export default function feedRouter(services, versionsRepository, storageType) { const router = express.Router(); /** @@ -155,7 +156,7 @@ export default function feedRouter(services) { const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}`; const versions = await versionsRepository.findRecent(getFeedLimit()); - const document = buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }); + const document = buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }); sendAtom(res, render(document)); }); @@ -198,7 +199,7 @@ export default function feedRouter(services) { const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}`; const versions = await versionsRepository.findRecent(getFeedLimit(), { serviceId: service.id }); - const document = buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }); + const document = buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }); return sendAtom(res, render(document)); }); @@ -253,7 +254,7 @@ export default function feedRouter(services) { const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}:${termsType}`; const versions = await versionsRepository.findRecent(getFeedLimit(), { serviceId: service.id, termsType }); - const document = buildFeedDocument({ collection, selfHref, feedId, versions, baseUrl }); + const document = buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }); return sendAtom(res, render(document)); }); diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index e02829fa2..ee235f495 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -1,7 +1,9 @@ +import config from 'config'; import express from 'express'; import helmet from 'helmet'; import { getCollection } from '../../archivist/collection/index.js'; +import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'; import * as Services from '../../archivist/services/index.js'; import docsRouter from './docs.js'; @@ -34,11 +36,13 @@ export default async function apiRouter(basePath) { const services = await Services.load(); const collection = await getCollection(); + const versionsStorageConfig = config.get('@opentermsarchive/engine.recorder.versions.storage'); + const versionsRepository = await RepositoryFactory.create(versionsStorageConfig).initialize(); router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); - router.use(versionsRouter(services)); - router.use(feedRouter(services)); + router.use(versionsRouter(services, versionsRepository)); + router.use(feedRouter(services, versionsRepository, versionsStorageConfig.type)); return router; } diff --git a/src/collection-api/routes/versions.js b/src/collection-api/routes/versions.js index 069cd7f7b..914790a9b 100644 --- a/src/collection-api/routes/versions.js +++ b/src/collection-api/routes/versions.js @@ -3,11 +3,11 @@ import express from 'express'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; import { findServiceCaseInsensitive } from './utils.js'; -import versionsRepository from './versionsRepository.js'; /** - * @param {object} services The services to be exposed by the API - * @returns {express.Router} The router instance + * @param {object} services The services to be exposed by the API + * @param {object} versionsRepository The versions repository instance + * @returns {express.Router} The router instance * @private * @swagger * tags: @@ -30,7 +30,7 @@ import versionsRepository from './versionsRepository.js'; * type: string * description: The JSON-escaped Markdown content of the version */ -export default function versionsRouter(services) { +export default function versionsRouter(services, versionsRepository) { const router = express.Router(); /** diff --git a/src/collection-api/routes/versionsRepository.js b/src/collection-api/routes/versionsRepository.js deleted file mode 100644 index d76d06ce1..000000000 --- a/src/collection-api/routes/versionsRepository.js +++ /dev/null @@ -1,9 +0,0 @@ -import config from 'config'; - -import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'; - -export const storageConfig = config.get('@opentermsarchive/engine.recorder.versions.storage'); - -const versionsRepository = await RepositoryFactory.create(storageConfig).initialize(); - -export default versionsRepository; From 693dc2865d37299667bcd3b85008e59bac7ad6a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 29 Apr 2026 17:34:06 +0200 Subject: [PATCH 13/48] Resolve serviceId case-sensitively #34335620 introduced case-insensitive matching on/service/:serviceId, contradicting the documented case-sensitive service ID format --- CHANGELOG.md | 4 +- src/collection-api/routes/feed.js | 10 ++--- src/collection-api/routes/feed.test.js | 6 +-- src/collection-api/routes/index.js | 2 +- src/collection-api/routes/services.js | 4 +- src/collection-api/routes/services.test.js | 45 ++-------------------- src/collection-api/routes/utils.js | 5 --- src/collection-api/routes/utils.test.js | 44 --------------------- src/collection-api/routes/versions.js | 15 ++------ src/collection-api/routes/versions.test.js | 25 ------------ 10 files changed, 18 insertions(+), 142 deletions(-) delete mode 100644 src/collection-api/routes/utils.js delete mode 100644 src/collection-api/routes/utils.test.js diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c0c6adee..2b9a99268 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## Unreleased [minor] +## Unreleased [major] > Development of this release was supported by [Reset Tech](https://www.reset.tech). @@ -15,7 +15,7 @@ All changes that impact users of this module are documented in this file, in the ### Changed -- Resolve `serviceId` path parameter case-insensitively on the `GET /version/:serviceId/:termsType/:date` endpoint, consistent with other endpoints +- **Breaking:** Resolve `serviceId` path parameter case-sensitively on the `GET /service/:serviceId` endpoint, in line with the documented service ID format; clients relying on case-insensitive matching must now use the exact ID casing ## 11.0.2 - 2026-04-14 diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index dc73f20a3..5d63eb88f 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -6,8 +6,6 @@ import { getCollection } from '../../archivist/collection/index.js'; import { COMMIT_MESSAGE_PREFIXES } from '../../archivist/recorder/repositories/git/dataMapper.js'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; -import { findServiceCaseInsensitive } from './utils.js'; - const TAG_AUTHORITY = 'opentermsarchive.org,2026'; const FEED_AUTHOR_NAME = 'OTA-Bot'; const DEFAULT_LIMIT = 100; @@ -172,7 +170,7 @@ export default function feedRouter(services, versionsRepository, storageType) { * parameters: * - in: path * name: serviceId - * description: The ID of the service. Case-insensitive. + * description: The ID of the service. * schema: * type: string * required: true @@ -187,7 +185,7 @@ export default function feedRouter(services, versionsRepository, storageType) { * description: No service matching the provided ID is found. */ router.get('/feed/:serviceId', async (req, res) => { - const service = findServiceCaseInsensitive(services, req.params.serviceId); + const service = Object.hasOwn(services, req.params.serviceId) ? services[req.params.serviceId] : null; if (!service) { return res.status(404).send('Service not found'); @@ -215,7 +213,7 @@ export default function feedRouter(services, versionsRepository, storageType) { * parameters: * - in: path * name: serviceId - * description: The ID of the service. Case-insensitive. + * description: The ID of the service. * schema: * type: string * required: true @@ -236,7 +234,7 @@ export default function feedRouter(services, versionsRepository, storageType) { * description: Either the service ID does not match any service or the terms type is not declared by that service. */ router.get('/feed/:serviceId/:termsType', async (req, res) => { - const service = findServiceCaseInsensitive(services, req.params.serviceId); + const service = Object.hasOwn(services, req.params.serviceId) ? services[req.params.serviceId] : null; if (!service) { return res.status(404).send('Service not found'); diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index b63d1fb04..8c60f3493 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -355,15 +355,15 @@ describe('Feed API', () => { }); }); - context('when the serviceId uses different casing', () => { + context('when the serviceId casing does not match', () => { let response; before(async () => { response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE.toUpperCase())}`); }); - it('still resolves to the service (case-insensitive)', () => { - expect(response.status).to.equal(200); + it('responds with 404', () => { + expect(response.status).to.equal(404); }); }); }); diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index ee235f495..c24b8f5dd 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -41,7 +41,7 @@ export default async function apiRouter(basePath) { router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); - router.use(versionsRouter(services, versionsRepository)); + router.use(versionsRouter(versionsRepository)); router.use(feedRouter(services, versionsRepository, versionsStorageConfig.type)); return router; diff --git a/src/collection-api/routes/services.js b/src/collection-api/routes/services.js index 95febb6a2..9906b7152 100644 --- a/src/collection-api/routes/services.js +++ b/src/collection-api/routes/services.js @@ -1,7 +1,5 @@ import express from 'express'; -import { findServiceCaseInsensitive } from './utils.js'; - /** * @param {object} services The services to be exposed by the API * @returns {express.Router} The router instance @@ -132,7 +130,7 @@ export default function servicesRouter(services) { * description: No service matching the provided ID is found. */ router.get('/service/:serviceId', (req, res) => { - const service = findServiceCaseInsensitive(services, req.params.serviceId); + const service = Object.hasOwn(services, req.params.serviceId) ? services[req.params.serviceId] : null; if (!service) { res.status(404).send('Service not found'); diff --git a/src/collection-api/routes/services.test.js b/src/collection-api/routes/services.test.js index db6bdc16f..43dfed950 100644 --- a/src/collection-api/routes/services.test.js +++ b/src/collection-api/routes/services.test.js @@ -56,7 +56,6 @@ describe('Services API', () => { describe('GET /service/:serviceId', () => { let response; const SERVICE_ID = 'Service B!'; - const CASE_INSENSITIVE_SERVICE_ID = 'service b!'; before(async () => { response = await request(app).get(`${basePath}/v1/service/${encodeURI(SERVICE_ID)}`); @@ -106,49 +105,13 @@ describe('Services API', () => { }); }); - context('with a case-insensitive service ID parameter', () => { + context('when the service ID casing does not match', () => { before(async () => { - response = await request(app).get(`${basePath}/v1/service/${encodeURI(CASE_INSENSITIVE_SERVICE_ID)}`); + response = await request(app).get(`${basePath}/v1/service/${encodeURI(SERVICE_ID.toLowerCase())}`); }); - it('responds with 200 status code', () => { - expect(response.status).to.equal(200); - }); - - it('returns a service object with id', () => { - expect(response.body).to.have.property('id'); - }); - - it('returns the proper service object', () => { - expect(response.body.id).to.equal(SERVICE_ID); - }); - - it('returns a service object with name', () => { - expect(response.body).to.have.property('name'); - }); - - it('returns a service object with an array of terms', () => { - expect(response.body).to.have.property('terms').that.is.an('array'); - }); - - it('each terms should have a type property', () => { - response.body.terms.forEach(terms => { - expect(terms).to.have.property('type'); - }); - }); - - it('each terms should have an array of source documents', () => { - response.body.terms.forEach(terms => { - expect(terms).to.have.property('sourceDocuments').that.is.an('array'); - }); - }); - - it('each source document should have a location', () => { - response.body.terms.forEach(terms => { - terms.sourceDocuments.forEach(sourceDocument => { - expect(sourceDocument).to.have.property('location'); - }); - }); + it('responds with 404 status code', () => { + expect(response.status).to.equal(404); }); }); diff --git a/src/collection-api/routes/utils.js b/src/collection-api/routes/utils.js deleted file mode 100644 index 18728e445..000000000 --- a/src/collection-api/routes/utils.js +++ /dev/null @@ -1,5 +0,0 @@ -export function findServiceCaseInsensitive(services, serviceId) { - const matched = Object.keys(services).find(key => key.toLowerCase() === serviceId?.toLowerCase()); - - return matched ? services[matched] : null; -} diff --git a/src/collection-api/routes/utils.test.js b/src/collection-api/routes/utils.test.js deleted file mode 100644 index b7bb137f0..000000000 --- a/src/collection-api/routes/utils.test.js +++ /dev/null @@ -1,44 +0,0 @@ -import { expect } from 'chai'; - -import { findServiceCaseInsensitive } from './utils.js'; - -describe('findServiceCaseInsensitive', () => { - const services = { - '42Corp': { id: '42Corp' }, - ACMEco: { id: 'ACMEco' }, - 'example.org': { id: 'example.org' }, - 'Foo Bar': { id: 'Foo Bar' }, - 'service-b': { id: 'service-b' }, - serviceĀ·A: { id: 'serviceĀ·A' }, - }; - - it('returns the service when the id matches exactly', () => { - expect(findServiceCaseInsensitive(services, '42Corp')).to.equal(services['42Corp']); - expect(findServiceCaseInsensitive(services, 'ACMEco')).to.equal(services.ACMEco); - expect(findServiceCaseInsensitive(services, 'example.org')).to.equal(services['example.org']); - expect(findServiceCaseInsensitive(services, 'Foo Bar')).to.equal(services['Foo Bar']); - expect(findServiceCaseInsensitive(services, 'service-b')).to.equal(services['service-b']); - expect(findServiceCaseInsensitive(services, 'serviceĀ·A')).to.equal(services['serviceĀ·A']); - }); - - it('returns the service when the id casing differs', () => { - expect(findServiceCaseInsensitive(services, '42CORP')).to.equal(services['42Corp']); - expect(findServiceCaseInsensitive(services, 'acmeco')).to.equal(services.ACMEco); - expect(findServiceCaseInsensitive(services, 'EXAMPLE.ORG')).to.equal(services['example.org']); - expect(findServiceCaseInsensitive(services, 'foo bar')).to.equal(services['Foo Bar']); - expect(findServiceCaseInsensitive(services, 'SERVICE-B')).to.equal(services['service-b']); - expect(findServiceCaseInsensitive(services, 'SERVICEĀ·A')).to.equal(services['serviceĀ·A']); - }); - - it('returns null when no service matches', () => { - expect(findServiceCaseInsensitive(services, 'Unknown')).to.be.null; - }); - - it('returns null when serviceId is undefined', () => { - expect(findServiceCaseInsensitive(services, undefined)).to.be.null; - }); - - it('returns null when services is empty', () => { - expect(findServiceCaseInsensitive({}, 'Foo Bar')).to.be.null; - }); -}); diff --git a/src/collection-api/routes/versions.js b/src/collection-api/routes/versions.js index 914790a9b..176ba0c55 100644 --- a/src/collection-api/routes/versions.js +++ b/src/collection-api/routes/versions.js @@ -2,10 +2,7 @@ import express from 'express'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; -import { findServiceCaseInsensitive } from './utils.js'; - /** - * @param {object} services The services to be exposed by the API * @param {object} versionsRepository The versions repository instance * @returns {express.Router} The router instance * @private @@ -30,7 +27,7 @@ import { findServiceCaseInsensitive } from './utils.js'; * type: string * description: The JSON-escaped Markdown content of the version */ -export default function versionsRouter(services, versionsRepository) { +export default function versionsRouter(versionsRepository) { const router = express.Router(); /** @@ -91,20 +88,14 @@ export default function versionsRouter(services, versionsRepository) { * description: Error message indicating that the requested date is in the future. */ router.get('/version/:serviceId/:termsType/:date', async (req, res) => { - const { termsType, date } = req.params; + const { serviceId, termsType, date } = req.params; const requestedDate = new Date(date); if (requestedDate > new Date()) { return res.status(416).json({ error: 'Requested version is in the future' }); } - const service = findServiceCaseInsensitive(services, req.params.serviceId); - - if (!service) { - return res.status(404).json({ error: 'Service not found' }); - } - - const version = await versionsRepository.findByDate(service.id, termsType, requestedDate); + const version = await versionsRepository.findByDate(serviceId, termsType, requestedDate); if (!version) { return res.status(404).json({ error: `No version found for date ${date}` }); diff --git a/src/collection-api/routes/versions.test.js b/src/collection-api/routes/versions.test.js index 1ec145854..bfdff4e15 100644 --- a/src/collection-api/routes/versions.test.js +++ b/src/collection-api/routes/versions.test.js @@ -96,31 +96,6 @@ describe('Versions API', () => { }); }); - context('when the serviceId uses different casing', () => { - before(async () => { - response = await request.get(`${basePath}/v1/version/SERVICEĀ·A/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE))}`); - }); - - it('still resolves to the service (case-insensitive)', () => { - expect(response.status).to.equal(200); - expect(response.body).to.deep.equal(expectedResult); - }); - }); - - context('when the service does not exist', () => { - before(async () => { - response = await request.get(`${basePath}/v1/version/DoesNotExist/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE))}`); - }); - - it('responds with 404 status code', () => { - expect(response.status).to.equal(404); - }); - - it('returns an error message', () => { - expect(response.body.error).to.equal('Service not found'); - }); - }); - context('when the requested date is in the future', () => { before(async () => { const dateInTheFuture = new Date(Date.now() + 60000); // 1 minute in the future From 793c0abf73590bbdd40fefebd805eb9d7a479a8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 29 Apr 2026 17:50:21 +0200 Subject: [PATCH 14/48] Restructure repository query API with pagination --- .../recorder/repositories/git/dataMapper.js | 5 + .../recorder/repositories/git/git.js | 8 +- .../recorder/repositories/git/index.js | 84 ++++-- .../recorder/repositories/git/index.test.js | 208 ++++++-------- .../recorder/repositories/interface.js | 53 ++-- .../recorder/repositories/mongo/index.js | 64 +++-- .../recorder/repositories/mongo/index.test.js | 255 +++++++++--------- src/collection-api/routes/feed.js | 6 +- 8 files changed, 382 insertions(+), 301 deletions(-) diff --git a/src/archivist/recorder/repositories/git/dataMapper.js b/src/archivist/recorder/repositories/git/dataMapper.js index c9dadd267..8fcd3fafb 100644 --- a/src/archivist/recorder/repositories/git/dataMapper.js +++ b/src/archivist/recorder/repositories/git/dataMapper.js @@ -91,6 +91,11 @@ function generateFileName(termsType, documentId, extension) { } export function generateFilePath(serviceId, termsType, documentId, mimeType) { + // If only serviceId is provided, return a pattern to match all files for that service + if (termsType === undefined) { + return `${serviceId}/*`; + } + const extension = mime.getExtension(mimeType) || '*'; // If mime type is undefined, an asterisk is set as an extension. Used to match all files for the given service ID, terms type and document ID when mime type is unknown return `${serviceId}/${generateFileName(termsType, documentId, extension)}`; // Do not use `path.join` as even for Windows, the path should be with `/` and not `\` diff --git a/src/archivist/recorder/repositories/git/git.js b/src/archivist/recorder/repositories/git/git.js index 791c39310..364fdc72b 100644 --- a/src/archivist/recorder/repositories/git/git.js +++ b/src/archivist/recorder/repositories/git/git.js @@ -68,8 +68,12 @@ export default class Git { return this.git.push(); } - listCommits(options = []) { - return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]); // Returns all commits in chronological order (`--reverse`), excluding merge commits (`--no-merges`), with modified files names (`--name-only`) + listCommits(options = [], { reverse = true, skip, maxCount } = {}) { + const reverseOption = reverse ? ['--reverse'] : []; + const skipOption = skip !== undefined ? [`--skip=${skip}`] : []; + const maxCountOption = maxCount !== undefined ? [`--max-count=${maxCount}`] : []; + + return this.log([ ...reverseOption, '--author-date-order', '--no-merges', '--name-only', ...skipOption, ...maxCountOption, ...options ]); // Returns commits in chronological order with `--reverse` (oldest first) or reverse chronological without it (newest first), sorted by author date (`--author-date-order`), excluding merge commits (`--no-merges`), with modified files names (`--name-only`), with optional pagination (`--skip`, `--max-count`) } async getCommit(options) { diff --git a/src/archivist/recorder/repositories/git/index.js b/src/archivist/recorder/repositories/git/index.js index 284a0340c..50da196ba 100644 --- a/src/archivist/recorder/repositories/git/index.js +++ b/src/archivist/recorder/repositories/git/index.js @@ -88,36 +88,45 @@ export default class GitRepository extends RepositoryInterface { return this.#toDomain(commit); } - async findAll() { - return Promise.all((await this.#getCommits()).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); + async findAll({ limit, offset } = {}) { + return Promise.all((await this.#getCommits({ limit, offset })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); } - async findRecent(limit, { serviceId, termsType } = {}) { - const commits = (await this.#getCommits()).reverse(); - const records = []; + async findByService(serviceId, { limit, offset } = {}) { + const pathPattern = DataMapper.generateFilePath(serviceId); - for (const commit of commits) { - if (records.length >= limit) { break; } + return Promise.all((await this.#getCommits({ pathFilter: pathPattern, limit, offset })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); + } - const record = await this.#toDomain(commit, { deferContentLoading: true }); + async findByServiceAndTermsType(serviceId, termsType, { limit, offset } = {}) { + const pathPattern = DataMapper.generateFilePath(serviceId, termsType); - if (!record) { continue; } + return Promise.all((await this.#getCommits({ pathFilter: pathPattern, limit, offset })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); + } - if (serviceId !== undefined && record.serviceId !== serviceId) { continue; } - if (termsType !== undefined && record.termsType !== termsType) { continue; } + async count(serviceId, termsType) { + const grepOptions = Object.values(DataMapper.COMMIT_MESSAGE_PREFIXES).map(prefix => `--grep=${prefix}`); + const pathOptions = []; - records.push(record); - } + if (serviceId && termsType) { + const pathPattern = DataMapper.generateFilePath(serviceId, termsType); - return records; - } + pathOptions.push('--', pathPattern); + } else if (serviceId) { + // Count all records for a service (all terms types) + const pathPattern = DataMapper.generateFilePath(serviceId); - async count() { - return (await this.git.log(Object.values(DataMapper.COMMIT_MESSAGE_PREFIXES).map(prefix => `--grep=${prefix}`))).length; + pathOptions.push('--', pathPattern); + } else { + // Count all records (exclude root directory files) + pathOptions.push('--', '*/*'); + } + + return (await this.git.log([ ...grepOptions, ...pathOptions ])).length; } async* iterate() { - const commits = await this.#getCommits(); + const commits = await this.#getCommits({ reverse: true }); for (const commit of commits) { yield this.#toDomain(commit); @@ -151,12 +160,39 @@ export default class GitRepository extends RepositoryInterface { record.content = pdfBuffer; } - async #getCommits() { - return (await this.git.listCommits()) - .filter(commit => // Skip non-record commits (e.g., README or LICENSE updates) - DataMapper.COMMIT_MESSAGE_PREFIXES_REGEXP.test(commit.message) // Commits generated by the engine have messages that match predefined prefixes - && path.dirname(commit.diff.files[0].file) !== '.') // Assumes one record per commit; records must be in a serviceId folder, not root - .sort((commitA, commitB) => new Date(commitA.date) - new Date(commitB.date)); // Make sure that the commits are sorted in ascending chronological order + async #getCommits({ pathFilter, reverse = false, limit, offset } = {}) { + const grepOptions = Object.values(DataMapper.COMMIT_MESSAGE_PREFIXES).flatMap(prefix => [ '--grep', prefix ]); + const pathOptions = pathFilter + ? [ '--', pathFilter ] + : [ '--', '*/*' ]; // Exclude root directory files by only matching files in subdirectories + + const options = [ ...grepOptions, ...pathOptions ]; + + // Use git-level pagination when available + // Note: --skip and --max-count work in topological order, not chronological order + // This means pagination may not be strictly chronological, but it's acceptable for performance + const paginationOptions = {}; + + if (offset !== undefined) { + paginationOptions.skip = offset; + } + + if (limit !== undefined) { + paginationOptions.maxCount = limit; + } + + const commits = await this.git.listCommits(options, { reverse: false, ...paginationOptions }); // Get commits without git's --reverse for better performance, filtered at git level + + // Sort by date in JavaScript for accuracy - git's date ordering may not be reliable with backdated commits + // Default order is descending (newest to oldest), reverse gives ascending (oldest to newest) + commits.sort((commitA, commitB) => { + const dateA = new Date(commitA.date); + const dateB = new Date(commitB.date); + + return reverse ? dateA - dateB : dateB - dateA; + }); + + return commits; } static async writeFile({ filePath, content }) { diff --git a/src/archivist/recorder/repositories/git/index.test.js b/src/archivist/recorder/repositories/git/index.test.js index ee8d8b6e6..6ef24175f 100644 --- a/src/archivist/recorder/repositories/git/index.test.js +++ b/src/archivist/recorder/repositories/git/index.test.js @@ -540,53 +540,92 @@ describe('GitRepository', () => { } }); - it('returns records in ascending order', () => { - expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); }); }); - describe('#count', () => { - let count; + describe('#findByServiceAndTermsType', () => { + const expectedIds = []; + let records; before(async function () { this.timeout(5000); - await subject.save(new Version({ + const { id: id1 } = await subject.save(new Version({ serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE, content: CONTENT, fetchDate: FETCH_DATE, snapshotIds: [SNAPSHOT_ID], })); - await subject.save(new Version({ + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Version({ serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE, content: `${CONTENT} - updated`, fetchDate: FETCH_DATE_LATER, snapshotIds: [SNAPSHOT_ID], })); + + expectedIds.push(id2); + await subject.save(new Version({ - serviceId: SERVICE_PROVIDER_ID, - termsType: TERMS_TYPE, - content: `${CONTENT} - updated 2`, - isTechnicalUpgrade: true, - fetchDate: FETCH_DATE_EARLIER, + serviceId: 'other_service', + termsType: 'Privacy Policy', + content: `${CONTENT} - other`, + fetchDate: FETCH_DATE, snapshotIds: [SNAPSHOT_ID], })); - (count = await subject.count()); + (records = await subject.findByServiceAndTermsType(SERVICE_PROVIDER_ID, TERMS_TYPE)); }); after(() => subject.removeAll()); - it('returns the proper count', () => { - expect(count).to.equal(3); + it('returns only matching records', () => { + expect(records.length).to.equal(2); + }); + + it('returns Version objects', () => { + for (const record of records) { + expect(record).to.be.an.instanceof(Version); + } + }); + + it('returns records with matching service ID', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + } + }); + + it('returns records with matching terms type', () => { + for (const record of records) { + expect(record.termsType).to.equal(TERMS_TYPE); + } + }); + + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE ]); + }); + + it('returns records with correct IDs', () => { + expect(records.map(record => record.id)).to.have.members(expectedIds); + }); + + context('when no matching records exist', () => { + it('returns an empty array', async () => { + const result = await subject.findByServiceAndTermsType('non_existent_service', 'Non Existent Terms'); + + expect(result).to.be.an('array').that.is.empty; + }); }); }); - describe('#findRecent', () => { - const OTHER_SERVICE = 'other_service'; - const OTHER_TERMS = 'Privacy Policy'; + describe('#count', () => { + let count; before(async function () { this.timeout(5000); @@ -595,133 +634,62 @@ describe('GitRepository', () => { serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE, content: CONTENT, - fetchDate: FETCH_DATE_EARLIER, + fetchDate: FETCH_DATE, snapshotIds: [SNAPSHOT_ID], })); await subject.save(new Version({ serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE, content: `${CONTENT} - updated`, - fetchDate: FETCH_DATE, - snapshotIds: [SNAPSHOT_ID], - })); - await subject.save(new Version({ - serviceId: SERVICE_PROVIDER_ID, - termsType: OTHER_TERMS, - content: CONTENT, fetchDate: FETCH_DATE_LATER, snapshotIds: [SNAPSHOT_ID], })); await subject.save(new Version({ - serviceId: OTHER_SERVICE, + serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE, - content: CONTENT, - fetchDate: FETCH_DATE_LATER, + content: `${CONTENT} - updated 2`, + isTechnicalUpgrade: true, + fetchDate: FETCH_DATE_EARLIER, snapshotIds: [SNAPSHOT_ID], })); - }); - - after(() => subject.removeAll()); - - context('without filters', () => { - let records; - - before(async () => { - records = await subject.findRecent(10); - }); - - it('returns records in descending chronological order', () => { - const dates = records.map(record => record.fetchDate.getTime()); - expect(dates).to.deep.equal([...dates].sort((a, b) => b - a)); - }); - - it('returns all matching records', () => { - expect(records).to.have.length(4); - }); - - it('does not load content eagerly', () => { - for (const record of records) { - expect(() => record.content).to.throw('Content not defined'); - } - }); - - it('exposes the metadata needed for feed entries', () => { - const [record] = records; - - expect(record.id).to.be.a('string'); - expect(record.serviceId).to.be.a('string'); - expect(record.termsType).to.be.a('string'); - expect(record.fetchDate).to.be.an.instanceof(Date); - expect(record.isFirstRecord).to.be.a('boolean'); - expect(record.isTechnicalUpgrade).to.be.a('boolean'); - }); - }); - - context('when limit is smaller than the number of matching records', () => { - let records; - - before(async () => { - records = await subject.findRecent(2); - }); - - it('returns at most limit records', () => { - expect(records).to.have.length(2); - }); - - it('returns the most recent records', () => { - for (const record of records) { - expect(record.fetchDate.getTime()).to.be.at.least(FETCH_DATE.getTime()); - } - }); + (count = await subject.count()); }); - context('when a serviceId filter is given', () => { - let records; - - before(async () => { - records = await subject.findRecent(10, { serviceId: SERVICE_PROVIDER_ID }); - }); - - it('returns only records for that service', () => { - for (const record of records) { - expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); - } - }); + after(() => subject.removeAll()); - it('returns all records that match', () => { - expect(records).to.have.length(3); - }); + it('returns the proper count', () => { + expect(count).to.equal(3); }); - context('when both serviceId and termsType filters are given', () => { - let records; + context('with serviceId and termsType filters', () => { + it('returns count for specific service and terms type', async () => { + const filteredCount = await subject.count(SERVICE_PROVIDER_ID, TERMS_TYPE); - before(async () => { - records = await subject.findRecent(10, { serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE }); + expect(filteredCount).to.equal(3); }); - it('returns only records for that service and terms type', () => { - for (const record of records) { - expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); - expect(record.termsType).to.equal(TERMS_TYPE); - } - }); + it('returns zero for non-existent service', async () => { + const filteredCount = await subject.count('non-existent-service', TERMS_TYPE); - it('returns all records that match', () => { - expect(records).to.have.length(2); + expect(filteredCount).to.equal(0); }); }); - context('when filters match no record', () => { - let records; + context('with only serviceId filter', () => { + it('returns count for all terms types of a service', async () => { + // Add a version with different terms type + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: 'Different Terms', + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); - before(async () => { - records = await subject.findRecent(10, { serviceId: 'unknown' }); - }); + const filteredCount = await subject.count(SERVICE_PROVIDER_ID); - it('returns an empty array', () => { - expect(records).to.deep.equal([]); + expect(filteredCount).to.equal(4); // 3 from TERMS_TYPE + 1 from 'Different Terms' }); }); }); @@ -1243,8 +1211,8 @@ describe('GitRepository', () => { } }); - it('returns records in ascending order', () => { - expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); }); }); @@ -1604,8 +1572,8 @@ describe('GitRepository', () => { } }); - it('returns records in ascending order', () => { - expect(records.map(record => record.fetchDate)).to.deep.equal(expectedDates); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([...expectedDates].reverse()); }); }); diff --git a/src/archivist/recorder/repositories/interface.js b/src/archivist/recorder/repositories/interface.js index cf18e6a85..ae0ffafcc 100644 --- a/src/archivist/recorder/repositories/interface.js +++ b/src/archivist/recorder/repositories/interface.js @@ -70,35 +70,56 @@ class RepositoryInterface { } /** - * Find all records + * Find all records, in descending chronological order (newest first; opposite of #iterate) * For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records - * @see RepositoryInterface#loadRecordContent - * @returns {Promise>} Promise that will be resolved with an array of all records + * @see RepositoryInterface#loadRecordContent + * @see RepositoryInterface#iterate + * @param {object} [options] - Pagination options + * @param {number} [options.limit] - Maximum number of records to return + * @param {number} [options.offset] - Number of records to skip + * @returns {Promise>} Promise that will be resolved with an array of records in descending chronological order */ - async findAll() { + async findAll(options = {}) { throw new Error(`#findAll method is not implemented in ${this.constructor.name}`); } /** - * Find the most recent records in the repository, optionally filtered by service ID and terms type - * For performance reasons, the content of the records will not be loaded. Use #loadRecordContent to load the content of individual records - * @see RepositoryInterface#loadRecordContent - * @param {number} limit - Maximum number of records to return - * @param {object} [filters] - Optional filters - * @param {string} [filters.serviceId] - Restrict results to this service ID - * @param {string} [filters.termsType] - Restrict results to this terms type - * @returns {Promise>} Promise that will be resolved with an array of records in descending chronological order + * Find all records for a specific service, in descending chronological order + * For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records + * @see RepositoryInterface#loadRecordContent + * @param {string} serviceId - Service ID of records to find + * @param {object} [options] - Pagination options + * @param {number} [options.limit] - Maximum number of records to return + * @param {number} [options.offset] - Number of records to skip + * @returns {Promise>} Promise that will be resolved with an array of matching records in descending chronological order + */ + async findByService(serviceId, options = {}) { + throw new Error(`#findByService method is not implemented in ${this.constructor.name}`); + } + + /** + * Find all records for a specific service and terms type, in descending chronological order + * For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records + * @see RepositoryInterface#loadRecordContent + * @param {string} serviceId - Service ID of records to find + * @param {string} termsType - Terms type of records to find + * @param {object} [options] - Pagination options + * @param {number} [options.limit] - Maximum number of records to return + * @param {number} [options.offset] - Number of records to skip + * @returns {Promise>} Promise that will be resolved with an array of matching records in descending chronological order */ - async findRecent(limit, filters) { - throw new Error(`#findRecent method is not implemented in ${this.constructor.name}`); + async findByServiceAndTermsType(serviceId, termsType, options = {}) { + throw new Error(`#findByServiceAndTermsType method is not implemented in ${this.constructor.name}`); } /** * Count the total number of records in the repository * For performance reasons, use this method rather than counting the number of entries returned by #findAll if you only need the size of a repository - * @returns {Promise} Promise that will be resolved with the total number of records + * @param {string} [serviceId] - Optional service ID to filter records + * @param {string} [termsType] - Optional terms type to filter records (requires serviceId) + * @returns {Promise} Promise that will be resolved with the total number of records */ - async count() { + async count(serviceId, termsType) { throw new Error(`#count method is not implemented in ${this.constructor.name}`); } diff --git a/src/archivist/recorder/repositories/mongo/index.js b/src/archivist/recorder/repositories/mongo/index.js index cd64940f9..fc1b860ca 100644 --- a/src/archivist/recorder/repositories/mongo/index.js +++ b/src/archivist/recorder/repositories/mongo/index.js @@ -88,29 +88,63 @@ export default class MongoRepository extends RepositoryInterface { return this.#toDomain(mongoDocument); } - async findAll() { - return Promise.all((await this.collection.find().project({ content: 0 }).sort({ fetchDate: 1 }).toArray()) + async findAll({ limit, offset } = {}) { + let query = this.collection.find().project({ content: 0 }).sort({ fetchDate: -1 }); + + if (offset !== undefined) { + query = query.skip(offset); + } + + if (limit !== undefined) { + query = query.limit(limit); + } + + return Promise.all((await query.toArray()) .map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); } - async findRecent(limit, { serviceId, termsType } = {}) { - const query = {}; + async findByServiceAndTermsType(serviceId, termsType, { limit, offset } = {}) { + let query = this.collection.find({ serviceId, termsType }).project({ content: 0 }).sort({ fetchDate: -1 }); - if (serviceId !== undefined) { query.serviceId = serviceId; } - if (termsType !== undefined) { query.termsType = termsType; } + if (offset !== undefined) { + query = query.skip(offset); + } - const mongoDocuments = await this.collection - .find(query) - .project({ content: 0 }) - .sort({ fetchDate: -1 }) - .limit(limit) - .toArray(); + if (limit !== undefined) { + query = query.limit(limit); + } - return Promise.all(mongoDocuments.map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); + return Promise.all((await query.toArray()) + .map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); } - count() { - return this.collection.countDocuments(); + async findByService(serviceId, { limit, offset } = {}) { + let query = this.collection.find({ serviceId }).project({ content: 0 }).sort({ fetchDate: -1 }); + + if (offset !== undefined) { + query = query.skip(offset); + } + + if (limit !== undefined) { + query = query.limit(limit); + } + + return Promise.all((await query.toArray()) + .map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); + } + + count(serviceId, termsType) { + const filter = {}; + + if (serviceId) { + filter.serviceId = serviceId; + } + + if (termsType) { + filter.termsType = termsType; + } + + return this.collection.countDocuments(filter); } async* iterate() { diff --git a/src/archivist/recorder/repositories/mongo/index.test.js b/src/archivist/recorder/repositories/mongo/index.test.js index 880c9b2e3..e2123cdfd 100644 --- a/src/archivist/recorder/repositories/mongo/index.test.js +++ b/src/archivist/recorder/repositories/mongo/index.test.js @@ -629,184 +629,197 @@ describe('MongoRepository', () => { } }); - it('returns records in ascending order', () => { - expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); }); }); - describe('#count', () => { - let count; + describe('#findByServiceAndTermsType', () => { + const expectedIds = []; + let records; before(async () => { - await subject.save(new Version({ + const { id: id1 } = await subject.save(new Version({ serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE, content: CONTENT, fetchDate: FETCH_DATE, snapshotIds: [SNAPSHOT_ID], })); - await subject.save(new Version({ + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Version({ serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE, content: `${CONTENT} - updated`, fetchDate: FETCH_DATE_LATER, snapshotIds: [SNAPSHOT_ID], })); - await subject.save(new Version({ - serviceId: SERVICE_PROVIDER_ID, - termsType: TERMS_TYPE, - content: `${CONTENT} - updated 2`, - isTechnicalUpgrade: true, - fetchDate: FETCH_DATE_EARLIER, - snapshotIds: [SNAPSHOT_ID], - })); - (count = await subject.count()); - }); - - after(() => subject.removeAll()); - - it('returns the proper count', () => { - expect(count).to.equal(3); - }); - }); - - describe('#findRecent', () => { - const OTHER_SERVICE = 'other_service'; - const OTHER_TERMS = 'Privacy Policy'; + expectedIds.push(id2); - before(async () => { await subject.save(new Version({ - serviceId: SERVICE_PROVIDER_ID, - termsType: TERMS_TYPE, - content: CONTENT, - fetchDate: FETCH_DATE_EARLIER, - snapshotIds: [SNAPSHOT_ID], - })); - await subject.save(new Version({ - serviceId: SERVICE_PROVIDER_ID, - termsType: TERMS_TYPE, - content: `${CONTENT} - updated`, + serviceId: 'other_service', + termsType: 'Privacy Policy', + content: `${CONTENT} - other`, fetchDate: FETCH_DATE, snapshotIds: [SNAPSHOT_ID], })); - await subject.save(new Version({ - serviceId: SERVICE_PROVIDER_ID, - termsType: OTHER_TERMS, - content: CONTENT, - fetchDate: FETCH_DATE_LATER, - snapshotIds: [SNAPSHOT_ID], - })); - await subject.save(new Version({ - serviceId: OTHER_SERVICE, - termsType: TERMS_TYPE, - content: CONTENT, - fetchDate: FETCH_DATE_LATER, - snapshotIds: [SNAPSHOT_ID], - })); + + (records = await subject.findByServiceAndTermsType(SERVICE_PROVIDER_ID, TERMS_TYPE)); }); after(() => subject.removeAll()); - context('without filters', () => { - let records; + it('returns only matching records', () => { + expect(records.length).to.equal(2); + }); - before(async () => { - records = await subject.findRecent(10); - }); + it('returns Version objects', () => { + for (const record of records) { + expect(record).to.be.an.instanceof(Version); + } + }); - it('returns records in descending chronological order', () => { - const dates = records.map(record => record.fetchDate.getTime()); + it('returns records with matching service ID', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + } + }); - expect(dates).to.deep.equal([...dates].sort((a, b) => b - a)); - }); + it('returns records with matching terms type', () => { + for (const record of records) { + expect(record.termsType).to.equal(TERMS_TYPE); + } + }); - it('returns all matching records', () => { - expect(records).to.have.length(4); - }); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE ]); + }); - it('does not load content eagerly', () => { - for (const record of records) { - expect(() => record.content).to.throw('Content not defined'); - } - }); + it('returns records with correct IDs', () => { + expect(records.map(record => record.id)).to.have.members(expectedIds); + }); - it('exposes the metadata needed for feed entries', () => { - const [record] = records; + context('when no matching records exist', () => { + it('returns an empty array', async () => { + const result = await subject.findByServiceAndTermsType('non_existent_service', 'Non Existent Terms'); - expect(record.id).to.be.a('string'); - expect(record.serviceId).to.be.a('string'); - expect(record.termsType).to.be.a('string'); - expect(record.fetchDate).to.be.an.instanceof(Date); - expect(record.isFirstRecord).to.be.a('boolean'); - expect(record.isTechnicalUpgrade).to.be.a('boolean'); + expect(result).to.be.an('array').that.is.empty; }); }); + }); - context('when limit is smaller than the number of matching records', () => { - let records; + describe('#count', () => { + context('without filters', () => { + let count; before(async () => { - records = await subject.findRecent(2); - }); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated 2`, + isTechnicalUpgrade: true, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + })); - it('returns at most limit records', () => { - expect(records).to.have.length(2); + (count = await subject.count()); }); - it('returns the most recent records', () => { - for (const record of records) { - expect(record.fetchDate.getTime()).to.be.at.least(FETCH_DATE.getTime()); - } + after(() => subject.removeAll()); + + it('returns the proper count', () => { + expect(count).to.equal(3); }); }); - context('when a serviceId filter is given', () => { - let records; - + context('with serviceId and termsType filters', () => { before(async () => { - records = await subject.findRecent(10, { serviceId: SERVICE_PROVIDER_ID }); - }); - - it('returns only records for that service', () => { - for (const record of records) { - expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); - } + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: 'other_service', + termsType: 'Privacy Policy', + content: 'Other content', + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); }); - it('returns all records that match', () => { - expect(records).to.have.length(3); - }); - }); + after(() => subject.removeAll()); - context('when both serviceId and termsType filters are given', () => { - let records; + it('returns count for specific service and terms type', async () => { + const filteredCount = await subject.count(SERVICE_PROVIDER_ID, TERMS_TYPE); - before(async () => { - records = await subject.findRecent(10, { serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE }); + expect(filteredCount).to.equal(2); }); - it('returns only records for that service and terms type', () => { - for (const record of records) { - expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); - expect(record.termsType).to.equal(TERMS_TYPE); - } - }); + it('returns zero for non-existent service', async () => { + const filteredCount = await subject.count('non-existent-service', TERMS_TYPE); - it('returns all records that match', () => { - expect(records).to.have.length(2); + expect(filteredCount).to.equal(0); }); }); - context('when filters match no record', () => { - let records; - + context('with only serviceId filter', () => { before(async () => { - records = await subject.findRecent(10, { serviceId: 'unknown' }); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: 'Different Terms', + content: 'Different content', + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: 'other_service', + termsType: 'Privacy Policy', + content: 'Other content', + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); }); - it('returns an empty array', () => { - expect(records).to.deep.equal([]); + after(() => subject.removeAll()); + + it('returns count for all terms types of a service', async () => { + const filteredCount = await subject.count(SERVICE_PROVIDER_ID); + + expect(filteredCount).to.equal(2); }); }); }); @@ -1337,8 +1350,8 @@ describe('MongoRepository', () => { } }); - it('returns records in ascending order', () => { - expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); }); }); diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 5d63eb88f..97db22ebf 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -153,7 +153,7 @@ export default function feedRouter(services, versionsRepository, storageType) { const selfHref = `${baseUrl}/feed`; const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}`; - const versions = await versionsRepository.findRecent(getFeedLimit()); + const versions = await versionsRepository.findAll({ limit: getFeedLimit() }); const document = buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }); sendAtom(res, render(document)); @@ -196,7 +196,7 @@ export default function feedRouter(services, versionsRepository, storageType) { const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}`; const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}`; - const versions = await versionsRepository.findRecent(getFeedLimit(), { serviceId: service.id }); + const versions = await versionsRepository.findByService(service.id, { limit: getFeedLimit() }); const document = buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }); return sendAtom(res, render(document)); @@ -251,7 +251,7 @@ export default function feedRouter(services, versionsRepository, storageType) { const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}/${encodeURIComponent(termsType)}`; const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}:${termsType}`; - const versions = await versionsRepository.findRecent(getFeedLimit(), { serviceId: service.id, termsType }); + const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: getFeedLimit() }); const document = buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }); return sendAtom(res, render(document)); From b0af4e6410933baa535fe2088d0ec2c343d97605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Mon, 4 May 2026 12:04:55 +0200 Subject: [PATCH 15/48] Improve code readbility --- src/collection-api/routes/feed.js | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 97db22ebf..aaf184288 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -35,16 +35,29 @@ function buildAbsoluteBaseUrl(req) { } function classifyRecordType(version) { - if (version.isFirstRecord) { return RECORD_TYPES.firstRecord; } - if (version.isTechnicalUpgrade) { return RECORD_TYPES.technicalUpgrade; } - - return RECORD_TYPES.change; + switch (true) { + case version.isFirstRecord: + return RECORD_TYPES.firstRecord; + case version.isTechnicalUpgrade: + return RECORD_TYPES.technicalUpgrade; + default: + return RECORD_TYPES.change; + } } function buildEntryTitle(version) { - let prefix = COMMIT_MESSAGE_PREFIXES.update; - - if (version.isFirstRecord) { prefix = COMMIT_MESSAGE_PREFIXES.startTracking; } else if (version.isTechnicalUpgrade) { prefix = COMMIT_MESSAGE_PREFIXES.technicalUpgrade; } + let prefix; + + switch (true) { + case version.isFirstRecord: + prefix = COMMIT_MESSAGE_PREFIXES.startTracking; + break; + case version.isTechnicalUpgrade: + prefix = COMMIT_MESSAGE_PREFIXES.technicalUpgrade; + break; + default: + prefix = COMMIT_MESSAGE_PREFIXES.update; + } return `${prefix} ${version.serviceId} ${version.termsType}`; } From 64aa0655d6c484545fe44922e3a7ec24b3d50add Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Mon, 4 May 2026 12:06:18 +0200 Subject: [PATCH 16/48] Factorize feed response --- src/collection-api/routes/feed.js | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index aaf184288..907d5c30e 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -122,13 +122,11 @@ function buildFeedDocument({ collection, storageType, selfHref, feedId, versions }; } -function sendAtom(res, xml) { - res.set('Content-Type', 'application/atom+xml; charset=utf-8'); - res.status(200).send(xml); -} +function sendFeed(res, opts) { + const document = buildFeedDocument(opts); -function render(document) { - return js2xml(document, { compact: true, spaces: 2 }); + res.set('Content-Type', 'application/atom+xml; charset=utf-8'); + res.status(200).send(js2xml(document, { compact: true, spaces: 2 })); } /** @@ -167,9 +165,8 @@ export default function feedRouter(services, versionsRepository, storageType) { const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}`; const versions = await versionsRepository.findAll({ limit: getFeedLimit() }); - const document = buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }); - sendAtom(res, render(document)); + sendFeed(res, { collection, storageType, selfHref, feedId, versions, baseUrl }); }); /** @@ -210,9 +207,8 @@ export default function feedRouter(services, versionsRepository, storageType) { const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}`; const versions = await versionsRepository.findByService(service.id, { limit: getFeedLimit() }); - const document = buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }); - return sendAtom(res, render(document)); + return sendFeed(res, { collection, storageType, selfHref, feedId, versions, baseUrl }); }); /** @@ -265,9 +261,8 @@ export default function feedRouter(services, versionsRepository, storageType) { const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}:${termsType}`; const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: getFeedLimit() }); - const document = buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }); - return sendAtom(res, render(document)); + return sendFeed(res, { collection, storageType, selfHref, feedId, versions, baseUrl }); }); return router; From 1128a584187242b35eae345381a0a6e8d35e9073 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Mon, 4 May 2026 13:54:23 +0200 Subject: [PATCH 17/48] Inject feed limit, author and tag authority --- config/default.json | 6 ++- src/collection-api/routes/feed.js | 62 ++++++++++---------------- src/collection-api/routes/feed.test.js | 6 ++- src/collection-api/routes/index.js | 3 +- 4 files changed, 34 insertions(+), 43 deletions(-) diff --git a/config/default.json b/config/default.json index 96309b6fb..c6cf33cad 100644 --- a/config/default.json +++ b/config/default.json @@ -50,7 +50,11 @@ }, "collection-api": { "feed": { - "limit": 100 + "limit": 100, + "author": { + "name": "Open Terms Archive Collection API" + }, + "tagAuthority": "opentermsarchive.org,2026" } } } diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 907d5c30e..6d9ce9b89 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -1,4 +1,3 @@ -import config from 'config'; import express from 'express'; import { js2xml } from 'xml-js'; @@ -6,30 +5,12 @@ import { getCollection } from '../../archivist/collection/index.js'; import { COMMIT_MESSAGE_PREFIXES } from '../../archivist/recorder/repositories/git/dataMapper.js'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; -const TAG_AUTHORITY = 'opentermsarchive.org,2026'; -const FEED_AUTHOR_NAME = 'OTA-Bot'; -const DEFAULT_LIMIT = 100; - -function getFeedLimit() { - if (config.has('@opentermsarchive/engine.collection-api.feed.limit')) { - return config.get('@opentermsarchive/engine.collection-api.feed.limit'); - } - - return DEFAULT_LIMIT; -} - const RECORD_TYPES = { firstRecord: 'First record', technicalUpgrade: 'Technical upgrade', change: 'Change', }; -const SCHEMES = { - service: `tag:${TAG_AUTHORITY}:scheme:service`, - termsType: `tag:${TAG_AUTHORITY}:scheme:terms-type`, - recordType: `tag:${TAG_AUTHORITY}:scheme:record-type`, -}; - function buildAbsoluteBaseUrl(req) { return `${req.protocol}://${req.get('host')}${req.baseUrl}`; } @@ -70,11 +51,11 @@ function buildVersionLink(baseUrl, version) { return `${baseUrl}/version/${encodedService}/${encodedTermsType}/${encodedDate}`; } -function buildEntryId(collection, storageType, version) { - return `tag:${TAG_AUTHORITY}:version:${collection.metadata?.id}:${storageType}:${version.id}`; +function buildEntryId(tagAuthority, storageType, collection, version) { + return `tag:${tagAuthority}:version:${collection.metadata?.id}:${storageType}:${version.id}`; } -function buildEntry(collection, storageType, baseUrl, version) { +function buildEntry(tagAuthority, storageType, baseUrl, collection, version) { const apiLink = buildVersionLink(baseUrl, version); const githubCommitLink = collection.metadata?.versions && `${collection.metadata.versions}/commit/${version.id}`; @@ -85,19 +66,19 @@ function buildEntry(collection, storageType, baseUrl, version) { } return { - id: { _text: buildEntryId(collection, storageType, version) }, + id: { _text: buildEntryId(tagAuthority, storageType, collection, version) }, link: links, title: { _text: buildEntryTitle(version) }, updated: { _text: version.fetchDate.toISOString() }, category: [ - { _attributes: { term: version.serviceId, scheme: SCHEMES.service } }, - { _attributes: { term: version.termsType, scheme: SCHEMES.termsType } }, - { _attributes: { term: classifyRecordType(version), scheme: SCHEMES.recordType } }, + { _attributes: { term: version.serviceId, scheme: `tag:${tagAuthority}:scheme:service` } }, + { _attributes: { term: version.termsType, scheme: `tag:${tagAuthority}:scheme:terms-type` } }, + { _attributes: { term: classifyRecordType(version), scheme: `tag:${tagAuthority}:scheme:record-type` } }, ], }; } -function buildFeedDocument({ collection, storageType, selfHref, feedId, versions, baseUrl }) { +function buildFeedDocument({ tagAuthority, storageType, feedAuthorName, collection, selfHref, feedId, versions, baseUrl }) { const latestFetchDate = versions.length > 0 ? versions[0].fetchDate : new Date(); const feed = { @@ -107,14 +88,14 @@ function buildFeedDocument({ collection, storageType, selfHref, feedId, versions id: { _text: feedId }, updated: { _text: latestFetchDate.toISOString() }, link: { _attributes: { rel: 'self', href: selfHref } }, - author: { name: { _text: FEED_AUTHOR_NAME } }, + author: { name: { _text: feedAuthorName } }, }; if (collection.metadata?.logo) { feed.logo = { _text: collection.metadata.logo }; } - feed.entry = versions.map(version => buildEntry(collection, storageType, baseUrl, version)); + feed.entry = versions.map(version => buildEntry(tagAuthority, storageType, baseUrl, collection, version)); return { _declaration: { _attributes: { version: '1.0', encoding: 'utf-8' } }, @@ -133,13 +114,16 @@ function sendFeed(res, opts) { * @param {object} services The services to be exposed by the API * @param {object} versionsRepository The versions repository instance * @param {string} storageType The storage type identifier of the versions repository + * @param {number} feedLimit Maximum number of entries returned by feed endpoints + * @param {string} feedAuthorName Name used for the Atom feed-level author element + * @param {string} tagAuthority Tag URI authority used to mint feed and entry IDs (RFC 4151) * @returns {express.Router} The router instance * @swagger * tags: * name: Feeds * description: Atom feeds of version changes */ -export default function feedRouter(services, versionsRepository, storageType) { +export default function feedRouter(services, versionsRepository, storageType, feedLimit, feedAuthorName, tagAuthority) { const router = express.Router(); /** @@ -162,11 +146,11 @@ export default function feedRouter(services, versionsRepository, storageType) { const collection = await getCollection(); const baseUrl = buildAbsoluteBaseUrl(req); const selfHref = `${baseUrl}/feed`; - const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}`; + const feedId = `tag:${tagAuthority}:feed:${collection.metadata?.id}`; - const versions = await versionsRepository.findAll({ limit: getFeedLimit() }); + const versions = await versionsRepository.findAll({ limit: feedLimit }); - sendFeed(res, { collection, storageType, selfHref, feedId, versions, baseUrl }); + sendFeed(res, { tagAuthority, storageType, feedAuthorName, collection, selfHref, feedId, versions, baseUrl }); }); /** @@ -204,11 +188,11 @@ export default function feedRouter(services, versionsRepository, storageType) { const collection = await getCollection(); const baseUrl = buildAbsoluteBaseUrl(req); const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}`; - const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}`; + const feedId = `tag:${tagAuthority}:feed:${collection.metadata?.id}:${service.id}`; - const versions = await versionsRepository.findByService(service.id, { limit: getFeedLimit() }); + const versions = await versionsRepository.findByService(service.id, { limit: feedLimit }); - return sendFeed(res, { collection, storageType, selfHref, feedId, versions, baseUrl }); + return sendFeed(res, { tagAuthority, storageType, feedAuthorName, collection, selfHref, feedId, versions, baseUrl }); }); /** @@ -258,11 +242,11 @@ export default function feedRouter(services, versionsRepository, storageType) { const collection = await getCollection(); const baseUrl = buildAbsoluteBaseUrl(req); const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}/${encodeURIComponent(termsType)}`; - const feedId = `tag:${TAG_AUTHORITY}:feed:${collection.metadata?.id}:${service.id}:${termsType}`; + const feedId = `tag:${tagAuthority}:feed:${collection.metadata?.id}:${service.id}:${termsType}`; - const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: getFeedLimit() }); + const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: feedLimit }); - return sendFeed(res, { collection, storageType, selfHref, feedId, versions, baseUrl }); + return sendFeed(res, { tagAuthority, storageType, feedAuthorName, collection, selfHref, feedId, versions, baseUrl }); }); return router; diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 8c60f3493..bd2e608b3 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -69,8 +69,10 @@ describe('Feed API', () => { expect(selfHrefMatch[1]).to.match(new RegExp(`${basePath}/v1/feed$`)); }); - it('has an author named OTA-Bot', () => { - expect(response.text).to.match(/[\s\S]*OTA-Bot<\/name>[\s\S]*<\/author>/); + it('has an author matching the configured feed author name', () => { + const expectedName = config.get('@opentermsarchive/engine.collection-api.feed.author.name'); + + expect(response.text).to.match(new RegExp(`[\\s\\S]*${expectedName}[\\s\\S]*`)); }); it('has a logo matching the collection logo', () => { diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index c24b8f5dd..b82274ae9 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -38,11 +38,12 @@ export default async function apiRouter(basePath) { const collection = await getCollection(); const versionsStorageConfig = config.get('@opentermsarchive/engine.recorder.versions.storage'); const versionsRepository = await RepositoryFactory.create(versionsStorageConfig).initialize(); + const feedConfig = config.get('@opentermsarchive/engine.collection-api.feed'); router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); router.use(versionsRouter(versionsRepository)); - router.use(feedRouter(services, versionsRepository, versionsStorageConfig.type)); + router.use(feedRouter(services, versionsRepository, versionsStorageConfig.type, feedConfig.limit, feedConfig.author.name, feedConfig.tagAuthority)); return router; } From 6da0199464f25b647d1e45b5575bea76059d4296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Mon, 4 May 2026 14:02:54 +0200 Subject: [PATCH 18/48] Factor out buildFeedId helper --- src/collection-api/routes/feed.js | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 6d9ce9b89..4d9b2fac4 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -55,6 +55,10 @@ function buildEntryId(tagAuthority, storageType, collection, version) { return `tag:${tagAuthority}:version:${collection.metadata?.id}:${storageType}:${version.id}`; } +function buildFeedId(tagAuthority, collection, ...suffix) { + return [ `tag:${tagAuthority}:feed`, collection.metadata?.id, ...suffix ].join(':'); +} + function buildEntry(tagAuthority, storageType, baseUrl, collection, version) { const apiLink = buildVersionLink(baseUrl, version); const githubCommitLink = collection.metadata?.versions && `${collection.metadata.versions}/commit/${version.id}`; @@ -146,7 +150,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const collection = await getCollection(); const baseUrl = buildAbsoluteBaseUrl(req); const selfHref = `${baseUrl}/feed`; - const feedId = `tag:${tagAuthority}:feed:${collection.metadata?.id}`; + const feedId = buildFeedId(tagAuthority, collection); const versions = await versionsRepository.findAll({ limit: feedLimit }); @@ -188,7 +192,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const collection = await getCollection(); const baseUrl = buildAbsoluteBaseUrl(req); const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}`; - const feedId = `tag:${tagAuthority}:feed:${collection.metadata?.id}:${service.id}`; + const feedId = buildFeedId(tagAuthority, collection, service.id); const versions = await versionsRepository.findByService(service.id, { limit: feedLimit }); @@ -242,7 +246,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const collection = await getCollection(); const baseUrl = buildAbsoluteBaseUrl(req); const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}/${encodeURIComponent(termsType)}`; - const feedId = `tag:${tagAuthority}:feed:${collection.metadata?.id}:${service.id}:${termsType}`; + const feedId = buildFeedId(tagAuthority, collection, service.id, termsType); const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: feedLimit }); From 68cbcff3334c9a1d76dba29a6fe75082e7021004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Mon, 4 May 2026 14:04:35 +0200 Subject: [PATCH 19/48] Factor out buildSchemes helper --- src/collection-api/routes/feed.js | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 4d9b2fac4..46d19013c 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -59,9 +59,18 @@ function buildFeedId(tagAuthority, collection, ...suffix) { return [ `tag:${tagAuthority}:feed`, collection.metadata?.id, ...suffix ].join(':'); } +function buildSchemes(tagAuthority) { + return { + service: `tag:${tagAuthority}:scheme:service`, + termsType: `tag:${tagAuthority}:scheme:terms-type`, + recordType: `tag:${tagAuthority}:scheme:record-type`, + }; +} + function buildEntry(tagAuthority, storageType, baseUrl, collection, version) { const apiLink = buildVersionLink(baseUrl, version); const githubCommitLink = collection.metadata?.versions && `${collection.metadata.versions}/commit/${version.id}`; + const schemes = buildSchemes(tagAuthority); const links = [{ _attributes: { rel: 'alternate', type: 'text/html', href: githubCommitLink || apiLink } }]; @@ -75,9 +84,9 @@ function buildEntry(tagAuthority, storageType, baseUrl, collection, version) { title: { _text: buildEntryTitle(version) }, updated: { _text: version.fetchDate.toISOString() }, category: [ - { _attributes: { term: version.serviceId, scheme: `tag:${tagAuthority}:scheme:service` } }, - { _attributes: { term: version.termsType, scheme: `tag:${tagAuthority}:scheme:terms-type` } }, - { _attributes: { term: classifyRecordType(version), scheme: `tag:${tagAuthority}:scheme:record-type` } }, + { _attributes: { term: version.serviceId, scheme: schemes.service } }, + { _attributes: { term: version.termsType, scheme: schemes.termsType } }, + { _attributes: { term: classifyRecordType(version), scheme: schemes.recordType } }, ], }; } From 7ae4e8d454431268b76bc45b14f981eb916c505e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Mon, 4 May 2026 15:54:16 +0200 Subject: [PATCH 20/48] Hardcode feed tag authority and author name --- config/default.json | 6 +--- src/collection-api/routes/feed.js | 47 +++++++++++++------------- src/collection-api/routes/feed.test.js | 6 ++-- src/collection-api/routes/index.js | 2 +- 4 files changed, 28 insertions(+), 33 deletions(-) diff --git a/config/default.json b/config/default.json index c6cf33cad..96309b6fb 100644 --- a/config/default.json +++ b/config/default.json @@ -50,11 +50,7 @@ }, "collection-api": { "feed": { - "limit": 100, - "author": { - "name": "Open Terms Archive Collection API" - }, - "tagAuthority": "opentermsarchive.org,2026" + "limit": 100 } } } diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 46d19013c..eb3caa792 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -11,6 +11,9 @@ const RECORD_TYPES = { change: 'Change', }; +const TAG_AUTHORITY = 'opentermsarchive.org,2026'; +const FEED_AUTHOR_NAME = 'Open Terms Archive engine'; + function buildAbsoluteBaseUrl(req) { return `${req.protocol}://${req.get('host')}${req.baseUrl}`; } @@ -51,26 +54,26 @@ function buildVersionLink(baseUrl, version) { return `${baseUrl}/version/${encodedService}/${encodedTermsType}/${encodedDate}`; } -function buildEntryId(tagAuthority, storageType, collection, version) { - return `tag:${tagAuthority}:version:${collection.metadata?.id}:${storageType}:${version.id}`; +function buildEntryId(storageType, collection, version) { + return `tag:${TAG_AUTHORITY}:version:${collection.metadata?.id}:${storageType}:${version.id}`; } -function buildFeedId(tagAuthority, collection, ...suffix) { - return [ `tag:${tagAuthority}:feed`, collection.metadata?.id, ...suffix ].join(':'); +function buildFeedId(collection, ...suffix) { + return [ `tag:${TAG_AUTHORITY}:feed`, collection.metadata?.id, ...suffix ].join(':'); } -function buildSchemes(tagAuthority) { +function buildSchemes() { return { - service: `tag:${tagAuthority}:scheme:service`, - termsType: `tag:${tagAuthority}:scheme:terms-type`, - recordType: `tag:${tagAuthority}:scheme:record-type`, + service: `tag:${TAG_AUTHORITY}:scheme:service`, + termsType: `tag:${TAG_AUTHORITY}:scheme:terms-type`, + recordType: `tag:${TAG_AUTHORITY}:scheme:record-type`, }; } -function buildEntry(tagAuthority, storageType, baseUrl, collection, version) { +function buildEntry(storageType, baseUrl, collection, version) { const apiLink = buildVersionLink(baseUrl, version); const githubCommitLink = collection.metadata?.versions && `${collection.metadata.versions}/commit/${version.id}`; - const schemes = buildSchemes(tagAuthority); + const schemes = buildSchemes(); const links = [{ _attributes: { rel: 'alternate', type: 'text/html', href: githubCommitLink || apiLink } }]; @@ -79,7 +82,7 @@ function buildEntry(tagAuthority, storageType, baseUrl, collection, version) { } return { - id: { _text: buildEntryId(tagAuthority, storageType, collection, version) }, + id: { _text: buildEntryId(storageType, collection, version) }, link: links, title: { _text: buildEntryTitle(version) }, updated: { _text: version.fetchDate.toISOString() }, @@ -91,7 +94,7 @@ function buildEntry(tagAuthority, storageType, baseUrl, collection, version) { }; } -function buildFeedDocument({ tagAuthority, storageType, feedAuthorName, collection, selfHref, feedId, versions, baseUrl }) { +function buildFeedDocument({ storageType, collection, selfHref, feedId, versions, baseUrl }) { const latestFetchDate = versions.length > 0 ? versions[0].fetchDate : new Date(); const feed = { @@ -101,14 +104,14 @@ function buildFeedDocument({ tagAuthority, storageType, feedAuthorName, collecti id: { _text: feedId }, updated: { _text: latestFetchDate.toISOString() }, link: { _attributes: { rel: 'self', href: selfHref } }, - author: { name: { _text: feedAuthorName } }, + author: { name: { _text: FEED_AUTHOR_NAME } }, }; if (collection.metadata?.logo) { feed.logo = { _text: collection.metadata.logo }; } - feed.entry = versions.map(version => buildEntry(tagAuthority, storageType, baseUrl, collection, version)); + feed.entry = versions.map(version => buildEntry(storageType, baseUrl, collection, version)); return { _declaration: { _attributes: { version: '1.0', encoding: 'utf-8' } }, @@ -128,15 +131,13 @@ function sendFeed(res, opts) { * @param {object} versionsRepository The versions repository instance * @param {string} storageType The storage type identifier of the versions repository * @param {number} feedLimit Maximum number of entries returned by feed endpoints - * @param {string} feedAuthorName Name used for the Atom feed-level author element - * @param {string} tagAuthority Tag URI authority used to mint feed and entry IDs (RFC 4151) * @returns {express.Router} The router instance * @swagger * tags: * name: Feeds * description: Atom feeds of version changes */ -export default function feedRouter(services, versionsRepository, storageType, feedLimit, feedAuthorName, tagAuthority) { +export default function feedRouter(services, versionsRepository, storageType, feedLimit) { const router = express.Router(); /** @@ -159,11 +160,11 @@ export default function feedRouter(services, versionsRepository, storageType, fe const collection = await getCollection(); const baseUrl = buildAbsoluteBaseUrl(req); const selfHref = `${baseUrl}/feed`; - const feedId = buildFeedId(tagAuthority, collection); + const feedId = buildFeedId(collection); const versions = await versionsRepository.findAll({ limit: feedLimit }); - sendFeed(res, { tagAuthority, storageType, feedAuthorName, collection, selfHref, feedId, versions, baseUrl }); + sendFeed(res, { storageType, collection, selfHref, feedId, versions, baseUrl }); }); /** @@ -201,11 +202,11 @@ export default function feedRouter(services, versionsRepository, storageType, fe const collection = await getCollection(); const baseUrl = buildAbsoluteBaseUrl(req); const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}`; - const feedId = buildFeedId(tagAuthority, collection, service.id); + const feedId = buildFeedId(collection, service.id); const versions = await versionsRepository.findByService(service.id, { limit: feedLimit }); - return sendFeed(res, { tagAuthority, storageType, feedAuthorName, collection, selfHref, feedId, versions, baseUrl }); + return sendFeed(res, { storageType, collection, selfHref, feedId, versions, baseUrl }); }); /** @@ -255,11 +256,11 @@ export default function feedRouter(services, versionsRepository, storageType, fe const collection = await getCollection(); const baseUrl = buildAbsoluteBaseUrl(req); const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}/${encodeURIComponent(termsType)}`; - const feedId = buildFeedId(tagAuthority, collection, service.id, termsType); + const feedId = buildFeedId(collection, service.id, termsType); const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: feedLimit }); - return sendFeed(res, { tagAuthority, storageType, feedAuthorName, collection, selfHref, feedId, versions, baseUrl }); + return sendFeed(res, { storageType, collection, selfHref, feedId, versions, baseUrl }); }); return router; diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index bd2e608b3..753f5e2ad 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -69,10 +69,8 @@ describe('Feed API', () => { expect(selfHrefMatch[1]).to.match(new RegExp(`${basePath}/v1/feed$`)); }); - it('has an author matching the configured feed author name', () => { - const expectedName = config.get('@opentermsarchive/engine.collection-api.feed.author.name'); - - expect(response.text).to.match(new RegExp(`[\\s\\S]*${expectedName}[\\s\\S]*`)); + it('has an author matching the feed author name', () => { + expect(response.text).to.match(/[\s\S]*Open Terms Archive engine<\/name>[\s\S]*<\/author>/); }); it('has a logo matching the collection logo', () => { diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index b82274ae9..16df0bbb8 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -43,7 +43,7 @@ export default async function apiRouter(basePath) { router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); router.use(versionsRouter(versionsRepository)); - router.use(feedRouter(services, versionsRepository, versionsStorageConfig.type, feedConfig.limit, feedConfig.author.name, feedConfig.tagAuthority)); + router.use(feedRouter(services, versionsRepository, versionsStorageConfig.type, feedConfig.limit)); return router; } From e26b940cc25d2eb71948690a6f1c110c5b15b5d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Mon, 4 May 2026 17:21:26 +0200 Subject: [PATCH 21/48] Replace feed dual links with URL template --- src/collection-api/routes/feed.js | 36 +++++------- src/collection-api/routes/feed.test.js | 77 +++++++++++++++++++++++--- src/collection-api/routes/index.js | 2 +- 3 files changed, 84 insertions(+), 31 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index eb3caa792..e4ec32801 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -70,20 +70,13 @@ function buildSchemes() { }; } -function buildEntry(storageType, baseUrl, collection, version) { - const apiLink = buildVersionLink(baseUrl, version); - const githubCommitLink = collection.metadata?.versions && `${collection.metadata.versions}/commit/${version.id}`; +function buildEntry(storageType, versionUrlTemplate, baseUrl, collection, version) { + const href = versionUrlTemplate?.replace('%VERSION_ID', version.id) ?? buildVersionLink(baseUrl, version); const schemes = buildSchemes(); - const links = [{ _attributes: { rel: 'alternate', type: 'text/html', href: githubCommitLink || apiLink } }]; - - if (githubCommitLink) { - links.push({ _attributes: { rel: 'related', type: 'text/html', href: apiLink } }); - } - return { id: { _text: buildEntryId(storageType, collection, version) }, - link: links, + link: { _attributes: { rel: 'alternate', type: 'text/html', href } }, title: { _text: buildEntryTitle(version) }, updated: { _text: version.fetchDate.toISOString() }, category: [ @@ -94,7 +87,7 @@ function buildEntry(storageType, baseUrl, collection, version) { }; } -function buildFeedDocument({ storageType, collection, selfHref, feedId, versions, baseUrl }) { +function buildFeedDocument({ storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }) { const latestFetchDate = versions.length > 0 ? versions[0].fetchDate : new Date(); const feed = { @@ -111,7 +104,7 @@ function buildFeedDocument({ storageType, collection, selfHref, feedId, versions feed.logo = { _text: collection.metadata.logo }; } - feed.entry = versions.map(version => buildEntry(storageType, baseUrl, collection, version)); + feed.entry = versions.map(version => buildEntry(storageType, versionUrlTemplate, baseUrl, collection, version)); return { _declaration: { _attributes: { version: '1.0', encoding: 'utf-8' } }, @@ -127,17 +120,18 @@ function sendFeed(res, opts) { } /** - * @param {object} services The services to be exposed by the API - * @param {object} versionsRepository The versions repository instance - * @param {string} storageType The storage type identifier of the versions repository - * @param {number} feedLimit Maximum number of entries returned by feed endpoints - * @returns {express.Router} The router instance + * @param {object} services The services to be exposed by the API + * @param {object} versionsRepository The versions repository instance + * @param {string} storageType The storage type identifier of the versions repository + * @param {number} feedLimit Maximum number of entries returned by feed endpoints + * @param {string} [versionUrlTemplate] Optional URL template with %VERSION_ID placeholder; when set, replaces the API link as each entry's alternate href + * @returns {express.Router} The router instance * @swagger * tags: * name: Feeds * description: Atom feeds of version changes */ -export default function feedRouter(services, versionsRepository, storageType, feedLimit) { +export default function feedRouter(services, versionsRepository, storageType, feedLimit, versionUrlTemplate) { const router = express.Router(); /** @@ -164,7 +158,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const versions = await versionsRepository.findAll({ limit: feedLimit }); - sendFeed(res, { storageType, collection, selfHref, feedId, versions, baseUrl }); + sendFeed(res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); }); /** @@ -206,7 +200,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const versions = await versionsRepository.findByService(service.id, { limit: feedLimit }); - return sendFeed(res, { storageType, collection, selfHref, feedId, versions, baseUrl }); + return sendFeed(res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); }); /** @@ -260,7 +254,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: feedLimit }); - return sendFeed(res, { storageType, collection, selfHref, feedId, versions, baseUrl }); + return sendFeed(res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); }); return router; diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 753f5e2ad..5722128bf 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -1,13 +1,17 @@ import { expect } from 'chai'; import config from 'config'; +import express from 'express'; import supertest from 'supertest'; import { getCollection } from '../../archivist/collection/index.js'; import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'; +import * as Services from '../../archivist/services/index.js'; import Version from '../../archivist/recorder/version.js'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; import app from '../server.js'; +import feedRouter from './feed.js'; + const basePath = config.get('@opentermsarchive/engine.collection-api.basePath'); const request = supertest(app); const storageConfig = config.get('@opentermsarchive/engine.recorder.versions.storage'); @@ -159,18 +163,17 @@ describe('Feed API', () => { expect(firstEntry).to.include(`${expected}`); }); - it('has an alternate link to the GitHub commit', async () => { - const collection = await getCollection(); + it('has an alternate link to the API version endpoint', () => { const href = firstEntry.match(/]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + const expectedPathFragment = `/version/${encodeURIComponent('service-2')}/${encodeURIComponent('Privacy Policy')}/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE_UPGRADE))}`; - expect(href).to.equal(`${collection.metadata.versions}/commit/${savedVersions.technicalUpgradeRecord.id}`); + expect(href).to.include(expectedPathFragment); }); - it('has a related link to the version API endpoint', () => { - const href = firstEntry.match(/]*rel="related"[^>]*href="([^"]+)"/)[1]; - const expectedPathFragment = `/version/${encodeURIComponent('service-2')}/${encodeURIComponent('Privacy Policy')}/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE_UPGRADE))}`; + it('has exactly one link per entry', () => { + const links = firstEntry.match(/]*\/>/g) || []; - expect(href).to.include(expectedPathFragment); + expect(links).to.have.length(1); }); it('has a type="text/html" on the alternate link', () => { @@ -406,8 +409,9 @@ describe('Feed API', () => { expect(href).to.not.include('Service B!'); }); - it('URL-encodes spaces and special characters in entry related links', () => { - const href = response.text.match(/]*rel="related"[^>]*href="([^"]+)"/)[1]; + it('URL-encodes spaces and special characters in entry alternate links', () => { + const entry = response.text.match(/[\s\S]*?<\/entry>/)[0]; + const href = entry.match(/]*rel="alternate"[^>]*href="([^"]+)"/)[1]; expect(href).to.include('Service%20B!'); expect(href).to.include('Privacy%20Policy'); @@ -506,4 +510,59 @@ describe('Feed API', () => { }); }); }); + + describe('entry links with versionUrlTemplate configured', () => { + const TEMPLATE = 'https://example.test/v/%VERSION_ID'; + + let response; + let repository; + let savedVersion; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + + savedVersion = await repository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'content', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['s1'], + })); + + const services = await Services.load(); + const templatedApp = express(); + + templatedApp.use(feedRouter(services, repository, storageConfig.type, 10, TEMPLATE)); + + response = await supertest(templatedApp).get('/feed'); + }); + + after(() => repository.removeAll()); + + it('interpolates %VERSION_ID into the alternate link', () => { + const href = response.text.match(/[\s\S]*?]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.equal(`https://example.test/v/${savedVersion.id}`); + }); + + it('does not point to the API for entry links', () => { + const entries = response.text.match(/[\s\S]*?<\/entry>/g) || []; + + for (const entry of entries) { + expect(entry).to.not.match(/]*href="[^"]*\/version\//); + } + }); + + it('still emits exactly one link per entry', () => { + const entries = response.text.match(/[\s\S]*?<\/entry>/g) || []; + + for (const entry of entries) { + const links = entry.match(/]*\/>/g) || []; + + expect(links).to.have.length(1); + } + }); + }); }); diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index 16df0bbb8..34e720470 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -43,7 +43,7 @@ export default async function apiRouter(basePath) { router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); router.use(versionsRouter(versionsRepository)); - router.use(feedRouter(services, versionsRepository, versionsStorageConfig.type, feedConfig.limit)); + router.use(feedRouter(services, versionsRepository, versionsStorageConfig.type, feedConfig.limit, feedConfig.versionUrlTemplate)); return router; } From f653d087a278f8b9babf5ba277172b949f388b9b Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 14:21:49 +0200 Subject: [PATCH 22/48] Lint --- src/collection-api/routes/feed.test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 5722128bf..86b559caa 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -5,8 +5,8 @@ import supertest from 'supertest'; import { getCollection } from '../../archivist/collection/index.js'; import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'; -import * as Services from '../../archivist/services/index.js'; import Version from '../../archivist/recorder/version.js'; +import * as Services from '../../archivist/services/index.js'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; import app from '../server.js'; From 7d27aa4b82d5f6548e54416863146a2cac2545ae Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 14:01:58 +0200 Subject: [PATCH 23/48] Add includeTechnicalUpgrades option in interface --- .../recorder/repositories/interface.js | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/archivist/recorder/repositories/interface.js b/src/archivist/recorder/repositories/interface.js index ae0ffafcc..00e8bfce5 100644 --- a/src/archivist/recorder/repositories/interface.js +++ b/src/archivist/recorder/repositories/interface.js @@ -74,10 +74,11 @@ class RepositoryInterface { * For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records * @see RepositoryInterface#loadRecordContent * @see RepositoryInterface#iterate - * @param {object} [options] - Pagination options - * @param {number} [options.limit] - Maximum number of records to return - * @param {number} [options.offset] - Number of records to skip - * @returns {Promise>} Promise that will be resolved with an array of records in descending chronological order + * @param {object} [options] - Query options + * @param {number} [options.limit] - Maximum number of records to return + * @param {number} [options.offset] - Number of records to skip + * @param {boolean} [options.includeTechnicalUpgrades] - When false, exclude technical upgrade records (re-renders of existing snapshots) and only return records that represent actual content changes. Default: true + * @returns {Promise>} Promise that will be resolved with an array of records in descending chronological order */ async findAll(options = {}) { throw new Error(`#findAll method is not implemented in ${this.constructor.name}`); @@ -87,11 +88,12 @@ class RepositoryInterface { * Find all records for a specific service, in descending chronological order * For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records * @see RepositoryInterface#loadRecordContent - * @param {string} serviceId - Service ID of records to find - * @param {object} [options] - Pagination options - * @param {number} [options.limit] - Maximum number of records to return - * @param {number} [options.offset] - Number of records to skip - * @returns {Promise>} Promise that will be resolved with an array of matching records in descending chronological order + * @param {string} serviceId - Service ID of records to find + * @param {object} [options] - Query options + * @param {number} [options.limit] - Maximum number of records to return + * @param {number} [options.offset] - Number of records to skip + * @param {boolean} [options.includeTechnicalUpgrades] - When false, exclude technical upgrade records (re-renders of existing snapshots) and only return records that represent actual content changes. Default: true + * @returns {Promise>} Promise that will be resolved with an array of matching records in descending chronological order */ async findByService(serviceId, options = {}) { throw new Error(`#findByService method is not implemented in ${this.constructor.name}`); @@ -101,12 +103,13 @@ class RepositoryInterface { * Find all records for a specific service and terms type, in descending chronological order * For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records * @see RepositoryInterface#loadRecordContent - * @param {string} serviceId - Service ID of records to find - * @param {string} termsType - Terms type of records to find - * @param {object} [options] - Pagination options - * @param {number} [options.limit] - Maximum number of records to return - * @param {number} [options.offset] - Number of records to skip - * @returns {Promise>} Promise that will be resolved with an array of matching records in descending chronological order + * @param {string} serviceId - Service ID of records to find + * @param {string} termsType - Terms type of records to find + * @param {object} [options] - Query options + * @param {number} [options.limit] - Maximum number of records to return + * @param {number} [options.offset] - Number of records to skip + * @param {boolean} [options.includeTechnicalUpgrades] - When false, exclude technical upgrade records (re-renders of existing snapshots) and only return records that represent actual content changes. Default: true + * @returns {Promise>} Promise that will be resolved with an array of matching records in descending chronological order */ async findByServiceAndTermsType(serviceId, termsType, options = {}) { throw new Error(`#findByServiceAndTermsType method is not implemented in ${this.constructor.name}`); From a37d00ebcd3bfda1534ab59760abb8bdcb146b77 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 14:07:06 +0200 Subject: [PATCH 24/48] Implement technical upgrades filtering in Git Filter commits by their message prefix at git level using the REAL_CHANGE_COMMIT_MESSAGE_PREFIXES subset when the option is disabled. This avoids returning technical upgrade records (which are re-renders of existing snapshots) while keeping startTracking and update records. --- .../recorder/repositories/git/dataMapper.js | 8 +++ .../recorder/repositories/git/index.js | 19 ++++--- .../recorder/repositories/git/index.test.js | 50 +++++++++++++++++++ 3 files changed, 69 insertions(+), 8 deletions(-) diff --git a/src/archivist/recorder/repositories/git/dataMapper.js b/src/archivist/recorder/repositories/git/dataMapper.js index 8fcd3fafb..6d60ae9e9 100644 --- a/src/archivist/recorder/repositories/git/dataMapper.js +++ b/src/archivist/recorder/repositories/git/dataMapper.js @@ -14,6 +14,14 @@ export const COMMIT_MESSAGE_PREFIXES = { deprecated_update: 'Update', }; +// Subset of COMMIT_MESSAGE_PREFIXES that exclude technical upgrades (re-renders of existing snapshots with updated extraction rules) and only represent content changes detected at the service source +export const REAL_CHANGE_COMMIT_MESSAGE_PREFIXES = { + startTracking: COMMIT_MESSAGE_PREFIXES.startTracking, + update: COMMIT_MESSAGE_PREFIXES.update, + deprecated_startTracking: COMMIT_MESSAGE_PREFIXES.deprecated_startTracking, + deprecated_update: COMMIT_MESSAGE_PREFIXES.deprecated_update, +}; + export const TERMS_TYPE_AND_DOCUMENT_ID_SEPARATOR = ' #'; export const SNAPSHOT_ID_MARKER = '%SNAPSHOT_ID'; const SINGLE_SOURCE_DOCUMENT_PREFIX = 'This version was recorded after extracting from snapshot'; diff --git a/src/archivist/recorder/repositories/git/index.js b/src/archivist/recorder/repositories/git/index.js index 50da196ba..f3721edb5 100644 --- a/src/archivist/recorder/repositories/git/index.js +++ b/src/archivist/recorder/repositories/git/index.js @@ -88,20 +88,20 @@ export default class GitRepository extends RepositoryInterface { return this.#toDomain(commit); } - async findAll({ limit, offset } = {}) { - return Promise.all((await this.#getCommits({ limit, offset })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); + async findAll({ limit, offset, includeTechnicalUpgrades = true } = {}) { + return Promise.all((await this.#getCommits({ limit, offset, includeTechnicalUpgrades })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); } - async findByService(serviceId, { limit, offset } = {}) { + async findByService(serviceId, { limit, offset, includeTechnicalUpgrades = true } = {}) { const pathPattern = DataMapper.generateFilePath(serviceId); - return Promise.all((await this.#getCommits({ pathFilter: pathPattern, limit, offset })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); + return Promise.all((await this.#getCommits({ pathFilter: pathPattern, limit, offset, includeTechnicalUpgrades })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); } - async findByServiceAndTermsType(serviceId, termsType, { limit, offset } = {}) { + async findByServiceAndTermsType(serviceId, termsType, { limit, offset, includeTechnicalUpgrades = true } = {}) { const pathPattern = DataMapper.generateFilePath(serviceId, termsType); - return Promise.all((await this.#getCommits({ pathFilter: pathPattern, limit, offset })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); + return Promise.all((await this.#getCommits({ pathFilter: pathPattern, limit, offset, includeTechnicalUpgrades })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); } async count(serviceId, termsType) { @@ -160,8 +160,11 @@ export default class GitRepository extends RepositoryInterface { record.content = pdfBuffer; } - async #getCommits({ pathFilter, reverse = false, limit, offset } = {}) { - const grepOptions = Object.values(DataMapper.COMMIT_MESSAGE_PREFIXES).flatMap(prefix => [ '--grep', prefix ]); + async #getCommits({ pathFilter, reverse = false, limit, offset, includeTechnicalUpgrades = true } = {}) { + const prefixes = includeTechnicalUpgrades + ? DataMapper.COMMIT_MESSAGE_PREFIXES + : DataMapper.REAL_CHANGE_COMMIT_MESSAGE_PREFIXES; + const grepOptions = Object.values(prefixes).flatMap(prefix => [ '--grep', prefix ]); const pathOptions = pathFilter ? [ '--', pathFilter ] : [ '--', '*/*' ]; // Exclude root directory files by only matching files in subdirectories diff --git a/src/archivist/recorder/repositories/git/index.test.js b/src/archivist/recorder/repositories/git/index.test.js index 6ef24175f..9428c64d3 100644 --- a/src/archivist/recorder/repositories/git/index.test.js +++ b/src/archivist/recorder/repositories/git/index.test.js @@ -543,6 +543,28 @@ describe('GitRepository', () => { it('returns records in descending order', () => { expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + + before(async () => { + filteredRecords = await subject.findAll({ includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.length).to.equal(2); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + + it('returns the expected records in descending order', () => { + expect(filteredRecords.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE ]); + }); + }); }); describe('#findByServiceAndTermsType', () => { @@ -622,6 +644,34 @@ describe('GitRepository', () => { expect(result).to.be.an('array').that.is.empty; }); }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + let technicalUpgradeId; + + before(async () => { + ({ id: technicalUpgradeId } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - technical upgrade`, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + isTechnicalUpgrade: true, + }))); + + filteredRecords = await subject.findByServiceAndTermsType(SERVICE_PROVIDER_ID, TERMS_TYPE, { includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.map(record => record.id)).to.not.include(technicalUpgradeId); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + }); }); describe('#count', () => { From b132cd4e705b8c8fae90f18a3cf4310296e6c6e9 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 14:08:36 +0200 Subject: [PATCH 25/48] Implement technical upgrades filtering in Mongo Filter documents at query time using the existing isTechnicalUpgrade field when the option is disabled, in findAll, findByService and findByServiceAndTermsType. --- .../recorder/repositories/mongo/index.js | 25 +++++++--- .../recorder/repositories/mongo/index.test.js | 50 +++++++++++++++++++ 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/src/archivist/recorder/repositories/mongo/index.js b/src/archivist/recorder/repositories/mongo/index.js index fc1b860ca..649b22981 100644 --- a/src/archivist/recorder/repositories/mongo/index.js +++ b/src/archivist/recorder/repositories/mongo/index.js @@ -88,8 +88,9 @@ export default class MongoRepository extends RepositoryInterface { return this.#toDomain(mongoDocument); } - async findAll({ limit, offset } = {}) { - let query = this.collection.find().project({ content: 0 }).sort({ fetchDate: -1 }); + async findAll({ limit, offset, includeTechnicalUpgrades = true } = {}) { + const filter = includeTechnicalUpgrades ? {} : { isTechnicalUpgrade: { $ne: true } }; + let query = this.collection.find(filter).project({ content: 0 }).sort({ fetchDate: -1 }); if (offset !== undefined) { query = query.skip(offset); @@ -103,8 +104,14 @@ export default class MongoRepository extends RepositoryInterface { .map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); } - async findByServiceAndTermsType(serviceId, termsType, { limit, offset } = {}) { - let query = this.collection.find({ serviceId, termsType }).project({ content: 0 }).sort({ fetchDate: -1 }); + async findByServiceAndTermsType(serviceId, termsType, { limit, offset, includeTechnicalUpgrades = true } = {}) { + const filter = { serviceId, termsType }; + + if (!includeTechnicalUpgrades) { + filter.isTechnicalUpgrade = { $ne: true }; + } + + let query = this.collection.find(filter).project({ content: 0 }).sort({ fetchDate: -1 }); if (offset !== undefined) { query = query.skip(offset); @@ -118,8 +125,14 @@ export default class MongoRepository extends RepositoryInterface { .map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); } - async findByService(serviceId, { limit, offset } = {}) { - let query = this.collection.find({ serviceId }).project({ content: 0 }).sort({ fetchDate: -1 }); + async findByService(serviceId, { limit, offset, includeTechnicalUpgrades = true } = {}) { + const filter = { serviceId }; + + if (!includeTechnicalUpgrades) { + filter.isTechnicalUpgrade = { $ne: true }; + } + + let query = this.collection.find(filter).project({ content: 0 }).sort({ fetchDate: -1 }); if (offset !== undefined) { query = query.skip(offset); diff --git a/src/archivist/recorder/repositories/mongo/index.test.js b/src/archivist/recorder/repositories/mongo/index.test.js index e2123cdfd..c78b0cd4e 100644 --- a/src/archivist/recorder/repositories/mongo/index.test.js +++ b/src/archivist/recorder/repositories/mongo/index.test.js @@ -632,6 +632,28 @@ describe('MongoRepository', () => { it('returns records in descending order', () => { expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + + before(async () => { + filteredRecords = await subject.findAll({ includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.length).to.equal(2); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + + it('returns the expected records in descending order', () => { + expect(filteredRecords.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE ]); + }); + }); }); describe('#findByServiceAndTermsType', () => { @@ -709,6 +731,34 @@ describe('MongoRepository', () => { expect(result).to.be.an('array').that.is.empty; }); }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + let technicalUpgradeId; + + before(async () => { + ({ id: technicalUpgradeId } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - technical upgrade`, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + isTechnicalUpgrade: true, + }))); + + filteredRecords = await subject.findByServiceAndTermsType(SERVICE_PROVIDER_ID, TERMS_TYPE, { includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.map(record => record.id)).to.not.include(technicalUpgradeId); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + }); }); describe('#count', () => { From 9ea4e99e2f330a6450eff8d2573dd355d4bfe31b Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 14:14:54 +0200 Subject: [PATCH 26/48] Exclude technical upgrades from feed endpoints Feed subscribers want to be notified of changes in the legal content of services, not of re-renderings of existing snapshots with updated extraction rules. Pass includeTechnicalUpgrades: false to all three versions repository queries used by feed routes. Drop the now-dead technical-upgrade branches in classifyRecordType and buildEntryTitle, and update tests accordingly. --- src/collection-api/routes/feed.js | 29 +++++--------------------- src/collection-api/routes/feed.test.js | 19 ++++++++--------- 2 files changed, 14 insertions(+), 34 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index e4ec32801..bab5294ee 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -7,7 +7,6 @@ import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; const RECORD_TYPES = { firstRecord: 'First record', - technicalUpgrade: 'Technical upgrade', change: 'Change', }; @@ -19,29 +18,11 @@ function buildAbsoluteBaseUrl(req) { } function classifyRecordType(version) { - switch (true) { - case version.isFirstRecord: - return RECORD_TYPES.firstRecord; - case version.isTechnicalUpgrade: - return RECORD_TYPES.technicalUpgrade; - default: - return RECORD_TYPES.change; - } + return version.isFirstRecord ? RECORD_TYPES.firstRecord : RECORD_TYPES.change; } function buildEntryTitle(version) { - let prefix; - - switch (true) { - case version.isFirstRecord: - prefix = COMMIT_MESSAGE_PREFIXES.startTracking; - break; - case version.isTechnicalUpgrade: - prefix = COMMIT_MESSAGE_PREFIXES.technicalUpgrade; - break; - default: - prefix = COMMIT_MESSAGE_PREFIXES.update; - } + const prefix = version.isFirstRecord ? COMMIT_MESSAGE_PREFIXES.startTracking : COMMIT_MESSAGE_PREFIXES.update; return `${prefix} ${version.serviceId} ${version.termsType}`; } @@ -156,7 +137,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const selfHref = `${baseUrl}/feed`; const feedId = buildFeedId(collection); - const versions = await versionsRepository.findAll({ limit: feedLimit }); + const versions = await versionsRepository.findAll({ limit: feedLimit, includeTechnicalUpgrades: false }); sendFeed(res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); }); @@ -198,7 +179,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}`; const feedId = buildFeedId(collection, service.id); - const versions = await versionsRepository.findByService(service.id, { limit: feedLimit }); + const versions = await versionsRepository.findByService(service.id, { limit: feedLimit, includeTechnicalUpgrades: false }); return sendFeed(res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); }); @@ -252,7 +233,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}/${encodeURIComponent(termsType)}`; const feedId = buildFeedId(collection, service.id, termsType); - const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: feedLimit }); + const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: feedLimit, includeTechnicalUpgrades: false }); return sendFeed(res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); }); diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 86b559caa..5ac3831af 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -136,11 +136,11 @@ describe('Feed API', () => { after(() => repository.removeAll()); - it('lists one entry per saved version up to the configured limit', () => { + it('lists one entry per real-change version up to the configured limit', () => { const limit = config.get('@opentermsarchive/engine.collection-api.feed.limit'); const entries = response.text.match(//g) || []; - expect(entries).to.have.length(Math.min(4, limit)); + expect(entries).to.have.length(Math.min(3, limit)); }); it('orders entries newest-first', () => { @@ -158,14 +158,14 @@ describe('Feed API', () => { it('has an id tag URI including storage type and record id', () => { const collectionId = 'test'; - const expected = `tag:opentermsarchive.org,2026:version:${collectionId}:${storageConfig.type}:${savedVersions.technicalUpgradeRecord.id}`; + const expected = `tag:opentermsarchive.org,2026:version:${collectionId}:${storageConfig.type}:${savedVersions.upgradeRecord.id}`; expect(firstEntry).to.include(`${expected}`); }); it('has an alternate link to the API version endpoint', () => { const href = firstEntry.match(/]*rel="alternate"[^>]*href="([^"]+)"/)[1]; - const expectedPathFragment = `/version/${encodeURIComponent('service-2')}/${encodeURIComponent('Privacy Policy')}/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE_UPGRADE))}`; + const expectedPathFragment = `/version/${encodeURIComponent('service-2')}/${encodeURIComponent('Privacy Policy')}/${encodeURIComponent(toISODateWithoutMilliseconds(savedVersions.upgradeRecord.fetchDate))}`; expect(href).to.include(expectedPathFragment); }); @@ -183,7 +183,7 @@ describe('Feed API', () => { it('has a title reconstructed from commit prefix + serviceId + termsType', () => { const title = firstEntry.match(/]*>([\s\S]*?)<\/title>/)[1]; - expect(title).to.include('Apply technical or declaration upgrade on'); + expect(title).to.include('First record of'); expect(title).to.include('service-2'); expect(title).to.include('Privacy Policy'); }); @@ -191,7 +191,7 @@ describe('Feed API', () => { it('has an updated element matching the fetch date', () => { const updated = firstEntry.match(/([^<]+)<\/updated>/)[1]; - expect(new Date(updated).toISOString()).to.equal(FETCH_DATE_UPGRADE.toISOString()); + expect(new Date(updated).toISOString()).to.equal(savedVersions.upgradeRecord.fetchDate.toISOString()); }); it('has three categories with the expected schemes', () => { @@ -212,7 +212,7 @@ describe('Feed API', () => { expect(terms).to.include('service-2'); expect(terms).to.include('Privacy Policy'); - expect(terms).to.include('Technical upgrade'); + expect(terms).to.include('First record'); }); }); @@ -237,11 +237,10 @@ describe('Feed API', () => { expect(entry).to.match(/term="Change"/); }); - it('classifies a technical upgrade as "Technical upgrade"', () => { + it('excludes technical upgrade records from the feed', () => { const entry = findEntryById(response.text, savedVersions.technicalUpgradeRecord.id); - expect(entry).to.not.be.undefined; - expect(entry).to.match(/term="Technical upgrade"/); + expect(entry).to.be.undefined; }); }); From 30794e22fe0855e9c2f211a08f1315bab48fb661 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 14:31:11 +0200 Subject: [PATCH 27/48] Document versionUrlTemplate option in CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b9a99268..ba1b303bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ All changes that impact users of this module are documented in this file, in the - Add `GET /feed/:serviceId` endpoint on the Collection API exposing an Atom feed scoped to a single service - Add `GET /feed/:serviceId/:termsType` endpoint on the Collection API exposing an Atom feed scoped to a single service and terms type - Add [`@opentermsarchive/engine.collection-api.feed.limit`](https://docs.opentermsarchive.org/collections/reference/configuration/) configuration option controlling the maximum number of entries returned by feed endpoints (default: `100`) +- Add [`@opentermsarchive/engine.collection-api.feed.versionUrlTemplate`](https://docs.opentermsarchive.org/collections/reference/configuration/) configuration option to customize the `alternate` link of feed entries with a URL template (e.g. `https://github.com/openTermsArchive/demo-versions/commit/%VERSION_ID`); useful to point feed readers to a human-readable page instead of the default version API endpoint ### Changed From 26cfb4a7b80796f5288314bd2f3ddc961007f5cb Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 14:32:09 +0200 Subject: [PATCH 28/48] Ensure feed test isolation --- src/collection-api/routes/feed.test.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 5ac3831af..7bbc39083 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -96,7 +96,7 @@ describe('Feed API', () => { this.timeout(5000); repository = RepositoryFactory.create(storageConfig); await repository.initialize(); - + await repository.removeAll(); const firstRecord = await repository.save(new Version({ serviceId: 'service-1', termsType: 'Terms of Service', @@ -265,7 +265,7 @@ describe('Feed API', () => { this.timeout(5000); repository = RepositoryFactory.create(storageConfig); await repository.initialize(); - + await repository.removeAll(); await repository.save(new Version({ serviceId: SERVICE, termsType: TERMS, @@ -382,7 +382,7 @@ describe('Feed API', () => { this.timeout(5000); repository = RepositoryFactory.create(storageConfig); await repository.initialize(); - + await repository.removeAll(); await repository.save(new Version({ serviceId: SERVICE, termsType: TERMS, @@ -428,7 +428,7 @@ describe('Feed API', () => { this.timeout(5000); repository = RepositoryFactory.create(storageConfig); await repository.initialize(); - + await repository.removeAll(); await repository.save(new Version({ serviceId: SERVICE, termsType: TERMS, @@ -521,7 +521,7 @@ describe('Feed API', () => { this.timeout(5000); repository = RepositoryFactory.create(storageConfig); await repository.initialize(); - + await repository.removeAll(); savedVersion = await repository.save(new Version({ serviceId: 'service-1', termsType: 'Terms of Service', From 380af2455fc2874c7b4cddb5b6185cad78d4b07a Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 14:33:03 +0200 Subject: [PATCH 29/48] Fail fast if collection metadata id is missing Feed tag URIs (RFC 4151) and entry IDs are built by interpolating collection.metadata.id. When this value is missing, the resulting IDs contain double colons and become technically malformed URIs without any feedback to the operator. Throw explicitly at apiRouter setup so the misconfiguration is caught at boot rather than silently emitted in every feed response. --- src/collection-api/routes/index.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index 34e720470..a8a629695 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -40,6 +40,10 @@ export default async function apiRouter(basePath) { const versionsRepository = await RepositoryFactory.create(versionsStorageConfig).initialize(); const feedConfig = config.get('@opentermsarchive/engine.collection-api.feed'); + if (!collection.metadata?.id) { + throw new Error('Collection metadata "id" is required to expose feed endpoints, as it is used to build the tag URIs that uniquely identify the feed and its entries. Add an "id" field to the collection metadata file.'); + } + router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); router.use(versionsRouter(versionsRepository)); From 379fd43af60fe1336bdd1df7d4b6516c2f726f03 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 14:33:33 +0200 Subject: [PATCH 30/48] Document TAG_AUTHORITY year per RFC 4151 --- src/collection-api/routes/feed.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index bab5294ee..0f3bf1bd6 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -10,7 +10,7 @@ const RECORD_TYPES = { change: 'Change', }; -const TAG_AUTHORITY = 'opentermsarchive.org,2026'; +const TAG_AUTHORITY = 'opentermsarchive.org,2026'; // Tag URI authority (RFC 4151). The year fixes the scheme inception and must never change: it would invalidate every previously emitted feed and entry ID. const FEED_AUTHOR_NAME = 'Open Terms Archive engine'; function buildAbsoluteBaseUrl(req) { From ceb2b90cb6fb36eed57df91cce9f056cadc4ad35 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 14:40:22 +0200 Subject: [PATCH 31/48] Fix funder --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba1b303bd..1e0743735 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All changes that impact users of this module are documented in this file, in the ## Unreleased [major] -> Development of this release was supported by [Reset Tech](https://www.reset.tech). +> Development of this release was supported by the [NGI0 Commons Fund](https://nlnet.nl/project/Modular-OTA/), a fund established by [NLnet](https://nlnet.nl/) with financial support from the European Commission's [Next Generation Internet](https://www.ngi.eu) programme, under the aegis of DG CNECT under grant agreement N°101069594. ### Added From 5c3da560029b0e6d4b1674c3a01cdb18455527c7 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 15:42:51 +0200 Subject: [PATCH 32/48] Add displayTitle getter on Record class --- src/archivist/recorder/record.js | 21 ++++++++++++++ src/archivist/recorder/version.test.js | 38 ++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/src/archivist/recorder/record.js b/src/archivist/recorder/record.js index 9b335a7e0..965cb9088 100644 --- a/src/archivist/recorder/record.js +++ b/src/archivist/recorder/record.js @@ -3,6 +3,13 @@ * @class Record * @private */ + +export const TITLE_PREFIXES = Object.freeze({ + firstRecord: 'First record of', + technicalUpgrade: 'Apply technical or declaration upgrade on', + update: 'Record new changes of', +}); + export default class Record { #content; @@ -32,6 +39,20 @@ export default class Record { this.#content = content; } + get displayTitle() { + let prefix; + + if (this.isFirstRecord) { + prefix = TITLE_PREFIXES.firstRecord; + } else if (this.isTechnicalUpgrade) { + prefix = TITLE_PREFIXES.technicalUpgrade; + } else { + prefix = TITLE_PREFIXES.update; + } + + return `${prefix} ${this.serviceId} ${this.termsType}`; + } + validate() { for (const requiredParam of this.constructor.REQUIRED_PARAMS) { if (requiredParam == 'content') { diff --git a/src/archivist/recorder/version.test.js b/src/archivist/recorder/version.test.js index 248d607dd..d03a2af3f 100644 --- a/src/archivist/recorder/version.test.js +++ b/src/archivist/recorder/version.test.js @@ -60,4 +60,42 @@ describe('Version', () => { }); }); }); + + describe('#displayTitle', () => { + const baseParams = { + serviceId: 'service-A', + termsType: 'Terms of Service', + fetchDate: new Date('2000-01-01T12:00:00.000Z'), + content: 'some content', + snapshotIds: ['dd263f270b3065e1c18201b49ab898474b357566'], + }; + + context('when the record is the first one for its service and terms type', () => { + it('starts with the first-record prefix', () => { + subject = new Version({ ...baseParams, isFirstRecord: true }); + expect(subject.displayTitle).to.equal('First record of service-A Terms of Service'); + }); + }); + + context('when the record is a technical upgrade', () => { + it('starts with the technical-upgrade prefix', () => { + subject = new Version({ ...baseParams, isTechnicalUpgrade: true }); + expect(subject.displayTitle).to.equal('Apply technical or declaration upgrade on service-A Terms of Service'); + }); + }); + + context('when the record is a regular content change', () => { + it('starts with the update prefix', () => { + subject = new Version(baseParams); + expect(subject.displayTitle).to.equal('Record new changes of service-A Terms of Service'); + }); + }); + + context('when the record is both a first record and a technical upgrade', () => { + it('prioritises the first-record prefix', () => { + subject = new Version({ ...baseParams, isFirstRecord: true, isTechnicalUpgrade: true }); + expect(subject.displayTitle).to.equal('First record of service-A Terms of Service'); + }); + }); + }); }); From ad3b5710ef5e11b5fc0e87282f9e5f65fb4ea241 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 15:44:52 +0200 Subject: [PATCH 33/48] Use displayTitle for git commit subjects --- .../recorder/repositories/git/dataMapper.js | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/archivist/recorder/repositories/git/dataMapper.js b/src/archivist/recorder/repositories/git/dataMapper.js index 6d60ae9e9..d9fb74fe0 100644 --- a/src/archivist/recorder/repositories/git/dataMapper.js +++ b/src/archivist/recorder/repositories/git/dataMapper.js @@ -2,13 +2,14 @@ import path from 'path'; import mime from 'mime'; +import { TITLE_PREFIXES } from '../../record.js'; import Snapshot from '../../snapshot.js'; import Version from '../../version.js'; export const COMMIT_MESSAGE_PREFIXES = { - startTracking: 'First record of', - technicalUpgrade: 'Apply technical or declaration upgrade on', - update: 'Record new changes of', + startTracking: TITLE_PREFIXES.firstRecord, + technicalUpgrade: TITLE_PREFIXES.technicalUpgrade, + update: TITLE_PREFIXES.update, deprecated_startTracking: 'Start tracking', deprecated_refilter: 'Refilter', deprecated_update: 'Update', @@ -30,13 +31,9 @@ const MULTIPLE_SOURCE_DOCUMENTS_PREFIX = 'This version was recorded after extrac export const COMMIT_MESSAGE_PREFIXES_REGEXP = new RegExp(`^(${Object.values(COMMIT_MESSAGE_PREFIXES).join('|')})`); export function toPersistence(record, snapshotIdentiferTemplate) { - const { serviceId, termsType, documentId, isTechnicalUpgrade, snapshotIds = [], mimeType, isFirstRecord, metadata } = record; + const { serviceId, termsType, documentId, snapshotIds = [], mimeType, metadata } = record; - let prefix = isTechnicalUpgrade ? COMMIT_MESSAGE_PREFIXES.technicalUpgrade : COMMIT_MESSAGE_PREFIXES.update; - - prefix = isFirstRecord ? COMMIT_MESSAGE_PREFIXES.startTracking : prefix; - - const subject = `${prefix} ${serviceId} ${termsType}`; + const subject = record.displayTitle; const documentIdMessage = `${documentId ? `Document ID ${documentId}\n\n` : ''}`; let snapshotIdsMessage; From 78865b5bcf858313453f8e9312d8e29052de68b5 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 15:45:55 +0200 Subject: [PATCH 34/48] Use displayTitle for feed entry titles The feed route no longer imports anything from the git repository implementation: titles come straight from the record. Atom-only labels (record-type categories) remain local to the route. --- src/collection-api/routes/feed.js | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 0f3bf1bd6..9a9c87022 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -2,7 +2,6 @@ import express from 'express'; import { js2xml } from 'xml-js'; import { getCollection } from '../../archivist/collection/index.js'; -import { COMMIT_MESSAGE_PREFIXES } from '../../archivist/recorder/repositories/git/dataMapper.js'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; const RECORD_TYPES = { @@ -21,12 +20,6 @@ function classifyRecordType(version) { return version.isFirstRecord ? RECORD_TYPES.firstRecord : RECORD_TYPES.change; } -function buildEntryTitle(version) { - const prefix = version.isFirstRecord ? COMMIT_MESSAGE_PREFIXES.startTracking : COMMIT_MESSAGE_PREFIXES.update; - - return `${prefix} ${version.serviceId} ${version.termsType}`; -} - function buildVersionLink(baseUrl, version) { const encodedDate = encodeURIComponent(toISODateWithoutMilliseconds(version.fetchDate)); const encodedService = encodeURIComponent(version.serviceId); @@ -58,7 +51,7 @@ function buildEntry(storageType, versionUrlTemplate, baseUrl, collection, versio return { id: { _text: buildEntryId(storageType, collection, version) }, link: { _attributes: { rel: 'alternate', type: 'text/html', href } }, - title: { _text: buildEntryTitle(version) }, + title: { _text: version.displayTitle }, updated: { _text: version.fetchDate.toISOString() }, category: [ { _attributes: { term: version.serviceId, scheme: schemes.service } }, From 1f1b72a689e43d366b07a0389dd0128d99e90ef5 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 16:03:09 +0200 Subject: [PATCH 35/48] Honour reverse proxy headers in feed URLs --- src/collection-api/routes/feed.js | 4 ++- src/collection-api/routes/feed.test.js | 42 ++++++++++++++++++++++++++ src/collection-api/server.js | 2 ++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 9a9c87022..0af26051f 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -13,7 +13,9 @@ const TAG_AUTHORITY = 'opentermsarchive.org,2026'; // Tag URI authority (RFC 415 const FEED_AUTHOR_NAME = 'Open Terms Archive engine'; function buildAbsoluteBaseUrl(req) { - return `${req.protocol}://${req.get('host')}${req.baseUrl}`; + const host = req.get('X-Forwarded-Host') ?? req.get('host'); // Behind a trusted reverse proxy, the public host comes from X-Forwarded-Host. req.get('host') only sees the internal Host header, so we read the forwarded value explicitly and fall back to the direct host for non-proxied setups (dev, tests). + + return `${req.protocol}://${host}${req.baseUrl}`; } function classifyRecordType(version) { diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 7bbc39083..359849072 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -564,4 +564,46 @@ describe('Feed API', () => { } }); }); + + describe('behind a reverse proxy', () => { + let response; + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + await repository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'content', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['s1'], + })); + + response = await request + .get(`${basePath}/v1/feed`) + .set('X-Forwarded-Proto', 'https') + .set('X-Forwarded-Host', 'api.example.com'); + }); + + after(() => repository.removeAll()); + + it('uses the forwarded protocol and host in the self link', () => { + const href = response.text.match(/]*rel="self"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.match(/^https:\/\/api\.example\.com\//); + }); + + it('uses the forwarded protocol and host in entry alternate links', () => { + const entry = response.text.match(/[\s\S]*?<\/entry>/); + + expect(entry).to.not.be.null; + + const href = entry[0].match(/]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.match(/^https:\/\/api\.example\.com\//); + }); + }); }); diff --git a/src/collection-api/server.js b/src/collection-api/server.js index c1b0a7eb0..6a54758ce 100644 --- a/src/collection-api/server.js +++ b/src/collection-api/server.js @@ -8,6 +8,8 @@ import apiRouter from './routes/index.js'; const app = express(); +app.set('trust proxy', 'loopback'); // The API binds to 127.0.0.1 and is expected to run behind a reverse proxy. Honour X-Forwarded-* headers only when they come from a local proxy so absolute URLs emitted by routes (notably Atom feed links) reflect the URL seen by clients rather than the internal http://127.0.0.1 hop. + if (process.env.NODE_ENV !== 'test') { app.use(loggerMiddleware); } From 5b42fdf717f0615a176984b5d064f0d7ed83ebb3 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 16:39:01 +0200 Subject: [PATCH 36/48] Hoist feed schemes to a module-level constant --- src/collection-api/routes/feed.js | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 0af26051f..2c14b29cc 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -12,6 +12,12 @@ const RECORD_TYPES = { const TAG_AUTHORITY = 'opentermsarchive.org,2026'; // Tag URI authority (RFC 4151). The year fixes the scheme inception and must never change: it would invalidate every previously emitted feed and entry ID. const FEED_AUTHOR_NAME = 'Open Terms Archive engine'; +const SCHEMES = Object.freeze({ + service: `tag:${TAG_AUTHORITY}:scheme:service`, + termsType: `tag:${TAG_AUTHORITY}:scheme:terms-type`, + recordType: `tag:${TAG_AUTHORITY}:scheme:record-type`, +}); + function buildAbsoluteBaseUrl(req) { const host = req.get('X-Forwarded-Host') ?? req.get('host'); // Behind a trusted reverse proxy, the public host comes from X-Forwarded-Host. req.get('host') only sees the internal Host header, so we read the forwarded value explicitly and fall back to the direct host for non-proxied setups (dev, tests). @@ -38,17 +44,8 @@ function buildFeedId(collection, ...suffix) { return [ `tag:${TAG_AUTHORITY}:feed`, collection.metadata?.id, ...suffix ].join(':'); } -function buildSchemes() { - return { - service: `tag:${TAG_AUTHORITY}:scheme:service`, - termsType: `tag:${TAG_AUTHORITY}:scheme:terms-type`, - recordType: `tag:${TAG_AUTHORITY}:scheme:record-type`, - }; -} - function buildEntry(storageType, versionUrlTemplate, baseUrl, collection, version) { const href = versionUrlTemplate?.replace('%VERSION_ID', version.id) ?? buildVersionLink(baseUrl, version); - const schemes = buildSchemes(); return { id: { _text: buildEntryId(storageType, collection, version) }, @@ -56,9 +53,9 @@ function buildEntry(storageType, versionUrlTemplate, baseUrl, collection, versio title: { _text: version.displayTitle }, updated: { _text: version.fetchDate.toISOString() }, category: [ - { _attributes: { term: version.serviceId, scheme: schemes.service } }, - { _attributes: { term: version.termsType, scheme: schemes.termsType } }, - { _attributes: { term: classifyRecordType(version), scheme: schemes.recordType } }, + { _attributes: { term: version.serviceId, scheme: SCHEMES.service } }, + { _attributes: { term: version.termsType, scheme: SCHEMES.termsType } }, + { _attributes: { term: classifyRecordType(version), scheme: SCHEMES.recordType } }, ], }; } From c29a2c40c3220eebe71b881b99fa04d86b392d65 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 16:40:17 +0200 Subject: [PATCH 37/48] Require collection name to expose feed endpoints --- src/collection-api/routes/feed.js | 2 +- src/collection-api/routes/index.js | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 2c14b29cc..fa79fdccf 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -65,7 +65,7 @@ function buildFeedDocument({ storageType, versionUrlTemplate, collection, selfHr const feed = { _attributes: { xmlns: 'http://www.w3.org/2005/Atom' }, - title: { _text: collection.metadata?.name || '' }, + title: { _text: collection.metadata.name }, subtitle: { _text: collection.metadata?.tagline || '' }, id: { _text: feedId }, updated: { _text: latestFetchDate.toISOString() }, diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index a8a629695..c4ae5ec33 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -44,6 +44,10 @@ export default async function apiRouter(basePath) { throw new Error('Collection metadata "id" is required to expose feed endpoints, as it is used to build the tag URIs that uniquely identify the feed and its entries. Add an "id" field to the collection metadata file.'); } + if (!collection.metadata?.name) { + throw new Error('Collection metadata "name" is required to expose feed endpoints, as it is used as the Atom feed title which the Atom 1.0 specification requires to be non-empty. Add a "name" field to the collection metadata file.'); + } + router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); router.use(versionsRouter(versionsRepository)); From 186ead356c46f1c2827b75e15b08c81f12ba3a6e Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 16:40:52 +0200 Subject: [PATCH 38/48] Skip Atom subtitle when collection has no tagline --- src/collection-api/routes/feed.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index fa79fdccf..11659474e 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -66,13 +66,16 @@ function buildFeedDocument({ storageType, versionUrlTemplate, collection, selfHr const feed = { _attributes: { xmlns: 'http://www.w3.org/2005/Atom' }, title: { _text: collection.metadata.name }, - subtitle: { _text: collection.metadata?.tagline || '' }, id: { _text: feedId }, updated: { _text: latestFetchDate.toISOString() }, link: { _attributes: { rel: 'self', href: selfHref } }, author: { name: { _text: FEED_AUTHOR_NAME } }, }; + if (collection.metadata?.tagline) { + feed.subtitle = { _text: collection.metadata.tagline }; + } + if (collection.metadata?.logo) { feed.logo = { _text: collection.metadata.logo }; } From 260e00f23ba4db282c97182bda355ee20b05ca0d Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 16:42:16 +0200 Subject: [PATCH 39/48] Use accurate MIME types on feed links --- src/collection-api/routes/feed.js | 5 +++-- src/collection-api/routes/feed.test.js | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 11659474e..2045c3ad5 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -46,10 +46,11 @@ function buildFeedId(collection, ...suffix) { function buildEntry(storageType, versionUrlTemplate, baseUrl, collection, version) { const href = versionUrlTemplate?.replace('%VERSION_ID', version.id) ?? buildVersionLink(baseUrl, version); + const type = versionUrlTemplate ? 'text/html' : 'application/json'; // The default link points to the JSON Version API; operators who configure a versionUrlTemplate typically target a human-readable page (e.g. a GitHub commit), which is HTML. return { id: { _text: buildEntryId(storageType, collection, version) }, - link: { _attributes: { rel: 'alternate', type: 'text/html', href } }, + link: { _attributes: { rel: 'alternate', type, href } }, title: { _text: version.displayTitle }, updated: { _text: version.fetchDate.toISOString() }, category: [ @@ -68,7 +69,7 @@ function buildFeedDocument({ storageType, versionUrlTemplate, collection, selfHr title: { _text: collection.metadata.name }, id: { _text: feedId }, updated: { _text: latestFetchDate.toISOString() }, - link: { _attributes: { rel: 'self', href: selfHref } }, + link: { _attributes: { rel: 'self', type: 'application/atom+xml', href: selfHref } }, author: { name: { _text: FEED_AUTHOR_NAME } }, }; diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 359849072..b81319bd4 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -73,6 +73,10 @@ describe('Feed API', () => { expect(selfHrefMatch[1]).to.match(new RegExp(`${basePath}/v1/feed$`)); }); + it('advertises an application/atom+xml type on the self link', () => { + expect(response.text).to.match(/]*rel="self"[^>]*type="application\/atom\+xml"/); + }); + it('has an author matching the feed author name', () => { expect(response.text).to.match(/[\s\S]*Open Terms Archive engine<\/name>[\s\S]*<\/author>/); }); @@ -176,8 +180,8 @@ describe('Feed API', () => { expect(links).to.have.length(1); }); - it('has a type="text/html" on the alternate link', () => { - expect(firstEntry).to.match(/]*rel="alternate"[^>]*type="text\/html"/); + it('has a type matching the default Version API JSON response on the alternate link', () => { + expect(firstEntry).to.match(/]*rel="alternate"[^>]*type="application\/json"/); }); it('has a title reconstructed from commit prefix + serviceId + termsType', () => { @@ -546,6 +550,12 @@ describe('Feed API', () => { expect(href).to.equal(`https://example.test/v/${savedVersion.id}`); }); + it('advertises a text/html type on the alternate link', () => { + const entry = response.text.match(/[\s\S]*?<\/entry>/)[0]; + + expect(entry).to.match(/]*rel="alternate"[^>]*type="text\/html"/); + }); + it('does not point to the API for entry links', () => { const entries = response.text.match(/[\s\S]*?<\/entry>/g) || []; From dd0ddd8332b82e4c479799f11fb7dac32ef09f7f Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 16:43:08 +0200 Subject: [PATCH 40/48] Use a stable updated date for empty feeds Falling back to new Date() when no entries exist made the feed-level updated value change on every request, which breaks conditional GET: readers would never see a 304 even when nothing changed. Use the Unix epoch as a stable, RFC-valid placeholder until the first entry lands. --- src/collection-api/routes/feed.js | 2 +- src/collection-api/routes/feed.test.js | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 2045c3ad5..932bc0471 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -62,7 +62,7 @@ function buildEntry(storageType, versionUrlTemplate, baseUrl, collection, versio } function buildFeedDocument({ storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }) { - const latestFetchDate = versions.length > 0 ? versions[0].fetchDate : new Date(); + const latestFetchDate = versions.length > 0 ? versions[0].fetchDate : new Date(0); // Atom 1.0 requires a feed-level . When no entry exists yet, fall back to the Unix epoch so the value is stable across requests, emitting `new Date()` would defeat conditional GET caching for empty feeds. const feed = { _attributes: { xmlns: 'http://www.w3.org/2005/Atom' }, diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index b81319bd4..1b81c3e4b 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -347,6 +347,10 @@ describe('Feed API', () => { it('returns an empty feed (no entries)', () => { expect(response.text).to.not.include(''); }); + + it('uses a stable updated date so conditional GET keeps working', () => { + expect(extractTag(response.text, 'updated')).to.equal(new Date(0).toISOString()); + }); }); context('when the service does not exist', () => { From a2b86a8b1b1009e4b9547108894762a96f061984 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 17:07:09 +0200 Subject: [PATCH 41/48] Enable conditional GET on feeds via Last-Modified Atom feeds are typically polled every few minutes by RSS readers. Setting Last-Modified from the latest entry's fetch date lets Express honour If-Modified-Since and return 304 when nothing changed, saving the cost of building and shipping the XML body for the (frequent) unchanged case. --- src/collection-api/routes/feed.js | 26 +++++++++----- src/collection-api/routes/feed.test.js | 47 ++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 932bc0471..dfc520f77 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -61,9 +61,11 @@ function buildEntry(storageType, versionUrlTemplate, baseUrl, collection, versio }; } -function buildFeedDocument({ storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }) { - const latestFetchDate = versions.length > 0 ? versions[0].fetchDate : new Date(0); // Atom 1.0 requires a feed-level . When no entry exists yet, fall back to the Unix epoch so the value is stable across requests, emitting `new Date()` would defeat conditional GET caching for empty feeds. +function computeLatestFetchDate(versions) { + return versions.length > 0 ? versions[0].fetchDate : new Date(0); // Atom 1.0 requires a feed-level . When no entry exists yet, fall back to the Unix epoch so the value is stable across requests, emitting `new Date()` would defeat conditional GET caching for empty feeds. +} +function buildFeedDocument({ storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl, latestFetchDate }) { const feed = { _attributes: { xmlns: 'http://www.w3.org/2005/Atom' }, title: { _text: collection.metadata.name }, @@ -89,11 +91,19 @@ function buildFeedDocument({ storageType, versionUrlTemplate, collection, selfHr }; } -function sendFeed(res, opts) { - const document = buildFeedDocument(opts); +function sendFeed(req, res, opts) { + const latestFetchDate = computeLatestFetchDate(opts.versions); + + res.set('Last-Modified', latestFetchDate.toUTCString()); // Setting Last-Modified before checking req.fresh enables Express to compare it with If-Modified-Since and return 304 when nothing changed since the reader's last fetch; the headline optimisation for Atom feeds, which are typically polled every few minutes. + + if (req.fresh) { + return res.status(304).end(); + } res.set('Content-Type', 'application/atom+xml; charset=utf-8'); - res.status(200).send(js2xml(document, { compact: true, spaces: 2 })); + const document = buildFeedDocument({ ...opts, latestFetchDate }); + + return res.status(200).send(js2xml(document, { compact: true, spaces: 2 })); } /** @@ -135,7 +145,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const versions = await versionsRepository.findAll({ limit: feedLimit, includeTechnicalUpgrades: false }); - sendFeed(res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); + sendFeed(req, res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); }); /** @@ -177,7 +187,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const versions = await versionsRepository.findByService(service.id, { limit: feedLimit, includeTechnicalUpgrades: false }); - return sendFeed(res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); + return sendFeed(req, res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); }); /** @@ -231,7 +241,7 @@ export default function feedRouter(services, versionsRepository, storageType, fe const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: feedLimit, includeTechnicalUpgrades: false }); - return sendFeed(res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); + return sendFeed(req, res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); }); return router; diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 1b81c3e4b..0117d3eab 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -579,6 +579,53 @@ describe('Feed API', () => { }); }); + describe('conditional GET via Last-Modified', () => { + const FETCH_DATE = new Date('2024-05-15T10:00:00Z'); + + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + await repository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'content', + fetchDate: FETCH_DATE, + snapshotIds: ['s1'], + })); + }); + + after(() => repository.removeAll()); + + it('exposes a Last-Modified header matching the latest entry fetch date', async () => { + const response = await request.get(`${basePath}/v1/feed`); + + expect(response.headers['last-modified']).to.equal(FETCH_DATE.toUTCString()); + }); + + it('returns 304 with no body when If-Modified-Since is at or after the latest entry', async () => { + const response = await request + .get(`${basePath}/v1/feed`) + .set('If-Modified-Since', FETCH_DATE.toUTCString()); + + expect(response.status).to.equal(304); + expect(response.text).to.be.empty; + }); + + it('returns 200 with body when If-Modified-Since is before the latest entry', async () => { + const earlier = new Date(FETCH_DATE.getTime() - 24 * 60 * 60 * 1000); + const response = await request + .get(`${basePath}/v1/feed`) + .set('If-Modified-Since', earlier.toUTCString()); + + expect(response.status).to.equal(200); + expect(response.text).to.include(' { let response; let repository; From b6ed98fb8ee5bd6f734a96005fb629a6ea8407ef Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 12 May 2026 16:47:57 +0200 Subject: [PATCH 42/48] Escape XML special characters in feed attributes Xml-js does not escape attribute values --- src/collection-api/routes/feed.js | 13 ++++++++-- src/collection-api/routes/feed.test.js | 36 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index dfc520f77..3fdc9ce17 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -28,6 +28,15 @@ function classifyRecordType(version) { return version.isFirstRecord ? RECORD_TYPES.firstRecord : RECORD_TYPES.change; } +// xml-js does not escape attribute values: it assumes the caller passes them already escaped. Apply the XML attribute-value escapes manually wherever an attribute may carry user-provided text (notably category term), otherwise a serviceId like "AT&T Mobile" would emit malformed XML that strict feed readers (libxml2-based) reject. +function escapeXmlAttribute(value) { + return String(value) + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"'); +} + function buildVersionLink(baseUrl, version) { const encodedDate = encodeURIComponent(toISODateWithoutMilliseconds(version.fetchDate)); const encodedService = encodeURIComponent(version.serviceId); @@ -54,8 +63,8 @@ function buildEntry(storageType, versionUrlTemplate, baseUrl, collection, versio title: { _text: version.displayTitle }, updated: { _text: version.fetchDate.toISOString() }, category: [ - { _attributes: { term: version.serviceId, scheme: SCHEMES.service } }, - { _attributes: { term: version.termsType, scheme: SCHEMES.termsType } }, + { _attributes: { term: escapeXmlAttribute(version.serviceId), scheme: SCHEMES.service } }, + { _attributes: { term: escapeXmlAttribute(version.termsType), scheme: SCHEMES.termsType } }, { _attributes: { term: classifyRecordType(version), scheme: SCHEMES.recordType } }, ], }; diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 0117d3eab..0d8f3c277 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -579,6 +579,42 @@ describe('Feed API', () => { }); }); + describe('XML escape of special characters', () => { + const SERVICE = 'AT&T Mobile'; + const TERMS = 'Terms of Service'; + + let response; + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'content', + fetchDate: new Date('2024-05-15T10:00:00Z'), + snapshotIds: ['s_xmlesc'], + })); + + response = await request.get(`${basePath}/v1/feed`); + }); + + after(() => repository.removeAll()); + + it('escapes ampersands in the entry title text', () => { + expect(response.text).to.match(/[^<]*AT&T Mobile[^<]*<\/title>/); + expect(response.text).to.not.match(/<title>[^<]*AT&T Mobile/); + }); + + it('escapes ampersands in the category term attribute', () => { + expect(response.text).to.match(/<category[^/]*term="AT&T Mobile"/); + expect(response.text).to.not.match(/<category[^/]*term="AT&T Mobile"/); + }); + }); + describe('conditional GET via Last-Modified', () => { const FETCH_DATE = new Date('2024-05-15T10:00:00Z'); From 9f371fbd0060882edb9354975f3ecc5b37be5bde Mon Sep 17 00:00:00 2001 From: Nicolas Dupont <npg.dupont@gmail.com> Date: Tue, 12 May 2026 16:59:32 +0200 Subject: [PATCH 43/48] Apply XML attribute escape via js2xml hook Pass escapeXmlAttribute through xml-js's attributeValueFn rather than wrapping each attribute manually. Escaping now happens at the serialization boundary for every attribute, including those we did not explicitly wrap before --- src/collection-api/routes/feed.js | 8 +++---- src/collection-api/routes/feed.test.js | 33 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 3fdc9ce17..97061a512 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -28,7 +28,7 @@ function classifyRecordType(version) { return version.isFirstRecord ? RECORD_TYPES.firstRecord : RECORD_TYPES.change; } -// xml-js does not escape attribute values: it assumes the caller passes them already escaped. Apply the XML attribute-value escapes manually wherever an attribute may carry user-provided text (notably category term), otherwise a serviceId like "AT&T Mobile" would emit malformed XML that strict feed readers (libxml2-based) reject. +// xml-js does not escape attribute values by default — callers are expected to pre-escape. We wire this helper to js2xml's attributeValueFn so every emitted attribute goes through it, regardless of where it's built. Without this, a serviceId like "AT&T Mobile" would yield malformed XML rejected by strict feed readers (libxml2-based). function escapeXmlAttribute(value) { return String(value) .replace(/&/g, '&') @@ -63,8 +63,8 @@ function buildEntry(storageType, versionUrlTemplate, baseUrl, collection, versio title: { _text: version.displayTitle }, updated: { _text: version.fetchDate.toISOString() }, category: [ - { _attributes: { term: escapeXmlAttribute(version.serviceId), scheme: SCHEMES.service } }, - { _attributes: { term: escapeXmlAttribute(version.termsType), scheme: SCHEMES.termsType } }, + { _attributes: { term: version.serviceId, scheme: SCHEMES.service } }, + { _attributes: { term: version.termsType, scheme: SCHEMES.termsType } }, { _attributes: { term: classifyRecordType(version), scheme: SCHEMES.recordType } }, ], }; @@ -112,7 +112,7 @@ function sendFeed(req, res, opts) { res.set('Content-Type', 'application/atom+xml; charset=utf-8'); const document = buildFeedDocument({ ...opts, latestFetchDate }); - return res.status(200).send(js2xml(document, { compact: true, spaces: 2 })); + return res.status(200).send(js2xml(document, { compact: true, spaces: 2, attributeValueFn: escapeXmlAttribute })); } /** diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js index 0d8f3c277..ed95e4964 100644 --- a/src/collection-api/routes/feed.test.js +++ b/src/collection-api/routes/feed.test.js @@ -613,6 +613,39 @@ describe('Feed API', () => { expect(response.text).to.match(/<category[^/]*term="AT&T Mobile"/); expect(response.text).to.not.match(/<category[^/]*term="AT&T Mobile"/); }); + + context('with a versionUrlTemplate that contains XML-special characters', () => { + const TEMPLATE = 'https://example.test/v?ref=main&id=%VERSION_ID'; + + let templatedResponse; + let templatedRepository; + + before(async function () { + this.timeout(5000); + templatedRepository = RepositoryFactory.create(storageConfig); + await templatedRepository.initialize(); + await templatedRepository.removeAll(); + await templatedRepository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'content', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['s_tpl_escape'], + })); + + const services = await Services.load(); + const templatedApp = express(); + + templatedApp.use(feedRouter(services, templatedRepository, storageConfig.type, 10, TEMPLATE)); + templatedResponse = await supertest(templatedApp).get('/feed'); + }); + + after(() => templatedRepository.removeAll()); + + it('escapes the ampersand in the alternate link href', () => { + expect(templatedResponse.text).to.match(/<link[^>]*rel="alternate"[^>]*href="https:\/\/example\.test\/v\?ref=main&id=[^"]+"/); + }); + }); }); describe('conditional GET via Last-Modified', () => { From f2eea3a4dbfa44e29b2d8e503081feea42f1fa42 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont <npg.dupont@gmail.com> Date: Tue, 12 May 2026 17:20:30 +0200 Subject: [PATCH 44/48] Annotate listCommits git log options inline --- src/archivist/recorder/repositories/git/git.js | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/archivist/recorder/repositories/git/git.js b/src/archivist/recorder/repositories/git/git.js index 364fdc72b..9298d0270 100644 --- a/src/archivist/recorder/repositories/git/git.js +++ b/src/archivist/recorder/repositories/git/git.js @@ -73,7 +73,15 @@ export default class Git { const skipOption = skip !== undefined ? [`--skip=${skip}`] : []; const maxCountOption = maxCount !== undefined ? [`--max-count=${maxCount}`] : []; - return this.log([ ...reverseOption, '--author-date-order', '--no-merges', '--name-only', ...skipOption, ...maxCountOption, ...options ]); // Returns commits in chronological order with `--reverse` (oldest first) or reverse chronological without it (newest first), sorted by author date (`--author-date-order`), excluding merge commits (`--no-merges`), with modified files names (`--name-only`), with optional pagination (`--skip`, `--max-count`) + return this.log([ + ...reverseOption, // When `reverse` is true, lists commits oldest-first; otherwise the default newest-first applies + '--author-date-order', // Best-effort author-date ordering: with --max-count, git applies the cap topologically, so the page can miss strictly-newer commits that #getCommits' JS resort cannot recover + '--no-merges', // Exclude merge commits — records are stored as regular commits, never as merges + '--name-only', // Append the modified file names below each commit, used by `toDomain` to extract the record's file path + ...skipOption, // Optional `--skip=N`: drop the first N matching commits (pagination offset) + ...maxCountOption, // Optional `--max-count=N`: cap the result to N commits (pagination limit) + ...options, // Caller-supplied options: typically grep filters on commit messages and a path filter (`-- pathspec`) + ]); } async getCommit(options) { From 58637aafe800ee5d0d78ac8564f597cbec6af070 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont <npg.dupont@gmail.com> Date: Tue, 12 May 2026 18:01:49 +0200 Subject: [PATCH 45/48] Express commit prefix groups as disjoint sets --- .../recorder/repositories/git/dataMapper.js | 18 +++++++++--------- .../recorder/repositories/git/index.js | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/archivist/recorder/repositories/git/dataMapper.js b/src/archivist/recorder/repositories/git/dataMapper.js index d9fb74fe0..fa9ba90d6 100644 --- a/src/archivist/recorder/repositories/git/dataMapper.js +++ b/src/archivist/recorder/repositories/git/dataMapper.js @@ -6,23 +6,23 @@ import { TITLE_PREFIXES } from '../../record.js'; import Snapshot from '../../snapshot.js'; import Version from '../../version.js'; -export const COMMIT_MESSAGE_PREFIXES = { +// Prefixes for commits that represent an actual content change detected at the service source +const CHANGE_PREFIXES = { startTracking: TITLE_PREFIXES.firstRecord, - technicalUpgrade: TITLE_PREFIXES.technicalUpgrade, update: TITLE_PREFIXES.update, deprecated_startTracking: 'Start tracking', - deprecated_refilter: 'Refilter', deprecated_update: 'Update', }; -// Subset of COMMIT_MESSAGE_PREFIXES that exclude technical upgrades (re-renders of existing snapshots with updated extraction rules) and only represent content changes detected at the service source -export const REAL_CHANGE_COMMIT_MESSAGE_PREFIXES = { - startTracking: COMMIT_MESSAGE_PREFIXES.startTracking, - update: COMMIT_MESSAGE_PREFIXES.update, - deprecated_startTracking: COMMIT_MESSAGE_PREFIXES.deprecated_startTracking, - deprecated_update: COMMIT_MESSAGE_PREFIXES.deprecated_update, +// Prefixes for commits that re-render an existing snapshot (e.g. with updated extraction rules) without any change at the service source +const TECHNICAL_UPGRADE_PREFIXES = { + technicalUpgrade: TITLE_PREFIXES.technicalUpgrade, + deprecated_refilter: 'Refilter', }; +export const CHANGE_COMMIT_MESSAGE_PREFIXES = CHANGE_PREFIXES; +export const COMMIT_MESSAGE_PREFIXES = { ...CHANGE_PREFIXES, ...TECHNICAL_UPGRADE_PREFIXES }; + export const TERMS_TYPE_AND_DOCUMENT_ID_SEPARATOR = ' #'; export const SNAPSHOT_ID_MARKER = '%SNAPSHOT_ID'; const SINGLE_SOURCE_DOCUMENT_PREFIX = 'This version was recorded after extracting from snapshot'; diff --git a/src/archivist/recorder/repositories/git/index.js b/src/archivist/recorder/repositories/git/index.js index f3721edb5..6f51efa90 100644 --- a/src/archivist/recorder/repositories/git/index.js +++ b/src/archivist/recorder/repositories/git/index.js @@ -163,7 +163,7 @@ export default class GitRepository extends RepositoryInterface { async #getCommits({ pathFilter, reverse = false, limit, offset, includeTechnicalUpgrades = true } = {}) { const prefixes = includeTechnicalUpgrades ? DataMapper.COMMIT_MESSAGE_PREFIXES - : DataMapper.REAL_CHANGE_COMMIT_MESSAGE_PREFIXES; + : DataMapper.CHANGE_COMMIT_MESSAGE_PREFIXES; const grepOptions = Object.values(prefixes).flatMap(prefix => [ '--grep', prefix ]); const pathOptions = pathFilter ? [ '--', pathFilter ] From f8a90d2f37a69e0e999c74a0272f6f8e4799779c Mon Sep 17 00:00:00 2001 From: Nicolas Dupont <npg.dupont@gmail.com> Date: Wed, 13 May 2026 10:21:49 +0200 Subject: [PATCH 46/48] Improve comments --- .../recorder/repositories/git/dataMapper.js | 3 +-- src/archivist/recorder/repositories/git/index.js | 13 +++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/archivist/recorder/repositories/git/dataMapper.js b/src/archivist/recorder/repositories/git/dataMapper.js index fa9ba90d6..9f2757f45 100644 --- a/src/archivist/recorder/repositories/git/dataMapper.js +++ b/src/archivist/recorder/repositories/git/dataMapper.js @@ -96,9 +96,8 @@ function generateFileName(termsType, documentId, extension) { } export function generateFilePath(serviceId, termsType, documentId, mimeType) { - // If only serviceId is provided, return a pattern to match all files for that service if (termsType === undefined) { - return `${serviceId}/*`; + return `${serviceId}/*`; // If only serviceId is provided, return a pattern to match all files for that service } const extension = mime.getExtension(mimeType) || '*'; // If mime type is undefined, an asterisk is set as an extension. Used to match all files for the given service ID, terms type and document ID when mime type is unknown diff --git a/src/archivist/recorder/repositories/git/index.js b/src/archivist/recorder/repositories/git/index.js index 6f51efa90..1e54fb4bd 100644 --- a/src/archivist/recorder/repositories/git/index.js +++ b/src/archivist/recorder/repositories/git/index.js @@ -113,13 +113,11 @@ export default class GitRepository extends RepositoryInterface { pathOptions.push('--', pathPattern); } else if (serviceId) { - // Count all records for a service (all terms types) const pathPattern = DataMapper.generateFilePath(serviceId); pathOptions.push('--', pathPattern); } else { - // Count all records (exclude root directory files) - pathOptions.push('--', '*/*'); + pathOptions.push('--', '*/*'); // Count all records (exclude root directory files) } return (await this.git.log([ ...grepOptions, ...pathOptions ])).length; @@ -171,9 +169,10 @@ export default class GitRepository extends RepositoryInterface { const options = [ ...grepOptions, ...pathOptions ]; - // Use git-level pagination when available - // Note: --skip and --max-count work in topological order, not chronological order - // This means pagination may not be strictly chronological, but it's acceptable for performance + // Use git-level pagination for performance: `--skip` and `--max-count` count in topological order, not strictly chronological. + // In records history, the only commits whose author date is out of step with their topological position are technical upgrades. + // The only caller currently relying on pagination is the feed endpoint, which already filters technical upgrades out via `includeTechnicalUpgrades: false`, so the paginated set has no chronological/topological divergence in practice. + // If a future caller needs paginated access that includes technical upgrades, switch to the approach proposed in https://github.com/OpenTermsArchive/engine/issues/1243. const paginationOptions = {}; if (offset !== undefined) { @@ -186,8 +185,6 @@ export default class GitRepository extends RepositoryInterface { const commits = await this.git.listCommits(options, { reverse: false, ...paginationOptions }); // Get commits without git's --reverse for better performance, filtered at git level - // Sort by date in JavaScript for accuracy - git's date ordering may not be reliable with backdated commits - // Default order is descending (newest to oldest), reverse gives ascending (oldest to newest) commits.sort((commitA, commitB) => { const dateA = new Date(commitA.date); const dateB = new Date(commitB.date); From 6c566cbe2691f63517cea24c77d3f532adf8c959 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont <npg.dupont@gmail.com> Date: Wed, 13 May 2026 10:34:31 +0200 Subject: [PATCH 47/48] Factor feed rendering into a renderFeed helper --- src/collection-api/routes/feed.js | 32 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js index 97061a512..02de64b1e 100644 --- a/src/collection-api/routes/feed.js +++ b/src/collection-api/routes/feed.js @@ -130,6 +130,14 @@ function sendFeed(req, res, opts) { export default function feedRouter(services, versionsRepository, storageType, feedLimit, versionUrlTemplate) { const router = express.Router(); + async function renderFeed(req, res, { selfHref, suffix = [], versions }) { + const collection = await getCollection(); + const baseUrl = buildAbsoluteBaseUrl(req); + const feedId = buildFeedId(collection, ...suffix); + + return sendFeed(req, res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); + } + /** * @swagger * /feed: @@ -147,14 +155,10 @@ export default function feedRouter(services, versionsRepository, storageType, fe * type: string */ router.get('/feed', async (req, res) => { - const collection = await getCollection(); - const baseUrl = buildAbsoluteBaseUrl(req); - const selfHref = `${baseUrl}/feed`; - const feedId = buildFeedId(collection); - const versions = await versionsRepository.findAll({ limit: feedLimit, includeTechnicalUpgrades: false }); + const selfHref = `${buildAbsoluteBaseUrl(req)}/feed`; - sendFeed(req, res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); + return renderFeed(req, res, { selfHref, versions }); }); /** @@ -189,14 +193,10 @@ export default function feedRouter(services, versionsRepository, storageType, fe return res.status(404).send('Service not found'); } - const collection = await getCollection(); - const baseUrl = buildAbsoluteBaseUrl(req); - const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}`; - const feedId = buildFeedId(collection, service.id); - const versions = await versionsRepository.findByService(service.id, { limit: feedLimit, includeTechnicalUpgrades: false }); + const selfHref = `${buildAbsoluteBaseUrl(req)}/feed/${encodeURIComponent(service.id)}`; - return sendFeed(req, res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); + return renderFeed(req, res, { selfHref, suffix: [service.id], versions }); }); /** @@ -243,14 +243,10 @@ export default function feedRouter(services, versionsRepository, storageType, fe return res.status(404).send('Terms type not found for this service'); } - const collection = await getCollection(); - const baseUrl = buildAbsoluteBaseUrl(req); - const selfHref = `${baseUrl}/feed/${encodeURIComponent(service.id)}/${encodeURIComponent(termsType)}`; - const feedId = buildFeedId(collection, service.id, termsType); - const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: feedLimit, includeTechnicalUpgrades: false }); + const selfHref = `${buildAbsoluteBaseUrl(req)}/feed/${encodeURIComponent(service.id)}/${encodeURIComponent(termsType)}`; - return sendFeed(req, res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); + return renderFeed(req, res, { selfHref, suffix: [ service.id, termsType ], versions }); }); return router; From c857bc2d398fbe5b5d0ea73a27a7489176bdb216 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont <npg.dupont@gmail.com> Date: Wed, 13 May 2026 17:16:54 +0200 Subject: [PATCH 48/48] Add findByService tests --- .../recorder/repositories/git/index.test.js | 116 ++++++++++++++++++ .../recorder/repositories/mongo/index.test.js | 114 +++++++++++++++++ 2 files changed, 230 insertions(+) diff --git a/src/archivist/recorder/repositories/git/index.test.js b/src/archivist/recorder/repositories/git/index.test.js index 9428c64d3..2ae2dfdae 100644 --- a/src/archivist/recorder/repositories/git/index.test.js +++ b/src/archivist/recorder/repositories/git/index.test.js @@ -674,6 +674,122 @@ describe('GitRepository', () => { }); }); + describe('#findByService', () => { + const OTHER_TERMS_TYPE = 'Privacy Policy'; + const expectedIds = []; + let records; + + before(async function () { + this.timeout(5000); + + const { id: id1 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id2); + + const { id: id3 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: OTHER_TERMS_TYPE, + content: `${CONTENT} - other terms type`, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id3); + + await subject.save(new Version({ + serviceId: 'other_service', + termsType: TERMS_TYPE, + content: `${CONTENT} - other service`, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + (records = await subject.findByService(SERVICE_PROVIDER_ID)); + }); + + after(() => subject.removeAll()); + + it('returns only matching records', () => { + expect(records.length).to.equal(3); + }); + + it('returns Version objects', () => { + for (const record of records) { + expect(record).to.be.an.instanceof(Version); + } + }); + + it('returns records with matching service ID', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + } + }); + + it('returns records across all terms types of the service', () => { + expect(new Set(records.map(record => record.termsType))).to.deep.equal(new Set([ TERMS_TYPE, OTHER_TERMS_TYPE ])); + }); + + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); + }); + + it('returns records with correct IDs', () => { + expect(records.map(record => record.id)).to.have.members(expectedIds); + }); + + context('when no matching records exist', () => { + it('returns an empty array', async () => { + const result = await subject.findByService('non_existent_service'); + + expect(result).to.be.an('array').that.is.empty; + }); + }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + let technicalUpgradeId; + + before(async () => { + ({ id: technicalUpgradeId } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - technical upgrade`, + fetchDate: new Date('2000-01-03T12:00:00.000Z'), + snapshotIds: [SNAPSHOT_ID], + isTechnicalUpgrade: true, + }))); + + filteredRecords = await subject.findByService(SERVICE_PROVIDER_ID, { includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.map(record => record.id)).to.not.include(technicalUpgradeId); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + }); + }); + describe('#count', () => { let count; diff --git a/src/archivist/recorder/repositories/mongo/index.test.js b/src/archivist/recorder/repositories/mongo/index.test.js index c78b0cd4e..72b3f3b3d 100644 --- a/src/archivist/recorder/repositories/mongo/index.test.js +++ b/src/archivist/recorder/repositories/mongo/index.test.js @@ -761,6 +761,120 @@ describe('MongoRepository', () => { }); }); + describe('#findByService', () => { + const OTHER_TERMS_TYPE = 'Privacy Policy'; + const expectedIds = []; + let records; + + before(async () => { + const { id: id1 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id2); + + const { id: id3 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: OTHER_TERMS_TYPE, + content: `${CONTENT} - other terms type`, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id3); + + await subject.save(new Version({ + serviceId: 'other_service', + termsType: TERMS_TYPE, + content: `${CONTENT} - other service`, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + (records = await subject.findByService(SERVICE_PROVIDER_ID)); + }); + + after(() => subject.removeAll()); + + it('returns only matching records', () => { + expect(records.length).to.equal(3); + }); + + it('returns Version objects', () => { + for (const record of records) { + expect(record).to.be.an.instanceof(Version); + } + }); + + it('returns records with matching service ID', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + } + }); + + it('returns records across all terms types of the service', () => { + expect(new Set(records.map(record => record.termsType))).to.deep.equal(new Set([ TERMS_TYPE, OTHER_TERMS_TYPE ])); + }); + + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); + }); + + it('returns records with correct IDs', () => { + expect(records.map(record => record.id)).to.have.members(expectedIds); + }); + + context('when no matching records exist', () => { + it('returns an empty array', async () => { + const result = await subject.findByService('non_existent_service'); + + expect(result).to.be.an('array').that.is.empty; + }); + }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + let technicalUpgradeId; + + before(async () => { + ({ id: technicalUpgradeId } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - technical upgrade`, + fetchDate: new Date('2000-01-03T12:00:00.000Z'), + snapshotIds: [SNAPSHOT_ID], + isTechnicalUpgrade: true, + }))); + + filteredRecords = await subject.findByService(SERVICE_PROVIDER_ID, { includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.map(record => record.id)).to.not.include(technicalUpgradeId); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + }); + }); + describe('#count', () => { context('without filters', () => { let count;