diff --git a/.eslintrc.yaml b/.eslintrc.yaml index 85702b255..3731975fa 100644 --- a/.eslintrc.yaml +++ b/.eslintrc.yaml @@ -37,6 +37,9 @@ rules: - error - always-multiline consistent-return: 0 + curly: + - error + - all function-paren-newline: - error - multiline diff --git a/CHANGELOG.md b/CHANGELOG.md index ad05dcd96..1e0743735 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,22 @@ All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased [major] + +> Development of this release was supported by the [NGI0 Commons Fund](https://nlnet.nl/project/Modular-OTA/), a fund established by [NLnet](https://nlnet.nl/) with financial support from the European Commission's [Next Generation Internet](https://www.ngi.eu) programme, under the aegis of DG CNECT under grant agreement N°101069594. + +### Added + +- Add `GET /feed` endpoint on the Collection API exposing an Atom feed of the latest version changes across the whole collection +- Add `GET /feed/:serviceId` endpoint on the Collection API exposing an Atom feed scoped to a single service +- Add `GET /feed/:serviceId/:termsType` endpoint on the Collection API exposing an Atom feed scoped to a single service and terms type +- Add [`@opentermsarchive/engine.collection-api.feed.limit`](https://docs.opentermsarchive.org/collections/reference/configuration/) configuration option controlling the maximum number of entries returned by feed endpoints (default: `100`) +- Add [`@opentermsarchive/engine.collection-api.feed.versionUrlTemplate`](https://docs.opentermsarchive.org/collections/reference/configuration/) configuration option to customize the `alternate` link of feed entries with a URL template (e.g. `https://github.com/openTermsArchive/demo-versions/commit/%VERSION_ID`); useful to point feed readers to a human-readable page instead of the default version API endpoint + +### Changed + +- **Breaking:** Resolve `serviceId` path parameter case-sensitively on the `GET /service/:serviceId` endpoint, in line with the documented service ID format; clients relying on case-insensitive matching must now use the exact ID casing + ## 11.0.2 - 2026-04-14 > Development of this release was supported by [Reset Tech](https://www.reset.tech). diff --git a/config/default.json b/config/default.json index c044f2939..96309b6fb 100644 --- a/config/default.json +++ b/config/default.json @@ -47,6 +47,11 @@ }, "dataset": { "publishingSchedule": "30 8 * * MON" + }, + "collection-api": { + "feed": { + "limit": 100 + } } } } diff --git a/config/test.json b/config/test.json index cf14b8be3..050fd5b79 100644 --- a/config/test.json +++ b/config/test.json @@ -47,7 +47,10 @@ }, "collection-api": { "port": 3000, - "basePath": "/collection-api" + "basePath": "/collection-api", + "feed": { + "limit": 3 + } } } } diff --git a/package-lock.json b/package-lock.json index 3e11f1cf9..2292b55ce 100644 --- a/package-lock.json +++ b/package-lock.json @@ -58,7 +58,8 @@ "swagger-ui-express": "^5.0.1", "turndown": "^7.2.1", "winston": "^3.17.0", - "winston-mail": "^2.0.0" + "winston-mail": "^2.0.0", + "xml-js": "^1.6.11" }, "bin": { "ota": "bin/ota.js" @@ -1273,7 +1274,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=20.19.0" }, @@ -1320,7 +1320,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=20.19.0" } @@ -1639,7 +1638,6 @@ "integrity": "sha512-jCs9ldd7NwzpgXDIf6P3+NrHh9/sD6CQdxHyjQI+h/6rDNo88ypBxxz45UDuZHz9r3tNz7N/VInSVoVdtXEI4A==", "devOptional": true, "license": "MIT", - "peer": true, "engines": { "node": "^14.21.3 || >=16" }, @@ -1793,7 +1791,6 @@ "resolved": "https://registry.npmjs.org/@octokit/core/-/core-7.0.4.tgz", "integrity": "sha512-jOT8V1Ba5BdC79sKrRWDdMT5l1R+XNHTPR6CPWzUP2EcfAcvIHZWF0eAbmRcpOOP5gVIwnqNg0C4nvh6Abc3OA==", "license": "MIT", - "peer": true, "dependencies": { "@octokit/auth-token": "^6.0.0", "@octokit/graphql": "^9.0.1", @@ -2294,8 +2291,7 @@ "version": "20.7.0", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.7.0.tgz", "integrity": "sha512-zI22/pJW2wUZOVyguFaUL1HABdmSVxpXrzIqkjsHmyUjNhPoWM1CKfvVuXfetHhIok4RY573cqS0mZ1SJEnoTg==", - "license": "MIT", - "peer": true + "license": "MIT" }, "node_modules/@types/triple-beam": { "version": "1.3.5", @@ -2386,7 +2382,6 @@ "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -3108,7 +3103,6 @@ "resolved": "https://registry.npmjs.org/chai/-/chai-6.0.1.tgz", "integrity": "sha512-/JOoU2//6p5vCXh00FpNgtlw0LjvhGttaWc+y7wpW9yjBm3ys0dI8tSKZxIOgNruz5J0RleccatSIC3uxEZP0g==", "license": "MIT", - "peer": true, "engines": { "node": ">=18" } @@ -3569,7 +3563,6 @@ "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz", "integrity": "sha512-itvL5h8RETACmOTFc4UfIyB2RfEHi71Ax6E/PivVxq9NseKbOWpeyHEOIbmAw1rs8Ak0VursQNww7lf7YtUwzg==", "license": "MIT", - "peer": true, "dependencies": { "env-paths": "^2.2.1", "import-fresh": "^3.3.0", @@ -3989,8 +3982,7 @@ "version": "0.0.1495869", "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1495869.tgz", "integrity": "sha512-i+bkd9UYFis40RcnkW7XrOprCujXRAHg62IVh/Ah3G8MmNXpCGt1m0dTFhSdx/AVs8XEMbdOGRwdkR1Bcta8AA==", - "license": "BSD-3-Clause", - "peer": true + "license": "BSD-3-Clause" }, "node_modules/dezalgo": { "version": "1.0.4", @@ -4482,7 +4474,6 @@ "integrity": "sha512-ypowyDxpVSYpkXr9WPv2PAZCtNip1Mv5KTW0SCurXv/9iOpcrH9PaqUElksqEB6pChqHGDRCFTyrZlGhnLNGiA==", "deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.", "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.6.1", @@ -4624,7 +4615,6 @@ "resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.32.0.tgz", "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==", "license": "MIT", - "peer": true, "dependencies": { "@rtsao/scc": "^1.1.0", "array-includes": "^3.1.9", @@ -8870,7 +8860,6 @@ "integrity": "sha512-QabGIvu7F0hAMiKGHZCIRHMb6UoH0QAJA2OaqxEU2tL5noXPrxUcotg2l3ttOA4p1PFnVIGkr6PXRAWlM2evVQ==", "hasInstallScript": true, "license": "Apache-2.0", - "peer": true, "dependencies": { "@puppeteer/browsers": "2.10.10", "chromium-bidi": "8.0.0", @@ -8926,7 +8915,6 @@ "resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-3.3.6.tgz", "integrity": "sha512-rsLBE/6mMxAjlLd06LuGacrukP2bqbzKCLzV1vrhHFavqQE/taQ2UXv3H5P0Ls7nsrASa+6x3bDbXHpqMwq+7A==", "license": "MIT", - "peer": true, "dependencies": { "@types/debug": "^4.1.0", "debug": "^4.1.1", @@ -9887,7 +9875,6 @@ "resolved": "https://registry.npmjs.org/socks/-/socks-2.8.7.tgz", "integrity": "sha512-HLpt+uLy/pxB+bum/9DzAgiKS8CX1EvbWxI4zlmgGCExImLdiad2iCwXT5Z4c9c3Eq8rP2318mPW2c+QbtjK8A==", "license": "MIT", - "peer": true, "dependencies": { "ip-address": "^10.0.1", "smart-buffer": "^4.2.0" @@ -11334,6 +11321,18 @@ } } }, + "node_modules/xml-js": { + "version": "1.6.11", + "resolved": "https://registry.npmjs.org/xml-js/-/xml-js-1.6.11.tgz", + "integrity": "sha512-7rVi2KMfwfWFl+GpPg6m80IVMWXLRjO+PxTq7V2CDhoGak0wzYzFgUY2m4XJ47OGdXd8eLE8EmwfAmdjw7lC1g==", + "license": "MIT", + "dependencies": { + "sax": "^1.2.4" + }, + "bin": { + "xml-js": "bin/cli.js" + } + }, "node_modules/xml-name-validator": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz", diff --git a/package.json b/package.json index f21c5f107..db0fbd146 100644 --- a/package.json +++ b/package.json @@ -100,7 +100,8 @@ "swagger-ui-express": "^5.0.1", "turndown": "^7.2.1", "winston": "^3.17.0", - "winston-mail": "^2.0.0" + "winston-mail": "^2.0.0", + "xml-js": "^1.6.11" }, "devDependencies": { "@commitlint/cli": "^19.8.1", diff --git a/scripts/reporter/duplicate/index.js b/scripts/reporter/duplicate/index.js index d2b508770..22e13b1a0 100644 --- a/scripts/reporter/duplicate/index.js +++ b/scripts/reporter/duplicate/index.js @@ -39,7 +39,7 @@ async function removeDuplicateIssues() { } for (const [ title, duplicateIssues ] of issuesByTitle) { - if (duplicateIssues.length === 1) continue; + if (duplicateIssues.length === 1) { continue; } const originalIssue = duplicateIssues.reduce((oldest, current) => (new Date(current.created_at) < new Date(oldest.created_at) ? current : oldest)); diff --git a/src/archivist/collection/index.test.js b/src/archivist/collection/index.test.js index f7689384d..3b817e615 100644 --- a/src/archivist/collection/index.test.js +++ b/src/archivist/collection/index.test.js @@ -18,7 +18,7 @@ describe('Collection', () => { try { metadataBackup = await fs.readFile(metadataPath, 'utf8'); } catch (error) { - if (error.code !== 'ENOENT') throw error; + if (error.code !== 'ENOENT') { throw error; } } }); diff --git a/src/archivist/recorder/record.js b/src/archivist/recorder/record.js index 9b335a7e0..965cb9088 100644 --- a/src/archivist/recorder/record.js +++ b/src/archivist/recorder/record.js @@ -3,6 +3,13 @@ * @class Record * @private */ + +export const TITLE_PREFIXES = Object.freeze({ + firstRecord: 'First record of', + technicalUpgrade: 'Apply technical or declaration upgrade on', + update: 'Record new changes of', +}); + export default class Record { #content; @@ -32,6 +39,20 @@ export default class Record { this.#content = content; } + get displayTitle() { + let prefix; + + if (this.isFirstRecord) { + prefix = TITLE_PREFIXES.firstRecord; + } else if (this.isTechnicalUpgrade) { + prefix = TITLE_PREFIXES.technicalUpgrade; + } else { + prefix = TITLE_PREFIXES.update; + } + + return `${prefix} ${this.serviceId} ${this.termsType}`; + } + validate() { for (const requiredParam of this.constructor.REQUIRED_PARAMS) { if (requiredParam == 'content') { diff --git a/src/archivist/recorder/repositories/git/dataMapper.js b/src/archivist/recorder/repositories/git/dataMapper.js index c9dadd267..9f2757f45 100644 --- a/src/archivist/recorder/repositories/git/dataMapper.js +++ b/src/archivist/recorder/repositories/git/dataMapper.js @@ -2,18 +2,27 @@ import path from 'path'; import mime from 'mime'; +import { TITLE_PREFIXES } from '../../record.js'; import Snapshot from '../../snapshot.js'; import Version from '../../version.js'; -export const COMMIT_MESSAGE_PREFIXES = { - startTracking: 'First record of', - technicalUpgrade: 'Apply technical or declaration upgrade on', - update: 'Record new changes of', +// Prefixes for commits that represent an actual content change detected at the service source +const CHANGE_PREFIXES = { + startTracking: TITLE_PREFIXES.firstRecord, + update: TITLE_PREFIXES.update, deprecated_startTracking: 'Start tracking', - deprecated_refilter: 'Refilter', deprecated_update: 'Update', }; +// Prefixes for commits that re-render an existing snapshot (e.g. with updated extraction rules) without any change at the service source +const TECHNICAL_UPGRADE_PREFIXES = { + technicalUpgrade: TITLE_PREFIXES.technicalUpgrade, + deprecated_refilter: 'Refilter', +}; + +export const CHANGE_COMMIT_MESSAGE_PREFIXES = CHANGE_PREFIXES; +export const COMMIT_MESSAGE_PREFIXES = { ...CHANGE_PREFIXES, ...TECHNICAL_UPGRADE_PREFIXES }; + export const TERMS_TYPE_AND_DOCUMENT_ID_SEPARATOR = ' #'; export const SNAPSHOT_ID_MARKER = '%SNAPSHOT_ID'; const SINGLE_SOURCE_DOCUMENT_PREFIX = 'This version was recorded after extracting from snapshot'; @@ -22,13 +31,9 @@ const MULTIPLE_SOURCE_DOCUMENTS_PREFIX = 'This version was recorded after extrac export const COMMIT_MESSAGE_PREFIXES_REGEXP = new RegExp(`^(${Object.values(COMMIT_MESSAGE_PREFIXES).join('|')})`); export function toPersistence(record, snapshotIdentiferTemplate) { - const { serviceId, termsType, documentId, isTechnicalUpgrade, snapshotIds = [], mimeType, isFirstRecord, metadata } = record; + const { serviceId, termsType, documentId, snapshotIds = [], mimeType, metadata } = record; - let prefix = isTechnicalUpgrade ? COMMIT_MESSAGE_PREFIXES.technicalUpgrade : COMMIT_MESSAGE_PREFIXES.update; - - prefix = isFirstRecord ? COMMIT_MESSAGE_PREFIXES.startTracking : prefix; - - const subject = `${prefix} ${serviceId} ${termsType}`; + const subject = record.displayTitle; const documentIdMessage = `${documentId ? `Document ID ${documentId}\n\n` : ''}`; let snapshotIdsMessage; @@ -91,6 +96,10 @@ function generateFileName(termsType, documentId, extension) { } export function generateFilePath(serviceId, termsType, documentId, mimeType) { + if (termsType === undefined) { + return `${serviceId}/*`; // If only serviceId is provided, return a pattern to match all files for that service + } + const extension = mime.getExtension(mimeType) || '*'; // If mime type is undefined, an asterisk is set as an extension. Used to match all files for the given service ID, terms type and document ID when mime type is unknown return `${serviceId}/${generateFileName(termsType, documentId, extension)}`; // Do not use `path.join` as even for Windows, the path should be with `/` and not `\` diff --git a/src/archivist/recorder/repositories/git/git.js b/src/archivist/recorder/repositories/git/git.js index 791c39310..9298d0270 100644 --- a/src/archivist/recorder/repositories/git/git.js +++ b/src/archivist/recorder/repositories/git/git.js @@ -68,8 +68,20 @@ export default class Git { return this.git.push(); } - listCommits(options = []) { - return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]); // Returns all commits in chronological order (`--reverse`), excluding merge commits (`--no-merges`), with modified files names (`--name-only`) + listCommits(options = [], { reverse = true, skip, maxCount } = {}) { + const reverseOption = reverse ? ['--reverse'] : []; + const skipOption = skip !== undefined ? [`--skip=${skip}`] : []; + const maxCountOption = maxCount !== undefined ? [`--max-count=${maxCount}`] : []; + + return this.log([ + ...reverseOption, // When `reverse` is true, lists commits oldest-first; otherwise the default newest-first applies + '--author-date-order', // Best-effort author-date ordering: with --max-count, git applies the cap topologically, so the page can miss strictly-newer commits that #getCommits' JS resort cannot recover + '--no-merges', // Exclude merge commits — records are stored as regular commits, never as merges + '--name-only', // Append the modified file names below each commit, used by `toDomain` to extract the record's file path + ...skipOption, // Optional `--skip=N`: drop the first N matching commits (pagination offset) + ...maxCountOption, // Optional `--max-count=N`: cap the result to N commits (pagination limit) + ...options, // Caller-supplied options: typically grep filters on commit messages and a path filter (`-- pathspec`) + ]); } async getCommit(options) { diff --git a/src/archivist/recorder/repositories/git/index.js b/src/archivist/recorder/repositories/git/index.js index 5caf59948..1e54fb4bd 100644 --- a/src/archivist/recorder/repositories/git/index.js +++ b/src/archivist/recorder/repositories/git/index.js @@ -88,16 +88,43 @@ export default class GitRepository extends RepositoryInterface { return this.#toDomain(commit); } - async findAll() { - return Promise.all((await this.#getCommits()).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); + async findAll({ limit, offset, includeTechnicalUpgrades = true } = {}) { + return Promise.all((await this.#getCommits({ limit, offset, includeTechnicalUpgrades })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); } - async count() { - return (await this.git.log(Object.values(DataMapper.COMMIT_MESSAGE_PREFIXES).map(prefix => `--grep=${prefix}`))).length; + async findByService(serviceId, { limit, offset, includeTechnicalUpgrades = true } = {}) { + const pathPattern = DataMapper.generateFilePath(serviceId); + + return Promise.all((await this.#getCommits({ pathFilter: pathPattern, limit, offset, includeTechnicalUpgrades })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); + } + + async findByServiceAndTermsType(serviceId, termsType, { limit, offset, includeTechnicalUpgrades = true } = {}) { + const pathPattern = DataMapper.generateFilePath(serviceId, termsType); + + return Promise.all((await this.#getCommits({ pathFilter: pathPattern, limit, offset, includeTechnicalUpgrades })).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); + } + + async count(serviceId, termsType) { + const grepOptions = Object.values(DataMapper.COMMIT_MESSAGE_PREFIXES).map(prefix => `--grep=${prefix}`); + const pathOptions = []; + + if (serviceId && termsType) { + const pathPattern = DataMapper.generateFilePath(serviceId, termsType); + + pathOptions.push('--', pathPattern); + } else if (serviceId) { + const pathPattern = DataMapper.generateFilePath(serviceId); + + pathOptions.push('--', pathPattern); + } else { + pathOptions.push('--', '*/*'); // Count all records (exclude root directory files) + } + + return (await this.git.log([ ...grepOptions, ...pathOptions ])).length; } async* iterate() { - const commits = await this.#getCommits(); + const commits = await this.#getCommits({ reverse: true }); for (const commit of commits) { yield this.#toDomain(commit); @@ -131,12 +158,41 @@ export default class GitRepository extends RepositoryInterface { record.content = pdfBuffer; } - async #getCommits() { - return (await this.git.listCommits()) - .filter(commit => // Skip non-record commits (e.g., README or LICENSE updates) - DataMapper.COMMIT_MESSAGE_PREFIXES_REGEXP.test(commit.message) // Commits generated by the engine have messages that match predefined prefixes - && path.dirname(commit.diff.files[0].file) !== '.') // Assumes one record per commit; records must be in a serviceId folder, not root - .sort((commitA, commitB) => new Date(commitA.date) - new Date(commitB.date)); // Make sure that the commits are sorted in ascending chronological order + async #getCommits({ pathFilter, reverse = false, limit, offset, includeTechnicalUpgrades = true } = {}) { + const prefixes = includeTechnicalUpgrades + ? DataMapper.COMMIT_MESSAGE_PREFIXES + : DataMapper.CHANGE_COMMIT_MESSAGE_PREFIXES; + const grepOptions = Object.values(prefixes).flatMap(prefix => [ '--grep', prefix ]); + const pathOptions = pathFilter + ? [ '--', pathFilter ] + : [ '--', '*/*' ]; // Exclude root directory files by only matching files in subdirectories + + const options = [ ...grepOptions, ...pathOptions ]; + + // Use git-level pagination for performance: `--skip` and `--max-count` count in topological order, not strictly chronological. + // In records history, the only commits whose author date is out of step with their topological position are technical upgrades. + // The only caller currently relying on pagination is the feed endpoint, which already filters technical upgrades out via `includeTechnicalUpgrades: false`, so the paginated set has no chronological/topological divergence in practice. + // If a future caller needs paginated access that includes technical upgrades, switch to the approach proposed in https://github.com/OpenTermsArchive/engine/issues/1243. + const paginationOptions = {}; + + if (offset !== undefined) { + paginationOptions.skip = offset; + } + + if (limit !== undefined) { + paginationOptions.maxCount = limit; + } + + const commits = await this.git.listCommits(options, { reverse: false, ...paginationOptions }); // Get commits without git's --reverse for better performance, filtered at git level + + commits.sort((commitA, commitB) => { + const dateA = new Date(commitA.date); + const dateB = new Date(commitB.date); + + return reverse ? dateA - dateB : dateB - dateA; + }); + + return commits; } static async writeFile({ filePath, content }) { diff --git a/src/archivist/recorder/repositories/git/index.test.js b/src/archivist/recorder/repositories/git/index.test.js index 6c7e1dea0..2ae2dfdae 100644 --- a/src/archivist/recorder/repositories/git/index.test.js +++ b/src/archivist/recorder/repositories/git/index.test.js @@ -540,8 +540,253 @@ describe('GitRepository', () => { } }); - it('returns records in ascending order', () => { - expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); + }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + + before(async () => { + filteredRecords = await subject.findAll({ includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.length).to.equal(2); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + + it('returns the expected records in descending order', () => { + expect(filteredRecords.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE ]); + }); + }); + }); + + describe('#findByServiceAndTermsType', () => { + const expectedIds = []; + let records; + + before(async function () { + this.timeout(5000); + + const { id: id1 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id2); + + await subject.save(new Version({ + serviceId: 'other_service', + termsType: 'Privacy Policy', + content: `${CONTENT} - other`, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + (records = await subject.findByServiceAndTermsType(SERVICE_PROVIDER_ID, TERMS_TYPE)); + }); + + after(() => subject.removeAll()); + + it('returns only matching records', () => { + expect(records.length).to.equal(2); + }); + + it('returns Version objects', () => { + for (const record of records) { + expect(record).to.be.an.instanceof(Version); + } + }); + + it('returns records with matching service ID', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + } + }); + + it('returns records with matching terms type', () => { + for (const record of records) { + expect(record.termsType).to.equal(TERMS_TYPE); + } + }); + + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE ]); + }); + + it('returns records with correct IDs', () => { + expect(records.map(record => record.id)).to.have.members(expectedIds); + }); + + context('when no matching records exist', () => { + it('returns an empty array', async () => { + const result = await subject.findByServiceAndTermsType('non_existent_service', 'Non Existent Terms'); + + expect(result).to.be.an('array').that.is.empty; + }); + }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + let technicalUpgradeId; + + before(async () => { + ({ id: technicalUpgradeId } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - technical upgrade`, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + isTechnicalUpgrade: true, + }))); + + filteredRecords = await subject.findByServiceAndTermsType(SERVICE_PROVIDER_ID, TERMS_TYPE, { includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.map(record => record.id)).to.not.include(technicalUpgradeId); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + }); + }); + + describe('#findByService', () => { + const OTHER_TERMS_TYPE = 'Privacy Policy'; + const expectedIds = []; + let records; + + before(async function () { + this.timeout(5000); + + const { id: id1 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id2); + + const { id: id3 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: OTHER_TERMS_TYPE, + content: `${CONTENT} - other terms type`, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id3); + + await subject.save(new Version({ + serviceId: 'other_service', + termsType: TERMS_TYPE, + content: `${CONTENT} - other service`, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + (records = await subject.findByService(SERVICE_PROVIDER_ID)); + }); + + after(() => subject.removeAll()); + + it('returns only matching records', () => { + expect(records.length).to.equal(3); + }); + + it('returns Version objects', () => { + for (const record of records) { + expect(record).to.be.an.instanceof(Version); + } + }); + + it('returns records with matching service ID', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + } + }); + + it('returns records across all terms types of the service', () => { + expect(new Set(records.map(record => record.termsType))).to.deep.equal(new Set([ TERMS_TYPE, OTHER_TERMS_TYPE ])); + }); + + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); + }); + + it('returns records with correct IDs', () => { + expect(records.map(record => record.id)).to.have.members(expectedIds); + }); + + context('when no matching records exist', () => { + it('returns an empty array', async () => { + const result = await subject.findByService('non_existent_service'); + + expect(result).to.be.an('array').that.is.empty; + }); + }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + let technicalUpgradeId; + + before(async () => { + ({ id: technicalUpgradeId } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - technical upgrade`, + fetchDate: new Date('2000-01-03T12:00:00.000Z'), + snapshotIds: [SNAPSHOT_ID], + isTechnicalUpgrade: true, + }))); + + filteredRecords = await subject.findByService(SERVICE_PROVIDER_ID, { includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.map(record => record.id)).to.not.include(technicalUpgradeId); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); }); }); @@ -582,6 +827,37 @@ describe('GitRepository', () => { it('returns the proper count', () => { expect(count).to.equal(3); }); + + context('with serviceId and termsType filters', () => { + it('returns count for specific service and terms type', async () => { + const filteredCount = await subject.count(SERVICE_PROVIDER_ID, TERMS_TYPE); + + expect(filteredCount).to.equal(3); + }); + + it('returns zero for non-existent service', async () => { + const filteredCount = await subject.count('non-existent-service', TERMS_TYPE); + + expect(filteredCount).to.equal(0); + }); + }); + + context('with only serviceId filter', () => { + it('returns count for all terms types of a service', async () => { + // Add a version with different terms type + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: 'Different Terms', + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + const filteredCount = await subject.count(SERVICE_PROVIDER_ID); + + expect(filteredCount).to.equal(4); // 3 from TERMS_TYPE + 1 from 'Different Terms' + }); + }); }); describe('#findLatest', () => { @@ -1101,8 +1377,8 @@ describe('GitRepository', () => { } }); - it('returns records in ascending order', () => { - expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); }); }); @@ -1462,8 +1738,8 @@ describe('GitRepository', () => { } }); - it('returns records in ascending order', () => { - expect(records.map(record => record.fetchDate)).to.deep.equal(expectedDates); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([...expectedDates].reverse()); }); }); diff --git a/src/archivist/recorder/repositories/interface.js b/src/archivist/recorder/repositories/interface.js index 1d9270944..00e8bfce5 100644 --- a/src/archivist/recorder/repositories/interface.js +++ b/src/archivist/recorder/repositories/interface.js @@ -70,21 +70,59 @@ class RepositoryInterface { } /** - * Find all records + * Find all records, in descending chronological order (newest first; opposite of #iterate) * For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records - * @see RepositoryInterface#loadRecordContent - * @returns {Promise>} Promise that will be resolved with an array of all records + * @see RepositoryInterface#loadRecordContent + * @see RepositoryInterface#iterate + * @param {object} [options] - Query options + * @param {number} [options.limit] - Maximum number of records to return + * @param {number} [options.offset] - Number of records to skip + * @param {boolean} [options.includeTechnicalUpgrades] - When false, exclude technical upgrade records (re-renders of existing snapshots) and only return records that represent actual content changes. Default: true + * @returns {Promise>} Promise that will be resolved with an array of records in descending chronological order */ - async findAll() { + async findAll(options = {}) { throw new Error(`#findAll method is not implemented in ${this.constructor.name}`); } + /** + * Find all records for a specific service, in descending chronological order + * For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records + * @see RepositoryInterface#loadRecordContent + * @param {string} serviceId - Service ID of records to find + * @param {object} [options] - Query options + * @param {number} [options.limit] - Maximum number of records to return + * @param {number} [options.offset] - Number of records to skip + * @param {boolean} [options.includeTechnicalUpgrades] - When false, exclude technical upgrade records (re-renders of existing snapshots) and only return records that represent actual content changes. Default: true + * @returns {Promise>} Promise that will be resolved with an array of matching records in descending chronological order + */ + async findByService(serviceId, options = {}) { + throw new Error(`#findByService method is not implemented in ${this.constructor.name}`); + } + + /** + * Find all records for a specific service and terms type, in descending chronological order + * For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records + * @see RepositoryInterface#loadRecordContent + * @param {string} serviceId - Service ID of records to find + * @param {string} termsType - Terms type of records to find + * @param {object} [options] - Query options + * @param {number} [options.limit] - Maximum number of records to return + * @param {number} [options.offset] - Number of records to skip + * @param {boolean} [options.includeTechnicalUpgrades] - When false, exclude technical upgrade records (re-renders of existing snapshots) and only return records that represent actual content changes. Default: true + * @returns {Promise>} Promise that will be resolved with an array of matching records in descending chronological order + */ + async findByServiceAndTermsType(serviceId, termsType, options = {}) { + throw new Error(`#findByServiceAndTermsType method is not implemented in ${this.constructor.name}`); + } + /** * Count the total number of records in the repository * For performance reasons, use this method rather than counting the number of entries returned by #findAll if you only need the size of a repository - * @returns {Promise} Promise that will be resolved with the total number of records + * @param {string} [serviceId] - Optional service ID to filter records + * @param {string} [termsType] - Optional terms type to filter records (requires serviceId) + * @returns {Promise} Promise that will be resolved with the total number of records */ - async count() { + async count(serviceId, termsType) { throw new Error(`#count method is not implemented in ${this.constructor.name}`); } diff --git a/src/archivist/recorder/repositories/mongo/index.js b/src/archivist/recorder/repositories/mongo/index.js index 2a4abb18c..649b22981 100644 --- a/src/archivist/recorder/repositories/mongo/index.js +++ b/src/archivist/recorder/repositories/mongo/index.js @@ -88,13 +88,76 @@ export default class MongoRepository extends RepositoryInterface { return this.#toDomain(mongoDocument); } - async findAll() { - return Promise.all((await this.collection.find().project({ content: 0 }).sort({ fetchDate: 1 }).toArray()) + async findAll({ limit, offset, includeTechnicalUpgrades = true } = {}) { + const filter = includeTechnicalUpgrades ? {} : { isTechnicalUpgrade: { $ne: true } }; + let query = this.collection.find(filter).project({ content: 0 }).sort({ fetchDate: -1 }); + + if (offset !== undefined) { + query = query.skip(offset); + } + + if (limit !== undefined) { + query = query.limit(limit); + } + + return Promise.all((await query.toArray()) + .map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); + } + + async findByServiceAndTermsType(serviceId, termsType, { limit, offset, includeTechnicalUpgrades = true } = {}) { + const filter = { serviceId, termsType }; + + if (!includeTechnicalUpgrades) { + filter.isTechnicalUpgrade = { $ne: true }; + } + + let query = this.collection.find(filter).project({ content: 0 }).sort({ fetchDate: -1 }); + + if (offset !== undefined) { + query = query.skip(offset); + } + + if (limit !== undefined) { + query = query.limit(limit); + } + + return Promise.all((await query.toArray()) .map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); } - count() { - return this.collection.countDocuments(); + async findByService(serviceId, { limit, offset, includeTechnicalUpgrades = true } = {}) { + const filter = { serviceId }; + + if (!includeTechnicalUpgrades) { + filter.isTechnicalUpgrade = { $ne: true }; + } + + let query = this.collection.find(filter).project({ content: 0 }).sort({ fetchDate: -1 }); + + if (offset !== undefined) { + query = query.skip(offset); + } + + if (limit !== undefined) { + query = query.limit(limit); + } + + return Promise.all((await query.toArray()) + .map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); + } + + count(serviceId, termsType) { + const filter = {}; + + if (serviceId) { + filter.serviceId = serviceId; + } + + if (termsType) { + filter.termsType = termsType; + } + + return this.collection.countDocuments(filter); } async* iterate() { diff --git a/src/archivist/recorder/repositories/mongo/index.test.js b/src/archivist/recorder/repositories/mongo/index.test.js index 61ecfd1d0..72b3f3b3d 100644 --- a/src/archivist/recorder/repositories/mongo/index.test.js +++ b/src/archivist/recorder/repositories/mongo/index.test.js @@ -629,45 +629,362 @@ describe('MongoRepository', () => { } }); - it('returns records in ascending order', () => { - expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); + }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + + before(async () => { + filteredRecords = await subject.findAll({ includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.length).to.equal(2); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + + it('returns the expected records in descending order', () => { + expect(filteredRecords.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE ]); + }); }); }); - describe('#count', () => { - let count; + describe('#findByServiceAndTermsType', () => { + const expectedIds = []; + let records; before(async () => { - await subject.save(new Version({ + const { id: id1 } = await subject.save(new Version({ serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE, content: CONTENT, fetchDate: FETCH_DATE, snapshotIds: [SNAPSHOT_ID], })); - await subject.save(new Version({ + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Version({ serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE, content: `${CONTENT} - updated`, fetchDate: FETCH_DATE_LATER, snapshotIds: [SNAPSHOT_ID], })); + + expectedIds.push(id2); + await subject.save(new Version({ + serviceId: 'other_service', + termsType: 'Privacy Policy', + content: `${CONTENT} - other`, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + (records = await subject.findByServiceAndTermsType(SERVICE_PROVIDER_ID, TERMS_TYPE)); + }); + + after(() => subject.removeAll()); + + it('returns only matching records', () => { + expect(records.length).to.equal(2); + }); + + it('returns Version objects', () => { + for (const record of records) { + expect(record).to.be.an.instanceof(Version); + } + }); + + it('returns records with matching service ID', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + } + }); + + it('returns records with matching terms type', () => { + for (const record of records) { + expect(record.termsType).to.equal(TERMS_TYPE); + } + }); + + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE ]); + }); + + it('returns records with correct IDs', () => { + expect(records.map(record => record.id)).to.have.members(expectedIds); + }); + + context('when no matching records exist', () => { + it('returns an empty array', async () => { + const result = await subject.findByServiceAndTermsType('non_existent_service', 'Non Existent Terms'); + + expect(result).to.be.an('array').that.is.empty; + }); + }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + let technicalUpgradeId; + + before(async () => { + ({ id: technicalUpgradeId } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - technical upgrade`, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + isTechnicalUpgrade: true, + }))); + + filteredRecords = await subject.findByServiceAndTermsType(SERVICE_PROVIDER_ID, TERMS_TYPE, { includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.map(record => record.id)).to.not.include(technicalUpgradeId); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + }); + }); + + describe('#findByService', () => { + const OTHER_TERMS_TYPE = 'Privacy Policy'; + const expectedIds = []; + let records; + + before(async () => { + const { id: id1 } = await subject.save(new Version({ serviceId: SERVICE_PROVIDER_ID, termsType: TERMS_TYPE, - content: `${CONTENT} - updated 2`, - isTechnicalUpgrade: true, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + + expectedIds.push(id2); + + const { id: id3 } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: OTHER_TERMS_TYPE, + content: `${CONTENT} - other terms type`, fetchDate: FETCH_DATE_EARLIER, snapshotIds: [SNAPSHOT_ID], })); - (count = await subject.count()); + expectedIds.push(id3); + + await subject.save(new Version({ + serviceId: 'other_service', + termsType: TERMS_TYPE, + content: `${CONTENT} - other service`, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + + (records = await subject.findByService(SERVICE_PROVIDER_ID)); }); after(() => subject.removeAll()); - it('returns the proper count', () => { - expect(count).to.equal(3); + it('returns only matching records', () => { + expect(records.length).to.equal(3); + }); + + it('returns Version objects', () => { + for (const record of records) { + expect(record).to.be.an.instanceof(Version); + } + }); + + it('returns records with matching service ID', () => { + for (const record of records) { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + } + }); + + it('returns records across all terms types of the service', () => { + expect(new Set(records.map(record => record.termsType))).to.deep.equal(new Set([ TERMS_TYPE, OTHER_TERMS_TYPE ])); + }); + + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); + }); + + it('returns records with correct IDs', () => { + expect(records.map(record => record.id)).to.have.members(expectedIds); + }); + + context('when no matching records exist', () => { + it('returns an empty array', async () => { + const result = await subject.findByService('non_existent_service'); + + expect(result).to.be.an('array').that.is.empty; + }); + }); + + context('with includeTechnicalUpgrades: false', () => { + let filteredRecords; + let technicalUpgradeId; + + before(async () => { + ({ id: technicalUpgradeId } = await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - technical upgrade`, + fetchDate: new Date('2000-01-03T12:00:00.000Z'), + snapshotIds: [SNAPSHOT_ID], + isTechnicalUpgrade: true, + }))); + + filteredRecords = await subject.findByService(SERVICE_PROVIDER_ID, { includeTechnicalUpgrades: false }); + }); + + it('excludes technical upgrade records', () => { + expect(filteredRecords.map(record => record.id)).to.not.include(technicalUpgradeId); + }); + + it('only returns records that represent actual content changes', () => { + for (const record of filteredRecords) { + expect(record.isTechnicalUpgrade).to.not.be.true; + } + }); + }); + }); + + describe('#count', () => { + context('without filters', () => { + let count; + + before(async () => { + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated 2`, + isTechnicalUpgrade: true, + fetchDate: FETCH_DATE_EARLIER, + snapshotIds: [SNAPSHOT_ID], + })); + + (count = await subject.count()); + }); + + after(() => subject.removeAll()); + + it('returns the proper count', () => { + expect(count).to.equal(3); + }); + }); + + context('with serviceId and termsType filters', () => { + before(async () => { + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: 'other_service', + termsType: 'Privacy Policy', + content: 'Other content', + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + }); + + after(() => subject.removeAll()); + + it('returns count for specific service and terms type', async () => { + const filteredCount = await subject.count(SERVICE_PROVIDER_ID, TERMS_TYPE); + + expect(filteredCount).to.equal(2); + }); + + it('returns zero for non-existent service', async () => { + const filteredCount = await subject.count('non-existent-service', TERMS_TYPE); + + expect(filteredCount).to.equal(0); + }); + }); + + context('with only serviceId filter', () => { + before(async () => { + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: TERMS_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: SERVICE_PROVIDER_ID, + termsType: 'Different Terms', + content: 'Different content', + fetchDate: FETCH_DATE_LATER, + snapshotIds: [SNAPSHOT_ID], + })); + await subject.save(new Version({ + serviceId: 'other_service', + termsType: 'Privacy Policy', + content: 'Other content', + fetchDate: FETCH_DATE, + snapshotIds: [SNAPSHOT_ID], + })); + }); + + after(() => subject.removeAll()); + + it('returns count for all terms types of a service', async () => { + const filteredCount = await subject.count(SERVICE_PROVIDER_ID); + + expect(filteredCount).to.equal(2); + }); }); }); @@ -1197,8 +1514,8 @@ describe('MongoRepository', () => { } }); - it('returns records in ascending order', () => { - expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); + it('returns records in descending order', () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_LATER, FETCH_DATE, FETCH_DATE_EARLIER ]); }); }); diff --git a/src/archivist/recorder/version.test.js b/src/archivist/recorder/version.test.js index 248d607dd..d03a2af3f 100644 --- a/src/archivist/recorder/version.test.js +++ b/src/archivist/recorder/version.test.js @@ -60,4 +60,42 @@ describe('Version', () => { }); }); }); + + describe('#displayTitle', () => { + const baseParams = { + serviceId: 'service-A', + termsType: 'Terms of Service', + fetchDate: new Date('2000-01-01T12:00:00.000Z'), + content: 'some content', + snapshotIds: ['dd263f270b3065e1c18201b49ab898474b357566'], + }; + + context('when the record is the first one for its service and terms type', () => { + it('starts with the first-record prefix', () => { + subject = new Version({ ...baseParams, isFirstRecord: true }); + expect(subject.displayTitle).to.equal('First record of service-A Terms of Service'); + }); + }); + + context('when the record is a technical upgrade', () => { + it('starts with the technical-upgrade prefix', () => { + subject = new Version({ ...baseParams, isTechnicalUpgrade: true }); + expect(subject.displayTitle).to.equal('Apply technical or declaration upgrade on service-A Terms of Service'); + }); + }); + + context('when the record is a regular content change', () => { + it('starts with the update prefix', () => { + subject = new Version(baseParams); + expect(subject.displayTitle).to.equal('Record new changes of service-A Terms of Service'); + }); + }); + + context('when the record is both a first record and a technical upgrade', () => { + it('prioritises the first-record prefix', () => { + subject = new Version({ ...baseParams, isFirstRecord: true, isTechnicalUpgrade: true }); + expect(subject.displayTitle).to.equal('First record of service-A Terms of Service'); + }); + }); + }); }); diff --git a/src/archivist/services/index.js b/src/archivist/services/index.js index cdcc07bbf..980973379 100644 --- a/src/archivist/services/index.js +++ b/src/archivist/services/index.js @@ -281,7 +281,7 @@ function getHistoryFilePaths(serviceId) { } async function loadServiceHistory(historyFilePath) { - if (!(await fileExists(historyFilePath))) return {}; + if (!(await fileExists(historyFilePath))) { return {}; } try { return JSON.parse(await fs.readFile(historyFilePath)); diff --git a/src/collection-api/routes/feed.js b/src/collection-api/routes/feed.js new file mode 100644 index 000000000..02de64b1e --- /dev/null +++ b/src/collection-api/routes/feed.js @@ -0,0 +1,253 @@ +import express from 'express'; +import { js2xml } from 'xml-js'; + +import { getCollection } from '../../archivist/collection/index.js'; +import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; + +const RECORD_TYPES = { + firstRecord: 'First record', + change: 'Change', +}; + +const TAG_AUTHORITY = 'opentermsarchive.org,2026'; // Tag URI authority (RFC 4151). The year fixes the scheme inception and must never change: it would invalidate every previously emitted feed and entry ID. +const FEED_AUTHOR_NAME = 'Open Terms Archive engine'; + +const SCHEMES = Object.freeze({ + service: `tag:${TAG_AUTHORITY}:scheme:service`, + termsType: `tag:${TAG_AUTHORITY}:scheme:terms-type`, + recordType: `tag:${TAG_AUTHORITY}:scheme:record-type`, +}); + +function buildAbsoluteBaseUrl(req) { + const host = req.get('X-Forwarded-Host') ?? req.get('host'); // Behind a trusted reverse proxy, the public host comes from X-Forwarded-Host. req.get('host') only sees the internal Host header, so we read the forwarded value explicitly and fall back to the direct host for non-proxied setups (dev, tests). + + return `${req.protocol}://${host}${req.baseUrl}`; +} + +function classifyRecordType(version) { + return version.isFirstRecord ? RECORD_TYPES.firstRecord : RECORD_TYPES.change; +} + +// xml-js does not escape attribute values by default — callers are expected to pre-escape. We wire this helper to js2xml's attributeValueFn so every emitted attribute goes through it, regardless of where it's built. Without this, a serviceId like "AT&T Mobile" would yield malformed XML rejected by strict feed readers (libxml2-based). +function escapeXmlAttribute(value) { + return String(value) + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"'); +} + +function buildVersionLink(baseUrl, version) { + const encodedDate = encodeURIComponent(toISODateWithoutMilliseconds(version.fetchDate)); + const encodedService = encodeURIComponent(version.serviceId); + const encodedTermsType = encodeURIComponent(version.termsType); + + return `${baseUrl}/version/${encodedService}/${encodedTermsType}/${encodedDate}`; +} + +function buildEntryId(storageType, collection, version) { + return `tag:${TAG_AUTHORITY}:version:${collection.metadata?.id}:${storageType}:${version.id}`; +} + +function buildFeedId(collection, ...suffix) { + return [ `tag:${TAG_AUTHORITY}:feed`, collection.metadata?.id, ...suffix ].join(':'); +} + +function buildEntry(storageType, versionUrlTemplate, baseUrl, collection, version) { + const href = versionUrlTemplate?.replace('%VERSION_ID', version.id) ?? buildVersionLink(baseUrl, version); + const type = versionUrlTemplate ? 'text/html' : 'application/json'; // The default link points to the JSON Version API; operators who configure a versionUrlTemplate typically target a human-readable page (e.g. a GitHub commit), which is HTML. + + return { + id: { _text: buildEntryId(storageType, collection, version) }, + link: { _attributes: { rel: 'alternate', type, href } }, + title: { _text: version.displayTitle }, + updated: { _text: version.fetchDate.toISOString() }, + category: [ + { _attributes: { term: version.serviceId, scheme: SCHEMES.service } }, + { _attributes: { term: version.termsType, scheme: SCHEMES.termsType } }, + { _attributes: { term: classifyRecordType(version), scheme: SCHEMES.recordType } }, + ], + }; +} + +function computeLatestFetchDate(versions) { + return versions.length > 0 ? versions[0].fetchDate : new Date(0); // Atom 1.0 requires a feed-level . When no entry exists yet, fall back to the Unix epoch so the value is stable across requests, emitting `new Date()` would defeat conditional GET caching for empty feeds. +} + +function buildFeedDocument({ storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl, latestFetchDate }) { + const feed = { + _attributes: { xmlns: 'http://www.w3.org/2005/Atom' }, + title: { _text: collection.metadata.name }, + id: { _text: feedId }, + updated: { _text: latestFetchDate.toISOString() }, + link: { _attributes: { rel: 'self', type: 'application/atom+xml', href: selfHref } }, + author: { name: { _text: FEED_AUTHOR_NAME } }, + }; + + if (collection.metadata?.tagline) { + feed.subtitle = { _text: collection.metadata.tagline }; + } + + if (collection.metadata?.logo) { + feed.logo = { _text: collection.metadata.logo }; + } + + feed.entry = versions.map(version => buildEntry(storageType, versionUrlTemplate, baseUrl, collection, version)); + + return { + _declaration: { _attributes: { version: '1.0', encoding: 'utf-8' } }, + feed, + }; +} + +function sendFeed(req, res, opts) { + const latestFetchDate = computeLatestFetchDate(opts.versions); + + res.set('Last-Modified', latestFetchDate.toUTCString()); // Setting Last-Modified before checking req.fresh enables Express to compare it with If-Modified-Since and return 304 when nothing changed since the reader's last fetch; the headline optimisation for Atom feeds, which are typically polled every few minutes. + + if (req.fresh) { + return res.status(304).end(); + } + + res.set('Content-Type', 'application/atom+xml; charset=utf-8'); + const document = buildFeedDocument({ ...opts, latestFetchDate }); + + return res.status(200).send(js2xml(document, { compact: true, spaces: 2, attributeValueFn: escapeXmlAttribute })); +} + +/** + * @param {object} services The services to be exposed by the API + * @param {object} versionsRepository The versions repository instance + * @param {string} storageType The storage type identifier of the versions repository + * @param {number} feedLimit Maximum number of entries returned by feed endpoints + * @param {string} [versionUrlTemplate] Optional URL template with %VERSION_ID placeholder; when set, replaces the API link as each entry's alternate href + * @returns {express.Router} The router instance + * @swagger + * tags: + * name: Feeds + * description: Atom feeds of version changes + */ +export default function feedRouter(services, versionsRepository, storageType, feedLimit, versionUrlTemplate) { + const router = express.Router(); + + async function renderFeed(req, res, { selfHref, suffix = [], versions }) { + const collection = await getCollection(); + const baseUrl = buildAbsoluteBaseUrl(req); + const feedId = buildFeedId(collection, ...suffix); + + return sendFeed(req, res, { storageType, versionUrlTemplate, collection, selfHref, feedId, versions, baseUrl }); + } + + /** + * @swagger + * /feed: + * get: + * summary: Atom feed of the latest version changes across the whole collection. + * tags: [Feeds] + * produces: + * - application/atom+xml + * responses: + * 200: + * description: An Atom 1.0 feed listing the latest version records, newest first. The maximum number of entries is server-configured. + * content: + * application/atom+xml: + * schema: + * type: string + */ + router.get('/feed', async (req, res) => { + const versions = await versionsRepository.findAll({ limit: feedLimit, includeTechnicalUpgrades: false }); + const selfHref = `${buildAbsoluteBaseUrl(req)}/feed`; + + return renderFeed(req, res, { selfHref, versions }); + }); + + /** + * @swagger + * /feed/{serviceId}: + * get: + * summary: Atom feed of the latest version changes scoped to a single service. + * tags: [Feeds] + * produces: + * - application/atom+xml + * parameters: + * - in: path + * name: serviceId + * description: The ID of the service. + * schema: + * type: string + * required: true + * responses: + * 200: + * description: An Atom 1.0 feed listing the latest version records for the given service, newest first. + * content: + * application/atom+xml: + * schema: + * type: string + * 404: + * description: No service matching the provided ID is found. + */ + router.get('/feed/:serviceId', async (req, res) => { + const service = Object.hasOwn(services, req.params.serviceId) ? services[req.params.serviceId] : null; + + if (!service) { + return res.status(404).send('Service not found'); + } + + const versions = await versionsRepository.findByService(service.id, { limit: feedLimit, includeTechnicalUpgrades: false }); + const selfHref = `${buildAbsoluteBaseUrl(req)}/feed/${encodeURIComponent(service.id)}`; + + return renderFeed(req, res, { selfHref, suffix: [service.id], versions }); + }); + + /** + * @swagger + * /feed/{serviceId}/{termsType}: + * get: + * summary: Atom feed of the latest version changes scoped to a service and terms type. + * tags: [Feeds] + * produces: + * - application/atom+xml + * parameters: + * - in: path + * name: serviceId + * description: The ID of the service. + * schema: + * type: string + * required: true + * - in: path + * name: termsType + * description: The terms type declared by the service (e.g. "Terms of Service", "Privacy Policy"). + * schema: + * type: string + * required: true + * responses: + * 200: + * description: An Atom 1.0 feed listing the latest version records for the given service and terms type, newest first. + * content: + * application/atom+xml: + * schema: + * type: string + * 404: + * description: Either the service ID does not match any service or the terms type is not declared by that service. + */ + router.get('/feed/:serviceId/:termsType', async (req, res) => { + const service = Object.hasOwn(services, req.params.serviceId) ? services[req.params.serviceId] : null; + + if (!service) { + return res.status(404).send('Service not found'); + } + + const { termsType } = req.params; + + if (!service.getTermsTypes().includes(termsType)) { + return res.status(404).send('Terms type not found for this service'); + } + + const versions = await versionsRepository.findByServiceAndTermsType(service.id, termsType, { limit: feedLimit, includeTechnicalUpgrades: false }); + const selfHref = `${buildAbsoluteBaseUrl(req)}/feed/${encodeURIComponent(service.id)}/${encodeURIComponent(termsType)}`; + + return renderFeed(req, res, { selfHref, suffix: [ service.id, termsType ], versions }); + }); + + return router; +} diff --git a/src/collection-api/routes/feed.test.js b/src/collection-api/routes/feed.test.js new file mode 100644 index 000000000..ed95e4964 --- /dev/null +++ b/src/collection-api/routes/feed.test.js @@ -0,0 +1,739 @@ +import { expect } from 'chai'; +import config from 'config'; +import express from 'express'; +import supertest from 'supertest'; + +import { getCollection } from '../../archivist/collection/index.js'; +import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'; +import Version from '../../archivist/recorder/version.js'; +import * as Services from '../../archivist/services/index.js'; +import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; +import app from '../server.js'; + +import feedRouter from './feed.js'; + +const basePath = config.get('@opentermsarchive/engine.collection-api.basePath'); +const request = supertest(app); +const storageConfig = config.get('@opentermsarchive/engine.recorder.versions.storage'); + +function extractTag(xml, tag) { + const match = xml.match(new RegExp(`<${tag}>([\\s\\S]*?)`)); + + return match ? match[1] : null; +} + +describe('Feed API', () => { + describe('GET /feed', () => { + let response; + let collection; + + before(async () => { + collection = await getCollection(); + response = await request.get(`${basePath}/v1/feed`); + }); + + it('responds with 200 status code', () => { + expect(response.status).to.equal(200); + }); + + it('responds with Content-Type application/atom+xml', () => { + expect(response.headers['content-type']).to.match(/^application\/atom\+xml/); + }); + + it('is a valid Atom feed root', () => { + expect(response.text).to.match(/^<\?xml version="1\.0"/); + expect(response.text).to.include(' { + it('has a title matching the collection name', () => { + expect(extractTag(response.text, 'title')).to.equal(collection.metadata.name); + }); + + it('has a subtitle matching the collection tagline', () => { + expect(extractTag(response.text, 'subtitle')).to.equal(collection.metadata.tagline); + }); + + it('has a tag URI id based on the collection id', () => { + expect(extractTag(response.text, 'id')).to.equal(`tag:opentermsarchive.org,2026:feed:${collection.metadata.id}`); + }); + + it('has an updated element with a valid ISO 8601 datetime', () => { + const updated = extractTag(response.text, 'updated'); + + expect(updated).to.be.a('string'); + expect(new Date(updated).toString()).to.not.equal('Invalid Date'); + }); + + it('has a self link pointing to the feed endpoint', () => { + const selfHrefMatch = response.text.match(/]*rel="self"[^>]*href="([^"]+)"/); + + expect(selfHrefMatch).to.not.be.null; + expect(selfHrefMatch[1]).to.match(new RegExp(`${basePath}/v1/feed$`)); + }); + + it('advertises an application/atom+xml type on the self link', () => { + expect(response.text).to.match(/]*rel="self"[^>]*type="application\/atom\+xml"/); + }); + + it('has an author matching the feed author name', () => { + expect(response.text).to.match(/[\s\S]*Open Terms Archive engine<\/name>[\s\S]*<\/author>/); + }); + + it('has a logo matching the collection logo', () => { + expect(extractTag(response.text, 'logo')).to.equal(collection.metadata.logo); + }); + }); + }); + + describe('GET /feed — entries', () => { + const FETCH_DATE_FIRST = new Date('2023-01-01T12:00:00Z'); + const FETCH_DATE_CHANGE = new Date('2023-06-15T08:30:00Z'); + const FETCH_DATE_UPGRADE = new Date('2024-02-10T16:45:00Z'); + + let response; + let repository; + let savedVersions; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + const firstRecord = await repository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'first content', + fetchDate: FETCH_DATE_FIRST, + snapshotIds: ['snapshot_1'], + })); + + const changeRecord = await repository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'changed content', + fetchDate: FETCH_DATE_CHANGE, + snapshotIds: ['snapshot_2'], + })); + + const upgradeRecord = await repository.save(new Version({ + serviceId: 'service-2', + termsType: 'Privacy Policy', + content: 'initial privacy', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['snapshot_3'], + })); + + const technicalUpgradeRecord = await repository.save(new Version({ + serviceId: 'service-2', + termsType: 'Privacy Policy', + content: 'upgraded privacy', + fetchDate: FETCH_DATE_UPGRADE, + snapshotIds: ['snapshot_4'], + isTechnicalUpgrade: true, + })); + + savedVersions = { firstRecord, changeRecord, upgradeRecord, technicalUpgradeRecord }; + response = await request.get(`${basePath}/v1/feed`); + }); + + after(() => repository.removeAll()); + + it('lists one entry per real-change version up to the configured limit', () => { + const limit = config.get('@opentermsarchive/engine.collection-api.feed.limit'); + const entries = response.text.match(//g) || []; + + expect(entries).to.have.length(Math.min(3, limit)); + }); + + it('orders entries newest-first', () => { + const updates = [...response.text.matchAll(/[\s\S]*?([^<]+)<\/updated>[\s\S]*?<\/entry>/g)].map(match => match[1]); + + expect(updates).to.deep.equal([...updates].sort().reverse()); + }); + + describe('entry metadata', () => { + let firstEntry; + + before(() => { + [firstEntry] = response.text.match(/[\s\S]*?<\/entry>/); + }); + + it('has an id tag URI including storage type and record id', () => { + const collectionId = 'test'; + const expected = `tag:opentermsarchive.org,2026:version:${collectionId}:${storageConfig.type}:${savedVersions.upgradeRecord.id}`; + + expect(firstEntry).to.include(`${expected}`); + }); + + it('has an alternate link to the API version endpoint', () => { + const href = firstEntry.match(/]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + const expectedPathFragment = `/version/${encodeURIComponent('service-2')}/${encodeURIComponent('Privacy Policy')}/${encodeURIComponent(toISODateWithoutMilliseconds(savedVersions.upgradeRecord.fetchDate))}`; + + expect(href).to.include(expectedPathFragment); + }); + + it('has exactly one link per entry', () => { + const links = firstEntry.match(/]*\/>/g) || []; + + expect(links).to.have.length(1); + }); + + it('has a type matching the default Version API JSON response on the alternate link', () => { + expect(firstEntry).to.match(/]*rel="alternate"[^>]*type="application\/json"/); + }); + + it('has a title reconstructed from commit prefix + serviceId + termsType', () => { + const title = firstEntry.match(/]*>([\s\S]*?)<\/title>/)[1]; + + expect(title).to.include('First record of'); + expect(title).to.include('service-2'); + expect(title).to.include('Privacy Policy'); + }); + + it('has an updated element matching the fetch date', () => { + const updated = firstEntry.match(/([^<]+)<\/updated>/)[1]; + + expect(new Date(updated).toISOString()).to.equal(savedVersions.upgradeRecord.fetchDate.toISOString()); + }); + + it('has three categories with the expected schemes', () => { + const categories = [...firstEntry.matchAll(//g)].map(match => match[1]); + + expect(categories).to.have.length(3); + + const schemes = categories.map(attrs => attrs.match(/scheme="([^"]+)"/)[1]); + + expect(schemes).to.include('tag:opentermsarchive.org,2026:scheme:service'); + expect(schemes).to.include('tag:opentermsarchive.org,2026:scheme:terms-type'); + expect(schemes).to.include('tag:opentermsarchive.org,2026:scheme:record-type'); + }); + + it('has category terms for service, terms type and record type', () => { + const categories = [...firstEntry.matchAll(//g)].map(match => match[1]); + const terms = categories.map(attrs => attrs.match(/term="([^"]+)"/)[1]); + + expect(terms).to.include('service-2'); + expect(terms).to.include('Privacy Policy'); + expect(terms).to.include('First record'); + }); + }); + + describe('record-type classification', () => { + function findEntryById(xml, recordId) { + const match = [...xml.matchAll(/[\s\S]*?<\/entry>/g)].find(entry => entry[0].includes(`:${recordId}`)); + + return match && match[0]; + } + + it('classifies a first record as "First record"', () => { + const entry = findEntryById(response.text, savedVersions.upgradeRecord.id); + + expect(entry).to.not.be.undefined; + expect(entry).to.match(/term="First record"/); + }); + + it('classifies a content change as "Change"', () => { + const entry = findEntryById(response.text, savedVersions.changeRecord.id); + + expect(entry).to.not.be.undefined; + expect(entry).to.match(/term="Change"/); + }); + + it('excludes technical upgrade records from the feed', () => { + const entry = findEntryById(response.text, savedVersions.technicalUpgradeRecord.id); + + expect(entry).to.be.undefined; + }); + }); + + describe('configurable limit', () => { + it('returns at most the configured number of entries', () => { + const limit = config.get('@opentermsarchive/engine.collection-api.feed.limit'); + const entries = response.text.match(//g) || []; + + expect(entries.length).to.be.at.most(limit); + }); + }); + }); + + describe('GET /feed/:serviceId', () => { + const SERVICE = 'service_without_history'; + const OTHER_SERVICE = 'service_with_history'; + const TERMS = 'Terms of Service'; + + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'c1', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['s1'], + })); + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'c2', + fetchDate: new Date('2024-02-01T00:00:00Z'), + snapshotIds: ['s2'], + })); + await repository.save(new Version({ + serviceId: OTHER_SERVICE, + termsType: TERMS, + content: 'c3', + fetchDate: new Date('2024-03-01T00:00:00Z'), + snapshotIds: ['s3'], + })); + }); + + after(() => repository.removeAll()); + + context('when the service exists and has versions', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE)}`); + }); + + it('responds with 200', () => { + expect(response.status).to.equal(200); + }); + + it('responds with Content-Type application/atom+xml', () => { + expect(response.headers['content-type']).to.match(/^application\/atom\+xml/); + }); + + it('includes only entries for that service', () => { + const serviceTerms = [...response.text.matchAll(/scheme="tag:opentermsarchive.org,2026:scheme:service"[^/]*term="([^"]+)"/g)] + .concat([...response.text.matchAll(/term="([^"]+)"[^/]*scheme="tag:opentermsarchive.org,2026:scheme:service"/g)]) + .map(match => match[1]); + + expect(serviceTerms).to.not.be.empty; + + for (const term of serviceTerms) { + expect(term).to.equal(SERVICE); + } + }); + + it('has a feed id including the service id', () => { + expect(extractTag(response.text, 'id')).to.equal(`tag:opentermsarchive.org,2026:feed:test:${SERVICE}`); + }); + + it('has a self link pointing to the service-scoped feed endpoint', () => { + const href = response.text.match(/]*rel="self"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.match(new RegExp(`/feed/${SERVICE}$`)); + }); + }); + + context('when the service exists but has no versions', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent('service_with_filters_history')}`); + }); + + it('responds with 200', () => { + expect(response.status).to.equal(200); + }); + + it('returns an empty feed (no entries)', () => { + expect(response.text).to.not.include(''); + }); + + it('uses a stable updated date so conditional GET keeps working', () => { + expect(extractTag(response.text, 'updated')).to.equal(new Date(0).toISOString()); + }); + }); + + context('when the service does not exist', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/DoesNotExist`); + }); + + it('responds with 404', () => { + expect(response.status).to.equal(404); + }); + }); + + context('when the serviceId casing does not match', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE.toUpperCase())}`); + }); + + it('responds with 404', () => { + expect(response.status).to.equal(404); + }); + }); + }); + + describe('XML escaping and URL encoding', () => { + const SERVICE = 'Service B!'; + const TERMS = 'Privacy Policy'; + const FETCH_DATE = new Date('2024-05-15T10:00:00Z'); + + let response; + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'content with & and ', + fetchDate: FETCH_DATE, + snapshotIds: ['s_escape'], + })); + + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE)}/${encodeURIComponent(TERMS)}`); + }); + + after(() => repository.removeAll()); + + it('responds with 200', () => { + expect(response.status).to.equal(200); + }); + + it('URL-encodes spaces and special characters in the self link href', () => { + const href = response.text.match(/]*rel="self"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.include('Service%20B!'); + expect(href).to.include('Privacy%20Policy'); + expect(href).to.not.include('Service B!'); + }); + + it('URL-encodes spaces and special characters in entry alternate links', () => { + const entry = response.text.match(/[\s\S]*?<\/entry>/)[0]; + const href = entry.match(/]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.include('Service%20B!'); + expect(href).to.include('Privacy%20Policy'); + }); + }); + + describe('GET /feed/:serviceId/:termsType', () => { + const SERVICE = 'service_without_history'; + const TERMS = 'Terms of Service'; + const UNKNOWN_TERMS = 'Imprint'; + + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'first', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['s1'], + })); + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'updated', + fetchDate: new Date('2024-02-01T00:00:00Z'), + snapshotIds: ['s2'], + })); + }); + + after(() => repository.removeAll()); + + context('when the service and terms type match', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE)}/${encodeURIComponent(TERMS)}`); + }); + + it('responds with 200', () => { + expect(response.status).to.equal(200); + }); + + it('includes entries for the combination', () => { + const entries = response.text.match(//g) || []; + + expect(entries.length).to.be.at.least(1); + }); + + it('entries only have the expected terms type', () => { + const termsTypeTerms = [...response.text.matchAll(/ match[1]); + + for (const term of termsTypeTerms) { + expect(term).to.equal(TERMS); + } + }); + + it('has a feed id that includes both service and terms type', () => { + expect(extractTag(response.text, 'id')).to.equal(`tag:opentermsarchive.org,2026:feed:test:${SERVICE}:${TERMS}`); + }); + + it('has a self link pointing to the combination endpoint', () => { + const href = response.text.match(/]*rel="self"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.match(new RegExp(`/feed/${SERVICE}/${encodeURIComponent(TERMS)}$`)); + }); + }); + + context('when the service exists but does not declare the terms type', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/${encodeURIComponent(SERVICE)}/${encodeURIComponent(UNKNOWN_TERMS)}`); + }); + + it('responds with 404', () => { + expect(response.status).to.equal(404); + }); + }); + + context('when the service does not exist', () => { + let response; + + before(async () => { + response = await request.get(`${basePath}/v1/feed/DoesNotExist/${encodeURIComponent(TERMS)}`); + }); + + it('responds with 404', () => { + expect(response.status).to.equal(404); + }); + }); + }); + + describe('entry links with versionUrlTemplate configured', () => { + const TEMPLATE = 'https://example.test/v/%VERSION_ID'; + + let response; + let repository; + let savedVersion; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + savedVersion = await repository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'content', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['s1'], + })); + + const services = await Services.load(); + const templatedApp = express(); + + templatedApp.use(feedRouter(services, repository, storageConfig.type, 10, TEMPLATE)); + + response = await supertest(templatedApp).get('/feed'); + }); + + after(() => repository.removeAll()); + + it('interpolates %VERSION_ID into the alternate link', () => { + const href = response.text.match(/[\s\S]*?]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.equal(`https://example.test/v/${savedVersion.id}`); + }); + + it('advertises a text/html type on the alternate link', () => { + const entry = response.text.match(/[\s\S]*?<\/entry>/)[0]; + + expect(entry).to.match(/]*rel="alternate"[^>]*type="text\/html"/); + }); + + it('does not point to the API for entry links', () => { + const entries = response.text.match(/[\s\S]*?<\/entry>/g) || []; + + for (const entry of entries) { + expect(entry).to.not.match(/]*href="[^"]*\/version\//); + } + }); + + it('still emits exactly one link per entry', () => { + const entries = response.text.match(/[\s\S]*?<\/entry>/g) || []; + + for (const entry of entries) { + const links = entry.match(/]*\/>/g) || []; + + expect(links).to.have.length(1); + } + }); + }); + + describe('XML escape of special characters', () => { + const SERVICE = 'AT&T Mobile'; + const TERMS = 'Terms of Service'; + + let response; + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + await repository.save(new Version({ + serviceId: SERVICE, + termsType: TERMS, + content: 'content', + fetchDate: new Date('2024-05-15T10:00:00Z'), + snapshotIds: ['s_xmlesc'], + })); + + response = await request.get(`${basePath}/v1/feed`); + }); + + after(() => repository.removeAll()); + + it('escapes ampersands in the entry title text', () => { + expect(response.text).to.match(/[^<]*AT&T Mobile[^<]*<\/title>/); + expect(response.text).to.not.match(/<title>[^<]*AT&T Mobile/); + }); + + it('escapes ampersands in the category term attribute', () => { + expect(response.text).to.match(/<category[^/]*term="AT&T Mobile"/); + expect(response.text).to.not.match(/<category[^/]*term="AT&T Mobile"/); + }); + + context('with a versionUrlTemplate that contains XML-special characters', () => { + const TEMPLATE = 'https://example.test/v?ref=main&id=%VERSION_ID'; + + let templatedResponse; + let templatedRepository; + + before(async function () { + this.timeout(5000); + templatedRepository = RepositoryFactory.create(storageConfig); + await templatedRepository.initialize(); + await templatedRepository.removeAll(); + await templatedRepository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'content', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['s_tpl_escape'], + })); + + const services = await Services.load(); + const templatedApp = express(); + + templatedApp.use(feedRouter(services, templatedRepository, storageConfig.type, 10, TEMPLATE)); + templatedResponse = await supertest(templatedApp).get('/feed'); + }); + + after(() => templatedRepository.removeAll()); + + it('escapes the ampersand in the alternate link href', () => { + expect(templatedResponse.text).to.match(/<link[^>]*rel="alternate"[^>]*href="https:\/\/example\.test\/v\?ref=main&id=[^"]+"/); + }); + }); + }); + + describe('conditional GET via Last-Modified', () => { + const FETCH_DATE = new Date('2024-05-15T10:00:00Z'); + + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + await repository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'content', + fetchDate: FETCH_DATE, + snapshotIds: ['s1'], + })); + }); + + after(() => repository.removeAll()); + + it('exposes a Last-Modified header matching the latest entry fetch date', async () => { + const response = await request.get(`${basePath}/v1/feed`); + + expect(response.headers['last-modified']).to.equal(FETCH_DATE.toUTCString()); + }); + + it('returns 304 with no body when If-Modified-Since is at or after the latest entry', async () => { + const response = await request + .get(`${basePath}/v1/feed`) + .set('If-Modified-Since', FETCH_DATE.toUTCString()); + + expect(response.status).to.equal(304); + expect(response.text).to.be.empty; + }); + + it('returns 200 with body when If-Modified-Since is before the latest entry', async () => { + const earlier = new Date(FETCH_DATE.getTime() - 24 * 60 * 60 * 1000); + const response = await request + .get(`${basePath}/v1/feed`) + .set('If-Modified-Since', earlier.toUTCString()); + + expect(response.status).to.equal(200); + expect(response.text).to.include('<feed'); + }); + }); + + describe('behind a reverse proxy', () => { + let response; + let repository; + + before(async function () { + this.timeout(5000); + repository = RepositoryFactory.create(storageConfig); + await repository.initialize(); + await repository.removeAll(); + await repository.save(new Version({ + serviceId: 'service-1', + termsType: 'Terms of Service', + content: 'content', + fetchDate: new Date('2024-01-01T00:00:00Z'), + snapshotIds: ['s1'], + })); + + response = await request + .get(`${basePath}/v1/feed`) + .set('X-Forwarded-Proto', 'https') + .set('X-Forwarded-Host', 'api.example.com'); + }); + + after(() => repository.removeAll()); + + it('uses the forwarded protocol and host in the self link', () => { + const href = response.text.match(/<link[^>]*rel="self"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.match(/^https:\/\/api\.example\.com\//); + }); + + it('uses the forwarded protocol and host in entry alternate links', () => { + const entry = response.text.match(/<entry>[\s\S]*?<\/entry>/); + + expect(entry).to.not.be.null; + + const href = entry[0].match(/<link[^>]*rel="alternate"[^>]*href="([^"]+)"/)[1]; + + expect(href).to.match(/^https:\/\/api\.example\.com\//); + }); + }); +}); diff --git a/src/collection-api/routes/index.js b/src/collection-api/routes/index.js index b99636b90..c4ae5ec33 100644 --- a/src/collection-api/routes/index.js +++ b/src/collection-api/routes/index.js @@ -1,10 +1,13 @@ +import config from 'config'; import express from 'express'; import helmet from 'helmet'; import { getCollection } from '../../archivist/collection/index.js'; +import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'; import * as Services from '../../archivist/services/index.js'; import docsRouter from './docs.js'; +import feedRouter from './feed.js'; import metadataRouter from './metadata.js'; import servicesRouter from './services.js'; import versionsRouter from './versions.js'; @@ -33,10 +36,22 @@ export default async function apiRouter(basePath) { const services = await Services.load(); const collection = await getCollection(); + const versionsStorageConfig = config.get('@opentermsarchive/engine.recorder.versions.storage'); + const versionsRepository = await RepositoryFactory.create(versionsStorageConfig).initialize(); + const feedConfig = config.get('@opentermsarchive/engine.collection-api.feed'); + + if (!collection.metadata?.id) { + throw new Error('Collection metadata "id" is required to expose feed endpoints, as it is used to build the tag URIs that uniquely identify the feed and its entries. Add an "id" field to the collection metadata file.'); + } + + if (!collection.metadata?.name) { + throw new Error('Collection metadata "name" is required to expose feed endpoints, as it is used as the Atom feed title which the Atom 1.0 specification requires to be non-empty. Add a "name" field to the collection metadata file.'); + } router.use(await metadataRouter(collection, services)); router.use(servicesRouter(services)); - router.use(versionsRouter); + router.use(versionsRouter(versionsRepository)); + router.use(feedRouter(services, versionsRepository, versionsStorageConfig.type, feedConfig.limit, feedConfig.versionUrlTemplate)); return router; } diff --git a/src/collection-api/routes/services.js b/src/collection-api/routes/services.js index f13879d2d..9906b7152 100644 --- a/src/collection-api/routes/services.js +++ b/src/collection-api/routes/services.js @@ -130,8 +130,7 @@ export default function servicesRouter(services) { * description: No service matching the provided ID is found. */ router.get('/service/:serviceId', (req, res) => { - const matchedServiceID = Object.keys(services).find(key => key.toLowerCase() === req.params.serviceId?.toLowerCase()); - const service = services[matchedServiceID]; + const service = Object.hasOwn(services, req.params.serviceId) ? services[req.params.serviceId] : null; if (!service) { res.status(404).send('Service not found'); diff --git a/src/collection-api/routes/services.test.js b/src/collection-api/routes/services.test.js index db6bdc16f..43dfed950 100644 --- a/src/collection-api/routes/services.test.js +++ b/src/collection-api/routes/services.test.js @@ -56,7 +56,6 @@ describe('Services API', () => { describe('GET /service/:serviceId', () => { let response; const SERVICE_ID = 'Service B!'; - const CASE_INSENSITIVE_SERVICE_ID = 'service b!'; before(async () => { response = await request(app).get(`${basePath}/v1/service/${encodeURI(SERVICE_ID)}`); @@ -106,49 +105,13 @@ describe('Services API', () => { }); }); - context('with a case-insensitive service ID parameter', () => { + context('when the service ID casing does not match', () => { before(async () => { - response = await request(app).get(`${basePath}/v1/service/${encodeURI(CASE_INSENSITIVE_SERVICE_ID)}`); + response = await request(app).get(`${basePath}/v1/service/${encodeURI(SERVICE_ID.toLowerCase())}`); }); - it('responds with 200 status code', () => { - expect(response.status).to.equal(200); - }); - - it('returns a service object with id', () => { - expect(response.body).to.have.property('id'); - }); - - it('returns the proper service object', () => { - expect(response.body.id).to.equal(SERVICE_ID); - }); - - it('returns a service object with name', () => { - expect(response.body).to.have.property('name'); - }); - - it('returns a service object with an array of terms', () => { - expect(response.body).to.have.property('terms').that.is.an('array'); - }); - - it('each terms should have a type property', () => { - response.body.terms.forEach(terms => { - expect(terms).to.have.property('type'); - }); - }); - - it('each terms should have an array of source documents', () => { - response.body.terms.forEach(terms => { - expect(terms).to.have.property('sourceDocuments').that.is.an('array'); - }); - }); - - it('each source document should have a location', () => { - response.body.terms.forEach(terms => { - terms.sourceDocuments.forEach(sourceDocument => { - expect(sourceDocument).to.have.property('location'); - }); - }); + it('responds with 404 status code', () => { + expect(response.status).to.equal(404); }); }); diff --git a/src/collection-api/routes/versions.js b/src/collection-api/routes/versions.js index e420f8998..176ba0c55 100644 --- a/src/collection-api/routes/versions.js +++ b/src/collection-api/routes/versions.js @@ -1,10 +1,10 @@ -import config from 'config'; import express from 'express'; -import RepositoryFactory from '../../archivist/recorder/repositories/factory.js'; import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; /** + * @param {object} versionsRepository The versions repository instance + * @returns {express.Router} The router instance * @private * @swagger * tags: @@ -27,86 +27,86 @@ import { toISODateWithoutMilliseconds } from '../../archivist/utils/date.js'; * type: string * description: The JSON-escaped Markdown content of the version */ -const router = express.Router(); +export default function versionsRouter(versionsRepository) { + const router = express.Router(); -const versionsRepository = await RepositoryFactory.create(config.get('@opentermsarchive/engine.recorder.versions.storage')).initialize(); + /** + * @private + * @swagger + * /version/{serviceId}/{termsType}/{date}: + * get: + * summary: Get a specific version of some terms at a given date. + * tags: [Versions] + * produces: + * - application/json + * parameters: + * - in: path + * name: serviceId + * description: The ID of the service whose version will be returned. + * schema: + * type: string + * required: true + * - in: path + * name: termsType + * description: The type of terms whose version will be returned. + * schema: + * type: string + * required: true + * - in: path + * name: date + * description: The date and time for which the version is requested, in ISO 8601 format. + * schema: + * type: string + * format: date-time + * required: true + * responses: + * 200: + * description: A JSON object containing the version content and metadata. + * content: + * application/json: + * schema: + * $ref: '#/components/schemas/Version' + * 404: + * description: No version found for the specified combination of service ID, terms type and date. + * content: + * application/json: + * schema: + * type: object + * properties: + * error: + * type: string + * description: Error message indicating that no version is found. + * 416: + * description: The requested date is in the future. + * content: + * application/json: + * schema: + * type: object + * properties: + * error: + * type: string + * description: Error message indicating that the requested date is in the future. + */ + router.get('/version/:serviceId/:termsType/:date', async (req, res) => { + const { serviceId, termsType, date } = req.params; + const requestedDate = new Date(date); -/** - * @private - * @swagger - * /version/{serviceId}/{termsType}/{date}: - * get: - * summary: Get a specific version of some terms at a given date. - * tags: [Versions] - * produces: - * - application/json - * parameters: - * - in: path - * name: serviceId - * description: The ID of the service whose version will be returned. - * schema: - * type: string - * required: true - * - in: path - * name: termsType - * description: The type of terms whose version will be returned. - * schema: - * type: string - * required: true - * - in: path - * name: date - * description: The date and time for which the version is requested, in ISO 8601 format. - * schema: - * type: string - * format: date-time - * required: true - * responses: - * 200: - * description: A JSON object containing the version content and metadata. - * content: - * application/json: - * schema: - * $ref: '#/components/schemas/Version' - * 404: - * description: No version found for the specified combination of service ID, terms type and date. - * content: - * application/json: - * schema: - * type: object - * properties: - * error: - * type: string - * description: Error message indicating that no version is found. - * 416: - * description: The requested date is in the future. - * content: - * application/json: - * schema: - * type: object - * properties: - * error: - * type: string - * description: Error message indicating that the requested date is in the future. - */ -router.get('/version/:serviceId/:termsType/:date', async (req, res) => { - const { serviceId, termsType, date } = req.params; - const requestedDate = new Date(date); - - if (requestedDate > new Date()) { - return res.status(416).json({ error: 'Requested version is in the future' }); - } + if (requestedDate > new Date()) { + return res.status(416).json({ error: 'Requested version is in the future' }); + } - const version = await versionsRepository.findByDate(serviceId, termsType, requestedDate); + const version = await versionsRepository.findByDate(serviceId, termsType, requestedDate); - if (!version) { - return res.status(404).json({ error: `No version found for date ${date}` }); - } + if (!version) { + return res.status(404).json({ error: `No version found for date ${date}` }); + } - return res.status(200).json({ - id: version.id, - fetchDate: toISODateWithoutMilliseconds(version.fetchDate), - content: version.content, + return res.status(200).json({ + id: version.id, + fetchDate: toISODateWithoutMilliseconds(version.fetchDate), + content: version.content, + }); }); -}); -export default router; + return router; +} diff --git a/src/collection-api/routes/versions.test.js b/src/collection-api/routes/versions.test.js index aadcfe14b..bfdff4e15 100644 --- a/src/collection-api/routes/versions.test.js +++ b/src/collection-api/routes/versions.test.js @@ -17,7 +17,7 @@ describe('Versions API', () => { let versionsRepository; const FETCH_DATE = new Date('2023-01-01T12:00:00Z'); const VERSION_COMMON_ATTRIBUTES = { - serviceId: 'service-1', + serviceId: 'service·A', termsType: 'Terms of Service', snapshotId: ['snapshot_id'], }; @@ -62,7 +62,7 @@ describe('Versions API', () => { context('when a version is found', () => { before(async () => { - response = await request.get(`${basePath}/v1/version/service-1/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE))}`); + response = await request.get(`${basePath}/v1/version/service·A/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(FETCH_DATE))}`); }); it('responds with 200 status code', () => { @@ -80,7 +80,7 @@ describe('Versions API', () => { context('when the requested date is anterior to the first available version', () => { before(async () => { - response = await request.get(`${basePath}/v1/version/service-1/Terms%20of%20Service/2000-01-01T12:00:00Z`); + response = await request.get(`${basePath}/v1/version/service·A/Terms%20of%20Service/2000-01-01T12:00:00Z`); }); it('responds with 404 status code', () => { @@ -100,7 +100,7 @@ describe('Versions API', () => { before(async () => { const dateInTheFuture = new Date(Date.now() + 60000); // 1 minute in the future - response = await request.get(`${basePath}/v1/version/service-1/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(dateInTheFuture))}`); + response = await request.get(`${basePath}/v1/version/service·A/Terms%20of%20Service/${encodeURIComponent(toISODateWithoutMilliseconds(dateInTheFuture))}`); }); it('responds with 416 status code', () => { diff --git a/src/collection-api/server.js b/src/collection-api/server.js index c1b0a7eb0..6a54758ce 100644 --- a/src/collection-api/server.js +++ b/src/collection-api/server.js @@ -8,6 +8,8 @@ import apiRouter from './routes/index.js'; const app = express(); +app.set('trust proxy', 'loopback'); // The API binds to 127.0.0.1 and is expected to run behind a reverse proxy. Honour X-Forwarded-* headers only when they come from a local proxy so absolute URLs emitted by routes (notably Atom feed links) reflect the URL seen by clients rather than the internal http://127.0.0.1 hop. + if (process.env.NODE_ENV !== 'test') { app.use(loggerMiddleware); } diff --git a/src/reporter/gitlab/index.js b/src/reporter/gitlab/index.js index 431416768..55ea591f1 100644 --- a/src/reporter/gitlab/index.js +++ b/src/reporter/gitlab/index.js @@ -358,7 +358,7 @@ export default class GitLab { try { let apiUrl = `${this.apiBaseURL}/projects/${this.projectId}/issues?search=${encodeURIComponent(title)}&state=${searchParams.state}&per_page=100`; - if (searchParams.state == 'all') apiUrl = `${this.apiBaseURL}/projects/${this.projectId}/issues?search=${encodeURIComponent(title)}&per_page=100`; + if (searchParams.state == 'all') { apiUrl = `${this.apiBaseURL}/projects/${this.projectId}/issues?search=${encodeURIComponent(title)}&per_page=100`; } const options = GitLab.baseOptionsHttpReq();