Skip to content
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
OTA_ENGINE_GITHUB_TOKEN=your_github_token_here
OTA_ENGINE_GITLAB_TOKEN=your_gitlab_token_here
OTA_ENGINE_GITLAB_RELEASES_TOKEN=your_gitlab_releases_token_here
OTA_ENGINE_DATAGOUV_API_KEY=your_datagouv_api_key_here
OTA_ENGINE_SENDINBLUE_API_KEY=your_sendinblue_api_key_here
OTA_ENGINE_SMTP_PASSWORD=your_smtp_password_here

Expand Down
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased [minor]

> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs.

### Added

- Add support for publishing datasets to data.gouv.fr; configure `dataset.datagouv.datasetId` or `dataset.datagouv.organizationIdOrSlug` in configuration file and set `OTA_ENGINE_DATAGOUV_API_KEY` environment variable
- Add ability to publish datasets to multiple platforms simultaneously; datasets can now be published to GitHub (or GitLab) and data.gouv.fr in parallel

## 10.0.1 - 2025-11-24

_Full changeset and discussions: [#1208](https://github.com/OpenTermsArchive/engine/pull/1208)._
Expand Down
4 changes: 2 additions & 2 deletions bin/ota-dataset.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ import logger from '../src/logger/index.js';

program
.name('ota dataset')
.description('Export the versions dataset into a ZIP file and optionally publish it to GitHub releases')
.description('Export the versions dataset into a ZIP file and optionally publish it to GitHub releases, GitLab releases, or data.gouv.fr')
.option('-f, --file <filename>', 'file name of the generated dataset')
.option('-p, --publish', 'publish dataset to GitHub releases on versions repository. Mandatory authentication to GitHub is provided through the `OTA_ENGINE_GITHUB_TOKEN` environment variable')
.option('-p, --publish', 'publish dataset. Supports GitHub releases (OTA_ENGINE_GITHUB_TOKEN), GitLab releases (OTA_ENGINE_GITLAB_TOKEN), or data.gouv.fr (OTA_ENGINE_DATAGOUV_API_KEY + config)')
.option('-r, --remove-local-copy', 'remove local copy of dataset after publishing. Works only in combination with --publish option')
.option('--schedule', 'schedule automatic dataset generation');

Expand Down
2 changes: 1 addition & 1 deletion config/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
}
},
"dataset": {
"title": "sandbox",
"title": "Sandbox collection dataset",
"versionsRepositoryURL": "https://github.com/OpenTermsArchive/sandbox-declarations",
"publishingSchedule": "30 8 * * MON"
}
Expand Down
2 changes: 1 addition & 1 deletion scripts/dataset/assets/README.template.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ export function title({ releaseDate }) {

const title = config.get('@opentermsarchive/engine.dataset.title');

return `${title} — ${releaseDate} dataset`;
return `${title} — ${releaseDate}`;
}

export function body({ servicesCount, firstVersionDate, lastVersionDate }) {
Expand Down
2 changes: 1 addition & 1 deletion scripts/dataset/export/test/fixtures/dataset/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Open Terms Archive — sandbox — January 1, 2022 dataset
# Open Terms Archive — sandbox — January 1, 2022

This dataset consolidates the contractual documents of 2 service providers, in all their versions that were accessible online between January 1, 2021 and January 6, 2022.

Expand Down
11 changes: 8 additions & 3 deletions scripts/dataset/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import publishRelease from './publish/index.js';

export async function release({ shouldPublish, shouldRemoveLocalCopy, fileName }) {
const releaseDate = new Date();
const archiveName = fileName || `dataset-${config.get('@opentermsarchive/engine.dataset.title')}-${releaseDate.toISOString().replace(/T.*/, '')}`;
const archiveName = fileName || `${config.get('@opentermsarchive/engine.dataset.title').toLowerCase().replace(/[^a-zA-Z0-9.\-_]/g, '-')}-${releaseDate.toISOString().replace(/T.*/, '')}`;
const archivePath = `${path.basename(archiveName, '.zip')}.zip`; // allow to pass filename or filename.zip as the archive name and have filename.zip as the result name

logger.info('Start exporting dataset…');
Expand All @@ -24,13 +24,18 @@ export async function release({ shouldPublish, shouldRemoveLocalCopy, fileName }

logger.info('Start publishing dataset…');

const releaseUrl = await publishRelease({
const results = await publishRelease({
archivePath,
releaseDate,
stats,
});

logger.info(`Dataset published to ${releaseUrl}`);
if (results.length > 0) {
logger.info('Dataset published to following platforms:');
results.forEach(result => {
logger.info(` - ${result.platform}: ${result.url}`);
});
}

if (!shouldRemoveLocalCopy) {
return;
Expand Down
28 changes: 25 additions & 3 deletions scripts/dataset/logger/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,35 @@ const { combine, timestamp, printf, colorize } = winston.format;
logger.format = combine(
colorize(),
timestamp({ format: 'YYYY-MM-DDTHH:mm:ssZ' }),
printf(({ level, message, counter, hash, timestamp }) => {
const prefix = counter && hash ? `${counter.toString().padEnd(6)} ${hash.padEnd(40)}` : '';
printf(({ level, message, counter, hash, timestamp, module }) => {
let prefix = counter && hash ? `${counter.toString().padEnd(6)} ${hash.padEnd(40)}` : '';

const timestampPrefix = config.get('@opentermsarchive/engine.logger.timestampPrefix') ? `${timestamp} ` : '';

return `${timestampPrefix}${level.padEnd(15)} ${prefix.padEnd(50)} ${message}`;
prefix = module ? `${module} ${prefix}` : prefix;

const levelStr = level.padEnd(15);
let coloredLevel = levelStr;
let coloredMessage = message;

if (level.includes('warn')) {
coloredLevel = `\x1b[33m${levelStr}\x1b[0m`;
coloredMessage = `\x1b[33m${message}\x1b[0m`;
} else if (level.includes('error')) {
coloredLevel = `\x1b[31m${levelStr}\x1b[0m`;
coloredMessage = `\x1b[31m${message}\x1b[0m`;
}

return `${timestampPrefix} ${coloredLevel} ${prefix.padEnd(50)} ${coloredMessage}`;
}),
);

export function createModuleLogger(moduleName) {
return {
info: message => logger.info(message, { module: moduleName }),
warn: message => logger.warn(message, { module: moduleName }),
error: message => logger.error(message, { module: moduleName }),
};
}

export default logger;
234 changes: 234 additions & 0 deletions scripts/dataset/publish/datagouv/dataset.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
import fsApi from 'fs';
import path from 'path';

import FormData from 'form-data';
import nodeFetch from 'node-fetch';

import { createModuleLogger } from '../../logger/index.js';

const logger = createModuleLogger('datagouv');

const DATASET_LICENSE = 'odc-odbl';
const DEFAULT_RESOURCE_DESCRIPTION = 'See README.md inside the archive for dataset structure and usage information.';

const routes = {
dataset: (apiBaseUrl, datasetId) => `${apiBaseUrl}/datasets/${datasetId}/`,
datasets: apiBaseUrl => `${apiBaseUrl}/datasets/`,
datasetUpload: (apiBaseUrl, datasetId) => `${apiBaseUrl}/datasets/${datasetId}/upload/`,
resource: (apiBaseUrl, datasetId, resourceId) => `${apiBaseUrl}/datasets/${datasetId}/resources/${resourceId}/`,
resourceUpload: (apiBaseUrl, datasetId, resourceId) => `${apiBaseUrl}/datasets/${datasetId}/resources/${resourceId}/upload/`,
organization: (apiBaseUrl, organizationIdOrSlug) => `${apiBaseUrl}/organizations/${organizationIdOrSlug}/`,
organizationDatasets: (apiBaseUrl, organizationId) => `${apiBaseUrl}/organizations/${organizationId}/datasets/?page_size=100`,
};

export async function getOrganization({ apiBaseUrl, headers, organizationIdOrSlug }) {
logger.info(`Fetching organization: ${organizationIdOrSlug}…`);

const orgResponse = await nodeFetch(routes.organization(apiBaseUrl, organizationIdOrSlug), { headers });

if (!orgResponse.ok) {
const errorText = await orgResponse.text();

throw new Error(`Failed to retrieve organization: ${orgResponse.status} ${orgResponse.statusText} - ${errorText}`);
}

const orgData = await orgResponse.json();

logger.info(`Found organization: ${orgData.name} (ID: ${orgData.id})`);

return orgData;
}

export async function getDataset({ apiBaseUrl, headers, datasetId }) {
const datasetResponse = await nodeFetch(routes.dataset(apiBaseUrl, datasetId), { headers });

if (!datasetResponse.ok) {
const errorText = await datasetResponse.text();
const error = new Error(`Failed to retrieve dataset: ${datasetResponse.status} ${datasetResponse.statusText} - ${errorText}`);

error.statusCode = datasetResponse.status;
throw error;
}

const datasetData = await datasetResponse.json();

return datasetData;
}

export async function findDatasetByTitle({ apiBaseUrl, headers, organizationId, title }) {
logger.info(`Searching for dataset with title "${title}" in organization…`);

const searchResponse = await nodeFetch(routes.organizationDatasets(apiBaseUrl, organizationId), { headers });

if (!searchResponse.ok) {
const errorText = await searchResponse.text();

throw new Error(`Failed to search for datasets: ${searchResponse.status} ${searchResponse.statusText} - ${errorText}`);
}

const searchData = await searchResponse.json();

const dataset = searchData.data.find(ds => ds.title === title);

if (dataset) {
logger.info(`Found existing dataset: ${dataset.title} (ID: ${dataset.id})`);

return dataset;
}

logger.info('No existing dataset found with this title');

return null;
}

export async function createDataset({ apiBaseUrl, headers, organizationId, title, description, license, frequency }) {
logger.info(`Creating new dataset: ${title}…`);

const createResponse = await nodeFetch(routes.datasets(apiBaseUrl), {
method: 'POST',
headers: {
...headers,
'Content-Type': 'application/json',
},
body: JSON.stringify({
title,
description,
organization: organizationId,
license,
frequency,
}),
});

if (!createResponse.ok) {
const errorText = await createResponse.text();

throw new Error(`Failed to create dataset: ${createResponse.status} ${createResponse.statusText} - ${errorText}`);
}

const dataset = await createResponse.json();

logger.info(`Dataset created successfully: ${dataset.title} (ID: ${dataset.id})`);

return dataset;
}

export async function updateDatasetMetadata({ apiBaseUrl, headers, datasetId, title, description, stats, frequency }) {
const updatePayload = {
title,
description,
license: DATASET_LICENSE,
frequency,
};

if (stats?.firstVersionDate && stats?.lastVersionDate) {
updatePayload.temporal_coverage = {
start: stats.firstVersionDate.toISOString(),
end: stats.lastVersionDate.toISOString(),
};
}

const updateResponse = await nodeFetch(routes.dataset(apiBaseUrl, datasetId), {
method: 'PUT',
headers: {
...headers,
'Content-Type': 'application/json',
},
body: JSON.stringify(updatePayload),
});

if (!updateResponse.ok) {
const errorText = await updateResponse.text();
const error = new Error(`Failed to update dataset metadata: ${updateResponse.status} ${updateResponse.statusText} - ${errorText}`);

error.statusCode = updateResponse.status;
throw error;
}

logger.info('Dataset metadata updated successfully');
}

export async function uploadResource({ apiBaseUrl, headers, datasetId, archivePath }) {
logger.info('Uploading dataset archive…');

const { formData, fileName } = createFormDataForFile(archivePath);

const uploadResponse = await nodeFetch(routes.datasetUpload(apiBaseUrl, datasetId), {
method: 'POST',
headers: { ...formData.getHeaders(), ...headers },
body: formData,
});

if (!uploadResponse.ok) {
const errorText = await uploadResponse.text();

throw new Error(`Failed to upload dataset file: ${uploadResponse.status} ${uploadResponse.statusText} - ${errorText}`);
}

const uploadResult = await uploadResponse.json();

logger.info(`Dataset file uploaded successfully with resource ID: ${uploadResult.id}`);

return { resourceId: uploadResult.id, fileName };
}

export async function replaceResourceFile({ apiBaseUrl, headers, datasetId, resourceId, archivePath }) {
logger.info(`Replacing file for existing resource ID: ${resourceId}…`);

const { formData, fileName } = createFormDataForFile(archivePath);

const uploadResponse = await nodeFetch(routes.resourceUpload(apiBaseUrl, datasetId, resourceId), {
method: 'POST',
headers: { ...formData.getHeaders(), ...headers },
body: formData,
});

if (!uploadResponse.ok) {
const errorText = await uploadResponse.text();

throw new Error(`Failed to replace resource file: ${uploadResponse.status} ${uploadResponse.statusText} - ${errorText}`);
}

const uploadResult = await uploadResponse.json();

logger.info('Resource file replaced successfully');

return { resourceId: uploadResult.id, fileName };
}

export async function updateResourceMetadata({ apiBaseUrl, headers, datasetId, resourceId, fileName }) {
logger.info('Updating resource metadata…');

const resourceUpdateResponse = await nodeFetch(routes.resource(apiBaseUrl, datasetId, resourceId), {
method: 'PUT',
headers: { ...headers, 'Content-Type': 'application/json' },
body: JSON.stringify({
title: fileName,
description: DEFAULT_RESOURCE_DESCRIPTION,
filetype: 'file',
format: 'zip',
mime: 'application/zip',
}),
});

if (!resourceUpdateResponse.ok) {
const errorText = await resourceUpdateResponse.text();

throw new Error(`Failed to update resource metadata: ${resourceUpdateResponse.status} ${resourceUpdateResponse.statusText} - ${errorText}`);
}

logger.info('Resource metadata updated successfully');
}

function createFormDataForFile(archivePath) {
const formData = new FormData();
const fileName = path.basename(archivePath);
const fileStats = fsApi.statSync(archivePath);

formData.append('file', fsApi.createReadStream(archivePath), {
filename: fileName,
contentType: 'application/zip',
knownLength: fileStats.size,
});

return { formData, fileName };
}
Loading
Loading