Module is now pure ESM and cannot be required from CommonJS module. Read about ESM modules and how to migrate from CommonJS to ESM
If you're using request object to customize headers, query params, encoding, etc. - it may require some changes (howewer most of options remain the same). Please follow this migration guide to got.
If you're using scrape.defaults now you need to receive them with
import defaultOptions from 'website-scraper/defaultOptions';If you're using scrape.plugins now you need to receive them with
import * as plugins from 'website-scraper/plugins';Create plugin class which adds saveResource action
// before
scrape({
resourceSaver: class MyResourceSaver {
saveResource (resource) {/* code to save file where you need */}
errorCleanup (err) {/* code to remove all previously saved files in case of error */}
}
})
// after
class CustomSaveResourcePlugin {
apply(registerAction) {
registerAction('saveResource', ({resource}) => {/* code to save file where you need */})
}
}
scrape({
plugins: [ new CustomSaveResourcePlugin() ]
})Create plugin class which adds getReference action
// before
scrape({
updateSources: false
})
// after
class MyGetReferencePlugin {
apply(registerAction) {
registerAction('getReference', () => ({ reference: null }))
}
}
scrape({
plugins: [ new MyGetReferencePlugin() ]
})Create plugin class which adds getReference action
// before
scrape({
updateMissingSources: true
})
// after
class MyGetReferencePlugin {
apply(registerAction) {
registerAction('getReference', ({resource, parentResource, originalReference}) => {
if (!resource) {
return { reference: getAbsoluteUrl(parentResource, originalReference) }
}
return getRelativePath(parentResource.getFilename(), resource.getFilename());
})
}
}
scrape({
plugins: [ new MyGetReferencePlugin() ]
})For functions only, if you use string byType or byStructure - you don't need to do anything.
Create plugin class which adds generateFilename action
// before
scrape({
filenameGenerator: (resource, options, occupiedFileNames) => {
return crypto.randomBytes(20).toString('hex');
}
})
// after
class MyGenerateFilenamePlugin {
apply(registerAction) {
registerAction('generateFilename', ({resource}) => {
return {filename: crypto.randomBytes(20).toString('hex')};
});
}
}
scrape({
plugins: [ new MyGenerateFilenamePlugin() ]
})Create plugin class which adds afterResponse action
// before
scrape({
httpResponseHandler: (response) => {
if (response.statusCode === 404) {
return Promise.reject(new Error('status is 404'));
} else {
return Promise.resolve(response.body);
}
}
})
// after
class MyAfterResponsePlugin {
apply(registerAction) {
registerAction('afterResponse', ({response}) => {
if (response.statusCode === 404) {
return null;
} else {
return response.body;
}
});
}
}
scrape({
plugins: [ new MyAfterResponsePlugin() ]
})For functions only, if you use static request object - you don't need to do anything.
Create plugin class which adds beforeRequest action
// before
scrape({
request: resource => ({qs: {myParam: 123}})
})
// after
class MyBeforeRequestPlugin {
apply(registerAction) {
registerAction('beforeRequest', ({resource, requestOptions}) => {
return {requestOptions: {qs: {myParam: 123}}};
});
}
}
scrape({
plugins: [ new MyBeforeRequestPlugin() ]
})Create plugin class which adds onResourceSaved and onResourceError actions
// before
scrape({
onResourceSaved: (resource) => {
console.log(`Resource ${resource} was saved to fs`);
},
onResourceError: (resource, err) => {
console.log(`Resource ${resource} was not saved because of ${err}`);
}
})
// after
class MyPlugin {
apply(registerAction) {
registerAction('onResourceSaved', ({resource}) => console.log(`Resource ${resource.url} saved!`));
registerAction('onResourceError', ({resource, error}) => console.log(`Resource ${resource.url} has error ${error}`));
}
}
scrape({
plugins: [ new MyPlugin() ]
})