This guide will help you get up and running with CrawlX quickly.
npm install crawlxThe simplest way to use CrawlX is with the quickCrawl function:
import { quickCrawl } from 'crawlx';
const result = await quickCrawl('https://example.com', {
title: 'title',
description: 'meta[name="description"]@content'
});
console.log(result.parsed);
// Output: { title: "Example Domain", description: "..." }For more control, create a crawler instance:
import { CrawlX } from 'crawlx';
const crawler = new CrawlX({
concurrency: 5,
timeout: 10000,
userAgent: 'MyBot/1.0'
});
const result = await crawler.crawl('https://example.com');
console.log(result.response.statusCode); // 200
await crawler.destroy(); // Clean up resourcesCrawlX uses CSS selectors for data extraction:
const parseRule = {
title: 'title', // Text content
links: '[a@href]', // Attribute values
images: ['img@src'], // Arrays
price: '.price | trim | number' // With filters
};
const result = await crawler.crawl('https://shop.example.com', {
parse: parseRule
});const parseRule = {
products: {
_scope: '.product', // Scope to product elements
name: '.name',
price: '.price | trim | number',
image: 'img@src',
details: {
_scope: '.details',
description: '.desc',
specs: ['.spec']
}
}
};const parseRule = {
title: 'title',
url: () => window.location.href,
timestamp: () => new Date().toISOString(),
productCount: ($) => $('.product').length
};CrawlX provides factory functions for common use cases:
import { createLightweightCrawler } from 'crawlx';
const crawler = createLightweightCrawler({
concurrency: 2,
timeout: 5000
});import { createScraper } from 'crawlx';
const scraper = createScraper();
const result = await scraper.crawl('https://example.com', {
parse: {
title: 'title',
content: '.content'
}
});import { createSpider } from 'crawlx';
const spider = createSpider({
plugins: {
follow: {
maxDepth: 3,
sameDomainOnly: true
}
}
});
const results = await spider.crawlMany(['https://example.com'], {
parse: { title: 'title' },
follow: '[a@href]' // Follow all links
});const crawler = new CrawlX({
mode: 'high-performance',
concurrency: 10,
timeout: 30000,
maxRetries: 3,
userAgent: 'MyBot/1.0',
headers: {
'Accept': 'text/html,application/xhtml+xml'
}
});const crawler = new CrawlX({
plugins: {
delay: {
enabled: true,
defaultDelay: 1000,
randomDelay: true
},
rateLimit: {
enabled: true,
globalLimit: { requests: 100, window: 60000 }
},
retry: {
enabled: true,
maxRetries: 3,
exponentialBackoff: true
}
}
});CRAWLX_MODE=high-performance
CRAWLX_CONCURRENCY=10
CRAWLX_TIMEOUT=30000
CRAWLX_PLUGINS_DELAY_ENABLED=true
CRAWLX_PLUGINS_DELAY_DEFAULT_DELAY=1000import { ConfigPresets } from 'crawlx';
// Development preset
const devCrawler = ConfigPresets.development();
// Production preset
const prodCrawler = ConfigPresets.production();
// Testing preset
const testCrawler = ConfigPresets.testing();const crawler = new CrawlX();
crawler.on('task-start', (task) => {
console.log(`Starting: ${task.url}`);
});
crawler.on('task-complete', (result) => {
console.log(`Completed: ${result.response.url}`);
});
crawler.on('data-extracted', (data, url) => {
console.log(`Data from ${url}:`, data);
});
crawler.on('task-error', (error, task) => {
console.error(`Failed ${task.url}:`, error.message);
});import { CrawlXError, NetworkError, TimeoutError } from 'crawlx';
try {
const result = await crawler.crawl('https://example.com');
} catch (error) {
if (error instanceof NetworkError) {
console.log('Network error:', error.statusCode);
} else if (error instanceof TimeoutError) {
console.log('Timeout after:', error.timeout);
} else if (error instanceof CrawlXError) {
console.log('CrawlX error:', error.code, error.context);
}
}class CustomPlugin {
name = 'custom';
version = '1.0.0';
priority = 100;
async onTaskComplete(result) {
// Add custom processing
result.customData = {
processedAt: new Date().toISOString(),
urlLength: result.response.url.length
};
return result;
}
}
const crawler = new CrawlX();
crawler.addPlugin(new CustomPlugin());Always clean up resources:
const crawler = new CrawlX();
try {
const result = await crawler.crawl('https://example.com');
// Process result
} finally {
await crawler.destroy();
}Handle errors gracefully:
const crawler = new CrawlX({
maxRetries: 3,
timeout: 10000
});
crawler.on('task-error', (error, task) => {
console.error(`Failed to crawl ${task.url}:`, error.message);
});Be respectful to target websites:
const crawler = new CrawlX({
plugins: {
delay: {
enabled: true,
defaultDelay: 1000 // 1 second between requests
},
rateLimit: {
enabled: true,
perDomainLimit: { requests: 10, window: 60000 }
}
}
});For large-scale crawling:
const crawler = new CrawlX({
mode: 'high-performance',
concurrency: 20,
scheduler: {
maxQueueSize: 1000,
resourceLimits: {
maxMemoryUsage: 1073741824 // 1GB
}
}
});- Read the API Documentation
- Explore Advanced Examples
- Learn about Plugin Development
- Check out Performance Tuning