FirstData/firstdata/sources/sectors/J-information-communication/common-crawl.json at main · firstdata-dev/FirstData · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
{
  "id": "common-crawl",
  "name": {
    "en": "Common Crawl",
    "zh": "Common Crawl 网络爬取数据"
  },
  "description": {
    "en": "Common Crawl is a 501(c)(3) non-profit organization that maintains a free, open repository of web crawl data that can be used by anyone. The corpus contains petabytes of raw web page data, metadata extracts, and text extracts, regularly collected since 2008. With over 300 billion pages spanning 18 years, Common Crawl adds 3-5 billion new pages each month and has been cited in over 10,000 research papers. The data is stored on Amazon Web Services' Public Data Sets and on multiple academic cloud platforms across the world, making wholesale extraction, transformation and analysis of open web data accessible to researchers, companies and individuals at no cost.",
    "zh": "Common Crawl 是一个 501(c)(3) 非营利组织，维护着一个免费、开放的网络爬取数据仓库，任何人都可以使用。该语料库包含PB级的原始网页数据、元数据提取和文本提取，自2008年以来定期收集。拥有超过3000亿个网页，跨越18年，Common Crawl每月新增30-50亿个网页，已被超过10,000篇研究论文引用。数据存储在亚马逊网络服务的公共数据集和全球多个学术云平台上，使研究人员、公司和个人能够免费访问、提取、转换和分析开放的网络数据。"
  },
  "website": "https://commoncrawl.org",
  "data_url": "https://commoncrawl.org",
  "api_url": "https://index.commoncrawl.org",
  "country": null,
  "domains": [
    "web crawling",
    "natural language processing",
    "machine learning",
    "data science",
    "information retrieval",
    "web analytics",
    "artificial intelligence",
    "research",
    "large language models"
  ],
  "geographic_scope": "global",
  "update_frequency": "monthly",
  "tags": [
    "web-crawling",
    "natural-language-processing",
    "machine-learning",
    "large-language-models",
    "web-data",
    "text-corpus",
    "open-data",
    "research",
    "big-data",
    "artificial-intelligence",
    "data-science",
    "web-mining",
    "nlp",
    "internet-archive"
  ],
  "data_content": {
    "en": [
      "Raw Web Page Data - HTML, CSS, JavaScript and other web content in WARC format",
      "Metadata Extracts - HTTP headers, response codes, content types (WAT format)",
      "Text Extracts - Plain text extracted from web pages (WET format)",
      "URL Index - Searchable index of all crawled URLs with CDX format",
      "Web Graphs - Link structure and relationships between web pages",
      "Crawl Statistics - Monthly statistics on crawl coverage and scope",
      "Historical Data - Web data archives from 2008 to present",
      "Multilingual Content - Web pages in hundreds of languages",
      "Domain Coverage - Pages from millions of domains worldwide"
    ],
    "zh": [
      "原始网页数据 - WARC 格式的 HTML、CSS、JavaScript 和其他网络内容",
      "元数据提取 - HTTP 标头、响应代码、内容类型（WAT 格式）",
      "文本提取 - 从网页提取的纯文本（WET 格式）",
      "URL 索引 - 所有爬取 URL 的可搜索索引（CDX 格式）",
      "网络图谱 - 网页之间的链接结构和关系",
      "爬取统计 - 每月爬取覆盖范围和规模统计",
      "历史数据 - 从 2008 年至今的网络数据档案",
      "多语言内容 - 数百种语言的网页",
      "域名覆盖 - 来自全球数百万个域名的网页"
    ]
  },
  "authority_level": "research"
}