-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathuniversal_extractor.py
More file actions
58 lines (47 loc) · 2.5 KB
/
universal_extractor.py
File metadata and controls
58 lines (47 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# Axiom - universal_extractor.py
# Copyright (C) 2025 The Axiom Contributors
# This program is licensed under the Peer Production License (PPL).
# See the LICENSE file for full details.
from googlesearch import search
import trafilatura
# A curated list of domains the system trusts for high-quality information.
TRUSTED_DOMAINS = [
'wikipedia.org', 'reuters.com', 'apnews.com', 'bbc.com', 'nytimes.com',
'wsj.com', 'britannica.com', '.gov', '.edu', 'forbes.com', 'nature.com'
]
def find_and_extract(topic, max_sources=3):
"""
Performs a web search for a topic, filters for trusted domains,
and extracts the primary text content from the top results.
Returns a list of dictionaries, each containing a source URL and its content.
"""
print(f"\n--- [Pathfinder] Seeking sources for '{topic}'...")
# Formulate a search query designed to find factual, historical information.
query = f'"{topic}" official information history facts filetype:html'
try:
# Perform the search and filter the results.
# --- THIS IS THE FINAL CORRECTED LINE ---
# All unsupported arguments ('num', 'stop', 'pause') have been removed.
# The 'num_results' argument is used to control the number of search results.
all_urls = search(query, num_results=10)
# The search function returns a generator. We convert it to a list to work with it.
urls_list = list(all_urls)
urls = [url for url in urls_list if any(domain in url for domain in TRUSTED_DOMAINS)]
if not urls:
print(f"[Pathfinder] No trusted sources found for '{topic}'.")
return []
print(f"[Universal Extractor] Found {len(urls)} potential sources. Extracting content...")
extracted_content = []
# Process the top N sources.
for url in urls[:max_sources]:
print(f" -> Extracting from: {url}")
# trafilatura downloads the page and intelligently extracts only the main article body.
downloaded = trafilatura.fetch_url(url)
main_text = trafilatura.extract(downloaded, include_comments=False, include_tables=False, include_images=False)
if main_text:
extracted_content.append({'source_url': url, 'content': main_text})
return extracted_content
except Exception as e:
# Handle potential search or network errors.
print(f"[Pathfinder/Extractor] ERROR: An exception occurred. {e}")
return []