|
| 1 | +import re |
| 2 | +import urllib.request |
| 3 | +from urllib.parse import urlparse |
| 4 | +from .heuristics import ( |
| 5 | + FAKE_NEWS_DOMAINS, |
| 6 | + SENSATIONALIST_KEYWORDS, |
| 7 | + CLICKBAIT_PATTERNS, |
| 8 | + HEURISTIC_WEIGHTS |
| 9 | +) |
| 10 | + |
| 11 | +def analyze_url_for_fake_news(url): |
| 12 | + """ |
| 13 | + Analyzes a URL for indicators of fake news. |
| 14 | + """ |
| 15 | + if not url.startswith(('http://', 'https://')): |
| 16 | + url = 'http://' + url |
| 17 | + |
| 18 | + domain = urlparse(url).netloc.lower() |
| 19 | + |
| 20 | + score = 0.0 |
| 21 | + indicators_found = [] |
| 22 | + |
| 23 | + # 1. Check against known fake news domains |
| 24 | + if domain in FAKE_NEWS_DOMAINS: |
| 25 | + score += HEURISTIC_WEIGHTS.get("KNOWN_FAKE_NEWS_DOMAIN", 5.0) |
| 26 | + indicators_found.append(f"Domain '{domain}' is a known source of fake news.") |
| 27 | + return { |
| 28 | + "url": url, |
| 29 | + "score": round(score, 2), |
| 30 | + "indicators_found": indicators_found |
| 31 | + } |
| 32 | + |
| 33 | + # 2. Fetch and analyze content |
| 34 | + try: |
| 35 | + headers = {'User-Agent': 'Mozilla/5.0'} |
| 36 | + request = urllib.request.Request(url, headers=headers) |
| 37 | + with urllib.request.urlopen(request, timeout=10) as response: |
| 38 | + if response.status == 200: |
| 39 | + html_content = response.read().decode('utf-8', errors='ignore') |
| 40 | + text_content = re.sub(r'<[^>]+>', '', html_content).lower() |
| 41 | + |
| 42 | + # 3. Analyze text for sensationalist keywords |
| 43 | + for keyword in SENSATIONALIST_KEYWORDS: |
| 44 | + if keyword in text_content: |
| 45 | + score += HEURISTIC_WEIGHTS.get("SENSATIONALIST_KEYWORD", 1.0) |
| 46 | + indicators_found.append(f"Found sensationalist keyword: '{keyword}'") |
| 47 | + |
| 48 | + # 4. Analyze text for clickbait patterns |
| 49 | + for pattern in CLICKBAIT_PATTERNS: |
| 50 | + if re.search(pattern, text_content, re.IGNORECASE): |
| 51 | + score += HEURISTIC_WEIGHTS.get("CLICKBAIT_PATTERN", 1.5) |
| 52 | + indicators_found.append(f"Found clickbait pattern: '{pattern}'") |
| 53 | + |
| 54 | + else: |
| 55 | + return {"error": f"Failed to fetch URL: HTTP status code {response.status}"} |
| 56 | + except Exception as e: |
| 57 | + return {"error": f"An error occurred: {e}"} |
| 58 | + |
| 59 | + return { |
| 60 | + "url": url, |
| 61 | + "score": round(score, 2), |
| 62 | + "indicators_found": indicators_found |
| 63 | + } |
0 commit comments