Merge pull request #19 from GYFX35/feature/fake-news-detector

GYFX35 · web-flow · commit 105189f41b40 · 2025-09-23T09:24:11.000Z
feat: Integrate fake news detection tool
diff --git a/social_media_analyzer/fake_news_detector.py b/social_media_analyzer/fake_news_detector.py
@@ -0,0 +1,63 @@
+import re
+import urllib.request
+from urllib.parse import urlparse
+from .heuristics import (
+    FAKE_NEWS_DOMAINS,
+    SENSATIONALIST_KEYWORDS,
+    CLICKBAIT_PATTERNS,
+    HEURISTIC_WEIGHTS
+)
+
+def analyze_url_for_fake_news(url):
+    """
+    Analyzes a URL for indicators of fake news.
+    """
+    if not url.startswith(('http://', 'https://')):
+        url = 'http://' + url
+
+    domain = urlparse(url).netloc.lower()
+
+    score = 0.0
+    indicators_found = []
+
+    # 1. Check against known fake news domains
+    if domain in FAKE_NEWS_DOMAINS:
+        score += HEURISTIC_WEIGHTS.get("KNOWN_FAKE_NEWS_DOMAIN", 5.0)
+        indicators_found.append(f"Domain '{domain}' is a known source of fake news.")
+        return {
+            "url": url,
+            "score": round(score, 2),
+            "indicators_found": indicators_found
+        }
+
+    # 2. Fetch and analyze content
+    try:
+        headers = {'User-Agent': 'Mozilla/5.0'}
+        request = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(request, timeout=10) as response:
+            if response.status == 200:
+                html_content = response.read().decode('utf-8', errors='ignore')
+                text_content = re.sub(r'<[^>]+>', '', html_content).lower()
+
+                # 3. Analyze text for sensationalist keywords
+                for keyword in SENSATIONALIST_KEYWORDS:
+                    if keyword in text_content:
+                        score += HEURISTIC_WEIGHTS.get("SENSATIONALIST_KEYWORD", 1.0)
+                        indicators_found.append(f"Found sensationalist keyword: '{keyword}'")
+
+                # 4. Analyze text for clickbait patterns
+                for pattern in CLICKBAIT_PATTERNS:
+                    if re.search(pattern, text_content, re.IGNORECASE):
+                        score += HEURISTIC_WEIGHTS.get("CLICKBAIT_PATTERN", 1.5)
+                        indicators_found.append(f"Found clickbait pattern: '{pattern}'")
+
+            else:
+                return {"error": f"Failed to fetch URL: HTTP status code {response.status}"}
+    except Exception as e:
+        return {"error": f"An error occurred: {e}"}
+
+    return {
+        "url": url,
+        "score": round(score, 2),
+        "indicators_found": indicators_found
+    }
diff --git a/social_media_analyzer/heuristics.py b/social_media_analyzer/heuristics.py
@@ -123,6 +123,27 @@
 ]
 
 
+# --- Fake News Heuristics ---
+
+FAKE_NEWS_DOMAINS = [
+    "abcnews.com.co", "cnn.com.de", "daily-mail.com.de",
+    "infowars.com", "naturalnews.com", "breitbart.com",
+    "worldnewsdailyreport.com", "theonion.com", # The Onion is satire, but often mistaken for real news
+    "yournewswire.com", "davidwolfe.com"
+]
+
+SENSATIONALIST_KEYWORDS = [
+    "shocking", "bombshell", "secret", "cover-up",
+    "miracle", "cure", "unbelievable", "outrageous",
+    "conspiracy", "hidden truth", "what they don't want you to know", "fake news"
+]
+
+CLICKBAIT_PATTERNS = [
+    r"you won't believe", r"will shock you", r"number \d will amaze you",
+    r"this one weird trick", r"doctors hate him", r"the truth about",
+    r"scientists baffled", r"what happened next", r"secret to"
+]
+
 # --- Regular Expression Patterns ---
 
 # Basic URL detection
@@ -201,6 +222,9 @@ def generate_suspicious_url_patterns(legitimate_domains):
 
 # --- Scoring Weights ---
 HEURISTIC_WEIGHTS = {
+    "KNOWN_FAKE_NEWS_DOMAIN": 5.0,
+    "SENSATIONALIST_KEYWORD": 1.0,
+    "CLICKBAIT_PATTERN": 1.5,
     "URGENCY": 1.5,
     "SENSITIVE_INFO": 2.5,
     "TOO_GOOD_TO_BE_TRUE": 2.0,
diff --git a/social_media_analyzer/main.py b/social_media_analyzer/main.py
@@ -1,5 +1,26 @@
 from . import fake_profile_detector
 from . import scam_detector
+from . import fake_news_detector
+
+def analyze_news_url():
+    """Analyzes a news URL for potential fake news."""
+    url_to_check = input("Please enter the full URL of the news article you want to analyze: ").strip()
+    if not url_to_check:
+        print("No URL entered.")
+        return
+
+    print("\n--- Analyzing News URL for Fake News ---")
+    result = fake_news_detector.analyze_url_for_fake_news(url_to_check)
+
+    if "error" in result:
+        print(f"Could not analyze news URL: {result['error']}")
+    elif not result.get("indicators_found"):
+        print("No specific fake news indicators were found.")
+    else:
+        print(f"Score: {result['score']} (Higher is more suspicious)")
+        print("Indicators Found:")
+        for indicator in result['indicators_found']:
+            print(f"- {indicator}")
 
 def analyze_website_url():
     """Analyzes a website URL for potential scams."""
@@ -107,21 +128,24 @@ def analyze_social_media():
 def main():
     """Main function to run the security analyzer."""
     print("--- Universal Security Analyzer ---")
-    print("This tool helps you analyze social media, messages, and websites for potential scams.")
+    print("This tool helps you analyze social media, messages, and websites for potential scams and fake news.")
 
     while True:
         print("\n--- Main Menu ---")
         print("1. Analyze a Social Media Platform")
-        print("2. Analyze a Website URL")
-        print("3. Exit")
+        print("2. Analyze a Website URL for Scams")
+        print("3. Analyze a News URL for Fake News")
+        print("4. Exit")
 
         try:
-            choice = int(input("Enter your choice (1-3): "))
+            choice = int(input("Enter your choice (1-4): "))
             if choice == 1:
                 analyze_social_media()
             elif choice == 2:
                 analyze_website_url()
             elif choice == 3:
+                analyze_news_url()
+            elif choice == 4:
                 print("Exiting. Stay safe!")
                 break
             else:
diff --git a/social_media_analyzer/test_fake_news_detector.py b/social_media_analyzer/test_fake_news_detector.py
@@ -0,0 +1,34 @@
+import unittest
+from unittest.mock import patch, Mock
+from .fake_news_detector import analyze_url_for_fake_news
+
+class TestFakeNewsDetector(unittest.TestCase):
+
+    @patch('urllib.request.urlopen')
+    def test_fake_news_url(self, mock_urlopen):
+        # Mock the response for a fake news URL
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.read.return_value = b'<html><head><title>Fake News</title></head><body>This is a shocking story!</body></html>'
+        mock_urlopen.return_value.__enter__.return_value = mock_response
+
+        url = "http://abcnews.com.co/news/breaking-news-report.html"
+        result = analyze_url_for_fake_news(url)
+        self.assertGreater(result["score"], 0)
+        self.assertIn("Domain 'abcnews.com.co' is a known source of fake news.", result["indicators_found"])
+
+    @patch('urllib.request.urlopen')
+    def test_legitimate_news_url(self, mock_urlopen):
+        # Mock the response for a legitimate news URL
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.read.return_value = b'<html><head><title>Real News</title></head><body>This is a real news story.</body></html>'
+        mock_urlopen.return_value.__enter__.return_value = mock_response
+
+        url = "https://www.bbc.com/news"
+        result = analyze_url_for_fake_news(url)
+        self.assertEqual(result["score"], 0)
+        self.assertEqual(len(result["indicators_found"]), 0)
+
+if __name__ == '__main__':
+    unittest.main()