Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions src/govbr_scraper/scrapers/ebc_scrape_manager.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import hashlib
import logging
import os
from collections import OrderedDict
from datetime import date, datetime
from typing import Any, Dict, List

from govbr_scraper.scrapers.ebc_webscraper import EBCWebScraper
from govbr_scraper.scrapers.yaml_config import load_urls_from_yaml
from govbr_scraper.scrapers.yaml_config import get_config_dir, load_urls_from_yaml

# Set up logging configuration
logging.basicConfig(
Expand Down Expand Up @@ -39,10 +38,6 @@ def __init__(self, storage: Any):
"""
self.dataset_manager = storage # Keep attribute name for compatibility

def _get_config_dir(self) -> str:
"""Get the config directory path."""
return os.path.join(os.path.dirname(os.path.abspath(__file__)), "config")

def run_scraper(
self,
min_date: str,
Expand All @@ -68,7 +63,7 @@ def run_scraper(

try:
agency_urls = {}
config_dir = self._get_config_dir()
config_dir = get_config_dir(__file__)
# Load URLs for each agency in the list
if agencies:
for agency in agencies:
Expand Down
54 changes: 25 additions & 29 deletions src/govbr_scraper/scrapers/scrape_manager.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import hashlib
import logging
import os
from collections import OrderedDict
from datetime import date
from typing import Any, Dict, List

from govbr_scraper.scrapers.webscraper import ScrapingError, WebScraper
from govbr_scraper.scrapers.yaml_config import load_urls_from_yaml
from govbr_scraper.scrapers.yaml_config import get_config_dir, load_urls_from_yaml

# Set up logging configuration
logging.basicConfig(
Expand Down Expand Up @@ -39,10 +38,6 @@ def __init__(self, storage: Any):
"""
self.dataset_manager = storage # Keep attribute name for compatibility

def _get_config_dir(self) -> str:
"""Get the config directory path."""
return os.path.join(os.path.dirname(os.path.abspath(__file__)), "config")

def run_scraper(
self,
agencies: List[str],
Expand All @@ -68,64 +63,65 @@ def run_scraper(
errors = []

try:
all_urls = []
config_dir = self._get_config_dir()
agency_urls = {}
config_dir = get_config_dir(__file__)
# Load URLs for each agency in the list
if agencies:
for agency in agencies:
try:
agency_urls = load_urls_from_yaml(config_dir, "site_urls.yaml", agency)
all_urls.extend(agency_urls.values())
loaded = load_urls_from_yaml(config_dir, "site_urls.yaml", agency)
agency_urls.update(loaded)
except ValueError as e:
errors.append({"agency": agency, "error": str(e)})
logging.warning(f"Skipping agency '{agency}': {e}")
else:
# Load all agency URLs if agencies list is None or empty
agency_urls = load_urls_from_yaml(config_dir, "site_urls.yaml")
all_urls = list(agency_urls.values())

# Create list of (agency_name, scraper) tuples
webscrapers = [
WebScraper(min_date, url, max_date=max_date) for url in all_urls
(agency_name, WebScraper(min_date, url, max_date=max_date))
for agency_name, url in agency_urls.items()
]

if sequential:
for scraper in webscrapers:
for agency_name, scraper in webscrapers:
try:
scraped_data = scraper.scrape_news()
if scraped_data:
logging.info(
f"Appending news for {scraper.agency} to HF dataset."
f"Appending news for {agency_name} to HF dataset."
)
articles_scraped += len(scraped_data)
saved = self._process_and_upload_data(scraped_data, allow_update) or 0
articles_saved += saved
agencies_processed.append(scraper.agency)
agencies_processed.append(agency_name)
else:
logging.info(f"No news found for {scraper.agency}.")
agencies_processed.append(scraper.agency)
logging.info(f"No news found for {agency_name}.")
agencies_processed.append(agency_name)
except ScrapingError as e:
errors.append({"agency": scraper.agency, "error": str(e)})
logging.error(f"Scraping failed for {scraper.agency}: {e}")
errors.append({"agency": agency_name, "error": str(e)})
logging.error(f"Scraping failed for {agency_name}: {e}")
except Exception as e:
errors.append({"agency": scraper.agency, "error": str(e)})
logging.error(f"Unexpected error for {scraper.agency}: {e}")
errors.append({"agency": agency_name, "error": str(e)})
logging.error(f"Unexpected error for {agency_name}: {e}")
else:
all_news_data = []
for scraper in webscrapers:
for agency_name, scraper in webscrapers:
try:
scraped_data = scraper.scrape_news()
if scraped_data:
all_news_data.extend(scraped_data)
agencies_processed.append(scraper.agency)
agencies_processed.append(agency_name)
else:
logging.info(f"No news found for {scraper.agency}.")
agencies_processed.append(scraper.agency)
logging.info(f"No news found for {agency_name}.")
agencies_processed.append(agency_name)
except ScrapingError as e:
errors.append({"agency": scraper.agency, "error": str(e)})
logging.error(f"Scraping failed for {scraper.agency}: {e}")
errors.append({"agency": agency_name, "error": str(e)})
logging.error(f"Scraping failed for {agency_name}: {e}")
except Exception as e:
errors.append({"agency": scraper.agency, "error": str(e)})
logging.error(f"Unexpected error for {scraper.agency}: {e}")
errors.append({"agency": agency_name, "error": str(e)})
logging.error(f"Unexpected error for {agency_name}: {e}")

if all_news_data:
logging.info("Appending all collected news to HF dataset.")
Expand Down
10 changes: 10 additions & 0 deletions src/govbr_scraper/scrapers/yaml_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@
import yaml


def get_config_dir(module_file: str) -> str:
"""
Get the config directory path relative to a module file.

:param module_file: The __file__ of the calling module.
:return: Absolute path to the config directory.
"""
return os.path.join(os.path.dirname(os.path.abspath(module_file)), "config")


def load_urls_from_yaml(
config_dir: str, file_name: str, agency: str = None
) -> Dict[str, str]:
Expand Down
7 changes: 0 additions & 7 deletions tests/unit/test_scrape_manager.py

This file was deleted.

57 changes: 35 additions & 22 deletions tests/unit/test_yaml_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,36 @@
import os
import pytest
from govbr_scraper.scrapers.yaml_config import (
get_config_dir,
load_urls_from_yaml,
extract_url,
is_agency_inactive,
)

# Path to the scrapers module, used to resolve config dir in tests
_SCRAPERS_MODULE = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"..",
"..",
"src",
"govbr_scraper",
"scrapers",
"scrape_manager.py",
)


class TestGetConfigDir:
"""Tests for get_config_dir function."""

def test_returns_config_subdir(self):
"""get_config_dir should return a path ending in 'config'."""
config_dir = get_config_dir(_SCRAPERS_MODULE)
assert config_dir.endswith("config")

def get_config_dir():
"""Get the config directory path for tests."""
return os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"..",
"..",
"src",
"govbr_scraper",
"scrapers",
"config",
)
def test_config_dir_exists(self):
"""get_config_dir should return an existing directory."""
config_dir = get_config_dir(_SCRAPERS_MODULE)
assert os.path.isdir(config_dir)


class TestExtractUrl:
Expand Down Expand Up @@ -75,36 +88,36 @@ class TestLoadUrlsFromYamlGovBr:

def test_load_urls_returns_dict(self):
"""load_urls_from_yaml should return a dict mapping agency names to URLs."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
agency_urls = load_urls_from_yaml(config_dir, "site_urls.yaml")
assert isinstance(agency_urls, dict)
assert len(agency_urls) > 0

def test_load_urls_filters_inactive(self):
"""Inactive agencies should not be in the returned dict."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
agency_urls = load_urls_from_yaml(config_dir, "site_urls.yaml")
# cisc uses the generic gov.br/pt-br/noticias URL and is inactive
for agency_name, url in agency_urls.items():
assert url != "https://www.gov.br/pt-br/noticias"

def test_load_specific_active_agency(self):
"""Loading a specific active agency should work."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
agency_urls = load_urls_from_yaml(config_dir, "site_urls.yaml", agency="mec")
assert len(agency_urls) == 1
assert "mec" in agency_urls
assert "mec" in agency_urls["mec"]

def test_load_specific_inactive_agency_raises(self):
"""Loading a specific inactive agency should raise ValueError."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
with pytest.raises(ValueError, match="inactive"):
load_urls_from_yaml(config_dir, "site_urls.yaml", agency="cisc")

def test_load_nonexistent_agency_raises(self):
"""Loading a nonexistent agency should raise ValueError."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
with pytest.raises(ValueError, match="not found"):
load_urls_from_yaml(config_dir, "site_urls.yaml", agency="nonexistent_agency_xyz")

Expand All @@ -114,43 +127,43 @@ class TestLoadUrlsFromYamlEBC:

def test_load_urls_returns_dict(self):
"""load_urls_from_yaml should return a dict mapping agency names to URLs."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
agency_urls = load_urls_from_yaml(config_dir, "ebc_urls.yaml")
assert isinstance(agency_urls, dict)
assert len(agency_urls) > 0

def test_load_urls_filters_inactive(self):
"""Inactive agencies should not be in the returned dict."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
agency_urls = load_urls_from_yaml(config_dir, "ebc_urls.yaml")
# memoria-ebc is inactive, so its URL should not be present
for agency_name, url in agency_urls.items():
assert "memoria.ebc.com.br" not in url

def test_load_urls_includes_active_agencies(self):
"""Active agencies should be in the returned dict."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
agency_urls = load_urls_from_yaml(config_dir, "ebc_urls.yaml")
urls_str = " ".join(agency_urls.values())
# agencia_brasil and tvbrasil are active
assert "agenciabrasil.ebc.com.br" in urls_str or "tvbrasil.ebc.com.br" in urls_str

def test_load_specific_active_agency(self):
"""Loading a specific active agency should work."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
agency_urls = load_urls_from_yaml(config_dir, "ebc_urls.yaml", agency="agencia_brasil")
assert len(agency_urls) == 1
assert "agencia_brasil" in agency_urls
assert "agenciabrasil.ebc.com.br" in agency_urls["agencia_brasil"]

def test_load_specific_inactive_agency_raises(self):
"""Loading a specific inactive agency should raise ValueError."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
with pytest.raises(ValueError, match="inactive"):
load_urls_from_yaml(config_dir, "ebc_urls.yaml", agency="memoria-ebc")

def test_load_nonexistent_agency_raises(self):
"""Loading a nonexistent agency should raise ValueError."""
config_dir = get_config_dir()
config_dir = get_config_dir(_SCRAPERS_MODULE)
with pytest.raises(ValueError, match="not found"):
load_urls_from_yaml(config_dir, "ebc_urls.yaml", agency="nonexistent_agency_xyz")