destaquesgovbr · miguellsfilho · Feb 27, 2026 · Feb 26, 2026
diff --git a/src/govbr_scraper/scrapers/ebc_scrape_manager.py b/src/govbr_scraper/scrapers/ebc_scrape_manager.py
@@ -1,12 +1,11 @@
 import hashlib
 import logging
-import os
 from collections import OrderedDict
 from datetime import date, datetime
 from typing import Any, Dict, List
 
 from govbr_scraper.scrapers.ebc_webscraper import EBCWebScraper
-from govbr_scraper.scrapers.yaml_config import load_urls_from_yaml
+from govbr_scraper.scrapers.yaml_config import get_config_dir, load_urls_from_yaml
 
 # Set up logging configuration
 logging.basicConfig(
@@ -39,10 +38,6 @@ def __init__(self, storage: Any):
         """
         self.dataset_manager = storage  # Keep attribute name for compatibility
 
-    def _get_config_dir(self) -> str:
-        """Get the config directory path."""
-        return os.path.join(os.path.dirname(os.path.abspath(__file__)), "config")
-
     def run_scraper(
         self,
         min_date: str,
@@ -68,7 +63,7 @@ def run_scraper(
 
         try:
             agency_urls = {}
-            config_dir = self._get_config_dir()
+            config_dir = get_config_dir(__file__)
             # Load URLs for each agency in the list
             if agencies:
                 for agency in agencies:

diff --git a/src/govbr_scraper/scrapers/scrape_manager.py b/src/govbr_scraper/scrapers/scrape_manager.py
@@ -1,12 +1,11 @@
 import hashlib
 import logging
-import os
 from collections import OrderedDict
 from datetime import date
 from typing import Any, Dict, List
 
 from govbr_scraper.scrapers.webscraper import ScrapingError, WebScraper
-from govbr_scraper.scrapers.yaml_config import load_urls_from_yaml
+from govbr_scraper.scrapers.yaml_config import get_config_dir, load_urls_from_yaml
 
 # Set up logging configuration
 logging.basicConfig(
@@ -39,10 +38,6 @@ def __init__(self, storage: Any):
         """
         self.dataset_manager = storage  # Keep attribute name for compatibility
 
-    def _get_config_dir(self) -> str:
-        """Get the config directory path."""
-        return os.path.join(os.path.dirname(os.path.abspath(__file__)), "config")
-
     def run_scraper(
         self,
         agencies: List[str],
@@ -68,64 +63,65 @@ def run_scraper(
         errors = []
 
         try:
-            all_urls = []
-            config_dir = self._get_config_dir()
+            agency_urls = {}
+            config_dir = get_config_dir(__file__)
             # Load URLs for each agency in the list
             if agencies:
                 for agency in agencies:
                     try:
-                        agency_urls = load_urls_from_yaml(config_dir, "site_urls.yaml", agency)
-                        all_urls.extend(agency_urls.values())
+                        loaded = load_urls_from_yaml(config_dir, "site_urls.yaml", agency)
+                        agency_urls.update(loaded)
                     except ValueError as e:
                         errors.append({"agency": agency, "error": str(e)})
                         logging.warning(f"Skipping agency '{agency}': {e}")
             else:
                 # Load all agency URLs if agencies list is None or empty
                 agency_urls = load_urls_from_yaml(config_dir, "site_urls.yaml")
-                all_urls = list(agency_urls.values())
 
+            # Create list of (agency_name, scraper) tuples
             webscrapers = [
-                WebScraper(min_date, url, max_date=max_date) for url in all_urls
+                (agency_name, WebScraper(min_date, url, max_date=max_date))
+                for agency_name, url in agency_urls.items()
             ]
 
             if sequential:
-                for scraper in webscrapers:
+                for agency_name, scraper in webscrapers:
                     try:
                         scraped_data = scraper.scrape_news()
                         if scraped_data:
                             logging.info(
-                                f"Appending news for {scraper.agency} to HF dataset."
+                                f"Appending news for {agency_name} to HF dataset."
                             )
                             articles_scraped += len(scraped_data)
                             saved = self._process_and_upload_data(scraped_data, allow_update) or 0
                             articles_saved += saved
-                            agencies_processed.append(scraper.agency)
+                            agencies_processed.append(agency_name)
                         else:
-                            logging.info(f"No news found for {scraper.agency}.")
-                            agencies_processed.append(scraper.agency)
+                            logging.info(f"No news found for {agency_name}.")
+                            agencies_processed.append(agency_name)
                     except ScrapingError as e:
-                        errors.append({"agency": scraper.agency, "error": str(e)})
-                        logging.error(f"Scraping failed for {scraper.agency}: {e}")
+                        errors.append({"agency": agency_name, "error": str(e)})
+                        logging.error(f"Scraping failed for {agency_name}: {e}")
                     except Exception as e:
-                        errors.append({"agency": scraper.agency, "error": str(e)})
-                        logging.error(f"Unexpected error for {scraper.agency}: {e}")
+                        errors.append({"agency": agency_name, "error": str(e)})
+                        logging.error(f"Unexpected error for {agency_name}: {e}")
             else:
                 all_news_data = []
-                for scraper in webscrapers:
+                for agency_name, scraper in webscrapers:
                     try:
                         scraped_data = scraper.scrape_news()
                         if scraped_data:
                             all_news_data.extend(scraped_data)
-                            agencies_processed.append(scraper.agency)
+                            agencies_processed.append(agency_name)
                         else:
-                            logging.info(f"No news found for {scraper.agency}.")
-                            agencies_processed.append(scraper.agency)
+                            logging.info(f"No news found for {agency_name}.")
+                            agencies_processed.append(agency_name)
                     except ScrapingError as e:
-                        errors.append({"agency": scraper.agency, "error": str(e)})
-                        logging.error(f"Scraping failed for {scraper.agency}: {e}")
+                        errors.append({"agency": agency_name, "error": str(e)})
+                        logging.error(f"Scraping failed for {agency_name}: {e}")
                     except Exception as e:
-                        errors.append({"agency": scraper.agency, "error": str(e)})
-                        logging.error(f"Unexpected error for {scraper.agency}: {e}")
+                        errors.append({"agency": agency_name, "error": str(e)})
+                        logging.error(f"Unexpected error for {agency_name}: {e}")
 
                 if all_news_data:
                     logging.info("Appending all collected news to HF dataset.")

diff --git a/src/govbr_scraper/scrapers/yaml_config.py b/src/govbr_scraper/scrapers/yaml_config.py
@@ -8,6 +8,16 @@
 import yaml
 
 
+def get_config_dir(module_file: str) -> str:
+    """
+    Get the config directory path relative to a module file.
+
+    :param module_file: The __file__ of the calling module.
+    :return: Absolute path to the config directory.
+    """
+    return os.path.join(os.path.dirname(os.path.abspath(module_file)), "config")
+
+
 def load_urls_from_yaml(
     config_dir: str, file_name: str, agency: str = None
 ) -> Dict[str, str]:

diff --git a/tests/unit/test_scrape_manager.py b/tests/unit/test_scrape_manager.py
diff --git a/tests/unit/test_yaml_config.py b/tests/unit/test_yaml_config.py
@@ -4,23 +4,36 @@
 import os
 import pytest
 from govbr_scraper.scrapers.yaml_config import (
+    get_config_dir,
     load_urls_from_yaml,
     extract_url,
     is_agency_inactive,
 )
 
+# Path to the scrapers module, used to resolve config dir in tests
+_SCRAPERS_MODULE = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)),
+    "..",
+    "..",
+    "src",
+    "govbr_scraper",
+    "scrapers",
+    "scrape_manager.py",
+)
+
+
+class TestGetConfigDir:
+    """Tests for get_config_dir function."""
+
+    def test_returns_config_subdir(self):
+        """get_config_dir should return a path ending in 'config'."""
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
+        assert config_dir.endswith("config")
 
-def get_config_dir():
-    """Get the config directory path for tests."""
-    return os.path.join(
-        os.path.dirname(os.path.abspath(__file__)),
-        "..",
-        "..",
-        "src",
-        "govbr_scraper",
-        "scrapers",
-        "config",
-    )
+    def test_config_dir_exists(self):
+        """get_config_dir should return an existing directory."""
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
+        assert os.path.isdir(config_dir)
 
 
 class TestExtractUrl:
@@ -75,36 +88,36 @@ class TestLoadUrlsFromYamlGovBr:
 
     def test_load_urls_returns_dict(self):
         """load_urls_from_yaml should return a dict mapping agency names to URLs."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         agency_urls = load_urls_from_yaml(config_dir, "site_urls.yaml")
         assert isinstance(agency_urls, dict)
         assert len(agency_urls) > 0
 
     def test_load_urls_filters_inactive(self):
         """Inactive agencies should not be in the returned dict."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         agency_urls = load_urls_from_yaml(config_dir, "site_urls.yaml")
         # cisc uses the generic gov.br/pt-br/noticias URL and is inactive
         for agency_name, url in agency_urls.items():
             assert url != "https://www.gov.br/pt-br/noticias"
 
     def test_load_specific_active_agency(self):
         """Loading a specific active agency should work."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         agency_urls = load_urls_from_yaml(config_dir, "site_urls.yaml", agency="mec")
         assert len(agency_urls) == 1
         assert "mec" in agency_urls
         assert "mec" in agency_urls["mec"]
 
     def test_load_specific_inactive_agency_raises(self):
         """Loading a specific inactive agency should raise ValueError."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         with pytest.raises(ValueError, match="inactive"):
             load_urls_from_yaml(config_dir, "site_urls.yaml", agency="cisc")
 
     def test_load_nonexistent_agency_raises(self):
         """Loading a nonexistent agency should raise ValueError."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         with pytest.raises(ValueError, match="not found"):
             load_urls_from_yaml(config_dir, "site_urls.yaml", agency="nonexistent_agency_xyz")
 
@@ -114,43 +127,43 @@ class TestLoadUrlsFromYamlEBC:
 
     def test_load_urls_returns_dict(self):
         """load_urls_from_yaml should return a dict mapping agency names to URLs."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         agency_urls = load_urls_from_yaml(config_dir, "ebc_urls.yaml")
         assert isinstance(agency_urls, dict)
         assert len(agency_urls) > 0
 
     def test_load_urls_filters_inactive(self):
         """Inactive agencies should not be in the returned dict."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         agency_urls = load_urls_from_yaml(config_dir, "ebc_urls.yaml")
         # memoria-ebc is inactive, so its URL should not be present
         for agency_name, url in agency_urls.items():
             assert "memoria.ebc.com.br" not in url
 
     def test_load_urls_includes_active_agencies(self):
         """Active agencies should be in the returned dict."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         agency_urls = load_urls_from_yaml(config_dir, "ebc_urls.yaml")
         urls_str = " ".join(agency_urls.values())
         # agencia_brasil and tvbrasil are active
         assert "agenciabrasil.ebc.com.br" in urls_str or "tvbrasil.ebc.com.br" in urls_str
 
     def test_load_specific_active_agency(self):
         """Loading a specific active agency should work."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         agency_urls = load_urls_from_yaml(config_dir, "ebc_urls.yaml", agency="agencia_brasil")
         assert len(agency_urls) == 1
         assert "agencia_brasil" in agency_urls
         assert "agenciabrasil.ebc.com.br" in agency_urls["agencia_brasil"]
 
     def test_load_specific_inactive_agency_raises(self):
         """Loading a specific inactive agency should raise ValueError."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         with pytest.raises(ValueError, match="inactive"):
             load_urls_from_yaml(config_dir, "ebc_urls.yaml", agency="memoria-ebc")
 
     def test_load_nonexistent_agency_raises(self):
         """Loading a nonexistent agency should raise ValueError."""
-        config_dir = get_config_dir()
+        config_dir = get_config_dir(_SCRAPERS_MODULE)
         with pytest.raises(ValueError, match="not found"):
             load_urls_from_yaml(config_dir, "ebc_urls.yaml", agency="nonexistent_agency_xyz")