Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
655 changes: 492 additions & 163 deletions dags/config/site_urls.yaml

Large diffs are not rendered by default.

23 changes: 21 additions & 2 deletions dags/scrape_agencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,29 @@


def _load_agencies_config() -> dict:
"""Carrega config de agências do YAML."""
"""Carrega config de agências ativas do YAML.

Suporta formato dicionário com campos:
- url: str (obrigatório)
- active: bool (opcional, default: True)
- disabled_reason: str (opcional)
- disabled_date: str (opcional)

Returns:
dict: Mapeamento {agency_key: url} apenas para agências ativas
"""
config_path = os.path.join(os.path.dirname(__file__), "config", "site_urls.yaml")
with open(config_path) as f:
return yaml.safe_load(f)["agencies"]
agencies = yaml.safe_load(f)["agencies"]

# Filtrar apenas agências ativas e extrair URLs
active_agencies = {}
for key, data in agencies.items():
is_active = data.get("active", True)
if is_active:
active_agencies[key] = data.get("url")

return active_agencies


def create_scraper_dag(agency_key: str, agency_url: str):
Expand Down
11 changes: 8 additions & 3 deletions src/govbr_scraper/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class ScrapeAgenciesRequest(BaseModel):
class ScrapeEBCRequest(BaseModel):
start_date: str
end_date: str | None = None
agencies: list[str] | None = None
allow_update: bool = False
sequential: bool = True

Expand Down Expand Up @@ -108,7 +109,7 @@ def scrape_ebc(req: ScrapeEBCRequest):
from govbr_scraper.scrapers.ebc_scrape_manager import EBCScrapeManager

end = req.end_date or req.start_date
logger.info(f"Scraping EBC from {req.start_date} to {end}")
logger.info(f"Scraping EBC agencies: {req.agencies or 'ALL'} from {req.start_date} to {end}")

try:
storage = StorageAdapter()
Expand All @@ -118,15 +119,19 @@ def scrape_ebc(req: ScrapeEBCRequest):
max_date=end,
sequential=req.sequential,
allow_update=req.allow_update,
agencies=req.agencies,
)
except Exception as e:
logger.error(f"EBC scraping failed: {e}")
raise HTTPException(status_code=500, detail=str(e))

errors = [AgencyError(**e) for e in metrics.get("errors", [])]
if errors:
if errors and not metrics["agencies_processed"]:
status = "failed"
message = f"EBC scraping failed: {errors[0].error}"
elif errors:
status = "partial"
message = f"Completed with {len(errors)} error(s)"
else:
status = "completed"
message = "EBC scraping completed"
Expand All @@ -137,7 +142,7 @@ def scrape_ebc(req: ScrapeEBCRequest):
end_date=end,
articles_scraped=metrics["articles_scraped"],
articles_saved=metrics["articles_saved"],
agencies_processed=["ebc"] if not errors else [],
agencies_processed=metrics["agencies_processed"],
errors=errors,
message=message,
)
14 changes: 14 additions & 0 deletions src/govbr_scraper/scrapers/config/ebc_urls.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
agencies:
memoria-ebc:
url: https://memoria.ebc.com.br/noticias
active: false
disabled_reason: "Site fora do ar (502 Bad Gateway) - issue #50"
disabled_date: "2026-02-12"

agencia_brasil:
url: https://agenciabrasil.ebc.com.br/ultimas
active: true

tvbrasil:
url: https://tvbrasil.ebc.com.br/ultimas
active: true
Loading