feat: add relevance-based sorting for CSV export and fix tests

PyDevDeep · PyDevDeep · commit 74b052d9fced · 2026-05-08T11:50:13.000+03:00
diff --git a/.env.example b/.env.example
@@ -28,3 +28,5 @@ CACHE_TTL_DAYS=7
 # --- OUTPUT ---
 # Directory where CSV and Markdown reports will be saved
 OUTPUT_DIR=data
+# Export sorting configuration: "false" to keep original order, "true" to sort by score (desc) and domain (asc)
+EXPORT_SORT_BY_RELEVANCE=false
diff --git a/README.md b/README.md
@@ -64,6 +64,31 @@ poetry run python -m src.main --input data/seeds.csv --rerun-failed
 - **Works with any input:** A plain domain list or a complex CSV — the system finds failed domains via the database
 - **Time savings:** Re-scrapes only domains with `status=error`; successful ones are pulled from cache
 
+### 📊 Export Sorting by Relevance (Smart Export Order)
+- **Configurable sorting:** `.env` parameter `EXPORT_SORT_BY_RELEVANCE=true` to sort CSV by score
+- **Two-level sorting:** First by score (100→0), then alphabetically for ties
+- **Preserve original order:** Default `false` — domains in CSV appear in the same order as input file
+- **NULL-safe:** Domains without score (failed scraping) automatically moved to the end of the list
+
+**Example `.env` configuration:**
+```bash
+# Sort CSV by relevance (High Priority → Low Priority)
+EXPORT_SORT_BY_RELEVANCE=true
+
+# Or preserve original order (default)
+EXPORT_SORT_BY_RELEVANCE=false
+```
+
+**Output when `EXPORT_SORT_BY_RELEVANCE=true`:**
+```
+domain,score,priority
+apple.com,100,High         ← highest score
+wikipedia.org,100,High     ← same score → alphabetical order
+amazon.com,85,High
+httpbin.org,40,Low
+fake-domain.com,0,Low      ← failed domains at the end
+```
+
 ### ⚡ Async I/O + Connection Pooling
 - **5 workers process 100 domains in ~20 seconds** (vs. 100 seconds in the synchronous variant)
 - **Configurable parallelism:** `--workers 10` for fast VPS or `--workers 2` for resource-constrained environments
diff --git a/docs/README_UA.md b/docs/README_UA.md
@@ -63,6 +63,31 @@ poetry run python -m src.main --input data/seeds.csv --rerun-failed
 - **Працює з будь-яким input:** Чистий список доменів або складний CSV — система знайде failed domains через БД
 - **Економія часу:** Пере-scrape лише домени з `status=error`, успішні береться з кешу
 
+### 📊 Сортування експорту за релевантністю (Smart Export Order)
+- **Configurable sorting:** `.env` параметр `EXPORT_SORT_BY_RELEVANCE=true` для сортування CSV по score
+- **Двоступенева сортування:** Спочатку за балами (100→0), потім за алфавітом при однакових балах
+- **Збереження оригінального порядку:** За замовчуванням `false` — домени у CSV у тому ж порядку, що й у вхідному файлі
+- **NULL-safe:** Домени без score (failed scraping) автоматично переміщуються в кінець списку
+
+**Приклад `.env` налаштування:**
+```bash
+# Сортувати CSV за релевантністю (High Priority → Low Priority)
+EXPORT_SORT_BY_RELEVANCE=true
+
+# Або зберегти оригінальний порядок (default)
+EXPORT_SORT_BY_RELEVANCE=false
+```
+
+**Output при `EXPORT_SORT_BY_RELEVANCE=true`:**
+```
+domain,score,priority
+apple.com,100,High         ← найвищий score
+wikipedia.org,100,High     ← однаковий score → алфавітний порядок
+amazon.com,85,High
+httpbin.org,40,Low
+fake-domain.com,0,Low      ← failed domains в кінці
+```
+
 ### ⚡ Async I/O + Connection Pooling
 - **5 workers обробляють 100 доменів за ~20 секунд** (vs 100 секунд у sync варіанті)
 - **Configurable parallelism:** `--workers 10` для швидких VPS або `--workers 2` для обмежених ресурсів
diff --git a/docs/sort_output_20260508_003121.csv b/docs/sort_output_20260508_003121.csv
diff --git a/docs/sort_output_20260508_003121_summary.md b/docs/sort_output_20260508_003121_summary.md
@@ -0,0 +1,30 @@
+# Domain Triaging Executive Summary
+Generated on: 2026-05-08 00:31:21
+Input source: `output_20260508_003121.csv`
+
+## 📊 Processing Statistics
+| Metric | Value |
+| :--- | :--- |
+| **Total Domains Processed** | 100 |
+| **Successful Scrapes** | 59 |
+| **Failed / Inaccessible** | 41 |
+| **Fallback API (Serper) Used** | 48 |
+| **Total Serper Credits Consumed** | 38 |
+
+## 🎯 Triage Results (Prioritization)
+- **🔴 High Priority (Manual Review):** 54
+- **🟡 Medium Priority (Monitor):** 1
+- **🟢 Low Priority (Discard/Archive):** 45
+
+## 🔍 Top Interesting Domains
+| Domain | Score | Next Action |
+| :--- | :--- | :--- |
+| cal.com | 100 | Manual Review |
+| discord.com | 100 | Manual Review |
+| miro.com | 100 | Manual Review |
+| calendly.com | 100 | Manual Review |
+| cloudflare.com | 100 | Manual Review |
+
+
+---
+*Full data available in the associated CSV file: output_20260508_003121.csv*
diff --git a/src/config.py b/src/config.py
@@ -30,6 +30,9 @@ class Config:
 
     # I/O Directories
     OUTPUT_DIR: str = "data"
+    EXPORT_SORT_BY_RELEVANCE: bool = (
+        os.getenv("EXPORT_SORT_BY_RELEVANCE", "false").lower() == "true"
+    )
 
 
 config = Config()
diff --git a/src/exporter.py b/src/exporter.py
@@ -93,6 +93,15 @@ def export_to_csv(results: list[dict[str, Any]], output_path: str | None = None)
     # Discarding unnecessary columns (e.g., 'fallback_used') and aligning the order
     df = df[expected_columns]
 
+    if config.EXPORT_SORT_BY_RELEVANCE:
+        # Create a temporary sorting column to handle None/NaN safely
+        sort_score: Any = pd.to_numeric(df["score"], errors="coerce").fillna(-1)  # type: ignore
+        df = (
+            df.assign(_sort_score=sort_score)
+            .sort_values(by=["_sort_score", "domain"], ascending=[False, True])
+            .drop(columns=["_sort_score"])
+        )
+
     try:
         # Writing with utf-8-sig (BOM) for correct import
         df.to_csv(output_path, index=False, encoding="utf-8-sig")
diff --git a/tests/test_exporter.py b/tests/test_exporter.py
@@ -71,3 +71,22 @@ def test_export_to_csv_success(tmp_path: Path) -> None:
     assert df.loc[0, "scrape_method"] == "bs4"
     assert df.loc[1, "scrape_method"] == "serper"
     assert "Fallback to Serper.dev" in str(df.loc[1, "notes"])
+
+
+def test_export_to_csv_sorting(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    """Verifies sorting by relevance (score) and alphabetically by domain."""
+    new_config = replace(config, EXPORT_SORT_BY_RELEVANCE=True)
+    monkeypatch.setattr("src.exporter.config", new_config)
+
+    output_file = tmp_path / "test_sort.csv"
+    mock_results = [
+        {"domain": "b.com", "score": 50, "status": "success"},
+        {"domain": "c.com", "score": 100, "status": "success"},
+        {"domain": "a.com", "score": 50, "status": "success"},
+        {"domain": "d.com", "score": None, "status": "error"},
+    ]
+    export_to_csv(mock_results, output_path=str(output_file))
+
+    df = pd.read_csv(output_file)
+    domains = df["domain"].tolist()
+    assert domains == ["c.com", "a.com", "b.com", "d.com"]
diff --git a/tests/test_scorer.py b/tests/test_scorer.py
@@ -46,18 +46,15 @@ def test_assign_priority_medium() -> None:
     mock_data = {
         "domain": "average-site.net",
         "ssl_valid": True,
-        "domain_age_days": 40,
-        "has_live_content": False,
+        "domain_age_days": 400,
+        "has_live_content": True,
         "word_count": 60,
         "error": None,
         "status_code": 200,
     }
 
     result = calculate_score(mock_data)
 
-    mock_data["domain_age_days"] = 400
-    result = calculate_score(mock_data)
-
     assert result["priority"] == "Medium"
     assert result["next_action"] == "Monitor"