do-web-doc-resolver/scripts/resolve.py at main · d-oit/do-web-doc-resolver · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env python3
"""
Web Doc Resolver - Resolve queries or URLs into compact, LLM-ready markdown.
Main orchestrator. CLI entrypoint moved to scripts/cli.py.
"""

import concurrent.futures
import logging
import os
from typing import Any

import scripts._query_resolve
import scripts._url_resolve
import scripts.cache_negative
import scripts.circuit_breaker
import scripts.providers_impl
import scripts.quality
import scripts.routing
import scripts.routing_memory
import scripts.semantic_cache
import scripts.synthesis
import scripts.utils
from scripts.models import (
    ErrorType,
    Profile,
    ProviderType,
    ResolvedResult,
    ValidationResult,
)
from scripts.providers_impl import (
    _is_rate_limited,
    _rate_limits,
    _set_rate_limit,
    resolve_with_duckduckgo,
    resolve_with_exa,
    resolve_with_exa_mcp,
    resolve_with_firecrawl,
    resolve_with_jina,
    resolve_with_mistral_browser,
    resolve_with_mistral_websearch,
    resolve_with_tavily,
)
from scripts.semantic_cache import get_semantic_cache
from scripts.utils import (
    _cache_key,
    _detect_error_type,
    _get_cache,
    _get_from_cache,
    _save_to_cache,
    fetch_llms_txt,
    fetch_url_content,
    get_cache,
    get_session,
    is_url,
    validate_links,
    validate_url,
)

MAX_CHARS = int(os.getenv("WEB_RESOLVER_MAX_CHARS", "8000"))
MIN_CHARS = int(os.getenv("WEB_RESOLVER_MIN_CHARS", "200"))
DEFAULT_TIMEOUT = int(os.getenv("WEB_RESOLVER_TIMEOUT", "30"))

logger = logging.getLogger(__name__)

_circuit_breakers = scripts.circuit_breaker.CircuitBreakerRegistry()
_routing_memory = scripts.routing_memory.RoutingMemory()
_cache = None
_semantic_cache = None
_executor = None


def _get_executor(max_workers: int = 10) -> concurrent.futures.ThreadPoolExecutor:
    """Get or create shared ThreadPoolExecutor."""
    global _executor
    if _executor is None:
        _executor = concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers, thread_name_prefix="resolver"
        )
    return _executor


# Keep facade and extracted submodules on the same shared state so callers,
# tests, and future monkeypatches still observe one resolver runtime.
scripts._query_resolve._circuit_breakers = _circuit_breakers
scripts._query_resolve._routing_memory = _routing_memory
scripts._url_resolve._circuit_breakers = _circuit_breakers
scripts._url_resolve._routing_memory = _routing_memory

is_rate_limited = _is_rate_limited
set_rate_limit = _set_rate_limit


def _get_semantic_cache():
    """Get or initialize the semantic cache."""
    return get_semantic_cache()


def _check_semantic_cache(query_or_url: str) -> dict[str, Any] | None:
    """Check semantic cache - delegates to sub-modules."""
    result = scripts._query_resolve._check_semantic_cache(query_or_url)
    if result:
        return result
    return scripts._url_resolve._check_semantic_cache(query_or_url)


def _store_in_semantic_cache(query_or_url: str, result: dict) -> bool:
    """Store in semantic cache - delegates to sub-modules."""
    if scripts._query_resolve._store_in_semantic_cache(query_or_url, result):
        return True
    return scripts._url_resolve._store_in_semantic_cache(query_or_url, result)


__all__ = [
    "resolve",
    "resolve_url",
    "resolve_query",
    "resolve_direct",
    "resolve_with_order",
    "resolve_url_with_order",
    "resolve_query_with_order",
    "ResolvedResult",
    "ValidationResult",
    "ErrorType",
    "ProviderType",
    "is_url",
    "validate_url",
    "validate_links",
    "fetch_url_content",
    "fetch_llms_txt",
    "MAX_CHARS",
    "MIN_CHARS",
    "DEFAULT_TIMEOUT",
    "_detect_error_type",
    "_is_rate_limited",
    "_set_rate_limit",
    "get_session",
    "_get_from_cache",
    "_save_to_cache",
    "_cache_key",
    "_get_cache",
    "get_cache",
    "_rate_limits",
    "_cache",
    "_check_semantic_cache",
    "_store_in_semantic_cache",
]


resolve_url = scripts._url_resolve.resolve_url
resolve_url_stream = scripts._url_resolve.resolve_url_stream
resolve_query = scripts._query_resolve.resolve_query
resolve_query_stream = scripts._query_resolve.resolve_query_stream


def synthesize_results(query: str, results: list[ResolvedResult], api_key: str, model: str) -> str:
    return scripts.synthesis.synthesize_results(query, results, api_key, model)


def resolve(
    input_str: str,
    max_chars: int = MAX_CHARS,
    skip_providers: set[str] | None = None,
    profile: Profile | str = Profile.BALANCED,
) -> dict[str, Any]:
    if isinstance(profile, str):
        profile = Profile(profile.lower())

    if is_url(input_str):
        return resolve_url(input_str, max_chars, profile=profile)
    return resolve_query(input_str, max_chars, skip_providers, profile=profile)


def resolve_direct(
    input_str: str, provider: ProviderType, max_chars: int = MAX_CHARS
) -> dict[str, Any]:
    funcs: dict[ProviderType, Any] = {
        ProviderType.JINA: resolve_with_jina,
        ProviderType.EXA_MCP: resolve_with_exa_mcp,
        ProviderType.EXA: resolve_with_exa,
        ProviderType.TAVILY: resolve_with_tavily,
        ProviderType.DUCKDUCKGO: resolve_with_duckduckgo,
        ProviderType.FIRECRAWL: resolve_with_firecrawl,
        ProviderType.MISTRAL_BROWSER: resolve_with_mistral_browser,
        ProviderType.MISTRAL_WEBSEARCH: resolve_with_mistral_websearch,
        ProviderType.DIRECT_FETCH: fetch_url_content,
    }
    if provider in funcs:
        res = funcs[provider](input_str, max_chars)
        return res.to_dict() if res else {"source": "none", "error": "Provider failed"}
    return {"source": "none", "error": "Unknown provider"}


def resolve_with_order(
    input_str: str, providers_order: list[ProviderType], max_chars: int = MAX_CHARS
) -> dict[str, Any]:
    for pt in providers_order:
        res = resolve_direct(input_str, pt, max_chars)
        if res.get("source") != "none":
            return res
    return {"source": "none", "error": "All providers failed"}


def resolve_url_with_order(
    url: str, order: list[ProviderType], max_chars: int = MAX_CHARS
) -> dict[str, Any]:
    return resolve_with_order(url, order, max_chars)


def resolve_query_with_order(
    query: str, order: list[ProviderType], max_chars: int = MAX_CHARS
) -> dict[str, Any]:
    return resolve_with_order(query, order, max_chars)