diff --git a/docs/byok_guide.md b/docs/byok_guide.md index 5213a8d15..e1d353919 100644 --- a/docs/byok_guide.md +++ b/docs/byok_guide.md @@ -79,12 +79,18 @@ Both modes rely on: Inline RAG additionally supports: - **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content. +- **Relevance cutoff score** (`relevance_cutoff_score` in `byok_rag`): Minimum raw similarity score for a chunk to be returned from that BYOK vector store. Chunks below the threshold are dropped before results are merged and ranked with other sources. Configure per knowledge source (each `byok_rag` entry has its own value). The default when omitted is `0.3` (see `DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE` in `src/constants.py`). This value is passed to Llama Stack as the vector search `score_threshold` for that store. > [!NOTE] > OKP and BYOK scores are not directly comparable (different scoring systems), so > `score_multiplier` does not apply to OKP results. To control the amount of retrieved > context, set the `BYOK_RAG_MAX_CHUNKS` and `OKP_RAG_MAX_CHUNKS` constants in `src/constants.py` > (defaults: 10 and 5 respectively). For Tool RAG, use `TOOL_RAG_MAX_CHUNKS` (default: 10). +> +> [!NOTE] +> `relevance_cutoff_score` applies to Inline RAG only. When the model uses Tool RAG (`file_search`), +> Lightspeed Stack does not send this setting; retrieval uses Llama Stack’s default ranking for that path. +> Use Inline RAG if you need per-store cutoff behavior from configuration. --- @@ -288,11 +294,16 @@ registered_resources: > embedding_dimension: 768 > vector_db_id: your-index-id # Llama Stack vector store ID (from index generation) > db_path: /path/to/vector_db/faiss_store.db -> score_multiplier: 1.0 # Optional: weight results when mixing multiple sources +> score_multiplier: 1.0 # Optional: weight results when mixing multiple BYOK sources (Inline RAG) +> relevance_cutoff_score: 0.3 # Optional: min raw similarity per chunk for this store (Inline RAG only; default 0.3) > ``` > > When multiple BYOK sources are configured, `score_multiplier` adjusts the relative importance of > each store's results during Inline RAG retrieval. Values above 1.0 boost a store; below 1.0 reduce it. +> +> `relevance_cutoff_score` is interpreted in the same score space as the vector backend for that +> store. It is not comparable across different vector stores or OKP; tune each `byok_rag` entry +> using retrieval quality on that corpus. ### Step 5: Configure RAG Strategy @@ -319,10 +330,10 @@ okp: Both modes can be enabled simultaneously. Choose based on your latency and control preferences: -| Mode | When context is fetched | Tool call needed | score_multiplier | -|------|------------------------|------------------|-----------------| -| Inline RAG | With every query | No | Yes (BYOK only) | -| Tool RAG | On LLM demand | Yes | No | +| Mode | When context is fetched | Tool call needed | score_multiplier | relevance_cutoff_score | +|------|------------------------|------------------|----------------|------------------------| +| Inline RAG | With every query | No | Yes (BYOK only) | Yes (BYOK only) | +| Tool RAG | On LLM demand | Yes | No | No | > [!TIP] > A ready-to-use example combining BYOK and OKP is available at @@ -572,4 +583,4 @@ For additional support and advanced configurations, refer to: - [Llama Stack Documentation](https://llama-stack.readthedocs.io/) - [rag-content Tool Repository](https://github.com/lightspeed-core/rag-content) -Remember to regularly update your knowledge sources and monitor system performance to maintain optimal BYOK functionality. \ No newline at end of file +Remember to regularly update your knowledge sources and monitor system performance to maintain optimal BYOK functionality. diff --git a/docs/openapi.json b/docs/openapi.json index 117a15621..f5926690f 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -11806,6 +11806,13 @@ "title": "Score multiplier", "description": "Multiplier applied to relevance scores from this vector store. Used to weight results when querying multiple knowledge sources. Values > 1 boost this store's results; values < 1 reduce them.", "default": 1.0 + }, + "relevance_cutoff_score": { + "type": "number", + "exclusiveMinimum": 0.0, + "title": "Relevance cutoff score", + "description": "Minimum raw similarity score to consider a result relevant. Results with a similarity score below this threshold are not returned.", + "default": 0.3 } }, "additionalProperties": false, diff --git a/src/constants.py b/src/constants.py index cbc3cdd74..ebe016311 100644 --- a/src/constants.py +++ b/src/constants.py @@ -193,6 +193,9 @@ BYOK_RAG_MAX_CHUNKS: Final[int] = 10 # retrieved from BYOK RAG OKP_RAG_MAX_CHUNKS: Final[int] = 5 # retrieved from OKP RAG +# Default minimum raw similarity per BYOK store +DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE: Final[float] = 0.3 + # Solr OKP constants SOLR_VECTOR_SEARCH_DEFAULT_K: Final[int] = 5 SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD: Final[float] = 0.3 diff --git a/src/models/config.py b/src/models/config.py index c0724c7ff..901933bdd 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -1718,6 +1718,14 @@ class ByokRag(ConfigurationBase): "Values > 1 boost this store's results; values < 1 reduce them.", ) + relevance_cutoff_score: float = Field( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + gt=0, + title="Relevance cutoff score", + description="Minimum raw similarity score to consider a result relevant. " + "Results with a similarity score below this threshold are not returned.", + ) + class QuotaLimiterConfiguration(ConfigurationBase): """Configuration for one quota limiter. diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py index 9d901271e..9ecaf0d07 100644 --- a/src/utils/vector_search.py +++ b/src/utils/vector_search.py @@ -26,6 +26,24 @@ logger = get_logger(__name__) +def _relevance_cutoff_for_vector_store(vector_store_id: str) -> float: + """Return configured relevance cutoff for a Llama Stack vector store ID. + + Args: + vector_store_id: Llama Stack vector store identifier (``vector_db_id``) + used to find a matching BYOK RAG entry in configuration. + + Returns: + ``brag.relevance_cutoff_score`` from the ``ByokRag`` whose ``vector_db_id`` + matches ``vector_store_id``, or ``constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE`` + when no BYOK entry matches. + """ + for brag in configuration.configuration.byok_rag: + if brag.vector_db_id == vector_store_id: + return brag.relevance_cutoff_score + return constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + + def _get_okp_base_url() -> AnyUrl: """Return OKP document base URL from configuration (rhokp_url), or default if unset. @@ -180,6 +198,7 @@ async def _query_store_for_byok_rag( vector_store_id: str, query: str, weight: float, + score_threshold: float, ) -> list[dict[str, Any]]: """Query a single vector store for BYOK RAG. @@ -188,6 +207,7 @@ async def _query_store_for_byok_rag( vector_store_id: ID of the vector store to query query: Search query string weight: Score multiplier to apply + score_threshold: Minimum raw similarity score (``relevance_cutoff_score``) Returns: List of weighted result dictionaries, or empty list on error @@ -199,6 +219,7 @@ async def _query_store_for_byok_rag( params={ "max_chunks": constants.BYOK_RAG_MAX_CHUNKS, "mode": "vector", + "score_threshold": score_threshold, }, ) return _extract_byok_rag_chunks(search_response, vector_store_id, weight) @@ -410,6 +431,7 @@ async def _fetch_byok_rag( vector_store_id, query, score_multiplier_mapping.get(vector_store_id, 1.0), + _relevance_cutoff_for_vector_store(vector_store_id), ) for vector_store_id in vector_store_ids_to_query ] diff --git a/tests/unit/models/config/test_byok_rag.py b/tests/unit/models/config/test_byok_rag.py index e80e749c3..547459dd6 100644 --- a/tests/unit/models/config/test_byok_rag.py +++ b/tests/unit/models/config/test_byok_rag.py @@ -4,6 +4,7 @@ from pydantic import ValidationError from constants import ( + DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, DEFAULT_EMBEDDING_DIMENSION, DEFAULT_EMBEDDING_MODEL, DEFAULT_RAG_TYPE, @@ -35,6 +36,7 @@ def test_byok_rag_configuration_default_values() -> None: assert byok_rag.vector_db_id == "vector_db_id" assert byok_rag.db_path == "tests/configuration/rag.txt" assert byok_rag.score_multiplier == DEFAULT_SCORE_MULTIPLIER + assert byok_rag.relevance_cutoff_score == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE def test_byok_rag_configuration_nondefault_values() -> None: @@ -54,6 +56,7 @@ def test_byok_rag_configuration_nondefault_values() -> None: vector_db_id="vector_db_id", db_path="tests/configuration/rag.txt", score_multiplier=1.0, + relevance_cutoff_score=0.72, ) assert byok_rag is not None assert byok_rag.rag_id == "rag_id" @@ -62,6 +65,7 @@ def test_byok_rag_configuration_nondefault_values() -> None: assert byok_rag.embedding_dimension == 1024 assert byok_rag.vector_db_id == "vector_db_id" assert byok_rag.db_path == "tests/configuration/rag.txt" + assert byok_rag.relevance_cutoff_score == 0.72 def test_byok_rag_configuration_wrong_dimension() -> None: @@ -199,3 +203,18 @@ def test_byok_rag_configuration_score_multiplier_must_be_positive() -> None: db_path="tests/configuration/rag.txt", score_multiplier=0.0, ) + + +def test_byok_rag_configuration_relevance_cutoff_must_be_positive() -> None: + """Test that relevance_cutoff_score must be greater than 0.""" + with pytest.raises(ValidationError, match="greater than 0"): + _ = ByokRag( + rag_id="rag_id", + rag_type="rag_type", + vector_db_id="vector_db_id", + embedding_model="embedding_model", + embedding_dimension=1024, + db_path="tests/configuration/rag.txt", + score_multiplier=1.0, + relevance_cutoff_score=0.0, + ) diff --git a/tests/unit/models/config/test_dump_configuration.py b/tests/unit/models/config/test_dump_configuration.py index 14ee69bbb..1cba5a2bf 100644 --- a/tests/unit/models/config/test_dump_configuration.py +++ b/tests/unit/models/config/test_dump_configuration.py @@ -8,6 +8,7 @@ from pydantic import SecretStr +from constants import DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE from models.config import ( ByokRag, Configuration, @@ -1024,6 +1025,7 @@ def test_dump_configuration_byok(tmp_path: Path) -> None: "rag_type": "inline::faiss", "vector_db_id": "vector_db_id", "score_multiplier": 1.0, + "relevance_cutoff_score": DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, }, ], "quota_handlers": { diff --git a/tests/unit/utils/test_vector_search.py b/tests/unit/utils/test_vector_search.py index 0945bb236..8fd6868a3 100644 --- a/tests/unit/utils/test_vector_search.py +++ b/tests/unit/utils/test_vector_search.py @@ -1,5 +1,8 @@ """Unit tests for vector search utilities.""" +from collections.abc import Awaitable, Callable +from typing import Any + import pytest from pydantic import AnyUrl from pytest_mock import MockerFixture @@ -20,10 +23,37 @@ _get_okp_base_url, _get_solr_vector_store_ids, _is_solr_enabled, + _query_store_for_byok_rag, build_rag_context, ) +def _vector_io_query_stub_like_backend( + chunk_score_pairs: list[tuple[Any, float]], mocker: MockerFixture +) -> Callable[..., Awaitable[Any]]: + """Build an async ``vector_io.query`` stand-in that honors ``score_threshold``. + + Production code forwards ``relevance_cutoff_score`` as ``params['score_threshold']``; + Llama Stack filters hits server-side. The stub keeps pairs whose raw score is at or + above that minimum (``>=``), matching the docstring on ``_query_store_for_byok_rag``. + """ + + async def _query(**kwargs: Any) -> Any: + threshold = float(kwargs["params"]["score_threshold"]) + chunks_out: list[Any] = [] + scores_out: list[float] = [] + for chunk, raw_score in chunk_score_pairs: + if raw_score >= threshold: + chunks_out.append(chunk) + scores_out.append(raw_score) + out = mocker.Mock() + out.chunks = chunks_out + out.scores = scores_out + return out + + return _query + + class TestIsSolrEnabled: """Tests for _is_solr_enabled function.""" @@ -427,6 +457,9 @@ async def test_byok_enabled_success(self, mocker: MockerFixture) -> None: byok_rag_mock = mocker.Mock() byok_rag_mock.rag_id = "rag_1" byok_rag_mock.vector_db_id = "vs_1" + byok_rag_mock.relevance_cutoff_score = ( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) config_mock.configuration.rag.inline = ["rag_1"] config_mock.configuration.byok_rag = [byok_rag_mock] config_mock.score_multiplier_mapping = {"vs_1": 1.5} @@ -466,6 +499,9 @@ async def test_user_facing_ids_translated_to_internal_ids( byok_rag_mock = mocker.Mock() byok_rag_mock.rag_id = "my-kb" byok_rag_mock.vector_db_id = "vs-internal-001" + byok_rag_mock.relevance_cutoff_score = ( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) config_mock.configuration.byok_rag = [byok_rag_mock] config_mock.configuration.rag.inline = ["my-kb"] config_mock.score_multiplier_mapping = {"vs-internal-001": 1.0} @@ -491,7 +527,11 @@ async def test_user_facing_ids_translated_to_internal_ids( client_mock.vector_io.query.assert_called_once_with( vector_store_id="vs-internal-001", query="test query", - params={"max_chunks": constants.BYOK_RAG_MAX_CHUNKS, "mode": "vector"}, + params={ + "max_chunks": constants.BYOK_RAG_MAX_CHUNKS, + "mode": "vector", + "score_threshold": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + }, ) @pytest.mark.asyncio @@ -503,9 +543,15 @@ async def test_multiple_user_facing_ids_each_translated( byok_rag_1 = mocker.Mock() byok_rag_1.rag_id = "kb-part1" byok_rag_1.vector_db_id = "vs-aaa-111" + byok_rag_1.relevance_cutoff_score = ( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) byok_rag_2 = mocker.Mock() byok_rag_2.rag_id = "kb-part2" byok_rag_2.vector_db_id = "vs-bbb-222" + byok_rag_2.relevance_cutoff_score = ( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) config_mock.configuration.byok_rag = [byok_rag_1, byok_rag_2] config_mock.configuration.rag.inline = ["kb-part1", "kb-part2"] config_mock.score_multiplier_mapping = {"vs-aaa-111": 1.0, "vs-bbb-222": 1.0} @@ -542,6 +588,146 @@ async def test_multiple_user_facing_ids_each_translated( assert "kb-part1" not in call_args assert "kb-part2" not in call_args + @pytest.mark.asyncio + async def test_byok_passes_configured_relevance_cutoff_to_vector_io( + self, mocker: MockerFixture + ) -> None: + """Configured ``relevance_cutoff_score`` is sent as ``score_threshold``.""" + config_mock = mocker.Mock(spec=AppConfig) + byok_rag_mock = mocker.Mock() + byok_rag_mock.rag_id = "my-kb" + byok_rag_mock.vector_db_id = "vs-internal-001" + byok_rag_mock.relevance_cutoff_score = 0.55 + config_mock.configuration.byok_rag = [byok_rag_mock] + config_mock.configuration.rag.inline = ["my-kb"] + config_mock.score_multiplier_mapping = {"vs-internal-001": 1.0} + config_mock.rag_id_mapping = {"vs-internal-001": "my-kb"} + mocker.patch("utils.vector_search.configuration", config_mock) + + chunk_mock = mocker.Mock() + chunk_mock.content = "Test content" + chunk_mock.chunk_id = "chunk_1" + chunk_mock.metadata = {"document_id": "doc_1"} + + search_response = mocker.Mock() + search_response.chunks = [chunk_mock] + search_response.scores = [0.9] + + client_mock = mocker.AsyncMock() + client_mock.vector_io.query.return_value = search_response + + await _fetch_byok_rag(client_mock, "test query", vector_store_ids=["my-kb"]) + + client_mock.vector_io.query.assert_called_once_with( + vector_store_id="vs-internal-001", + query="test query", + params={ + "max_chunks": constants.BYOK_RAG_MAX_CHUNKS, + "mode": "vector", + "score_threshold": 0.55, + }, + ) + + @pytest.mark.asyncio + async def test_query_store_for_byok_rag_forwards_score_threshold( + self, mocker: MockerFixture + ) -> None: + """Cutoff is applied by vector backends; this layer forwards it on ``vector_io.query``. + + ``_query_store_for_byok_rag`` is the code that maps ``relevance_cutoff_score`` (via + callers) into ``params["score_threshold"]``. It does not re-rank or drop hits by + score—whatever ``vector_io.query`` returns is passed to ``_extract_byok_rag_chunks``. + """ + score_threshold = 0.37 + chunk = mocker.Mock() + chunk.content = "chunk text" + chunk.chunk_id = "chunk-1" + chunk.metadata = {"document_id": "doc-1"} + + search_response = mocker.Mock() + search_response.chunks = [chunk] + search_response.scores = [0.91] + + client = mocker.AsyncMock() + client.vector_io.query.return_value = search_response + + result = await _query_store_for_byok_rag( + client, + vector_store_id="vs-test", + query="q", + weight=2.0, + score_threshold=score_threshold, + ) + + client.vector_io.query.assert_awaited_once_with( + vector_store_id="vs-test", + query="q", + params={ + "max_chunks": constants.BYOK_RAG_MAX_CHUNKS, + "mode": "vector", + "score_threshold": score_threshold, + }, + ) + assert len(result) == 1 + assert result[0]["content"] == "chunk text" + assert result[0]["score"] == 0.91 + assert result[0]["weighted_score"] == pytest.approx(1.82) + + @pytest.mark.asyncio + async def test_fetch_byok_rag_omits_chunks_below_vector_io_score_threshold( + self, mocker: MockerFixture + ) -> None: + """Sub-threshold hits never become ``RAGChunk`` rows when vector_io enforces the cutoff. + + The full path resolves ``relevance_cutoff_score``, calls ``vector_io.query`` with + ``score_threshold``, then maps the response. The stub models backend filtering so + scores strictly below the cutoff are absent from the mocked response. + """ + cutoff = 0.5 + config_mock = mocker.Mock(spec=AppConfig) + byok_rag_mock = mocker.Mock() + byok_rag_mock.rag_id = "kb" + byok_rag_mock.vector_db_id = "vs-cutoff" + byok_rag_mock.relevance_cutoff_score = cutoff + config_mock.configuration.byok_rag = [byok_rag_mock] + config_mock.configuration.rag.inline = ["kb"] + config_mock.score_multiplier_mapping = {"vs-cutoff": 1.0} + config_mock.rag_id_mapping = {"vs-cutoff": "kb"} + mocker.patch("utils.vector_search.configuration", config_mock) + + def chunk(content: str, cid: str) -> Any: + ch = mocker.Mock() + ch.content = content + ch.chunk_id = cid + ch.metadata = {"document_id": cid} + return ch + + chunk_score_pairs: list[tuple[Any, float]] = [ + (chunk("below_cutoff", "c_low"), 0.3), + (chunk("at_cutoff", "c_edge"), cutoff), + (chunk("above_cutoff", "c_high"), 0.85), + ] + + client = mocker.AsyncMock() + client.vector_io.query.side_effect = _vector_io_query_stub_like_backend( + chunk_score_pairs, mocker + ) + + rag_chunks, _referenced = await _fetch_byok_rag( + client, "test query", vector_store_ids=["kb"] + ) + + assert ( + client.vector_io.query.await_args.kwargs["params"]["score_threshold"] + == cutoff + ) + contents = {c.content for c in rag_chunks} + assert "below_cutoff" not in contents + assert contents == {"at_cutoff", "above_cutoff"} + for ch in rag_chunks: + assert ch.score is not None + assert ch.score >= cutoff + @pytest.mark.asyncio async def test_no_inline_rag_configured_skips_byok( self, mocker: MockerFixture @@ -693,6 +879,9 @@ async def test_byok_enabled_only(self, mocker: MockerFixture) -> None: byok_rag_mock = mocker.Mock() byok_rag_mock.rag_id = "rag_1" byok_rag_mock.vector_db_id = "vs_1" + byok_rag_mock.relevance_cutoff_score = ( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) config_mock.configuration.rag.inline = ["rag_1"] config_mock.configuration.byok_rag = [byok_rag_mock] config_mock.inline_solr_enabled = False