From 3deff52e461facd764c3e2c7022d2aa41a1c0ca8 Mon Sep 17 00:00:00 2001
From: Charith Nuwan Bimsara <59943919+nuwangeek@users.noreply.github.com>
Date: Mon, 19 Jan 2026 12:09:54 +0530
Subject: [PATCH] BUG fixes #192 #189 (#211)

* updated docker compose ec2

* integrate streaming endpoint with test prodction connection page

* formatted response with markdown

* fe logic for the encryption

* vault secret update after fixing issues

* fixed formatting issue

* integration with be

* update cron manager vault script

* tested integration of vault security update

* fix security issues

* fixed issue references are not sending with streming tokens

* complete #192 and #206 bug fixes

* change production inference display logic

* Remove obsolete Vite configuration files and associated plugins

* added new model

---------

Co-authored-by: Thiru Dinesh <56014038+Thirunayan22@users.noreply.github.com>
Co-authored-by: Thiru Dinesh <thiru.dinesh@rootcodelabs.com>
Co-authored-by: erangi-ar <erangika.ariyasena@rootcode.io>
---
 .../rag-search-script-v1-llm-connections.sql  |   1 +
 README.md                                     |   2 +-
 src/llm_orchestration_service.py              | 166 ++++++++++++------
 .../config/llm_config.yaml                    |   7 +
 .../llm_ochestrator_constants.py              |  60 +++++--
 src/prompt_refine_manager/prompt_refiner.py   |  13 +-
 src/response_generator/response_generate.py   |  17 +-
 src/utils/language_detector.py                | 116 ++++++++++++
 8 files changed, 305 insertions(+), 77 deletions(-)
 create mode 100644 src/utils/language_detector.py

diff --git a/DSL/Liquibase/changelog/rag-search-script-v1-llm-connections.sql b/DSL/Liquibase/changelog/rag-search-script-v1-llm-connections.sql
index 6367462..24ce356 100644
--- a/DSL/Liquibase/changelog/rag-search-script-v1-llm-connections.sql
+++ b/DSL/Liquibase/changelog/rag-search-script-v1-llm-connections.sql
@@ -102,6 +102,7 @@ INSERT INTO llm_models (platform_id, model_key, model_name) VALUES
 -- Azure models
 ((SELECT id FROM llm_platforms WHERE platform_key = 'azure'), 'gpt-4o-mini', 'GPT-4o-mini'),
 ((SELECT id FROM llm_platforms WHERE platform_key = 'azure'), 'gpt-4o', 'GPT-4o'),
+((SELECT id FROM llm_platforms WHERE platform_key = 'azure'), 'gpt-4.1', 'GPT-4.1'),
 -- AWS models
 ((SELECT id FROM llm_platforms WHERE platform_key = 'aws'), 'anthropic-claude-3.5-sonnet', 'Anthropic Claude 3.5 Sonnet'),
 ((SELECT id FROM llm_platforms WHERE platform_key = 'aws'), 'anthropic-claude-3.7-sonnet', 'Anthropic Claude 3.7 Sonnet');
diff --git a/README.md b/README.md
index ad5edce..9e7dd82 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ The **BYK-RAG Module** is part of the Burokratt ecosystem, designed to provide *
   - Models searchable via dropdown with cache-enabled indicators.
 
 - **Enhanced Security with RSA Encryption**  
-  - LLM credentials encrypted with RSA-2048 asymmetric encryption before storage.  
+  - LLM credentials encrypted with RSA-2048 asymmetric encryption before storage.
   - GUI encrypts using public key; CronManager decrypts with private key.  
   - Additional security layer beyond HashiCorp Vault's encryption.  
 
diff --git a/src/llm_orchestration_service.py b/src/llm_orchestration_service.py
index a6cc98e..49b307d 100644
--- a/src/llm_orchestration_service.py
+++ b/src/llm_orchestration_service.py
@@ -27,9 +27,14 @@
 from src.response_generator.response_generate import stream_response_native
 from src.llm_orchestrator_config.llm_ochestrator_constants import (
     OUT_OF_SCOPE_MESSAGE,
+    OUT_OF_SCOPE_MESSAGES,
     TECHNICAL_ISSUE_MESSAGE,
+    TECHNICAL_ISSUE_MESSAGES,
     INPUT_GUARDRAIL_VIOLATION_MESSAGE,
+    INPUT_GUARDRAIL_VIOLATION_MESSAGES,
     OUTPUT_GUARDRAIL_VIOLATION_MESSAGE,
+    OUTPUT_GUARDRAIL_VIOLATION_MESSAGES,
+    get_localized_message,
     GUARDRAILS_BLOCKED_PHRASES,
     TEST_DEPLOYMENT_ENVIRONMENT,
     STREAM_TOKEN_LIMIT_MESSAGE,
@@ -43,6 +48,7 @@
 from src.utils.time_tracker import log_step_timings
 from src.utils.budget_tracker import get_budget_tracker
 from src.utils.production_store import get_production_store
+from src.utils.language_detector import detect_language, get_language_name
 from src.guardrails import NeMoRailsAdapter, GuardrailCheckResult
 from src.contextual_retrieval import ContextualRetriever
 from src.llm_orchestrator_config.exceptions import (
@@ -127,6 +133,16 @@ def process_orchestration_request(
                 f"authorId: {request.authorId}, environment: {request.environment}"
             )
 
+            # STEP 0: Detect language from user message
+            detected_language = detect_language(request.message)
+            language_name = get_language_name(detected_language)
+            logger.info(
+                f"[{request.chatId}] Detected language: {language_name} ({detected_language})"
+            )
+
+            # Store detected language in request for use throughout pipeline
+            request._detected_language = detected_language
+
             # Initialize all service components
             components = self._initialize_service_components(request)
 
@@ -245,6 +261,16 @@ async def stream_orchestration_response(
         timing_dict: Dict[str, float] = {}
         streaming_start_time = datetime.now()
 
+        # STEP 0: Detect language from user message
+        detected_language = detect_language(request.message)
+        language_name = get_language_name(detected_language)
+        logger.info(
+            f"[{request.chatId}] Streaming request - Detected language: {language_name} ({detected_language})"
+        )
+
+        # Store detected language in request for use throughout pipeline
+        request._detected_language = detected_language
+
         # Use StreamManager for centralized tracking and guaranteed cleanup
         async with stream_manager.managed_stream(
             chat_id=request.chatId, author_id=request.authorId
@@ -339,7 +365,11 @@ async def stream_orchestration_response(
                     logger.info(
                         f"[{request.chatId}] [{stream_ctx.stream_id}] No relevant chunks - out of scope"
                     )
-                    yield self._format_sse(request.chatId, OUT_OF_SCOPE_MESSAGE)
+                    detected_lang = getattr(request, "_detected_language", "en")
+                    localized_msg = get_localized_message(
+                        OUT_OF_SCOPE_MESSAGES, detected_lang
+                    )
+                    yield self._format_sse(request.chatId, localized_msg)
                     yield self._format_sse(request.chatId, "END")
                     self._log_costs(costs_dict)
                     log_step_timings(timing_dict, request.chatId)
@@ -369,7 +399,11 @@ async def stream_orchestration_response(
                     logger.info(
                         f"[{request.chatId}] [{stream_ctx.stream_id}] Question out of scope"
                     )
-                    yield self._format_sse(request.chatId, OUT_OF_SCOPE_MESSAGE)
+                    detected_lang = getattr(request, "_detected_language", "en")
+                    localized_msg = get_localized_message(
+                        OUT_OF_SCOPE_MESSAGES, detected_lang
+                    )
+                    yield self._format_sse(request.chatId, localized_msg)
                     yield self._format_sse(request.chatId, "END")
                     self._log_costs(costs_dict)
                     log_step_timings(timing_dict, request.chatId)
@@ -535,21 +569,12 @@ async def bot_response_generator() -> AsyncIterator[str]:
                             logger.info(
                                 f"[{request.chatId}] [{stream_ctx.stream_id}] Sending {len(doc_references)} document references before END"
                             )
-                            references_data = [
-                                ref.model_dump() for ref in doc_references
-                            ]
-                            references_message = {
-                                "chatId": request.chatId,
-                                "payload": {
-                                    "type": "references",
-                                    "references": references_data,
-                                },
-                                "timestamp": str(
-                                    int(datetime.now().timestamp() * 1000)
-                                ),
-                                "sentTo": [],
-                            }
-                            yield f"data: {json_module.dumps(references_message)}\n\n"
+                            # Format references as markdown text
+                            refs_text = "\n\n**References:**\n" + "\n".join(
+                                f"{i + 1}. [{ref.document_url}]({ref.document_url})"
+                                for i, ref in enumerate(doc_references)
+                            )
+                            yield self._format_sse(request.chatId, refs_text)
 
                         yield self._format_sse(request.chatId, "END")
 
@@ -594,21 +619,12 @@ async def bot_response_generator() -> AsyncIterator[str]:
                             logger.info(
                                 f"[{request.chatId}] [{stream_ctx.stream_id}] Sending {len(doc_references)} document references before END"
                             )
-                            references_data = [
-                                ref.model_dump() for ref in doc_references
-                            ]
-                            references_message = {
-                                "chatId": request.chatId,
-                                "payload": {
-                                    "type": "references",
-                                    "references": references_data,
-                                },
-                                "timestamp": str(
-                                    int(datetime.now().timestamp() * 1000)
-                                ),
-                                "sentTo": [],
-                            }
-                            yield f"data: {json_module.dumps(references_message)}\n\n"
+                            # Format references as markdown text
+                            refs_text = "\n\n**References:**\n" + "\n".join(
+                                f"{i + 1}. [{ref.document_url}]({ref.document_url})"
+                                for i, ref in enumerate(doc_references)
+                            )
+                            yield self._format_sse(request.chatId, refs_text)
 
                         yield self._format_sse(request.chatId, "END")
 
@@ -1048,6 +1064,13 @@ def handle_input_guardrails(
 
         if not input_check_result.allowed:
             logger.warning(f"Input blocked by guardrails: {input_check_result.reason}")
+
+            # Get localized message based on detected language
+            detected_lang = getattr(request, "_detected_language", "en")
+            localized_msg = get_localized_message(
+                INPUT_GUARDRAIL_VIOLATION_MESSAGES, detected_lang
+            )
+
             if request.environment == TEST_DEPLOYMENT_ENVIRONMENT:
                 logger.info(
                     "Test environment detected – returning input guardrail violation message."
@@ -1056,7 +1079,7 @@ def handle_input_guardrails(
                     llmServiceActive=True,
                     questionOutOfLLMScope=False,
                     inputGuardFailed=True,
-                    content=INPUT_GUARDRAIL_VIOLATION_MESSAGE,
+                    content=localized_msg,
                     chunks=None,
                 )
             else:
@@ -1065,7 +1088,7 @@ def handle_input_guardrails(
                     llmServiceActive=True,
                     questionOutOfLLMScope=False,
                     inputGuardFailed=True,
-                    content=INPUT_GUARDRAIL_VIOLATION_MESSAGE,
+                    content=localized_msg,
                 )
 
         logger.info("Input guardrails check passed")
@@ -1172,12 +1195,18 @@ def handle_output_guardrails(
                 logger.warning(
                     f"Output blocked by guardrails: {output_check_result.reason}"
                 )
+                # Get localized message based on detected language
+                detected_lang = getattr(request, "_detected_language", "en")
+                localized_msg = get_localized_message(
+                    OUTPUT_GUARDRAIL_VIOLATION_MESSAGES, detected_lang
+                )
+
                 return OrchestrationResponse(
                     chatId=request.chatId,
                     llmServiceActive=True,
                     questionOutOfLLMScope=False,
                     inputGuardFailed=False,
-                    content=OUTPUT_GUARDRAIL_VIOLATION_MESSAGE,
+                    content=localized_msg,
                 )
 
             logger.info("Output guardrails check passed")
@@ -1190,25 +1219,35 @@ def handle_output_guardrails(
     def _create_error_response(
         self, request: OrchestrationRequest
     ) -> OrchestrationResponse:
-        """Create standardized error response."""
+        """Create standardized error response with localized message."""
+        # Get language from request (set during language detection)
+        detected_lang = getattr(request, "_detected_language", "en")
+        localized_message = get_localized_message(
+            TECHNICAL_ISSUE_MESSAGES, detected_lang
+        )
+
         return OrchestrationResponse(
             chatId=request.chatId,
             llmServiceActive=False,
             questionOutOfLLMScope=False,
             inputGuardFailed=False,
-            content=TECHNICAL_ISSUE_MESSAGE,
+            content=localized_message,
         )
 
     def _create_out_of_scope_response(
         self, request: OrchestrationRequest
     ) -> OrchestrationResponse:
-        """Create standardized out-of-scope response."""
+        """Create standardized out-of-scope response with localized message."""
+        # Get language from request (set during language detection)
+        detected_lang = getattr(request, "_detected_language", "en")
+        localized_message = get_localized_message(OUT_OF_SCOPE_MESSAGES, detected_lang)
+
         return OrchestrationResponse(
             chatId=request.chatId,
             llmServiceActive=True,
             questionOutOfLLMScope=True,
             inputGuardFailed=False,
-            content=OUT_OF_SCOPE_MESSAGE,
+            content=localized_message,
         )
 
     def _store_production_inference_data(
@@ -2080,6 +2119,13 @@ def _generate_rag_response(
             logger.warning(
                 "Response generator unavailable – returning technical issue message."
             )
+
+            # Get localized message based on detected language
+            detected_lang = getattr(request, "_detected_language", "en")
+            localized_msg = get_localized_message(
+                TECHNICAL_ISSUE_MESSAGES, detected_lang
+            )
+
             if request.environment == TEST_DEPLOYMENT_ENVIRONMENT:
                 logger.info(
                     "Test environment detected – returning technical issue message."
@@ -2088,8 +2134,8 @@ def _generate_rag_response(
                     llmServiceActive=False,
                     questionOutOfLLMScope=False,
                     inputGuardFailed=False,
-                    content=TECHNICAL_ISSUE_MESSAGE,
-                    chunks=self._format_chunks_for_test_response(relevant_chunks),
+                    content=localized_msg,
+                    chunks=None,  # No chunks for technical failures
                 )
             else:
                 return OrchestrationResponse(
@@ -2147,20 +2193,18 @@ def _generate_rag_response(
                     output=answer,
                 )
             if question_out_of_scope:
-                logger.info("Question determined out-of-scope – sending fixed message.")
-
-                # Extract document references even for out-of-scope
-                doc_references = self._extract_document_references(relevant_chunks)
+                logger.info(
+                    "Question determined out-of-scope – sending fixed message without references."
+                )
 
-                # Append references to content
-                content_with_refs = OUT_OF_SCOPE_MESSAGE
-                if doc_references:
-                    refs_text = "\n\n**References:**\n" + "\n".join(
-                        f"{i + 1}. {ref.document_url}"
-                        for i, ref in enumerate(doc_references)
-                    )
-                    content_with_refs += refs_text
+                # Get localized message based on detected language
+                detected_lang = getattr(request, "_detected_language", "en")
+                localized_msg = get_localized_message(
+                    OUT_OF_SCOPE_MESSAGES, detected_lang
+                )
 
+                # Do NOT include references when question is out of scope
+                # (data did not provide sufficient context to answer)
                 if request.environment == TEST_DEPLOYMENT_ENVIRONMENT:
                     logger.info(
                         "Test environment detected – returning out-of-scope message."
@@ -2169,8 +2213,8 @@ def _generate_rag_response(
                         llmServiceActive=True,  # service OK; insufficient context
                         questionOutOfLLMScope=True,
                         inputGuardFailed=False,
-                        content=content_with_refs,
-                        chunks=self._format_chunks_for_test_response(relevant_chunks),
+                        content=localized_msg,
+                        chunks=None,  # No chunks when question is out of scope
                     )
                 else:
                     return OrchestrationResponse(
@@ -2178,7 +2222,7 @@ def _generate_rag_response(
                         llmServiceActive=True,  # service OK; insufficient context
                         questionOutOfLLMScope=True,
                         inputGuardFailed=False,
-                        content=content_with_refs,
+                        content=localized_msg,
                     )
 
             # In-scope: return the answer as-is (NO citations)
@@ -2233,6 +2277,12 @@ def _generate_rag_response(
                     }
                 )
             # Standardized technical issue; no second LLM call, no citations
+            # Get localized message based on detected language
+            detected_lang = getattr(request, "_detected_language", "en")
+            localized_msg = get_localized_message(
+                TECHNICAL_ISSUE_MESSAGES, detected_lang
+            )
+
             if request.environment == TEST_DEPLOYMENT_ENVIRONMENT:
                 logger.info(
                     "Test environment detected – returning technical issue message."
@@ -2241,8 +2291,8 @@ def _generate_rag_response(
                     llmServiceActive=False,
                     questionOutOfLLMScope=False,
                     inputGuardFailed=False,
-                    content=TECHNICAL_ISSUE_MESSAGE,
-                    chunks=self._format_chunks_for_test_response(relevant_chunks),
+                    content=localized_msg,
+                    chunks=None,  # No chunks for technical failures
                 )
             else:
                 return OrchestrationResponse(
diff --git a/src/llm_orchestrator_config/config/llm_config.yaml b/src/llm_orchestrator_config/config/llm_config.yaml
index f7248a1..736a6e8 100644
--- a/src/llm_orchestrator_config/config/llm_config.yaml
+++ b/src/llm_orchestrator_config/config/llm_config.yaml
@@ -32,6 +32,13 @@ llm:
           temperature: 0.5
           deployment_name: "gpt-4o-deployment"
 
+        gpt-4.1:
+          model_type: "chat"
+          max_tokens: 13107
+          temperature: 0.6
+          deployment_name: "gpt-4.1"
+
+
     # AWS Bedrock Configuration
     aws_bedrock:
       cache: true  # Keep caching enabled (DSPY default)
diff --git a/src/llm_orchestrator_config/llm_ochestrator_constants.py b/src/llm_orchestrator_config/llm_ochestrator_constants.py
index b53b3d7..61af696 100644
--- a/src/llm_orchestrator_config/llm_ochestrator_constants.py
+++ b/src/llm_orchestrator_config/llm_ochestrator_constants.py
@@ -1,20 +1,36 @@
-OUT_OF_SCOPE_MESSAGE = (
-    "I apologize, but I’m unable to provide a complete response because the available "
-    "context does not sufficiently cover your request. Please try rephrasing or providing more details."
-)
-
-TECHNICAL_ISSUE_MESSAGE = (
-    "Technical issue with response generation\n"
-    "I apologize, but I’m currently unable to generate a response due to a temporary technical issue. "
-    "Please try again in a moment."
-)
+# Multilingual message dictionaries
+OUT_OF_SCOPE_MESSAGES = {
+    "et": "Vabandust, kuid mul pole piisavalt konteksti, et teie küsimusele vastata. Palun püüdke ümber sõnastada või lisage rohkem üksikasju.",
+    "ru": "Извините, но у меня недостаточно контекста для ответа на ваш вопрос. Пожалуйста, попробуйте переформулировать или предоставить больше деталей.",
+    "en": "I apologize, but I'm unable to provide a complete response because the available context does not sufficiently cover your request. Please try rephrasing or providing more details.",
+}
+
+TECHNICAL_ISSUE_MESSAGES = {
+    "et": "Tehniline probleem vastuse genereerimisel\nVabandust, kuid ma ei saa praegu vastust genereerida ajutise tehnilise probleemi tõttu. Palun proovige mõne hetke pärast uuesti.",
+    "ru": "Техническая проблема при генерации ответа\nИзвините, в настоящее время я не могу сгенерировать ответ из-за временной технической проблемы. Пожалуйста, попробуйте еще раз через мгновение.",
+    "en": "Technical issue with response generation\nI apologize, but I'm currently unable to generate a response due to a temporary technical issue. Please try again in a moment.",
+}
+
+INPUT_GUARDRAIL_VIOLATION_MESSAGES = {
+    "et": "Vabandust, kuid ma ei saa selle taotlusega aidata, kuna see rikub meie kasutustingimusi.",
+    "ru": "Извините, но я не могу помочь с этим запросом, так как он нарушает нашу политику использования.",
+    "en": "I apologize, but I'm unable to assist with that request as it violates our usage policies.",
+}
+
+OUTPUT_GUARDRAIL_VIOLATION_MESSAGES = {
+    "et": "Vabandust, kuid ma ei saa vastust anda, kuna see võib rikkuda meie kasutustingimusi.",
+    "ru": "Извините, но я не могу предоставить ответ, так как он может нарушить нашу политику использования.",
+    "en": "I apologize, but I'm unable to provide a response as it may violate our usage policies.",
+}
+
+# Legacy constants for backward compatibility (English defaults)
+OUT_OF_SCOPE_MESSAGE = OUT_OF_SCOPE_MESSAGES["en"]
+TECHNICAL_ISSUE_MESSAGE = TECHNICAL_ISSUE_MESSAGES["en"]
+INPUT_GUARDRAIL_VIOLATION_MESSAGE = INPUT_GUARDRAIL_VIOLATION_MESSAGES["en"]
+OUTPUT_GUARDRAIL_VIOLATION_MESSAGE = OUTPUT_GUARDRAIL_VIOLATION_MESSAGES["en"]
 
 UNKNOWN_SOURCE = "Unknown source"
 
-INPUT_GUARDRAIL_VIOLATION_MESSAGE = "I apologize, but I'm unable to assist with that request as it violates our usage policies."
-
-OUTPUT_GUARDRAIL_VIOLATION_MESSAGE = "I apologize, but I'm unable to provide a response as it may violate our usage policies."
-
 GUARDRAILS_BLOCKED_PHRASES = [
     "i'm sorry, i can't respond to that",
     "i cannot respond to that",
@@ -88,6 +104,22 @@
 
 VALIDATION_GENERIC_ERROR = "I apologize, but I couldn't process your request. Please check your input and try again."
 
+
+# Helper function to get localized messages
+def get_localized_message(message_dict: dict, language_code: str = "en") -> str:
+    """
+    Get message in the specified language, fallback to English.
+
+    Args:
+        message_dict: Dictionary with language codes as keys
+        language_code: Language code ('et', 'ru', 'en')
+
+    Returns:
+        Localized message string
+    """
+    return message_dict.get(language_code, message_dict.get("en", ""))
+
+
 # Service endpoints
 RAG_SEARCH_RESQL = "http://resql:8082/rag-search"
 RAG_SEARCH_RUUTER_PUBLIC = "http://ruuter-public:8086/rag-search"
diff --git a/src/prompt_refine_manager/prompt_refiner.py b/src/prompt_refine_manager/prompt_refiner.py
index 6ca42d2..b24c275 100644
--- a/src/prompt_refine_manager/prompt_refiner.py
+++ b/src/prompt_refine_manager/prompt_refiner.py
@@ -27,6 +27,13 @@ class ConversationHistory(BaseModel):
 class PromptRefiner(dspy.Signature):
     """Produce N distinct, concise rewrites of the user's question using chat history.
 
+    CRITICAL LANGUAGE RULE:
+    - The rewrites MUST be in the SAME language as the input question
+    - Estonian question → Estonian rewrites
+    - Russian question → Russian rewrites
+    - English question → English rewrites
+    - Preserve the natural language of the original question
+
     Constraints:
     - Preserve the original intent; don't inject unsupported constraints.
     - Resolve pronouns with context when safe; avoid changing semantics.
@@ -36,11 +43,13 @@ class PromptRefiner(dspy.Signature):
     """
 
     history: str = dspy.InputField(desc="Recent conversation history (turns).")
-    question: str = dspy.InputField(desc="The user's latest question to refine.")
+    question: str = dspy.InputField(
+        desc="The user's latest question to refine. Preserve its language in rewrites."
+    )
     n: int = dspy.InputField(desc="Number of rewrites to produce (N).")
 
     rewrites: list[str] = dspy.OutputField(
-        desc="Exactly N refined variations of the question, each a single sentence."
+        desc="Exactly N refined variations of the question in THE SAME LANGUAGE as input, each a single sentence."
     )
 
 
diff --git a/src/response_generator/response_generate.py b/src/response_generator/response_generate.py
index 34d27d4..23aa744 100644
--- a/src/response_generator/response_generate.py
+++ b/src/response_generator/response_generate.py
@@ -22,16 +22,27 @@
 class ResponseGenerator(dspy.Signature):
     """Produce a grounded answer from the provided context ONLY.
 
+    CRITICAL LANGUAGE RULE:
+    - The answer MUST be in the SAME language as the input question
+    - Estonian question → Estonian answer
+    - Russian question → Russian answer
+    - English question → English answer
+    - Maintain the natural language flow and grammar of the detected language
+
     Rules:
     - Use ONLY the provided context blocks; do not invent facts.
     - If the context is insufficient, set questionOutOfLLMScope=true and say so briefly.
     - Do not include citations in the 'answer' field.
     """
 
-    question: str = dspy.InputField()
+    question: str = dspy.InputField(
+        desc="User's question. Answer in the SAME language as this question."
+    )
     context_blocks: List[str] = dspy.InputField()
     citations: List[str] = dspy.InputField()
-    answer: str = dspy.OutputField(desc="Human-friendly answer without citations")
+    answer: str = dspy.OutputField(
+        desc="Human-friendly answer in THE SAME LANGUAGE as the question, without citations"
+    )
     questionOutOfLLMScope: bool = dspy.OutputField(
         desc="True if context is insufficient to answer"
     )
@@ -40,6 +51,8 @@ class ResponseGenerator(dspy.Signature):
 class ScopeChecker(dspy.Signature):
     """Quick check if question can be answered from context.
 
+    LANGUAGE NOTE: This is an internal check, language doesn't matter for scope determination.
+
     Rules:
     - Return True ONLY if context is completely insufficient
     - Return False if context has ANY relevant information
diff --git a/src/utils/language_detector.py b/src/utils/language_detector.py
new file mode 100644
index 0000000..ba289c9
--- /dev/null
+++ b/src/utils/language_detector.py
@@ -0,0 +1,116 @@
+"""Language detection utility for multilingual support.
+
+Detects Estonian, Russian, and English based on character patterns and common words.
+"""
+
+import re
+from typing import Literal
+from loguru import logger
+
+LanguageCode = Literal["et", "ru", "en"]
+
+
+def detect_language(text: str) -> LanguageCode:
+    """
+    Detect language from input text.
+
+    Detection Strategy:
+    1. Check for Cyrillic characters (Russian)
+    2. Check for Estonian-specific characters
+    3. Check for Estonian common words
+    4. Default to English
+
+    Args:
+        text: Input text to analyze
+
+    Returns:
+        Language code: 'et' (Estonian), 'ru' (Russian), 'en' (English)
+
+    Examples:
+        >>> detect_language("Mis on sünnitoetus?")
+        'et'
+        >>> detect_language("Что такое пособие?")
+        'ru'
+        >>> detect_language("What is the benefit?")
+        'en'
+    """
+    if not text or not text.strip():
+        logger.warning(
+            "Empty text provided for language detection, defaulting to English"
+        )
+        return "en"
+
+    text_sample = text.strip()[:500]  # Use first 500 chars for detection
+
+    # Check for Cyrillic characters (Russian) - use percentage-based detection
+    cyrillic_count = len(re.findall(r"[а-яА-ЯёЁ]", text_sample))
+    total_alpha = len(re.findall(r"[a-zA-Zа-яА-ЯёЁõäöüšžÕÄÖÜŠŽ]", text_sample))
+
+    if (
+        total_alpha > 0 and cyrillic_count / total_alpha > 0.25
+    ):  # 25% Cyrillic threshold
+        logger.debug(
+            f"Detected Russian (Cyrillic: {cyrillic_count}/{total_alpha} = {cyrillic_count / total_alpha:.1%})"
+        )
+        return "ru"
+
+    # Check for Estonian-specific characters (õ, ä, ö, ü, š, ž)
+    estonian_chars = re.findall(r"[õäöüšž]", text_sample, re.IGNORECASE)
+    if len(estonian_chars) > 0:
+        logger.debug(f"Detected Estonian (special chars: {len(estonian_chars)})")
+        return "et"
+
+    # Check for Estonian common words - use distinctive markers to avoid English false positives
+    estonian_markers = [
+        "kuidas",
+        "miks",
+        "kus",
+        "millal",
+        "kes",
+        "võib",
+        "olen",
+        "oled",
+        "see",
+        "seda",
+        "jah",
+        "või",
+        "ning",
+        "siis",
+        "veel",
+        "aga",
+        "kuid",
+        "nii",
+        "nagu",
+        "oli",
+        "mis",
+    ]
+
+    # Tokenize and check for Estonian markers
+    words = re.findall(r"\b\w+\b", text_sample.lower())
+    estonian_word_count = sum(1 for word in words if word in estonian_markers)
+
+    # Scale threshold based on text length for better accuracy
+    threshold = 1 if len(words) < 10 else 2
+    if estonian_word_count >= threshold:
+        logger.debug(
+            f"Detected Estonian (marker words: {estonian_word_count}/{len(words)}, threshold: {threshold})"
+        )
+        return "et"
+
+    # Default to English
+    logger.debug("Detected English (default)")
+    return "en"
+
+
+def get_language_name(language_code: LanguageCode) -> str:
+    """
+    Get human-readable language name from code.
+
+    Args:
+        language_code: ISO 639-1 language code
+
+    Returns:
+        Full language name
+    """
+    language_names = {"et": "Estonian", "ru": "Russian", "en": "English"}
+    return language_names.get(language_code, "Unknown")