From 3deff52e461facd764c3e2c7022d2aa41a1c0ca8 Mon Sep 17 00:00:00 2001 From: Charith Nuwan Bimsara <59943919+nuwangeek@users.noreply.github.com> Date: Mon, 19 Jan 2026 12:09:54 +0530 Subject: [PATCH] BUG fixes #192 #189 (#211) * updated docker compose ec2 * integrate streaming endpoint with test prodction connection page * formatted response with markdown * fe logic for the encryption * vault secret update after fixing issues * fixed formatting issue * integration with be * update cron manager vault script * tested integration of vault security update * fix security issues * fixed issue references are not sending with streming tokens * complete #192 and #206 bug fixes * change production inference display logic * Remove obsolete Vite configuration files and associated plugins * added new model --------- Co-authored-by: Thiru Dinesh <56014038+Thirunayan22@users.noreply.github.com> Co-authored-by: Thiru Dinesh Co-authored-by: erangi-ar --- .../rag-search-script-v1-llm-connections.sql | 1 + README.md | 2 +- src/llm_orchestration_service.py | 166 ++++++++++++------ .../config/llm_config.yaml | 7 + .../llm_ochestrator_constants.py | 60 +++++-- src/prompt_refine_manager/prompt_refiner.py | 13 +- src/response_generator/response_generate.py | 17 +- src/utils/language_detector.py | 116 ++++++++++++ 8 files changed, 305 insertions(+), 77 deletions(-) create mode 100644 src/utils/language_detector.py diff --git a/DSL/Liquibase/changelog/rag-search-script-v1-llm-connections.sql b/DSL/Liquibase/changelog/rag-search-script-v1-llm-connections.sql index 6367462..24ce356 100644 --- a/DSL/Liquibase/changelog/rag-search-script-v1-llm-connections.sql +++ b/DSL/Liquibase/changelog/rag-search-script-v1-llm-connections.sql @@ -102,6 +102,7 @@ INSERT INTO llm_models (platform_id, model_key, model_name) VALUES -- Azure models ((SELECT id FROM llm_platforms WHERE platform_key = 'azure'), 'gpt-4o-mini', 'GPT-4o-mini'), ((SELECT id FROM llm_platforms WHERE platform_key = 'azure'), 'gpt-4o', 'GPT-4o'), +((SELECT id FROM llm_platforms WHERE platform_key = 'azure'), 'gpt-4.1', 'GPT-4.1'), -- AWS models ((SELECT id FROM llm_platforms WHERE platform_key = 'aws'), 'anthropic-claude-3.5-sonnet', 'Anthropic Claude 3.5 Sonnet'), ((SELECT id FROM llm_platforms WHERE platform_key = 'aws'), 'anthropic-claude-3.7-sonnet', 'Anthropic Claude 3.7 Sonnet'); diff --git a/README.md b/README.md index ad5edce..9e7dd82 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ The **BYK-RAG Module** is part of the Burokratt ecosystem, designed to provide * - Models searchable via dropdown with cache-enabled indicators. - **Enhanced Security with RSA Encryption** - - LLM credentials encrypted with RSA-2048 asymmetric encryption before storage. + - LLM credentials encrypted with RSA-2048 asymmetric encryption before storage. - GUI encrypts using public key; CronManager decrypts with private key. - Additional security layer beyond HashiCorp Vault's encryption. diff --git a/src/llm_orchestration_service.py b/src/llm_orchestration_service.py index a6cc98e..49b307d 100644 --- a/src/llm_orchestration_service.py +++ b/src/llm_orchestration_service.py @@ -27,9 +27,14 @@ from src.response_generator.response_generate import stream_response_native from src.llm_orchestrator_config.llm_ochestrator_constants import ( OUT_OF_SCOPE_MESSAGE, + OUT_OF_SCOPE_MESSAGES, TECHNICAL_ISSUE_MESSAGE, + TECHNICAL_ISSUE_MESSAGES, INPUT_GUARDRAIL_VIOLATION_MESSAGE, + INPUT_GUARDRAIL_VIOLATION_MESSAGES, OUTPUT_GUARDRAIL_VIOLATION_MESSAGE, + OUTPUT_GUARDRAIL_VIOLATION_MESSAGES, + get_localized_message, GUARDRAILS_BLOCKED_PHRASES, TEST_DEPLOYMENT_ENVIRONMENT, STREAM_TOKEN_LIMIT_MESSAGE, @@ -43,6 +48,7 @@ from src.utils.time_tracker import log_step_timings from src.utils.budget_tracker import get_budget_tracker from src.utils.production_store import get_production_store +from src.utils.language_detector import detect_language, get_language_name from src.guardrails import NeMoRailsAdapter, GuardrailCheckResult from src.contextual_retrieval import ContextualRetriever from src.llm_orchestrator_config.exceptions import ( @@ -127,6 +133,16 @@ def process_orchestration_request( f"authorId: {request.authorId}, environment: {request.environment}" ) + # STEP 0: Detect language from user message + detected_language = detect_language(request.message) + language_name = get_language_name(detected_language) + logger.info( + f"[{request.chatId}] Detected language: {language_name} ({detected_language})" + ) + + # Store detected language in request for use throughout pipeline + request._detected_language = detected_language + # Initialize all service components components = self._initialize_service_components(request) @@ -245,6 +261,16 @@ async def stream_orchestration_response( timing_dict: Dict[str, float] = {} streaming_start_time = datetime.now() + # STEP 0: Detect language from user message + detected_language = detect_language(request.message) + language_name = get_language_name(detected_language) + logger.info( + f"[{request.chatId}] Streaming request - Detected language: {language_name} ({detected_language})" + ) + + # Store detected language in request for use throughout pipeline + request._detected_language = detected_language + # Use StreamManager for centralized tracking and guaranteed cleanup async with stream_manager.managed_stream( chat_id=request.chatId, author_id=request.authorId @@ -339,7 +365,11 @@ async def stream_orchestration_response( logger.info( f"[{request.chatId}] [{stream_ctx.stream_id}] No relevant chunks - out of scope" ) - yield self._format_sse(request.chatId, OUT_OF_SCOPE_MESSAGE) + detected_lang = getattr(request, "_detected_language", "en") + localized_msg = get_localized_message( + OUT_OF_SCOPE_MESSAGES, detected_lang + ) + yield self._format_sse(request.chatId, localized_msg) yield self._format_sse(request.chatId, "END") self._log_costs(costs_dict) log_step_timings(timing_dict, request.chatId) @@ -369,7 +399,11 @@ async def stream_orchestration_response( logger.info( f"[{request.chatId}] [{stream_ctx.stream_id}] Question out of scope" ) - yield self._format_sse(request.chatId, OUT_OF_SCOPE_MESSAGE) + detected_lang = getattr(request, "_detected_language", "en") + localized_msg = get_localized_message( + OUT_OF_SCOPE_MESSAGES, detected_lang + ) + yield self._format_sse(request.chatId, localized_msg) yield self._format_sse(request.chatId, "END") self._log_costs(costs_dict) log_step_timings(timing_dict, request.chatId) @@ -535,21 +569,12 @@ async def bot_response_generator() -> AsyncIterator[str]: logger.info( f"[{request.chatId}] [{stream_ctx.stream_id}] Sending {len(doc_references)} document references before END" ) - references_data = [ - ref.model_dump() for ref in doc_references - ] - references_message = { - "chatId": request.chatId, - "payload": { - "type": "references", - "references": references_data, - }, - "timestamp": str( - int(datetime.now().timestamp() * 1000) - ), - "sentTo": [], - } - yield f"data: {json_module.dumps(references_message)}\n\n" + # Format references as markdown text + refs_text = "\n\n**References:**\n" + "\n".join( + f"{i + 1}. [{ref.document_url}]({ref.document_url})" + for i, ref in enumerate(doc_references) + ) + yield self._format_sse(request.chatId, refs_text) yield self._format_sse(request.chatId, "END") @@ -594,21 +619,12 @@ async def bot_response_generator() -> AsyncIterator[str]: logger.info( f"[{request.chatId}] [{stream_ctx.stream_id}] Sending {len(doc_references)} document references before END" ) - references_data = [ - ref.model_dump() for ref in doc_references - ] - references_message = { - "chatId": request.chatId, - "payload": { - "type": "references", - "references": references_data, - }, - "timestamp": str( - int(datetime.now().timestamp() * 1000) - ), - "sentTo": [], - } - yield f"data: {json_module.dumps(references_message)}\n\n" + # Format references as markdown text + refs_text = "\n\n**References:**\n" + "\n".join( + f"{i + 1}. [{ref.document_url}]({ref.document_url})" + for i, ref in enumerate(doc_references) + ) + yield self._format_sse(request.chatId, refs_text) yield self._format_sse(request.chatId, "END") @@ -1048,6 +1064,13 @@ def handle_input_guardrails( if not input_check_result.allowed: logger.warning(f"Input blocked by guardrails: {input_check_result.reason}") + + # Get localized message based on detected language + detected_lang = getattr(request, "_detected_language", "en") + localized_msg = get_localized_message( + INPUT_GUARDRAIL_VIOLATION_MESSAGES, detected_lang + ) + if request.environment == TEST_DEPLOYMENT_ENVIRONMENT: logger.info( "Test environment detected – returning input guardrail violation message." @@ -1056,7 +1079,7 @@ def handle_input_guardrails( llmServiceActive=True, questionOutOfLLMScope=False, inputGuardFailed=True, - content=INPUT_GUARDRAIL_VIOLATION_MESSAGE, + content=localized_msg, chunks=None, ) else: @@ -1065,7 +1088,7 @@ def handle_input_guardrails( llmServiceActive=True, questionOutOfLLMScope=False, inputGuardFailed=True, - content=INPUT_GUARDRAIL_VIOLATION_MESSAGE, + content=localized_msg, ) logger.info("Input guardrails check passed") @@ -1172,12 +1195,18 @@ def handle_output_guardrails( logger.warning( f"Output blocked by guardrails: {output_check_result.reason}" ) + # Get localized message based on detected language + detected_lang = getattr(request, "_detected_language", "en") + localized_msg = get_localized_message( + OUTPUT_GUARDRAIL_VIOLATION_MESSAGES, detected_lang + ) + return OrchestrationResponse( chatId=request.chatId, llmServiceActive=True, questionOutOfLLMScope=False, inputGuardFailed=False, - content=OUTPUT_GUARDRAIL_VIOLATION_MESSAGE, + content=localized_msg, ) logger.info("Output guardrails check passed") @@ -1190,25 +1219,35 @@ def handle_output_guardrails( def _create_error_response( self, request: OrchestrationRequest ) -> OrchestrationResponse: - """Create standardized error response.""" + """Create standardized error response with localized message.""" + # Get language from request (set during language detection) + detected_lang = getattr(request, "_detected_language", "en") + localized_message = get_localized_message( + TECHNICAL_ISSUE_MESSAGES, detected_lang + ) + return OrchestrationResponse( chatId=request.chatId, llmServiceActive=False, questionOutOfLLMScope=False, inputGuardFailed=False, - content=TECHNICAL_ISSUE_MESSAGE, + content=localized_message, ) def _create_out_of_scope_response( self, request: OrchestrationRequest ) -> OrchestrationResponse: - """Create standardized out-of-scope response.""" + """Create standardized out-of-scope response with localized message.""" + # Get language from request (set during language detection) + detected_lang = getattr(request, "_detected_language", "en") + localized_message = get_localized_message(OUT_OF_SCOPE_MESSAGES, detected_lang) + return OrchestrationResponse( chatId=request.chatId, llmServiceActive=True, questionOutOfLLMScope=True, inputGuardFailed=False, - content=OUT_OF_SCOPE_MESSAGE, + content=localized_message, ) def _store_production_inference_data( @@ -2080,6 +2119,13 @@ def _generate_rag_response( logger.warning( "Response generator unavailable – returning technical issue message." ) + + # Get localized message based on detected language + detected_lang = getattr(request, "_detected_language", "en") + localized_msg = get_localized_message( + TECHNICAL_ISSUE_MESSAGES, detected_lang + ) + if request.environment == TEST_DEPLOYMENT_ENVIRONMENT: logger.info( "Test environment detected – returning technical issue message." @@ -2088,8 +2134,8 @@ def _generate_rag_response( llmServiceActive=False, questionOutOfLLMScope=False, inputGuardFailed=False, - content=TECHNICAL_ISSUE_MESSAGE, - chunks=self._format_chunks_for_test_response(relevant_chunks), + content=localized_msg, + chunks=None, # No chunks for technical failures ) else: return OrchestrationResponse( @@ -2147,20 +2193,18 @@ def _generate_rag_response( output=answer, ) if question_out_of_scope: - logger.info("Question determined out-of-scope – sending fixed message.") - - # Extract document references even for out-of-scope - doc_references = self._extract_document_references(relevant_chunks) + logger.info( + "Question determined out-of-scope – sending fixed message without references." + ) - # Append references to content - content_with_refs = OUT_OF_SCOPE_MESSAGE - if doc_references: - refs_text = "\n\n**References:**\n" + "\n".join( - f"{i + 1}. {ref.document_url}" - for i, ref in enumerate(doc_references) - ) - content_with_refs += refs_text + # Get localized message based on detected language + detected_lang = getattr(request, "_detected_language", "en") + localized_msg = get_localized_message( + OUT_OF_SCOPE_MESSAGES, detected_lang + ) + # Do NOT include references when question is out of scope + # (data did not provide sufficient context to answer) if request.environment == TEST_DEPLOYMENT_ENVIRONMENT: logger.info( "Test environment detected – returning out-of-scope message." @@ -2169,8 +2213,8 @@ def _generate_rag_response( llmServiceActive=True, # service OK; insufficient context questionOutOfLLMScope=True, inputGuardFailed=False, - content=content_with_refs, - chunks=self._format_chunks_for_test_response(relevant_chunks), + content=localized_msg, + chunks=None, # No chunks when question is out of scope ) else: return OrchestrationResponse( @@ -2178,7 +2222,7 @@ def _generate_rag_response( llmServiceActive=True, # service OK; insufficient context questionOutOfLLMScope=True, inputGuardFailed=False, - content=content_with_refs, + content=localized_msg, ) # In-scope: return the answer as-is (NO citations) @@ -2233,6 +2277,12 @@ def _generate_rag_response( } ) # Standardized technical issue; no second LLM call, no citations + # Get localized message based on detected language + detected_lang = getattr(request, "_detected_language", "en") + localized_msg = get_localized_message( + TECHNICAL_ISSUE_MESSAGES, detected_lang + ) + if request.environment == TEST_DEPLOYMENT_ENVIRONMENT: logger.info( "Test environment detected – returning technical issue message." @@ -2241,8 +2291,8 @@ def _generate_rag_response( llmServiceActive=False, questionOutOfLLMScope=False, inputGuardFailed=False, - content=TECHNICAL_ISSUE_MESSAGE, - chunks=self._format_chunks_for_test_response(relevant_chunks), + content=localized_msg, + chunks=None, # No chunks for technical failures ) else: return OrchestrationResponse( diff --git a/src/llm_orchestrator_config/config/llm_config.yaml b/src/llm_orchestrator_config/config/llm_config.yaml index f7248a1..736a6e8 100644 --- a/src/llm_orchestrator_config/config/llm_config.yaml +++ b/src/llm_orchestrator_config/config/llm_config.yaml @@ -32,6 +32,13 @@ llm: temperature: 0.5 deployment_name: "gpt-4o-deployment" + gpt-4.1: + model_type: "chat" + max_tokens: 13107 + temperature: 0.6 + deployment_name: "gpt-4.1" + + # AWS Bedrock Configuration aws_bedrock: cache: true # Keep caching enabled (DSPY default) diff --git a/src/llm_orchestrator_config/llm_ochestrator_constants.py b/src/llm_orchestrator_config/llm_ochestrator_constants.py index b53b3d7..61af696 100644 --- a/src/llm_orchestrator_config/llm_ochestrator_constants.py +++ b/src/llm_orchestrator_config/llm_ochestrator_constants.py @@ -1,20 +1,36 @@ -OUT_OF_SCOPE_MESSAGE = ( - "I apologize, but I’m unable to provide a complete response because the available " - "context does not sufficiently cover your request. Please try rephrasing or providing more details." -) - -TECHNICAL_ISSUE_MESSAGE = ( - "Technical issue with response generation\n" - "I apologize, but I’m currently unable to generate a response due to a temporary technical issue. " - "Please try again in a moment." -) +# Multilingual message dictionaries +OUT_OF_SCOPE_MESSAGES = { + "et": "Vabandust, kuid mul pole piisavalt konteksti, et teie küsimusele vastata. Palun püüdke ümber sõnastada või lisage rohkem üksikasju.", + "ru": "Извините, но у меня недостаточно контекста для ответа на ваш вопрос. Пожалуйста, попробуйте переформулировать или предоставить больше деталей.", + "en": "I apologize, but I'm unable to provide a complete response because the available context does not sufficiently cover your request. Please try rephrasing or providing more details.", +} + +TECHNICAL_ISSUE_MESSAGES = { + "et": "Tehniline probleem vastuse genereerimisel\nVabandust, kuid ma ei saa praegu vastust genereerida ajutise tehnilise probleemi tõttu. Palun proovige mõne hetke pärast uuesti.", + "ru": "Техническая проблема при генерации ответа\nИзвините, в настоящее время я не могу сгенерировать ответ из-за временной технической проблемы. Пожалуйста, попробуйте еще раз через мгновение.", + "en": "Technical issue with response generation\nI apologize, but I'm currently unable to generate a response due to a temporary technical issue. Please try again in a moment.", +} + +INPUT_GUARDRAIL_VIOLATION_MESSAGES = { + "et": "Vabandust, kuid ma ei saa selle taotlusega aidata, kuna see rikub meie kasutustingimusi.", + "ru": "Извините, но я не могу помочь с этим запросом, так как он нарушает нашу политику использования.", + "en": "I apologize, but I'm unable to assist with that request as it violates our usage policies.", +} + +OUTPUT_GUARDRAIL_VIOLATION_MESSAGES = { + "et": "Vabandust, kuid ma ei saa vastust anda, kuna see võib rikkuda meie kasutustingimusi.", + "ru": "Извините, но я не могу предоставить ответ, так как он может нарушить нашу политику использования.", + "en": "I apologize, but I'm unable to provide a response as it may violate our usage policies.", +} + +# Legacy constants for backward compatibility (English defaults) +OUT_OF_SCOPE_MESSAGE = OUT_OF_SCOPE_MESSAGES["en"] +TECHNICAL_ISSUE_MESSAGE = TECHNICAL_ISSUE_MESSAGES["en"] +INPUT_GUARDRAIL_VIOLATION_MESSAGE = INPUT_GUARDRAIL_VIOLATION_MESSAGES["en"] +OUTPUT_GUARDRAIL_VIOLATION_MESSAGE = OUTPUT_GUARDRAIL_VIOLATION_MESSAGES["en"] UNKNOWN_SOURCE = "Unknown source" -INPUT_GUARDRAIL_VIOLATION_MESSAGE = "I apologize, but I'm unable to assist with that request as it violates our usage policies." - -OUTPUT_GUARDRAIL_VIOLATION_MESSAGE = "I apologize, but I'm unable to provide a response as it may violate our usage policies." - GUARDRAILS_BLOCKED_PHRASES = [ "i'm sorry, i can't respond to that", "i cannot respond to that", @@ -88,6 +104,22 @@ VALIDATION_GENERIC_ERROR = "I apologize, but I couldn't process your request. Please check your input and try again." + +# Helper function to get localized messages +def get_localized_message(message_dict: dict, language_code: str = "en") -> str: + """ + Get message in the specified language, fallback to English. + + Args: + message_dict: Dictionary with language codes as keys + language_code: Language code ('et', 'ru', 'en') + + Returns: + Localized message string + """ + return message_dict.get(language_code, message_dict.get("en", "")) + + # Service endpoints RAG_SEARCH_RESQL = "http://resql:8082/rag-search" RAG_SEARCH_RUUTER_PUBLIC = "http://ruuter-public:8086/rag-search" diff --git a/src/prompt_refine_manager/prompt_refiner.py b/src/prompt_refine_manager/prompt_refiner.py index 6ca42d2..b24c275 100644 --- a/src/prompt_refine_manager/prompt_refiner.py +++ b/src/prompt_refine_manager/prompt_refiner.py @@ -27,6 +27,13 @@ class ConversationHistory(BaseModel): class PromptRefiner(dspy.Signature): """Produce N distinct, concise rewrites of the user's question using chat history. + CRITICAL LANGUAGE RULE: + - The rewrites MUST be in the SAME language as the input question + - Estonian question → Estonian rewrites + - Russian question → Russian rewrites + - English question → English rewrites + - Preserve the natural language of the original question + Constraints: - Preserve the original intent; don't inject unsupported constraints. - Resolve pronouns with context when safe; avoid changing semantics. @@ -36,11 +43,13 @@ class PromptRefiner(dspy.Signature): """ history: str = dspy.InputField(desc="Recent conversation history (turns).") - question: str = dspy.InputField(desc="The user's latest question to refine.") + question: str = dspy.InputField( + desc="The user's latest question to refine. Preserve its language in rewrites." + ) n: int = dspy.InputField(desc="Number of rewrites to produce (N).") rewrites: list[str] = dspy.OutputField( - desc="Exactly N refined variations of the question, each a single sentence." + desc="Exactly N refined variations of the question in THE SAME LANGUAGE as input, each a single sentence." ) diff --git a/src/response_generator/response_generate.py b/src/response_generator/response_generate.py index 34d27d4..23aa744 100644 --- a/src/response_generator/response_generate.py +++ b/src/response_generator/response_generate.py @@ -22,16 +22,27 @@ class ResponseGenerator(dspy.Signature): """Produce a grounded answer from the provided context ONLY. + CRITICAL LANGUAGE RULE: + - The answer MUST be in the SAME language as the input question + - Estonian question → Estonian answer + - Russian question → Russian answer + - English question → English answer + - Maintain the natural language flow and grammar of the detected language + Rules: - Use ONLY the provided context blocks; do not invent facts. - If the context is insufficient, set questionOutOfLLMScope=true and say so briefly. - Do not include citations in the 'answer' field. """ - question: str = dspy.InputField() + question: str = dspy.InputField( + desc="User's question. Answer in the SAME language as this question." + ) context_blocks: List[str] = dspy.InputField() citations: List[str] = dspy.InputField() - answer: str = dspy.OutputField(desc="Human-friendly answer without citations") + answer: str = dspy.OutputField( + desc="Human-friendly answer in THE SAME LANGUAGE as the question, without citations" + ) questionOutOfLLMScope: bool = dspy.OutputField( desc="True if context is insufficient to answer" ) @@ -40,6 +51,8 @@ class ResponseGenerator(dspy.Signature): class ScopeChecker(dspy.Signature): """Quick check if question can be answered from context. + LANGUAGE NOTE: This is an internal check, language doesn't matter for scope determination. + Rules: - Return True ONLY if context is completely insufficient - Return False if context has ANY relevant information diff --git a/src/utils/language_detector.py b/src/utils/language_detector.py new file mode 100644 index 0000000..ba289c9 --- /dev/null +++ b/src/utils/language_detector.py @@ -0,0 +1,116 @@ +"""Language detection utility for multilingual support. + +Detects Estonian, Russian, and English based on character patterns and common words. +""" + +import re +from typing import Literal +from loguru import logger + +LanguageCode = Literal["et", "ru", "en"] + + +def detect_language(text: str) -> LanguageCode: + """ + Detect language from input text. + + Detection Strategy: + 1. Check for Cyrillic characters (Russian) + 2. Check for Estonian-specific characters + 3. Check for Estonian common words + 4. Default to English + + Args: + text: Input text to analyze + + Returns: + Language code: 'et' (Estonian), 'ru' (Russian), 'en' (English) + + Examples: + >>> detect_language("Mis on sünnitoetus?") + 'et' + >>> detect_language("Что такое пособие?") + 'ru' + >>> detect_language("What is the benefit?") + 'en' + """ + if not text or not text.strip(): + logger.warning( + "Empty text provided for language detection, defaulting to English" + ) + return "en" + + text_sample = text.strip()[:500] # Use first 500 chars for detection + + # Check for Cyrillic characters (Russian) - use percentage-based detection + cyrillic_count = len(re.findall(r"[а-яА-ЯёЁ]", text_sample)) + total_alpha = len(re.findall(r"[a-zA-Zа-яА-ЯёЁõäöüšžÕÄÖÜŠŽ]", text_sample)) + + if ( + total_alpha > 0 and cyrillic_count / total_alpha > 0.25 + ): # 25% Cyrillic threshold + logger.debug( + f"Detected Russian (Cyrillic: {cyrillic_count}/{total_alpha} = {cyrillic_count / total_alpha:.1%})" + ) + return "ru" + + # Check for Estonian-specific characters (õ, ä, ö, ü, š, ž) + estonian_chars = re.findall(r"[õäöüšž]", text_sample, re.IGNORECASE) + if len(estonian_chars) > 0: + logger.debug(f"Detected Estonian (special chars: {len(estonian_chars)})") + return "et" + + # Check for Estonian common words - use distinctive markers to avoid English false positives + estonian_markers = [ + "kuidas", + "miks", + "kus", + "millal", + "kes", + "võib", + "olen", + "oled", + "see", + "seda", + "jah", + "või", + "ning", + "siis", + "veel", + "aga", + "kuid", + "nii", + "nagu", + "oli", + "mis", + ] + + # Tokenize and check for Estonian markers + words = re.findall(r"\b\w+\b", text_sample.lower()) + estonian_word_count = sum(1 for word in words if word in estonian_markers) + + # Scale threshold based on text length for better accuracy + threshold = 1 if len(words) < 10 else 2 + if estonian_word_count >= threshold: + logger.debug( + f"Detected Estonian (marker words: {estonian_word_count}/{len(words)}, threshold: {threshold})" + ) + return "et" + + # Default to English + logger.debug("Detected English (default)") + return "en" + + +def get_language_name(language_code: LanguageCode) -> str: + """ + Get human-readable language name from code. + + Args: + language_code: ISO 639-1 language code + + Returns: + Full language name + """ + language_names = {"et": "Estonian", "ru": "Russian", "en": "English"} + return language_names.get(language_code, "Unknown")