From 6f9bba69df50db411a648f28ecc91581b9270dfe Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Wed, 17 Dec 2025 17:29:15 -0500 Subject: [PATCH 01/14] investigating separating out documents from the rest of the message history and instructions. --- willa/chatbot/graph_manager.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index 13dea78..d3347c3 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -22,6 +22,7 @@ class WillaChatbotState(TypedDict): docs_context: NotRequired[str] search_query: NotRequired[str] tind_metadata: NotRequired[str] + documents: NotRequired[list[Any]] context: NotRequired[dict[str, Any]] @@ -87,17 +88,25 @@ def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str]: vector_store = self._vector_store if not search_query or not vector_store: - return {"docs_context": "", "tind_metadata": ""} + return {"docs_context": "", "tind_metadata": "", "documents": []} # Search for relevant documents retriever = vector_store.as_retriever(search_kwargs={"k": int(CONFIG['K_VALUE'])}) matching_docs = retriever.invoke(search_query) + formatted_documents = [ + { + "page_content": doc.page_content, + "start_index": str(doc.metadata.get('start_index')) if doc.metadata.get('start_index') else '', + "total_pages": str(doc.metadata.get('total_pages')) if doc.metadata.get('total_pages') else '', + } + for doc in matching_docs + ] # Format context and metadata docs_context = '\n\n'.join(doc.page_content for doc in matching_docs) tind_metadata = format_tind_context.get_tind_context(matching_docs) - return {"docs_context": docs_context, "tind_metadata": tind_metadata} + return {"docs_context": docs_context, "tind_metadata": tind_metadata, "documents": formatted_documents} # This should be refactored probably. Very bulky def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMessage]]: @@ -107,6 +116,7 @@ def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMess docs_context = state.get("docs_context", "") tind_metadata = state.get("tind_metadata", "") model = self._model + documents = state.get("documents", []) if not model: return {"messages": [AIMessage(content="Model not available.")]} @@ -121,16 +131,20 @@ def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMess return {"messages": [AIMessage(content="I'm sorry, I didn't receive a question.")]} prompt = get_langfuse_prompt() - system_messages = prompt.invoke({'context': docs_context, - 'question': latest_message.content}) + system_messages = prompt.invoke({}) + if hasattr(system_messages, "messages"): all_messages = summarized_conversation + system_messages.messages else: all_messages = summarized_conversation + [system_messages] # Get response from model - response = model.invoke(all_messages) - + response = model.invoke( + all_messages, + additional_model_request_fields={"documents": documents}, + additional_model_response_field_paths=["/citations"] + ) + # print(response.response_metadata) # Create clean response content response_content = str(response.content) if hasattr(response, 'content') else str(response) response_messages: list[AnyMessage] = [AIMessage(content=response_content), From 3f81a6c9380250e160914076ab4ac8711161c8f9 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Fri, 19 Dec 2025 11:27:10 -0500 Subject: [PATCH 02/14] preserving cohere response citations - this gets cohere specific response field that includes citations for the response text --- willa/chatbot/graph_manager.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index d3347c3..298d973 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -23,6 +23,7 @@ class WillaChatbotState(TypedDict): search_query: NotRequired[str] tind_metadata: NotRequired[str] documents: NotRequired[list[Any]] + citations: NotRequired[list[dict[str, Any]]] context: NotRequired[dict[str, Any]] @@ -144,7 +145,12 @@ def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMess additional_model_request_fields={"documents": documents}, additional_model_response_field_paths=["/citations"] ) - # print(response.response_metadata) + citations = response.response_metadata.get('additionalModelResponseFields').get('citations') if response.response_metadata else None + + # add citations to graph state + if citations: + state['citations'] = citations + # Create clean response content response_content = str(response.content) if hasattr(response, 'content') else str(response) response_messages: list[AnyMessage] = [AIMessage(content=response_content), From 8869188bbc021dde3fd38a0cb73bcb375a814334 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Mon, 22 Dec 2025 14:51:07 -0500 Subject: [PATCH 03/14] add prepare generation node - temporarily add raw citations to response. --- willa/chatbot/graph_manager.py | 55 ++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index 298d973..bd4ae5e 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -53,13 +53,15 @@ def _create_workflow(self) -> CompiledStateGraph: workflow.add_node("summarize", summarization_node) workflow.add_node("prepare_search", self._prepare_search_query) workflow.add_node("retrieve_context", self._retrieve_context) + workflow.add_node("prepare_for_generation", self._prepare_for_generation) workflow.add_node("generate_response", self._generate_response) # Define edges workflow.add_edge("filter_messages", "summarize") workflow.add_edge("summarize", "prepare_search") workflow.add_edge("prepare_search", "retrieve_context") - workflow.add_edge("retrieve_context", "generate_response") + workflow.add_edge("retrieve_context", "prepare_for_generation") + workflow.add_edge("prepare_for_generation", "generate_response") workflow.set_entry_point("filter_messages") workflow.set_finish_point("generate_response") @@ -109,50 +111,51 @@ def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str]: return {"docs_context": docs_context, "tind_metadata": tind_metadata, "documents": formatted_documents} - # This should be refactored probably. Very bulky - def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMessage]]: - """Generate response using the model.""" + def _prepare_for_generation(self, state: WillaChatbotState) -> dict[str, list[AnyMessage]]: + """Prepare the current and past messages for response generation.""" messages = state["messages"] summarized_conversation = state.get("summarized_messages", messages) - docs_context = state.get("docs_context", "") - tind_metadata = state.get("tind_metadata", "") - model = self._model - documents = state.get("documents", []) - - if not model: - return {"messages": [AIMessage(content="Model not available.")]} - - # Get the latest human message - latest_message = next( - (msg for msg in reversed(messages) if isinstance(msg, HumanMessage)), - None - ) - - if not latest_message: + + if not any(isinstance(msg, HumanMessage) for msg in messages): return {"messages": [AIMessage(content="I'm sorry, I didn't receive a question.")]} - + prompt = get_langfuse_prompt() system_messages = prompt.invoke({}) - + if hasattr(system_messages, "messages"): all_messages = summarized_conversation + system_messages.messages else: all_messages = summarized_conversation + [system_messages] + + return {"messages": all_messages} + + def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMessage]]: + """Generate response using the model.""" + tind_metadata = state.get("tind_metadata", "") + model = self._model + documents = state.get("documents", []) + messages = state["messages"] + + if not model: + return {"messages": [AIMessage(content="Model not available.")]} # Get response from model response = model.invoke( - all_messages, + messages, additional_model_request_fields={"documents": documents}, additional_model_response_field_paths=["/citations"] ) citations = response.response_metadata.get('additionalModelResponseFields').get('citations') if response.response_metadata else None - # add citations to graph state - if citations: - state['citations'] = citations - # Create clean response content response_content = str(response.content) if hasattr(response, 'content') else str(response) + + if citations: + state['citations'] = citations + response_content += "\n\nCitations:\n" + for citation in citations: + response_content += f"- {citation.get('text', '')} (docs: {citation.get('document_ids', [])})\n" + response_messages: list[AnyMessage] = [AIMessage(content=response_content), ChatMessage(content=tind_metadata, role='TIND', response_metadata={'tind': True})] From ea8e7fb9c316a3c8bb49c6b0b12e6a428e4905f6 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Tue, 23 Dec 2025 16:16:04 -0500 Subject: [PATCH 04/14] improving citation output prep --- willa/chatbot/graph_manager.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index bd4ae5e..8c948d3 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -1,4 +1,5 @@ """Manages the shared state and workflow for Willa chatbots.""" +import re from typing import Any, Optional, Annotated, NotRequired from typing_extensions import TypedDict @@ -19,12 +20,10 @@ class WillaChatbotState(TypedDict): messages: Annotated[list[AnyMessage], add_messages] filtered_messages: NotRequired[list[AnyMessage]] summarized_messages: NotRequired[list[AnyMessage]] - docs_context: NotRequired[str] search_query: NotRequired[str] tind_metadata: NotRequired[str] documents: NotRequired[list[Any]] citations: NotRequired[list[dict[str, Any]]] - context: NotRequired[dict[str, Any]] class GraphManager: # pylint: disable=too-few-public-methods @@ -91,25 +90,27 @@ def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str]: vector_store = self._vector_store if not search_query or not vector_store: - return {"docs_context": "", "tind_metadata": "", "documents": []} + return {"tind_metadata": "", "documents": []} # Search for relevant documents retriever = vector_store.as_retriever(search_kwargs={"k": int(CONFIG['K_VALUE'])}) matching_docs = retriever.invoke(search_query) formatted_documents = [ { + "id": f"{doc.metadata.get('tind_metadata', {}).get('tind_id', [''])[0]}_{i}", "page_content": doc.page_content, - "start_index": str(doc.metadata.get('start_index')) if doc.metadata.get('start_index') else '', - "total_pages": str(doc.metadata.get('total_pages')) if doc.metadata.get('total_pages') else '', + "title": doc.metadata.get('tind_metadata', {}).get('title', [''])[0], + "project": doc.metadata.get('tind_metadata', {}).get('isPartOf', [''])[0], + "tind_link": format_tind_context.get_tind_url( + doc.metadata.get('tind_metadata', {}).get('tind_id', [''])[0]) } - for doc in matching_docs + for i, doc in enumerate(matching_docs, 1) ] - # Format context and metadata - docs_context = '\n\n'.join(doc.page_content for doc in matching_docs) + # Format tind metadata tind_metadata = format_tind_context.get_tind_context(matching_docs) - return {"docs_context": docs_context, "tind_metadata": tind_metadata, "documents": formatted_documents} + return {"tind_metadata": tind_metadata, "documents": formatted_documents} def _prepare_for_generation(self, state: WillaChatbotState) -> dict[str, list[AnyMessage]]: """Prepare the current and past messages for response generation.""" @@ -154,7 +155,9 @@ def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMess state['citations'] = citations response_content += "\n\nCitations:\n" for citation in citations: - response_content += f"- {citation.get('text', '')} (docs: {citation.get('document_ids', [])})\n" + doc_ids = list(dict.fromkeys([re.sub(r'_\d*$', '', doc_id) + for doc_id in citation.get('document_ids', [])])) + response_content += f"- {citation.get('text', '')} ({', '.join(doc_ids)})\n" response_messages: list[AnyMessage] = [AIMessage(content=response_content), ChatMessage(content=tind_metadata, role='TIND', From 595169cc75e08463f9bc413dc4e6af800618aaa9 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Mon, 5 Jan 2026 15:19:33 -0500 Subject: [PATCH 05/14] strip citations --- willa/chatbot/graph_manager.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index 8c948d3..3f05b8e 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -1,5 +1,4 @@ """Manages the shared state and workflow for Willa chatbots.""" -import re from typing import Any, Optional, Annotated, NotRequired from typing_extensions import TypedDict @@ -23,8 +22,6 @@ class WillaChatbotState(TypedDict): search_query: NotRequired[str] tind_metadata: NotRequired[str] documents: NotRequired[list[Any]] - citations: NotRequired[list[dict[str, Any]]] - class GraphManager: # pylint: disable=too-few-public-methods """Manages the shared LangGraph workflow for all chatbot instances.""" @@ -143,22 +140,12 @@ def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMess # Get response from model response = model.invoke( messages, - additional_model_request_fields={"documents": documents}, - additional_model_response_field_paths=["/citations"] + additional_model_request_fields={"documents": documents} ) - citations = response.response_metadata.get('additionalModelResponseFields').get('citations') if response.response_metadata else None # Create clean response content response_content = str(response.content) if hasattr(response, 'content') else str(response) - if citations: - state['citations'] = citations - response_content += "\n\nCitations:\n" - for citation in citations: - doc_ids = list(dict.fromkeys([re.sub(r'_\d*$', '', doc_id) - for doc_id in citation.get('document_ids', [])])) - response_content += f"- {citation.get('text', '')} ({', '.join(doc_ids)})\n" - response_messages: list[AnyMessage] = [AIMessage(content=response_content), ChatMessage(content=tind_metadata, role='TIND', response_metadata={'tind': True})] From 51c264e89432a289e30a3f7cf09f8355a860e5d0 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Mon, 5 Jan 2026 15:36:40 -0500 Subject: [PATCH 06/14] fix return type for retrieve docs --- willa/chatbot/graph_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index 3f05b8e..8ae1556 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -81,7 +81,7 @@ def _prepare_search_query(self, state: WillaChatbotState) -> dict[str, str]: search_query = '\n'.join(str(msg.content) for msg in messages if hasattr(msg, 'content')) return {"search_query": search_query} - def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str]: + def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str | list[Any]]: """Retrieve relevant context from vector store.""" search_query = state.get("search_query", "") vector_store = self._vector_store From 140ccc009032aa73fe8c35bbd01d5616eb99dd22 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Mon, 5 Jan 2026 15:38:47 -0500 Subject: [PATCH 07/14] pylint fixes --- willa/chatbot/graph_manager.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index 8ae1556..26597fe 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -113,18 +113,18 @@ def _prepare_for_generation(self, state: WillaChatbotState) -> dict[str, list[An """Prepare the current and past messages for response generation.""" messages = state["messages"] summarized_conversation = state.get("summarized_messages", messages) - + if not any(isinstance(msg, HumanMessage) for msg in messages): return {"messages": [AIMessage(content="I'm sorry, I didn't receive a question.")]} - + prompt = get_langfuse_prompt() system_messages = prompt.invoke({}) - + if hasattr(system_messages, "messages"): all_messages = summarized_conversation + system_messages.messages else: all_messages = summarized_conversation + [system_messages] - + return {"messages": all_messages} def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMessage]]: @@ -145,7 +145,7 @@ def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMess # Create clean response content response_content = str(response.content) if hasattr(response, 'content') else str(response) - + response_messages: list[AnyMessage] = [AIMessage(content=response_content), ChatMessage(content=tind_metadata, role='TIND', response_metadata={'tind': True})] From a8579b8786bd9e377e27d1818de91579aee2fde9 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Mon, 5 Jan 2026 15:59:34 -0500 Subject: [PATCH 08/14] truncate search query if needed ... suggestions from @awilfox --- willa/chatbot/graph_manager.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index 26597fe..19a8e6d 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -23,6 +23,7 @@ class WillaChatbotState(TypedDict): tind_metadata: NotRequired[str] documents: NotRequired[list[Any]] + class GraphManager: # pylint: disable=too-few-public-methods """Manages the shared LangGraph workflow for all chatbot instances.""" @@ -79,6 +80,11 @@ def _prepare_search_query(self, state: WillaChatbotState) -> dict[str, str]: # summarization may include a system message as well as any human or ai messages search_query = '\n'.join(str(msg.content) for msg in messages if hasattr(msg, 'content')) + + # if summarization fails or some other issue, truncate to the last 2048 characters + if len(search_query) > 2048: + search_query = search_query[-2048:] + return {"search_query": search_query} def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str | list[Any]]: @@ -94,7 +100,7 @@ def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str | list[An matching_docs = retriever.invoke(search_query) formatted_documents = [ { - "id": f"{doc.metadata.get('tind_metadata', {}).get('tind_id', [''])[0]}_{i}", + "id": f"{i}_{doc.metadata.get('tind_metadata', {}).get('tind_id', [''])[0]}", "page_content": doc.page_content, "title": doc.metadata.get('tind_metadata', {}).get('title', [''])[0], "project": doc.metadata.get('tind_metadata', {}).get('isPartOf', [''])[0], From 21904f6e75d7d8a5e0cd08c7d7a9e7e5581a07fc Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Mon, 5 Jan 2026 16:03:10 -0500 Subject: [PATCH 09/14] pylint fix. aaarrrrgghghghghg --- willa/chatbot/graph_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index 19a8e6d..f74ea9e 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -80,11 +80,11 @@ def _prepare_search_query(self, state: WillaChatbotState) -> dict[str, str]: # summarization may include a system message as well as any human or ai messages search_query = '\n'.join(str(msg.content) for msg in messages if hasattr(msg, 'content')) - + # if summarization fails or some other issue, truncate to the last 2048 characters if len(search_query) > 2048: search_query = search_query[-2048:] - + return {"search_query": search_query} def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str | list[Any]]: From 53ea41894c30b8061073cdc956fc8f55cd2b4ce8 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Mon, 5 Jan 2026 16:09:58 -0500 Subject: [PATCH 10/14] possible summarization fix --- willa/chatbot/graph_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index f74ea9e..3dcb4b1 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -41,6 +41,7 @@ def _create_workflow(self) -> CompiledStateGraph: summarization_node = SummarizationNode( max_tokens=int(CONFIG['SUMMARIZATION_MAX_TOKENS']), model=self._model, + token_counter=self._model.get_num_tokens_from_messages, input_messages_key="filtered_messages", output_messages_key="summarized_messages" ) From 1843a05d062cbf9900ea1c2b7e4e77e7af5e0159 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Mon, 5 Jan 2026 17:39:57 -0500 Subject: [PATCH 11/14] fixed summarization issues --- willa/chatbot/graph_manager.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index 3dcb4b1..ae2f8a8 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -19,6 +19,7 @@ class WillaChatbotState(TypedDict): messages: Annotated[list[AnyMessage], add_messages] filtered_messages: NotRequired[list[AnyMessage]] summarized_messages: NotRequired[list[AnyMessage]] + messages_for_generation: NotRequired[list[AnyMessage]] search_query: NotRequired[str] tind_metadata: NotRequired[str] documents: NotRequired[list[Any]] @@ -41,7 +42,6 @@ def _create_workflow(self) -> CompiledStateGraph: summarization_node = SummarizationNode( max_tokens=int(CONFIG['SUMMARIZATION_MAX_TOKENS']), model=self._model, - token_counter=self._model.get_num_tokens_from_messages, input_messages_key="filtered_messages", output_messages_key="summarized_messages" ) @@ -70,7 +70,10 @@ def _filter_messages(self, state: WillaChatbotState) -> dict[str, list[AnyMessag """Filter out TIND messages from the conversation history.""" messages = state["messages"] - filtered = [msg for msg in messages if 'tind' not in msg.response_metadata] + filtered = [ + msg for msg in messages + if 'tind' not in msg.response_metadata and msg.type != "system" + ] return {"filtered_messages": filtered} def _prepare_search_query(self, state: WillaChatbotState) -> dict[str, str]: @@ -132,14 +135,14 @@ def _prepare_for_generation(self, state: WillaChatbotState) -> dict[str, list[An else: all_messages = summarized_conversation + [system_messages] - return {"messages": all_messages} + return {"messages_for_generation": all_messages} def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMessage]]: """Generate response using the model.""" tind_metadata = state.get("tind_metadata", "") model = self._model documents = state.get("documents", []) - messages = state["messages"] + messages = state["messages_for_generation"] if not model: return {"messages": [AIMessage(content="Model not available.")]} From 20fc9f9f6cf736e07828821713a1cf57902b3b99 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Wed, 7 Jan 2026 10:40:14 -0500 Subject: [PATCH 12/14] typing --- willa/chatbot/graph_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index ae2f8a8..7bfbcb6 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -1,5 +1,5 @@ """Manages the shared state and workflow for Willa chatbots.""" -from typing import Any, Optional, Annotated, NotRequired +from typing import Optional, Annotated, NotRequired from typing_extensions import TypedDict from langchain_core.language_models import BaseChatModel @@ -22,7 +22,7 @@ class WillaChatbotState(TypedDict): messages_for_generation: NotRequired[list[AnyMessage]] search_query: NotRequired[str] tind_metadata: NotRequired[str] - documents: NotRequired[list[Any]] + documents: NotRequired[list[dict[str, str]]] class GraphManager: # pylint: disable=too-few-public-methods @@ -91,7 +91,7 @@ def _prepare_search_query(self, state: WillaChatbotState) -> dict[str, str]: return {"search_query": search_query} - def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str | list[Any]]: + def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str | list[dict[str, str]]]: """Retrieve relevant context from vector store.""" search_query = state.get("search_query", "") vector_store = self._vector_store From aa1c5b2c6f568714d9cb39e7b95ab483af03f304 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Wed, 7 Jan 2026 11:28:57 -0500 Subject: [PATCH 13/14] clean up refactor and addressing some @awilfox and copilot suggestions --- willa/chatbot/graph_manager.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index 7bfbcb6..2e71fe2 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -2,6 +2,7 @@ from typing import Optional, Annotated, NotRequired from typing_extensions import TypedDict +from langchain_core.documents import Document from langchain_core.language_models import BaseChatModel from langchain_core.messages import ChatMessage, HumanMessage, AIMessage from langchain_core.vectorstores.base import VectorStore @@ -72,7 +73,7 @@ def _filter_messages(self, state: WillaChatbotState) -> dict[str, list[AnyMessag filtered = [ msg for msg in messages - if 'tind' not in msg.response_metadata and msg.type != "system" + if "tind" not in getattr(msg, "response_metadata", {}) and msg.type != "system" ] return {"filtered_messages": filtered} @@ -91,6 +92,21 @@ def _prepare_search_query(self, state: WillaChatbotState) -> dict[str, str]: return {"search_query": search_query} + def _format_retrieved_documents(self, matching_docs: list[Document]) -> list[dict[str, str]]: + """Format documents from vector store into a list of dictionaries.""" + formatted_documents: list[dict[str, str]] = [] + for i, doc in enumerate(matching_docs, 1): + tind_metadata = doc.metadata.get('tind_metadata', {}) + tind_id = tind_metadata.get('tind_id', [''])[0] + formatted_documents.append({ + "id": f"{i}_{tind_id}", + "page_content": doc.page_content, + "title": tind_metadata.get('title', [''])[0], + "project": tind_metadata.get('isPartOf', [''])[0], + "tind_link": format_tind_context.get_tind_url(tind_id) + }) + return formatted_documents + def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str | list[dict[str, str]]]: """Retrieve relevant context from vector store.""" search_query = state.get("search_query", "") @@ -102,17 +118,7 @@ def _retrieve_context(self, state: WillaChatbotState) -> dict[str, str | list[di # Search for relevant documents retriever = vector_store.as_retriever(search_kwargs={"k": int(CONFIG['K_VALUE'])}) matching_docs = retriever.invoke(search_query) - formatted_documents = [ - { - "id": f"{i}_{doc.metadata.get('tind_metadata', {}).get('tind_id', [''])[0]}", - "page_content": doc.page_content, - "title": doc.metadata.get('tind_metadata', {}).get('title', [''])[0], - "project": doc.metadata.get('tind_metadata', {}).get('isPartOf', [''])[0], - "tind_link": format_tind_context.get_tind_url( - doc.metadata.get('tind_metadata', {}).get('tind_id', [''])[0]) - } - for i, doc in enumerate(matching_docs, 1) - ] + formatted_documents = self._format_retrieved_documents(matching_docs) # Format tind metadata tind_metadata = format_tind_context.get_tind_context(matching_docs) @@ -142,7 +148,7 @@ def _generate_response(self, state: WillaChatbotState) -> dict[str, list[AnyMess tind_metadata = state.get("tind_metadata", "") model = self._model documents = state.get("documents", []) - messages = state["messages_for_generation"] + messages = state.get("messages_for_generation") or state.get("messages", []) if not model: return {"messages": [AIMessage(content="Model not available.")]} From cb09da1841df51bdf7e3c47af41fd40da7893233 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Wed, 7 Jan 2026 11:37:11 -0500 Subject: [PATCH 14/14] mypy type error --- willa/chatbot/graph_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/willa/chatbot/graph_manager.py b/willa/chatbot/graph_manager.py index 2e71fe2..b4b5809 100644 --- a/willa/chatbot/graph_manager.py +++ b/willa/chatbot/graph_manager.py @@ -71,7 +71,7 @@ def _filter_messages(self, state: WillaChatbotState) -> dict[str, list[AnyMessag """Filter out TIND messages from the conversation history.""" messages = state["messages"] - filtered = [ + filtered: list[AnyMessage] = [ msg for msg in messages if "tind" not in getattr(msg, "response_metadata", {}) and msg.type != "system" ]