From 4394403bebfa18f5061160f2bb0210ce092ffe65 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:07:31 -0700 Subject: [PATCH] Fix: release pooled DB connection during AI LLM/embedding calls (#12983) --- src/paperless_ai/ai_classifier.py | 36 +++++++++++++++++-------------- src/paperless_ai/chat.py | 21 ++++++++++++------ src/paperless_ai/db.py | 30 ++++++++++++++++++++++++++ src/paperless_ai/indexing.py | 7 +++++- 4 files changed, 70 insertions(+), 24 deletions(-) create mode 100644 src/paperless_ai/db.py diff --git a/src/paperless_ai/ai_classifier.py b/src/paperless_ai/ai_classifier.py index 5420812eb..f9c2f1e06 100644 --- a/src/paperless_ai/ai_classifier.py +++ b/src/paperless_ai/ai_classifier.py @@ -8,6 +8,7 @@ from documents.models import Document from documents.permissions import get_objects_for_user_owner_aware from paperless.config import AIConfig from paperless_ai.client import AIClient +from paperless_ai.db import db_connection_released from paperless_ai.indexing import query_similar_documents from paperless_ai.indexing import truncate_content @@ -146,20 +147,23 @@ def get_ai_document_classification( ) client = AIClient() - result = client.run_llm_query(prompt) - suggestions = parse_ai_response(result) - if output_language: - localized = client.run_llm_query( - build_localization_prompt(suggestions, output_language), - ) - localized_suggestions = parse_ai_response(localized) - suggestions = { - **suggestions, - "title": localized_suggestions["title"] or suggestions["title"], - "tags": localized_suggestions["tags"] or suggestions["tags"], - "document_types": localized_suggestions["document_types"] - or suggestions["document_types"], - "storage_paths": localized_suggestions["storage_paths"] - or suggestions["storage_paths"], - } + # Hand the pooled DB connection back while the (slow) LLM query runs so it + # is not pinned for the call's duration; see paperless_ai.db and #12976. + with db_connection_released(): + result = client.run_llm_query(prompt) + suggestions = parse_ai_response(result) + if output_language: + localized = client.run_llm_query( + build_localization_prompt(suggestions, output_language), + ) + localized_suggestions = parse_ai_response(localized) + suggestions = { + **suggestions, + "title": localized_suggestions["title"] or suggestions["title"], + "tags": localized_suggestions["tags"] or suggestions["tags"], + "document_types": localized_suggestions["document_types"] + or suggestions["document_types"], + "storage_paths": localized_suggestions["storage_paths"] + or suggestions["storage_paths"], + } return suggestions diff --git a/src/paperless_ai/chat.py b/src/paperless_ai/chat.py index 123771c50..6102d94e4 100644 --- a/src/paperless_ai/chat.py +++ b/src/paperless_ai/chat.py @@ -5,6 +5,7 @@ import sys from documents.models import Document from paperless.config import AIConfig from paperless_ai.client import AIClient +from paperless_ai.db import db_connection_released from paperless_ai.indexing import _document_id_filters from paperless_ai.indexing import get_rag_prompt_helper from paperless_ai.indexing import load_or_build_index @@ -105,7 +106,10 @@ def _stream_chat_with_documents(query_str: str, documents: list[Document]): filters=filters, ) - top_nodes = retriever.retrieve(query_str) + # Slow query-embedding + vector search; no Django ORM access happens during + # it, so release the pooled DB connection for its duration. See #12976. + with db_connection_released(): + top_nodes = retriever.retrieve(query_str) if not top_nodes: logger.warning("No nodes found for the given documents.") yield CHAT_NO_CONTENT_MESSAGE @@ -133,10 +137,13 @@ def _stream_chat_with_documents(query_str: str, documents: list[Document]): ) logger.debug("Document chat query: %s", query_str) - response_stream = query_engine.query(query_str) - for chunk in response_stream.response_gen: - yield chunk - sys.stdout.flush() + # Release the pooled DB connection for the slow streaming LLM response so it + # is not pinned for the whole stream; see paperless_ai.db and #12976. + with db_connection_released(): + response_stream = query_engine.query(query_str) + for chunk in response_stream.response_gen: + yield chunk + sys.stdout.flush() - if references: - yield _format_chat_metadata_trailer(references) + if references: + yield _format_chat_metadata_trailer(references) diff --git a/src/paperless_ai/db.py b/src/paperless_ai/db.py new file mode 100644 index 000000000..066689acf --- /dev/null +++ b/src/paperless_ai/db.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from contextlib import contextmanager + +from django.db import connections + + +@contextmanager +def db_connection_released(): + """ + Return any checked-out DB connections to the pool for the duration of the + wrapped block. + + The AI endpoints run inside a synchronous web request (``ai_suggestions``) + or a streaming response (``chat``). Django keeps the request's database + connection checked out for the entire request/response, so a blocking LLM + call - which can take many seconds - pins a pooled connection the whole + time. With connection pooling enabled, enough concurrent AI requests check + out every slot and all other requests then fail with + ``psycopg_pool.PoolTimeout`` (see issue #12976). + + No Django ORM access happens during the LLM call, so we hand the connection + back to the pool first; Django transparently re-checks-out a connection on + the next ORM use after the block. + """ + connections.close_all() + try: + yield + finally: + connections.close_all() diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index dd96106a6..5153c0baa 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -13,6 +13,7 @@ from documents.models import PaperlessTask from documents.utils import IterWrapper from documents.utils import identity from paperless.config import AIConfig +from paperless_ai.db import db_connection_released from paperless_ai.embedding import build_llm_index_text from paperless_ai.embedding import get_configured_model_name from paperless_ai.embedding import get_embedding_model @@ -385,7 +386,11 @@ def query_similar_documents( chunk_size=config.llm_embedding_chunk_size, context_size=config.llm_context_size, ) - results = retriever.retrieve(query_text) + # The retrieve() call generates a query embedding (a slow external request) + # and searches the vector store; no Django ORM access happens during it, so + # release the pooled DB connection for its duration. See #12976. + with db_connection_released(): + results = retriever.retrieve(query_text) retrieved_document_ids: list[int] = [] for node in results: