Fix: release pooled DB connection during AI LLM/embedding calls (#12983)

This commit is contained in:
Trenton H
2026-06-11 13:07:31 -07:00
committed by GitHub
parent f188d308eb
commit 4394403beb
4 changed files with 70 additions and 24 deletions
+20 -16
View File
@@ -8,6 +8,7 @@ from documents.models import Document
from documents.permissions import get_objects_for_user_owner_aware
from paperless.config import AIConfig
from paperless_ai.client import AIClient
from paperless_ai.db import db_connection_released
from paperless_ai.indexing import query_similar_documents
from paperless_ai.indexing import truncate_content
@@ -146,20 +147,23 @@ def get_ai_document_classification(
)
client = AIClient()
result = client.run_llm_query(prompt)
suggestions = parse_ai_response(result)
if output_language:
localized = client.run_llm_query(
build_localization_prompt(suggestions, output_language),
)
localized_suggestions = parse_ai_response(localized)
suggestions = {
**suggestions,
"title": localized_suggestions["title"] or suggestions["title"],
"tags": localized_suggestions["tags"] or suggestions["tags"],
"document_types": localized_suggestions["document_types"]
or suggestions["document_types"],
"storage_paths": localized_suggestions["storage_paths"]
or suggestions["storage_paths"],
}
# Hand the pooled DB connection back while the (slow) LLM query runs so it
# is not pinned for the call's duration; see paperless_ai.db and #12976.
with db_connection_released():
result = client.run_llm_query(prompt)
suggestions = parse_ai_response(result)
if output_language:
localized = client.run_llm_query(
build_localization_prompt(suggestions, output_language),
)
localized_suggestions = parse_ai_response(localized)
suggestions = {
**suggestions,
"title": localized_suggestions["title"] or suggestions["title"],
"tags": localized_suggestions["tags"] or suggestions["tags"],
"document_types": localized_suggestions["document_types"]
or suggestions["document_types"],
"storage_paths": localized_suggestions["storage_paths"]
or suggestions["storage_paths"],
}
return suggestions
+14 -7
View File
@@ -5,6 +5,7 @@ import sys
from documents.models import Document
from paperless.config import AIConfig
from paperless_ai.client import AIClient
from paperless_ai.db import db_connection_released
from paperless_ai.indexing import _document_id_filters
from paperless_ai.indexing import get_rag_prompt_helper
from paperless_ai.indexing import load_or_build_index
@@ -105,7 +106,10 @@ def _stream_chat_with_documents(query_str: str, documents: list[Document]):
filters=filters,
)
top_nodes = retriever.retrieve(query_str)
# Slow query-embedding + vector search; no Django ORM access happens during
# it, so release the pooled DB connection for its duration. See #12976.
with db_connection_released():
top_nodes = retriever.retrieve(query_str)
if not top_nodes:
logger.warning("No nodes found for the given documents.")
yield CHAT_NO_CONTENT_MESSAGE
@@ -133,10 +137,13 @@ def _stream_chat_with_documents(query_str: str, documents: list[Document]):
)
logger.debug("Document chat query: %s", query_str)
response_stream = query_engine.query(query_str)
for chunk in response_stream.response_gen:
yield chunk
sys.stdout.flush()
# Release the pooled DB connection for the slow streaming LLM response so it
# is not pinned for the whole stream; see paperless_ai.db and #12976.
with db_connection_released():
response_stream = query_engine.query(query_str)
for chunk in response_stream.response_gen:
yield chunk
sys.stdout.flush()
if references:
yield _format_chat_metadata_trailer(references)
if references:
yield _format_chat_metadata_trailer(references)
+30
View File
@@ -0,0 +1,30 @@
from __future__ import annotations
from contextlib import contextmanager
from django.db import connections
@contextmanager
def db_connection_released():
"""
Return any checked-out DB connections to the pool for the duration of the
wrapped block.
The AI endpoints run inside a synchronous web request (``ai_suggestions``)
or a streaming response (``chat``). Django keeps the request's database
connection checked out for the entire request/response, so a blocking LLM
call - which can take many seconds - pins a pooled connection the whole
time. With connection pooling enabled, enough concurrent AI requests check
out every slot and all other requests then fail with
``psycopg_pool.PoolTimeout`` (see issue #12976).
No Django ORM access happens during the LLM call, so we hand the connection
back to the pool first; Django transparently re-checks-out a connection on
the next ORM use after the block.
"""
connections.close_all()
try:
yield
finally:
connections.close_all()
+6 -1
View File
@@ -13,6 +13,7 @@ from documents.models import PaperlessTask
from documents.utils import IterWrapper
from documents.utils import identity
from paperless.config import AIConfig
from paperless_ai.db import db_connection_released
from paperless_ai.embedding import build_llm_index_text
from paperless_ai.embedding import get_configured_model_name
from paperless_ai.embedding import get_embedding_model
@@ -385,7 +386,11 @@ def query_similar_documents(
chunk_size=config.llm_embedding_chunk_size,
context_size=config.llm_context_size,
)
results = retriever.retrieve(query_text)
# The retrieve() call generates a query embedding (a slow external request)
# and searches the vector store; no Django ORM access happens during it, so
# release the pooled DB connection for its duration. See #12976.
with db_connection_released():
results = retriever.retrieve(query_text)
retrieved_document_ids: list[int] = []
for node in results: