Enhancement (beta): support LLM timeout config (#13002)

2026-06-20 20:34:20 +00:00 · 2026-06-18 08:35:11 -07:00
parent b8f10269a7
commit 262183e848
14 changed files with 184 additions and 36 deletions
@@ -2068,6 +2068,13 @@ context by default.

    Defaults to 8192.

+#### [`PAPERLESS_AI_LLM_REQUEST_TIMEOUT=<int>`](#PAPERLESS_AI_LLM_REQUEST_TIMEOUT) {#PAPERLESS_AI_LLM_REQUEST_TIMEOUT}
+
+: The timeout, in seconds, for requests to the configured AI backend. Increase this when using
+local or slow inference servers that need more time to generate responses.
+
+    Defaults to 120.
+
 #### [`PAPERLESS_AI_LLM_BACKEND=<str>`](#PAPERLESS_AI_LLM_BACKEND) {#PAPERLESS_AI_LLM_BACKEND}

 : The AI backend to use. This can be either "openai-like" or "ollama". If set to "ollama", the AI
@@ -360,6 +360,14 @@ export const PaperlessConfigOptions: ConfigOption[] = [
    category: ConfigCategory.AI,
    note: $localize`Language to use for generated AI suggestions. When unset, AI suggestions use the user's display language if explicitly set.`,
  },
+  {
+    key: 'llm_request_timeout',
+    title: $localize`LLM Request Timeout`,
+    type: ConfigOptionType.Number,
+    config_key: 'PAPERLESS_AI_LLM_REQUEST_TIMEOUT',
+    category: ConfigCategory.AI,
+    note: $localize`Timeout in seconds for LLM requests.`,
+  },
 ]

 export interface PaperlessConfig extends ObjectWithId {
@@ -401,4 +409,5 @@ export interface PaperlessConfig extends ObjectWithId {
  llm_api_key: string
  llm_endpoint: string
  llm_output_language: string
+  llm_request_timeout: number
 }
@@ -82,6 +82,7 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
                "llm_api_key": None,
                "llm_endpoint": None,
                "llm_output_language": None,
+                "llm_request_timeout": None,
            },
        )

@@ -30,6 +30,7 @@ from documents.signals.handlers import update_llm_suggestions_cache
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import read_streaming_response
 from paperless.models import ApplicationConfiguration
+from paperless_ai.exceptions import LLMTimeoutError


 class TestViews(DirectoriesMixin, TestCase):
@@ -476,6 +477,33 @@ class TestAISuggestions(DirectoriesMixin, TestCase):
            get_llm_suggestion_cache(self.document.pk, backend="openai-like"),
        )

+    @patch("documents.views.get_ai_document_classification")
+    @override_settings(
+        AI_ENABLED=True,
+        LLM_BACKEND="openai-like",
+    )
+    def test_ai_suggestions_with_llm_timeout(
+        self,
+        mock_get_ai_classification,
+    ) -> None:
+        mock_get_ai_classification.side_effect = LLMTimeoutError()
+
+        self.client.force_login(user=self.user)
+        response = self.client.get(
+            f"/api/documents/{self.document.pk}/ai_suggestions/",
+        )
+
+        self.assertEqual(response.status_code, status.HTTP_503_SERVICE_UNAVAILABLE)
+        self.assertEqual(
+            response.json(),
+            {
+                "ai": ["AI backend request timed out."],
+            },
+        )
+        self.assertIsNone(
+            get_llm_suggestion_cache(self.document.pk, backend="openai-like"),
+        )
+
    def test_invalidate_suggestions_cache(self) -> None:
        self.client.force_login(user=self.user)
        suggestions = {
@@ -241,6 +241,7 @@ from paperless.serialisers import UserSerializer
 from paperless.views import StandardPagination
 from paperless_ai.ai_classifier import get_ai_document_classification
 from paperless_ai.chat import stream_chat_with_documents
+from paperless_ai.exceptions import LLMTimeoutError
 from paperless_ai.matching import extract_unmatched_names
 from paperless_ai.matching import match_correspondents_by_name
 from paperless_ai.matching import match_document_types_by_name
@@ -1510,6 +1511,17 @@ class DocumentViewSet(
                exc_info=True,
            )
            raise ValidationError({"ai": [_("Invalid AI configuration.")]}) from exc
+        except LLMTimeoutError as exc:
+            logger.exception(
+                "AI backend timed out while generating suggestions for document %s: %s",
+                doc.pk,
+                exc,
+                exc_info=True,
+            )
+            return Response(
+                {"ai": [_("AI backend request timed out.")]},
+                status=status.HTTP_503_SERVICE_UNAVAILABLE,
+            )

        matched_tags = match_tags_by_name(
            llm_suggestions.get("tags", []),
@@ -197,6 +197,7 @@ class AIConfig(BaseConfig):
    llm_embedding_endpoint: str = dataclasses.field(init=False)
    llm_embedding_chunk_size: int = dataclasses.field(init=False)
    llm_context_size: int = dataclasses.field(init=False)
+    llm_request_timeout: int = dataclasses.field(init=False)
    llm_backend: str = dataclasses.field(init=False)
    llm_model: str = dataclasses.field(init=False)
    llm_api_key: str = dataclasses.field(init=False)
@@ -221,6 +222,9 @@ class AIConfig(BaseConfig):
            app_config.llm_embedding_chunk_size or settings.LLM_EMBEDDING_CHUNK_SIZE
        )
        self.llm_context_size = app_config.llm_context_size or settings.LLM_CONTEXT_SIZE
+        self.llm_request_timeout = (
+            app_config.llm_request_timeout or settings.LLM_REQUEST_TIMEOUT
+        )
        self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
        self.llm_model = app_config.llm_model or settings.LLM_MODEL
        self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
@@ -0,0 +1,23 @@
+# Generated by Django 5.2.14 on 2026-06-14 14:22
+
+import django.core.validators
+from django.db import migrations
+from django.db import models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("paperless", "0012_applicationconfiguration_llm_output_language"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_request_timeout",
+            field=models.PositiveSmallIntegerField(
+                null=True,
+                validators=[django.core.validators.MinValueValidator(1)],
+                verbose_name="Sets the LLM request timeout in seconds",
+            ),
+        ),
+    ]
@@ -366,6 +366,12 @@ class ApplicationConfiguration(AbstractSingletonModel):
        max_length=32,
    )

+    llm_request_timeout = models.PositiveSmallIntegerField(
+        verbose_name=_("Sets the LLM timeout in seconds"),
+        null=True,
+        validators=[MinValueValidator(1)],
+    )
+
    class Meta:
        verbose_name = _("paperless application settings")
        permissions = [
@@ -1206,6 +1206,9 @@ if LLM_EMBEDDING_CHUNK_SIZE < 1:
 LLM_CONTEXT_SIZE = get_int_from_env("PAPERLESS_AI_LLM_CONTEXT_SIZE", 8192)
 if LLM_CONTEXT_SIZE < 1:
    raise ImproperlyConfigured("PAPERLESS_AI_LLM_CONTEXT_SIZE must be >= 1")
+LLM_REQUEST_TIMEOUT = get_int_from_env("PAPERLESS_AI_LLM_REQUEST_TIMEOUT", 120)
+if LLM_REQUEST_TIMEOUT < 1:
+    raise ImproperlyConfigured("PAPERLESS_AI_LLM_REQUEST_TIMEOUT must be >= 1")
 LLM_BACKEND = get_choice_from_env(
    "PAPERLESS_AI_LLM_BACKEND",
    {"ollama", "openai-like"},
@@ -1,11 +1,14 @@
 import json
 import logging
+from collections.abc import Iterator
+from contextlib import contextmanager
 from typing import TYPE_CHECKING

+import httpx
+
 from paperless.models import LLMBackend

 if TYPE_CHECKING:
-    from llama_index.core.llms import ChatMessage
    from llama_index.llms.ollama import Ollama
    from llama_index.llms.openai_like import OpenAILike

@@ -16,6 +19,7 @@ from paperless.network import create_pinned_async_httpx_client
 from paperless.network import create_pinned_httpx_client
 from paperless.network import validate_outbound_http_url
 from paperless_ai.base_model import DocumentClassifierSchema
+from paperless_ai.exceptions import LLMTimeoutError

 logger = logging.getLogger("paperless_ai.client")

@@ -61,16 +65,16 @@ class AIClient:
                model=self.settings.llm_model or "llama3.1",
                base_url=endpoint,
                context_window=self.settings.llm_context_size,
-                request_timeout=120,
+                request_timeout=self.settings.llm_request_timeout,
                system_prompt=LLM_SYSTEM_PROMPT,
                client=Client(
                    host=endpoint,
-                    timeout=120,
+                    timeout=self.settings.llm_request_timeout,
                    transport=transport,
                ),
                async_client=AsyncClient(
                    host=endpoint,
-                    timeout=120,
+                    timeout=self.settings.llm_request_timeout,
                    transport=async_transport,
                ),
            )
@@ -84,15 +88,18 @@ class AIClient:
                http_client = create_pinned_httpx_client(
                    endpoint,
                    allow_internal=self.settings.llm_allow_internal_endpoints,
+                    timeout=self.settings.llm_request_timeout,
                )
                async_http_client = create_pinned_async_httpx_client(
                    endpoint,
                    allow_internal=self.settings.llm_allow_internal_endpoints,
+                    timeout=self.settings.llm_request_timeout,
                )
            return OpenAILike(
                model=self.settings.llm_model or "gpt-3.5-turbo",
                api_base=endpoint,
                api_key=self.settings.llm_api_key,
+                timeout=self.settings.llm_request_timeout,
                is_chat_model=True,
                is_function_calling_model=True,
                system_prompt=LLM_SYSTEM_PROMPT,
@@ -113,11 +120,12 @@ class AIClient:

        user_msg = ChatMessage(role="user", content=prompt)
        if self.settings.llm_backend == LLMBackend.OLLAMA:
-            result = self.llm.chat(
-                [user_msg],
-                format=DocumentClassifierSchema.model_json_schema(),
-                think=False,
-            )
+            with self._normalize_timeouts():
+                result = self.llm.chat(
+                    [user_msg],
+                    format=DocumentClassifierSchema.model_json_schema(),
+                    think=False,
+                )
            logger.debug("LLM query result: %s", result)
            parsed = DocumentClassifierSchema(**json.loads(result.message.content))
            return parsed.model_dump()
@@ -125,27 +133,39 @@ class AIClient:
        from llama_index.core.program.function_program import get_function_tool

        tool = get_function_tool(DocumentClassifierSchema)
-        result = self.llm.chat_with_tools(
-            tools=[tool],
-            user_msg=user_msg,
-            chat_history=[],
-            allow_parallel_tool_calls=True,
-            tool_required=True,
-        )
-        tool_calls = self.llm.get_tool_calls_from_response(
-            result,
-            error_on_no_tool_call=True,
-        )
+        with self._normalize_timeouts():
+            result = self.llm.chat_with_tools(
+                tools=[tool],
+                user_msg=user_msg,
+                chat_history=[],
+                allow_parallel_tool_calls=True,
+                tool_required=True,
+            )
+            tool_calls = self.llm.get_tool_calls_from_response(
+                result,
+                error_on_no_tool_call=True,
+            )
        logger.debug("LLM query result: %s", tool_calls)
        parsed = DocumentClassifierSchema(**tool_calls[0].tool_kwargs)
        return parsed.model_dump()

-    def run_chat(self, messages: list["ChatMessage"]) -> str:
-        logger.debug(
-            "Running chat query against %s with model %s",
-            self.settings.llm_backend,
-            self.settings.llm_model,
-        )
-        result = self.llm.chat(messages)
-        logger.debug("Chat result: %s", result)
-        return result
+    @contextmanager
+    def _normalize_timeouts(self) -> Iterator[None]:
+        try:
+            yield
+        except httpx.TimeoutException as exc:
+            raise LLMTimeoutError from exc
+        except Exception as exc:
+            if self._is_openai_timeout(exc):
+                raise LLMTimeoutError from exc
+            raise
+
+    def _is_openai_timeout(self, exc: Exception) -> bool:
+        if self.settings.llm_backend != LLMBackend.OPENAI_LIKE:
+            return False
+
+        # Keep OpenAI imports out of module import paths and only load the SDK
+        # when translating an error from an OpenAI-backed request.
+        from openai import APITimeoutError
+
+        return isinstance(exc, APITimeoutError)
@@ -32,15 +32,18 @@ def get_embedding_model(config: AIConfig) -> "BaseEmbedding":
                http_client = create_pinned_httpx_client(
                    endpoint,
                    allow_internal=config.llm_allow_internal_endpoints,
+                    timeout=config.llm_request_timeout,
                )
                async_http_client = create_pinned_async_httpx_client(
                    endpoint,
                    allow_internal=config.llm_allow_internal_endpoints,
+                    timeout=config.llm_request_timeout,
                )
            return OpenAILikeEmbedding(
                model_name=config.llm_embedding_model or "text-embedding-3-small",
                api_key=config.llm_api_key,
                api_base=endpoint,
+                timeout=config.llm_request_timeout,
                http_client=http_client,
                async_http_client=async_http_client,
            )
@@ -73,12 +76,14 @@ def get_embedding_model(config: AIConfig) -> "BaseEmbedding":
            )
            embedding._client = Client(
                host=endpoint,
+                timeout=config.llm_request_timeout,
                transport=PinnedHostHTTPTransport(
                    allow_internal=config.llm_allow_internal_endpoints,
                ),
            )
            embedding._async_client = AsyncClient(
                host=endpoint,
+                timeout=config.llm_request_timeout,
                transport=PinnedHostAsyncHTTPTransport(
                    allow_internal=config.llm_allow_internal_endpoints,
                ),
@@ -0,0 +1,2 @@
+class LLMTimeoutError(Exception):
+    pass
@@ -3,12 +3,14 @@ from unittest.mock import ANY
 from unittest.mock import MagicMock
 from unittest.mock import patch

+import httpx
+import openai
 import pytest
-from llama_index.core.llms import ChatMessage
 from llama_index.core.llms.llm import ToolSelection

 from paperless_ai.client import LLM_SYSTEM_PROMPT
 from paperless_ai.client import AIClient
+from paperless_ai.exceptions import LLMTimeoutError


@pytest.fixture
@@ -17,6 +19,7 @@ def mock_ai_config():
        mock_config = MagicMock()
        mock_config.llm_allow_internal_endpoints = True
        mock_config.llm_context_size = 8192
+        mock_config.llm_request_timeout = 120
        MockAIConfig.return_value = mock_config
        yield mock_config

@@ -64,6 +67,7 @@ def test_get_llm_openai(mock_ai_config, mock_openai_llm):
        model="test_model",
        api_base="http://test-url",
        api_key="test_api_key",
+        timeout=120,
        is_chat_model=True,
        is_function_calling_model=True,
        system_prompt=LLM_SYSTEM_PROMPT,
@@ -151,17 +155,38 @@ def test_run_llm_query_openai_uses_tools(mock_ai_config, mock_openai_llm):
    mock_llm_instance.chat_with_tools.assert_called_once()


-def test_run_chat(mock_ai_config, mock_ollama_llm):
+def test_run_llm_query_openai_timeout_raises_local_error(
+    mock_ai_config,
+    mock_openai_llm,
+):
+    mock_ai_config.llm_backend = "openai-like"
+    mock_ai_config.llm_model = "test_model"
+    mock_ai_config.llm_api_key = "test_api_key"
+    mock_ai_config.llm_endpoint = "http://test-url"
+
+    request = httpx.Request("POST", "http://test-url/v1/chat/completions")
+    mock_openai_llm.return_value.chat_with_tools.side_effect = openai.APITimeoutError(
+        request,
+    )
+
+    client = AIClient()
+
+    with pytest.raises(LLMTimeoutError):
+        client.run_llm_query("test_prompt")
+
+
+def test_run_llm_query_httpx_timeout_raises_local_error(
+    mock_ai_config,
+    mock_ollama_llm,
+):
    mock_ai_config.llm_backend = "ollama"
    mock_ai_config.llm_model = "test_model"
    mock_ai_config.llm_endpoint = "http://test-url"

    mock_llm_instance = mock_ollama_llm.return_value
-    mock_llm_instance.chat.return_value = "test_chat_result"
+    mock_llm_instance.chat.side_effect = httpx.ReadTimeout("timed out")

    client = AIClient()
-    messages = [ChatMessage(role="user", content="Hello")]
-    result = client.run_chat(messages)

-    mock_llm_instance.chat.assert_called_once_with(messages)
-    assert result == "test_chat_result"
+    with pytest.raises(LLMTimeoutError):
+        client.run_llm_query("test_prompt")
@@ -19,6 +19,7 @@ def mock_ai_config():
        MockAIConfig.return_value.llm_embedding_endpoint = None
        MockAIConfig.return_value.llm_allow_internal_endpoints = True
        MockAIConfig.return_value.llm_context_size = 8192
+        MockAIConfig.return_value.llm_request_timeout = 120
        yield MockAIConfig


@@ -71,6 +72,7 @@ def test_get_embedding_model_openai(mock_ai_config):
            model_name="text-embedding-3-small",
            api_key="test_api_key",
            api_base="http://test-url",
+            timeout=120,
            http_client=ANY,
            async_http_client=ANY,
        )
@@ -92,6 +94,7 @@ def test_get_embedding_model_openai_prefers_embedding_endpoint(mock_ai_config):
            model_name="text-embedding-3-small",
            api_key="test_api_key",
            api_base="http://embedding-url",
+            timeout=120,
            http_client=ANY,
            async_http_client=ANY,
        )