mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-20 20:34:20 +00:00
Enhancement (beta): support LLM timeout config (#13002)
This commit is contained in:
@@ -2068,6 +2068,13 @@ context by default.
|
||||
|
||||
Defaults to 8192.
|
||||
|
||||
#### [`PAPERLESS_AI_LLM_REQUEST_TIMEOUT=<int>`](#PAPERLESS_AI_LLM_REQUEST_TIMEOUT) {#PAPERLESS_AI_LLM_REQUEST_TIMEOUT}
|
||||
|
||||
: The timeout, in seconds, for requests to the configured AI backend. Increase this when using
|
||||
local or slow inference servers that need more time to generate responses.
|
||||
|
||||
Defaults to 120.
|
||||
|
||||
#### [`PAPERLESS_AI_LLM_BACKEND=<str>`](#PAPERLESS_AI_LLM_BACKEND) {#PAPERLESS_AI_LLM_BACKEND}
|
||||
|
||||
: The AI backend to use. This can be either "openai-like" or "ollama". If set to "ollama", the AI
|
||||
|
||||
@@ -360,6 +360,14 @@ export const PaperlessConfigOptions: ConfigOption[] = [
|
||||
category: ConfigCategory.AI,
|
||||
note: $localize`Language to use for generated AI suggestions. When unset, AI suggestions use the user's display language if explicitly set.`,
|
||||
},
|
||||
{
|
||||
key: 'llm_request_timeout',
|
||||
title: $localize`LLM Request Timeout`,
|
||||
type: ConfigOptionType.Number,
|
||||
config_key: 'PAPERLESS_AI_LLM_REQUEST_TIMEOUT',
|
||||
category: ConfigCategory.AI,
|
||||
note: $localize`Timeout in seconds for LLM requests.`,
|
||||
},
|
||||
]
|
||||
|
||||
export interface PaperlessConfig extends ObjectWithId {
|
||||
@@ -401,4 +409,5 @@ export interface PaperlessConfig extends ObjectWithId {
|
||||
llm_api_key: string
|
||||
llm_endpoint: string
|
||||
llm_output_language: string
|
||||
llm_request_timeout: number
|
||||
}
|
||||
|
||||
@@ -82,6 +82,7 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
|
||||
"llm_api_key": None,
|
||||
"llm_endpoint": None,
|
||||
"llm_output_language": None,
|
||||
"llm_request_timeout": None,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ from documents.signals.handlers import update_llm_suggestions_cache
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import read_streaming_response
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless_ai.exceptions import LLMTimeoutError
|
||||
|
||||
|
||||
class TestViews(DirectoriesMixin, TestCase):
|
||||
@@ -476,6 +477,33 @@ class TestAISuggestions(DirectoriesMixin, TestCase):
|
||||
get_llm_suggestion_cache(self.document.pk, backend="openai-like"),
|
||||
)
|
||||
|
||||
@patch("documents.views.get_ai_document_classification")
|
||||
@override_settings(
|
||||
AI_ENABLED=True,
|
||||
LLM_BACKEND="openai-like",
|
||||
)
|
||||
def test_ai_suggestions_with_llm_timeout(
|
||||
self,
|
||||
mock_get_ai_classification,
|
||||
) -> None:
|
||||
mock_get_ai_classification.side_effect = LLMTimeoutError()
|
||||
|
||||
self.client.force_login(user=self.user)
|
||||
response = self.client.get(
|
||||
f"/api/documents/{self.document.pk}/ai_suggestions/",
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_503_SERVICE_UNAVAILABLE)
|
||||
self.assertEqual(
|
||||
response.json(),
|
||||
{
|
||||
"ai": ["AI backend request timed out."],
|
||||
},
|
||||
)
|
||||
self.assertIsNone(
|
||||
get_llm_suggestion_cache(self.document.pk, backend="openai-like"),
|
||||
)
|
||||
|
||||
def test_invalidate_suggestions_cache(self) -> None:
|
||||
self.client.force_login(user=self.user)
|
||||
suggestions = {
|
||||
|
||||
@@ -241,6 +241,7 @@ from paperless.serialisers import UserSerializer
|
||||
from paperless.views import StandardPagination
|
||||
from paperless_ai.ai_classifier import get_ai_document_classification
|
||||
from paperless_ai.chat import stream_chat_with_documents
|
||||
from paperless_ai.exceptions import LLMTimeoutError
|
||||
from paperless_ai.matching import extract_unmatched_names
|
||||
from paperless_ai.matching import match_correspondents_by_name
|
||||
from paperless_ai.matching import match_document_types_by_name
|
||||
@@ -1510,6 +1511,17 @@ class DocumentViewSet(
|
||||
exc_info=True,
|
||||
)
|
||||
raise ValidationError({"ai": [_("Invalid AI configuration.")]}) from exc
|
||||
except LLMTimeoutError as exc:
|
||||
logger.exception(
|
||||
"AI backend timed out while generating suggestions for document %s: %s",
|
||||
doc.pk,
|
||||
exc,
|
||||
exc_info=True,
|
||||
)
|
||||
return Response(
|
||||
{"ai": [_("AI backend request timed out.")]},
|
||||
status=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
)
|
||||
|
||||
matched_tags = match_tags_by_name(
|
||||
llm_suggestions.get("tags", []),
|
||||
|
||||
@@ -197,6 +197,7 @@ class AIConfig(BaseConfig):
|
||||
llm_embedding_endpoint: str = dataclasses.field(init=False)
|
||||
llm_embedding_chunk_size: int = dataclasses.field(init=False)
|
||||
llm_context_size: int = dataclasses.field(init=False)
|
||||
llm_request_timeout: int = dataclasses.field(init=False)
|
||||
llm_backend: str = dataclasses.field(init=False)
|
||||
llm_model: str = dataclasses.field(init=False)
|
||||
llm_api_key: str = dataclasses.field(init=False)
|
||||
@@ -221,6 +222,9 @@ class AIConfig(BaseConfig):
|
||||
app_config.llm_embedding_chunk_size or settings.LLM_EMBEDDING_CHUNK_SIZE
|
||||
)
|
||||
self.llm_context_size = app_config.llm_context_size or settings.LLM_CONTEXT_SIZE
|
||||
self.llm_request_timeout = (
|
||||
app_config.llm_request_timeout or settings.LLM_REQUEST_TIMEOUT
|
||||
)
|
||||
self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
|
||||
self.llm_model = app_config.llm_model or settings.LLM_MODEL
|
||||
self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
# Generated by Django 5.2.14 on 2026-06-14 14:22
|
||||
|
||||
import django.core.validators
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("paperless", "0012_applicationconfiguration_llm_output_language"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_request_timeout",
|
||||
field=models.PositiveSmallIntegerField(
|
||||
null=True,
|
||||
validators=[django.core.validators.MinValueValidator(1)],
|
||||
verbose_name="Sets the LLM request timeout in seconds",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -366,6 +366,12 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
||||
max_length=32,
|
||||
)
|
||||
|
||||
llm_request_timeout = models.PositiveSmallIntegerField(
|
||||
verbose_name=_("Sets the LLM timeout in seconds"),
|
||||
null=True,
|
||||
validators=[MinValueValidator(1)],
|
||||
)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _("paperless application settings")
|
||||
permissions = [
|
||||
|
||||
@@ -1206,6 +1206,9 @@ if LLM_EMBEDDING_CHUNK_SIZE < 1:
|
||||
LLM_CONTEXT_SIZE = get_int_from_env("PAPERLESS_AI_LLM_CONTEXT_SIZE", 8192)
|
||||
if LLM_CONTEXT_SIZE < 1:
|
||||
raise ImproperlyConfigured("PAPERLESS_AI_LLM_CONTEXT_SIZE must be >= 1")
|
||||
LLM_REQUEST_TIMEOUT = get_int_from_env("PAPERLESS_AI_LLM_REQUEST_TIMEOUT", 120)
|
||||
if LLM_REQUEST_TIMEOUT < 1:
|
||||
raise ImproperlyConfigured("PAPERLESS_AI_LLM_REQUEST_TIMEOUT must be >= 1")
|
||||
LLM_BACKEND = get_choice_from_env(
|
||||
"PAPERLESS_AI_LLM_BACKEND",
|
||||
{"ollama", "openai-like"},
|
||||
|
||||
+49
-29
@@ -1,11 +1,14 @@
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Iterator
|
||||
from contextlib import contextmanager
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import httpx
|
||||
|
||||
from paperless.models import LLMBackend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from llama_index.core.llms import ChatMessage
|
||||
from llama_index.llms.ollama import Ollama
|
||||
from llama_index.llms.openai_like import OpenAILike
|
||||
|
||||
@@ -16,6 +19,7 @@ from paperless.network import create_pinned_async_httpx_client
|
||||
from paperless.network import create_pinned_httpx_client
|
||||
from paperless.network import validate_outbound_http_url
|
||||
from paperless_ai.base_model import DocumentClassifierSchema
|
||||
from paperless_ai.exceptions import LLMTimeoutError
|
||||
|
||||
logger = logging.getLogger("paperless_ai.client")
|
||||
|
||||
@@ -61,16 +65,16 @@ class AIClient:
|
||||
model=self.settings.llm_model or "llama3.1",
|
||||
base_url=endpoint,
|
||||
context_window=self.settings.llm_context_size,
|
||||
request_timeout=120,
|
||||
request_timeout=self.settings.llm_request_timeout,
|
||||
system_prompt=LLM_SYSTEM_PROMPT,
|
||||
client=Client(
|
||||
host=endpoint,
|
||||
timeout=120,
|
||||
timeout=self.settings.llm_request_timeout,
|
||||
transport=transport,
|
||||
),
|
||||
async_client=AsyncClient(
|
||||
host=endpoint,
|
||||
timeout=120,
|
||||
timeout=self.settings.llm_request_timeout,
|
||||
transport=async_transport,
|
||||
),
|
||||
)
|
||||
@@ -84,15 +88,18 @@ class AIClient:
|
||||
http_client = create_pinned_httpx_client(
|
||||
endpoint,
|
||||
allow_internal=self.settings.llm_allow_internal_endpoints,
|
||||
timeout=self.settings.llm_request_timeout,
|
||||
)
|
||||
async_http_client = create_pinned_async_httpx_client(
|
||||
endpoint,
|
||||
allow_internal=self.settings.llm_allow_internal_endpoints,
|
||||
timeout=self.settings.llm_request_timeout,
|
||||
)
|
||||
return OpenAILike(
|
||||
model=self.settings.llm_model or "gpt-3.5-turbo",
|
||||
api_base=endpoint,
|
||||
api_key=self.settings.llm_api_key,
|
||||
timeout=self.settings.llm_request_timeout,
|
||||
is_chat_model=True,
|
||||
is_function_calling_model=True,
|
||||
system_prompt=LLM_SYSTEM_PROMPT,
|
||||
@@ -113,11 +120,12 @@ class AIClient:
|
||||
|
||||
user_msg = ChatMessage(role="user", content=prompt)
|
||||
if self.settings.llm_backend == LLMBackend.OLLAMA:
|
||||
result = self.llm.chat(
|
||||
[user_msg],
|
||||
format=DocumentClassifierSchema.model_json_schema(),
|
||||
think=False,
|
||||
)
|
||||
with self._normalize_timeouts():
|
||||
result = self.llm.chat(
|
||||
[user_msg],
|
||||
format=DocumentClassifierSchema.model_json_schema(),
|
||||
think=False,
|
||||
)
|
||||
logger.debug("LLM query result: %s", result)
|
||||
parsed = DocumentClassifierSchema(**json.loads(result.message.content))
|
||||
return parsed.model_dump()
|
||||
@@ -125,27 +133,39 @@ class AIClient:
|
||||
from llama_index.core.program.function_program import get_function_tool
|
||||
|
||||
tool = get_function_tool(DocumentClassifierSchema)
|
||||
result = self.llm.chat_with_tools(
|
||||
tools=[tool],
|
||||
user_msg=user_msg,
|
||||
chat_history=[],
|
||||
allow_parallel_tool_calls=True,
|
||||
tool_required=True,
|
||||
)
|
||||
tool_calls = self.llm.get_tool_calls_from_response(
|
||||
result,
|
||||
error_on_no_tool_call=True,
|
||||
)
|
||||
with self._normalize_timeouts():
|
||||
result = self.llm.chat_with_tools(
|
||||
tools=[tool],
|
||||
user_msg=user_msg,
|
||||
chat_history=[],
|
||||
allow_parallel_tool_calls=True,
|
||||
tool_required=True,
|
||||
)
|
||||
tool_calls = self.llm.get_tool_calls_from_response(
|
||||
result,
|
||||
error_on_no_tool_call=True,
|
||||
)
|
||||
logger.debug("LLM query result: %s", tool_calls)
|
||||
parsed = DocumentClassifierSchema(**tool_calls[0].tool_kwargs)
|
||||
return parsed.model_dump()
|
||||
|
||||
def run_chat(self, messages: list["ChatMessage"]) -> str:
|
||||
logger.debug(
|
||||
"Running chat query against %s with model %s",
|
||||
self.settings.llm_backend,
|
||||
self.settings.llm_model,
|
||||
)
|
||||
result = self.llm.chat(messages)
|
||||
logger.debug("Chat result: %s", result)
|
||||
return result
|
||||
@contextmanager
|
||||
def _normalize_timeouts(self) -> Iterator[None]:
|
||||
try:
|
||||
yield
|
||||
except httpx.TimeoutException as exc:
|
||||
raise LLMTimeoutError from exc
|
||||
except Exception as exc:
|
||||
if self._is_openai_timeout(exc):
|
||||
raise LLMTimeoutError from exc
|
||||
raise
|
||||
|
||||
def _is_openai_timeout(self, exc: Exception) -> bool:
|
||||
if self.settings.llm_backend != LLMBackend.OPENAI_LIKE:
|
||||
return False
|
||||
|
||||
# Keep OpenAI imports out of module import paths and only load the SDK
|
||||
# when translating an error from an OpenAI-backed request.
|
||||
from openai import APITimeoutError
|
||||
|
||||
return isinstance(exc, APITimeoutError)
|
||||
|
||||
@@ -32,15 +32,18 @@ def get_embedding_model(config: AIConfig) -> "BaseEmbedding":
|
||||
http_client = create_pinned_httpx_client(
|
||||
endpoint,
|
||||
allow_internal=config.llm_allow_internal_endpoints,
|
||||
timeout=config.llm_request_timeout,
|
||||
)
|
||||
async_http_client = create_pinned_async_httpx_client(
|
||||
endpoint,
|
||||
allow_internal=config.llm_allow_internal_endpoints,
|
||||
timeout=config.llm_request_timeout,
|
||||
)
|
||||
return OpenAILikeEmbedding(
|
||||
model_name=config.llm_embedding_model or "text-embedding-3-small",
|
||||
api_key=config.llm_api_key,
|
||||
api_base=endpoint,
|
||||
timeout=config.llm_request_timeout,
|
||||
http_client=http_client,
|
||||
async_http_client=async_http_client,
|
||||
)
|
||||
@@ -73,12 +76,14 @@ def get_embedding_model(config: AIConfig) -> "BaseEmbedding":
|
||||
)
|
||||
embedding._client = Client(
|
||||
host=endpoint,
|
||||
timeout=config.llm_request_timeout,
|
||||
transport=PinnedHostHTTPTransport(
|
||||
allow_internal=config.llm_allow_internal_endpoints,
|
||||
),
|
||||
)
|
||||
embedding._async_client = AsyncClient(
|
||||
host=endpoint,
|
||||
timeout=config.llm_request_timeout,
|
||||
transport=PinnedHostAsyncHTTPTransport(
|
||||
allow_internal=config.llm_allow_internal_endpoints,
|
||||
),
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
class LLMTimeoutError(Exception):
|
||||
pass
|
||||
@@ -3,12 +3,14 @@ from unittest.mock import ANY
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import httpx
|
||||
import openai
|
||||
import pytest
|
||||
from llama_index.core.llms import ChatMessage
|
||||
from llama_index.core.llms.llm import ToolSelection
|
||||
|
||||
from paperless_ai.client import LLM_SYSTEM_PROMPT
|
||||
from paperless_ai.client import AIClient
|
||||
from paperless_ai.exceptions import LLMTimeoutError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -17,6 +19,7 @@ def mock_ai_config():
|
||||
mock_config = MagicMock()
|
||||
mock_config.llm_allow_internal_endpoints = True
|
||||
mock_config.llm_context_size = 8192
|
||||
mock_config.llm_request_timeout = 120
|
||||
MockAIConfig.return_value = mock_config
|
||||
yield mock_config
|
||||
|
||||
@@ -64,6 +67,7 @@ def test_get_llm_openai(mock_ai_config, mock_openai_llm):
|
||||
model="test_model",
|
||||
api_base="http://test-url",
|
||||
api_key="test_api_key",
|
||||
timeout=120,
|
||||
is_chat_model=True,
|
||||
is_function_calling_model=True,
|
||||
system_prompt=LLM_SYSTEM_PROMPT,
|
||||
@@ -151,17 +155,38 @@ def test_run_llm_query_openai_uses_tools(mock_ai_config, mock_openai_llm):
|
||||
mock_llm_instance.chat_with_tools.assert_called_once()
|
||||
|
||||
|
||||
def test_run_chat(mock_ai_config, mock_ollama_llm):
|
||||
def test_run_llm_query_openai_timeout_raises_local_error(
|
||||
mock_ai_config,
|
||||
mock_openai_llm,
|
||||
):
|
||||
mock_ai_config.llm_backend = "openai-like"
|
||||
mock_ai_config.llm_model = "test_model"
|
||||
mock_ai_config.llm_api_key = "test_api_key"
|
||||
mock_ai_config.llm_endpoint = "http://test-url"
|
||||
|
||||
request = httpx.Request("POST", "http://test-url/v1/chat/completions")
|
||||
mock_openai_llm.return_value.chat_with_tools.side_effect = openai.APITimeoutError(
|
||||
request,
|
||||
)
|
||||
|
||||
client = AIClient()
|
||||
|
||||
with pytest.raises(LLMTimeoutError):
|
||||
client.run_llm_query("test_prompt")
|
||||
|
||||
|
||||
def test_run_llm_query_httpx_timeout_raises_local_error(
|
||||
mock_ai_config,
|
||||
mock_ollama_llm,
|
||||
):
|
||||
mock_ai_config.llm_backend = "ollama"
|
||||
mock_ai_config.llm_model = "test_model"
|
||||
mock_ai_config.llm_endpoint = "http://test-url"
|
||||
|
||||
mock_llm_instance = mock_ollama_llm.return_value
|
||||
mock_llm_instance.chat.return_value = "test_chat_result"
|
||||
mock_llm_instance.chat.side_effect = httpx.ReadTimeout("timed out")
|
||||
|
||||
client = AIClient()
|
||||
messages = [ChatMessage(role="user", content="Hello")]
|
||||
result = client.run_chat(messages)
|
||||
|
||||
mock_llm_instance.chat.assert_called_once_with(messages)
|
||||
assert result == "test_chat_result"
|
||||
with pytest.raises(LLMTimeoutError):
|
||||
client.run_llm_query("test_prompt")
|
||||
|
||||
@@ -19,6 +19,7 @@ def mock_ai_config():
|
||||
MockAIConfig.return_value.llm_embedding_endpoint = None
|
||||
MockAIConfig.return_value.llm_allow_internal_endpoints = True
|
||||
MockAIConfig.return_value.llm_context_size = 8192
|
||||
MockAIConfig.return_value.llm_request_timeout = 120
|
||||
yield MockAIConfig
|
||||
|
||||
|
||||
@@ -71,6 +72,7 @@ def test_get_embedding_model_openai(mock_ai_config):
|
||||
model_name="text-embedding-3-small",
|
||||
api_key="test_api_key",
|
||||
api_base="http://test-url",
|
||||
timeout=120,
|
||||
http_client=ANY,
|
||||
async_http_client=ANY,
|
||||
)
|
||||
@@ -92,6 +94,7 @@ def test_get_embedding_model_openai_prefers_embedding_endpoint(mock_ai_config):
|
||||
model_name="text-embedding-3-small",
|
||||
api_key="test_api_key",
|
||||
api_base="http://embedding-url",
|
||||
timeout=120,
|
||||
http_client=ANY,
|
||||
async_http_client=ANY,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user