mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-28 16:24:19 +00:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b2e4cbd980 | |||
| 7632b49e90 | |||
| 1a5c370ed5 |
+3
-1
@@ -104,6 +104,8 @@ ARG JBIG2ENC_VERSION=0.30
|
||||
# Set Python environment variables
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
# Ignore warning from Whitenoise about async iterators
|
||||
PYTHONWARNINGS="ignore:::django.http.response:517" \
|
||||
PNGX_CONTAINERIZED=1 \
|
||||
# https://docs.astral.sh/uv/reference/settings/#link-mode
|
||||
UV_LINK_MODE=copy
|
||||
@@ -237,7 +239,7 @@ RUN set -eux \
|
||||
&& echo "Making fontconfig cache writable for arbitrary container UIDs" \
|
||||
&& chmod 1777 /var/cache/fontconfig \
|
||||
&& echo "Collecting static files" \
|
||||
&& PAPERLESS_SECRET_KEY=build-time-dummy s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input \
|
||||
&& PAPERLESS_SECRET_KEY=build-time-dummy s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input --link \
|
||||
&& PAPERLESS_SECRET_KEY=build-time-dummy s6-setuidgid paperless python3 manage.py compilemessages \
|
||||
&& /usr/local/bin/deduplicate.py --verbose /usr/src/paperless/static/
|
||||
|
||||
|
||||
@@ -8,13 +8,6 @@ export GRANIAN_HOST=${GRANIAN_HOST:-${PAPERLESS_BIND_ADDR:-"::"}}
|
||||
export GRANIAN_PORT=${GRANIAN_PORT:-${PAPERLESS_PORT:-8000}}
|
||||
export GRANIAN_WORKERS=${GRANIAN_WORKERS:-${PAPERLESS_WEBSERVER_WORKERS:-1}}
|
||||
|
||||
# Static file serving: Granian matches against the raw URI path (before any
|
||||
# SCRIPT_NAME stripping), so the route must include the subpath prefix.
|
||||
_static_dir="${PAPERLESS_STATICDIR:-/usr/src/paperless/static}"
|
||||
_static_route="${PAPERLESS_FORCE_SCRIPT_NAME}/static"
|
||||
export GRANIAN_STATIC_PATH_MOUNT=${GRANIAN_STATIC_PATH_MOUNT:-${_static_dir}}
|
||||
export GRANIAN_STATIC_PATH_ROUTE=${GRANIAN_STATIC_PATH_ROUTE:-${_static_route:-/static}}
|
||||
|
||||
# Only set GRANIAN_URL_PATH_PREFIX if PAPERLESS_FORCE_SCRIPT_NAME is set
|
||||
if [[ -n "${PAPERLESS_FORCE_SCRIPT_NAME}" ]]; then
|
||||
export GRANIAN_URL_PATH_PREFIX=${PAPERLESS_FORCE_SCRIPT_NAME}
|
||||
|
||||
+1
-1
@@ -16,7 +16,7 @@ classifiers = [
|
||||
dependencies = [
|
||||
"azure-ai-documentintelligence>=1.0.2",
|
||||
"babel>=2.17",
|
||||
"bleach~=6.3.0",
|
||||
"bleach[css]~=6.3.0",
|
||||
"celery[redis]~=5.6.2",
|
||||
"channels~=4.2",
|
||||
"channels-redis~=4.2",
|
||||
|
||||
@@ -23,10 +23,6 @@ ExecStart=/bin/sh -c '\
|
||||
[ -n "$PAPERLESS_WEBSERVER_WORKERS" ] && export GRANIAN_WORKERS=$PAPERLESS_WEBSERVER_WORKERS; \
|
||||
# URL path prefix: only set if PAPERLESS_FORCE_SCRIPT_NAME exists \
|
||||
[ -n "$PAPERLESS_FORCE_SCRIPT_NAME" ] && export GRANIAN_URL_PATH_PREFIX=$PAPERLESS_FORCE_SCRIPT_NAME; \
|
||||
# Static file serving: Granian matches the raw URI path (before SCRIPT_NAME stripping), \
|
||||
# so the route must include any subpath prefix. \
|
||||
[ -z "$GRANIAN_STATIC_PATH_MOUNT" ] && export GRANIAN_STATIC_PATH_MOUNT=${PAPERLESS_STATICDIR:-/opt/paperless/static}; \
|
||||
[ -z "$GRANIAN_STATIC_PATH_ROUTE" ] && export GRANIAN_STATIC_PATH_ROUTE="${PAPERLESS_FORCE_SCRIPT_NAME}/static"; \
|
||||
exec granian --interface asginl --ws --loop uvloop "paperless.asgi:application"'
|
||||
|
||||
[Install]
|
||||
|
||||
@@ -1,15 +1,12 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from datetime import UTC
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Final
|
||||
from typing import Self
|
||||
from typing import TypedDict
|
||||
from typing import TypeVar
|
||||
@@ -46,11 +43,6 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.getLogger("paperless.search")
|
||||
|
||||
_LOCK_TIMEOUT_SECONDS: Final[float] = 10.0 # per-attempt acquire timeout
|
||||
_LOCK_RETRY_ATTEMPTS: Final[int] = 4 # total attempts (1 initial + 3 retries)
|
||||
_LOCK_BACKOFF_BASE: Final[float] = 1.0 # seconds
|
||||
_LOCK_BACKOFF_CAP: Final[float] = 10.0 # seconds
|
||||
|
||||
_WORD_RE = regex.compile(r"\w+")
|
||||
_AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted content
|
||||
|
||||
@@ -191,27 +183,12 @@ class WriteBatch:
|
||||
if self._backend._path is not None:
|
||||
lock_path = self._backend._path / ".tantivy.lock"
|
||||
self._lock = filelock.FileLock(str(lock_path))
|
||||
for attempt in range(_LOCK_RETRY_ATTEMPTS):
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout)
|
||||
break
|
||||
except filelock.Timeout:
|
||||
if attempt == _LOCK_RETRY_ATTEMPTS - 1:
|
||||
raise SearchIndexLockError(
|
||||
f"Could not acquire index lock after {_LOCK_RETRY_ATTEMPTS} "
|
||||
f"attempts (timeout={self._lock_timeout}s each)",
|
||||
)
|
||||
sleep_s = random.uniform(
|
||||
0,
|
||||
min(_LOCK_BACKOFF_CAP, _LOCK_BACKOFF_BASE * (2**attempt)),
|
||||
)
|
||||
logger.debug(
|
||||
"Index lock contention; retrying in %.2fs (attempt %d/%d)",
|
||||
sleep_s,
|
||||
attempt + 1,
|
||||
_LOCK_RETRY_ATTEMPTS,
|
||||
)
|
||||
time.sleep(sleep_s)
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout)
|
||||
except filelock.Timeout as e: # pragma: no cover
|
||||
raise SearchIndexLockError(
|
||||
f"Could not acquire index lock within {self._lock_timeout}s",
|
||||
) from e
|
||||
|
||||
self._raw_writer = self._backend._index.writer()
|
||||
return self
|
||||
@@ -513,28 +490,13 @@ class TantivyBackend:
|
||||
Convenience method for single-document updates. For bulk operations,
|
||||
use batch_update() context manager for better performance.
|
||||
|
||||
On lock exhaustion after all retry attempts, schedules a deferred
|
||||
index_document Celery task and returns normally. Callers will NOT
|
||||
receive a SearchIndexLockError; the index write is deferred silently.
|
||||
|
||||
Args:
|
||||
document: Django Document instance to index
|
||||
effective_content: Override document.content for indexing
|
||||
"""
|
||||
self._ensure_open()
|
||||
try:
|
||||
with self.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS) as batch:
|
||||
batch.add_or_update(document, effective_content)
|
||||
except SearchIndexLockError:
|
||||
logger.error(
|
||||
"Search index lock exhausted for document %d after %d attempts; "
|
||||
"scheduling deferred index write",
|
||||
document.pk,
|
||||
_LOCK_RETRY_ATTEMPTS,
|
||||
)
|
||||
from documents.tasks import index_document
|
||||
|
||||
index_document.apply_async(args=[document.pk], countdown=60)
|
||||
with self.batch_update(lock_timeout=5.0) as batch:
|
||||
batch.add_or_update(document, effective_content)
|
||||
|
||||
def remove(self, doc_id: int) -> None:
|
||||
"""
|
||||
@@ -543,27 +505,12 @@ class TantivyBackend:
|
||||
Convenience method for single-document removal. For bulk operations,
|
||||
use batch_update() context manager for better performance.
|
||||
|
||||
On lock exhaustion after all retry attempts, schedules a deferred
|
||||
remove_document_from_index Celery task and returns normally.
|
||||
Callers will NOT receive a SearchIndexLockError.
|
||||
|
||||
Args:
|
||||
doc_id: Primary key of the document to remove
|
||||
"""
|
||||
self._ensure_open()
|
||||
try:
|
||||
with self.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS) as batch:
|
||||
batch.remove(doc_id)
|
||||
except SearchIndexLockError:
|
||||
logger.error(
|
||||
"Search index lock exhausted for doc_id %d after %d attempts; "
|
||||
"scheduling deferred index removal",
|
||||
doc_id,
|
||||
_LOCK_RETRY_ATTEMPTS,
|
||||
)
|
||||
from documents.tasks import remove_document_from_index
|
||||
|
||||
remove_document_from_index.apply_async(args=[doc_id], countdown=60)
|
||||
with self.batch_update(lock_timeout=5.0) as batch:
|
||||
batch.remove(doc_id)
|
||||
|
||||
def highlight_hits(
|
||||
self,
|
||||
|
||||
@@ -56,7 +56,6 @@ from documents.plugins.base import StopConsumeTaskError
|
||||
from documents.plugins.helpers import ProgressManager
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.sanity_checker import SanityCheckFailedException
|
||||
from documents.search._backend import SearchIndexLockError
|
||||
from documents.signals import document_updated
|
||||
from documents.signals.handlers import cleanup_document_deletion
|
||||
from documents.signals.handlers import run_workflows
|
||||
@@ -85,63 +84,6 @@ def index_optimize() -> None:
|
||||
)
|
||||
|
||||
|
||||
@shared_task(
|
||||
bind=True,
|
||||
ignore_result=True,
|
||||
autoretry_for=(SearchIndexLockError,),
|
||||
max_retries=5,
|
||||
retry_backoff=60,
|
||||
retry_jitter=True,
|
||||
)
|
||||
def index_document(self, document_id: int) -> None:
|
||||
"""
|
||||
Deferred single-document index write.
|
||||
|
||||
Used as a self-healing fallback when add_or_update() exhausts its lock retry
|
||||
budget during high-concurrency consumption. Runs via batch_update() directly
|
||||
to avoid re-entering the deferred scheduling path in add_or_update().
|
||||
|
||||
If the document was deleted before this task runs, it exits cleanly.
|
||||
"""
|
||||
from documents.search import get_backend
|
||||
|
||||
try:
|
||||
document = Document.objects.get(pk=document_id)
|
||||
except Document.DoesNotExist:
|
||||
logger.info(
|
||||
"index_document: document %d no longer exists; skipping",
|
||||
document_id,
|
||||
)
|
||||
return
|
||||
with get_backend().batch_update() as batch:
|
||||
batch.add_or_update(
|
||||
document,
|
||||
effective_content=document.get_effective_content(),
|
||||
)
|
||||
|
||||
|
||||
@shared_task(
|
||||
bind=True,
|
||||
ignore_result=True,
|
||||
autoretry_for=(SearchIndexLockError,),
|
||||
max_retries=5,
|
||||
retry_backoff=60,
|
||||
retry_jitter=True,
|
||||
)
|
||||
def remove_document_from_index(self, doc_id: int) -> None:
|
||||
"""
|
||||
Deferred single-document index removal.
|
||||
|
||||
Used as a self-healing fallback when remove() exhausts its lock retry budget.
|
||||
Operates only on the Tantivy index; no database lookup required.
|
||||
If the document has already been removed, the term-query delete is a no-op.
|
||||
"""
|
||||
from documents.search import get_backend
|
||||
|
||||
with get_backend().batch_update() as batch:
|
||||
batch.remove(doc_id)
|
||||
|
||||
|
||||
@shared_task
|
||||
def train_classifier(
|
||||
*,
|
||||
|
||||
@@ -1,248 +0,0 @@
|
||||
"""Tests for search index lock backoff, retry logic, and self-healing deferred tasks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import filelock
|
||||
import pytest
|
||||
|
||||
from documents.search._backend import _LOCK_BACKOFF_CAP
|
||||
from documents.search._backend import _LOCK_RETRY_ATTEMPTS
|
||||
from documents.search._backend import _LOCK_TIMEOUT_SECONDS
|
||||
from documents.search._backend import SearchIndexLockError
|
||||
from documents.search._backend import TantivyBackend
|
||||
from documents.tasks import index_document
|
||||
from documents.tasks import remove_document_from_index
|
||||
from documents.tests.factories import DocumentFactory
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
pytestmark = pytest.mark.search
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def disk_backend(tmp_path: Path) -> Generator[TantivyBackend, None, None]:
|
||||
"""On-disk TantivyBackend so the file-lock code path is exercised."""
|
||||
b = TantivyBackend(path=tmp_path)
|
||||
b.open()
|
||||
try:
|
||||
yield b
|
||||
finally:
|
||||
b.close()
|
||||
|
||||
|
||||
class TestWriteBatchLockRetry:
|
||||
"""Test WriteBatch retry loop with backoff + full jitter."""
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_lock_retries_then_succeeds(
|
||||
self,
|
||||
disk_backend: TantivyBackend,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""Timeout on first 3 attempts then success on 4th — document must be indexed."""
|
||||
doc = DocumentFactory()
|
||||
|
||||
acquire_calls = 0
|
||||
|
||||
def flaky_acquire(timeout: float) -> None:
|
||||
nonlocal acquire_calls
|
||||
acquire_calls += 1
|
||||
# Raise Timeout for first _LOCK_RETRY_ATTEMPTS - 1 calls, succeed on last
|
||||
if acquire_calls < _LOCK_RETRY_ATTEMPTS:
|
||||
raise filelock.Timeout("")
|
||||
|
||||
sleep_values: list[float] = []
|
||||
|
||||
mocker.patch(
|
||||
"documents.search._backend.filelock.FileLock.acquire",
|
||||
side_effect=flaky_acquire,
|
||||
)
|
||||
mock_sleep = mocker.patch(
|
||||
"documents.search._backend.time.sleep",
|
||||
side_effect=lambda s: sleep_values.append(s),
|
||||
)
|
||||
|
||||
# Should not raise — 4th attempt succeeds
|
||||
with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS) as batch:
|
||||
batch.add_or_update(doc)
|
||||
|
||||
# sleep called exactly _LOCK_RETRY_ATTEMPTS - 1 times (once per failed attempt)
|
||||
assert mock_sleep.call_count == _LOCK_RETRY_ATTEMPTS - 1
|
||||
|
||||
# All sleep values must be in [0, _LOCK_BACKOFF_CAP]
|
||||
for s in sleep_values:
|
||||
assert 0 <= s <= _LOCK_BACKOFF_CAP, (
|
||||
f"Sleep value {s} outside [0, {_LOCK_BACKOFF_CAP}]"
|
||||
)
|
||||
|
||||
def test_lock_exhaustion_raises_search_index_lock_error(
|
||||
self,
|
||||
disk_backend: TantivyBackend,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""All acquire attempts raise Timeout — WriteBatch must raise SearchIndexLockError."""
|
||||
mocker.patch(
|
||||
"documents.search._backend.filelock.FileLock.acquire",
|
||||
side_effect=filelock.Timeout(""),
|
||||
)
|
||||
mocker.patch("documents.search._backend.time.sleep")
|
||||
|
||||
with pytest.raises(SearchIndexLockError):
|
||||
with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS):
|
||||
pass
|
||||
|
||||
def test_jitter_values_in_range(
|
||||
self,
|
||||
disk_backend: TantivyBackend,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""Sleep values must always lie in [0, _LOCK_BACKOFF_CAP] across many samples."""
|
||||
mocker.patch(
|
||||
"documents.search._backend.filelock.FileLock.acquire",
|
||||
side_effect=filelock.Timeout(""),
|
||||
)
|
||||
sleep_values: list[float] = []
|
||||
mocker.patch(
|
||||
"documents.search._backend.time.sleep",
|
||||
side_effect=lambda s: sleep_values.append(s),
|
||||
)
|
||||
for _ in range(50):
|
||||
sleep_values.clear()
|
||||
with pytest.raises(SearchIndexLockError):
|
||||
with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS):
|
||||
pass
|
||||
|
||||
for s in sleep_values:
|
||||
assert 0 <= s <= _LOCK_BACKOFF_CAP, (
|
||||
f"Jitter {s} exceeds cap {_LOCK_BACKOFF_CAP}"
|
||||
)
|
||||
|
||||
|
||||
class TestAddOrUpdateDeferredScheduling:
|
||||
"""Test that add_or_update() and remove() defer to Celery on lock exhaustion."""
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_lock_exhaustion_schedules_deferred_task(
|
||||
self,
|
||||
disk_backend: TantivyBackend,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""Lock exhaustion in add_or_update must schedule index_document task, not raise."""
|
||||
doc = DocumentFactory()
|
||||
|
||||
mocker.patch(
|
||||
"documents.search._backend.filelock.FileLock.acquire",
|
||||
side_effect=filelock.Timeout(""),
|
||||
)
|
||||
mocker.patch("documents.search._backend.time.sleep")
|
||||
mock_apply = mocker.patch("documents.tasks.index_document.apply_async")
|
||||
|
||||
# Must NOT raise
|
||||
disk_backend.add_or_update(doc)
|
||||
|
||||
mock_apply.assert_called_once_with(args=[doc.pk], countdown=60)
|
||||
|
||||
def test_remove_exhaustion_schedules_deferred_task(
|
||||
self,
|
||||
disk_backend: TantivyBackend,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""Lock exhaustion in remove() must schedule remove_document_from_index task, not raise."""
|
||||
doc_id = 503
|
||||
|
||||
mocker.patch(
|
||||
"documents.search._backend.filelock.FileLock.acquire",
|
||||
side_effect=filelock.Timeout(""),
|
||||
)
|
||||
mocker.patch("documents.search._backend.time.sleep")
|
||||
mock_apply = mocker.patch(
|
||||
"documents.tasks.remove_document_from_index.apply_async",
|
||||
)
|
||||
|
||||
# Must NOT raise
|
||||
disk_backend.remove(doc_id)
|
||||
|
||||
mock_apply.assert_called_once_with(args=[doc_id], countdown=60)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestIndexDocumentTask:
|
||||
"""Test the deferred index_document and remove_document_from_index Celery tasks."""
|
||||
|
||||
def test_index_document_task_skips_deleted_document(
|
||||
self,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
"""index_document with a non-existent doc_id must return cleanly and log INFO."""
|
||||
nonexistent_id = 999999
|
||||
|
||||
with caplog.at_level(logging.INFO, logger="paperless.tasks"):
|
||||
index_document(nonexistent_id)
|
||||
|
||||
assert any("no longer exists" in record.message for record in caplog.records), (
|
||||
"Expected INFO log about missing document"
|
||||
)
|
||||
|
||||
def test_index_document_task_indexes_existing_document(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""index_document task must add the document to the index via batch_update."""
|
||||
doc = DocumentFactory(content="via deferred task")
|
||||
|
||||
# get_backend is imported lazily inside the task: `from documents.search import get_backend`
|
||||
mocker.patch(
|
||||
"documents.search.get_backend",
|
||||
return_value=backend,
|
||||
)
|
||||
index_document(doc.pk)
|
||||
|
||||
ids = backend.search_ids("deferred task", user=None)
|
||||
assert doc.pk in ids
|
||||
|
||||
def test_remove_document_from_index_task_removes_existing_document(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""remove_document_from_index task must remove the document from the index."""
|
||||
doc = DocumentFactory(content="will be removed by deferred task")
|
||||
backend.add_or_update(doc)
|
||||
assert doc.pk in backend.search_ids("removed", user=None)
|
||||
|
||||
mocker.patch("documents.search.get_backend", return_value=backend)
|
||||
remove_document_from_index(doc.pk)
|
||||
|
||||
assert doc.pk not in backend.search_ids("removed", user=None)
|
||||
|
||||
def test_task_does_not_swallow_lock_error(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""Verifies the task body propagates SearchIndexLockError so Celery's
|
||||
autoretry_for can catch it (rather than the task swallowing the error
|
||||
and silently succeeding)."""
|
||||
doc = DocumentFactory()
|
||||
|
||||
mock_batch = mocker.MagicMock()
|
||||
mock_batch.__enter__ = mocker.MagicMock(
|
||||
side_effect=SearchIndexLockError("exhausted"),
|
||||
)
|
||||
mock_batch.__exit__ = mocker.MagicMock(return_value=False)
|
||||
|
||||
mock_backend = mocker.MagicMock()
|
||||
mock_backend.batch_update.return_value = mock_batch
|
||||
|
||||
# get_backend is imported lazily inside the task: `from documents.search import get_backend`
|
||||
mocker.patch("documents.search.get_backend", return_value=mock_backend)
|
||||
|
||||
with pytest.raises(SearchIndexLockError):
|
||||
index_document(doc.pk)
|
||||
+254
-29
@@ -24,6 +24,7 @@ from typing import Self
|
||||
|
||||
from bleach import clean
|
||||
from bleach import linkify
|
||||
from bleach.css_sanitizer import CSSSanitizer
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.utils.timezone import is_naive
|
||||
@@ -38,6 +39,10 @@ from humanize import naturalsize
|
||||
from imap_tools import MailAttachment
|
||||
from imap_tools import MailMessage
|
||||
from tika_client import TikaClient
|
||||
from tinycss2 import parse_declaration_list
|
||||
from tinycss2 import parse_rule_list
|
||||
from tinycss2 import parse_stylesheet
|
||||
from tinycss2 import serialize
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
@@ -58,6 +63,238 @@ _SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"message/rfc822": ".eml",
|
||||
}
|
||||
|
||||
_EMAIL_HTML_TAGS = {
|
||||
"a",
|
||||
"abbr",
|
||||
"acronym",
|
||||
"address",
|
||||
"b",
|
||||
"blockquote",
|
||||
"br",
|
||||
"caption",
|
||||
"code",
|
||||
"dd",
|
||||
"del",
|
||||
"div",
|
||||
"dl",
|
||||
"dt",
|
||||
"em",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"hr",
|
||||
"i",
|
||||
"img",
|
||||
"li",
|
||||
"ol",
|
||||
"p",
|
||||
"pre",
|
||||
"s",
|
||||
"small",
|
||||
"span",
|
||||
"style",
|
||||
"strong",
|
||||
"sub",
|
||||
"sup",
|
||||
"table",
|
||||
"tbody",
|
||||
"td",
|
||||
"tfoot",
|
||||
"th",
|
||||
"thead",
|
||||
"tr",
|
||||
"u",
|
||||
"ul",
|
||||
}
|
||||
_EMAIL_HTML_PROTOCOLS = {"cid", "http", "https", "mailto"}
|
||||
_EMAIL_HTML_GLOBAL_ATTRIBUTES = {
|
||||
"abbr",
|
||||
"align",
|
||||
"alt",
|
||||
"height",
|
||||
"style",
|
||||
"title",
|
||||
"width",
|
||||
}
|
||||
_EMAIL_HTML_TAG_ATTRIBUTES = {
|
||||
"a": {"href", "name", "title"},
|
||||
"img": {"alt", "height", "src", "title", "width"},
|
||||
"ol": {"start", "type"},
|
||||
"td": {"colspan", "headers", "rowspan", "scope"},
|
||||
"th": {"colspan", "headers", "rowspan", "scope"},
|
||||
"ul": {"type"},
|
||||
}
|
||||
_EMAIL_CSS_PROPERTIES = {
|
||||
"background-color",
|
||||
"border",
|
||||
"border-bottom",
|
||||
"border-collapse",
|
||||
"border-color",
|
||||
"border-left",
|
||||
"border-right",
|
||||
"border-spacing",
|
||||
"border-style",
|
||||
"border-top",
|
||||
"border-width",
|
||||
"color",
|
||||
"display",
|
||||
"font",
|
||||
"font-family",
|
||||
"font-size",
|
||||
"font-style",
|
||||
"font-weight",
|
||||
"height",
|
||||
"line-height",
|
||||
"margin",
|
||||
"margin-bottom",
|
||||
"margin-left",
|
||||
"margin-right",
|
||||
"margin-top",
|
||||
"max-width",
|
||||
"min-width",
|
||||
"padding",
|
||||
"padding-bottom",
|
||||
"padding-left",
|
||||
"padding-right",
|
||||
"padding-top",
|
||||
"text-align",
|
||||
"text-decoration",
|
||||
"vertical-align",
|
||||
"white-space",
|
||||
"width",
|
||||
}
|
||||
|
||||
|
||||
def _has_unsafe_css_value(tokens: list) -> bool:
|
||||
for token in tokens:
|
||||
if token.type == "url":
|
||||
return True
|
||||
if token.type == "function" and token.lower_name in {"expression", "url"}:
|
||||
return True
|
||||
if hasattr(token, "content") and _has_unsafe_css_value(token.content):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class EmailCSSSanitizer(CSSSanitizer):
|
||||
def sanitize_css(self, style: str) -> str:
|
||||
declarations = parse_declaration_list(
|
||||
style,
|
||||
skip_comments=True,
|
||||
skip_whitespace=True,
|
||||
)
|
||||
sanitized = [
|
||||
declaration
|
||||
for declaration in declarations
|
||||
if declaration.type == "declaration"
|
||||
and declaration.lower_name in self.allowed_css_properties
|
||||
and not _has_unsafe_css_value(declaration.value)
|
||||
]
|
||||
return serialize(sanitized).strip()
|
||||
|
||||
|
||||
_EMAIL_CSS_SANITIZER = EmailCSSSanitizer(
|
||||
allowed_css_properties=_EMAIL_CSS_PROPERTIES,
|
||||
)
|
||||
|
||||
|
||||
def _linkify_text_as_html(text: object) -> str:
|
||||
"""Escape plain text and linkify URLs/email addresses for safe HTML output."""
|
||||
if isinstance(text, list):
|
||||
text = "\n".join([str(e) for e in text])
|
||||
if not isinstance(text, str):
|
||||
text = str(text)
|
||||
text = escape(text)
|
||||
text = linkify(text, parse_email=True)
|
||||
return text.replace("\n", "<br>")
|
||||
|
||||
|
||||
def _allow_email_html_attribute(tag: str, name: str, value: str) -> bool:
|
||||
if name not in _EMAIL_HTML_GLOBAL_ATTRIBUTES | _EMAIL_HTML_TAG_ATTRIBUTES.get(
|
||||
tag,
|
||||
set(),
|
||||
):
|
||||
return False
|
||||
|
||||
if tag == "img" and name == "src":
|
||||
return value.lower().startswith("cid:")
|
||||
|
||||
if tag == "a" and name == "href":
|
||||
return value.lower().startswith(("http://", "https://", "mailto:"))
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _sanitize_email_css_rules(rules: list) -> str:
|
||||
sanitized_rules = []
|
||||
|
||||
for rule in rules:
|
||||
if rule.type == "qualified-rule":
|
||||
selector = serialize(rule.prelude).strip()
|
||||
declarations = _EMAIL_CSS_SANITIZER.sanitize_css(
|
||||
serialize(rule.content),
|
||||
)
|
||||
if selector and declarations:
|
||||
sanitized_rules.append(f"{selector}{{{declarations}}}")
|
||||
|
||||
elif (
|
||||
rule.type == "at-rule" and rule.lower_at_keyword == "media" and rule.content
|
||||
):
|
||||
media_query = serialize(rule.prelude).strip()
|
||||
nested_rules = _sanitize_email_css_rules(
|
||||
parse_rule_list(
|
||||
rule.content,
|
||||
skip_comments=True,
|
||||
skip_whitespace=True,
|
||||
),
|
||||
)
|
||||
if media_query and nested_rules:
|
||||
sanitized_rules.append(f"@media {media_query}{{{nested_rules}}}")
|
||||
|
||||
return "".join(sanitized_rules)
|
||||
|
||||
|
||||
def _sanitize_email_css_stylesheet(css: str) -> str:
|
||||
return _sanitize_email_css_rules(
|
||||
parse_stylesheet(css, skip_comments=True, skip_whitespace=True),
|
||||
)
|
||||
|
||||
|
||||
def _clean_email_html(text: str) -> str:
|
||||
"""Sanitize email HTML before rendering it with Chromium."""
|
||||
sanitized_style_blocks = []
|
||||
|
||||
def sanitize_style_block(match: re.Match[str]) -> str:
|
||||
sanitized_style_blocks.append(
|
||||
f"<style>{_sanitize_email_css_stylesheet(match.group(1))}</style>",
|
||||
)
|
||||
return f"__PAPERLESS_SANITIZED_STYLE_{len(sanitized_style_blocks) - 1}__"
|
||||
|
||||
text = re.sub(r"(?is)<script\b[^>]*>.*?</script\s*>", "", text)
|
||||
text = re.sub(
|
||||
r"(?is)<style\b[^>]*>(.*?)</style\s*>",
|
||||
sanitize_style_block,
|
||||
text,
|
||||
)
|
||||
text = re.sub(r"(?is)</?(script|style)\b[^>]*>", "", text)
|
||||
for index, style_block in enumerate(sanitized_style_blocks):
|
||||
text = text.replace(f"__PAPERLESS_SANITIZED_STYLE_{index}__", style_block)
|
||||
return linkify(
|
||||
clean(
|
||||
text,
|
||||
tags=_EMAIL_HTML_TAGS,
|
||||
attributes=_allow_email_html_attribute,
|
||||
protocols=_EMAIL_HTML_PROTOCOLS,
|
||||
css_sanitizer=_EMAIL_CSS_SANITIZER,
|
||||
strip=True,
|
||||
strip_comments=True,
|
||||
),
|
||||
parse_email=True,
|
||||
)
|
||||
|
||||
|
||||
class MailDocumentParser:
|
||||
"""Parse .eml email files for Paperless-ngx.
|
||||
@@ -619,33 +856,29 @@ class MailDocumentParser:
|
||||
Path to the rendered HTML file inside the temporary directory.
|
||||
"""
|
||||
|
||||
def clean_html(text: str) -> str:
|
||||
"""Attempt to clean, escape, and linkify the given HTML string."""
|
||||
if isinstance(text, list):
|
||||
text = "\n".join([str(e) for e in text])
|
||||
if not isinstance(text, str):
|
||||
text = str(text)
|
||||
text = escape(text)
|
||||
text = clean(text)
|
||||
text = linkify(text, parse_email=True)
|
||||
text = text.replace("\n", "<br>")
|
||||
return text
|
||||
|
||||
data = {}
|
||||
|
||||
data["subject"] = clean_html(mail.subject)
|
||||
data["subject"] = _linkify_text_as_html(mail.subject)
|
||||
if data["subject"]:
|
||||
data["subject_label"] = "Subject"
|
||||
data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
|
||||
data["from"] = _linkify_text_as_html(
|
||||
mail.from_values.full if mail.from_values else "",
|
||||
)
|
||||
if data["from"]:
|
||||
data["from_label"] = "From"
|
||||
data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
|
||||
data["to"] = _linkify_text_as_html(
|
||||
", ".join(address.full for address in mail.to_values),
|
||||
)
|
||||
if data["to"]:
|
||||
data["to_label"] = "To"
|
||||
data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
|
||||
data["cc"] = _linkify_text_as_html(
|
||||
", ".join(address.full for address in mail.cc_values),
|
||||
)
|
||||
if data["cc"]:
|
||||
data["cc_label"] = "CC"
|
||||
data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
|
||||
data["bcc"] = _linkify_text_as_html(
|
||||
", ".join(address.full for address in mail.bcc_values),
|
||||
)
|
||||
if data["bcc"]:
|
||||
data["bcc_label"] = "BCC"
|
||||
|
||||
@@ -654,14 +887,14 @@ class MailDocumentParser:
|
||||
att.append(
|
||||
f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
|
||||
)
|
||||
data["attachments"] = clean_html(", ".join(att))
|
||||
data["attachments"] = _linkify_text_as_html(", ".join(att))
|
||||
if data["attachments"]:
|
||||
data["attachments_label"] = "Attachments"
|
||||
|
||||
data["date"] = clean_html(
|
||||
data["date"] = _linkify_text_as_html(
|
||||
timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
|
||||
)
|
||||
data["content"] = clean_html(mail.text.strip())
|
||||
data["content"] = _linkify_text_as_html(mail.text.strip())
|
||||
|
||||
from django.template.loader import render_to_string
|
||||
|
||||
@@ -761,19 +994,11 @@ class MailDocumentParser:
|
||||
If Gotenberg returns an error.
|
||||
"""
|
||||
|
||||
def clean_html_script(text: str) -> str:
|
||||
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
|
||||
text = compiled_open.sub("<div hidden ", text)
|
||||
|
||||
compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
|
||||
text = compiled_close.sub("</div", text)
|
||||
return text
|
||||
|
||||
logger.info("Converting message html to PDF")
|
||||
|
||||
tempdir = Path(self._tempdir)
|
||||
|
||||
html_clean = clean_html_script(orig_html)
|
||||
html_clean = _clean_email_html(orig_html)
|
||||
html_clean_file = tempdir / "index.html"
|
||||
html_clean_file.write_text(html_clean)
|
||||
|
||||
|
||||
@@ -118,6 +118,7 @@ SCRATCH_DIR = get_path_from_env(
|
||||
env_apps = get_list_from_env("PAPERLESS_APPS")
|
||||
|
||||
INSTALLED_APPS = [
|
||||
"whitenoise.runserver_nostatic",
|
||||
"django.contrib.auth",
|
||||
"django.contrib.contenttypes",
|
||||
"django.contrib.sessions",
|
||||
@@ -172,6 +173,7 @@ if DEBUG:
|
||||
|
||||
MIDDLEWARE = [
|
||||
"django.middleware.security.SecurityMiddleware",
|
||||
"whitenoise.middleware.WhiteNoiseMiddleware",
|
||||
"django.contrib.sessions.middleware.SessionMiddleware",
|
||||
"corsheaders.middleware.CorsMiddleware",
|
||||
"django.middleware.locale.LocaleMiddleware",
|
||||
@@ -230,6 +232,7 @@ WSGI_APPLICATION = "paperless.wsgi.application"
|
||||
ASGI_APPLICATION = "paperless.asgi.application"
|
||||
|
||||
STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", BASE_URL + "static/")
|
||||
WHITENOISE_STATIC_PREFIX = "/static/"
|
||||
|
||||
STORAGES = {
|
||||
"staticfiles": {
|
||||
|
||||
@@ -15,6 +15,8 @@ from documents.parsers import ParseError
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.mail import _clean_email_html
|
||||
from paperless.parsers.mail import _linkify_text_as_html
|
||||
|
||||
|
||||
class TestMailParserProtocol:
|
||||
@@ -72,6 +74,75 @@ class TestMailParserProtocol:
|
||||
assert count > 0
|
||||
|
||||
|
||||
class TestMailHtmlCleaning:
|
||||
def test_text_fields_are_escaped_before_linkifying(self) -> None:
|
||||
result = _linkify_text_as_html(
|
||||
"Hello <b>bold</b>\nhttps://example.com?a=1&b=2",
|
||||
)
|
||||
|
||||
assert "<b>bold</b>" in result
|
||||
assert "<br>" in result
|
||||
assert '<a href="https://example.com?a=1&b=2"' in result
|
||||
|
||||
def test_email_html_preserves_safe_structure(self) -> None:
|
||||
result = _clean_email_html(
|
||||
"""
|
||||
<style>
|
||||
.invoice { margin: 0; padding: 8px; color: #333; }
|
||||
@media screen { .invoice { width: 100%; } }
|
||||
</style>
|
||||
<div style="margin: 0; padding: 8px; color: #333;">
|
||||
<p>Hello <strong>there</strong></p>
|
||||
</div>
|
||||
<table style="width: 100%; border-collapse: collapse;">
|
||||
<tr><td colspan="2" style="text-align: right;">Total</td></tr>
|
||||
</table>
|
||||
<img src="cid:logo" width="100" alt="Logo" style="display: block;">
|
||||
Visit https://example.com
|
||||
""",
|
||||
)
|
||||
|
||||
assert "<style>.invoice{margin: 0;padding: 8px;color: #333;}" in result
|
||||
assert "@media screen{.invoice{width: 100%;}}</style>" in result
|
||||
assert 'style="margin: 0;padding: 8px;color: #333;"' in result
|
||||
assert "<p>Hello <strong>there</strong></p>" in result
|
||||
assert 'style="width: 100%;border-collapse: collapse;"' in result
|
||||
assert '<td colspan="2" style="text-align: right;">Total</td>' in result
|
||||
assert 'style="display: block;"' in result
|
||||
assert '<img src="cid:logo" width="100" alt="Logo"' in result
|
||||
assert '<a href="https://example.com"' in result
|
||||
|
||||
def test_email_html_removes_executable_content(self) -> None:
|
||||
result = _clean_email_html(
|
||||
"""
|
||||
<div onclick="alert('x')">Message</div>
|
||||
<script>alert('script')</script>
|
||||
<style>
|
||||
@import url("https://example.com/x.css");
|
||||
body { color: url("https://example.com/x"); position: fixed; }
|
||||
@media screen { body { background-image: url("https://example.com/x"); } }
|
||||
</style>
|
||||
<a href="javascript:alert('x')">bad link</a>
|
||||
<a href="cid:logo">bad cid link</a>
|
||||
<img src="https://example.com/logo.png" onerror="alert('x')" alt="Logo"
|
||||
style="background-image: url('https://example.com/logo.png'); position: fixed;">
|
||||
""",
|
||||
)
|
||||
|
||||
assert "Message" in result
|
||||
assert "script" not in result
|
||||
assert "background" not in result
|
||||
assert "onclick" not in result
|
||||
assert "onerror" not in result
|
||||
assert "javascript:" not in result
|
||||
assert "background-image" not in result
|
||||
assert "position" not in result
|
||||
assert "@import" not in result
|
||||
assert "<a>bad link</a>" in result
|
||||
assert "<a>bad cid link</a>" in result
|
||||
assert '<img alt="Logo" style="">' in result
|
||||
|
||||
|
||||
class TestEmailFileParsing:
|
||||
"""
|
||||
Tests around reading a file and parsing it into a
|
||||
|
||||
@@ -313,6 +313,11 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
css = [
|
||||
{ name = "tinycss2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
version = "1.2.0"
|
||||
@@ -2888,7 +2893,7 @@ source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "bleach", extra = ["css"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "channels-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -3038,7 +3043,7 @@ typing = [
|
||||
requires-dist = [
|
||||
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
|
||||
{ name = "babel", specifier = ">=2.17" },
|
||||
{ name = "bleach", specifier = "~=6.3.0" },
|
||||
{ name = "bleach", extras = ["css"], specifier = "~=6.3.0" },
|
||||
{ name = "celery", extras = ["redis"], specifier = "~=5.6.2" },
|
||||
{ name = "channels", specifier = "~=4.2" },
|
||||
{ name = "channels-redis", specifier = "~=4.2" },
|
||||
@@ -4892,6 +4897,18 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/40/d0/ad3feb0a392ef4e0c08bc32024950373ddc0669002cbdcbb9f3bf0c2d114/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:528217cad85ede5f85c8bc78b0341868d3c3cfefc6ecb5b622e1cacb6c73247b", size = 39837, upload-time = "2025-12-17T23:32:58.283Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinycss2"
|
||||
version = "1.4.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "webencodings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/7a/fd/7a5ee21fd08ff70d3d33a5781c255cbe779659bd03278feb98b19ee550f4/tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7", size = 87085, upload-time = "2024-10-24T14:58:29.895Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e6/34/ebdc18bae6aa14fbee1a08b63c015c72b64868ff7dae68808ab500c492e2/tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289", size = 26610, upload-time = "2024-10-24T14:58:28.029Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytag"
|
||||
version = "2.2.1"
|
||||
|
||||
Reference in New Issue
Block a user