Compare commits

..

2 Commits

Author SHA1 Message Date
stumpylog 0bef44e108 Experiments with Granian static file serving 2026-05-29 08:14:41 -07:00
Trenton H 525b986e23 Fix: Handle tanvity index lock contention (#12856)
Implements and tests a retry with backoff + jitter for aquring the index update lock.  If we still can't get it, dispatch a celery task to handle it later instead (also with retry)

Signed-off-by: stumpylog <797416+stumpylog@users.noreply.github.com>
2026-05-27 09:47:13 -07:00
11 changed files with 413 additions and 361 deletions
+1 -3
View File
@@ -104,8 +104,6 @@ ARG JBIG2ENC_VERSION=0.30
# Set Python environment variables # Set Python environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \ ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \ PYTHONUNBUFFERED=1 \
# Ignore warning from Whitenoise about async iterators
PYTHONWARNINGS="ignore:::django.http.response:517" \
PNGX_CONTAINERIZED=1 \ PNGX_CONTAINERIZED=1 \
# https://docs.astral.sh/uv/reference/settings/#link-mode # https://docs.astral.sh/uv/reference/settings/#link-mode
UV_LINK_MODE=copy UV_LINK_MODE=copy
@@ -239,7 +237,7 @@ RUN set -eux \
&& echo "Making fontconfig cache writable for arbitrary container UIDs" \ && echo "Making fontconfig cache writable for arbitrary container UIDs" \
&& chmod 1777 /var/cache/fontconfig \ && chmod 1777 /var/cache/fontconfig \
&& echo "Collecting static files" \ && echo "Collecting static files" \
&& PAPERLESS_SECRET_KEY=build-time-dummy s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input --link \ && PAPERLESS_SECRET_KEY=build-time-dummy s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input \
&& PAPERLESS_SECRET_KEY=build-time-dummy s6-setuidgid paperless python3 manage.py compilemessages \ && PAPERLESS_SECRET_KEY=build-time-dummy s6-setuidgid paperless python3 manage.py compilemessages \
&& /usr/local/bin/deduplicate.py --verbose /usr/src/paperless/static/ && /usr/local/bin/deduplicate.py --verbose /usr/src/paperless/static/
@@ -8,6 +8,13 @@ export GRANIAN_HOST=${GRANIAN_HOST:-${PAPERLESS_BIND_ADDR:-"::"}}
export GRANIAN_PORT=${GRANIAN_PORT:-${PAPERLESS_PORT:-8000}} export GRANIAN_PORT=${GRANIAN_PORT:-${PAPERLESS_PORT:-8000}}
export GRANIAN_WORKERS=${GRANIAN_WORKERS:-${PAPERLESS_WEBSERVER_WORKERS:-1}} export GRANIAN_WORKERS=${GRANIAN_WORKERS:-${PAPERLESS_WEBSERVER_WORKERS:-1}}
# Static file serving: Granian matches against the raw URI path (before any
# SCRIPT_NAME stripping), so the route must include the subpath prefix.
_static_dir="${PAPERLESS_STATICDIR:-/usr/src/paperless/static}"
_static_route="${PAPERLESS_FORCE_SCRIPT_NAME}/static"
export GRANIAN_STATIC_PATH_MOUNT=${GRANIAN_STATIC_PATH_MOUNT:-${_static_dir}}
export GRANIAN_STATIC_PATH_ROUTE=${GRANIAN_STATIC_PATH_ROUTE:-${_static_route:-/static}}
# Only set GRANIAN_URL_PATH_PREFIX if PAPERLESS_FORCE_SCRIPT_NAME is set # Only set GRANIAN_URL_PATH_PREFIX if PAPERLESS_FORCE_SCRIPT_NAME is set
if [[ -n "${PAPERLESS_FORCE_SCRIPT_NAME}" ]]; then if [[ -n "${PAPERLESS_FORCE_SCRIPT_NAME}" ]]; then
export GRANIAN_URL_PATH_PREFIX=${PAPERLESS_FORCE_SCRIPT_NAME} export GRANIAN_URL_PATH_PREFIX=${PAPERLESS_FORCE_SCRIPT_NAME}
+1 -1
View File
@@ -16,7 +16,7 @@ classifiers = [
dependencies = [ dependencies = [
"azure-ai-documentintelligence>=1.0.2", "azure-ai-documentintelligence>=1.0.2",
"babel>=2.17", "babel>=2.17",
"bleach[css]~=6.3.0", "bleach~=6.3.0",
"celery[redis]~=5.6.2", "celery[redis]~=5.6.2",
"channels~=4.2", "channels~=4.2",
"channels-redis~=4.2", "channels-redis~=4.2",
+4
View File
@@ -23,6 +23,10 @@ ExecStart=/bin/sh -c '\
[ -n "$PAPERLESS_WEBSERVER_WORKERS" ] && export GRANIAN_WORKERS=$PAPERLESS_WEBSERVER_WORKERS; \ [ -n "$PAPERLESS_WEBSERVER_WORKERS" ] && export GRANIAN_WORKERS=$PAPERLESS_WEBSERVER_WORKERS; \
# URL path prefix: only set if PAPERLESS_FORCE_SCRIPT_NAME exists \ # URL path prefix: only set if PAPERLESS_FORCE_SCRIPT_NAME exists \
[ -n "$PAPERLESS_FORCE_SCRIPT_NAME" ] && export GRANIAN_URL_PATH_PREFIX=$PAPERLESS_FORCE_SCRIPT_NAME; \ [ -n "$PAPERLESS_FORCE_SCRIPT_NAME" ] && export GRANIAN_URL_PATH_PREFIX=$PAPERLESS_FORCE_SCRIPT_NAME; \
# Static file serving: Granian matches the raw URI path (before SCRIPT_NAME stripping), \
# so the route must include any subpath prefix. \
[ -z "$GRANIAN_STATIC_PATH_MOUNT" ] && export GRANIAN_STATIC_PATH_MOUNT=${PAPERLESS_STATICDIR:-/opt/paperless/static}; \
[ -z "$GRANIAN_STATIC_PATH_ROUTE" ] && export GRANIAN_STATIC_PATH_ROUTE="${PAPERLESS_FORCE_SCRIPT_NAME}/static"; \
exec granian --interface asginl --ws --loop uvloop "paperless.asgi:application"' exec granian --interface asginl --ws --loop uvloop "paperless.asgi:application"'
[Install] [Install]
+63 -10
View File
@@ -1,12 +1,15 @@
from __future__ import annotations from __future__ import annotations
import logging import logging
import random
import re import re
import threading import threading
import time
from datetime import UTC from datetime import UTC
from datetime import datetime from datetime import datetime
from enum import StrEnum from enum import StrEnum
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from typing import Final
from typing import Self from typing import Self
from typing import TypedDict from typing import TypedDict
from typing import TypeVar from typing import TypeVar
@@ -43,6 +46,11 @@ if TYPE_CHECKING:
logger = logging.getLogger("paperless.search") logger = logging.getLogger("paperless.search")
_LOCK_TIMEOUT_SECONDS: Final[float] = 10.0 # per-attempt acquire timeout
_LOCK_RETRY_ATTEMPTS: Final[int] = 4 # total attempts (1 initial + 3 retries)
_LOCK_BACKOFF_BASE: Final[float] = 1.0 # seconds
_LOCK_BACKOFF_CAP: Final[float] = 10.0 # seconds
_WORD_RE = regex.compile(r"\w+") _WORD_RE = regex.compile(r"\w+")
_AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted content _AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted content
@@ -183,12 +191,27 @@ class WriteBatch:
if self._backend._path is not None: if self._backend._path is not None:
lock_path = self._backend._path / ".tantivy.lock" lock_path = self._backend._path / ".tantivy.lock"
self._lock = filelock.FileLock(str(lock_path)) self._lock = filelock.FileLock(str(lock_path))
try: for attempt in range(_LOCK_RETRY_ATTEMPTS):
self._lock.acquire(timeout=self._lock_timeout) try:
except filelock.Timeout as e: # pragma: no cover self._lock.acquire(timeout=self._lock_timeout)
raise SearchIndexLockError( break
f"Could not acquire index lock within {self._lock_timeout}s", except filelock.Timeout:
) from e if attempt == _LOCK_RETRY_ATTEMPTS - 1:
raise SearchIndexLockError(
f"Could not acquire index lock after {_LOCK_RETRY_ATTEMPTS} "
f"attempts (timeout={self._lock_timeout}s each)",
)
sleep_s = random.uniform(
0,
min(_LOCK_BACKOFF_CAP, _LOCK_BACKOFF_BASE * (2**attempt)),
)
logger.debug(
"Index lock contention; retrying in %.2fs (attempt %d/%d)",
sleep_s,
attempt + 1,
_LOCK_RETRY_ATTEMPTS,
)
time.sleep(sleep_s)
self._raw_writer = self._backend._index.writer() self._raw_writer = self._backend._index.writer()
return self return self
@@ -490,13 +513,28 @@ class TantivyBackend:
Convenience method for single-document updates. For bulk operations, Convenience method for single-document updates. For bulk operations,
use batch_update() context manager for better performance. use batch_update() context manager for better performance.
On lock exhaustion after all retry attempts, schedules a deferred
index_document Celery task and returns normally. Callers will NOT
receive a SearchIndexLockError; the index write is deferred silently.
Args: Args:
document: Django Document instance to index document: Django Document instance to index
effective_content: Override document.content for indexing effective_content: Override document.content for indexing
""" """
self._ensure_open() self._ensure_open()
with self.batch_update(lock_timeout=5.0) as batch: try:
batch.add_or_update(document, effective_content) with self.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS) as batch:
batch.add_or_update(document, effective_content)
except SearchIndexLockError:
logger.error(
"Search index lock exhausted for document %d after %d attempts; "
"scheduling deferred index write",
document.pk,
_LOCK_RETRY_ATTEMPTS,
)
from documents.tasks import index_document
index_document.apply_async(args=[document.pk], countdown=60)
def remove(self, doc_id: int) -> None: def remove(self, doc_id: int) -> None:
""" """
@@ -505,12 +543,27 @@ class TantivyBackend:
Convenience method for single-document removal. For bulk operations, Convenience method for single-document removal. For bulk operations,
use batch_update() context manager for better performance. use batch_update() context manager for better performance.
On lock exhaustion after all retry attempts, schedules a deferred
remove_document_from_index Celery task and returns normally.
Callers will NOT receive a SearchIndexLockError.
Args: Args:
doc_id: Primary key of the document to remove doc_id: Primary key of the document to remove
""" """
self._ensure_open() self._ensure_open()
with self.batch_update(lock_timeout=5.0) as batch: try:
batch.remove(doc_id) with self.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS) as batch:
batch.remove(doc_id)
except SearchIndexLockError:
logger.error(
"Search index lock exhausted for doc_id %d after %d attempts; "
"scheduling deferred index removal",
doc_id,
_LOCK_RETRY_ATTEMPTS,
)
from documents.tasks import remove_document_from_index
remove_document_from_index.apply_async(args=[doc_id], countdown=60)
def highlight_hits( def highlight_hits(
self, self,
+58
View File
@@ -56,6 +56,7 @@ from documents.plugins.base import StopConsumeTaskError
from documents.plugins.helpers import ProgressManager from documents.plugins.helpers import ProgressManager
from documents.plugins.helpers import ProgressStatusOptions from documents.plugins.helpers import ProgressStatusOptions
from documents.sanity_checker import SanityCheckFailedException from documents.sanity_checker import SanityCheckFailedException
from documents.search._backend import SearchIndexLockError
from documents.signals import document_updated from documents.signals import document_updated
from documents.signals.handlers import cleanup_document_deletion from documents.signals.handlers import cleanup_document_deletion
from documents.signals.handlers import run_workflows from documents.signals.handlers import run_workflows
@@ -84,6 +85,63 @@ def index_optimize() -> None:
) )
@shared_task(
bind=True,
ignore_result=True,
autoretry_for=(SearchIndexLockError,),
max_retries=5,
retry_backoff=60,
retry_jitter=True,
)
def index_document(self, document_id: int) -> None:
"""
Deferred single-document index write.
Used as a self-healing fallback when add_or_update() exhausts its lock retry
budget during high-concurrency consumption. Runs via batch_update() directly
to avoid re-entering the deferred scheduling path in add_or_update().
If the document was deleted before this task runs, it exits cleanly.
"""
from documents.search import get_backend
try:
document = Document.objects.get(pk=document_id)
except Document.DoesNotExist:
logger.info(
"index_document: document %d no longer exists; skipping",
document_id,
)
return
with get_backend().batch_update() as batch:
batch.add_or_update(
document,
effective_content=document.get_effective_content(),
)
@shared_task(
bind=True,
ignore_result=True,
autoretry_for=(SearchIndexLockError,),
max_retries=5,
retry_backoff=60,
retry_jitter=True,
)
def remove_document_from_index(self, doc_id: int) -> None:
"""
Deferred single-document index removal.
Used as a self-healing fallback when remove() exhausts its lock retry budget.
Operates only on the Tantivy index; no database lookup required.
If the document has already been removed, the term-query delete is a no-op.
"""
from documents.search import get_backend
with get_backend().batch_update() as batch:
batch.remove(doc_id)
@shared_task @shared_task
def train_classifier( def train_classifier(
*, *,
@@ -0,0 +1,248 @@
"""Tests for search index lock backoff, retry logic, and self-healing deferred tasks."""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
import filelock
import pytest
from documents.search._backend import _LOCK_BACKOFF_CAP
from documents.search._backend import _LOCK_RETRY_ATTEMPTS
from documents.search._backend import _LOCK_TIMEOUT_SECONDS
from documents.search._backend import SearchIndexLockError
from documents.search._backend import TantivyBackend
from documents.tasks import index_document
from documents.tasks import remove_document_from_index
from documents.tests.factories import DocumentFactory
if TYPE_CHECKING:
from collections.abc import Generator
from pathlib import Path
from pytest_mock import MockerFixture
pytestmark = pytest.mark.search
@pytest.fixture
def disk_backend(tmp_path: Path) -> Generator[TantivyBackend, None, None]:
"""On-disk TantivyBackend so the file-lock code path is exercised."""
b = TantivyBackend(path=tmp_path)
b.open()
try:
yield b
finally:
b.close()
class TestWriteBatchLockRetry:
"""Test WriteBatch retry loop with backoff + full jitter."""
@pytest.mark.django_db
def test_lock_retries_then_succeeds(
self,
disk_backend: TantivyBackend,
mocker: MockerFixture,
) -> None:
"""Timeout on first 3 attempts then success on 4th — document must be indexed."""
doc = DocumentFactory()
acquire_calls = 0
def flaky_acquire(timeout: float) -> None:
nonlocal acquire_calls
acquire_calls += 1
# Raise Timeout for first _LOCK_RETRY_ATTEMPTS - 1 calls, succeed on last
if acquire_calls < _LOCK_RETRY_ATTEMPTS:
raise filelock.Timeout("")
sleep_values: list[float] = []
mocker.patch(
"documents.search._backend.filelock.FileLock.acquire",
side_effect=flaky_acquire,
)
mock_sleep = mocker.patch(
"documents.search._backend.time.sleep",
side_effect=lambda s: sleep_values.append(s),
)
# Should not raise — 4th attempt succeeds
with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS) as batch:
batch.add_or_update(doc)
# sleep called exactly _LOCK_RETRY_ATTEMPTS - 1 times (once per failed attempt)
assert mock_sleep.call_count == _LOCK_RETRY_ATTEMPTS - 1
# All sleep values must be in [0, _LOCK_BACKOFF_CAP]
for s in sleep_values:
assert 0 <= s <= _LOCK_BACKOFF_CAP, (
f"Sleep value {s} outside [0, {_LOCK_BACKOFF_CAP}]"
)
def test_lock_exhaustion_raises_search_index_lock_error(
self,
disk_backend: TantivyBackend,
mocker: MockerFixture,
) -> None:
"""All acquire attempts raise Timeout — WriteBatch must raise SearchIndexLockError."""
mocker.patch(
"documents.search._backend.filelock.FileLock.acquire",
side_effect=filelock.Timeout(""),
)
mocker.patch("documents.search._backend.time.sleep")
with pytest.raises(SearchIndexLockError):
with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS):
pass
def test_jitter_values_in_range(
self,
disk_backend: TantivyBackend,
mocker: MockerFixture,
) -> None:
"""Sleep values must always lie in [0, _LOCK_BACKOFF_CAP] across many samples."""
mocker.patch(
"documents.search._backend.filelock.FileLock.acquire",
side_effect=filelock.Timeout(""),
)
sleep_values: list[float] = []
mocker.patch(
"documents.search._backend.time.sleep",
side_effect=lambda s: sleep_values.append(s),
)
for _ in range(50):
sleep_values.clear()
with pytest.raises(SearchIndexLockError):
with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS):
pass
for s in sleep_values:
assert 0 <= s <= _LOCK_BACKOFF_CAP, (
f"Jitter {s} exceeds cap {_LOCK_BACKOFF_CAP}"
)
class TestAddOrUpdateDeferredScheduling:
"""Test that add_or_update() and remove() defer to Celery on lock exhaustion."""
@pytest.mark.django_db
def test_lock_exhaustion_schedules_deferred_task(
self,
disk_backend: TantivyBackend,
mocker: MockerFixture,
) -> None:
"""Lock exhaustion in add_or_update must schedule index_document task, not raise."""
doc = DocumentFactory()
mocker.patch(
"documents.search._backend.filelock.FileLock.acquire",
side_effect=filelock.Timeout(""),
)
mocker.patch("documents.search._backend.time.sleep")
mock_apply = mocker.patch("documents.tasks.index_document.apply_async")
# Must NOT raise
disk_backend.add_or_update(doc)
mock_apply.assert_called_once_with(args=[doc.pk], countdown=60)
def test_remove_exhaustion_schedules_deferred_task(
self,
disk_backend: TantivyBackend,
mocker: MockerFixture,
) -> None:
"""Lock exhaustion in remove() must schedule remove_document_from_index task, not raise."""
doc_id = 503
mocker.patch(
"documents.search._backend.filelock.FileLock.acquire",
side_effect=filelock.Timeout(""),
)
mocker.patch("documents.search._backend.time.sleep")
mock_apply = mocker.patch(
"documents.tasks.remove_document_from_index.apply_async",
)
# Must NOT raise
disk_backend.remove(doc_id)
mock_apply.assert_called_once_with(args=[doc_id], countdown=60)
@pytest.mark.django_db
class TestIndexDocumentTask:
"""Test the deferred index_document and remove_document_from_index Celery tasks."""
def test_index_document_task_skips_deleted_document(
self,
caplog: pytest.LogCaptureFixture,
) -> None:
"""index_document with a non-existent doc_id must return cleanly and log INFO."""
nonexistent_id = 999999
with caplog.at_level(logging.INFO, logger="paperless.tasks"):
index_document(nonexistent_id)
assert any("no longer exists" in record.message for record in caplog.records), (
"Expected INFO log about missing document"
)
def test_index_document_task_indexes_existing_document(
self,
backend: TantivyBackend,
mocker: MockerFixture,
) -> None:
"""index_document task must add the document to the index via batch_update."""
doc = DocumentFactory(content="via deferred task")
# get_backend is imported lazily inside the task: `from documents.search import get_backend`
mocker.patch(
"documents.search.get_backend",
return_value=backend,
)
index_document(doc.pk)
ids = backend.search_ids("deferred task", user=None)
assert doc.pk in ids
def test_remove_document_from_index_task_removes_existing_document(
self,
backend: TantivyBackend,
mocker: MockerFixture,
) -> None:
"""remove_document_from_index task must remove the document from the index."""
doc = DocumentFactory(content="will be removed by deferred task")
backend.add_or_update(doc)
assert doc.pk in backend.search_ids("removed", user=None)
mocker.patch("documents.search.get_backend", return_value=backend)
remove_document_from_index(doc.pk)
assert doc.pk not in backend.search_ids("removed", user=None)
def test_task_does_not_swallow_lock_error(
self,
mocker: MockerFixture,
) -> None:
"""Verifies the task body propagates SearchIndexLockError so Celery's
autoretry_for can catch it (rather than the task swallowing the error
and silently succeeding)."""
doc = DocumentFactory()
mock_batch = mocker.MagicMock()
mock_batch.__enter__ = mocker.MagicMock(
side_effect=SearchIndexLockError("exhausted"),
)
mock_batch.__exit__ = mocker.MagicMock(return_value=False)
mock_backend = mocker.MagicMock()
mock_backend.batch_update.return_value = mock_batch
# get_backend is imported lazily inside the task: `from documents.search import get_backend`
mocker.patch("documents.search.get_backend", return_value=mock_backend)
with pytest.raises(SearchIndexLockError):
index_document(doc.pk)
+29 -254
View File
@@ -24,7 +24,6 @@ from typing import Self
from bleach import clean from bleach import clean
from bleach import linkify from bleach import linkify
from bleach.css_sanitizer import CSSSanitizer
from django.conf import settings from django.conf import settings
from django.utils import timezone from django.utils import timezone
from django.utils.timezone import is_naive from django.utils.timezone import is_naive
@@ -39,10 +38,6 @@ from humanize import naturalsize
from imap_tools import MailAttachment from imap_tools import MailAttachment
from imap_tools import MailMessage from imap_tools import MailMessage
from tika_client import TikaClient from tika_client import TikaClient
from tinycss2 import parse_declaration_list
from tinycss2 import parse_rule_list
from tinycss2 import parse_stylesheet
from tinycss2 import serialize
from documents.parsers import ParseError from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf from documents.parsers import make_thumbnail_from_pdf
@@ -63,238 +58,6 @@ _SUPPORTED_MIME_TYPES: dict[str, str] = {
"message/rfc822": ".eml", "message/rfc822": ".eml",
} }
_EMAIL_HTML_TAGS = {
"a",
"abbr",
"acronym",
"address",
"b",
"blockquote",
"br",
"caption",
"code",
"dd",
"del",
"div",
"dl",
"dt",
"em",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"hr",
"i",
"img",
"li",
"ol",
"p",
"pre",
"s",
"small",
"span",
"style",
"strong",
"sub",
"sup",
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"tr",
"u",
"ul",
}
_EMAIL_HTML_PROTOCOLS = {"cid", "http", "https", "mailto"}
_EMAIL_HTML_GLOBAL_ATTRIBUTES = {
"abbr",
"align",
"alt",
"height",
"style",
"title",
"width",
}
_EMAIL_HTML_TAG_ATTRIBUTES = {
"a": {"href", "name", "title"},
"img": {"alt", "height", "src", "title", "width"},
"ol": {"start", "type"},
"td": {"colspan", "headers", "rowspan", "scope"},
"th": {"colspan", "headers", "rowspan", "scope"},
"ul": {"type"},
}
_EMAIL_CSS_PROPERTIES = {
"background-color",
"border",
"border-bottom",
"border-collapse",
"border-color",
"border-left",
"border-right",
"border-spacing",
"border-style",
"border-top",
"border-width",
"color",
"display",
"font",
"font-family",
"font-size",
"font-style",
"font-weight",
"height",
"line-height",
"margin",
"margin-bottom",
"margin-left",
"margin-right",
"margin-top",
"max-width",
"min-width",
"padding",
"padding-bottom",
"padding-left",
"padding-right",
"padding-top",
"text-align",
"text-decoration",
"vertical-align",
"white-space",
"width",
}
def _has_unsafe_css_value(tokens: list) -> bool:
for token in tokens:
if token.type == "url":
return True
if token.type == "function" and token.lower_name in {"expression", "url"}:
return True
if hasattr(token, "content") and _has_unsafe_css_value(token.content):
return True
return False
class EmailCSSSanitizer(CSSSanitizer):
def sanitize_css(self, style: str) -> str:
declarations = parse_declaration_list(
style,
skip_comments=True,
skip_whitespace=True,
)
sanitized = [
declaration
for declaration in declarations
if declaration.type == "declaration"
and declaration.lower_name in self.allowed_css_properties
and not _has_unsafe_css_value(declaration.value)
]
return serialize(sanitized).strip()
_EMAIL_CSS_SANITIZER = EmailCSSSanitizer(
allowed_css_properties=_EMAIL_CSS_PROPERTIES,
)
def _linkify_text_as_html(text: object) -> str:
"""Escape plain text and linkify URLs/email addresses for safe HTML output."""
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if not isinstance(text, str):
text = str(text)
text = escape(text)
text = linkify(text, parse_email=True)
return text.replace("\n", "<br>")
def _allow_email_html_attribute(tag: str, name: str, value: str) -> bool:
if name not in _EMAIL_HTML_GLOBAL_ATTRIBUTES | _EMAIL_HTML_TAG_ATTRIBUTES.get(
tag,
set(),
):
return False
if tag == "img" and name == "src":
return value.lower().startswith("cid:")
if tag == "a" and name == "href":
return value.lower().startswith(("http://", "https://", "mailto:"))
return True
def _sanitize_email_css_rules(rules: list) -> str:
sanitized_rules = []
for rule in rules:
if rule.type == "qualified-rule":
selector = serialize(rule.prelude).strip()
declarations = _EMAIL_CSS_SANITIZER.sanitize_css(
serialize(rule.content),
)
if selector and declarations:
sanitized_rules.append(f"{selector}{{{declarations}}}")
elif (
rule.type == "at-rule" and rule.lower_at_keyword == "media" and rule.content
):
media_query = serialize(rule.prelude).strip()
nested_rules = _sanitize_email_css_rules(
parse_rule_list(
rule.content,
skip_comments=True,
skip_whitespace=True,
),
)
if media_query and nested_rules:
sanitized_rules.append(f"@media {media_query}{{{nested_rules}}}")
return "".join(sanitized_rules)
def _sanitize_email_css_stylesheet(css: str) -> str:
return _sanitize_email_css_rules(
parse_stylesheet(css, skip_comments=True, skip_whitespace=True),
)
def _clean_email_html(text: str) -> str:
"""Sanitize email HTML before rendering it with Chromium."""
sanitized_style_blocks = []
def sanitize_style_block(match: re.Match[str]) -> str:
sanitized_style_blocks.append(
f"<style>{_sanitize_email_css_stylesheet(match.group(1))}</style>",
)
return f"__PAPERLESS_SANITIZED_STYLE_{len(sanitized_style_blocks) - 1}__"
text = re.sub(r"(?is)<script\b[^>]*>.*?</script\s*>", "", text)
text = re.sub(
r"(?is)<style\b[^>]*>(.*?)</style\s*>",
sanitize_style_block,
text,
)
text = re.sub(r"(?is)</?(script|style)\b[^>]*>", "", text)
for index, style_block in enumerate(sanitized_style_blocks):
text = text.replace(f"__PAPERLESS_SANITIZED_STYLE_{index}__", style_block)
return linkify(
clean(
text,
tags=_EMAIL_HTML_TAGS,
attributes=_allow_email_html_attribute,
protocols=_EMAIL_HTML_PROTOCOLS,
css_sanitizer=_EMAIL_CSS_SANITIZER,
strip=True,
strip_comments=True,
),
parse_email=True,
)
class MailDocumentParser: class MailDocumentParser:
"""Parse .eml email files for Paperless-ngx. """Parse .eml email files for Paperless-ngx.
@@ -856,29 +619,33 @@ class MailDocumentParser:
Path to the rendered HTML file inside the temporary directory. Path to the rendered HTML file inside the temporary directory.
""" """
def clean_html(text: str) -> str:
"""Attempt to clean, escape, and linkify the given HTML string."""
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if not isinstance(text, str):
text = str(text)
text = escape(text)
text = clean(text)
text = linkify(text, parse_email=True)
text = text.replace("\n", "<br>")
return text
data = {} data = {}
data["subject"] = _linkify_text_as_html(mail.subject) data["subject"] = clean_html(mail.subject)
if data["subject"]: if data["subject"]:
data["subject_label"] = "Subject" data["subject_label"] = "Subject"
data["from"] = _linkify_text_as_html( data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
mail.from_values.full if mail.from_values else "",
)
if data["from"]: if data["from"]:
data["from_label"] = "From" data["from_label"] = "From"
data["to"] = _linkify_text_as_html( data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
", ".join(address.full for address in mail.to_values),
)
if data["to"]: if data["to"]:
data["to_label"] = "To" data["to_label"] = "To"
data["cc"] = _linkify_text_as_html( data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
", ".join(address.full for address in mail.cc_values),
)
if data["cc"]: if data["cc"]:
data["cc_label"] = "CC" data["cc_label"] = "CC"
data["bcc"] = _linkify_text_as_html( data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
", ".join(address.full for address in mail.bcc_values),
)
if data["bcc"]: if data["bcc"]:
data["bcc_label"] = "BCC" data["bcc_label"] = "BCC"
@@ -887,14 +654,14 @@ class MailDocumentParser:
att.append( att.append(
f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})", f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
) )
data["attachments"] = _linkify_text_as_html(", ".join(att)) data["attachments"] = clean_html(", ".join(att))
if data["attachments"]: if data["attachments"]:
data["attachments_label"] = "Attachments" data["attachments_label"] = "Attachments"
data["date"] = _linkify_text_as_html( data["date"] = clean_html(
timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"), timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
) )
data["content"] = _linkify_text_as_html(mail.text.strip()) data["content"] = clean_html(mail.text.strip())
from django.template.loader import render_to_string from django.template.loader import render_to_string
@@ -994,11 +761,19 @@ class MailDocumentParser:
If Gotenberg returns an error. If Gotenberg returns an error.
""" """
def clean_html_script(text: str) -> str:
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
text = compiled_open.sub("<div hidden ", text)
compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
text = compiled_close.sub("</div", text)
return text
logger.info("Converting message html to PDF") logger.info("Converting message html to PDF")
tempdir = Path(self._tempdir) tempdir = Path(self._tempdir)
html_clean = _clean_email_html(orig_html) html_clean = clean_html_script(orig_html)
html_clean_file = tempdir / "index.html" html_clean_file = tempdir / "index.html"
html_clean_file.write_text(html_clean) html_clean_file.write_text(html_clean)
-3
View File
@@ -118,7 +118,6 @@ SCRATCH_DIR = get_path_from_env(
env_apps = get_list_from_env("PAPERLESS_APPS") env_apps = get_list_from_env("PAPERLESS_APPS")
INSTALLED_APPS = [ INSTALLED_APPS = [
"whitenoise.runserver_nostatic",
"django.contrib.auth", "django.contrib.auth",
"django.contrib.contenttypes", "django.contrib.contenttypes",
"django.contrib.sessions", "django.contrib.sessions",
@@ -173,7 +172,6 @@ if DEBUG:
MIDDLEWARE = [ MIDDLEWARE = [
"django.middleware.security.SecurityMiddleware", "django.middleware.security.SecurityMiddleware",
"whitenoise.middleware.WhiteNoiseMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware", "django.contrib.sessions.middleware.SessionMiddleware",
"corsheaders.middleware.CorsMiddleware", "corsheaders.middleware.CorsMiddleware",
"django.middleware.locale.LocaleMiddleware", "django.middleware.locale.LocaleMiddleware",
@@ -232,7 +230,6 @@ WSGI_APPLICATION = "paperless.wsgi.application"
ASGI_APPLICATION = "paperless.asgi.application" ASGI_APPLICATION = "paperless.asgi.application"
STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", BASE_URL + "static/") STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", BASE_URL + "static/")
WHITENOISE_STATIC_PREFIX = "/static/"
STORAGES = { STORAGES = {
"staticfiles": { "staticfiles": {
@@ -15,8 +15,6 @@ from documents.parsers import ParseError
from paperless.parsers import ParserContext from paperless.parsers import ParserContext
from paperless.parsers import ParserProtocol from paperless.parsers import ParserProtocol
from paperless.parsers.mail import MailDocumentParser from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.mail import _clean_email_html
from paperless.parsers.mail import _linkify_text_as_html
class TestMailParserProtocol: class TestMailParserProtocol:
@@ -74,75 +72,6 @@ class TestMailParserProtocol:
assert count > 0 assert count > 0
class TestMailHtmlCleaning:
def test_text_fields_are_escaped_before_linkifying(self) -> None:
result = _linkify_text_as_html(
"Hello <b>bold</b>\nhttps://example.com?a=1&b=2",
)
assert "&lt;b&gt;bold&lt;/b&gt;" in result
assert "<br>" in result
assert '<a href="https://example.com?a=1&amp;b=2"' in result
def test_email_html_preserves_safe_structure(self) -> None:
result = _clean_email_html(
"""
<style>
.invoice { margin: 0; padding: 8px; color: #333; }
@media screen { .invoice { width: 100%; } }
</style>
<div style="margin: 0; padding: 8px; color: #333;">
<p>Hello <strong>there</strong></p>
</div>
<table style="width: 100%; border-collapse: collapse;">
<tr><td colspan="2" style="text-align: right;">Total</td></tr>
</table>
<img src="cid:logo" width="100" alt="Logo" style="display: block;">
Visit https://example.com
""",
)
assert "<style>.invoice{margin: 0;padding: 8px;color: #333;}" in result
assert "@media screen{.invoice{width: 100%;}}</style>" in result
assert 'style="margin: 0;padding: 8px;color: #333;"' in result
assert "<p>Hello <strong>there</strong></p>" in result
assert 'style="width: 100%;border-collapse: collapse;"' in result
assert '<td colspan="2" style="text-align: right;">Total</td>' in result
assert 'style="display: block;"' in result
assert '<img src="cid:logo" width="100" alt="Logo"' in result
assert '<a href="https://example.com"' in result
def test_email_html_removes_executable_content(self) -> None:
result = _clean_email_html(
"""
<div onclick="alert('x')">Message</div>
<script>alert('script')</script>
<style>
@import url("https://example.com/x.css");
body { color: url("https://example.com/x"); position: fixed; }
@media screen { body { background-image: url("https://example.com/x"); } }
</style>
<a href="javascript:alert('x')">bad link</a>
<a href="cid:logo">bad cid link</a>
<img src="https://example.com/logo.png" onerror="alert('x')" alt="Logo"
style="background-image: url('https://example.com/logo.png'); position: fixed;">
""",
)
assert "Message" in result
assert "script" not in result
assert "background" not in result
assert "onclick" not in result
assert "onerror" not in result
assert "javascript:" not in result
assert "background-image" not in result
assert "position" not in result
assert "@import" not in result
assert "<a>bad link</a>" in result
assert "<a>bad cid link</a>" in result
assert '<img alt="Logo" style="">' in result
class TestEmailFileParsing: class TestEmailFileParsing:
""" """
Tests around reading a file and parsing it into a Tests around reading a file and parsing it into a
Generated
+2 -19
View File
@@ -313,11 +313,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" }, { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" },
] ]
[package.optional-dependencies]
css = [
{ name = "tinycss2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
[[package]] [[package]]
name = "brotli" name = "brotli"
version = "1.2.0" version = "1.2.0"
@@ -2893,7 +2888,7 @@ source = { virtual = "." }
dependencies = [ dependencies = [
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "bleach", extra = ["css"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "channels-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "channels-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -3043,7 +3038,7 @@ typing = [
requires-dist = [ requires-dist = [
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" }, { name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
{ name = "babel", specifier = ">=2.17" }, { name = "babel", specifier = ">=2.17" },
{ name = "bleach", extras = ["css"], specifier = "~=6.3.0" }, { name = "bleach", specifier = "~=6.3.0" },
{ name = "celery", extras = ["redis"], specifier = "~=5.6.2" }, { name = "celery", extras = ["redis"], specifier = "~=5.6.2" },
{ name = "channels", specifier = "~=4.2" }, { name = "channels", specifier = "~=4.2" },
{ name = "channels-redis", specifier = "~=4.2" }, { name = "channels-redis", specifier = "~=4.2" },
@@ -4897,18 +4892,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/40/d0/ad3feb0a392ef4e0c08bc32024950373ddc0669002cbdcbb9f3bf0c2d114/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:528217cad85ede5f85c8bc78b0341868d3c3cfefc6ecb5b622e1cacb6c73247b", size = 39837, upload-time = "2025-12-17T23:32:58.283Z" }, { url = "https://files.pythonhosted.org/packages/40/d0/ad3feb0a392ef4e0c08bc32024950373ddc0669002cbdcbb9f3bf0c2d114/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:528217cad85ede5f85c8bc78b0341868d3c3cfefc6ecb5b622e1cacb6c73247b", size = 39837, upload-time = "2025-12-17T23:32:58.283Z" },
] ]
[[package]]
name = "tinycss2"
version = "1.4.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "webencodings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/7a/fd/7a5ee21fd08ff70d3d33a5781c255cbe779659bd03278feb98b19ee550f4/tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7", size = 87085, upload-time = "2024-10-24T14:58:29.895Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e6/34/ebdc18bae6aa14fbee1a08b63c015c72b64868ff7dae68808ab500c492e2/tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289", size = 26610, upload-time = "2024-10-24T14:58:28.029Z" },
]
[[package]] [[package]]
name = "tinytag" name = "tinytag"
version = "2.2.1" version = "2.2.1"