fix css sanitizer stuff

Add css sanitizer
Fix sanitize and linkify email HTML
2026-06-28 16:24:19 +00:00 · 2026-05-27 13:42:40 -07:00 · 2026-05-27 11:26:46 -07:00 · 2026-05-27 09:03:24 -07:00
11 changed files with 361 additions and 413 deletions
@@ -104,6 +104,8 @@ ARG JBIG2ENC_VERSION=0.30
 # Set Python environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
+    # Ignore warning from Whitenoise about async iterators
+    PYTHONWARNINGS="ignore:::django.http.response:517" \
    PNGX_CONTAINERIZED=1 \
    # https://docs.astral.sh/uv/reference/settings/#link-mode
    UV_LINK_MODE=copy
@@ -237,7 +239,7 @@ RUN set -eux \
  && echo "Making fontconfig cache writable for arbitrary container UIDs" \
    && chmod 1777 /var/cache/fontconfig \
  && echo "Collecting static files" \
-    && PAPERLESS_SECRET_KEY=build-time-dummy s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input \
+    && PAPERLESS_SECRET_KEY=build-time-dummy s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input --link \
    && PAPERLESS_SECRET_KEY=build-time-dummy s6-setuidgid paperless python3 manage.py compilemessages \
    && /usr/local/bin/deduplicate.py --verbose /usr/src/paperless/static/

@@ -8,13 +8,6 @@ export GRANIAN_HOST=${GRANIAN_HOST:-${PAPERLESS_BIND_ADDR:-"::"}}
 export GRANIAN_PORT=${GRANIAN_PORT:-${PAPERLESS_PORT:-8000}}
 export GRANIAN_WORKERS=${GRANIAN_WORKERS:-${PAPERLESS_WEBSERVER_WORKERS:-1}}

-# Static file serving: Granian matches against the raw URI path (before any
-# SCRIPT_NAME stripping), so the route must include the subpath prefix.
-_static_dir="${PAPERLESS_STATICDIR:-/usr/src/paperless/static}"
-_static_route="${PAPERLESS_FORCE_SCRIPT_NAME}/static"
-export GRANIAN_STATIC_PATH_MOUNT=${GRANIAN_STATIC_PATH_MOUNT:-${_static_dir}}
-export GRANIAN_STATIC_PATH_ROUTE=${GRANIAN_STATIC_PATH_ROUTE:-${_static_route:-/static}}
-
 # Only set GRANIAN_URL_PATH_PREFIX if PAPERLESS_FORCE_SCRIPT_NAME is set
 if [[ -n "${PAPERLESS_FORCE_SCRIPT_NAME}" ]]; then
 	export GRANIAN_URL_PATH_PREFIX=${PAPERLESS_FORCE_SCRIPT_NAME}
@@ -16,7 +16,7 @@ classifiers = [
 dependencies = [
  "azure-ai-documentintelligence>=1.0.2",
  "babel>=2.17",
-  "bleach~=6.3.0",
+  "bleach[css]~=6.3.0",
  "celery[redis]~=5.6.2",
  "channels~=4.2",
  "channels-redis~=4.2",
@@ -23,10 +23,6 @@ ExecStart=/bin/sh -c '\
  [ -n "$PAPERLESS_WEBSERVER_WORKERS" ] && export GRANIAN_WORKERS=$PAPERLESS_WEBSERVER_WORKERS; \
  # URL path prefix: only set if PAPERLESS_FORCE_SCRIPT_NAME exists \
  [ -n "$PAPERLESS_FORCE_SCRIPT_NAME" ] && export GRANIAN_URL_PATH_PREFIX=$PAPERLESS_FORCE_SCRIPT_NAME; \
-  # Static file serving: Granian matches the raw URI path (before SCRIPT_NAME stripping), \
-  # so the route must include any subpath prefix. \
-  [ -z "$GRANIAN_STATIC_PATH_MOUNT" ] && export GRANIAN_STATIC_PATH_MOUNT=${PAPERLESS_STATICDIR:-/opt/paperless/static}; \
-  [ -z "$GRANIAN_STATIC_PATH_ROUTE" ] && export GRANIAN_STATIC_PATH_ROUTE="${PAPERLESS_FORCE_SCRIPT_NAME}/static"; \
  exec granian --interface asginl --ws --loop uvloop "paperless.asgi:application"'

 [Install]
@@ -1,15 +1,12 @@
 from __future__ import annotations

 import logging
-import random
 import re
 import threading
-import time
 from datetime import UTC
 from datetime import datetime
 from enum import StrEnum
 from typing import TYPE_CHECKING
-from typing import Final
 from typing import Self
 from typing import TypedDict
 from typing import TypeVar
@@ -46,11 +43,6 @@ if TYPE_CHECKING:

 logger = logging.getLogger("paperless.search")

-_LOCK_TIMEOUT_SECONDS: Final[float] = 10.0  # per-attempt acquire timeout
-_LOCK_RETRY_ATTEMPTS: Final[int] = 4  # total attempts (1 initial + 3 retries)
-_LOCK_BACKOFF_BASE: Final[float] = 1.0  # seconds
-_LOCK_BACKOFF_CAP: Final[float] = 10.0  # seconds
-
 _WORD_RE = regex.compile(r"\w+")
 _AUTOCOMPLETE_REGEX_TIMEOUT = 1.0  # seconds; guards against ReDoS on untrusted content

@@ -191,27 +183,12 @@ class WriteBatch:
        if self._backend._path is not None:
            lock_path = self._backend._path / ".tantivy.lock"
            self._lock = filelock.FileLock(str(lock_path))
-            for attempt in range(_LOCK_RETRY_ATTEMPTS):
-                try:
-                    self._lock.acquire(timeout=self._lock_timeout)
-                    break
-                except filelock.Timeout:
-                    if attempt == _LOCK_RETRY_ATTEMPTS - 1:
-                        raise SearchIndexLockError(
-                            f"Could not acquire index lock after {_LOCK_RETRY_ATTEMPTS} "
-                            f"attempts (timeout={self._lock_timeout}s each)",
-                        )
-                    sleep_s = random.uniform(
-                        0,
-                        min(_LOCK_BACKOFF_CAP, _LOCK_BACKOFF_BASE * (2**attempt)),
-                    )
-                    logger.debug(
-                        "Index lock contention; retrying in %.2fs (attempt %d/%d)",
-                        sleep_s,
-                        attempt + 1,
-                        _LOCK_RETRY_ATTEMPTS,
-                    )
-                    time.sleep(sleep_s)
+            try:
+                self._lock.acquire(timeout=self._lock_timeout)
+            except filelock.Timeout as e:  # pragma: no cover
+                raise SearchIndexLockError(
+                    f"Could not acquire index lock within {self._lock_timeout}s",
+                ) from e

        self._raw_writer = self._backend._index.writer()
        return self
@@ -513,28 +490,13 @@ class TantivyBackend:
        Convenience method for single-document updates. For bulk operations,
        use batch_update() context manager for better performance.

-        On lock exhaustion after all retry attempts, schedules a deferred
-        index_document Celery task and returns normally. Callers will NOT
-        receive a SearchIndexLockError; the index write is deferred silently.
-
        Args:
            document: Django Document instance to index
            effective_content: Override document.content for indexing
        """
        self._ensure_open()
-        try:
-            with self.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS) as batch:
-                batch.add_or_update(document, effective_content)
-        except SearchIndexLockError:
-            logger.error(
-                "Search index lock exhausted for document %d after %d attempts; "
-                "scheduling deferred index write",
-                document.pk,
-                _LOCK_RETRY_ATTEMPTS,
-            )
-            from documents.tasks import index_document
-
-            index_document.apply_async(args=[document.pk], countdown=60)
+        with self.batch_update(lock_timeout=5.0) as batch:
+            batch.add_or_update(document, effective_content)

    def remove(self, doc_id: int) -> None:
        """
@@ -543,27 +505,12 @@ class TantivyBackend:
        Convenience method for single-document removal. For bulk operations,
        use batch_update() context manager for better performance.

-        On lock exhaustion after all retry attempts, schedules a deferred
-        remove_document_from_index Celery task and returns normally.
-        Callers will NOT receive a SearchIndexLockError.
-
        Args:
            doc_id: Primary key of the document to remove
        """
        self._ensure_open()
-        try:
-            with self.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS) as batch:
-                batch.remove(doc_id)
-        except SearchIndexLockError:
-            logger.error(
-                "Search index lock exhausted for doc_id %d after %d attempts; "
-                "scheduling deferred index removal",
-                doc_id,
-                _LOCK_RETRY_ATTEMPTS,
-            )
-            from documents.tasks import remove_document_from_index
-
-            remove_document_from_index.apply_async(args=[doc_id], countdown=60)
+        with self.batch_update(lock_timeout=5.0) as batch:
+            batch.remove(doc_id)

    def highlight_hits(
        self,
@@ -56,7 +56,6 @@ from documents.plugins.base import StopConsumeTaskError
 from documents.plugins.helpers import ProgressManager
 from documents.plugins.helpers import ProgressStatusOptions
 from documents.sanity_checker import SanityCheckFailedException
-from documents.search._backend import SearchIndexLockError
 from documents.signals import document_updated
 from documents.signals.handlers import cleanup_document_deletion
 from documents.signals.handlers import run_workflows
@@ -85,63 +84,6 @@ def index_optimize() -> None:
    )


-@shared_task(
-    bind=True,
-    ignore_result=True,
-    autoretry_for=(SearchIndexLockError,),
-    max_retries=5,
-    retry_backoff=60,
-    retry_jitter=True,
-)
-def index_document(self, document_id: int) -> None:
-    """
-    Deferred single-document index write.
-
-    Used as a self-healing fallback when add_or_update() exhausts its lock retry
-    budget during high-concurrency consumption. Runs via batch_update() directly
-    to avoid re-entering the deferred scheduling path in add_or_update().
-
-    If the document was deleted before this task runs, it exits cleanly.
-    """
-    from documents.search import get_backend
-
-    try:
-        document = Document.objects.get(pk=document_id)
-    except Document.DoesNotExist:
-        logger.info(
-            "index_document: document %d no longer exists; skipping",
-            document_id,
-        )
-        return
-    with get_backend().batch_update() as batch:
-        batch.add_or_update(
-            document,
-            effective_content=document.get_effective_content(),
-        )
-
-
-@shared_task(
-    bind=True,
-    ignore_result=True,
-    autoretry_for=(SearchIndexLockError,),
-    max_retries=5,
-    retry_backoff=60,
-    retry_jitter=True,
-)
-def remove_document_from_index(self, doc_id: int) -> None:
-    """
-    Deferred single-document index removal.
-
-    Used as a self-healing fallback when remove() exhausts its lock retry budget.
-    Operates only on the Tantivy index; no database lookup required.
-    If the document has already been removed, the term-query delete is a no-op.
-    """
-    from documents.search import get_backend
-
-    with get_backend().batch_update() as batch:
-        batch.remove(doc_id)
-
-
@shared_task
 def train_classifier(
    *,
@@ -1,248 +0,0 @@
-"""Tests for search index lock backoff, retry logic, and self-healing deferred tasks."""
-
-from __future__ import annotations
-
-import logging
-from typing import TYPE_CHECKING
-
-import filelock
-import pytest
-
-from documents.search._backend import _LOCK_BACKOFF_CAP
-from documents.search._backend import _LOCK_RETRY_ATTEMPTS
-from documents.search._backend import _LOCK_TIMEOUT_SECONDS
-from documents.search._backend import SearchIndexLockError
-from documents.search._backend import TantivyBackend
-from documents.tasks import index_document
-from documents.tasks import remove_document_from_index
-from documents.tests.factories import DocumentFactory
-
-if TYPE_CHECKING:
-    from collections.abc import Generator
-    from pathlib import Path
-
-    from pytest_mock import MockerFixture
-
-pytestmark = pytest.mark.search
-
-
-@pytest.fixture
-def disk_backend(tmp_path: Path) -> Generator[TantivyBackend, None, None]:
-    """On-disk TantivyBackend so the file-lock code path is exercised."""
-    b = TantivyBackend(path=tmp_path)
-    b.open()
-    try:
-        yield b
-    finally:
-        b.close()
-
-
-class TestWriteBatchLockRetry:
-    """Test WriteBatch retry loop with backoff + full jitter."""
-
-    @pytest.mark.django_db
-    def test_lock_retries_then_succeeds(
-        self,
-        disk_backend: TantivyBackend,
-        mocker: MockerFixture,
-    ) -> None:
-        """Timeout on first 3 attempts then success on 4th — document must be indexed."""
-        doc = DocumentFactory()
-
-        acquire_calls = 0
-
-        def flaky_acquire(timeout: float) -> None:
-            nonlocal acquire_calls
-            acquire_calls += 1
-            # Raise Timeout for first _LOCK_RETRY_ATTEMPTS - 1 calls, succeed on last
-            if acquire_calls < _LOCK_RETRY_ATTEMPTS:
-                raise filelock.Timeout("")
-
-        sleep_values: list[float] = []
-
-        mocker.patch(
-            "documents.search._backend.filelock.FileLock.acquire",
-            side_effect=flaky_acquire,
-        )
-        mock_sleep = mocker.patch(
-            "documents.search._backend.time.sleep",
-            side_effect=lambda s: sleep_values.append(s),
-        )
-
-        # Should not raise — 4th attempt succeeds
-        with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS) as batch:
-            batch.add_or_update(doc)
-
-        # sleep called exactly _LOCK_RETRY_ATTEMPTS - 1 times (once per failed attempt)
-        assert mock_sleep.call_count == _LOCK_RETRY_ATTEMPTS - 1
-
-        # All sleep values must be in [0, _LOCK_BACKOFF_CAP]
-        for s in sleep_values:
-            assert 0 <= s <= _LOCK_BACKOFF_CAP, (
-                f"Sleep value {s} outside [0, {_LOCK_BACKOFF_CAP}]"
-            )
-
-    def test_lock_exhaustion_raises_search_index_lock_error(
-        self,
-        disk_backend: TantivyBackend,
-        mocker: MockerFixture,
-    ) -> None:
-        """All acquire attempts raise Timeout — WriteBatch must raise SearchIndexLockError."""
-        mocker.patch(
-            "documents.search._backend.filelock.FileLock.acquire",
-            side_effect=filelock.Timeout(""),
-        )
-        mocker.patch("documents.search._backend.time.sleep")
-
-        with pytest.raises(SearchIndexLockError):
-            with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS):
-                pass
-
-    def test_jitter_values_in_range(
-        self,
-        disk_backend: TantivyBackend,
-        mocker: MockerFixture,
-    ) -> None:
-        """Sleep values must always lie in [0, _LOCK_BACKOFF_CAP] across many samples."""
-        mocker.patch(
-            "documents.search._backend.filelock.FileLock.acquire",
-            side_effect=filelock.Timeout(""),
-        )
-        sleep_values: list[float] = []
-        mocker.patch(
-            "documents.search._backend.time.sleep",
-            side_effect=lambda s: sleep_values.append(s),
-        )
-        for _ in range(50):
-            sleep_values.clear()
-            with pytest.raises(SearchIndexLockError):
-                with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS):
-                    pass
-
-            for s in sleep_values:
-                assert 0 <= s <= _LOCK_BACKOFF_CAP, (
-                    f"Jitter {s} exceeds cap {_LOCK_BACKOFF_CAP}"
-                )
-
-
-class TestAddOrUpdateDeferredScheduling:
-    """Test that add_or_update() and remove() defer to Celery on lock exhaustion."""
-
-    @pytest.mark.django_db
-    def test_lock_exhaustion_schedules_deferred_task(
-        self,
-        disk_backend: TantivyBackend,
-        mocker: MockerFixture,
-    ) -> None:
-        """Lock exhaustion in add_or_update must schedule index_document task, not raise."""
-        doc = DocumentFactory()
-
-        mocker.patch(
-            "documents.search._backend.filelock.FileLock.acquire",
-            side_effect=filelock.Timeout(""),
-        )
-        mocker.patch("documents.search._backend.time.sleep")
-        mock_apply = mocker.patch("documents.tasks.index_document.apply_async")
-
-        # Must NOT raise
-        disk_backend.add_or_update(doc)
-
-        mock_apply.assert_called_once_with(args=[doc.pk], countdown=60)
-
-    def test_remove_exhaustion_schedules_deferred_task(
-        self,
-        disk_backend: TantivyBackend,
-        mocker: MockerFixture,
-    ) -> None:
-        """Lock exhaustion in remove() must schedule remove_document_from_index task, not raise."""
-        doc_id = 503
-
-        mocker.patch(
-            "documents.search._backend.filelock.FileLock.acquire",
-            side_effect=filelock.Timeout(""),
-        )
-        mocker.patch("documents.search._backend.time.sleep")
-        mock_apply = mocker.patch(
-            "documents.tasks.remove_document_from_index.apply_async",
-        )
-
-        # Must NOT raise
-        disk_backend.remove(doc_id)
-
-        mock_apply.assert_called_once_with(args=[doc_id], countdown=60)
-
-
-@pytest.mark.django_db
-class TestIndexDocumentTask:
-    """Test the deferred index_document and remove_document_from_index Celery tasks."""
-
-    def test_index_document_task_skips_deleted_document(
-        self,
-        caplog: pytest.LogCaptureFixture,
-    ) -> None:
-        """index_document with a non-existent doc_id must return cleanly and log INFO."""
-        nonexistent_id = 999999
-
-        with caplog.at_level(logging.INFO, logger="paperless.tasks"):
-            index_document(nonexistent_id)
-
-        assert any("no longer exists" in record.message for record in caplog.records), (
-            "Expected INFO log about missing document"
-        )
-
-    def test_index_document_task_indexes_existing_document(
-        self,
-        backend: TantivyBackend,
-        mocker: MockerFixture,
-    ) -> None:
-        """index_document task must add the document to the index via batch_update."""
-        doc = DocumentFactory(content="via deferred task")
-
-        # get_backend is imported lazily inside the task: `from documents.search import get_backend`
-        mocker.patch(
-            "documents.search.get_backend",
-            return_value=backend,
-        )
-        index_document(doc.pk)
-
-        ids = backend.search_ids("deferred task", user=None)
-        assert doc.pk in ids
-
-    def test_remove_document_from_index_task_removes_existing_document(
-        self,
-        backend: TantivyBackend,
-        mocker: MockerFixture,
-    ) -> None:
-        """remove_document_from_index task must remove the document from the index."""
-        doc = DocumentFactory(content="will be removed by deferred task")
-        backend.add_or_update(doc)
-        assert doc.pk in backend.search_ids("removed", user=None)
-
-        mocker.patch("documents.search.get_backend", return_value=backend)
-        remove_document_from_index(doc.pk)
-
-        assert doc.pk not in backend.search_ids("removed", user=None)
-
-    def test_task_does_not_swallow_lock_error(
-        self,
-        mocker: MockerFixture,
-    ) -> None:
-        """Verifies the task body propagates SearchIndexLockError so Celery's
-        autoretry_for can catch it (rather than the task swallowing the error
-        and silently succeeding)."""
-        doc = DocumentFactory()
-
-        mock_batch = mocker.MagicMock()
-        mock_batch.__enter__ = mocker.MagicMock(
-            side_effect=SearchIndexLockError("exhausted"),
-        )
-        mock_batch.__exit__ = mocker.MagicMock(return_value=False)
-
-        mock_backend = mocker.MagicMock()
-        mock_backend.batch_update.return_value = mock_batch
-
-        # get_backend is imported lazily inside the task: `from documents.search import get_backend`
-        mocker.patch("documents.search.get_backend", return_value=mock_backend)
-
-        with pytest.raises(SearchIndexLockError):
-            index_document(doc.pk)
@@ -24,6 +24,7 @@ from typing import Self

 from bleach import clean
 from bleach import linkify
+from bleach.css_sanitizer import CSSSanitizer
 from django.conf import settings
 from django.utils import timezone
 from django.utils.timezone import is_naive
@@ -38,6 +39,10 @@ from humanize import naturalsize
 from imap_tools import MailAttachment
 from imap_tools import MailMessage
 from tika_client import TikaClient
+from tinycss2 import parse_declaration_list
+from tinycss2 import parse_rule_list
+from tinycss2 import parse_stylesheet
+from tinycss2 import serialize

 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
@@ -58,6 +63,238 @@ _SUPPORTED_MIME_TYPES: dict[str, str] = {
    "message/rfc822": ".eml",
 }

+_EMAIL_HTML_TAGS = {
+    "a",
+    "abbr",
+    "acronym",
+    "address",
+    "b",
+    "blockquote",
+    "br",
+    "caption",
+    "code",
+    "dd",
+    "del",
+    "div",
+    "dl",
+    "dt",
+    "em",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "hr",
+    "i",
+    "img",
+    "li",
+    "ol",
+    "p",
+    "pre",
+    "s",
+    "small",
+    "span",
+    "style",
+    "strong",
+    "sub",
+    "sup",
+    "table",
+    "tbody",
+    "td",
+    "tfoot",
+    "th",
+    "thead",
+    "tr",
+    "u",
+    "ul",
+}
+_EMAIL_HTML_PROTOCOLS = {"cid", "http", "https", "mailto"}
+_EMAIL_HTML_GLOBAL_ATTRIBUTES = {
+    "abbr",
+    "align",
+    "alt",
+    "height",
+    "style",
+    "title",
+    "width",
+}
+_EMAIL_HTML_TAG_ATTRIBUTES = {
+    "a": {"href", "name", "title"},
+    "img": {"alt", "height", "src", "title", "width"},
+    "ol": {"start", "type"},
+    "td": {"colspan", "headers", "rowspan", "scope"},
+    "th": {"colspan", "headers", "rowspan", "scope"},
+    "ul": {"type"},
+}
+_EMAIL_CSS_PROPERTIES = {
+    "background-color",
+    "border",
+    "border-bottom",
+    "border-collapse",
+    "border-color",
+    "border-left",
+    "border-right",
+    "border-spacing",
+    "border-style",
+    "border-top",
+    "border-width",
+    "color",
+    "display",
+    "font",
+    "font-family",
+    "font-size",
+    "font-style",
+    "font-weight",
+    "height",
+    "line-height",
+    "margin",
+    "margin-bottom",
+    "margin-left",
+    "margin-right",
+    "margin-top",
+    "max-width",
+    "min-width",
+    "padding",
+    "padding-bottom",
+    "padding-left",
+    "padding-right",
+    "padding-top",
+    "text-align",
+    "text-decoration",
+    "vertical-align",
+    "white-space",
+    "width",
+}
+
+
+def _has_unsafe_css_value(tokens: list) -> bool:
+    for token in tokens:
+        if token.type == "url":
+            return True
+        if token.type == "function" and token.lower_name in {"expression", "url"}:
+            return True
+        if hasattr(token, "content") and _has_unsafe_css_value(token.content):
+            return True
+    return False
+
+
+class EmailCSSSanitizer(CSSSanitizer):
+    def sanitize_css(self, style: str) -> str:
+        declarations = parse_declaration_list(
+            style,
+            skip_comments=True,
+            skip_whitespace=True,
+        )
+        sanitized = [
+            declaration
+            for declaration in declarations
+            if declaration.type == "declaration"
+            and declaration.lower_name in self.allowed_css_properties
+            and not _has_unsafe_css_value(declaration.value)
+        ]
+        return serialize(sanitized).strip()
+
+
+_EMAIL_CSS_SANITIZER = EmailCSSSanitizer(
+    allowed_css_properties=_EMAIL_CSS_PROPERTIES,
+)
+
+
+def _linkify_text_as_html(text: object) -> str:
+    """Escape plain text and linkify URLs/email addresses for safe HTML output."""
+    if isinstance(text, list):
+        text = "\n".join([str(e) for e in text])
+    if not isinstance(text, str):
+        text = str(text)
+    text = escape(text)
+    text = linkify(text, parse_email=True)
+    return text.replace("\n", "<br>")
+
+
+def _allow_email_html_attribute(tag: str, name: str, value: str) -> bool:
+    if name not in _EMAIL_HTML_GLOBAL_ATTRIBUTES | _EMAIL_HTML_TAG_ATTRIBUTES.get(
+        tag,
+        set(),
+    ):
+        return False
+
+    if tag == "img" and name == "src":
+        return value.lower().startswith("cid:")
+
+    if tag == "a" and name == "href":
+        return value.lower().startswith(("http://", "https://", "mailto:"))
+
+    return True
+
+
+def _sanitize_email_css_rules(rules: list) -> str:
+    sanitized_rules = []
+
+    for rule in rules:
+        if rule.type == "qualified-rule":
+            selector = serialize(rule.prelude).strip()
+            declarations = _EMAIL_CSS_SANITIZER.sanitize_css(
+                serialize(rule.content),
+            )
+            if selector and declarations:
+                sanitized_rules.append(f"{selector}{{{declarations}}}")
+
+        elif (
+            rule.type == "at-rule" and rule.lower_at_keyword == "media" and rule.content
+        ):
+            media_query = serialize(rule.prelude).strip()
+            nested_rules = _sanitize_email_css_rules(
+                parse_rule_list(
+                    rule.content,
+                    skip_comments=True,
+                    skip_whitespace=True,
+                ),
+            )
+            if media_query and nested_rules:
+                sanitized_rules.append(f"@media {media_query}{{{nested_rules}}}")
+
+    return "".join(sanitized_rules)
+
+
+def _sanitize_email_css_stylesheet(css: str) -> str:
+    return _sanitize_email_css_rules(
+        parse_stylesheet(css, skip_comments=True, skip_whitespace=True),
+    )
+
+
+def _clean_email_html(text: str) -> str:
+    """Sanitize email HTML before rendering it with Chromium."""
+    sanitized_style_blocks = []
+
+    def sanitize_style_block(match: re.Match[str]) -> str:
+        sanitized_style_blocks.append(
+            f"<style>{_sanitize_email_css_stylesheet(match.group(1))}</style>",
+        )
+        return f"__PAPERLESS_SANITIZED_STYLE_{len(sanitized_style_blocks) - 1}__"
+
+    text = re.sub(r"(?is)<script\b[^>]*>.*?</script\s*>", "", text)
+    text = re.sub(
+        r"(?is)<style\b[^>]*>(.*?)</style\s*>",
+        sanitize_style_block,
+        text,
+    )
+    text = re.sub(r"(?is)</?(script|style)\b[^>]*>", "", text)
+    for index, style_block in enumerate(sanitized_style_blocks):
+        text = text.replace(f"__PAPERLESS_SANITIZED_STYLE_{index}__", style_block)
+    return linkify(
+        clean(
+            text,
+            tags=_EMAIL_HTML_TAGS,
+            attributes=_allow_email_html_attribute,
+            protocols=_EMAIL_HTML_PROTOCOLS,
+            css_sanitizer=_EMAIL_CSS_SANITIZER,
+            strip=True,
+            strip_comments=True,
+        ),
+        parse_email=True,
+    )
+

 class MailDocumentParser:
    """Parse .eml email files for Paperless-ngx.
@@ -619,33 +856,29 @@ class MailDocumentParser:
            Path to the rendered HTML file inside the temporary directory.
        """

-        def clean_html(text: str) -> str:
-            """Attempt to clean, escape, and linkify the given HTML string."""
-            if isinstance(text, list):
-                text = "\n".join([str(e) for e in text])
-            if not isinstance(text, str):
-                text = str(text)
-            text = escape(text)
-            text = clean(text)
-            text = linkify(text, parse_email=True)
-            text = text.replace("\n", "<br>")
-            return text
-
        data = {}

-        data["subject"] = clean_html(mail.subject)
+        data["subject"] = _linkify_text_as_html(mail.subject)
        if data["subject"]:
            data["subject_label"] = "Subject"
-        data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
+        data["from"] = _linkify_text_as_html(
+            mail.from_values.full if mail.from_values else "",
+        )
        if data["from"]:
            data["from_label"] = "From"
-        data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
+        data["to"] = _linkify_text_as_html(
+            ", ".join(address.full for address in mail.to_values),
+        )
        if data["to"]:
            data["to_label"] = "To"
-        data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
+        data["cc"] = _linkify_text_as_html(
+            ", ".join(address.full for address in mail.cc_values),
+        )
        if data["cc"]:
            data["cc_label"] = "CC"
-        data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
+        data["bcc"] = _linkify_text_as_html(
+            ", ".join(address.full for address in mail.bcc_values),
+        )
        if data["bcc"]:
            data["bcc_label"] = "BCC"

@@ -654,14 +887,14 @@ class MailDocumentParser:
            att.append(
                f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
            )
-        data["attachments"] = clean_html(", ".join(att))
+        data["attachments"] = _linkify_text_as_html(", ".join(att))
        if data["attachments"]:
            data["attachments_label"] = "Attachments"

-        data["date"] = clean_html(
+        data["date"] = _linkify_text_as_html(
            timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
        )
-        data["content"] = clean_html(mail.text.strip())
+        data["content"] = _linkify_text_as_html(mail.text.strip())

        from django.template.loader import render_to_string

@@ -761,19 +994,11 @@ class MailDocumentParser:
            If Gotenberg returns an error.
        """

-        def clean_html_script(text: str) -> str:
-            compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
-            text = compiled_open.sub("<div hidden ", text)
-
-            compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
-            text = compiled_close.sub("</div", text)
-            return text
-
        logger.info("Converting message html to PDF")

        tempdir = Path(self._tempdir)

-        html_clean = clean_html_script(orig_html)
+        html_clean = _clean_email_html(orig_html)
        html_clean_file = tempdir / "index.html"
        html_clean_file.write_text(html_clean)

@@ -118,6 +118,7 @@ SCRATCH_DIR = get_path_from_env(
 env_apps = get_list_from_env("PAPERLESS_APPS")

 INSTALLED_APPS = [
+    "whitenoise.runserver_nostatic",
    "django.contrib.auth",
    "django.contrib.contenttypes",
    "django.contrib.sessions",
@@ -172,6 +173,7 @@ if DEBUG:

 MIDDLEWARE = [
    "django.middleware.security.SecurityMiddleware",
+    "whitenoise.middleware.WhiteNoiseMiddleware",
    "django.contrib.sessions.middleware.SessionMiddleware",
    "corsheaders.middleware.CorsMiddleware",
    "django.middleware.locale.LocaleMiddleware",
@@ -230,6 +232,7 @@ WSGI_APPLICATION = "paperless.wsgi.application"
 ASGI_APPLICATION = "paperless.asgi.application"

 STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", BASE_URL + "static/")
+WHITENOISE_STATIC_PREFIX = "/static/"

 STORAGES = {
    "staticfiles": {
@@ -15,6 +15,8 @@ from documents.parsers import ParseError
 from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.mail import MailDocumentParser
+from paperless.parsers.mail import _clean_email_html
+from paperless.parsers.mail import _linkify_text_as_html


 class TestMailParserProtocol:
@@ -72,6 +74,75 @@ class TestMailParserProtocol:
        assert count > 0


+class TestMailHtmlCleaning:
+    def test_text_fields_are_escaped_before_linkifying(self) -> None:
+        result = _linkify_text_as_html(
+            "Hello <b>bold</b>\nhttps://example.com?a=1&b=2",
+        )
+
+        assert "&lt;b&gt;bold&lt;/b&gt;" in result
+        assert "<br>" in result
+        assert '<a href="https://example.com?a=1&amp;b=2"' in result
+
+    def test_email_html_preserves_safe_structure(self) -> None:
+        result = _clean_email_html(
+            """
+            <style>
+              .invoice { margin: 0; padding: 8px; color: #333; }
+              @media screen { .invoice { width: 100%; } }
+            </style>
+            <div style="margin: 0; padding: 8px; color: #333;">
+              <p>Hello <strong>there</strong></p>
+            </div>
+            <table style="width: 100%; border-collapse: collapse;">
+              <tr><td colspan="2" style="text-align: right;">Total</td></tr>
+            </table>
+            <img src="cid:logo" width="100" alt="Logo" style="display: block;">
+            Visit https://example.com
+            """,
+        )
+
+        assert "<style>.invoice{margin: 0;padding: 8px;color: #333;}" in result
+        assert "@media screen{.invoice{width: 100%;}}</style>" in result
+        assert 'style="margin: 0;padding: 8px;color: #333;"' in result
+        assert "<p>Hello <strong>there</strong></p>" in result
+        assert 'style="width: 100%;border-collapse: collapse;"' in result
+        assert '<td colspan="2" style="text-align: right;">Total</td>' in result
+        assert 'style="display: block;"' in result
+        assert '<img src="cid:logo" width="100" alt="Logo"' in result
+        assert '<a href="https://example.com"' in result
+
+    def test_email_html_removes_executable_content(self) -> None:
+        result = _clean_email_html(
+            """
+            <div onclick="alert('x')">Message</div>
+            <script>alert('script')</script>
+            <style>
+              @import url("https://example.com/x.css");
+              body { color: url("https://example.com/x"); position: fixed; }
+              @media screen { body { background-image: url("https://example.com/x"); } }
+            </style>
+            <a href="javascript:alert('x')">bad link</a>
+            <a href="cid:logo">bad cid link</a>
+            <img src="https://example.com/logo.png" onerror="alert('x')" alt="Logo"
+                 style="background-image: url('https://example.com/logo.png'); position: fixed;">
+            """,
+        )
+
+        assert "Message" in result
+        assert "script" not in result
+        assert "background" not in result
+        assert "onclick" not in result
+        assert "onerror" not in result
+        assert "javascript:" not in result
+        assert "background-image" not in result
+        assert "position" not in result
+        assert "@import" not in result
+        assert "<a>bad link</a>" in result
+        assert "<a>bad cid link</a>" in result
+        assert '<img alt="Logo" style="">' in result
+
+
 class TestEmailFileParsing:
    """
    Tests around reading a file and parsing it into a
@@ -313,6 +313,11 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" },
 ]

+[package.optional-dependencies]
+css = [
+    { name = "tinycss2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+]
+
 [[package]]
 name = "brotli"
 version = "1.2.0"
@@ -2888,7 +2893,7 @@ source = { virtual = "." }
 dependencies = [
    { name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "bleach", extra = ["css"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "channels-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -3038,7 +3043,7 @@ typing = [
 requires-dist = [
    { name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
    { name = "babel", specifier = ">=2.17" },
-    { name = "bleach", specifier = "~=6.3.0" },
+    { name = "bleach", extras = ["css"], specifier = "~=6.3.0" },
    { name = "celery", extras = ["redis"], specifier = "~=5.6.2" },
    { name = "channels", specifier = "~=4.2" },
    { name = "channels-redis", specifier = "~=4.2" },
@@ -4892,6 +4897,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/40/d0/ad3feb0a392ef4e0c08bc32024950373ddc0669002cbdcbb9f3bf0c2d114/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:528217cad85ede5f85c8bc78b0341868d3c3cfefc6ecb5b622e1cacb6c73247b", size = 39837, upload-time = "2025-12-17T23:32:58.283Z" },
 ]

+[[package]]
+name = "tinycss2"
+version = "1.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "webencodings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7a/fd/7a5ee21fd08ff70d3d33a5781c255cbe779659bd03278feb98b19ee550f4/tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7", size = 87085, upload-time = "2024-10-24T14:58:29.895Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e6/34/ebdc18bae6aa14fbee1a08b63c015c72b64868ff7dae68808ab500c492e2/tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289", size = 26610, upload-time = "2024-10-24T14:58:28.029Z" },
+]
+
 [[package]]
 name = "tinytag"
 version = "2.2.1"
Author	SHA1	Message	Date
shamoon	b2e4cbd980	fix css sanitizer stuff	2026-05-27 13:42:40 -07:00
shamoon	7632b49e90	Add css sanitizer	2026-05-27 11:26:46 -07:00
shamoon	1a5c370ed5	Fix sanitize and linkify email HTML	2026-05-27 09:03:24 -07:00