From 7632b49e9026490993b67a17be14d945459bdfba Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Wed, 27 May 2026 11:26:46 -0700 Subject: [PATCH] Add css sanitizer --- pyproject.toml | 2 +- src/paperless/parsers/mail.py | 43 +++++++++++++++++++ .../tests/parsers/test_mail_parser.py | 26 +++++++---- uv.lock | 21 ++++++++- 4 files changed, 81 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1b6a11419..52e87d164 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ dependencies = [ "azure-ai-documentintelligence>=1.0.2", "babel>=2.17", - "bleach~=6.3.0", + "bleach[css]~=6.3.0", "celery[redis]~=5.6.2", "channels~=4.2", "channels-redis~=4.2", diff --git a/src/paperless/parsers/mail.py b/src/paperless/parsers/mail.py index 29355f3bd..88b54ef27 100644 --- a/src/paperless/parsers/mail.py +++ b/src/paperless/parsers/mail.py @@ -24,6 +24,7 @@ from typing import Self from bleach import clean from bleach import linkify +from bleach.css_sanitizer import CSSSanitizer from django.conf import settings from django.utils import timezone from django.utils.timezone import is_naive @@ -109,6 +110,7 @@ _EMAIL_HTML_GLOBAL_ATTRIBUTES = { "align", "alt", "height", + "style", "title", "width", } @@ -120,6 +122,46 @@ _EMAIL_HTML_TAG_ATTRIBUTES = { "th": {"colspan", "headers", "rowspan", "scope"}, "ul": {"type"}, } +_EMAIL_CSS_PROPERTIES = { + "background-color", + "border", + "border-bottom", + "border-collapse", + "border-color", + "border-left", + "border-right", + "border-spacing", + "border-style", + "border-top", + "border-width", + "color", + "display", + "font", + "font-family", + "font-size", + "font-style", + "font-weight", + "height", + "line-height", + "margin", + "margin-bottom", + "margin-left", + "margin-right", + "margin-top", + "max-width", + "min-width", + "padding", + "padding-bottom", + "padding-left", + "padding-right", + "padding-top", + "text-align", + "text-decoration", + "vertical-align", + "white-space", + "width", +} +_EMAIL_CSS_SANITIZER = CSSSanitizer(allowed_css_properties=_EMAIL_CSS_PROPERTIES) def _linkify_text_as_html(text: object) -> str: @@ -159,6 +201,7 @@ def _clean_email_html(text: str) -> str: tags=_EMAIL_HTML_TAGS, attributes=_allow_email_html_attribute, protocols=_EMAIL_HTML_PROTOCOLS, + css_sanitizer=_EMAIL_CSS_SANITIZER, strip=True, strip_comments=True, ), diff --git a/src/paperless/tests/parsers/test_mail_parser.py b/src/paperless/tests/parsers/test_mail_parser.py index 54444bc7a..0dedfbf02 100644 --- a/src/paperless/tests/parsers/test_mail_parser.py +++ b/src/paperless/tests/parsers/test_mail_parser.py @@ -87,16 +87,23 @@ class TestMailHtmlCleaning: def test_email_html_preserves_safe_structure(self) -> None: result = _clean_email_html( """ -
Hello there
| Total |
Hello there
+| Total |
Hello there
Hello there
" in result + assert 'style="width: 100%; border-collapse: collapse;"' in result + assert '
+
""",
)
@@ -117,9 +125,11 @@ class TestMailHtmlCleaning:
assert "onclick" not in result
assert "onerror" not in result
assert "javascript:" not in result
+ assert "background-image" not in result
+ assert "position" not in result
assert "bad link" in result
assert "bad cid link" in result
- assert '