From 7632b49e9026490993b67a17be14d945459bdfba Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Wed, 27 May 2026 11:26:46 -0700 Subject: [PATCH] Add css sanitizer --- pyproject.toml | 2 +- src/paperless/parsers/mail.py | 43 +++++++++++++++++++ .../tests/parsers/test_mail_parser.py | 26 +++++++---- uv.lock | 21 ++++++++- 4 files changed, 81 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1b6a11419..52e87d164 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ dependencies = [ "azure-ai-documentintelligence>=1.0.2", "babel>=2.17", - "bleach~=6.3.0", + "bleach[css]~=6.3.0", "celery[redis]~=5.6.2", "channels~=4.2", "channels-redis~=4.2", diff --git a/src/paperless/parsers/mail.py b/src/paperless/parsers/mail.py index 29355f3bd..88b54ef27 100644 --- a/src/paperless/parsers/mail.py +++ b/src/paperless/parsers/mail.py @@ -24,6 +24,7 @@ from typing import Self from bleach import clean from bleach import linkify +from bleach.css_sanitizer import CSSSanitizer from django.conf import settings from django.utils import timezone from django.utils.timezone import is_naive @@ -109,6 +110,7 @@ _EMAIL_HTML_GLOBAL_ATTRIBUTES = { "align", "alt", "height", + "style", "title", "width", } @@ -120,6 +122,46 @@ _EMAIL_HTML_TAG_ATTRIBUTES = { "th": {"colspan", "headers", "rowspan", "scope"}, "ul": {"type"}, } +_EMAIL_CSS_PROPERTIES = { + "background-color", + "border", + "border-bottom", + "border-collapse", + "border-color", + "border-left", + "border-right", + "border-spacing", + "border-style", + "border-top", + "border-width", + "color", + "display", + "font", + "font-family", + "font-size", + "font-style", + "font-weight", + "height", + "line-height", + "margin", + "margin-bottom", + "margin-left", + "margin-right", + "margin-top", + "max-width", + "min-width", + "padding", + "padding-bottom", + "padding-left", + "padding-right", + "padding-top", + "text-align", + "text-decoration", + "vertical-align", + "white-space", + "width", +} +_EMAIL_CSS_SANITIZER = CSSSanitizer(allowed_css_properties=_EMAIL_CSS_PROPERTIES) def _linkify_text_as_html(text: object) -> str: @@ -159,6 +201,7 @@ def _clean_email_html(text: str) -> str: tags=_EMAIL_HTML_TAGS, attributes=_allow_email_html_attribute, protocols=_EMAIL_HTML_PROTOCOLS, + css_sanitizer=_EMAIL_CSS_SANITIZER, strip=True, strip_comments=True, ), diff --git a/src/paperless/tests/parsers/test_mail_parser.py b/src/paperless/tests/parsers/test_mail_parser.py index 54444bc7a..0dedfbf02 100644 --- a/src/paperless/tests/parsers/test_mail_parser.py +++ b/src/paperless/tests/parsers/test_mail_parser.py @@ -87,16 +87,23 @@ class TestMailHtmlCleaning: def test_email_html_preserves_safe_structure(self) -> None: result = _clean_email_html( """ -

Hello there

-
Total
- Logo +
+

Hello there

+
+ + +
Total
+ Logo Visit https://example.com """, ) - assert "

Hello there

" in result - assert 'Total' in result - assert 'Logo' in result + assert 'style="margin: 0; padding: 8px; color: #333;"' in result + assert "

Hello there

" in result + assert 'style="width: 100%; border-collapse: collapse;"' in result + assert 'Total' in result + assert 'style="display: block;"' in result + assert 'Logo None: @@ -107,7 +114,8 @@ class TestMailHtmlCleaning: bad link bad cid link - Logo + Logo """, ) @@ -117,9 +125,11 @@ class TestMailHtmlCleaning: assert "onclick" not in result assert "onerror" not in result assert "javascript:" not in result + assert "background-image" not in result + assert "position" not in result assert "bad link" in result assert "bad cid link" in result - assert 'Logo' in result + assert 'Logo' in result class TestEmailFileParsing: diff --git a/uv.lock b/uv.lock index 3591fa29c..ae52bc275 100644 --- a/uv.lock +++ b/uv.lock @@ -313,6 +313,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" }, ] +[package.optional-dependencies] +css = [ + { name = "tinycss2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, +] + [[package]] name = "brotli" version = "1.2.0" @@ -2888,7 +2893,7 @@ source = { virtual = "." } dependencies = [ { name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "bleach", extra = ["css"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "channels-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -3038,7 +3043,7 @@ typing = [ requires-dist = [ { name = "azure-ai-documentintelligence", specifier = ">=1.0.2" }, { name = "babel", specifier = ">=2.17" }, - { name = "bleach", specifier = "~=6.3.0" }, + { name = "bleach", extras = ["css"], specifier = "~=6.3.0" }, { name = "celery", extras = ["redis"], specifier = "~=5.6.2" }, { name = "channels", specifier = "~=4.2" }, { name = "channels-redis", specifier = "~=4.2" }, @@ -4892,6 +4897,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/d0/ad3feb0a392ef4e0c08bc32024950373ddc0669002cbdcbb9f3bf0c2d114/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:528217cad85ede5f85c8bc78b0341868d3c3cfefc6ecb5b622e1cacb6c73247b", size = 39837, upload-time = "2025-12-17T23:32:58.283Z" }, ] +[[package]] +name = "tinycss2" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "webencodings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/fd/7a5ee21fd08ff70d3d33a5781c255cbe779659bd03278feb98b19ee550f4/tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7", size = 87085, upload-time = "2024-10-24T14:58:29.895Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/34/ebdc18bae6aa14fbee1a08b63c015c72b64868ff7dae68808ab500c492e2/tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289", size = 26610, upload-time = "2024-10-24T14:58:28.029Z" }, +] + [[package]] name = "tinytag" version = "2.2.1"