fix css sanitizer stuff

Add css sanitizer
Fix sanitize and linkify email HTML
2026-06-29 08:44:24 +00:00 · 2026-05-27 13:42:40 -07:00 · 2026-05-27 11:26:46 -07:00 · 2026-05-27 09:03:24 -07:00
4 changed files with 345 additions and 32 deletions
@@ -16,7 +16,7 @@ classifiers = [
 dependencies = [
  "azure-ai-documentintelligence>=1.0.2",
  "babel>=2.17",
-  "bleach~=6.3.0",
+  "bleach[css]~=6.3.0",
  "celery[redis]~=5.6.2",
  "channels~=4.2",
  "channels-redis~=4.2",
@@ -24,6 +24,7 @@ from typing import Self
 from bleach import clean
 from bleach import linkify
 from bleach.css_sanitizer import CSSSanitizer
 from django.conf import settings
 from django.utils import timezone
 from django.utils.timezone import is_naive
@@ -38,6 +39,10 @@ from humanize import naturalsize
 from imap_tools import MailAttachment
 from imap_tools import MailMessage
 from tika_client import TikaClient
 from tinycss2 import parse_declaration_list
 from tinycss2 import parse_rule_list
 from tinycss2 import parse_stylesheet
 from tinycss2 import serialize
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
@@ -58,6 +63,238 @@ _SUPPORTED_MIME_TYPES: dict[str, str] = {
    "message/rfc822": ".eml",
 }
 _EMAIL_HTML_TAGS = {
    "a",
    "abbr",
    "acronym",
    "address",
    "b",
    "blockquote",
    "br",
    "caption",
    "code",
    "dd",
    "del",
    "div",
    "dl",
    "dt",
    "em",
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
    "hr",
    "i",
    "img",
    "li",
    "ol",
    "p",
    "pre",
    "s",
    "small",
    "span",
    "style",
    "strong",
    "sub",
    "sup",
    "table",
    "tbody",
    "td",
    "tfoot",
    "th",
    "thead",
    "tr",
    "u",
    "ul",
 }
 _EMAIL_HTML_PROTOCOLS = {"cid", "http", "https", "mailto"}
 _EMAIL_HTML_GLOBAL_ATTRIBUTES = {
    "abbr",
    "align",
    "alt",
    "height",
    "style",
    "title",
    "width",
 }
 _EMAIL_HTML_TAG_ATTRIBUTES = {
    "a": {"href", "name", "title"},
    "img": {"alt", "height", "src", "title", "width"},
    "ol": {"start", "type"},
    "td": {"colspan", "headers", "rowspan", "scope"},
    "th": {"colspan", "headers", "rowspan", "scope"},
    "ul": {"type"},
 }
 _EMAIL_CSS_PROPERTIES = {
    "background-color",
    "border",
    "border-bottom",
    "border-collapse",
    "border-color",
    "border-left",
    "border-right",
    "border-spacing",
    "border-style",
    "border-top",
    "border-width",
    "color",
    "display",
    "font",
    "font-family",
    "font-size",
    "font-style",
    "font-weight",
    "height",
    "line-height",
    "margin",
    "margin-bottom",
    "margin-left",
    "margin-right",
    "margin-top",
    "max-width",
    "min-width",
    "padding",
    "padding-bottom",
    "padding-left",
    "padding-right",
    "padding-top",
    "text-align",
    "text-decoration",
    "vertical-align",
    "white-space",
    "width",
 }
 def _has_unsafe_css_value(tokens: list) -> bool:
    for token in tokens:
        if token.type == "url":
            return True
        if token.type == "function" and token.lower_name in {"expression", "url"}:
            return True
        if hasattr(token, "content") and _has_unsafe_css_value(token.content):
            return True
    return False
 class EmailCSSSanitizer(CSSSanitizer):
    def sanitize_css(self, style: str) -> str:
        declarations = parse_declaration_list(
            style,
            skip_comments=True,
            skip_whitespace=True,
        )
        sanitized = [
            declaration
            for declaration in declarations
            if declaration.type == "declaration"
            and declaration.lower_name in self.allowed_css_properties
            and not _has_unsafe_css_value(declaration.value)
        ]
        return serialize(sanitized).strip()
 _EMAIL_CSS_SANITIZER = EmailCSSSanitizer(
    allowed_css_properties=_EMAIL_CSS_PROPERTIES,
 )
 def _linkify_text_as_html(text: object) -> str:
    """Escape plain text and linkify URLs/email addresses for safe HTML output."""
    if isinstance(text, list):
        text = "\n".join([str(e) for e in text])
    if not isinstance(text, str):
        text = str(text)
    text = escape(text)
    text = linkify(text, parse_email=True)
    return text.replace("\n", "<br>")
 def _allow_email_html_attribute(tag: str, name: str, value: str) -> bool:
    if name not in _EMAIL_HTML_GLOBAL_ATTRIBUTES | _EMAIL_HTML_TAG_ATTRIBUTES.get(
        tag,
        set(),
    ):
        return False
    if tag == "img" and name == "src":
        return value.lower().startswith("cid:")
    if tag == "a" and name == "href":
        return value.lower().startswith(("http://", "https://", "mailto:"))
    return True
 def _sanitize_email_css_rules(rules: list) -> str:
    sanitized_rules = []
    for rule in rules:
        if rule.type == "qualified-rule":
            selector = serialize(rule.prelude).strip()
            declarations = _EMAIL_CSS_SANITIZER.sanitize_css(
                serialize(rule.content),
            )
            if selector and declarations:
                sanitized_rules.append(f"{selector}{{{declarations}}}")
        elif (
            rule.type == "at-rule" and rule.lower_at_keyword == "media" and rule.content
        ):
            media_query = serialize(rule.prelude).strip()
            nested_rules = _sanitize_email_css_rules(
                parse_rule_list(
                    rule.content,
                    skip_comments=True,
                    skip_whitespace=True,
                ),
            )
            if media_query and nested_rules:
                sanitized_rules.append(f"@media {media_query}{{{nested_rules}}}")
    return "".join(sanitized_rules)
 def _sanitize_email_css_stylesheet(css: str) -> str:
    return _sanitize_email_css_rules(
        parse_stylesheet(css, skip_comments=True, skip_whitespace=True),
    )
 def _clean_email_html(text: str) -> str:
    """Sanitize email HTML before rendering it with Chromium."""
    sanitized_style_blocks = []
    def sanitize_style_block(match: re.Match[str]) -> str:
        sanitized_style_blocks.append(
            f"<style>{_sanitize_email_css_stylesheet(match.group(1))}</style>",
        )
        return f"__PAPERLESS_SANITIZED_STYLE_{len(sanitized_style_blocks) - 1}__"
    text = re.sub(r"(?is)<script\b[^>]*>.*?</script\s*>", "", text)
    text = re.sub(
        r"(?is)<style\b[^>]*>(.*?)</style\s*>",
        sanitize_style_block,
        text,
    )
    text = re.sub(r"(?is)</?(script|style)\b[^>]*>", "", text)
    for index, style_block in enumerate(sanitized_style_blocks):
        text = text.replace(f"__PAPERLESS_SANITIZED_STYLE_{index}__", style_block)
    return linkify(
        clean(
            text,
            tags=_EMAIL_HTML_TAGS,
            attributes=_allow_email_html_attribute,
            protocols=_EMAIL_HTML_PROTOCOLS,
            css_sanitizer=_EMAIL_CSS_SANITIZER,
            strip=True,
            strip_comments=True,
        ),
        parse_email=True,
    )
 class MailDocumentParser:
    """Parse .eml email files for Paperless-ngx.
@@ -619,33 +856,29 @@ class MailDocumentParser:
            Path to the rendered HTML file inside the temporary directory.
        """
        def clean_html(text: str) -> str:
            """Attempt to clean, escape, and linkify the given HTML string."""
            if isinstance(text, list):
                text = "\n".join([str(e) for e in text])
            if not isinstance(text, str):
                text = str(text)
            text = escape(text)
            text = clean(text)
            text = linkify(text, parse_email=True)
            text = text.replace("\n", "<br>")
            return text
        data = {}
-        data["subject"] = clean_html(mail.subject)
+        data["subject"] = _linkify_text_as_html(mail.subject)
        if data["subject"]:
            data["subject_label"] = "Subject"
-        data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
+        data["from"] = _linkify_text_as_html(
            mail.from_values.full if mail.from_values else "",
        )
        if data["from"]:
            data["from_label"] = "From"
-        data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
+        data["to"] = _linkify_text_as_html(
            ", ".join(address.full for address in mail.to_values),
        )
        if data["to"]:
            data["to_label"] = "To"
-        data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
+        data["cc"] = _linkify_text_as_html(
            ", ".join(address.full for address in mail.cc_values),
        )
        if data["cc"]:
            data["cc_label"] = "CC"
-        data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
+        data["bcc"] = _linkify_text_as_html(
            ", ".join(address.full for address in mail.bcc_values),
        )
        if data["bcc"]:
            data["bcc_label"] = "BCC"
@@ -654,14 +887,14 @@ class MailDocumentParser:
            att.append(
                f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
            )
-        data["attachments"] = clean_html(", ".join(att))
+        data["attachments"] = _linkify_text_as_html(", ".join(att))
        if data["attachments"]:
            data["attachments_label"] = "Attachments"
-        data["date"] = clean_html(
+        data["date"] = _linkify_text_as_html(
            timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
        )
-        data["content"] = clean_html(mail.text.strip())
+        data["content"] = _linkify_text_as_html(mail.text.strip())
        from django.template.loader import render_to_string
@@ -761,19 +994,11 @@ class MailDocumentParser:
            If Gotenberg returns an error.
        """
        def clean_html_script(text: str) -> str:
            compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
            text = compiled_open.sub("<div hidden ", text)
            compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
            text = compiled_close.sub("</div", text)
            return text
        logger.info("Converting message html to PDF")
        tempdir = Path(self._tempdir)
-        html_clean = clean_html_script(orig_html)
+        html_clean = _clean_email_html(orig_html)
        html_clean_file = tempdir / "index.html"
        html_clean_file.write_text(html_clean)
@@ -15,6 +15,8 @@ from documents.parsers import ParseError
 from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.mail import MailDocumentParser
 from paperless.parsers.mail import _clean_email_html
 from paperless.parsers.mail import _linkify_text_as_html
 class TestMailParserProtocol:
@@ -72,6 +74,75 @@ class TestMailParserProtocol:
        assert count > 0
 class TestMailHtmlCleaning:
    def test_text_fields_are_escaped_before_linkifying(self) -> None:
        result = _linkify_text_as_html(
            "Hello <b>bold</b>\nhttps://example.com?a=1&b=2",
        )
        assert "&lt;b&gt;bold&lt;/b&gt;" in result
        assert "<br>" in result
        assert '<a href="https://example.com?a=1&amp;b=2"' in result
    def test_email_html_preserves_safe_structure(self) -> None:
        result = _clean_email_html(
            """
            <style>
              .invoice { margin: 0; padding: 8px; color: #333; }
              @media screen { .invoice { width: 100%; } }
            </style>
            <div style="margin: 0; padding: 8px; color: #333;">
              <p>Hello <strong>there</strong></p>
            </div>
            <table style="width: 100%; border-collapse: collapse;">
              <tr><td colspan="2" style="text-align: right;">Total</td></tr>
            </table>
            <img src="cid:logo" width="100" alt="Logo" style="display: block;">
            Visit https://example.com
            """,
        )
        assert "<style>.invoice{margin: 0;padding: 8px;color: #333;}" in result
        assert "@media screen{.invoice{width: 100%;}}</style>" in result
        assert 'style="margin: 0;padding: 8px;color: #333;"' in result
        assert "<p>Hello <strong>there</strong></p>" in result
        assert 'style="width: 100%;border-collapse: collapse;"' in result
        assert '<td colspan="2" style="text-align: right;">Total</td>' in result
        assert 'style="display: block;"' in result
        assert '<img src="cid:logo" width="100" alt="Logo"' in result
        assert '<a href="https://example.com"' in result
    def test_email_html_removes_executable_content(self) -> None:
        result = _clean_email_html(
            """
            <div onclick="alert('x')">Message</div>
            <script>alert('script')</script>
            <style>
              @import url("https://example.com/x.css");
              body { color: url("https://example.com/x"); position: fixed; }
              @media screen { body { background-image: url("https://example.com/x"); } }
            </style>
            <a href="javascript:alert('x')">bad link</a>
            <a href="cid:logo">bad cid link</a>
            <img src="https://example.com/logo.png" onerror="alert('x')" alt="Logo"
                 style="background-image: url('https://example.com/logo.png'); position: fixed;">
            """,
        )
        assert "Message" in result
        assert "script" not in result
        assert "background" not in result
        assert "onclick" not in result
        assert "onerror" not in result
        assert "javascript:" not in result
        assert "background-image" not in result
        assert "position" not in result
        assert "@import" not in result
        assert "<a>bad link</a>" in result
        assert "<a>bad cid link</a>" in result
        assert '<img alt="Logo" style="">' in result
 class TestEmailFileParsing:
    """
    Tests around reading a file and parsing it into a
@@ -313,6 +313,11 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" },
 ]
 [package.optional-dependencies]
 css = [
    { name = "tinycss2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
 [[package]]
 name = "brotli"
 version = "1.2.0"
@@ -2888,7 +2893,7 @@ source = { virtual = "." }
 dependencies = [
    { name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
-    { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "bleach", extra = ["css"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "channels-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -3038,7 +3043,7 @@ typing = [
 requires-dist = [
    { name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
    { name = "babel", specifier = ">=2.17" },
-    { name = "bleach", specifier = "~=6.3.0" },
+    { name = "bleach", extras = ["css"], specifier = "~=6.3.0" },
    { name = "celery", extras = ["redis"], specifier = "~=5.6.2" },
    { name = "channels", specifier = "~=4.2" },
    { name = "channels-redis", specifier = "~=4.2" },
@@ -4892,6 +4897,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/40/d0/ad3feb0a392ef4e0c08bc32024950373ddc0669002cbdcbb9f3bf0c2d114/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:528217cad85ede5f85c8bc78b0341868d3c3cfefc6ecb5b622e1cacb6c73247b", size = 39837, upload-time = "2025-12-17T23:32:58.283Z" },
 ]
 [[package]]
 name = "tinycss2"
 version = "1.4.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "webencodings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/7a/fd/7a5ee21fd08ff70d3d33a5781c255cbe779659bd03278feb98b19ee550f4/tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7", size = 87085, upload-time = "2024-10-24T14:58:29.895Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/e6/34/ebdc18bae6aa14fbee1a08b63c015c72b64868ff7dae68808ab500c492e2/tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289", size = 26610, upload-time = "2024-10-24T14:58:28.029Z" },
 ]
 [[package]]
 name = "tinytag"
 version = "2.2.1"
Author	SHA1	Message	Date
shamoon	b2e4cbd980	fix css sanitizer stuff	2026-05-27 13:42:40 -07:00
shamoon	7632b49e90	Add css sanitizer	2026-05-27 11:26:46 -07:00
shamoon	1a5c370ed5	Fix sanitize and linkify email HTML	2026-05-27 09:03:24 -07:00