Fix sanitize and linkify email HTML

2026-08-01 16:42:18 +00:00 · 2026-05-27 09:03:24 -07:00
parent 4ce5f2022c
commit 1a5c370ed5
2 changed files with 174 additions and 29 deletions
@@ -58,6 +58,113 @@ _SUPPORTED_MIME_TYPES: dict[str, str] = {
    "message/rfc822": ".eml",
 }

+_EMAIL_HTML_TAGS = {
+    "a",
+    "abbr",
+    "acronym",
+    "address",
+    "b",
+    "blockquote",
+    "br",
+    "caption",
+    "code",
+    "dd",
+    "del",
+    "div",
+    "dl",
+    "dt",
+    "em",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "hr",
+    "i",
+    "img",
+    "li",
+    "ol",
+    "p",
+    "pre",
+    "s",
+    "small",
+    "span",
+    "strong",
+    "sub",
+    "sup",
+    "table",
+    "tbody",
+    "td",
+    "tfoot",
+    "th",
+    "thead",
+    "tr",
+    "u",
+    "ul",
+}
+_EMAIL_HTML_PROTOCOLS = {"cid", "http", "https", "mailto"}
+_EMAIL_HTML_GLOBAL_ATTRIBUTES = {
+    "abbr",
+    "align",
+    "alt",
+    "height",
+    "title",
+    "width",
+}
+_EMAIL_HTML_TAG_ATTRIBUTES = {
+    "a": {"href", "name", "title"},
+    "img": {"alt", "height", "src", "title", "width"},
+    "ol": {"start", "type"},
+    "td": {"colspan", "headers", "rowspan", "scope"},
+    "th": {"colspan", "headers", "rowspan", "scope"},
+    "ul": {"type"},
+}
+
+
+def _linkify_text_as_html(text: object) -> str:
+    """Escape plain text and linkify URLs/email addresses for safe HTML output."""
+    if isinstance(text, list):
+        text = "\n".join([str(e) for e in text])
+    if not isinstance(text, str):
+        text = str(text)
+    text = escape(text)
+    text = linkify(text, parse_email=True)
+    return text.replace("\n", "<br>")
+
+
+def _allow_email_html_attribute(tag: str, name: str, value: str) -> bool:
+    if name not in _EMAIL_HTML_GLOBAL_ATTRIBUTES | _EMAIL_HTML_TAG_ATTRIBUTES.get(
+        tag,
+        set(),
+    ):
+        return False
+
+    if tag == "img" and name == "src":
+        return value.lower().startswith("cid:")
+
+    if tag == "a" and name == "href":
+        return value.lower().startswith(("http://", "https://", "mailto:"))
+
+    return True
+
+
+def _clean_email_html(text: str) -> str:
+    """Sanitize email HTML before rendering it with Chromium."""
+    text = re.sub(r"(?is)<(script|style)\b[^>]*>.*?</\1\s*>", "", text)
+    text = re.sub(r"(?is)</?(script|style)\b[^>]*>", "", text)
+    return linkify(
+        clean(
+            text,
+            tags=_EMAIL_HTML_TAGS,
+            attributes=_allow_email_html_attribute,
+            protocols=_EMAIL_HTML_PROTOCOLS,
+            strip=True,
+            strip_comments=True,
+        ),
+        parse_email=True,
+    )
+

 class MailDocumentParser:
    """Parse .eml email files for Paperless-ngx.
@@ -619,33 +726,29 @@ class MailDocumentParser:
            Path to the rendered HTML file inside the temporary directory.
        """

-        def clean_html(text: str) -> str:
-            """Attempt to clean, escape, and linkify the given HTML string."""
-            if isinstance(text, list):
-                text = "\n".join([str(e) for e in text])
-            if not isinstance(text, str):
-                text = str(text)
-            text = escape(text)
-            text = clean(text)
-            text = linkify(text, parse_email=True)
-            text = text.replace("\n", "<br>")
-            return text
-
        data = {}

-        data["subject"] = clean_html(mail.subject)
+        data["subject"] = _linkify_text_as_html(mail.subject)
        if data["subject"]:
            data["subject_label"] = "Subject"
-        data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
+        data["from"] = _linkify_text_as_html(
+            mail.from_values.full if mail.from_values else "",
+        )
        if data["from"]:
            data["from_label"] = "From"
-        data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
+        data["to"] = _linkify_text_as_html(
+            ", ".join(address.full for address in mail.to_values),
+        )
        if data["to"]:
            data["to_label"] = "To"
-        data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
+        data["cc"] = _linkify_text_as_html(
+            ", ".join(address.full for address in mail.cc_values),
+        )
        if data["cc"]:
            data["cc_label"] = "CC"
-        data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
+        data["bcc"] = _linkify_text_as_html(
+            ", ".join(address.full for address in mail.bcc_values),
+        )
        if data["bcc"]:
            data["bcc_label"] = "BCC"

@@ -654,14 +757,14 @@ class MailDocumentParser:
            att.append(
                f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
            )
-        data["attachments"] = clean_html(", ".join(att))
+        data["attachments"] = _linkify_text_as_html(", ".join(att))
        if data["attachments"]:
            data["attachments_label"] = "Attachments"

-        data["date"] = clean_html(
+        data["date"] = _linkify_text_as_html(
            timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
        )
-        data["content"] = clean_html(mail.text.strip())
+        data["content"] = _linkify_text_as_html(mail.text.strip())

        from django.template.loader import render_to_string

@@ -761,19 +864,11 @@ class MailDocumentParser:
            If Gotenberg returns an error.
        """

-        def clean_html_script(text: str) -> str:
-            compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
-            text = compiled_open.sub("<div hidden ", text)
-
-            compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
-            text = compiled_close.sub("</div", text)
-            return text
-
        logger.info("Converting message html to PDF")

        tempdir = Path(self._tempdir)

-        html_clean = clean_html_script(orig_html)
+        html_clean = _clean_email_html(orig_html)
        html_clean_file = tempdir / "index.html"
        html_clean_file.write_text(html_clean)

@@ -15,6 +15,8 @@ from documents.parsers import ParseError
 from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.mail import MailDocumentParser
+from paperless.parsers.mail import _clean_email_html
+from paperless.parsers.mail import _linkify_text_as_html


 class TestMailParserProtocol:
@@ -72,6 +74,54 @@ class TestMailParserProtocol:
        assert count > 0


+class TestMailHtmlCleaning:
+    def test_text_fields_are_escaped_before_linkifying(self) -> None:
+        result = _linkify_text_as_html(
+            "Hello <b>bold</b>\nhttps://example.com?a=1&b=2",
+        )
+
+        assert "&lt;b&gt;bold&lt;/b&gt;" in result
+        assert "<br>" in result
+        assert '<a href="https://example.com?a=1&amp;b=2"' in result
+
+    def test_email_html_preserves_safe_structure(self) -> None:
+        result = _clean_email_html(
+            """
+            <div><p>Hello <strong>there</strong></p></div>
+            <table><tr><td colspan="2">Total</td></tr></table>
+            <img src="cid:logo" width="100" alt="Logo">
+            Visit https://example.com
+            """,
+        )
+
+        assert "<div><p>Hello <strong>there</strong></p></div>" in result
+        assert '<td colspan="2">Total</td>' in result
+        assert '<img src="cid:logo" width="100" alt="Logo">' in result
+        assert '<a href="https://example.com"' in result
+
+    def test_email_html_removes_executable_content(self) -> None:
+        result = _clean_email_html(
+            """
+            <div onclick="alert('x')">Message</div>
+            <script>alert('script')</script>
+            <style>body { background: url("https://example.com/x"); }</style>
+            <a href="javascript:alert('x')">bad link</a>
+            <a href="cid:logo">bad cid link</a>
+            <img src="https://example.com/logo.png" onerror="alert('x')" alt="Logo">
+            """,
+        )
+
+        assert "Message" in result
+        assert "script" not in result
+        assert "background" not in result
+        assert "onclick" not in result
+        assert "onerror" not in result
+        assert "javascript:" not in result
+        assert "<a>bad link</a>" in result
+        assert "<a>bad cid link</a>" in result
+        assert '<img alt="Logo">' in result
+
+
 class TestEmailFileParsing:
    """
    Tests around reading a file and parsing it into a