From 1a5c370ed5713ec77153d91a7ae71dab7c8c2865 Mon Sep 17 00:00:00 2001
From: shamoon <4887959+shamoon@users.noreply.github.com>
Date: Wed, 27 May 2026 09:03:24 -0700
Subject: [PATCH] Fix sanitize and linkify email HTML

---
 src/paperless/parsers/mail.py                 | 153 ++++++++++++++----
 .../tests/parsers/test_mail_parser.py         |  50 ++++++
 2 files changed, 174 insertions(+), 29 deletions(-)
diff --git a/src/paperless/parsers/mail.py b/src/paperless/parsers/mail.py
index 8188b7933..29355f3bd 100644
--- a/src/paperless/parsers/mail.py
+++ b/src/paperless/parsers/mail.py
@@ -58,6 +58,113 @@ _SUPPORTED_MIME_TYPES: dict[str, str] = {
     "message/rfc822": ".eml",
 }
 
+_EMAIL_HTML_TAGS = {
+    "a",
+    "abbr",
+    "acronym",
+    "address",
+    "b",
+    "blockquote",
+    "br",
+    "caption",
+    "code",
+    "dd",
+    "del",
+    "div",
+    "dl",
+    "dt",
+    "em",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "hr",
+    "i",
+    "img",
+    "li",
+    "ol",
+    "p",
+    "pre",
+    "s",
+    "small",
+    "span",
+    "strong",
+    "sub",
+    "sup",
+    "table",
+    "tbody",
+    "td",
+    "tfoot",
+    "th",
+    "thead",
+    "tr",
+    "u",
+    "ul",
+}
+_EMAIL_HTML_PROTOCOLS = {"cid", "http", "https", "mailto"}
+_EMAIL_HTML_GLOBAL_ATTRIBUTES = {
+    "abbr",
+    "align",
+    "alt",
+    "height",
+    "title",
+    "width",
+}
+_EMAIL_HTML_TAG_ATTRIBUTES = {
+    "a": {"href", "name", "title"},
+    "img": {"alt", "height", "src", "title", "width"},
+    "ol": {"start", "type"},
+    "td": {"colspan", "headers", "rowspan", "scope"},
+    "th": {"colspan", "headers", "rowspan", "scope"},
+    "ul": {"type"},
+}
+
+
+def _linkify_text_as_html(text: object) -> str:
+    """Escape plain text and linkify URLs/email addresses for safe HTML output."""
+    if isinstance(text, list):
+        text = "\n".join([str(e) for e in text])
+    if not isinstance(text, str):
+        text = str(text)
+    text = escape(text)
+    text = linkify(text, parse_email=True)
+    return text.replace("\n", "<br>")
+
+
+def _allow_email_html_attribute(tag: str, name: str, value: str) -> bool:
+    if name not in _EMAIL_HTML_GLOBAL_ATTRIBUTES | _EMAIL_HTML_TAG_ATTRIBUTES.get(
+        tag,
+        set(),
+    ):
+        return False
+
+    if tag == "img" and name == "src":
+        return value.lower().startswith("cid:")
+
+    if tag == "a" and name == "href":
+        return value.lower().startswith(("http://", "https://", "mailto:"))
+
+    return True
+
+
+def _clean_email_html(text: str) -> str:
+    """Sanitize email HTML before rendering it with Chromium."""
+    text = re.sub(r"(?is)<(script|style)\b[^>]*>.*?</\1\s*>", "", text)
+    text = re.sub(r"(?is)</?(script|style)\b[^>]*>", "", text)
+    return linkify(
+        clean(
+            text,
+            tags=_EMAIL_HTML_TAGS,
+            attributes=_allow_email_html_attribute,
+            protocols=_EMAIL_HTML_PROTOCOLS,
+            strip=True,
+            strip_comments=True,
+        ),
+        parse_email=True,
+    )
+
 
 class MailDocumentParser:
     """Parse .eml email files for Paperless-ngx.
@@ -619,33 +726,29 @@ class MailDocumentParser:
             Path to the rendered HTML file inside the temporary directory.
         """
 
-        def clean_html(text: str) -> str:
-            """Attempt to clean, escape, and linkify the given HTML string."""
-            if isinstance(text, list):
-                text = "\n".join([str(e) for e in text])
-            if not isinstance(text, str):
-                text = str(text)
-            text = escape(text)
-            text = clean(text)
-            text = linkify(text, parse_email=True)
-            text = text.replace("\n", "<br>")
-            return text
-
         data = {}
 
-        data["subject"] = clean_html(mail.subject)
+        data["subject"] = _linkify_text_as_html(mail.subject)
         if data["subject"]:
             data["subject_label"] = "Subject"
-        data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
+        data["from"] = _linkify_text_as_html(
+            mail.from_values.full if mail.from_values else "",
+        )
         if data["from"]:
             data["from_label"] = "From"
-        data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
+        data["to"] = _linkify_text_as_html(
+            ", ".join(address.full for address in mail.to_values),
+        )
         if data["to"]:
             data["to_label"] = "To"
-        data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
+        data["cc"] = _linkify_text_as_html(
+            ", ".join(address.full for address in mail.cc_values),
+        )
         if data["cc"]:
             data["cc_label"] = "CC"
-        data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
+        data["bcc"] = _linkify_text_as_html(
+            ", ".join(address.full for address in mail.bcc_values),
+        )
         if data["bcc"]:
             data["bcc_label"] = "BCC"
 
@@ -654,14 +757,14 @@ class MailDocumentParser:
             att.append(
                 f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
             )
-        data["attachments"] = clean_html(", ".join(att))
+        data["attachments"] = _linkify_text_as_html(", ".join(att))
         if data["attachments"]:
             data["attachments_label"] = "Attachments"
 
-        data["date"] = clean_html(
+        data["date"] = _linkify_text_as_html(
             timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
         )
-        data["content"] = clean_html(mail.text.strip())
+        data["content"] = _linkify_text_as_html(mail.text.strip())
 
         from django.template.loader import render_to_string
 
@@ -761,19 +864,11 @@ class MailDocumentParser:
             If Gotenberg returns an error.
         """
 
-        def clean_html_script(text: str) -> str:
-            compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
-            text = compiled_open.sub("<div hidden ", text)
-
-            compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
-            text = compiled_close.sub("</div", text)
-            return text
-
         logger.info("Converting message html to PDF")
 
         tempdir = Path(self._tempdir)
 
-        html_clean = clean_html_script(orig_html)
+        html_clean = _clean_email_html(orig_html)
         html_clean_file = tempdir / "index.html"
         html_clean_file.write_text(html_clean)
 
diff --git a/src/paperless/tests/parsers/test_mail_parser.py b/src/paperless/tests/parsers/test_mail_parser.py
index b875442ad..54444bc7a 100644
--- a/src/paperless/tests/parsers/test_mail_parser.py
+++ b/src/paperless/tests/parsers/test_mail_parser.py
@@ -15,6 +15,8 @@ from documents.parsers import ParseError
 from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.mail import MailDocumentParser
+from paperless.parsers.mail import _clean_email_html
+from paperless.parsers.mail import _linkify_text_as_html
 
 
 class TestMailParserProtocol:
@@ -72,6 +74,54 @@ class TestMailParserProtocol:
         assert count > 0
 
 
+class TestMailHtmlCleaning:
+    def test_text_fields_are_escaped_before_linkifying(self) -> None:
+        result = _linkify_text_as_html(
+            "Hello <b>bold</b>\nhttps://example.com?a=1&b=2",
+        )
+
+        assert "&lt;b&gt;bold&lt;/b&gt;" in result
+        assert "<br>" in result
+        assert '<a href="https://example.com?a=1&amp;b=2"' in result
+
+    def test_email_html_preserves_safe_structure(self) -> None:
+        result = _clean_email_html(
+            """
+            <div><p>Hello <strong>there</strong></p></div>
+            <table><tr><td colspan="2">Total</td></tr></table>
+            <img src="cid:logo" width="100" alt="Logo">
+            Visit https://example.com
+            """,
+        )
+
+        assert "<div><p>Hello <strong>there</strong></p></div>" in result
+        assert '<td colspan="2">Total</td>' in result
+        assert '<img src="cid:logo" width="100" alt="Logo">' in result
+        assert '<a href="https://example.com"' in result
+
+    def test_email_html_removes_executable_content(self) -> None:
+        result = _clean_email_html(
+            """
+            <div onclick="alert('x')">Message</div>
+            <script>alert('script')</script>
+            <style>body { background: url("https://example.com/x"); }</style>
+            <a href="javascript:alert('x')">bad link</a>
+            <a href="cid:logo">bad cid link</a>
+            <img src="https://example.com/logo.png" onerror="alert('x')" alt="Logo">
+            """,
+        )
+
+        assert "Message" in result
+        assert "script" not in result
+        assert "background" not in result
+        assert "onclick" not in result
+        assert "onerror" not in result
+        assert "javascript:" not in result
+        assert "<a>bad link</a>" in result
+        assert "<a>bad cid link</a>" in result
+        assert '<img alt="Logo">' in result
+
+
 class TestEmailFileParsing:
     """
     Tests around reading a file and parsing it into a