Compare commits

...

3 Commits

Author SHA1 Message Date
shamoon b2e4cbd980 fix css sanitizer stuff 2026-05-27 13:42:40 -07:00
shamoon 7632b49e90 Add css sanitizer 2026-05-27 11:26:46 -07:00
shamoon 1a5c370ed5 Fix sanitize and linkify email HTML 2026-05-27 09:03:24 -07:00
4 changed files with 345 additions and 32 deletions
+1 -1
View File
@@ -16,7 +16,7 @@ classifiers = [
dependencies = [ dependencies = [
"azure-ai-documentintelligence>=1.0.2", "azure-ai-documentintelligence>=1.0.2",
"babel>=2.17", "babel>=2.17",
"bleach~=6.3.0", "bleach[css]~=6.3.0",
"celery[redis]~=5.6.2", "celery[redis]~=5.6.2",
"channels~=4.2", "channels~=4.2",
"channels-redis~=4.2", "channels-redis~=4.2",
+254 -29
View File
@@ -24,6 +24,7 @@ from typing import Self
from bleach import clean from bleach import clean
from bleach import linkify from bleach import linkify
from bleach.css_sanitizer import CSSSanitizer
from django.conf import settings from django.conf import settings
from django.utils import timezone from django.utils import timezone
from django.utils.timezone import is_naive from django.utils.timezone import is_naive
@@ -38,6 +39,10 @@ from humanize import naturalsize
from imap_tools import MailAttachment from imap_tools import MailAttachment
from imap_tools import MailMessage from imap_tools import MailMessage
from tika_client import TikaClient from tika_client import TikaClient
from tinycss2 import parse_declaration_list
from tinycss2 import parse_rule_list
from tinycss2 import parse_stylesheet
from tinycss2 import serialize
from documents.parsers import ParseError from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf from documents.parsers import make_thumbnail_from_pdf
@@ -58,6 +63,238 @@ _SUPPORTED_MIME_TYPES: dict[str, str] = {
"message/rfc822": ".eml", "message/rfc822": ".eml",
} }
_EMAIL_HTML_TAGS = {
"a",
"abbr",
"acronym",
"address",
"b",
"blockquote",
"br",
"caption",
"code",
"dd",
"del",
"div",
"dl",
"dt",
"em",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"hr",
"i",
"img",
"li",
"ol",
"p",
"pre",
"s",
"small",
"span",
"style",
"strong",
"sub",
"sup",
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"tr",
"u",
"ul",
}
_EMAIL_HTML_PROTOCOLS = {"cid", "http", "https", "mailto"}
_EMAIL_HTML_GLOBAL_ATTRIBUTES = {
"abbr",
"align",
"alt",
"height",
"style",
"title",
"width",
}
_EMAIL_HTML_TAG_ATTRIBUTES = {
"a": {"href", "name", "title"},
"img": {"alt", "height", "src", "title", "width"},
"ol": {"start", "type"},
"td": {"colspan", "headers", "rowspan", "scope"},
"th": {"colspan", "headers", "rowspan", "scope"},
"ul": {"type"},
}
_EMAIL_CSS_PROPERTIES = {
"background-color",
"border",
"border-bottom",
"border-collapse",
"border-color",
"border-left",
"border-right",
"border-spacing",
"border-style",
"border-top",
"border-width",
"color",
"display",
"font",
"font-family",
"font-size",
"font-style",
"font-weight",
"height",
"line-height",
"margin",
"margin-bottom",
"margin-left",
"margin-right",
"margin-top",
"max-width",
"min-width",
"padding",
"padding-bottom",
"padding-left",
"padding-right",
"padding-top",
"text-align",
"text-decoration",
"vertical-align",
"white-space",
"width",
}
def _has_unsafe_css_value(tokens: list) -> bool:
for token in tokens:
if token.type == "url":
return True
if token.type == "function" and token.lower_name in {"expression", "url"}:
return True
if hasattr(token, "content") and _has_unsafe_css_value(token.content):
return True
return False
class EmailCSSSanitizer(CSSSanitizer):
def sanitize_css(self, style: str) -> str:
declarations = parse_declaration_list(
style,
skip_comments=True,
skip_whitespace=True,
)
sanitized = [
declaration
for declaration in declarations
if declaration.type == "declaration"
and declaration.lower_name in self.allowed_css_properties
and not _has_unsafe_css_value(declaration.value)
]
return serialize(sanitized).strip()
_EMAIL_CSS_SANITIZER = EmailCSSSanitizer(
allowed_css_properties=_EMAIL_CSS_PROPERTIES,
)
def _linkify_text_as_html(text: object) -> str:
"""Escape plain text and linkify URLs/email addresses for safe HTML output."""
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if not isinstance(text, str):
text = str(text)
text = escape(text)
text = linkify(text, parse_email=True)
return text.replace("\n", "<br>")
def _allow_email_html_attribute(tag: str, name: str, value: str) -> bool:
if name not in _EMAIL_HTML_GLOBAL_ATTRIBUTES | _EMAIL_HTML_TAG_ATTRIBUTES.get(
tag,
set(),
):
return False
if tag == "img" and name == "src":
return value.lower().startswith("cid:")
if tag == "a" and name == "href":
return value.lower().startswith(("http://", "https://", "mailto:"))
return True
def _sanitize_email_css_rules(rules: list) -> str:
sanitized_rules = []
for rule in rules:
if rule.type == "qualified-rule":
selector = serialize(rule.prelude).strip()
declarations = _EMAIL_CSS_SANITIZER.sanitize_css(
serialize(rule.content),
)
if selector and declarations:
sanitized_rules.append(f"{selector}{{{declarations}}}")
elif (
rule.type == "at-rule" and rule.lower_at_keyword == "media" and rule.content
):
media_query = serialize(rule.prelude).strip()
nested_rules = _sanitize_email_css_rules(
parse_rule_list(
rule.content,
skip_comments=True,
skip_whitespace=True,
),
)
if media_query and nested_rules:
sanitized_rules.append(f"@media {media_query}{{{nested_rules}}}")
return "".join(sanitized_rules)
def _sanitize_email_css_stylesheet(css: str) -> str:
return _sanitize_email_css_rules(
parse_stylesheet(css, skip_comments=True, skip_whitespace=True),
)
def _clean_email_html(text: str) -> str:
"""Sanitize email HTML before rendering it with Chromium."""
sanitized_style_blocks = []
def sanitize_style_block(match: re.Match[str]) -> str:
sanitized_style_blocks.append(
f"<style>{_sanitize_email_css_stylesheet(match.group(1))}</style>",
)
return f"__PAPERLESS_SANITIZED_STYLE_{len(sanitized_style_blocks) - 1}__"
text = re.sub(r"(?is)<script\b[^>]*>.*?</script\s*>", "", text)
text = re.sub(
r"(?is)<style\b[^>]*>(.*?)</style\s*>",
sanitize_style_block,
text,
)
text = re.sub(r"(?is)</?(script|style)\b[^>]*>", "", text)
for index, style_block in enumerate(sanitized_style_blocks):
text = text.replace(f"__PAPERLESS_SANITIZED_STYLE_{index}__", style_block)
return linkify(
clean(
text,
tags=_EMAIL_HTML_TAGS,
attributes=_allow_email_html_attribute,
protocols=_EMAIL_HTML_PROTOCOLS,
css_sanitizer=_EMAIL_CSS_SANITIZER,
strip=True,
strip_comments=True,
),
parse_email=True,
)
class MailDocumentParser: class MailDocumentParser:
"""Parse .eml email files for Paperless-ngx. """Parse .eml email files for Paperless-ngx.
@@ -619,33 +856,29 @@ class MailDocumentParser:
Path to the rendered HTML file inside the temporary directory. Path to the rendered HTML file inside the temporary directory.
""" """
def clean_html(text: str) -> str:
"""Attempt to clean, escape, and linkify the given HTML string."""
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if not isinstance(text, str):
text = str(text)
text = escape(text)
text = clean(text)
text = linkify(text, parse_email=True)
text = text.replace("\n", "<br>")
return text
data = {} data = {}
data["subject"] = clean_html(mail.subject) data["subject"] = _linkify_text_as_html(mail.subject)
if data["subject"]: if data["subject"]:
data["subject_label"] = "Subject" data["subject_label"] = "Subject"
data["from"] = clean_html(mail.from_values.full if mail.from_values else "") data["from"] = _linkify_text_as_html(
mail.from_values.full if mail.from_values else "",
)
if data["from"]: if data["from"]:
data["from_label"] = "From" data["from_label"] = "From"
data["to"] = clean_html(", ".join(address.full for address in mail.to_values)) data["to"] = _linkify_text_as_html(
", ".join(address.full for address in mail.to_values),
)
if data["to"]: if data["to"]:
data["to_label"] = "To" data["to_label"] = "To"
data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values)) data["cc"] = _linkify_text_as_html(
", ".join(address.full for address in mail.cc_values),
)
if data["cc"]: if data["cc"]:
data["cc_label"] = "CC" data["cc_label"] = "CC"
data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values)) data["bcc"] = _linkify_text_as_html(
", ".join(address.full for address in mail.bcc_values),
)
if data["bcc"]: if data["bcc"]:
data["bcc_label"] = "BCC" data["bcc_label"] = "BCC"
@@ -654,14 +887,14 @@ class MailDocumentParser:
att.append( att.append(
f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})", f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
) )
data["attachments"] = clean_html(", ".join(att)) data["attachments"] = _linkify_text_as_html(", ".join(att))
if data["attachments"]: if data["attachments"]:
data["attachments_label"] = "Attachments" data["attachments_label"] = "Attachments"
data["date"] = clean_html( data["date"] = _linkify_text_as_html(
timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"), timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
) )
data["content"] = clean_html(mail.text.strip()) data["content"] = _linkify_text_as_html(mail.text.strip())
from django.template.loader import render_to_string from django.template.loader import render_to_string
@@ -761,19 +994,11 @@ class MailDocumentParser:
If Gotenberg returns an error. If Gotenberg returns an error.
""" """
def clean_html_script(text: str) -> str:
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
text = compiled_open.sub("<div hidden ", text)
compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
text = compiled_close.sub("</div", text)
return text
logger.info("Converting message html to PDF") logger.info("Converting message html to PDF")
tempdir = Path(self._tempdir) tempdir = Path(self._tempdir)
html_clean = clean_html_script(orig_html) html_clean = _clean_email_html(orig_html)
html_clean_file = tempdir / "index.html" html_clean_file = tempdir / "index.html"
html_clean_file.write_text(html_clean) html_clean_file.write_text(html_clean)
@@ -15,6 +15,8 @@ from documents.parsers import ParseError
from paperless.parsers import ParserContext from paperless.parsers import ParserContext
from paperless.parsers import ParserProtocol from paperless.parsers import ParserProtocol
from paperless.parsers.mail import MailDocumentParser from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.mail import _clean_email_html
from paperless.parsers.mail import _linkify_text_as_html
class TestMailParserProtocol: class TestMailParserProtocol:
@@ -72,6 +74,75 @@ class TestMailParserProtocol:
assert count > 0 assert count > 0
class TestMailHtmlCleaning:
def test_text_fields_are_escaped_before_linkifying(self) -> None:
result = _linkify_text_as_html(
"Hello <b>bold</b>\nhttps://example.com?a=1&b=2",
)
assert "&lt;b&gt;bold&lt;/b&gt;" in result
assert "<br>" in result
assert '<a href="https://example.com?a=1&amp;b=2"' in result
def test_email_html_preserves_safe_structure(self) -> None:
result = _clean_email_html(
"""
<style>
.invoice { margin: 0; padding: 8px; color: #333; }
@media screen { .invoice { width: 100%; } }
</style>
<div style="margin: 0; padding: 8px; color: #333;">
<p>Hello <strong>there</strong></p>
</div>
<table style="width: 100%; border-collapse: collapse;">
<tr><td colspan="2" style="text-align: right;">Total</td></tr>
</table>
<img src="cid:logo" width="100" alt="Logo" style="display: block;">
Visit https://example.com
""",
)
assert "<style>.invoice{margin: 0;padding: 8px;color: #333;}" in result
assert "@media screen{.invoice{width: 100%;}}</style>" in result
assert 'style="margin: 0;padding: 8px;color: #333;"' in result
assert "<p>Hello <strong>there</strong></p>" in result
assert 'style="width: 100%;border-collapse: collapse;"' in result
assert '<td colspan="2" style="text-align: right;">Total</td>' in result
assert 'style="display: block;"' in result
assert '<img src="cid:logo" width="100" alt="Logo"' in result
assert '<a href="https://example.com"' in result
def test_email_html_removes_executable_content(self) -> None:
result = _clean_email_html(
"""
<div onclick="alert('x')">Message</div>
<script>alert('script')</script>
<style>
@import url("https://example.com/x.css");
body { color: url("https://example.com/x"); position: fixed; }
@media screen { body { background-image: url("https://example.com/x"); } }
</style>
<a href="javascript:alert('x')">bad link</a>
<a href="cid:logo">bad cid link</a>
<img src="https://example.com/logo.png" onerror="alert('x')" alt="Logo"
style="background-image: url('https://example.com/logo.png'); position: fixed;">
""",
)
assert "Message" in result
assert "script" not in result
assert "background" not in result
assert "onclick" not in result
assert "onerror" not in result
assert "javascript:" not in result
assert "background-image" not in result
assert "position" not in result
assert "@import" not in result
assert "<a>bad link</a>" in result
assert "<a>bad cid link</a>" in result
assert '<img alt="Logo" style="">' in result
class TestEmailFileParsing: class TestEmailFileParsing:
""" """
Tests around reading a file and parsing it into a Tests around reading a file and parsing it into a
Generated
+19 -2
View File
@@ -313,6 +313,11 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" }, { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" },
] ]
[package.optional-dependencies]
css = [
{ name = "tinycss2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
[[package]] [[package]]
name = "brotli" name = "brotli"
version = "1.2.0" version = "1.2.0"
@@ -2888,7 +2893,7 @@ source = { virtual = "." }
dependencies = [ dependencies = [
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "bleach", extra = ["css"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "channels-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "channels-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -3038,7 +3043,7 @@ typing = [
requires-dist = [ requires-dist = [
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" }, { name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
{ name = "babel", specifier = ">=2.17" }, { name = "babel", specifier = ">=2.17" },
{ name = "bleach", specifier = "~=6.3.0" }, { name = "bleach", extras = ["css"], specifier = "~=6.3.0" },
{ name = "celery", extras = ["redis"], specifier = "~=5.6.2" }, { name = "celery", extras = ["redis"], specifier = "~=5.6.2" },
{ name = "channels", specifier = "~=4.2" }, { name = "channels", specifier = "~=4.2" },
{ name = "channels-redis", specifier = "~=4.2" }, { name = "channels-redis", specifier = "~=4.2" },
@@ -4892,6 +4897,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/40/d0/ad3feb0a392ef4e0c08bc32024950373ddc0669002cbdcbb9f3bf0c2d114/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:528217cad85ede5f85c8bc78b0341868d3c3cfefc6ecb5b622e1cacb6c73247b", size = 39837, upload-time = "2025-12-17T23:32:58.283Z" }, { url = "https://files.pythonhosted.org/packages/40/d0/ad3feb0a392ef4e0c08bc32024950373ddc0669002cbdcbb9f3bf0c2d114/time_machine-3.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:528217cad85ede5f85c8bc78b0341868d3c3cfefc6ecb5b622e1cacb6c73247b", size = 39837, upload-time = "2025-12-17T23:32:58.283Z" },
] ]
[[package]]
name = "tinycss2"
version = "1.4.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "webencodings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/7a/fd/7a5ee21fd08ff70d3d33a5781c255cbe779659bd03278feb98b19ee550f4/tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7", size = 87085, upload-time = "2024-10-24T14:58:29.895Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e6/34/ebdc18bae6aa14fbee1a08b63c015c72b64868ff7dae68808ab500c492e2/tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289", size = 26610, upload-time = "2024-10-24T14:58:28.029Z" },
]
[[package]] [[package]]
name = "tinytag" name = "tinytag"
version = "2.2.1" version = "2.2.1"