Fix: Apply unicode normalization to all paths and path components (#12993)

This commit is contained in:
Trenton H
2026-06-13 05:45:54 -07:00
committed by GitHub
parent 92c016ce47
commit 8ed4bf2011
6 changed files with 499 additions and 20 deletions
+24 -15
View File
@@ -1,6 +1,7 @@
import logging
import os
import re
import unicodedata
from collections.abc import Iterable
from pathlib import PurePath
@@ -36,10 +37,12 @@ class FilePathTemplate(Template):
def clean_filepath(value: str) -> str:
"""
Clean up a filepath by:
1. Removing newlines and carriage returns
2. Removing extra spaces before and after forward slashes
3. Preserving spaces in other parts of the path
1. Normalizing Unicode to NFC form to prevent byte-level mismatches
2. Removing newlines and carriage returns
3. Removing extra spaces before and after forward slashes
4. Preserving spaces in other parts of the path
"""
value = unicodedata.normalize("NFC", value)
value = value.replace("\n", "").replace("\r", "")
value = re.sub(r"\s*/\s*", "/", value)
@@ -181,17 +184,17 @@ def get_basic_metadata_context(
"""
return {
"title": pathvalidate.sanitize_filename(
document.title,
unicodedata.normalize("NFC", document.title),
replacement_text="-",
),
"correspondent": pathvalidate.sanitize_filename(
document.correspondent.name,
unicodedata.normalize("NFC", document.correspondent.name),
replacement_text="-",
)
if document.correspondent
else no_value_default,
"document_type": pathvalidate.sanitize_filename(
document.document_type.name,
unicodedata.normalize("NFC", document.document_type.name),
replacement_text="-",
)
if document.document_type
@@ -202,7 +205,10 @@ def get_basic_metadata_context(
"owner_username": document.owner.username
if document.owner
else no_value_default,
"original_name": PurePath(document.original_filename).with_suffix("").name
"original_name": unicodedata.normalize(
"NFC",
PurePath(document.original_filename).with_suffix("").name,
)
if document.original_filename
else no_value_default,
"doc_pk": f"{document.pk:07}",
@@ -269,12 +275,12 @@ def get_tags_context(tags: Iterable[Tag]) -> dict[str, str | list[str]]:
return {
"tag_list": pathvalidate.sanitize_filename(
",".join(
sorted(tag.name for tag in tags),
sorted(unicodedata.normalize("NFC", tag.name) for tag in tags),
),
replacement_text="-",
),
# Assumed to be ordered, but a template could loop through to find what they want
"tag_name_list": [x.name for x in tags],
"tag_name_list": [unicodedata.normalize("NFC", x.name) for x in tags],
}
@@ -301,7 +307,7 @@ def get_custom_fields_context(
CustomField.FieldDataType.LONG_TEXT,
}:
value = pathvalidate.sanitize_filename(
field_instance.value,
unicodedata.normalize("NFC", field_instance.value),
replacement_text="-",
)
elif (
@@ -310,10 +316,13 @@ def get_custom_fields_context(
):
options = field_instance.field.extra_data["select_options"]
value = pathvalidate.sanitize_filename(
next(
option["label"]
for option in options
if option["id"] == field_instance.value
unicodedata.normalize(
"NFC",
next(
option["label"]
for option in options
if option["id"] == field_instance.value
),
),
replacement_text="-",
)
@@ -321,7 +330,7 @@ def get_custom_fields_context(
value = field_instance.value
field_data["custom_fields"][
pathvalidate.sanitize_filename(
field_instance.field.name,
unicodedata.normalize("NFC", field_instance.field.name),
replacement_text="-",
)
] = {
@@ -0,0 +1,95 @@
import unicodedata
from typing import TYPE_CHECKING
from unittest import mock
import celery.result
import pytest
from django.core.files.uploadedfile import SimpleUploadedFile
if TYPE_CHECKING:
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
@pytest.fixture()
def consume_file_mock():
with mock.patch("documents.tasks.consume_file.apply_async") as m:
m.return_value = celery.result.AsyncResult(id="test-task-id")
yield m
@pytest.fixture()
def directories(tmp_path, settings, _media_settings):
scratch = tmp_path / "scratch"
scratch.mkdir()
settings.SCRATCH_DIR = scratch
return scratch
@pytest.mark.django_db
class TestPostDocumentNFCNormalization:
def test_nfd_filename_normalized_to_nfc(
self,
admin_client,
consume_file_mock: mock.MagicMock,
directories,
):
"""Uploaded file with NFD filename must have its name stored as NFC."""
nfd = unicodedata.normalize("NFD", "Rechnung März.pdf")
nfc = unicodedata.normalize("NFC", "Rechnung März.pdf")
# Verify our test strings actually differ at the byte level
assert nfd != nfc
uploaded = SimpleUploadedFile(
nfd,
b"%PDF-1.4 test",
content_type="application/pdf",
)
response = admin_client.post(
"/api/documents/post_document/",
{"document": uploaded},
)
assert response.status_code == 200
task_kwargs = consume_file_mock.call_args.kwargs["kwargs"]
input_doc: ConsumableDocument = task_kwargs["input_doc"]
overrides: DocumentMetadataOverrides = task_kwargs["overrides"]
# The temp file on disk must have an NFC name
assert input_doc.original_file.name == nfc, (
f"Expected NFC filename {nfc!r}, got {input_doc.original_file.name!r}"
)
# The override filename stored for later use must also be NFC
assert overrides.filename == nfc, (
f"Expected NFC override filename {nfc!r}, got {overrides.filename!r}"
)
assert unicodedata.is_normalized("NFC", overrides.filename)
def test_already_nfc_filename_unchanged(
self,
admin_client,
consume_file_mock: mock.MagicMock,
directories,
):
"""Uploaded file with already-NFC filename must pass through unchanged."""
nfc = unicodedata.normalize("NFC", "Invoice_2024.pdf")
uploaded = SimpleUploadedFile(
nfc,
b"%PDF-1.4 test",
content_type="application/pdf",
)
response = admin_client.post(
"/api/documents/post_document/",
{"document": uploaded},
)
assert response.status_code == 200
task_kwargs = consume_file_mock.call_args.kwargs["kwargs"]
overrides: DocumentMetadataOverrides = task_kwargs["overrides"]
assert overrides.filename == nfc
assert unicodedata.is_normalized("NFC", overrides.filename)
+187
View File
@@ -0,0 +1,187 @@
"""
Tests for NFC Unicode normalization in generate_filename / FilePathTemplate.render().
NFC `ü` (UTF-8: c3 bc) and NFD `ü` (UTF-8: 75 cc 88) are visually identical but
produce different byte sequences. On Linux (ext4, ZFS) these are distinct filenames.
All paths produced by the templating system must be NFC-normalized.
"""
import unicodedata
import pytest
from documents.file_handling import generate_filename
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.tests.factories import CorrespondentFactory
from documents.tests.factories import DocumentFactory
from documents.tests.factories import StoragePathFactory
from documents.tests.factories import TagFactory
@pytest.mark.django_db
class TestGenerateFilenameNFCNormalization:
@pytest.mark.parametrize(
"raw,display",
[
(unicodedata.normalize("NFD", "Gemüse"), "Gemüse"),
(unicodedata.normalize("NFD", "Café"), "Café"),
(unicodedata.normalize("NFD", "naïve"), "naïve"),
],
)
def test_nfd_title_normalized_to_nfc(self, settings, raw, display):
"""NFD title must produce NFC path bytes."""
settings.FILENAME_FORMAT = "{{ title }}"
nfc = unicodedata.normalize("NFC", display)
assert raw != nfc # confirm byte-level difference
doc = DocumentFactory(title=raw, mime_type="application/pdf")
result = generate_filename(doc)
assert str(result) == f"{nfc}.pdf"
assert str(result).encode() == f"{nfc}.pdf".encode()
def test_nfd_correspondent_normalized_to_nfc(self, settings):
"""NFD correspondent name must produce NFC path component."""
settings.FILENAME_FORMAT = "{{ correspondent }}/{{ title }}"
nfd = unicodedata.normalize("NFD", "Müller")
nfc = unicodedata.normalize("NFC", "Müller")
correspondent = CorrespondentFactory(name=nfd)
doc = DocumentFactory(
title="invoice",
correspondent=correspondent,
mime_type="application/pdf",
)
result = generate_filename(doc)
assert str(result) == f"{nfc}/invoice.pdf"
assert str(result).encode() == f"{nfc}/invoice.pdf".encode()
def test_nfd_storage_path_normalized_to_nfc(self, settings):
"""NFD literal in StoragePath.path template must produce NFC path bytes."""
settings.FILENAME_FORMAT = None
nfd = unicodedata.normalize("NFD", "Büro")
nfc = unicodedata.normalize("NFC", "Büro")
# StoragePath.path is used directly as the format/template string.
# Literal NFD characters in the template must survive rendering as NFC.
sp = StoragePathFactory(path=f"{nfd}/{{{{ title }}}}")
doc = DocumentFactory(title="doc", storage_path=sp, mime_type="application/pdf")
result = generate_filename(doc)
assert str(result).encode() == f"{nfc}/doc.pdf".encode()
def test_nfd_raw_document_title_normalized_to_nfc(self, settings):
"""NFD title accessed via document.title (unsanitized context) must also be NFC."""
settings.FILENAME_FORMAT = "{{ document.title }}"
nfd = unicodedata.normalize("NFD", "Café")
nfc = unicodedata.normalize("NFC", "Café")
doc = DocumentFactory(title=nfd, mime_type="application/pdf")
result = generate_filename(doc)
assert str(result) == f"{nfc}.pdf"
assert str(result).encode() == f"{nfc}.pdf".encode()
@pytest.mark.django_db
class TestContextBuilderNFCNormalization:
"""
Defense-in-depth: context builder functions must NFC-normalize string inputs
before passing them to sanitize_filename(). Task 1 already normalizes the
final rendered path via clean_filepath(), so these tests may already pass;
they exist as regression guards for the context-builder layer.
"""
def test_nfd_tag_name_normalized_in_tag_list(self, settings):
"""NFD tag name must appear as NFC bytes in the {{ tag_list }} shorthand."""
settings.FILENAME_FORMAT = "{{ tag_list }}/{{ title }}"
nfd = unicodedata.normalize("NFD", "Büro")
nfc = unicodedata.normalize("NFC", "Büro")
assert nfd != nfc # confirm they differ at byte level
tag = TagFactory(name=nfd)
doc = DocumentFactory(title="doc", mime_type="application/pdf")
doc.tags.set([tag])
result = generate_filename(doc)
assert str(result).encode() == f"{nfc}/doc.pdf".encode()
def test_nfd_original_name_normalized_to_nfc(self, settings):
settings.FILENAME_FORMAT = "{{ original_name }}"
nfd = unicodedata.normalize("NFD", "Rechnung März")
nfc = unicodedata.normalize("NFC", "Rechnung März")
doc = DocumentFactory(
original_filename=f"{nfd}.pdf",
mime_type="application/pdf",
)
result = generate_filename(doc)
assert str(result).encode() == f"{nfc}.pdf".encode()
def test_nfd_custom_field_string_value_normalized(self, settings):
"""NFD value in a STRING-type custom field must appear as NFC in the context."""
settings.FILENAME_FORMAT = (
"{{ custom_fields['Location']['value'] }}/{{ title }}"
)
nfd_value = unicodedata.normalize("NFD", "Düsseldorf")
nfc_value = unicodedata.normalize("NFC", "Düsseldorf")
assert nfd_value != nfc_value
doc = DocumentFactory(title="report", mime_type="application/pdf")
cf = CustomField.objects.create(
name="Location",
data_type=CustomField.FieldDataType.STRING,
)
CustomFieldInstance.objects.create(
document=doc,
field=cf,
value_text=nfd_value,
)
result = generate_filename(doc)
assert str(result).encode() == f"{nfc_value}/report.pdf".encode()
def test_nfd_custom_field_name_normalized_as_key(self, settings):
"""NFD characters in a custom field name must appear as NFC in the context dict key."""
nfd_name = unicodedata.normalize("NFD", "Größe")
nfc_name = unicodedata.normalize("NFC", "Größe")
assert nfd_name != nfc_name
settings.FILENAME_FORMAT = f"{{% if custom_fields['{nfc_name}'] %}}{{{{ custom_fields['{nfc_name}']['value'] }}}}/{{{{ title }}}}{{% else %}}{{{{ title }}}}{{% endif %}}"
doc = DocumentFactory(title="letter", mime_type="application/pdf")
cf = CustomField.objects.create(
name=nfd_name,
data_type=CustomField.FieldDataType.STRING,
)
CustomFieldInstance.objects.create(
document=doc,
field=cf,
value_text="Berlin",
)
result = generate_filename(doc)
# If field name key is NFC-normalized, the template condition succeeds
# and result is "Berlin/letter.pdf"; otherwise it falls back to "letter.pdf"
assert str(result) == "Berlin/letter.pdf"
def test_nfd_tag_name_list_normalized_to_nfc(self, settings):
"""NFD tag names in tag_name_list must appear as NFC bytes when iterated."""
settings.FILENAME_FORMAT = (
"{% for t in tag_name_list %}{{ t }}{% endfor %}/{{ title }}"
)
nfd = unicodedata.normalize("NFD", "Büro")
nfc = unicodedata.normalize("NFC", "Büro")
assert nfd != nfc # confirm byte-level difference
doc = DocumentFactory(title="doc", mime_type="application/pdf")
doc.tags.add(TagFactory(name=nfd))
result = generate_filename(doc)
assert str(result).encode() == f"{nfc}/doc.pdf".encode()
+1
View File
@@ -3126,6 +3126,7 @@ class PostDocumentView(GenericAPIView[Any]):
serializer.is_valid(raise_exception=True)
doc_name, doc_data = serializer.validated_data.get("document")
doc_name = normalize("NFC", doc_name)
correspondent_id = serializer.validated_data.get("correspondent")
document_type_id = serializer.validated_data.get("document_type")
storage_path_id = serializer.validated_data.get("storage_path")
+10 -5
View File
@@ -4,6 +4,7 @@ import logging
import ssl
import tempfile
import traceback
import unicodedata
from datetime import date
from datetime import timedelta
from fnmatch import fnmatch
@@ -496,10 +497,10 @@ class MailAccountHandler(LoggingMixin):
rule: MailRule,
) -> str | None:
if rule.assign_title_from == MailRule.TitleSource.FROM_SUBJECT:
return message.subject
return unicodedata.normalize("NFC", message.subject)
elif rule.assign_title_from == MailRule.TitleSource.FROM_FILENAME:
return Path(att.filename).stem
return unicodedata.normalize("NFC", Path(att.filename).stem)
elif rule.assign_title_from == MailRule.TitleSource.NONE:
return None
@@ -866,7 +867,9 @@ class MailAccountHandler(LoggingMixin):
),
)
attachment_name = pathvalidate.sanitize_filename(att.filename)
attachment_name = pathvalidate.sanitize_filename(
unicodedata.normalize("NFC", att.filename),
)
if attachment_name:
temp_filename = temp_dir / attachment_name
else: # pragma: no cover
@@ -882,7 +885,7 @@ class MailAccountHandler(LoggingMixin):
)
doc_overrides = DocumentMetadataOverrides(
title=title,
filename=pathvalidate.sanitize_filename(att.filename),
filename=attachment_name,
correspondent_id=correspondent.id if correspondent else None,
document_type_id=doc_type.id if doc_type else None,
tag_ids=tag_ids,
@@ -988,7 +991,9 @@ class MailAccountHandler(LoggingMixin):
)
doc_overrides = DocumentMetadataOverrides(
title=message.subject,
filename=pathvalidate.sanitize_filename(f"{message.subject}.eml"),
filename=pathvalidate.sanitize_filename(
unicodedata.normalize("NFC", f"{message.subject}.eml"),
),
correspondent_id=correspondent.id if correspondent else None,
document_type_id=doc_type.id if doc_type else None,
tag_ids=tag_ids,
+182
View File
@@ -0,0 +1,182 @@
"""
Tests that mail attachment filenames and EML subject filenames are
normalized to NFC Unicode before being stored as document overrides.
Filenames from MIME headers can arrive in NFD form (e.g. from macOS Mail),
and must be normalized to NFC so filenames are consistent regardless of the
sending client.
"""
import unicodedata
from pathlib import Path
from unittest import mock
import pytest
from documents.tests.utils import remove_dirs
from documents.tests.utils import setup_directories
from paperless_mail.models import MailRule
from paperless_mail.tests.factories import MailAccountFactory
from paperless_mail.tests.test_mail import MessageBuilder
from paperless_mail.tests.test_mail import _AttachmentDef
from paperless_mail.tests.test_mail import fake_magic_from_buffer
@pytest.fixture()
def directories(settings):
dirs = setup_directories()
yield dirs
remove_dirs(dirs)
@pytest.fixture()
def queue_consumption_tasks_mock():
with mock.patch("paperless_mail.mail.queue_consumption_tasks") as m:
yield m
@pytest.fixture()
def mail_account(db):
return MailAccountFactory()
@pytest.fixture()
def attachment_rule(mail_account):
rule = MailRule(
name="attachment rule",
account=mail_account,
assign_title_from=MailRule.TitleSource.FROM_FILENAME,
consumption_scope=MailRule.ConsumptionScope.ATTACHMENTS_ONLY,
attachment_type=MailRule.AttachmentProcessing.ATTACHMENTS_ONLY,
)
rule.save()
return rule
@pytest.fixture()
def eml_rule(mail_account):
rule = MailRule(
name="eml rule",
account=mail_account,
assign_title_from=MailRule.TitleSource.FROM_SUBJECT,
consumption_scope=MailRule.ConsumptionScope.EML_ONLY,
attachment_type=MailRule.AttachmentProcessing.ATTACHMENTS_ONLY,
)
rule.save()
return rule
@pytest.fixture()
def message_builder():
return MessageBuilder()
@pytest.mark.django_db
@mock.patch("paperless_mail.mail.magic.from_buffer", fake_magic_from_buffer)
class TestMailNFCNormalization:
"""Attachment filenames and EML subject filenames must be NFC-normalized."""
def test_attachment_nfd_filename_normalized_to_nfc(
self,
directories,
queue_consumption_tasks_mock,
attachment_rule,
mail_account_handler,
message_builder,
):
"""Attachment filename arriving as NFD must be stored as NFC in both
the overrides and the temp file written to disk.
"""
nfd_filename = unicodedata.normalize("NFD", "Rechnung März.pdf")
nfc_filename = unicodedata.normalize("NFC", "Rechnung März.pdf")
# Confirm the fixture is actually NFD (not already NFC)
assert unicodedata.is_normalized("NFD", nfd_filename)
assert not unicodedata.is_normalized("NFC", nfd_filename)
message = message_builder.create_message(
subject="Test invoice",
from_="sender@example.com",
attachments=[
_AttachmentDef(filename=nfd_filename, content=b"%PDF-1.4 test"),
],
)
result = mail_account_handler._handle_message(message, attachment_rule)
assert result == 1
queue_consumption_tasks_mock.assert_called_once()
call_kwargs = queue_consumption_tasks_mock.call_args.kwargs
consume_tasks = call_kwargs["consume_tasks"]
assert len(consume_tasks) == 1
overrides = consume_tasks[0].kwargs["overrides"]
assert overrides.filename == nfc_filename
assert unicodedata.is_normalized("NFC", overrides.filename)
assert unicodedata.is_normalized("NFC", overrides.title)
input_doc = consume_tasks[0].kwargs["input_doc"]
original_file = Path(input_doc.original_file)
assert original_file.exists()
assert original_file.name == nfc_filename
def test_eml_subject_filename_nfc(
self,
directories,
queue_consumption_tasks_mock,
eml_rule,
mail_account_handler,
message_builder,
):
"""EML filename derived from subject arriving as NFD must be stored as NFC."""
nfd_subject = unicodedata.normalize("NFD", "Rechnung März 2024")
nfc_expected_filename = unicodedata.normalize("NFC", "Rechnung März 2024.eml")
# Confirm the fixture is actually NFD
assert unicodedata.is_normalized("NFD", nfd_subject)
message = message_builder.create_message(
subject=nfd_subject,
from_="sender@example.com",
attachments=0,
)
mail_account_handler._handle_message(message, eml_rule)
queue_consumption_tasks_mock.assert_called_once()
call_kwargs = queue_consumption_tasks_mock.call_args.kwargs
consume_tasks = call_kwargs["consume_tasks"]
assert len(consume_tasks) == 1
overrides = consume_tasks[0].kwargs["overrides"]
assert overrides.filename == nfc_expected_filename
assert unicodedata.is_normalized("NFC", overrides.filename)
def test_already_nfc_attachment_filename_unchanged(
self,
directories,
queue_consumption_tasks_mock,
attachment_rule,
mail_account_handler,
message_builder,
):
"""An attachment filename already in NFC must pass through unchanged."""
nfc_filename = "Invoice_2024.pdf"
assert unicodedata.is_normalized("NFC", nfc_filename)
message = message_builder.create_message(
subject="Invoice",
from_="sender@example.com",
attachments=[
_AttachmentDef(filename=nfc_filename, content=b"%PDF-1.4 test"),
],
)
mail_account_handler._handle_message(message, attachment_rule)
call_kwargs = queue_consumption_tasks_mock.call_args.kwargs
consume_tasks = call_kwargs["consume_tasks"]
overrides = consume_tasks[0].kwargs["overrides"]
assert overrides.filename == nfc_filename