From 8ed4bf2011818bb5518ed71cc3d5d0773cce27c9 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Sat, 13 Jun 2026 05:45:54 -0700 Subject: [PATCH] Fix: Apply unicode normalization to all paths and path components (#12993) --- src/documents/templating/filepath.py | 39 ++-- .../tests/test_api_post_document_nfc.py | 95 +++++++++ src/documents/tests/test_filepath_nfc.py | 187 ++++++++++++++++++ src/documents/views.py | 1 + src/paperless_mail/mail.py | 15 +- src/paperless_mail/tests/test_mail_nfc.py | 182 +++++++++++++++++ 6 files changed, 499 insertions(+), 20 deletions(-) create mode 100644 src/documents/tests/test_api_post_document_nfc.py create mode 100644 src/documents/tests/test_filepath_nfc.py create mode 100644 src/paperless_mail/tests/test_mail_nfc.py diff --git a/src/documents/templating/filepath.py b/src/documents/templating/filepath.py index 008f1d0a9..d827b7c89 100644 --- a/src/documents/templating/filepath.py +++ b/src/documents/templating/filepath.py @@ -1,6 +1,7 @@ import logging import os import re +import unicodedata from collections.abc import Iterable from pathlib import PurePath @@ -36,10 +37,12 @@ class FilePathTemplate(Template): def clean_filepath(value: str) -> str: """ Clean up a filepath by: - 1. Removing newlines and carriage returns - 2. Removing extra spaces before and after forward slashes - 3. Preserving spaces in other parts of the path + 1. Normalizing Unicode to NFC form to prevent byte-level mismatches + 2. Removing newlines and carriage returns + 3. Removing extra spaces before and after forward slashes + 4. Preserving spaces in other parts of the path """ + value = unicodedata.normalize("NFC", value) value = value.replace("\n", "").replace("\r", "") value = re.sub(r"\s*/\s*", "/", value) @@ -181,17 +184,17 @@ def get_basic_metadata_context( """ return { "title": pathvalidate.sanitize_filename( - document.title, + unicodedata.normalize("NFC", document.title), replacement_text="-", ), "correspondent": pathvalidate.sanitize_filename( - document.correspondent.name, + unicodedata.normalize("NFC", document.correspondent.name), replacement_text="-", ) if document.correspondent else no_value_default, "document_type": pathvalidate.sanitize_filename( - document.document_type.name, + unicodedata.normalize("NFC", document.document_type.name), replacement_text="-", ) if document.document_type @@ -202,7 +205,10 @@ def get_basic_metadata_context( "owner_username": document.owner.username if document.owner else no_value_default, - "original_name": PurePath(document.original_filename).with_suffix("").name + "original_name": unicodedata.normalize( + "NFC", + PurePath(document.original_filename).with_suffix("").name, + ) if document.original_filename else no_value_default, "doc_pk": f"{document.pk:07}", @@ -269,12 +275,12 @@ def get_tags_context(tags: Iterable[Tag]) -> dict[str, str | list[str]]: return { "tag_list": pathvalidate.sanitize_filename( ",".join( - sorted(tag.name for tag in tags), + sorted(unicodedata.normalize("NFC", tag.name) for tag in tags), ), replacement_text="-", ), # Assumed to be ordered, but a template could loop through to find what they want - "tag_name_list": [x.name for x in tags], + "tag_name_list": [unicodedata.normalize("NFC", x.name) for x in tags], } @@ -301,7 +307,7 @@ def get_custom_fields_context( CustomField.FieldDataType.LONG_TEXT, }: value = pathvalidate.sanitize_filename( - field_instance.value, + unicodedata.normalize("NFC", field_instance.value), replacement_text="-", ) elif ( @@ -310,10 +316,13 @@ def get_custom_fields_context( ): options = field_instance.field.extra_data["select_options"] value = pathvalidate.sanitize_filename( - next( - option["label"] - for option in options - if option["id"] == field_instance.value + unicodedata.normalize( + "NFC", + next( + option["label"] + for option in options + if option["id"] == field_instance.value + ), ), replacement_text="-", ) @@ -321,7 +330,7 @@ def get_custom_fields_context( value = field_instance.value field_data["custom_fields"][ pathvalidate.sanitize_filename( - field_instance.field.name, + unicodedata.normalize("NFC", field_instance.field.name), replacement_text="-", ) ] = { diff --git a/src/documents/tests/test_api_post_document_nfc.py b/src/documents/tests/test_api_post_document_nfc.py new file mode 100644 index 000000000..5b3088b01 --- /dev/null +++ b/src/documents/tests/test_api_post_document_nfc.py @@ -0,0 +1,95 @@ +import unicodedata +from typing import TYPE_CHECKING +from unittest import mock + +import celery.result +import pytest +from django.core.files.uploadedfile import SimpleUploadedFile + +if TYPE_CHECKING: + from documents.data_models import ConsumableDocument + from documents.data_models import DocumentMetadataOverrides + + +@pytest.fixture() +def consume_file_mock(): + with mock.patch("documents.tasks.consume_file.apply_async") as m: + m.return_value = celery.result.AsyncResult(id="test-task-id") + yield m + + +@pytest.fixture() +def directories(tmp_path, settings, _media_settings): + scratch = tmp_path / "scratch" + scratch.mkdir() + settings.SCRATCH_DIR = scratch + return scratch + + +@pytest.mark.django_db +class TestPostDocumentNFCNormalization: + def test_nfd_filename_normalized_to_nfc( + self, + admin_client, + consume_file_mock: mock.MagicMock, + directories, + ): + """Uploaded file with NFD filename must have its name stored as NFC.""" + nfd = unicodedata.normalize("NFD", "Rechnung März.pdf") + nfc = unicodedata.normalize("NFC", "Rechnung März.pdf") + + # Verify our test strings actually differ at the byte level + assert nfd != nfc + + uploaded = SimpleUploadedFile( + nfd, + b"%PDF-1.4 test", + content_type="application/pdf", + ) + response = admin_client.post( + "/api/documents/post_document/", + {"document": uploaded}, + ) + + assert response.status_code == 200 + + task_kwargs = consume_file_mock.call_args.kwargs["kwargs"] + input_doc: ConsumableDocument = task_kwargs["input_doc"] + overrides: DocumentMetadataOverrides = task_kwargs["overrides"] + + # The temp file on disk must have an NFC name + assert input_doc.original_file.name == nfc, ( + f"Expected NFC filename {nfc!r}, got {input_doc.original_file.name!r}" + ) + # The override filename stored for later use must also be NFC + assert overrides.filename == nfc, ( + f"Expected NFC override filename {nfc!r}, got {overrides.filename!r}" + ) + assert unicodedata.is_normalized("NFC", overrides.filename) + + def test_already_nfc_filename_unchanged( + self, + admin_client, + consume_file_mock: mock.MagicMock, + directories, + ): + """Uploaded file with already-NFC filename must pass through unchanged.""" + nfc = unicodedata.normalize("NFC", "Invoice_2024.pdf") + + uploaded = SimpleUploadedFile( + nfc, + b"%PDF-1.4 test", + content_type="application/pdf", + ) + response = admin_client.post( + "/api/documents/post_document/", + {"document": uploaded}, + ) + + assert response.status_code == 200 + + task_kwargs = consume_file_mock.call_args.kwargs["kwargs"] + overrides: DocumentMetadataOverrides = task_kwargs["overrides"] + + assert overrides.filename == nfc + assert unicodedata.is_normalized("NFC", overrides.filename) diff --git a/src/documents/tests/test_filepath_nfc.py b/src/documents/tests/test_filepath_nfc.py new file mode 100644 index 000000000..e1d3ef9a8 --- /dev/null +++ b/src/documents/tests/test_filepath_nfc.py @@ -0,0 +1,187 @@ +""" +Tests for NFC Unicode normalization in generate_filename / FilePathTemplate.render(). + +NFC `ü` (UTF-8: c3 bc) and NFD `ü` (UTF-8: 75 cc 88) are visually identical but +produce different byte sequences. On Linux (ext4, ZFS) these are distinct filenames. +All paths produced by the templating system must be NFC-normalized. +""" + +import unicodedata + +import pytest + +from documents.file_handling import generate_filename +from documents.models import CustomField +from documents.models import CustomFieldInstance +from documents.tests.factories import CorrespondentFactory +from documents.tests.factories import DocumentFactory +from documents.tests.factories import StoragePathFactory +from documents.tests.factories import TagFactory + + +@pytest.mark.django_db +class TestGenerateFilenameNFCNormalization: + @pytest.mark.parametrize( + "raw,display", + [ + (unicodedata.normalize("NFD", "Gemüse"), "Gemüse"), + (unicodedata.normalize("NFD", "Café"), "Café"), + (unicodedata.normalize("NFD", "naïve"), "naïve"), + ], + ) + def test_nfd_title_normalized_to_nfc(self, settings, raw, display): + """NFD title must produce NFC path bytes.""" + settings.FILENAME_FORMAT = "{{ title }}" + nfc = unicodedata.normalize("NFC", display) + assert raw != nfc # confirm byte-level difference + + doc = DocumentFactory(title=raw, mime_type="application/pdf") + result = generate_filename(doc) + + assert str(result) == f"{nfc}.pdf" + assert str(result).encode() == f"{nfc}.pdf".encode() + + def test_nfd_correspondent_normalized_to_nfc(self, settings): + """NFD correspondent name must produce NFC path component.""" + settings.FILENAME_FORMAT = "{{ correspondent }}/{{ title }}" + nfd = unicodedata.normalize("NFD", "Müller") + nfc = unicodedata.normalize("NFC", "Müller") + + correspondent = CorrespondentFactory(name=nfd) + doc = DocumentFactory( + title="invoice", + correspondent=correspondent, + mime_type="application/pdf", + ) + result = generate_filename(doc) + + assert str(result) == f"{nfc}/invoice.pdf" + assert str(result).encode() == f"{nfc}/invoice.pdf".encode() + + def test_nfd_storage_path_normalized_to_nfc(self, settings): + """NFD literal in StoragePath.path template must produce NFC path bytes.""" + settings.FILENAME_FORMAT = None + nfd = unicodedata.normalize("NFD", "Büro") + nfc = unicodedata.normalize("NFC", "Büro") + + # StoragePath.path is used directly as the format/template string. + # Literal NFD characters in the template must survive rendering as NFC. + sp = StoragePathFactory(path=f"{nfd}/{{{{ title }}}}") + doc = DocumentFactory(title="doc", storage_path=sp, mime_type="application/pdf") + result = generate_filename(doc) + + assert str(result).encode() == f"{nfc}/doc.pdf".encode() + + def test_nfd_raw_document_title_normalized_to_nfc(self, settings): + """NFD title accessed via document.title (unsanitized context) must also be NFC.""" + settings.FILENAME_FORMAT = "{{ document.title }}" + nfd = unicodedata.normalize("NFD", "Café") + nfc = unicodedata.normalize("NFC", "Café") + + doc = DocumentFactory(title=nfd, mime_type="application/pdf") + result = generate_filename(doc) + + assert str(result) == f"{nfc}.pdf" + assert str(result).encode() == f"{nfc}.pdf".encode() + + +@pytest.mark.django_db +class TestContextBuilderNFCNormalization: + """ + Defense-in-depth: context builder functions must NFC-normalize string inputs + before passing them to sanitize_filename(). Task 1 already normalizes the + final rendered path via clean_filepath(), so these tests may already pass; + they exist as regression guards for the context-builder layer. + """ + + def test_nfd_tag_name_normalized_in_tag_list(self, settings): + """NFD tag name must appear as NFC bytes in the {{ tag_list }} shorthand.""" + settings.FILENAME_FORMAT = "{{ tag_list }}/{{ title }}" + nfd = unicodedata.normalize("NFD", "Büro") + nfc = unicodedata.normalize("NFC", "Büro") + assert nfd != nfc # confirm they differ at byte level + + tag = TagFactory(name=nfd) + doc = DocumentFactory(title="doc", mime_type="application/pdf") + doc.tags.set([tag]) + + result = generate_filename(doc) + + assert str(result).encode() == f"{nfc}/doc.pdf".encode() + + def test_nfd_original_name_normalized_to_nfc(self, settings): + settings.FILENAME_FORMAT = "{{ original_name }}" + nfd = unicodedata.normalize("NFD", "Rechnung März") + nfc = unicodedata.normalize("NFC", "Rechnung März") + + doc = DocumentFactory( + original_filename=f"{nfd}.pdf", + mime_type="application/pdf", + ) + result = generate_filename(doc) + + assert str(result).encode() == f"{nfc}.pdf".encode() + + def test_nfd_custom_field_string_value_normalized(self, settings): + """NFD value in a STRING-type custom field must appear as NFC in the context.""" + settings.FILENAME_FORMAT = ( + "{{ custom_fields['Location']['value'] }}/{{ title }}" + ) + nfd_value = unicodedata.normalize("NFD", "Düsseldorf") + nfc_value = unicodedata.normalize("NFC", "Düsseldorf") + assert nfd_value != nfc_value + + doc = DocumentFactory(title="report", mime_type="application/pdf") + cf = CustomField.objects.create( + name="Location", + data_type=CustomField.FieldDataType.STRING, + ) + CustomFieldInstance.objects.create( + document=doc, + field=cf, + value_text=nfd_value, + ) + + result = generate_filename(doc) + + assert str(result).encode() == f"{nfc_value}/report.pdf".encode() + + def test_nfd_custom_field_name_normalized_as_key(self, settings): + """NFD characters in a custom field name must appear as NFC in the context dict key.""" + nfd_name = unicodedata.normalize("NFD", "Größe") + nfc_name = unicodedata.normalize("NFC", "Größe") + assert nfd_name != nfc_name + + settings.FILENAME_FORMAT = f"{{% if custom_fields['{nfc_name}'] %}}{{{{ custom_fields['{nfc_name}']['value'] }}}}/{{{{ title }}}}{{% else %}}{{{{ title }}}}{{% endif %}}" + + doc = DocumentFactory(title="letter", mime_type="application/pdf") + cf = CustomField.objects.create( + name=nfd_name, + data_type=CustomField.FieldDataType.STRING, + ) + CustomFieldInstance.objects.create( + document=doc, + field=cf, + value_text="Berlin", + ) + + result = generate_filename(doc) + + # If field name key is NFC-normalized, the template condition succeeds + # and result is "Berlin/letter.pdf"; otherwise it falls back to "letter.pdf" + assert str(result) == "Berlin/letter.pdf" + + def test_nfd_tag_name_list_normalized_to_nfc(self, settings): + """NFD tag names in tag_name_list must appear as NFC bytes when iterated.""" + settings.FILENAME_FORMAT = ( + "{% for t in tag_name_list %}{{ t }}{% endfor %}/{{ title }}" + ) + nfd = unicodedata.normalize("NFD", "Büro") + nfc = unicodedata.normalize("NFC", "Büro") + assert nfd != nfc # confirm byte-level difference + + doc = DocumentFactory(title="doc", mime_type="application/pdf") + doc.tags.add(TagFactory(name=nfd)) + result = generate_filename(doc) + + assert str(result).encode() == f"{nfc}/doc.pdf".encode() diff --git a/src/documents/views.py b/src/documents/views.py index cbc4560d8..5ed6fdaf5 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -3126,6 +3126,7 @@ class PostDocumentView(GenericAPIView[Any]): serializer.is_valid(raise_exception=True) doc_name, doc_data = serializer.validated_data.get("document") + doc_name = normalize("NFC", doc_name) correspondent_id = serializer.validated_data.get("correspondent") document_type_id = serializer.validated_data.get("document_type") storage_path_id = serializer.validated_data.get("storage_path") diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py index d551cc8cd..acdc72703 100644 --- a/src/paperless_mail/mail.py +++ b/src/paperless_mail/mail.py @@ -4,6 +4,7 @@ import logging import ssl import tempfile import traceback +import unicodedata from datetime import date from datetime import timedelta from fnmatch import fnmatch @@ -496,10 +497,10 @@ class MailAccountHandler(LoggingMixin): rule: MailRule, ) -> str | None: if rule.assign_title_from == MailRule.TitleSource.FROM_SUBJECT: - return message.subject + return unicodedata.normalize("NFC", message.subject) elif rule.assign_title_from == MailRule.TitleSource.FROM_FILENAME: - return Path(att.filename).stem + return unicodedata.normalize("NFC", Path(att.filename).stem) elif rule.assign_title_from == MailRule.TitleSource.NONE: return None @@ -866,7 +867,9 @@ class MailAccountHandler(LoggingMixin): ), ) - attachment_name = pathvalidate.sanitize_filename(att.filename) + attachment_name = pathvalidate.sanitize_filename( + unicodedata.normalize("NFC", att.filename), + ) if attachment_name: temp_filename = temp_dir / attachment_name else: # pragma: no cover @@ -882,7 +885,7 @@ class MailAccountHandler(LoggingMixin): ) doc_overrides = DocumentMetadataOverrides( title=title, - filename=pathvalidate.sanitize_filename(att.filename), + filename=attachment_name, correspondent_id=correspondent.id if correspondent else None, document_type_id=doc_type.id if doc_type else None, tag_ids=tag_ids, @@ -988,7 +991,9 @@ class MailAccountHandler(LoggingMixin): ) doc_overrides = DocumentMetadataOverrides( title=message.subject, - filename=pathvalidate.sanitize_filename(f"{message.subject}.eml"), + filename=pathvalidate.sanitize_filename( + unicodedata.normalize("NFC", f"{message.subject}.eml"), + ), correspondent_id=correspondent.id if correspondent else None, document_type_id=doc_type.id if doc_type else None, tag_ids=tag_ids, diff --git a/src/paperless_mail/tests/test_mail_nfc.py b/src/paperless_mail/tests/test_mail_nfc.py new file mode 100644 index 000000000..bfef06da9 --- /dev/null +++ b/src/paperless_mail/tests/test_mail_nfc.py @@ -0,0 +1,182 @@ +""" +Tests that mail attachment filenames and EML subject filenames are +normalized to NFC Unicode before being stored as document overrides. + +Filenames from MIME headers can arrive in NFD form (e.g. from macOS Mail), +and must be normalized to NFC so filenames are consistent regardless of the +sending client. +""" + +import unicodedata +from pathlib import Path +from unittest import mock + +import pytest + +from documents.tests.utils import remove_dirs +from documents.tests.utils import setup_directories +from paperless_mail.models import MailRule +from paperless_mail.tests.factories import MailAccountFactory +from paperless_mail.tests.test_mail import MessageBuilder +from paperless_mail.tests.test_mail import _AttachmentDef +from paperless_mail.tests.test_mail import fake_magic_from_buffer + + +@pytest.fixture() +def directories(settings): + dirs = setup_directories() + yield dirs + remove_dirs(dirs) + + +@pytest.fixture() +def queue_consumption_tasks_mock(): + with mock.patch("paperless_mail.mail.queue_consumption_tasks") as m: + yield m + + +@pytest.fixture() +def mail_account(db): + return MailAccountFactory() + + +@pytest.fixture() +def attachment_rule(mail_account): + rule = MailRule( + name="attachment rule", + account=mail_account, + assign_title_from=MailRule.TitleSource.FROM_FILENAME, + consumption_scope=MailRule.ConsumptionScope.ATTACHMENTS_ONLY, + attachment_type=MailRule.AttachmentProcessing.ATTACHMENTS_ONLY, + ) + rule.save() + return rule + + +@pytest.fixture() +def eml_rule(mail_account): + rule = MailRule( + name="eml rule", + account=mail_account, + assign_title_from=MailRule.TitleSource.FROM_SUBJECT, + consumption_scope=MailRule.ConsumptionScope.EML_ONLY, + attachment_type=MailRule.AttachmentProcessing.ATTACHMENTS_ONLY, + ) + rule.save() + return rule + + +@pytest.fixture() +def message_builder(): + return MessageBuilder() + + +@pytest.mark.django_db +@mock.patch("paperless_mail.mail.magic.from_buffer", fake_magic_from_buffer) +class TestMailNFCNormalization: + """Attachment filenames and EML subject filenames must be NFC-normalized.""" + + def test_attachment_nfd_filename_normalized_to_nfc( + self, + directories, + queue_consumption_tasks_mock, + attachment_rule, + mail_account_handler, + message_builder, + ): + """Attachment filename arriving as NFD must be stored as NFC in both + the overrides and the temp file written to disk. + """ + nfd_filename = unicodedata.normalize("NFD", "Rechnung März.pdf") + nfc_filename = unicodedata.normalize("NFC", "Rechnung März.pdf") + + # Confirm the fixture is actually NFD (not already NFC) + assert unicodedata.is_normalized("NFD", nfd_filename) + assert not unicodedata.is_normalized("NFC", nfd_filename) + + message = message_builder.create_message( + subject="Test invoice", + from_="sender@example.com", + attachments=[ + _AttachmentDef(filename=nfd_filename, content=b"%PDF-1.4 test"), + ], + ) + + result = mail_account_handler._handle_message(message, attachment_rule) + + assert result == 1 + queue_consumption_tasks_mock.assert_called_once() + + call_kwargs = queue_consumption_tasks_mock.call_args.kwargs + consume_tasks = call_kwargs["consume_tasks"] + assert len(consume_tasks) == 1 + + overrides = consume_tasks[0].kwargs["overrides"] + assert overrides.filename == nfc_filename + assert unicodedata.is_normalized("NFC", overrides.filename) + assert unicodedata.is_normalized("NFC", overrides.title) + + input_doc = consume_tasks[0].kwargs["input_doc"] + original_file = Path(input_doc.original_file) + assert original_file.exists() + assert original_file.name == nfc_filename + + def test_eml_subject_filename_nfc( + self, + directories, + queue_consumption_tasks_mock, + eml_rule, + mail_account_handler, + message_builder, + ): + """EML filename derived from subject arriving as NFD must be stored as NFC.""" + nfd_subject = unicodedata.normalize("NFD", "Rechnung März 2024") + nfc_expected_filename = unicodedata.normalize("NFC", "Rechnung März 2024.eml") + + # Confirm the fixture is actually NFD + assert unicodedata.is_normalized("NFD", nfd_subject) + + message = message_builder.create_message( + subject=nfd_subject, + from_="sender@example.com", + attachments=0, + ) + + mail_account_handler._handle_message(message, eml_rule) + + queue_consumption_tasks_mock.assert_called_once() + + call_kwargs = queue_consumption_tasks_mock.call_args.kwargs + consume_tasks = call_kwargs["consume_tasks"] + assert len(consume_tasks) == 1 + + overrides = consume_tasks[0].kwargs["overrides"] + assert overrides.filename == nfc_expected_filename + assert unicodedata.is_normalized("NFC", overrides.filename) + + def test_already_nfc_attachment_filename_unchanged( + self, + directories, + queue_consumption_tasks_mock, + attachment_rule, + mail_account_handler, + message_builder, + ): + """An attachment filename already in NFC must pass through unchanged.""" + nfc_filename = "Invoice_2024.pdf" + assert unicodedata.is_normalized("NFC", nfc_filename) + + message = message_builder.create_message( + subject="Invoice", + from_="sender@example.com", + attachments=[ + _AttachmentDef(filename=nfc_filename, content=b"%PDF-1.4 test"), + ], + ) + + mail_account_handler._handle_message(message, attachment_rule) + + call_kwargs = queue_consumption_tasks_mock.call_args.kwargs + consume_tasks = call_kwargs["consume_tasks"] + overrides = consume_tasks[0].kwargs["overrides"] + assert overrides.filename == nfc_filename