Fix: Apply unicode normalization to all paths and path components (#12993)

This commit is contained in:
Trenton H
2026-06-13 05:45:54 -07:00
committed by GitHub
parent 92c016ce47
commit 8ed4bf2011
6 changed files with 499 additions and 20 deletions
+24 -15
View File
@@ -1,6 +1,7 @@
import logging
import os
import re
import unicodedata
from collections.abc import Iterable
from pathlib import PurePath
@@ -36,10 +37,12 @@ class FilePathTemplate(Template):
def clean_filepath(value: str) -> str:
"""
Clean up a filepath by:
1. Removing newlines and carriage returns
2. Removing extra spaces before and after forward slashes
3. Preserving spaces in other parts of the path
1. Normalizing Unicode to NFC form to prevent byte-level mismatches
2. Removing newlines and carriage returns
3. Removing extra spaces before and after forward slashes
4. Preserving spaces in other parts of the path
"""
value = unicodedata.normalize("NFC", value)
value = value.replace("\n", "").replace("\r", "")
value = re.sub(r"\s*/\s*", "/", value)
@@ -181,17 +184,17 @@ def get_basic_metadata_context(
"""
return {
"title": pathvalidate.sanitize_filename(
document.title,
unicodedata.normalize("NFC", document.title),
replacement_text="-",
),
"correspondent": pathvalidate.sanitize_filename(
document.correspondent.name,
unicodedata.normalize("NFC", document.correspondent.name),
replacement_text="-",
)
if document.correspondent
else no_value_default,
"document_type": pathvalidate.sanitize_filename(
document.document_type.name,
unicodedata.normalize("NFC", document.document_type.name),
replacement_text="-",
)
if document.document_type
@@ -202,7 +205,10 @@ def get_basic_metadata_context(
"owner_username": document.owner.username
if document.owner
else no_value_default,
"original_name": PurePath(document.original_filename).with_suffix("").name
"original_name": unicodedata.normalize(
"NFC",
PurePath(document.original_filename).with_suffix("").name,
)
if document.original_filename
else no_value_default,
"doc_pk": f"{document.pk:07}",
@@ -269,12 +275,12 @@ def get_tags_context(tags: Iterable[Tag]) -> dict[str, str | list[str]]:
return {
"tag_list": pathvalidate.sanitize_filename(
",".join(
sorted(tag.name for tag in tags),
sorted(unicodedata.normalize("NFC", tag.name) for tag in tags),
),
replacement_text="-",
),
# Assumed to be ordered, but a template could loop through to find what they want
"tag_name_list": [x.name for x in tags],
"tag_name_list": [unicodedata.normalize("NFC", x.name) for x in tags],
}
@@ -301,7 +307,7 @@ def get_custom_fields_context(
CustomField.FieldDataType.LONG_TEXT,
}:
value = pathvalidate.sanitize_filename(
field_instance.value,
unicodedata.normalize("NFC", field_instance.value),
replacement_text="-",
)
elif (
@@ -310,10 +316,13 @@ def get_custom_fields_context(
):
options = field_instance.field.extra_data["select_options"]
value = pathvalidate.sanitize_filename(
next(
option["label"]
for option in options
if option["id"] == field_instance.value
unicodedata.normalize(
"NFC",
next(
option["label"]
for option in options
if option["id"] == field_instance.value
),
),
replacement_text="-",
)
@@ -321,7 +330,7 @@ def get_custom_fields_context(
value = field_instance.value
field_data["custom_fields"][
pathvalidate.sanitize_filename(
field_instance.field.name,
unicodedata.normalize("NFC", field_instance.field.name),
replacement_text="-",
)
] = {