From 889ccfd67a02f715cb00f61b1db1f41ca9c604c6 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Fri, 29 May 2026 16:42:07 -0700
Subject: [PATCH] Fix: Fold query and autocomplete terms with Tantivy's
 ascii_fold so special letters match (#12868)

---
 src/documents/search/_backend.py           |  2 +-
 src/documents/search/_normalize.py         |  8 -----
 src/documents/search/_query.py             | 24 +++++++--------
 src/documents/search/_tokenizer.py         | 34 +++++++++++++++++++++
 src/documents/tests/search/test_backend.py | 35 ++++++++++++++++++++++
 5 files changed, 82 insertions(+), 21 deletions(-)
 delete mode 100644 src/documents/search/_normalize.py
diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py
index 22c02b299..5b5c8aa08 100644
--- a/src/documents/search/_backend.py
+++ b/src/documents/search/_backend.py
@@ -22,7 +22,6 @@ from django.conf import settings
 from django.utils.timezone import get_current_timezone
 from guardian.shortcuts import get_users_with_perms
 
-from documents.search._normalize import ascii_fold
 from documents.search._query import build_permission_filter
 from documents.search._query import parse_simple_text_highlight_query
 from documents.search._query import parse_simple_text_query
@@ -32,6 +31,7 @@ from documents.search._schema import _write_sentinels
 from documents.search._schema import build_schema
 from documents.search._schema import open_or_rebuild_index
 from documents.search._schema import wipe_index
+from documents.search._tokenizer import ascii_fold
 from documents.search._tokenizer import register_tokenizers
 from documents.utils import IterWrapper
 from documents.utils import identity
diff --git a/src/documents/search/_normalize.py b/src/documents/search/_normalize.py
deleted file mode 100644
index 3d7b23f33..000000000
--- a/src/documents/search/_normalize.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from __future__ import annotations
-
-import unicodedata
-
-
-def ascii_fold(text: str) -> str:
-    """Normalize unicode text to ASCII equivalents for search consistency."""
-    return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode()
diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py
index cced4c64c..932d68bc1 100644
--- a/src/documents/search/_query.py
+++ b/src/documents/search/_query.py
@@ -12,7 +12,7 @@ import tantivy
 from dateutil.relativedelta import relativedelta
 from django.conf import settings
 
-from documents.search._normalize import ascii_fold
+from documents.search._tokenizer import simple_search_tokens
 
 if TYPE_CHECKING:
     from datetime import tzinfo
@@ -78,7 +78,6 @@ _YEAR_RANGE_RE = regex.compile(
     r"(?<!\w)(?P<field>created|modified|added):\[(?P<y1>\d{4})\s+TO\s+(?P<y2>\d{4})\]",
     regex.IGNORECASE,
 )
-_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+")
 # Tantivy syntax error: " - " and " + " with spaces on both sides are invalid because
 # the NOT/MUST operators require no space between the operator and the term.
 # In natural-language queries (e.g., "H52.1 - Kurzsichtigkeit"), the dash is a separator.
@@ -542,11 +541,10 @@ _SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}
 
 
 def _simple_query_tokens(raw_query: str) -> list[str]:
-    tokens = [
-        ascii_fold(token.lower())
-        for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
-    ]
-    return [token for token in tokens if token]
+    # Tokenize and fold via the same analyzer used to index simple_title /
+    # simple_content, so query terms fold identically to the indexed terms
+    # (single source of truth for ASCII folding).
+    return simple_search_tokens(raw_query)
 
 
 def _build_simple_field_query(
@@ -614,9 +612,10 @@ def parse_user_query(
         field_boosts=_FIELD_BOOSTS,
     )
 
-    # CJK characters are stripped by ascii_fold in the standard tokenizer, so
-    # they would never match content/title. Route CJK queries to the bigram
-    # fields, which use an ngram tokenizer that preserves non-ASCII text.
+    # The standard analyzer keeps a whitespace-free CJK run as a single token,
+    # so substring queries can't match content/title (and long runs are dropped
+    # by remove_long). Route CJK queries to the bigram fields, whose ngram
+    # tokenizer indexes overlapping 2-grams for substring matching.
     cjk_query = (
         _build_cjk_query(index, raw_query, _CJK_ALL_FIELDS)
         if _has_cjk(raw_query)
@@ -658,8 +657,9 @@ def parse_simple_query(
 
     Query string is escaped and normalized to be treated as "simple" text query.
     When cjk_fields is provided and the query contains CJK characters, an
-    additional Should clause searches those bigram-tokenized fields so that
-    CJK text is not silently dropped by ascii_fold.
+    additional Should clause searches those bigram-tokenized fields, which match
+    CJK substrings the simple analyzer can't (long whitespace-free runs are
+    dropped by remove_long).
     """
     tokens = _simple_query_tokens(raw_query)
 
diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py
index 2079ca4cc..207a876a1 100644
--- a/src/documents/search/_tokenizer.py
+++ b/src/documents/search/_tokenizer.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+from typing import Final
 
 import tantivy
 
@@ -128,3 +129,36 @@ def _simple_search_analyzer() -> tantivy.TextAnalyzer:
         .filter(tantivy.Filter.ascii_fold())
         .build()
     )
+
+
+# Shared analyzers for query-side normalization. They reuse the exact filters
+# applied at index time so query terms fold identically (single source of truth
+# for ASCII folding, instead of a separate Python implementation). tantivy-py's
+# TextAnalyzer.analyze clones internally per call, so these are safe to share.
+_SIMPLE_SEARCH_ANALYZER: Final = _simple_search_analyzer()
+# raw tokenizer keeps the whole input as one token, so this folds an arbitrary
+# string to ASCII exactly like the content tokenizers (ß->ss, ø->o, æ->ae, ...)
+# without splitting it - used for autocomplete words and prefixes.
+_ASCII_FOLD_ANALYZER: Final = (
+    tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.raw())
+    .filter(tantivy.Filter.ascii_fold())
+    .build()
+)
+
+
+def simple_search_tokens(text: str) -> list[str]:
+    """Tokenize a query string exactly as simple_title/simple_content are indexed."""
+    return _SIMPLE_SEARCH_ANALYZER.analyze(text)
+
+
+def ascii_fold(text: str) -> str:
+    """Fold text to ASCII using the same mapping as the content tokenizers.
+
+    Maps non-decomposable letters (ß->ss, ø->o, æ->ae, ...) identically to
+    Tantivy's ascii_fold filter used at index time, so query/autocomplete terms
+    agree with the folded content. A naive NFD strip would instead delete those
+    letters, causing silent search misses. Callers lowercase first, matching the
+    index pipeline's lowercase -> ascii_fold order.
+    """
+    tokens = _ASCII_FOLD_ANALYZER.analyze(text)
+    return tokens[0] if tokens else ""
diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py
index c0d7b6d99..d9fa30715 100644
--- a/src/documents/tests/search/test_backend.py
+++ b/src/documents/tests/search/test_backend.py
@@ -385,6 +385,29 @@ class TestSearch:
             == 1
         )
 
+    @pytest.mark.parametrize(
+        "query",
+        [
+            pytest.param("Straße", id="eszett"),
+            pytest.param("Ærøskøbing", id="ae_and_oslash"),
+            pytest.param("strasse", id="ascii_fold_form"),
+        ],
+    )
+    def test_simple_search_folds_special_letters_like_index(
+        self,
+        backend: TantivyBackend,
+        query: str,
+    ) -> None:
+        """Query-side folding must match index-side folding for non-decomposable
+        letters (ß→ss, ø→o, ...). Searching the accented form must find the doc.
+        A naive NFD fold deletes these letters and silently fails to match."""
+        doc = DocumentFactory(title="report", content="Straße Ærøskøbing")
+        backend.add_or_update(doc)
+
+        assert (
+            len(backend.search_ids(query, user=None, search_mode=SearchMode.TEXT)) == 1
+        )
+
     def test_sort_field_ascending(self, backend: TantivyBackend) -> None:
         """Searching with sort_reverse=False must return results in ascending ASN order."""
         for asn in [30, 10, 20]:
@@ -564,6 +587,18 @@ class TestAutocomplete:
         results = backend.autocomplete("pay", limit=10)
         assert results.index("payment") < results.index("payslip")
 
+    def test_folds_special_letters_consistently(
+        self,
+        backend: TantivyBackend,
+    ) -> None:
+        """Autocomplete words must fold the same way as content (ß→ss), so a
+        prefix of the folded form finds them. A naive NFD fold would store the
+        word as 'strae' and the prefix 'stras' would never match it."""
+        doc = DocumentFactory(title="Straße", content="details")
+        backend.add_or_update(doc)
+
+        assert "strasse" in backend.autocomplete("stras", limit=10)
+
 
 class TestMoreLikeThis:
     """Test more like this functionality."""