Fix: Fold query and autocomplete terms with Tantivy's ascii_fold so special letters match (#12868)

2026-07-19 10:24:57 +00:00 · 2026-05-29 16:42:07 -07:00
parent bbceb5dac6
commit 889ccfd67a
5 changed files with 82 additions and 21 deletions
@@ -22,7 +22,6 @@ from django.conf import settings
 from django.utils.timezone import get_current_timezone
 from guardian.shortcuts import get_users_with_perms

-from documents.search._normalize import ascii_fold
 from documents.search._query import build_permission_filter
 from documents.search._query import parse_simple_text_highlight_query
 from documents.search._query import parse_simple_text_query
@@ -32,6 +31,7 @@ from documents.search._schema import _write_sentinels
 from documents.search._schema import build_schema
 from documents.search._schema import open_or_rebuild_index
 from documents.search._schema import wipe_index
+from documents.search._tokenizer import ascii_fold
 from documents.search._tokenizer import register_tokenizers
 from documents.utils import IterWrapper
 from documents.utils import identity
@@ -1,8 +0,0 @@
-from __future__ import annotations
-
-import unicodedata
-
-
-def ascii_fold(text: str) -> str:
-    """Normalize unicode text to ASCII equivalents for search consistency."""
-    return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode()
@@ -12,7 +12,7 @@ import tantivy
 from dateutil.relativedelta import relativedelta
 from django.conf import settings

-from documents.search._normalize import ascii_fold
+from documents.search._tokenizer import simple_search_tokens

 if TYPE_CHECKING:
    from datetime import tzinfo
@@ -78,7 +78,6 @@ _YEAR_RANGE_RE = regex.compile(
    r"(?<!\w)(?P<field>created|modified|added):\[(?P<y1>\d{4})\s+TO\s+(?P<y2>\d{4})\]",
    regex.IGNORECASE,
 )
-_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+")
 # Tantivy syntax error: " - " and " + " with spaces on both sides are invalid because
 # the NOT/MUST operators require no space between the operator and the term.
 # In natural-language queries (e.g., "H52.1 - Kurzsichtigkeit"), the dash is a separator.
@@ -542,11 +541,10 @@ _SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}


 def _simple_query_tokens(raw_query: str) -> list[str]:
-    tokens = [
-        ascii_fold(token.lower())
-        for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
-    ]
-    return [token for token in tokens if token]
+    # Tokenize and fold via the same analyzer used to index simple_title /
+    # simple_content, so query terms fold identically to the indexed terms
+    # (single source of truth for ASCII folding).
+    return simple_search_tokens(raw_query)


 def _build_simple_field_query(
@@ -614,9 +612,10 @@ def parse_user_query(
        field_boosts=_FIELD_BOOSTS,
    )

-    # CJK characters are stripped by ascii_fold in the standard tokenizer, so
-    # they would never match content/title. Route CJK queries to the bigram
-    # fields, which use an ngram tokenizer that preserves non-ASCII text.
+    # The standard analyzer keeps a whitespace-free CJK run as a single token,
+    # so substring queries can't match content/title (and long runs are dropped
+    # by remove_long). Route CJK queries to the bigram fields, whose ngram
+    # tokenizer indexes overlapping 2-grams for substring matching.
    cjk_query = (
        _build_cjk_query(index, raw_query, _CJK_ALL_FIELDS)
        if _has_cjk(raw_query)
@@ -658,8 +657,9 @@ def parse_simple_query(

    Query string is escaped and normalized to be treated as "simple" text query.
    When cjk_fields is provided and the query contains CJK characters, an
-    additional Should clause searches those bigram-tokenized fields so that
-    CJK text is not silently dropped by ascii_fold.
+    additional Should clause searches those bigram-tokenized fields, which match
+    CJK substrings the simple analyzer can't (long whitespace-free runs are
+    dropped by remove_long).
    """
    tokens = _simple_query_tokens(raw_query)

@@ -1,6 +1,7 @@
 from __future__ import annotations

 import logging
+from typing import Final

 import tantivy

@@ -128,3 +129,36 @@ def _simple_search_analyzer() -> tantivy.TextAnalyzer:
        .filter(tantivy.Filter.ascii_fold())
        .build()
    )
+
+
+# Shared analyzers for query-side normalization. They reuse the exact filters
+# applied at index time so query terms fold identically (single source of truth
+# for ASCII folding, instead of a separate Python implementation). tantivy-py's
+# TextAnalyzer.analyze clones internally per call, so these are safe to share.
+_SIMPLE_SEARCH_ANALYZER: Final = _simple_search_analyzer()
+# raw tokenizer keeps the whole input as one token, so this folds an arbitrary
+# string to ASCII exactly like the content tokenizers (ß->ss, ø->o, æ->ae, ...)
+# without splitting it - used for autocomplete words and prefixes.
+_ASCII_FOLD_ANALYZER: Final = (
+    tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.raw())
+    .filter(tantivy.Filter.ascii_fold())
+    .build()
+)
+
+
+def simple_search_tokens(text: str) -> list[str]:
+    """Tokenize a query string exactly as simple_title/simple_content are indexed."""
+    return _SIMPLE_SEARCH_ANALYZER.analyze(text)
+
+
+def ascii_fold(text: str) -> str:
+    """Fold text to ASCII using the same mapping as the content tokenizers.
+
+    Maps non-decomposable letters (ß->ss, ø->o, æ->ae, ...) identically to
+    Tantivy's ascii_fold filter used at index time, so query/autocomplete terms
+    agree with the folded content. A naive NFD strip would instead delete those
+    letters, causing silent search misses. Callers lowercase first, matching the
+    index pipeline's lowercase -> ascii_fold order.
+    """
+    tokens = _ASCII_FOLD_ANALYZER.analyze(text)
+    return tokens[0] if tokens else ""
@@ -385,6 +385,29 @@ class TestSearch:
            == 1
        )

+    @pytest.mark.parametrize(
+        "query",
+        [
+            pytest.param("Straße", id="eszett"),
+            pytest.param("Ærøskøbing", id="ae_and_oslash"),
+            pytest.param("strasse", id="ascii_fold_form"),
+        ],
+    )
+    def test_simple_search_folds_special_letters_like_index(
+        self,
+        backend: TantivyBackend,
+        query: str,
+    ) -> None:
+        """Query-side folding must match index-side folding for non-decomposable
+        letters (ß→ss, ø→o, ...). Searching the accented form must find the doc.
+        A naive NFD fold deletes these letters and silently fails to match."""
+        doc = DocumentFactory(title="report", content="Straße Ærøskøbing")
+        backend.add_or_update(doc)
+
+        assert (
+            len(backend.search_ids(query, user=None, search_mode=SearchMode.TEXT)) == 1
+        )
+
    def test_sort_field_ascending(self, backend: TantivyBackend) -> None:
        """Searching with sort_reverse=False must return results in ascending ASN order."""
        for asn in [30, 10, 20]:
@@ -564,6 +587,18 @@ class TestAutocomplete:
        results = backend.autocomplete("pay", limit=10)
        assert results.index("payment") < results.index("payslip")

+    def test_folds_special_letters_consistently(
+        self,
+        backend: TantivyBackend,
+    ) -> None:
+        """Autocomplete words must fold the same way as content (ß→ss), so a
+        prefix of the folded form finds them. A naive NFD fold would store the
+        word as 'strae' and the prefix 'stras' would never match it."""
+        doc = DocumentFactory(title="Straße", content="details")
+        backend.add_or_update(doc)
+
+        assert "strasse" in backend.autocomplete("stras", limit=10)
+

 class TestMoreLikeThis:
    """Test more like this functionality."""