From 889ccfd67a02f715cb00f61b1db1f41ca9c604c6 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 29 May 2026 16:42:07 -0700 Subject: [PATCH] Fix: Fold query and autocomplete terms with Tantivy's ascii_fold so special letters match (#12868) --- src/documents/search/_backend.py | 2 +- src/documents/search/_normalize.py | 8 ----- src/documents/search/_query.py | 24 +++++++-------- src/documents/search/_tokenizer.py | 34 +++++++++++++++++++++ src/documents/tests/search/test_backend.py | 35 ++++++++++++++++++++++ 5 files changed, 82 insertions(+), 21 deletions(-) delete mode 100644 src/documents/search/_normalize.py diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 22c02b299..5b5c8aa08 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -22,7 +22,6 @@ from django.conf import settings from django.utils.timezone import get_current_timezone from guardian.shortcuts import get_users_with_perms -from documents.search._normalize import ascii_fold from documents.search._query import build_permission_filter from documents.search._query import parse_simple_text_highlight_query from documents.search._query import parse_simple_text_query @@ -32,6 +31,7 @@ from documents.search._schema import _write_sentinels from documents.search._schema import build_schema from documents.search._schema import open_or_rebuild_index from documents.search._schema import wipe_index +from documents.search._tokenizer import ascii_fold from documents.search._tokenizer import register_tokenizers from documents.utils import IterWrapper from documents.utils import identity diff --git a/src/documents/search/_normalize.py b/src/documents/search/_normalize.py deleted file mode 100644 index 3d7b23f33..000000000 --- a/src/documents/search/_normalize.py +++ /dev/null @@ -1,8 +0,0 @@ -from __future__ import annotations - -import unicodedata - - -def ascii_fold(text: str) -> str: - """Normalize unicode text to ASCII equivalents for search consistency.""" - return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode() diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index cced4c64c..932d68bc1 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -12,7 +12,7 @@ import tantivy from dateutil.relativedelta import relativedelta from django.conf import settings -from documents.search._normalize import ascii_fold +from documents.search._tokenizer import simple_search_tokens if TYPE_CHECKING: from datetime import tzinfo @@ -78,7 +78,6 @@ _YEAR_RANGE_RE = regex.compile( r"(?created|modified|added):\[(?P\d{4})\s+TO\s+(?P\d{4})\]", regex.IGNORECASE, ) -_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+") # Tantivy syntax error: " - " and " + " with spaces on both sides are invalid because # the NOT/MUST operators require no space between the operator and the term. # In natural-language queries (e.g., "H52.1 - Kurzsichtigkeit"), the dash is a separator. @@ -542,11 +541,10 @@ _SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0} def _simple_query_tokens(raw_query: str) -> list[str]: - tokens = [ - ascii_fold(token.lower()) - for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT) - ] - return [token for token in tokens if token] + # Tokenize and fold via the same analyzer used to index simple_title / + # simple_content, so query terms fold identically to the indexed terms + # (single source of truth for ASCII folding). + return simple_search_tokens(raw_query) def _build_simple_field_query( @@ -614,9 +612,10 @@ def parse_user_query( field_boosts=_FIELD_BOOSTS, ) - # CJK characters are stripped by ascii_fold in the standard tokenizer, so - # they would never match content/title. Route CJK queries to the bigram - # fields, which use an ngram tokenizer that preserves non-ASCII text. + # The standard analyzer keeps a whitespace-free CJK run as a single token, + # so substring queries can't match content/title (and long runs are dropped + # by remove_long). Route CJK queries to the bigram fields, whose ngram + # tokenizer indexes overlapping 2-grams for substring matching. cjk_query = ( _build_cjk_query(index, raw_query, _CJK_ALL_FIELDS) if _has_cjk(raw_query) @@ -658,8 +657,9 @@ def parse_simple_query( Query string is escaped and normalized to be treated as "simple" text query. When cjk_fields is provided and the query contains CJK characters, an - additional Should clause searches those bigram-tokenized fields so that - CJK text is not silently dropped by ascii_fold. + additional Should clause searches those bigram-tokenized fields, which match + CJK substrings the simple analyzer can't (long whitespace-free runs are + dropped by remove_long). """ tokens = _simple_query_tokens(raw_query) diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py index 2079ca4cc..207a876a1 100644 --- a/src/documents/search/_tokenizer.py +++ b/src/documents/search/_tokenizer.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from typing import Final import tantivy @@ -128,3 +129,36 @@ def _simple_search_analyzer() -> tantivy.TextAnalyzer: .filter(tantivy.Filter.ascii_fold()) .build() ) + + +# Shared analyzers for query-side normalization. They reuse the exact filters +# applied at index time so query terms fold identically (single source of truth +# for ASCII folding, instead of a separate Python implementation). tantivy-py's +# TextAnalyzer.analyze clones internally per call, so these are safe to share. +_SIMPLE_SEARCH_ANALYZER: Final = _simple_search_analyzer() +# raw tokenizer keeps the whole input as one token, so this folds an arbitrary +# string to ASCII exactly like the content tokenizers (ß->ss, ø->o, æ->ae, ...) +# without splitting it - used for autocomplete words and prefixes. +_ASCII_FOLD_ANALYZER: Final = ( + tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.raw()) + .filter(tantivy.Filter.ascii_fold()) + .build() +) + + +def simple_search_tokens(text: str) -> list[str]: + """Tokenize a query string exactly as simple_title/simple_content are indexed.""" + return _SIMPLE_SEARCH_ANALYZER.analyze(text) + + +def ascii_fold(text: str) -> str: + """Fold text to ASCII using the same mapping as the content tokenizers. + + Maps non-decomposable letters (ß->ss, ø->o, æ->ae, ...) identically to + Tantivy's ascii_fold filter used at index time, so query/autocomplete terms + agree with the folded content. A naive NFD strip would instead delete those + letters, causing silent search misses. Callers lowercase first, matching the + index pipeline's lowercase -> ascii_fold order. + """ + tokens = _ASCII_FOLD_ANALYZER.analyze(text) + return tokens[0] if tokens else "" diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index c0d7b6d99..d9fa30715 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -385,6 +385,29 @@ class TestSearch: == 1 ) + @pytest.mark.parametrize( + "query", + [ + pytest.param("Straße", id="eszett"), + pytest.param("Ærøskøbing", id="ae_and_oslash"), + pytest.param("strasse", id="ascii_fold_form"), + ], + ) + def test_simple_search_folds_special_letters_like_index( + self, + backend: TantivyBackend, + query: str, + ) -> None: + """Query-side folding must match index-side folding for non-decomposable + letters (ß→ss, ø→o, ...). Searching the accented form must find the doc. + A naive NFD fold deletes these letters and silently fails to match.""" + doc = DocumentFactory(title="report", content="Straße Ærøskøbing") + backend.add_or_update(doc) + + assert ( + len(backend.search_ids(query, user=None, search_mode=SearchMode.TEXT)) == 1 + ) + def test_sort_field_ascending(self, backend: TantivyBackend) -> None: """Searching with sort_reverse=False must return results in ascending ASN order.""" for asn in [30, 10, 20]: @@ -564,6 +587,18 @@ class TestAutocomplete: results = backend.autocomplete("pay", limit=10) assert results.index("payment") < results.index("payslip") + def test_folds_special_letters_consistently( + self, + backend: TantivyBackend, + ) -> None: + """Autocomplete words must fold the same way as content (ß→ss), so a + prefix of the folded form finds them. A naive NFD fold would store the + word as 'strae' and the prefix 'stras' would never match it.""" + doc = DocumentFactory(title="Straße", content="details") + backend.add_or_update(doc) + + assert "strasse" in backend.autocomplete("stras", limit=10) + class TestMoreLikeThis: """Test more like this functionality."""