mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-03 12:19:45 +00:00
Fix: Fold query and autocomplete terms with Tantivy's ascii_fold so special letters match (#12868)
This commit is contained in:
@@ -22,7 +22,6 @@ from django.conf import settings
|
||||
from django.utils.timezone import get_current_timezone
|
||||
from guardian.shortcuts import get_users_with_perms
|
||||
|
||||
from documents.search._normalize import ascii_fold
|
||||
from documents.search._query import build_permission_filter
|
||||
from documents.search._query import parse_simple_text_highlight_query
|
||||
from documents.search._query import parse_simple_text_query
|
||||
@@ -32,6 +31,7 @@ from documents.search._schema import _write_sentinels
|
||||
from documents.search._schema import build_schema
|
||||
from documents.search._schema import open_or_rebuild_index
|
||||
from documents.search._schema import wipe_index
|
||||
from documents.search._tokenizer import ascii_fold
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
from documents.utils import IterWrapper
|
||||
from documents.utils import identity
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import unicodedata
|
||||
|
||||
|
||||
def ascii_fold(text: str) -> str:
|
||||
"""Normalize unicode text to ASCII equivalents for search consistency."""
|
||||
return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode()
|
||||
@@ -12,7 +12,7 @@ import tantivy
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from django.conf import settings
|
||||
|
||||
from documents.search._normalize import ascii_fold
|
||||
from documents.search._tokenizer import simple_search_tokens
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datetime import tzinfo
|
||||
@@ -78,7 +78,6 @@ _YEAR_RANGE_RE = regex.compile(
|
||||
r"(?<!\w)(?P<field>created|modified|added):\[(?P<y1>\d{4})\s+TO\s+(?P<y2>\d{4})\]",
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+")
|
||||
# Tantivy syntax error: " - " and " + " with spaces on both sides are invalid because
|
||||
# the NOT/MUST operators require no space between the operator and the term.
|
||||
# In natural-language queries (e.g., "H52.1 - Kurzsichtigkeit"), the dash is a separator.
|
||||
@@ -542,11 +541,10 @@ _SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}
|
||||
|
||||
|
||||
def _simple_query_tokens(raw_query: str) -> list[str]:
|
||||
tokens = [
|
||||
ascii_fold(token.lower())
|
||||
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
|
||||
]
|
||||
return [token for token in tokens if token]
|
||||
# Tokenize and fold via the same analyzer used to index simple_title /
|
||||
# simple_content, so query terms fold identically to the indexed terms
|
||||
# (single source of truth for ASCII folding).
|
||||
return simple_search_tokens(raw_query)
|
||||
|
||||
|
||||
def _build_simple_field_query(
|
||||
@@ -614,9 +612,10 @@ def parse_user_query(
|
||||
field_boosts=_FIELD_BOOSTS,
|
||||
)
|
||||
|
||||
# CJK characters are stripped by ascii_fold in the standard tokenizer, so
|
||||
# they would never match content/title. Route CJK queries to the bigram
|
||||
# fields, which use an ngram tokenizer that preserves non-ASCII text.
|
||||
# The standard analyzer keeps a whitespace-free CJK run as a single token,
|
||||
# so substring queries can't match content/title (and long runs are dropped
|
||||
# by remove_long). Route CJK queries to the bigram fields, whose ngram
|
||||
# tokenizer indexes overlapping 2-grams for substring matching.
|
||||
cjk_query = (
|
||||
_build_cjk_query(index, raw_query, _CJK_ALL_FIELDS)
|
||||
if _has_cjk(raw_query)
|
||||
@@ -658,8 +657,9 @@ def parse_simple_query(
|
||||
|
||||
Query string is escaped and normalized to be treated as "simple" text query.
|
||||
When cjk_fields is provided and the query contains CJK characters, an
|
||||
additional Should clause searches those bigram-tokenized fields so that
|
||||
CJK text is not silently dropped by ascii_fold.
|
||||
additional Should clause searches those bigram-tokenized fields, which match
|
||||
CJK substrings the simple analyzer can't (long whitespace-free runs are
|
||||
dropped by remove_long).
|
||||
"""
|
||||
tokens = _simple_query_tokens(raw_query)
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Final
|
||||
|
||||
import tantivy
|
||||
|
||||
@@ -128,3 +129,36 @@ def _simple_search_analyzer() -> tantivy.TextAnalyzer:
|
||||
.filter(tantivy.Filter.ascii_fold())
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
# Shared analyzers for query-side normalization. They reuse the exact filters
|
||||
# applied at index time so query terms fold identically (single source of truth
|
||||
# for ASCII folding, instead of a separate Python implementation). tantivy-py's
|
||||
# TextAnalyzer.analyze clones internally per call, so these are safe to share.
|
||||
_SIMPLE_SEARCH_ANALYZER: Final = _simple_search_analyzer()
|
||||
# raw tokenizer keeps the whole input as one token, so this folds an arbitrary
|
||||
# string to ASCII exactly like the content tokenizers (ß->ss, ø->o, æ->ae, ...)
|
||||
# without splitting it - used for autocomplete words and prefixes.
|
||||
_ASCII_FOLD_ANALYZER: Final = (
|
||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.raw())
|
||||
.filter(tantivy.Filter.ascii_fold())
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def simple_search_tokens(text: str) -> list[str]:
|
||||
"""Tokenize a query string exactly as simple_title/simple_content are indexed."""
|
||||
return _SIMPLE_SEARCH_ANALYZER.analyze(text)
|
||||
|
||||
|
||||
def ascii_fold(text: str) -> str:
|
||||
"""Fold text to ASCII using the same mapping as the content tokenizers.
|
||||
|
||||
Maps non-decomposable letters (ß->ss, ø->o, æ->ae, ...) identically to
|
||||
Tantivy's ascii_fold filter used at index time, so query/autocomplete terms
|
||||
agree with the folded content. A naive NFD strip would instead delete those
|
||||
letters, causing silent search misses. Callers lowercase first, matching the
|
||||
index pipeline's lowercase -> ascii_fold order.
|
||||
"""
|
||||
tokens = _ASCII_FOLD_ANALYZER.analyze(text)
|
||||
return tokens[0] if tokens else ""
|
||||
|
||||
@@ -385,6 +385,29 @@ class TestSearch:
|
||||
== 1
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"query",
|
||||
[
|
||||
pytest.param("Straße", id="eszett"),
|
||||
pytest.param("Ærøskøbing", id="ae_and_oslash"),
|
||||
pytest.param("strasse", id="ascii_fold_form"),
|
||||
],
|
||||
)
|
||||
def test_simple_search_folds_special_letters_like_index(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
query: str,
|
||||
) -> None:
|
||||
"""Query-side folding must match index-side folding for non-decomposable
|
||||
letters (ß→ss, ø→o, ...). Searching the accented form must find the doc.
|
||||
A naive NFD fold deletes these letters and silently fails to match."""
|
||||
doc = DocumentFactory(title="report", content="Straße Ærøskøbing")
|
||||
backend.add_or_update(doc)
|
||||
|
||||
assert (
|
||||
len(backend.search_ids(query, user=None, search_mode=SearchMode.TEXT)) == 1
|
||||
)
|
||||
|
||||
def test_sort_field_ascending(self, backend: TantivyBackend) -> None:
|
||||
"""Searching with sort_reverse=False must return results in ascending ASN order."""
|
||||
for asn in [30, 10, 20]:
|
||||
@@ -564,6 +587,18 @@ class TestAutocomplete:
|
||||
results = backend.autocomplete("pay", limit=10)
|
||||
assert results.index("payment") < results.index("payslip")
|
||||
|
||||
def test_folds_special_letters_consistently(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
) -> None:
|
||||
"""Autocomplete words must fold the same way as content (ß→ss), so a
|
||||
prefix of the folded form finds them. A naive NFD fold would store the
|
||||
word as 'strae' and the prefix 'stras' would never match it."""
|
||||
doc = DocumentFactory(title="Straße", content="details")
|
||||
backend.add_or_update(doc)
|
||||
|
||||
assert "strasse" in backend.autocomplete("stras", limit=10)
|
||||
|
||||
|
||||
class TestMoreLikeThis:
|
||||
"""Test more like this functionality."""
|
||||
|
||||
Reference in New Issue
Block a user