From 51880b04f88dd3cd2bd8d945c6955d1f548c8ae2 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Thu, 2 Apr 2026 15:16:06 -0700 Subject: [PATCH] Anchor later query tokens in regex search --- src/documents/search/_query.py | 12 ++++++- src/documents/tests/search/test_backend.py | 42 ++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 47723db2e..b7bcbbe9c 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -450,7 +450,17 @@ def _build_simple_field_query( field: str, tokens: list[str], ) -> tantivy.Query: - patterns = [f".*{regex.escape(token)}.*" for token in tokens] + patterns = [] + for idx, token in enumerate(tokens): + escaped = regex.escape(token) + # For multi-token substring search, only the first token can begin mid-word. + # Later tokens follow a whitespace boundary in the original query, so anchor + # them to the start of the next indexed token to reduce false positives like + # matching "Z-Berichte 16" for the query "Z-Berichte 6". + if idx == 0: + patterns.append(f".*{escaped}.*") + else: + patterns.append(f"{escaped}.*") if len(patterns) == 1: query = tantivy.Query.regex_query(index.schema, field, patterns[0]) else: diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index c300f2461..ff9638e63 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -187,6 +187,48 @@ class TestSearch: ) assert non_match.total == 0 + def test_text_mode_anchors_later_query_tokens_to_token_starts( + self, + backend: TantivyBackend, + ): + """Multi-token simple search should not match later tokens in the middle of a word.""" + exact_doc = Document.objects.create( + title="Z-Berichte 6", + content="monthly report", + checksum="TXT9", + pk=15, + ) + prefix_doc = Document.objects.create( + title="Z-Berichte 60", + content="monthly report", + checksum="TXT10", + pk=16, + ) + false_positive = Document.objects.create( + title="Z-Berichte 16", + content="monthly report", + checksum="TXT11", + pk=17, + ) + backend.add_or_update(exact_doc) + backend.add_or_update(prefix_doc) + backend.add_or_update(false_positive) + + results = backend.search( + "Z-Berichte 6", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + result_ids = {hit["id"] for hit in results.hits} + + assert exact_doc.id in result_ids + assert prefix_doc.id in result_ids + assert false_positive.id not in result_ids + def test_text_mode_ignores_queries_without_searchable_tokens( self, backend: TantivyBackend,