Anchor later query tokens in regex search

This commit is contained in:
shamoon
2026-04-02 15:16:06 -07:00
parent f8b686a29e
commit 51880b04f8
2 changed files with 53 additions and 1 deletions

View File

@@ -450,7 +450,17 @@ def _build_simple_field_query(
field: str,
tokens: list[str],
) -> tantivy.Query:
patterns = [f".*{regex.escape(token)}.*" for token in tokens]
patterns = []
for idx, token in enumerate(tokens):
escaped = regex.escape(token)
# For multi-token substring search, only the first token can begin mid-word.
# Later tokens follow a whitespace boundary in the original query, so anchor
# them to the start of the next indexed token to reduce false positives like
# matching "Z-Berichte 16" for the query "Z-Berichte 6".
if idx == 0:
patterns.append(f".*{escaped}.*")
else:
patterns.append(f"{escaped}.*")
if len(patterns) == 1:
query = tantivy.Query.regex_query(index.schema, field, patterns[0])
else:

View File

@@ -187,6 +187,48 @@ class TestSearch:
)
assert non_match.total == 0
def test_text_mode_anchors_later_query_tokens_to_token_starts(
self,
backend: TantivyBackend,
):
"""Multi-token simple search should not match later tokens in the middle of a word."""
exact_doc = Document.objects.create(
title="Z-Berichte 6",
content="monthly report",
checksum="TXT9",
pk=15,
)
prefix_doc = Document.objects.create(
title="Z-Berichte 60",
content="monthly report",
checksum="TXT10",
pk=16,
)
false_positive = Document.objects.create(
title="Z-Berichte 16",
content="monthly report",
checksum="TXT11",
pk=17,
)
backend.add_or_update(exact_doc)
backend.add_or_update(prefix_doc)
backend.add_or_update(false_positive)
results = backend.search(
"Z-Berichte 6",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
result_ids = {hit["id"] for hit in results.hits}
assert exact_doc.id in result_ids
assert prefix_doc.id in result_ids
assert false_positive.id not in result_ids
def test_text_mode_ignores_queries_without_searchable_tokens(
self,
backend: TantivyBackend,