Fix: Sanitize dash or plus from the text search path (#12789)

This commit is contained in:
Trenton H
2026-05-12 12:41:38 -07:00
committed by GitHub
parent 5f42854d99
commit 7e381f204e
2 changed files with 49 additions and 1 deletions
+5 -1
View File
@@ -618,7 +618,11 @@ def parse_simple_text_highlight_query(
SnippetGenerator we build a plain term query over the content field instead.
"""
tokens = _simple_query_tokens(raw_query)
# Strip Tantivy operator chars before tokenizing: this is a plain-text
# highlight query, not a structured boolean query, so +/- are separators.
tokens = _simple_query_tokens(
regex.sub(r"[-+]", " ", raw_query, timeout=_REGEX_TIMEOUT),
)
if not tokens:
return tantivy.Query.empty_query()
+44
View File
@@ -16,6 +16,7 @@ from documents.search._query import _datetime_range
from documents.search._query import _rewrite_compact_date
from documents.search._query import build_permission_filter
from documents.search._query import normalize_query
from documents.search._query import parse_simple_text_highlight_query
from documents.search._query import parse_user_query
from documents.search._query import rewrite_natural_date_keywords
from documents.search._schema import build_schema
@@ -623,6 +624,49 @@ class TestNormalizeQuery:
assert normalize_query(query) == query
class TestParseSimpleTextHighlightQuery:
"""parse_simple_text_highlight_query must not raise on natural-language queries."""
@pytest.fixture
def query_index(self) -> tantivy.Index:
schema = build_schema()
idx = tantivy.Index(schema, path=None)
register_tokenizers(idx, "")
return idx
@pytest.mark.parametrize(
"raw_query",
[
pytest.param("h52.1 - kurzsichtigkeit", id="icd_code_dash_description"),
pytest.param("H52.1 - asd", id="icd_code_uppercase"),
pytest.param("h52.1 -", id="trailing_minus"),
pytest.param(". -", id="dot_trailing_minus"),
pytest.param(".12 -", id="dot_number_trailing_minus"),
pytest.param("f84.0 - v.a. autismusspektrumstorung", id="complex_icd_dash"),
],
)
def test_spaced_dash_queries_do_not_raise(
self,
query_index: tantivy.Index,
raw_query: str,
) -> None:
assert isinstance(
parse_simple_text_highlight_query(query_index, raw_query),
tantivy.Query,
)
def test_empty_query_returns_empty_query(self, query_index: tantivy.Index) -> None:
result = parse_simple_text_highlight_query(query_index, "")
assert isinstance(result, tantivy.Query)
def test_all_operators_returns_empty_query(
self,
query_index: tantivy.Index,
) -> None:
result = parse_simple_text_highlight_query(query_index, "- +")
assert isinstance(result, tantivy.Query)
class TestPermissionFilter:
"""
build_permission_filter tests use an in-memory index - no DB access needed.