From 7e381f204e4ed3c61b17ff8cbaa6bb863ca5d9e5 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 12 May 2026 12:41:38 -0700 Subject: [PATCH] Fix: Sanitize dash or plus from the text search path (#12789) --- src/documents/search/_query.py | 6 +++- src/documents/tests/search/test_query.py | 44 ++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 04d79d1ef..4fd2bfc68 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -618,7 +618,11 @@ def parse_simple_text_highlight_query( SnippetGenerator we build a plain term query over the content field instead. """ - tokens = _simple_query_tokens(raw_query) + # Strip Tantivy operator chars before tokenizing: this is a plain-text + # highlight query, not a structured boolean query, so +/- are separators. + tokens = _simple_query_tokens( + regex.sub(r"[-+]", " ", raw_query, timeout=_REGEX_TIMEOUT), + ) if not tokens: return tantivy.Query.empty_query() diff --git a/src/documents/tests/search/test_query.py b/src/documents/tests/search/test_query.py index 0db224c5d..dfeffc253 100644 --- a/src/documents/tests/search/test_query.py +++ b/src/documents/tests/search/test_query.py @@ -16,6 +16,7 @@ from documents.search._query import _datetime_range from documents.search._query import _rewrite_compact_date from documents.search._query import build_permission_filter from documents.search._query import normalize_query +from documents.search._query import parse_simple_text_highlight_query from documents.search._query import parse_user_query from documents.search._query import rewrite_natural_date_keywords from documents.search._schema import build_schema @@ -623,6 +624,49 @@ class TestNormalizeQuery: assert normalize_query(query) == query +class TestParseSimpleTextHighlightQuery: + """parse_simple_text_highlight_query must not raise on natural-language queries.""" + + @pytest.fixture + def query_index(self) -> tantivy.Index: + schema = build_schema() + idx = tantivy.Index(schema, path=None) + register_tokenizers(idx, "") + return idx + + @pytest.mark.parametrize( + "raw_query", + [ + pytest.param("h52.1 - kurzsichtigkeit", id="icd_code_dash_description"), + pytest.param("H52.1 - asd", id="icd_code_uppercase"), + pytest.param("h52.1 -", id="trailing_minus"), + pytest.param(". -", id="dot_trailing_minus"), + pytest.param(".12 -", id="dot_number_trailing_minus"), + pytest.param("f84.0 - v.a. autismusspektrumstorung", id="complex_icd_dash"), + ], + ) + def test_spaced_dash_queries_do_not_raise( + self, + query_index: tantivy.Index, + raw_query: str, + ) -> None: + assert isinstance( + parse_simple_text_highlight_query(query_index, raw_query), + tantivy.Query, + ) + + def test_empty_query_returns_empty_query(self, query_index: tantivy.Index) -> None: + result = parse_simple_text_highlight_query(query_index, "") + assert isinstance(result, tantivy.Query) + + def test_all_operators_returns_empty_query( + self, + query_index: tantivy.Index, + ) -> None: + result = parse_simple_text_highlight_query(query_index, "- +") + assert isinstance(result, tantivy.Query) + + class TestPermissionFilter: """ build_permission_filter tests use an in-memory index - no DB access needed.