From 6ee3c62f9737ecf2d502e412d9235fce70b995c2 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Thu, 7 May 2026 06:20:49 -0700 Subject: [PATCH] Handle user queries with Tantivy operators like - or + in them --- src/documents/search/_query.py | 14 ++++- src/documents/tests/search/test_query.py | 74 ++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 65df260c3..04d79d1ef 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -76,6 +76,11 @@ _YEAR_RANGE_RE = regex.compile( regex.IGNORECASE, ) _SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+") +# Tantivy syntax error: " - " and " + " with spaces on both sides are invalid because +# the NOT/MUST operators require no space between the operator and the term. +# In natural-language queries (e.g., "H52.1 - Kurzsichtigkeit"), the dash is a separator. +_SPACED_OPERATOR_RE = regex.compile(r"\s+[-+]\s+") +_TRAILING_OPERATOR_RE = regex.compile(r"\s+[-+]+\s*$") def _fmt(dt: datetime) -> str: @@ -430,7 +435,14 @@ def normalize_query(query: str) -> str: query, timeout=_REGEX_TIMEOUT, ) - return regex.sub(r" {2,}", " ", query, timeout=_REGEX_TIMEOUT).strip() + query = regex.sub(r" {2,}", " ", query, timeout=_REGEX_TIMEOUT).strip() + # Strip trailing dangling operators before Tantivy sees them. + query = _TRAILING_OPERATOR_RE.sub("", query, timeout=_REGEX_TIMEOUT).strip() + # Replace " - " / " + " with a space: Tantivy requires no space between + # the operator and its operand (-term / +term), so spaces on both sides + # means this is a natural-language separator, not a query operator. + query = _SPACED_OPERATOR_RE.sub(" ", query, timeout=_REGEX_TIMEOUT).strip() + return query except TimeoutError: # pragma: no cover raise ValueError("Query too complex to process (normalization timed out)") diff --git a/src/documents/tests/search/test_query.py b/src/documents/tests/search/test_query.py index 11297eb92..0db224c5d 100644 --- a/src/documents/tests/search/test_query.py +++ b/src/documents/tests/search/test_query.py @@ -443,6 +443,25 @@ class TestParseUserQuery: q = parse_user_query(query_index, "created:today", UTC) assert isinstance(q, tantivy.Query) + @pytest.mark.parametrize( + "raw_query", + [ + pytest.param("h52.1 - kurzsichtigkeit", id="icd_code_dash_description"), + pytest.param("H52.1 - asd", id="icd_code_uppercase"), + pytest.param("h52.1 -", id="trailing_minus"), + pytest.param(". -", id="dot_trailing_minus"), + pytest.param("h52. -", id="partial_code_trailing_minus"), + pytest.param(".12 -", id="dot_number_trailing_minus"), + pytest.param("h52.1 - ku", id="partial_word_after_dash"), + ], + ) + def test_spaced_dash_queries_do_not_raise( + self, + query_index: tantivy.Index, + raw_query: str, + ) -> None: + assert isinstance(parse_user_query(query_index, raw_query, UTC), tantivy.Query) + class TestYearRangeRewriting: """Whoosh-style year-only date ranges must be rewritten to ISO 8601.""" @@ -548,6 +567,61 @@ class TestNormalizeQuery: def test_normalize_no_commas_unchanged(self) -> None: assert normalize_query("bank statement") == "bank statement" + @pytest.mark.parametrize( + ("raw", "expected"), + [ + pytest.param( + "h52.1 - kurzsichtigkeit", + "h52.1 kurzsichtigkeit", + id="icd_code_dash_description", + ), + pytest.param( + "H52.1 - asd", + "H52.1 asd", + id="icd_code_uppercase_dash", + ), + pytest.param( + "h52.1 -", + "h52.1", + id="trailing_minus", + ), + pytest.param( + ". -", + ".", + id="dot_trailing_minus", + ), + pytest.param( + "h52. -", + "h52.", + id="partial_code_trailing_minus", + ), + pytest.param( + "foo - bar - baz", + "foo bar baz", + id="multiple_dashes", + ), + pytest.param( + "foo + bar", + "foo bar", + id="spaced_plus_operator", + ), + ], + ) + def test_normalize_strips_dangling_operators(self, raw: str, expected: str) -> None: + assert normalize_query(raw) == expected + + @pytest.mark.parametrize( + "query", + [ + pytest.param("term -other", id="adjacent_not_operator"), + pytest.param("-term", id="leading_not_operator"), + pytest.param("+term", id="leading_must_operator"), + pytest.param("foo -bar +baz", id="mixed_adjacent_operators"), + ], + ) + def test_normalize_preserves_valid_operators(self, query: str) -> None: + assert normalize_query(query) == query + class TestPermissionFilter: """