diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index fef248253..0e8a543ab 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -464,8 +464,13 @@ def normalize_query(query: str) -> str: return " AND ".join(f"{field}:{v}" for v in values) try: + # Only true multi-value fields are comma-split, matching Whoosh's + # KEYWORD(commas=True) fields (tag, tag_id, viewer_id). A field-agnostic + # match would corrupt unrelated text such as URLs (http://x/a,b) or + # numeric values (title:10,20). tag_id is ordered before tag so the + # longer field name wins. query = regex.sub( - r"(\w+):([^\s\[\]]+(?:,[^\s\[\]]+)+)", + r"(? None: assert normalize_query("bank statement") == "bank statement" + def test_normalize_expands_multi_value_id_fields(self) -> None: + # tag_id and viewer_id were KEYWORD(commas=True) in Whoosh too. + assert normalize_query("tag_id:1,2") == "tag_id:1 AND tag_id:2" + assert normalize_query("viewer_id:5,6") == "viewer_id:5 AND viewer_id:6" + + @pytest.mark.parametrize( + "query", + [ + pytest.param("http://example.com/a,b", id="url_with_comma"), + pytest.param("title:10,20", id="non_multivalue_field"), + pytest.param("correspondent:foo,bar", id="text_field_not_comma_split"), + pytest.param("content:a,b,c", id="content_field"), + ], + ) + def test_normalize_does_not_expand_non_multi_value_fields( + self, + query: str, + ) -> None: + # Only true multi-value fields (tag/tag_id/viewer_id) comma-split, matching + # Whoosh's KEYWORD(commas=True) set. Everything else passes through verbatim. + assert normalize_query(query) == query + @pytest.mark.parametrize( ("raw", "expected"), [