Restrict comma expansion for queries to multi valued fields only instead of any schema field

This commit is contained in:
stumpylog
2026-05-29 12:47:20 -07:00
parent 97e3c75720
commit e32f974078
2 changed files with 28 additions and 1 deletions
+6 -1
View File
@@ -464,8 +464,13 @@ def normalize_query(query: str) -> str:
return " AND ".join(f"{field}:{v}" for v in values)
try:
# Only true multi-value fields are comma-split, matching Whoosh's
# KEYWORD(commas=True) fields (tag, tag_id, viewer_id). A field-agnostic
# match would corrupt unrelated text such as URLs (http://x/a,b) or
# numeric values (title:10,20). tag_id is ordered before tag so the
# longer field name wins.
query = regex.sub(
r"(\w+):([^\s\[\]]+(?:,[^\s\[\]]+)+)",
r"(?<!\w)(tag_id|viewer_id|tag):([^\s\[\]]+(?:,[^\s\[\]]+)+)",
_expand,
query,
timeout=_REGEX_TIMEOUT,
+22
View File
@@ -607,6 +607,28 @@ class TestNormalizeQuery:
def test_normalize_no_commas_unchanged(self) -> None:
assert normalize_query("bank statement") == "bank statement"
def test_normalize_expands_multi_value_id_fields(self) -> None:
# tag_id and viewer_id were KEYWORD(commas=True) in Whoosh too.
assert normalize_query("tag_id:1,2") == "tag_id:1 AND tag_id:2"
assert normalize_query("viewer_id:5,6") == "viewer_id:5 AND viewer_id:6"
@pytest.mark.parametrize(
"query",
[
pytest.param("http://example.com/a,b", id="url_with_comma"),
pytest.param("title:10,20", id="non_multivalue_field"),
pytest.param("correspondent:foo,bar", id="text_field_not_comma_split"),
pytest.param("content:a,b,c", id="content_field"),
],
)
def test_normalize_does_not_expand_non_multi_value_fields(
self,
query: str,
) -> None:
# Only true multi-value fields (tag/tag_id/viewer_id) comma-split, matching
# Whoosh's KEYWORD(commas=True) set. Everything else passes through verbatim.
assert normalize_query(query) == query
@pytest.mark.parametrize(
("raw", "expected"),
[