mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-07 23:25:25 +00:00
Handle user queries with Tantivy operators like - or + in them
This commit is contained in:
committed by
Trenton H
parent
af0df43bac
commit
6ee3c62f97
@@ -76,6 +76,11 @@ _YEAR_RANGE_RE = regex.compile(
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+")
|
||||
# Tantivy syntax error: " - " and " + " with spaces on both sides are invalid because
|
||||
# the NOT/MUST operators require no space between the operator and the term.
|
||||
# In natural-language queries (e.g., "H52.1 - Kurzsichtigkeit"), the dash is a separator.
|
||||
_SPACED_OPERATOR_RE = regex.compile(r"\s+[-+]\s+")
|
||||
_TRAILING_OPERATOR_RE = regex.compile(r"\s+[-+]+\s*$")
|
||||
|
||||
|
||||
def _fmt(dt: datetime) -> str:
|
||||
@@ -430,7 +435,14 @@ def normalize_query(query: str) -> str:
|
||||
query,
|
||||
timeout=_REGEX_TIMEOUT,
|
||||
)
|
||||
return regex.sub(r" {2,}", " ", query, timeout=_REGEX_TIMEOUT).strip()
|
||||
query = regex.sub(r" {2,}", " ", query, timeout=_REGEX_TIMEOUT).strip()
|
||||
# Strip trailing dangling operators before Tantivy sees them.
|
||||
query = _TRAILING_OPERATOR_RE.sub("", query, timeout=_REGEX_TIMEOUT).strip()
|
||||
# Replace " - " / " + " with a space: Tantivy requires no space between
|
||||
# the operator and its operand (-term / +term), so spaces on both sides
|
||||
# means this is a natural-language separator, not a query operator.
|
||||
query = _SPACED_OPERATOR_RE.sub(" ", query, timeout=_REGEX_TIMEOUT).strip()
|
||||
return query
|
||||
except TimeoutError: # pragma: no cover
|
||||
raise ValueError("Query too complex to process (normalization timed out)")
|
||||
|
||||
|
||||
@@ -443,6 +443,25 @@ class TestParseUserQuery:
|
||||
q = parse_user_query(query_index, "created:today", UTC)
|
||||
assert isinstance(q, tantivy.Query)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw_query",
|
||||
[
|
||||
pytest.param("h52.1 - kurzsichtigkeit", id="icd_code_dash_description"),
|
||||
pytest.param("H52.1 - asd", id="icd_code_uppercase"),
|
||||
pytest.param("h52.1 -", id="trailing_minus"),
|
||||
pytest.param(". -", id="dot_trailing_minus"),
|
||||
pytest.param("h52. -", id="partial_code_trailing_minus"),
|
||||
pytest.param(".12 -", id="dot_number_trailing_minus"),
|
||||
pytest.param("h52.1 - ku", id="partial_word_after_dash"),
|
||||
],
|
||||
)
|
||||
def test_spaced_dash_queries_do_not_raise(
|
||||
self,
|
||||
query_index: tantivy.Index,
|
||||
raw_query: str,
|
||||
) -> None:
|
||||
assert isinstance(parse_user_query(query_index, raw_query, UTC), tantivy.Query)
|
||||
|
||||
|
||||
class TestYearRangeRewriting:
|
||||
"""Whoosh-style year-only date ranges must be rewritten to ISO 8601."""
|
||||
@@ -548,6 +567,61 @@ class TestNormalizeQuery:
|
||||
def test_normalize_no_commas_unchanged(self) -> None:
|
||||
assert normalize_query("bank statement") == "bank statement"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("raw", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"h52.1 - kurzsichtigkeit",
|
||||
"h52.1 kurzsichtigkeit",
|
||||
id="icd_code_dash_description",
|
||||
),
|
||||
pytest.param(
|
||||
"H52.1 - asd",
|
||||
"H52.1 asd",
|
||||
id="icd_code_uppercase_dash",
|
||||
),
|
||||
pytest.param(
|
||||
"h52.1 -",
|
||||
"h52.1",
|
||||
id="trailing_minus",
|
||||
),
|
||||
pytest.param(
|
||||
". -",
|
||||
".",
|
||||
id="dot_trailing_minus",
|
||||
),
|
||||
pytest.param(
|
||||
"h52. -",
|
||||
"h52.",
|
||||
id="partial_code_trailing_minus",
|
||||
),
|
||||
pytest.param(
|
||||
"foo - bar - baz",
|
||||
"foo bar baz",
|
||||
id="multiple_dashes",
|
||||
),
|
||||
pytest.param(
|
||||
"foo + bar",
|
||||
"foo bar",
|
||||
id="spaced_plus_operator",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_normalize_strips_dangling_operators(self, raw: str, expected: str) -> None:
|
||||
assert normalize_query(raw) == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"query",
|
||||
[
|
||||
pytest.param("term -other", id="adjacent_not_operator"),
|
||||
pytest.param("-term", id="leading_not_operator"),
|
||||
pytest.param("+term", id="leading_must_operator"),
|
||||
pytest.param("foo -bar +baz", id="mixed_adjacent_operators"),
|
||||
],
|
||||
)
|
||||
def test_normalize_preserves_valid_operators(self, query: str) -> None:
|
||||
assert normalize_query(query) == query
|
||||
|
||||
|
||||
class TestPermissionFilter:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user