mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-01 22:02:44 +00:00
Switch simple substring search to simple_search analyzer
This commit is contained in:
@@ -303,8 +303,10 @@ class TantivyBackend:
|
||||
doc.add_text("checksum", document.checksum)
|
||||
doc.add_text("title", document.title)
|
||||
doc.add_text("title_sort", document.title)
|
||||
doc.add_text("simple_title", document.title)
|
||||
doc.add_text("content", content)
|
||||
doc.add_text("bigram_content", content)
|
||||
doc.add_text("simple_content", content)
|
||||
|
||||
# Original filename - only add if not None/empty
|
||||
if document.original_filename:
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import unicodedata
|
||||
from datetime import UTC
|
||||
from datetime import date
|
||||
from datetime import datetime
|
||||
@@ -51,7 +52,7 @@ _WHOOSH_REL_RANGE_RE = regex.compile(
|
||||
)
|
||||
# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly
|
||||
_DATE8_RE = regex.compile(r"(?P<field>\w+):(?P<date8>\d{8})\b")
|
||||
_SIMPLE_QUERY_SPECIAL_CHARS_RE = regex.compile(r'([+\-!(){}\[\]^"~*?:\\/])')
|
||||
_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+")
|
||||
|
||||
|
||||
def _fmt(dt: datetime) -> str:
|
||||
@@ -439,9 +440,38 @@ DEFAULT_SEARCH_FIELDS = [
|
||||
"note", # companion text field for notes content (notes JSON for structured: notes.user:x)
|
||||
"custom_field", # companion text field for CF values (custom_fields JSON for structured: custom_fields.name:x)
|
||||
]
|
||||
SIMPLE_SEARCH_FIELDS = ["title", "content"]
|
||||
TITLE_SEARCH_FIELDS = ["title"]
|
||||
SIMPLE_SEARCH_FIELDS = ["simple_title", "simple_content"]
|
||||
TITLE_SEARCH_FIELDS = ["simple_title"]
|
||||
_FIELD_BOOSTS = {"title": 2.0}
|
||||
_SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}
|
||||
|
||||
|
||||
def _normalize_simple_token(token: str) -> str:
|
||||
return (
|
||||
unicodedata.normalize("NFD", token.lower())
|
||||
.encode(
|
||||
"ascii",
|
||||
"ignore",
|
||||
)
|
||||
.decode()
|
||||
)
|
||||
|
||||
|
||||
def _build_simple_field_query(
|
||||
index: tantivy.Index,
|
||||
field: str,
|
||||
tokens: list[str],
|
||||
) -> tantivy.Query:
|
||||
patterns = [f".*{regex.escape(token)}.*" for token in tokens]
|
||||
if len(patterns) == 1:
|
||||
query = tantivy.Query.regex_query(index.schema, field, patterns[0])
|
||||
else:
|
||||
query = tantivy.Query.regex_phrase_query(index.schema, field, patterns)
|
||||
|
||||
boost = _SIMPLE_FIELD_BOOSTS.get(field, 1.0)
|
||||
if boost != 1.0:
|
||||
return tantivy.Query.boost_query(query, boost)
|
||||
return query
|
||||
|
||||
|
||||
def parse_user_query(
|
||||
@@ -512,20 +542,21 @@ def parse_simple_query(
|
||||
|
||||
Query string is escaped and normalized to be treated as "simple" text query.
|
||||
"""
|
||||
# strips special characters that would be interpreted as syntax by the parser
|
||||
query_str = regex.sub(
|
||||
_SIMPLE_QUERY_SPECIAL_CHARS_RE,
|
||||
r"\\\1",
|
||||
raw_query,
|
||||
timeout=_REGEX_TIMEOUT,
|
||||
)
|
||||
# collapse multiple spaces to a single space for cleaner parsing (and to prevent ReDoS on excessive whitespace)
|
||||
query_str = regex.sub(r" {2,}", " ", query_str, timeout=_REGEX_TIMEOUT).strip()
|
||||
return index.parse_query(
|
||||
query_str,
|
||||
fields,
|
||||
field_boosts=_FIELD_BOOSTS,
|
||||
)
|
||||
tokens = [
|
||||
_normalize_simple_token(token)
|
||||
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
|
||||
]
|
||||
tokens = [token for token in tokens if token]
|
||||
if not tokens:
|
||||
return tantivy.Query.empty_query()
|
||||
|
||||
field_queries = [
|
||||
(tantivy.Occur.Should, _build_simple_field_query(index, field, tokens))
|
||||
for field in fields
|
||||
]
|
||||
if len(field_queries) == 1:
|
||||
return field_queries[0][1]
|
||||
return tantivy.Query.boolean_query(field_queries)
|
||||
|
||||
|
||||
def parse_simple_text_query(
|
||||
|
||||
@@ -53,6 +53,18 @@ def build_schema() -> tantivy.Schema:
|
||||
# CJK support - not stored, indexed only
|
||||
sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer")
|
||||
|
||||
# Simple substring search support for title/content - not stored, indexed only
|
||||
sb.add_text_field(
|
||||
"simple_title",
|
||||
stored=False,
|
||||
tokenizer_name="simple_search_analyzer",
|
||||
)
|
||||
sb.add_text_field(
|
||||
"simple_content",
|
||||
stored=False,
|
||||
tokenizer_name="simple_search_analyzer",
|
||||
)
|
||||
|
||||
# Autocomplete prefix scan - stored, not indexed
|
||||
sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw")
|
||||
|
||||
|
||||
@@ -70,6 +70,7 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||
index.register_tokenizer("paperless_text", _paperless_text(language))
|
||||
index.register_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
||||
index.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
|
||||
# Fast-field tokenizer required for fast=True text fields in the schema
|
||||
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
|
||||
|
||||
@@ -114,3 +115,15 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
|
||||
"""Tokenizer for simple substring search fields: non-whitespace chunks -> lowercase -> ascii_fold."""
|
||||
return (
|
||||
tantivy.TextAnalyzerBuilder(
|
||||
tantivy.Tokenizer.regex(r"\S+"),
|
||||
)
|
||||
.filter(tantivy.Filter.lowercase())
|
||||
.filter(tantivy.Filter.ascii_fold())
|
||||
.build()
|
||||
)
|
||||
|
||||
@@ -117,6 +117,122 @@ class TestSearch:
|
||||
)
|
||||
assert title_match.total == 1
|
||||
|
||||
def test_text_mode_matches_partial_term_substrings(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Simple text mode should support substring matching within tokens."""
|
||||
doc = Document.objects.create(
|
||||
title="Account access",
|
||||
content="password reset instructions",
|
||||
checksum="TXT3",
|
||||
pk=11,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
prefix_match = backend.search(
|
||||
"pass",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert prefix_match.total == 1
|
||||
|
||||
infix_match = backend.search(
|
||||
"sswo",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert infix_match.total == 1
|
||||
|
||||
phrase_match = backend.search(
|
||||
"sswo re",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert phrase_match.total == 1
|
||||
|
||||
def test_text_mode_does_not_match_on_partial_term_overlap(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Simple text mode should not match documents that merely share partial fragments."""
|
||||
doc = Document.objects.create(
|
||||
title="Adobe Acrobat PDF Files",
|
||||
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
|
||||
checksum="TXT7",
|
||||
pk=13,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
non_match = backend.search(
|
||||
"raptor",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert non_match.total == 0
|
||||
|
||||
def test_title_mode_matches_partial_term_substrings(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Title mode should support substring matching within title tokens."""
|
||||
doc = Document.objects.create(
|
||||
title="Password guide",
|
||||
content="reset instructions",
|
||||
checksum="TXT4",
|
||||
pk=12,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
prefix_match = backend.search(
|
||||
"pass",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert prefix_match.total == 1
|
||||
|
||||
infix_match = backend.search(
|
||||
"sswo",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert infix_match.total == 1
|
||||
|
||||
phrase_match = backend.search(
|
||||
"sswo gu",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert phrase_match.total == 1
|
||||
|
||||
def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
|
||||
"""Search scores must be normalized so top hit has score 1.0 for UI consistency."""
|
||||
for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
|
||||
|
||||
@@ -8,6 +8,7 @@ import tantivy
|
||||
|
||||
from documents.search._tokenizer import _bigram_analyzer
|
||||
from documents.search._tokenizer import _paperless_text
|
||||
from documents.search._tokenizer import _simple_search_analyzer
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -41,6 +42,20 @@ class TestTokenizers:
|
||||
idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
||||
return idx
|
||||
|
||||
@pytest.fixture
|
||||
def simple_search_index(self) -> tantivy.Index:
|
||||
"""Index with simple-search field for Latin substring tests."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
sb.add_text_field(
|
||||
"simple_content",
|
||||
stored=False,
|
||||
tokenizer_name="simple_search_analyzer",
|
||||
)
|
||||
schema = sb.build()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
idx.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
|
||||
return idx
|
||||
|
||||
def test_ascii_fold_finds_accented_content(
|
||||
self,
|
||||
content_index: tantivy.Index,
|
||||
@@ -66,6 +81,24 @@ class TestTokenizers:
|
||||
q = bigram_index.parse_query("東京", ["bigram_content"])
|
||||
assert bigram_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_simple_search_analyzer_supports_regex_substrings(
|
||||
self,
|
||||
simple_search_index: tantivy.Index,
|
||||
) -> None:
|
||||
"""Whitespace-preserving simple search analyzer supports substring regex matching."""
|
||||
writer = simple_search_index.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_text("simple_content", "tag:invoice password-reset")
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
simple_search_index.reload()
|
||||
q = tantivy.Query.regex_query(
|
||||
simple_search_index.schema,
|
||||
"simple_content",
|
||||
".*sswo.*",
|
||||
)
|
||||
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
||||
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
|
||||
@@ -119,6 +119,47 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 0)
|
||||
|
||||
def test_simple_text_search_matches_substrings(self) -> None:
|
||||
matching_doc = Document.objects.create(
|
||||
title="Quarterly summary",
|
||||
content="Password reset instructions",
|
||||
checksum="T5",
|
||||
pk=15,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(matching_doc)
|
||||
|
||||
response = self.client.get("/api/documents/?text=pass")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
response = self.client.get("/api/documents/?text=sswo")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
response = self.client.get("/api/documents/?text=sswo re")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
def test_simple_text_search_does_not_match_on_partial_term_overlap(self) -> None:
|
||||
non_matching_doc = Document.objects.create(
|
||||
title="Adobe Acrobat PDF Files",
|
||||
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
|
||||
checksum="T7",
|
||||
pk=17,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(non_matching_doc)
|
||||
|
||||
response = self.client.get("/api/documents/?text=raptor")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 0)
|
||||
|
||||
def test_simple_title_search(self) -> None:
|
||||
title_match = Document.objects.create(
|
||||
title="Quarterly summary",
|
||||
@@ -142,6 +183,32 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
def test_simple_title_search_matches_substrings(self) -> None:
|
||||
title_match = Document.objects.create(
|
||||
title="Password handbook",
|
||||
content="No matching content here",
|
||||
checksum="T6",
|
||||
pk=16,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(title_match)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=pass")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=sswo")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=sswo hand")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
def test_search_returns_all_for_api_version_9(self) -> None:
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
|
||||
Reference in New Issue
Block a user