Switch simple substring search to simple_search analyzer

This commit is contained in:
shamoon
2026-04-01 12:22:16 -07:00
parent 7c98d29de2
commit 3539f3f66a
7 changed files with 291 additions and 17 deletions

View File

@@ -303,8 +303,10 @@ class TantivyBackend:
doc.add_text("checksum", document.checksum)
doc.add_text("title", document.title)
doc.add_text("title_sort", document.title)
doc.add_text("simple_title", document.title)
doc.add_text("content", content)
doc.add_text("bigram_content", content)
doc.add_text("simple_content", content)
# Original filename - only add if not None/empty
if document.original_filename:

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import unicodedata
from datetime import UTC
from datetime import date
from datetime import datetime
@@ -51,7 +52,7 @@ _WHOOSH_REL_RANGE_RE = regex.compile(
)
# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly
_DATE8_RE = regex.compile(r"(?P<field>\w+):(?P<date8>\d{8})\b")
_SIMPLE_QUERY_SPECIAL_CHARS_RE = regex.compile(r'([+\-!(){}\[\]^"~*?:\\/])')
_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+")
def _fmt(dt: datetime) -> str:
@@ -439,9 +440,38 @@ DEFAULT_SEARCH_FIELDS = [
"note", # companion text field for notes content (notes JSON for structured: notes.user:x)
"custom_field", # companion text field for CF values (custom_fields JSON for structured: custom_fields.name:x)
]
SIMPLE_SEARCH_FIELDS = ["title", "content"]
TITLE_SEARCH_FIELDS = ["title"]
SIMPLE_SEARCH_FIELDS = ["simple_title", "simple_content"]
TITLE_SEARCH_FIELDS = ["simple_title"]
_FIELD_BOOSTS = {"title": 2.0}
_SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}
def _normalize_simple_token(token: str) -> str:
return (
unicodedata.normalize("NFD", token.lower())
.encode(
"ascii",
"ignore",
)
.decode()
)
def _build_simple_field_query(
index: tantivy.Index,
field: str,
tokens: list[str],
) -> tantivy.Query:
patterns = [f".*{regex.escape(token)}.*" for token in tokens]
if len(patterns) == 1:
query = tantivy.Query.regex_query(index.schema, field, patterns[0])
else:
query = tantivy.Query.regex_phrase_query(index.schema, field, patterns)
boost = _SIMPLE_FIELD_BOOSTS.get(field, 1.0)
if boost != 1.0:
return tantivy.Query.boost_query(query, boost)
return query
def parse_user_query(
@@ -512,20 +542,21 @@ def parse_simple_query(
Query string is escaped and normalized to be treated as "simple" text query.
"""
# strips special characters that would be interpreted as syntax by the parser
query_str = regex.sub(
_SIMPLE_QUERY_SPECIAL_CHARS_RE,
r"\\\1",
raw_query,
timeout=_REGEX_TIMEOUT,
)
# collapse multiple spaces to a single space for cleaner parsing (and to prevent ReDoS on excessive whitespace)
query_str = regex.sub(r" {2,}", " ", query_str, timeout=_REGEX_TIMEOUT).strip()
return index.parse_query(
query_str,
fields,
field_boosts=_FIELD_BOOSTS,
)
tokens = [
_normalize_simple_token(token)
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
]
tokens = [token for token in tokens if token]
if not tokens:
return tantivy.Query.empty_query()
field_queries = [
(tantivy.Occur.Should, _build_simple_field_query(index, field, tokens))
for field in fields
]
if len(field_queries) == 1:
return field_queries[0][1]
return tantivy.Query.boolean_query(field_queries)
def parse_simple_text_query(

View File

@@ -53,6 +53,18 @@ def build_schema() -> tantivy.Schema:
# CJK support - not stored, indexed only
sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer")
# Simple substring search support for title/content - not stored, indexed only
sb.add_text_field(
"simple_title",
stored=False,
tokenizer_name="simple_search_analyzer",
)
sb.add_text_field(
"simple_content",
stored=False,
tokenizer_name="simple_search_analyzer",
)
# Autocomplete prefix scan - stored, not indexed
sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw")

View File

@@ -70,6 +70,7 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
index.register_tokenizer("paperless_text", _paperless_text(language))
index.register_tokenizer("simple_analyzer", _simple_analyzer())
index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
index.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
# Fast-field tokenizer required for fast=True text fields in the schema
index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())
@@ -114,3 +115,15 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
.filter(tantivy.Filter.lowercase())
.build()
)
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
"""Tokenizer for simple substring search fields: non-whitespace chunks -> lowercase -> ascii_fold."""
return (
tantivy.TextAnalyzerBuilder(
tantivy.Tokenizer.regex(r"\S+"),
)
.filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold())
.build()
)

View File

@@ -117,6 +117,122 @@ class TestSearch:
)
assert title_match.total == 1
def test_text_mode_matches_partial_term_substrings(
self,
backend: TantivyBackend,
):
"""Simple text mode should support substring matching within tokens."""
doc = Document.objects.create(
title="Account access",
content="password reset instructions",
checksum="TXT3",
pk=11,
)
backend.add_or_update(doc)
prefix_match = backend.search(
"pass",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert prefix_match.total == 1
infix_match = backend.search(
"sswo",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert infix_match.total == 1
phrase_match = backend.search(
"sswo re",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert phrase_match.total == 1
def test_text_mode_does_not_match_on_partial_term_overlap(
self,
backend: TantivyBackend,
):
"""Simple text mode should not match documents that merely share partial fragments."""
doc = Document.objects.create(
title="Adobe Acrobat PDF Files",
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
checksum="TXT7",
pk=13,
)
backend.add_or_update(doc)
non_match = backend.search(
"raptor",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert non_match.total == 0
def test_title_mode_matches_partial_term_substrings(
self,
backend: TantivyBackend,
):
"""Title mode should support substring matching within title tokens."""
doc = Document.objects.create(
title="Password guide",
content="reset instructions",
checksum="TXT4",
pk=12,
)
backend.add_or_update(doc)
prefix_match = backend.search(
"pass",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert prefix_match.total == 1
infix_match = backend.search(
"sswo",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert infix_match.total == 1
phrase_match = backend.search(
"sswo gu",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert phrase_match.total == 1
def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
"""Search scores must be normalized so top hit has score 1.0 for UI consistency."""
for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):

View File

@@ -8,6 +8,7 @@ import tantivy
from documents.search._tokenizer import _bigram_analyzer
from documents.search._tokenizer import _paperless_text
from documents.search._tokenizer import _simple_search_analyzer
from documents.search._tokenizer import register_tokenizers
if TYPE_CHECKING:
@@ -41,6 +42,20 @@ class TestTokenizers:
idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
return idx
@pytest.fixture
def simple_search_index(self) -> tantivy.Index:
"""Index with simple-search field for Latin substring tests."""
sb = tantivy.SchemaBuilder()
sb.add_text_field(
"simple_content",
stored=False,
tokenizer_name="simple_search_analyzer",
)
schema = sb.build()
idx = tantivy.Index(schema, path=None)
idx.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
return idx
def test_ascii_fold_finds_accented_content(
self,
content_index: tantivy.Index,
@@ -66,6 +81,24 @@ class TestTokenizers:
q = bigram_index.parse_query("東京", ["bigram_content"])
assert bigram_index.searcher().search(q, limit=5).count == 1
def test_simple_search_analyzer_supports_regex_substrings(
self,
simple_search_index: tantivy.Index,
) -> None:
"""Whitespace-preserving simple search analyzer supports substring regex matching."""
writer = simple_search_index.writer()
doc = tantivy.Document()
doc.add_text("simple_content", "tag:invoice password-reset")
writer.add_document(doc)
writer.commit()
simple_search_index.reload()
q = tantivy.Query.regex_query(
simple_search_index.schema,
"simple_content",
".*sswo.*",
)
assert simple_search_index.searcher().search(q, limit=5).count == 1
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
"""Unsupported language codes should log a warning and disable stemming gracefully."""
sb = tantivy.SchemaBuilder()

View File

@@ -119,6 +119,47 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 0)
def test_simple_text_search_matches_substrings(self) -> None:
matching_doc = Document.objects.create(
title="Quarterly summary",
content="Password reset instructions",
checksum="T5",
pk=15,
)
backend = get_backend()
backend.add_or_update(matching_doc)
response = self.client.get("/api/documents/?text=pass")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=sswo")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=sswo re")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
def test_simple_text_search_does_not_match_on_partial_term_overlap(self) -> None:
non_matching_doc = Document.objects.create(
title="Adobe Acrobat PDF Files",
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
checksum="T7",
pk=17,
)
backend = get_backend()
backend.add_or_update(non_matching_doc)
response = self.client.get("/api/documents/?text=raptor")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 0)
def test_simple_title_search(self) -> None:
title_match = Document.objects.create(
title="Quarterly summary",
@@ -142,6 +183,32 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
def test_simple_title_search_matches_substrings(self) -> None:
title_match = Document.objects.create(
title="Password handbook",
content="No matching content here",
checksum="T6",
pk=16,
)
backend = get_backend()
backend.add_or_update(title_match)
response = self.client.get("/api/documents/?title_search=pass")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
response = self.client.get("/api/documents/?title_search=sswo")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
response = self.client.get("/api/documents/?title_search=sswo hand")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
def test_search_returns_all_for_api_version_9(self) -> None:
d1 = Document.objects.create(
title="invoice",