Enhancement: unify text search to use tantivy (#12485)

This commit is contained in:
shamoon
2026-04-03 13:53:45 -07:00
committed by GitHub
parent f32ad98d8e
commit 566afdffca
29 changed files with 1019 additions and 97 deletions
+253
View File
@@ -5,6 +5,7 @@ from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import Note
from documents.search._backend import SearchMode
from documents.search._backend import TantivyBackend
from documents.search._backend import get_backend
from documents.search._backend import reset_backend
@@ -46,6 +47,258 @@ class TestWriteBatch:
class TestSearch:
"""Test search functionality."""
def test_text_mode_limits_default_search_to_title_and_content(
self,
backend: TantivyBackend,
):
"""Simple text mode must not match metadata-only fields."""
doc = Document.objects.create(
title="Invoice document",
content="monthly statement",
checksum="TXT1",
pk=9,
)
backend.add_or_update(doc)
metadata_only = backend.search(
"document_type:invoice",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert metadata_only.total == 0
content_match = backend.search(
"monthly",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert content_match.total == 1
def test_title_mode_limits_default_search_to_title_only(
self,
backend: TantivyBackend,
):
"""Title mode must not match content-only terms."""
doc = Document.objects.create(
title="Invoice document",
content="monthly statement",
checksum="TXT2",
pk=10,
)
backend.add_or_update(doc)
content_only = backend.search(
"monthly",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert content_only.total == 0
title_match = backend.search(
"invoice",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert title_match.total == 1
def test_text_mode_matches_partial_term_substrings(
self,
backend: TantivyBackend,
):
"""Simple text mode should support substring matching within tokens."""
doc = Document.objects.create(
title="Account access",
content="password reset instructions",
checksum="TXT3",
pk=11,
)
backend.add_or_update(doc)
prefix_match = backend.search(
"pass",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert prefix_match.total == 1
infix_match = backend.search(
"sswo",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert infix_match.total == 1
phrase_match = backend.search(
"sswo re",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert phrase_match.total == 1
def test_text_mode_does_not_match_on_partial_term_overlap(
self,
backend: TantivyBackend,
):
"""Simple text mode should not match documents that merely share partial fragments."""
doc = Document.objects.create(
title="Adobe Acrobat PDF Files",
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
checksum="TXT7",
pk=13,
)
backend.add_or_update(doc)
non_match = backend.search(
"raptor",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert non_match.total == 0
def test_text_mode_anchors_later_query_tokens_to_token_starts(
self,
backend: TantivyBackend,
):
"""Multi-token simple search should not match later tokens in the middle of a word."""
exact_doc = Document.objects.create(
title="Z-Berichte 6",
content="monthly report",
checksum="TXT9",
pk=15,
)
prefix_doc = Document.objects.create(
title="Z-Berichte 60",
content="monthly report",
checksum="TXT10",
pk=16,
)
false_positive = Document.objects.create(
title="Z-Berichte 16",
content="monthly report",
checksum="TXT11",
pk=17,
)
backend.add_or_update(exact_doc)
backend.add_or_update(prefix_doc)
backend.add_or_update(false_positive)
results = backend.search(
"Z-Berichte 6",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
result_ids = {hit["id"] for hit in results.hits}
assert exact_doc.id in result_ids
assert prefix_doc.id in result_ids
assert false_positive.id not in result_ids
def test_text_mode_ignores_queries_without_searchable_tokens(
self,
backend: TantivyBackend,
):
"""Simple text mode should safely return no hits for symbol-only strings."""
doc = Document.objects.create(
title="Guide",
content="This is a guide.",
checksum="TXT8",
pk=14,
)
backend.add_or_update(doc)
no_tokens = backend.search(
"!!!",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
assert no_tokens.total == 0
def test_title_mode_matches_partial_term_substrings(
self,
backend: TantivyBackend,
):
"""Title mode should support substring matching within title tokens."""
doc = Document.objects.create(
title="Password guide",
content="reset instructions",
checksum="TXT4",
pk=12,
)
backend.add_or_update(doc)
prefix_match = backend.search(
"pass",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert prefix_match.total == 1
infix_match = backend.search(
"sswo",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert infix_match.total == 1
phrase_match = backend.search(
"sswo gu",
user=None,
page=1,
page_size=10,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TITLE,
)
assert phrase_match.total == 1
def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
"""Search scores must be normalized so top hit has score 1.0 for UI consistency."""
for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
@@ -8,6 +8,7 @@ import tantivy
from documents.search._tokenizer import _bigram_analyzer
from documents.search._tokenizer import _paperless_text
from documents.search._tokenizer import _simple_search_analyzer
from documents.search._tokenizer import register_tokenizers
if TYPE_CHECKING:
@@ -41,6 +42,20 @@ class TestTokenizers:
idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
return idx
@pytest.fixture
def simple_search_index(self) -> tantivy.Index:
"""Index with simple-search field for Latin substring tests."""
sb = tantivy.SchemaBuilder()
sb.add_text_field(
"simple_content",
stored=False,
tokenizer_name="simple_search_analyzer",
)
schema = sb.build()
idx = tantivy.Index(schema, path=None)
idx.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
return idx
def test_ascii_fold_finds_accented_content(
self,
content_index: tantivy.Index,
@@ -66,6 +81,24 @@ class TestTokenizers:
q = bigram_index.parse_query("東京", ["bigram_content"])
assert bigram_index.searcher().search(q, limit=5).count == 1
def test_simple_search_analyzer_supports_regex_substrings(
self,
simple_search_index: tantivy.Index,
) -> None:
"""Whitespace-preserving simple search analyzer supports substring regex matching."""
writer = simple_search_index.writer()
doc = tantivy.Document()
doc.add_text("simple_content", "tag:invoice password-reset")
writer.add_document(doc)
writer.commit()
simple_search_index.reload()
q = tantivy.Query.regex_query(
simple_search_index.schema,
"simple_content",
".*sswo.*",
)
assert simple_search_index.searcher().search(q, limit=5).count == 1
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
"""Unsupported language codes should log a warning and disable stemming gracefully."""
sb = tantivy.SchemaBuilder()
+154
View File
@@ -91,6 +91,135 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.data["count"], 0)
self.assertEqual(len(results), 0)
def test_simple_text_search(self) -> None:
tagged = Tag.objects.create(name="invoice")
matching_doc = Document.objects.create(
title="Quarterly summary",
content="Monthly bank report",
checksum="T1",
pk=11,
)
matching_doc.tags.add(tagged)
metadata_only_doc = Document.objects.create(
title="Completely unrelated",
content="No matching terms here",
checksum="T2",
pk=12,
)
metadata_only_doc.tags.add(tagged)
backend = get_backend()
backend.add_or_update(matching_doc)
backend.add_or_update(metadata_only_doc)
response = self.client.get("/api/documents/?text=monthly")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=tag:invoice")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 0)
def test_simple_text_search_matches_substrings(self) -> None:
matching_doc = Document.objects.create(
title="Quarterly summary",
content="Password reset instructions",
checksum="T5",
pk=15,
)
backend = get_backend()
backend.add_or_update(matching_doc)
response = self.client.get("/api/documents/?text=pass")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=sswo")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=sswo re")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
def test_simple_text_search_does_not_match_on_partial_term_overlap(self) -> None:
non_matching_doc = Document.objects.create(
title="Adobe Acrobat PDF Files",
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
checksum="T7",
pk=17,
)
backend = get_backend()
backend.add_or_update(non_matching_doc)
response = self.client.get("/api/documents/?text=raptor")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 0)
def test_simple_title_search(self) -> None:
title_match = Document.objects.create(
title="Quarterly summary",
content="No matching content here",
checksum="T3",
pk=13,
)
content_only = Document.objects.create(
title="Completely unrelated",
content="Quarterly summary appears only in content",
checksum="T4",
pk=14,
)
backend = get_backend()
backend.add_or_update(title_match)
backend.add_or_update(content_only)
response = self.client.get("/api/documents/?title_search=quarterly")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
def test_simple_title_search_matches_substrings(self) -> None:
title_match = Document.objects.create(
title="Password handbook",
content="No matching content here",
checksum="T6",
pk=16,
)
backend = get_backend()
backend.add_or_update(title_match)
response = self.client.get("/api/documents/?title_search=pass")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
response = self.client.get("/api/documents/?title_search=sswo")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
response = self.client.get("/api/documents/?title_search=sswo hand")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
def test_search_rejects_multiple_search_modes(self) -> None:
response = self.client.get("/api/documents/?text=bank&query=bank")
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertEqual(
response.data["detail"],
"Specify only one of text, title_search, query, or more_like_id.",
)
def test_search_returns_all_for_api_version_9(self) -> None:
d1 = Document.objects.create(
title="invoice",
@@ -1493,6 +1622,31 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertEqual(results["custom_fields"][0]["id"], custom_field1.id)
self.assertEqual(results["workflows"][0]["id"], workflow1.id)
def test_global_search_db_only_limits_documents_to_title_matches(self) -> None:
title_match = Document.objects.create(
title="bank statement",
content="no additional terms",
checksum="GS1",
pk=21,
)
content_only = Document.objects.create(
title="not a title match",
content="bank appears only in content",
checksum="GS2",
pk=22,
)
backend = get_backend()
backend.add_or_update(title_match)
backend.add_or_update(content_only)
self.client.force_authenticate(self.user)
response = self.client.get("/api/search/?query=bank&db_only=true")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(len(response.data["documents"]), 1)
self.assertEqual(response.data["documents"][0]["id"], title_match.id)
def test_global_search_filters_owned_mail_objects(self) -> None:
user1 = User.objects.create_user("mail-search-user")
user2 = User.objects.create_user("other-mail-search-user")