mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-26 08:25:34 +00:00
Enhancement: unify text search to use tantivy (#12485)
This commit is contained in:
@@ -5,6 +5,7 @@ from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import Note
|
||||
from documents.search._backend import SearchMode
|
||||
from documents.search._backend import TantivyBackend
|
||||
from documents.search._backend import get_backend
|
||||
from documents.search._backend import reset_backend
|
||||
@@ -46,6 +47,258 @@ class TestWriteBatch:
|
||||
class TestSearch:
|
||||
"""Test search functionality."""
|
||||
|
||||
def test_text_mode_limits_default_search_to_title_and_content(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Simple text mode must not match metadata-only fields."""
|
||||
doc = Document.objects.create(
|
||||
title="Invoice document",
|
||||
content="monthly statement",
|
||||
checksum="TXT1",
|
||||
pk=9,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
metadata_only = backend.search(
|
||||
"document_type:invoice",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert metadata_only.total == 0
|
||||
|
||||
content_match = backend.search(
|
||||
"monthly",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert content_match.total == 1
|
||||
|
||||
def test_title_mode_limits_default_search_to_title_only(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Title mode must not match content-only terms."""
|
||||
doc = Document.objects.create(
|
||||
title="Invoice document",
|
||||
content="monthly statement",
|
||||
checksum="TXT2",
|
||||
pk=10,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
content_only = backend.search(
|
||||
"monthly",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert content_only.total == 0
|
||||
|
||||
title_match = backend.search(
|
||||
"invoice",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert title_match.total == 1
|
||||
|
||||
def test_text_mode_matches_partial_term_substrings(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Simple text mode should support substring matching within tokens."""
|
||||
doc = Document.objects.create(
|
||||
title="Account access",
|
||||
content="password reset instructions",
|
||||
checksum="TXT3",
|
||||
pk=11,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
prefix_match = backend.search(
|
||||
"pass",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert prefix_match.total == 1
|
||||
|
||||
infix_match = backend.search(
|
||||
"sswo",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert infix_match.total == 1
|
||||
|
||||
phrase_match = backend.search(
|
||||
"sswo re",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert phrase_match.total == 1
|
||||
|
||||
def test_text_mode_does_not_match_on_partial_term_overlap(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Simple text mode should not match documents that merely share partial fragments."""
|
||||
doc = Document.objects.create(
|
||||
title="Adobe Acrobat PDF Files",
|
||||
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
|
||||
checksum="TXT7",
|
||||
pk=13,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
non_match = backend.search(
|
||||
"raptor",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert non_match.total == 0
|
||||
|
||||
def test_text_mode_anchors_later_query_tokens_to_token_starts(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Multi-token simple search should not match later tokens in the middle of a word."""
|
||||
exact_doc = Document.objects.create(
|
||||
title="Z-Berichte 6",
|
||||
content="monthly report",
|
||||
checksum="TXT9",
|
||||
pk=15,
|
||||
)
|
||||
prefix_doc = Document.objects.create(
|
||||
title="Z-Berichte 60",
|
||||
content="monthly report",
|
||||
checksum="TXT10",
|
||||
pk=16,
|
||||
)
|
||||
false_positive = Document.objects.create(
|
||||
title="Z-Berichte 16",
|
||||
content="monthly report",
|
||||
checksum="TXT11",
|
||||
pk=17,
|
||||
)
|
||||
backend.add_or_update(exact_doc)
|
||||
backend.add_or_update(prefix_doc)
|
||||
backend.add_or_update(false_positive)
|
||||
|
||||
results = backend.search(
|
||||
"Z-Berichte 6",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
result_ids = {hit["id"] for hit in results.hits}
|
||||
|
||||
assert exact_doc.id in result_ids
|
||||
assert prefix_doc.id in result_ids
|
||||
assert false_positive.id not in result_ids
|
||||
|
||||
def test_text_mode_ignores_queries_without_searchable_tokens(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Simple text mode should safely return no hits for symbol-only strings."""
|
||||
doc = Document.objects.create(
|
||||
title="Guide",
|
||||
content="This is a guide.",
|
||||
checksum="TXT8",
|
||||
pk=14,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
no_tokens = backend.search(
|
||||
"!!!",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
assert no_tokens.total == 0
|
||||
|
||||
def test_title_mode_matches_partial_term_substrings(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Title mode should support substring matching within title tokens."""
|
||||
doc = Document.objects.create(
|
||||
title="Password guide",
|
||||
content="reset instructions",
|
||||
checksum="TXT4",
|
||||
pk=12,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
prefix_match = backend.search(
|
||||
"pass",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert prefix_match.total == 1
|
||||
|
||||
infix_match = backend.search(
|
||||
"sswo",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert infix_match.total == 1
|
||||
|
||||
phrase_match = backend.search(
|
||||
"sswo gu",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TITLE,
|
||||
)
|
||||
assert phrase_match.total == 1
|
||||
|
||||
def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
|
||||
"""Search scores must be normalized so top hit has score 1.0 for UI consistency."""
|
||||
for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
|
||||
|
||||
@@ -8,6 +8,7 @@ import tantivy
|
||||
|
||||
from documents.search._tokenizer import _bigram_analyzer
|
||||
from documents.search._tokenizer import _paperless_text
|
||||
from documents.search._tokenizer import _simple_search_analyzer
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -41,6 +42,20 @@ class TestTokenizers:
|
||||
idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
||||
return idx
|
||||
|
||||
@pytest.fixture
|
||||
def simple_search_index(self) -> tantivy.Index:
|
||||
"""Index with simple-search field for Latin substring tests."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
sb.add_text_field(
|
||||
"simple_content",
|
||||
stored=False,
|
||||
tokenizer_name="simple_search_analyzer",
|
||||
)
|
||||
schema = sb.build()
|
||||
idx = tantivy.Index(schema, path=None)
|
||||
idx.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
|
||||
return idx
|
||||
|
||||
def test_ascii_fold_finds_accented_content(
|
||||
self,
|
||||
content_index: tantivy.Index,
|
||||
@@ -66,6 +81,24 @@ class TestTokenizers:
|
||||
q = bigram_index.parse_query("東京", ["bigram_content"])
|
||||
assert bigram_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_simple_search_analyzer_supports_regex_substrings(
|
||||
self,
|
||||
simple_search_index: tantivy.Index,
|
||||
) -> None:
|
||||
"""Whitespace-preserving simple search analyzer supports substring regex matching."""
|
||||
writer = simple_search_index.writer()
|
||||
doc = tantivy.Document()
|
||||
doc.add_text("simple_content", "tag:invoice password-reset")
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
simple_search_index.reload()
|
||||
q = tantivy.Query.regex_query(
|
||||
simple_search_index.schema,
|
||||
"simple_content",
|
||||
".*sswo.*",
|
||||
)
|
||||
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
||||
|
||||
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
||||
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
||||
sb = tantivy.SchemaBuilder()
|
||||
|
||||
@@ -91,6 +91,135 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.data["count"], 0)
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
def test_simple_text_search(self) -> None:
|
||||
tagged = Tag.objects.create(name="invoice")
|
||||
matching_doc = Document.objects.create(
|
||||
title="Quarterly summary",
|
||||
content="Monthly bank report",
|
||||
checksum="T1",
|
||||
pk=11,
|
||||
)
|
||||
matching_doc.tags.add(tagged)
|
||||
|
||||
metadata_only_doc = Document.objects.create(
|
||||
title="Completely unrelated",
|
||||
content="No matching terms here",
|
||||
checksum="T2",
|
||||
pk=12,
|
||||
)
|
||||
metadata_only_doc.tags.add(tagged)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(matching_doc)
|
||||
backend.add_or_update(metadata_only_doc)
|
||||
|
||||
response = self.client.get("/api/documents/?text=monthly")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
response = self.client.get("/api/documents/?text=tag:invoice")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 0)
|
||||
|
||||
def test_simple_text_search_matches_substrings(self) -> None:
|
||||
matching_doc = Document.objects.create(
|
||||
title="Quarterly summary",
|
||||
content="Password reset instructions",
|
||||
checksum="T5",
|
||||
pk=15,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(matching_doc)
|
||||
|
||||
response = self.client.get("/api/documents/?text=pass")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
response = self.client.get("/api/documents/?text=sswo")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
response = self.client.get("/api/documents/?text=sswo re")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
|
||||
|
||||
def test_simple_text_search_does_not_match_on_partial_term_overlap(self) -> None:
|
||||
non_matching_doc = Document.objects.create(
|
||||
title="Adobe Acrobat PDF Files",
|
||||
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
|
||||
checksum="T7",
|
||||
pk=17,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(non_matching_doc)
|
||||
|
||||
response = self.client.get("/api/documents/?text=raptor")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 0)
|
||||
|
||||
def test_simple_title_search(self) -> None:
|
||||
title_match = Document.objects.create(
|
||||
title="Quarterly summary",
|
||||
content="No matching content here",
|
||||
checksum="T3",
|
||||
pk=13,
|
||||
)
|
||||
content_only = Document.objects.create(
|
||||
title="Completely unrelated",
|
||||
content="Quarterly summary appears only in content",
|
||||
checksum="T4",
|
||||
pk=14,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(title_match)
|
||||
backend.add_or_update(content_only)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=quarterly")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
def test_simple_title_search_matches_substrings(self) -> None:
|
||||
title_match = Document.objects.create(
|
||||
title="Password handbook",
|
||||
content="No matching content here",
|
||||
checksum="T6",
|
||||
pk=16,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(title_match)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=pass")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=sswo")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
response = self.client.get("/api/documents/?title_search=sswo hand")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(response.data["results"][0]["id"], title_match.id)
|
||||
|
||||
def test_search_rejects_multiple_search_modes(self) -> None:
|
||||
response = self.client.get("/api/documents/?text=bank&query=bank")
|
||||
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
self.assertEqual(
|
||||
response.data["detail"],
|
||||
"Specify only one of text, title_search, query, or more_like_id.",
|
||||
)
|
||||
|
||||
def test_search_returns_all_for_api_version_9(self) -> None:
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
@@ -1493,6 +1622,31 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(results["custom_fields"][0]["id"], custom_field1.id)
|
||||
self.assertEqual(results["workflows"][0]["id"], workflow1.id)
|
||||
|
||||
def test_global_search_db_only_limits_documents_to_title_matches(self) -> None:
|
||||
title_match = Document.objects.create(
|
||||
title="bank statement",
|
||||
content="no additional terms",
|
||||
checksum="GS1",
|
||||
pk=21,
|
||||
)
|
||||
content_only = Document.objects.create(
|
||||
title="not a title match",
|
||||
content="bank appears only in content",
|
||||
checksum="GS2",
|
||||
pk=22,
|
||||
)
|
||||
|
||||
backend = get_backend()
|
||||
backend.add_or_update(title_match)
|
||||
backend.add_or_update(content_only)
|
||||
|
||||
self.client.force_authenticate(self.user)
|
||||
|
||||
response = self.client.get("/api/search/?query=bank&db_only=true")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(len(response.data["documents"]), 1)
|
||||
self.assertEqual(response.data["documents"][0]["id"], title_match.id)
|
||||
|
||||
def test_global_search_filters_owned_mail_objects(self) -> None:
|
||||
user1 = User.objects.create_user("mail-search-user")
|
||||
user2 = User.objects.create_user("other-mail-search-user")
|
||||
|
||||
Reference in New Issue
Block a user