Enhancement: unify text search to use tantivy (#12485)

This commit is contained in:
shamoon
2026-04-03 13:53:45 -07:00
committed by GitHub
parent f32ad98d8e
commit 566afdffca
29 changed files with 1019 additions and 97 deletions
+154
View File
@@ -91,6 +91,135 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.data["count"], 0)
self.assertEqual(len(results), 0)
def test_simple_text_search(self) -> None:
tagged = Tag.objects.create(name="invoice")
matching_doc = Document.objects.create(
title="Quarterly summary",
content="Monthly bank report",
checksum="T1",
pk=11,
)
matching_doc.tags.add(tagged)
metadata_only_doc = Document.objects.create(
title="Completely unrelated",
content="No matching terms here",
checksum="T2",
pk=12,
)
metadata_only_doc.tags.add(tagged)
backend = get_backend()
backend.add_or_update(matching_doc)
backend.add_or_update(metadata_only_doc)
response = self.client.get("/api/documents/?text=monthly")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=tag:invoice")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 0)
def test_simple_text_search_matches_substrings(self) -> None:
matching_doc = Document.objects.create(
title="Quarterly summary",
content="Password reset instructions",
checksum="T5",
pk=15,
)
backend = get_backend()
backend.add_or_update(matching_doc)
response = self.client.get("/api/documents/?text=pass")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=sswo")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
response = self.client.get("/api/documents/?text=sswo re")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
def test_simple_text_search_does_not_match_on_partial_term_overlap(self) -> None:
non_matching_doc = Document.objects.create(
title="Adobe Acrobat PDF Files",
content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
checksum="T7",
pk=17,
)
backend = get_backend()
backend.add_or_update(non_matching_doc)
response = self.client.get("/api/documents/?text=raptor")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 0)
def test_simple_title_search(self) -> None:
title_match = Document.objects.create(
title="Quarterly summary",
content="No matching content here",
checksum="T3",
pk=13,
)
content_only = Document.objects.create(
title="Completely unrelated",
content="Quarterly summary appears only in content",
checksum="T4",
pk=14,
)
backend = get_backend()
backend.add_or_update(title_match)
backend.add_or_update(content_only)
response = self.client.get("/api/documents/?title_search=quarterly")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
def test_simple_title_search_matches_substrings(self) -> None:
title_match = Document.objects.create(
title="Password handbook",
content="No matching content here",
checksum="T6",
pk=16,
)
backend = get_backend()
backend.add_or_update(title_match)
response = self.client.get("/api/documents/?title_search=pass")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
response = self.client.get("/api/documents/?title_search=sswo")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
response = self.client.get("/api/documents/?title_search=sswo hand")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 1)
self.assertEqual(response.data["results"][0]["id"], title_match.id)
def test_search_rejects_multiple_search_modes(self) -> None:
response = self.client.get("/api/documents/?text=bank&query=bank")
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertEqual(
response.data["detail"],
"Specify only one of text, title_search, query, or more_like_id.",
)
def test_search_returns_all_for_api_version_9(self) -> None:
d1 = Document.objects.create(
title="invoice",
@@ -1493,6 +1622,31 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertEqual(results["custom_fields"][0]["id"], custom_field1.id)
self.assertEqual(results["workflows"][0]["id"], workflow1.id)
def test_global_search_db_only_limits_documents_to_title_matches(self) -> None:
title_match = Document.objects.create(
title="bank statement",
content="no additional terms",
checksum="GS1",
pk=21,
)
content_only = Document.objects.create(
title="not a title match",
content="bank appears only in content",
checksum="GS2",
pk=22,
)
backend = get_backend()
backend.add_or_update(title_match)
backend.add_or_update(content_only)
self.client.force_authenticate(self.user)
response = self.client.get("/api/search/?query=bank&db_only=true")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(len(response.data["documents"]), 1)
self.assertEqual(response.data["documents"][0]["id"], title_match.id)
def test_global_search_filters_owned_mail_objects(self) -> None:
user1 = User.objects.create_user("mail-search-user")
user2 = User.objects.create_user("other-mail-search-user")