Enhancement: unify text search to use tantivy (#12485)

This commit is contained in:
shamoon
2026-04-03 13:53:45 -07:00
committed by GitHub
parent f32ad98d8e
commit 566afdffca
29 changed files with 1019 additions and 97 deletions
+65 -19
View File
@@ -1995,11 +1995,23 @@ class ChatStreamingView(GenericAPIView):
list=extend_schema(
description="Document views including search",
parameters=[
OpenApiParameter(
name="text",
type=OpenApiTypes.STR,
location=OpenApiParameter.QUERY,
description="Simple Tantivy-backed text search query string",
),
OpenApiParameter(
name="title_search",
type=OpenApiTypes.STR,
location=OpenApiParameter.QUERY,
description="Simple Tantivy-backed title-only search query string",
),
OpenApiParameter(
name="query",
type=OpenApiTypes.STR,
location=OpenApiParameter.QUERY,
description="Advanced search query string",
description="Advanced Tantivy search query string",
),
OpenApiParameter(
name="full_perms",
@@ -2025,22 +2037,28 @@ class ChatStreamingView(GenericAPIView):
),
)
class UnifiedSearchViewSet(DocumentViewSet):
SEARCH_PARAM_NAMES = ("text", "title_search", "query", "more_like_id")
def get_serializer_class(self):
if self._is_search_request():
return SearchResultSerializer
else:
return DocumentSerializer
def _get_active_search_params(self, request: Request | None = None) -> list[str]:
request = request or self.request
return [
param for param in self.SEARCH_PARAM_NAMES if param in request.query_params
]
def _is_search_request(self):
return (
"query" in self.request.query_params
or "more_like_id" in self.request.query_params
)
return bool(self._get_active_search_params())
def list(self, request, *args, **kwargs):
if not self._is_search_request():
return super().list(request)
from documents.search import SearchMode
from documents.search import TantivyRelevanceList
from documents.search import get_backend
@@ -2050,9 +2068,31 @@ class UnifiedSearchViewSet(DocumentViewSet):
filtered_qs = self.filter_queryset(self.get_queryset())
user = None if request.user.is_superuser else request.user
active_search_params = self._get_active_search_params(request)
if "query" in request.query_params:
query_str = request.query_params["query"]
if len(active_search_params) > 1:
raise ValidationError(
{
"detail": _(
"Specify only one of text, title_search, query, or more_like_id.",
),
},
)
if (
"text" in request.query_params
or "title_search" in request.query_params
or "query" in request.query_params
):
if "text" in request.query_params:
search_mode = SearchMode.TEXT
query_str = request.query_params["text"]
elif "title_search" in request.query_params:
search_mode = SearchMode.TITLE
query_str = request.query_params["title_search"]
else:
search_mode = SearchMode.QUERY
query_str = request.query_params["query"]
results = backend.search(
query_str,
user=user,
@@ -2060,6 +2100,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
page_size=10000,
sort_field=None,
sort_reverse=False,
search_mode=search_mode,
)
else:
# more_like_id — validate permission on the seed document first
@@ -2132,6 +2173,8 @@ class UnifiedSearchViewSet(DocumentViewSet):
if str(e.detail) == str(invalid_more_like_id_message):
return HttpResponseForbidden(invalid_more_like_id_message)
return HttpResponseForbidden(_("Insufficient permissions."))
except ValidationError:
raise
except Exception as e:
logger.warning(f"An error occurred listing search results: {e!s}")
return HttpResponseBadRequest(
@@ -3003,6 +3046,9 @@ class GlobalSearchView(PassUserMixin):
serializer_class = SearchResultSerializer
def get(self, request, *args, **kwargs):
from documents.search import SearchMode
from documents.search import get_backend
query = request.query_params.get("query", None)
if query is None:
return HttpResponseBadRequest("Query required")
@@ -3019,25 +3065,25 @@ class GlobalSearchView(PassUserMixin):
"view_document",
Document,
)
# First search by title
docs = all_docs.filter(title__icontains=query)
if not db_only and len(docs) < OBJECT_LIMIT:
# If we don't have enough results, search by content.
# Over-fetch from Tantivy (no permission filter) and rely on
# the ORM all_docs queryset for authoritative permission gating.
from documents.search import get_backend
if db_only:
docs = all_docs.filter(title__icontains=query)[:OBJECT_LIMIT]
else:
user = None if request.user.is_superuser else request.user
fts_results = get_backend().search(
query,
user=None,
user=user,
page=1,
page_size=1000,
sort_field=None,
sort_reverse=False,
search_mode=SearchMode.TEXT,
)
fts_ids = {h["id"] for h in fts_results.hits}
docs = docs | all_docs.filter(id__in=fts_ids)
docs = docs[:OBJECT_LIMIT]
docs_by_id = all_docs.in_bulk([hit["id"] for hit in fts_results.hits])
docs = [
docs_by_id[hit["id"]]
for hit in fts_results.hits
if hit["id"] in docs_by_id
][:OBJECT_LIMIT]
saved_views = (
get_objects_for_user_owner_aware(
request.user,