mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-10 23:59:43 +00:00
Enhancement: unify text search to use tantivy (#12485)
This commit is contained in:
+65
-19
@@ -1995,11 +1995,23 @@ class ChatStreamingView(GenericAPIView):
|
||||
list=extend_schema(
|
||||
description="Document views including search",
|
||||
parameters=[
|
||||
OpenApiParameter(
|
||||
name="text",
|
||||
type=OpenApiTypes.STR,
|
||||
location=OpenApiParameter.QUERY,
|
||||
description="Simple Tantivy-backed text search query string",
|
||||
),
|
||||
OpenApiParameter(
|
||||
name="title_search",
|
||||
type=OpenApiTypes.STR,
|
||||
location=OpenApiParameter.QUERY,
|
||||
description="Simple Tantivy-backed title-only search query string",
|
||||
),
|
||||
OpenApiParameter(
|
||||
name="query",
|
||||
type=OpenApiTypes.STR,
|
||||
location=OpenApiParameter.QUERY,
|
||||
description="Advanced search query string",
|
||||
description="Advanced Tantivy search query string",
|
||||
),
|
||||
OpenApiParameter(
|
||||
name="full_perms",
|
||||
@@ -2025,22 +2037,28 @@ class ChatStreamingView(GenericAPIView):
|
||||
),
|
||||
)
|
||||
class UnifiedSearchViewSet(DocumentViewSet):
|
||||
SEARCH_PARAM_NAMES = ("text", "title_search", "query", "more_like_id")
|
||||
|
||||
def get_serializer_class(self):
|
||||
if self._is_search_request():
|
||||
return SearchResultSerializer
|
||||
else:
|
||||
return DocumentSerializer
|
||||
|
||||
def _get_active_search_params(self, request: Request | None = None) -> list[str]:
|
||||
request = request or self.request
|
||||
return [
|
||||
param for param in self.SEARCH_PARAM_NAMES if param in request.query_params
|
||||
]
|
||||
|
||||
def _is_search_request(self):
|
||||
return (
|
||||
"query" in self.request.query_params
|
||||
or "more_like_id" in self.request.query_params
|
||||
)
|
||||
return bool(self._get_active_search_params())
|
||||
|
||||
def list(self, request, *args, **kwargs):
|
||||
if not self._is_search_request():
|
||||
return super().list(request)
|
||||
|
||||
from documents.search import SearchMode
|
||||
from documents.search import TantivyRelevanceList
|
||||
from documents.search import get_backend
|
||||
|
||||
@@ -2050,9 +2068,31 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
filtered_qs = self.filter_queryset(self.get_queryset())
|
||||
|
||||
user = None if request.user.is_superuser else request.user
|
||||
active_search_params = self._get_active_search_params(request)
|
||||
|
||||
if "query" in request.query_params:
|
||||
query_str = request.query_params["query"]
|
||||
if len(active_search_params) > 1:
|
||||
raise ValidationError(
|
||||
{
|
||||
"detail": _(
|
||||
"Specify only one of text, title_search, query, or more_like_id.",
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
if (
|
||||
"text" in request.query_params
|
||||
or "title_search" in request.query_params
|
||||
or "query" in request.query_params
|
||||
):
|
||||
if "text" in request.query_params:
|
||||
search_mode = SearchMode.TEXT
|
||||
query_str = request.query_params["text"]
|
||||
elif "title_search" in request.query_params:
|
||||
search_mode = SearchMode.TITLE
|
||||
query_str = request.query_params["title_search"]
|
||||
else:
|
||||
search_mode = SearchMode.QUERY
|
||||
query_str = request.query_params["query"]
|
||||
results = backend.search(
|
||||
query_str,
|
||||
user=user,
|
||||
@@ -2060,6 +2100,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=search_mode,
|
||||
)
|
||||
else:
|
||||
# more_like_id — validate permission on the seed document first
|
||||
@@ -2132,6 +2173,8 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
if str(e.detail) == str(invalid_more_like_id_message):
|
||||
return HttpResponseForbidden(invalid_more_like_id_message)
|
||||
return HttpResponseForbidden(_("Insufficient permissions."))
|
||||
except ValidationError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.warning(f"An error occurred listing search results: {e!s}")
|
||||
return HttpResponseBadRequest(
|
||||
@@ -3003,6 +3046,9 @@ class GlobalSearchView(PassUserMixin):
|
||||
serializer_class = SearchResultSerializer
|
||||
|
||||
def get(self, request, *args, **kwargs):
|
||||
from documents.search import SearchMode
|
||||
from documents.search import get_backend
|
||||
|
||||
query = request.query_params.get("query", None)
|
||||
if query is None:
|
||||
return HttpResponseBadRequest("Query required")
|
||||
@@ -3019,25 +3065,25 @@ class GlobalSearchView(PassUserMixin):
|
||||
"view_document",
|
||||
Document,
|
||||
)
|
||||
# First search by title
|
||||
docs = all_docs.filter(title__icontains=query)
|
||||
if not db_only and len(docs) < OBJECT_LIMIT:
|
||||
# If we don't have enough results, search by content.
|
||||
# Over-fetch from Tantivy (no permission filter) and rely on
|
||||
# the ORM all_docs queryset for authoritative permission gating.
|
||||
from documents.search import get_backend
|
||||
|
||||
if db_only:
|
||||
docs = all_docs.filter(title__icontains=query)[:OBJECT_LIMIT]
|
||||
else:
|
||||
user = None if request.user.is_superuser else request.user
|
||||
fts_results = get_backend().search(
|
||||
query,
|
||||
user=None,
|
||||
user=user,
|
||||
page=1,
|
||||
page_size=1000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
search_mode=SearchMode.TEXT,
|
||||
)
|
||||
fts_ids = {h["id"] for h in fts_results.hits}
|
||||
docs = docs | all_docs.filter(id__in=fts_ids)
|
||||
docs = docs[:OBJECT_LIMIT]
|
||||
docs_by_id = all_docs.in_bulk([hit["id"] for hit in fts_results.hits])
|
||||
docs = [
|
||||
docs_by_id[hit["id"]]
|
||||
for hit in fts_results.hits
|
||||
if hit["id"] in docs_by_id
|
||||
][:OBJECT_LIMIT]
|
||||
saved_views = (
|
||||
get_objects_for_user_owner_aware(
|
||||
request.user,
|
||||
|
||||
Reference in New Issue
Block a user