Fix (dev): resolve tantivy search-filtered documents in bulk edit (#12705)

This commit is contained in:
shamoon
2026-05-04 14:34:08 -07:00
committed by GitHub
parent 1b08417062
commit a76b6b826c
3 changed files with 184 additions and 34 deletions
+75
View File
@@ -671,6 +671,81 @@ class TestBulkEditAPI(DirectoriesMixin, APITestCase):
self.assertEqual(args[0], [self.doc2.id])
self.assertEqual(kwargs["storage_path"], self.sp1.id)
@mock.patch("documents.search.get_backend")
@mock.patch("documents.serialisers.bulk_edit.set_storage_path")
def test_api_bulk_edit_with_all_true_resolves_documents_from_search_filters(
self,
m,
get_backend,
) -> None:
self.setup_mock(m, "set_storage_path")
for filters in (
{"text": "new doc 2017-03-16"},
{"title_search": "apple"},
):
with self.subTest(filters=filters):
get_backend.return_value.search_ids.return_value = [self.doc2.id]
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"all": True,
"filters": filters,
"method": "set_storage_path",
"parameters": {"storage_path": self.sp1.id},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
get_backend.return_value.search_ids.assert_called_once()
args, kwargs = m.call_args
self.assertEqual(args[0], [self.doc2.id])
self.assertEqual(kwargs["storage_path"], self.sp1.id)
m.reset_mock()
get_backend.return_value.search_ids.reset_mock()
# more_like_id is a different path
get_backend.return_value.more_like_this_ids.return_value = [self.doc2.id]
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"all": True,
"filters": {"more_like_id": self.doc1.id},
"method": "set_storage_path",
"parameters": {"storage_path": self.sp1.id},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
get_backend.return_value.more_like_this_ids.assert_called_once()
def test_api_bulk_edit_with_all_true_rejects_multiple_filters(self) -> None:
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"all": True,
"filters": {
"text": "B",
"query": "c1",
},
"method": "set_storage_path",
"parameters": {"storage_path": self.sp1.id},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertIn(b"Specify only one of", response.content)
def test_api_bulk_edit_with_all_true_rejects_unsupported_methods(self) -> None:
response = self.client.post(
"/api/documents/bulk_edit/",
+13
View File
@@ -1028,6 +1028,19 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertIn(d3.id, result_ids)
self.assertNotIn(d4.id, result_ids)
def test_more_like_requires_id_of_existing_document(self) -> None:
"""
GIVEN:
- No document with the given ID exists
WHEN:
- API request for more like a given document is made with a non-existent document ID
THEN:
- 403 Forbidden is returned with an appropriate error message
"""
response = self.client.get("/api/documents/?more_like_id=9999")
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
self.assertEqual(response.content, b"Invalid more_like_id")
def test_search_more_like_requires_view_permission_on_seed_document(
self,
) -> None:
+96 -34
View File
@@ -260,6 +260,38 @@ logger = logging.getLogger("paperless.api")
# degrades on SQLite with thousands of parameters. PostgreSQL handles large IN
# clauses efficiently, so this threshold mainly protects SQLite users.
_TANTIVY_INTERSECT_THRESHOLD = 5_000
_TANTIVY_SEARCH_PARAM_NAMES = ("text", "title_search", "query", "more_like_id")
def _get_tantivy_query_and_mode(params):
from documents.search import SearchMode
if "text" in params:
return str(params["text"]), SearchMode.TEXT
if "title_search" in params:
return str(params["title_search"]), SearchMode.TITLE
if "query" in params:
return str(params["query"]), SearchMode.QUERY
return None # pragma: no cover
def _get_more_like_id(query_params: dict[str, Any], user: User | None) -> int:
try:
more_like_doc_id = int(query_params["more_like_id"])
more_like_doc = Document.objects.select_related("owner").get(
pk=more_like_doc_id,
)
except (TypeError, ValueError, Document.DoesNotExist):
raise PermissionDenied(_("Invalid more_like_id"))
if user and not has_perms_owner_aware(
user,
"view_document",
more_like_doc,
):
raise PermissionDenied(_("Insufficient permissions."))
return more_like_doc_id
class IndexView(TemplateView):
@@ -2165,8 +2197,6 @@ class ChatStreamingView(GenericAPIView[Any]):
),
)
class UnifiedSearchViewSet(DocumentViewSet):
SEARCH_PARAM_NAMES = ("text", "title_search", "query", "more_like_id")
def get_serializer_class(self):
if self._is_search_request():
return SearchResultSerializer
@@ -2175,7 +2205,9 @@ class UnifiedSearchViewSet(DocumentViewSet):
def _get_active_search_params(self, request: Request | None = None) -> list[str]:
request = request or self.request
return [
param for param in self.SEARCH_PARAM_NAMES if param in request.query_params
param
for param in _TANTIVY_SEARCH_PARAM_NAMES
if param in request.query_params
]
def _is_search_request(self):
@@ -2186,7 +2218,6 @@ class UnifiedSearchViewSet(DocumentViewSet):
return super().list(request)
from documents.search import SearchHit
from documents.search import SearchMode
from documents.search import TantivyBackend
from documents.search import TantivyRelevanceList
from documents.search import get_backend
@@ -2257,15 +2288,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
filtered_qs: QuerySet[Document],
) -> tuple[list[int], list[SearchHit], int]:
"""Handle text/title/query search: IDs, ORM intersection, page highlights."""
if "text" in request.query_params:
search_mode = SearchMode.TEXT
query_str = request.query_params["text"]
elif "title_search" in request.query_params:
search_mode = SearchMode.TITLE
query_str = request.query_params["title_search"]
else:
search_mode = SearchMode.QUERY
query_str = request.query_params["query"]
query_str, search_mode = _get_tantivy_query_and_mode(request.query_params)
# "score" is not a real Tantivy sort field — it means relevance order,
# which is Tantivy's default when no sort field is specified.
@@ -2305,20 +2328,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
filtered_qs: QuerySet[Document],
) -> tuple[list[int], list[SearchHit], int]:
"""Handle more_like_id search: permission check, IDs, stub hits."""
try:
more_like_doc_id = int(request.query_params["more_like_id"])
more_like_doc = Document.objects.select_related("owner").get(
pk=more_like_doc_id,
)
except (TypeError, ValueError, Document.DoesNotExist):
raise PermissionDenied(_("Invalid more_like_id"))
if not has_perms_owner_aware(
request.user,
"view_document",
more_like_doc,
):
raise PermissionDenied(_("Insufficient permissions."))
more_like_doc_id = _get_more_like_id(request.query_params, user)
all_ids = backend.more_like_this_ids(more_like_doc_id, user=user)
ordered_ids = intersect_and_order(
@@ -2508,6 +2518,48 @@ class SavedViewViewSet(BulkPermissionMixin, PassUserMixin, ModelViewSet[SavedVie
class DocumentSelectionMixin:
def _get_search_document_ids(
self,
*,
user: User,
filters: dict[str, Any],
) -> list[int] | None:
search_filters = [
filter_name
for filter_name in _TANTIVY_SEARCH_PARAM_NAMES
if filter_name in filters
]
if not search_filters:
return None
if len(search_filters) > 1:
raise ValidationError(
{
"detail": _(
"Specify only one of text, title_search, query, or more_like_id.",
),
},
)
from documents.search import get_backend
filter_name = search_filters[0]
backend = get_backend()
search_user = None if user.is_superuser else user
if filter_name == "more_like_id":
more_like_doc_id = _get_more_like_id(filters, user)
search_ids = backend.more_like_this_ids(more_like_doc_id, user=search_user)
else:
query_str, search_mode = _get_tantivy_query_and_mode(filters)
search_ids = backend.search_ids(
query_str,
user=search_user,
search_mode=search_mode,
)
return search_ids
def _resolve_document_ids(
self,
*,
@@ -2521,19 +2573,29 @@ class DocumentSelectionMixin:
# otherwise, reconstruct the document list based on the provided filters
filters = validated_data.get("filters") or {}
orm_filters = {
key: value
for key, value in filters.items()
if key not in _TANTIVY_SEARCH_PARAM_NAMES
}
permitted_documents = get_objects_for_user_owner_aware(
user,
permission_codename,
Document,
)
return list(
DocumentFilterSet(
data=filters,
queryset=permitted_documents,
)
.qs.distinct()
.values_list("pk", flat=True),
# orm-filtered docs
filtered_documents = DocumentFilterSet(
data=orm_filters,
queryset=permitted_documents,
).qs.distinct()
# tantivy-filtered docs (if search params provided)
search_filtered_ids = self._get_search_document_ids(
user=user,
filters=filters,
)
if search_filtered_ids is not None:
filtered_documents = filtered_documents.filter(pk__in=search_filtered_ids)
return list(filtered_documents.values_list("pk", flat=True))
class DocumentOperationPermissionMixin(PassUserMixin, DocumentSelectionMixin):