diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 652cc2c3e..8d2e974f2 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -405,12 +405,17 @@ class TantivyBackend: doc.add_unsigned("tag_id", tag.pk) tag_names.append(tag.name) - # Notes — JSON for structured queries (notes.user:alice, notes.note:text), - # companion text field for default full-text search. + # Notes — JSON for structured queries (notes.user:alice, notes.note:text). + # notes_text is a plain-text companion for snippet/highlight generation; + # tantivy's SnippetGenerator does not support JSON fields. num_notes = 0 + note_texts: list[str] = [] for note in document.notes.all(): num_notes += 1 doc.add_json("notes", {"note": note.note, "user": note.user.username}) + note_texts.append(note.note) + if note_texts: + doc.add_text("notes_text", " ".join(note_texts)) # Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y), # companion text field for default full-text search. @@ -545,6 +550,22 @@ class TantivyBackend: self._ensure_open() user_query = self._parse_query(query, search_mode) + # For notes_text snippet generation, we need a query that targets the + # notes_text field directly. user_query may contain JSON-field terms + # (e.g. notes.note:urgent) that the SnippetGenerator cannot resolve + # against a text field. Strip field:value prefixes so bare terms like + # "urgent" are re-parsed against notes_text, producing highlights even + # when the original query used structured syntax. + bare_query = re.sub(r"\w[\w.]*:", "", query).strip() + try: + notes_text_query = ( + self._index.parse_query(bare_query, ["notes_text"]) + if bare_query + else user_query + ) + except Exception: + notes_text_query = user_query + searcher = self._index.searcher() snippet_generator = None notes_snippet_generator = None @@ -585,21 +606,25 @@ class TantivyBackend: "content", ) - content_snippet = snippet_generator.snippet_from_doc(actual_doc) - if content_snippet: - highlights["content"] = str(content_snippet) + content_html = snippet_generator.snippet_from_doc(actual_doc).to_html() + if content_html: + highlights["content"] = content_html - if "notes" in doc_dict: + if "notes_text" in doc_dict: + # Use notes_text (plain text) for snippet generation — tantivy's + # SnippetGenerator does not support JSON fields. if notes_snippet_generator is None: notes_snippet_generator = tantivy.SnippetGenerator.create( searcher, - user_query, + notes_text_query, self._schema, - "notes", + "notes_text", ) - notes_snippet = notes_snippet_generator.snippet_from_doc(actual_doc) - if notes_snippet: - highlights["notes"] = str(notes_snippet) + notes_html = notes_snippet_generator.snippet_from_doc( + actual_doc, + ).to_html() + if notes_html: + highlights["notes"] = notes_html except Exception: # pragma: no cover logger.debug("Failed to generate highlights for doc %s", doc_id) diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py index 5e9404235..479c60bc5 100644 --- a/src/documents/search/_schema.py +++ b/src/documents/search/_schema.py @@ -72,6 +72,9 @@ def build_schema() -> tantivy.Schema: # JSON fields — structured queries: notes.user:alice, custom_fields.name:invoice sb.add_json_field("notes", stored=True, tokenizer_name="paperless_text") + # Plain-text companion for notes — tantivy's SnippetGenerator does not support + # JSON fields, so highlights require a text field with the same content. + sb.add_text_field("notes_text", stored=True, tokenizer_name="paperless_text") sb.add_json_field("custom_fields", stored=True, tokenizer_name="paperless_text") for field in ( diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index 68c4ea9d1..dd745253b 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -558,3 +558,85 @@ class TestFieldHandling: assert len(ids) == 1, ( f"Expected 1, got {len(ids)}. Note content should be searchable via notes.note: prefix." ) + + +class TestHighlightHits: + """Test highlight_hits returns proper HTML strings, not raw Snippet objects.""" + + def test_highlights_content_returns_html_string(self, backend: TantivyBackend): + """highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects.""" + doc = Document.objects.create( + title="Highlight Test", + content="The quick brown fox jumps over the lazy dog", + checksum="HH1", + pk=90, + ) + backend.add_or_update(doc) + + hits = backend.highlight_hits("quick", [doc.pk]) + + assert len(hits) == 1 + highlights = hits[0]["highlights"] + assert "content" in highlights + content_highlight = highlights["content"] + assert isinstance(content_highlight, str), ( + f"Expected str, got {type(content_highlight)}: {content_highlight!r}" + ) + # Tantivy wraps matched terms in tags + assert "" in content_highlight, ( + f"Expected HTML with tags, got: {content_highlight!r}" + ) + + def test_highlights_notes_returns_html_string(self, backend: TantivyBackend): + """Note highlights must be HTML strings via notes_text companion field. + + The notes JSON field does not support tantivy SnippetGenerator; the + notes_text plain-text field is used instead. We use the full-text + query "urgent" (not notes.note:) because notes_text IS in + DEFAULT_SEARCH_FIELDS via the normal search path… actually, we use + notes.note: prefix so the query targets notes content directly, but + the snippet is generated from notes_text which stores the same text. + """ + user = User.objects.create_user("hl_noteuser") + doc = Document.objects.create( + title="Doc with matching note", + content="unrelated content", + checksum="HH2", + pk=91, + ) + Note.objects.create(document=doc, note="urgent payment required", user=user) + backend.add_or_update(doc) + + # Use notes.note: prefix so the document matches the query and the + # notes_text snippet generator can produce highlights. + hits = backend.highlight_hits("notes.note:urgent", [doc.pk]) + + assert len(hits) == 1 + highlights = hits[0]["highlights"] + assert "notes" in highlights + note_highlight = highlights["notes"] + assert isinstance(note_highlight, str), ( + f"Expected str, got {type(note_highlight)}: {note_highlight!r}" + ) + assert "" in note_highlight, ( + f"Expected HTML with tags, got: {note_highlight!r}" + ) + + def test_empty_doc_list_returns_empty_hits(self, backend: TantivyBackend): + """highlight_hits with no doc IDs must return an empty list.""" + hits = backend.highlight_hits("anything", []) + assert hits == [] + + def test_no_highlights_when_no_match(self, backend: TantivyBackend): + """Documents not matching the query should not appear in results.""" + doc = Document.objects.create( + title="Unrelated", + content="completely different text", + checksum="HH3", + pk=92, + ) + backend.add_or_update(doc) + + hits = backend.highlight_hits("quick", [doc.pk]) + + assert len(hits) == 0 diff --git a/src/documents/tests/test_api_search.py b/src/documents/tests/test_api_search.py index 54b960719..85f479010 100644 --- a/src/documents/tests/test_api_search.py +++ b/src/documents/tests/test_api_search.py @@ -1503,6 +1503,43 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): [d2.id, d1.id, d3.id], ) + def test_search_ordering_by_score(self) -> None: + """ordering=-score must return results in descending relevance order (best first).""" + backend = get_backend() + # doc_high has more occurrences of the search term → higher BM25 score + doc_low = Document.objects.create( + title="score sort low", + content="apple", + checksum="SCL1", + ) + doc_high = Document.objects.create( + title="score sort high", + content="apple apple apple apple apple", + checksum="SCH1", + ) + backend.add_or_update(doc_low) + backend.add_or_update(doc_high) + + # -score = descending = best first (highest score) + response = self.client.get("/api/documents/?query=apple&ordering=-score") + self.assertEqual(response.status_code, status.HTTP_200_OK) + ids = [r["id"] for r in response.data["results"]] + self.assertEqual( + ids[0], + doc_high.id, + "Most relevant doc should be first for -score", + ) + + # score = ascending = worst first (lowest score) + response = self.client.get("/api/documents/?query=apple&ordering=score") + self.assertEqual(response.status_code, status.HTTP_200_OK) + ids = [r["id"] for r in response.data["results"]] + self.assertEqual( + ids[0], + doc_low.id, + "Least relevant doc should be first for +score", + ) + def test_search_with_tantivy_native_sort(self) -> None: """When ordering by a Tantivy-sortable field, results must be correctly sorted.""" backend = get_backend() diff --git a/src/documents/views.py b/src/documents/views.py index 2fa1491a5..367d7b2ea 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -2087,9 +2087,13 @@ class UnifiedSearchViewSet(DocumentViewSet): ordering_param = request.query_params.get("ordering", "") sort_reverse = ordering_param.startswith("-") sort_field_name = ordering_param.lstrip("-") or None + # "score" means relevance order — Tantivy handles it natively, + # so treat it as a Tantivy sort to preserve the ranked order through + # the ORM intersection step. use_tantivy_sort = ( sort_field_name in TantivyBackend.SORTABLE_FIELDS or sort_field_name is None + or sort_field_name == "score" ) try: @@ -2144,10 +2148,15 @@ class UnifiedSearchViewSet(DocumentViewSet): search_mode = SearchMode.QUERY query_str = request.query_params["query"] + # "score" is not a real Tantivy sort field — it means relevance order, + # which is Tantivy's default when no sort field is specified. + is_score_sort = sort_field_name == "score" all_ids = backend.search_ids( query_str, user=user, - sort_field=sort_field_name if use_tantivy_sort else None, + sort_field=( + None if (not use_tantivy_sort or is_score_sort) else sort_field_name + ), sort_reverse=sort_reverse, search_mode=search_mode, ) @@ -2156,6 +2165,10 @@ class UnifiedSearchViewSet(DocumentViewSet): filtered_qs, use_tantivy_sort=use_tantivy_sort, ) + # Tantivy returns relevance results best-first (descending score). + # ordering=score (ascending, worst-first) requires a reversal. + if is_score_sort and not sort_reverse: + ordered_ids = list(reversed(ordered_ids)) page_offset = (page_num - 1) * page_size page_ids = ordered_ids[page_offset : page_offset + page_size]