Call to_html on snippets. JSON fields don't support snippets, so store a 'notes_text' to highlight instead. Use tantivty score when sorting for that, instead of discarding it

2026-07-11 22:45:10 +00:00 · 2026-04-08 15:05:04 -07:00
parent 759717404e
commit acdee63197
5 changed files with 172 additions and 12 deletions
@@ -405,12 +405,17 @@ class TantivyBackend:
            doc.add_unsigned("tag_id", tag.pk)
            tag_names.append(tag.name)

-        # Notes — JSON for structured queries (notes.user:alice, notes.note:text),
-        # companion text field for default full-text search.
+        # Notes — JSON for structured queries (notes.user:alice, notes.note:text).
+        # notes_text is a plain-text companion for snippet/highlight generation;
+        # tantivy's SnippetGenerator does not support JSON fields.
        num_notes = 0
+        note_texts: list[str] = []
        for note in document.notes.all():
            num_notes += 1
            doc.add_json("notes", {"note": note.note, "user": note.user.username})
+            note_texts.append(note.note)
+        if note_texts:
+            doc.add_text("notes_text", " ".join(note_texts))

        # Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y),
        # companion text field for default full-text search.
@@ -545,6 +550,22 @@ class TantivyBackend:
        self._ensure_open()
        user_query = self._parse_query(query, search_mode)

+        # For notes_text snippet generation, we need a query that targets the
+        # notes_text field directly. user_query may contain JSON-field terms
+        # (e.g. notes.note:urgent) that the SnippetGenerator cannot resolve
+        # against a text field. Strip field:value prefixes so bare terms like
+        # "urgent" are re-parsed against notes_text, producing highlights even
+        # when the original query used structured syntax.
+        bare_query = re.sub(r"\w[\w.]*:", "", query).strip()
+        try:
+            notes_text_query = (
+                self._index.parse_query(bare_query, ["notes_text"])
+                if bare_query
+                else user_query
+            )
+        except Exception:
+            notes_text_query = user_query
+
        searcher = self._index.searcher()
        snippet_generator = None
        notes_snippet_generator = None
@@ -585,21 +606,25 @@ class TantivyBackend:
                        "content",
                    )

-                content_snippet = snippet_generator.snippet_from_doc(actual_doc)
-                if content_snippet:
-                    highlights["content"] = str(content_snippet)
+                content_html = snippet_generator.snippet_from_doc(actual_doc).to_html()
+                if content_html:
+                    highlights["content"] = content_html

-                if "notes" in doc_dict:
+                if "notes_text" in doc_dict:
+                    # Use notes_text (plain text) for snippet generation — tantivy's
+                    # SnippetGenerator does not support JSON fields.
                    if notes_snippet_generator is None:
                        notes_snippet_generator = tantivy.SnippetGenerator.create(
                            searcher,
-                            user_query,
+                            notes_text_query,
                            self._schema,
-                            "notes",
+                            "notes_text",
                        )
-                    notes_snippet = notes_snippet_generator.snippet_from_doc(actual_doc)
-                    if notes_snippet:
-                        highlights["notes"] = str(notes_snippet)
+                    notes_html = notes_snippet_generator.snippet_from_doc(
+                        actual_doc,
+                    ).to_html()
+                    if notes_html:
+                        highlights["notes"] = notes_html

            except Exception:  # pragma: no cover
                logger.debug("Failed to generate highlights for doc %s", doc_id)
@@ -72,6 +72,9 @@ def build_schema() -> tantivy.Schema:

    # JSON fields — structured queries: notes.user:alice, custom_fields.name:invoice
    sb.add_json_field("notes", stored=True, tokenizer_name="paperless_text")
+    # Plain-text companion for notes — tantivy's SnippetGenerator does not support
+    # JSON fields, so highlights require a text field with the same content.
+    sb.add_text_field("notes_text", stored=True, tokenizer_name="paperless_text")
    sb.add_json_field("custom_fields", stored=True, tokenizer_name="paperless_text")

    for field in (
@@ -558,3 +558,85 @@ class TestFieldHandling:
        assert len(ids) == 1, (
            f"Expected 1, got {len(ids)}. Note content should be searchable via notes.note: prefix."
        )
+
+
+class TestHighlightHits:
+    """Test highlight_hits returns proper HTML strings, not raw Snippet objects."""
+
+    def test_highlights_content_returns_html_string(self, backend: TantivyBackend):
+        """highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects."""
+        doc = Document.objects.create(
+            title="Highlight Test",
+            content="The quick brown fox jumps over the lazy dog",
+            checksum="HH1",
+            pk=90,
+        )
+        backend.add_or_update(doc)
+
+        hits = backend.highlight_hits("quick", [doc.pk])
+
+        assert len(hits) == 1
+        highlights = hits[0]["highlights"]
+        assert "content" in highlights
+        content_highlight = highlights["content"]
+        assert isinstance(content_highlight, str), (
+            f"Expected str, got {type(content_highlight)}: {content_highlight!r}"
+        )
+        # Tantivy wraps matched terms in <b> tags
+        assert "<b>" in content_highlight, (
+            f"Expected HTML with <b> tags, got: {content_highlight!r}"
+        )
+
+    def test_highlights_notes_returns_html_string(self, backend: TantivyBackend):
+        """Note highlights must be HTML strings via notes_text companion field.
+
+        The notes JSON field does not support tantivy SnippetGenerator; the
+        notes_text plain-text field is used instead.  We use the full-text
+        query "urgent" (not notes.note:) because notes_text IS in
+        DEFAULT_SEARCH_FIELDS via the normal search path… actually, we use
+        notes.note: prefix so the query targets notes content directly, but
+        the snippet is generated from notes_text which stores the same text.
+        """
+        user = User.objects.create_user("hl_noteuser")
+        doc = Document.objects.create(
+            title="Doc with matching note",
+            content="unrelated content",
+            checksum="HH2",
+            pk=91,
+        )
+        Note.objects.create(document=doc, note="urgent payment required", user=user)
+        backend.add_or_update(doc)
+
+        # Use notes.note: prefix so the document matches the query and the
+        # notes_text snippet generator can produce highlights.
+        hits = backend.highlight_hits("notes.note:urgent", [doc.pk])
+
+        assert len(hits) == 1
+        highlights = hits[0]["highlights"]
+        assert "notes" in highlights
+        note_highlight = highlights["notes"]
+        assert isinstance(note_highlight, str), (
+            f"Expected str, got {type(note_highlight)}: {note_highlight!r}"
+        )
+        assert "<b>" in note_highlight, (
+            f"Expected HTML with <b> tags, got: {note_highlight!r}"
+        )
+
+    def test_empty_doc_list_returns_empty_hits(self, backend: TantivyBackend):
+        """highlight_hits with no doc IDs must return an empty list."""
+        hits = backend.highlight_hits("anything", [])
+        assert hits == []
+
+    def test_no_highlights_when_no_match(self, backend: TantivyBackend):
+        """Documents not matching the query should not appear in results."""
+        doc = Document.objects.create(
+            title="Unrelated",
+            content="completely different text",
+            checksum="HH3",
+            pk=92,
+        )
+        backend.add_or_update(doc)
+
+        hits = backend.highlight_hits("quick", [doc.pk])
+
+        assert len(hits) == 0
@@ -1503,6 +1503,43 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
            [d2.id, d1.id, d3.id],
        )

+    def test_search_ordering_by_score(self) -> None:
+        """ordering=-score must return results in descending relevance order (best first)."""
+        backend = get_backend()
+        # doc_high has more occurrences of the search term → higher BM25 score
+        doc_low = Document.objects.create(
+            title="score sort low",
+            content="apple",
+            checksum="SCL1",
+        )
+        doc_high = Document.objects.create(
+            title="score sort high",
+            content="apple apple apple apple apple",
+            checksum="SCH1",
+        )
+        backend.add_or_update(doc_low)
+        backend.add_or_update(doc_high)
+
+        # -score = descending = best first (highest score)
+        response = self.client.get("/api/documents/?query=apple&ordering=-score")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        ids = [r["id"] for r in response.data["results"]]
+        self.assertEqual(
+            ids[0],
+            doc_high.id,
+            "Most relevant doc should be first for -score",
+        )
+
+        # score = ascending = worst first (lowest score)
+        response = self.client.get("/api/documents/?query=apple&ordering=score")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        ids = [r["id"] for r in response.data["results"]]
+        self.assertEqual(
+            ids[0],
+            doc_low.id,
+            "Least relevant doc should be first for +score",
+        )
+
    def test_search_with_tantivy_native_sort(self) -> None:
        """When ordering by a Tantivy-sortable field, results must be correctly sorted."""
        backend = get_backend()
@@ -2087,9 +2087,13 @@ class UnifiedSearchViewSet(DocumentViewSet):
            ordering_param = request.query_params.get("ordering", "")
            sort_reverse = ordering_param.startswith("-")
            sort_field_name = ordering_param.lstrip("-") or None
+            # "score" means relevance order — Tantivy handles it natively,
+            # so treat it as a Tantivy sort to preserve the ranked order through
+            # the ORM intersection step.
            use_tantivy_sort = (
                sort_field_name in TantivyBackend.SORTABLE_FIELDS
                or sort_field_name is None
+                or sort_field_name == "score"
            )

            try:
@@ -2144,10 +2148,15 @@ class UnifiedSearchViewSet(DocumentViewSet):
                search_mode = SearchMode.QUERY
                query_str = request.query_params["query"]

+            # "score" is not a real Tantivy sort field — it means relevance order,
+            # which is Tantivy's default when no sort field is specified.
+            is_score_sort = sort_field_name == "score"
            all_ids = backend.search_ids(
                query_str,
                user=user,
-                sort_field=sort_field_name if use_tantivy_sort else None,
+                sort_field=(
+                    None if (not use_tantivy_sort or is_score_sort) else sort_field_name
+                ),
                sort_reverse=sort_reverse,
                search_mode=search_mode,
            )
@@ -2156,6 +2165,10 @@ class UnifiedSearchViewSet(DocumentViewSet):
                filtered_qs,
                use_tantivy_sort=use_tantivy_sort,
            )
+            # Tantivy returns relevance results best-first (descending score).
+            # ordering=score (ascending, worst-first) requires a reversal.
+            if is_score_sort and not sort_reverse:
+                ordered_ids = list(reversed(ordered_ids))

            page_offset = (page_num - 1) * page_size
            page_ids = ordered_ids[page_offset : page_offset + page_size]