Call to_html on snippets. JSON fields don't support snippets, so store a 'notes_text' to highlight instead. Use tantivty score when sorting for that, instead of discarding it

This commit is contained in:
Trenton H
2026-04-08 15:05:04 -07:00
parent 759717404e
commit acdee63197
5 changed files with 172 additions and 12 deletions
+36 -11
View File
@@ -405,12 +405,17 @@ class TantivyBackend:
doc.add_unsigned("tag_id", tag.pk)
tag_names.append(tag.name)
# Notes — JSON for structured queries (notes.user:alice, notes.note:text),
# companion text field for default full-text search.
# Notes — JSON for structured queries (notes.user:alice, notes.note:text).
# notes_text is a plain-text companion for snippet/highlight generation;
# tantivy's SnippetGenerator does not support JSON fields.
num_notes = 0
note_texts: list[str] = []
for note in document.notes.all():
num_notes += 1
doc.add_json("notes", {"note": note.note, "user": note.user.username})
note_texts.append(note.note)
if note_texts:
doc.add_text("notes_text", " ".join(note_texts))
# Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y),
# companion text field for default full-text search.
@@ -545,6 +550,22 @@ class TantivyBackend:
self._ensure_open()
user_query = self._parse_query(query, search_mode)
# For notes_text snippet generation, we need a query that targets the
# notes_text field directly. user_query may contain JSON-field terms
# (e.g. notes.note:urgent) that the SnippetGenerator cannot resolve
# against a text field. Strip field:value prefixes so bare terms like
# "urgent" are re-parsed against notes_text, producing highlights even
# when the original query used structured syntax.
bare_query = re.sub(r"\w[\w.]*:", "", query).strip()
try:
notes_text_query = (
self._index.parse_query(bare_query, ["notes_text"])
if bare_query
else user_query
)
except Exception:
notes_text_query = user_query
searcher = self._index.searcher()
snippet_generator = None
notes_snippet_generator = None
@@ -585,21 +606,25 @@ class TantivyBackend:
"content",
)
content_snippet = snippet_generator.snippet_from_doc(actual_doc)
if content_snippet:
highlights["content"] = str(content_snippet)
content_html = snippet_generator.snippet_from_doc(actual_doc).to_html()
if content_html:
highlights["content"] = content_html
if "notes" in doc_dict:
if "notes_text" in doc_dict:
# Use notes_text (plain text) for snippet generation — tantivy's
# SnippetGenerator does not support JSON fields.
if notes_snippet_generator is None:
notes_snippet_generator = tantivy.SnippetGenerator.create(
searcher,
user_query,
notes_text_query,
self._schema,
"notes",
"notes_text",
)
notes_snippet = notes_snippet_generator.snippet_from_doc(actual_doc)
if notes_snippet:
highlights["notes"] = str(notes_snippet)
notes_html = notes_snippet_generator.snippet_from_doc(
actual_doc,
).to_html()
if notes_html:
highlights["notes"] = notes_html
except Exception: # pragma: no cover
logger.debug("Failed to generate highlights for doc %s", doc_id)
+3
View File
@@ -72,6 +72,9 @@ def build_schema() -> tantivy.Schema:
# JSON fields — structured queries: notes.user:alice, custom_fields.name:invoice
sb.add_json_field("notes", stored=True, tokenizer_name="paperless_text")
# Plain-text companion for notes — tantivy's SnippetGenerator does not support
# JSON fields, so highlights require a text field with the same content.
sb.add_text_field("notes_text", stored=True, tokenizer_name="paperless_text")
sb.add_json_field("custom_fields", stored=True, tokenizer_name="paperless_text")
for field in (
@@ -558,3 +558,85 @@ class TestFieldHandling:
assert len(ids) == 1, (
f"Expected 1, got {len(ids)}. Note content should be searchable via notes.note: prefix."
)
class TestHighlightHits:
"""Test highlight_hits returns proper HTML strings, not raw Snippet objects."""
def test_highlights_content_returns_html_string(self, backend: TantivyBackend):
"""highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects."""
doc = Document.objects.create(
title="Highlight Test",
content="The quick brown fox jumps over the lazy dog",
checksum="HH1",
pk=90,
)
backend.add_or_update(doc)
hits = backend.highlight_hits("quick", [doc.pk])
assert len(hits) == 1
highlights = hits[0]["highlights"]
assert "content" in highlights
content_highlight = highlights["content"]
assert isinstance(content_highlight, str), (
f"Expected str, got {type(content_highlight)}: {content_highlight!r}"
)
# Tantivy wraps matched terms in <b> tags
assert "<b>" in content_highlight, (
f"Expected HTML with <b> tags, got: {content_highlight!r}"
)
def test_highlights_notes_returns_html_string(self, backend: TantivyBackend):
"""Note highlights must be HTML strings via notes_text companion field.
The notes JSON field does not support tantivy SnippetGenerator; the
notes_text plain-text field is used instead. We use the full-text
query "urgent" (not notes.note:) because notes_text IS in
DEFAULT_SEARCH_FIELDS via the normal search path… actually, we use
notes.note: prefix so the query targets notes content directly, but
the snippet is generated from notes_text which stores the same text.
"""
user = User.objects.create_user("hl_noteuser")
doc = Document.objects.create(
title="Doc with matching note",
content="unrelated content",
checksum="HH2",
pk=91,
)
Note.objects.create(document=doc, note="urgent payment required", user=user)
backend.add_or_update(doc)
# Use notes.note: prefix so the document matches the query and the
# notes_text snippet generator can produce highlights.
hits = backend.highlight_hits("notes.note:urgent", [doc.pk])
assert len(hits) == 1
highlights = hits[0]["highlights"]
assert "notes" in highlights
note_highlight = highlights["notes"]
assert isinstance(note_highlight, str), (
f"Expected str, got {type(note_highlight)}: {note_highlight!r}"
)
assert "<b>" in note_highlight, (
f"Expected HTML with <b> tags, got: {note_highlight!r}"
)
def test_empty_doc_list_returns_empty_hits(self, backend: TantivyBackend):
"""highlight_hits with no doc IDs must return an empty list."""
hits = backend.highlight_hits("anything", [])
assert hits == []
def test_no_highlights_when_no_match(self, backend: TantivyBackend):
"""Documents not matching the query should not appear in results."""
doc = Document.objects.create(
title="Unrelated",
content="completely different text",
checksum="HH3",
pk=92,
)
backend.add_or_update(doc)
hits = backend.highlight_hits("quick", [doc.pk])
assert len(hits) == 0
+37
View File
@@ -1503,6 +1503,43 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
[d2.id, d1.id, d3.id],
)
def test_search_ordering_by_score(self) -> None:
"""ordering=-score must return results in descending relevance order (best first)."""
backend = get_backend()
# doc_high has more occurrences of the search term → higher BM25 score
doc_low = Document.objects.create(
title="score sort low",
content="apple",
checksum="SCL1",
)
doc_high = Document.objects.create(
title="score sort high",
content="apple apple apple apple apple",
checksum="SCH1",
)
backend.add_or_update(doc_low)
backend.add_or_update(doc_high)
# -score = descending = best first (highest score)
response = self.client.get("/api/documents/?query=apple&ordering=-score")
self.assertEqual(response.status_code, status.HTTP_200_OK)
ids = [r["id"] for r in response.data["results"]]
self.assertEqual(
ids[0],
doc_high.id,
"Most relevant doc should be first for -score",
)
# score = ascending = worst first (lowest score)
response = self.client.get("/api/documents/?query=apple&ordering=score")
self.assertEqual(response.status_code, status.HTTP_200_OK)
ids = [r["id"] for r in response.data["results"]]
self.assertEqual(
ids[0],
doc_low.id,
"Least relevant doc should be first for +score",
)
def test_search_with_tantivy_native_sort(self) -> None:
"""When ordering by a Tantivy-sortable field, results must be correctly sorted."""
backend = get_backend()
+14 -1
View File
@@ -2087,9 +2087,13 @@ class UnifiedSearchViewSet(DocumentViewSet):
ordering_param = request.query_params.get("ordering", "")
sort_reverse = ordering_param.startswith("-")
sort_field_name = ordering_param.lstrip("-") or None
# "score" means relevance order — Tantivy handles it natively,
# so treat it as a Tantivy sort to preserve the ranked order through
# the ORM intersection step.
use_tantivy_sort = (
sort_field_name in TantivyBackend.SORTABLE_FIELDS
or sort_field_name is None
or sort_field_name == "score"
)
try:
@@ -2144,10 +2148,15 @@ class UnifiedSearchViewSet(DocumentViewSet):
search_mode = SearchMode.QUERY
query_str = request.query_params["query"]
# "score" is not a real Tantivy sort field — it means relevance order,
# which is Tantivy's default when no sort field is specified.
is_score_sort = sort_field_name == "score"
all_ids = backend.search_ids(
query_str,
user=user,
sort_field=sort_field_name if use_tantivy_sort else None,
sort_field=(
None if (not use_tantivy_sort or is_score_sort) else sort_field_name
),
sort_reverse=sort_reverse,
search_mode=search_mode,
)
@@ -2156,6 +2165,10 @@ class UnifiedSearchViewSet(DocumentViewSet):
filtered_qs,
use_tantivy_sort=use_tantivy_sort,
)
# Tantivy returns relevance results best-first (descending score).
# ordering=score (ascending, worst-first) requires a reversal.
if is_score_sort and not sort_reverse:
ordered_ids = list(reversed(ordered_ids))
page_offset = (page_num - 1) * page_size
page_ids = ordered_ids[page_offset : page_offset + page_size]