mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-27 00:45:23 +00:00
Call to_html on snippets. JSON fields don't support snippets, so store a 'notes_text' to highlight instead. Use tantivty score when sorting for that, instead of discarding it
This commit is contained in:
@@ -405,12 +405,17 @@ class TantivyBackend:
|
||||
doc.add_unsigned("tag_id", tag.pk)
|
||||
tag_names.append(tag.name)
|
||||
|
||||
# Notes — JSON for structured queries (notes.user:alice, notes.note:text),
|
||||
# companion text field for default full-text search.
|
||||
# Notes — JSON for structured queries (notes.user:alice, notes.note:text).
|
||||
# notes_text is a plain-text companion for snippet/highlight generation;
|
||||
# tantivy's SnippetGenerator does not support JSON fields.
|
||||
num_notes = 0
|
||||
note_texts: list[str] = []
|
||||
for note in document.notes.all():
|
||||
num_notes += 1
|
||||
doc.add_json("notes", {"note": note.note, "user": note.user.username})
|
||||
note_texts.append(note.note)
|
||||
if note_texts:
|
||||
doc.add_text("notes_text", " ".join(note_texts))
|
||||
|
||||
# Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y),
|
||||
# companion text field for default full-text search.
|
||||
@@ -545,6 +550,22 @@ class TantivyBackend:
|
||||
self._ensure_open()
|
||||
user_query = self._parse_query(query, search_mode)
|
||||
|
||||
# For notes_text snippet generation, we need a query that targets the
|
||||
# notes_text field directly. user_query may contain JSON-field terms
|
||||
# (e.g. notes.note:urgent) that the SnippetGenerator cannot resolve
|
||||
# against a text field. Strip field:value prefixes so bare terms like
|
||||
# "urgent" are re-parsed against notes_text, producing highlights even
|
||||
# when the original query used structured syntax.
|
||||
bare_query = re.sub(r"\w[\w.]*:", "", query).strip()
|
||||
try:
|
||||
notes_text_query = (
|
||||
self._index.parse_query(bare_query, ["notes_text"])
|
||||
if bare_query
|
||||
else user_query
|
||||
)
|
||||
except Exception:
|
||||
notes_text_query = user_query
|
||||
|
||||
searcher = self._index.searcher()
|
||||
snippet_generator = None
|
||||
notes_snippet_generator = None
|
||||
@@ -585,21 +606,25 @@ class TantivyBackend:
|
||||
"content",
|
||||
)
|
||||
|
||||
content_snippet = snippet_generator.snippet_from_doc(actual_doc)
|
||||
if content_snippet:
|
||||
highlights["content"] = str(content_snippet)
|
||||
content_html = snippet_generator.snippet_from_doc(actual_doc).to_html()
|
||||
if content_html:
|
||||
highlights["content"] = content_html
|
||||
|
||||
if "notes" in doc_dict:
|
||||
if "notes_text" in doc_dict:
|
||||
# Use notes_text (plain text) for snippet generation — tantivy's
|
||||
# SnippetGenerator does not support JSON fields.
|
||||
if notes_snippet_generator is None:
|
||||
notes_snippet_generator = tantivy.SnippetGenerator.create(
|
||||
searcher,
|
||||
user_query,
|
||||
notes_text_query,
|
||||
self._schema,
|
||||
"notes",
|
||||
"notes_text",
|
||||
)
|
||||
notes_snippet = notes_snippet_generator.snippet_from_doc(actual_doc)
|
||||
if notes_snippet:
|
||||
highlights["notes"] = str(notes_snippet)
|
||||
notes_html = notes_snippet_generator.snippet_from_doc(
|
||||
actual_doc,
|
||||
).to_html()
|
||||
if notes_html:
|
||||
highlights["notes"] = notes_html
|
||||
|
||||
except Exception: # pragma: no cover
|
||||
logger.debug("Failed to generate highlights for doc %s", doc_id)
|
||||
|
||||
@@ -72,6 +72,9 @@ def build_schema() -> tantivy.Schema:
|
||||
|
||||
# JSON fields — structured queries: notes.user:alice, custom_fields.name:invoice
|
||||
sb.add_json_field("notes", stored=True, tokenizer_name="paperless_text")
|
||||
# Plain-text companion for notes — tantivy's SnippetGenerator does not support
|
||||
# JSON fields, so highlights require a text field with the same content.
|
||||
sb.add_text_field("notes_text", stored=True, tokenizer_name="paperless_text")
|
||||
sb.add_json_field("custom_fields", stored=True, tokenizer_name="paperless_text")
|
||||
|
||||
for field in (
|
||||
|
||||
@@ -558,3 +558,85 @@ class TestFieldHandling:
|
||||
assert len(ids) == 1, (
|
||||
f"Expected 1, got {len(ids)}. Note content should be searchable via notes.note: prefix."
|
||||
)
|
||||
|
||||
|
||||
class TestHighlightHits:
|
||||
"""Test highlight_hits returns proper HTML strings, not raw Snippet objects."""
|
||||
|
||||
def test_highlights_content_returns_html_string(self, backend: TantivyBackend):
|
||||
"""highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects."""
|
||||
doc = Document.objects.create(
|
||||
title="Highlight Test",
|
||||
content="The quick brown fox jumps over the lazy dog",
|
||||
checksum="HH1",
|
||||
pk=90,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
hits = backend.highlight_hits("quick", [doc.pk])
|
||||
|
||||
assert len(hits) == 1
|
||||
highlights = hits[0]["highlights"]
|
||||
assert "content" in highlights
|
||||
content_highlight = highlights["content"]
|
||||
assert isinstance(content_highlight, str), (
|
||||
f"Expected str, got {type(content_highlight)}: {content_highlight!r}"
|
||||
)
|
||||
# Tantivy wraps matched terms in <b> tags
|
||||
assert "<b>" in content_highlight, (
|
||||
f"Expected HTML with <b> tags, got: {content_highlight!r}"
|
||||
)
|
||||
|
||||
def test_highlights_notes_returns_html_string(self, backend: TantivyBackend):
|
||||
"""Note highlights must be HTML strings via notes_text companion field.
|
||||
|
||||
The notes JSON field does not support tantivy SnippetGenerator; the
|
||||
notes_text plain-text field is used instead. We use the full-text
|
||||
query "urgent" (not notes.note:) because notes_text IS in
|
||||
DEFAULT_SEARCH_FIELDS via the normal search path… actually, we use
|
||||
notes.note: prefix so the query targets notes content directly, but
|
||||
the snippet is generated from notes_text which stores the same text.
|
||||
"""
|
||||
user = User.objects.create_user("hl_noteuser")
|
||||
doc = Document.objects.create(
|
||||
title="Doc with matching note",
|
||||
content="unrelated content",
|
||||
checksum="HH2",
|
||||
pk=91,
|
||||
)
|
||||
Note.objects.create(document=doc, note="urgent payment required", user=user)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
# Use notes.note: prefix so the document matches the query and the
|
||||
# notes_text snippet generator can produce highlights.
|
||||
hits = backend.highlight_hits("notes.note:urgent", [doc.pk])
|
||||
|
||||
assert len(hits) == 1
|
||||
highlights = hits[0]["highlights"]
|
||||
assert "notes" in highlights
|
||||
note_highlight = highlights["notes"]
|
||||
assert isinstance(note_highlight, str), (
|
||||
f"Expected str, got {type(note_highlight)}: {note_highlight!r}"
|
||||
)
|
||||
assert "<b>" in note_highlight, (
|
||||
f"Expected HTML with <b> tags, got: {note_highlight!r}"
|
||||
)
|
||||
|
||||
def test_empty_doc_list_returns_empty_hits(self, backend: TantivyBackend):
|
||||
"""highlight_hits with no doc IDs must return an empty list."""
|
||||
hits = backend.highlight_hits("anything", [])
|
||||
assert hits == []
|
||||
|
||||
def test_no_highlights_when_no_match(self, backend: TantivyBackend):
|
||||
"""Documents not matching the query should not appear in results."""
|
||||
doc = Document.objects.create(
|
||||
title="Unrelated",
|
||||
content="completely different text",
|
||||
checksum="HH3",
|
||||
pk=92,
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
hits = backend.highlight_hits("quick", [doc.pk])
|
||||
|
||||
assert len(hits) == 0
|
||||
|
||||
@@ -1503,6 +1503,43 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
[d2.id, d1.id, d3.id],
|
||||
)
|
||||
|
||||
def test_search_ordering_by_score(self) -> None:
|
||||
"""ordering=-score must return results in descending relevance order (best first)."""
|
||||
backend = get_backend()
|
||||
# doc_high has more occurrences of the search term → higher BM25 score
|
||||
doc_low = Document.objects.create(
|
||||
title="score sort low",
|
||||
content="apple",
|
||||
checksum="SCL1",
|
||||
)
|
||||
doc_high = Document.objects.create(
|
||||
title="score sort high",
|
||||
content="apple apple apple apple apple",
|
||||
checksum="SCH1",
|
||||
)
|
||||
backend.add_or_update(doc_low)
|
||||
backend.add_or_update(doc_high)
|
||||
|
||||
# -score = descending = best first (highest score)
|
||||
response = self.client.get("/api/documents/?query=apple&ordering=-score")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
ids = [r["id"] for r in response.data["results"]]
|
||||
self.assertEqual(
|
||||
ids[0],
|
||||
doc_high.id,
|
||||
"Most relevant doc should be first for -score",
|
||||
)
|
||||
|
||||
# score = ascending = worst first (lowest score)
|
||||
response = self.client.get("/api/documents/?query=apple&ordering=score")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
ids = [r["id"] for r in response.data["results"]]
|
||||
self.assertEqual(
|
||||
ids[0],
|
||||
doc_low.id,
|
||||
"Least relevant doc should be first for +score",
|
||||
)
|
||||
|
||||
def test_search_with_tantivy_native_sort(self) -> None:
|
||||
"""When ordering by a Tantivy-sortable field, results must be correctly sorted."""
|
||||
backend = get_backend()
|
||||
|
||||
+14
-1
@@ -2087,9 +2087,13 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
ordering_param = request.query_params.get("ordering", "")
|
||||
sort_reverse = ordering_param.startswith("-")
|
||||
sort_field_name = ordering_param.lstrip("-") or None
|
||||
# "score" means relevance order — Tantivy handles it natively,
|
||||
# so treat it as a Tantivy sort to preserve the ranked order through
|
||||
# the ORM intersection step.
|
||||
use_tantivy_sort = (
|
||||
sort_field_name in TantivyBackend.SORTABLE_FIELDS
|
||||
or sort_field_name is None
|
||||
or sort_field_name == "score"
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -2144,10 +2148,15 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
search_mode = SearchMode.QUERY
|
||||
query_str = request.query_params["query"]
|
||||
|
||||
# "score" is not a real Tantivy sort field — it means relevance order,
|
||||
# which is Tantivy's default when no sort field is specified.
|
||||
is_score_sort = sort_field_name == "score"
|
||||
all_ids = backend.search_ids(
|
||||
query_str,
|
||||
user=user,
|
||||
sort_field=sort_field_name if use_tantivy_sort else None,
|
||||
sort_field=(
|
||||
None if (not use_tantivy_sort or is_score_sort) else sort_field_name
|
||||
),
|
||||
sort_reverse=sort_reverse,
|
||||
search_mode=search_mode,
|
||||
)
|
||||
@@ -2156,6 +2165,10 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
filtered_qs,
|
||||
use_tantivy_sort=use_tantivy_sort,
|
||||
)
|
||||
# Tantivy returns relevance results best-first (descending score).
|
||||
# ordering=score (ascending, worst-first) requires a reversal.
|
||||
if is_score_sort and not sort_reverse:
|
||||
ordered_ids = list(reversed(ordered_ids))
|
||||
|
||||
page_offset = (page_num - 1) * page_size
|
||||
page_ids = ordered_ids[page_offset : page_offset + page_size]
|
||||
|
||||
Reference in New Issue
Block a user