diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py
index 8d2e974f2..8e4c86896 100644
--- a/src/documents/search/_backend.py
+++ b/src/documents/search/_backend.py
@@ -7,6 +7,7 @@ from collections import Counter
from datetime import UTC
from datetime import datetime
from enum import StrEnum
+from html import escape
from typing import TYPE_CHECKING
from typing import Self
from typing import TypedDict
@@ -54,6 +55,36 @@ class SearchMode(StrEnum):
TITLE = "title"
+def _render_snippet_html(snippet: tantivy.Snippet) -> str:
+ fragment = snippet.fragment()
+ highlighted = sorted(snippet.highlighted(), key=lambda r: r.start)
+
+ if not highlighted:
+ return escape(fragment)
+
+ parts: list[str] = []
+ cursor = 0
+ fragment_len = len(fragment)
+
+ for highlight in highlighted:
+ start = max(0, min(fragment_len, highlight.start))
+ end = max(start, min(fragment_len, highlight.end))
+
+ if end <= cursor:
+ continue
+
+ if start > cursor:
+ parts.append(escape(fragment[cursor:start]))
+
+ parts.append(f'{escape(fragment[start:end])}')
+ cursor = end
+
+ if cursor < fragment_len:
+ parts.append(escape(fragment[cursor:]))
+
+ return "".join(parts)
+
+
def _extract_autocomplete_words(text_sources: list[str]) -> set[str]:
"""Extract and normalize words for autocomplete.
@@ -606,7 +637,9 @@ class TantivyBackend:
"content",
)
- content_html = snippet_generator.snippet_from_doc(actual_doc).to_html()
+ content_html = _render_snippet_html(
+ snippet_generator.snippet_from_doc(actual_doc),
+ )
if content_html:
highlights["content"] = content_html
@@ -620,9 +653,9 @@ class TantivyBackend:
self._schema,
"notes_text",
)
- notes_html = notes_snippet_generator.snippet_from_doc(
- actual_doc,
- ).to_html()
+ notes_html = _render_snippet_html(
+ notes_snippet_generator.snippet_from_doc(actual_doc),
+ )
if notes_html:
highlights["notes"] = notes_html
diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py
index dd745253b..32d3d9263 100644
--- a/src/documents/tests/search/test_backend.py
+++ b/src/documents/tests/search/test_backend.py
@@ -563,8 +563,11 @@ class TestFieldHandling:
class TestHighlightHits:
"""Test highlight_hits returns proper HTML strings, not raw Snippet objects."""
- def test_highlights_content_returns_html_string(self, backend: TantivyBackend):
- """highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects."""
+ def test_highlights_content_returns_match_span_html(
+ self,
+ backend: TantivyBackend,
+ ):
+ """highlight_hits must return frontend-ready highlight spans."""
doc = Document.objects.create(
title="Highlight Test",
content="The quick brown fox jumps over the lazy dog",
@@ -582,13 +585,15 @@ class TestHighlightHits:
assert isinstance(content_highlight, str), (
f"Expected str, got {type(content_highlight)}: {content_highlight!r}"
)
- # Tantivy wraps matched terms in tags
- assert "" in content_highlight, (
- f"Expected HTML with tags, got: {content_highlight!r}"
+ assert '' in content_highlight, (
+ f"Expected HTML with match span, got: {content_highlight!r}"
)
- def test_highlights_notes_returns_html_string(self, backend: TantivyBackend):
- """Note highlights must be HTML strings via notes_text companion field.
+ def test_highlights_notes_returns_match_span_html(
+ self,
+ backend: TantivyBackend,
+ ):
+ """Note highlights must be frontend-ready HTML via notes_text companion field.
The notes JSON field does not support tantivy SnippetGenerator; the
notes_text plain-text field is used instead. We use the full-text
@@ -618,8 +623,8 @@ class TestHighlightHits:
assert isinstance(note_highlight, str), (
f"Expected str, got {type(note_highlight)}: {note_highlight!r}"
)
- assert "" in note_highlight, (
- f"Expected HTML with tags, got: {note_highlight!r}"
+ assert '' in note_highlight, (
+ f"Expected HTML with match span, got: {note_highlight!r}"
)
def test_empty_doc_list_returns_empty_hits(self, backend: TantivyBackend):