diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 8d2e974f2..8e4c86896 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -7,6 +7,7 @@ from collections import Counter from datetime import UTC from datetime import datetime from enum import StrEnum +from html import escape from typing import TYPE_CHECKING from typing import Self from typing import TypedDict @@ -54,6 +55,36 @@ class SearchMode(StrEnum): TITLE = "title" +def _render_snippet_html(snippet: tantivy.Snippet) -> str: + fragment = snippet.fragment() + highlighted = sorted(snippet.highlighted(), key=lambda r: r.start) + + if not highlighted: + return escape(fragment) + + parts: list[str] = [] + cursor = 0 + fragment_len = len(fragment) + + for highlight in highlighted: + start = max(0, min(fragment_len, highlight.start)) + end = max(start, min(fragment_len, highlight.end)) + + if end <= cursor: + continue + + if start > cursor: + parts.append(escape(fragment[cursor:start])) + + parts.append(f'{escape(fragment[start:end])}') + cursor = end + + if cursor < fragment_len: + parts.append(escape(fragment[cursor:])) + + return "".join(parts) + + def _extract_autocomplete_words(text_sources: list[str]) -> set[str]: """Extract and normalize words for autocomplete. @@ -606,7 +637,9 @@ class TantivyBackend: "content", ) - content_html = snippet_generator.snippet_from_doc(actual_doc).to_html() + content_html = _render_snippet_html( + snippet_generator.snippet_from_doc(actual_doc), + ) if content_html: highlights["content"] = content_html @@ -620,9 +653,9 @@ class TantivyBackend: self._schema, "notes_text", ) - notes_html = notes_snippet_generator.snippet_from_doc( - actual_doc, - ).to_html() + notes_html = _render_snippet_html( + notes_snippet_generator.snippet_from_doc(actual_doc), + ) if notes_html: highlights["notes"] = notes_html diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index dd745253b..32d3d9263 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -563,8 +563,11 @@ class TestFieldHandling: class TestHighlightHits: """Test highlight_hits returns proper HTML strings, not raw Snippet objects.""" - def test_highlights_content_returns_html_string(self, backend: TantivyBackend): - """highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects.""" + def test_highlights_content_returns_match_span_html( + self, + backend: TantivyBackend, + ): + """highlight_hits must return frontend-ready highlight spans.""" doc = Document.objects.create( title="Highlight Test", content="The quick brown fox jumps over the lazy dog", @@ -582,13 +585,15 @@ class TestHighlightHits: assert isinstance(content_highlight, str), ( f"Expected str, got {type(content_highlight)}: {content_highlight!r}" ) - # Tantivy wraps matched terms in tags - assert "" in content_highlight, ( - f"Expected HTML with tags, got: {content_highlight!r}" + assert '' in content_highlight, ( + f"Expected HTML with match span, got: {content_highlight!r}" ) - def test_highlights_notes_returns_html_string(self, backend: TantivyBackend): - """Note highlights must be HTML strings via notes_text companion field. + def test_highlights_notes_returns_match_span_html( + self, + backend: TantivyBackend, + ): + """Note highlights must be frontend-ready HTML via notes_text companion field. The notes JSON field does not support tantivy SnippetGenerator; the notes_text plain-text field is used instead. We use the full-text @@ -618,8 +623,8 @@ class TestHighlightHits: assert isinstance(note_highlight, str), ( f"Expected str, got {type(note_highlight)}: {note_highlight!r}" ) - assert "" in note_highlight, ( - f"Expected HTML with tags, got: {note_highlight!r}" + assert '' in note_highlight, ( + f"Expected HTML with match span, got: {note_highlight!r}" ) def test_empty_doc_list_returns_empty_hits(self, backend: TantivyBackend):