Simplify

Enhancement: add highlighting to title + content searches
2026-04-17 21:48:55 +00:00 · 2026-04-17 08:12:56 -07:00 · 2026-04-17 08:08:31 -07:00
3 changed files with 52 additions and 7 deletions
--- a/src/documents/search/_backend.py
+++ b/src/documents/search/_backend.py
@@ -21,6 +21,7 @@ from guardian.shortcuts import get_users_with_perms

 from documents.search._normalize import ascii_fold
 from documents.search._query import build_permission_filter
+from documents.search._query import parse_simple_text_highlight_query
 from documents.search._query import parse_simple_text_query
 from documents.search._query import parse_simple_title_query
 from documents.search._query import parse_user_query
@@ -549,6 +550,9 @@ class TantivyBackend:

        self._ensure_open()
        user_query = self._parse_query(query, search_mode)
+        highlight_query = user_query
+        if search_mode is SearchMode.TEXT:
+            highlight_query = parse_simple_text_highlight_query(self._index, query)

        # For notes_text snippet generation, we need a query that targets the
        # notes_text field directly. user_query may contain JSON-field terms
@@ -601,7 +605,7 @@ class TantivyBackend:
                if snippet_generator is None:
                    snippet_generator = tantivy.SnippetGenerator.create(
                        searcher,
-                        user_query,
+                        highlight_query,
                        self._schema,
                        "content",
                    )
@@ -610,7 +614,7 @@ class TantivyBackend:
                if content_html:
                    highlights["content"] = content_html

-                if "notes_text" in doc_dict:
+                if search_mode is SearchMode.QUERY and "notes_text" in doc_dict:
                    # Use notes_text (plain text) for snippet generation — tantivy's
                    # SnippetGenerator does not support JSON fields.
                    if notes_snippet_generator is None:
--- a/src/documents/search/_query.py
+++ b/src/documents/search/_query.py
@@ -452,6 +452,14 @@ _FIELD_BOOSTS = {"title": 2.0}
 _SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}


+def _simple_query_tokens(raw_query: str) -> list[str]:
+    tokens = [
+        ascii_fold(token.lower())
+        for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
+    ]
+    return [token for token in tokens if token]
+
+
 def _build_simple_field_query(
    index: tantivy.Index,
    field: str,
@@ -547,11 +555,7 @@ def parse_simple_query(

    Query string is escaped and normalized to be treated as "simple" text query.
    """
-    tokens = [
-        ascii_fold(token.lower())
-        for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
-    ]
-    tokens = [token for token in tokens if token]
+    tokens = _simple_query_tokens(raw_query)
    if not tokens:
        return tantivy.Query.empty_query()

@@ -564,6 +568,23 @@ def parse_simple_query(
    return tantivy.Query.boolean_query(field_queries)


+def parse_simple_text_highlight_query(
+    index: tantivy.Index,
+    raw_query: str,
+) -> tantivy.Query:
+    """Build a snippet-friendly query for simple text searches.
+
+    Simple search matching uses regex queries but for compatibility with Tantivy
+    SnippetGenerator we build a plain term query over the content field instead.
+    """
+
+    tokens = _simple_query_tokens(raw_query)
+    if not tokens:
+        return tantivy.Query.empty_query()
+
+    return index.parse_query(" ".join(tokens), ["content"])
+
+
 def parse_simple_text_query(
    index: tantivy.Index,
    raw_query: str,
--- a/src/documents/tests/search/test_backend.py
+++ b/src/documents/tests/search/test_backend.py
@@ -563,6 +563,26 @@ class TestFieldHandling:
 class TestHighlightHits:
    """Test highlight_hits returns proper HTML strings, not raw Snippet objects."""

+    def test_highlights_simple_text_mode_returns_html_string(
+        self,
+        backend: TantivyBackend,
+    ):
+        """Simple text search should still produce content highlights for exact-token hits."""
+        doc = Document.objects.create(
+            title="Highlight Test",
+            content="The quick brown fox jumps over the lazy dog",
+            checksum="HH0",
+            pk=89,
+        )
+        backend.add_or_update(doc)
+
+        hits = backend.highlight_hits("quick", [doc.pk], search_mode=SearchMode.TEXT)
+
+        assert len(hits) == 1
+        highlights = hits[0]["highlights"]
+        assert "content" in highlights
+        assert "<b>" in highlights["content"]
+
    def test_highlights_content_returns_html_string(self, backend: TantivyBackend):
        """highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects."""
        doc = Document.objects.create(
Author	SHA1	Message	Date
shamoon	dd56c2ec25	Simplify	2026-04-17 08:12:56 -07:00
shamoon	d60cb0e21f	Enhancement: add highlighting to title + content searches	2026-04-17 08:08:31 -07:00