Enhancement: add highlighting to title + content searches

This commit is contained in:
shamoon
2026-04-15 21:17:06 -07:00
parent bf6915114b
commit 16af0d1bee
3 changed files with 68 additions and 7 deletions
+15 -2
View File
@@ -21,6 +21,7 @@ from guardian.shortcuts import get_users_with_perms
from documents.search._normalize import ascii_fold
from documents.search._query import build_permission_filter
from documents.search._query import parse_simple_highlight_query
from documents.search._query import parse_simple_text_query
from documents.search._query import parse_simple_title_query
from documents.search._query import parse_user_query
@@ -335,6 +336,17 @@ class TantivyBackend:
else:
return parse_user_query(self._index, query, tz)
def _parse_highlight_query(
self,
query: str,
search_mode: SearchMode,
) -> tantivy.Query:
if search_mode is SearchMode.TEXT:
# title does not supported highlight for now
return parse_simple_highlight_query(self._index, query, ["content"])
else:
return self._parse_query(query, search_mode)
def _apply_permission_filter(
self,
query: tantivy.Query,
@@ -549,6 +561,7 @@ class TantivyBackend:
self._ensure_open()
user_query = self._parse_query(query, search_mode)
highlight_query = self._parse_highlight_query(query, search_mode)
# For notes_text snippet generation, we need a query that targets the
# notes_text field directly. user_query may contain JSON-field terms
@@ -601,7 +614,7 @@ class TantivyBackend:
if snippet_generator is None:
snippet_generator = tantivy.SnippetGenerator.create(
searcher,
user_query,
highlight_query,
self._schema,
"content",
)
@@ -610,7 +623,7 @@ class TantivyBackend:
if content_html:
highlights["content"] = content_html
if "notes_text" in doc_dict:
if search_mode is SearchMode.QUERY and "notes_text" in doc_dict:
# Use notes_text (plain text) for snippet generation — tantivy's
# SnippetGenerator does not support JSON fields.
if notes_snippet_generator is None:
+33 -5
View File
@@ -452,6 +452,14 @@ _FIELD_BOOSTS = {"title": 2.0}
_SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}
def _simple_query_tokens(raw_query: str) -> list[str]:
tokens = [
ascii_fold(token.lower())
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
]
return [token for token in tokens if token]
def _build_simple_field_query(
index: tantivy.Index,
field: str,
@@ -547,11 +555,7 @@ def parse_simple_query(
Query string is escaped and normalized to be treated as "simple" text query.
"""
tokens = [
ascii_fold(token.lower())
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
]
tokens = [token for token in tokens if token]
tokens = _simple_query_tokens(raw_query)
if not tokens:
return tantivy.Query.empty_query()
@@ -564,6 +568,30 @@ def parse_simple_query(
return tantivy.Query.boolean_query(field_queries)
def parse_simple_highlight_query(
index: tantivy.Index,
raw_query: str,
fields: list[str],
) -> tantivy.Query:
"""Build a snippet-friendly query for simple text/title searches.
Simple search matching uses regex queries over the normalized shadow fields to
support substring matches. Tantivy's SnippetGenerator does not produce
highlights for that query shape, so for snippet generation we build a plain
term query over the real stored text fields instead.
"""
tokens = _simple_query_tokens(raw_query)
if not tokens:
return tantivy.Query.empty_query()
return index.parse_query(
" ".join(tokens),
fields,
field_boosts={field: _FIELD_BOOSTS.get(field, 1.0) for field in fields},
)
def parse_simple_text_query(
index: tantivy.Index,
raw_query: str,
@@ -563,6 +563,26 @@ class TestFieldHandling:
class TestHighlightHits:
"""Test highlight_hits returns proper HTML strings, not raw Snippet objects."""
def test_highlights_simple_text_mode_returns_html_string(
self,
backend: TantivyBackend,
):
"""Simple text search should still produce content highlights for exact-token hits."""
doc = Document.objects.create(
title="Highlight Test",
content="The quick brown fox jumps over the lazy dog",
checksum="HH0",
pk=89,
)
backend.add_or_update(doc)
hits = backend.highlight_hits("quick", [doc.pk], search_mode=SearchMode.TEXT)
assert len(hits) == 1
highlights = hits[0]["highlights"]
assert "content" in highlights
assert "<b>" in highlights["content"]
def test_highlights_content_returns_html_string(self, backend: TantivyBackend):
"""highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects."""
doc = Document.objects.create(