Enhancement: add highlighting to title + content searches

This commit is contained in:
shamoon
2026-04-15 21:17:06 -07:00
parent bf6915114b
commit d60cb0e21f
3 changed files with 66 additions and 7 deletions

View File

@@ -21,6 +21,7 @@ from guardian.shortcuts import get_users_with_perms
from documents.search._normalize import ascii_fold
from documents.search._query import build_permission_filter
from documents.search._query import parse_simple_highlight_query
from documents.search._query import parse_simple_text_query
from documents.search._query import parse_simple_title_query
from documents.search._query import parse_user_query
@@ -335,6 +336,17 @@ class TantivyBackend:
else:
return parse_user_query(self._index, query, tz)
def _parse_highlight_query(
self,
query: str,
search_mode: SearchMode,
) -> tantivy.Query:
if search_mode is SearchMode.TEXT:
# title does not supported highlight for now
return parse_simple_highlight_query(self._index, query, ["content"])
else:
return self._parse_query(query, search_mode)
def _apply_permission_filter(
self,
query: tantivy.Query,
@@ -549,6 +561,7 @@ class TantivyBackend:
self._ensure_open()
user_query = self._parse_query(query, search_mode)
highlight_query = self._parse_highlight_query(query, search_mode)
# For notes_text snippet generation, we need a query that targets the
# notes_text field directly. user_query may contain JSON-field terms
@@ -601,7 +614,7 @@ class TantivyBackend:
if snippet_generator is None:
snippet_generator = tantivy.SnippetGenerator.create(
searcher,
user_query,
highlight_query,
self._schema,
"content",
)
@@ -610,7 +623,7 @@ class TantivyBackend:
if content_html:
highlights["content"] = content_html
if "notes_text" in doc_dict:
if search_mode is SearchMode.QUERY and "notes_text" in doc_dict:
# Use notes_text (plain text) for snippet generation — tantivy's
# SnippetGenerator does not support JSON fields.
if notes_snippet_generator is None:

View File

@@ -452,6 +452,14 @@ _FIELD_BOOSTS = {"title": 2.0}
_SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}
def _simple_query_tokens(raw_query: str) -> list[str]:
tokens = [
ascii_fold(token.lower())
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
]
return [token for token in tokens if token]
def _build_simple_field_query(
index: tantivy.Index,
field: str,
@@ -547,11 +555,7 @@ def parse_simple_query(
Query string is escaped and normalized to be treated as "simple" text query.
"""
tokens = [
ascii_fold(token.lower())
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
]
tokens = [token for token in tokens if token]
tokens = _simple_query_tokens(raw_query)
if not tokens:
return tantivy.Query.empty_query()
@@ -564,6 +568,28 @@ def parse_simple_query(
return tantivy.Query.boolean_query(field_queries)
def parse_simple_highlight_query(
index: tantivy.Index,
raw_query: str,
fields: list[str],
) -> tantivy.Query:
"""Build a snippet-friendly query for simple text/title searches.
Simple search matching uses regex queries but for compatibility with Tantivy
SnippetGenerator we build a plain term query over the actual text fields instead.
"""
tokens = _simple_query_tokens(raw_query)
if not tokens:
return tantivy.Query.empty_query()
return index.parse_query(
" ".join(tokens),
fields,
field_boosts={field: _FIELD_BOOSTS.get(field, 1.0) for field in fields},
)
def parse_simple_text_query(
index: tantivy.Index,
raw_query: str,

View File

@@ -563,6 +563,26 @@ class TestFieldHandling:
class TestHighlightHits:
"""Test highlight_hits returns proper HTML strings, not raw Snippet objects."""
def test_highlights_simple_text_mode_returns_html_string(
self,
backend: TantivyBackend,
):
"""Simple text search should still produce content highlights for exact-token hits."""
doc = Document.objects.create(
title="Highlight Test",
content="The quick brown fox jumps over the lazy dog",
checksum="HH0",
pk=89,
)
backend.add_or_update(doc)
hits = backend.highlight_hits("quick", [doc.pk], search_mode=SearchMode.TEXT)
assert len(hits) == 1
highlights = hits[0]["highlights"]
assert "content" in highlights
assert "<b>" in highlights["content"]
def test_highlights_content_returns_html_string(self, backend: TantivyBackend):
"""highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects."""
doc = Document.objects.create(