Compare commits

...

2 Commits

Author SHA1 Message Date
shamoon
dd56c2ec25 Simplify 2026-04-17 08:12:56 -07:00
shamoon
d60cb0e21f Enhancement: add highlighting to title + content searches 2026-04-17 08:08:31 -07:00
3 changed files with 52 additions and 7 deletions

View File

@@ -21,6 +21,7 @@ from guardian.shortcuts import get_users_with_perms
from documents.search._normalize import ascii_fold
from documents.search._query import build_permission_filter
from documents.search._query import parse_simple_text_highlight_query
from documents.search._query import parse_simple_text_query
from documents.search._query import parse_simple_title_query
from documents.search._query import parse_user_query
@@ -549,6 +550,9 @@ class TantivyBackend:
self._ensure_open()
user_query = self._parse_query(query, search_mode)
highlight_query = user_query
if search_mode is SearchMode.TEXT:
highlight_query = parse_simple_text_highlight_query(self._index, query)
# For notes_text snippet generation, we need a query that targets the
# notes_text field directly. user_query may contain JSON-field terms
@@ -601,7 +605,7 @@ class TantivyBackend:
if snippet_generator is None:
snippet_generator = tantivy.SnippetGenerator.create(
searcher,
user_query,
highlight_query,
self._schema,
"content",
)
@@ -610,7 +614,7 @@ class TantivyBackend:
if content_html:
highlights["content"] = content_html
if "notes_text" in doc_dict:
if search_mode is SearchMode.QUERY and "notes_text" in doc_dict:
# Use notes_text (plain text) for snippet generation — tantivy's
# SnippetGenerator does not support JSON fields.
if notes_snippet_generator is None:

View File

@@ -452,6 +452,14 @@ _FIELD_BOOSTS = {"title": 2.0}
_SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}
def _simple_query_tokens(raw_query: str) -> list[str]:
tokens = [
ascii_fold(token.lower())
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
]
return [token for token in tokens if token]
def _build_simple_field_query(
index: tantivy.Index,
field: str,
@@ -547,11 +555,7 @@ def parse_simple_query(
Query string is escaped and normalized to be treated as "simple" text query.
"""
tokens = [
ascii_fold(token.lower())
for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
]
tokens = [token for token in tokens if token]
tokens = _simple_query_tokens(raw_query)
if not tokens:
return tantivy.Query.empty_query()
@@ -564,6 +568,23 @@ def parse_simple_query(
return tantivy.Query.boolean_query(field_queries)
def parse_simple_text_highlight_query(
index: tantivy.Index,
raw_query: str,
) -> tantivy.Query:
"""Build a snippet-friendly query for simple text searches.
Simple search matching uses regex queries but for compatibility with Tantivy
SnippetGenerator we build a plain term query over the content field instead.
"""
tokens = _simple_query_tokens(raw_query)
if not tokens:
return tantivy.Query.empty_query()
return index.parse_query(" ".join(tokens), ["content"])
def parse_simple_text_query(
index: tantivy.Index,
raw_query: str,

View File

@@ -563,6 +563,26 @@ class TestFieldHandling:
class TestHighlightHits:
"""Test highlight_hits returns proper HTML strings, not raw Snippet objects."""
def test_highlights_simple_text_mode_returns_html_string(
self,
backend: TantivyBackend,
):
"""Simple text search should still produce content highlights for exact-token hits."""
doc = Document.objects.create(
title="Highlight Test",
content="The quick brown fox jumps over the lazy dog",
checksum="HH0",
pk=89,
)
backend.add_or_update(doc)
hits = backend.highlight_hits("quick", [doc.pk], search_mode=SearchMode.TEXT)
assert len(hits) == 1
highlights = hits[0]["highlights"]
assert "content" in highlights
assert "<b>" in highlights["content"]
def test_highlights_content_returns_html_string(self, backend: TantivyBackend):
"""highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects."""
doc = Document.objects.create(