diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 8d2e974f2..46f56f339 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -21,6 +21,7 @@ from guardian.shortcuts import get_users_with_perms from documents.search._normalize import ascii_fold from documents.search._query import build_permission_filter +from documents.search._query import parse_simple_text_highlight_query from documents.search._query import parse_simple_text_query from documents.search._query import parse_simple_title_query from documents.search._query import parse_user_query @@ -549,6 +550,9 @@ class TantivyBackend: self._ensure_open() user_query = self._parse_query(query, search_mode) + highlight_query = user_query + if search_mode is SearchMode.TEXT: + highlight_query = parse_simple_text_highlight_query(self._index, query) # For notes_text snippet generation, we need a query that targets the # notes_text field directly. user_query may contain JSON-field terms @@ -601,7 +605,7 @@ class TantivyBackend: if snippet_generator is None: snippet_generator = tantivy.SnippetGenerator.create( searcher, - user_query, + highlight_query, self._schema, "content", ) @@ -610,7 +614,7 @@ class TantivyBackend: if content_html: highlights["content"] = content_html - if "notes_text" in doc_dict: + if search_mode is SearchMode.QUERY and "notes_text" in doc_dict: # Use notes_text (plain text) for snippet generation — tantivy's # SnippetGenerator does not support JSON fields. if notes_snippet_generator is None: diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 1bd31b804..59421c763 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -490,6 +490,14 @@ _FIELD_BOOSTS = {"title": 2.0} _SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0} +def _simple_query_tokens(raw_query: str) -> list[str]: + tokens = [ + ascii_fold(token.lower()) + for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT) + ] + return [token for token in tokens if token] + + def _build_simple_field_query( index: tantivy.Index, field: str, @@ -585,11 +593,7 @@ def parse_simple_query( Query string is escaped and normalized to be treated as "simple" text query. """ - tokens = [ - ascii_fold(token.lower()) - for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT) - ] - tokens = [token for token in tokens if token] + tokens = _simple_query_tokens(raw_query) if not tokens: return tantivy.Query.empty_query() @@ -602,6 +606,23 @@ def parse_simple_query( return tantivy.Query.boolean_query(field_queries) +def parse_simple_text_highlight_query( + index: tantivy.Index, + raw_query: str, +) -> tantivy.Query: + """Build a snippet-friendly query for simple text searches. + + Simple search matching uses regex queries but for compatibility with Tantivy + SnippetGenerator we build a plain term query over the content field instead. + """ + + tokens = _simple_query_tokens(raw_query) + if not tokens: + return tantivy.Query.empty_query() + + return index.parse_query(" ".join(tokens), ["content"]) + + def parse_simple_text_query( index: tantivy.Index, raw_query: str, diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index dd745253b..99da1b674 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -563,6 +563,26 @@ class TestFieldHandling: class TestHighlightHits: """Test highlight_hits returns proper HTML strings, not raw Snippet objects.""" + def test_highlights_simple_text_mode_returns_html_string( + self, + backend: TantivyBackend, + ): + """Simple text search should still produce content highlights for exact-token hits.""" + doc = Document.objects.create( + title="Highlight Test", + content="The quick brown fox jumps over the lazy dog", + checksum="HH0", + pk=89, + ) + backend.add_or_update(doc) + + hits = backend.highlight_hits("quick", [doc.pk], search_mode=SearchMode.TEXT) + + assert len(hits) == 1 + highlights = hits[0]["highlights"] + assert "content" in highlights + assert "" in highlights["content"] + def test_highlights_content_returns_html_string(self, backend: TantivyBackend): """highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects.""" doc = Document.objects.create(