diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 8d2e974f2..13969ab27 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -21,6 +21,7 @@ from guardian.shortcuts import get_users_with_perms from documents.search._normalize import ascii_fold from documents.search._query import build_permission_filter +from documents.search._query import parse_simple_highlight_query from documents.search._query import parse_simple_text_query from documents.search._query import parse_simple_title_query from documents.search._query import parse_user_query @@ -335,6 +336,17 @@ class TantivyBackend: else: return parse_user_query(self._index, query, tz) + def _parse_highlight_query( + self, + query: str, + search_mode: SearchMode, + ) -> tantivy.Query: + if search_mode is SearchMode.TEXT: + # title does not supported highlight for now + return parse_simple_highlight_query(self._index, query, ["content"]) + else: + return self._parse_query(query, search_mode) + def _apply_permission_filter( self, query: tantivy.Query, @@ -549,6 +561,7 @@ class TantivyBackend: self._ensure_open() user_query = self._parse_query(query, search_mode) + highlight_query = self._parse_highlight_query(query, search_mode) # For notes_text snippet generation, we need a query that targets the # notes_text field directly. user_query may contain JSON-field terms @@ -601,7 +614,7 @@ class TantivyBackend: if snippet_generator is None: snippet_generator = tantivy.SnippetGenerator.create( searcher, - user_query, + highlight_query, self._schema, "content", ) @@ -610,7 +623,7 @@ class TantivyBackend: if content_html: highlights["content"] = content_html - if "notes_text" in doc_dict: + if search_mode is SearchMode.QUERY and "notes_text" in doc_dict: # Use notes_text (plain text) for snippet generation — tantivy's # SnippetGenerator does not support JSON fields. if notes_snippet_generator is None: diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index ed0bb4c15..0b9da49de 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -452,6 +452,14 @@ _FIELD_BOOSTS = {"title": 2.0} _SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0} +def _simple_query_tokens(raw_query: str) -> list[str]: + tokens = [ + ascii_fold(token.lower()) + for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT) + ] + return [token for token in tokens if token] + + def _build_simple_field_query( index: tantivy.Index, field: str, @@ -547,11 +555,7 @@ def parse_simple_query( Query string is escaped and normalized to be treated as "simple" text query. """ - tokens = [ - ascii_fold(token.lower()) - for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT) - ] - tokens = [token for token in tokens if token] + tokens = _simple_query_tokens(raw_query) if not tokens: return tantivy.Query.empty_query() @@ -564,6 +568,28 @@ def parse_simple_query( return tantivy.Query.boolean_query(field_queries) +def parse_simple_highlight_query( + index: tantivy.Index, + raw_query: str, + fields: list[str], +) -> tantivy.Query: + """Build a snippet-friendly query for simple text/title searches. + + Simple search matching uses regex queries but for compatibility with Tantivy + SnippetGenerator we build a plain term query over the actual text fields instead. + """ + + tokens = _simple_query_tokens(raw_query) + if not tokens: + return tantivy.Query.empty_query() + + return index.parse_query( + " ".join(tokens), + fields, + field_boosts={field: _FIELD_BOOSTS.get(field, 1.0) for field in fields}, + ) + + def parse_simple_text_query( index: tantivy.Index, raw_query: str, diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index dd745253b..99da1b674 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -563,6 +563,26 @@ class TestFieldHandling: class TestHighlightHits: """Test highlight_hits returns proper HTML strings, not raw Snippet objects.""" + def test_highlights_simple_text_mode_returns_html_string( + self, + backend: TantivyBackend, + ): + """Simple text search should still produce content highlights for exact-token hits.""" + doc = Document.objects.create( + title="Highlight Test", + content="The quick brown fox jumps over the lazy dog", + checksum="HH0", + pk=89, + ) + backend.add_or_update(doc) + + hits = backend.highlight_hits("quick", [doc.pk], search_mode=SearchMode.TEXT) + + assert len(hits) == 1 + highlights = hits[0]["highlights"] + assert "content" in highlights + assert "" in highlights["content"] + def test_highlights_content_returns_html_string(self, backend: TantivyBackend): """highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects.""" doc = Document.objects.create(