diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index f9f857e3f..087c66550 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -303,8 +303,10 @@ class TantivyBackend: doc.add_text("checksum", document.checksum) doc.add_text("title", document.title) doc.add_text("title_sort", document.title) + doc.add_text("simple_title", document.title) doc.add_text("content", content) doc.add_text("bigram_content", content) + doc.add_text("simple_content", content) # Original filename - only add if not None/empty if document.original_filename: diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index ea9b4dfd8..44c604f8f 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -1,5 +1,6 @@ from __future__ import annotations +import unicodedata from datetime import UTC from datetime import date from datetime import datetime @@ -51,7 +52,7 @@ _WHOOSH_REL_RANGE_RE = regex.compile( ) # Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly _DATE8_RE = regex.compile(r"(?P\w+):(?P\d{8})\b") -_SIMPLE_QUERY_SPECIAL_CHARS_RE = regex.compile(r'([+\-!(){}\[\]^"~*?:\\/])') +_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+") def _fmt(dt: datetime) -> str: @@ -439,9 +440,38 @@ DEFAULT_SEARCH_FIELDS = [ "note", # companion text field for notes content (notes JSON for structured: notes.user:x) "custom_field", # companion text field for CF values (custom_fields JSON for structured: custom_fields.name:x) ] -SIMPLE_SEARCH_FIELDS = ["title", "content"] -TITLE_SEARCH_FIELDS = ["title"] +SIMPLE_SEARCH_FIELDS = ["simple_title", "simple_content"] +TITLE_SEARCH_FIELDS = ["simple_title"] _FIELD_BOOSTS = {"title": 2.0} +_SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0} + + +def _normalize_simple_token(token: str) -> str: + return ( + unicodedata.normalize("NFD", token.lower()) + .encode( + "ascii", + "ignore", + ) + .decode() + ) + + +def _build_simple_field_query( + index: tantivy.Index, + field: str, + tokens: list[str], +) -> tantivy.Query: + patterns = [f".*{regex.escape(token)}.*" for token in tokens] + if len(patterns) == 1: + query = tantivy.Query.regex_query(index.schema, field, patterns[0]) + else: + query = tantivy.Query.regex_phrase_query(index.schema, field, patterns) + + boost = _SIMPLE_FIELD_BOOSTS.get(field, 1.0) + if boost != 1.0: + return tantivy.Query.boost_query(query, boost) + return query def parse_user_query( @@ -512,20 +542,21 @@ def parse_simple_query( Query string is escaped and normalized to be treated as "simple" text query. """ - # strips special characters that would be interpreted as syntax by the parser - query_str = regex.sub( - _SIMPLE_QUERY_SPECIAL_CHARS_RE, - r"\\\1", - raw_query, - timeout=_REGEX_TIMEOUT, - ) - # collapse multiple spaces to a single space for cleaner parsing (and to prevent ReDoS on excessive whitespace) - query_str = regex.sub(r" {2,}", " ", query_str, timeout=_REGEX_TIMEOUT).strip() - return index.parse_query( - query_str, - fields, - field_boosts=_FIELD_BOOSTS, - ) + tokens = [ + _normalize_simple_token(token) + for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT) + ] + tokens = [token for token in tokens if token] + if not tokens: + return tantivy.Query.empty_query() + + field_queries = [ + (tantivy.Occur.Should, _build_simple_field_query(index, field, tokens)) + for field in fields + ] + if len(field_queries) == 1: + return field_queries[0][1] + return tantivy.Query.boolean_query(field_queries) def parse_simple_text_query( diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py index 705b7b023..9f111b132 100644 --- a/src/documents/search/_schema.py +++ b/src/documents/search/_schema.py @@ -53,6 +53,18 @@ def build_schema() -> tantivy.Schema: # CJK support - not stored, indexed only sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer") + # Simple substring search support for title/content - not stored, indexed only + sb.add_text_field( + "simple_title", + stored=False, + tokenizer_name="simple_search_analyzer", + ) + sb.add_text_field( + "simple_content", + stored=False, + tokenizer_name="simple_search_analyzer", + ) + # Autocomplete prefix scan - stored, not indexed sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw") diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py index e597a879e..c26acbaee 100644 --- a/src/documents/search/_tokenizer.py +++ b/src/documents/search/_tokenizer.py @@ -70,6 +70,7 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None: index.register_tokenizer("paperless_text", _paperless_text(language)) index.register_tokenizer("simple_analyzer", _simple_analyzer()) index.register_tokenizer("bigram_analyzer", _bigram_analyzer()) + index.register_tokenizer("simple_search_analyzer", _simple_search_analyzer()) # Fast-field tokenizer required for fast=True text fields in the schema index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer()) @@ -114,3 +115,15 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer: .filter(tantivy.Filter.lowercase()) .build() ) + + +def _simple_search_analyzer() -> tantivy.TextAnalyzer: + """Tokenizer for simple substring search fields: non-whitespace chunks -> lowercase -> ascii_fold.""" + return ( + tantivy.TextAnalyzerBuilder( + tantivy.Tokenizer.regex(r"\S+"), + ) + .filter(tantivy.Filter.lowercase()) + .filter(tantivy.Filter.ascii_fold()) + .build() + ) diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index 64c0efeb3..fabba5883 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -117,6 +117,122 @@ class TestSearch: ) assert title_match.total == 1 + def test_text_mode_matches_partial_term_substrings( + self, + backend: TantivyBackend, + ): + """Simple text mode should support substring matching within tokens.""" + doc = Document.objects.create( + title="Account access", + content="password reset instructions", + checksum="TXT3", + pk=11, + ) + backend.add_or_update(doc) + + prefix_match = backend.search( + "pass", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert prefix_match.total == 1 + + infix_match = backend.search( + "sswo", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert infix_match.total == 1 + + phrase_match = backend.search( + "sswo re", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert phrase_match.total == 1 + + def test_text_mode_does_not_match_on_partial_term_overlap( + self, + backend: TantivyBackend, + ): + """Simple text mode should not match documents that merely share partial fragments.""" + doc = Document.objects.create( + title="Adobe Acrobat PDF Files", + content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + checksum="TXT7", + pk=13, + ) + backend.add_or_update(doc) + + non_match = backend.search( + "raptor", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TEXT, + ) + assert non_match.total == 0 + + def test_title_mode_matches_partial_term_substrings( + self, + backend: TantivyBackend, + ): + """Title mode should support substring matching within title tokens.""" + doc = Document.objects.create( + title="Password guide", + content="reset instructions", + checksum="TXT4", + pk=12, + ) + backend.add_or_update(doc) + + prefix_match = backend.search( + "pass", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TITLE, + ) + assert prefix_match.total == 1 + + infix_match = backend.search( + "sswo", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TITLE, + ) + assert infix_match.total == 1 + + phrase_match = backend.search( + "sswo gu", + user=None, + page=1, + page_size=10, + sort_field=None, + sort_reverse=False, + search_mode=SearchMode.TITLE, + ) + assert phrase_match.total == 1 + def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend): """Search scores must be normalized so top hit has score 1.0 for UI consistency.""" for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]): diff --git a/src/documents/tests/search/test_tokenizer.py b/src/documents/tests/search/test_tokenizer.py index aee52a567..fc2c41231 100644 --- a/src/documents/tests/search/test_tokenizer.py +++ b/src/documents/tests/search/test_tokenizer.py @@ -8,6 +8,7 @@ import tantivy from documents.search._tokenizer import _bigram_analyzer from documents.search._tokenizer import _paperless_text +from documents.search._tokenizer import _simple_search_analyzer from documents.search._tokenizer import register_tokenizers if TYPE_CHECKING: @@ -41,6 +42,20 @@ class TestTokenizers: idx.register_tokenizer("bigram_analyzer", _bigram_analyzer()) return idx + @pytest.fixture + def simple_search_index(self) -> tantivy.Index: + """Index with simple-search field for Latin substring tests.""" + sb = tantivy.SchemaBuilder() + sb.add_text_field( + "simple_content", + stored=False, + tokenizer_name="simple_search_analyzer", + ) + schema = sb.build() + idx = tantivy.Index(schema, path=None) + idx.register_tokenizer("simple_search_analyzer", _simple_search_analyzer()) + return idx + def test_ascii_fold_finds_accented_content( self, content_index: tantivy.Index, @@ -66,6 +81,24 @@ class TestTokenizers: q = bigram_index.parse_query("東京", ["bigram_content"]) assert bigram_index.searcher().search(q, limit=5).count == 1 + def test_simple_search_analyzer_supports_regex_substrings( + self, + simple_search_index: tantivy.Index, + ) -> None: + """Whitespace-preserving simple search analyzer supports substring regex matching.""" + writer = simple_search_index.writer() + doc = tantivy.Document() + doc.add_text("simple_content", "tag:invoice password-reset") + writer.add_document(doc) + writer.commit() + simple_search_index.reload() + q = tantivy.Query.regex_query( + simple_search_index.schema, + "simple_content", + ".*sswo.*", + ) + assert simple_search_index.searcher().search(q, limit=5).count == 1 + def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None: """Unsupported language codes should log a warning and disable stemming gracefully.""" sb = tantivy.SchemaBuilder() diff --git a/src/documents/tests/test_api_search.py b/src/documents/tests/test_api_search.py index cc20d0111..02626873d 100644 --- a/src/documents/tests/test_api_search.py +++ b/src/documents/tests/test_api_search.py @@ -119,6 +119,47 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data["count"], 0) + def test_simple_text_search_matches_substrings(self) -> None: + matching_doc = Document.objects.create( + title="Quarterly summary", + content="Password reset instructions", + checksum="T5", + pk=15, + ) + + backend = get_backend() + backend.add_or_update(matching_doc) + + response = self.client.get("/api/documents/?text=pass") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], matching_doc.id) + + response = self.client.get("/api/documents/?text=sswo") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], matching_doc.id) + + response = self.client.get("/api/documents/?text=sswo re") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], matching_doc.id) + + def test_simple_text_search_does_not_match_on_partial_term_overlap(self) -> None: + non_matching_doc = Document.objects.create( + title="Adobe Acrobat PDF Files", + content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + checksum="T7", + pk=17, + ) + + backend = get_backend() + backend.add_or_update(non_matching_doc) + + response = self.client.get("/api/documents/?text=raptor") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 0) + def test_simple_title_search(self) -> None: title_match = Document.objects.create( title="Quarterly summary", @@ -142,6 +183,32 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): self.assertEqual(response.data["count"], 1) self.assertEqual(response.data["results"][0]["id"], title_match.id) + def test_simple_title_search_matches_substrings(self) -> None: + title_match = Document.objects.create( + title="Password handbook", + content="No matching content here", + checksum="T6", + pk=16, + ) + + backend = get_backend() + backend.add_or_update(title_match) + + response = self.client.get("/api/documents/?title_search=pass") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], title_match.id) + + response = self.client.get("/api/documents/?title_search=sswo") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], title_match.id) + + response = self.client.get("/api/documents/?title_search=sswo hand") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["count"], 1) + self.assertEqual(response.data["results"][0]["id"], title_match.id) + def test_search_returns_all_for_api_version_9(self) -> None: d1 = Document.objects.create( title="invoice",