diff --git a/src/documents/search/_tokenizer.py b/src/documents/search/_tokenizer.py index 207a876a1..6879a66af 100644 --- a/src/documents/search/_tokenizer.py +++ b/src/documents/search/_tokenizer.py @@ -48,6 +48,9 @@ _LANGUAGE_MAP: dict[str, str] = { } SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP) +# Document.title is max_length=128, so use 129 as the limit for +# Tantivy's remove_long filter +_TOKEN_REMOVE_LONG_LIMIT: Final[int] = 129 def register_tokenizers(index: tantivy.Index, language: str | None) -> None: @@ -77,10 +80,10 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None: def _paperless_text(language: str | None) -> tantivy.TextAnalyzer: - """Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]""" + """Main full-text tokenizer for content, title, etc: simple -> remove_long(129) -> lowercase -> ascii_fold [-> stemmer]""" builder = ( tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple()) - .filter(tantivy.Filter.remove_long(65)) + .filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT)) .filter(tantivy.Filter.lowercase()) .filter(tantivy.Filter.ascii_fold()) ) @@ -119,12 +122,12 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer: def _simple_search_analyzer() -> tantivy.TextAnalyzer: - """Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(65) -> lowercase -> ascii_fold.""" + """Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(129) -> lowercase -> ascii_fold.""" return ( tantivy.TextAnalyzerBuilder( tantivy.Tokenizer.regex(r"\S+"), ) - .filter(tantivy.Filter.remove_long(65)) + .filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT)) .filter(tantivy.Filter.lowercase()) .filter(tantivy.Filter.ascii_fold()) .build() diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index d9fa30715..5c804439b 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -261,6 +261,36 @@ class TestSearch: == 1 ) + @pytest.mark.parametrize( + ("search_mode", "query"), + [ + pytest.param(SearchMode.TITLE, "12345", id="title_search"), + pytest.param(SearchMode.TEXT, "12345", id="text_search"), + pytest.param(SearchMode.QUERY, None, id="query_title_exact"), + ], + ) + def test_search_modes_match_model_limit_title_tokens( + self, + backend: TantivyBackend, + search_mode: SearchMode, + query: str | None, + ) -> None: + """Search must keep filename-like title tokens up to the model limit.""" + long_title = "1234567890" * 12 + "12345678" + doc = Document.objects.create( + title=long_title, + content="ordinary content", + checksum="TXT12", + pk=18, + ) + backend.add_or_update(doc) + + assert backend.search_ids( + query or f"title:{long_title}", + user=None, + search_mode=search_mode, + ) == [doc.pk] + @pytest.mark.parametrize( ("mode", "title", "content", "hits", "misses"), [ diff --git a/src/documents/tests/search/test_tokenizer.py b/src/documents/tests/search/test_tokenizer.py index fc2c41231..bd03cdf09 100644 --- a/src/documents/tests/search/test_tokenizer.py +++ b/src/documents/tests/search/test_tokenizer.py @@ -99,6 +99,25 @@ class TestTokenizers: ) assert simple_search_index.searcher().search(q, limit=5).count == 1 + def test_simple_search_analyzer_supports_model_limit_token_substrings( + self, + simple_search_index: tantivy.Index, + ) -> None: + """Simple substring search keeps tokens up to Document.title's model limit.""" + long_token = "abcdefghij" * 12 + "abcdefgh" + writer = simple_search_index.writer() + doc = tantivy.Document() + doc.add_text("simple_content", long_token) + writer.add_document(doc) + writer.commit() + simple_search_index.reload() + q = tantivy.Query.regex_query( + simple_search_index.schema, + "simple_content", + ".*cdefg.*", + ) + assert simple_search_index.searcher().search(q, limit=5).count == 1 + def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None: """Unsupported language codes should log a warning and disable stemming gracefully.""" sb = tantivy.SchemaBuilder()