Switch simple substring search to simple_search analyzer

2026-04-01 22:02:44 +00:00 · 2026-04-01 12:22:16 -07:00
parent 7c98d29de2
commit 3539f3f66a
7 changed files with 291 additions and 17 deletions
--- a/src/documents/search/_backend.py
+++ b/src/documents/search/_backend.py
@@ -303,8 +303,10 @@ class TantivyBackend:
        doc.add_text("checksum", document.checksum)
        doc.add_text("title", document.title)
        doc.add_text("title_sort", document.title)
+        doc.add_text("simple_title", document.title)
        doc.add_text("content", content)
        doc.add_text("bigram_content", content)
+        doc.add_text("simple_content", content)

        # Original filename - only add if not None/empty
        if document.original_filename:
--- a/src/documents/search/_query.py
+++ b/src/documents/search/_query.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import unicodedata
 from datetime import UTC
 from datetime import date
 from datetime import datetime
@@ -51,7 +52,7 @@ _WHOOSH_REL_RANGE_RE = regex.compile(
 )
 # Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly
 _DATE8_RE = regex.compile(r"(?P<field>\w+):(?P<date8>\d{8})\b")
-_SIMPLE_QUERY_SPECIAL_CHARS_RE = regex.compile(r'([+\-!(){}\[\]^"~*?:\\/])')
+_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+")


 def _fmt(dt: datetime) -> str:
@@ -439,9 +440,38 @@ DEFAULT_SEARCH_FIELDS = [
    "note",  # companion text field for notes content (notes JSON for structured: notes.user:x)
    "custom_field",  # companion text field for CF values (custom_fields JSON for structured: custom_fields.name:x)
 ]
-SIMPLE_SEARCH_FIELDS = ["title", "content"]
-TITLE_SEARCH_FIELDS = ["title"]
+SIMPLE_SEARCH_FIELDS = ["simple_title", "simple_content"]
+TITLE_SEARCH_FIELDS = ["simple_title"]
 _FIELD_BOOSTS = {"title": 2.0}
+_SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0}
+
+
+def _normalize_simple_token(token: str) -> str:
+    return (
+        unicodedata.normalize("NFD", token.lower())
+        .encode(
+            "ascii",
+            "ignore",
+        )
+        .decode()
+    )
+
+
+def _build_simple_field_query(
+    index: tantivy.Index,
+    field: str,
+    tokens: list[str],
+) -> tantivy.Query:
+    patterns = [f".*{regex.escape(token)}.*" for token in tokens]
+    if len(patterns) == 1:
+        query = tantivy.Query.regex_query(index.schema, field, patterns[0])
+    else:
+        query = tantivy.Query.regex_phrase_query(index.schema, field, patterns)
+
+    boost = _SIMPLE_FIELD_BOOSTS.get(field, 1.0)
+    if boost != 1.0:
+        return tantivy.Query.boost_query(query, boost)
+    return query


 def parse_user_query(
@@ -512,20 +542,21 @@ def parse_simple_query(

    Query string is escaped and normalized to be treated as "simple" text query.
    """
-    # strips special characters that would be interpreted as syntax by the parser
-    query_str = regex.sub(
-        _SIMPLE_QUERY_SPECIAL_CHARS_RE,
-        r"\\\1",
-        raw_query,
-        timeout=_REGEX_TIMEOUT,
-    )
-    # collapse multiple spaces to a single space for cleaner parsing (and to prevent ReDoS on excessive whitespace)
-    query_str = regex.sub(r" {2,}", " ", query_str, timeout=_REGEX_TIMEOUT).strip()
-    return index.parse_query(
-        query_str,
-        fields,
-        field_boosts=_FIELD_BOOSTS,
-    )
+    tokens = [
+        _normalize_simple_token(token)
+        for token in _SIMPLE_QUERY_TOKEN_RE.findall(raw_query, timeout=_REGEX_TIMEOUT)
+    ]
+    tokens = [token for token in tokens if token]
+    if not tokens:
+        return tantivy.Query.empty_query()
+
+    field_queries = [
+        (tantivy.Occur.Should, _build_simple_field_query(index, field, tokens))
+        for field in fields
+    ]
+    if len(field_queries) == 1:
+        return field_queries[0][1]
+    return tantivy.Query.boolean_query(field_queries)


 def parse_simple_text_query(
--- a/src/documents/search/_schema.py
+++ b/src/documents/search/_schema.py
@@ -53,6 +53,18 @@ def build_schema() -> tantivy.Schema:
    # CJK support - not stored, indexed only
    sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer")

+    # Simple substring search support for title/content - not stored, indexed only
+    sb.add_text_field(
+        "simple_title",
+        stored=False,
+        tokenizer_name="simple_search_analyzer",
+    )
+    sb.add_text_field(
+        "simple_content",
+        stored=False,
+        tokenizer_name="simple_search_analyzer",
+    )
+
    # Autocomplete prefix scan - stored, not indexed
    sb.add_text_field("autocomplete_word", stored=True, tokenizer_name="raw")

--- a/src/documents/search/_tokenizer.py
+++ b/src/documents/search/_tokenizer.py
@@ -70,6 +70,7 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
    index.register_tokenizer("paperless_text", _paperless_text(language))
    index.register_tokenizer("simple_analyzer", _simple_analyzer())
    index.register_tokenizer("bigram_analyzer", _bigram_analyzer())
+    index.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
    # Fast-field tokenizer required for fast=True text fields in the schema
    index.register_fast_field_tokenizer("simple_analyzer", _simple_analyzer())

@@ -114,3 +115,15 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
        .filter(tantivy.Filter.lowercase())
        .build()
    )
+
+
+def _simple_search_analyzer() -> tantivy.TextAnalyzer:
+    """Tokenizer for simple substring search fields: non-whitespace chunks -> lowercase -> ascii_fold."""
+    return (
+        tantivy.TextAnalyzerBuilder(
+            tantivy.Tokenizer.regex(r"\S+"),
+        )
+        .filter(tantivy.Filter.lowercase())
+        .filter(tantivy.Filter.ascii_fold())
+        .build()
+    )
--- a/src/documents/tests/search/test_backend.py
+++ b/src/documents/tests/search/test_backend.py
@@ -117,6 +117,122 @@ class TestSearch:
        )
        assert title_match.total == 1

+    def test_text_mode_matches_partial_term_substrings(
+        self,
+        backend: TantivyBackend,
+    ):
+        """Simple text mode should support substring matching within tokens."""
+        doc = Document.objects.create(
+            title="Account access",
+            content="password reset instructions",
+            checksum="TXT3",
+            pk=11,
+        )
+        backend.add_or_update(doc)
+
+        prefix_match = backend.search(
+            "pass",
+            user=None,
+            page=1,
+            page_size=10,
+            sort_field=None,
+            sort_reverse=False,
+            search_mode=SearchMode.TEXT,
+        )
+        assert prefix_match.total == 1
+
+        infix_match = backend.search(
+            "sswo",
+            user=None,
+            page=1,
+            page_size=10,
+            sort_field=None,
+            sort_reverse=False,
+            search_mode=SearchMode.TEXT,
+        )
+        assert infix_match.total == 1
+
+        phrase_match = backend.search(
+            "sswo re",
+            user=None,
+            page=1,
+            page_size=10,
+            sort_field=None,
+            sort_reverse=False,
+            search_mode=SearchMode.TEXT,
+        )
+        assert phrase_match.total == 1
+
+    def test_text_mode_does_not_match_on_partial_term_overlap(
+        self,
+        backend: TantivyBackend,
+    ):
+        """Simple text mode should not match documents that merely share partial fragments."""
+        doc = Document.objects.create(
+            title="Adobe Acrobat PDF Files",
+            content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+            checksum="TXT7",
+            pk=13,
+        )
+        backend.add_or_update(doc)
+
+        non_match = backend.search(
+            "raptor",
+            user=None,
+            page=1,
+            page_size=10,
+            sort_field=None,
+            sort_reverse=False,
+            search_mode=SearchMode.TEXT,
+        )
+        assert non_match.total == 0
+
+    def test_title_mode_matches_partial_term_substrings(
+        self,
+        backend: TantivyBackend,
+    ):
+        """Title mode should support substring matching within title tokens."""
+        doc = Document.objects.create(
+            title="Password guide",
+            content="reset instructions",
+            checksum="TXT4",
+            pk=12,
+        )
+        backend.add_or_update(doc)
+
+        prefix_match = backend.search(
+            "pass",
+            user=None,
+            page=1,
+            page_size=10,
+            sort_field=None,
+            sort_reverse=False,
+            search_mode=SearchMode.TITLE,
+        )
+        assert prefix_match.total == 1
+
+        infix_match = backend.search(
+            "sswo",
+            user=None,
+            page=1,
+            page_size=10,
+            sort_field=None,
+            sort_reverse=False,
+            search_mode=SearchMode.TITLE,
+        )
+        assert infix_match.total == 1
+
+        phrase_match = backend.search(
+            "sswo gu",
+            user=None,
+            page=1,
+            page_size=10,
+            sort_field=None,
+            sort_reverse=False,
+            search_mode=SearchMode.TITLE,
+        )
+        assert phrase_match.total == 1
+
    def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
        """Search scores must be normalized so top hit has score 1.0 for UI consistency."""
        for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
--- a/src/documents/tests/search/test_tokenizer.py
+++ b/src/documents/tests/search/test_tokenizer.py
@@ -8,6 +8,7 @@ import tantivy

 from documents.search._tokenizer import _bigram_analyzer
 from documents.search._tokenizer import _paperless_text
+from documents.search._tokenizer import _simple_search_analyzer
 from documents.search._tokenizer import register_tokenizers

 if TYPE_CHECKING:
@@ -41,6 +42,20 @@ class TestTokenizers:
        idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
        return idx

+    @pytest.fixture
+    def simple_search_index(self) -> tantivy.Index:
+        """Index with simple-search field for Latin substring tests."""
+        sb = tantivy.SchemaBuilder()
+        sb.add_text_field(
+            "simple_content",
+            stored=False,
+            tokenizer_name="simple_search_analyzer",
+        )
+        schema = sb.build()
+        idx = tantivy.Index(schema, path=None)
+        idx.register_tokenizer("simple_search_analyzer", _simple_search_analyzer())
+        return idx
+
    def test_ascii_fold_finds_accented_content(
        self,
        content_index: tantivy.Index,
@@ -66,6 +81,24 @@ class TestTokenizers:
        q = bigram_index.parse_query("東京", ["bigram_content"])
        assert bigram_index.searcher().search(q, limit=5).count == 1

+    def test_simple_search_analyzer_supports_regex_substrings(
+        self,
+        simple_search_index: tantivy.Index,
+    ) -> None:
+        """Whitespace-preserving simple search analyzer supports substring regex matching."""
+        writer = simple_search_index.writer()
+        doc = tantivy.Document()
+        doc.add_text("simple_content", "tag:invoice password-reset")
+        writer.add_document(doc)
+        writer.commit()
+        simple_search_index.reload()
+        q = tantivy.Query.regex_query(
+            simple_search_index.schema,
+            "simple_content",
+            ".*sswo.*",
+        )
+        assert simple_search_index.searcher().search(q, limit=5).count == 1
+
    def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
        """Unsupported language codes should log a warning and disable stemming gracefully."""
        sb = tantivy.SchemaBuilder()
--- a/src/documents/tests/test_api_search.py
+++ b/src/documents/tests/test_api_search.py
@@ -119,6 +119,47 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(response.data["count"], 0)

+    def test_simple_text_search_matches_substrings(self) -> None:
+        matching_doc = Document.objects.create(
+            title="Quarterly summary",
+            content="Password reset instructions",
+            checksum="T5",
+            pk=15,
+        )
+
+        backend = get_backend()
+        backend.add_or_update(matching_doc)
+
+        response = self.client.get("/api/documents/?text=pass")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["count"], 1)
+        self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
+
+        response = self.client.get("/api/documents/?text=sswo")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["count"], 1)
+        self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
+
+        response = self.client.get("/api/documents/?text=sswo re")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["count"], 1)
+        self.assertEqual(response.data["results"][0]["id"], matching_doc.id)
+
+    def test_simple_text_search_does_not_match_on_partial_term_overlap(self) -> None:
+        non_matching_doc = Document.objects.create(
+            title="Adobe Acrobat PDF Files",
+            content="Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
+            checksum="T7",
+            pk=17,
+        )
+
+        backend = get_backend()
+        backend.add_or_update(non_matching_doc)
+
+        response = self.client.get("/api/documents/?text=raptor")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["count"], 0)
+
    def test_simple_title_search(self) -> None:
        title_match = Document.objects.create(
            title="Quarterly summary",
@@ -142,6 +183,32 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.data["count"], 1)
        self.assertEqual(response.data["results"][0]["id"], title_match.id)

+    def test_simple_title_search_matches_substrings(self) -> None:
+        title_match = Document.objects.create(
+            title="Password handbook",
+            content="No matching content here",
+            checksum="T6",
+            pk=16,
+        )
+
+        backend = get_backend()
+        backend.add_or_update(title_match)
+
+        response = self.client.get("/api/documents/?title_search=pass")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["count"], 1)
+        self.assertEqual(response.data["results"][0]["id"], title_match.id)
+
+        response = self.client.get("/api/documents/?title_search=sswo")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["count"], 1)
+        self.assertEqual(response.data["results"][0]["id"], title_match.id)
+
+        response = self.client.get("/api/documents/?title_search=sswo hand")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["count"], 1)
+        self.assertEqual(response.data["results"][0]["id"], title_match.id)
+
    def test_search_returns_all_for_api_version_9(self) -> None:
        d1 = Document.objects.create(
            title="invoice",