diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 2ccd8c86a..5c5930c1e 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -403,6 +403,7 @@ class TantivyBackend: doc.add_text("title", document.title) doc.add_text("title_sort", document.title) doc.add_text("simple_title", document.title) + doc.add_text("bigram_title", document.title) doc.add_text("content", content) doc.add_text("bigram_content", content) doc.add_text("simple_content", content) @@ -415,12 +416,14 @@ class TantivyBackend: if document.correspondent: doc.add_text("correspondent", document.correspondent.name) doc.add_text("correspondent_sort", document.correspondent.name) + doc.add_text("bigram_correspondent", document.correspondent.name) doc.add_unsigned("correspondent_id", document.correspondent_id) # Document type if document.document_type: doc.add_text("document_type", document.document_type.name) doc.add_text("type_sort", document.document_type.name) + doc.add_text("bigram_document_type", document.document_type.name) doc.add_unsigned("document_type_id", document.document_type_id) # Storage path @@ -432,6 +435,7 @@ class TantivyBackend: tag_names: list[str] = [] for tag in document.tags.all(): doc.add_text("tag", tag.name) + doc.add_text("bigram_tag", tag.name) doc.add_unsigned("tag_id", tag.pk) tag_names.append(tag.name) diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index dbd1f8ec2..fef248253 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -84,6 +84,38 @@ _SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+") # In natural-language queries (e.g., "H52.1 - Kurzsichtigkeit"), the dash is a separator. _SPACED_OPERATOR_RE = regex.compile(r"\s+[-+]\s+") _TRAILING_OPERATOR_RE = regex.compile(r"\s+[-+]+\s*$") +# Matches CJK/Hangul characters so queries can be routed to bigram fields. +# Uses Unicode properties to cover all blocks including Extension B+ planes. +_CJK_RE: Final = regex.compile(r"[\p{Han}\p{Hiragana}\p{Katakana}\p{Hangul}]+") + + +def _has_cjk(text: str) -> bool: + """Return True if text contains any CJK characters.""" + return bool(_CJK_RE.search(text)) + + +def _build_cjk_query( + index: tantivy.Index, + raw_query: str, + fields: list[str], +) -> tantivy.Query | None: + """Build a bigram-field query from the CJK runs in ``raw_query``. + + Only the CJK character runs are extracted and parsed; ASCII field prefixes, + boolean operators and date keywords are discarded. This keeps the CJK clause + plain-text and consistent across query/simple modes (no leaked ``field:`` + semantics, no parse failures from spaced ``-``/``+``), and avoids feeding + Latin tokens into the character-bigram matcher (which would produce spurious + matches against unrelated Latin text). Returns None when there is no CJK + text or the parse fails. + """ + cjk_text = " ".join(_CJK_RE.findall(raw_query)) + if not cjk_text: + return None + try: + return index.parse_query(cjk_text, fields) + except Exception: + return None def _fmt(dt: datetime) -> str: @@ -491,6 +523,15 @@ DEFAULT_SEARCH_FIELDS = [ ] SIMPLE_SEARCH_FIELDS = ["simple_title", "simple_content"] TITLE_SEARCH_FIELDS = ["simple_title"] +_CJK_ALL_FIELDS: Final[list[str]] = [ + "bigram_content", + "bigram_title", + "bigram_correspondent", + "bigram_document_type", + "bigram_tag", +] +_CJK_CONTENT_FIELDS: Final[list[str]] = ["bigram_content"] +_CJK_TITLE_FIELDS: Final[list[str]] = ["bigram_title"] _FIELD_BOOSTS = {"title": 2.0} _SIMPLE_FIELD_BOOSTS = {"simple_title": 2.0} @@ -568,6 +609,19 @@ def parse_user_query( field_boosts=_FIELD_BOOSTS, ) + # CJK characters are stripped by ascii_fold in the standard tokenizer, so + # they would never match content/title. Route CJK queries to the bigram + # fields, which use an ngram tokenizer that preserves non-ASCII text. + cjk_query = ( + _build_cjk_query(index, raw_query, _CJK_ALL_FIELDS) + if _has_cjk(raw_query) + else None + ) + + clauses: list[tuple[tantivy.Occur, tantivy.Query]] = [ + (tantivy.Occur.Should, exact), + ] + threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD if threshold is not None: fuzzy = index.parse_query( @@ -577,38 +631,50 @@ def parse_user_query( # (prefix=True, distance=1, transposition_cost_one=True) — edit-distance fuzziness fuzzy_fields={f: (True, 1, True) for f in DEFAULT_SEARCH_FIELDS}, ) - return tantivy.Query.boolean_query( - [ - (tantivy.Occur.Should, exact), - # 0.1 boost keeps fuzzy hits ranked below exact matches (intentional) - (tantivy.Occur.Should, tantivy.Query.boost_query(fuzzy, 0.1)), - ], - ) + # 0.1 boost keeps fuzzy hits ranked below exact matches (intentional) + clauses.append((tantivy.Occur.Should, tantivy.Query.boost_query(fuzzy, 0.1))) - return exact + if cjk_query is not None: + clauses.append((tantivy.Occur.Should, cjk_query)) + + if len(clauses) == 1: + return exact + return tantivy.Query.boolean_query(clauses) def parse_simple_query( index: tantivy.Index, raw_query: str, fields: list[str], + cjk_fields: list[str] | None = None, ) -> tantivy.Query: """ Parse a plain-text query using Tantivy over a restricted field set. Query string is escaped and normalized to be treated as "simple" text query. + When cjk_fields is provided and the query contains CJK characters, an + additional Should clause searches those bigram-tokenized fields so that + CJK text is not silently dropped by ascii_fold. """ tokens = _simple_query_tokens(raw_query) - if not tokens: - return tantivy.Query.empty_query() - field_queries = [ - (tantivy.Occur.Should, _build_simple_field_query(index, field, tokens)) - for field in fields - ] - if len(field_queries) == 1: - return field_queries[0][1] - return tantivy.Query.boolean_query(field_queries) + clauses: list[tuple[tantivy.Occur, tantivy.Query]] = [] + if tokens: + clauses = [ + (tantivy.Occur.Should, _build_simple_field_query(index, field, tokens)) + for field in fields + ] + + if cjk_fields and _has_cjk(raw_query): + cjk_q = _build_cjk_query(index, raw_query, cjk_fields) + if cjk_q is not None: + clauses.append((tantivy.Occur.Should, cjk_q)) + + if not clauses: + return tantivy.Query.empty_query() + if len(clauses) == 1: + return clauses[0][1] + return tantivy.Query.boolean_query(clauses) def parse_simple_text_highlight_query( @@ -640,7 +706,12 @@ def parse_simple_text_query( Parse a plain-text query over title/content for simple search inputs. """ - return parse_simple_query(index, raw_query, SIMPLE_SEARCH_FIELDS) + return parse_simple_query( + index, + raw_query, + SIMPLE_SEARCH_FIELDS, + cjk_fields=_CJK_CONTENT_FIELDS, + ) def parse_simple_title_query( @@ -651,4 +722,9 @@ def parse_simple_title_query( Parse a plain-text query over the title field only. """ - return parse_simple_query(index, raw_query, TITLE_SEARCH_FIELDS) + return parse_simple_query( + index, + raw_query, + TITLE_SEARCH_FIELDS, + cjk_fields=_CJK_TITLE_FIELDS, + ) diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py index 1cab8f4c4..36ac83356 100644 --- a/src/documents/search/_schema.py +++ b/src/documents/search/_schema.py @@ -56,6 +56,18 @@ def build_schema() -> tantivy.Schema: # CJK support - not stored, indexed only sb.add_text_field("bigram_content", stored=False, tokenizer_name="bigram_analyzer") + sb.add_text_field("bigram_title", stored=False, tokenizer_name="bigram_analyzer") + sb.add_text_field( + "bigram_correspondent", + stored=False, + tokenizer_name="bigram_analyzer", + ) + sb.add_text_field( + "bigram_document_type", + stored=False, + tokenizer_name="bigram_analyzer", + ) + sb.add_text_field("bigram_tag", stored=False, tokenizer_name="bigram_analyzer") # Simple substring search support for title/content - not stored, indexed only sb.add_text_field( diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index 08057a936..b8be780d6 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -9,6 +9,10 @@ from documents.search._backend import SearchMode from documents.search._backend import TantivyBackend from documents.search._backend import get_backend from documents.search._backend import reset_backend +from documents.tests.factories import CorrespondentFactory +from documents.tests.factories import DocumentFactory +from documents.tests.factories import DocumentTypeFactory +from documents.tests.factories import TagFactory pytestmark = [pytest.mark.search, pytest.mark.django_db] @@ -214,6 +218,130 @@ class TestSearch: == 1 ) + @pytest.mark.parametrize( + ("mode", "title", "content", "hits", "misses"), + [ + pytest.param( + SearchMode.QUERY, + "CJK document", + "東京都の人口は約1400万人です", + ["東京", "人口"], + ["大阪"], + id="query_mode_cjk_content", + ), + pytest.param( + SearchMode.TEXT, + "CJK document", + "東京都の人口は約1400万人です", + ["東京"], + ["大阪"], + id="text_mode_cjk_content", + ), + pytest.param( + SearchMode.TITLE, + "東京都の報告書", + "This document is about Tokyo.", + ["東京", "報告"], + ["大阪"], + id="title_mode_cjk_title", + ), + ], + ) + def test_cjk_search_finds_matching_documents( + self, + backend: TantivyBackend, + mode: SearchMode, + title: str, + content: str, + hits: list[str], + misses: list[str], + ) -> None: + """CJK queries must match documents via bigram fields in all three search modes.""" + doc = DocumentFactory(title=title, content=content) + backend.add_or_update(doc) + + for query in hits: + assert len(backend.search_ids(query, user=None, search_mode=mode)) == 1, ( + f"Expected {query!r} to match in {mode} mode" + ) + for query in misses: + assert len(backend.search_ids(query, user=None, search_mode=mode)) == 0, ( + f"Expected {query!r} not to match in {mode} mode" + ) + + def test_title_mode_cjk_does_not_match_content_only( + self, + backend: TantivyBackend, + ) -> None: + """Title-only CJK search must not return docs where CJK appears only in content.""" + doc = DocumentFactory( + title="Tokyo report", + content="東京都の人口は約1400万人です", + ) + backend.add_or_update(doc) + + assert ( + len(backend.search_ids("東京", user=None, search_mode=SearchMode.TITLE)) + == 0 + ) + + @pytest.mark.parametrize( + ("field", "query", "miss"), + [ + pytest.param("correspondent", "東京", "大阪", id="cjk_correspondent"), + pytest.param("document_type", "請求書", "領収書", id="cjk_document_type"), + pytest.param("tag", "重要", "普通", id="cjk_tag"), + ], + ) + def test_cjk_metadata_search_via_query_mode( + self, + backend: TantivyBackend, + field: str, + query: str, + miss: str, + ) -> None: + """CJK in correspondent/document_type/tag names must be searchable via global search.""" + if field == "correspondent": + doc = DocumentFactory(correspondent=CorrespondentFactory(name=query)) + elif field == "document_type": + doc = DocumentFactory(document_type=DocumentTypeFactory(name=query)) + else: + tag = TagFactory(name=query) + doc = DocumentFactory() + doc.tags.add(tag) + backend.add_or_update(doc) + + assert ( + len(backend.search_ids(query, user=None, search_mode=SearchMode.QUERY)) == 1 + ), f"Expected CJK {field} name {query!r} to match" + assert ( + len(backend.search_ids(miss, user=None, search_mode=SearchMode.QUERY)) == 0 + ), f"Expected {miss!r} not to match" + + def test_cjk_text_mode_does_not_leak_field_query_semantics( + self, + backend: TantivyBackend, + ) -> None: + """TEXT mode is plain-text over content: a 'field:CJK' input must not be + parsed as a structured query against that field. A doc tagged 重要 with + no 重要 in its content must NOT match the TEXT-mode query 'tag:重要'.""" + tag = TagFactory(name="重要") + doc = DocumentFactory(title="report", content="just english content") + doc.tags.add(tag) + backend.add_or_update(doc) + + assert ( + len(backend.search_ids("tag:重要", user=None, search_mode=SearchMode.TEXT)) + == 0 + ) + # Sanity: the CJK run still matches when it is actually in the content. + doc2 = DocumentFactory(title="report2", content="本文に重要な情報") + backend.add_or_update(doc2) + assert ( + len(backend.search_ids("tag:重要", user=None, search_mode=SearchMode.TEXT)) + == 1 + ) + def test_sort_field_ascending(self, backend: TantivyBackend) -> None: """Searching with sort_reverse=False must return results in ascending ASN order.""" for asn in [30, 10, 20]: