mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-02 22:28:51 +00:00
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: Antoine Mérino <3023499+Merinorus@users.noreply.github.com>
79 lines
2.9 KiB
Python
79 lines
2.9 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import TYPE_CHECKING
|
|
|
|
import pytest
|
|
import tantivy
|
|
|
|
from documents.search._tokenizer import _bigram_analyzer
|
|
from documents.search._tokenizer import _paperless_text
|
|
from documents.search._tokenizer import register_tokenizers
|
|
|
|
if TYPE_CHECKING:
|
|
from _pytest.logging import LogCaptureFixture
|
|
|
|
pytestmark = pytest.mark.search
|
|
|
|
|
|
class TestTokenizers:
|
|
@pytest.fixture
|
|
def content_index(self) -> tantivy.Index:
|
|
"""Index with just a content field for ASCII folding tests."""
|
|
sb = tantivy.SchemaBuilder()
|
|
sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
|
|
schema = sb.build()
|
|
idx = tantivy.Index(schema, path=None)
|
|
idx.register_tokenizer("paperless_text", _paperless_text(""))
|
|
return idx
|
|
|
|
@pytest.fixture
|
|
def bigram_index(self) -> tantivy.Index:
|
|
"""Index with bigram field for CJK tests."""
|
|
sb = tantivy.SchemaBuilder()
|
|
sb.add_text_field(
|
|
"bigram_content",
|
|
stored=False,
|
|
tokenizer_name="bigram_analyzer",
|
|
)
|
|
schema = sb.build()
|
|
idx = tantivy.Index(schema, path=None)
|
|
idx.register_tokenizer("bigram_analyzer", _bigram_analyzer())
|
|
return idx
|
|
|
|
def test_ascii_fold_finds_accented_content(
|
|
self,
|
|
content_index: tantivy.Index,
|
|
) -> None:
|
|
"""ASCII folding allows searching accented text with plain ASCII queries."""
|
|
writer = content_index.writer()
|
|
doc = tantivy.Document()
|
|
doc.add_text("content", "café résumé")
|
|
writer.add_document(doc)
|
|
writer.commit()
|
|
content_index.reload()
|
|
q = content_index.parse_query("cafe resume", ["content"])
|
|
assert content_index.searcher().search(q, limit=5).count == 1
|
|
|
|
def test_bigram_finds_cjk_substring(self, bigram_index: tantivy.Index) -> None:
|
|
"""Bigram tokenizer enables substring search in CJK languages without whitespace delimiters."""
|
|
writer = bigram_index.writer()
|
|
doc = tantivy.Document()
|
|
doc.add_text("bigram_content", "東京都")
|
|
writer.add_document(doc)
|
|
writer.commit()
|
|
bigram_index.reload()
|
|
q = bigram_index.parse_query("東京", ["bigram_content"])
|
|
assert bigram_index.searcher().search(q, limit=5).count == 1
|
|
|
|
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
|
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
|
sb = tantivy.SchemaBuilder()
|
|
sb.add_text_field("content", stored=True, tokenizer_name="paperless_text")
|
|
schema = sb.build()
|
|
idx = tantivy.Index(schema, path=None)
|
|
|
|
with caplog.at_level(logging.WARNING, logger="paperless.search"):
|
|
register_tokenizers(idx, "klingon")
|
|
assert "klingon" in caplog.text
|