mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-30 17:24:22 +00:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| be882b8bcb |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -48,9 +48,6 @@ _LANGUAGE_MAP: dict[str, str] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
||||||
# Document.title is max_length=128, so use 129 as the limit for
|
|
||||||
# Tantivy's remove_long filter
|
|
||||||
_TOKEN_REMOVE_LONG_LIMIT: Final[int] = 129
|
|
||||||
|
|
||||||
|
|
||||||
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||||
@@ -80,10 +77,10 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
|
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
|
||||||
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(129) -> lowercase -> ascii_fold [-> stemmer]"""
|
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
|
||||||
builder = (
|
builder = (
|
||||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
||||||
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
|
.filter(tantivy.Filter.remove_long(65))
|
||||||
.filter(tantivy.Filter.lowercase())
|
.filter(tantivy.Filter.lowercase())
|
||||||
.filter(tantivy.Filter.ascii_fold())
|
.filter(tantivy.Filter.ascii_fold())
|
||||||
)
|
)
|
||||||
@@ -122,12 +119,12 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
|
|||||||
|
|
||||||
|
|
||||||
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
|
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
|
||||||
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(129) -> lowercase -> ascii_fold."""
|
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(65) -> lowercase -> ascii_fold."""
|
||||||
return (
|
return (
|
||||||
tantivy.TextAnalyzerBuilder(
|
tantivy.TextAnalyzerBuilder(
|
||||||
tantivy.Tokenizer.regex(r"\S+"),
|
tantivy.Tokenizer.regex(r"\S+"),
|
||||||
)
|
)
|
||||||
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
|
.filter(tantivy.Filter.remove_long(65))
|
||||||
.filter(tantivy.Filter.lowercase())
|
.filter(tantivy.Filter.lowercase())
|
||||||
.filter(tantivy.Filter.ascii_fold())
|
.filter(tantivy.Filter.ascii_fold())
|
||||||
.build()
|
.build()
|
||||||
|
|||||||
@@ -261,36 +261,6 @@ class TestSearch:
|
|||||||
== 1
|
== 1
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("search_mode", "query"),
|
|
||||||
[
|
|
||||||
pytest.param(SearchMode.TITLE, "12345", id="title_search"),
|
|
||||||
pytest.param(SearchMode.TEXT, "12345", id="text_search"),
|
|
||||||
pytest.param(SearchMode.QUERY, None, id="query_title_exact"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_search_modes_match_model_limit_title_tokens(
|
|
||||||
self,
|
|
||||||
backend: TantivyBackend,
|
|
||||||
search_mode: SearchMode,
|
|
||||||
query: str | None,
|
|
||||||
) -> None:
|
|
||||||
"""Search must keep filename-like title tokens up to the model limit."""
|
|
||||||
long_title = "1234567890" * 12 + "12345678"
|
|
||||||
doc = Document.objects.create(
|
|
||||||
title=long_title,
|
|
||||||
content="ordinary content",
|
|
||||||
checksum="TXT12",
|
|
||||||
pk=18,
|
|
||||||
)
|
|
||||||
backend.add_or_update(doc)
|
|
||||||
|
|
||||||
assert backend.search_ids(
|
|
||||||
query or f"title:{long_title}",
|
|
||||||
user=None,
|
|
||||||
search_mode=search_mode,
|
|
||||||
) == [doc.pk]
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("mode", "title", "content", "hits", "misses"),
|
("mode", "title", "content", "hits", "misses"),
|
||||||
[
|
[
|
||||||
|
|||||||
@@ -99,25 +99,6 @@ class TestTokenizers:
|
|||||||
)
|
)
|
||||||
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
||||||
|
|
||||||
def test_simple_search_analyzer_supports_model_limit_token_substrings(
|
|
||||||
self,
|
|
||||||
simple_search_index: tantivy.Index,
|
|
||||||
) -> None:
|
|
||||||
"""Simple substring search keeps tokens up to Document.title's model limit."""
|
|
||||||
long_token = "abcdefghij" * 12 + "abcdefgh"
|
|
||||||
writer = simple_search_index.writer()
|
|
||||||
doc = tantivy.Document()
|
|
||||||
doc.add_text("simple_content", long_token)
|
|
||||||
writer.add_document(doc)
|
|
||||||
writer.commit()
|
|
||||||
simple_search_index.reload()
|
|
||||||
q = tantivy.Query.regex_query(
|
|
||||||
simple_search_index.schema,
|
|
||||||
"simple_content",
|
|
||||||
".*cdefg.*",
|
|
||||||
)
|
|
||||||
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
|
||||||
|
|
||||||
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
||||||
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
||||||
sb = tantivy.SchemaBuilder()
|
sb = tantivy.SchemaBuilder()
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user