mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-07-02 18:24:17 +00:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| da381ab9aa |
@@ -46,16 +46,16 @@ repos:
|
|||||||
- ts
|
- ts
|
||||||
- markdown
|
- markdown
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
- prettier@3.8.3
|
- prettier@3.8.4
|
||||||
- 'prettier-plugin-organize-imports@4.3.0'
|
- 'prettier-plugin-organize-imports@4.3.0'
|
||||||
# Python hooks
|
# Python hooks
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.15.17
|
rev: v0.15.19
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff-check
|
- id: ruff-check
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
- repo: https://github.com/tox-dev/pyproject-fmt
|
- repo: https://github.com/tox-dev/pyproject-fmt
|
||||||
rev: "v2.24.1"
|
rev: "v2.25.0"
|
||||||
hooks:
|
hooks:
|
||||||
- id: pyproject-fmt
|
- id: pyproject-fmt
|
||||||
additional_dependencies: [tomli]
|
additional_dependencies: [tomli]
|
||||||
|
|||||||
@@ -48,9 +48,6 @@ _LANGUAGE_MAP: dict[str, str] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
||||||
# Document.title is max_length=128, so use 129 as the limit for
|
|
||||||
# Tantivy's remove_long filter
|
|
||||||
_TOKEN_REMOVE_LONG_LIMIT: Final[int] = 129
|
|
||||||
|
|
||||||
|
|
||||||
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||||
@@ -80,10 +77,10 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
|
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
|
||||||
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(129) -> lowercase -> ascii_fold [-> stemmer]"""
|
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
|
||||||
builder = (
|
builder = (
|
||||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
||||||
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
|
.filter(tantivy.Filter.remove_long(65))
|
||||||
.filter(tantivy.Filter.lowercase())
|
.filter(tantivy.Filter.lowercase())
|
||||||
.filter(tantivy.Filter.ascii_fold())
|
.filter(tantivy.Filter.ascii_fold())
|
||||||
)
|
)
|
||||||
@@ -122,12 +119,12 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
|
|||||||
|
|
||||||
|
|
||||||
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
|
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
|
||||||
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(129) -> lowercase -> ascii_fold."""
|
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(65) -> lowercase -> ascii_fold."""
|
||||||
return (
|
return (
|
||||||
tantivy.TextAnalyzerBuilder(
|
tantivy.TextAnalyzerBuilder(
|
||||||
tantivy.Tokenizer.regex(r"\S+"),
|
tantivy.Tokenizer.regex(r"\S+"),
|
||||||
)
|
)
|
||||||
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
|
.filter(tantivy.Filter.remove_long(65))
|
||||||
.filter(tantivy.Filter.lowercase())
|
.filter(tantivy.Filter.lowercase())
|
||||||
.filter(tantivy.Filter.ascii_fold())
|
.filter(tantivy.Filter.ascii_fold())
|
||||||
.build()
|
.build()
|
||||||
|
|||||||
@@ -261,36 +261,6 @@ class TestSearch:
|
|||||||
== 1
|
== 1
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("search_mode", "query"),
|
|
||||||
[
|
|
||||||
pytest.param(SearchMode.TITLE, "12345", id="title_search"),
|
|
||||||
pytest.param(SearchMode.TEXT, "12345", id="text_search"),
|
|
||||||
pytest.param(SearchMode.QUERY, None, id="query_title_exact"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_search_modes_match_model_limit_title_tokens(
|
|
||||||
self,
|
|
||||||
backend: TantivyBackend,
|
|
||||||
search_mode: SearchMode,
|
|
||||||
query: str | None,
|
|
||||||
) -> None:
|
|
||||||
"""Search must keep filename-like title tokens up to the model limit."""
|
|
||||||
long_title = "1234567890" * 12 + "12345678"
|
|
||||||
doc = Document.objects.create(
|
|
||||||
title=long_title,
|
|
||||||
content="ordinary content",
|
|
||||||
checksum="TXT12",
|
|
||||||
pk=18,
|
|
||||||
)
|
|
||||||
backend.add_or_update(doc)
|
|
||||||
|
|
||||||
assert backend.search_ids(
|
|
||||||
query or f"title:{long_title}",
|
|
||||||
user=None,
|
|
||||||
search_mode=search_mode,
|
|
||||||
) == [doc.pk]
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("mode", "title", "content", "hits", "misses"),
|
("mode", "title", "content", "hits", "misses"),
|
||||||
[
|
[
|
||||||
|
|||||||
@@ -99,25 +99,6 @@ class TestTokenizers:
|
|||||||
)
|
)
|
||||||
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
||||||
|
|
||||||
def test_simple_search_analyzer_supports_model_limit_token_substrings(
|
|
||||||
self,
|
|
||||||
simple_search_index: tantivy.Index,
|
|
||||||
) -> None:
|
|
||||||
"""Simple substring search keeps tokens up to Document.title's model limit."""
|
|
||||||
long_token = "abcdefghij" * 12 + "abcdefgh"
|
|
||||||
writer = simple_search_index.writer()
|
|
||||||
doc = tantivy.Document()
|
|
||||||
doc.add_text("simple_content", long_token)
|
|
||||||
writer.add_document(doc)
|
|
||||||
writer.commit()
|
|
||||||
simple_search_index.reload()
|
|
||||||
q = tantivy.Query.regex_query(
|
|
||||||
simple_search_index.schema,
|
|
||||||
"simple_content",
|
|
||||||
".*cdefg.*",
|
|
||||||
)
|
|
||||||
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
|
||||||
|
|
||||||
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
||||||
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
||||||
sb = tantivy.SchemaBuilder()
|
sb = tantivy.SchemaBuilder()
|
||||||
|
|||||||
Reference in New Issue
Block a user