mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-07-02 18:24:17 +00:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5b8fbdcec7 | |||
| beb048b94a | |||
| b33d11778a |
+1
-1
@@ -57,7 +57,7 @@ dependencies = [
|
|||||||
"llama-index-llms-ollama>=0.9.1",
|
"llama-index-llms-ollama>=0.9.1",
|
||||||
"llama-index-llms-openai-like>=0.7.1",
|
"llama-index-llms-openai-like>=0.7.1",
|
||||||
"nltk~=3.9.1",
|
"nltk~=3.9.1",
|
||||||
"ocrmypdf~=17.7.0",
|
"ocrmypdf~=17.4.2",
|
||||||
"openai>=2.32",
|
"openai>=2.32",
|
||||||
"pathvalidate~=3.3.1",
|
"pathvalidate~=3.3.1",
|
||||||
"pdf2image~=1.17.0",
|
"pdf2image~=1.17.0",
|
||||||
|
|||||||
@@ -48,6 +48,9 @@ _LANGUAGE_MAP: dict[str, str] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
|
||||||
|
# Document.title is max_length=128, so use 129 as the limit for
|
||||||
|
# Tantivy's remove_long filter
|
||||||
|
_TOKEN_REMOVE_LONG_LIMIT: Final[int] = 129
|
||||||
|
|
||||||
|
|
||||||
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
||||||
@@ -77,10 +80,10 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
|
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
|
||||||
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
|
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(129) -> lowercase -> ascii_fold [-> stemmer]"""
|
||||||
builder = (
|
builder = (
|
||||||
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
|
||||||
.filter(tantivy.Filter.remove_long(65))
|
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
|
||||||
.filter(tantivy.Filter.lowercase())
|
.filter(tantivy.Filter.lowercase())
|
||||||
.filter(tantivy.Filter.ascii_fold())
|
.filter(tantivy.Filter.ascii_fold())
|
||||||
)
|
)
|
||||||
@@ -119,12 +122,12 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
|
|||||||
|
|
||||||
|
|
||||||
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
|
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
|
||||||
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(65) -> lowercase -> ascii_fold."""
|
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(129) -> lowercase -> ascii_fold."""
|
||||||
return (
|
return (
|
||||||
tantivy.TextAnalyzerBuilder(
|
tantivy.TextAnalyzerBuilder(
|
||||||
tantivy.Tokenizer.regex(r"\S+"),
|
tantivy.Tokenizer.regex(r"\S+"),
|
||||||
)
|
)
|
||||||
.filter(tantivy.Filter.remove_long(65))
|
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
|
||||||
.filter(tantivy.Filter.lowercase())
|
.filter(tantivy.Filter.lowercase())
|
||||||
.filter(tantivy.Filter.ascii_fold())
|
.filter(tantivy.Filter.ascii_fold())
|
||||||
.build()
|
.build()
|
||||||
|
|||||||
@@ -261,6 +261,36 @@ class TestSearch:
|
|||||||
== 1
|
== 1
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("search_mode", "query"),
|
||||||
|
[
|
||||||
|
pytest.param(SearchMode.TITLE, "12345", id="title_search"),
|
||||||
|
pytest.param(SearchMode.TEXT, "12345", id="text_search"),
|
||||||
|
pytest.param(SearchMode.QUERY, None, id="query_title_exact"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_search_modes_match_model_limit_title_tokens(
|
||||||
|
self,
|
||||||
|
backend: TantivyBackend,
|
||||||
|
search_mode: SearchMode,
|
||||||
|
query: str | None,
|
||||||
|
) -> None:
|
||||||
|
"""Search must keep filename-like title tokens up to the model limit."""
|
||||||
|
long_title = "1234567890" * 12 + "12345678"
|
||||||
|
doc = Document.objects.create(
|
||||||
|
title=long_title,
|
||||||
|
content="ordinary content",
|
||||||
|
checksum="TXT12",
|
||||||
|
pk=18,
|
||||||
|
)
|
||||||
|
backend.add_or_update(doc)
|
||||||
|
|
||||||
|
assert backend.search_ids(
|
||||||
|
query or f"title:{long_title}",
|
||||||
|
user=None,
|
||||||
|
search_mode=search_mode,
|
||||||
|
) == [doc.pk]
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("mode", "title", "content", "hits", "misses"),
|
("mode", "title", "content", "hits", "misses"),
|
||||||
[
|
[
|
||||||
|
|||||||
@@ -99,6 +99,25 @@ class TestTokenizers:
|
|||||||
)
|
)
|
||||||
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
||||||
|
|
||||||
|
def test_simple_search_analyzer_supports_model_limit_token_substrings(
|
||||||
|
self,
|
||||||
|
simple_search_index: tantivy.Index,
|
||||||
|
) -> None:
|
||||||
|
"""Simple substring search keeps tokens up to Document.title's model limit."""
|
||||||
|
long_token = "abcdefghij" * 12 + "abcdefgh"
|
||||||
|
writer = simple_search_index.writer()
|
||||||
|
doc = tantivy.Document()
|
||||||
|
doc.add_text("simple_content", long_token)
|
||||||
|
writer.add_document(doc)
|
||||||
|
writer.commit()
|
||||||
|
simple_search_index.reload()
|
||||||
|
q = tantivy.Query.regex_query(
|
||||||
|
simple_search_index.schema,
|
||||||
|
"simple_content",
|
||||||
|
".*cdefg.*",
|
||||||
|
)
|
||||||
|
assert simple_search_index.searcher().search(q, limit=5).count == 1
|
||||||
|
|
||||||
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
|
||||||
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
"""Unsupported language codes should log a warning and disable stemming gracefully."""
|
||||||
sb = tantivy.SchemaBuilder()
|
sb = tantivy.SchemaBuilder()
|
||||||
|
|||||||
@@ -2796,7 +2796,7 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ocrmypdf"
|
name = "ocrmypdf"
|
||||||
version = "17.7.0"
|
version = "17.4.2"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "deprecation", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "deprecation", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
@@ -2813,9 +2813,9 @@ dependencies = [
|
|||||||
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "uharfbuzz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "uharfbuzz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/0e/f4/7d8fcd1f255ce098ecb8daf40ae20eb3fd940ac0f40c5bc22f10711352f6/ocrmypdf-17.7.0.tar.gz", hash = "sha256:c0a351910e8fee09bdbfee6cc7ba0299b9d561bba42afbbf4df29edcc7d58bc6", size = 7431306, upload-time = "2026-06-18T02:59:40.938Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/6a/ba/26dd03706f5f387acb5d0e11d33dbe716f5cbce95e033e68404636eb3de5/ocrmypdf-17.4.2.tar.gz", hash = "sha256:b564557411c9a2695137cdc34e0a1a5084c5f33d7b3ef593f2659aa6a6a1c3cd", size = 7392178, upload-time = "2026-04-20T19:33:33.572Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/d5/e5/293574a8645948aebfdcf6fd2d723720882625403b6e54c5f0e896c1e57d/ocrmypdf-17.7.0-py3-none-any.whl", hash = "sha256:a9e297d5b33749b34efbb1221675ea487348a08157c08def6d9ac1711bec0af0", size = 504897, upload-time = "2026-06-18T02:59:38.851Z" },
|
{ url = "https://files.pythonhosted.org/packages/d8/cb/e98af5616f3627dcef5b3c15cc28e566f77e96fb6f9b7592c24ff063cf5d/ocrmypdf-17.4.2-py3-none-any.whl", hash = "sha256:486d783dad386826ce052269111b1e9847f2e236a20465de0688af8c7a9bf7ad", size = 489269, upload-time = "2026-04-20T19:33:31.271Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3056,7 +3056,7 @@ requires-dist = [
|
|||||||
{ name = "llama-index-llms-openai-like", specifier = ">=0.7.1" },
|
{ name = "llama-index-llms-openai-like", specifier = ">=0.7.1" },
|
||||||
{ name = "mysqlclient", marker = "extra == 'mariadb'", specifier = "~=2.2.7" },
|
{ name = "mysqlclient", marker = "extra == 'mariadb'", specifier = "~=2.2.7" },
|
||||||
{ name = "nltk", specifier = "~=3.9.1" },
|
{ name = "nltk", specifier = "~=3.9.1" },
|
||||||
{ name = "ocrmypdf", specifier = "~=17.7.0" },
|
{ name = "ocrmypdf", specifier = "~=17.4.2" },
|
||||||
{ name = "openai", specifier = ">=2.32" },
|
{ name = "openai", specifier = ">=2.32" },
|
||||||
{ name = "pathvalidate", specifier = "~=3.3.1" },
|
{ name = "pathvalidate", specifier = "~=3.3.1" },
|
||||||
{ name = "pdf2image", specifier = "~=1.17.0" },
|
{ name = "pdf2image", specifier = "~=1.17.0" },
|
||||||
|
|||||||
Reference in New Issue
Block a user