Compare commits

..

1 Commits

Author SHA1 Message Date
dependabot[bot] da381ab9aa Chore(deps): Bump the pre-commit-dependencies group with 3 updates
Bumps the pre-commit-dependencies group with 3 updates: prettier, [https://github.com/astral-sh/ruff-pre-commit](https://github.com/astral-sh/ruff-pre-commit) and [https://github.com/tox-dev/pyproject-fmt](https://github.com/tox-dev/pyproject-fmt).


Updates `prettier` from 3.8.3 to 3.8.4

Updates `https://github.com/astral-sh/ruff-pre-commit` from v0.15.17 to 0.15.19
- [Release notes](https://github.com/astral-sh/ruff-pre-commit/releases)
- [Commits](https://github.com/astral-sh/ruff-pre-commit/compare/v0.15.17...v0.15.19)

Updates `https://github.com/tox-dev/pyproject-fmt` from v2.24.1 to 2.25.0
- [Release notes](https://github.com/tox-dev/pyproject-fmt/releases)
- [Commits](https://github.com/tox-dev/pyproject-fmt/compare/v2.24.1...v2.25.0)

---
updated-dependencies:
- dependency-name: prettier
  dependency-version: 3.8.4
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: pre-commit-dependencies
- dependency-name: https://github.com/astral-sh/ruff-pre-commit
  dependency-version: 0.15.19
  dependency-type: direct:production
  dependency-group: pre-commit-dependencies
- dependency-name: https://github.com/tox-dev/pyproject-fmt
  dependency-version: 2.25.0
  dependency-type: direct:production
  dependency-group: pre-commit-dependencies
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-07-01 20:18:07 +00:00
4 changed files with 7 additions and 59 deletions
+3 -3
View File
@@ -46,16 +46,16 @@ repos:
- ts - ts
- markdown - markdown
additional_dependencies: additional_dependencies:
- prettier@3.8.3 - prettier@3.8.4
- 'prettier-plugin-organize-imports@4.3.0' - 'prettier-plugin-organize-imports@4.3.0'
# Python hooks # Python hooks
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.15.17 rev: v0.15.19
hooks: hooks:
- id: ruff-check - id: ruff-check
- id: ruff-format - id: ruff-format
- repo: https://github.com/tox-dev/pyproject-fmt - repo: https://github.com/tox-dev/pyproject-fmt
rev: "v2.24.1" rev: "v2.25.0"
hooks: hooks:
- id: pyproject-fmt - id: pyproject-fmt
additional_dependencies: [tomli] additional_dependencies: [tomli]
+4 -7
View File
@@ -48,9 +48,6 @@ _LANGUAGE_MAP: dict[str, str] = {
} }
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP) SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
# Document.title is max_length=128, so use 129 as the limit for
# Tantivy's remove_long filter
_TOKEN_REMOVE_LONG_LIMIT: Final[int] = 129
def register_tokenizers(index: tantivy.Index, language: str | None) -> None: def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
@@ -80,10 +77,10 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer: def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(129) -> lowercase -> ascii_fold [-> stemmer]""" """Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
builder = ( builder = (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple()) tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT)) .filter(tantivy.Filter.remove_long(65))
.filter(tantivy.Filter.lowercase()) .filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold()) .filter(tantivy.Filter.ascii_fold())
) )
@@ -122,12 +119,12 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
def _simple_search_analyzer() -> tantivy.TextAnalyzer: def _simple_search_analyzer() -> tantivy.TextAnalyzer:
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(129) -> lowercase -> ascii_fold.""" """Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(65) -> lowercase -> ascii_fold."""
return ( return (
tantivy.TextAnalyzerBuilder( tantivy.TextAnalyzerBuilder(
tantivy.Tokenizer.regex(r"\S+"), tantivy.Tokenizer.regex(r"\S+"),
) )
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT)) .filter(tantivy.Filter.remove_long(65))
.filter(tantivy.Filter.lowercase()) .filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold()) .filter(tantivy.Filter.ascii_fold())
.build() .build()
@@ -261,36 +261,6 @@ class TestSearch:
== 1 == 1
) )
@pytest.mark.parametrize(
("search_mode", "query"),
[
pytest.param(SearchMode.TITLE, "12345", id="title_search"),
pytest.param(SearchMode.TEXT, "12345", id="text_search"),
pytest.param(SearchMode.QUERY, None, id="query_title_exact"),
],
)
def test_search_modes_match_model_limit_title_tokens(
self,
backend: TantivyBackend,
search_mode: SearchMode,
query: str | None,
) -> None:
"""Search must keep filename-like title tokens up to the model limit."""
long_title = "1234567890" * 12 + "12345678"
doc = Document.objects.create(
title=long_title,
content="ordinary content",
checksum="TXT12",
pk=18,
)
backend.add_or_update(doc)
assert backend.search_ids(
query or f"title:{long_title}",
user=None,
search_mode=search_mode,
) == [doc.pk]
@pytest.mark.parametrize( @pytest.mark.parametrize(
("mode", "title", "content", "hits", "misses"), ("mode", "title", "content", "hits", "misses"),
[ [
@@ -99,25 +99,6 @@ class TestTokenizers:
) )
assert simple_search_index.searcher().search(q, limit=5).count == 1 assert simple_search_index.searcher().search(q, limit=5).count == 1
def test_simple_search_analyzer_supports_model_limit_token_substrings(
self,
simple_search_index: tantivy.Index,
) -> None:
"""Simple substring search keeps tokens up to Document.title's model limit."""
long_token = "abcdefghij" * 12 + "abcdefgh"
writer = simple_search_index.writer()
doc = tantivy.Document()
doc.add_text("simple_content", long_token)
writer.add_document(doc)
writer.commit()
simple_search_index.reload()
q = tantivy.Query.regex_query(
simple_search_index.schema,
"simple_content",
".*cdefg.*",
)
assert simple_search_index.searcher().search(q, limit=5).count == 1
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None: def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
"""Unsupported language codes should log a warning and disable stemming gracefully.""" """Unsupported language codes should log a warning and disable stemming gracefully."""
sb = tantivy.SchemaBuilder() sb = tantivy.SchemaBuilder()