Chore(deps): Bump the pre-commit-dependencies group with 3 updates

Bumps the pre-commit-dependencies group with 3 updates: prettier, [https://github.com/astral-sh/ruff-pre-commit](https://github.com/astral-sh/ruff-pre-commit) and [https://github.com/tox-dev/pyproject-fmt](https://github.com/tox-dev/pyproject-fmt). Updates `prettier` from 3.8.3 to 3.8.4 Updates `https://github.com/astral-sh/ruff-pre-commit` from v0.15.17 to 0.15.19 - [Release notes](https://github.com/astral-sh/ruff-pre-commit/releases) - [Commits](https://github.com/astral-sh/ruff-pre-commit/compare/v0.15.17...v0.15.19) Updates `https://github.com/tox-dev/pyproject-fmt` from v2.24.1 to 2.25.0 - [Release notes](https://github.com/tox-dev/pyproject-fmt/releases) - [Commits](https://github.com/tox-dev/pyproject-fmt/compare/v2.24.1...v2.25.0) --- updated-dependencies: - dependency-name: prettier dependency-version: 3.8.4 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: pre-commit-dependencies - dependency-name: https://github.com/astral-sh/ruff-pre-commit dependency-version: 0.15.19 dependency-type: direct:production dependency-group: pre-commit-dependencies - dependency-name: https://github.com/tox-dev/pyproject-fmt dependency-version: 2.25.0 dependency-type: direct:production dependency-group: pre-commit-dependencies ... Signed-off-by: dependabot[bot] <support@github.com>
2026-07-02 18:24:17 +00:00 · 2026-07-01 20:18:07 +00:00
4 changed files with 7 additions and 59 deletions
@@ -46,16 +46,16 @@ repos:
          - ts
          - markdown
        additional_dependencies:
-          - prettier@3.8.3
+          - prettier@3.8.4
          - 'prettier-plugin-organize-imports@4.3.0'
  # Python hooks
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.15.17
+    rev: v0.15.19
    hooks:
      - id: ruff-check
      - id: ruff-format
  - repo: https://github.com/tox-dev/pyproject-fmt
-    rev: "v2.24.1"
+    rev: "v2.25.0"
    hooks:
      - id: pyproject-fmt
        additional_dependencies: [tomli]
@@ -48,9 +48,6 @@ _LANGUAGE_MAP: dict[str, str] = {
 }
 SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
 # Document.title is max_length=128, so use 129 as the limit for
 # Tantivy's remove_long filter
 _TOKEN_REMOVE_LONG_LIMIT: Final[int] = 129
 def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
@@ -80,10 +77,10 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
 def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
-    """Main full-text tokenizer for content, title, etc: simple -> remove_long(129) -> lowercase -> ascii_fold [-> stemmer]"""
+    """Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
    builder = (
        tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
-        .filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
+        .filter(tantivy.Filter.remove_long(65))
        .filter(tantivy.Filter.lowercase())
        .filter(tantivy.Filter.ascii_fold())
    )
@@ -122,12 +119,12 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
 def _simple_search_analyzer() -> tantivy.TextAnalyzer:
-    """Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(129) -> lowercase -> ascii_fold."""
+    """Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(65) -> lowercase -> ascii_fold."""
    return (
        tantivy.TextAnalyzerBuilder(
            tantivy.Tokenizer.regex(r"\S+"),
        )
-        .filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
+        .filter(tantivy.Filter.remove_long(65))
        .filter(tantivy.Filter.lowercase())
        .filter(tantivy.Filter.ascii_fold())
        .build()
@@ -261,36 +261,6 @@ class TestSearch:
            == 1
        )
    @pytest.mark.parametrize(
        ("search_mode", "query"),
        [
            pytest.param(SearchMode.TITLE, "12345", id="title_search"),
            pytest.param(SearchMode.TEXT, "12345", id="text_search"),
            pytest.param(SearchMode.QUERY, None, id="query_title_exact"),
        ],
    )
    def test_search_modes_match_model_limit_title_tokens(
        self,
        backend: TantivyBackend,
        search_mode: SearchMode,
        query: str | None,
    ) -> None:
        """Search must keep filename-like title tokens up to the model limit."""
        long_title = "1234567890" * 12 + "12345678"
        doc = Document.objects.create(
            title=long_title,
            content="ordinary content",
            checksum="TXT12",
            pk=18,
        )
        backend.add_or_update(doc)
        assert backend.search_ids(
            query or f"title:{long_title}",
            user=None,
            search_mode=search_mode,
        ) == [doc.pk]
    @pytest.mark.parametrize(
        ("mode", "title", "content", "hits", "misses"),
        [
@@ -99,25 +99,6 @@ class TestTokenizers:
        )
        assert simple_search_index.searcher().search(q, limit=5).count == 1
    def test_simple_search_analyzer_supports_model_limit_token_substrings(
        self,
        simple_search_index: tantivy.Index,
    ) -> None:
        """Simple substring search keeps tokens up to Document.title's model limit."""
        long_token = "abcdefghij" * 12 + "abcdefgh"
        writer = simple_search_index.writer()
        doc = tantivy.Document()
        doc.add_text("simple_content", long_token)
        writer.add_document(doc)
        writer.commit()
        simple_search_index.reload()
        q = tantivy.Query.regex_query(
            simple_search_index.schema,
            "simple_content",
            ".*cdefg.*",
        )
        assert simple_search_index.searcher().search(q, limit=5).count == 1
    def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
        """Unsupported language codes should log a warning and disable stemming gracefully."""
        sb = tantivy.SchemaBuilder()