Merge branch 'beta' into dev

2026-06-25 23:04:19 +00:00 · 2026-06-02 08:32:54 -07:00
parent 2638554969 2c58d86380
commit ab8fe0521b
81 changed files with 3722 additions and 634 deletions
@@ -8,6 +8,8 @@ from typing import TYPE_CHECKING
 import filelock
 import pytest
 from django.contrib.auth import get_user_model
+from django.contrib.contenttypes.models import ContentType
+from guardian.shortcuts import clear_ct_cache
 from pytest_django.fixtures import SettingsWrapper
 from rest_framework.test import APIClient

@@ -158,6 +160,19 @@ def user_client(rest_api_client: APIClient, regular_user: UserModelT) -> APIClie
    return rest_api_client


+@pytest.fixture(autouse=True)
+def _clear_content_type_caches() -> None:
+    """Clear Django's ContentType cache and guardian's lru_cache before each test.
+
+    Tests that delete and reinsert ContentType/Permission rows (e.g. the
+    importer) corrupt both caches. Without this fixture a subsequent test on
+    the same xdist worker sees stale ContentType objects and guardian raises
+    MixedContentTypeError.
+    """
+    ContentType.objects.clear_cache()
+    clear_ct_cache()
+
+
@pytest.fixture(scope="session", autouse=True)
 def faker_session_locale():
    """Set Faker locale for reproducibility."""
@@ -1,5 +1,6 @@
 import pytest
 from django.contrib.auth.models import User
+from pytest_mock import MockerFixture

 from documents.models import CustomField
 from documents.models import CustomFieldInstance
@@ -7,8 +8,13 @@ from documents.models import Document
 from documents.models import Note
 from documents.search._backend import SearchMode
 from documents.search._backend import TantivyBackend
+from documents.search._backend import WriteBatch
 from documents.search._backend import get_backend
 from documents.search._backend import reset_backend
+from documents.tests.factories import CorrespondentFactory
+from documents.tests.factories import DocumentFactory
+from documents.tests.factories import DocumentTypeFactory
+from documents.tests.factories import TagFactory

 pytestmark = [pytest.mark.search, pytest.mark.django_db]

@@ -36,6 +42,47 @@ class TestWriteBatch:
        ids = backend.search_ids("should survive", user=None)
        assert len(ids) == 1

+    def test_writer_released_when_commit_fails(
+        self,
+        backend: TantivyBackend,
+        mocker: MockerFixture,
+    ) -> None:
+        """A commit failure must still dispose the writer (released in finally).
+
+        Otherwise the Tantivy IndexWriter lingers holding its internal lock and
+        the next batch fails with LockBusy. The real writer is created in
+        __enter__; here commit() is forced to raise via a mocked _writer.
+        """
+        doc = Document.objects.create(
+            title="Commit Fail",
+            content="indexable text",
+            checksum="WBCF1",
+            pk=42,
+        )
+
+        failing = mocker.MagicMock()
+        failing.commit.side_effect = RuntimeError("simulated commit failure")
+        mocker.patch.object(
+            WriteBatch,
+            "_writer",
+            new_callable=mocker.PropertyMock,
+            return_value=failing,
+        )
+
+        batch = backend.batch_update()
+        with pytest.raises(RuntimeError, match="simulated commit failure"):
+            with batch as b:
+                b.add_or_update(doc)
+
+        # Writer disposed despite the commit failure.
+        assert batch._raw_writer is None
+
+        # Drop the patch so a real writer can be created; a fresh batch must
+        # succeed (would raise LockBusy if the previous writer had leaked).
+        mocker.stopall()
+        backend.add_or_update(doc)
+        assert len(backend.search_ids("indexable", user=None)) == 1
+

 class TestSearch:
    """Test search query parsing and matching via search_ids."""
@@ -214,6 +261,153 @@ class TestSearch:
            == 1
        )

+    @pytest.mark.parametrize(
+        ("mode", "title", "content", "hits", "misses"),
+        [
+            pytest.param(
+                SearchMode.QUERY,
+                "CJK document",
+                "東京都の人口は約1400万人です",
+                ["東京", "人口"],
+                ["大阪"],
+                id="query_mode_cjk_content",
+            ),
+            pytest.param(
+                SearchMode.TEXT,
+                "CJK document",
+                "東京都の人口は約1400万人です",
+                ["東京"],
+                ["大阪"],
+                id="text_mode_cjk_content",
+            ),
+            pytest.param(
+                SearchMode.TITLE,
+                "東京都の報告書",
+                "This document is about Tokyo.",
+                ["東京", "報告"],
+                ["大阪"],
+                id="title_mode_cjk_title",
+            ),
+        ],
+    )
+    def test_cjk_search_finds_matching_documents(
+        self,
+        backend: TantivyBackend,
+        mode: SearchMode,
+        title: str,
+        content: str,
+        hits: list[str],
+        misses: list[str],
+    ) -> None:
+        """CJK queries must match documents via bigram fields in all three search modes."""
+        doc = DocumentFactory(title=title, content=content)
+        backend.add_or_update(doc)
+
+        for query in hits:
+            assert len(backend.search_ids(query, user=None, search_mode=mode)) == 1, (
+                f"Expected {query!r} to match in {mode} mode"
+            )
+        for query in misses:
+            assert len(backend.search_ids(query, user=None, search_mode=mode)) == 0, (
+                f"Expected {query!r} not to match in {mode} mode"
+            )
+
+    def test_title_mode_cjk_does_not_match_content_only(
+        self,
+        backend: TantivyBackend,
+    ) -> None:
+        """Title-only CJK search must not return docs where CJK appears only in content."""
+        doc = DocumentFactory(
+            title="Tokyo report",
+            content="東京都の人口は約1400万人です",
+        )
+        backend.add_or_update(doc)
+
+        assert (
+            len(backend.search_ids("東京", user=None, search_mode=SearchMode.TITLE))
+            == 0
+        )
+
+    @pytest.mark.parametrize(
+        ("field", "query", "miss"),
+        [
+            pytest.param("correspondent", "東京", "大阪", id="cjk_correspondent"),
+            pytest.param("document_type", "請求書", "領収書", id="cjk_document_type"),
+            pytest.param("tag", "重要", "普通", id="cjk_tag"),
+        ],
+    )
+    def test_cjk_metadata_search_via_query_mode(
+        self,
+        backend: TantivyBackend,
+        field: str,
+        query: str,
+        miss: str,
+    ) -> None:
+        """CJK in correspondent/document_type/tag names must be searchable via global search."""
+        if field == "correspondent":
+            doc = DocumentFactory(correspondent=CorrespondentFactory(name=query))
+        elif field == "document_type":
+            doc = DocumentFactory(document_type=DocumentTypeFactory(name=query))
+        else:
+            tag = TagFactory(name=query)
+            doc = DocumentFactory()
+            doc.tags.add(tag)
+        backend.add_or_update(doc)
+
+        assert (
+            len(backend.search_ids(query, user=None, search_mode=SearchMode.QUERY)) == 1
+        ), f"Expected CJK {field} name {query!r} to match"
+        assert (
+            len(backend.search_ids(miss, user=None, search_mode=SearchMode.QUERY)) == 0
+        ), f"Expected {miss!r} not to match"
+
+    def test_cjk_text_mode_does_not_leak_field_query_semantics(
+        self,
+        backend: TantivyBackend,
+    ) -> None:
+        """TEXT mode is plain-text over content: a 'field:CJK' input must not be
+        parsed as a structured query against that field. A doc tagged 重要 with
+        no 重要 in its content must NOT match the TEXT-mode query 'tag:重要'."""
+        tag = TagFactory(name="重要")
+        doc = DocumentFactory(title="report", content="just english content")
+        doc.tags.add(tag)
+        backend.add_or_update(doc)
+
+        assert (
+            len(backend.search_ids("tag:重要", user=None, search_mode=SearchMode.TEXT))
+            == 0
+        )
+        # Sanity: the CJK run still matches when it is actually in the content.
+        doc2 = DocumentFactory(title="report2", content="本文に重要な情報")
+        backend.add_or_update(doc2)
+        assert (
+            len(backend.search_ids("tag:重要", user=None, search_mode=SearchMode.TEXT))
+            == 1
+        )
+
+    @pytest.mark.parametrize(
+        "query",
+        [
+            pytest.param("Straße", id="eszett"),
+            pytest.param("Ærøskøbing", id="ae_and_oslash"),
+            pytest.param("strasse", id="ascii_fold_form"),
+        ],
+    )
+    def test_simple_search_folds_special_letters_like_index(
+        self,
+        backend: TantivyBackend,
+        query: str,
+    ) -> None:
+        """Query-side folding must match index-side folding for non-decomposable
+        letters (ß→ss, ø→o, ...). Searching the accented form must find the doc.
+        A naive NFD fold deletes these letters and silently fails to match."""
+        doc = DocumentFactory(title="report", content="Straße Ærøskøbing")
+        backend.add_or_update(doc)
+
+        assert (
+            len(backend.search_ids(query, user=None, search_mode=SearchMode.TEXT)) == 1
+        )
+
    def test_sort_field_ascending(self, backend: TantivyBackend) -> None:
        """Searching with sort_reverse=False must return results in ascending ASN order."""
        for asn in [30, 10, 20]:
@@ -393,6 +587,18 @@ class TestAutocomplete:
        results = backend.autocomplete("pay", limit=10)
        assert results.index("payment") < results.index("payslip")

+    def test_folds_special_letters_consistently(
+        self,
+        backend: TantivyBackend,
+    ) -> None:
+        """Autocomplete words must fold the same way as content (ß→ss), so a
+        prefix of the folded form finds them. A naive NFD fold would store the
+        word as 'strae' and the prefix 'stras' would never match it."""
+        doc = DocumentFactory(title="Straße", content="details")
+        backend.add_or_update(doc)
+
+        assert "strasse" in backend.autocomplete("stras", limit=10)
+

 class TestMoreLikeThis:
    """Test more like this functionality."""
@@ -0,0 +1,248 @@
+"""Tests for search index lock backoff, retry logic, and self-healing deferred tasks."""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import filelock
+import pytest
+
+from documents.search._backend import _LOCK_BACKOFF_CAP
+from documents.search._backend import _LOCK_RETRY_ATTEMPTS
+from documents.search._backend import _LOCK_TIMEOUT_SECONDS
+from documents.search._backend import SearchIndexLockError
+from documents.search._backend import TantivyBackend
+from documents.tasks import index_document
+from documents.tasks import remove_document_from_index
+from documents.tests.factories import DocumentFactory
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+    from pathlib import Path
+
+    from pytest_mock import MockerFixture
+
+pytestmark = pytest.mark.search
+
+
+@pytest.fixture
+def disk_backend(tmp_path: Path) -> Generator[TantivyBackend, None, None]:
+    """On-disk TantivyBackend so the file-lock code path is exercised."""
+    b = TantivyBackend(path=tmp_path)
+    b.open()
+    try:
+        yield b
+    finally:
+        b.close()
+
+
+class TestWriteBatchLockRetry:
+    """Test WriteBatch retry loop with backoff + full jitter."""
+
+    @pytest.mark.django_db
+    def test_lock_retries_then_succeeds(
+        self,
+        disk_backend: TantivyBackend,
+        mocker: MockerFixture,
+    ) -> None:
+        """Timeout on first 3 attempts then success on 4th — document must be indexed."""
+        doc = DocumentFactory()
+
+        acquire_calls = 0
+
+        def flaky_acquire(timeout: float) -> None:
+            nonlocal acquire_calls
+            acquire_calls += 1
+            # Raise Timeout for first _LOCK_RETRY_ATTEMPTS - 1 calls, succeed on last
+            if acquire_calls < _LOCK_RETRY_ATTEMPTS:
+                raise filelock.Timeout("")
+
+        sleep_values: list[float] = []
+
+        mocker.patch(
+            "documents.search._backend.filelock.FileLock.acquire",
+            side_effect=flaky_acquire,
+        )
+        mock_sleep = mocker.patch(
+            "documents.search._backend.time.sleep",
+            side_effect=lambda s: sleep_values.append(s),
+        )
+
+        # Should not raise — 4th attempt succeeds
+        with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS) as batch:
+            batch.add_or_update(doc)
+
+        # sleep called exactly _LOCK_RETRY_ATTEMPTS - 1 times (once per failed attempt)
+        assert mock_sleep.call_count == _LOCK_RETRY_ATTEMPTS - 1
+
+        # All sleep values must be in [0, _LOCK_BACKOFF_CAP]
+        for s in sleep_values:
+            assert 0 <= s <= _LOCK_BACKOFF_CAP, (
+                f"Sleep value {s} outside [0, {_LOCK_BACKOFF_CAP}]"
+            )
+
+    def test_lock_exhaustion_raises_search_index_lock_error(
+        self,
+        disk_backend: TantivyBackend,
+        mocker: MockerFixture,
+    ) -> None:
+        """All acquire attempts raise Timeout — WriteBatch must raise SearchIndexLockError."""
+        mocker.patch(
+            "documents.search._backend.filelock.FileLock.acquire",
+            side_effect=filelock.Timeout(""),
+        )
+        mocker.patch("documents.search._backend.time.sleep")
+
+        with pytest.raises(SearchIndexLockError):
+            with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS):
+                pass
+
+    def test_jitter_values_in_range(
+        self,
+        disk_backend: TantivyBackend,
+        mocker: MockerFixture,
+    ) -> None:
+        """Sleep values must always lie in [0, _LOCK_BACKOFF_CAP] across many samples."""
+        mocker.patch(
+            "documents.search._backend.filelock.FileLock.acquire",
+            side_effect=filelock.Timeout(""),
+        )
+        sleep_values: list[float] = []
+        mocker.patch(
+            "documents.search._backend.time.sleep",
+            side_effect=lambda s: sleep_values.append(s),
+        )
+        for _ in range(50):
+            sleep_values.clear()
+            with pytest.raises(SearchIndexLockError):
+                with disk_backend.batch_update(lock_timeout=_LOCK_TIMEOUT_SECONDS):
+                    pass
+
+            for s in sleep_values:
+                assert 0 <= s <= _LOCK_BACKOFF_CAP, (
+                    f"Jitter {s} exceeds cap {_LOCK_BACKOFF_CAP}"
+                )
+
+
+class TestAddOrUpdateDeferredScheduling:
+    """Test that add_or_update() and remove() defer to Celery on lock exhaustion."""
+
+    @pytest.mark.django_db
+    def test_lock_exhaustion_schedules_deferred_task(
+        self,
+        disk_backend: TantivyBackend,
+        mocker: MockerFixture,
+    ) -> None:
+        """Lock exhaustion in add_or_update must schedule index_document task, not raise."""
+        doc = DocumentFactory()
+
+        mocker.patch(
+            "documents.search._backend.filelock.FileLock.acquire",
+            side_effect=filelock.Timeout(""),
+        )
+        mocker.patch("documents.search._backend.time.sleep")
+        mock_apply = mocker.patch("documents.tasks.index_document.apply_async")
+
+        # Must NOT raise
+        disk_backend.add_or_update(doc)
+
+        mock_apply.assert_called_once_with(args=[doc.pk], countdown=60)
+
+    def test_remove_exhaustion_schedules_deferred_task(
+        self,
+        disk_backend: TantivyBackend,
+        mocker: MockerFixture,
+    ) -> None:
+        """Lock exhaustion in remove() must schedule remove_document_from_index task, not raise."""
+        doc_id = 503
+
+        mocker.patch(
+            "documents.search._backend.filelock.FileLock.acquire",
+            side_effect=filelock.Timeout(""),
+        )
+        mocker.patch("documents.search._backend.time.sleep")
+        mock_apply = mocker.patch(
+            "documents.tasks.remove_document_from_index.apply_async",
+        )
+
+        # Must NOT raise
+        disk_backend.remove(doc_id)
+
+        mock_apply.assert_called_once_with(args=[doc_id], countdown=60)
+
+
+@pytest.mark.django_db
+class TestIndexDocumentTask:
+    """Test the deferred index_document and remove_document_from_index Celery tasks."""
+
+    def test_index_document_task_skips_deleted_document(
+        self,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        """index_document with a non-existent doc_id must return cleanly and log INFO."""
+        nonexistent_id = 999999
+
+        with caplog.at_level(logging.INFO, logger="paperless.tasks"):
+            index_document(nonexistent_id)
+
+        assert any("no longer exists" in record.message for record in caplog.records), (
+            "Expected INFO log about missing document"
+        )
+
+    def test_index_document_task_indexes_existing_document(
+        self,
+        backend: TantivyBackend,
+        mocker: MockerFixture,
+    ) -> None:
+        """index_document task must add the document to the index via batch_update."""
+        doc = DocumentFactory(content="via deferred task")
+
+        # get_backend is imported lazily inside the task: `from documents.search import get_backend`
+        mocker.patch(
+            "documents.search.get_backend",
+            return_value=backend,
+        )
+        index_document(doc.pk)
+
+        ids = backend.search_ids("deferred task", user=None)
+        assert doc.pk in ids
+
+    def test_remove_document_from_index_task_removes_existing_document(
+        self,
+        backend: TantivyBackend,
+        mocker: MockerFixture,
+    ) -> None:
+        """remove_document_from_index task must remove the document from the index."""
+        doc = DocumentFactory(content="will be removed by deferred task")
+        backend.add_or_update(doc)
+        assert doc.pk in backend.search_ids("removed", user=None)
+
+        mocker.patch("documents.search.get_backend", return_value=backend)
+        remove_document_from_index(doc.pk)
+
+        assert doc.pk not in backend.search_ids("removed", user=None)
+
+    def test_task_does_not_swallow_lock_error(
+        self,
+        mocker: MockerFixture,
+    ) -> None:
+        """Verifies the task body propagates SearchIndexLockError so Celery's
+        autoretry_for can catch it (rather than the task swallowing the error
+        and silently succeeding)."""
+        doc = DocumentFactory()
+
+        mock_batch = mocker.MagicMock()
+        mock_batch.__enter__ = mocker.MagicMock(
+            side_effect=SearchIndexLockError("exhausted"),
+        )
+        mock_batch.__exit__ = mocker.MagicMock(return_value=False)
+
+        mock_backend = mocker.MagicMock()
+        mock_backend.batch_update.return_value = mock_batch
+
+        # get_backend is imported lazily inside the task: `from documents.search import get_backend`
+        mocker.patch("documents.search.get_backend", return_value=mock_backend)
+
+        with pytest.raises(SearchIndexLockError):
+            index_document(doc.pk)
@@ -16,6 +16,7 @@ from documents.search._query import _datetime_range
 from documents.search._query import _rewrite_compact_date
 from documents.search._query import build_permission_filter
 from documents.search._query import normalize_query
+from documents.search._query import parse_simple_text_highlight_query
 from documents.search._query import parse_user_query
 from documents.search._query import rewrite_natural_date_keywords
 from documents.search._schema import build_schema
@@ -443,6 +444,149 @@ class TestParseUserQuery:
            q = parse_user_query(query_index, "created:today", UTC)
        assert isinstance(q, tantivy.Query)

+    @pytest.mark.parametrize(
+        "raw_query",
+        [
+            pytest.param("h52.1 - kurzsichtigkeit", id="icd_code_dash_description"),
+            pytest.param("H52.1 - asd", id="icd_code_uppercase"),
+            pytest.param("h52.1 -", id="trailing_minus"),
+            pytest.param(". -", id="dot_trailing_minus"),
+            pytest.param("h52. -", id="partial_code_trailing_minus"),
+            pytest.param(".12 -", id="dot_number_trailing_minus"),
+            pytest.param("h52.1 - ku", id="partial_word_after_dash"),
+        ],
+    )
+    def test_spaced_dash_queries_do_not_raise(
+        self,
+        query_index: tantivy.Index,
+        raw_query: str,
+    ) -> None:
+        assert isinstance(parse_user_query(query_index, raw_query, UTC), tantivy.Query)
+
+
+class TestYearRangeRewriting:
+    """Whoosh-style year-only date ranges must be rewritten to ISO 8601."""
+
+    @pytest.mark.parametrize(
+        ("query", "field", "expected_lo", "expected_hi"),
+        [
+            pytest.param(
+                "created:[2020 TO 2020]",
+                "created",
+                "2020-01-01T00:00:00Z",
+                "2021-01-01T00:00:00Z",
+                id="single_year_created",
+            ),
+            pytest.param(
+                "created:[2018 TO 2021]",
+                "created",
+                "2018-01-01T00:00:00Z",
+                "2022-01-01T00:00:00Z",
+                id="multi_year_range_created",
+            ),
+            pytest.param(
+                "added:[2022 TO 2023]",
+                "added",
+                "2022-01-01T00:00:00Z",
+                "2024-01-01T00:00:00Z",
+                id="added_field",
+            ),
+            pytest.param(
+                "modified:[2021 TO 2021]",
+                "modified",
+                "2021-01-01T00:00:00Z",
+                "2022-01-01T00:00:00Z",
+                id="modified_field",
+            ),
+            pytest.param(
+                "created:[2020 to 2020]",
+                "created",
+                "2020-01-01T00:00:00Z",
+                "2021-01-01T00:00:00Z",
+                id="lowercase_to_keyword",
+            ),
+        ],
+    )
+    def test_year_range_rewritten(
+        self,
+        query: str,
+        field: str,
+        expected_lo: str,
+        expected_hi: str,
+    ) -> None:
+        result = rewrite_natural_date_keywords(query, UTC)
+        lo, hi = _range(result, field)
+        assert lo == expected_lo
+        assert hi == expected_hi
+
+    def test_reversed_year_range_is_swapped(self) -> None:
+        # A reversed range must not yield lo > hi, which Tantivy treats as an
+        # empty range (silently zero results). The bounds are swapped instead.
+        result = rewrite_natural_date_keywords("created:[2025 TO 2020]", UTC)
+        lo, hi = _range(result, "created")
+        assert lo == "2020-01-01T00:00:00Z"
+        assert hi == "2026-01-01T00:00:00Z"
+
+    def test_year_range_in_complex_boolean_query(self) -> None:
+        query = "tag:steuer AND (title:2020 OR (NOT title:2019 AND NOT title:2018 AND created:[2020 TO 2020]))"
+        result = rewrite_natural_date_keywords(query, UTC)
+        lo, hi = _range(result, "created")
+        assert lo == "2020-01-01T00:00:00Z"
+        assert hi == "2021-01-01T00:00:00Z"
+        assert "title:2020" in result
+        assert "title:2019" in result
+        assert "title:2018" in result
+
+    def test_already_iso_date_range_passes_through_unchanged(self) -> None:
+        original = "created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]"
+        assert rewrite_natural_date_keywords(original, UTC) == original
+
+    def test_8digit_in_brackets_not_matched_as_year_range(self) -> None:
+        # [YYYYMMDD TO YYYYMMDD] has 8-digit values - must not be caught by year rewriter
+        original = "created:[20200101 TO 20201231]"
+        result = rewrite_natural_date_keywords(original, UTC)
+        assert "20200101" in result or "2020-01-01" in result
+        assert "20201231" in result or "2020-12-31" in result
+
+
+class TestNonDateFieldsNotRewritten:
+    """Date rewriters must only fire on the date fields (created/modified/added).
+
+    Integer fields like asn/id/page_count and unknown fields would otherwise be
+    rewritten into date ranges and rejected by Tantivy as type mismatches.
+    """
+
+    @pytest.mark.parametrize(
+        "query",
+        [
+            pytest.param("asn:20240101", id="asn_8digit"),
+            pytest.param("id:20240101", id="id_8digit"),
+            pytest.param("page_count:12345678", id="page_count_8digit"),
+            pytest.param("num_notes:20231201", id="num_notes_8digit"),
+        ],
+    )
+    def test_8digit_on_integer_field_passes_through_unchanged(self, query: str) -> None:
+        assert rewrite_natural_date_keywords(query, EASTERN) == query
+
+    @pytest.mark.parametrize(
+        "query",
+        [
+            pytest.param("asn:[2000 TO 2024]", id="asn_year_range"),
+            pytest.param("id:[2000 TO 2024]", id="id_year_range"),
+            pytest.param("page_count:[2000 TO 2024]", id="page_count_year_range"),
+        ],
+    )
+    def test_year_range_on_integer_field_passes_through_unchanged(
+        self,
+        query: str,
+    ) -> None:
+        assert rewrite_natural_date_keywords(query, UTC) == query
+
+    def test_unknown_field_keyword_passes_through_unchanged(self) -> None:
+        # foobar is not a date field: 'foobar:today' must not become a date range,
+        # which Tantivy would otherwise reject as an unknown/typed field.
+        assert rewrite_natural_date_keywords("foobar:today", UTC) == "foobar:today"
+

 class TestPassthrough:
    """Queries without field prefixes or unrelated content pass through unchanged."""
@@ -471,10 +615,108 @@ class TestNormalizeQuery:
    def test_normalize_no_commas_unchanged(self) -> None:
        assert normalize_query("bank statement") == "bank statement"

+    @pytest.mark.parametrize(
+        ("raw", "expected"),
+        [
+            pytest.param(
+                "h52.1 - kurzsichtigkeit",
+                "h52.1 kurzsichtigkeit",
+                id="icd_code_dash_description",
+            ),
+            pytest.param(
+                "H52.1 - asd",
+                "H52.1 asd",
+                id="icd_code_uppercase_dash",
+            ),
+            pytest.param(
+                "h52.1 -",
+                "h52.1",
+                id="trailing_minus",
+            ),
+            pytest.param(
+                ". -",
+                ".",
+                id="dot_trailing_minus",
+            ),
+            pytest.param(
+                "h52. -",
+                "h52.",
+                id="partial_code_trailing_minus",
+            ),
+            pytest.param(
+                "foo - bar - baz",
+                "foo bar baz",
+                id="multiple_dashes",
+            ),
+            pytest.param(
+                "foo + bar",
+                "foo bar",
+                id="spaced_plus_operator",
+            ),
+        ],
+    )
+    def test_normalize_strips_dangling_operators(self, raw: str, expected: str) -> None:
+        assert normalize_query(raw) == expected
+
+    @pytest.mark.parametrize(
+        "query",
+        [
+            pytest.param("term -other", id="adjacent_not_operator"),
+            pytest.param("-term", id="leading_not_operator"),
+            pytest.param("+term", id="leading_must_operator"),
+            pytest.param("foo -bar +baz", id="mixed_adjacent_operators"),
+        ],
+    )
+    def test_normalize_preserves_valid_operators(self, query: str) -> None:
+        assert normalize_query(query) == query
+
+
+class TestParseSimpleTextHighlightQuery:
+    """parse_simple_text_highlight_query must not raise on natural-language queries."""
+
+    @pytest.fixture
+    def query_index(self) -> tantivy.Index:
+        schema = build_schema()
+        idx = tantivy.Index(schema, path=None)
+        register_tokenizers(idx, "")
+        return idx
+
+    @pytest.mark.parametrize(
+        "raw_query",
+        [
+            pytest.param("h52.1 - kurzsichtigkeit", id="icd_code_dash_description"),
+            pytest.param("H52.1 - asd", id="icd_code_uppercase"),
+            pytest.param("h52.1 -", id="trailing_minus"),
+            pytest.param(". -", id="dot_trailing_minus"),
+            pytest.param(".12 -", id="dot_number_trailing_minus"),
+            pytest.param("f84.0 - v.a. autismusspektrumstorung", id="complex_icd_dash"),
+        ],
+    )
+    def test_spaced_dash_queries_do_not_raise(
+        self,
+        query_index: tantivy.Index,
+        raw_query: str,
+    ) -> None:
+        assert isinstance(
+            parse_simple_text_highlight_query(query_index, raw_query),
+            tantivy.Query,
+        )
+
+    def test_empty_query_returns_empty_query(self, query_index: tantivy.Index) -> None:
+        result = parse_simple_text_highlight_query(query_index, "")
+        assert isinstance(result, tantivy.Query)
+
+    def test_all_operators_returns_empty_query(
+        self,
+        query_index: tantivy.Index,
+    ) -> None:
+        result = parse_simple_text_highlight_query(query_index, "- +")
+        assert isinstance(result, tantivy.Query)
+

 class TestPermissionFilter:
    """
-    build_permission_filter tests use an in-memory index — no DB access needed.
+    build_permission_filter tests use an in-memory index - no DB access needed.

    Users are constructed as unsaved model instances (django_user_model(pk=N))
    so no database round-trip occurs; only .pk is read by build_permission_filter.
@@ -74,6 +74,9 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
                "ai_enabled": False,
                "llm_embedding_backend": None,
                "llm_embedding_model": None,
+                "llm_embedding_endpoint": None,
+                "llm_embedding_chunk_size": None,
+                "llm_context_size": None,
                "llm_backend": None,
                "llm_model": None,
                "llm_api_key": None,
@@ -840,7 +843,7 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):

        with (
            patch("documents.tasks.llmindex_index.apply_async") as mock_update,
-            patch("paperless_ai.indexing.vector_store_file_exists") as mock_exists,
+            patch("paperless.views.vector_store_file_exists") as mock_exists,
        ):
            mock_exists.return_value = False
            self.client.patch(
@@ -855,6 +858,91 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
            )
            mock_update.assert_called_once()

+    def test_update_llm_embedding_chunk_size_triggers_rebuild(self) -> None:
+        config = ApplicationConfiguration.objects.first()
+        assert config is not None
+        config.ai_enabled = True
+        config.llm_embedding_backend = "openai-like"
+        config.llm_embedding_chunk_size = 1024
+        config.save()
+
+        with (
+            patch("documents.tasks.llmindex_index.apply_async") as mock_update,
+            patch("paperless.views.vector_store_file_exists") as mock_exists,
+        ):
+            mock_exists.return_value = True
+            self.client.patch(
+                f"{self.ENDPOINT}1/",
+                json.dumps({"llm_embedding_chunk_size": 512}),
+                content_type="application/json",
+            )
+            mock_update.assert_called_once()
+            self.assertEqual(mock_update.call_args.kwargs["kwargs"], {"rebuild": True})
+
+    def test_update_llm_context_size_triggers_rebuild(self) -> None:
+        config = ApplicationConfiguration.objects.first()
+        assert config is not None
+        config.ai_enabled = True
+        config.llm_embedding_backend = "openai-like"
+        config.llm_context_size = 8192
+        config.save()
+
+        with (
+            patch("documents.tasks.llmindex_index.apply_async") as mock_update,
+            patch("paperless.views.vector_store_file_exists") as mock_exists,
+        ):
+            mock_exists.return_value = True
+            self.client.patch(
+                f"{self.ENDPOINT}1/",
+                json.dumps({"llm_context_size": 4096}),
+                content_type="application/json",
+            )
+            mock_update.assert_called_once()
+            self.assertEqual(mock_update.call_args.kwargs["kwargs"], {"rebuild": True})
+
+    def test_update_llm_embedding_model_triggers_rebuild(self) -> None:
+        config = ApplicationConfiguration.objects.first()
+        assert config is not None
+        config.ai_enabled = True
+        config.llm_embedding_backend = "openai-like"
+        config.llm_embedding_model = "text-embedding-3-small"
+        config.save()
+
+        with patch("documents.tasks.llmindex_index.apply_async") as mock_update:
+            self.client.patch(
+                f"{self.ENDPOINT}1/",
+                json.dumps({"llm_embedding_model": "text-embedding-3-large"}),
+                content_type="application/json",
+            )
+            mock_update.assert_called_once()
+            self.assertEqual(mock_update.call_args.kwargs["kwargs"], {"rebuild": True})
+
+    def test_enable_ai_index_with_config_change_triggers_rebuild(self) -> None:
+        config = ApplicationConfiguration.objects.first()
+        assert config is not None
+        config.ai_enabled = False
+        config.llm_embedding_backend = "openai-like"
+        config.llm_embedding_model = "text-embedding-3-small"
+        config.save()
+
+        with (
+            patch("documents.tasks.llmindex_index.apply_async") as mock_update,
+            patch("paperless.views.vector_store_file_exists") as mock_exists,
+        ):
+            mock_exists.return_value = True
+            self.client.patch(
+                f"{self.ENDPOINT}1/",
+                json.dumps(
+                    {
+                        "ai_enabled": True,
+                        "llm_embedding_model": "text-embedding-3-large",
+                    },
+                ),
+                content_type="application/json",
+            )
+            mock_update.assert_called_once()
+            self.assertEqual(mock_update.call_args.kwargs["kwargs"], {"rebuild": True})
+
    @override_settings(LLM_ALLOW_INTERNAL_ENDPOINTS=False)
    def test_update_llm_endpoint_blocks_internal_endpoint_when_disallowed(self) -> None:
        response = self.client.patch(
@@ -868,3 +956,19 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
        )
        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
        self.assertIn("non-public address", str(response.data).lower())
+
+    @override_settings(LLM_ALLOW_INTERNAL_ENDPOINTS=False)
+    def test_update_llm_embedding_endpoint_blocks_internal_endpoint_when_disallowed(
+        self,
+    ) -> None:
+        response = self.client.patch(
+            f"{self.ENDPOINT}1/",
+            json.dumps(
+                {
+                    "llm_embedding_endpoint": "http://127.0.0.1:11434",
+                },
+            ),
+            content_type="application/json",
+        )
+        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
+        self.assertIn("non-public address", str(response.data).lower())
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from unittest import mock
+
+from django.contrib.auth.models import User
+from rest_framework import status
+from rest_framework.test import APITestCase
+
+
+class TestChatStreamingViewInputValidation(APITestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self.user = User.objects.create_superuser(username="temp_admin")
+        self.client.force_authenticate(user=self.user)
+
+    def _mock_ai_enabled(self) -> mock.MagicMock:
+        """Return a mock AIConfig instance with ai_enabled=True."""
+        m = mock.MagicMock()
+        m.ai_enabled = True
+        return m
+
+    def test_oversized_question_is_rejected(self) -> None:
+        with mock.patch(
+            "documents.views.AIConfig",
+            return_value=self._mock_ai_enabled(),
+        ):
+            resp = self.client.post(
+                "/api/documents/chat/",
+                {"q": "x" * 4001},
+                format="json",
+            )
+        assert resp.status_code == status.HTTP_400_BAD_REQUEST
+
+    def test_missing_question_is_rejected(self) -> None:
+        with mock.patch(
+            "documents.views.AIConfig",
+            return_value=self._mock_ai_enabled(),
+        ):
+            resp = self.client.post(
+                "/api/documents/chat/",
+                {},
+                format="json",
+            )
+        assert resp.status_code == status.HTTP_400_BAD_REQUEST
@@ -464,6 +464,40 @@ class TestDocumentVersioningApi(DirectoriesMixin, APITestCase):
        self.assertEqual(resp.status_code, status.HTTP_200_OK)
        self.assertEqual(read_streaming_response(resp), b"thumb")

+    def test_thumb_etag_changes_when_latest_version_is_deleted(self) -> None:
+        root = self._create_pdf(title="root", checksum="root")
+        v1 = self._create_pdf(
+            title="v1",
+            checksum="v1",
+            root_document=root,
+        )
+        v2 = self._create_pdf(
+            title="v2",
+            checksum="v2",
+            root_document=root,
+        )
+        self._write_file(v1.thumbnail_path, b"thumb-v1")
+        self._write_file(v2.thumbnail_path, b"thumb-v2")
+
+        resp = self.client.get(f"/api/documents/{root.id}/thumb/")
+        self.assertEqual(resp.status_code, status.HTTP_200_OK)
+        self.assertEqual(read_streaming_response(resp), b"thumb-v2")
+        self.assertEqual(resp.headers["ETag"], '"v2"')
+
+        with mock.patch("documents.search.get_backend"):
+            delete_resp = self.client.delete(
+                f"/api/documents/{root.id}/versions/{v2.id}/",
+            )
+        self.assertEqual(delete_resp.status_code, status.HTTP_200_OK)
+
+        resp = self.client.get(
+            f"/api/documents/{root.id}/thumb/",
+            HTTP_IF_NONE_MATCH='"v2"',
+        )
+        self.assertEqual(resp.status_code, status.HTTP_200_OK)
+        self.assertEqual(resp.headers["ETag"], '"v1"')
+        self.assertEqual(read_streaming_response(resp), b"thumb-v1")
+
    def test_metadata_version_param_uses_version(self) -> None:
        root = Document.objects.create(
            title="root",
@@ -485,6 +485,42 @@ class TestDocumentApi(DirectoriesMixin, ConsumeTaskMixin, APITestCase):
        response = self.client.get(f"/api/documents/{doc.pk}/thumb/")
        self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND)

+    def test_document_actions_trashed_document(self) -> None:
+        """
+        GIVEN:
+            - Document with files exists
+        WHEN:
+            - Document is soft-deleted (moved to trash)
+            - Preview and thumb endpoints are requested
+        THEN:
+            - HTTP 200 OK for both (trashed documents remain previewable)
+        """
+        _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
+        content = b"This is a test"
+        content_thumbnail = b"thumbnail content"
+
+        with Path(filename).open("wb") as f:
+            f.write(content)
+
+        doc = Document.objects.create(
+            title="none",
+            filename=Path(filename).name,
+            mime_type="application/pdf",
+        )
+
+        with (self.dirs.thumbnail_dir / f"{doc.pk:07d}.webp").open("wb") as f:
+            f.write(content_thumbnail)
+
+        doc.delete()
+
+        response = self.client.get(f"/api/documents/{doc.pk}/preview/")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(read_streaming_response(response), content)
+
+        response = self.client.get(f"/api/documents/{doc.pk}/thumb/")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(read_streaming_response(response), content_thumbnail)
+
    def test_document_history_action(self) -> None:
        """
        GIVEN:
@@ -1305,6 +1341,35 @@ class TestDocumentApi(DirectoriesMixin, ConsumeTaskMixin, APITestCase):
        self.assertEqual(response.data["document_type_count"], 1)
        self.assertEqual(response.data["storage_path_count"], 2)

+    def test_statistics_excludes_document_versions(self) -> None:
+        root = Document.objects.create(
+            title="root",
+            checksum="A",
+            mime_type="application/pdf",
+            content="root",
+        )
+        version = Document.objects.create(
+            title="version",
+            checksum="B",
+            mime_type="application/pdf",
+            content="version",
+            root_document=root,
+            version_index=1,
+        )
+        tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
+        version.tags.add(tag_inbox)
+
+        response = self.client.get("/api/statistics/")
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["documents_total"], 1)
+        self.assertEqual(response.data["documents_inbox"], 0)
+        self.assertEqual(response.data["character_count"], 4)
+        self.assertEqual(
+            response.data["document_file_type_counts"][0]["mime_type_count"],
+            1,
+        )
+
    def test_statistics_no_inbox_tag(self) -> None:
        Document.objects.create(title="none1", checksum="A")

@@ -3047,6 +3112,46 @@ class TestDocumentApi(DirectoriesMixin, ConsumeTaskMixin, APITestCase):
        # modified was updated to today
        self.assertEqual(doc.modified.day, timezone.now().day)

+    def test_create_note_only_saves_document_modified_field(self) -> None:
+        """
+        GIVEN:
+            - Existing document with a created date
+        WHEN:
+            - API request is made to add a note
+        THEN:
+            - Only the document modified field is persisted by the note endpoint
+            - Other document fields are not rewritten by the note endpoint
+        """
+        doc = Document.objects.create(
+            title="test",
+            mime_type="application/pdf",
+            content="this is a document which will have notes added",
+            created=datetime.date(2026, 3, 31),
+        )
+        original_save = Document.save
+
+        with mock.patch.object(
+            Document,
+            "save",
+            autospec=True,
+            side_effect=original_save,
+        ) as save_mock:
+            resp = self.client.post(
+                f"/api/documents/{doc.pk}/notes/",
+                data={"note": "this is a posted note"},
+            )
+
+        self.assertEqual(resp.status_code, status.HTTP_200_OK)
+        doc.refresh_from_db()
+        self.assertEqual(doc.created, datetime.date(2026, 3, 31))
+        self.assertTrue(
+            any(
+                call.kwargs.get("update_fields") == ["modified"]
+                for call in save_mock.call_args_list
+                if call.args and call.args[0].pk == doc.pk
+            ),
+        )
+
    def test_notes_permissions_aware(self) -> None:
        """
        GIVEN:
@@ -987,29 +987,32 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
        THEN:
            - The similar documents are returned from the API request
        """
-        d1 = Document.objects.create(
+        # Distinct created/added dates: documents created at the same instant
+        # share a timestamp term, and more_like_this (which cannot be scoped to
+        # content fields) would then match on it, surfacing unrelated documents.
+        d1 = DocumentFactory(
            title="invoice",
            content="the thing i bought at a shop and paid with bank account",
-            checksum="A",
-            pk=1,
+            created=datetime.date(2018, 1, 1),
+            added=timezone.make_aware(datetime.datetime(2018, 1, 1)),
        )
-        d2 = Document.objects.create(
+        d2 = DocumentFactory(
            title="bank statement 1",
            content="things i paid for in august",
-            pk=2,
-            checksum="B",
+            created=datetime.date(2019, 3, 4),
+            added=timezone.make_aware(datetime.datetime(2019, 3, 4)),
        )
-        d3 = Document.objects.create(
+        d3 = DocumentFactory(
            title="bank statement 3",
            content="things i paid for in september",
-            pk=3,
-            checksum="C",
+            created=datetime.date(2020, 7, 9),
+            added=timezone.make_aware(datetime.datetime(2020, 7, 9)),
        )
-        d4 = Document.objects.create(
+        d4 = DocumentFactory(
            title="Quarterly Report",
            content="quarterly revenue profit margin earnings growth",
-            pk=4,
-            checksum="ABC",
+            created=datetime.date(2021, 11, 30),
+            added=timezone.make_aware(datetime.datetime(2021, 11, 30)),
        )
        backend = get_backend()
        backend.add_or_update(d1)
@@ -945,6 +945,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        pages = [[1, 2], [3]]
        self.doc2.archive_serial_number = 200
        self.doc2.save()
+        errback = bulk_edit.restore_archive_serial_numbers_task.s(
+            {self.doc2.id: 200},
+        )
+        mock_chord.return_value.on_error.return_value = mock_chord.return_value

        result = bulk_edit.split(doc_ids, pages, delete_originals=True)
        self.assertEqual(result, "OK")
@@ -957,6 +961,8 @@ class TestPDFActions(DirectoriesMixin, TestCase):

        mock_delete_documents.assert_called()
        mock_chord.assert_called_once()
+        mock_chord.return_value.on_error.assert_called_once_with(errback)
+        mock_chord.return_value.apply_async.assert_called_once_with()

        delete_documents_args, _ = mock_delete_documents.call_args
        self.assertEqual(
@@ -991,6 +997,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        self.doc2.save()

        sig = mock.Mock()
+        sig.on_error.return_value = sig
        sig.apply_async.side_effect = Exception("boom")
        mock_chord.return_value = sig

@@ -1256,10 +1263,16 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        operations = [{"page": 1}, {"page": 2}]
        self.doc2.archive_serial_number = 250
        self.doc2.save()
+        errback = bulk_edit.restore_archive_serial_numbers_task.s(
+            {self.doc2.id: 250},
+        )
+        mock_chord.return_value.on_error.return_value = mock_chord.return_value

        result = bulk_edit.edit_pdf(doc_ids, operations, delete_original=True)
        self.assertEqual(result, "OK")
        mock_chord.assert_called_once()
+        mock_chord.return_value.on_error.assert_called_once_with(errback)
+        mock_chord.return_value.apply_async.assert_called_once_with()
        self.assertEqual(mock_consume_file.call_args.kwargs["overrides"].asn, 250)
        self.doc2.refresh_from_db()
        self.assertIsNone(self.doc2.archive_serial_number)
@@ -1288,6 +1301,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        self.doc2.save()

        sig = mock.Mock()
+        sig.on_error.return_value = sig
        sig.apply_async.side_effect = Exception("boom")
        mock_chord.return_value = sig

@@ -1480,6 +1494,44 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        self.assertEqual(task_kwargs["input_doc"].root_document_id, doc.id)
        self.assertIsNotNone(task_kwargs["overrides"])

+    @mock.patch("documents.bulk_edit.update_document_content_maybe_archive_file.delay")
+    @mock.patch("documents.tasks.consume_file.apply_async")
+    @mock.patch("documents.bulk_edit.tempfile.mkdtemp")
+    @mock.patch("pikepdf.open")
+    def test_remove_password_update_document_uses_source_paths(
+        self,
+        mock_open,
+        mock_mkdtemp,
+        mock_consume_delay,
+        mock_update_document,
+    ) -> None:
+        doc = self.doc1
+        source_file = self.dirs.scratch_dir / "consumption-source.pdf"
+        source_file.write_bytes(b"protected pdf content")
+        temp_dir = self.dirs.scratch_dir / "remove-password-source-file"
+        temp_dir.mkdir(parents=True, exist_ok=True)
+        mock_mkdtemp.return_value = str(temp_dir)
+
+        fake_pdf = mock.MagicMock()
+
+        def save_side_effect(target_path):
+            Path(target_path).write_bytes(b"new pdf content")
+
+        fake_pdf.save.side_effect = save_side_effect
+        mock_open.return_value.__enter__.return_value = fake_pdf
+
+        result = bulk_edit.remove_password(
+            [doc.id],
+            password="secret",
+            update_document=True,
+            source_paths_by_id={doc.id: source_file},
+        )
+
+        self.assertEqual(result, "OK")
+        mock_open.assert_called_once_with(source_file, password="secret")
+        mock_update_document.assert_not_called()
+        mock_consume_delay.assert_called_once()
+
    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
    @mock.patch("documents.tasks.consume_file.apply_async")
    @mock.patch("pikepdf.open")
@@ -1120,12 +1120,14 @@ class TestConsumer(
            self.assertEqual(command[1], "--replace-input")

    @mock.patch("paperless_mail.models.MailRule.objects.get")
+    @mock.patch("paperless.parsers.mail.MailDocumentParser.get_thumbnail")
    @mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
    @mock.patch("documents.consumer.get_parser_registry")
    def test_mail_parser_receives_mailrule(
        self,
        mock_get_parser_registry: mock.Mock,
        mock_mail_parser_parse: mock.Mock,
+        mock_get_thumbnail: mock.Mock,
        mock_mailrule_get: mock.Mock,
    ) -> None:
        """
@@ -1136,6 +1138,7 @@ class TestConsumer(
        THEN:
            - The mail parser should receive the mail rule
        """
+        from documents.parsers import ParseError
        from paperless.parsers.mail import MailDocumentParser

        mock_get_parser_registry.return_value.get_parser_for_file.return_value = (
@@ -1144,19 +1147,24 @@ class TestConsumer(
        mock_mailrule_get.return_value = mock.Mock(
            pdf_layout=MailRule.PdfLayout.HTML_ONLY,
        )
+        mock_get_thumbnail.side_effect = ParseError("no thumbnail")
+
+        src = (
+            Path(__file__).parent.parent.parent
+            / Path("paperless")
+            / Path("tests")
+            / Path("samples")
+            / Path("mail")
+            / "html.eml"
+        )
+        dst = self.dirs.scratch_dir / "html.eml"
+        shutil.copy(src, dst)
+
        with self.get_consumer(
-            filepath=(
-                Path(__file__).parent.parent.parent
-                / Path("paperless")
-                / Path("tests")
-                / Path("samples")
-                / Path("mail")
-            ).resolve()
-            / "html.eml",
+            filepath=dst,
            source=DocumentSource.MailFetch,
            mailrule_id=1,
        ) as consumer:
-            # fails because no gotenberg
            with self.assertRaises(
                ConsumerError,
            ):
@@ -124,7 +124,7 @@ class ShareLinkBundleAPITests(DirectoriesMixin, APITestCase):
        self.assertIn("document_ids", response.data)

    def test_download_ready_bundle_streams_file(self) -> None:
-        bundle_file = Path(self.dirs.media_dir) / "bundles" / "ready.zip"
+        bundle_file = settings.SHARE_LINK_BUNDLE_DIR / "bundles" / "ready.zip"
        bundle_file.parent.mkdir(parents=True, exist_ok=True)
        bundle_file.write_bytes(b"binary-zip-content")

@@ -132,7 +132,7 @@ class ShareLinkBundleAPITests(DirectoriesMixin, APITestCase):
            slug="readyslug",
            file_version=ShareLink.FileVersion.ARCHIVE,
            status=ShareLinkBundle.Status.READY,
-            file_path=str(bundle_file),
+            file_path=str(bundle_file.relative_to(settings.SHARE_LINK_BUNDLE_DIR)),
        )
        bundle.documents.set([self.document])

@@ -199,11 +199,11 @@ class ShareLinkBundleTaskTests(DirectoriesMixin, APITestCase):
        self.document = DocumentFactory.create()

    def test_cleanup_expired_share_link_bundles(self) -> None:
-        expired_path = Path(self.dirs.media_dir) / "expired.zip"
+        expired_path = settings.SHARE_LINK_BUNDLE_DIR / "expired.zip"
        expired_path.parent.mkdir(parents=True, exist_ok=True)
        expired_path.write_bytes(b"expired")

-        active_path = Path(self.dirs.media_dir) / "active.zip"
+        active_path = settings.SHARE_LINK_BUNDLE_DIR / "active.zip"
        active_path.write_bytes(b"active")

        expired_bundle = ShareLinkBundle.objects.create(
@@ -211,7 +211,7 @@ class ShareLinkBundleTaskTests(DirectoriesMixin, APITestCase):
            file_version=ShareLink.FileVersion.ARCHIVE,
            status=ShareLinkBundle.Status.READY,
            expiration=timezone.now() - timedelta(days=1),
-            file_path=str(expired_path),
+            file_path=expired_path.name,
        )
        expired_bundle.documents.set([self.document])

@@ -220,7 +220,7 @@ class ShareLinkBundleTaskTests(DirectoriesMixin, APITestCase):
            file_version=ShareLink.FileVersion.ARCHIVE,
            status=ShareLinkBundle.Status.READY,
            expiration=timezone.now() + timedelta(days=1),
-            file_path=str(active_path),
+            file_path=active_path.name,
        )
        active_bundle.documents.set([self.document])

@@ -424,7 +424,7 @@ class ShareLinkBundleFilterSetTests(DirectoriesMixin, APITestCase):


 class ShareLinkBundleModelTests(DirectoriesMixin, APITestCase):
-    def test_absolute_file_path_handles_relative_and_absolute(self) -> None:
+    def test_absolute_file_path_handles_relative_path(self) -> None:
        relative_path = Path("relative.zip")
        bundle = ShareLinkBundle.objects.create(
            slug="relative-bundle",
@@ -437,10 +437,23 @@ class ShareLinkBundleModelTests(DirectoriesMixin, APITestCase):
            (settings.SHARE_LINK_BUNDLE_DIR / relative_path).resolve(),
        )

-        absolute_path = Path(self.dirs.media_dir) / "absolute.zip"
-        bundle.file_path = str(absolute_path)
+    def test_absolute_file_path_rejects_absolute_path(self) -> None:
+        bundle = ShareLinkBundle.objects.create(
+            slug="absolute-bundle",
+            file_version=ShareLink.FileVersion.ORIGINAL,
+            file_path=str(Path(self.dirs.media_dir) / "absolute.zip"),
+        )

-        self.assertEqual(bundle.absolute_file_path.resolve(), absolute_path.resolve())
+        self.assertIsNone(bundle.absolute_file_path)
+
+    def test_absolute_file_path_rejects_traversal_outside_bundle_dir(self) -> None:
+        bundle = ShareLinkBundle.objects.create(
+            slug="traversal-bundle",
+            file_version=ShareLink.FileVersion.ORIGINAL,
+            file_path="../escaped.zip",
+        )
+
+        self.assertIsNone(bundle.absolute_file_path)

    def test_str_returns_translated_slug(self) -> None:
        bundle = ShareLinkBundle.objects.create(
@@ -5,6 +5,7 @@ from django.test import TestCase

 from documents.conditionals import metadata_etag
 from documents.conditionals import preview_etag
+from documents.conditionals import thumbnail_etag
 from documents.conditionals import thumbnail_last_modified
 from documents.models import Document
 from documents.tests.utils import DirectoriesMixin
@@ -30,6 +31,7 @@ class TestConditionals(DirectoriesMixin, TestCase):

        self.assertEqual(metadata_etag(request, root.id), latest.checksum)
        self.assertEqual(preview_etag(request, root.id), latest.archive_checksum)
+        self.assertEqual(thumbnail_etag(request, root.id), latest.checksum)

    def test_resolve_effective_doc_returns_none_for_invalid_or_unrelated_version(
        self,
@@ -25,6 +25,7 @@ from documents.models import DocumentType
 from documents.models import ShareLink
 from documents.models import StoragePath
 from documents.models import Tag
+from documents.models import UiSettings
 from documents.signals.handlers import update_llm_suggestions_cache
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import read_streaming_response
@@ -319,6 +320,10 @@ class TestAISuggestions(DirectoriesMixin, TestCase):
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(response.json(), {"tags": ["tag1", "tag2"]})
+        mock_get_cache.assert_called_once_with(
+            self.document.pk,
+            backend="mock_backend",
+        )
        mock_refresh_cache.assert_called_once_with(self.document.pk)

    @patch("documents.views.get_ai_document_classification")
@@ -359,6 +364,49 @@ class TestAISuggestions(DirectoriesMixin, TestCase):
                "dates": ["2023-01-01"],
            },
        )
+        mock_get_ai_classification.assert_called_once_with(
+            self.document,
+            self.user,
+            None,
+        )
+
+    @patch("documents.views.get_ai_document_classification")
+    @override_settings(
+        AI_ENABLED=True,
+        LLM_BACKEND="mock_backend",
+    )
+    def test_ai_suggestions_uses_user_display_language(
+        self,
+        mock_get_ai_classification,
+    ) -> None:
+        UiSettings.objects.create(user=self.user, settings={"language": "de-de"})
+        mock_get_ai_classification.return_value = {
+            "title": "KI Title",
+            "tags": [],
+            "correspondents": [],
+            "document_types": [],
+            "storage_paths": [],
+            "dates": [],
+        }
+
+        self.client.force_login(user=self.user)
+        response = self.client.get(
+            f"/api/documents/{self.document.pk}/ai_suggestions/",
+        )
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        mock_get_ai_classification.assert_called_once_with(
+            self.document,
+            self.user,
+            "de-de",
+        )
+        self.assertEqual(
+            get_llm_suggestion_cache(
+                self.document.pk,
+                backend="mock_backend:de-de",
+            ).suggestions["title"],
+            "KI Title",
+        )

    @patch("documents.views.get_ai_document_classification")
    @override_settings(
@@ -437,8 +485,14 @@ class TestAIChatStreamingView(DirectoriesMixin, TestCase):
        )
        super().setUp()

+    def grant_view_document_permission(self) -> None:
+        self.user.user_permissions.add(
+            *Permission.objects.filter(codename="view_document"),
+        )
+
    @override_settings(AI_ENABLED=False)
    def test_post_ai_disabled(self) -> None:
+        self.grant_view_document_permission()
        response = self.client.post(
            self.ENDPOINT,
            data='{"q": "question"}',
@@ -451,6 +505,7 @@ class TestAIChatStreamingView(DirectoriesMixin, TestCase):
    @patch("documents.views.get_objects_for_user_owner_aware")
    @override_settings(AI_ENABLED=True)
    def test_post_no_document_id(self, mock_get_objects, mock_stream_chat) -> None:
+        self.grant_view_document_permission()
        mock_get_objects.return_value = [self.document]
        mock_stream_chat.return_value = iter([b"data"])
        response = self.client.post(
@@ -464,6 +519,7 @@ class TestAIChatStreamingView(DirectoriesMixin, TestCase):
    @patch("documents.views.stream_chat_with_documents")
    @override_settings(AI_ENABLED=True)
    def test_post_with_document_id(self, mock_stream_chat) -> None:
+        self.grant_view_document_permission()
        mock_stream_chat.return_value = iter([b"data"])
        response = self.client.post(
            self.ENDPOINT,
@@ -475,6 +531,7 @@ class TestAIChatStreamingView(DirectoriesMixin, TestCase):

    @override_settings(AI_ENABLED=True)
    def test_post_with_invalid_document_id(self) -> None:
+        self.grant_view_document_permission()
        response = self.client.post(
            self.ENDPOINT,
            data='{"q": "question", "document_id": 999999}',
@@ -486,6 +543,7 @@ class TestAIChatStreamingView(DirectoriesMixin, TestCase):
    @patch("documents.views.has_perms_owner_aware")
    @override_settings(AI_ENABLED=True)
    def test_post_with_document_id_no_permission(self, mock_has_perms) -> None:
+        self.grant_view_document_permission()
        mock_has_perms.return_value = False
        response = self.client.post(
            self.ENDPOINT,
@@ -494,3 +552,31 @@ class TestAIChatStreamingView(DirectoriesMixin, TestCase):
        )
        self.assertEqual(response.status_code, 403)
        self.assertIn(b"Insufficient permissions", response.content)
+
+    @patch("documents.views.stream_chat_with_documents")
+    @override_settings(AI_ENABLED=True)
+    def test_post_no_document_id_requires_view_document_permission(
+        self,
+        mock_stream_chat,
+    ) -> None:
+        response = self.client.post(
+            self.ENDPOINT,
+            data='{"q": "question"}',
+            content_type="application/json",
+        )
+        self.assertEqual(response.status_code, 403)
+        mock_stream_chat.assert_not_called()
+
+    @patch("documents.views.stream_chat_with_documents")
+    @override_settings(AI_ENABLED=True)
+    def test_post_with_document_id_requires_view_document_permission(
+        self,
+        mock_stream_chat,
+    ) -> None:
+        response = self.client.post(
+            self.ENDPOINT,
+            data=f'{{"q": "question", "document_id": {self.document.pk}}}',
+            content_type="application/json",
+        )
+        self.assertEqual(response.status_code, 403)
+        mock_stream_chat.assert_not_called()
@@ -4164,7 +4164,7 @@ class TestWorkflows(
        )
        action = WorkflowAction.objects.create(
            type=WorkflowAction.WorkflowActionType.PASSWORD_REMOVAL,
-            passwords="wrong, right\n extra ",
+            passwords=["wrong", "right", "extra"],
        )
        workflow = Workflow.objects.create(name="Password workflow")
        workflow.triggers.add(trigger)
@@ -4185,12 +4185,14 @@ class TestWorkflows(
                    password="wrong",
                    update_document=True,
                    user=doc.owner,
+                    source_paths_by_id=None,
                ),
                mock.call(
                    [doc.id],
                    password="right",
                    update_document=True,
                    user=doc.owner,
+                    source_paths_by_id=None,
                ),
            ],
        )
@@ -4218,7 +4220,7 @@ class TestWorkflows(
        )
        action = WorkflowAction.objects.create(
            type=WorkflowAction.WorkflowActionType.PASSWORD_REMOVAL,
-            passwords=" \n , ",
+            passwords=[" ", "  "],
        )
        workflow = Workflow.objects.create(name="Password workflow missing passwords")
        workflow.triggers.add(trigger)
@@ -4276,7 +4278,7 @@ class TestWorkflows(
        """
        action = WorkflowAction.objects.create(
            type=WorkflowAction.WorkflowActionType.PASSWORD_REMOVAL,
-            passwords="first, second",
+            passwords=["first", "second"],
        )

        temp_dir = Path(tempfile.mkdtemp())
@@ -4304,6 +4306,7 @@ class TestWorkflows(
        document_consumption_finished.send(
            sender=self.__class__,
            document=doc,
+            original_file=original_file,
        )

        assert mock_remove_password.call_count == 2
@@ -4314,12 +4317,14 @@ class TestWorkflows(
                    password="first",
                    update_document=True,
                    user=doc.owner,
+                    source_paths_by_id={doc.id: original_file},
                ),
                mock.call(
                    [doc.id],
                    password="second",
                    update_document=True,
                    user=doc.owner,
+                    source_paths_by_id={doc.id: original_file},
                ),
            ],
        )
@@ -4331,6 +4336,53 @@ class TestWorkflows(
        )
        assert mock_remove_password.call_count == 2

+    @mock.patch("documents.bulk_edit.remove_password")
+    def test_password_removal_document_added_uses_original_file(
+        self,
+        mock_remove_password,
+    ) -> None:
+        """
+        GIVEN:
+            - Workflow password removal action on a DOCUMENT_ADDED trigger
+            - run_workflows called with an explicit original_file (staged file
+              from the consumer, before the source path is populated)
+        WHEN:
+            - The workflow runs
+        THEN:
+            - remove_password is called with source_paths_by_id pointing at the
+              staged file rather than the not-yet-existing source_path
+        """
+        doc = Document.objects.create(
+            title="Protected",
+            checksum="pw-checksum-added",
+        )
+        trigger = WorkflowTrigger.objects.create(
+            type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
+        )
+        action = WorkflowAction.objects.create(
+            type=WorkflowAction.WorkflowActionType.PASSWORD_REMOVAL,
+            passwords=["secret"],
+        )
+        workflow = Workflow.objects.create(name="Password workflow added")
+        workflow.triggers.add(trigger)
+        workflow.actions.add(action)
+
+        mock_remove_password.return_value = "OK"
+
+        temp_dir = Path(tempfile.mkdtemp())
+        original_file = temp_dir / "staged.pdf"
+        original_file.write_bytes(b"pdf content")
+
+        run_workflows(trigger.type, doc, original_file=original_file)
+
+        mock_remove_password.assert_called_once_with(
+            [doc.id],
+            password="secret",
+            update_document=True,
+            user=doc.owner,
+            source_paths_by_id={doc.id: original_file},
+        )
+
    def test_workflow_trash_action_soft_delete(self) -> None:
        """
        GIVEN: