From 4aefb9b138680e0b96cf52c02dfc1c5dfc9ce280 Mon Sep 17 00:00:00 2001
From: stumpylog <797416+stumpylog@users.noreply.github.com>
Date: Wed, 3 Jun 2026 11:06:26 -0700
Subject: [PATCH] fix(ai): sort document_id filter values; add chat filter
 scoping test

- chat.py: use sorted() for doc_ids in the MetadataFilters IN clause,
  matching the same pattern used in query_similar_documents. Ensures
  deterministic filter construction regardless of document iteration order.
- test_chat.py: add test_chat_filter_contains_only_requested_document_ids
  verifying that the retriever receives a filter scoped only to the
  requested documents (not all indexed documents). Inspired by
  test_document_filtered_retriever_applies_lancedb_metadata_filter in
  origin/feature/beta-lancedb.

Co-Authored-By: shamoon <shamoon@users.noreply.github.com>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/paperless_ai/chat.py            |  2 +-
 src/paperless_ai/tests/test_chat.py | 43 +++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/paperless_ai/chat.py b/src/paperless_ai/chat.py
index be1da80f6..6465cec9e 100644
--- a/src/paperless_ai/chat.py
+++ b/src/paperless_ai/chat.py
@@ -94,7 +94,7 @@ def _stream_chat_with_documents(query_str: str, documents: list[Document]):
 
     index = load_or_build_index()
 
-    doc_ids = [str(doc.pk) for doc in documents]
+    doc_ids = sorted(str(doc.pk) for doc in documents)
     filters = MetadataFilters(
         filters=[
             MetadataFilter(
diff --git a/src/paperless_ai/tests/test_chat.py b/src/paperless_ai/tests/test_chat.py
index f7edc3fc9..52e36b15a 100644
--- a/src/paperless_ai/tests/test_chat.py
+++ b/src/paperless_ai/tests/test_chat.py
@@ -238,3 +238,46 @@ class TestStreamChatRetrieval:
         # Nothing indexed for this document yet.
         out = list(chat.stream_chat_with_documents("question?", [doc]))
         assert chat.CHAT_NO_CONTENT_MESSAGE in out
+
+    def test_chat_filter_contains_only_requested_document_ids(
+        self,
+        temp_llm_index_dir,
+        mock_embed_model,
+        mocker,
+    ) -> None:
+        """The MetadataFilter passed to the retriever must be scoped to the
+        requested documents only — content from other indexed documents must
+        not be surfaced.
+        """
+        from documents.tests.factories import DocumentFactory
+        from paperless_ai import indexing
+
+        included = DocumentFactory.create(content="included document content")
+        excluded = DocumentFactory.create(content="excluded document content")
+        indexing.llm_index_add_or_update_document(included)
+        indexing.llm_index_add_or_update_document(excluded)
+
+        # VectorIndexRetriever is imported inside _stream_chat_with_documents;
+        # patch it at the llama_index source so the lazy import picks it up.
+        captured_filters = []
+        mock_retriever = mocker.MagicMock()
+        mock_retriever.retrieve.return_value = []
+
+        def capture_retriever(*args, **kwargs):
+            captured_filters.append(kwargs.get("filters"))
+            return mock_retriever
+
+        mocker.patch("paperless_ai.chat.AIClient")
+        mocker.patch(
+            "llama_index.core.retrievers.VectorIndexRetriever",
+            side_effect=capture_retriever,
+        )
+
+        list(chat.stream_chat_with_documents("question?", [included]))
+
+        assert captured_filters, "VectorIndexRetriever was never constructed"
+        filt = captured_filters[0]
+        assert filt is not None, "Retriever must receive a MetadataFilters"
+        filter_values = filt.filters[0].value
+        assert str(included.pk) in filter_values
+        assert str(excluded.pk) not in filter_values