mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-06 13:49:44 +00:00
fix(ai): sort document_id filter values; add chat filter scoping test
- chat.py: use sorted() for doc_ids in the MetadataFilters IN clause, matching the same pattern used in query_similar_documents. Ensures deterministic filter construction regardless of document iteration order. - test_chat.py: add test_chat_filter_contains_only_requested_document_ids verifying that the retriever receives a filter scoped only to the requested documents (not all indexed documents). Inspired by test_document_filtered_retriever_applies_lancedb_metadata_filter in origin/feature/beta-lancedb. Co-Authored-By: shamoon <shamoon@users.noreply.github.com> Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -94,7 +94,7 @@ def _stream_chat_with_documents(query_str: str, documents: list[Document]):
|
||||
|
||||
index = load_or_build_index()
|
||||
|
||||
doc_ids = [str(doc.pk) for doc in documents]
|
||||
doc_ids = sorted(str(doc.pk) for doc in documents)
|
||||
filters = MetadataFilters(
|
||||
filters=[
|
||||
MetadataFilter(
|
||||
|
||||
@@ -238,3 +238,46 @@ class TestStreamChatRetrieval:
|
||||
# Nothing indexed for this document yet.
|
||||
out = list(chat.stream_chat_with_documents("question?", [doc]))
|
||||
assert chat.CHAT_NO_CONTENT_MESSAGE in out
|
||||
|
||||
def test_chat_filter_contains_only_requested_document_ids(
|
||||
self,
|
||||
temp_llm_index_dir,
|
||||
mock_embed_model,
|
||||
mocker,
|
||||
) -> None:
|
||||
"""The MetadataFilter passed to the retriever must be scoped to the
|
||||
requested documents only — content from other indexed documents must
|
||||
not be surfaced.
|
||||
"""
|
||||
from documents.tests.factories import DocumentFactory
|
||||
from paperless_ai import indexing
|
||||
|
||||
included = DocumentFactory.create(content="included document content")
|
||||
excluded = DocumentFactory.create(content="excluded document content")
|
||||
indexing.llm_index_add_or_update_document(included)
|
||||
indexing.llm_index_add_or_update_document(excluded)
|
||||
|
||||
# VectorIndexRetriever is imported inside _stream_chat_with_documents;
|
||||
# patch it at the llama_index source so the lazy import picks it up.
|
||||
captured_filters = []
|
||||
mock_retriever = mocker.MagicMock()
|
||||
mock_retriever.retrieve.return_value = []
|
||||
|
||||
def capture_retriever(*args, **kwargs):
|
||||
captured_filters.append(kwargs.get("filters"))
|
||||
return mock_retriever
|
||||
|
||||
mocker.patch("paperless_ai.chat.AIClient")
|
||||
mocker.patch(
|
||||
"llama_index.core.retrievers.VectorIndexRetriever",
|
||||
side_effect=capture_retriever,
|
||||
)
|
||||
|
||||
list(chat.stream_chat_with_documents("question?", [included]))
|
||||
|
||||
assert captured_filters, "VectorIndexRetriever was never constructed"
|
||||
filt = captured_filters[0]
|
||||
assert filt is not None, "Retriever must receive a MetadataFilters"
|
||||
filter_values = filt.filters[0].value
|
||||
assert str(included.pk) in filter_values
|
||||
assert str(excluded.pk) not in filter_values
|
||||
|
||||
Reference in New Issue
Block a user