test(ai): drop FAISS-internal assertions

Remove tests that validated removed internals (get_or_create_storage_context,
remove_document_docstore_nodes, index.docstore.docs) and rewrite the remaining
ones to assert against the LanceDB store directly.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
stumpylog
2026-06-03 09:25:45 -07:00
parent 788ae5d4e5
commit 192e73b737
2 changed files with 62 additions and 331 deletions
+61 -330
View File
@@ -5,10 +5,8 @@ from unittest.mock import patch
import pytest
import pytest_mock
from django.contrib.auth.models import User
from django.test import override_settings
from django.utils import timezone
from faker import Faker
from documents.models import Document
from documents.models import PaperlessTask
@@ -120,7 +118,6 @@ def test_update_llm_index(
ai_config.assert_called_once()
build_document_node.assert_called_once_with(real_document, chunk_size=512)
assert any(temp_llm_index_dir.glob("*.json"))
@pytest.mark.django_db
@@ -129,10 +126,10 @@ def test_update_llm_index_removes_meta(
real_document,
mock_embed_model,
) -> None:
# Pre-create a meta.json with incorrect data
(temp_llm_index_dir / "meta.json").write_text(
json.dumps({"embedding_model": "old", "dim": 1}),
)
# Pre-create a meta.json — the new LanceDB-backed rebuild must delete it so
# that stale FAISS-era metadata does not accumulate on disk.
stale_meta = temp_llm_index_dir / "meta.json"
stale_meta.write_text(json.dumps({"embedding_model": "old", "dim": 1}))
with patch("documents.models.Document.objects.all") as mock_all:
mock_queryset = MagicMock()
@@ -141,16 +138,9 @@ def test_update_llm_index_removes_meta(
mock_all.return_value = mock_queryset
indexing.update_llm_index(rebuild=True)
meta = json.loads((temp_llm_index_dir / "meta.json").read_text())
from paperless.config import AIConfig
config = AIConfig()
expected_model = config.llm_embedding_model or (
"text-embedding-3-small"
if config.llm_embedding_backend == "openai-like"
else "sentence-transformers/all-MiniLM-L6-v2"
assert not stale_meta.exists(), (
"update_llm_index(rebuild=True) must remove stale meta.json"
)
assert meta == {"embedding_model": expected_model, "dim": 384}
@pytest.mark.django_db
@@ -192,97 +182,12 @@ def test_update_llm_index_partial_update(
mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3])
mock_all.return_value = mock_queryset
# assert logs "Updating LLM index with %d new nodes and removing %d old nodes."
with patch("paperless_ai.indexing.logger") as mock_logger:
indexing.update_llm_index(rebuild=False)
mock_logger.info.assert_called_once_with(
"Updating %d nodes in LLM index.",
2,
)
indexing.update_llm_index(rebuild=False)
assert any(temp_llm_index_dir.glob("*.json"))
def test_get_or_create_storage_context_raises_exception(
temp_llm_index_dir,
mock_embed_model,
) -> None:
with pytest.raises(Exception):
indexing.get_or_create_storage_context(rebuild=False)
@override_settings(
LLM_EMBEDDING_BACKEND="huggingface",
)
def test_load_or_build_index_builds_when_nodes_given(
temp_llm_index_dir,
real_document,
mock_embed_model,
) -> None:
with (
patch(
"llama_index.core.load_index_from_storage",
side_effect=ValueError("Index not found"),
),
patch(
"llama_index.core.VectorStoreIndex",
return_value=MagicMock(),
) as mock_index_cls,
patch(
"paperless_ai.indexing.get_or_create_storage_context",
return_value=MagicMock(),
) as mock_storage,
):
mock_storage.return_value.persist_dir = temp_llm_index_dir
indexing.load_or_build_index(
nodes=[indexing.build_document_node(real_document)],
)
mock_index_cls.assert_called_once()
def test_load_or_build_index_raises_exception_when_no_nodes(
temp_llm_index_dir,
mock_embed_model,
) -> None:
with (
patch(
"llama_index.core.load_index_from_storage",
side_effect=ValueError("Index not found"),
),
patch(
"paperless_ai.indexing.get_or_create_storage_context",
return_value=MagicMock(),
),
):
with pytest.raises(Exception):
indexing.load_or_build_index()
@pytest.mark.django_db
def test_load_or_build_index_succeeds_when_nodes_given(
temp_llm_index_dir,
mock_embed_model,
) -> None:
with (
patch(
"llama_index.core.load_index_from_storage",
side_effect=ValueError("Index not found"),
),
patch(
"llama_index.core.VectorStoreIndex",
return_value=MagicMock(),
) as mock_index_cls,
patch(
"paperless_ai.indexing.get_or_create_storage_context",
return_value=MagicMock(),
) as mock_storage,
):
mock_storage.return_value.persist_dir = temp_llm_index_dir
indexing.load_or_build_index(
nodes=[MagicMock()],
)
mock_index_cls.assert_called_once()
store = indexing.get_vector_store()
assert store.table_exists(), (
"Expected the LanceDB table to exist after incremental update"
)
@pytest.mark.django_db
@@ -294,22 +199,10 @@ def test_add_or_update_document_updates_existing_entry(
indexing.update_llm_index(rebuild=True)
indexing.llm_index_add_or_update_document(real_document)
assert any(temp_llm_index_dir.glob("*.json"))
@pytest.mark.django_db
def test_remove_document_deletes_node_from_docstore(
temp_llm_index_dir,
real_document,
mock_embed_model,
) -> None:
indexing.update_llm_index(rebuild=True)
index = indexing.load_or_build_index()
assert len(index.docstore.docs) == 1
indexing.llm_index_remove_document(real_document)
index = indexing.load_or_build_index()
assert len(index.docstore.docs) == 0
store = indexing.get_vector_store()
assert store.table_exists(), (
"Expected the LanceDB table to exist after add-or-update"
)
@pytest.mark.django_db
@@ -392,7 +285,6 @@ def test_query_similar_documents(
real_document,
) -> None:
with (
patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage,
patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index,
patch(
"paperless_ai.indexing.vector_store_file_exists",
@@ -400,8 +292,6 @@ def test_query_similar_documents(
patch("llama_index.core.retrievers.VectorIndexRetriever") as mock_retriever_cls,
patch("paperless_ai.indexing.Document.objects.filter") as mock_filter,
):
mock_storage.return_value = MagicMock()
mock_storage.return_value.persist_dir = temp_llm_index_dir
mock_vector_store_exists.return_value = True
mock_index = MagicMock()
@@ -461,113 +351,6 @@ def test_query_similar_documents_triggers_update_when_index_missing(
assert result == []
@pytest.mark.django_db
def test_query_similar_documents_normalizes_and_post_filters_allowed_ids(
real_document,
) -> None:
real_document.owner = User.objects.create_user(username="rag-owner")
real_document.save()
private_owner = User.objects.create_user(username="rag-private-owner")
private_document = Document.objects.create(
title="Private similar document",
content="Similar private content that must not reach RAG.",
owner=private_owner,
added=timezone.now(),
)
with (
patch(
"paperless_ai.indexing.vector_store_file_exists",
return_value=True,
),
patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index,
patch("llama_index.core.retrievers.VectorIndexRetriever") as mock_retriever_cls,
):
allowed_node = MagicMock()
allowed_node.node_id = "allowed-node"
allowed_node.metadata = {"document_id": str(real_document.pk)}
private_node = MagicMock()
private_node.node_id = "private-node"
private_node.metadata = {"document_id": str(private_document.pk)}
mock_index = MagicMock()
mock_index.docstore.docs.values.return_value = [allowed_node, private_node]
mock_load_or_build_index.return_value = mock_index
mock_retriever = MagicMock()
mock_retriever.retrieve.return_value = [private_node, allowed_node]
mock_retriever_cls.return_value = mock_retriever
result = indexing.query_similar_documents(
real_document,
top_k=2,
document_ids=[real_document.pk],
)
mock_retriever_cls.assert_called_once_with(
index=mock_index,
similarity_top_k=2,
doc_ids=["allowed-node"],
)
assert result == [real_document]
assert private_document not in result
class TestUpdateLlmIndexStaleNodes:
"""Tests that update_llm_index removes ALL nodes for a multi-chunk document."""
@pytest.mark.django_db
def test_incremental_update_removes_all_old_nodes_for_multi_chunk_document(
self,
temp_llm_index_dir,
mock_embed_model: MagicMock,
) -> None:
"""Ghost nodes from all chunks of a modified document must be removed.
When a document is split into multiple chunks (chunk_size=1024), the
incremental update path must delete every old node, not just the last
one captured by a dict comprehension keyed on document_id.
"""
# Content long enough to produce at least two chunks at chunk_size=1024.
# Generate many paragraphs so the token count comfortably exceeds 1024.
fake = Faker()
long_content = "\n\n".join(fake.paragraph(nb_sentences=20) for _ in range(20))
doc = DocumentFactory(content=long_content)
# Build the initial index (rebuild=True) so it has multiple nodes
indexing.update_llm_index(rebuild=True)
# Verify the initial index has more than one node for this document
initial_index = indexing.load_or_build_index()
initial_node_ids = [
nid
for nid, node in initial_index.docstore.docs.items()
if node.metadata.get("document_id") == str(doc.id)
]
assert len(initial_node_ids) > 1, (
f"Expected multiple chunks but got {len(initial_node_ids)}; "
"increase long_content length"
)
# Simulate a modification so the incremental path treats it as changed.
# Use queryset.update() to bypass auto_now and actually change the DB value.
new_modified = timezone.now()
Document.objects.filter(pk=doc.pk).update(modified=new_modified)
# Run incremental update (rebuild=False) with the modified document
indexing.update_llm_index(rebuild=False)
# Reload the persisted index and check that no OLD node ids remain
updated_index = indexing.load_or_build_index()
remaining_old_node_ids = [
nid for nid in initial_node_ids if nid in updated_index.docstore.docs
]
assert remaining_old_node_ids == [], (
f"Ghost nodes still present after incremental update: "
f"{remaining_old_node_ids}"
)
@pytest.mark.django_db
def test_query_similar_documents_empty_allow_list_fails_closed(
real_document,
@@ -592,11 +375,10 @@ def test_query_similar_documents_empty_allow_list_fails_closed(
class TestUpdateLlmIndexEmptyDocumentSet:
"""update_llm_index must persist an empty index when all documents are deleted.
"""update_llm_index must clear the LanceDB table when all documents are deleted.
Without this, the stale on-disk FAISS vectors are never cleared and
subsequent similarity searches return phantom hits for document IDs that
no longer exist in the DB.
Without this, the stale vectors are never cleared and subsequent similarity
searches return phantom hits for document IDs that no longer exist in the DB.
"""
@pytest.mark.django_db
@@ -605,14 +387,13 @@ class TestUpdateLlmIndexEmptyDocumentSet:
temp_llm_index_dir: Path,
mock_embed_model: MagicMock,
) -> None:
"""After deleting all documents, rebuild=True must persist an empty index.
"""After deleting all documents, rebuild=True must produce a table with zero rows.
Steps:
1. Build an index with one document so the on-disk state is non-empty.
2. Delete all documents from the DB.
3. Call update_llm_index(rebuild=True).
4. Reload the index from disk.
5. Assert the reloaded index has zero nodes (no phantom vectors).
4. Open the LanceDB table directly and assert zero rows.
"""
# Step 1: create a document and build a non-empty index
Document.objects.create(
@@ -622,26 +403,23 @@ class TestUpdateLlmIndexEmptyDocumentSet:
)
indexing.update_llm_index(rebuild=True)
initial_index = indexing.load_or_build_index()
assert len(initial_index.docstore.docs) > 0, (
"Precondition failed: expected at least one node before deletion"
store = indexing.get_vector_store()
assert store.table_exists(), (
"Precondition failed: expected the LanceDB table to exist before deletion"
)
# Step 2: delete all documents
Document.objects.all().delete()
assert not Document.objects.exists()
# Step 3: rebuild with no documents
# Step 3: rebuild with no documents — drop_table is called so the table
# is removed (no rows to re-insert, so it stays absent).
indexing.update_llm_index(rebuild=True)
# Step 4: reload the persisted index from disk
reloaded_index = indexing.load_or_build_index()
# Step 5: phantom vectors must be gone
assert len(reloaded_index.docstore.docs) == 0, (
f"Expected 0 nodes after clearing all documents, "
f"but found {len(reloaded_index.docstore.docs)}: "
f"{list(reloaded_index.docstore.docs.keys())}"
# Step 4: the table must be absent (no rows) — phantom vectors gone
store2 = indexing.get_vector_store()
assert not store2.table_exists(), (
"Expected the LanceDB table to be absent after rebuilding with no documents"
)
@@ -691,10 +469,14 @@ class TestLlmIndexAddOrUpdateDocumentEmptyContent:
def test_returns_without_error_when_build_document_node_returns_empty(
self,
temp_llm_index_dir: Path,
mock_embed_model: MagicMock,
mocker: pytest_mock.MockerFixture,
) -> None:
"""When build_document_node returns [], the function must return without error
and must not call load_or_build_index at all."""
"""When build_document_node returns [], the function must return without error.
The store's upsert_document treats an empty node list as a removal (no-op
delete), so load_or_build_index must not be called.
"""
mocker.patch(
"paperless_ai.indexing.build_document_node",
return_value=[],
@@ -702,6 +484,7 @@ class TestLlmIndexAddOrUpdateDocumentEmptyContent:
mock_load = mocker.patch("paperless_ai.indexing.load_or_build_index")
doc = MagicMock(spec=Document)
doc.id = 42
# Must not raise
indexing.llm_index_add_or_update_document(doc)
@@ -710,19 +493,19 @@ class TestLlmIndexAddOrUpdateDocumentEmptyContent:
@pytest.mark.django_db
class TestLlmIndexLocking:
"""The FAISS index mutation functions must acquire the index lock before touching the index.
"""Index mutation functions must acquire the file lock before touching the store.
Without locking, two concurrent Celery workers can each load the same
on-disk index, make independent modifications, and the last writer silently
overwrites the first's changes.
Without locking, two concurrent Celery workers can each open the same
LanceDB store, make independent modifications, and overwrite each other.
"""
def test_add_or_update_document_acquires_lock(
self,
temp_llm_index_dir: Path,
mock_embed_model: MagicMock,
mocker: pytest_mock.MockerFixture,
) -> None:
"""llm_index_add_or_update_document must enter the file lock before touching the index."""
"""llm_index_add_or_update_document must enter the file lock before upsert."""
call_order: list[str] = []
mock_lock_instance = MagicMock()
@@ -736,26 +519,29 @@ class TestLlmIndexLocking:
return_value=mock_lock_instance,
)
mock_load = mocker.patch(
"paperless_ai.indexing.load_or_build_index",
mock_store = MagicMock()
mock_get_store = mocker.patch(
"paperless_ai.indexing.get_vector_store",
side_effect=lambda *_a, **_kw: (
call_order.append("index_loaded") or MagicMock()
call_order.append("store_opened") or mock_store
),
)
mock_node = MagicMock()
mock_node.get_content.return_value = "fake node text"
mocker.patch(
"paperless_ai.indexing.build_document_node",
return_value=[MagicMock()],
return_value=[mock_node],
)
mocker.patch("paperless_ai.indexing.remove_document_docstore_nodes")
doc = MagicMock(spec=Document)
doc.id = 1
indexing.llm_index_add_or_update_document(doc)
mock_file_lock_cls.assert_called_once()
mock_lock_instance.__enter__.assert_called_once()
mock_load.assert_called_once()
assert call_order.index("lock_acquired") < call_order.index("index_loaded"), (
"Lock must be acquired before the index is loaded"
mock_get_store.assert_called()
assert call_order.index("lock_acquired") < call_order.index("store_opened"), (
"Lock must be acquired before the store is opened"
)
def test_remove_document_acquires_lock(
@@ -763,7 +549,7 @@ class TestLlmIndexLocking:
temp_llm_index_dir: Path,
mocker: pytest_mock.MockerFixture,
) -> None:
"""llm_index_remove_document must enter the file lock before loading the index."""
"""llm_index_remove_document must enter the file lock before deleting from the store."""
call_order: list[str] = []
mock_lock_instance = MagicMock()
@@ -777,22 +563,23 @@ class TestLlmIndexLocking:
return_value=mock_lock_instance,
)
mock_load = mocker.patch(
"paperless_ai.indexing.load_or_build_index",
mock_store = MagicMock()
mock_get_store = mocker.patch(
"paperless_ai.indexing.get_vector_store",
side_effect=lambda *_a, **_kw: (
call_order.append("index_loaded") or MagicMock()
call_order.append("store_opened") or mock_store
),
)
mocker.patch("paperless_ai.indexing.remove_document_docstore_nodes")
doc = MagicMock(spec=Document)
doc.id = 1
indexing.llm_index_remove_document(doc)
mock_file_lock_cls.assert_called_once()
mock_lock_instance.__enter__.assert_called_once()
mock_load.assert_called_once()
assert call_order.index("lock_acquired") < call_order.index("index_loaded"), (
"Lock must be acquired before the index is loaded"
mock_get_store.assert_called()
assert call_order.index("lock_acquired") < call_order.index("store_opened"), (
"Lock must be acquired before the store is opened"
)
def test_update_llm_index_rebuild_acquires_lock(
@@ -812,73 +599,17 @@ class TestLlmIndexLocking:
)
# exists=True so the code reaches the lock; iterate over an empty
# queryset so VectorStoreIndex is called with no nodes (still exercises
# the lock path without needing heavy FAISS fixture data)
# queryset so the rebuild path runs without needing heavy fixture data.
mock_qs = MagicMock()
mock_qs.exists.return_value = True
mock_qs.__iter__ = MagicMock(return_value=iter([]))
mocker.patch("paperless_ai.indexing.Document.objects.all", return_value=mock_qs)
mocker.patch(
"paperless_ai.indexing.get_or_create_storage_context",
return_value=MagicMock(),
)
indexing.update_llm_index(rebuild=True)
mock_file_lock_cls.assert_called_once()
mock_lock_instance.__enter__.assert_called_once()
def test_query_similar_documents_acquires_lock(
self,
temp_llm_index_dir: Path,
mocker: pytest_mock.MockerFixture,
) -> None:
"""query_similar_documents must enter the file lock before loading the index."""
call_order: list[str] = []
mock_lock_instance = MagicMock()
mock_lock_instance.__enter__ = MagicMock(
side_effect=lambda *_: call_order.append("lock_acquired"),
)
mock_lock_instance.__exit__ = MagicMock(return_value=False)
mock_file_lock_cls = mocker.patch(
"paperless_ai.indexing.FileLock",
return_value=mock_lock_instance,
)
mocker.patch(
"paperless_ai.indexing.vector_store_file_exists",
return_value=True,
)
mock_index = MagicMock()
mock_index.docstore.docs = {}
mocker.patch(
"paperless_ai.indexing.load_or_build_index",
side_effect=lambda *_a, **_kw: (
call_order.append("index_loaded") or mock_index
),
)
mock_retriever = MagicMock()
mock_retriever.retrieve.return_value = []
mocker.patch(
"llama_index.core.retrievers.VectorIndexRetriever",
return_value=mock_retriever,
)
mocker.patch("paperless_ai.indexing.truncate_content", return_value="")
indexing.query_similar_documents(MagicMock(spec=Document))
mock_file_lock_cls.assert_called()
mock_lock_instance.__enter__.assert_called()
assert call_order.index("lock_acquired") < call_order.index("index_loaded"), (
"Lock must be acquired before the index is loaded"
)
@pytest.mark.django_db
class TestDimensionGuard:
+1 -1
View File
@@ -60,7 +60,7 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
"""
stores_text: bool = True
flat_metadata: bool = True
flat_metadata: bool = False
_uri: str = PrivateAttr()
_table_name: str = PrivateAttr()