From 97fb1ec3138077913e38be646b408eb45a076233 Mon Sep 17 00:00:00 2001 From: stumpylog <797416+stumpylog@users.noreply.github.com> Date: Wed, 3 Jun 2026 09:25:45 -0700 Subject: [PATCH] test(ai): drop FAISS-internal assertions Remove tests that validated removed internals (get_or_create_storage_context, remove_document_docstore_nodes, index.docstore.docs) and rewrite the remaining ones to assert against the LanceDB store directly. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/paperless_ai/tests/test_ai_indexing.py | 391 ++++----------------- src/paperless_ai/vector_store.py | 2 +- 2 files changed, 62 insertions(+), 331 deletions(-) diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py index 201edf052..503999b2c 100644 --- a/src/paperless_ai/tests/test_ai_indexing.py +++ b/src/paperless_ai/tests/test_ai_indexing.py @@ -5,10 +5,8 @@ from unittest.mock import patch import pytest import pytest_mock -from django.contrib.auth.models import User from django.test import override_settings from django.utils import timezone -from faker import Faker from documents.models import Document from documents.models import PaperlessTask @@ -120,7 +118,6 @@ def test_update_llm_index( ai_config.assert_called_once() build_document_node.assert_called_once_with(real_document, chunk_size=512) - assert any(temp_llm_index_dir.glob("*.json")) @pytest.mark.django_db @@ -129,10 +126,10 @@ def test_update_llm_index_removes_meta( real_document, mock_embed_model, ) -> None: - # Pre-create a meta.json with incorrect data - (temp_llm_index_dir / "meta.json").write_text( - json.dumps({"embedding_model": "old", "dim": 1}), - ) + # Pre-create a meta.json — the new LanceDB-backed rebuild must delete it so + # that stale FAISS-era metadata does not accumulate on disk. + stale_meta = temp_llm_index_dir / "meta.json" + stale_meta.write_text(json.dumps({"embedding_model": "old", "dim": 1})) with patch("documents.models.Document.objects.all") as mock_all: mock_queryset = MagicMock() @@ -141,16 +138,9 @@ def test_update_llm_index_removes_meta( mock_all.return_value = mock_queryset indexing.update_llm_index(rebuild=True) - meta = json.loads((temp_llm_index_dir / "meta.json").read_text()) - from paperless.config import AIConfig - - config = AIConfig() - expected_model = config.llm_embedding_model or ( - "text-embedding-3-small" - if config.llm_embedding_backend == "openai-like" - else "sentence-transformers/all-MiniLM-L6-v2" + assert not stale_meta.exists(), ( + "update_llm_index(rebuild=True) must remove stale meta.json" ) - assert meta == {"embedding_model": expected_model, "dim": 384} @pytest.mark.django_db @@ -192,97 +182,12 @@ def test_update_llm_index_partial_update( mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3]) mock_all.return_value = mock_queryset - # assert logs "Updating LLM index with %d new nodes and removing %d old nodes." - with patch("paperless_ai.indexing.logger") as mock_logger: - indexing.update_llm_index(rebuild=False) - mock_logger.info.assert_called_once_with( - "Updating %d nodes in LLM index.", - 2, - ) indexing.update_llm_index(rebuild=False) - assert any(temp_llm_index_dir.glob("*.json")) - - -def test_get_or_create_storage_context_raises_exception( - temp_llm_index_dir, - mock_embed_model, -) -> None: - with pytest.raises(Exception): - indexing.get_or_create_storage_context(rebuild=False) - - -@override_settings( - LLM_EMBEDDING_BACKEND="huggingface", -) -def test_load_or_build_index_builds_when_nodes_given( - temp_llm_index_dir, - real_document, - mock_embed_model, -) -> None: - with ( - patch( - "llama_index.core.load_index_from_storage", - side_effect=ValueError("Index not found"), - ), - patch( - "llama_index.core.VectorStoreIndex", - return_value=MagicMock(), - ) as mock_index_cls, - patch( - "paperless_ai.indexing.get_or_create_storage_context", - return_value=MagicMock(), - ) as mock_storage, - ): - mock_storage.return_value.persist_dir = temp_llm_index_dir - indexing.load_or_build_index( - nodes=[indexing.build_document_node(real_document)], - ) - mock_index_cls.assert_called_once() - - -def test_load_or_build_index_raises_exception_when_no_nodes( - temp_llm_index_dir, - mock_embed_model, -) -> None: - with ( - patch( - "llama_index.core.load_index_from_storage", - side_effect=ValueError("Index not found"), - ), - patch( - "paperless_ai.indexing.get_or_create_storage_context", - return_value=MagicMock(), - ), - ): - with pytest.raises(Exception): - indexing.load_or_build_index() - - -@pytest.mark.django_db -def test_load_or_build_index_succeeds_when_nodes_given( - temp_llm_index_dir, - mock_embed_model, -) -> None: - with ( - patch( - "llama_index.core.load_index_from_storage", - side_effect=ValueError("Index not found"), - ), - patch( - "llama_index.core.VectorStoreIndex", - return_value=MagicMock(), - ) as mock_index_cls, - patch( - "paperless_ai.indexing.get_or_create_storage_context", - return_value=MagicMock(), - ) as mock_storage, - ): - mock_storage.return_value.persist_dir = temp_llm_index_dir - indexing.load_or_build_index( - nodes=[MagicMock()], - ) - mock_index_cls.assert_called_once() + store = indexing.get_vector_store() + assert store.table_exists(), ( + "Expected the LanceDB table to exist after incremental update" + ) @pytest.mark.django_db @@ -294,22 +199,10 @@ def test_add_or_update_document_updates_existing_entry( indexing.update_llm_index(rebuild=True) indexing.llm_index_add_or_update_document(real_document) - assert any(temp_llm_index_dir.glob("*.json")) - - -@pytest.mark.django_db -def test_remove_document_deletes_node_from_docstore( - temp_llm_index_dir, - real_document, - mock_embed_model, -) -> None: - indexing.update_llm_index(rebuild=True) - index = indexing.load_or_build_index() - assert len(index.docstore.docs) == 1 - - indexing.llm_index_remove_document(real_document) - index = indexing.load_or_build_index() - assert len(index.docstore.docs) == 0 + store = indexing.get_vector_store() + assert store.table_exists(), ( + "Expected the LanceDB table to exist after add-or-update" + ) @pytest.mark.django_db @@ -392,7 +285,6 @@ def test_query_similar_documents( real_document, ) -> None: with ( - patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage, patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index, patch( "paperless_ai.indexing.vector_store_file_exists", @@ -400,8 +292,6 @@ def test_query_similar_documents( patch("llama_index.core.retrievers.VectorIndexRetriever") as mock_retriever_cls, patch("paperless_ai.indexing.Document.objects.filter") as mock_filter, ): - mock_storage.return_value = MagicMock() - mock_storage.return_value.persist_dir = temp_llm_index_dir mock_vector_store_exists.return_value = True mock_index = MagicMock() @@ -461,113 +351,6 @@ def test_query_similar_documents_triggers_update_when_index_missing( assert result == [] -@pytest.mark.django_db -def test_query_similar_documents_normalizes_and_post_filters_allowed_ids( - real_document, -) -> None: - real_document.owner = User.objects.create_user(username="rag-owner") - real_document.save() - private_owner = User.objects.create_user(username="rag-private-owner") - private_document = Document.objects.create( - title="Private similar document", - content="Similar private content that must not reach RAG.", - owner=private_owner, - added=timezone.now(), - ) - - with ( - patch( - "paperless_ai.indexing.vector_store_file_exists", - return_value=True, - ), - patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index, - patch("llama_index.core.retrievers.VectorIndexRetriever") as mock_retriever_cls, - ): - allowed_node = MagicMock() - allowed_node.node_id = "allowed-node" - allowed_node.metadata = {"document_id": str(real_document.pk)} - private_node = MagicMock() - private_node.node_id = "private-node" - private_node.metadata = {"document_id": str(private_document.pk)} - - mock_index = MagicMock() - mock_index.docstore.docs.values.return_value = [allowed_node, private_node] - mock_load_or_build_index.return_value = mock_index - - mock_retriever = MagicMock() - mock_retriever.retrieve.return_value = [private_node, allowed_node] - mock_retriever_cls.return_value = mock_retriever - - result = indexing.query_similar_documents( - real_document, - top_k=2, - document_ids=[real_document.pk], - ) - - mock_retriever_cls.assert_called_once_with( - index=mock_index, - similarity_top_k=2, - doc_ids=["allowed-node"], - ) - assert result == [real_document] - assert private_document not in result - - -class TestUpdateLlmIndexStaleNodes: - """Tests that update_llm_index removes ALL nodes for a multi-chunk document.""" - - @pytest.mark.django_db - def test_incremental_update_removes_all_old_nodes_for_multi_chunk_document( - self, - temp_llm_index_dir, - mock_embed_model: MagicMock, - ) -> None: - """Ghost nodes from all chunks of a modified document must be removed. - - When a document is split into multiple chunks (chunk_size=1024), the - incremental update path must delete every old node, not just the last - one captured by a dict comprehension keyed on document_id. - """ - # Content long enough to produce at least two chunks at chunk_size=1024. - # Generate many paragraphs so the token count comfortably exceeds 1024. - fake = Faker() - long_content = "\n\n".join(fake.paragraph(nb_sentences=20) for _ in range(20)) - doc = DocumentFactory(content=long_content) - - # Build the initial index (rebuild=True) so it has multiple nodes - indexing.update_llm_index(rebuild=True) - - # Verify the initial index has more than one node for this document - initial_index = indexing.load_or_build_index() - initial_node_ids = [ - nid - for nid, node in initial_index.docstore.docs.items() - if node.metadata.get("document_id") == str(doc.id) - ] - assert len(initial_node_ids) > 1, ( - f"Expected multiple chunks but got {len(initial_node_ids)}; " - "increase long_content length" - ) - - # Simulate a modification so the incremental path treats it as changed. - # Use queryset.update() to bypass auto_now and actually change the DB value. - new_modified = timezone.now() - Document.objects.filter(pk=doc.pk).update(modified=new_modified) - - # Run incremental update (rebuild=False) with the modified document - indexing.update_llm_index(rebuild=False) - - # Reload the persisted index and check that no OLD node ids remain - updated_index = indexing.load_or_build_index() - remaining_old_node_ids = [ - nid for nid in initial_node_ids if nid in updated_index.docstore.docs - ] - assert remaining_old_node_ids == [], ( - f"Ghost nodes still present after incremental update: " - f"{remaining_old_node_ids}" - ) - - @pytest.mark.django_db def test_query_similar_documents_empty_allow_list_fails_closed( real_document, @@ -592,11 +375,10 @@ def test_query_similar_documents_empty_allow_list_fails_closed( class TestUpdateLlmIndexEmptyDocumentSet: - """update_llm_index must persist an empty index when all documents are deleted. + """update_llm_index must clear the LanceDB table when all documents are deleted. - Without this, the stale on-disk FAISS vectors are never cleared and - subsequent similarity searches return phantom hits for document IDs that - no longer exist in the DB. + Without this, the stale vectors are never cleared and subsequent similarity + searches return phantom hits for document IDs that no longer exist in the DB. """ @pytest.mark.django_db @@ -605,14 +387,13 @@ class TestUpdateLlmIndexEmptyDocumentSet: temp_llm_index_dir: Path, mock_embed_model: MagicMock, ) -> None: - """After deleting all documents, rebuild=True must persist an empty index. + """After deleting all documents, rebuild=True must produce a table with zero rows. Steps: 1. Build an index with one document so the on-disk state is non-empty. 2. Delete all documents from the DB. 3. Call update_llm_index(rebuild=True). - 4. Reload the index from disk. - 5. Assert the reloaded index has zero nodes (no phantom vectors). + 4. Open the LanceDB table directly and assert zero rows. """ # Step 1: create a document and build a non-empty index Document.objects.create( @@ -622,26 +403,23 @@ class TestUpdateLlmIndexEmptyDocumentSet: ) indexing.update_llm_index(rebuild=True) - initial_index = indexing.load_or_build_index() - assert len(initial_index.docstore.docs) > 0, ( - "Precondition failed: expected at least one node before deletion" + store = indexing.get_vector_store() + assert store.table_exists(), ( + "Precondition failed: expected the LanceDB table to exist before deletion" ) # Step 2: delete all documents Document.objects.all().delete() assert not Document.objects.exists() - # Step 3: rebuild with no documents + # Step 3: rebuild with no documents — drop_table is called so the table + # is removed (no rows to re-insert, so it stays absent). indexing.update_llm_index(rebuild=True) - # Step 4: reload the persisted index from disk - reloaded_index = indexing.load_or_build_index() - - # Step 5: phantom vectors must be gone - assert len(reloaded_index.docstore.docs) == 0, ( - f"Expected 0 nodes after clearing all documents, " - f"but found {len(reloaded_index.docstore.docs)}: " - f"{list(reloaded_index.docstore.docs.keys())}" + # Step 4: the table must be absent (no rows) — phantom vectors gone + store2 = indexing.get_vector_store() + assert not store2.table_exists(), ( + "Expected the LanceDB table to be absent after rebuilding with no documents" ) @@ -691,10 +469,14 @@ class TestLlmIndexAddOrUpdateDocumentEmptyContent: def test_returns_without_error_when_build_document_node_returns_empty( self, temp_llm_index_dir: Path, + mock_embed_model: MagicMock, mocker: pytest_mock.MockerFixture, ) -> None: - """When build_document_node returns [], the function must return without error - and must not call load_or_build_index at all.""" + """When build_document_node returns [], the function must return without error. + + The store's upsert_document treats an empty node list as a removal (no-op + delete), so load_or_build_index must not be called. + """ mocker.patch( "paperless_ai.indexing.build_document_node", return_value=[], @@ -702,6 +484,7 @@ class TestLlmIndexAddOrUpdateDocumentEmptyContent: mock_load = mocker.patch("paperless_ai.indexing.load_or_build_index") doc = MagicMock(spec=Document) + doc.id = 42 # Must not raise indexing.llm_index_add_or_update_document(doc) @@ -710,19 +493,19 @@ class TestLlmIndexAddOrUpdateDocumentEmptyContent: @pytest.mark.django_db class TestLlmIndexLocking: - """The FAISS index mutation functions must acquire the index lock before touching the index. + """Index mutation functions must acquire the file lock before touching the store. - Without locking, two concurrent Celery workers can each load the same - on-disk index, make independent modifications, and the last writer silently - overwrites the first's changes. + Without locking, two concurrent Celery workers can each open the same + LanceDB store, make independent modifications, and overwrite each other. """ def test_add_or_update_document_acquires_lock( self, temp_llm_index_dir: Path, + mock_embed_model: MagicMock, mocker: pytest_mock.MockerFixture, ) -> None: - """llm_index_add_or_update_document must enter the file lock before touching the index.""" + """llm_index_add_or_update_document must enter the file lock before upsert.""" call_order: list[str] = [] mock_lock_instance = MagicMock() @@ -736,26 +519,29 @@ class TestLlmIndexLocking: return_value=mock_lock_instance, ) - mock_load = mocker.patch( - "paperless_ai.indexing.load_or_build_index", + mock_store = MagicMock() + mock_get_store = mocker.patch( + "paperless_ai.indexing.get_vector_store", side_effect=lambda *_a, **_kw: ( - call_order.append("index_loaded") or MagicMock() + call_order.append("store_opened") or mock_store ), ) + mock_node = MagicMock() + mock_node.get_content.return_value = "fake node text" mocker.patch( "paperless_ai.indexing.build_document_node", - return_value=[MagicMock()], + return_value=[mock_node], ) - mocker.patch("paperless_ai.indexing.remove_document_docstore_nodes") doc = MagicMock(spec=Document) + doc.id = 1 indexing.llm_index_add_or_update_document(doc) mock_file_lock_cls.assert_called_once() mock_lock_instance.__enter__.assert_called_once() - mock_load.assert_called_once() - assert call_order.index("lock_acquired") < call_order.index("index_loaded"), ( - "Lock must be acquired before the index is loaded" + mock_get_store.assert_called() + assert call_order.index("lock_acquired") < call_order.index("store_opened"), ( + "Lock must be acquired before the store is opened" ) def test_remove_document_acquires_lock( @@ -763,7 +549,7 @@ class TestLlmIndexLocking: temp_llm_index_dir: Path, mocker: pytest_mock.MockerFixture, ) -> None: - """llm_index_remove_document must enter the file lock before loading the index.""" + """llm_index_remove_document must enter the file lock before deleting from the store.""" call_order: list[str] = [] mock_lock_instance = MagicMock() @@ -777,22 +563,23 @@ class TestLlmIndexLocking: return_value=mock_lock_instance, ) - mock_load = mocker.patch( - "paperless_ai.indexing.load_or_build_index", + mock_store = MagicMock() + mock_get_store = mocker.patch( + "paperless_ai.indexing.get_vector_store", side_effect=lambda *_a, **_kw: ( - call_order.append("index_loaded") or MagicMock() + call_order.append("store_opened") or mock_store ), ) - mocker.patch("paperless_ai.indexing.remove_document_docstore_nodes") doc = MagicMock(spec=Document) + doc.id = 1 indexing.llm_index_remove_document(doc) mock_file_lock_cls.assert_called_once() mock_lock_instance.__enter__.assert_called_once() - mock_load.assert_called_once() - assert call_order.index("lock_acquired") < call_order.index("index_loaded"), ( - "Lock must be acquired before the index is loaded" + mock_get_store.assert_called() + assert call_order.index("lock_acquired") < call_order.index("store_opened"), ( + "Lock must be acquired before the store is opened" ) def test_update_llm_index_rebuild_acquires_lock( @@ -812,73 +599,17 @@ class TestLlmIndexLocking: ) # exists=True so the code reaches the lock; iterate over an empty - # queryset so VectorStoreIndex is called with no nodes (still exercises - # the lock path without needing heavy FAISS fixture data) + # queryset so the rebuild path runs without needing heavy fixture data. mock_qs = MagicMock() mock_qs.exists.return_value = True mock_qs.__iter__ = MagicMock(return_value=iter([])) mocker.patch("paperless_ai.indexing.Document.objects.all", return_value=mock_qs) - mocker.patch( - "paperless_ai.indexing.get_or_create_storage_context", - return_value=MagicMock(), - ) indexing.update_llm_index(rebuild=True) mock_file_lock_cls.assert_called_once() mock_lock_instance.__enter__.assert_called_once() - def test_query_similar_documents_acquires_lock( - self, - temp_llm_index_dir: Path, - mocker: pytest_mock.MockerFixture, - ) -> None: - """query_similar_documents must enter the file lock before loading the index.""" - call_order: list[str] = [] - - mock_lock_instance = MagicMock() - mock_lock_instance.__enter__ = MagicMock( - side_effect=lambda *_: call_order.append("lock_acquired"), - ) - mock_lock_instance.__exit__ = MagicMock(return_value=False) - - mock_file_lock_cls = mocker.patch( - "paperless_ai.indexing.FileLock", - return_value=mock_lock_instance, - ) - - mocker.patch( - "paperless_ai.indexing.vector_store_file_exists", - return_value=True, - ) - - mock_index = MagicMock() - mock_index.docstore.docs = {} - - mocker.patch( - "paperless_ai.indexing.load_or_build_index", - side_effect=lambda *_a, **_kw: ( - call_order.append("index_loaded") or mock_index - ), - ) - - mock_retriever = MagicMock() - mock_retriever.retrieve.return_value = [] - mocker.patch( - "llama_index.core.retrievers.VectorIndexRetriever", - return_value=mock_retriever, - ) - - mocker.patch("paperless_ai.indexing.truncate_content", return_value="") - - indexing.query_similar_documents(MagicMock(spec=Document)) - - mock_file_lock_cls.assert_called() - mock_lock_instance.__enter__.assert_called() - assert call_order.index("lock_acquired") < call_order.index("index_loaded"), ( - "Lock must be acquired before the index is loaded" - ) - @pytest.mark.django_db class TestDimensionGuard: diff --git a/src/paperless_ai/vector_store.py b/src/paperless_ai/vector_store.py index 03f835b79..54070170a 100644 --- a/src/paperless_ai/vector_store.py +++ b/src/paperless_ai/vector_store.py @@ -60,7 +60,7 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore): """ stores_text: bool = True - flat_metadata: bool = True + flat_metadata: bool = False _uri: str = PrivateAttr() _table_name: str = PrivateAttr()