From a2d66a232e95365e40de7971247614296865be62 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue, 2 Jun 2026 09:27:49 -0700 Subject: [PATCH] Ok, first swap out storage stuff --- src/paperless_ai/indexing.py | 37 +++++++++++----------- src/paperless_ai/tests/test_ai_indexing.py | 8 +++-- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index 7ec1fdba3..dd33dfce9 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -80,26 +80,24 @@ def get_or_create_storage_context(*, rebuild=False): shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True) settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True) - if rebuild or not settings.LLM_INDEX_DIR.exists(): - import faiss - from llama_index.core import StorageContext - from llama_index.core.storage.docstore import SimpleDocumentStore - from llama_index.core.storage.index_store import SimpleIndexStore - from llama_index.vector_stores.faiss import FaissVectorStore + from llama_index.core import StorageContext + from llama_index.core.storage.docstore import SimpleDocumentStore + from llama_index.core.storage.index_store import SimpleIndexStore + from llama_index.vector_stores.lancedb import LanceDBVectorStore - settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True) - embedding_dim = get_embedding_dim() - faiss_index = faiss.IndexFlatL2(embedding_dim) - vector_store = FaissVectorStore(faiss_index=faiss_index) + settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True) + vector_store = LanceDBVectorStore( + uri=str(settings.LLM_INDEX_DIR / "lancedb"), + table_name="vectors", + mode="overwrite" if rebuild else "create", + stores_text=False, + flat_metadata=False, + ) + + if rebuild or not (settings.LLM_INDEX_DIR / "docstore.json").exists(): docstore = SimpleDocumentStore() index_store = SimpleIndexStore() else: - from llama_index.core import StorageContext - from llama_index.core.storage.docstore import SimpleDocumentStore - from llama_index.core.storage.index_store import SimpleIndexStore - from llama_index.vector_stores.faiss import FaissVectorStore - - vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR) docstore = SimpleDocumentStore.from_persist_dir(settings.LLM_INDEX_DIR) index_store = SimpleIndexStore.from_persist_dir(settings.LLM_INDEX_DIR) @@ -211,9 +209,11 @@ def remove_document_docstore_nodes(document: Document, index: "VectorStoreIndex" def vector_store_file_exists(): """ - Check if the vector store file exists in the LLM index directory. + Check if the LanceDB vector store exists in the LLM index directory. """ - return Path(settings.LLM_INDEX_DIR / "default__vector_store.json").exists() + return Path( + settings.LLM_INDEX_DIR / "lancedb" / "vectors.lance", + ).exists() def get_rag_chunk_size() -> int: @@ -281,6 +281,7 @@ def update_llm_index( embed_model = get_embedding_model() llama_settings.Settings.embed_model = embed_model storage_context = get_or_create_storage_context(rebuild=True) + get_embedding_dim() for document in iter_wrapper(documents): document_nodes = build_document_node(document, chunk_size=chunk_size) nodes.extend(document_nodes) diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py index 339d75ead..9855089c3 100644 --- a/src/paperless_ai/tests/test_ai_indexing.py +++ b/src/paperless_ai/tests/test_ai_indexing.py @@ -222,12 +222,14 @@ def test_update_llm_index_partial_update( assert any(temp_llm_index_dir.glob("*.json")) -def test_get_or_create_storage_context_raises_exception( +def test_get_or_create_storage_context_creates_empty_context( temp_llm_index_dir, mock_embed_model, ) -> None: - with pytest.raises(Exception): - indexing.get_or_create_storage_context(rebuild=False) + storage_context = indexing.get_or_create_storage_context(rebuild=False) + + assert storage_context.vector_store is not None + assert not indexing.vector_store_file_exists() @override_settings(