diff --git a/src/paperless_ai/embedding.py b/src/paperless_ai/embedding.py index 2695e9fb3..8524e77bf 100644 --- a/src/paperless_ai/embedding.py +++ b/src/paperless_ai/embedding.py @@ -132,6 +132,11 @@ def get_embedding_dim() -> int: return dim +def current_embedding_dim() -> int: + """Embedding dimension for the configured model (probes if not cached).""" + return get_embedding_dim() + + def _normalize_llm_index_text(text: str) -> str: text = OCR_LEADER_REGEX.sub(" ", text) return HORIZONTAL_WHITESPACE_REGEX.sub(" ", text) diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index a55479acf..47239d4dd 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -1,5 +1,6 @@ import json import logging +import shutil from collections.abc import Iterable from datetime import timedelta from pathlib import Path @@ -155,6 +156,26 @@ def vector_store_file_exists() -> bool: return get_vector_store().table_exists() +def migrate_stale_faiss_index() -> None: + """Remove a pre-LanceDB FAISS index directory so it is rebuilt fresh.""" + stale_marker = settings.LLM_INDEX_DIR / "default__vector_store.json" + if stale_marker.exists(): + logger.info("Removing stale FAISS LLM index; it will be rebuilt.") + shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True) + settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True) + + +def embedding_dim_mismatch() -> bool: + """True when the stored table's vector dim differs from the current model.""" + store = get_vector_store() + stored = store.vector_dim() + if stored is None: + return False + from paperless_ai.embedding import current_embedding_dim + + return stored != current_embedding_dim() + + def get_rag_chunk_size() -> int: return AIConfig().llm_embedding_chunk_size @@ -211,6 +232,11 @@ def update_llm_index( """Rebuild or incrementally update the LLM index.""" from llama_index.core.schema import MetadataMode + migrate_stale_faiss_index() + if not rebuild and vector_store_file_exists() and embedding_dim_mismatch(): + logger.warning("Embedding dimension changed; forcing LLM index rebuild.") + rebuild = True + documents = Document.objects.all() if not documents.exists(): logger.warning("No documents found to index.") diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py index 81cd9d227..fda025b5d 100644 --- a/src/paperless_ai/tests/test_ai_indexing.py +++ b/src/paperless_ai/tests/test_ai_indexing.py @@ -910,6 +910,18 @@ class TestLlmIndexLocking: ) +@pytest.mark.django_db +class TestFaissMigration: + def test_migration_wipes_stale_faiss_files( + self, + temp_llm_index_dir: Path, + ) -> None: + stale = temp_llm_index_dir / "default__vector_store.json" + stale.write_text("{}") + indexing.migrate_stale_faiss_index() + assert not stale.exists() + + @pytest.mark.django_db class TestLanceDbIndexing: def test_get_vector_store_roundtrip(