feat(ai): dimension guard and FAISS index migration

Adds current_embedding_dim() to embedding.py, migrate_stale_faiss_index() and embedding_dim_mismatch() to indexing.py, and wires both into update_llm_index so that stale FAISS directories are wiped on startup and embedding model changes force a full index rebuild. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-08-01 08:32:18 +00:00 · 2026-06-05 11:43:42 -07:00
parent 1f2af9087c
commit b855eba878
3 changed files with 43 additions and 0 deletions
@@ -132,6 +132,11 @@ def get_embedding_dim() -> int:
    return dim


+def current_embedding_dim() -> int:
+    """Embedding dimension for the configured model (probes if not cached)."""
+    return get_embedding_dim()
+
+
 def _normalize_llm_index_text(text: str) -> str:
    text = OCR_LEADER_REGEX.sub(" ", text)
    return HORIZONTAL_WHITESPACE_REGEX.sub(" ", text)
@@ -1,5 +1,6 @@
 import json
 import logging
+import shutil
 from collections.abc import Iterable
 from datetime import timedelta
 from pathlib import Path
@@ -155,6 +156,26 @@ def vector_store_file_exists() -> bool:
    return get_vector_store().table_exists()


+def migrate_stale_faiss_index() -> None:
+    """Remove a pre-LanceDB FAISS index directory so it is rebuilt fresh."""
+    stale_marker = settings.LLM_INDEX_DIR / "default__vector_store.json"
+    if stale_marker.exists():
+        logger.info("Removing stale FAISS LLM index; it will be rebuilt.")
+        shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True)
+        settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def embedding_dim_mismatch() -> bool:
+    """True when the stored table's vector dim differs from the current model."""
+    store = get_vector_store()
+    stored = store.vector_dim()
+    if stored is None:
+        return False
+    from paperless_ai.embedding import current_embedding_dim
+
+    return stored != current_embedding_dim()
+
+
 def get_rag_chunk_size() -> int:
    return AIConfig().llm_embedding_chunk_size

@@ -211,6 +232,11 @@ def update_llm_index(
    """Rebuild or incrementally update the LLM index."""
    from llama_index.core.schema import MetadataMode

+    migrate_stale_faiss_index()
+    if not rebuild and vector_store_file_exists() and embedding_dim_mismatch():
+        logger.warning("Embedding dimension changed; forcing LLM index rebuild.")
+        rebuild = True
+
    documents = Document.objects.all()
    if not documents.exists():
        logger.warning("No documents found to index.")
@@ -910,6 +910,18 @@ class TestLlmIndexLocking:
        )


+@pytest.mark.django_db
+class TestFaissMigration:
+    def test_migration_wipes_stale_faiss_files(
+        self,
+        temp_llm_index_dir: Path,
+    ) -> None:
+        stale = temp_llm_index_dir / "default__vector_store.json"
+        stale.write_text("{}")
+        indexing.migrate_stale_faiss_index()
+        assert not stale.exists()
+
+
@pytest.mark.django_db
 class TestLanceDbIndexing:
    def test_get_vector_store_roundtrip(