feat(ai): dimension guard and FAISS index migration

Adds current_embedding_dim() to embedding.py, migrate_stale_faiss_index()
and embedding_dim_mismatch() to indexing.py, and wires both into
update_llm_index so that stale FAISS directories are wiped on startup and
embedding model changes force a full index rebuild.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
stumpylog
2026-06-03 08:44:36 -07:00
parent 1f2af9087c
commit b855eba878
3 changed files with 43 additions and 0 deletions
+5
View File
@@ -132,6 +132,11 @@ def get_embedding_dim() -> int:
return dim
def current_embedding_dim() -> int:
"""Embedding dimension for the configured model (probes if not cached)."""
return get_embedding_dim()
def _normalize_llm_index_text(text: str) -> str:
text = OCR_LEADER_REGEX.sub(" ", text)
return HORIZONTAL_WHITESPACE_REGEX.sub(" ", text)
+26
View File
@@ -1,5 +1,6 @@
import json
import logging
import shutil
from collections.abc import Iterable
from datetime import timedelta
from pathlib import Path
@@ -155,6 +156,26 @@ def vector_store_file_exists() -> bool:
return get_vector_store().table_exists()
def migrate_stale_faiss_index() -> None:
"""Remove a pre-LanceDB FAISS index directory so it is rebuilt fresh."""
stale_marker = settings.LLM_INDEX_DIR / "default__vector_store.json"
if stale_marker.exists():
logger.info("Removing stale FAISS LLM index; it will be rebuilt.")
shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True)
settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
def embedding_dim_mismatch() -> bool:
"""True when the stored table's vector dim differs from the current model."""
store = get_vector_store()
stored = store.vector_dim()
if stored is None:
return False
from paperless_ai.embedding import current_embedding_dim
return stored != current_embedding_dim()
def get_rag_chunk_size() -> int:
return AIConfig().llm_embedding_chunk_size
@@ -211,6 +232,11 @@ def update_llm_index(
"""Rebuild or incrementally update the LLM index."""
from llama_index.core.schema import MetadataMode
migrate_stale_faiss_index()
if not rebuild and vector_store_file_exists() and embedding_dim_mismatch():
logger.warning("Embedding dimension changed; forcing LLM index rebuild.")
rebuild = True
documents = Document.objects.all()
if not documents.exists():
logger.warning("No documents found to index.")
@@ -910,6 +910,18 @@ class TestLlmIndexLocking:
)
@pytest.mark.django_db
class TestFaissMigration:
def test_migration_wipes_stale_faiss_files(
self,
temp_llm_index_dir: Path,
) -> None:
stale = temp_llm_index_dir / "default__vector_store.json"
stale.write_text("{}")
indexing.migrate_stale_faiss_index()
assert not stale.exists()
@pytest.mark.django_db
class TestLanceDbIndexing:
def test_get_vector_store_roundtrip(