mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-06 13:49:44 +00:00
feat(ai): dimension guard and FAISS index migration
Adds current_embedding_dim() to embedding.py, migrate_stale_faiss_index() and embedding_dim_mismatch() to indexing.py, and wires both into update_llm_index so that stale FAISS directories are wiped on startup and embedding model changes force a full index rebuild. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -132,6 +132,11 @@ def get_embedding_dim() -> int:
|
||||
return dim
|
||||
|
||||
|
||||
def current_embedding_dim() -> int:
|
||||
"""Embedding dimension for the configured model (probes if not cached)."""
|
||||
return get_embedding_dim()
|
||||
|
||||
|
||||
def _normalize_llm_index_text(text: str) -> str:
|
||||
text = OCR_LEADER_REGEX.sub(" ", text)
|
||||
return HORIZONTAL_WHITESPACE_REGEX.sub(" ", text)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
from collections.abc import Iterable
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
@@ -155,6 +156,26 @@ def vector_store_file_exists() -> bool:
|
||||
return get_vector_store().table_exists()
|
||||
|
||||
|
||||
def migrate_stale_faiss_index() -> None:
|
||||
"""Remove a pre-LanceDB FAISS index directory so it is rebuilt fresh."""
|
||||
stale_marker = settings.LLM_INDEX_DIR / "default__vector_store.json"
|
||||
if stale_marker.exists():
|
||||
logger.info("Removing stale FAISS LLM index; it will be rebuilt.")
|
||||
shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True)
|
||||
settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def embedding_dim_mismatch() -> bool:
|
||||
"""True when the stored table's vector dim differs from the current model."""
|
||||
store = get_vector_store()
|
||||
stored = store.vector_dim()
|
||||
if stored is None:
|
||||
return False
|
||||
from paperless_ai.embedding import current_embedding_dim
|
||||
|
||||
return stored != current_embedding_dim()
|
||||
|
||||
|
||||
def get_rag_chunk_size() -> int:
|
||||
return AIConfig().llm_embedding_chunk_size
|
||||
|
||||
@@ -211,6 +232,11 @@ def update_llm_index(
|
||||
"""Rebuild or incrementally update the LLM index."""
|
||||
from llama_index.core.schema import MetadataMode
|
||||
|
||||
migrate_stale_faiss_index()
|
||||
if not rebuild and vector_store_file_exists() and embedding_dim_mismatch():
|
||||
logger.warning("Embedding dimension changed; forcing LLM index rebuild.")
|
||||
rebuild = True
|
||||
|
||||
documents = Document.objects.all()
|
||||
if not documents.exists():
|
||||
logger.warning("No documents found to index.")
|
||||
|
||||
@@ -910,6 +910,18 @@ class TestLlmIndexLocking:
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestFaissMigration:
|
||||
def test_migration_wipes_stale_faiss_files(
|
||||
self,
|
||||
temp_llm_index_dir: Path,
|
||||
) -> None:
|
||||
stale = temp_llm_index_dir / "default__vector_store.json"
|
||||
stale.write_text("{}")
|
||||
indexing.migrate_stale_faiss_index()
|
||||
assert not stale.exists()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestLanceDbIndexing:
|
||||
def test_get_vector_store_roundtrip(
|
||||
|
||||
Reference in New Issue
Block a user