Implements config_mismatch which checks for either a dimension or model name mismatch. Model name is now stored as internal metadata of the vector store

This commit is contained in:
Trenton Holmes
2026-06-06 14:48:50 -07:00
parent 6ee84af8ad
commit 7f5053cbe3
6 changed files with 180 additions and 148 deletions
+11 -34
View File
@@ -1,12 +1,9 @@
import json
import re
from typing import TYPE_CHECKING
from django.conf import settings
if TYPE_CHECKING:
from pathlib import Path
from llama_index.core.base.embeddings.base import BaseEmbedding
from documents.models import Document
@@ -95,41 +92,21 @@ def get_embedding_model() -> "BaseEmbedding":
)
def get_embedding_dim() -> int:
"""
Loads embedding dimension from meta.json if available, otherwise infers it
from a dummy embedding and stores it for future use.
"""
_DEFAULT_MODEL_NAMES = {
LLMEmbeddingBackend.OPENAI_LIKE: "text-embedding-3-small",
LLMEmbeddingBackend.HUGGINGFACE: "sentence-transformers/all-MiniLM-L6-v2",
LLMEmbeddingBackend.OLLAMA: "embeddinggemma",
}
def get_configured_model_name() -> str:
"""Return the canonical name of the currently configured embedding model."""
config = AIConfig()
default_model = {
LLMEmbeddingBackend.OPENAI_LIKE: "text-embedding-3-small",
LLMEmbeddingBackend.HUGGINGFACE: "sentence-transformers/all-MiniLM-L6-v2",
LLMEmbeddingBackend.OLLAMA: "embeddinggemma",
}.get(
default = _DEFAULT_MODEL_NAMES.get(
config.llm_embedding_backend,
"sentence-transformers/all-MiniLM-L6-v2",
)
model = config.llm_embedding_model or default_model
meta_path: Path = settings.LLM_INDEX_DIR / "meta.json"
if meta_path.exists():
with meta_path.open() as f:
meta = json.load(f)
if meta.get("embedding_model") != model:
raise RuntimeError(
f"Embedding model changed from {meta.get('embedding_model')} to {model}. "
"You must rebuild the index.",
)
return meta["dim"]
embedding_model = get_embedding_model()
test_embed = embedding_model.get_text_embedding("test")
dim = len(test_embed)
with meta_path.open("w") as f:
json.dump({"embedding_model": model, "dim": dim}, f)
return dim
return config.llm_embedding_model or default
def _normalize_llm_index_text(text: str) -> str:
+23 -27
View File
@@ -14,6 +14,7 @@ from documents.utils import IterWrapper
from documents.utils import identity
from paperless.config import AIConfig
from paperless_ai.embedding import build_llm_index_text
from paperless_ai.embedding import get_configured_model_name
from paperless_ai.embedding import get_embedding_model
if TYPE_CHECKING:
@@ -72,16 +73,25 @@ def get_vector_store() -> "PaperlessLanceVectorStore":
@contextmanager
def write_store():
def write_store(embed_model_name: str | None = None):
"""Acquire the write lock and yield the vector store.
All mutating operations (upsert, delete, rebuild, compact) must go through
this context manager to serialise concurrent Celery writers.
Read paths use ``get_vector_store()`` directly — no lock needed.
Pass ``embed_model_name`` whenever the operation may create the table so
the model name is recorded in the schema metadata for future mismatch checks.
"""
from paperless_ai.vector_store import PaperlessLanceVectorStore
settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
with FileLock(settings.LLM_INDEX_LOCK):
yield get_vector_store()
yield PaperlessLanceVectorStore(
uri=str(settings.LLM_INDEX_DIR),
table_name=LLM_INDEX_TABLE,
embed_model_name=embed_model_name,
)
def build_document_node(
@@ -148,25 +158,10 @@ def llm_index_exists() -> bool:
return get_vector_store().table_exists()
def embedding_dim_mismatch() -> bool:
"""True when the stored table's vector dim differs from the current model."""
store = get_vector_store()
stored = store.vector_dim()
if stored is None:
return False
from paperless_ai.embedding import get_embedding_dim
return stored != get_embedding_dim()
def get_rag_chunk_size() -> int:
return AIConfig().llm_embedding_chunk_size
def get_rag_context_size() -> int:
return AIConfig().llm_context_size
def get_rag_chunk_overlap(chunk_size: int | None = None) -> int:
chunk_size = chunk_size or get_rag_chunk_size()
return min(RAG_CHUNK_OVERLAP, chunk_size - 1)
@@ -222,19 +217,20 @@ def _document_id_filters(doc_ids):
)
def get_llm_index_compaction_retention() -> int:
"""Seconds of MVCC version history to keep during compaction."""
return 60 * 60 # 1 hour: safe for in-flight readers, reclaims daily
def update_llm_index(
*,
iter_wrapper: IterWrapper[Document] = identity,
rebuild=False,
) -> str:
"""Rebuild or incrementally update the LLM index."""
if not rebuild and llm_index_exists() and embedding_dim_mismatch():
logger.warning("Embedding dimension changed; forcing LLM index rebuild.")
model_name = get_configured_model_name()
if (
not rebuild
and llm_index_exists()
and get_vector_store().config_mismatch(model_name)
):
logger.warning("Embedding model changed; forcing LLM index rebuild.")
rebuild = True
documents = Document.objects.all()
@@ -246,7 +242,7 @@ def update_llm_index(
chunk_size = AIConfig().llm_embedding_chunk_size
embed_model = get_embedding_model()
with write_store() as store:
with write_store(embed_model_name=model_name) as store:
if rebuild or not store.table_exists():
(settings.LLM_INDEX_DIR / "meta.json").unlink(missing_ok=True)
logger.info("Rebuilding LLM index.")
@@ -275,7 +271,7 @@ def update_llm_index(
store.ensure_document_id_scalar_index()
store.maybe_create_ann_index()
store.compact(retention_seconds=get_llm_index_compaction_retention())
store.compact(retention_seconds=60 * 60) # 1 hour: safe for in-flight readers
return msg
@@ -285,7 +281,7 @@ def llm_index_add_or_update_document(document: Document):
if new_nodes:
_embed_nodes(new_nodes, get_embedding_model())
with write_store() as store:
with write_store(embed_model_name=get_configured_model_name()) as store:
store.upsert_document(str(document.id), new_nodes)
store.ensure_document_id_scalar_index()
+39 -39
View File
@@ -155,13 +155,13 @@ def test_update_llm_index(
@pytest.mark.django_db
def test_update_llm_index_removes_meta(
def test_update_llm_index_cleans_stale_meta_on_rebuild(
temp_llm_index_dir: Path,
real_document: Document,
mock_embed_model: FakeEmbedding,
) -> None:
# Pre-create a meta.json — the new LanceDB-backed rebuild must delete it so
# that stale FAISS-era metadata does not accumulate on disk.
# A meta.json left over from the FAISS era (or written by older code) must be
# deleted on rebuild so stale artifacts don't accumulate on disk.
stale_meta = temp_llm_index_dir / "meta.json"
stale_meta.write_text(json.dumps({"embedding_model": "old", "dim": 1}))
@@ -177,6 +177,42 @@ def test_update_llm_index_removes_meta(
)
@pytest.mark.django_db
def test_update_llm_index_rebuilds_on_model_name_change(
temp_llm_index_dir: Path,
real_document: Document,
mock_embed_model: FakeEmbedding,
) -> None:
# Build initial index with model "model-a".
with patch("documents.models.Document.objects.all") as mock_all:
mock_queryset = MagicMock()
mock_queryset.exists.return_value = True
mock_queryset.__iter__.return_value = iter([real_document])
mock_all.return_value = mock_queryset
with patch(
"paperless_ai.indexing.get_configured_model_name",
return_value="model-a",
):
indexing.update_llm_index(rebuild=True)
# Simulate config change to "model-b"; the incremental run must force a rebuild.
with patch("documents.models.Document.objects.all") as mock_all:
mock_queryset = MagicMock()
mock_queryset.exists.return_value = True
mock_queryset.__iter__.return_value = iter([real_document])
mock_all.return_value = mock_queryset
with patch(
"paperless_ai.indexing.get_configured_model_name",
return_value="model-b",
):
indexing.update_llm_index(rebuild=False)
store = indexing.get_vector_store()
# Schema metadata only updates when the table is dropped and recreated, never on
# incremental writes -- so "model-b" here proves a full rebuild happened.
assert store.stored_model_name() == "model-b"
@pytest.mark.django_db
def test_update_llm_index_partial_update(
temp_llm_index_dir: Path,
@@ -641,42 +677,6 @@ class TestLlmIndexLocking:
@pytest.mark.django_db
class TestDimensionGuard:
def test_embedding_dim_mismatch_false_when_no_table(
self,
temp_llm_index_dir: Path,
mock_embed_model: FakeEmbedding,
) -> None:
"""No table yet — dim mismatch must return False (nothing to compare)."""
assert not indexing.embedding_dim_mismatch()
def test_update_llm_index_forces_rebuild_on_dim_mismatch(
self,
temp_llm_index_dir: Path,
mock_embed_model: FakeEmbedding,
mocker: pytest_mock.MockerFixture,
) -> None:
"""When the stored dim differs from the current model, update must force a rebuild."""
mocker.patch("paperless_ai.indexing.embedding_dim_mismatch", return_value=True)
mocker.patch("paperless_ai.indexing.llm_index_exists", return_value=True)
mock_store = MagicMock()
mocker.patch(
"paperless_ai.indexing.write_store",
return_value=mocker.MagicMock(
__enter__=mocker.MagicMock(return_value=mock_store),
__exit__=mocker.MagicMock(return_value=False),
),
)
mock_qs = MagicMock()
mock_qs.exists.return_value = True
mock_qs.__iter__ = MagicMock(return_value=iter([]))
mocker.patch("paperless_ai.indexing.Document.objects.all", return_value=mock_qs)
indexing.update_llm_index(rebuild=False)
mock_store.drop_table.assert_called_once()
@pytest.mark.django_db
class TestLanceDbIndexing:
def test_get_vector_store_roundtrip(
+24 -45
View File
@@ -1,4 +1,3 @@
import json
from unittest.mock import ANY
from unittest.mock import MagicMock
from unittest.mock import patch
@@ -10,7 +9,7 @@ from documents.models import Document
from paperless.models import LLMEmbeddingBackend
from paperless_ai.embedding import _normalize_llm_index_text
from paperless_ai.embedding import build_llm_index_text
from paperless_ai.embedding import get_embedding_dim
from paperless_ai.embedding import get_configured_model_name
from paperless_ai.embedding import get_embedding_model
@@ -186,52 +185,32 @@ def test_get_embedding_model_invalid_backend(mock_ai_config):
get_embedding_model()
def test_get_embedding_dim_infers_and_saves(temp_llm_index_dir, mock_ai_config):
mock_ai_config.return_value.llm_embedding_backend = "openai-like"
@pytest.mark.parametrize(
("backend", "expected_default"),
[
(LLMEmbeddingBackend.OPENAI_LIKE, "text-embedding-3-small"),
(LLMEmbeddingBackend.HUGGINGFACE, "sentence-transformers/all-MiniLM-L6-v2"),
(LLMEmbeddingBackend.OLLAMA, "embeddinggemma"),
],
)
def test_get_configured_model_name_falls_back_to_backend_default(
mock_ai_config,
backend,
expected_default,
):
"""When no model is explicitly configured, each backend has a distinct default."""
mock_ai_config.return_value.llm_embedding_backend = backend
mock_ai_config.return_value.llm_embedding_model = None
class DummyEmbedding:
def get_text_embedding(self, text):
return [0.0] * 7
with patch(
"paperless_ai.embedding.get_embedding_model",
return_value=DummyEmbedding(),
) as mock_get:
dim = get_embedding_dim()
mock_get.assert_called_once()
assert dim == 7
meta = json.loads((temp_llm_index_dir / "meta.json").read_text())
assert meta == {"embedding_model": "text-embedding-3-small", "dim": 7}
assert get_configured_model_name() == expected_default
def test_get_embedding_dim_reads_existing_meta(temp_llm_index_dir, mock_ai_config):
mock_ai_config.return_value.llm_embedding_backend = "openai-like"
mock_ai_config.return_value.llm_embedding_model = None
(temp_llm_index_dir / "meta.json").write_text(
json.dumps({"embedding_model": "text-embedding-3-small", "dim": 11}),
)
with patch("paperless_ai.embedding.get_embedding_model") as mock_get:
assert get_embedding_dim() == 11
mock_get.assert_not_called()
def test_get_embedding_dim_raises_on_model_change(temp_llm_index_dir, mock_ai_config):
mock_ai_config.return_value.llm_embedding_backend = "openai-like"
mock_ai_config.return_value.llm_embedding_model = None
(temp_llm_index_dir / "meta.json").write_text(
json.dumps({"embedding_model": "old", "dim": 11}),
)
with pytest.raises(
RuntimeError,
match="Embedding model changed from old to text-embedding-3-small",
):
get_embedding_dim()
def test_get_configured_model_name_explicit_overrides_default(mock_ai_config):
"""An explicit model name overrides the backend default for all backends."""
mock_ai_config.return_value.llm_embedding_backend = LLMEmbeddingBackend.OPENAI_LIKE
mock_ai_config.return_value.llm_embedding_model = "my-custom-model"
# The backend default for OPENAI_LIKE is "text-embedding-3-small", so if
# the explicit name was ignored we'd get the wrong result.
assert get_configured_model_name() == "my-custom-model"
def test_build_llm_index_text(mock_document):
@@ -320,6 +320,57 @@ class TestPaperlessLanceVectorStoreMaintenance:
store.ensure_document_id_scalar_index() # no table yet — must not raise
class TestConfigMismatch:
@pytest.fixture
def uri(self, tmp_path: Path) -> str:
return str(tmp_path / "idx")
def test_stored_model_name_returns_none_when_no_table(self, uri: str) -> None:
store = PaperlessLanceVectorStore(uri=uri)
assert store.stored_model_name() is None
def test_model_name_stored_in_schema_after_add(self, uri: str) -> None:
store = PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2")
store.add([_node("1-0", "1", "text", 0.1)])
assert store.stored_model_name() == "all-MiniLM-L6-v2"
def test_model_name_stored_in_schema_after_upsert(self, uri: str) -> None:
store = PaperlessLanceVectorStore(uri=uri, embed_model_name="nomic-embed")
store.upsert_document("1", [_node("1-0", "1", "text", 0.1)])
assert store.stored_model_name() == "nomic-embed"
def test_model_name_persists_after_reopen(self, uri: str) -> None:
PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2").add(
[_node("1-0", "1", "text", 0.1)],
)
reopened = PaperlessLanceVectorStore(uri=uri)
assert reopened.stored_model_name() == "all-MiniLM-L6-v2"
def test_config_mismatch_returns_false_when_no_table(self, uri: str) -> None:
store = PaperlessLanceVectorStore(uri=uri)
assert store.config_mismatch("any-model") is False
def test_config_mismatch_returns_false_when_model_matches(self, uri: str) -> None:
store = PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2")
store.add([_node("1-0", "1", "text", 0.1)])
assert store.config_mismatch("all-MiniLM-L6-v2") is False
def test_config_mismatch_returns_true_when_model_differs(self, uri: str) -> None:
store = PaperlessLanceVectorStore(uri=uri, embed_model_name="old-model")
store.add([_node("1-0", "1", "text", 0.1)])
assert store.config_mismatch("new-model") is True
def test_config_mismatch_returns_false_when_no_metadata_stored(
self,
uri: str,
) -> None:
# Tables created before model-name tracking was added have no schema metadata.
# Conservative default: assume compatible rather than force a rebuild.
store = PaperlessLanceVectorStore(uri=uri)
store.add([_node("1-0", "1", "text", 0.1)])
assert store.config_mismatch("any-model") is False
class TestGetModifiedTimes:
@pytest.fixture
def store(self, tmp_path: Path) -> PaperlessLanceVectorStore:
+32 -3
View File
@@ -71,13 +71,20 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
_uri: str = PrivateAttr()
_table_name: str = PrivateAttr()
_embed_model_name: str | None = PrivateAttr()
_conn: Any = PrivateAttr()
_table: Any = PrivateAttr()
def __init__(self, uri: str, table_name: str = DEFAULT_TABLE_NAME) -> None:
def __init__(
self,
uri: str,
table_name: str = DEFAULT_TABLE_NAME,
embed_model_name: str | None = None,
) -> None:
super().__init__(stores_text=True, flat_metadata=False)
self._uri = uri
self._table_name = table_name
self._embed_model_name = embed_model_name
self._conn = lancedb.connect(uri)
existing = self._conn.list_tables().tables
self._table = (
@@ -101,8 +108,29 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
self._conn.drop_table(self._table_name)
self._table = None
def stored_model_name(self) -> str | None:
"""Return the embedding model name stored in table schema metadata, or None."""
if self._table is None:
return None
meta = self._table.schema.metadata or {}
value = meta.get(b"embed_model")
return value.decode() if value else None
def config_mismatch(self, model_name: str) -> bool:
"""True when the stored model name differs from ``model_name``.
Returns False when no table exists or when the table predates model-name
tracking (schema has no metadata) — conservative default avoids spurious
rebuilds on upgrade.
"""
stored = self.stored_model_name()
if stored is None:
return False
return stored != model_name
@staticmethod
def _schema(dim: int) -> pa.Schema:
def _schema(dim: int, model_name: str | None = None) -> pa.Schema:
meta = {b"embed_model": model_name.encode()} if model_name else None
return pa.schema(
[
pa.field("id", pa.string()),
@@ -112,6 +140,7 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
pa.field("vector", pa.list_(pa.float32(), dim)),
pa.field("node_content", pa.string()),
],
metadata=meta,
)
def _row(self, node: BaseNode) -> dict[str, Any]:
@@ -140,7 +169,7 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
self._table = self._conn.create_table(
self._table_name,
rows,
schema=self._schema(dim),
schema=self._schema(dim, self._embed_model_name),
)
return True