From 7f5053cbe3db4c6dfe0f786a18ab0680d64194d6 Mon Sep 17 00:00:00 2001
From: Trenton Holmes <797416+stumpylog@users.noreply.github.com>
Date: Sat, 6 Jun 2026 14:48:50 -0700
Subject: [PATCH] Implements config_mismatch which checks for either a
 dimension or model name mismatch.  Model name is now stored as internal
 metadata of the vector store

---
 src/paperless_ai/embedding.py               | 45 +++---------
 src/paperless_ai/indexing.py                | 50 ++++++-------
 src/paperless_ai/tests/test_ai_indexing.py  | 78 ++++++++++-----------
 src/paperless_ai/tests/test_embedding.py    | 69 +++++++-----------
 src/paperless_ai/tests/test_vector_store.py | 51 ++++++++++++++
 src/paperless_ai/vector_store.py            | 35 ++++++++-
 6 files changed, 180 insertions(+), 148 deletions(-)

diff --git a/src/paperless_ai/embedding.py b/src/paperless_ai/embedding.py
index 59582130f..8480cb76d 100644
--- a/src/paperless_ai/embedding.py
+++ b/src/paperless_ai/embedding.py
@@ -1,12 +1,9 @@
-import json
 import re
 from typing import TYPE_CHECKING
 
 from django.conf import settings
 
 if TYPE_CHECKING:
-    from pathlib import Path
-
     from llama_index.core.base.embeddings.base import BaseEmbedding
 
 from documents.models import Document
@@ -95,41 +92,21 @@ def get_embedding_model() -> "BaseEmbedding":
             )
 
 
-def get_embedding_dim() -> int:
-    """
-    Loads embedding dimension from meta.json if available, otherwise infers it
-    from a dummy embedding and stores it for future use.
-    """
+_DEFAULT_MODEL_NAMES = {
+    LLMEmbeddingBackend.OPENAI_LIKE: "text-embedding-3-small",
+    LLMEmbeddingBackend.HUGGINGFACE: "sentence-transformers/all-MiniLM-L6-v2",
+    LLMEmbeddingBackend.OLLAMA: "embeddinggemma",
+}
+
+
+def get_configured_model_name() -> str:
+    """Return the canonical name of the currently configured embedding model."""
     config = AIConfig()
-    default_model = {
-        LLMEmbeddingBackend.OPENAI_LIKE: "text-embedding-3-small",
-        LLMEmbeddingBackend.HUGGINGFACE: "sentence-transformers/all-MiniLM-L6-v2",
-        LLMEmbeddingBackend.OLLAMA: "embeddinggemma",
-    }.get(
+    default = _DEFAULT_MODEL_NAMES.get(
         config.llm_embedding_backend,
         "sentence-transformers/all-MiniLM-L6-v2",
     )
-    model = config.llm_embedding_model or default_model
-
-    meta_path: Path = settings.LLM_INDEX_DIR / "meta.json"
-    if meta_path.exists():
-        with meta_path.open() as f:
-            meta = json.load(f)
-        if meta.get("embedding_model") != model:
-            raise RuntimeError(
-                f"Embedding model changed from {meta.get('embedding_model')} to {model}. "
-                "You must rebuild the index.",
-            )
-        return meta["dim"]
-
-    embedding_model = get_embedding_model()
-    test_embed = embedding_model.get_text_embedding("test")
-    dim = len(test_embed)
-
-    with meta_path.open("w") as f:
-        json.dump({"embedding_model": model, "dim": dim}, f)
-
-    return dim
+    return config.llm_embedding_model or default
 
 
 def _normalize_llm_index_text(text: str) -> str:
diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py
index 3e6cff3d3..bfd4edd72 100644
--- a/src/paperless_ai/indexing.py
+++ b/src/paperless_ai/indexing.py
@@ -14,6 +14,7 @@ from documents.utils import IterWrapper
 from documents.utils import identity
 from paperless.config import AIConfig
 from paperless_ai.embedding import build_llm_index_text
+from paperless_ai.embedding import get_configured_model_name
 from paperless_ai.embedding import get_embedding_model
 
 if TYPE_CHECKING:
@@ -72,16 +73,25 @@ def get_vector_store() -> "PaperlessLanceVectorStore":
 
 
 @contextmanager
-def write_store():
+def write_store(embed_model_name: str | None = None):
     """Acquire the write lock and yield the vector store.
 
     All mutating operations (upsert, delete, rebuild, compact) must go through
     this context manager to serialise concurrent Celery writers.
     Read paths use ``get_vector_store()`` directly — no lock needed.
+
+    Pass ``embed_model_name`` whenever the operation may create the table so
+    the model name is recorded in the schema metadata for future mismatch checks.
     """
+    from paperless_ai.vector_store import PaperlessLanceVectorStore
+
     settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
     with FileLock(settings.LLM_INDEX_LOCK):
-        yield get_vector_store()
+        yield PaperlessLanceVectorStore(
+            uri=str(settings.LLM_INDEX_DIR),
+            table_name=LLM_INDEX_TABLE,
+            embed_model_name=embed_model_name,
+        )
 
 
 def build_document_node(
@@ -148,25 +158,10 @@ def llm_index_exists() -> bool:
     return get_vector_store().table_exists()
 
 
-def embedding_dim_mismatch() -> bool:
-    """True when the stored table's vector dim differs from the current model."""
-    store = get_vector_store()
-    stored = store.vector_dim()
-    if stored is None:
-        return False
-    from paperless_ai.embedding import get_embedding_dim
-
-    return stored != get_embedding_dim()
-
-
 def get_rag_chunk_size() -> int:
     return AIConfig().llm_embedding_chunk_size
 
 
-def get_rag_context_size() -> int:
-    return AIConfig().llm_context_size
-
-
 def get_rag_chunk_overlap(chunk_size: int | None = None) -> int:
     chunk_size = chunk_size or get_rag_chunk_size()
     return min(RAG_CHUNK_OVERLAP, chunk_size - 1)
@@ -222,19 +217,20 @@ def _document_id_filters(doc_ids):
     )
 
 
-def get_llm_index_compaction_retention() -> int:
-    """Seconds of MVCC version history to keep during compaction."""
-    return 60 * 60  # 1 hour: safe for in-flight readers, reclaims daily
-
-
 def update_llm_index(
     *,
     iter_wrapper: IterWrapper[Document] = identity,
     rebuild=False,
 ) -> str:
     """Rebuild or incrementally update the LLM index."""
-    if not rebuild and llm_index_exists() and embedding_dim_mismatch():
-        logger.warning("Embedding dimension changed; forcing LLM index rebuild.")
+    model_name = get_configured_model_name()
+
+    if (
+        not rebuild
+        and llm_index_exists()
+        and get_vector_store().config_mismatch(model_name)
+    ):
+        logger.warning("Embedding model changed; forcing LLM index rebuild.")
         rebuild = True
 
     documents = Document.objects.all()
@@ -246,7 +242,7 @@ def update_llm_index(
     chunk_size = AIConfig().llm_embedding_chunk_size
     embed_model = get_embedding_model()
 
-    with write_store() as store:
+    with write_store(embed_model_name=model_name) as store:
         if rebuild or not store.table_exists():
             (settings.LLM_INDEX_DIR / "meta.json").unlink(missing_ok=True)
             logger.info("Rebuilding LLM index.")
@@ -275,7 +271,7 @@ def update_llm_index(
 
         store.ensure_document_id_scalar_index()
         store.maybe_create_ann_index()
-        store.compact(retention_seconds=get_llm_index_compaction_retention())
+        store.compact(retention_seconds=60 * 60)  # 1 hour: safe for in-flight readers
     return msg
 
 
@@ -285,7 +281,7 @@ def llm_index_add_or_update_document(document: Document):
     if new_nodes:
         _embed_nodes(new_nodes, get_embedding_model())
 
-    with write_store() as store:
+    with write_store(embed_model_name=get_configured_model_name()) as store:
         store.upsert_document(str(document.id), new_nodes)
         store.ensure_document_id_scalar_index()
 
diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py
index 64a3868e9..31e1f6bc8 100644
--- a/src/paperless_ai/tests/test_ai_indexing.py
+++ b/src/paperless_ai/tests/test_ai_indexing.py
@@ -155,13 +155,13 @@ def test_update_llm_index(
 
 
 @pytest.mark.django_db
-def test_update_llm_index_removes_meta(
+def test_update_llm_index_cleans_stale_meta_on_rebuild(
     temp_llm_index_dir: Path,
     real_document: Document,
     mock_embed_model: FakeEmbedding,
 ) -> None:
-    # Pre-create a meta.json — the new LanceDB-backed rebuild must delete it so
-    # that stale FAISS-era metadata does not accumulate on disk.
+    # A meta.json left over from the FAISS era (or written by older code) must be
+    # deleted on rebuild so stale artifacts don't accumulate on disk.
     stale_meta = temp_llm_index_dir / "meta.json"
     stale_meta.write_text(json.dumps({"embedding_model": "old", "dim": 1}))
 
@@ -177,6 +177,42 @@ def test_update_llm_index_removes_meta(
     )
 
 
+@pytest.mark.django_db
+def test_update_llm_index_rebuilds_on_model_name_change(
+    temp_llm_index_dir: Path,
+    real_document: Document,
+    mock_embed_model: FakeEmbedding,
+) -> None:
+    # Build initial index with model "model-a".
+    with patch("documents.models.Document.objects.all") as mock_all:
+        mock_queryset = MagicMock()
+        mock_queryset.exists.return_value = True
+        mock_queryset.__iter__.return_value = iter([real_document])
+        mock_all.return_value = mock_queryset
+        with patch(
+            "paperless_ai.indexing.get_configured_model_name",
+            return_value="model-a",
+        ):
+            indexing.update_llm_index(rebuild=True)
+
+    # Simulate config change to "model-b"; the incremental run must force a rebuild.
+    with patch("documents.models.Document.objects.all") as mock_all:
+        mock_queryset = MagicMock()
+        mock_queryset.exists.return_value = True
+        mock_queryset.__iter__.return_value = iter([real_document])
+        mock_all.return_value = mock_queryset
+        with patch(
+            "paperless_ai.indexing.get_configured_model_name",
+            return_value="model-b",
+        ):
+            indexing.update_llm_index(rebuild=False)
+
+    store = indexing.get_vector_store()
+    # Schema metadata only updates when the table is dropped and recreated, never on
+    # incremental writes -- so "model-b" here proves a full rebuild happened.
+    assert store.stored_model_name() == "model-b"
+
+
 @pytest.mark.django_db
 def test_update_llm_index_partial_update(
     temp_llm_index_dir: Path,
@@ -641,42 +677,6 @@ class TestLlmIndexLocking:
 
 
 @pytest.mark.django_db
-class TestDimensionGuard:
-    def test_embedding_dim_mismatch_false_when_no_table(
-        self,
-        temp_llm_index_dir: Path,
-        mock_embed_model: FakeEmbedding,
-    ) -> None:
-        """No table yet — dim mismatch must return False (nothing to compare)."""
-        assert not indexing.embedding_dim_mismatch()
-
-    def test_update_llm_index_forces_rebuild_on_dim_mismatch(
-        self,
-        temp_llm_index_dir: Path,
-        mock_embed_model: FakeEmbedding,
-        mocker: pytest_mock.MockerFixture,
-    ) -> None:
-        """When the stored dim differs from the current model, update must force a rebuild."""
-        mocker.patch("paperless_ai.indexing.embedding_dim_mismatch", return_value=True)
-        mocker.patch("paperless_ai.indexing.llm_index_exists", return_value=True)
-        mock_store = MagicMock()
-        mocker.patch(
-            "paperless_ai.indexing.write_store",
-            return_value=mocker.MagicMock(
-                __enter__=mocker.MagicMock(return_value=mock_store),
-                __exit__=mocker.MagicMock(return_value=False),
-            ),
-        )
-        mock_qs = MagicMock()
-        mock_qs.exists.return_value = True
-        mock_qs.__iter__ = MagicMock(return_value=iter([]))
-        mocker.patch("paperless_ai.indexing.Document.objects.all", return_value=mock_qs)
-
-        indexing.update_llm_index(rebuild=False)
-
-        mock_store.drop_table.assert_called_once()
-
-
 @pytest.mark.django_db
 class TestLanceDbIndexing:
     def test_get_vector_store_roundtrip(
diff --git a/src/paperless_ai/tests/test_embedding.py b/src/paperless_ai/tests/test_embedding.py
index 102a88367..d1d0754d1 100644
--- a/src/paperless_ai/tests/test_embedding.py
+++ b/src/paperless_ai/tests/test_embedding.py
@@ -1,4 +1,3 @@
-import json
 from unittest.mock import ANY
 from unittest.mock import MagicMock
 from unittest.mock import patch
@@ -10,7 +9,7 @@ from documents.models import Document
 from paperless.models import LLMEmbeddingBackend
 from paperless_ai.embedding import _normalize_llm_index_text
 from paperless_ai.embedding import build_llm_index_text
-from paperless_ai.embedding import get_embedding_dim
+from paperless_ai.embedding import get_configured_model_name
 from paperless_ai.embedding import get_embedding_model
 
 
@@ -186,52 +185,32 @@ def test_get_embedding_model_invalid_backend(mock_ai_config):
         get_embedding_model()
 
 
-def test_get_embedding_dim_infers_and_saves(temp_llm_index_dir, mock_ai_config):
-    mock_ai_config.return_value.llm_embedding_backend = "openai-like"
+@pytest.mark.parametrize(
+    ("backend", "expected_default"),
+    [
+        (LLMEmbeddingBackend.OPENAI_LIKE, "text-embedding-3-small"),
+        (LLMEmbeddingBackend.HUGGINGFACE, "sentence-transformers/all-MiniLM-L6-v2"),
+        (LLMEmbeddingBackend.OLLAMA, "embeddinggemma"),
+    ],
+)
+def test_get_configured_model_name_falls_back_to_backend_default(
+    mock_ai_config,
+    backend,
+    expected_default,
+):
+    """When no model is explicitly configured, each backend has a distinct default."""
+    mock_ai_config.return_value.llm_embedding_backend = backend
     mock_ai_config.return_value.llm_embedding_model = None
-
-    class DummyEmbedding:
-        def get_text_embedding(self, text):
-            return [0.0] * 7
-
-    with patch(
-        "paperless_ai.embedding.get_embedding_model",
-        return_value=DummyEmbedding(),
-    ) as mock_get:
-        dim = get_embedding_dim()
-        mock_get.assert_called_once()
-
-    assert dim == 7
-    meta = json.loads((temp_llm_index_dir / "meta.json").read_text())
-    assert meta == {"embedding_model": "text-embedding-3-small", "dim": 7}
+    assert get_configured_model_name() == expected_default
 
 
-def test_get_embedding_dim_reads_existing_meta(temp_llm_index_dir, mock_ai_config):
-    mock_ai_config.return_value.llm_embedding_backend = "openai-like"
-    mock_ai_config.return_value.llm_embedding_model = None
-
-    (temp_llm_index_dir / "meta.json").write_text(
-        json.dumps({"embedding_model": "text-embedding-3-small", "dim": 11}),
-    )
-
-    with patch("paperless_ai.embedding.get_embedding_model") as mock_get:
-        assert get_embedding_dim() == 11
-        mock_get.assert_not_called()
-
-
-def test_get_embedding_dim_raises_on_model_change(temp_llm_index_dir, mock_ai_config):
-    mock_ai_config.return_value.llm_embedding_backend = "openai-like"
-    mock_ai_config.return_value.llm_embedding_model = None
-
-    (temp_llm_index_dir / "meta.json").write_text(
-        json.dumps({"embedding_model": "old", "dim": 11}),
-    )
-
-    with pytest.raises(
-        RuntimeError,
-        match="Embedding model changed from old to text-embedding-3-small",
-    ):
-        get_embedding_dim()
+def test_get_configured_model_name_explicit_overrides_default(mock_ai_config):
+    """An explicit model name overrides the backend default for all backends."""
+    mock_ai_config.return_value.llm_embedding_backend = LLMEmbeddingBackend.OPENAI_LIKE
+    mock_ai_config.return_value.llm_embedding_model = "my-custom-model"
+    # The backend default for OPENAI_LIKE is "text-embedding-3-small", so if
+    # the explicit name was ignored we'd get the wrong result.
+    assert get_configured_model_name() == "my-custom-model"
 
 
 def test_build_llm_index_text(mock_document):
diff --git a/src/paperless_ai/tests/test_vector_store.py b/src/paperless_ai/tests/test_vector_store.py
index bf4bfe149..b409ed1c9 100644
--- a/src/paperless_ai/tests/test_vector_store.py
+++ b/src/paperless_ai/tests/test_vector_store.py
@@ -320,6 +320,57 @@ class TestPaperlessLanceVectorStoreMaintenance:
         store.ensure_document_id_scalar_index()  # no table yet — must not raise
 
 
+class TestConfigMismatch:
+    @pytest.fixture
+    def uri(self, tmp_path: Path) -> str:
+        return str(tmp_path / "idx")
+
+    def test_stored_model_name_returns_none_when_no_table(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri)
+        assert store.stored_model_name() is None
+
+    def test_model_name_stored_in_schema_after_add(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2")
+        store.add([_node("1-0", "1", "text", 0.1)])
+        assert store.stored_model_name() == "all-MiniLM-L6-v2"
+
+    def test_model_name_stored_in_schema_after_upsert(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri, embed_model_name="nomic-embed")
+        store.upsert_document("1", [_node("1-0", "1", "text", 0.1)])
+        assert store.stored_model_name() == "nomic-embed"
+
+    def test_model_name_persists_after_reopen(self, uri: str) -> None:
+        PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2").add(
+            [_node("1-0", "1", "text", 0.1)],
+        )
+        reopened = PaperlessLanceVectorStore(uri=uri)
+        assert reopened.stored_model_name() == "all-MiniLM-L6-v2"
+
+    def test_config_mismatch_returns_false_when_no_table(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri)
+        assert store.config_mismatch("any-model") is False
+
+    def test_config_mismatch_returns_false_when_model_matches(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2")
+        store.add([_node("1-0", "1", "text", 0.1)])
+        assert store.config_mismatch("all-MiniLM-L6-v2") is False
+
+    def test_config_mismatch_returns_true_when_model_differs(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri, embed_model_name="old-model")
+        store.add([_node("1-0", "1", "text", 0.1)])
+        assert store.config_mismatch("new-model") is True
+
+    def test_config_mismatch_returns_false_when_no_metadata_stored(
+        self,
+        uri: str,
+    ) -> None:
+        # Tables created before model-name tracking was added have no schema metadata.
+        # Conservative default: assume compatible rather than force a rebuild.
+        store = PaperlessLanceVectorStore(uri=uri)
+        store.add([_node("1-0", "1", "text", 0.1)])
+        assert store.config_mismatch("any-model") is False
+
+
 class TestGetModifiedTimes:
     @pytest.fixture
     def store(self, tmp_path: Path) -> PaperlessLanceVectorStore:
diff --git a/src/paperless_ai/vector_store.py b/src/paperless_ai/vector_store.py
index cd9bf154d..f91d50baf 100644
--- a/src/paperless_ai/vector_store.py
+++ b/src/paperless_ai/vector_store.py
@@ -71,13 +71,20 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
 
     _uri: str = PrivateAttr()
     _table_name: str = PrivateAttr()
+    _embed_model_name: str | None = PrivateAttr()
     _conn: Any = PrivateAttr()
     _table: Any = PrivateAttr()
 
-    def __init__(self, uri: str, table_name: str = DEFAULT_TABLE_NAME) -> None:
+    def __init__(
+        self,
+        uri: str,
+        table_name: str = DEFAULT_TABLE_NAME,
+        embed_model_name: str | None = None,
+    ) -> None:
         super().__init__(stores_text=True, flat_metadata=False)
         self._uri = uri
         self._table_name = table_name
+        self._embed_model_name = embed_model_name
         self._conn = lancedb.connect(uri)
         existing = self._conn.list_tables().tables
         self._table = (
@@ -101,8 +108,29 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
             self._conn.drop_table(self._table_name)
         self._table = None
 
+    def stored_model_name(self) -> str | None:
+        """Return the embedding model name stored in table schema metadata, or None."""
+        if self._table is None:
+            return None
+        meta = self._table.schema.metadata or {}
+        value = meta.get(b"embed_model")
+        return value.decode() if value else None
+
+    def config_mismatch(self, model_name: str) -> bool:
+        """True when the stored model name differs from ``model_name``.
+
+        Returns False when no table exists or when the table predates model-name
+        tracking (schema has no metadata) — conservative default avoids spurious
+        rebuilds on upgrade.
+        """
+        stored = self.stored_model_name()
+        if stored is None:
+            return False
+        return stored != model_name
+
     @staticmethod
-    def _schema(dim: int) -> pa.Schema:
+    def _schema(dim: int, model_name: str | None = None) -> pa.Schema:
+        meta = {b"embed_model": model_name.encode()} if model_name else None
         return pa.schema(
             [
                 pa.field("id", pa.string()),
@@ -112,6 +140,7 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
                 pa.field("vector", pa.list_(pa.float32(), dim)),
                 pa.field("node_content", pa.string()),
             ],
+            metadata=meta,
         )
 
     def _row(self, node: BaseNode) -> dict[str, Any]:
@@ -140,7 +169,7 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
         self._table = self._conn.create_table(
             self._table_name,
             rows,
-            schema=self._schema(dim),
+            schema=self._schema(dim, self._embed_model_name),
         )
         return True