Implements config_mismatch which checks for either a dimension or model name mismatch. Model name is now stored as internal metadata of the vector store

2026-08-02 17:12:18 +00:00 · 2026-06-06 14:48:50 -07:00
parent 6ee84af8ad
commit 7f5053cbe3
6 changed files with 180 additions and 148 deletions
@@ -1,12 +1,9 @@
-import json
 import re
 from typing import TYPE_CHECKING

 from django.conf import settings

 if TYPE_CHECKING:
-    from pathlib import Path
-
    from llama_index.core.base.embeddings.base import BaseEmbedding

 from documents.models import Document
@@ -95,41 +92,21 @@ def get_embedding_model() -> "BaseEmbedding":
            )


-def get_embedding_dim() -> int:
-    """
-    Loads embedding dimension from meta.json if available, otherwise infers it
-    from a dummy embedding and stores it for future use.
-    """
+_DEFAULT_MODEL_NAMES = {
+    LLMEmbeddingBackend.OPENAI_LIKE: "text-embedding-3-small",
+    LLMEmbeddingBackend.HUGGINGFACE: "sentence-transformers/all-MiniLM-L6-v2",
+    LLMEmbeddingBackend.OLLAMA: "embeddinggemma",
+}
+
+
+def get_configured_model_name() -> str:
+    """Return the canonical name of the currently configured embedding model."""
    config = AIConfig()
-    default_model = {
-        LLMEmbeddingBackend.OPENAI_LIKE: "text-embedding-3-small",
-        LLMEmbeddingBackend.HUGGINGFACE: "sentence-transformers/all-MiniLM-L6-v2",
-        LLMEmbeddingBackend.OLLAMA: "embeddinggemma",
-    }.get(
+    default = _DEFAULT_MODEL_NAMES.get(
        config.llm_embedding_backend,
        "sentence-transformers/all-MiniLM-L6-v2",
    )
-    model = config.llm_embedding_model or default_model
-
-    meta_path: Path = settings.LLM_INDEX_DIR / "meta.json"
-    if meta_path.exists():
-        with meta_path.open() as f:
-            meta = json.load(f)
-        if meta.get("embedding_model") != model:
-            raise RuntimeError(
-                f"Embedding model changed from {meta.get('embedding_model')} to {model}. "
-                "You must rebuild the index.",
-            )
-        return meta["dim"]
-
-    embedding_model = get_embedding_model()
-    test_embed = embedding_model.get_text_embedding("test")
-    dim = len(test_embed)
-
-    with meta_path.open("w") as f:
-        json.dump({"embedding_model": model, "dim": dim}, f)
-
-    return dim
+    return config.llm_embedding_model or default


 def _normalize_llm_index_text(text: str) -> str:
@@ -14,6 +14,7 @@ from documents.utils import IterWrapper
 from documents.utils import identity
 from paperless.config import AIConfig
 from paperless_ai.embedding import build_llm_index_text
+from paperless_ai.embedding import get_configured_model_name
 from paperless_ai.embedding import get_embedding_model

 if TYPE_CHECKING:
@@ -72,16 +73,25 @@ def get_vector_store() -> "PaperlessLanceVectorStore":


@contextmanager
-def write_store():
+def write_store(embed_model_name: str | None = None):
    """Acquire the write lock and yield the vector store.

    All mutating operations (upsert, delete, rebuild, compact) must go through
    this context manager to serialise concurrent Celery writers.
    Read paths use ``get_vector_store()`` directly — no lock needed.
+
+    Pass ``embed_model_name`` whenever the operation may create the table so
+    the model name is recorded in the schema metadata for future mismatch checks.
    """
+    from paperless_ai.vector_store import PaperlessLanceVectorStore
+
    settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
    with FileLock(settings.LLM_INDEX_LOCK):
-        yield get_vector_store()
+        yield PaperlessLanceVectorStore(
+            uri=str(settings.LLM_INDEX_DIR),
+            table_name=LLM_INDEX_TABLE,
+            embed_model_name=embed_model_name,
+        )


 def build_document_node(
@@ -148,25 +158,10 @@ def llm_index_exists() -> bool:
    return get_vector_store().table_exists()


-def embedding_dim_mismatch() -> bool:
-    """True when the stored table's vector dim differs from the current model."""
-    store = get_vector_store()
-    stored = store.vector_dim()
-    if stored is None:
-        return False
-    from paperless_ai.embedding import get_embedding_dim
-
-    return stored != get_embedding_dim()
-
-
 def get_rag_chunk_size() -> int:
    return AIConfig().llm_embedding_chunk_size


-def get_rag_context_size() -> int:
-    return AIConfig().llm_context_size
-
-
 def get_rag_chunk_overlap(chunk_size: int | None = None) -> int:
    chunk_size = chunk_size or get_rag_chunk_size()
    return min(RAG_CHUNK_OVERLAP, chunk_size - 1)
@@ -222,19 +217,20 @@ def _document_id_filters(doc_ids):
    )


-def get_llm_index_compaction_retention() -> int:
-    """Seconds of MVCC version history to keep during compaction."""
-    return 60 * 60  # 1 hour: safe for in-flight readers, reclaims daily
-
-
 def update_llm_index(
    *,
    iter_wrapper: IterWrapper[Document] = identity,
    rebuild=False,
 ) -> str:
    """Rebuild or incrementally update the LLM index."""
-    if not rebuild and llm_index_exists() and embedding_dim_mismatch():
-        logger.warning("Embedding dimension changed; forcing LLM index rebuild.")
+    model_name = get_configured_model_name()
+
+    if (
+        not rebuild
+        and llm_index_exists()
+        and get_vector_store().config_mismatch(model_name)
+    ):
+        logger.warning("Embedding model changed; forcing LLM index rebuild.")
        rebuild = True

    documents = Document.objects.all()
@@ -246,7 +242,7 @@ def update_llm_index(
    chunk_size = AIConfig().llm_embedding_chunk_size
    embed_model = get_embedding_model()

-    with write_store() as store:
+    with write_store(embed_model_name=model_name) as store:
        if rebuild or not store.table_exists():
            (settings.LLM_INDEX_DIR / "meta.json").unlink(missing_ok=True)
            logger.info("Rebuilding LLM index.")
@@ -275,7 +271,7 @@ def update_llm_index(

        store.ensure_document_id_scalar_index()
        store.maybe_create_ann_index()
-        store.compact(retention_seconds=get_llm_index_compaction_retention())
+        store.compact(retention_seconds=60 * 60)  # 1 hour: safe for in-flight readers
    return msg


@@ -285,7 +281,7 @@ def llm_index_add_or_update_document(document: Document):
    if new_nodes:
        _embed_nodes(new_nodes, get_embedding_model())

-    with write_store() as store:
+    with write_store(embed_model_name=get_configured_model_name()) as store:
        store.upsert_document(str(document.id), new_nodes)
        store.ensure_document_id_scalar_index()

@@ -155,13 +155,13 @@ def test_update_llm_index(


@pytest.mark.django_db
-def test_update_llm_index_removes_meta(
+def test_update_llm_index_cleans_stale_meta_on_rebuild(
    temp_llm_index_dir: Path,
    real_document: Document,
    mock_embed_model: FakeEmbedding,
 ) -> None:
-    # Pre-create a meta.json — the new LanceDB-backed rebuild must delete it so
-    # that stale FAISS-era metadata does not accumulate on disk.
+    # A meta.json left over from the FAISS era (or written by older code) must be
+    # deleted on rebuild so stale artifacts don't accumulate on disk.
    stale_meta = temp_llm_index_dir / "meta.json"
    stale_meta.write_text(json.dumps({"embedding_model": "old", "dim": 1}))

@@ -177,6 +177,42 @@ def test_update_llm_index_removes_meta(
    )


+@pytest.mark.django_db
+def test_update_llm_index_rebuilds_on_model_name_change(
+    temp_llm_index_dir: Path,
+    real_document: Document,
+    mock_embed_model: FakeEmbedding,
+) -> None:
+    # Build initial index with model "model-a".
+    with patch("documents.models.Document.objects.all") as mock_all:
+        mock_queryset = MagicMock()
+        mock_queryset.exists.return_value = True
+        mock_queryset.__iter__.return_value = iter([real_document])
+        mock_all.return_value = mock_queryset
+        with patch(
+            "paperless_ai.indexing.get_configured_model_name",
+            return_value="model-a",
+        ):
+            indexing.update_llm_index(rebuild=True)
+
+    # Simulate config change to "model-b"; the incremental run must force a rebuild.
+    with patch("documents.models.Document.objects.all") as mock_all:
+        mock_queryset = MagicMock()
+        mock_queryset.exists.return_value = True
+        mock_queryset.__iter__.return_value = iter([real_document])
+        mock_all.return_value = mock_queryset
+        with patch(
+            "paperless_ai.indexing.get_configured_model_name",
+            return_value="model-b",
+        ):
+            indexing.update_llm_index(rebuild=False)
+
+    store = indexing.get_vector_store()
+    # Schema metadata only updates when the table is dropped and recreated, never on
+    # incremental writes -- so "model-b" here proves a full rebuild happened.
+    assert store.stored_model_name() == "model-b"
+
+
@pytest.mark.django_db
 def test_update_llm_index_partial_update(
    temp_llm_index_dir: Path,
@@ -641,42 +677,6 @@ class TestLlmIndexLocking:


@pytest.mark.django_db
-class TestDimensionGuard:
-    def test_embedding_dim_mismatch_false_when_no_table(
-        self,
-        temp_llm_index_dir: Path,
-        mock_embed_model: FakeEmbedding,
-    ) -> None:
-        """No table yet — dim mismatch must return False (nothing to compare)."""
-        assert not indexing.embedding_dim_mismatch()
-
-    def test_update_llm_index_forces_rebuild_on_dim_mismatch(
-        self,
-        temp_llm_index_dir: Path,
-        mock_embed_model: FakeEmbedding,
-        mocker: pytest_mock.MockerFixture,
-    ) -> None:
-        """When the stored dim differs from the current model, update must force a rebuild."""
-        mocker.patch("paperless_ai.indexing.embedding_dim_mismatch", return_value=True)
-        mocker.patch("paperless_ai.indexing.llm_index_exists", return_value=True)
-        mock_store = MagicMock()
-        mocker.patch(
-            "paperless_ai.indexing.write_store",
-            return_value=mocker.MagicMock(
-                __enter__=mocker.MagicMock(return_value=mock_store),
-                __exit__=mocker.MagicMock(return_value=False),
-            ),
-        )
-        mock_qs = MagicMock()
-        mock_qs.exists.return_value = True
-        mock_qs.__iter__ = MagicMock(return_value=iter([]))
-        mocker.patch("paperless_ai.indexing.Document.objects.all", return_value=mock_qs)
-
-        indexing.update_llm_index(rebuild=False)
-
-        mock_store.drop_table.assert_called_once()
-
-
@pytest.mark.django_db
 class TestLanceDbIndexing:
    def test_get_vector_store_roundtrip(
@@ -1,4 +1,3 @@
-import json
 from unittest.mock import ANY
 from unittest.mock import MagicMock
 from unittest.mock import patch
@@ -10,7 +9,7 @@ from documents.models import Document
 from paperless.models import LLMEmbeddingBackend
 from paperless_ai.embedding import _normalize_llm_index_text
 from paperless_ai.embedding import build_llm_index_text
-from paperless_ai.embedding import get_embedding_dim
+from paperless_ai.embedding import get_configured_model_name
 from paperless_ai.embedding import get_embedding_model


@@ -186,52 +185,32 @@ def test_get_embedding_model_invalid_backend(mock_ai_config):
        get_embedding_model()


-def test_get_embedding_dim_infers_and_saves(temp_llm_index_dir, mock_ai_config):
-    mock_ai_config.return_value.llm_embedding_backend = "openai-like"
+@pytest.mark.parametrize(
+    ("backend", "expected_default"),
+    [
+        (LLMEmbeddingBackend.OPENAI_LIKE, "text-embedding-3-small"),
+        (LLMEmbeddingBackend.HUGGINGFACE, "sentence-transformers/all-MiniLM-L6-v2"),
+        (LLMEmbeddingBackend.OLLAMA, "embeddinggemma"),
+    ],
+)
+def test_get_configured_model_name_falls_back_to_backend_default(
+    mock_ai_config,
+    backend,
+    expected_default,
+):
+    """When no model is explicitly configured, each backend has a distinct default."""
+    mock_ai_config.return_value.llm_embedding_backend = backend
    mock_ai_config.return_value.llm_embedding_model = None
-
-    class DummyEmbedding:
-        def get_text_embedding(self, text):
-            return [0.0] * 7
-
-    with patch(
-        "paperless_ai.embedding.get_embedding_model",
-        return_value=DummyEmbedding(),
-    ) as mock_get:
-        dim = get_embedding_dim()
-        mock_get.assert_called_once()
-
-    assert dim == 7
-    meta = json.loads((temp_llm_index_dir / "meta.json").read_text())
-    assert meta == {"embedding_model": "text-embedding-3-small", "dim": 7}
+    assert get_configured_model_name() == expected_default


-def test_get_embedding_dim_reads_existing_meta(temp_llm_index_dir, mock_ai_config):
-    mock_ai_config.return_value.llm_embedding_backend = "openai-like"
-    mock_ai_config.return_value.llm_embedding_model = None
-
-    (temp_llm_index_dir / "meta.json").write_text(
-        json.dumps({"embedding_model": "text-embedding-3-small", "dim": 11}),
-    )
-
-    with patch("paperless_ai.embedding.get_embedding_model") as mock_get:
-        assert get_embedding_dim() == 11
-        mock_get.assert_not_called()
-
-
-def test_get_embedding_dim_raises_on_model_change(temp_llm_index_dir, mock_ai_config):
-    mock_ai_config.return_value.llm_embedding_backend = "openai-like"
-    mock_ai_config.return_value.llm_embedding_model = None
-
-    (temp_llm_index_dir / "meta.json").write_text(
-        json.dumps({"embedding_model": "old", "dim": 11}),
-    )
-
-    with pytest.raises(
-        RuntimeError,
-        match="Embedding model changed from old to text-embedding-3-small",
-    ):
-        get_embedding_dim()
+def test_get_configured_model_name_explicit_overrides_default(mock_ai_config):
+    """An explicit model name overrides the backend default for all backends."""
+    mock_ai_config.return_value.llm_embedding_backend = LLMEmbeddingBackend.OPENAI_LIKE
+    mock_ai_config.return_value.llm_embedding_model = "my-custom-model"
+    # The backend default for OPENAI_LIKE is "text-embedding-3-small", so if
+    # the explicit name was ignored we'd get the wrong result.
+    assert get_configured_model_name() == "my-custom-model"


 def test_build_llm_index_text(mock_document):
@@ -320,6 +320,57 @@ class TestPaperlessLanceVectorStoreMaintenance:
        store.ensure_document_id_scalar_index()  # no table yet — must not raise


+class TestConfigMismatch:
+    @pytest.fixture
+    def uri(self, tmp_path: Path) -> str:
+        return str(tmp_path / "idx")
+
+    def test_stored_model_name_returns_none_when_no_table(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri)
+        assert store.stored_model_name() is None
+
+    def test_model_name_stored_in_schema_after_add(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2")
+        store.add([_node("1-0", "1", "text", 0.1)])
+        assert store.stored_model_name() == "all-MiniLM-L6-v2"
+
+    def test_model_name_stored_in_schema_after_upsert(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri, embed_model_name="nomic-embed")
+        store.upsert_document("1", [_node("1-0", "1", "text", 0.1)])
+        assert store.stored_model_name() == "nomic-embed"
+
+    def test_model_name_persists_after_reopen(self, uri: str) -> None:
+        PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2").add(
+            [_node("1-0", "1", "text", 0.1)],
+        )
+        reopened = PaperlessLanceVectorStore(uri=uri)
+        assert reopened.stored_model_name() == "all-MiniLM-L6-v2"
+
+    def test_config_mismatch_returns_false_when_no_table(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri)
+        assert store.config_mismatch("any-model") is False
+
+    def test_config_mismatch_returns_false_when_model_matches(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2")
+        store.add([_node("1-0", "1", "text", 0.1)])
+        assert store.config_mismatch("all-MiniLM-L6-v2") is False
+
+    def test_config_mismatch_returns_true_when_model_differs(self, uri: str) -> None:
+        store = PaperlessLanceVectorStore(uri=uri, embed_model_name="old-model")
+        store.add([_node("1-0", "1", "text", 0.1)])
+        assert store.config_mismatch("new-model") is True
+
+    def test_config_mismatch_returns_false_when_no_metadata_stored(
+        self,
+        uri: str,
+    ) -> None:
+        # Tables created before model-name tracking was added have no schema metadata.
+        # Conservative default: assume compatible rather than force a rebuild.
+        store = PaperlessLanceVectorStore(uri=uri)
+        store.add([_node("1-0", "1", "text", 0.1)])
+        assert store.config_mismatch("any-model") is False
+
+
 class TestGetModifiedTimes:
    @pytest.fixture
    def store(self, tmp_path: Path) -> PaperlessLanceVectorStore:
@@ -71,13 +71,20 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):

    _uri: str = PrivateAttr()
    _table_name: str = PrivateAttr()
+    _embed_model_name: str | None = PrivateAttr()
    _conn: Any = PrivateAttr()
    _table: Any = PrivateAttr()

-    def __init__(self, uri: str, table_name: str = DEFAULT_TABLE_NAME) -> None:
+    def __init__(
+        self,
+        uri: str,
+        table_name: str = DEFAULT_TABLE_NAME,
+        embed_model_name: str | None = None,
+    ) -> None:
        super().__init__(stores_text=True, flat_metadata=False)
        self._uri = uri
        self._table_name = table_name
+        self._embed_model_name = embed_model_name
        self._conn = lancedb.connect(uri)
        existing = self._conn.list_tables().tables
        self._table = (
@@ -101,8 +108,29 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
            self._conn.drop_table(self._table_name)
        self._table = None

+    def stored_model_name(self) -> str | None:
+        """Return the embedding model name stored in table schema metadata, or None."""
+        if self._table is None:
+            return None
+        meta = self._table.schema.metadata or {}
+        value = meta.get(b"embed_model")
+        return value.decode() if value else None
+
+    def config_mismatch(self, model_name: str) -> bool:
+        """True when the stored model name differs from ``model_name``.
+
+        Returns False when no table exists or when the table predates model-name
+        tracking (schema has no metadata) — conservative default avoids spurious
+        rebuilds on upgrade.
+        """
+        stored = self.stored_model_name()
+        if stored is None:
+            return False
+        return stored != model_name
+
    @staticmethod
-    def _schema(dim: int) -> pa.Schema:
+    def _schema(dim: int, model_name: str | None = None) -> pa.Schema:
+        meta = {b"embed_model": model_name.encode()} if model_name else None
        return pa.schema(
            [
                pa.field("id", pa.string()),
@@ -112,6 +140,7 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
                pa.field("vector", pa.list_(pa.float32(), dim)),
                pa.field("node_content", pa.string()),
            ],
+            metadata=meta,
        )

    def _row(self, node: BaseNode) -> dict[str, Any]:
@@ -140,7 +169,7 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore):
        self._table = self._conn.create_table(
            self._table_name,
            rows,
-            schema=self._schema(dim),
+            schema=self._schema(dim, self._embed_model_name),
        )
        return True