From 7f5053cbe3db4c6dfe0f786a18ab0680d64194d6 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Sat, 6 Jun 2026 14:48:50 -0700 Subject: [PATCH] Implements config_mismatch which checks for either a dimension or model name mismatch. Model name is now stored as internal metadata of the vector store --- src/paperless_ai/embedding.py | 45 +++--------- src/paperless_ai/indexing.py | 50 ++++++------- src/paperless_ai/tests/test_ai_indexing.py | 78 ++++++++++----------- src/paperless_ai/tests/test_embedding.py | 69 +++++++----------- src/paperless_ai/tests/test_vector_store.py | 51 ++++++++++++++ src/paperless_ai/vector_store.py | 35 ++++++++- 6 files changed, 180 insertions(+), 148 deletions(-) diff --git a/src/paperless_ai/embedding.py b/src/paperless_ai/embedding.py index 59582130f..8480cb76d 100644 --- a/src/paperless_ai/embedding.py +++ b/src/paperless_ai/embedding.py @@ -1,12 +1,9 @@ -import json import re from typing import TYPE_CHECKING from django.conf import settings if TYPE_CHECKING: - from pathlib import Path - from llama_index.core.base.embeddings.base import BaseEmbedding from documents.models import Document @@ -95,41 +92,21 @@ def get_embedding_model() -> "BaseEmbedding": ) -def get_embedding_dim() -> int: - """ - Loads embedding dimension from meta.json if available, otherwise infers it - from a dummy embedding and stores it for future use. - """ +_DEFAULT_MODEL_NAMES = { + LLMEmbeddingBackend.OPENAI_LIKE: "text-embedding-3-small", + LLMEmbeddingBackend.HUGGINGFACE: "sentence-transformers/all-MiniLM-L6-v2", + LLMEmbeddingBackend.OLLAMA: "embeddinggemma", +} + + +def get_configured_model_name() -> str: + """Return the canonical name of the currently configured embedding model.""" config = AIConfig() - default_model = { - LLMEmbeddingBackend.OPENAI_LIKE: "text-embedding-3-small", - LLMEmbeddingBackend.HUGGINGFACE: "sentence-transformers/all-MiniLM-L6-v2", - LLMEmbeddingBackend.OLLAMA: "embeddinggemma", - }.get( + default = _DEFAULT_MODEL_NAMES.get( config.llm_embedding_backend, "sentence-transformers/all-MiniLM-L6-v2", ) - model = config.llm_embedding_model or default_model - - meta_path: Path = settings.LLM_INDEX_DIR / "meta.json" - if meta_path.exists(): - with meta_path.open() as f: - meta = json.load(f) - if meta.get("embedding_model") != model: - raise RuntimeError( - f"Embedding model changed from {meta.get('embedding_model')} to {model}. " - "You must rebuild the index.", - ) - return meta["dim"] - - embedding_model = get_embedding_model() - test_embed = embedding_model.get_text_embedding("test") - dim = len(test_embed) - - with meta_path.open("w") as f: - json.dump({"embedding_model": model, "dim": dim}, f) - - return dim + return config.llm_embedding_model or default def _normalize_llm_index_text(text: str) -> str: diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index 3e6cff3d3..bfd4edd72 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -14,6 +14,7 @@ from documents.utils import IterWrapper from documents.utils import identity from paperless.config import AIConfig from paperless_ai.embedding import build_llm_index_text +from paperless_ai.embedding import get_configured_model_name from paperless_ai.embedding import get_embedding_model if TYPE_CHECKING: @@ -72,16 +73,25 @@ def get_vector_store() -> "PaperlessLanceVectorStore": @contextmanager -def write_store(): +def write_store(embed_model_name: str | None = None): """Acquire the write lock and yield the vector store. All mutating operations (upsert, delete, rebuild, compact) must go through this context manager to serialise concurrent Celery writers. Read paths use ``get_vector_store()`` directly — no lock needed. + + Pass ``embed_model_name`` whenever the operation may create the table so + the model name is recorded in the schema metadata for future mismatch checks. """ + from paperless_ai.vector_store import PaperlessLanceVectorStore + settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True) with FileLock(settings.LLM_INDEX_LOCK): - yield get_vector_store() + yield PaperlessLanceVectorStore( + uri=str(settings.LLM_INDEX_DIR), + table_name=LLM_INDEX_TABLE, + embed_model_name=embed_model_name, + ) def build_document_node( @@ -148,25 +158,10 @@ def llm_index_exists() -> bool: return get_vector_store().table_exists() -def embedding_dim_mismatch() -> bool: - """True when the stored table's vector dim differs from the current model.""" - store = get_vector_store() - stored = store.vector_dim() - if stored is None: - return False - from paperless_ai.embedding import get_embedding_dim - - return stored != get_embedding_dim() - - def get_rag_chunk_size() -> int: return AIConfig().llm_embedding_chunk_size -def get_rag_context_size() -> int: - return AIConfig().llm_context_size - - def get_rag_chunk_overlap(chunk_size: int | None = None) -> int: chunk_size = chunk_size or get_rag_chunk_size() return min(RAG_CHUNK_OVERLAP, chunk_size - 1) @@ -222,19 +217,20 @@ def _document_id_filters(doc_ids): ) -def get_llm_index_compaction_retention() -> int: - """Seconds of MVCC version history to keep during compaction.""" - return 60 * 60 # 1 hour: safe for in-flight readers, reclaims daily - - def update_llm_index( *, iter_wrapper: IterWrapper[Document] = identity, rebuild=False, ) -> str: """Rebuild or incrementally update the LLM index.""" - if not rebuild and llm_index_exists() and embedding_dim_mismatch(): - logger.warning("Embedding dimension changed; forcing LLM index rebuild.") + model_name = get_configured_model_name() + + if ( + not rebuild + and llm_index_exists() + and get_vector_store().config_mismatch(model_name) + ): + logger.warning("Embedding model changed; forcing LLM index rebuild.") rebuild = True documents = Document.objects.all() @@ -246,7 +242,7 @@ def update_llm_index( chunk_size = AIConfig().llm_embedding_chunk_size embed_model = get_embedding_model() - with write_store() as store: + with write_store(embed_model_name=model_name) as store: if rebuild or not store.table_exists(): (settings.LLM_INDEX_DIR / "meta.json").unlink(missing_ok=True) logger.info("Rebuilding LLM index.") @@ -275,7 +271,7 @@ def update_llm_index( store.ensure_document_id_scalar_index() store.maybe_create_ann_index() - store.compact(retention_seconds=get_llm_index_compaction_retention()) + store.compact(retention_seconds=60 * 60) # 1 hour: safe for in-flight readers return msg @@ -285,7 +281,7 @@ def llm_index_add_or_update_document(document: Document): if new_nodes: _embed_nodes(new_nodes, get_embedding_model()) - with write_store() as store: + with write_store(embed_model_name=get_configured_model_name()) as store: store.upsert_document(str(document.id), new_nodes) store.ensure_document_id_scalar_index() diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py index 64a3868e9..31e1f6bc8 100644 --- a/src/paperless_ai/tests/test_ai_indexing.py +++ b/src/paperless_ai/tests/test_ai_indexing.py @@ -155,13 +155,13 @@ def test_update_llm_index( @pytest.mark.django_db -def test_update_llm_index_removes_meta( +def test_update_llm_index_cleans_stale_meta_on_rebuild( temp_llm_index_dir: Path, real_document: Document, mock_embed_model: FakeEmbedding, ) -> None: - # Pre-create a meta.json — the new LanceDB-backed rebuild must delete it so - # that stale FAISS-era metadata does not accumulate on disk. + # A meta.json left over from the FAISS era (or written by older code) must be + # deleted on rebuild so stale artifacts don't accumulate on disk. stale_meta = temp_llm_index_dir / "meta.json" stale_meta.write_text(json.dumps({"embedding_model": "old", "dim": 1})) @@ -177,6 +177,42 @@ def test_update_llm_index_removes_meta( ) +@pytest.mark.django_db +def test_update_llm_index_rebuilds_on_model_name_change( + temp_llm_index_dir: Path, + real_document: Document, + mock_embed_model: FakeEmbedding, +) -> None: + # Build initial index with model "model-a". + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + with patch( + "paperless_ai.indexing.get_configured_model_name", + return_value="model-a", + ): + indexing.update_llm_index(rebuild=True) + + # Simulate config change to "model-b"; the incremental run must force a rebuild. + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + with patch( + "paperless_ai.indexing.get_configured_model_name", + return_value="model-b", + ): + indexing.update_llm_index(rebuild=False) + + store = indexing.get_vector_store() + # Schema metadata only updates when the table is dropped and recreated, never on + # incremental writes -- so "model-b" here proves a full rebuild happened. + assert store.stored_model_name() == "model-b" + + @pytest.mark.django_db def test_update_llm_index_partial_update( temp_llm_index_dir: Path, @@ -641,42 +677,6 @@ class TestLlmIndexLocking: @pytest.mark.django_db -class TestDimensionGuard: - def test_embedding_dim_mismatch_false_when_no_table( - self, - temp_llm_index_dir: Path, - mock_embed_model: FakeEmbedding, - ) -> None: - """No table yet — dim mismatch must return False (nothing to compare).""" - assert not indexing.embedding_dim_mismatch() - - def test_update_llm_index_forces_rebuild_on_dim_mismatch( - self, - temp_llm_index_dir: Path, - mock_embed_model: FakeEmbedding, - mocker: pytest_mock.MockerFixture, - ) -> None: - """When the stored dim differs from the current model, update must force a rebuild.""" - mocker.patch("paperless_ai.indexing.embedding_dim_mismatch", return_value=True) - mocker.patch("paperless_ai.indexing.llm_index_exists", return_value=True) - mock_store = MagicMock() - mocker.patch( - "paperless_ai.indexing.write_store", - return_value=mocker.MagicMock( - __enter__=mocker.MagicMock(return_value=mock_store), - __exit__=mocker.MagicMock(return_value=False), - ), - ) - mock_qs = MagicMock() - mock_qs.exists.return_value = True - mock_qs.__iter__ = MagicMock(return_value=iter([])) - mocker.patch("paperless_ai.indexing.Document.objects.all", return_value=mock_qs) - - indexing.update_llm_index(rebuild=False) - - mock_store.drop_table.assert_called_once() - - @pytest.mark.django_db class TestLanceDbIndexing: def test_get_vector_store_roundtrip( diff --git a/src/paperless_ai/tests/test_embedding.py b/src/paperless_ai/tests/test_embedding.py index 102a88367..d1d0754d1 100644 --- a/src/paperless_ai/tests/test_embedding.py +++ b/src/paperless_ai/tests/test_embedding.py @@ -1,4 +1,3 @@ -import json from unittest.mock import ANY from unittest.mock import MagicMock from unittest.mock import patch @@ -10,7 +9,7 @@ from documents.models import Document from paperless.models import LLMEmbeddingBackend from paperless_ai.embedding import _normalize_llm_index_text from paperless_ai.embedding import build_llm_index_text -from paperless_ai.embedding import get_embedding_dim +from paperless_ai.embedding import get_configured_model_name from paperless_ai.embedding import get_embedding_model @@ -186,52 +185,32 @@ def test_get_embedding_model_invalid_backend(mock_ai_config): get_embedding_model() -def test_get_embedding_dim_infers_and_saves(temp_llm_index_dir, mock_ai_config): - mock_ai_config.return_value.llm_embedding_backend = "openai-like" +@pytest.mark.parametrize( + ("backend", "expected_default"), + [ + (LLMEmbeddingBackend.OPENAI_LIKE, "text-embedding-3-small"), + (LLMEmbeddingBackend.HUGGINGFACE, "sentence-transformers/all-MiniLM-L6-v2"), + (LLMEmbeddingBackend.OLLAMA, "embeddinggemma"), + ], +) +def test_get_configured_model_name_falls_back_to_backend_default( + mock_ai_config, + backend, + expected_default, +): + """When no model is explicitly configured, each backend has a distinct default.""" + mock_ai_config.return_value.llm_embedding_backend = backend mock_ai_config.return_value.llm_embedding_model = None - - class DummyEmbedding: - def get_text_embedding(self, text): - return [0.0] * 7 - - with patch( - "paperless_ai.embedding.get_embedding_model", - return_value=DummyEmbedding(), - ) as mock_get: - dim = get_embedding_dim() - mock_get.assert_called_once() - - assert dim == 7 - meta = json.loads((temp_llm_index_dir / "meta.json").read_text()) - assert meta == {"embedding_model": "text-embedding-3-small", "dim": 7} + assert get_configured_model_name() == expected_default -def test_get_embedding_dim_reads_existing_meta(temp_llm_index_dir, mock_ai_config): - mock_ai_config.return_value.llm_embedding_backend = "openai-like" - mock_ai_config.return_value.llm_embedding_model = None - - (temp_llm_index_dir / "meta.json").write_text( - json.dumps({"embedding_model": "text-embedding-3-small", "dim": 11}), - ) - - with patch("paperless_ai.embedding.get_embedding_model") as mock_get: - assert get_embedding_dim() == 11 - mock_get.assert_not_called() - - -def test_get_embedding_dim_raises_on_model_change(temp_llm_index_dir, mock_ai_config): - mock_ai_config.return_value.llm_embedding_backend = "openai-like" - mock_ai_config.return_value.llm_embedding_model = None - - (temp_llm_index_dir / "meta.json").write_text( - json.dumps({"embedding_model": "old", "dim": 11}), - ) - - with pytest.raises( - RuntimeError, - match="Embedding model changed from old to text-embedding-3-small", - ): - get_embedding_dim() +def test_get_configured_model_name_explicit_overrides_default(mock_ai_config): + """An explicit model name overrides the backend default for all backends.""" + mock_ai_config.return_value.llm_embedding_backend = LLMEmbeddingBackend.OPENAI_LIKE + mock_ai_config.return_value.llm_embedding_model = "my-custom-model" + # The backend default for OPENAI_LIKE is "text-embedding-3-small", so if + # the explicit name was ignored we'd get the wrong result. + assert get_configured_model_name() == "my-custom-model" def test_build_llm_index_text(mock_document): diff --git a/src/paperless_ai/tests/test_vector_store.py b/src/paperless_ai/tests/test_vector_store.py index bf4bfe149..b409ed1c9 100644 --- a/src/paperless_ai/tests/test_vector_store.py +++ b/src/paperless_ai/tests/test_vector_store.py @@ -320,6 +320,57 @@ class TestPaperlessLanceVectorStoreMaintenance: store.ensure_document_id_scalar_index() # no table yet — must not raise +class TestConfigMismatch: + @pytest.fixture + def uri(self, tmp_path: Path) -> str: + return str(tmp_path / "idx") + + def test_stored_model_name_returns_none_when_no_table(self, uri: str) -> None: + store = PaperlessLanceVectorStore(uri=uri) + assert store.stored_model_name() is None + + def test_model_name_stored_in_schema_after_add(self, uri: str) -> None: + store = PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2") + store.add([_node("1-0", "1", "text", 0.1)]) + assert store.stored_model_name() == "all-MiniLM-L6-v2" + + def test_model_name_stored_in_schema_after_upsert(self, uri: str) -> None: + store = PaperlessLanceVectorStore(uri=uri, embed_model_name="nomic-embed") + store.upsert_document("1", [_node("1-0", "1", "text", 0.1)]) + assert store.stored_model_name() == "nomic-embed" + + def test_model_name_persists_after_reopen(self, uri: str) -> None: + PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2").add( + [_node("1-0", "1", "text", 0.1)], + ) + reopened = PaperlessLanceVectorStore(uri=uri) + assert reopened.stored_model_name() == "all-MiniLM-L6-v2" + + def test_config_mismatch_returns_false_when_no_table(self, uri: str) -> None: + store = PaperlessLanceVectorStore(uri=uri) + assert store.config_mismatch("any-model") is False + + def test_config_mismatch_returns_false_when_model_matches(self, uri: str) -> None: + store = PaperlessLanceVectorStore(uri=uri, embed_model_name="all-MiniLM-L6-v2") + store.add([_node("1-0", "1", "text", 0.1)]) + assert store.config_mismatch("all-MiniLM-L6-v2") is False + + def test_config_mismatch_returns_true_when_model_differs(self, uri: str) -> None: + store = PaperlessLanceVectorStore(uri=uri, embed_model_name="old-model") + store.add([_node("1-0", "1", "text", 0.1)]) + assert store.config_mismatch("new-model") is True + + def test_config_mismatch_returns_false_when_no_metadata_stored( + self, + uri: str, + ) -> None: + # Tables created before model-name tracking was added have no schema metadata. + # Conservative default: assume compatible rather than force a rebuild. + store = PaperlessLanceVectorStore(uri=uri) + store.add([_node("1-0", "1", "text", 0.1)]) + assert store.config_mismatch("any-model") is False + + class TestGetModifiedTimes: @pytest.fixture def store(self, tmp_path: Path) -> PaperlessLanceVectorStore: diff --git a/src/paperless_ai/vector_store.py b/src/paperless_ai/vector_store.py index cd9bf154d..f91d50baf 100644 --- a/src/paperless_ai/vector_store.py +++ b/src/paperless_ai/vector_store.py @@ -71,13 +71,20 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore): _uri: str = PrivateAttr() _table_name: str = PrivateAttr() + _embed_model_name: str | None = PrivateAttr() _conn: Any = PrivateAttr() _table: Any = PrivateAttr() - def __init__(self, uri: str, table_name: str = DEFAULT_TABLE_NAME) -> None: + def __init__( + self, + uri: str, + table_name: str = DEFAULT_TABLE_NAME, + embed_model_name: str | None = None, + ) -> None: super().__init__(stores_text=True, flat_metadata=False) self._uri = uri self._table_name = table_name + self._embed_model_name = embed_model_name self._conn = lancedb.connect(uri) existing = self._conn.list_tables().tables self._table = ( @@ -101,8 +108,29 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore): self._conn.drop_table(self._table_name) self._table = None + def stored_model_name(self) -> str | None: + """Return the embedding model name stored in table schema metadata, or None.""" + if self._table is None: + return None + meta = self._table.schema.metadata or {} + value = meta.get(b"embed_model") + return value.decode() if value else None + + def config_mismatch(self, model_name: str) -> bool: + """True when the stored model name differs from ``model_name``. + + Returns False when no table exists or when the table predates model-name + tracking (schema has no metadata) — conservative default avoids spurious + rebuilds on upgrade. + """ + stored = self.stored_model_name() + if stored is None: + return False + return stored != model_name + @staticmethod - def _schema(dim: int) -> pa.Schema: + def _schema(dim: int, model_name: str | None = None) -> pa.Schema: + meta = {b"embed_model": model_name.encode()} if model_name else None return pa.schema( [ pa.field("id", pa.string()), @@ -112,6 +140,7 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore): pa.field("vector", pa.list_(pa.float32(), dim)), pa.field("node_content", pa.string()), ], + metadata=meta, ) def _row(self, node: BaseNode) -> dict[str, Any]: @@ -140,7 +169,7 @@ class PaperlessLanceVectorStore(BasePydanticVectorStore): self._table = self._conn.create_table( self._table_name, rows, - schema=self._schema(dim), + schema=self._schema(dim, self._embed_model_name), ) return True