From 045d6dd723137cafe8991125a40388e4f420a2af Mon Sep 17 00:00:00 2001 From: stumpylog <797416+stumpylog@users.noreply.github.com> Date: Tue, 9 Jun 2026 09:39:30 -0700 Subject: [PATCH] feat(ai): wire schema migrations into update_llm_index; structural changes avoid re-embed Structural migrations (requires_reembed=False) are applied in-place before the incremental update path. If any pending migration requires re-embedding, a full drop+rebuild is triggered automatically, mirroring the model-name mismatch detection that already existed. Co-Authored-By: Claude Sonnet 4.6 --- src/paperless_ai/indexing.py | 7 ++ src/paperless_ai/tests/test_ai_indexing.py | 87 ++++++++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index dd96106a6..37c341eed 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -249,6 +249,13 @@ def update_llm_index( embed_model = get_embedding_model(config) with write_store(embed_model_name=model_name) as store: + if not rebuild and store.table_exists(): + store.apply_structural_migrations() + if store.requires_reembed_migration(): + logger.warning( + "Schema migration requires re-embedding; forcing LLM index rebuild.", + ) + rebuild = True if rebuild or not store.table_exists(): (settings.LLM_INDEX_DIR / "meta.json").unlink(missing_ok=True) logger.info("Rebuilding LLM index.") diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py index 31e1f6bc8..5fbb6806d 100644 --- a/src/paperless_ai/tests/test_ai_indexing.py +++ b/src/paperless_ai/tests/test_ai_indexing.py @@ -213,6 +213,93 @@ def test_update_llm_index_rebuilds_on_model_name_change( assert store.stored_model_name() == "model-b" +@pytest.mark.django_db +def test_update_llm_index_applies_structural_migration_without_rebuild( + temp_llm_index_dir: Path, + real_document: Document, + mock_embed_model: FakeEmbedding, + mocker: pytest_mock.MockerFixture, +) -> None: + """Structural migrations are applied in-place; no full rebuild (drop) occurs.""" + from paperless_ai.vector_store import Migration + from paperless_ai.vector_store import PaperlessLanceVectorStore + + column_added: list[bool] = [] + + def _add_extra(table) -> None: + table.add_columns({"extra": "CAST(NULL AS string)"}) + column_added.append(True) + + # Build the initial index at version 1 (the real CURRENT_SCHEMA_VERSION; no patches needed). + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + indexing.update_llm_index(rebuild=True) + + # Simulate a new v2 structural migration being introduced after the initial index was built. + m2 = Migration( + version=2, + description="add extra col", + requires_reembed=False, + apply=_add_extra, + ) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2]) + mocker.patch("paperless_ai.vector_store.CURRENT_SCHEMA_VERSION", 2) + drop_spy = mocker.spy(PaperlessLanceVectorStore, "drop_table") + + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + indexing.update_llm_index(rebuild=False) + + assert column_added, "Structural migration apply() was not called" + drop_spy.assert_not_called() + + +@pytest.mark.django_db +def test_update_llm_index_forces_rebuild_on_reembed_migration( + temp_llm_index_dir: Path, + real_document: Document, + mock_embed_model: FakeEmbedding, + mocker: pytest_mock.MockerFixture, +) -> None: + """A pending reembed migration causes a full drop+rebuild on next update.""" + from paperless_ai.vector_store import Migration + from paperless_ai.vector_store import PaperlessLanceVectorStore + + # Build the initial index at version 1 (the real CURRENT_SCHEMA_VERSION; no patches needed). + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + indexing.update_llm_index(rebuild=True) + + # Simulate a reembed migration at v2 being introduced after the initial index was built. + m2 = Migration( + version=2, + description="requires reembed", + requires_reembed=True, + apply=lambda t: None, + ) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2]) + mocker.patch("paperless_ai.vector_store.CURRENT_SCHEMA_VERSION", 2) + drop_spy = mocker.spy(PaperlessLanceVectorStore, "drop_table") + + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + indexing.update_llm_index(rebuild=False) + + drop_spy.assert_called() + + @pytest.mark.django_db def test_update_llm_index_partial_update( temp_llm_index_dir: Path,