mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-10 23:59:43 +00:00
feat(ai): wire schema migrations into update_llm_index; structural changes avoid re-embed
Structural migrations (requires_reembed=False) are applied in-place before the incremental update path. If any pending migration requires re-embedding, a full drop+rebuild is triggered automatically, mirroring the model-name mismatch detection that already existed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -249,6 +249,13 @@ def update_llm_index(
|
||||
embed_model = get_embedding_model(config)
|
||||
|
||||
with write_store(embed_model_name=model_name) as store:
|
||||
if not rebuild and store.table_exists():
|
||||
store.apply_structural_migrations()
|
||||
if store.requires_reembed_migration():
|
||||
logger.warning(
|
||||
"Schema migration requires re-embedding; forcing LLM index rebuild.",
|
||||
)
|
||||
rebuild = True
|
||||
if rebuild or not store.table_exists():
|
||||
(settings.LLM_INDEX_DIR / "meta.json").unlink(missing_ok=True)
|
||||
logger.info("Rebuilding LLM index.")
|
||||
|
||||
@@ -213,6 +213,93 @@ def test_update_llm_index_rebuilds_on_model_name_change(
|
||||
assert store.stored_model_name() == "model-b"
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_update_llm_index_applies_structural_migration_without_rebuild(
|
||||
temp_llm_index_dir: Path,
|
||||
real_document: Document,
|
||||
mock_embed_model: FakeEmbedding,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
"""Structural migrations are applied in-place; no full rebuild (drop) occurs."""
|
||||
from paperless_ai.vector_store import Migration
|
||||
from paperless_ai.vector_store import PaperlessLanceVectorStore
|
||||
|
||||
column_added: list[bool] = []
|
||||
|
||||
def _add_extra(table) -> None:
|
||||
table.add_columns({"extra": "CAST(NULL AS string)"})
|
||||
column_added.append(True)
|
||||
|
||||
# Build the initial index at version 1 (the real CURRENT_SCHEMA_VERSION; no patches needed).
|
||||
with patch("documents.models.Document.objects.all") as mock_all:
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([real_document])
|
||||
mock_all.return_value = mock_queryset
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
|
||||
# Simulate a new v2 structural migration being introduced after the initial index was built.
|
||||
m2 = Migration(
|
||||
version=2,
|
||||
description="add extra col",
|
||||
requires_reembed=False,
|
||||
apply=_add_extra,
|
||||
)
|
||||
mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2])
|
||||
mocker.patch("paperless_ai.vector_store.CURRENT_SCHEMA_VERSION", 2)
|
||||
drop_spy = mocker.spy(PaperlessLanceVectorStore, "drop_table")
|
||||
|
||||
with patch("documents.models.Document.objects.all") as mock_all:
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([real_document])
|
||||
mock_all.return_value = mock_queryset
|
||||
indexing.update_llm_index(rebuild=False)
|
||||
|
||||
assert column_added, "Structural migration apply() was not called"
|
||||
drop_spy.assert_not_called()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_update_llm_index_forces_rebuild_on_reembed_migration(
|
||||
temp_llm_index_dir: Path,
|
||||
real_document: Document,
|
||||
mock_embed_model: FakeEmbedding,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
"""A pending reembed migration causes a full drop+rebuild on next update."""
|
||||
from paperless_ai.vector_store import Migration
|
||||
from paperless_ai.vector_store import PaperlessLanceVectorStore
|
||||
|
||||
# Build the initial index at version 1 (the real CURRENT_SCHEMA_VERSION; no patches needed).
|
||||
with patch("documents.models.Document.objects.all") as mock_all:
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([real_document])
|
||||
mock_all.return_value = mock_queryset
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
|
||||
# Simulate a reembed migration at v2 being introduced after the initial index was built.
|
||||
m2 = Migration(
|
||||
version=2,
|
||||
description="requires reembed",
|
||||
requires_reembed=True,
|
||||
apply=lambda t: None,
|
||||
)
|
||||
mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2])
|
||||
mocker.patch("paperless_ai.vector_store.CURRENT_SCHEMA_VERSION", 2)
|
||||
drop_spy = mocker.spy(PaperlessLanceVectorStore, "drop_table")
|
||||
|
||||
with patch("documents.models.Document.objects.all") as mock_all:
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([real_document])
|
||||
mock_all.return_value = mock_queryset
|
||||
indexing.update_llm_index(rebuild=False)
|
||||
|
||||
drop_spy.assert_called()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_update_llm_index_partial_update(
|
||||
temp_llm_index_dir: Path,
|
||||
|
||||
Reference in New Issue
Block a user