From 8e894f78a005c3340153f47b7dc9e56a687dccf2 Mon Sep 17 00:00:00 2001 From: stumpylog <797416+stumpylog@users.noreply.github.com> Date: Tue, 9 Jun 2026 11:14:34 -0700 Subject: [PATCH] Additional ideas and plans --- .../2026-06-09-lancedb-schema-migrations.md | 745 ++++++++++++++++++ .../2026-06-09-node-metadata-enrichment.md | 446 +++++++++++ .../2026-05-20-ai-taxonomy-hints-design.md | 204 +++-- .../2026-06-09-node-metadata-enrichment.md | 115 +++ 4 files changed, 1403 insertions(+), 107 deletions(-) create mode 100644 docs/superpowers/plans/2026-06-09-lancedb-schema-migrations.md create mode 100644 docs/superpowers/plans/2026-06-09-node-metadata-enrichment.md create mode 100644 docs/superpowers/specs/2026-06-09-node-metadata-enrichment.md diff --git a/docs/superpowers/plans/2026-06-09-lancedb-schema-migrations.md b/docs/superpowers/plans/2026-06-09-lancedb-schema-migrations.md new file mode 100644 index 000000000..7b4472c89 --- /dev/null +++ b/docs/superpowers/plans/2026-06-09-lancedb-schema-migrations.md @@ -0,0 +1,745 @@ +# LanceDB Schema Migration Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a schema versioning and migration system to the LanceDB vector store so that structural column changes can be applied in-place without re-embedding documents, avoiding token costs for users on paid embedding APIs. + +**Architecture:** A `schema_version.json` file is written alongside the LanceDB data directory and tracks the current applied version. A `Migration` dataclass registry in `vector_store.py` holds ordered, typed migration steps; each migration is classified as `requires_reembed=True/False`. At index update time, structural-only migrations are applied in-place via LanceDB's `add_columns`/`alter_columns`/`drop_columns` APIs; if any pending migration requires re-embedding, the existing model-mismatch rebuild path is reused. + +**Tech Stack:** Python 3.11, lancedb 0.33, pyarrow, pytest, pytest-mock, factory-boy + +--- + +## File Map + +| File | Change | +| --------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `src/paperless_ai/vector_store.py` | Add `CURRENT_SCHEMA_VERSION`, `Migration` dataclass, version file helpers, migration methods; modify `_ensure_table` and `drop_table` | +| `src/paperless_ai/indexing.py` | Call migration inside `update_llm_index`'s `write_store` block | +| `src/paperless_ai/tests/test_vector_store.py` | New `TestSchemaVersioning` and `TestMigrations` test classes | +| `src/paperless_ai/tests/test_ai_indexing.py` | Two new integration tests for migration path | + +--- + +## Task 1: Schema version file helpers + +**Files:** + +- Modify: `src/paperless_ai/vector_store.py` +- Test: `src/paperless_ai/tests/test_vector_store.py` + +- [ ] **Step 1: Write the failing tests** + +Add a new class at the bottom of `test_vector_store.py`: + +```python +class TestSchemaVersioning: + @pytest.fixture + def uri(self, tmp_path: Path) -> str: + return str(tmp_path / "idx") + + def test_version_file_written_on_table_creation(self, uri: str) -> None: + from paperless_ai.vector_store import CURRENT_SCHEMA_VERSION + + store = PaperlessLanceVectorStore(uri=uri) + store.add([_node("1-0", "1", "text", 0.1)]) + + version_file = Path(uri) / "schema_version.json" + assert version_file.exists() + assert json.loads(version_file.read_text())["version"] == CURRENT_SCHEMA_VERSION + + def test_stored_schema_version_returns_current_when_file_missing( + self, uri: str + ) -> None: + from paperless_ai.vector_store import CURRENT_SCHEMA_VERSION + + store = PaperlessLanceVectorStore(uri=uri) + store.add([_node("1-0", "1", "text", 0.1)]) + (Path(uri) / "schema_version.json").unlink() + + reopened = PaperlessLanceVectorStore(uri=uri) + assert reopened.stored_schema_version() == CURRENT_SCHEMA_VERSION + + def test_stored_schema_version_persists_after_reopen(self, uri: str) -> None: + from paperless_ai.vector_store import CURRENT_SCHEMA_VERSION + + PaperlessLanceVectorStore(uri=uri).add([_node("1-0", "1", "text", 0.1)]) + + reopened = PaperlessLanceVectorStore(uri=uri) + assert reopened.stored_schema_version() == CURRENT_SCHEMA_VERSION + + def test_drop_table_removes_version_file(self, uri: str) -> None: + store = PaperlessLanceVectorStore(uri=uri) + store.add([_node("1-0", "1", "text", 0.1)]) + assert (Path(uri) / "schema_version.json").exists() + + store.drop_table() + assert not (Path(uri) / "schema_version.json").exists() + + def test_version_file_written_on_upsert_creation(self, uri: str) -> None: + from paperless_ai.vector_store import CURRENT_SCHEMA_VERSION + + store = PaperlessLanceVectorStore(uri=uri) + store.upsert_document("1", [_node("1-0", "1", "text", 0.1)]) + + version_file = Path(uri) / "schema_version.json" + assert json.loads(version_file.read_text())["version"] == CURRENT_SCHEMA_VERSION +``` + +Add `import json` and `import pytest_mock` to the top of `test_vector_store.py`. + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py::TestSchemaVersioning -v" +``` + +Expected: all 5 tests fail with `ImportError` or `AttributeError` — `CURRENT_SCHEMA_VERSION` and `stored_schema_version` don't exist yet. + +- [ ] **Step 3: Implement the schema version helpers in `vector_store.py`** + +After the existing imports and before the `DEFAULT_TABLE_NAME` constant, add: + +```python +import json +from pathlib import Path +``` + +After `DEFAULT_TABLE_NAME = "documents"`, add: + +```python +CURRENT_SCHEMA_VERSION: int = 1 +``` + +After the `ANN_PQ_SUB_VECTORS` constant, add nothing yet — version methods go on the class. + +Inside `PaperlessLanceVectorStore`, add these methods after `stored_model_name`: + +```python +@property +def _schema_version_path(self) -> Path: + return Path(self._uri) / "schema_version.json" + +def stored_schema_version(self) -> int: + """Return the schema version recorded on disk, or CURRENT_SCHEMA_VERSION if missing. + + Missing means either the table predates versioning or was just created and the + write hasn't happened yet — treat conservatively as already current. + """ + try: + return int(json.loads(self._schema_version_path.read_text())["version"]) + except (FileNotFoundError, KeyError, ValueError): + return CURRENT_SCHEMA_VERSION + +def _write_schema_version(self, version: int) -> None: + self._schema_version_path.parent.mkdir(parents=True, exist_ok=True) + self._schema_version_path.write_text(json.dumps({"version": version})) +``` + +Modify `_ensure_table` to write the version after creating the table. Replace the current method body: + +```python +def _ensure_table(self, rows: list[dict[str, Any]], dim: int) -> bool: + if self._table is not None: + return False + self._table = self._conn.create_table( + self._table_name, + rows, + schema=self._schema(dim, self._embed_model_name), + ) + self._write_schema_version(CURRENT_SCHEMA_VERSION) + return True +``` + +Modify `drop_table` to also remove the version file: + +```python +def drop_table(self) -> None: + if self.table_exists(): + self._conn.drop_table(self._table_name) + self._table = None + self._schema_version_path.unlink(missing_ok=True) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py::TestSchemaVersioning -v" +``` + +Expected: all 5 tests pass. + +- [ ] **Step 5: Verify no regressions** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py -v" +``` + +Expected: all existing tests still pass. + +- [ ] **Step 6: Lint** + +```bash +ruff check src/paperless_ai/vector_store.py src/paperless_ai/tests/test_vector_store.py +ruff format src/paperless_ai/vector_store.py src/paperless_ai/tests/test_vector_store.py +``` + +Expected: no errors. + +- [ ] **Step 7: Commit** + +```bash +git add src/paperless_ai/vector_store.py src/paperless_ai/tests/test_vector_store.py +git commit -m "feat(ai): add schema version file tracking to LanceDB vector store" +``` + +--- + +## Task 2: Migration dataclass and pending migration detection + +**Files:** + +- Modify: `src/paperless_ai/vector_store.py` +- Test: `src/paperless_ai/tests/test_vector_store.py` + +- [ ] **Step 1: Write the failing tests** + +Add a new class to `test_vector_store.py`: + +```python +class TestMigrationRegistry: + @pytest.fixture + def uri(self, tmp_path: Path) -> str: + return str(tmp_path / "idx") + + def _store_at_version(self, uri: str, version: int) -> PaperlessLanceVectorStore: + """Create a store with a table and then fake its on-disk version.""" + store = PaperlessLanceVectorStore(uri=uri) + store.add([_node("1-0", "1", "text", 0.1)]) + store._write_schema_version(version) + return PaperlessLanceVectorStore(uri=uri) # reopen to pick up written version + + def test_pending_migrations_empty_at_current_version(self, uri: str) -> None: + from paperless_ai.vector_store import CURRENT_SCHEMA_VERSION, Migration + + store = self._store_at_version(uri, CURRENT_SCHEMA_VERSION) + assert store.pending_migrations() == [] + + def test_pending_migrations_returns_migrations_above_stored_version( + self, uri: str, mocker: pytest_mock.MockerFixture + ) -> None: + from paperless_ai.vector_store import Migration + + m2 = Migration(version=2, description="add col", requires_reembed=False, apply=lambda t: None) + m3 = Migration(version=3, description="reindex", requires_reembed=True, apply=lambda t: None) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2, m3]) + + store = self._store_at_version(uri, 1) + pending = store.pending_migrations() + assert pending == [m2, m3] + + def test_pending_migrations_excludes_already_applied( + self, uri: str, mocker: pytest_mock.MockerFixture + ) -> None: + from paperless_ai.vector_store import Migration + + m2 = Migration(version=2, description="add col", requires_reembed=False, apply=lambda t: None) + m3 = Migration(version=3, description="reindex", requires_reembed=True, apply=lambda t: None) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2, m3]) + + store = self._store_at_version(uri, 2) + pending = store.pending_migrations() + assert pending == [m3] + + def test_pending_migrations_empty_when_no_table(self, uri: str) -> None: + store = PaperlessLanceVectorStore(uri=uri) + assert store.pending_migrations() == [] + + def test_requires_reembed_migration_false_when_none_pending(self, uri: str) -> None: + store = self._store_at_version(uri, 1) + assert store.requires_reembed_migration() is False + + def test_requires_reembed_migration_false_when_only_structural_pending( + self, uri: str, mocker: pytest_mock.MockerFixture + ) -> None: + from paperless_ai.vector_store import Migration + + m2 = Migration(version=2, description="add col", requires_reembed=False, apply=lambda t: None) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2]) + + store = self._store_at_version(uri, 1) + assert store.requires_reembed_migration() is False + + def test_requires_reembed_migration_true_when_reembed_migration_pending( + self, uri: str, mocker: pytest_mock.MockerFixture + ) -> None: + from paperless_ai.vector_store import Migration + + m2 = Migration(version=2, description="reindex", requires_reembed=True, apply=lambda t: None) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2]) + + store = self._store_at_version(uri, 1) + assert store.requires_reembed_migration() is True +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py::TestMigrationRegistry -v" +``` + +Expected: all 7 tests fail — `Migration`, `MIGRATIONS`, `pending_migrations`, `requires_reembed_migration` don't exist yet. + +- [ ] **Step 3: Add `Migration` dataclass and registry to `vector_store.py`** + +Add near the top of the file, after the existing imports: + +```python +from dataclasses import dataclass, field +from typing import Callable +``` + +After the `CURRENT_SCHEMA_VERSION` constant, add: + +```python +@dataclass(frozen=True) +class Migration: + version: int + description: str + requires_reembed: bool + apply: Callable[[Any], None] = field(compare=False, hash=False) +``` + +(`compare=False, hash=False` excludes `apply` from `__eq__` and `__hash__` — equality is driven by `version` alone, which is the natural identity key. This avoids lambda identity issues in tests and makes the API safe for callers that construct `Migration` instances inline.) + +# Ordered list of schema migrations. Each entry upgrades the table to `version`. + +# Structural migrations (requires_reembed=False) are applied in-place via LanceDB's + +# add_columns/alter_columns/drop_columns APIs — no re-embedding needed. + +# Migrations with requires_reembed=True cause a full rebuild on next index update, + +# exactly like a model-name change does today. + +# + +# To add a migration: + +# 1. Increment CURRENT_SCHEMA_VERSION. + +# 2. Append a Migration entry here with the new version number. + +# 3. For structural changes, call table.add_columns/alter_columns/drop_columns in apply(). + +# 4. For embedding-invalidating changes, set requires_reembed=True; apply() can be a no-op. + +MIGRATIONS: list[Migration] = [] + +```` + +Inside `PaperlessLanceVectorStore`, add after `requires_reembed_migration` (which we'll add next): + +```python +def pending_migrations(self) -> list[Migration]: + """Return migrations not yet applied to this table, in version order.""" + if self._table is None: + return [] + current = self.stored_schema_version() + return [m for m in MIGRATIONS if m.version > current] + +def requires_reembed_migration(self) -> bool: + """True when any pending migration requires a full re-embedding.""" + return any(m.requires_reembed for m in self.pending_migrations()) +```` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py::TestMigrationRegistry -v" +``` + +Expected: all 7 tests pass. + +- [ ] **Step 5: Lint** + +```bash +ruff check src/paperless_ai/vector_store.py src/paperless_ai/tests/test_vector_store.py +ruff format src/paperless_ai/vector_store.py src/paperless_ai/tests/test_vector_store.py +``` + +- [ ] **Step 6: Commit** + +```bash +git add src/paperless_ai/vector_store.py src/paperless_ai/tests/test_vector_store.py +git commit -m "feat(ai): add Migration registry and pending migration detection" +``` + +--- + +## Task 3: Apply structural migrations in-place + +**Files:** + +- Modify: `src/paperless_ai/vector_store.py` +- Test: `src/paperless_ai/tests/test_vector_store.py` + +- [ ] **Step 1: Write the failing tests** + +Add a new class to `test_vector_store.py`: + +```python +class TestApplyStructuralMigrations: + @pytest.fixture + def uri(self, tmp_path: Path) -> str: + return str(tmp_path / "idx") + + def _store_at_version(self, uri: str, version: int) -> PaperlessLanceVectorStore: + store = PaperlessLanceVectorStore(uri=uri) + store.add([_node("1-0", "1", "text", 0.1)]) + store._write_schema_version(version) + return PaperlessLanceVectorStore(uri=uri) + + def test_apply_structural_adds_column_via_lancedb( + self, uri: str, mocker: pytest_mock.MockerFixture + ) -> None: + from paperless_ai.vector_store import Migration + + def _add_extra(table: Any) -> None: + table.add_columns({"extra": "CAST(NULL AS VARCHAR)"}) + + m2 = Migration(version=2, description="add extra col", requires_reembed=False, apply=_add_extra) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2]) + + store = self._store_at_version(uri, 1) + applied = store.apply_structural_migrations() + + assert len(applied) == 1 + assert applied[0] == m2 + # Column actually present in the table schema. + reopened = PaperlessLanceVectorStore(uri=uri) + field_names = [f.name for f in reopened._table.schema] + assert "extra" in field_names + + def test_apply_structural_updates_version_file( + self, uri: str, mocker: pytest_mock.MockerFixture + ) -> None: + from paperless_ai.vector_store import Migration + + m2 = Migration(version=2, description="add col", requires_reembed=False, apply=lambda t: t.add_columns({"c": "CAST(NULL AS VARCHAR)"})) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2]) + + store = self._store_at_version(uri, 1) + store.apply_structural_migrations() + + assert store.stored_schema_version() == 2 + + def test_apply_structural_skips_reembed_migrations( + self, uri: str, mocker: pytest_mock.MockerFixture + ) -> None: + from paperless_ai.vector_store import Migration + + applied_versions: list[int] = [] + m2 = Migration(version=2, description="structural", requires_reembed=False, apply=lambda t: applied_versions.append(2) or t.add_columns({"c": "CAST(NULL AS VARCHAR)"})) + m3 = Migration(version=3, description="reembed", requires_reembed=True, apply=lambda t: applied_versions.append(3)) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2, m3]) + + store = self._store_at_version(uri, 1) + applied = store.apply_structural_migrations() + + assert [m.version for m in applied] == [2] + assert 3 not in applied_versions + # Version advances only to the last structural migration applied. + assert store.stored_schema_version() == 2 + + def test_apply_structural_noop_at_current_version(self, uri: str) -> None: + store = self._store_at_version(uri, 1) + applied = store.apply_structural_migrations() + assert applied == [] + + def test_apply_structural_noop_when_no_table(self, uri: str) -> None: + store = PaperlessLanceVectorStore(uri=uri) + applied = store.apply_structural_migrations() + assert applied == [] + + def test_apply_structural_refreshes_table_reference( + self, uri: str, mocker: pytest_mock.MockerFixture + ) -> None: + """After add_columns the in-memory table object must reflect the new schema.""" + from paperless_ai.vector_store import Migration + + m2 = Migration(version=2, description="add col", requires_reembed=False, apply=lambda t: t.add_columns({"extra": "CAST(NULL AS VARCHAR)"})) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2]) + + store = self._store_at_version(uri, 1) + store.apply_structural_migrations() + + # The store's own _table reference (not a re-open) must see the new column. + field_names = [f.name for f in store._table.schema] + assert "extra" in field_names +``` + +Add `from typing import Any` to the test file imports if not already present. + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py::TestApplyStructuralMigrations -v" +``` + +Expected: all 6 tests fail — `apply_structural_migrations` doesn't exist yet. + +- [ ] **Step 3: Implement `apply_structural_migrations` in `vector_store.py`** + +Add after `requires_reembed_migration` on the class: + +```python +def apply_structural_migrations(self) -> list[Migration]: + """Apply all pending structural (non-reembed) migrations in version order. + + Each applied migration's ``apply`` callable receives the live LanceDB table + object and should call ``add_columns``, ``alter_columns``, or ``drop_columns`` + as needed. After all structural migrations run, the version file is updated + to the highest version applied and the in-memory table reference is refreshed. + + Migrations with ``requires_reembed=True`` are skipped — the caller is + responsible for detecting them via ``requires_reembed_migration()`` and + triggering a full rebuild. + """ + if self._table is None: + return [] + structural = [m for m in self.pending_migrations() if not m.requires_reembed] + if not structural: + return [] + for migration in structural: + logger.info("Applying schema migration v%d: %s", migration.version, migration.description) + migration.apply(self._table) + # Refresh the in-memory table so subsequent operations see the new schema. + self._table = self._conn.open_table(self._table_name) + self._write_schema_version(structural[-1].version) + return structural +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py::TestApplyStructuralMigrations -v" +``` + +Expected: all 6 tests pass. + +- [ ] **Step 5: Full test_vector_store regression check** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py -v" +``` + +Expected: all tests pass. + +- [ ] **Step 6: Lint** + +```bash +ruff check src/paperless_ai/vector_store.py src/paperless_ai/tests/test_vector_store.py +ruff format src/paperless_ai/vector_store.py src/paperless_ai/tests/test_vector_store.py +``` + +- [ ] **Step 7: Commit** + +```bash +git add src/paperless_ai/vector_store.py src/paperless_ai/tests/test_vector_store.py +git commit -m "feat(ai): implement apply_structural_migrations for in-place schema changes" +``` + +--- + +## Task 4: Wire migrations into `update_llm_index` + +**Files:** + +- Modify: `src/paperless_ai/indexing.py` +- Test: `src/paperless_ai/tests/test_ai_indexing.py` + +- [ ] **Step 1: Write the failing tests** + +Add these two tests to `test_ai_indexing.py`, after the existing `test_update_llm_index_rebuilds_on_model_name_change` test: + +```python +@pytest.mark.django_db +def test_update_llm_index_applies_structural_migration_without_rebuild( + temp_llm_index_dir: Path, + real_document: Document, + mock_embed_model: FakeEmbedding, + mocker: pytest_mock.MockerFixture, +) -> None: + """Structural migrations are applied in-place; no full rebuild (drop) occurs.""" + from paperless_ai.vector_store import Migration, PaperlessLanceVectorStore + + column_added: list[bool] = [] + + def _add_extra(table) -> None: + table.add_columns({"extra": "CAST(NULL AS VARCHAR)"}) + column_added.append(True) + + # Build the initial index at version 1 (the real CURRENT_SCHEMA_VERSION; no patches needed). + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + indexing.update_llm_index(rebuild=True) + + # Simulate a new v2 structural migration being introduced after the initial index was built. + m2 = Migration(version=2, description="add extra col", requires_reembed=False, apply=_add_extra) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2]) + mocker.patch("paperless_ai.vector_store.CURRENT_SCHEMA_VERSION", 2) + drop_spy = mocker.spy(PaperlessLanceVectorStore, "drop_table") + + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + indexing.update_llm_index(rebuild=False) + + assert column_added, "Structural migration apply() was not called" + drop_spy.assert_not_called() + + +@pytest.mark.django_db +def test_update_llm_index_forces_rebuild_on_reembed_migration( + temp_llm_index_dir: Path, + real_document: Document, + mock_embed_model: FakeEmbedding, + mocker: pytest_mock.MockerFixture, +) -> None: + """A pending reembed migration causes a full drop+rebuild on next update.""" + from paperless_ai.vector_store import Migration, PaperlessLanceVectorStore + + # Build the initial index at version 1 (the real CURRENT_SCHEMA_VERSION; no patches needed). + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + indexing.update_llm_index(rebuild=True) + + # Simulate a reembed migration at v2 being introduced after the initial index was built. + m2 = Migration(version=2, description="requires reembed", requires_reembed=True, apply=lambda t: None) + mocker.patch("paperless_ai.vector_store.MIGRATIONS", [m2]) + mocker.patch("paperless_ai.vector_store.CURRENT_SCHEMA_VERSION", 2) + drop_spy = mocker.spy(PaperlessLanceVectorStore, "drop_table") + + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + indexing.update_llm_index(rebuild=False) + + drop_spy.assert_called() +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_ai_indexing.py::test_update_llm_index_applies_structural_migration_without_rebuild src/paperless_ai/tests/test_ai_indexing.py::test_update_llm_index_forces_rebuild_on_reembed_migration -v" +``` + +Expected: both tests fail because `update_llm_index` doesn't call migration methods yet. + +- [ ] **Step 3: Add migration check inside `update_llm_index` in `indexing.py`** + +Inside the `with write_store(embed_model_name=model_name) as store:` block in `update_llm_index`, insert the migration check immediately before the `if rebuild or not store.table_exists():` line: + +```python + if not rebuild and store.table_exists(): + store.apply_structural_migrations() + if store.requires_reembed_migration(): + logger.warning("Schema migration requires re-embedding; forcing LLM index rebuild.") + rebuild = True +``` + +The relevant section of `update_llm_index` should now look like: + +```python + with write_store(embed_model_name=model_name) as store: + if not rebuild and store.table_exists(): + store.apply_structural_migrations() + if store.requires_reembed_migration(): + logger.warning("Schema migration requires re-embedding; forcing LLM index rebuild.") + rebuild = True + if rebuild or not store.table_exists(): + (settings.LLM_INDEX_DIR / "meta.json").unlink(missing_ok=True) + logger.info("Rebuilding LLM index.") + store.drop_table() + ... +``` + +- [ ] **Step 4: Run new tests to verify they pass** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_ai_indexing.py::test_update_llm_index_applies_structural_migration_without_rebuild src/paperless_ai/tests/test_ai_indexing.py::test_update_llm_index_forces_rebuild_on_reembed_migration -v" +``` + +Expected: both tests pass. + +- [ ] **Step 5: Full indexing regression check** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_ai_indexing.py -v" +``` + +Expected: all existing tests still pass. + +- [ ] **Step 6: Full AI module test run** + +```bash +bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/ -v" +``` + +Expected: all tests pass. + +- [ ] **Step 7: Lint** + +```bash +ruff check src/paperless_ai/indexing.py src/paperless_ai/tests/test_ai_indexing.py +ruff format src/paperless_ai/indexing.py src/paperless_ai/tests/test_ai_indexing.py +``` + +- [ ] **Step 8: Commit** + +```bash +git add src/paperless_ai/indexing.py src/paperless_ai/tests/test_ai_indexing.py +git commit -m "feat(ai): wire schema migrations into update_llm_index; structural changes avoid re-embed" +``` + +--- + +## How to add a migration (reference for future developers) + +When a future schema change is needed: + +1. Increment `CURRENT_SCHEMA_VERSION` in `vector_store.py`. +2. Append a `Migration` to `MIGRATIONS` with the new version number. +3. If the change is **structural only** (add/rename/drop a column, no embedding content changed): + - Set `requires_reembed=False` + - In `apply`, call `table.add_columns({"col": "CAST(NULL AS string)"})`, `table.drop_columns(["col"])`, or `table.alter_columns({"path": "col", "rename": "new_name"})` as appropriate. +4. If the change affects **what text gets embedded** (new fields in `build_llm_index_text`, chunk size change baked into schema, etc.): + - Set `requires_reembed=True` + - `apply` can be a no-op (`lambda t: None`) — the framework will trigger a full rebuild. +5. Write tests for the migration in `test_vector_store.py` following the `TestApplyStructuralMigrations` patterns. + +Example structural migration adding a `language` column: + +```python +CURRENT_SCHEMA_VERSION: int = 2 + +MIGRATIONS: list[Migration] = [ + Migration( + version=2, + description="Add language column for future locale-aware filtering", + requires_reembed=False, + apply=lambda table: table.add_columns({"language": "CAST(NULL AS string)"}), + ), +] +``` diff --git a/docs/superpowers/plans/2026-06-09-node-metadata-enrichment.md b/docs/superpowers/plans/2026-06-09-node-metadata-enrichment.md new file mode 100644 index 000000000..1a8f6ede2 --- /dev/null +++ b/docs/superpowers/plans/2026-06-09-node-metadata-enrichment.md @@ -0,0 +1,446 @@ +# Node Metadata Enrichment Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Move `filename`, `storage_path`, and `archive_serial_number` from the LanceDB embedding text into `node.metadata`, and register a schema migration that triggers an automatic index rebuild on upgrade. + +**Architecture:** Three small, independent changes to two source files, tested first. The migration is a no-op `apply` (the rebuild regenerates all nodes with correct metadata). All three tests go red first, then each implementation makes them green. + +**Tech Stack:** pytest, pytest-django, pytest-mock, factory_boy, llama_index `MetadataMode`, `feature-lancedb-schema-migrate` branch (must be the base branch for this work). + +**Branch base:** `feature-lancedb-schema-migrate` + +--- + +### Task 1: Fail — embedding text no longer contains the three fields + +**Files:** + +- Modify: `src/paperless_ai/tests/test_embedding.py` + +- [ ] **Step 1: Update `mock_document` fixture to set an explicit `storage_path`** + + The fixture currently doesn't set `storage_path`, so the existing code path (`doc.storage_path.name if doc.storage_path else ''`) would call `.name` on a `MagicMock`. Give it an explicit value so assertions are unambiguous. + + Add these two lines to the `mock_document` fixture after `doc.archive_serial_number = "12345"`: + + ```python + doc.storage_path = MagicMock() + doc.storage_path.name = "Finance/Bills" + ``` + +- [ ] **Step 2: Update `test_build_llm_index_text` — flip and add assertions** + + The existing test asserts these fields ARE in the result. Change them to assert they are NOT, and add the two missing ones: + + ```python + # was: assert "Filename: test_file.pdf" in result + assert "Filename: test_file.pdf" not in result + assert "Storage Path: Finance/Bills" not in result + assert "Archive Serial Number: 12345" not in result + ``` + + The assertions for `Notes`, `Content`, and `Custom Field` lines are unchanged — leave them as-is. + +- [ ] **Step 3: Run the test to confirm it fails** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_embedding.py::test_build_llm_index_text -v" + ``` + + Expected: `FAILED` — `AssertionError: assert 'Filename: test_file.pdf' not in '...'` + +--- + +### Task 2: Pass — remove the three fields from `build_llm_index_text` + +**Files:** + +- Modify: `src/paperless_ai/embedding.py` + +- [ ] **Step 1: Remove the three lines and the TODO comment** + + Current `build_llm_index_text` (lines 114–133). Replace the function body: + + ```python + def build_llm_index_text(doc: Document) -> str: + lines = [ + f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}", + ] + + for instance in doc.custom_fields.all(): + lines.append(f"Custom Field - {instance.field.name}: {instance}") + + lines.append("\nContent:\n") + lines.append(doc.content or "") + + return _normalize_llm_index_text("\n".join(lines)) + ``` + +- [ ] **Step 2: Run the test to confirm it passes** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_embedding.py::test_build_llm_index_text -v" + ``` + + Expected: `PASSED` + +- [ ] **Step 3: Run the full embedding test module to catch regressions** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_embedding.py -v" + ``` + + Expected: all green. + +- [ ] **Step 4: Commit** + + ```bash + git add src/paperless_ai/embedding.py src/paperless_ai/tests/test_embedding.py + git commit -m "refactor(ai): remove filename/storage_path/asn from embedding text" + ``` + +--- + +### Task 3: Fail — `build_document_node` exposes the three fields in metadata + +**Files:** + +- Modify: `src/paperless_ai/tests/test_ai_indexing.py` + +- [ ] **Step 1: Extend `test_build_document_node_structured_fields_in_metadata`** + + This test already checks for `title`, `tags`, etc. Add the three new keys. The `real_document` fixture creates a document with no storage path set, so `storage_path` will be `None` — the key must still be present. + + Replace the existing test body: + + ```python + @pytest.mark.django_db + def test_build_document_node_structured_fields_in_metadata( + real_document: Document, + ) -> None: + """Structured fields must be in node.metadata so the LLM receives them via metadata prepend.""" + nodes = indexing.build_document_node(real_document) + assert len(nodes) > 0 + for node in nodes: + assert "title" in node.metadata + assert "tags" in node.metadata + assert "correspondent" in node.metadata + assert "document_type" in node.metadata + assert "created" in node.metadata + assert "added" in node.metadata + assert "modified" in node.metadata + assert "filename" in node.metadata + assert "storage_path" in node.metadata # None is fine; key must exist + assert "archive_serial_number" in node.metadata + ``` + +- [ ] **Step 2: Add a test that storage_path carries the name when set** + + Add a new test function after `test_build_document_node_structured_fields_in_metadata`: + + ```python + @pytest.mark.django_db + def test_build_document_node_storage_path_name_in_metadata() -> None: + """storage_path metadata value is the StoragePath name, not None, when set.""" + from documents.tests.factories import DocumentFactory, StoragePathFactory + + sp = StoragePathFactory(name="Finance/Bills") + doc = DocumentFactory(storage_path=sp) + + nodes = indexing.build_document_node(doc) + + assert len(nodes) > 0 + for node in nodes: + assert node.metadata["storage_path"] == "Finance/Bills" + ``` + +- [ ] **Step 3: Add a test that all three new fields are in `excluded_embed_metadata_keys`** + + Add after the previous test: + + ```python + @pytest.mark.django_db + def test_build_document_node_new_fields_excluded_from_embedding( + real_document: Document, + ) -> None: + """filename, storage_path, and archive_serial_number must not appear in embedding text.""" + from llama_index.core.schema import MetadataMode + + nodes = indexing.build_document_node(real_document) + assert len(nodes) > 0 + for node in nodes: + assert "filename" in node.excluded_embed_metadata_keys + assert "storage_path" in node.excluded_embed_metadata_keys + assert "archive_serial_number" in node.excluded_embed_metadata_keys + embed_text = node.get_content(metadata_mode=MetadataMode.EMBED) + assert "filename" not in embed_text + assert "storage_path" not in embed_text + assert "archive_serial_number" not in embed_text + ``` + +- [ ] **Step 4: Run the new tests to confirm they fail** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_ai_indexing.py::test_build_document_node_structured_fields_in_metadata src/paperless_ai/tests/test_ai_indexing.py::test_build_document_node_storage_path_name_in_metadata src/paperless_ai/tests/test_ai_indexing.py::test_build_document_node_new_fields_excluded_from_embedding -v" + ``` + + Expected: all `FAILED` — keys not yet in `node.metadata`. + +--- + +### Task 4: Pass — add the three fields to `build_document_node` + +**Files:** + +- Modify: `src/paperless_ai/indexing.py` + +- [ ] **Step 1: Update the `metadata` dict in `build_document_node`** + + Current metadata dict starts at line 106. Replace it: + + ```python + metadata = { + "document_id": str(document.id), + "title": document.title, + "filename": document.filename or "", + "storage_path": document.storage_path.name if document.storage_path else None, + "archive_serial_number": document.archive_serial_number, + "tags": [t.name for t in document.tags.all()], + "correspondent": document.correspondent.name + if document.correspondent + else None, + "document_type": document.document_type.name + if document.document_type + else None, + "created": document.created.isoformat() if document.created else None, + "added": document.added.isoformat() if document.added else None, + "modified": document.modified.isoformat(), + } + ``` + +- [ ] **Step 2: Update `excluded_embed_metadata_keys`** + + The `LlamaDocument(...)` call currently has: + + ```python + excluded_embed_metadata_keys=list(metadata.keys()), + ``` + + This already excludes all keys, so no change needed here — the new keys are automatically included since they're in the dict. Verify `excluded_llm_metadata_keys` still only excludes `"document_id"`: + + ```python + excluded_llm_metadata_keys=["document_id"], + ``` + + No change needed. + +- [ ] **Step 3: Run the failing tests to confirm they pass** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_ai_indexing.py::test_build_document_node_structured_fields_in_metadata src/paperless_ai/tests/test_ai_indexing.py::test_build_document_node_storage_path_name_in_metadata src/paperless_ai/tests/test_ai_indexing.py::test_build_document_node_new_fields_excluded_from_embedding -v" + ``` + + Expected: all `PASSED`. + +- [ ] **Step 4: Run the full indexing test module** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_ai_indexing.py -v" + ``` + + Expected: all green. + +- [ ] **Step 5: Commit** + + ```bash + git add src/paperless_ai/indexing.py src/paperless_ai/tests/test_ai_indexing.py + git commit -m "feat(ai): add filename/storage_path/asn to node metadata" + ``` + +--- + +### Task 5: Fail — migration v2 is registered + +**Files:** + +- Modify: `src/paperless_ai/tests/test_vector_store.py` + +These tests use the real (non-mocked) `MIGRATIONS` list, so they go red until the migration is registered in Task 6. + +- [ ] **Step 1: Add a `TestMetadataEnrichmentMigration` class** + + Add this class near the end of `test_vector_store.py`, before the final `TestApplyStructuralMigrations`: + + ```python + class TestMetadataEnrichmentMigration: + def test_current_schema_version_is_2(self) -> None: + from paperless_ai.vector_store import CURRENT_SCHEMA_VERSION + assert CURRENT_SCHEMA_VERSION == 2 + + def test_migration_v2_registered(self) -> None: + from paperless_ai.vector_store import MIGRATIONS + assert len(MIGRATIONS) == 1 + assert MIGRATIONS[0].version == 2 + assert MIGRATIONS[0].requires_reembed is True + + def test_store_at_v1_requires_reembed(self, uri: str) -> None: + store = _store_at_version(uri, 1) + assert store.requires_reembed_migration() is True + + def test_store_at_v2_no_pending_migrations(self, uri: str) -> None: + store = _store_at_version(uri, 2) + assert store.pending_migrations() == [] + assert store.requires_reembed_migration() is False + ``` + +- [ ] **Step 2: Run the tests to confirm they fail** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py::TestMetadataEnrichmentMigration -v" + ``` + + Expected: all `FAILED` — `CURRENT_SCHEMA_VERSION` is still 1 and `MIGRATIONS` is still empty. + +--- + +### Task 6: Pass — register migration v2 in `vector_store.py` + +**Files:** + +- Modify: `src/paperless_ai/vector_store.py` + +- [ ] **Step 1: Add the migration and bump the version constant** + + On the `feature-lancedb-schema-migrate` branch, `vector_store.py` has: + + ```python + CURRENT_SCHEMA_VERSION: Final[int] = 1 + ... + MIGRATIONS: list[Migration] = [] + ``` + + Change both: + + ```python + CURRENT_SCHEMA_VERSION: Final[int] = 2 + + MIGRATIONS: list[Migration] = [ + Migration( + version=2, + description="move filename/storage_path/asn from embedding text to metadata; rebuild required", + requires_reembed=True, + apply=lambda table: None, + ), + ] + ``` + +- [ ] **Step 2: Run the migration tests to confirm they pass** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py::TestMetadataEnrichmentMigration -v" + ``` + + Expected: all `PASSED`. + +- [ ] **Step 3: Run the full vector store test module** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_vector_store.py -v" + ``` + + Expected: all green. In particular, `TestSchemaVersioning::test_stored_schema_version_persists_after_reopen` and the `TestMigrationRegistry` tests should still pass — they use `CURRENT_SCHEMA_VERSION` as the baseline. + +--- + +### Task 7: Integration — `update_llm_index` rebuilds when schema version is stale + +**Files:** + +- Modify: `src/paperless_ai/tests/test_ai_indexing.py` + +- [ ] **Step 1: Write the failing integration test** + + Add this test near `test_update_llm_index_rebuilds_on_model_name_change`: + + ```python + @pytest.mark.django_db + def test_update_llm_index_rebuilds_on_pending_reembed_migration( + temp_llm_index_dir: Path, + real_document: Document, + mock_embed_model: FakeEmbedding, + ) -> None: + """A stale schema version (v1) must trigger a full rebuild on the next index run.""" + from paperless_ai.vector_store import PaperlessLanceVectorStore + + # Build an initial index and then rewind the schema version to 1 to simulate + # an index created before migration v2 was registered. + indexing.update_llm_index(rebuild=True) + store = indexing.get_vector_store() + store._write_schema_version(1) + + # An incremental run (rebuild=False) must detect the stale version and rebuild. + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document]) + mock_all.return_value = mock_queryset + indexing.update_llm_index(rebuild=False) + + # After rebuild the schema version must be current. + reopened = PaperlessLanceVectorStore(uri=str(temp_llm_index_dir)) + assert reopened.stored_schema_version() == 2 + ``` + +- [ ] **Step 2: Run the test to confirm it fails** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_ai_indexing.py::test_update_llm_index_rebuilds_on_pending_reembed_migration -v" + ``` + + Expected: `FAILED` — schema version stays at 1 because migration v2 isn't registered yet. + + _(If it passes already because `update_llm_index` detects a different condition, verify the assertion is actually exercising the migration path and not the model-name path.)_ + +- [ ] **Step 3: Run the test again now that migration v2 is registered (Task 6)** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_ai_indexing.py::test_update_llm_index_rebuilds_on_pending_reembed_migration -v" + ``` + + Expected: `PASSED`. + +- [ ] **Step 4: Run the full indexing test module** + + ``` + bash /c/Users/tholmes/Documents/Coding/paperless/vmtest.sh "src/paperless_ai/tests/test_ai_indexing.py -v" + ``` + + Expected: all green. + +- [ ] **Step 5: Final commit** + + ```bash + git add src/paperless_ai/vector_store.py src/paperless_ai/tests/test_vector_store.py src/paperless_ai/tests/test_ai_indexing.py + git commit -m "feat(ai): register schema migration v2; triggers rebuild for metadata enrichment" + ``` + +--- + +## Self-review checklist + +**Spec coverage:** + +- ✅ `build_llm_index_text` — three lines removed (Tasks 1–2) +- ✅ `build_document_node` — three fields added to metadata + excluded_embed_metadata_keys (Tasks 3–4) +- ✅ Migration v2 registered with `requires_reembed=True` and no-op apply (Tasks 5–6) +- ✅ `update_llm_index` triggers rebuild on stale schema (Task 7) +- ✅ Tests: `test_embedding.py`, `test_ai_indexing.py`, `test_vector_store.py` + +**Placeholder scan:** None found. Every step has exact code or exact commands. + +**Type consistency:** + +- `metadata` dict key names (`"filename"`, `"storage_path"`, `"archive_serial_number"`) used consistently across Tasks 1–4. +- `CURRENT_SCHEMA_VERSION = 2` and `MIGRATIONS[0].version == 2` are consistent across Tasks 5–6. +- `_store_at_version` and `_node` helpers referenced in Task 5 are defined in the existing `test_vector_store.py` on the `feature-lancedb-schema-migrate` branch. diff --git a/docs/superpowers/specs/2026-05-20-ai-taxonomy-hints-design.md b/docs/superpowers/specs/2026-05-20-ai-taxonomy-hints-design.md index 08176cd6f..4cc4a9370 100644 --- a/docs/superpowers/specs/2026-05-20-ai-taxonomy-hints-design.md +++ b/docs/superpowers/specs/2026-05-20-ai-taxonomy-hints-design.md @@ -1,9 +1,11 @@ # AI Suggestions: Inject existing taxonomy as candidates -**Status:** Design (v2 — frequency-only) +**Status:** Design (v3 — RAG-sourced, node metadata) **Date:** 2026-05-20 +**Updated:** 2026-06-09 (v3: switch from frequency DB queries to node metadata from RAG retrieval) **Related:** [Discussion #12787](https://github.com/paperless-ngx/paperless-ngx/discussions/12787) **Branch target:** `dev` +**Depends on:** `2026-06-09-node-metadata-enrichment.md` (adds `storage_path`, `filename`, `asn` to node metadata; must land first) ## Problem @@ -19,21 +21,41 @@ Result: the LLM invents new metadata names that duplicate existing taxonomy entr Tell the LLM what already exists, so it can prefer existing names verbatim. Fuzzy matching becomes the fallback for typos and for legitimately novel suggestions, not the primary semantic-equivalence mechanism. -Non-goals: changing the LLM client, embedding model selection, or RAG retrieval. Replacing fuzzy matching entirely. Custom-field option values. Embedding-based shortlisting (deferred to a v2 if frequency proves insufficient). +Non-goals: changing the LLM client, embedding model selection, or RAG retrieval. Replacing fuzzy matching entirely. Custom-field option values. Frequency-based DB queries (superseded by RAG-sourced approach). ## Approach -For each of Tags, DocumentTypes, Correspondents, StoragePaths: +Hints are sourced from the LanceDB node metadata of the similar documents already retrieved for RAG context — no separate DB queries, no new user-facing configuration. The feature is **gated on `llm_embedding_backend`**: when no embedding backend is configured, no hints are built and today's behavior is unchanged. -1. Take the user-visible queryset (owner-aware, matching `matching.py`). -2. Annotate by document-usage count and take the top `X` names by frequency. `X` is configurable per category cap (single setting, applied to all four categories). -3. Inject those names into the LLM prompt as "Available " blocks, with the instruction to prefer them verbatim. -4. When the LLM responds, tell `matching.py` which names were hinted so an exact normalized match short-circuits past fuzzy. Names not in the hint list keep today's fuzzy fallback. +LanceDB nodes already store `tags`, `correspondent`, `document_type`, `title`, and date fields per document (see `indexing.py:build_document_node`). `storage_path` is not currently stored; this feature adds it via a structural schema migration (no re-embed required). -No FAISS index, no signals, no Celery tasks, no locks. Pure DB-side queries on each suggestion request. +For each suggestion request (when embedding backend is on): + +1. Run the ANN retrieval once → get raw `NodeWithScore` results. +2. Extract taxonomy from the node metadata: `tags` (list), `document_type`, `correspondent`, `storage_path`. +3. Inject the unique names into the LLM prompt as "Available " blocks. +4. Pass the same name sets to `matching.py` as `hinted_names` so an exact normalized match short-circuits past fuzzy. + +When embedding backend is off → `hints = None` → prompt and matching are identical to today. ## Components +### `paperless_ai/indexing.py` (modify — `retrieve_similar_nodes`) + +Extract the shared retriever logic from `query_similar_documents` into a new lower-level function: + +```python +def retrieve_similar_nodes( + document: Document, + document_ids: Iterable[int | str] | None = None, + top_k: int = 5, +) -> list["NodeWithScore"]: + """Run ANN retrieval and return raw NodeWithScore results.""" + ... +``` + +Refactor `query_similar_documents` to call `retrieve_similar_nodes` and convert to ORM objects (behavior unchanged). The taxonomy hints path calls `retrieve_similar_nodes` directly — no DB round-trip, no second ANN query. + ### `paperless_ai/taxonomy.py` (new) ```python @@ -43,23 +65,23 @@ class TaxonomyHints(TypedDict): correspondents: list[str] storage_paths: list[str] -def build_taxonomy_hints(document: Document, user: User | None) -> TaxonomyHints: ... +def build_taxonomy_hints_from_nodes(nodes: list["NodeWithScore"]) -> TaxonomyHints: ... +def get_taxonomy_hints_for_document(document: Document, user: User | None) -> TaxonomyHints | None: ... def format_hints_for_prompt(hints: TaxonomyHints) -> str: ... ``` -Internals: +`get_taxonomy_hints_for_document`: -- `_visible_queryset(model_cls, perm: str, user)` — wraps `get_objects_for_user_owner_aware` exactly as `matching.py` does. If `user` is `None`, returns the unfiltered manager queryset (parity with how `matching.py` behaves today). -- `_shortlist_by_frequency(queryset, max_per_category)` — DB-side: - ```python - return list( - queryset - .annotate(usage=Count("documents")) - .order_by("-usage", "name") - .values_list("name", flat=True)[:max_per_category] - ) - ``` - Confirmed reverse relation name is `documents` for all four models (`documents/models.py:164,173,184,211`). Secondary order by `name` keeps results stable when usage ties (common with 0-usage tails). `StoragePath` uses the human `name` field, not the `path` template. +- Returns `None` immediately if `AIConfig().llm_embedding_backend` is falsy. +- Applies the same owner-aware document ID filter as `get_context_for_document` (`get_objects_for_user_owner_aware(user, "view_document", Document)` when `user` is not `None`; unfiltered otherwise). +- Calls `retrieve_similar_nodes(document=document, document_ids=visible_document_ids)`. +- Passes results to `build_taxonomy_hints_from_nodes`. + +`build_taxonomy_hints_from_nodes(nodes)`: + +- Extracts from each `node.metadata`: `tags` (list), `document_type` (str | None), `correspondent` (str | None), `storage_path` (str | None). +- Collects unique values across all nodes, sorted. Empty/`None` values skipped. +- Returns a `TaxonomyHints`. No cap — naturally bounded by `top_k=5` in retrieval. `format_hints_for_prompt` emits one `Available :` block per non-empty category. Empty categories produce no block (avoid prompting the LLM with "Available tags: (none)"). A single instruction line follows: @@ -70,22 +92,17 @@ if none of the existing names fits. ### `paperless_ai/ai_classifier.py` (modify) -> **Note (updated 2026-06-09):** Since this spec was written, two commits changed this file: -> -> - `27426c04b` (#12894) added `llm_output_language` to `AIConfig`, added a new `build_localization_prompt(suggestions, output_language)` function that runs _after_ the LLM call (post-classification localization step), and added `output_language: str | None = None` to `get_ai_document_classification`. -> - `eb292baa6` (#12944) switched the vector store to LanceDB (minor changes to this file). -> -> The current signatures are: +> **Note (updated 2026-06-09):** Current signatures after #12894 and #12944: > > - `build_prompt_without_rag(document: Document, config: AIConfig) -> str` > - `build_prompt_with_rag(document: Document, config: AIConfig, user: User | None = None) -> str` > - `get_ai_document_classification(document, user, output_language: str | None = None) -> dict` > -> `build_localization_prompt` is a separate downstream step and does **not** interact with taxonomy hints — hints inject into the base prompt only, before the LLM call. +> `build_localization_prompt` (added in #12894) runs after the LLM call and does **not** interact with taxonomy hints — hints inject into the base prompt only, before the LLM call. -Current signatures already take `config: AIConfig`; no `user` addition is needed in `build_prompt_without_rag` (the view owns hint construction). Both prompt builders accept a new optional `hints: TaxonomyHints | None = None` parameter. When non-`None`, `format_hints_for_prompt(hints)` is spliced in before the "Analyze the following document" instruction. When `None` (default), the prompt is built as today. +Both `build_prompt_without_rag` and `build_prompt_with_rag` accept a new optional `hints: TaxonomyHints | None = None` parameter. When non-`None`, `format_hints_for_prompt(hints)` is spliced in before the "Analyze the following document" instruction. When `None` (default), the prompt is built as today. -`get_ai_document_classification(document, user, output_language: str | None = None, hints: TaxonomyHints | None = None)` accepts the same optional `hints` and forwards it to the prompt builder. Return shape is **unchanged** (`dict`). The view layer owns hint construction so the same `TaxonomyHints` object can be used both for the prompt and for `hinted_names` in matching — no need to thread it back out of the classifier. Callers in tests pass `hints=None` (or omit) to preserve existing behavior. +`get_ai_document_classification(document, user, output_language: str | None = None, hints: TaxonomyHints | None = None)` accepts the same optional `hints` and forwards it to the prompt builder. Return shape **unchanged** (`dict`). Callers in tests pass `hints=None` (or omit) to preserve existing behavior. ### `paperless_ai/matching.py` (modify) @@ -98,82 +115,61 @@ Current signatures already take `config: AIConfig`; no `user` addition is needed ### `documents/views.py` (modify) -The suggestion endpoint (around line 1482) is the single production caller of `get_ai_document_classification` and the call site for `match_*_by_name`. Update it to: +The suggestion endpoint (around line 1498) is the single production caller of `get_ai_document_classification` and the call site for `match_*_by_name`. Update it to: -1. Build hints once: `hints = build_taxonomy_hints(document, request.user)` (when `AIConfig().taxonomy_hints_enabled` and `max_per_category > 0`; otherwise `hints = None`). -2. Pass `hints` into the classifier: `parsed = get_ai_document_classification(document, request.user, output_language, hints=hints)` — `output_language` is already resolved at this point (added in #12894, `views.py:1472`). +1. Build hints: `hints = get_taxonomy_hints_for_document(doc, request.user)` — returns `None` when embedding backend is off; no additional config check needed in the view. +2. Pass `hints` into the classifier: `parsed = get_ai_document_classification(doc, request.user, output_language, hints=hints)` — `output_language` is already resolved at this point (`views.py:1472`). 3. Pass `hinted_names=set(hints["tags"])` (etc., one per category, or `None` when `hints` is `None`) into each `match_*_by_name` call. -**Cache interaction:** the AI suggestion path is wrapped by `cached_llm_suggestions` / `refresh_suggestions_cache` (views.py:1477). A cached response bypasses the LLM call entirely — so changes to hints config don't take effect until the cache entry is invalidated. Acceptable for v1 (cache is short-lived). If experience shows users change the toggle and expect immediate effect, follow up by including a hash of the hint-relevant config (`taxonomy_hints_enabled`, `_max`) in the cache key. +**Cache interaction:** the AI suggestion path is wrapped by `cached_llm_suggestions` / `refresh_suggestions_cache` (views.py:1488). A cached response bypasses both the LLM call and hint construction entirely. Acceptable for v1. -### `paperless/config.py` (`AIConfig`) + DB model + settings +### No `AIConfig` / DB model / settings changes -`AIConfig.__post_init__` reads values from the `ApplicationConfiguration` DB row **and** falls back to `settings.*` constants (pattern at `paperless/config.py:207` for `ai_enabled`). Both layers are needed. - -Two new fields, threaded through three places: - -1. **`paperless/settings/*.py`** — add module-level constants read from env: - - `AI_TAXONOMY_HINTS: bool = __get_boolean("PAPERLESS_AI_TAXONOMY_HINTS", "yes")` (default on) - - `AI_TAXONOMY_HINTS_MAX: int = int(os.getenv("PAPERLESS_AI_TAXONOMY_HINTS_MAX", "30"))` - -2. **`paperless/models.py` (`ApplicationConfiguration`)** — add two nullable columns: - - `taxonomy_hints_enabled = models.BooleanField(null=True)` - - `taxonomy_hints_max_per_category = models.PositiveSmallIntegerField(null=True)` (range 0–32767; `PositiveSmallIntegerField` is sufficient) - - One Django migration. - -3. **`paperless/config.py` (`AIConfig`)** — read with **explicit None check, not `or`** (because `0` and `False` are legitimate user values that would otherwise silently fall back to the settings default): - ```python - self.taxonomy_hints_enabled = ( - app_config.taxonomy_hints_enabled - if app_config.taxonomy_hints_enabled is not None - else settings.AI_TAXONOMY_HINTS - ) - self.taxonomy_hints_max_per_category = ( - app_config.taxonomy_hints_max_per_category - if app_config.taxonomy_hints_max_per_category is not None - else settings.AI_TAXONOMY_HINTS_MAX - ) - ``` - (Other fields in this file use `or`; we deliberately diverge here to support `0` and `False`. A short comment in code records why.) - -**Frontend** (`src-ui/src/app/data/paperless-config.ts`): add two entries to the `PaperlessConfigOptions` declarative list (one `Boolean`, one `Number`, `category: ConfigCategory.AI`) plus two fields on the `PaperlessConfig` interface. No component changes; the form is generated from this list. - -`paperless.conf.example` and the configuration docs page get entries. +No new configuration fields, DB columns, Django migrations, env vars, or frontend changes. The feature is automatically active for users who have an embedding backend configured and invisible to everyone else. ## Data flow -Suggestion request: +Suggestion request (embedding backend on): -1. View checks `AIConfig().taxonomy_hints_enabled`; if enabled, calls `hints = build_taxonomy_hints(document, user)`; otherwise `hints = None`. -2. View calls `parsed = get_ai_document_classification(document, user, hints=hints)`. -3. Classifier splices `format_hints_for_prompt(hints)` into the prompt (when non-`None`), calls LLM, returns parsed dict. -4. View calls `match_*_by_name(names, user, hinted_names=set(hints[]) if hints else None)` per category. Exact-on-hint short-circuit; fuzzy fallback unchanged for misses. +1. View calls `get_taxonomy_hints_for_document(doc, user)` → `retrieve_similar_nodes` → extract metadata → `TaxonomyHints`. +2. View calls `get_ai_document_classification(doc, user, output_language, hints=hints)`. +3. Classifier builds RAG prompt via `build_prompt_with_rag` (internally calls `query_similar_documents` → `retrieve_similar_nodes` for context text) + splices hints block → LLM → parsed dict. +4. View calls `match_*_by_name(names, user, hinted_names=set(hints[]))` per category. -No background processing. No persisted state. Each suggestion request runs four lightweight `Count("documents")` queries (could be combined into a single query per model via `.annotate().order_by().values_list()`, no joins beyond the existing reverse relation). +Suggestion request (embedding backend off): + +- `get_taxonomy_hints_for_document` returns `None` immediately (no retrieval runs). +- Rest of the flow identical to today. + +**Note on retrieval calls:** `retrieve_similar_nodes` is called once directly (for hints) and once indirectly via `build_prompt_with_rag` → `get_context_for_document` → `query_similar_documents`. Both calls use identical parameters. Acceptable for v1; can be eliminated later by lifting `retrieve_similar_nodes` up to `get_ai_document_classification` and threading results to both callers. ## Error handling -- **Empty visible queryset for a category:** omit that category's block from the prompt. -- **`taxonomy_hints_enabled = False` or `max_per_category = 0`:** `build_taxonomy_hints` returns an empty `TaxonomyHints`; prompt is identical to today; matching is called without `hinted_names`; behavior identical to today. -- **LLM returns a name not in hints but exactly matching an existing visible name:** still treated as exact match. `_match_names_to_queryset` always tries exact-on-full-queryset before fuzzy; `hinted_names` only governs whether fuzzy is consulted for that specific name. -- **DB query failure during shortlist build:** propagate. Suggestion failures already surface as 5xx; adding silent fallbacks here would mask real problems. +- **Embedding backend off:** `get_taxonomy_hints_for_document` returns `None`; no hints; behavior identical to today. +- **No similar documents found:** `build_taxonomy_hints_from_nodes([])` returns all-empty `TaxonomyHints`; `format_hints_for_prompt` produces no blocks; effectively `hints = None`. +- **Node missing `storage_path` key** (index predates the metadata enrichment prerequisite): `node.metadata.get("storage_path")` returns `None`; skipped gracefully. Storage path hints absent until rebuild completes. +- **LLM returns a name not in hints but exactly matching an existing visible name:** still treated as exact match — `_match_names_to_queryset` always tries exact-on-full-queryset before fuzzy. +- **Retrieval failure:** propagates; suggestion failures already surface as 5xx. ## Testing -All new and modified tests use pytest style — functions/classes, no `unittest.TestCase` subclasses; `pytest-django` with per-class `@pytest.mark.django_db`; `pytest-mock`'s `mocker` fixture for patching; every fixture parameter, fixture return, and test signature type-annotated. Tests grouped under classes (`class TestBuildTaxonomyHints:`), not flat free functions. Shared fixtures live in `paperless_ai/tests/conftest.py`. Format with `ruff` directly (not `uv run ruff`). +All tests use pytest style — grouped under classes, `@pytest.mark.django_db` on the class, `pytest-mock`'s `mocker` fixture, every fixture parameter/return/test signature type-annotated. Format with `ruff` directly (not `uv run ruff`). ### `paperless_ai/tests/test_taxonomy.py` (new) -- `class TestBuildTaxonomyHints:` +- `class TestBuildTaxonomyHintsFromNodes:` - Returns a `TaxonomyHints` with all four keys. - - Top-K limit respected (`max_per_category` honored from `AIConfig`). - - Frequency ordering: tag used on 5 docs ranks above tag used on 2 docs. - - Tie-break by name (alphabetical) for stable output. - - Owner-aware: user lacking `view_tag` perm gets `tags=[]`; `view_documenttype` likewise per category. - - Empty queryset for a category → empty list; `format_hints_for_prompt` omits the block. - - `taxonomy_hints_enabled=False` returns zero-filled `TaxonomyHints` and runs no taxonomy DB queries (`django_assert_num_queries`). - - `max_per_category=0` same behavior as disabled. - - `StoragePath` shortlist uses the `name` field, not `path` template (asserted on returned values). + - Deduplicates tag names shared across multiple nodes. + - `None` values in node metadata skipped gracefully. + - Missing `storage_path` key in metadata handled gracefully (pre-migration nodes). + - Empty node list → all-empty `TaxonomyHints`. + - Sorted output is stable across calls. + +- `class TestGetTaxonomyHintsForDocument:` + - Returns `None` when `AIConfig().llm_embedding_backend` is falsy; `retrieve_similar_nodes` not called (`mocker.spy`). + - Calls `retrieve_similar_nodes` with owner-aware document ID filter when user is provided. + - Returns populated `TaxonomyHints` when nodes are found. + - Returns all-empty `TaxonomyHints` (not `None`) when `retrieve_similar_nodes` returns `[]`. - `class TestFormatHintsForPrompt:` - All four blocks present when all categories non-empty. @@ -184,40 +180,34 @@ All new and modified tests use pytest style — functions/classes, no `unittest. ### `paperless_ai/tests/test_ai_classifier.py` (extend) - `class TestBuildPrompt:` - - `build_prompt_without_rag(doc, user)` now accepts `user`; produces a prompt containing the hints block when hints are non-empty. - - `build_prompt_with_rag(doc, user)` includes both the RAG context block (unchanged) and the hints block. - - `taxonomy_hints_enabled=False`: prompt matches today's baseline (string equality against a fixture). - - `get_ai_document_classification(doc, user, hints=...)` forwards hints into the prompt; return shape unchanged (still `dict`). + - `build_prompt_without_rag(doc, config, hints=hints)` produces a prompt containing the hints block when hints are non-empty. + - `build_prompt_with_rag(doc, config, user, hints=hints)` includes both the RAG context block (unchanged) and the hints block. + - `hints=None`: prompt matches today's baseline (string equality against a fixture). + - `get_ai_document_classification(doc, user, hints=...)` forwards hints into the prompt; return shape unchanged. ### `paperless_ai/tests/test_matching.py` (extend) - `class TestHintedMatching:` - - LLM returns `"Bloodwork"` verbatim, `hinted_names={"Bloodwork", ...}` → exact match returned; `difflib.get_close_matches` not called (`mocker.spy` on `difflib.get_close_matches`). + - LLM returns `"Bloodwork"` verbatim, `hinted_names={"Bloodwork"}` → exact match returned; `difflib.get_close_matches` not called (`mocker.spy`). - LLM returns `"blood test"` not in `hinted_names`, no existing exact → fuzzy fallback runs; behavior unchanged from today (regression guard). - - LLM returns `"Bloodwork "` (whitespace) with hinted_names containing `"Bloodwork"` → normalized exact match wins, fuzzy not consulted. - - Backward compatibility: `match_tags_by_name(names, user)` without the kwarg behaves identically to today (snapshot of an existing test, parameterized). - -Markers: no `live` marker needed. + - LLM returns `"Bloodwork "` (whitespace) with `hinted_names={"Bloodwork"}` → normalized exact match wins, fuzzy not consulted. + - Backward compatibility: `match_tags_by_name(names, user)` without the kwarg behaves identically to today. ## Migration / rollout -- One Django migration adding two columns to `ApplicationConfiguration` (`taxonomy_hints_enabled BooleanField`, `taxonomy_hints_max_per_category PositiveSmallIntegerField`). Both nullable with sensible defaults so existing rows aren't broken. -- Feature defaults to on for new and existing installs. Set `PAPERLESS_AI_TAXONOMY_HINTS=false` (or via the Application Configuration UI) to restore today's behavior. -- Frontend admin form updated to expose the two fields under the existing AI section. +No migration in this feature. The prerequisite spec (`2026-06-09-node-metadata-enrichment.md`) handles the LanceDB schema migration (v2, `requires_reembed=True`) and the resulting index rebuild. Once that lands, `storage_path` is in every node's metadata and this feature needs no additional migration steps. -## Open questions deferred to implementation - -- `paperless_ai/tests/conftest.py` already exists — verify fixture-naming conventions match before adding new fixtures. -- Confirm `parse_ai_response` doesn't need to know about hints (it's a pure parser; hints flow alongside, not through it). -- The view layer applying `hinted_names` needs to read the same `AIConfig` instance the classifier used; pass the `TaxonomyHints` through the response tuple (chosen) rather than re-deriving in the view. +No Django migration. No new config. Users with an embedding backend get taxonomy hints automatically once both specs are shipped; users without one see no change. ## Interplay with `extract_unmatched_names` -`extract_unmatched_names` (used downstream of matching) surfaces LLM-returned names that didn't match any existing taxonomy entry — the UI uses these to offer "create new tag?" affordances. With hints in place, fewer names will be unmatched, which is the desired outcome. No behavior change is required: a hinted name that the LLM repeats verbatim will exact-match and not appear in the unmatched list; a name the LLM invents anyway (despite the hint instruction) still flows through fuzzy and, if no match, surfaces as "new" exactly as today. Out of scope: filtering unmatched results based on what was in the hint set. +`extract_unmatched_names` surfaces LLM-returned names that didn't match any existing taxonomy entry — the UI uses these to offer "create new tag?" affordances. With hints in place, fewer names will be unmatched. No behavior change required: a hinted name the LLM returns verbatim will exact-match and not appear in the unmatched list; a name the LLM invents anyway still flows through fuzzy and, if no match, surfaces as "new" exactly as today. ## Out of scope (potential v2) -- Embedding-based shortlisting (for users with very large taxonomies where frequency misses the right tag). Would re-introduce a FAISS-shaped subsystem with signals, debounce, and locks. Defer until evidence frequency is insufficient. -- Tag hierarchy awareness — hinting `Medical/Bloodwork` vs `Bloodwork` when tags are nested. +- Capping hint list length per category (currently unbounded within `top_k=5` retrieved nodes; revisit if prompt length becomes a concern). +- Eliminating the double `retrieve_similar_nodes` call by threading nodes through `get_ai_document_classification`. +- Frequency-based hints as a fallback for users without an embedding backend. +- Structured output / JSON schema enum constraints as an alternative to prompt injection. +- Tag hierarchy awareness. - Custom field option values. -- `StoragePath` template-expression hinting (vs raw `name`). diff --git a/docs/superpowers/specs/2026-06-09-node-metadata-enrichment.md b/docs/superpowers/specs/2026-06-09-node-metadata-enrichment.md new file mode 100644 index 000000000..d0c5ad502 --- /dev/null +++ b/docs/superpowers/specs/2026-06-09-node-metadata-enrichment.md @@ -0,0 +1,115 @@ +# LanceDB Node Metadata Enrichment + +**Status:** Design +**Date:** 2026-06-09 +**Branch target:** `dev` +**Prerequisite for:** AI taxonomy hints (`2026-05-20-ai-taxonomy-hints-design.md`) +**Depends on:** `feature-lancedb-schema-migrate` + +## Problem + +`build_llm_index_text` currently includes three short structured values in the embedding text: + +```python +lines = [ + f"Filename: {doc.filename}", + f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}", + f"Archive Serial Number: {doc.archive_serial_number or ''}", + ... +] +``` + +These don't belong in the embedding. The embedding should capture semantic content — the meaning of the document — not structured identifiers. Including them means vectors are partly "polluted" with filing metadata, making similarity search less accurate. The existing TODO in `embedding.py:115` explicitly calls this out. + +The right home for structured values is `node.metadata` (excluded from the embedding, but surfaced to the LLM when nodes are retrieved as context). `title`, `tags`, `correspondent`, and `document_type` already follow this pattern. + +Notes and custom fields stay in the embedding text — Notes is long free text, custom fields are dynamic and their semantic content belongs in the vector. + +## Changes + +### `paperless_ai/embedding.py` — `build_llm_index_text` + +Remove the three lines and the TODO comment: + +```python +# remove: +f"Filename: {doc.filename}", +f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}", +f"Archive Serial Number: {doc.archive_serial_number or ''}", +``` + +`Notes` and `Custom Fields` lines remain. + +### `paperless_ai/indexing.py` — `build_document_node` + +Add the three fields to the metadata dict: + +```python +metadata = { + "document_id": str(document.id), + "title": document.title, + "filename": document.filename or "", + "storage_path": document.storage_path.name if document.storage_path else None, + "archive_serial_number": document.archive_serial_number, + "tags": [t.name for t in document.tags.all()], + "correspondent": document.correspondent.name if document.correspondent else None, + "document_type": document.document_type.name if document.document_type else None, + "created": document.created.isoformat() if document.created else None, + "added": document.added.isoformat() if document.added else None, + "modified": document.modified.isoformat(), +} +``` + +All three new keys must also appear in `excluded_embed_metadata_keys` (consistent with all existing keys — none of the metadata is included in the embedding text). + +### `paperless_ai/vector_store.py` — schema migration + +Register migration version 2 on the `feature-lancedb-schema-migrate` framework. The embedding text changes, so all existing vectors are stale — a full rebuild is required. The migration's `apply` is a no-op; the rebuild handles regenerating all nodes with the correct metadata. + +```python +MIGRATIONS: list[Migration] = [ + Migration( + version=2, + description="move filename/storage_path/asn from embedding text to metadata", + requires_reembed=True, + apply=lambda table: None, + ), +] +CURRENT_SCHEMA_VERSION: Final[int] = 2 +``` + +On next `update_llm_index` run, `requires_reembed_migration()` returns `True`, triggering a full drop-and-rebuild. All new nodes carry the three metadata fields. No manual intervention required. + +## Impact + +- Similarity search quality improves slightly — vectors are more purely semantic. +- The LLM receives `filename`, `storage_path`, and `archive_serial_number` as structured metadata alongside retrieved chunks, rather than embedded in the chunk text. Same information, cleaner separation. +- One forced index rebuild on upgrade (beta: acceptable). +- `node.metadata["storage_path"]`, `node.metadata["filename"]`, `node.metadata["archive_serial_number"]` are available on all retrieved nodes after rebuild — unblocks the taxonomy hints feature. + +## Testing + +All tests use pytest style — grouped under classes, `@pytest.mark.django_db` on the class, `pytest-mock`'s `mocker` fixture, every fixture and test signature type-annotated. Format with `ruff` directly. + +### `paperless_ai/tests/test_embedding.py` (modify) + +- `class TestBuildLlmIndexText:` + - Assert `"Filename:"` is **not** in the output. + - Assert `"Storage Path:"` is **not** in the output. + - Assert `"Archive Serial Number:"` is **not** in the output. + - Assert Notes and Custom Fields lines are still present (regression guard). + +### `paperless_ai/tests/test_ai_indexing.py` (modify) + +- `class TestBuildDocumentNode:` + - `filename` is in `node.metadata` and in `excluded_embed_metadata_keys`. + - `storage_path` is in `node.metadata` (name string) and in `excluded_embed_metadata_keys`; `None` when document has no storage path. + - `archive_serial_number` is in `node.metadata` and in `excluded_embed_metadata_keys`; `None` when unset. + - None of the three appear in the embedding text produced for the node. + +### `paperless_ai/tests/test_vector_store.py` (modify) + +- `class TestSchemaMigrations:` + - `pending_migrations()` returns the v2 migration when stored version is 1. + - `requires_reembed_migration()` returns `True` when stored version is 1. + - `apply_structural_migrations()` stops at the v2 migration (skips reembed entries).