From da02f3ef2d633505c97f8a7ea71c7137f1a9f4e6 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:09:26 -0700 Subject: [PATCH] Storing more ideas/plans --- ...026-05-11-ai-taxonomy-candidates-design.md | 695 +++++++ .../plans/2026-06-10-sqlite-vec-transition.md | 1669 +++++++++++++++++ .../2026-06-11-unicode-nfc-normalization.md | 462 +++++ .../2026-05-15-scheduled-backup-design.md | 225 +++ ...-05-26-interactive-shell-contenv-design.md | 81 + ...06-10-llmindex-schema-migrations-design.md | 138 ++ ...26-06-10-sqlite-vec-vector-store-design.md | 155 ++ 7 files changed, 3425 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-11-ai-taxonomy-candidates-design.md create mode 100644 docs/superpowers/plans/2026-06-10-sqlite-vec-transition.md create mode 100644 docs/superpowers/plans/2026-06-11-unicode-nfc-normalization.md create mode 100644 docs/superpowers/specs/2026-05-15-scheduled-backup-design.md create mode 100644 docs/superpowers/specs/2026-05-26-interactive-shell-contenv-design.md create mode 100644 docs/superpowers/specs/2026-06-10-llmindex-schema-migrations-design.md create mode 100644 docs/superpowers/specs/2026-06-10-sqlite-vec-vector-store-design.md diff --git a/docs/superpowers/plans/2026-05-11-ai-taxonomy-candidates-design.md b/docs/superpowers/plans/2026-05-11-ai-taxonomy-candidates-design.md new file mode 100644 index 000000000..fb6efd594 --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-ai-taxonomy-candidates-design.md @@ -0,0 +1,695 @@ +# AI Taxonomy Candidate Injection Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Inject the user's existing taxonomy (tags, correspondents, document types, storage paths) as candidates into the LLM prompt so it prefers exact existing names over inventing new ones. + +**Architecture:** A new `get_taxonomy_candidates(user)` helper fetches each category permission-filtered to the requesting user, annotated with document-count for frequency ordering, and capped at 200 per category. A private `_format_candidates_section` helper renders the candidate lists into a prompt appendix. `build_prompt_without_rag` and `build_prompt_with_rag` each gain an optional `candidates` parameter. `get_ai_document_classification` wires it all together — fetch candidates then pass them to the prompt builder. No changes to the view, matching layer, or response format. + +**Tech Stack:** Django ORM (`annotate`, `Count`), `get_objects_for_user_owner_aware` (already used in `matching.py`), pytest + `unittest.mock` + +--- + +## File Map + +- **Modify:** `src/paperless_ai/ai_classifier.py` + - Add constant `TAXONOMY_CANDIDATE_LIMIT = 200` + - Add `get_taxonomy_candidates(user)` helper + - Add `_format_candidates_section(candidates)` helper + - Update `build_prompt_without_rag` signature and body + - Update `build_prompt_with_rag` signature and body + - Update `get_ai_document_classification` body +- **Create:** `src/paperless_ai/tests/test_taxonomy_candidates.py` + - All new tests for the above + +--- + +### Task 1: `get_taxonomy_candidates` — tests + implementation + +**Files:** + +- Modify: `src/paperless_ai/ai_classifier.py` +- Create: `src/paperless_ai/tests/test_taxonomy_candidates.py` + +- [ ] **Step 1: Write the failing tests** + +Create `src/paperless_ai/tests/test_taxonomy_candidates.py`: + +```python +import pytest +from unittest.mock import patch + +from django.contrib.auth.models import User + +from documents.models import Correspondent +from documents.models import Document +from documents.models import DocumentType +from documents.models import StoragePath +from documents.models import Tag +from paperless_ai.ai_classifier import TAXONOMY_CANDIDATE_LIMIT +from paperless_ai.ai_classifier import get_taxonomy_candidates + + +def test_get_taxonomy_candidates_returns_none_for_none_user(): + assert get_taxonomy_candidates(None) is None + + +@pytest.mark.django_db +class TestGetTaxonomyCandidates: + def test_returns_dict_with_four_keys(self): + user = User.objects.create_user(username="tc_user1", password="x") + with patch( + "paperless_ai.ai_classifier.get_objects_for_user_owner_aware", + ) as mock_get: + mock_get.side_effect = [ + Tag.objects.none(), + Correspondent.objects.none(), + DocumentType.objects.none(), + StoragePath.objects.none(), + ] + result = get_taxonomy_candidates(user) + assert result is not None + assert set(result.keys()) == { + "tags", + "correspondents", + "document_types", + "storage_paths", + } + + def test_returns_names_as_strings(self): + user = User.objects.create_user(username="tc_user2", password="x") + tag = Tag.objects.create(name="Bloodwork") + with patch( + "paperless_ai.ai_classifier.get_objects_for_user_owner_aware", + ) as mock_get: + mock_get.side_effect = [ + Tag.objects.filter(pk=tag.pk), + Correspondent.objects.none(), + DocumentType.objects.none(), + StoragePath.objects.none(), + ] + result = get_taxonomy_candidates(user) + assert result["tags"] == ["Bloodwork"] + + def test_orders_tags_by_document_count_descending(self): + user = User.objects.create_user(username="tc_user3", password="x") + tag_low = Tag.objects.create(name="LowUse") + tag_high = Tag.objects.create(name="HighUse") + + doc1 = Document.objects.create(mime_type="text/plain", checksum="tc_doc1") + doc2 = Document.objects.create(mime_type="text/plain", checksum="tc_doc2") + doc3 = Document.objects.create(mime_type="text/plain", checksum="tc_doc3") + doc1.tags.add(tag_high) + doc2.tags.add(tag_high) + doc3.tags.add(tag_low) + + with patch( + "paperless_ai.ai_classifier.get_objects_for_user_owner_aware", + ) as mock_get: + mock_get.side_effect = [ + Tag.objects.filter(pk__in=[tag_low.pk, tag_high.pk]), + Correspondent.objects.none(), + DocumentType.objects.none(), + StoragePath.objects.none(), + ] + result = get_taxonomy_candidates(user) + + assert result["tags"] == ["HighUse", "LowUse"] + + def test_caps_results_at_taxonomy_candidate_limit(self): + user = User.objects.create_user(username="tc_user4", password="x") + tags = [Tag.objects.create(name=f"Tag{i}") for i in range(5)] + + with ( + patch( + "paperless_ai.ai_classifier.get_objects_for_user_owner_aware", + ) as mock_get, + patch("paperless_ai.ai_classifier.TAXONOMY_CANDIDATE_LIMIT", 3), + ): + mock_get.side_effect = [ + Tag.objects.filter(pk__in=[t.pk for t in tags]), + Correspondent.objects.none(), + DocumentType.objects.none(), + StoragePath.objects.none(), + ] + result = get_taxonomy_candidates(user) + + assert len(result["tags"]) == 3 + + def test_all_four_categories_are_fetched(self): + user = User.objects.create_user(username="tc_user5", password="x") + tag = Tag.objects.create(name="MyTag") + corr = Correspondent.objects.create(name="MyCorr") + dt = DocumentType.objects.create(name="MyType") + sp = StoragePath.objects.create(name="MyPath", path="/my/path") + + with patch( + "paperless_ai.ai_classifier.get_objects_for_user_owner_aware", + ) as mock_get: + mock_get.side_effect = [ + Tag.objects.filter(pk=tag.pk), + Correspondent.objects.filter(pk=corr.pk), + DocumentType.objects.filter(pk=dt.pk), + StoragePath.objects.filter(pk=sp.pk), + ] + result = get_taxonomy_candidates(user) + + assert result["tags"] == ["MyTag"] + assert result["correspondents"] == ["MyCorr"] + assert result["document_types"] == ["MyType"] + assert result["storage_paths"] == ["MyPath"] +``` + +- [ ] **Step 2: Run to confirm they all fail** + +```bash +cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py --override-ini="addopts=" -v 2>&1 | tail -20 +``` + +Expected: `ImportError` or `FAILED` — `get_taxonomy_candidates` does not exist yet. + +- [ ] **Step 3: Add the implementation to `ai_classifier.py`** + +At the top of `src/paperless_ai/ai_classifier.py`, add new imports after the existing ones: + +```python +from django.db.models import Count + +from documents.models import Correspondent +from documents.models import DocumentType +from documents.models import StoragePath +from documents.models import Tag +from documents.permissions import get_objects_for_user_owner_aware +``` + +Add the constant and helper right after the `logger` line: + +```python +TAXONOMY_CANDIDATE_LIMIT = 200 + + +def get_taxonomy_candidates(user: User | None) -> dict[str, list[str]] | None: + if user is None: + return None + + tags = ( + get_objects_for_user_owner_aware(user, ["view_tag"], Tag) + .annotate(doc_count=Count("documents")) + .order_by("-doc_count")[:TAXONOMY_CANDIDATE_LIMIT] + ) + correspondents = ( + get_objects_for_user_owner_aware(user, ["view_correspondent"], Correspondent) + .annotate(doc_count=Count("documents")) + .order_by("-doc_count")[:TAXONOMY_CANDIDATE_LIMIT] + ) + document_types = ( + get_objects_for_user_owner_aware(user, ["view_documenttype"], DocumentType) + .annotate(doc_count=Count("documents")) + .order_by("-doc_count")[:TAXONOMY_CANDIDATE_LIMIT] + ) + storage_paths = ( + get_objects_for_user_owner_aware(user, ["view_storagepath"], StoragePath) + .annotate(doc_count=Count("documents")) + .order_by("-doc_count")[:TAXONOMY_CANDIDATE_LIMIT] + ) + + return { + "tags": [t.name for t in tags], + "correspondents": [c.name for c in correspondents], + "document_types": [d.name for d in document_types], + "storage_paths": [s.name for s in storage_paths], + } +``` + +- [ ] **Step 4: Run to confirm Task 1 tests pass** + +```bash +cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py --override-ini="addopts=" -v 2>&1 | tail -20 +``` + +Expected: all 6 tests `PASSED`. + +- [ ] **Step 5: Confirm existing AI classifier tests still pass** + +```bash +cd src && uv run pytest paperless_ai/tests/test_ai_classifier.py --override-ini="addopts=" -v 2>&1 | tail -20 +``` + +Expected: all tests `PASSED`. + +- [ ] **Step 6: Commit** + +```bash +git add src/paperless_ai/ai_classifier.py src/paperless_ai/tests/test_taxonomy_candidates.py +git commit -m "feat: add get_taxonomy_candidates helper with frequency ordering and cap" +``` + +--- + +### Task 2: Prompt injection — `_format_candidates_section` + `build_prompt_without_rag` + +**Files:** + +- Modify: `src/paperless_ai/ai_classifier.py` +- Modify: `src/paperless_ai/tests/test_taxonomy_candidates.py` + +- [ ] **Step 1: Write the failing tests** + +Append to `src/paperless_ai/tests/test_taxonomy_candidates.py`: + +```python +from unittest.mock import MagicMock + +from paperless_ai.ai_classifier import build_prompt_without_rag + + +@pytest.fixture +def mock_doc(): + doc = MagicMock(spec=Document) + doc.filename = "invoice.pdf" + doc.content = "Some document content." + return doc + + +class TestBuildPromptWithoutRag: + def test_no_candidates_section_when_candidates_is_none(self, mock_doc): + prompt = build_prompt_without_rag(mock_doc, candidates=None) + assert "Existing metadata" not in prompt + + def test_no_candidates_section_when_candidates_is_empty_dict(self, mock_doc): + prompt = build_prompt_without_rag(mock_doc, candidates={}) + assert "Existing metadata" not in prompt + + def test_candidates_section_present_when_provided(self, mock_doc): + candidates = { + "tags": ["Bloodwork", "Insurance"], + "correspondents": ["Dr. Smith"], + "document_types": [], + "storage_paths": [], + } + prompt = build_prompt_without_rag(mock_doc, candidates=candidates) + assert "Existing metadata" in prompt + assert "Bloodwork" in prompt + assert "Dr. Smith" in prompt + + def test_empty_categories_omitted_from_section(self, mock_doc): + candidates = { + "tags": ["Bloodwork"], + "correspondents": [], + "document_types": [], + "storage_paths": [], + } + prompt = build_prompt_without_rag(mock_doc, candidates=candidates) + assert "Correspondents:" not in prompt + assert "Document types:" not in prompt + assert "Storage paths:" not in prompt + + def test_existing_prompt_content_preserved(self, mock_doc): + prompt = build_prompt_without_rag(mock_doc, candidates=None) + assert "invoice.pdf" in prompt + assert "Some document content." in prompt +``` + +- [ ] **Step 2: Run to confirm they fail** + +```bash +cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestBuildPromptWithoutRag --override-ini="addopts=" -v 2>&1 | tail -20 +``` + +Expected: `FAILED` — `build_prompt_without_rag` doesn't accept `candidates` yet. + +- [ ] **Step 3: Add `_format_candidates_section` and update `build_prompt_without_rag` in `ai_classifier.py`** + +Add `_format_candidates_section` immediately after `get_taxonomy_candidates`: + +```python +def _format_candidates_section(candidates: dict[str, list[str]]) -> str: + lines = [ + "Existing metadata (use exact names where they fit; suggest new ones only if nothing matches):", + ] + for key, label in [ + ("tags", "Tags"), + ("correspondents", "Correspondents"), + ("document_types", "Document types"), + ("storage_paths", "Storage paths"), + ]: + names = candidates.get(key, []) + if names: + lines.append(f"{label}: {', '.join(names)}") + return "\n".join(lines) +``` + +Replace the existing `build_prompt_without_rag`: + +```python +def build_prompt_without_rag( + document: Document, + candidates: dict[str, list[str]] | None = None, +) -> str: + filename = document.filename or "" + content = truncate_content(document.content[:4000] or "") + + prompt = f""" + You are a document classification assistant. + + Analyze the following document and extract the following information: + - A short descriptive title + - Tags that reflect the content + - Names of people or organizations mentioned + - The type or category of the document + - Suggested folder paths for storing the document + - Up to 3 relevant dates in YYYY-MM-DD format + + Filename: + {filename} + + Content: + {content} + """.strip() + + if candidates: + prompt += "\n\n" + _format_candidates_section(candidates) + + return prompt +``` + +- [ ] **Step 4: Run to confirm Task 2 tests pass** + +```bash +cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestBuildPromptWithoutRag --override-ini="addopts=" -v 2>&1 | tail -20 +``` + +Expected: all 5 tests `PASSED`. + +- [ ] **Step 5: Run full test file to check no regressions** + +```bash +cd src && uv run pytest paperless_ai/tests/ --override-ini="addopts=" -v 2>&1 | tail -30 +``` + +Expected: all tests `PASSED`. + +- [ ] **Step 6: Commit** + +```bash +git add src/paperless_ai/ai_classifier.py src/paperless_ai/tests/test_taxonomy_candidates.py +git commit -m "feat: inject taxonomy candidates into build_prompt_without_rag" +``` + +--- + +### Task 3: Update `build_prompt_with_rag` + +**Files:** + +- Modify: `src/paperless_ai/ai_classifier.py` +- Modify: `src/paperless_ai/tests/test_taxonomy_candidates.py` + +- [ ] **Step 1: Write the failing tests** + +Append to `src/paperless_ai/tests/test_taxonomy_candidates.py`: + +```python +from paperless_ai.ai_classifier import build_prompt_with_rag + + +class TestBuildPromptWithRag: + def test_no_candidates_section_when_candidates_is_none(self, mock_doc): + with patch( + "paperless_ai.ai_classifier.get_context_for_document", + return_value="similar doc context", + ): + prompt = build_prompt_with_rag(mock_doc, candidates=None) + assert "Existing metadata" not in prompt + + def test_candidates_section_present_when_provided(self, mock_doc): + candidates = { + "tags": ["Insurance"], + "correspondents": [], + "document_types": ["Invoice"], + "storage_paths": [], + } + with patch( + "paperless_ai.ai_classifier.get_context_for_document", + return_value="similar doc context", + ): + prompt = build_prompt_with_rag(mock_doc, candidates=candidates) + assert "Existing metadata" in prompt + assert "Insurance" in prompt + assert "Invoice" in prompt + + def test_rag_context_still_present(self, mock_doc): + with patch( + "paperless_ai.ai_classifier.get_context_for_document", + return_value="similar doc context", + ): + prompt = build_prompt_with_rag(mock_doc, candidates=None) + assert "similar doc context" in prompt +``` + +- [ ] **Step 2: Run to confirm they fail** + +```bash +cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestBuildPromptWithRag --override-ini="addopts=" -v 2>&1 | tail -20 +``` + +Expected: `FAILED` — `build_prompt_with_rag` doesn't accept `candidates` yet. + +- [ ] **Step 3: Update `build_prompt_with_rag` in `ai_classifier.py`** + +Replace the existing `build_prompt_with_rag`: + +```python +def build_prompt_with_rag( + document: Document, + user: User | None = None, + candidates: dict[str, list[str]] | None = None, +) -> str: + base_prompt = build_prompt_without_rag(document, candidates) + context = truncate_content(get_context_for_document(document, user)) + + return f"""{base_prompt} + + Additional context from similar documents: + {context} + """.strip() +``` + +- [ ] **Step 4: Run to confirm Task 3 tests pass** + +```bash +cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestBuildPromptWithRag --override-ini="addopts=" -v 2>&1 | tail -20 +``` + +Expected: all 3 tests `PASSED`. + +- [ ] **Step 5: Run full test file to check no regressions** + +```bash +cd src && uv run pytest paperless_ai/tests/ --override-ini="addopts=" -v 2>&1 | tail -30 +``` + +Expected: all tests `PASSED`. + +- [ ] **Step 6: Commit** + +```bash +git add src/paperless_ai/ai_classifier.py src/paperless_ai/tests/test_taxonomy_candidates.py +git commit -m "feat: pass taxonomy candidates through build_prompt_with_rag" +``` + +--- + +### Task 4: Wire `get_ai_document_classification` + +**Files:** + +- Modify: `src/paperless_ai/ai_classifier.py` +- Modify: `src/paperless_ai/tests/test_taxonomy_candidates.py` + +- [ ] **Step 1: Write the failing tests** + +Append to `src/paperless_ai/tests/test_taxonomy_candidates.py`: + +```python +from django.test import override_settings + +from paperless_ai.ai_classifier import get_ai_document_classification + + +@pytest.mark.django_db +class TestGetAiDocumentClassificationCandidateWiring: + @override_settings(LLM_BACKEND="ollama", LLM_MODEL="some_model") + def test_candidates_fetched_and_passed_when_user_provided(self, mock_doc): + user = User.objects.create_user(username="tc_wire_user1", password="x") + fake_candidates = { + "tags": ["Bloodwork"], + "correspondents": [], + "document_types": [], + "storage_paths": [], + } + with ( + patch( + "paperless_ai.ai_classifier.get_taxonomy_candidates", + return_value=fake_candidates, + ) as mock_candidates, + patch( + "paperless_ai.ai_classifier.build_prompt_without_rag", + return_value="prompt", + ) as mock_build, + patch("paperless_ai.client.AIClient.run_llm_query") as mock_llm, + ): + mock_llm.return_value = { + "title": "", + "tags": [], + "correspondents": [], + "document_types": [], + "storage_paths": [], + "dates": [], + } + get_ai_document_classification(mock_doc, user) + + mock_candidates.assert_called_once_with(user) + mock_build.assert_called_once_with(mock_doc, fake_candidates) + + @override_settings(LLM_BACKEND="ollama", LLM_MODEL="some_model") + def test_no_candidates_when_user_is_none(self, mock_doc): + with ( + patch( + "paperless_ai.ai_classifier.get_taxonomy_candidates", + ) as mock_candidates, + patch( + "paperless_ai.ai_classifier.build_prompt_without_rag", + return_value="prompt", + ) as mock_build, + patch("paperless_ai.client.AIClient.run_llm_query") as mock_llm, + ): + mock_llm.return_value = { + "title": "", + "tags": [], + "correspondents": [], + "document_types": [], + "storage_paths": [], + "dates": [], + } + get_ai_document_classification(mock_doc, user=None) + + mock_candidates.assert_not_called() + mock_build.assert_called_once_with(mock_doc, None) + + @override_settings( + LLM_BACKEND="ollama", + LLM_MODEL="some_model", + LLM_EMBEDDING_BACKEND="huggingface", + LLM_EMBEDDING_MODEL="some_model", + ) + def test_candidates_passed_to_rag_prompt_when_embedding_configured(self, mock_doc): + user = User.objects.create_user(username="tc_wire_user2", password="x") + fake_candidates = { + "tags": ["Tax"], + "correspondents": [], + "document_types": [], + "storage_paths": [], + } + with ( + patch( + "paperless_ai.ai_classifier.get_taxonomy_candidates", + return_value=fake_candidates, + ), + patch( + "paperless_ai.ai_classifier.build_prompt_with_rag", + return_value="rag_prompt", + ) as mock_rag, + patch("paperless_ai.client.AIClient.run_llm_query") as mock_llm, + ): + mock_llm.return_value = { + "title": "", + "tags": [], + "correspondents": [], + "document_types": [], + "storage_paths": [], + "dates": [], + } + get_ai_document_classification(mock_doc, user) + + mock_rag.assert_called_once_with(mock_doc, user, fake_candidates) +``` + +- [ ] **Step 2: Run to confirm they fail** + +```bash +cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestGetAiDocumentClassificationCandidateWiring --override-ini="addopts=" -v 2>&1 | tail -20 +``` + +Expected: `FAILED` — `get_ai_document_classification` doesn't pass candidates yet. + +- [ ] **Step 3: Update `get_ai_document_classification` in `ai_classifier.py`** + +Replace the existing `get_ai_document_classification`: + +```python +def get_ai_document_classification( + document: Document, + user: User | None = None, +) -> dict: + ai_config = AIConfig() + candidates = get_taxonomy_candidates(user) if user is not None else None + + prompt = ( + build_prompt_with_rag(document, user, candidates) + if ai_config.llm_embedding_backend + else build_prompt_without_rag(document, candidates) + ) + + client = AIClient() + result = client.run_llm_query(prompt) + return parse_ai_response(result) +``` + +- [ ] **Step 4: Run Task 4 tests** + +```bash +cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestGetAiDocumentClassificationCandidateWiring --override-ini="addopts=" -v 2>&1 | tail -20 +``` + +Expected: all 3 tests `PASSED`. + +- [ ] **Step 5: Run the full `paperless_ai` test suite** + +```bash +cd src && uv run pytest paperless_ai/tests/ --override-ini="addopts=" -v 2>&1 | tail -40 +``` + +Expected: all tests `PASSED`. + +- [ ] **Step 6: Commit** + +```bash +git add src/paperless_ai/ai_classifier.py src/paperless_ai/tests/test_taxonomy_candidates.py +git commit -m "feat: wire taxonomy candidates into get_ai_document_classification" +``` + +--- + +### Task 5: Final verification + +- [ ] **Step 1: Run the broader backend test suite to catch any regressions** + +```bash +cd src && uv run pytest documents/tests/test_api_documents.py documents/tests/test_views.py paperless_ai/tests/ --override-ini="addopts=" -q 2>&1 | tail -20 +``` + +Expected: all `PASSED`, no errors. + +- [ ] **Step 2: Verify `ai_classifier.py` import order follows project conventions** + +Project convention: stdlib → Django → third-party → local, alphabetical within each group. Open `src/paperless_ai/ai_classifier.py` and confirm the new imports (`Count`, model imports, `get_objects_for_user_owner_aware`) are placed in the correct groups in alphabetical order. + +- [ ] **Step 3: Final commit if any formatting fixes were needed** + +If Step 2 required changes: + +```bash +git add src/paperless_ai/ai_classifier.py +git commit -m "chore: fix import ordering in ai_classifier.py" +``` diff --git a/docs/superpowers/plans/2026-06-10-sqlite-vec-transition.md b/docs/superpowers/plans/2026-06-10-sqlite-vec-transition.md new file mode 100644 index 000000000..dc2b04b8d --- /dev/null +++ b/docs/superpowers/plans/2026-06-10-sqlite-vec-transition.md @@ -0,0 +1,1669 @@ +# sqlite-vec Vector Store Transition Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace the LanceDB-backed AI vector store with a sqlite-vec-backed one, fixing #12970 (SIGILL on non-AVX2 CPUs) by removing the lancedb dependency entirely. + +**Architecture:** `PaperlessSqliteVecVectorStore` keeps the exact `BasePydanticVectorStore` surface of today's `PaperlessLanceVectorStore`, backed by one SQLite file (`LLM_INDEX_DIR/llmindex.db`) holding a vec0 virtual table plus a small `index_meta` key/value table. Writers stay serialized by the existing FileLock; readers run concurrently via WAL. Beta policy: upgrading users re-embed (a leftover Lance directory triggers a forced rebuild and is deleted). + +**Tech Stack:** Python/Django, `sqlite-vec==0.1.9` (pinned, see Risk register), stdlib `sqlite3` + `struct`, llama-index `BasePydanticVectorStore`, pytest. + +**Spec:** `docs/superpowers/specs/2026-06-10-sqlite-vec-vector-store-design.md` — read it first; every schema and semantics decision below was empirically verified there. + +**Included here (user decision):** the `embedding.py:115` TODO (move Filename / Storage Path / Archive Serial Number from embedded body text into `node.metadata`) is Task 5. It changes every document's embedded text, which would normally require a re-embed migration, but this transition forces a full rebuild anyway, so it rides along for free (one user-visible re-embed instead of two). + +**Deferred to a second spec (do NOT implement here):** schema-migration machinery (`docs/superpowers/specs/2026-06-10-llmindex-schema-migrations-design.md`, the PR #12968 idea rebuilt for sqlite-vec). It lands after this branch, with an empty migration registry. + +--- + +## Context for an implementer with zero history + +**Why sqlite-vec:** issue #12970: lancedb wheels are compiled for `target-cpu=haswell` and SIGILL at import on pre-AVX2 CPUs; upstream will not fix the published wheel. sqlite-vec 0.1.9's wheel contains no baked SIMD (verified under qemu `-cpu Westmere`). Research: `2026-06-10-vector-store-alternatives-research.md`. + +**Critical version constraint:** `sqlite-vec==0.1.9` exactly. The 0.1.10-alpha wheels bake `-mavx` (no runtime dispatch) and would reintroduce the crash class. Any future bump requires re-checking wheel build flags (`SELECT vec_debug()`) and ideally re-running the qemu check. An upstream issue about runtime dispatch is being raised separately; do not bump as part of this work. + +**Verified vec0 semantics this plan relies on** (all tested against the real 0.1.9 wheel; see spec): + +- `document_id` must be a plain metadata column, NOT `PARTITION KEY` (partition keys make `k` apply per partition with `IN` filters; metadata columns give a correct global top-k). +- KNN queries need `WHERE embedding MATCH ? AND k = ?`; `LIMIT` cannot be combined with `k`; results arrive distance-sorted ascending. +- `INSERT OR REPLACE` is broken on vec0 (upstream #259): always DELETE + INSERT inside one transaction. +- Metadata columns reject NULL (upstream #141): every value goes through `str(... or "")`. +- Vectors must be bound as packed float32 BLOBs, never JSON text (locale bug upstream #241). +- Aux column `+node_content` stores the JSON payload; it cannot appear in KNN WHERE clauses (we never do) but is selectable everywhere. +- DELETE never reclaims file space (upstream #54/#220); `compact()` is implemented as a rebuild (create temp table, copy, drop, rename, VACUUM). +- `DROP TABLE` on the vtab drops all its shadow tables. +- Full scans (`SELECT ... FROM vtab` without MATCH) work. +- The cumulative-vs-live bloat ratio is observable as `count(*)` of the `_rowids` shadow table vs the vtab itself. + +**Key existing files:** + +- `src/paperless_ai/vector_store.py` — the Lance store being replaced (334 lines). Read it fully before Task 2; the new class mirrors its docstrings and surface. +- `src/paperless_ai/indexing.py` — the only construction sites: `get_vector_store()` (read path) and `write_store()` (FileLock-serialized write path). `update_llm_index()` calls `store.ensure_document_id_scalar_index()`, `store.maybe_create_ann_index()`, `store.compact(retention_seconds=...)` — the first two disappear, the third changes signature. +- `src/paperless/settings/__init__.py:99-100` — `LLM_INDEX_DIR = DATA_DIR / "llm_index"`, `LLM_INDEX_LOCK` inside it. Unchanged. +- `src/paperless_ai/tests/conftest.py` — `temp_llm_index_dir` fixture (points `LLM_INDEX_DIR`/`LLM_INDEX_LOCK` at `tmp_path`) and `FakeEmbedding` (dim 384). Reuse both. +- `src/documents/management/commands/document_llmindex.py` — `rebuild|update|compact` subcommands; `compact` calls `paperless_ai.indexing.llm_index_compact()`. + +**Project conventions (from CLAUDE.md and memory):** + +- All Python through `uv run`; single test file: `cd src && uv run pytest --override-ini="addopts="`. +- pytest style only (no Django TestCase); new tests in dedicated files per subject; no trivial existence tests. +- `rg`/`fd`, not grep/find. Conventional commits, Co-Authored-By trailer for Claude commits. +- Current branch line for this feature is `beta`; branch from it. + +--- + +### Task 1: Branch and dependency swap groundwork + +**Files:** + +- Modify: `pyproject.toml`, `uv.lock` (via uv only, never by hand) + +- [ ] **Step 1: Branch** + +```bash +cd /tank/users/trenton/projects/paperless/paperless-ngx +git checkout beta && git pull +git checkout -b feature-sqlitevec-vector-store +``` + +- [ ] **Step 2: Add sqlite-vec (keep lancedb for now; it goes away in Task 7 after everything is ported)** + +```bash +uv add "sqlite-vec==0.1.9" +``` + +- [ ] **Step 3: Sanity-check the wheel loads and report its build flags** + +```bash +cd src && uv run python -c " +import sqlite3, sqlite_vec +db = sqlite3.connect(':memory:') +db.enable_load_extension(True) +sqlite_vec.load(db) +print(db.execute('select vec_version()').fetchone()[0]) +print(db.execute('select vec_debug()').fetchone()[0]) +" +``` + +Expected: `v0.1.9` and a `Build flags:` line that does NOT contain `avx`. If it contains `avx`, STOP: the wheel is not the ISA-safe build this whole transition depends on. + +- [ ] **Step 4: Commit** + +```bash +git add pyproject.toml uv.lock +git commit -m "Chore(beta): add sqlite-vec 0.1.9 dependency + +Pinned exactly: the 0.1.9 wheels carry no baked SIMD flags (safe on +pre-AVX2 CPUs, the point of this migration); the 0.1.10 alphas bake +-mavx and would reintroduce the #12970 crash class. + +Co-Authored-By: Claude Fable 5 " +``` + +--- + +### Task 2: Rewrite the vector store tests for the new backend + +**Files:** + +- Rewrite: `src/paperless_ai/tests/test_vector_store.py` + +The existing file (417 lines) tests the Lance store. Port its surface to the new class and add the sqlite-vec-specific behaviors. Read the old file first; the helpers below intentionally mirror its node-building helpers so test intent stays comparable in review. + +- [ ] **Step 1: Replace the file content** + +```python +import json +import sqlite3 +from pathlib import Path + +import pytest +from llama_index.core.schema import TextNode + +from paperless_ai.vector_store import DB_FILENAME +from paperless_ai.vector_store import PaperlessSqliteVecVectorStore + +DIM = 16 + + +def make_node( + node_id: str, + document_id: str, + *, + modified: str = "2026-06-10T00:00:00", + seed: float = 0.0, + text: str = "some text", +) -> TextNode: + node = TextNode( + id_=node_id, + text=text, + metadata={"document_id": document_id, "modified": modified}, + ) + node.relationships = {} + # ref_doc_id source: llama-index derives it from relationships; for unit + # tests, setting metadata document_id is what our _row() consumes. + node.embedding = [seed + i / 100 for i in range(DIM)] + return node + + +@pytest.fixture +def store(tmp_path: Path) -> PaperlessSqliteVecVectorStore: + return PaperlessSqliteVecVectorStore(uri=str(tmp_path)) + + +def _query(store: PaperlessSqliteVecVectorStore, embedding: list[float], top_k: int = 5, filters=None): + from llama_index.core.vector_stores.types import VectorStoreQuery + + return store.query( + VectorStoreQuery( + query_embedding=embedding, + similarity_top_k=top_k, + filters=filters, + ), + ) + + +def _in_filter(document_ids: list[str]): + from llama_index.core.vector_stores.types import ( + FilterOperator, + MetadataFilter, + MetadataFilters, + ) + + return MetadataFilters( + filters=[ + MetadataFilter( + key="document_id", operator=FilterOperator.IN, value=document_ids + ) + ], + ) + + +class TestCrud: + def test_add_then_query_returns_node(self, store) -> None: + node = make_node("n1", "1") + assert store.add([node]) == ["n1"] + result = _query(store, node.embedding, top_k=1) + assert result.ids == ["n1"] + assert result.nodes[0].metadata["document_id"] == "1" + # cosine distance of the identical vector is 0 -> similarity 1 + assert result.similarities[0] == pytest.approx(1.0) + + def test_query_empty_store_returns_empty_no_raise(self, store) -> None: + result = _query(store, [0.0] * DIM) + assert result.ids == [] and result.nodes == [] and result.similarities == [] + + def test_add_empty_list_is_noop(self, store) -> None: + assert store.add([]) == [] + assert not store.table_exists() + + def test_delete_removes_all_chunks_of_document(self, store) -> None: + store.add([make_node("a1", "1"), make_node("a2", "1"), make_node("b1", "2")]) + store.delete("1") + result = _query(store, [0.0] * DIM, top_k=10) + assert result.ids == ["b1"] + + def test_query_with_in_filter_scopes_results(self, store) -> None: + store.add( + [ + make_node("a1", "1", seed=0.0), + make_node("b1", "2", seed=1.0), + make_node("c1", "3", seed=2.0), + ], + ) + result = _query(store, [0.0] * DIM, top_k=10, filters=_in_filter(["2", "3"])) + assert sorted(result.ids) == ["b1", "c1"] + + def test_query_respects_top_k_with_filter(self, store) -> None: + # k semantics: global top-k even with IN filters (document_id is a + # metadata column, not a partition key — see design doc). + store.add( + [make_node(f"n{i}", str(i % 4), seed=float(i)) for i in range(12)], + ) + result = _query( + store, [0.0] * DIM, top_k=3, filters=_in_filter(["0", "1", "2", "3"]) + ) + assert len(result.ids) == 3 + assert result.similarities == sorted(result.similarities, reverse=True) + + def test_get_nodes_filter_and_empty_paths(self, store) -> None: + assert store.get_nodes(filters=_in_filter(["1"])) == [] # no table yet + store.add([make_node("a1", "1"), make_node("b1", "2")]) + nodes = store.get_nodes(filters=_in_filter(["1"])) + assert [n.node_id for n in nodes] == ["a1"] + assert nodes[0].embedding is not None + assert store.get_nodes(filters=_in_filter(["999"])) == [] + + def test_get_nodes_node_ids_not_implemented(self, store) -> None: + with pytest.raises(NotImplementedError): + store.get_nodes(node_ids=["x"]) + + def test_fresh_instance_sees_existing_table(self, store, tmp_path: Path) -> None: + store.add([make_node("a1", "1")]) + reopened = PaperlessSqliteVecVectorStore(uri=str(tmp_path)) + assert reopened.table_exists() + assert reopened.vector_dim() == DIM + assert _query(reopened, [0.0] * DIM, top_k=1).ids == ["a1"] + + def test_table_exists_and_drop(self, store) -> None: + assert not store.table_exists() + store.add([make_node("a1", "1")]) + assert store.table_exists() + store.drop_table() + assert not store.table_exists() + assert store.vector_dim() is None + + +class TestUpsert: + def test_upsert_replaces_and_prunes_stale_chunks(self, store) -> None: + store.add( + [make_node("d1c1", "1"), make_node("d1c2", "1"), make_node("d2c1", "2")], + ) + store.upsert_document("1", [make_node("d1new", "1")]) + result = _query(store, [0.0] * DIM, top_k=10) + assert sorted(result.ids) == ["d1new", "d2c1"] + + def test_upsert_creates_table_when_missing(self, store) -> None: + store.upsert_document("1", [make_node("a1", "1")]) + assert _query(store, [0.0] * DIM, top_k=1).ids == ["a1"] + + def test_upsert_empty_nodes_removes_document(self, store) -> None: + store.add([make_node("a1", "1"), make_node("b1", "2")]) + store.upsert_document("1", []) + assert _query(store, [0.0] * DIM, top_k=10).ids == ["b1"] + + def test_upsert_is_atomic_for_concurrent_readers(self, store, tmp_path: Path) -> None: + """A second connection must never observe document 1 half-replaced.""" + store.add([make_node("a1", "1"), make_node("a2", "1")]) + reader = PaperlessSqliteVecVectorStore(uri=str(tmp_path)) + store.upsert_document("1", [make_node("a3", "1")]) + ids = [n.node_id for n in reader.get_nodes(filters=_in_filter(["1"]))] + assert ids == ["a3"] + + +class TestMetadataCoercion: + def test_none_metadata_values_become_empty_strings(self, store) -> None: + node = make_node("a1", "1") + node.metadata["modified"] = None + store.add([node]) # must not raise (vec0 rejects NULL metadata) + assert store.get_modified_times() == {"1": ""} + + +class TestModelNameTracking: + def test_stored_model_name_none_without_table(self, tmp_path: Path) -> None: + store = PaperlessSqliteVecVectorStore( + uri=str(tmp_path), embed_model_name="model-a" + ) + assert store.stored_model_name() is None + + def test_model_name_stored_after_add_and_persists(self, tmp_path: Path) -> None: + store = PaperlessSqliteVecVectorStore( + uri=str(tmp_path), embed_model_name="model-a" + ) + store.add([make_node("a1", "1")]) + assert store.stored_model_name() == "model-a" + reopened = PaperlessSqliteVecVectorStore(uri=str(tmp_path)) + assert reopened.stored_model_name() == "model-a" + + def test_config_mismatch_semantics(self, tmp_path: Path) -> None: + store = PaperlessSqliteVecVectorStore( + uri=str(tmp_path), embed_model_name="model-a" + ) + assert not store.config_mismatch("anything") # no table yet + store.add([make_node("a1", "1")]) + assert not store.config_mismatch("model-a") + assert store.config_mismatch("model-b") + + def test_config_mismatch_false_when_table_predates_tracking( + self, tmp_path: Path + ) -> None: + store = PaperlessSqliteVecVectorStore(uri=str(tmp_path)) # no model name + store.add([make_node("a1", "1")]) + assert not store.config_mismatch("model-a") + + +class TestGetModifiedTimes: + def test_empty_store_returns_empty_dict(self, store) -> None: + assert store.get_modified_times() == {} + + def test_returns_one_entry_per_document(self, store) -> None: + store.add( + [ + make_node("a1", "1", modified="2026-01-01T00:00:00"), + make_node("a2", "1", modified="2026-01-01T00:00:00"), + make_node("b1", "2", modified="2026-02-02T00:00:00"), + ], + ) + assert store.get_modified_times() == { + "1": "2026-01-01T00:00:00", + "2": "2026-02-02T00:00:00", + } + + +class TestCompact: + def _bloat_ratio(self, store) -> float: + live = store.client.execute( + f"SELECT count(*) FROM {store._table_name}" # noqa: SLF001 + ).fetchone()[0] + total = store.client.execute( + f"SELECT count(*) FROM {store._table_name}_rowids" # noqa: SLF001 + ).fetchone()[0] + return total / max(live, 1) + + def _churn(self, store, cycles: int) -> None: + for i in range(cycles): + store.upsert_document( + "1", [make_node(f"gen{i}-{j}", "1", seed=float(j)) for j in range(20)] + ) + + def test_compact_noop_below_threshold(self, store) -> None: + store.add([make_node("a1", "1")]) + store.compact() + assert _query(store, [0.0] * DIM, top_k=1).ids == ["a1"] + + def test_force_compact_preserves_rows_and_metadata(self, store) -> None: + store.add([make_node("a1", "1"), make_node("b1", "2", seed=3.0)]) + self._churn(store, 5) + before = { + n.node_id: n.metadata for n in store.get_nodes(filters=_in_filter(["1", "2"])) + } + store.compact(force=True) + after = { + n.node_id: n.metadata for n in store.get_nodes(filters=_in_filter(["1", "2"])) + } + assert after == before + assert self._bloat_ratio(store) == pytest.approx(1.0) + # store remains fully usable after the rebuild + store.upsert_document("3", [make_node("c1", "3", seed=9.0)]) + assert "c1" in _query(store, [9.0] * DIM, top_k=1).ids + + def test_auto_compact_triggers_on_churn(self, store) -> None: + store.add([make_node(f"s{j}", "1", seed=float(j)) for j in range(20)]) + self._churn(store, 5) + assert self._bloat_ratio(store) > 2 + store.compact() + assert self._bloat_ratio(store) == pytest.approx(1.0) + + def test_compact_on_missing_table_is_noop(self, store) -> None: + store.compact() + store.compact(force=True) + + +class TestDbFile: + def test_single_db_file_in_index_dir(self, store, tmp_path: Path) -> None: + store.add([make_node("a1", "1")]) + assert (tmp_path / DB_FILENAME).exists() + + def test_wal_mode_enabled(self, store) -> None: + assert ( + store.client.execute("PRAGMA journal_mode").fetchone()[0].lower() == "wal" + ) +``` + +- [ ] **Step 2: Run to verify the import fails (class does not exist yet)** + +```bash +cd src && uv run pytest paperless_ai/tests/test_vector_store.py --override-ini="addopts=" 2>&1 | tail -3 +``` + +Expected: collection error, `ImportError: cannot import name 'PaperlessSqliteVecVectorStore'`. + +- [ ] **Step 3: Commit** + +```bash +git add src/paperless_ai/tests/test_vector_store.py +git commit -m "Test(beta): port vector store tests to sqlite-vec backend + +Co-Authored-By: Claude Fable 5 " +``` + +--- + +### Task 3: Implement PaperlessSqliteVecVectorStore + +**Files:** + +- Modify: `src/paperless_ai/vector_store.py` +- Create: `src/bench_vector_store.py` + +--- + +#### Phase A: Benchmark coexistence + +Add `PaperlessSqliteVecVectorStore` alongside the existing Lance class so both can be benchmarked head-to-head before the Lance class is removed. No commit is made in Phase A; the final commit (Phase B Step 6) captures the clean state. + +- [ ] **Step 1: Add `PaperlessSqliteVecVectorStore` alongside the existing Lance class** + +Add these imports to the top of `src/paperless_ai/vector_store.py` (insert after the existing `from llama_index...` block, before `logger = ...`): + +```python +import sqlite3 +import struct +from collections.abc import Iterator +from contextlib import contextmanager + +import sqlite_vec +``` + +Then append the content from Phase B Step 4 -- everything from `DB_FILENAME = "llmindex.db"` through the end of `PaperlessSqliteVecVectorStore` -- to the **bottom** of the existing file. When appending, make one change: rename the appended `_build_where` to `_build_sqlite_where` and update its two call sites inside `PaperlessSqliteVecVectorStore` (`get_nodes` and `query` methods). This avoids shadowing the existing Lance `_build_where`. All other names (`DB_FILENAME`, `COMPACT_BLOAT_RATIO`, `_FILTER_COLUMNS`, `_pack`, `_unpack`) are safe to append verbatim. + +Verify: + +```bash +rg -n "^class Paperless" src/paperless_ai/vector_store.py +# Expected: PaperlessLanceVectorStore on one line, PaperlessSqliteVecVectorStore on another +``` + +- [ ] **Step 2: Write `src/bench_vector_store.py`** + +```python +#!/usr/bin/env python3 +"""Head-to-head benchmark: PaperlessLanceVectorStore vs PaperlessSqliteVecVectorStore. + +Run from src/ with: + uv run python bench_vector_store.py [OPTIONS] + +Phase 1 (skipped if bench_data.pkl already exists): generate fake documents with +Faker and embed chunks via Ollama; save to disk for reuse. +Phase 2: benchmark both stores against identical data and print a comparison table. + +Requires both classes to coexist in paperless_ai.vector_store (Task 3 Phase A). +After Phase B replaces the file, the Lance import fails gracefully and only the +sqlite-vec half runs. +""" +from __future__ import annotations + +import argparse +import pickle +import statistics +import tempfile +import time +import uuid +from pathlib import Path + +import httpx +from faker import Faker +from llama_index.core.schema import TextNode +from llama_index.core.vector_stores.types import ( + FilterOperator, + MetadataFilter, + MetadataFilters, + VectorStoreQuery, +) + +try: + from paperless_ai.vector_store import PaperlessLanceVectorStore + + _LANCE_OK = True +except ImportError: + _LANCE_OK = False + +from paperless_ai.vector_store import PaperlessSqliteVecVectorStore + +DEFAULT_OLLAMA_URL = "http://192.168.1.87:11434" +DEFAULT_EMBED_MODEL = "qwen3-embedding:4b" +DEFAULT_DATA_FILE = "bench_data.pkl" +DEFAULT_N_DOCS = 2000 +DEFAULT_CHUNKS_PER_DOC = 3 +DEFAULT_QUERY_ITERS = 50 +_BATCH = 32 + + +def _embed(texts: list[str], url: str, model: str) -> list[list[float]]: + r = httpx.post( + f"{url}/api/embed", + json={"model": model, "input": texts}, + timeout=120.0, + ) + r.raise_for_status() + return r.json()["embeddings"] + + +def warm_up(url: str, model: str) -> int: + """Fire one embed call to load the model into GPU; return embedding dim.""" + print(f"Warming up {model}...", end=" ", flush=True) + dim = len(_embed(["warm"], url, model)[0]) + print(f"dim={dim}") + return dim + + +def generate_and_save( + n_docs: int, + chunks_per_doc: int, + url: str, + model: str, + out: str, +) -> list[dict]: + fake = Faker() + Faker.seed(42) + print(f"Generating {n_docs} docs ({chunks_per_doc} chunks each)...") + docs = [] + for i in range(n_docs): + body = "\n\n".join(fake.paragraph(nb_sentences=8) for _ in range(3)) + clen = max(1, len(body) // chunks_per_doc) + chunks = [] + for j in range(chunks_per_doc): + s = j * clen + e = s + clen if j < chunks_per_doc - 1 else len(body) + chunks.append({"node_id": str(uuid.uuid4()), "text": body[s:e], "embedding": None}) + docs.append({ + "doc_id": str(i + 1), + "title": fake.catch_phrase(), + "modified": fake.date_time_this_decade().isoformat(), + "chunks": chunks, + }) + + all_texts = [c["text"] for d in docs for c in d["chunks"]] + print(f"Embedding {len(all_texts)} chunks in batches of {_BATCH}...") + embeddings: list[list[float]] = [] + for i in range(0, len(all_texts), _BATCH): + embeddings.extend(_embed(all_texts[i : i + _BATCH], url, model)) + print(f" {min(i + _BATCH, len(all_texts))}/{len(all_texts)}", end="\r", flush=True) + print() + + idx = 0 + for d in docs: + for c in d["chunks"]: + c["embedding"] = embeddings[idx] + idx += 1 + + with open(out, "wb") as f: + pickle.dump(docs, f) + print(f"Saved to {out}") + return docs + + +def _build_nodes(docs: list[dict]) -> list[TextNode]: + nodes = [] + for d in docs: + for c in d["chunks"]: + n = TextNode( + id_=c["node_id"], + text=c["text"], + metadata={"document_id": d["doc_id"], "modified": d["modified"]}, + ) + n.relationships = {} + n.embedding = c["embedding"] + nodes.append(n) + return nodes + + +def _in_filter(ids: list[str]) -> MetadataFilters: + return MetadataFilters( + filters=[MetadataFilter(key="document_id", operator=FilterOperator.IN, value=ids)] + ) + + +def _dir_bytes(path: str) -> int: + return sum(f.stat().st_size for f in Path(path).rglob("*") if f.is_file()) + + +def _sqlite_bytes(uri: str) -> int: + p = Path(uri) / "llmindex.db" + return p.stat().st_size if p.exists() else 0 + + +def run_bench( + store, + nodes: list[TextNode], + docs: list[dict], + q_iters: int, + is_lance: bool, +) -> dict: + doc_ids = [d["doc_id"] for d in docs] + filter_ids = doc_ids[: max(1, len(doc_ids) // 5)] + q_vecs = [nodes[i * 10 % len(nodes)].embedding for i in range(q_iters)] + by_doc: dict[str, list[TextNode]] = {} + for n in nodes: + by_doc.setdefault(n.metadata["document_id"], []).append(n) + uri = store._uri + + # insert + t0 = time.perf_counter() + store.add(list(nodes)) + r: dict = {"insert": time.perf_counter() - t0} + + # query plain + times = [] + for emb in q_vecs: + t0 = time.perf_counter() + store.query(VectorStoreQuery(query_embedding=emb, similarity_top_k=10)) + times.append(time.perf_counter() - t0) + r["qp50"] = statistics.median(times) + r["qp95"] = sorted(times)[int(len(times) * 0.95)] + + # query filtered + times = [] + flt = _in_filter(filter_ids) + for emb in q_vecs: + t0 = time.perf_counter() + store.query(VectorStoreQuery(query_embedding=emb, similarity_top_k=10, filters=flt)) + times.append(time.perf_counter() - t0) + r["qfp50"] = statistics.median(times) + r["qfp95"] = sorted(times)[int(len(times) * 0.95)] + + # get_modified_times + times = [] + for _ in range(20): + t0 = time.perf_counter() + store.get_modified_times() + times.append(time.perf_counter() - t0) + r["gmt_p50"] = statistics.median(times) + + # upsert (fresh node IDs, same embeddings) + times = [] + for doc in docs[:q_iters]: + orig = by_doc.get(doc["doc_id"], []) + if not orig: + continue + fresh = [] + for o in orig: + fn = TextNode( + id_=str(uuid.uuid4()), + text=o.text, + metadata=o.metadata.copy(), + ) + fn.relationships = {} + fn.embedding = o.embedding + fresh.append(fn) + t0 = time.perf_counter() + store.upsert_document(doc["doc_id"], fresh) + times.append(time.perf_counter() - t0) + r["up50"] = statistics.median(times) if times else 0.0 + r["up95"] = sorted(times)[int(len(times) * 0.95)] if times else 0.0 + + r["size_pre"] = _dir_bytes(uri) if is_lance else _sqlite_bytes(uri) + + # compact + t0 = time.perf_counter() + if is_lance: + store.compact(retention_seconds=0) + else: + store.compact(force=True) + r["compact"] = time.perf_counter() - t0 + + r["size_post"] = _dir_bytes(uri) if is_lance else _sqlite_bytes(uri) + return r + + +def _pct(lv: float | None, sv: float) -> str: + if lv is None or lv == 0: + return "N/A" + p = (sv - lv) / lv * 100 + return f"{'+' if p > 0 else ''}{p:.0f}%" + + +def print_results(nodes: list[TextNode], q_iters: int, lance: dict | None, sq: dict) -> None: + W = 30 + n, dim = len(nodes), len(nodes[0].embedding) + print(f"\n=== Vector Store Benchmark ===") + print(f"Nodes: {n} | Dim: {dim} | Query iters: {q_iters}\n") + lh = "LanceDB" if lance else "LanceDB (N/A)" + print(f"{'Operation':<{W}} {lh:<22} {'sqlite-vec':<22} {'Delta'}") + print("-" * (W + 66)) + + def _s(v: float) -> str: + return f"{v:.3f}s" + + def _ms(v: float) -> str: + return f"{v * 1000:.1f}ms" + + def _mb(v: float) -> str: + return f"{v / 1e6:.1f} MB" + + def row(label: str, lv: float | None, sv: float, fmt) -> None: + ls = fmt(lv) if lv is not None else "N/A" + print(f"{label:<{W}} {ls:<22} {fmt(sv):<22} {_pct(lv, sv)}") + + def row2(label: str, lv1: float | None, lv2: float | None, sv1: float, sv2: float) -> None: + def ms_pair(a: float, b: float) -> str: + return f"{_ms(a)} / {_ms(b)}" + ls = ms_pair(lv1, lv2) if lv1 is not None else "N/A" + print(f"{label:<{W}} {ls:<22} {ms_pair(sv1, sv2):<22} {_pct(lv1, sv1)}") + + L = lance + row(f"insert ({n} nodes)", L["insert"] if L else None, sq["insert"], _s) + row2("query plain p50/p95", + L["qp50"] if L else None, L["qp95"] if L else None, sq["qp50"], sq["qp95"]) + row2("query filtered p50/p95", + L["qfp50"] if L else None, L["qfp95"] if L else None, sq["qfp50"], sq["qfp95"]) + row("get_modified_times p50", L["gmt_p50"] if L else None, sq["gmt_p50"], _ms) + row2("upsert p50/p95", + L["up50"] if L else None, L["up95"] if L else None, sq["up50"], sq["up95"]) + row("compact", L["compact"] if L else None, sq["compact"], _s) + row("file size pre-compact", L["size_pre"] if L else None, sq["size_pre"], _mb) + row("file size post-compact", L["size_post"] if L else None, sq["size_post"], _mb) + + +def main() -> None: + ap = argparse.ArgumentParser(description="Vector store head-to-head benchmark") + ap.add_argument("--n-docs", type=int, default=DEFAULT_N_DOCS) + ap.add_argument("--chunks-per-doc", type=int, default=DEFAULT_CHUNKS_PER_DOC) + ap.add_argument("--data-file", default=DEFAULT_DATA_FILE) + ap.add_argument("--regenerate", action="store_true") + ap.add_argument("--ollama-url", default=DEFAULT_OLLAMA_URL) + ap.add_argument("--embed-model", default=DEFAULT_EMBED_MODEL) + ap.add_argument("--query-iters", type=int, default=DEFAULT_QUERY_ITERS) + args = ap.parse_args() + + warm_up(args.ollama_url, args.embed_model) + + data_path = Path(args.data_file) + if args.regenerate or not data_path.exists(): + docs = generate_and_save( + args.n_docs, args.chunks_per_doc, args.ollama_url, args.embed_model, args.data_file + ) + else: + print(f"Loading {args.data_file}...") + with open(data_path, "rb") as f: + docs = pickle.load(f) + print(f"Loaded {len(docs)} docs ({sum(len(d['chunks']) for d in docs)} nodes)") + + all_nodes = _build_nodes(docs) + + lance_r = None + if _LANCE_OK: + print("\nBenchmarking LanceDB...") + with tempfile.TemporaryDirectory() as d: + store = PaperlessLanceVectorStore(uri=d) + lance_r = run_bench(store, all_nodes, docs, args.query_iters, is_lance=True) + else: + print("Skipping LanceDB (PaperlessLanceVectorStore not importable).") + + print("\nBenchmarking sqlite-vec...") + with tempfile.TemporaryDirectory() as d: + store = PaperlessSqliteVecVectorStore(uri=d) + sqlite_r = run_bench(store, all_nodes, docs, args.query_iters, is_lance=False) + + print_results(all_nodes, args.query_iters, lance_r, sqlite_r) + + +if __name__ == "__main__": + main() +``` + +- [ ] **Step 3: Run the benchmark and save output** + +```bash +cd src && uv run python bench_vector_store.py 2>&1 | tee bench_results.txt +``` + +First run: Faker generates docs and embeds ~6000 chunks via Ollama (a few minutes). The warm-up call fires first so model-load time does not skew timings. Results are written to `bench_results.txt`. + +Expected: both stores complete all operations without error; a comparison table is printed. A regression is any sqlite-vec operation significantly (>2x) slower than LanceDB. Note that `compact` differs in character (Lance uses MVCC cleanup vs. sqlite-vec full table rebuild) and is not a direct apples-to-apples comparison. + +--- + +#### Phase B: Final implementation + +- [ ] **Step 4: Replace the file content** + +```python +import json +import logging +import sqlite3 +import struct +from collections.abc import Iterator +from collections.abc import Sequence +from contextlib import contextmanager +from pathlib import Path +from typing import Any + +import sqlite_vec +from llama_index.core.bridge.pydantic import PrivateAttr +from llama_index.core.schema import BaseNode +from llama_index.core.vector_stores.types import BasePydanticVectorStore +from llama_index.core.vector_stores.types import FilterCondition +from llama_index.core.vector_stores.types import FilterOperator +from llama_index.core.vector_stores.types import MetadataFilters +from llama_index.core.vector_stores.types import VectorStoreQuery +from llama_index.core.vector_stores.types import VectorStoreQueryResult +from llama_index.core.vector_stores.utils import metadata_dict_to_node +from llama_index.core.vector_stores.utils import node_to_metadata_dict + +logger = logging.getLogger("paperless_ai.vector_store") + +DB_FILENAME = "llmindex.db" +DEFAULT_TABLE_NAME = "documents" + +# compact(): rebuild when the cumulative rowid count exceeds this multiple of +# the live row count. DELETEs on vec0 tables never reclaim space (upstream +# asg017/sqlite-vec#54), so per-document re-index churn grows the file until +# a rebuild copies the live rows into a fresh table. +COMPACT_BLOAT_RATIO = 2.0 + +# Filterable vec0 metadata columns. _build_where() only ever receives filter +# keys we construct ourselves, but allowlisting keeps SQL identifiers safe by +# construction. +_FILTER_COLUMNS = frozenset({"document_id", "modified"}) + + +def _pack(embedding: Sequence[float]) -> bytes: + return struct.pack(f"{len(embedding)}f", *embedding) + + +def _unpack(blob: bytes) -> list[float]: + return list(struct.unpack(f"{len(blob) // 4}f", blob)) + + +def _build_where(filters: MetadataFilters | None) -> tuple[str, list[str]]: + """Translate the EQ / IN filters we use into a parameterized SQL clause + on vec0 metadata columns. Returns ("", []) when there is nothing to filter. + """ + if filters is None or not filters.filters: + return "", [] + clauses: list[str] = [] + params: list[str] = [] + for f in filters.filters: + if f.key not in _FILTER_COLUMNS: # pragma: no cover - we build the keys + raise NotImplementedError(f"Unsupported filter column: {f.key}") + if f.operator == FilterOperator.IN: + values = [str(v) for v in f.value] + if not values: + clauses.append("1 = 0") + continue + placeholders = ",".join("?" for _ in values) + clauses.append(f"{f.key} IN ({placeholders})") + params.extend(values) + elif f.operator == FilterOperator.EQ: + clauses.append(f"{f.key} = ?") + params.append(str(f.value)) + else: # pragma: no cover - we only ever build EQ/IN filters + raise NotImplementedError(f"Unsupported filter operator: {f.operator}") + joiner = " OR " if filters.condition == FilterCondition.OR else " AND " + return "(" + joiner.join(clauses) + ")", params + + +class PaperlessSqliteVecVectorStore(BasePydanticVectorStore): + """A llama-index vector store backed by a sqlite-vec vec0 table. + + Stores one row per node: the node id (TEXT primary key), its document id + (metadata column, used for EQ/IN filtering and per-document delete), the + document's modified timestamp, the embedding (float32, cosine metric), and + the serialized node (text + metadata) as JSON in an auxiliary column. + ``stores_text`` lets llama-index run off this store alone, with no + separate docstore or index store. + + Everything lives in one SQLite database file (``DB_FILENAME``) inside the + directory given as ``uri`` (kept as a directory for compatibility with the + previous LanceDB layout). WAL mode allows readers in other processes to + proceed while the (FileLock-serialized) writer holds a transaction. + + Implemented surface of ``BasePydanticVectorStore`` + --------------------------------------------------- + Only the methods actively used by this codebase are implemented. + ``delete_nodes`` and the ``node_ids`` lookup path of ``get_nodes`` are + part of the llama-index interface contract and may be needed if a future + retriever or extension invokes them — add them then, with tests. + """ + + stores_text: bool = True + flat_metadata: bool = False + + _uri: str = PrivateAttr() + _table_name: str = PrivateAttr() + _embed_model_name: str | None = PrivateAttr() + _conn: Any = PrivateAttr() + + def __init__( + self, + uri: str, + table_name: str = DEFAULT_TABLE_NAME, + embed_model_name: str | None = None, + ) -> None: + super().__init__(stores_text=True, flat_metadata=False) + self._uri = uri + self._table_name = table_name + self._embed_model_name = embed_model_name + self._conn = sqlite3.connect( + str(Path(uri) / DB_FILENAME), + timeout=30, + isolation_level=None, # autocommit; explicit transactions below + ) + self._conn.row_factory = sqlite3.Row + self._conn.enable_load_extension(True) + sqlite_vec.load(self._conn) + self._conn.enable_load_extension(False) + self._conn.execute("PRAGMA journal_mode=WAL") + self._conn.execute("PRAGMA synchronous=NORMAL") + self._conn.execute( + "CREATE TABLE IF NOT EXISTS index_meta (key TEXT PRIMARY KEY, value TEXT)", + ) + + @property + def client(self) -> Any: + return self._conn + + @contextmanager + def _transaction(self) -> Iterator[None]: + self._conn.execute("BEGIN IMMEDIATE") + try: + yield + except BaseException: + self._conn.execute("ROLLBACK") + raise + else: + self._conn.execute("COMMIT") + + def _meta_get(self, key: str) -> str | None: + row = self._conn.execute( + "SELECT value FROM index_meta WHERE key = ?", + (key,), + ).fetchone() + return row["value"] if row else None + + def _meta_set(self, key: str, value: str) -> None: + self._conn.execute( + "INSERT INTO index_meta (key, value) VALUES (?, ?) " + "ON CONFLICT(key) DO UPDATE SET value = excluded.value", + (key, value), + ) + + def table_exists(self) -> bool: + return ( + self._conn.execute( + "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?", + (self._table_name,), + ).fetchone() + is not None + ) + + def vector_dim(self) -> int | None: + if not self.table_exists(): + return None + value = self._meta_get("dim") + return int(value) if value else None + + def drop_table(self) -> None: + self._conn.execute(f"DROP TABLE IF EXISTS {self._table_name}") + self._conn.execute("DELETE FROM index_meta") + + def stored_model_name(self) -> str | None: + """Return the embedding model name recorded at table creation, or None.""" + if not self.table_exists(): + return None + return self._meta_get("embed_model") + + def config_mismatch(self, model_name: str) -> bool: + """True when the stored model name differs from ``model_name``. + + Returns False when no table exists or when the table predates + model-name tracking — conservative default avoids spurious rebuilds. + """ + stored = self.stored_model_name() + if stored is None: + return False + return stored != model_name + + def _create_table(self, dim: int) -> None: + # document_id is deliberately a metadata column, NOT a partition key: + # partition keys change KNN `k` to per-partition semantics under IN + # filters (asg017/sqlite-vec#142); metadata columns give a correct + # global top-k. + self._conn.execute( + f"""CREATE VIRTUAL TABLE {self._table_name} USING vec0( + id TEXT PRIMARY KEY, + document_id TEXT, + modified TEXT, + +node_content TEXT, + embedding float[{dim}] distance_metric=cosine + )""", + ) + self._meta_set("dim", str(dim)) + if self._embed_model_name: + self._meta_set("embed_model", self._embed_model_name) + + def _ensure_table(self, dim: int) -> None: + if not self.table_exists(): + self._create_table(dim) + + def _row(self, node: BaseNode) -> tuple[str, str, str, str, bytes]: + meta = node_to_metadata_dict( + node, + remove_text=False, + flat_metadata=self.flat_metadata, + ) + # vec0 metadata columns reject NULL (asg017/sqlite-vec#141): coerce + # every value to a string, with "" as the absent sentinel. + document_id = node.ref_doc_id or node.metadata.get("document_id") + return ( + node.node_id, + str(document_id or ""), + str(node.metadata.get("modified") or ""), + json.dumps(meta), + _pack(node.get_embedding()), + ) + + _INSERT = "INSERT INTO {t} (id, document_id, modified, node_content, embedding) VALUES (?, ?, ?, ?, ?)" + + def add(self, nodes: Sequence[BaseNode], **add_kwargs: Any) -> list[str]: + if not nodes: + return [] + rows = [self._row(node) for node in nodes] + with self._transaction(): + self._ensure_table(len(nodes[0].get_embedding())) + self._conn.executemany(self._INSERT.format(t=self._table_name), rows) + return [node.node_id for node in nodes] + + def upsert_document(self, document_id: str, nodes: list[BaseNode]) -> list[str]: + """Atomically replace all stored chunks of ``document_id`` with ``nodes``. + + One transaction deletes the document's existing rows and inserts the + new set (vec0's INSERT OR REPLACE is broken upstream, #259, so + delete+insert it is). WAL readers in other processes see either the + old or the new chunk set, never a partial state. + """ + rows = [self._row(node) for node in nodes] + with self._transaction(): + if nodes: + self._ensure_table(len(nodes[0].get_embedding())) + if self.table_exists(): + self._conn.execute( + f"DELETE FROM {self._table_name} WHERE document_id = ?", + (str(document_id),), + ) + if rows: + self._conn.executemany(self._INSERT.format(t=self._table_name), rows) + return [node.node_id for node in nodes] + + def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: + if self.table_exists(): + with self._transaction(): + self._conn.execute( + f"DELETE FROM {self._table_name} WHERE document_id = ?", + (str(ref_doc_id),), + ) + + def _rows_to_nodes(self, rows: list[sqlite3.Row]) -> list[BaseNode]: + nodes: list[BaseNode] = [] + for row in rows: + node = metadata_dict_to_node(json.loads(row["node_content"])) + node.embedding = _unpack(row["embedding"]) + nodes.append(node) + return nodes + + def get_nodes( + self, + node_ids: list[str] | None = None, + filters: MetadataFilters | None = None, + **kwargs: Any, + ) -> list[BaseNode]: + if node_ids is not None: # pragma: no cover + # node_ids lookup is not implemented; see class docstring. + raise NotImplementedError( + "PaperlessSqliteVecVectorStore does not support node_ids lookup", + ) + if not self.table_exists(): + return [] + where, params = _build_where(filters) + sql = f"SELECT node_content, embedding FROM {self._table_name}" + if where: + sql += f" WHERE {where}" + return self._rows_to_nodes(self._conn.execute(sql, params).fetchall()) + + def query( + self, + query: VectorStoreQuery, + **kwargs: Any, + ) -> VectorStoreQueryResult: + if not self.table_exists(): + return VectorStoreQueryResult(nodes=[], similarities=[], ids=[]) + top_k = query.similarity_top_k if query.similarity_top_k is not None else 10 + where, params = _build_where(query.filters) + sql = ( + f"SELECT id, node_content, embedding, distance FROM {self._table_name} " + "WHERE embedding MATCH ? AND k = ?" + ) + if where: + sql += f" AND {where}" + rows = self._conn.execute( + sql, + [_pack(query.query_embedding), top_k, *params], + ).fetchall() + # vec0 returns rows distance-sorted ascending; slice defensively in + # case future schema changes alter k semantics (e.g. partition keys + # return k rows per partition). + rows = rows[:top_k] + nodes = self._rows_to_nodes(rows) + # Cosine distance in [0, 2]; map to a descending similarity. + sims = [1.0 - float(row["distance"]) for row in rows] + ids = [row["id"] for row in rows] + return VectorStoreQueryResult(nodes=nodes, similarities=sims, ids=ids) + + def get_modified_times(self) -> dict[str, str]: + """Return {document_id: stored_modified_isoformat} for all indexed documents. + + All chunks of a document share the same ``modified`` value, so the + first row seen per document is sufficient. + """ + if not self.table_exists(): + return {} + result: dict[str, str] = {} + for row in self._conn.execute( + f"SELECT document_id, modified FROM {self._table_name}", + ): + doc_id = str(row["document_id"]) + if doc_id not in result: + result[doc_id] = str(row["modified"] or "") + return result + + def compact(self, *, force: bool = False) -> None: + """Rebuild the table to reclaim space left behind by DELETEs. + + vec0 DELETE only invalidates rows; the vector data stays in the file + forever (asg017/sqlite-vec#54), and per-document re-indexing is a + delete+insert. When the cumulative rowid count exceeds + ``COMPACT_BLOAT_RATIO`` x the live row count (or when forced), copy + the live rows into a fresh table, swap it in, and VACUUM. + """ + if not self.table_exists(): + return + live = self._conn.execute( + f"SELECT count(*) FROM {self._table_name}", + ).fetchone()[0] + total = self._conn.execute( + f"SELECT count(*) FROM {self._table_name}_rowids", + ).fetchone()[0] + if not force and total <= max(live, 1) * COMPACT_BLOAT_RATIO: + return + dim = self.vector_dim() + if dim is None: # pragma: no cover - dim is written at creation + logger.warning("Skipping compact: no stored vector dimension") + return + logger.info( + "Compacting LLM index (%d live rows, %d cumulative)", + live, + total, + ) + original, tmp = self._table_name, f"{self._table_name}_compact" + with self._transaction(): + self._conn.execute(f"DROP TABLE IF EXISTS {tmp}") + self._table_name = tmp + try: + self._create_table(dim) + finally: + self._table_name = original + self._conn.execute( + f"INSERT INTO {tmp} (id, document_id, modified, node_content, embedding) " + f"SELECT id, document_id, modified, node_content, embedding FROM {original}", + ) + self._conn.execute(f"DROP TABLE {original}") + self._conn.execute(f"ALTER TABLE {tmp} RENAME TO {original}") + self._conn.execute("VACUUM") +``` + +- [ ] **Step 5: Run the vector store tests** + +```bash +cd src && uv run pytest paperless_ai/tests/test_vector_store.py --override-ini="addopts=" 2>&1 | tail -5 +``` + +Expected: all PASS. Debugging notes for likely failures: + +- `OperationalError: no such module: vec0` -> the extension did not load; check `sqlite_vec.load` ordering against `enable_load_extension`. +- `UNIQUE constraint failed on t primary key` in upsert tests -> the DELETE did not run inside the same transaction before the INSERT. +- A failure in `test_force_compact_preserves_rows_and_metadata` around `ALTER TABLE ... RENAME` would mean vec0 0.1.9's rename path misbehaves for this schema (the 0.1.10 alphas fixed rename bugs for non-FLAT tables; FLAT tables are expected to work). Fallback design if that happens: rebuild into a brand-new database file (`llmindex.db.compact`), checkpoint, `os.replace` onto `llmindex.db`, then reconnect; implement that instead and keep the same tests. + +- [ ] **Step 6: Commit** + +```bash +git add src/paperless_ai/vector_store.py src/bench_vector_store.py +git commit -m "Enhancement(beta): switch AI vector store from LanceDB to sqlite-vec + +Fixes the non-AVX2 SIGILL class (#12970) at the root: lancedb is no +longer imported. sqlite-vec 0.1.9 wheels carry no baked SIMD, vec0 +metadata columns give parameterized EQ/IN filtering, WAL preserves the +lock-free-reader model, and compact() rebuilds the table because vec0 +DELETEs never reclaim space. + +Co-Authored-By: Claude Fable 5 " +``` + +--- + +### Task 4: Switch indexing.py to the new store + +**Files:** + +- Modify: `src/paperless_ai/indexing.py` + +- [ ] **Step 1: Update imports/type hints and construction sites** + +In `src/paperless_ai/indexing.py`, replace every `PaperlessLanceVectorStore` with `PaperlessSqliteVecVectorStore` (TYPE_CHECKING import at ~line 23, `get_vector_store()` body and return annotation at ~lines 65-72, `write_store()` body at ~lines 86-94). The `uri=str(settings.LLM_INDEX_DIR)` argument and `LLM_INDEX_TABLE = "documents"` stay as they are. + +- [ ] **Step 2: Add the legacy-Lance cleanup helper** + +Add `import shutil` to the imports, then below `get_vector_store()`: + +```python +def _cleanup_legacy_lance_index() -> bool: + """Delete a LanceDB index left by a pre-sqlite-vec version, if present. + + Beta transition policy: no cross-store conversion; the caller forces a + full rebuild (re-embed) instead. Returns True when leftovers were found. + """ + legacy_table = settings.LLM_INDEX_DIR / f"{LLM_INDEX_TABLE}.lance" + found = legacy_table.exists() + if found: + shutil.rmtree(legacy_table, ignore_errors=True) + # faiss-era metadata file, removed on the same occasion + (settings.LLM_INDEX_DIR / "meta.json").unlink(missing_ok=True) + return found +``` + +- [ ] **Step 3: Wire it into update_llm_index and drop the Lance-only maintenance calls** + +In `update_llm_index()`: + +1. At the very top of the function body (before the `documents = ...` line), add: + +```python + if _cleanup_legacy_lance_index(): + logger.warning( + "Found a LanceDB index from a previous version; forcing a full rebuild.", + ) + rebuild = True +``` + +2. In the rebuild branch, delete the now-redundant line `(settings.LLM_INDEX_DIR / "meta.json").unlink(missing_ok=True)` (the helper handles it). + +3. At the end of the `with write_store(...)` block, replace: + +```python + store.ensure_document_id_scalar_index() + store.maybe_create_ann_index() + store.compact(retention_seconds=60 * 60) # 1 hour: safe for in-flight readers +``` + +with: + +```python + store.compact() +``` + +(`compact()` is now threshold-gated and rebuild-based; WAL snapshot isolation protects in-flight readers, so no retention window is needed.) + +- [ ] **Step 4: Update the other two call sites** + +In `llm_index_add_or_update_document()`, delete the `store.ensure_document_id_scalar_index()` line. In `llm_index_compact()`, change `store.compact(retention_seconds=0)` to `store.compact(force=True)` and update its docstring to: `"""Compact the index immediately, rebuilding the table to reclaim space."""` + +- [ ] **Step 5: Run the indexing tests, expect failures only in Lance-specific assertions** + +```bash +cd src && uv run pytest paperless_ai/tests/test_ai_indexing.py --override-ini="addopts=" 2>&1 | tail -15 +``` + +Expected: most tests pass; failures concentrated where tests reach into Lance internals (`isinstance` check ~line 690, direct-table row counts ~lines 482-508). Those are fixed next. + +- [ ] **Step 6: Port the Lance-specific test assertions** + +In `src/paperless_ai/tests/test_ai_indexing.py`: + +1. Rename `class TestLanceDbIndexing` to `class TestVectorStoreIndexing` and change its isinstance assertion (~lines 687-690) to: + +```python + from paperless_ai.vector_store import PaperlessSqliteVecVectorStore + + store = indexing.get_vector_store() + assert isinstance(store, PaperlessSqliteVecVectorStore) +``` + +2. Find the direct-row-count assertions (`rg -n "to_list|count_rows|open_table" src/paperless_ai/tests/test_ai_indexing.py`). Replace each direct Lance table read with the store's own connection, e.g. a zero-rows assertion becomes: + +```python + store = indexing.get_vector_store() + assert not store.table_exists() or ( + store.client.execute("SELECT count(*) FROM documents").fetchone()[0] == 0 + ) +``` + +and a "table exists with N rows" precondition becomes: + +```python + store = indexing.get_vector_store() + assert store.table_exists() + assert store.client.execute("SELECT count(*) FROM documents").fetchone()[0] > 0 +``` + +3. Update the docstring/comment mentions of "LanceDB" in this file (`rg -n "Lance" ...`) to "the vector store" or "sqlite-vec" as reads naturally; do not change test logic beyond the direct-access ports. + +- [ ] **Step 7: Run indexing + chat tests** + +```bash +cd src && uv run pytest paperless_ai/tests/test_ai_indexing.py paperless_ai/tests/test_chat.py --override-ini="addopts=" 2>&1 | tail -5 +``` + +Expected: all PASS (`chat.py` only consumes `load_or_build_index()`, no direct store APIs; verify with `rg -n "vector_store|lancedb" src/paperless_ai/chat.py` -> no hits). + +- [ ] **Step 8: Commit** + +```bash +git add src/paperless_ai/indexing.py src/paperless_ai/tests/test_ai_indexing.py +git commit -m "Enhancement(beta): wire indexing pipeline to the sqlite-vec store + +Co-Authored-By: Claude Fable 5 " +``` + +--- + +### Task 5: Move Filename / Storage Path / ASN from embedded text to node metadata + +**Files:** + +- Modify: `src/paperless_ai/embedding.py` (`build_llm_index_text`, ~lines 114-128) +- Modify: `src/paperless_ai/indexing.py` (`build_document_node` metadata dict, ~lines 106-119) +- Modify: `src/paperless_ai/tests/test_embedding.py`, `src/paperless_ai/tests/test_ai_indexing.py` + +Resolves the `embedding.py:115` TODO. These three short structured values get the same treatment title/tags/correspondent/document_type received in PR #12944: excluded from the embedded text (they add noise, not semantic signal) but visible to the LLM via llama-index's metadata prepend. Notes and Custom Fields deliberately stay in the body (long free text / dynamic count). This changes every document's embedded text, which is exactly why it ships inside this rebuild-everything transition instead of later. + +- [ ] **Step 1: Update the embedding-text test expectations** + +In `src/paperless_ai/tests/test_embedding.py`, find the test asserting body-text content (~lines 227-238, the blocks commented "Structured fields live in node.metadata..." / "Fields without a metadata equivalent stay in body text"). Move the three fields to the excluded group: + +```python + # Structured fields live in node.metadata for LLM context — not body text + assert "Title: Test Title" not in result + assert "Created: 2023-01-01" not in result + assert "Tags: Tag1, Tag2" not in result + assert "Document Type: Invoice" not in result + assert "Correspondent: Test Correspondent" not in result + assert "Filename:" not in result + assert "Storage Path:" not in result + assert "Archive Serial Number:" not in result + + # Fields without a metadata equivalent stay in body text + assert "Notes: Note1,Note2" in result + assert "Content:\n\nThis is the document content." in result + assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result +``` + +- [ ] **Step 2: Add node-metadata expectations** + +In `src/paperless_ai/tests/test_ai_indexing.py`, find the `build_document_node` test asserting `nodes[0].metadata["document_id"]` (~line 35) and extend it: + +```python + assert nodes[0].metadata["filename"] == real_document.filename + assert nodes[0].metadata["storage_path"] == ( + real_document.storage_path.name if real_document.storage_path else None + ) + assert ( + nodes[0].metadata["archive_serial_number"] + == real_document.archive_serial_number + ) + assert "filename" in nodes[0].excluded_embed_metadata_keys + assert "filename" not in nodes[0].excluded_llm_metadata_keys +``` + +(Check the `real_document` fixture's actual attribute values first and mirror them; if it sets none of these, the assertions above still hold with `None` values, which is the point: absent values are `None` in node metadata, consistent with the existing `correspondent`/`document_type` convention.) + +- [ ] **Step 3: Run both test files to verify the new assertions fail** + +```bash +cd src && uv run pytest paperless_ai/tests/test_embedding.py paperless_ai/tests/test_ai_indexing.py --override-ini="addopts=" 2>&1 | tail -5 +``` + +Expected: FAIL on the new assertions only ("Filename:" still in result; KeyError "filename" in metadata). + +- [ ] **Step 4: Implement** + +In `src/paperless_ai/embedding.py` `build_llm_index_text()`, delete these three lines from the `lines` list: + +```python + f"Filename: {doc.filename}", + f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}", + f"Archive Serial Number: {doc.archive_serial_number or ''}", +``` + +and replace the TODO comment above the list with: + +```python + # Short structured fields (filename, storage path, ASN, title, tags, ...) live + # in node.metadata: excluded from embeddings, shown to the LLM via metadata + # prepend. Notes and Custom Fields stay in the body: Notes can be long free + # text, Custom Fields are dynamic in count and best kept in the embedding. +``` + +In `src/paperless_ai/indexing.py` `build_document_node()`, extend the metadata dict (after the `"document_type"` entry): + +```python + "filename": document.filename, + "storage_path": document.storage_path.name if document.storage_path else None, + "archive_serial_number": document.archive_serial_number, +``` + +(`None`/int values are fine here: this dict is serialized into the node-content JSON, not into vec0 metadata columns; only `document_id` and `modified` are columns with the NULL restriction. `excluded_embed_metadata_keys=list(metadata.keys())` already covers the new keys; `excluded_llm_metadata_keys` stays `["document_id"]`.) + +- [ ] **Step 5: Run the tests again** + +```bash +cd src && uv run pytest paperless_ai/tests/test_embedding.py paperless_ai/tests/test_ai_indexing.py --override-ini="addopts=" 2>&1 | tail -5 +``` + +Expected: all PASS. + +- [ ] **Step 6: Commit** + +```bash +git add src/paperless_ai/embedding.py src/paperless_ai/indexing.py src/paperless_ai/tests/test_embedding.py src/paperless_ai/tests/test_ai_indexing.py +git commit -m "Enhancement(beta): move filename/storage path/ASN to node metadata + +Same treatment as title/tags/correspondent in #12944: excluded from +the embedded text, visible to the LLM via metadata prepend. Changes +embedded text for every document, so it ships inside the sqlite-vec +transition, whose forced rebuild re-embeds everything anyway. + +Co-Authored-By: Claude Fable 5 " +``` + +--- + +### Task 6: Legacy Lance index transition tests + +**Files:** + +- Create: `src/paperless_ai/tests/test_legacy_lance_cleanup.py` + +Per project convention, the transition behavior gets its own dedicated test file. + +- [ ] **Step 1: Write the tests** + +```python +from pathlib import Path + +import pytest + +from documents.models import Document +from paperless_ai import indexing + + +@pytest.fixture +def legacy_lance_dir(temp_llm_index_dir: Path) -> Path: + """Simulate leftovers of a pre-sqlite-vec LanceDB index.""" + lance_table = temp_llm_index_dir / "documents.lance" + (lance_table / "data").mkdir(parents=True) + (lance_table / "data" / "0000.lance").write_bytes(b"not a real lance file") + (temp_llm_index_dir / "meta.json").write_text("{}") + return lance_table + + +@pytest.mark.django_db +class TestLegacyLanceCleanup: + def test_update_removes_legacy_dir_and_forces_rebuild( + self, + legacy_lance_dir: Path, + temp_llm_index_dir: Path, + mock_embed_model, + document_factory, + caplog: pytest.LogCaptureFixture, + ) -> None: + document_factory(title="doc a", content="first document") + indexing.update_llm_index(rebuild=False) + assert not legacy_lance_dir.exists() + assert not (temp_llm_index_dir / "meta.json").exists() + assert "forcing a full rebuild" in caplog.text + store = indexing.get_vector_store() + assert store.table_exists() + + def test_update_without_legacy_dir_does_not_force_rebuild( + self, + temp_llm_index_dir: Path, + mock_embed_model, + document_factory, + caplog: pytest.LogCaptureFixture, + ) -> None: + document_factory(title="doc a", content="first document") + indexing.update_llm_index(rebuild=False) + caplog.clear() + indexing.update_llm_index(rebuild=False) + assert "forcing a full rebuild" not in caplog.text + + def test_cleanup_helper_reports_absence(self, temp_llm_index_dir: Path) -> None: + assert indexing._cleanup_legacy_lance_index() is False # noqa: SLF001 +``` + +Note: check `src/paperless_ai/tests/conftest.py` and `src/documents/tests/conftest.py` for the existing document-creation fixture name before running — `rg -n "document_factory|def make_document" src/paperless_ai/tests/ src/documents/tests/conftest.py`. If the AI tests build documents differently (e.g. direct `Document.objects.create(...)` with `checksum`/`title`), mirror that exact pattern here instead of `document_factory`; `test_ai_indexing.py` is the reference for how this app's tests create documents and invoke `update_llm_index`. + +- [ ] **Step 2: Run them** + +```bash +cd src && uv run pytest paperless_ai/tests/test_legacy_lance_cleanup.py --override-ini="addopts=" 2>&1 | tail -5 +``` + +Expected: all PASS (the implementation landed in Task 4; this task is the dedicated coverage for it). + +- [ ] **Step 3: Commit** + +```bash +git add src/paperless_ai/tests/test_legacy_lance_cleanup.py +git commit -m "Test(beta): cover legacy LanceDB index cleanup and forced rebuild + +Co-Authored-By: Claude Fable 5 " +``` + +--- + +### Task 7: Remove lancedb (and check pyarrow), update lazy-import guard + +**Files:** + +- Modify: `pyproject.toml`, `uv.lock`, `src/paperless_ai/tests/test_lazy_imports.py` + +- [ ] **Step 1: Confirm nothing references lancedb anymore** + +```bash +rg -ln "lancedb" src/ --iglob '!**/test_lazy_imports.py' +``` + +Expected: no hits. If any remain, fix them before proceeding. + +- [ ] **Step 2: Remove the dependency** + +```bash +uv remove lancedb +``` + +- [ ] **Step 3: Check whether pyarrow is still needed** + +```bash +rg -n "pyarrow" pyproject.toml src/ --iglob '!uv.lock' +uv pip list 2>/dev/null | rg -i pyarrow +``` + +If `pyproject.toml` lists pyarrow as a direct dependency and the only `src/` references are the lazy-import test string, run `uv remove pyarrow`. If pyarrow remains in `uv.lock` as a transitive dependency of something else, leave it; the lazy-import test still guards against it leaking into the light path. + +- [ ] **Step 4: Update the lazy-import leak list** + +In `src/paperless_ai/tests/test_lazy_imports.py`, change the leak list line to: + +```python + "leaked = [m for m in ('lancedb', 'pyarrow', 'llama_index', 'sqlite_vec') " +``` + +(Keeping `lancedb`/`pyarrow` in the list is free: absent packages can never appear in `sys.modules`, and the guard survives any accidental reintroduction.) + +- [ ] **Step 5: Run the lazy import test and the full AI app suite** + +```bash +cd src && uv run pytest paperless_ai/tests/test_lazy_imports.py --override-ini="addopts=" 2>&1 | tail -3 +cd src && uv run pytest paperless_ai/ --override-ini="addopts=" 2>&1 | tail -5 +``` + +Expected: all PASS. + +- [ ] **Step 6: Commit** + +```bash +git add pyproject.toml uv.lock src/paperless_ai/tests/test_lazy_imports.py +git commit -m "Chore(beta): drop lancedb dependency + +Fixes #12970: the package whose wheels SIGILL on non-AVX2 CPUs is no +longer installed at all. + +Co-Authored-By: Claude Fable 5 " +``` + +--- + +### Task 8: Full verification sweep + +**Files:** none (verification only) + +- [ ] **Step 1: Management command tests and the wider documents suite** + +```bash +cd src && uv run pytest documents/tests/management/test_management_document_llmindex.py --override-ini="addopts=" 2>&1 | tail -5 +cd src && uv run pytest documents/ paperless_ai/ -n auto 2>&1 | tail -5 +``` + +Expected: all PASS. The llmindex command tests exercise `rebuild|update|compact` through `indexing.py`; if a compact test asserted Lance-version behavior, port the assertion to the new semantics (file shrinks / table intact), keeping the test's intent. + +- [ ] **Step 2: Lint** + +```bash +prek run --files $(git diff --name-only beta...HEAD | tr '\n' ' ') +``` + +Expected: clean (or auto-fixed; re-add and amend if prek rewrites files). + +- [ ] **Step 3: The point of it all — ISA verification** + +If qemu-user is available (`which qemu-x86_64`), run the smoke check that the AI path no longer imports anything AVX2-baked: + +```bash +cd src && uv run python -c "import sys; print(sys.executable)" +# then, using that interpreter path: +qemu-x86_64 -cpu Westmere -c " +import sqlite3, sqlite_vec, struct +db = sqlite3.connect(':memory:') +db.enable_load_extension(True) +sqlite_vec.load(db) +db.execute('create virtual table v using vec0(embedding float[384])') +db.execute('insert into v(rowid, embedding) values (1, ?)', (struct.pack('384f', *([0.5]*384)),)) +print(db.execute('select rowid from v where embedding match ? and k = 1', (struct.pack('384f', *([0.4]*384)),)).fetchone()) +print('OK: sqlite-vec works on a pre-AVX2 CPU') +" +``` + +Expected: `(1,)` then the OK line, exit 0. + +- [ ] **Step 4: Update project memory** + +Append to the memory file `project_vector_store_alternatives.md`: transition implemented on branch `feature-sqlitevec-vector-store`, lancedb removed, and any gotchas discovered during implementation (especially if the compact rename fallback from Task 3 Step 2 was needed). + +- [ ] **Step 5: Final commit if anything moved, then summarize the branch** + +```bash +git log --oneline beta..HEAD +git diff --stat beta...HEAD +``` + +Hand the branch to the user for PR creation (PRs are the user's call; do not push or open one unprompted). + +--- + +## Self-review checklist (already applied) + +- Spec coverage: every section of the design doc maps to a task (pin+canary -> Task 1; schema/store -> Tasks 2-3; indexing wiring + compact semantics -> Task 4; metadata restructure -> Task 5; migration-from-Lance -> Tasks 4 and 6; dependency changes -> Task 7; test plan + ISA check -> Tasks 2, 6, 8). Deliberately out of scope per the user: schema-migration machinery (second spec, lands after this branch with an empty registry). +- The `_INSERT` class attribute is defined in Task 3 before both uses; `DB_FILENAME` is exported and imported by the Task 2 tests; `compact(force=...)` keyword matches between store (Task 3) and indexing (Task 4); `_cleanup_legacy_lance_index` is defined in Task 4 and referenced by Task 5 tests. +- Known soft spots called out inline rather than hidden: the `document_factory` fixture name (Task 5 Step 1 note), possible llmindex-command test assertions (Task 7 Step 1), and the vec0 ALTER RENAME fallback (Task 3 Step 2). +- `query()` slices to `top_k` defensively even though metadata-column `k` is already global, and `_build_where` allowlists filter columns, so user data never reaches SQL identifiers; values always travel as bound parameters. diff --git a/docs/superpowers/plans/2026-06-11-unicode-nfc-normalization.md b/docs/superpowers/plans/2026-06-11-unicode-nfc-normalization.md new file mode 100644 index 000000000..81c3ed817 --- /dev/null +++ b/docs/superpowers/plans/2026-06-11-unicode-nfc-normalization.md @@ -0,0 +1,462 @@ +# Unicode NFC Normalization for Filesystem Paths Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Ensure all filesystem paths stored in the database and written to disk use NFC Unicode normalization, preventing "file not found" failures caused by byte-level mismatches between visually identical filenames (e.g., NFD `ü` = `u + combining diaeresis` vs NFC `ü` = single codepoint U+00FC). + +**Architecture:** The fix has two layers. The primary fix normalizes the output of `clean_filepath()` in `FilePathTemplate.render()` — this is the single choke point through which all template-rendered filenames pass. Defense-in-depth changes normalize input strings before `pathvalidate.sanitize_filename()` in the context builder functions. A separate fix normalizes mail attachment filenames at the entry point. Existing documents with NFD paths will be transparently migrated to NFC on their next save (the file move logic already handles the case where old and new paths differ). + +**Tech Stack:** Python `unicodedata.normalize('NFC', ...)`, `pathvalidate`, Django, Jinja2, pytest + +--- + +## Background: The Bug + +`pathvalidate.sanitize_filename()` removes illegal filesystem characters but does **not** normalize Unicode. NFC `ü` (UTF-8: `c3 bc`) and NFD `ü` (UTF-8: `75 cc 88`) are visually identical but produce different byte sequences. On Linux filesystems with no normalization (default ZFS, ext4), these are treated as distinct filenames. If an LLM or OCR engine produces NFD text for a document title, the generated filesystem path contains NFD bytes. If the same title is later regenerated in NFC form (LLM output is non-deterministic), the path lookup fails: `old_source_path.is_file()` returns `False` even though a file with the same visual name exists on disk. + +## File Structure + +| File | Change | +| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `src/documents/templating/filepath.py` | Add NFC normalization in `clean_filepath()` (primary fix) + input normalization in `get_basic_metadata_context()`, `get_tags_context()`, `get_custom_fields_context()` (defense-in-depth) | +| `src/paperless_mail/mail.py` | Normalize attachment filenames before `pathvalidate.sanitize_filename()` | +| `src/documents/tests/test_file_handling.py` | Tests for NFC normalization in `generate_filename()` | +| `src/paperless_mail/tests/test_mail.py` | Tests for NFC normalization in mail attachment handling | + +--- + +## Task 1: Normalize `clean_filepath()` output (primary fix) + +This is the single choke point. ALL template-rendered paths pass through `clean_filepath()` before being stored in `document.filename`. Fixing this alone prevents the bug for every path generated via the filename format system — including `{{ title }}` (sanitized context), `{{ document.title }}` (raw context), `{{ correspondent }}`, and every other template variable. + +**Files:** + +- Modify: `src/documents/templating/filepath.py:36-48` +- Test: `src/documents/tests/test_file_handling.py` + +- [ ] **Step 1: Write failing tests** + +Add these tests to `src/documents/tests/test_file_handling.py`, inside `class TestFileHandling`: + +```python +import unicodedata + +@override_settings(FILENAME_FORMAT="{{ title }}") +def test_generate_filename_nfc_normalizes_nfd_title(self) -> None: + """NFD title (u + combining diaeresis) must produce NFC path bytes.""" + nfd_title = unicodedata.normalize("NFD", "Gemüse") + nfc_title = unicodedata.normalize("NFC", "Gemüse") + assert nfd_title != nfc_title # confirm inputs differ at byte level + + doc = Document.objects.create(title=nfd_title, mime_type="application/pdf") + result = generate_filename(doc) + + assert str(result) == f"{nfc_title}.pdf" + assert str(result).encode() == f"{nfc_title}.pdf".encode() + +@override_settings(FILENAME_FORMAT="{{ correspondent }}/{{ title }}") +def test_generate_filename_nfc_normalizes_nfd_correspondent(self) -> None: + """NFD correspondent name must produce NFC path component.""" + nfd_name = unicodedata.normalize("NFD", "Müller") + nfc_name = unicodedata.normalize("NFC", "Müller") + + correspondent = Correspondent.objects.create(name=nfd_name) + doc = Document.objects.create( + title="invoice", + correspondent=correspondent, + mime_type="application/pdf", + ) + result = generate_filename(doc) + + assert str(result) == f"{nfc_name}/invoice.pdf" + assert str(result).encode() == f"{nfc_name}/invoice.pdf".encode() + +@override_settings(FILENAME_FORMAT="{{ document.title }}") +def test_generate_filename_nfc_normalizes_raw_document_title_in_template(self) -> None: + """NFD title accessed via document.title (unsanitized context) must also be NFC.""" + nfd_title = unicodedata.normalize("NFD", "Café") + nfc_title = unicodedata.normalize("NFC", "Café") + + doc = Document.objects.create(title=nfd_title, mime_type="application/pdf") + result = generate_filename(doc) + + assert str(result) == f"{nfc_title}.pdf" + assert str(result).encode() == f"{nfc_title}.pdf".encode() +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +uv run pytest --override-ini="addopts=" src/documents/tests/test_file_handling.py::TestFileHandling::test_generate_filename_nfc_normalizes_nfd_title src/documents/tests/test_file_handling.py::TestFileHandling::test_generate_filename_nfc_normalizes_nfd_correspondent src/documents/tests/test_file_handling.py::TestFileHandling::test_generate_filename_nfc_normalizes_raw_document_title_in_template -v +``` + +Expected: all three FAIL (NFD title produces NFD path, assertion fails). + +- [ ] **Step 3: Add NFC normalization to `clean_filepath()`** + +In `src/documents/templating/filepath.py`, add `import unicodedata` at the top of the file and modify `clean_filepath()`: + +```python +import unicodedata # add to top-of-file imports + +class FilePathTemplate(Template): + def render(self, *args, **kwargs) -> str: + def clean_filepath(value: str) -> str: + """ + Clean up a filepath by: + 1. Normalizing to NFC Unicode form to prevent byte-level mismatches + between visually identical filenames on case-sensitive filesystems + 2. Removing newlines and carriage returns + 3. Removing extra spaces before and after forward slashes + 4. Preserving spaces in other parts of the path + """ + value = unicodedata.normalize("NFC", value) + value = value.replace("\n", "").replace("\r", "") + value = re.sub(r"\s*/\s*", "/", value) + + # We remove trailing and leading separators, as these are always relative paths, not absolute, even if the user + # tries + return value.strip().strip(os.sep) + + original_render = super().render(*args, **kwargs) + + return clean_filepath(original_render) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +uv run pytest --override-ini="addopts=" src/documents/tests/test_file_handling.py::TestFileHandling::test_generate_filename_nfc_normalizes_nfd_title src/documents/tests/test_file_handling.py::TestFileHandling::test_generate_filename_nfc_normalizes_nfd_correspondent src/documents/tests/test_file_handling.py::TestFileHandling::test_generate_filename_nfc_normalizes_raw_document_title_in_template -v +``` + +Expected: all three PASS. + +- [ ] **Step 5: Run the full file-handling test suite to check for regressions** + +```bash +uv run pytest --override-ini="addopts=" src/documents/tests/test_file_handling.py -v +``` + +Expected: all existing tests continue to pass (ASCII titles are unaffected by NFC normalization). + +- [ ] **Step 6: Commit** + +```bash +git add src/documents/templating/filepath.py src/documents/tests/test_file_handling.py +git commit -m "Fix: normalize filesystem paths to NFC Unicode to prevent byte-level mismatches" +``` + +--- + +## Task 2: Defense-in-depth normalization in context builders + +`clean_filepath()` (Task 1) fixes the rendered path. These changes normalize the input strings that go into `pathvalidate.sanitize_filename()` within the context builders — belt-and-suspenders so the sanitized shorthand variables (`{{ title }}`, `{{ correspondent }}`, `{{ tag_list }}`, `{{ custom_fields }}`) are also NFC before sanitization. This matters because the sanitized strings could theoretically be compared directly against DB-stored values in other contexts. + +**Files:** + +- Modify: `src/documents/templating/filepath.py:171-319` +- Test: `src/documents/tests/test_file_handling.py` + +- [ ] **Step 1: Write failing tests** + +Add these tests to `TestFileHandling` in `src/documents/tests/test_file_handling.py`: + +```python +@override_settings(FILENAME_FORMAT="{{ tag_list }}/{{ title }}") +def test_generate_filename_nfc_normalizes_nfd_tag_list(self) -> None: + """NFD tag names must produce NFC path component in tag_list.""" + nfd_name = unicodedata.normalize("NFD", "Büro") + nfc_name = unicodedata.normalize("NFC", "Büro") + + doc = Document.objects.create(title="doc", mime_type="application/pdf") + doc.tags.create(name=nfd_name) + result = generate_filename(doc) + + assert str(result) == f"{nfc_name}/doc.pdf" + assert str(result).encode() == f"{nfc_name}/doc.pdf".encode() +``` + +- [ ] **Step 2: Run test to verify it fails** + +```bash +uv run pytest --override-ini="addopts=" src/documents/tests/test_file_handling.py::TestFileHandling::test_generate_filename_nfc_normalizes_nfd_tag_list -v +``` + +Expected: FAIL. (The tag_list is already caught by `clean_filepath()` from Task 1, but we want a test that directly validates input normalization through the sanitize call.) + +Note: this test may already pass after Task 1 due to `clean_filepath()`. If so, keep the test as a regression guard and move straight to the implementation. + +- [ ] **Step 3: Normalize inputs in `get_basic_metadata_context()`** + +In `src/documents/templating/filepath.py`, update `get_basic_metadata_context()`. The `unicodedata` import was added in Task 1. + +```python +def get_basic_metadata_context( + document: Document, + *, + no_value_default: str = NO_VALUE_PLACEHOLDER, +) -> dict[str, str]: + """ + Given a Document, constructs some basic information about it. If certain values are not set, + they will be replaced with the no_value_default. + + Regardless of set or not, the values will be sanitized + """ + return { + "title": pathvalidate.sanitize_filename( + unicodedata.normalize("NFC", document.title), + replacement_text="-", + ), + "correspondent": pathvalidate.sanitize_filename( + unicodedata.normalize("NFC", document.correspondent.name), + replacement_text="-", + ) + if document.correspondent + else no_value_default, + "document_type": pathvalidate.sanitize_filename( + unicodedata.normalize("NFC", document.document_type.name), + replacement_text="-", + ) + if document.document_type + else no_value_default, + "asn": str(document.archive_serial_number) + if document.archive_serial_number + else no_value_default, + "owner_username": document.owner.username + if document.owner + else no_value_default, + "original_name": PurePath(document.original_filename).with_suffix("").name + if document.original_filename + else no_value_default, + "doc_pk": f"{document.pk:07}", + } +``` + +- [ ] **Step 4: Normalize inputs in `get_tags_context()`** + +Update `get_tags_context()` in the same file: + +```python +def get_tags_context(tags: Iterable[Tag]) -> dict[str, str | list[str]]: + """ + Given an Iterable of tags, constructs some context from them for usage + """ + return { + "tag_list": pathvalidate.sanitize_filename( + ",".join( + sorted(unicodedata.normalize("NFC", tag.name) for tag in tags), + ), + replacement_text="-", + ), + # Assumed to be ordered, but a template could loop through to find what they want + "tag_name_list": [unicodedata.normalize("NFC", x.name) for x in tags], + } +``` + +- [ ] **Step 5: Normalize string-type inputs in `get_custom_fields_context()`** + +Update `get_custom_fields_context()` in the same file. Only string-type fields (MONETARY, STRING, URL, LONG_TEXT, SELECT) go through `sanitize_filename()`; the others (dates, numbers, booleans) cannot contain non-ASCII unicode. Also normalize the field name itself. + +```python +def get_custom_fields_context( + custom_fields: Iterable[CustomFieldInstance], +) -> dict[str, dict[str, dict[str, str]]]: + """ + Given an Iterable of CustomFieldInstance, builds a dictionary mapping the field name + to its type and value + """ + field_data = {"custom_fields": {}} + for field_instance in custom_fields: + type_ = pathvalidate.sanitize_filename( + field_instance.field.data_type, + replacement_text="-", + ) + if field_instance.value is None: + value = None + # String types need to be sanitized + elif field_instance.field.data_type in { + CustomField.FieldDataType.MONETARY, + CustomField.FieldDataType.STRING, + CustomField.FieldDataType.URL, + CustomField.FieldDataType.LONG_TEXT, + }: + value = pathvalidate.sanitize_filename( + unicodedata.normalize("NFC", field_instance.value), + replacement_text="-", + ) + elif ( + field_instance.field.data_type == CustomField.FieldDataType.SELECT + and field_instance.field.extra_data["select_options"] is not None + ): + options = field_instance.field.extra_data["select_options"] + value = pathvalidate.sanitize_filename( + unicodedata.normalize( + "NFC", + next( + option["label"] + for option in options + if option["id"] == field_instance.value + ), + ), + replacement_text="-", + ) + else: + value = field_instance.value + field_data["custom_fields"][ + pathvalidate.sanitize_filename( + unicodedata.normalize("NFC", field_instance.field.name), + replacement_text="-", + ) + ] = { + "type": type_, + "value": value, + } + return field_data +``` + +- [ ] **Step 6: Run the new test and full test suite** + +```bash +uv run pytest --override-ini="addopts=" src/documents/tests/test_file_handling.py -v +``` + +Expected: all tests pass, including the new tag test. + +- [ ] **Step 7: Commit** + +```bash +git add src/documents/templating/filepath.py src/documents/tests/test_file_handling.py +git commit -m "Fix: normalize context builder inputs to NFC before sanitize_filename (defense-in-depth)" +``` + +--- + +## Task 3: Normalize mail attachment filenames + +Email attachment filenames come from MIME headers and can be in any Unicode normalization depending on the sending client. These flow into `document.original_filename` and then into `{{ original_name }}` template context. They also become the temp file name created on disk. + +**Files:** + +- Modify: `src/paperless_mail/mail.py` +- Test: `src/paperless_mail/tests/test_mail.py` + +- [ ] **Step 1: Find the exact lines in mail.py** + +```bash +grep -n "sanitize_filename" src/paperless_mail/mail.py +``` + +Expected output (line numbers may vary): + +``` +NNN: attachment_name = pathvalidate.sanitize_filename(att.filename) +NNN: filename=pathvalidate.sanitize_filename(att.filename), +NNN: filename=pathvalidate.sanitize_filename(f"{message.subject}.eml"), +``` + +Note the line numbers for the next step. + +- [ ] **Step 2: Write a failing test** + +Find an existing test in `src/paperless_mail/tests/test_mail.py` that exercises attachment filename handling (search for `sanitize_filename` or `att.filename` in that file to find a good base test to copy). Add a new test that uses an NFD attachment filename. + +The following test goes into the appropriate `TestCase` class in `src/paperless_mail/tests/test_mail.py`. Look at the file first to confirm the right class and mock patterns — the test below follows the existing pattern for mocking `MailMessage` and `Attachment` objects: + +```python +def test_attachment_filename_nfd_normalized_to_nfc(self) -> None: + """Mail attachment filenames with NFD encoding must be normalized to NFC.""" + import unicodedata + nfd_name = unicodedata.normalize("NFD", "Rechnung März.pdf") + nfc_name = unicodedata.normalize("NFC", "Rechnung März.pdf") + assert nfd_name != nfc_name # confirm inputs differ at byte level + + # Use whatever mock/factory pattern exists in this test file for creating + # a fake attachment with a specific filename, then run the mail handler, + # and assert that document.original_filename == nfc_name (not nfd_name). + # Adapt the mock setup to match the test file's existing patterns exactly. +``` + +To find the right mock pattern: `grep -n "att.filename\|Attachment\|MailMessage\|MagicMock" src/paperless_mail/tests/test_mail.py | head -20` + +- [ ] **Step 3: Run the test to verify it fails** + +```bash +uv run pytest --override-ini="addopts=" src/paperless_mail/tests/test_mail.py -k "test_attachment_filename_nfd" -v +``` + +Expected: FAIL. + +- [ ] **Step 4: Add `import unicodedata` to mail.py** + +At the top of `src/paperless_mail/mail.py`, add: + +```python +import unicodedata +``` + +- [ ] **Step 5: Normalize attachment filenames in mail.py** + +At each of the three `pathvalidate.sanitize_filename` call sites found in Step 1, wrap the input string with `unicodedata.normalize("NFC", ...)`: + +For the attachment temp file creation: + +```python +attachment_name = pathvalidate.sanitize_filename( + unicodedata.normalize("NFC", att.filename) +) +``` + +For the metadata override filename: + +```python +filename=pathvalidate.sanitize_filename( + unicodedata.normalize("NFC", att.filename) +), +``` + +For the EML subject filename: + +```python +filename=pathvalidate.sanitize_filename( + unicodedata.normalize("NFC", f"{message.subject}.eml") +), +``` + +- [ ] **Step 6: Run the mail test suite** + +```bash +uv run pytest --override-ini="addopts=" src/paperless_mail/tests/test_mail.py -v +``` + +Expected: all tests pass, including the new NFD normalization test. + +- [ ] **Step 7: Commit** + +```bash +git add src/paperless_mail/mail.py src/paperless_mail/tests/test_mail.py +git commit -m "Fix: normalize mail attachment filenames to NFC Unicode" +``` + +--- + +## Self-Review Checklist + +### Spec coverage + +| Requirement | Covered by | +| --------------------------------------------------------- | ----------------------------------------------------- | +| `clean_filepath()` normalizes all template-rendered paths | Task 1 Step 3 | +| `{{ title }}` (sanitized context) produces NFC output | Task 1 test + Task 2 Step 3 | +| `{{ document.title }}` (raw context) produces NFC output | Task 1 test | +| `{{ correspondent }}` produces NFC output | Task 1 test + Task 2 Step 3 | +| `{{ tag_list }}` and `tag_name_list` produce NFC output | Task 2 Steps 1+4 | +| Custom field string values produce NFC output | Task 2 Step 5 | +| Mail attachment filenames normalized at entry point | Task 3 | +| Existing NFD files auto-migrate to NFC on next save | Handled by existing move logic; no code change needed | + +### Notes for implementer + +- The `FILENAME_FORMAT` setting accepts old-style `{title}` format strings, which `convert_format_str_to_template_format()` converts to Jinja2 `{{ title }}` before rendering. Tests using `@override_settings(FILENAME_FORMAT="{{ title }}")` use Jinja2 syntax directly. +- Run tests with `--override-ini="addopts="` to disable coverage and parallelism for faster iteration. +- The `unicodedata` module is part of the Python standard library — no new dependency. +- NFC is the right normalization form for filenames: it is the default on macOS (HFS+/APFS) and the form most databases and text processing tools produce. NFD is what macOS HFS+ _internally_ normalizes to when writing (but presents as NFC), and what some OCR/LLM outputs occasionally produce. diff --git a/docs/superpowers/specs/2026-05-15-scheduled-backup-design.md b/docs/superpowers/specs/2026-05-15-scheduled-backup-design.md new file mode 100644 index 000000000..dc8ece25c --- /dev/null +++ b/docs/superpowers/specs/2026-05-15-scheduled-backup-design.md @@ -0,0 +1,225 @@ +# Scheduled Backup Design + +**Date**: 2026-05-15 +**Status**: Approved + +## Overview + +Add a scheduled backup system to paperless-ngx that exports documents as zip files on a user-configurable schedule, retaining the last N backups. The schedule timing is configured via an env var (consistent with all other scheduled tasks), while the backup-specific configuration (output directory, keep count) lives in a new database model editable through the API and UI. + +## Goals + +- Automated periodic exports without manual intervention +- Zip-based output for simple, unambiguous rotation +- Opt-in: no backup runs unless explicitly configured +- Strongly typed export contract usable by both the CLI and the scheduled task +- UI-editable backup config, no additional env vars beyond the cron schedule + +## Non-Goals + +- Encrypted backups (future enhancement) +- Age-based or size-based rotation (count-only for now) +- Remote/cloud backup destinations +- Import automation + +--- + +## Section 1: Data Model and API + +### `BackupConfiguration` model + +New singleton model in `src/paperless/models.py`, following the same `AbstractSingletonModel` pattern as `ApplicationConfiguration`. + +```python +class BackupConfiguration(AbstractSingletonModel): + output_dir = models.CharField( + verbose_name=_("Backup output directory"), + max_length=1024, + blank=True, + default="", + ) + keep_count = models.PositiveIntegerField( + verbose_name=_("Number of backups to keep"), + default=5, + help_text=_("Set to 0 to keep all backups."), + ) + + class Meta: + verbose_name = _("Backup configuration") +``` + +- `output_dir` blank/empty means backup is disabled (the task treats it as a no-op). +- `output_dir` must be an absolute path. The serializer validates this via a custom validator; `run_export` also calls `.resolve()` on the path unconditionally. +- `keep_count = 0` means keep all backups; no rotation is performed. + +### Migration + +The migration is created in `src/paperless/migrations/` (not `src/documents/migrations/`), since `BackupConfiguration` lives in the `paperless` app. + +### API + +- **Serializer**: `BackupConfigurationSerializer` in `src/paperless/serialisers.py` +- **ViewSet**: `BackupConfigurationViewSet` in `src/paperless/views.py` — singleton GET/PATCH, same pattern as `ApplicationConfiguration` +- **Route**: `/api/backup_config/` registered in `src/paperless/urls.py` + +--- + +## Section 2: Export Module + +New module `src/documents/export.py` contains the export contract and core logic, extracted from `document_exporter`'s `handle()` method. + +### `ExportOptions` dataclass + +```python +@dataclass +class ExportOptions: + target: Path + compare_checksums: bool = False + compare_json: bool = False + delete: bool = False + use_filename_format: bool = False + no_archive: bool = False + no_thumbnail: bool = False + use_folder_prefix: bool = False + split_manifest: bool = False + zip_export: bool = False + zip_name: str | None = None # None -> default date-based name + data_only: bool = False + passphrase: str | None = None + batch_size: int = 500 +``` + +`zip_name = None` means the caller wants the default date-based name. `run_export` resolves `None` internally to `f"export-{timezone.localdate().isoformat()}"` before use — callers never need to supply a default. The scheduled task always passes an explicit timestamped name. + +### `run_export(options: ExportOptions) -> None` + +The body of the current `Command.handle()` in `document_exporter` moves here, reading from `ExportOptions` instead of parsed CLI options. No behaviour changes. + +### Refactored `document_exporter` management command + +Becomes a thin CLI adapter: + +1. Parse arguments (unchanged) +2. Construct `ExportOptions` from parsed args +3. Call `run_export(options)` + +--- + +## Section 3: Scheduled Task and Rotation + +### `scheduled_backup` task in `src/documents/tasks.py` + +``` +1. Load BackupConfiguration (singleton) +2. If output_dir is blank, log a debug message and return (no-op, no PaperlessTask created) +3. Create a PaperlessTask record (TriggerSource.SCHEDULED) to track this run +4. Build zip_name as local-time timestamp: "export-YYYY-MM-DD-HHMMSS" + using Django's timezone.localtime() +5. Construct ExportOptions( + target=Path(config.output_dir), + zip_export=True, + zip_name=zip_name, + ) +6. Call run_export(options) +7. If keep_count > 0: + zips = sorted(Path(config.output_dir).glob("export-*.zip"), key=lambda p: p.stat().st_mtime) + for old_zip in zips[:-keep_count]: + old_zip.unlink() +8. Mark PaperlessTask as complete (handled by signal handlers) +``` + +Key design notes: + +- Rotation uses `export-*.zip` glob, not `*.zip`, to avoid matching zip files in the directory that paperless did not create. +- Rotation occurs only after a successful export, so a failed run does not consume a rotation slot. +- The timestamp format `YYYY-MM-DD-HHMMSS` in local time ensures multiple runs per day produce distinct filenames without collision. + +### PaperlessTask integration + +`PaperlessTask` lifecycle is managed entirely by the Celery signal handlers in `src/documents/signals/handlers.py`, not manually inside the task body. + +**Changes to `TRACKED_TASKS` and `PaperlessTask.TaskType`:** + +- Add `PaperlessTask.TaskType.BACKUP` to the `TaskType` enum in `src/documents/models.py` +- Add `"documents.tasks.scheduled_backup": PaperlessTask.TaskType.BACKUP` to `TRACKED_TASKS` + +**Conditional tracking — the no-op case:** + +When `BackupConfiguration.output_dir` is blank the task returns immediately, so no record should appear in the Tasks panel. This requires explicit handling in all five signal handlers. Relying on incidental safety (filters that match 0 rows, `DoesNotExist` guards) is fragile and unclear to future maintainers. + +The approach for each handler when the task type is `BACKUP`: + +| Handler | Current behaviour when no record exists | Required change | +| ----------------------------- | ---------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- | +| `before_task_publish_handler` | Creates the record | Check `BackupConfiguration.get_solo().output_dir`; skip `PaperlessTask.objects.create()` if blank | +| `task_prerun_handler` | `.filter().update()` — silent no-op | Add explicit early return if `BACKUP` task type and no record exists for `task_id` | +| `task_postrun_handler` | `DoesNotExist: return` — incidentally safe | Add explicit early return if `BACKUP` task type and no record exists for `task_id` | +| `task_failure_handler` | `.filter().first()` returns `None`, update skipped — incidentally safe | Add explicit early return if `BACKUP` task type and no record exists for `task_id` | +| `task_revoked_handler` | `.filter().update()` — silent no-op | Add explicit early return if `BACKUP` task type and no record exists for `task_id` | + +Extract a helper `_backup_task_is_tracked(task_id: str) -> bool` that returns `PaperlessTask.objects.filter(task_id=task_id).exists()`. The four downstream handlers call this after the `TRACKED_TASKS` check and return early if it returns `False` for a `BACKUP` task. This makes the intent explicit: "this task was intentionally not tracked for this invocation." + +--- + +## Section 4: Beat Schedule + +Add to the task list in `parse_beat_schedule()` in `src/paperless/settings/custom.py`: + +```python +{ + "name": "Scheduled document backup", + "env_key": "PAPERLESS_EXPORT_TASK_CRON", + "env_default": "disable", + "task": "documents.tasks.scheduled_backup", + "options": { + "expires": 1.0 * 60.0 * 60.0, # 1 hour + }, +}, +``` + +- Default is `"disable"` — the task is not added to the beat schedule unless the env var is explicitly set. +- Setting `PAPERLESS_EXPORT_TASK_CRON=disable` (or simply not setting it) produces no scheduled task and no noise. +- Typical user value: `"0 2 * * *"` (daily at 02:00 local server time). +- `expires` is set to 1 hour: if a scheduled backup has not started within 1 hour of its trigger time (e.g., the Celery worker was down), it is discarded rather than running late. Unlike other tasks whose expiry is tied to a known default interval, this task has a user-defined schedule. 1 hour is a conservative value that prevents stale backup tasks from piling up without being so short that it causes problems on a normally-running worker. + +--- + +## Section 5: Frontend + +Location to be decided by co-maintainer (dedicated "Backup" page vs. section within Application Settings). The API contract is independent of this decision. + +The UI requires two fields: + +- **Output directory** — text input for `output_dir` (absolute path on the server) +- **Keep count** — number input for `keep_count`, with a note that 0 means keep all + +The component performs a GET to `/api/backup_config/` on load and a PATCH on save, identical to how the Application Settings component works. + +--- + +## File Change Summary + +| File | Change | +| -------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | +| `src/paperless/models.py` | Add `BackupConfiguration` model | +| `src/paperless/serialisers.py` | Add `BackupConfigurationSerializer` | +| `src/paperless/views.py` | Add `BackupConfigurationViewSet` | +| `src/paperless/urls.py` | Register `/api/backup_config/` route | +| `src/paperless/settings/custom.py` | Add `PAPERLESS_EXPORT_TASK_CRON` beat entry | +| `src/documents/export.py` | New module: `ExportOptions`, `run_export()` | +| `src/documents/management/commands/document_exporter.py` | Thin wrapper around `run_export()` | +| `src/documents/models.py` | Add `PaperlessTask.TaskType.BACKUP` | +| `src/documents/signals/handlers.py` | Add `BACKUP` to `TRACKED_TASKS`; add `_backup_task_is_tracked()`; update all 5 signal handlers | +| `src/documents/tasks.py` | Add `scheduled_backup` task | +| `src-ui/` | New or extended settings component (location TBD) | +| `src/paperless/migrations/` | New migration for `BackupConfiguration` | + +--- + +## Testing + +- **`src/paperless/tests/test_backup_config.py`** — model, serializer, API (GET/PATCH) +- **`src/documents/tests/test_export.py`** — new unit tests for `run_export()` directly; `test_management_exporter.py` retains its existing CLI wiring tests and gains tests for the thin-wrapper behaviour +- **`src/documents/tests/test_tasks_backup.py`** — `scheduled_backup` task: no-op when `output_dir` blank, export called with correct options, rotation deletes correct files, rotation skipped when `keep_count=0` +- **`src/documents/tests/test_task_signals.py`** — signal handler behaviour for `BACKUP` task type: no record created when `output_dir` blank, all downstream handlers skip cleanly when no record exists, normal lifecycle when `output_dir` is set +- Frontend unit tests for the settings component diff --git a/docs/superpowers/specs/2026-05-26-interactive-shell-contenv-design.md b/docs/superpowers/specs/2026-05-26-interactive-shell-contenv-design.md new file mode 100644 index 000000000..bb587b391 --- /dev/null +++ b/docs/superpowers/specs/2026-05-26-interactive-shell-contenv-design.md @@ -0,0 +1,81 @@ +# Interactive Shell Container Environment + +**Date:** 2026-05-26 +**Branch:** fix-tanvity-index-lock (to be implemented on a new branch) +**Status:** Approved + +## Problem + +When paperless-ngx users open an interactive shell in the running container via `docker exec -it bash`, they do not see environment variables resolved from `*_FILE` secret injection. + +The `init-env-file` s6 init script reads `PAPERLESS_*_FILE` variables (e.g. `PAPERLESS_SECRET_KEY_FILE=/run/secrets/key`), reads the referenced file, and writes the resolved value (e.g. `PAPERLESS_SECRET_KEY=abc123`) to `/run/s6/container_environment/`. All s6-managed services and management command wrappers use the `#!/command/with-contenv` shebang, which reads that directory and injects all vars into the process environment before execution. + +`docker exec bash` bypasses s6 entirely. It is a non-login interactive shell launched directly by the Docker daemon, which provides only the original Docker-configured environment (the `*_FILE` paths, not the resolved values). Any manual command a user runs — such as `document_exporter` or `manage.py` calls — will be missing the resolved secrets unless they happen to also be set as plain Docker env vars. + +## Approach + +Source `/run/s6/container_environment/` in every interactive bash shell opened in the container, mirroring what `with-contenv` does for s6 services. + +Two hooks are needed because Debian uses different rc files for different shell types: + +- **Non-login interactive** (`docker exec bash`): sources `/etc/bash.bashrc` +- **Login interactive** (`docker exec bash --login`): sources `/etc/profile`, which auto-sources all `/etc/profile.d/*.sh` + +## Changes + +### 1. `docker/rootfs/etc/profile.d/contenv.sh` (new file) + +A POSIX-compatible shell script that exports all files in `/run/s6/container_environment/` as environment variables. Placed here so login shells pick it up automatically. + +```sh +#!/bin/sh +# Source s6 container environment for interactive shells. +# Ensures variables resolved from *_FILE secret injection are visible +# when using 'docker exec bash'. Does not affect s6 services (those +# use with-contenv directly). Has no effect in non-container contexts +# because the directory will not exist. +# Note: sh/dash shells opened via 'docker exec sh' are not covered; +# only bash-based sessions benefit from this file. +_pngx_contenv="/run/s6/container_environment" +if [ -d "${_pngx_contenv}" ]; then + for _pngx_f in "${_pngx_contenv}"/*; do + [ -f "${_pngx_f}" ] || continue + _pngx_name=$(basename "${_pngx_f}") + _pngx_val=$(cat "${_pngx_f}") + export "${_pngx_name}=${_pngx_val}" + done +fi +unset _pngx_contenv _pngx_f _pngx_name _pngx_val +``` + +### 2. Dockerfile `main-app` stage (one line added) + +Appends a source line to `/etc/bash.bashrc` so non-login interactive shells also pick up contenv. Added after the runtime package installation block, before the Python dependency installation. + +```dockerfile +RUN echo '. /etc/profile.d/contenv.sh' >> /etc/bash.bashrc +``` + +`/etc/bash.bashrc` is provided by the Debian base image and installed during the apt step, so it exists by the time this `RUN` executes. + +## Coverage + +| How user gets a shell | Gets contenv? | Mechanism | +| ---------------------------------------- | --------------------- | ---------------------------------------- | +| `docker exec -it container bash` | Yes | `/etc/bash.bashrc` sources `contenv.sh` | +| `docker exec -it container bash --login` | Yes | `/etc/profile.d/contenv.sh` auto-sourced | +| `docker exec -it container sh` | No (known limitation) | `sh` sources neither file | +| Management command wrappers | Already worked | `with-contenv` shebang | +| s6 services | Already worked | `with-contenv` shebang | + +## Edge Cases + +**Shell opened before `init-env-file` completes:** The directory exists but may not yet contain all resolved vars. The script exports what is present; missing vars are simply absent. No error is produced. + +**Variable value contains special characters:** `$(cat file)` strips only trailing newlines (which `init-env-file` already warns about). Other special characters are preserved correctly by the `export "NAME=VALUE"` form. + +**Directory does not exist (non-container use):** The `[ -d ]` guard makes the script a no-op. Safe to include in any Debian-based image. + +## Testing + +No automated test is added. This is container-bootstrap shell plumbing with no Python code path. Manual verification: run the container with a `*_FILE` secret, `docker exec bash`, and confirm the resolved variable is present in the environment. diff --git a/docs/superpowers/specs/2026-06-10-llmindex-schema-migrations-design.md b/docs/superpowers/specs/2026-06-10-llmindex-schema-migrations-design.md new file mode 100644 index 000000000..8f52f0356 --- /dev/null +++ b/docs/superpowers/specs/2026-06-10-llmindex-schema-migrations-design.md @@ -0,0 +1,138 @@ +# LLM Index Schema Migrations (second spec) + +Date: 2026-06-10 +Depends on: `docs/superpowers/specs/2026-06-10-sqlite-vec-vector-store-design.md` and its implementation plan (`docs/superpowers/plans/2026-06-10-sqlite-vec-transition.md`). This spec layers on top of the completed sqlite-vec transition; do not start it before that branch lands. +Supersedes: PR #12968 (in-place LanceDB migrations). The machinery design there is carried over nearly verbatim; only the storage backend specifics change. #12968 should be closed with a pointer here once this ships. + +Scope update (user decision, 2026-06-10): the `embedding.py:115` metadata restructure originally drafted as Part 2 of this spec was folded into the transition plan instead (its Task 5), because the transition forces a full rebuild anyway, so the embedded-text change rides along with no extra re-embed cost. This spec is now machinery-only: it ships with an EMPTY migration registry, ready for whatever schema change comes next. Part 2 below is retained as the worked example of how a re-embed migration would be registered, since the next one will not have a free rebuild to piggyback on. + +## Part 1: Schema migration machinery (ported from PR #12968) + +### What carries over unchanged + +The PR's design survives the store swap intact and is adopted as-is: + +- `Migration` frozen dataclass: `version: int`, `description: str`, `requires_reembed: bool`, `apply: Callable` (compare/hash-excluded field). +- `MIGRATIONS: list[Migration]` ordered registry + `CURRENT_SCHEMA_VERSION: Final[int]` in `vector_store.py`. To add a migration: bump the constant, append an entry. +- Store surface: `stored_schema_version() -> int` (0 when unrecorded, so pre-versioning tables treat every migration as pending), `pending_migrations()`, `requires_reembed_migration()`, `apply_structural_migrations() -> list[Migration]`. +- The stop-at-first-reembed-boundary rule in `apply_structural_migrations()`: structural migrations are applied in version order only up to the first pending `requires_reembed=True` entry, so the version counter can never jump past a re-embed boundary and silently skip the rebuild. (This was the subtle correctness insight of #12968; preserve the comment.) +- The `update_llm_index()` hook, verbatim from the PR: + +```python + with write_store(embed_model_name=model_name) as store: + if not rebuild and store.table_exists(): + store.apply_structural_migrations() + if store.requires_reembed_migration(): + logger.warning( + "Schema migration requires re-embedding; forcing LLM index rebuild.", + ) + rebuild = True +``` + +- Test approach from the PR: mock `MIGRATIONS`/`CURRENT_SCHEMA_VERSION` with `mocker.patch`, spy on `drop_table` to distinguish in-place from rebuild, one test per path (structural applied without rebuild; pending re-embed forces rebuild). + +### What changes for sqlite-vec + +**1. Version storage: `index_meta['schema_version']` instead of `schema_version.json`.** +The Lance store needed a sidecar JSON file because Lance had no convenient mutable metadata. The sqlite-vec store already has the `index_meta` key/value table, which is transactional with the data itself (a migration and its version bump commit atomically, which the file never could). Concretely: + +- `_create_table(dim)` additionally writes `schema_version = str(CURRENT_SCHEMA_VERSION)` (fresh tables are always current). +- `stored_schema_version()` reads the meta key, returns 0 on absence/garbage. +- `drop_table()` already does `DELETE FROM index_meta`, which clears the version with it. No sidecar file, no unlink bookkeeping. +- `apply_structural_migrations()` writes the new version inside the same transaction as the last applied migration. + +**2. `apply` receives the store, not a table handle.** +Lance migrations got the raw table for `add_columns`/`alter_columns`. vec0 virtual tables do not support arbitrary `ALTER TABLE`, so structural migrations are SQL against the store's connection. Signature: `apply: Callable[[PaperlessSqliteVecVectorStore], None]`. The store exposes what migrations need: `.client` (connection), `._table_name`, `.vector_dim()`, and the rebuild helper below. + +**3. Structural migrations are create+copy+rename, sharing the compact() machinery.** +The sqlite-vec `compact()` already implements the only structural mutation vec0 supports: build a new table, `INSERT INTO ... SELECT` (vectors copied bit-for-bit, no re-embedding), drop old, rename. Factor it into a shared helper on the store: + +```python +def rebuild_table( + self, + *, + create_sql: str | None = None, + copy_select: str | None = None, +) -> None: + """Copy live rows into a freshly created table and swap it in. + + Defaults reproduce the current schema (compaction). Structural + migrations pass a modified CREATE statement and a matching SELECT + (e.g. adding a column with a literal default). Runs in one + transaction; VACUUM afterwards. + """ +``` + +`compact()` becomes a thin caller (threshold check + `rebuild_table()`), and a structural migration like "add a `+page_count` aux column" is: + +```python +Migration( + version=2, + description="add page_count auxiliary column", + requires_reembed=False, + apply=lambda store: store.rebuild_table( + create_sql=..., # CREATE VIRTUAL TABLE ... with the new column + copy_select="SELECT id, document_id, modified, node_content, embedding, '' FROM {old}", + ), +) +``` + +A pleasant consequence: every structural migration is also a compaction (the copy drops dead rows), and the file-format risk surface is one helper with one test suite instead of two code paths. + +**4. Bootstrap version for the sqlite-vec store is 1.** +The transition plan ships the new store without machinery; tables it creates carry no `schema_version` key and therefore read as 0. This release lands with `CURRENT_SCHEMA_VERSION = 1` and `MIGRATIONS = []`, so the bootstrap is unconditionally safe: a 0-version table has no pending migrations and `apply_structural_migrations()` simply stamps it to 1. (The metadata restructure having moved into the transition itself is what makes this clean; the registry's first real entry will be v2, written against tables that are all stamped.) + +## Part 2 (worked example, IMPLEMENTED IN THE TRANSITION): the metadata TODO as a re-embed migration + +This section was implemented as Task 5 of the transition plan and ships with the store swap, not with this spec. It is kept as the reference example of how to register the next re-embed migration. + +### The change + +`build_llm_index_text()` currently embeds three short structured values in the body text: + +```python + f"Filename: {doc.filename}", + f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}", + f"Archive Serial Number: {doc.archive_serial_number or ''}", +``` + +Per the TODO, move them to `node.metadata` (excluded from embeddings, visible to the LLM via llama-index's metadata prepend), the same treatment title/tags/correspondent/document_type got in PR #12944. Notes and Custom Fields stay in the body (long free text / dynamic count, as the TODO says). + +1. `embedding.py build_llm_index_text()`: delete the three lines above (the `lines` list keeps Notes, Custom Fields, and Content). Update the TODO comment to describe only what remains intentional (Notes/Custom Fields stay embedded), or delete it. +2. `indexing.py build_document_node()` metadata dict gains: + +```python + "filename": doc.filename, + "storage_path": document.storage_path.name if document.storage_path else None, + "archive_serial_number": document.archive_serial_number, +``` + +(`None`/int values are fine here: this dict lives in the node-content JSON, not in vec0 metadata columns; only `document_id`/`modified` are columns with the NULL restriction. Matches the existing convention of `correspondent: None`.) 3. `excluded_embed_metadata_keys=list(metadata.keys())` already covers the new keys; `excluded_llm_metadata_keys` stays `["document_id"]` so the LLM sees the new fields. + +### Why this class of change needs a migration + +Removing the three lines changes the embedded text of every document, so stored vectors no longer match what the current code would embed. Incremental updates only re-embed documents whose `modified` changed, so without a forced rebuild the index would be a mixed old/new-text population indefinitely. This particular change escaped that fate only because the transition's forced rebuild covers it. The next embedded-text change will not have that luxury and gets registered like this: + +```python +CURRENT_SCHEMA_VERSION: Final[int] = 2 + +MIGRATIONS: list[Migration] = [ + Migration( + version=2, + description="", + requires_reembed=True, + apply=lambda store: None, + ), +] +``` + +On the first `update_llm_index` after upgrade, the hook sees the pending re-embed migration, logs, and rebuilds. + +### Test plan + +Machinery only (the metadata change is tested in the transition plan's Task 5). Port of the #12968 tests, dedicated file `test_vector_store_migrations.py`: structural migration applies in-place without `drop_table`; pending re-embed forces rebuild; version stamping on create/drop; bootstrap stamping of a pre-machinery 0-version table to 1; stop-at-boundary with a mixed [structural v2, reembed v3, structural v4] registry asserting v4 is NOT applied and the stored version stays at 2; `rebuild_table()` round-trips rows byte-for-byte (shared with compact tests). + +### Open questions + +- PR #12968 disposition: close with a comment pointing at this spec once the machinery lands (the Lance-specific `add_columns` path has no successor; vec0 cannot do in-place column adds). +- `created`/`added` fields are also candidates for future structural metadata work, but nothing needs them now (YAGNI; noted only so the next reader does not re-derive it). diff --git a/docs/superpowers/specs/2026-06-10-sqlite-vec-vector-store-design.md b/docs/superpowers/specs/2026-06-10-sqlite-vec-vector-store-design.md new file mode 100644 index 000000000..28fed7b6d --- /dev/null +++ b/docs/superpowers/specs/2026-06-10-sqlite-vec-vector-store-design.md @@ -0,0 +1,155 @@ +# sqlite-vec Vector Store Design (replaces PaperlessLanceVectorStore) + +Date: 2026-06-10 + +Context: LanceDB wheels SIGILL on non-AVX2 CPUs (#12970); research in `2026-06-10-vector-store-alternatives-research.md` selected sqlite-vec. This is a beta feature, so a one-time re-embed on upgrade is acceptable. Every claim marked [VERIFIED] below was empirically tested against the actual PyPI wheel (0.1.9, and 0.1.10a4 where noted), either in this repo's scratch harness (`/tmp/vstore-avx-test/explore_sqlitevec*.py`) or by the issues-audit agent. + +## Version pin: `sqlite-vec==0.1.9`, and why it is load-bearing + +- The 0.1.9 linux x86_64 wheel is built with **no SIMD flags at all** (`vec_debug()` shows empty build flags) and passed our qemu Westmere (SSE4.2, no AVX) and SandyBridge (AVX, no AVX2) emulation tests [VERIFIED]. This is the entire point of the migration. +- The **0.1.10-alpha.4 wheel regresses this**: built with `-mavx -DSQLITE_VEC_ENABLE_AVX` file-wide, no runtime CPU dispatch. It can SIGILL on AVX-less CPUs, including Goldmont Atom/Celeron NAS boxes, exactly the #12970 user base [VERIFIED via vec_debug on the wheel]. +- Guardrails: pin `==0.1.9` exactly; log `SELECT vec_version(), vec_debug()` at store init as an AVX canary; before ever bumping to 0.1.10+, re-check the wheel flags (and consider raising the runtime-dispatch issue upstream first). +- arm64: 0.1.9 manylinux aarch64 wheel is a proper ELF64 binary, no NEON flags baked [VERIFIED]. (The broken 32-bit "aarch64" wheel era was 0.1.6, fixed since.) +- No sdist on PyPI (asg017/sqlite-vec#211, open) and no musl wheels; fine for our Debian-based image, blocks Alpine bare-metal installs. + +## Schema + +One dedicated SQLite database file in `LLM_INDEX_DIR` (e.g. `llmindex.db`), never the Django DB. Connections set `PRAGMA journal_mode=WAL`, `busy_timeout`, `synchronous=NORMAL`. + +```sql +CREATE VIRTUAL TABLE nodes USING vec0( + id TEXT PRIMARY KEY, -- node_id (uuid) + document_id TEXT, -- METADATA column, deliberately NOT a partition key + modified TEXT, -- ISO timestamp; never NULL (sentinel "") + +node_content TEXT, -- auxiliary column: JSON payload, any size + embedding float[{dim}] distance_metric=cosine +); + +CREATE TABLE IF NOT EXISTS index_meta (key TEXT PRIMARY KEY, value TEXT); +-- rows: embed_model, dim, schema_version, created_by_vec_version +``` + +Design decisions, each verified on 0.1.9: + +- **`document_id` is a metadata column, not a partition key.** With a partition key, `k` applies per partition: `k=5 AND document_id IN (3 docs)` returns 15 rows (asg017/sqlite-vec#142, open) [VERIFIED]. As a metadata column the same query returns a correct global top-k of exactly 5 [VERIFIED]. `query_similar_documents()` passes permission-scoped `IN` lists, so per-partition semantics would over-fetch k x N(docs). At our scale the partition-pruning speedup is not needed (filtered KNN at 20K x 1024 was _faster_ than unfiltered: 39 ms vs 74 ms). +- **One document column, not two.** The Lance store carried both `doc_id` (ref_doc_id) and `document_id`; in our usage they are always the same value (`str(document.id)`), so the new schema keeps only `document_id`. +- **TEXT primary key works** (insert, UPDATE, DELETE, duplicate rejection) [VERIFIED]. There is no usable rowid mapping with a TEXT pk, which we do not need. +- **Aux column for the payload.** `+node_content` holds the multi-KB JSON; aux columns cannot appear in KNN WHERE clauses (loud error, not silent) [VERIFIED], which we never do, and are selectable in scans and KNN results [VERIFIED]. +- **Metadata columns reject NULL** (asg017/sqlite-vec#141, open) [VERIFIED]. `_row()` must keep coercing everything through `str(... or "")` as it already does today. +- **`distance_metric=cosine`**: similarity maps as `1 - distance` (identical vector gives distance 0.0 [VERIFIED]). For unit-norm embeddings the ranking equals today's L2 ranking; for non-normalized models cosine is the safer default, and the beta re-embed makes the behavior change free. (L2 + `1/(1+d)` remains available if exact parity is ever wanted.) +- **Vectors are always bound as float32 BLOBs** (`struct.pack`/`np.tobytes`), never JSON text: bypasses the locale-dependent `strtod` parsing bug (asg017/sqlite-vec#241, open) entirely. +- Limits, all comfortable: dims <= 8192, k <= 4096, chunk_size default 1024 [VERIFIED]. TEXT metadata has no length cap; values > 12 bytes go to a shadow text table with a prefix fast-path, and the one historical bug at that boundary (long-metadata DELETE, #274) is fixed in 0.1.9. + +## Method mapping (PaperlessLanceVectorStore -> PaperlessSqliteVecVectorStore) + +| Current method | sqlite-vec implementation | Notes | +| --------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `__init__(uri, table_name, embed_model_name)` | `sqlite3.connect(path)` + `enable_load_extension` + `sqlite_vec.load()` + PRAGMAs | Same lazy "table may not exist yet" stance | +| `client` property | the `sqlite3.Connection` | | +| `table_exists()` | `SELECT 1 FROM sqlite_master WHERE name='nodes'` | | +| `vector_dim()` | `index_meta['dim']` | Written at table creation; wrong-dim inserts are rejected by vec0 anyway [VERIFIED] | +| `drop_table()` | `DROP TABLE nodes` | Drops all 7 shadow tables with it [VERIFIED]; also clear `index_meta` | +| `stored_model_name()` / `config_mismatch()` | `index_meta['embed_model']` | Same conservative None handling | +| `_schema(dim, model)` | the CREATE statements above | dim from first batch, as today (`_ensure_table`) | +| `_row(node)` | same dict, vector packed to bytes | keep `str(... or "")` coercion (NULL rejection) | +| `add(nodes)` | `executemany(INSERT ...)` inside one transaction | ~3,300 rows/s at 1024 dims measured; batching via transactions | +| `upsert_document(document_id, nodes)` | `BEGIN; DELETE FROM nodes WHERE document_id = ?; executemany(INSERT); COMMIT` | **Not** `INSERT OR REPLACE`: broken on vec0 (asg017/sqlite-vec#259, open). Transaction gives the same no-transient-empty-state guarantee as merge_insert; rollback verified [VERIFIED] | +| `delete(ref_doc_id)` | `DELETE FROM nodes WHERE document_id = ?` | | +| `get_nodes(filters)` | `SELECT id, document_id, node_content, embedding FROM nodes [WHERE ...]` | full scans on vec0 work [VERIFIED]; 45 ms / 20K rows | +| `query(VectorStoreQuery)` | `SELECT id, node_content, embedding, distance FROM nodes WHERE embedding MATCH ? AND k = ? [AND filters]` then Python-slice to `top_k` | `k = ?` is mandatory; `LIMIT` cannot be combined with `k` [VERIFIED]; results arrive distance-sorted [VERIFIED]; similarities = `1 - distance` | +| `_build_where(filters)` | same EQ/IN translation, but emitting `?` placeholders + params list | **Upgrade**: bound parameters replace today's manual `_escape()` string interpolation | +| `get_modified_times()` | `SELECT document_id, modified FROM nodes` + first-seen dedupe in Python | identical logic | +| `ensure_document_id_scalar_index()` | no-op (delete if nothing else needs it) | metadata filters are evaluated in the chunk scan; nothing to create | +| `maybe_create_ann_index()` | no-op on 0.1.9 | ANN (rescore/diskann) is 0.1.10-alpha territory; adopting an ANN index makes the file unreadable by 0.1.9 (one-way door), while flat tables round-trip 0.1.9 <-> 0.1.10a4 cleanly [VERIFIED]. Revisit post-0.1.10-final | +| `compact(retention_seconds)` | **rebuild-based compaction**, see below | replaces Lance MVCC cleanup | + +Filter constraint surface (loud errors otherwise, [VERIFIED]): only `=, !=, <, <=, >, >=, IN` on metadata columns in KNN queries. We use only EQ/IN. Never use `NOT IN` (the vtab cannot see it; SQLite post-filters and silently under-delivers below k, asg017/sqlite-vec#116). + +## Compaction: the one real behavioral difference + +vec0 DELETE only flips a validity bit; space is never reclaimed, and VACUUM recovers only about half (asg017/sqlite-vec#54, #220, open; fix PRs #243/#210 unmerged). Measured: 5 delete+reinsert cycles on 2K rows grew the file 3.32 MB -> 6.56 MB; VACUUM got back to 4.94 MB. Paperless's per-document churn (every document edit is a delete+reinsert) hits this directly. + +So `compact()` becomes the maintainer-endorsed rebuild (asg017/sqlite-vec#205): + +```sql +CREATE VIRTUAL TABLE nodes_new USING vec0(...); +INSERT INTO nodes_new SELECT id, document_id, modified, node_content, embedding FROM nodes; +DROP TABLE nodes; +ALTER TABLE nodes_new RENAME TO nodes; -- then VACUUM +``` + +This copies vectors without re-embedding, runs under the existing write FileLock, and slots into the existing `document_llmindex compact` command and the scheduled maintenance task. A cheap trigger heuristic: rebuild when `count(*) in nodes_rowids shadow` (cumulative) exceeds ~2x live rows, or just keep the existing scheduled cadence. + +## Concurrency + +vec0 is a plain vtab over ordinary shadow tables, so standard SQLite WAL semantics apply, and the existing architecture is already the textbook arrangement: writers serialized by `settings.LLM_INDEX_LOCK` FileLock, readers concurrent via WAL. Verified across processes: a reader during another process's open write transaction does not block and sees a consistent pre-transaction snapshot; post-commit it sees the new rows [VERIFIED]. No sqlite-vec-specific multi-process corruption, locking, or segfault reports exist in the tracker. The 0.1.10a4 cached-statement fix (#295) is a Firefox/mozStorage `sqlite3_close()` issue; CPython's `sqlite3` is unaffected, no Python-side reports. + +Same caveat as the main SQLite DB: `LLM_INDEX_DIR` should not be on NFS. + +## Performance expectations (measured on the 0.1.9 no-SIMD wheel) + +- KNN 20K rows x 1024 dims: ~74 ms plain, ~39 ms with a metadata EQ filter. +- 100K x 768: 185 ms/query (vs 497 ms for LanceDB exact search on identical data). +- Extrapolated 500K x 1024-1536: ~0.9-1.8 s/query; 384 dims roughly 4x faster. Acceptable for suggestions/chat at the extreme tail; typical installs (low tens of thousands of chunks) are tens of ms. +- Insert: ~3,300 rows/s at 1024 dims in a single transaction. +- File size: ~raw vector size (~4.3 KB/row at 1024 dims), no compression; plus the bloat behavior above. + +## Migration from the Lance store + +Beta policy: re-embed. On startup/first index task: if `LLM_INDEX_DIR` contains a Lance table but no `llmindex.db`, log and queue a full rebuild, then remove the Lance directory. No cross-store vector copy, no lancedb import anywhere in the path (which is what un-breaks #12970 hosts: they currently crash at import, have no usable index, and get a fresh build). + +PR #12968's migration machinery maps onto `index_meta['schema_version']`: structural migrations = create-new-table + `INSERT ... SELECT` + rename (vectors copied, no re-embed; same shape as the compaction rebuild); re-embed migrations = drop + full rebuild, jumping straight to the current version. + +## Dependency changes + +- Add: `sqlite-vec==0.1.9` (one ~100 KB platform wheel, zero Python deps). +- Remove: `lancedb~=0.33.0` (and its pylance/lancedb wheels, ~40 MB). `pyarrow` leaves this module; check whether anything else in the AI stack still needs it before dropping from pyproject. + +## Test plan notes + +- pytest-style per project convention; the store tests can run against a tmp_path DB file (or `:memory:` for pure-logic tests; extension loading works on uv-managed CPython [VERIFIED]). +- Port the existing `test_vector_store.py` surface; add dedicated tests for: upsert transactionality (no transient empty state mid-upsert from a second connection), NULL-coercion in `_row()`, k-slice behavior, EQ/IN filter correctness, compaction rebuild preserving rows byte-for-byte, vec_debug canary logging. +- The qemu matrix (`/tmp/vstore-avx-test/`) can be re-run against any future sqlite-vec bump: `qemu-x86_64 -cpu Westmere venv/bin/python candidate_test.py sqlite_vec `. + +## Benchmark harness + +`src/bench_vector_store.py` -- standalone head-to-head comparison run during the migration window when both `PaperlessLanceVectorStore` and `PaperlessSqliteVecVectorStore` coexist (Task 3 Phase A of the implementation plan). After Phase B replaces `vector_store.py`, the Lance import fails gracefully and only the sqlite-vec half runs (useful for post-migration baseline checks). + +```bash +cd src +uv run python bench_vector_store.py # auto-generates bench_data.pkl on first run +uv run python bench_vector_store.py --regenerate # force re-embed +``` + +**Phase 1 (data generation, skipped if `bench_data.pkl` exists):** Faker generates `--n-docs` (default 2000) fake documents -- title, body, correspondent, ISO timestamp. Each body is split into `--chunks-per-doc` (default 3) equal-length chunks (~6000 total nodes). A warm-up embed call fires before generation to ensure the model is resident in GPU. All chunk texts are embedded via Ollama `/api/embed` in batches of 32 and saved to `bench_data.pkl`. Faker seed 42 for reproducibility. + +**Phase 2 (benchmark):** Each store runs in an isolated `tempfile.TemporaryDirectory()`. Query vectors are drawn reproducibly from the corpus (every 10th node, wrapping). + +| Operation | Reps | Metric | +| ----------------------------------------- | ---- | --------------------- | +| `add()` bulk insert | 1 | total time | +| `query()` plain | 50 | p50 / p95 | +| `query()` filtered (IN on 20% of doc IDs) | 50 | p50 / p95 | +| `get_modified_times()` | 20 | p50 | +| `upsert_document()` | 50 | p50 / p95 | +| `compact()` | 1 | total time | +| File size | -- | pre- and post-compact | + +**CLI flags:** `--n-docs` (2000), `--chunks-per-doc` (3), `--data-file` (`bench_data.pkl`), `--regenerate`, `--ollama-url` (`http://192.168.1.87:11434`), `--embed-model` (`qwen3-embedding:4b`), `--query-iters` (50). + +**Dependencies:** `faker` and `httpx` must be available (`uv add --dev faker httpx` if not already installed). + +## Risk register (from the 2026-06-10 issues audit) + +| Risk | Ref | State | Disposition | +| ------------------------------------------- | --------------------------------------- | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | +| 0.1.10+ wheels bake AVX, no dispatch | release CI change, verified on 0.1.10a4 | current | Pin 0.1.9; vec_debug canary; upstream ask before any bump | +| DELETE never reclaims space; VACUUM ~50% | #54, #220 | open | Rebuild-based `compact()` above | +| INSERT OR REPLACE broken on vec0 | #259 | open | Use DELETE+INSERT in txn (design already does) | +| NULL metadata rejected | #141 | open | Sentinel `""` coercion (already current behavior) | +| Partition-key IN returns k per partition | #142 | open | Avoided: document_id is a metadata column | +| NOT IN silently under-delivers | #116 | open | Never emit NOT IN | +| Locale strtod breaks JSON vector parsing | #241 | open | Always BLOB-bind vectors | +| Single weekend maintainer; fix PRs languish | #226 | open | Mitigated by Mozilla sponsorship + Firefox vendoring (release-train consumer); pin + vendor-from-source remains the escape hatch (no sdist: #211) | +| ANN index = one-way file format | 0.1.10 alphas | — | Do not adopt ANN until 0.1.10 final + flag audit | +| Long-TEXT metadata DELETE bug | #274 | fixed in 0.1.9 | Floor requirement `>=0.1.9` already implied by pin |