From ef8b4b453d93ec53b3841f042a5637b21ddd3ff2 Mon Sep 17 00:00:00 2001 From: stumpylog <797416+stumpylog@users.noreply.github.com> Date: Fri, 12 Jun 2026 14:27:15 -0700 Subject: [PATCH] Removes these for starting implementation --- ...026-05-11-ai-taxonomy-candidates-design.md | 695 ------------------ .../2026-05-20-ai-taxonomy-hints-design.md | 213 ------ 2 files changed, 908 deletions(-) delete mode 100644 docs/superpowers/plans/2026-05-11-ai-taxonomy-candidates-design.md delete mode 100644 docs/superpowers/specs/2026-05-20-ai-taxonomy-hints-design.md diff --git a/docs/superpowers/plans/2026-05-11-ai-taxonomy-candidates-design.md b/docs/superpowers/plans/2026-05-11-ai-taxonomy-candidates-design.md deleted file mode 100644 index fb6efd594..000000000 --- a/docs/superpowers/plans/2026-05-11-ai-taxonomy-candidates-design.md +++ /dev/null @@ -1,695 +0,0 @@ -# AI Taxonomy Candidate Injection Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Inject the user's existing taxonomy (tags, correspondents, document types, storage paths) as candidates into the LLM prompt so it prefers exact existing names over inventing new ones. - -**Architecture:** A new `get_taxonomy_candidates(user)` helper fetches each category permission-filtered to the requesting user, annotated with document-count for frequency ordering, and capped at 200 per category. A private `_format_candidates_section` helper renders the candidate lists into a prompt appendix. `build_prompt_without_rag` and `build_prompt_with_rag` each gain an optional `candidates` parameter. `get_ai_document_classification` wires it all together — fetch candidates then pass them to the prompt builder. No changes to the view, matching layer, or response format. - -**Tech Stack:** Django ORM (`annotate`, `Count`), `get_objects_for_user_owner_aware` (already used in `matching.py`), pytest + `unittest.mock` - ---- - -## File Map - -- **Modify:** `src/paperless_ai/ai_classifier.py` - - Add constant `TAXONOMY_CANDIDATE_LIMIT = 200` - - Add `get_taxonomy_candidates(user)` helper - - Add `_format_candidates_section(candidates)` helper - - Update `build_prompt_without_rag` signature and body - - Update `build_prompt_with_rag` signature and body - - Update `get_ai_document_classification` body -- **Create:** `src/paperless_ai/tests/test_taxonomy_candidates.py` - - All new tests for the above - ---- - -### Task 1: `get_taxonomy_candidates` — tests + implementation - -**Files:** - -- Modify: `src/paperless_ai/ai_classifier.py` -- Create: `src/paperless_ai/tests/test_taxonomy_candidates.py` - -- [ ] **Step 1: Write the failing tests** - -Create `src/paperless_ai/tests/test_taxonomy_candidates.py`: - -```python -import pytest -from unittest.mock import patch - -from django.contrib.auth.models import User - -from documents.models import Correspondent -from documents.models import Document -from documents.models import DocumentType -from documents.models import StoragePath -from documents.models import Tag -from paperless_ai.ai_classifier import TAXONOMY_CANDIDATE_LIMIT -from paperless_ai.ai_classifier import get_taxonomy_candidates - - -def test_get_taxonomy_candidates_returns_none_for_none_user(): - assert get_taxonomy_candidates(None) is None - - -@pytest.mark.django_db -class TestGetTaxonomyCandidates: - def test_returns_dict_with_four_keys(self): - user = User.objects.create_user(username="tc_user1", password="x") - with patch( - "paperless_ai.ai_classifier.get_objects_for_user_owner_aware", - ) as mock_get: - mock_get.side_effect = [ - Tag.objects.none(), - Correspondent.objects.none(), - DocumentType.objects.none(), - StoragePath.objects.none(), - ] - result = get_taxonomy_candidates(user) - assert result is not None - assert set(result.keys()) == { - "tags", - "correspondents", - "document_types", - "storage_paths", - } - - def test_returns_names_as_strings(self): - user = User.objects.create_user(username="tc_user2", password="x") - tag = Tag.objects.create(name="Bloodwork") - with patch( - "paperless_ai.ai_classifier.get_objects_for_user_owner_aware", - ) as mock_get: - mock_get.side_effect = [ - Tag.objects.filter(pk=tag.pk), - Correspondent.objects.none(), - DocumentType.objects.none(), - StoragePath.objects.none(), - ] - result = get_taxonomy_candidates(user) - assert result["tags"] == ["Bloodwork"] - - def test_orders_tags_by_document_count_descending(self): - user = User.objects.create_user(username="tc_user3", password="x") - tag_low = Tag.objects.create(name="LowUse") - tag_high = Tag.objects.create(name="HighUse") - - doc1 = Document.objects.create(mime_type="text/plain", checksum="tc_doc1") - doc2 = Document.objects.create(mime_type="text/plain", checksum="tc_doc2") - doc3 = Document.objects.create(mime_type="text/plain", checksum="tc_doc3") - doc1.tags.add(tag_high) - doc2.tags.add(tag_high) - doc3.tags.add(tag_low) - - with patch( - "paperless_ai.ai_classifier.get_objects_for_user_owner_aware", - ) as mock_get: - mock_get.side_effect = [ - Tag.objects.filter(pk__in=[tag_low.pk, tag_high.pk]), - Correspondent.objects.none(), - DocumentType.objects.none(), - StoragePath.objects.none(), - ] - result = get_taxonomy_candidates(user) - - assert result["tags"] == ["HighUse", "LowUse"] - - def test_caps_results_at_taxonomy_candidate_limit(self): - user = User.objects.create_user(username="tc_user4", password="x") - tags = [Tag.objects.create(name=f"Tag{i}") for i in range(5)] - - with ( - patch( - "paperless_ai.ai_classifier.get_objects_for_user_owner_aware", - ) as mock_get, - patch("paperless_ai.ai_classifier.TAXONOMY_CANDIDATE_LIMIT", 3), - ): - mock_get.side_effect = [ - Tag.objects.filter(pk__in=[t.pk for t in tags]), - Correspondent.objects.none(), - DocumentType.objects.none(), - StoragePath.objects.none(), - ] - result = get_taxonomy_candidates(user) - - assert len(result["tags"]) == 3 - - def test_all_four_categories_are_fetched(self): - user = User.objects.create_user(username="tc_user5", password="x") - tag = Tag.objects.create(name="MyTag") - corr = Correspondent.objects.create(name="MyCorr") - dt = DocumentType.objects.create(name="MyType") - sp = StoragePath.objects.create(name="MyPath", path="/my/path") - - with patch( - "paperless_ai.ai_classifier.get_objects_for_user_owner_aware", - ) as mock_get: - mock_get.side_effect = [ - Tag.objects.filter(pk=tag.pk), - Correspondent.objects.filter(pk=corr.pk), - DocumentType.objects.filter(pk=dt.pk), - StoragePath.objects.filter(pk=sp.pk), - ] - result = get_taxonomy_candidates(user) - - assert result["tags"] == ["MyTag"] - assert result["correspondents"] == ["MyCorr"] - assert result["document_types"] == ["MyType"] - assert result["storage_paths"] == ["MyPath"] -``` - -- [ ] **Step 2: Run to confirm they all fail** - -```bash -cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py --override-ini="addopts=" -v 2>&1 | tail -20 -``` - -Expected: `ImportError` or `FAILED` — `get_taxonomy_candidates` does not exist yet. - -- [ ] **Step 3: Add the implementation to `ai_classifier.py`** - -At the top of `src/paperless_ai/ai_classifier.py`, add new imports after the existing ones: - -```python -from django.db.models import Count - -from documents.models import Correspondent -from documents.models import DocumentType -from documents.models import StoragePath -from documents.models import Tag -from documents.permissions import get_objects_for_user_owner_aware -``` - -Add the constant and helper right after the `logger` line: - -```python -TAXONOMY_CANDIDATE_LIMIT = 200 - - -def get_taxonomy_candidates(user: User | None) -> dict[str, list[str]] | None: - if user is None: - return None - - tags = ( - get_objects_for_user_owner_aware(user, ["view_tag"], Tag) - .annotate(doc_count=Count("documents")) - .order_by("-doc_count")[:TAXONOMY_CANDIDATE_LIMIT] - ) - correspondents = ( - get_objects_for_user_owner_aware(user, ["view_correspondent"], Correspondent) - .annotate(doc_count=Count("documents")) - .order_by("-doc_count")[:TAXONOMY_CANDIDATE_LIMIT] - ) - document_types = ( - get_objects_for_user_owner_aware(user, ["view_documenttype"], DocumentType) - .annotate(doc_count=Count("documents")) - .order_by("-doc_count")[:TAXONOMY_CANDIDATE_LIMIT] - ) - storage_paths = ( - get_objects_for_user_owner_aware(user, ["view_storagepath"], StoragePath) - .annotate(doc_count=Count("documents")) - .order_by("-doc_count")[:TAXONOMY_CANDIDATE_LIMIT] - ) - - return { - "tags": [t.name for t in tags], - "correspondents": [c.name for c in correspondents], - "document_types": [d.name for d in document_types], - "storage_paths": [s.name for s in storage_paths], - } -``` - -- [ ] **Step 4: Run to confirm Task 1 tests pass** - -```bash -cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py --override-ini="addopts=" -v 2>&1 | tail -20 -``` - -Expected: all 6 tests `PASSED`. - -- [ ] **Step 5: Confirm existing AI classifier tests still pass** - -```bash -cd src && uv run pytest paperless_ai/tests/test_ai_classifier.py --override-ini="addopts=" -v 2>&1 | tail -20 -``` - -Expected: all tests `PASSED`. - -- [ ] **Step 6: Commit** - -```bash -git add src/paperless_ai/ai_classifier.py src/paperless_ai/tests/test_taxonomy_candidates.py -git commit -m "feat: add get_taxonomy_candidates helper with frequency ordering and cap" -``` - ---- - -### Task 2: Prompt injection — `_format_candidates_section` + `build_prompt_without_rag` - -**Files:** - -- Modify: `src/paperless_ai/ai_classifier.py` -- Modify: `src/paperless_ai/tests/test_taxonomy_candidates.py` - -- [ ] **Step 1: Write the failing tests** - -Append to `src/paperless_ai/tests/test_taxonomy_candidates.py`: - -```python -from unittest.mock import MagicMock - -from paperless_ai.ai_classifier import build_prompt_without_rag - - -@pytest.fixture -def mock_doc(): - doc = MagicMock(spec=Document) - doc.filename = "invoice.pdf" - doc.content = "Some document content." - return doc - - -class TestBuildPromptWithoutRag: - def test_no_candidates_section_when_candidates_is_none(self, mock_doc): - prompt = build_prompt_without_rag(mock_doc, candidates=None) - assert "Existing metadata" not in prompt - - def test_no_candidates_section_when_candidates_is_empty_dict(self, mock_doc): - prompt = build_prompt_without_rag(mock_doc, candidates={}) - assert "Existing metadata" not in prompt - - def test_candidates_section_present_when_provided(self, mock_doc): - candidates = { - "tags": ["Bloodwork", "Insurance"], - "correspondents": ["Dr. Smith"], - "document_types": [], - "storage_paths": [], - } - prompt = build_prompt_without_rag(mock_doc, candidates=candidates) - assert "Existing metadata" in prompt - assert "Bloodwork" in prompt - assert "Dr. Smith" in prompt - - def test_empty_categories_omitted_from_section(self, mock_doc): - candidates = { - "tags": ["Bloodwork"], - "correspondents": [], - "document_types": [], - "storage_paths": [], - } - prompt = build_prompt_without_rag(mock_doc, candidates=candidates) - assert "Correspondents:" not in prompt - assert "Document types:" not in prompt - assert "Storage paths:" not in prompt - - def test_existing_prompt_content_preserved(self, mock_doc): - prompt = build_prompt_without_rag(mock_doc, candidates=None) - assert "invoice.pdf" in prompt - assert "Some document content." in prompt -``` - -- [ ] **Step 2: Run to confirm they fail** - -```bash -cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestBuildPromptWithoutRag --override-ini="addopts=" -v 2>&1 | tail -20 -``` - -Expected: `FAILED` — `build_prompt_without_rag` doesn't accept `candidates` yet. - -- [ ] **Step 3: Add `_format_candidates_section` and update `build_prompt_without_rag` in `ai_classifier.py`** - -Add `_format_candidates_section` immediately after `get_taxonomy_candidates`: - -```python -def _format_candidates_section(candidates: dict[str, list[str]]) -> str: - lines = [ - "Existing metadata (use exact names where they fit; suggest new ones only if nothing matches):", - ] - for key, label in [ - ("tags", "Tags"), - ("correspondents", "Correspondents"), - ("document_types", "Document types"), - ("storage_paths", "Storage paths"), - ]: - names = candidates.get(key, []) - if names: - lines.append(f"{label}: {', '.join(names)}") - return "\n".join(lines) -``` - -Replace the existing `build_prompt_without_rag`: - -```python -def build_prompt_without_rag( - document: Document, - candidates: dict[str, list[str]] | None = None, -) -> str: - filename = document.filename or "" - content = truncate_content(document.content[:4000] or "") - - prompt = f""" - You are a document classification assistant. - - Analyze the following document and extract the following information: - - A short descriptive title - - Tags that reflect the content - - Names of people or organizations mentioned - - The type or category of the document - - Suggested folder paths for storing the document - - Up to 3 relevant dates in YYYY-MM-DD format - - Filename: - {filename} - - Content: - {content} - """.strip() - - if candidates: - prompt += "\n\n" + _format_candidates_section(candidates) - - return prompt -``` - -- [ ] **Step 4: Run to confirm Task 2 tests pass** - -```bash -cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestBuildPromptWithoutRag --override-ini="addopts=" -v 2>&1 | tail -20 -``` - -Expected: all 5 tests `PASSED`. - -- [ ] **Step 5: Run full test file to check no regressions** - -```bash -cd src && uv run pytest paperless_ai/tests/ --override-ini="addopts=" -v 2>&1 | tail -30 -``` - -Expected: all tests `PASSED`. - -- [ ] **Step 6: Commit** - -```bash -git add src/paperless_ai/ai_classifier.py src/paperless_ai/tests/test_taxonomy_candidates.py -git commit -m "feat: inject taxonomy candidates into build_prompt_without_rag" -``` - ---- - -### Task 3: Update `build_prompt_with_rag` - -**Files:** - -- Modify: `src/paperless_ai/ai_classifier.py` -- Modify: `src/paperless_ai/tests/test_taxonomy_candidates.py` - -- [ ] **Step 1: Write the failing tests** - -Append to `src/paperless_ai/tests/test_taxonomy_candidates.py`: - -```python -from paperless_ai.ai_classifier import build_prompt_with_rag - - -class TestBuildPromptWithRag: - def test_no_candidates_section_when_candidates_is_none(self, mock_doc): - with patch( - "paperless_ai.ai_classifier.get_context_for_document", - return_value="similar doc context", - ): - prompt = build_prompt_with_rag(mock_doc, candidates=None) - assert "Existing metadata" not in prompt - - def test_candidates_section_present_when_provided(self, mock_doc): - candidates = { - "tags": ["Insurance"], - "correspondents": [], - "document_types": ["Invoice"], - "storage_paths": [], - } - with patch( - "paperless_ai.ai_classifier.get_context_for_document", - return_value="similar doc context", - ): - prompt = build_prompt_with_rag(mock_doc, candidates=candidates) - assert "Existing metadata" in prompt - assert "Insurance" in prompt - assert "Invoice" in prompt - - def test_rag_context_still_present(self, mock_doc): - with patch( - "paperless_ai.ai_classifier.get_context_for_document", - return_value="similar doc context", - ): - prompt = build_prompt_with_rag(mock_doc, candidates=None) - assert "similar doc context" in prompt -``` - -- [ ] **Step 2: Run to confirm they fail** - -```bash -cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestBuildPromptWithRag --override-ini="addopts=" -v 2>&1 | tail -20 -``` - -Expected: `FAILED` — `build_prompt_with_rag` doesn't accept `candidates` yet. - -- [ ] **Step 3: Update `build_prompt_with_rag` in `ai_classifier.py`** - -Replace the existing `build_prompt_with_rag`: - -```python -def build_prompt_with_rag( - document: Document, - user: User | None = None, - candidates: dict[str, list[str]] | None = None, -) -> str: - base_prompt = build_prompt_without_rag(document, candidates) - context = truncate_content(get_context_for_document(document, user)) - - return f"""{base_prompt} - - Additional context from similar documents: - {context} - """.strip() -``` - -- [ ] **Step 4: Run to confirm Task 3 tests pass** - -```bash -cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestBuildPromptWithRag --override-ini="addopts=" -v 2>&1 | tail -20 -``` - -Expected: all 3 tests `PASSED`. - -- [ ] **Step 5: Run full test file to check no regressions** - -```bash -cd src && uv run pytest paperless_ai/tests/ --override-ini="addopts=" -v 2>&1 | tail -30 -``` - -Expected: all tests `PASSED`. - -- [ ] **Step 6: Commit** - -```bash -git add src/paperless_ai/ai_classifier.py src/paperless_ai/tests/test_taxonomy_candidates.py -git commit -m "feat: pass taxonomy candidates through build_prompt_with_rag" -``` - ---- - -### Task 4: Wire `get_ai_document_classification` - -**Files:** - -- Modify: `src/paperless_ai/ai_classifier.py` -- Modify: `src/paperless_ai/tests/test_taxonomy_candidates.py` - -- [ ] **Step 1: Write the failing tests** - -Append to `src/paperless_ai/tests/test_taxonomy_candidates.py`: - -```python -from django.test import override_settings - -from paperless_ai.ai_classifier import get_ai_document_classification - - -@pytest.mark.django_db -class TestGetAiDocumentClassificationCandidateWiring: - @override_settings(LLM_BACKEND="ollama", LLM_MODEL="some_model") - def test_candidates_fetched_and_passed_when_user_provided(self, mock_doc): - user = User.objects.create_user(username="tc_wire_user1", password="x") - fake_candidates = { - "tags": ["Bloodwork"], - "correspondents": [], - "document_types": [], - "storage_paths": [], - } - with ( - patch( - "paperless_ai.ai_classifier.get_taxonomy_candidates", - return_value=fake_candidates, - ) as mock_candidates, - patch( - "paperless_ai.ai_classifier.build_prompt_without_rag", - return_value="prompt", - ) as mock_build, - patch("paperless_ai.client.AIClient.run_llm_query") as mock_llm, - ): - mock_llm.return_value = { - "title": "", - "tags": [], - "correspondents": [], - "document_types": [], - "storage_paths": [], - "dates": [], - } - get_ai_document_classification(mock_doc, user) - - mock_candidates.assert_called_once_with(user) - mock_build.assert_called_once_with(mock_doc, fake_candidates) - - @override_settings(LLM_BACKEND="ollama", LLM_MODEL="some_model") - def test_no_candidates_when_user_is_none(self, mock_doc): - with ( - patch( - "paperless_ai.ai_classifier.get_taxonomy_candidates", - ) as mock_candidates, - patch( - "paperless_ai.ai_classifier.build_prompt_without_rag", - return_value="prompt", - ) as mock_build, - patch("paperless_ai.client.AIClient.run_llm_query") as mock_llm, - ): - mock_llm.return_value = { - "title": "", - "tags": [], - "correspondents": [], - "document_types": [], - "storage_paths": [], - "dates": [], - } - get_ai_document_classification(mock_doc, user=None) - - mock_candidates.assert_not_called() - mock_build.assert_called_once_with(mock_doc, None) - - @override_settings( - LLM_BACKEND="ollama", - LLM_MODEL="some_model", - LLM_EMBEDDING_BACKEND="huggingface", - LLM_EMBEDDING_MODEL="some_model", - ) - def test_candidates_passed_to_rag_prompt_when_embedding_configured(self, mock_doc): - user = User.objects.create_user(username="tc_wire_user2", password="x") - fake_candidates = { - "tags": ["Tax"], - "correspondents": [], - "document_types": [], - "storage_paths": [], - } - with ( - patch( - "paperless_ai.ai_classifier.get_taxonomy_candidates", - return_value=fake_candidates, - ), - patch( - "paperless_ai.ai_classifier.build_prompt_with_rag", - return_value="rag_prompt", - ) as mock_rag, - patch("paperless_ai.client.AIClient.run_llm_query") as mock_llm, - ): - mock_llm.return_value = { - "title": "", - "tags": [], - "correspondents": [], - "document_types": [], - "storage_paths": [], - "dates": [], - } - get_ai_document_classification(mock_doc, user) - - mock_rag.assert_called_once_with(mock_doc, user, fake_candidates) -``` - -- [ ] **Step 2: Run to confirm they fail** - -```bash -cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestGetAiDocumentClassificationCandidateWiring --override-ini="addopts=" -v 2>&1 | tail -20 -``` - -Expected: `FAILED` — `get_ai_document_classification` doesn't pass candidates yet. - -- [ ] **Step 3: Update `get_ai_document_classification` in `ai_classifier.py`** - -Replace the existing `get_ai_document_classification`: - -```python -def get_ai_document_classification( - document: Document, - user: User | None = None, -) -> dict: - ai_config = AIConfig() - candidates = get_taxonomy_candidates(user) if user is not None else None - - prompt = ( - build_prompt_with_rag(document, user, candidates) - if ai_config.llm_embedding_backend - else build_prompt_without_rag(document, candidates) - ) - - client = AIClient() - result = client.run_llm_query(prompt) - return parse_ai_response(result) -``` - -- [ ] **Step 4: Run Task 4 tests** - -```bash -cd src && uv run pytest paperless_ai/tests/test_taxonomy_candidates.py::TestGetAiDocumentClassificationCandidateWiring --override-ini="addopts=" -v 2>&1 | tail -20 -``` - -Expected: all 3 tests `PASSED`. - -- [ ] **Step 5: Run the full `paperless_ai` test suite** - -```bash -cd src && uv run pytest paperless_ai/tests/ --override-ini="addopts=" -v 2>&1 | tail -40 -``` - -Expected: all tests `PASSED`. - -- [ ] **Step 6: Commit** - -```bash -git add src/paperless_ai/ai_classifier.py src/paperless_ai/tests/test_taxonomy_candidates.py -git commit -m "feat: wire taxonomy candidates into get_ai_document_classification" -``` - ---- - -### Task 5: Final verification - -- [ ] **Step 1: Run the broader backend test suite to catch any regressions** - -```bash -cd src && uv run pytest documents/tests/test_api_documents.py documents/tests/test_views.py paperless_ai/tests/ --override-ini="addopts=" -q 2>&1 | tail -20 -``` - -Expected: all `PASSED`, no errors. - -- [ ] **Step 2: Verify `ai_classifier.py` import order follows project conventions** - -Project convention: stdlib → Django → third-party → local, alphabetical within each group. Open `src/paperless_ai/ai_classifier.py` and confirm the new imports (`Count`, model imports, `get_objects_for_user_owner_aware`) are placed in the correct groups in alphabetical order. - -- [ ] **Step 3: Final commit if any formatting fixes were needed** - -If Step 2 required changes: - -```bash -git add src/paperless_ai/ai_classifier.py -git commit -m "chore: fix import ordering in ai_classifier.py" -``` diff --git a/docs/superpowers/specs/2026-05-20-ai-taxonomy-hints-design.md b/docs/superpowers/specs/2026-05-20-ai-taxonomy-hints-design.md deleted file mode 100644 index 4cc4a9370..000000000 --- a/docs/superpowers/specs/2026-05-20-ai-taxonomy-hints-design.md +++ /dev/null @@ -1,213 +0,0 @@ -# AI Suggestions: Inject existing taxonomy as candidates - -**Status:** Design (v3 — RAG-sourced, node metadata) -**Date:** 2026-05-20 -**Updated:** 2026-06-09 (v3: switch from frequency DB queries to node metadata from RAG retrieval) -**Related:** [Discussion #12787](https://github.com/paperless-ngx/paperless-ngx/discussions/12787) -**Branch target:** `dev` -**Depends on:** `2026-06-09-node-metadata-enrichment.md` (adds `storage_path`, `filename`, `asn` to node metadata; must land first) - -## Problem - -AI Suggestions currently asks the LLM for free-form tag/document-type/correspondent/storage-path names, then reconciles via `difflib` fuzzy matching (cutoff 0.8) in `paperless_ai/matching.py`. This works for typos but not for semantic equivalents: - -- `blood test` does not fuzzy-match `Bloodwork` -- `IRS` does not fuzzy-match `Taxes` -- `doctor visit` does not fuzzy-match `Medical` - -Result: the LLM invents new metadata names that duplicate existing taxonomy entries. - -## Goal - -Tell the LLM what already exists, so it can prefer existing names verbatim. Fuzzy matching becomes the fallback for typos and for legitimately novel suggestions, not the primary semantic-equivalence mechanism. - -Non-goals: changing the LLM client, embedding model selection, or RAG retrieval. Replacing fuzzy matching entirely. Custom-field option values. Frequency-based DB queries (superseded by RAG-sourced approach). - -## Approach - -Hints are sourced from the LanceDB node metadata of the similar documents already retrieved for RAG context — no separate DB queries, no new user-facing configuration. The feature is **gated on `llm_embedding_backend`**: when no embedding backend is configured, no hints are built and today's behavior is unchanged. - -LanceDB nodes already store `tags`, `correspondent`, `document_type`, `title`, and date fields per document (see `indexing.py:build_document_node`). `storage_path` is not currently stored; this feature adds it via a structural schema migration (no re-embed required). - -For each suggestion request (when embedding backend is on): - -1. Run the ANN retrieval once → get raw `NodeWithScore` results. -2. Extract taxonomy from the node metadata: `tags` (list), `document_type`, `correspondent`, `storage_path`. -3. Inject the unique names into the LLM prompt as "Available " blocks. -4. Pass the same name sets to `matching.py` as `hinted_names` so an exact normalized match short-circuits past fuzzy. - -When embedding backend is off → `hints = None` → prompt and matching are identical to today. - -## Components - -### `paperless_ai/indexing.py` (modify — `retrieve_similar_nodes`) - -Extract the shared retriever logic from `query_similar_documents` into a new lower-level function: - -```python -def retrieve_similar_nodes( - document: Document, - document_ids: Iterable[int | str] | None = None, - top_k: int = 5, -) -> list["NodeWithScore"]: - """Run ANN retrieval and return raw NodeWithScore results.""" - ... -``` - -Refactor `query_similar_documents` to call `retrieve_similar_nodes` and convert to ORM objects (behavior unchanged). The taxonomy hints path calls `retrieve_similar_nodes` directly — no DB round-trip, no second ANN query. - -### `paperless_ai/taxonomy.py` (new) - -```python -class TaxonomyHints(TypedDict): - tags: list[str] - document_types: list[str] - correspondents: list[str] - storage_paths: list[str] - -def build_taxonomy_hints_from_nodes(nodes: list["NodeWithScore"]) -> TaxonomyHints: ... -def get_taxonomy_hints_for_document(document: Document, user: User | None) -> TaxonomyHints | None: ... -def format_hints_for_prompt(hints: TaxonomyHints) -> str: ... -``` - -`get_taxonomy_hints_for_document`: - -- Returns `None` immediately if `AIConfig().llm_embedding_backend` is falsy. -- Applies the same owner-aware document ID filter as `get_context_for_document` (`get_objects_for_user_owner_aware(user, "view_document", Document)` when `user` is not `None`; unfiltered otherwise). -- Calls `retrieve_similar_nodes(document=document, document_ids=visible_document_ids)`. -- Passes results to `build_taxonomy_hints_from_nodes`. - -`build_taxonomy_hints_from_nodes(nodes)`: - -- Extracts from each `node.metadata`: `tags` (list), `document_type` (str | None), `correspondent` (str | None), `storage_path` (str | None). -- Collects unique values across all nodes, sorted. Empty/`None` values skipped. -- Returns a `TaxonomyHints`. No cap — naturally bounded by `top_k=5` in retrieval. - -`format_hints_for_prompt` emits one `Available :` block per non-empty category. Empty categories produce no block (avoid prompting the LLM with "Available tags: (none)"). A single instruction line follows: - -``` -Prefer existing names from these lists verbatim. Only propose a new value -if none of the existing names fits. -``` - -### `paperless_ai/ai_classifier.py` (modify) - -> **Note (updated 2026-06-09):** Current signatures after #12894 and #12944: -> -> - `build_prompt_without_rag(document: Document, config: AIConfig) -> str` -> - `build_prompt_with_rag(document: Document, config: AIConfig, user: User | None = None) -> str` -> - `get_ai_document_classification(document, user, output_language: str | None = None) -> dict` -> -> `build_localization_prompt` (added in #12894) runs after the LLM call and does **not** interact with taxonomy hints — hints inject into the base prompt only, before the LLM call. - -Both `build_prompt_without_rag` and `build_prompt_with_rag` accept a new optional `hints: TaxonomyHints | None = None` parameter. When non-`None`, `format_hints_for_prompt(hints)` is spliced in before the "Analyze the following document" instruction. When `None` (default), the prompt is built as today. - -`get_ai_document_classification(document, user, output_language: str | None = None, hints: TaxonomyHints | None = None)` accepts the same optional `hints` and forwards it to the prompt builder. Return shape **unchanged** (`dict`). Callers in tests pass `hints=None` (or omit) to preserve existing behavior. - -### `paperless_ai/matching.py` (modify) - -- `_match_names_to_queryset(names, queryset, attr, hinted_names: set[str] | None = None)`: - - Normalization unchanged. - - Exact-match-on-full-queryset behavior unchanged (always tried first). - - When `hinted_names` is provided and the LLM-returned name (normalized) matches a hinted name (normalized) → treated as exact-only; fuzzy is skipped for that name. - - When `hinted_names` is `None` or the name isn't in it → existing 0.8 fuzzy fallback runs. -- `match_tags_by_name(names, user, hinted_names=None)` etc. — optional kwarg, backward compatible. - -### `documents/views.py` (modify) - -The suggestion endpoint (around line 1498) is the single production caller of `get_ai_document_classification` and the call site for `match_*_by_name`. Update it to: - -1. Build hints: `hints = get_taxonomy_hints_for_document(doc, request.user)` — returns `None` when embedding backend is off; no additional config check needed in the view. -2. Pass `hints` into the classifier: `parsed = get_ai_document_classification(doc, request.user, output_language, hints=hints)` — `output_language` is already resolved at this point (`views.py:1472`). -3. Pass `hinted_names=set(hints["tags"])` (etc., one per category, or `None` when `hints` is `None`) into each `match_*_by_name` call. - -**Cache interaction:** the AI suggestion path is wrapped by `cached_llm_suggestions` / `refresh_suggestions_cache` (views.py:1488). A cached response bypasses both the LLM call and hint construction entirely. Acceptable for v1. - -### No `AIConfig` / DB model / settings changes - -No new configuration fields, DB columns, Django migrations, env vars, or frontend changes. The feature is automatically active for users who have an embedding backend configured and invisible to everyone else. - -## Data flow - -Suggestion request (embedding backend on): - -1. View calls `get_taxonomy_hints_for_document(doc, user)` → `retrieve_similar_nodes` → extract metadata → `TaxonomyHints`. -2. View calls `get_ai_document_classification(doc, user, output_language, hints=hints)`. -3. Classifier builds RAG prompt via `build_prompt_with_rag` (internally calls `query_similar_documents` → `retrieve_similar_nodes` for context text) + splices hints block → LLM → parsed dict. -4. View calls `match_*_by_name(names, user, hinted_names=set(hints[]))` per category. - -Suggestion request (embedding backend off): - -- `get_taxonomy_hints_for_document` returns `None` immediately (no retrieval runs). -- Rest of the flow identical to today. - -**Note on retrieval calls:** `retrieve_similar_nodes` is called once directly (for hints) and once indirectly via `build_prompt_with_rag` → `get_context_for_document` → `query_similar_documents`. Both calls use identical parameters. Acceptable for v1; can be eliminated later by lifting `retrieve_similar_nodes` up to `get_ai_document_classification` and threading results to both callers. - -## Error handling - -- **Embedding backend off:** `get_taxonomy_hints_for_document` returns `None`; no hints; behavior identical to today. -- **No similar documents found:** `build_taxonomy_hints_from_nodes([])` returns all-empty `TaxonomyHints`; `format_hints_for_prompt` produces no blocks; effectively `hints = None`. -- **Node missing `storage_path` key** (index predates the metadata enrichment prerequisite): `node.metadata.get("storage_path")` returns `None`; skipped gracefully. Storage path hints absent until rebuild completes. -- **LLM returns a name not in hints but exactly matching an existing visible name:** still treated as exact match — `_match_names_to_queryset` always tries exact-on-full-queryset before fuzzy. -- **Retrieval failure:** propagates; suggestion failures already surface as 5xx. - -## Testing - -All tests use pytest style — grouped under classes, `@pytest.mark.django_db` on the class, `pytest-mock`'s `mocker` fixture, every fixture parameter/return/test signature type-annotated. Format with `ruff` directly (not `uv run ruff`). - -### `paperless_ai/tests/test_taxonomy.py` (new) - -- `class TestBuildTaxonomyHintsFromNodes:` - - Returns a `TaxonomyHints` with all four keys. - - Deduplicates tag names shared across multiple nodes. - - `None` values in node metadata skipped gracefully. - - Missing `storage_path` key in metadata handled gracefully (pre-migration nodes). - - Empty node list → all-empty `TaxonomyHints`. - - Sorted output is stable across calls. - -- `class TestGetTaxonomyHintsForDocument:` - - Returns `None` when `AIConfig().llm_embedding_backend` is falsy; `retrieve_similar_nodes` not called (`mocker.spy`). - - Calls `retrieve_similar_nodes` with owner-aware document ID filter when user is provided. - - Returns populated `TaxonomyHints` when nodes are found. - - Returns all-empty `TaxonomyHints` (not `None`) when `retrieve_similar_nodes` returns `[]`. - -- `class TestFormatHintsForPrompt:` - - All four blocks present when all categories non-empty. - - Empty category produces no block. - - All-empty hints produces empty string (no stray instruction line). - - Instruction line appears exactly once when at least one block is rendered. - -### `paperless_ai/tests/test_ai_classifier.py` (extend) - -- `class TestBuildPrompt:` - - `build_prompt_without_rag(doc, config, hints=hints)` produces a prompt containing the hints block when hints are non-empty. - - `build_prompt_with_rag(doc, config, user, hints=hints)` includes both the RAG context block (unchanged) and the hints block. - - `hints=None`: prompt matches today's baseline (string equality against a fixture). - - `get_ai_document_classification(doc, user, hints=...)` forwards hints into the prompt; return shape unchanged. - -### `paperless_ai/tests/test_matching.py` (extend) - -- `class TestHintedMatching:` - - LLM returns `"Bloodwork"` verbatim, `hinted_names={"Bloodwork"}` → exact match returned; `difflib.get_close_matches` not called (`mocker.spy`). - - LLM returns `"blood test"` not in `hinted_names`, no existing exact → fuzzy fallback runs; behavior unchanged from today (regression guard). - - LLM returns `"Bloodwork "` (whitespace) with `hinted_names={"Bloodwork"}` → normalized exact match wins, fuzzy not consulted. - - Backward compatibility: `match_tags_by_name(names, user)` without the kwarg behaves identically to today. - -## Migration / rollout - -No migration in this feature. The prerequisite spec (`2026-06-09-node-metadata-enrichment.md`) handles the LanceDB schema migration (v2, `requires_reembed=True`) and the resulting index rebuild. Once that lands, `storage_path` is in every node's metadata and this feature needs no additional migration steps. - -No Django migration. No new config. Users with an embedding backend get taxonomy hints automatically once both specs are shipped; users without one see no change. - -## Interplay with `extract_unmatched_names` - -`extract_unmatched_names` surfaces LLM-returned names that didn't match any existing taxonomy entry — the UI uses these to offer "create new tag?" affordances. With hints in place, fewer names will be unmatched. No behavior change required: a hinted name the LLM returns verbatim will exact-match and not appear in the unmatched list; a name the LLM invents anyway still flows through fuzzy and, if no match, surfaces as "new" exactly as today. - -## Out of scope (potential v2) - -- Capping hint list length per category (currently unbounded within `top_k=5` retrieved nodes; revisit if prompt length becomes a concern). -- Eliminating the double `retrieve_similar_nodes` call by threading nodes through `get_ai_document_classification`. -- Frequency-based hints as a fallback for users without an embedding backend. -- Structured output / JSON schema enum constraints as an alternative to prompt injection. -- Tag hierarchy awareness. -- Custom field option values.