From 3aa83c9e4ce4ddcf945cb72ab25ad9e3878f3c90 Mon Sep 17 00:00:00 2001 From: stumpylog <797416+stumpylog@users.noreply.github.com> Date: Fri, 5 Jun 2026 10:15:22 -0700 Subject: [PATCH] To reduce embedding size, don't store the metadata in the body. Body is content + a few other things, metadata keys hold the metadata --- src/paperless_ai/embedding.py | 12 ++++----- src/paperless_ai/indexing.py | 1 + src/paperless_ai/tests/test_ai_indexing.py | 31 ++++++++++++++++++++++ src/paperless_ai/tests/test_embedding.py | 13 +++++---- 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/src/paperless_ai/embedding.py b/src/paperless_ai/embedding.py index 2695e9fb3..59582130f 100644 --- a/src/paperless_ai/embedding.py +++ b/src/paperless_ai/embedding.py @@ -138,15 +138,13 @@ def _normalize_llm_index_text(text: str) -> str: def build_llm_index_text(doc: Document) -> str: + # TODO: Filename, Storage Path, and Archive Serial Number are short structured + # values that could move to node.metadata (excluded from embeddings, visible to + # LLM via metadata prepend) — same pattern as title/tags/correspondent. Notes + # and Custom Fields should stay here: Notes can be long free text, Custom Fields + # are dynamic in count and best kept in the embedding. lines = [ - f"Title: {doc.title}", f"Filename: {doc.filename}", - f"Created: {doc.created}", - f"Added: {doc.added}", - f"Modified: {doc.modified}", - f"Tags: {', '.join(tag.name for tag in doc.tags.all())}", - f"Document Type: {doc.document_type.name if doc.document_type else ''}", - f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}", f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}", f"Archive Serial Number: {doc.archive_serial_number or ''}", f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}", diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index 31b7b7c5c..caae1cbb9 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -119,6 +119,7 @@ def build_document_node( text=text, metadata=metadata, excluded_embed_metadata_keys=list(metadata.keys()), + excluded_llm_metadata_keys=["document_id"], ) chunk_size = chunk_size or get_rag_chunk_size() parser = SimpleNodeParser( diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py index c1bb8aab1..5333e70d9 100644 --- a/src/paperless_ai/tests/test_ai_indexing.py +++ b/src/paperless_ai/tests/test_ai_indexing.py @@ -69,6 +69,37 @@ def test_build_document_node_excludes_metadata_from_embedding( ) +@pytest.mark.django_db +def test_build_document_node_structured_fields_in_metadata( + real_document: Document, +) -> None: + """Structured fields must be in node.metadata so the LLM receives them via metadata prepend.""" + nodes = indexing.build_document_node(real_document) + assert len(nodes) > 0 + for node in nodes: + assert "title" in node.metadata + assert "tags" in node.metadata + assert "correspondent" in node.metadata + assert "document_type" in node.metadata + assert "created" in node.metadata + assert "added" in node.metadata + assert "modified" in node.metadata + + +@pytest.mark.django_db +def test_build_document_node_excludes_document_id_from_llm_context( + real_document: Document, +) -> None: + """document_id is an internal key and must not appear in LLM context text.""" + from llama_index.core.schema import MetadataMode + + nodes = indexing.build_document_node(real_document) + assert len(nodes) > 0 + for node in nodes: + assert "document_id" in node.excluded_llm_metadata_keys + assert "document_id" not in node.get_content(metadata_mode=MetadataMode.LLM) + + @pytest.mark.django_db def test_build_document_node_uses_rag_chunk_settings(real_document: Document) -> None: app_config, _ = ApplicationConfiguration.objects.get_or_create() diff --git a/src/paperless_ai/tests/test_embedding.py b/src/paperless_ai/tests/test_embedding.py index 1dbd0ab99..102a88367 100644 --- a/src/paperless_ai/tests/test_embedding.py +++ b/src/paperless_ai/tests/test_embedding.py @@ -243,12 +243,15 @@ def test_build_llm_index_text(mock_document): result = build_llm_index_text(mock_document) - assert "Title: Test Title" in result + # Structured fields live in node.metadata for LLM context — not body text + assert "Title: Test Title" not in result + assert "Created: 2023-01-01" not in result + assert "Tags: Tag1, Tag2" not in result + assert "Document Type: Invoice" not in result + assert "Correspondent: Test Correspondent" not in result + + # Fields without a metadata equivalent stay in body text assert "Filename: test_file.pdf" in result - assert "Created: 2023-01-01" in result - assert "Tags: Tag1, Tag2" in result - assert "Document Type: Invoice" in result - assert "Correspondent: Test Correspondent" in result assert "Notes: Note1,Note2" in result assert "Content:\n\nThis is the document content." in result assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result