To reduce embedding size, don't store the metadata in the body. Body is content + a few other things, metadata keys hold the metadata

2026-08-02 17:12:18 +00:00 · 2026-06-05 11:43:42 -07:00
parent e7f8bf0542
commit 3aa83c9e4c
4 changed files with 45 additions and 12 deletions
@@ -138,15 +138,13 @@ def _normalize_llm_index_text(text: str) -> str:


 def build_llm_index_text(doc: Document) -> str:
+    # TODO: Filename, Storage Path, and Archive Serial Number are short structured
+    # values that could move to node.metadata (excluded from embeddings, visible to
+    # LLM via metadata prepend) — same pattern as title/tags/correspondent. Notes
+    # and Custom Fields should stay here: Notes can be long free text, Custom Fields
+    # are dynamic in count and best kept in the embedding.
    lines = [
-        f"Title: {doc.title}",
        f"Filename: {doc.filename}",
-        f"Created: {doc.created}",
-        f"Added: {doc.added}",
-        f"Modified: {doc.modified}",
-        f"Tags: {', '.join(tag.name for tag in doc.tags.all())}",
-        f"Document Type: {doc.document_type.name if doc.document_type else ''}",
-        f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}",
        f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}",
        f"Archive Serial Number: {doc.archive_serial_number or ''}",
        f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}",
@@ -119,6 +119,7 @@ def build_document_node(
        text=text,
        metadata=metadata,
        excluded_embed_metadata_keys=list(metadata.keys()),
+        excluded_llm_metadata_keys=["document_id"],
    )
    chunk_size = chunk_size or get_rag_chunk_size()
    parser = SimpleNodeParser(
@@ -69,6 +69,37 @@ def test_build_document_node_excludes_metadata_from_embedding(
            )


+@pytest.mark.django_db
+def test_build_document_node_structured_fields_in_metadata(
+    real_document: Document,
+) -> None:
+    """Structured fields must be in node.metadata so the LLM receives them via metadata prepend."""
+    nodes = indexing.build_document_node(real_document)
+    assert len(nodes) > 0
+    for node in nodes:
+        assert "title" in node.metadata
+        assert "tags" in node.metadata
+        assert "correspondent" in node.metadata
+        assert "document_type" in node.metadata
+        assert "created" in node.metadata
+        assert "added" in node.metadata
+        assert "modified" in node.metadata
+
+
+@pytest.mark.django_db
+def test_build_document_node_excludes_document_id_from_llm_context(
+    real_document: Document,
+) -> None:
+    """document_id is an internal key and must not appear in LLM context text."""
+    from llama_index.core.schema import MetadataMode
+
+    nodes = indexing.build_document_node(real_document)
+    assert len(nodes) > 0
+    for node in nodes:
+        assert "document_id" in node.excluded_llm_metadata_keys
+        assert "document_id" not in node.get_content(metadata_mode=MetadataMode.LLM)
+
+
@pytest.mark.django_db
 def test_build_document_node_uses_rag_chunk_settings(real_document: Document) -> None:
    app_config, _ = ApplicationConfiguration.objects.get_or_create()
@@ -243,12 +243,15 @@ def test_build_llm_index_text(mock_document):

        result = build_llm_index_text(mock_document)

-        assert "Title: Test Title" in result
+        # Structured fields live in node.metadata for LLM context — not body text
+        assert "Title: Test Title" not in result
+        assert "Created: 2023-01-01" not in result
+        assert "Tags: Tag1, Tag2" not in result
+        assert "Document Type: Invoice" not in result
+        assert "Correspondent: Test Correspondent" not in result
+
+        # Fields without a metadata equivalent stay in body text
        assert "Filename: test_file.pdf" in result
-        assert "Created: 2023-01-01" in result
-        assert "Tags: Tag1, Tag2" in result
-        assert "Document Type: Invoice" in result
-        assert "Correspondent: Test Correspondent" in result
        assert "Notes: Note1,Note2" in result
        assert "Content:\n\nThis is the document content." in result
        assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result