To reduce embedding size, don't store the metadata in the body. Body is content + a few other things, metadata keys hold the metadata

This commit is contained in:
stumpylog
2026-06-05 10:15:22 -07:00
parent e7f8bf0542
commit 3aa83c9e4c
4 changed files with 45 additions and 12 deletions
+5 -7
View File
@@ -138,15 +138,13 @@ def _normalize_llm_index_text(text: str) -> str:
def build_llm_index_text(doc: Document) -> str:
# TODO: Filename, Storage Path, and Archive Serial Number are short structured
# values that could move to node.metadata (excluded from embeddings, visible to
# LLM via metadata prepend) — same pattern as title/tags/correspondent. Notes
# and Custom Fields should stay here: Notes can be long free text, Custom Fields
# are dynamic in count and best kept in the embedding.
lines = [
f"Title: {doc.title}",
f"Filename: {doc.filename}",
f"Created: {doc.created}",
f"Added: {doc.added}",
f"Modified: {doc.modified}",
f"Tags: {', '.join(tag.name for tag in doc.tags.all())}",
f"Document Type: {doc.document_type.name if doc.document_type else ''}",
f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}",
f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}",
f"Archive Serial Number: {doc.archive_serial_number or ''}",
f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}",
+1
View File
@@ -119,6 +119,7 @@ def build_document_node(
text=text,
metadata=metadata,
excluded_embed_metadata_keys=list(metadata.keys()),
excluded_llm_metadata_keys=["document_id"],
)
chunk_size = chunk_size or get_rag_chunk_size()
parser = SimpleNodeParser(
@@ -69,6 +69,37 @@ def test_build_document_node_excludes_metadata_from_embedding(
)
@pytest.mark.django_db
def test_build_document_node_structured_fields_in_metadata(
real_document: Document,
) -> None:
"""Structured fields must be in node.metadata so the LLM receives them via metadata prepend."""
nodes = indexing.build_document_node(real_document)
assert len(nodes) > 0
for node in nodes:
assert "title" in node.metadata
assert "tags" in node.metadata
assert "correspondent" in node.metadata
assert "document_type" in node.metadata
assert "created" in node.metadata
assert "added" in node.metadata
assert "modified" in node.metadata
@pytest.mark.django_db
def test_build_document_node_excludes_document_id_from_llm_context(
real_document: Document,
) -> None:
"""document_id is an internal key and must not appear in LLM context text."""
from llama_index.core.schema import MetadataMode
nodes = indexing.build_document_node(real_document)
assert len(nodes) > 0
for node in nodes:
assert "document_id" in node.excluded_llm_metadata_keys
assert "document_id" not in node.get_content(metadata_mode=MetadataMode.LLM)
@pytest.mark.django_db
def test_build_document_node_uses_rag_chunk_settings(real_document: Document) -> None:
app_config, _ = ApplicationConfiguration.objects.get_or_create()
+8 -5
View File
@@ -243,12 +243,15 @@ def test_build_llm_index_text(mock_document):
result = build_llm_index_text(mock_document)
assert "Title: Test Title" in result
# Structured fields live in node.metadata for LLM context — not body text
assert "Title: Test Title" not in result
assert "Created: 2023-01-01" not in result
assert "Tags: Tag1, Tag2" not in result
assert "Document Type: Invoice" not in result
assert "Correspondent: Test Correspondent" not in result
# Fields without a metadata equivalent stay in body text
assert "Filename: test_file.pdf" in result
assert "Created: 2023-01-01" in result
assert "Tags: Tag1, Tag2" in result
assert "Document Type: Invoice" in result
assert "Correspondent: Test Correspondent" in result
assert "Notes: Note1,Note2" in result
assert "Content:\n\nThis is the document content." in result
assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result