mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-06 13:49:44 +00:00
To reduce embedding size, don't store the metadata in the body. Body is content + a few other things, metadata keys hold the metadata
This commit is contained in:
@@ -138,15 +138,13 @@ def _normalize_llm_index_text(text: str) -> str:
|
||||
|
||||
|
||||
def build_llm_index_text(doc: Document) -> str:
|
||||
# TODO: Filename, Storage Path, and Archive Serial Number are short structured
|
||||
# values that could move to node.metadata (excluded from embeddings, visible to
|
||||
# LLM via metadata prepend) — same pattern as title/tags/correspondent. Notes
|
||||
# and Custom Fields should stay here: Notes can be long free text, Custom Fields
|
||||
# are dynamic in count and best kept in the embedding.
|
||||
lines = [
|
||||
f"Title: {doc.title}",
|
||||
f"Filename: {doc.filename}",
|
||||
f"Created: {doc.created}",
|
||||
f"Added: {doc.added}",
|
||||
f"Modified: {doc.modified}",
|
||||
f"Tags: {', '.join(tag.name for tag in doc.tags.all())}",
|
||||
f"Document Type: {doc.document_type.name if doc.document_type else ''}",
|
||||
f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}",
|
||||
f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}",
|
||||
f"Archive Serial Number: {doc.archive_serial_number or ''}",
|
||||
f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}",
|
||||
|
||||
@@ -119,6 +119,7 @@ def build_document_node(
|
||||
text=text,
|
||||
metadata=metadata,
|
||||
excluded_embed_metadata_keys=list(metadata.keys()),
|
||||
excluded_llm_metadata_keys=["document_id"],
|
||||
)
|
||||
chunk_size = chunk_size or get_rag_chunk_size()
|
||||
parser = SimpleNodeParser(
|
||||
|
||||
@@ -69,6 +69,37 @@ def test_build_document_node_excludes_metadata_from_embedding(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_build_document_node_structured_fields_in_metadata(
|
||||
real_document: Document,
|
||||
) -> None:
|
||||
"""Structured fields must be in node.metadata so the LLM receives them via metadata prepend."""
|
||||
nodes = indexing.build_document_node(real_document)
|
||||
assert len(nodes) > 0
|
||||
for node in nodes:
|
||||
assert "title" in node.metadata
|
||||
assert "tags" in node.metadata
|
||||
assert "correspondent" in node.metadata
|
||||
assert "document_type" in node.metadata
|
||||
assert "created" in node.metadata
|
||||
assert "added" in node.metadata
|
||||
assert "modified" in node.metadata
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_build_document_node_excludes_document_id_from_llm_context(
|
||||
real_document: Document,
|
||||
) -> None:
|
||||
"""document_id is an internal key and must not appear in LLM context text."""
|
||||
from llama_index.core.schema import MetadataMode
|
||||
|
||||
nodes = indexing.build_document_node(real_document)
|
||||
assert len(nodes) > 0
|
||||
for node in nodes:
|
||||
assert "document_id" in node.excluded_llm_metadata_keys
|
||||
assert "document_id" not in node.get_content(metadata_mode=MetadataMode.LLM)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_build_document_node_uses_rag_chunk_settings(real_document: Document) -> None:
|
||||
app_config, _ = ApplicationConfiguration.objects.get_or_create()
|
||||
|
||||
@@ -243,12 +243,15 @@ def test_build_llm_index_text(mock_document):
|
||||
|
||||
result = build_llm_index_text(mock_document)
|
||||
|
||||
assert "Title: Test Title" in result
|
||||
# Structured fields live in node.metadata for LLM context — not body text
|
||||
assert "Title: Test Title" not in result
|
||||
assert "Created: 2023-01-01" not in result
|
||||
assert "Tags: Tag1, Tag2" not in result
|
||||
assert "Document Type: Invoice" not in result
|
||||
assert "Correspondent: Test Correspondent" not in result
|
||||
|
||||
# Fields without a metadata equivalent stay in body text
|
||||
assert "Filename: test_file.pdf" in result
|
||||
assert "Created: 2023-01-01" in result
|
||||
assert "Tags: Tag1, Tag2" in result
|
||||
assert "Document Type: Invoice" in result
|
||||
assert "Correspondent: Test Correspondent" in result
|
||||
assert "Notes: Note1,Note2" in result
|
||||
assert "Content:\n\nThis is the document content." in result
|
||||
assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result
|
||||
|
||||
Reference in New Issue
Block a user