feat(ai): tie LlamaDocument id to the paperless document id

Set id_=str(document.id) on the LlamaDocument constructor in
build_document_node so that every chunk node's ref_doc_id equals the
paperless document pk, enabling the LanceDB adapter's delete(str(doc.id))
and doc_id column to work correctly.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
stumpylog
2026-06-02 15:11:21 -07:00
parent b2e0dbef46
commit 9a40b4ac9d
2 changed files with 13 additions and 0 deletions
+1
View File
@@ -142,6 +142,7 @@ def build_document_node(
# the token count and exceed embedding models with small context windows
# (e.g. nomic-embed-text via Ollama defaults to num_ctx=2048).
doc = LlamaDocument(
id_=str(document.id),
text=text,
metadata=metadata,
excluded_embed_metadata_keys=list(metadata.keys()),
@@ -66,6 +66,18 @@ def test_build_document_node(real_document) -> None:
assert nodes[0].metadata["document_id"] == str(real_document.id)
@pytest.mark.django_db
def test_build_document_node_sets_ref_doc_id(real_document: Document) -> None:
"""Every node produced by build_document_node must carry the paperless document id
as its ref_doc_id so that the LanceDB adapter's delete(str(doc.id)) works correctly."""
nodes = indexing.build_document_node(real_document)
assert len(nodes) > 0, "Expected at least one node"
for node in nodes:
assert node.ref_doc_id == str(real_document.id), (
f"Expected ref_doc_id={real_document.id!r}, got {node.ref_doc_id!r}"
)
@pytest.mark.django_db
def test_build_document_node_excludes_metadata_from_embedding(real_document) -> None:
"""Metadata keys must not be prepended to the embedding text.