feat(ai): tie LlamaDocument id to the paperless document id

Set id_=str(document.id) on the LlamaDocument constructor in build_document_node so that every chunk node's ref_doc_id equals the paperless document pk, enabling the LanceDB adapter's delete(str(doc.id)) and doc_id column to work correctly. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-08-02 00:52:20 +00:00 · 2026-06-03 09:41:06 -07:00
parent b2e0dbef46
commit 9a40b4ac9d
2 changed files with 13 additions and 0 deletions
@@ -142,6 +142,7 @@ def build_document_node(
    # the token count and exceed embedding models with small context windows
    # (e.g. nomic-embed-text via Ollama defaults to num_ctx=2048).
    doc = LlamaDocument(
+        id_=str(document.id),
        text=text,
        metadata=metadata,
        excluded_embed_metadata_keys=list(metadata.keys()),
@@ -66,6 +66,18 @@ def test_build_document_node(real_document) -> None:
    assert nodes[0].metadata["document_id"] == str(real_document.id)


+@pytest.mark.django_db
+def test_build_document_node_sets_ref_doc_id(real_document: Document) -> None:
+    """Every node produced by build_document_node must carry the paperless document id
+    as its ref_doc_id so that the LanceDB adapter's delete(str(doc.id)) works correctly."""
+    nodes = indexing.build_document_node(real_document)
+    assert len(nodes) > 0, "Expected at least one node"
+    for node in nodes:
+        assert node.ref_doc_id == str(real_document.id), (
+            f"Expected ref_doc_id={real_document.id!r}, got {node.ref_doc_id!r}"
+        )
+
+
@pytest.mark.django_db
 def test_build_document_node_excludes_metadata_from_embedding(real_document) -> None:
    """Metadata keys must not be prepended to the embedding text.