From 2f5d199fefd573a8c8cd49ff9a8d84c9e609584d Mon Sep 17 00:00:00 2001 From: stumpylog <797416+stumpylog@users.noreply.github.com> Date: Tue, 2 Jun 2026 15:11:21 -0700 Subject: [PATCH] feat(ai): tie LlamaDocument id to the paperless document id Set id_=str(document.id) on the LlamaDocument constructor in build_document_node so that every chunk node's ref_doc_id equals the paperless document pk, enabling the LanceDB adapter's delete(str(doc.id)) and doc_id column to work correctly. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/paperless_ai/indexing.py | 1 + src/paperless_ai/tests/test_ai_indexing.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index 7ec1fdba3..cf68ebc53 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -142,6 +142,7 @@ def build_document_node( # the token count and exceed embedding models with small context windows # (e.g. nomic-embed-text via Ollama defaults to num_ctx=2048). doc = LlamaDocument( + id_=str(document.id), text=text, metadata=metadata, excluded_embed_metadata_keys=list(metadata.keys()), diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py index 339d75ead..c1ddbb7d2 100644 --- a/src/paperless_ai/tests/test_ai_indexing.py +++ b/src/paperless_ai/tests/test_ai_indexing.py @@ -66,6 +66,18 @@ def test_build_document_node(real_document) -> None: assert nodes[0].metadata["document_id"] == str(real_document.id) +@pytest.mark.django_db +def test_build_document_node_sets_ref_doc_id(real_document: Document) -> None: + """Every node produced by build_document_node must carry the paperless document id + as its ref_doc_id so that the LanceDB adapter's delete(str(doc.id)) works correctly.""" + nodes = indexing.build_document_node(real_document) + assert len(nodes) > 0, "Expected at least one node" + for node in nodes: + assert node.ref_doc_id == str(real_document.id), ( + f"Expected ref_doc_id={real_document.id!r}, got {node.ref_doc_id!r}" + ) + + @pytest.mark.django_db def test_build_document_node_excludes_metadata_from_embedding(real_document) -> None: """Metadata keys must not be prepended to the embedding text.