diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index 7ec1fdba3..cf68ebc53 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -142,6 +142,7 @@ def build_document_node( # the token count and exceed embedding models with small context windows # (e.g. nomic-embed-text via Ollama defaults to num_ctx=2048). doc = LlamaDocument( + id_=str(document.id), text=text, metadata=metadata, excluded_embed_metadata_keys=list(metadata.keys()), diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py index 339d75ead..c1ddbb7d2 100644 --- a/src/paperless_ai/tests/test_ai_indexing.py +++ b/src/paperless_ai/tests/test_ai_indexing.py @@ -66,6 +66,18 @@ def test_build_document_node(real_document) -> None: assert nodes[0].metadata["document_id"] == str(real_document.id) +@pytest.mark.django_db +def test_build_document_node_sets_ref_doc_id(real_document: Document) -> None: + """Every node produced by build_document_node must carry the paperless document id + as its ref_doc_id so that the LanceDB adapter's delete(str(doc.id)) works correctly.""" + nodes = indexing.build_document_node(real_document) + assert len(nodes) > 0, "Expected at least one node" + for node in nodes: + assert node.ref_doc_id == str(real_document.id), ( + f"Expected ref_doc_id={real_document.id!r}, got {node.ref_doc_id!r}" + ) + + @pytest.mark.django_db def test_build_document_node_excludes_metadata_from_embedding(real_document) -> None: """Metadata keys must not be prepended to the embedding text.