mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-06 21:59:46 +00:00
feat(ai): tie LlamaDocument id to the paperless document id
Set id_=str(document.id) on the LlamaDocument constructor in build_document_node so that every chunk node's ref_doc_id equals the paperless document pk, enabling the LanceDB adapter's delete(str(doc.id)) and doc_id column to work correctly. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -142,6 +142,7 @@ def build_document_node(
|
||||
# the token count and exceed embedding models with small context windows
|
||||
# (e.g. nomic-embed-text via Ollama defaults to num_ctx=2048).
|
||||
doc = LlamaDocument(
|
||||
id_=str(document.id),
|
||||
text=text,
|
||||
metadata=metadata,
|
||||
excluded_embed_metadata_keys=list(metadata.keys()),
|
||||
|
||||
@@ -66,6 +66,18 @@ def test_build_document_node(real_document) -> None:
|
||||
assert nodes[0].metadata["document_id"] == str(real_document.id)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_build_document_node_sets_ref_doc_id(real_document: Document) -> None:
|
||||
"""Every node produced by build_document_node must carry the paperless document id
|
||||
as its ref_doc_id so that the LanceDB adapter's delete(str(doc.id)) works correctly."""
|
||||
nodes = indexing.build_document_node(real_document)
|
||||
assert len(nodes) > 0, "Expected at least one node"
|
||||
for node in nodes:
|
||||
assert node.ref_doc_id == str(real_document.id), (
|
||||
f"Expected ref_doc_id={real_document.id!r}, got {node.ref_doc_id!r}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_build_document_node_excludes_metadata_from_embedding(real_document) -> None:
|
||||
"""Metadata keys must not be prepended to the embedding text.
|
||||
|
||||
Reference in New Issue
Block a user