mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-10 03:01:23 +00:00
Compare commits
3 Commits
dependabot
...
fix-versio
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d9628f7255 | ||
|
|
fcbe4b200c | ||
|
|
2b434916a0 |
@@ -169,7 +169,7 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user
|
||||
def matches(matching_model: MatchingModel, document: Document):
|
||||
search_flags = 0
|
||||
|
||||
document_content = document.content
|
||||
document_content = document.get_effective_content() or ""
|
||||
|
||||
# Check that match is not empty
|
||||
if not matching_model.match.strip():
|
||||
|
||||
@@ -361,6 +361,42 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
res += f" {self.title}"
|
||||
return res
|
||||
|
||||
def get_effective_content(self) -> str | None:
|
||||
"""
|
||||
Returns the effective content for the document.
|
||||
|
||||
For root documents, this is the latest version's content when available.
|
||||
For version documents, this is always the document's own content.
|
||||
If the queryset already annotated ``effective_content``, that value is used.
|
||||
"""
|
||||
if hasattr(self, "effective_content"):
|
||||
return getattr(self, "effective_content")
|
||||
|
||||
if self.root_document_id is not None or self.pk is None:
|
||||
return self.content
|
||||
|
||||
prefetched_cache = getattr(self, "_prefetched_objects_cache", None)
|
||||
prefetched_versions = (
|
||||
prefetched_cache.get("versions")
|
||||
if isinstance(prefetched_cache, dict)
|
||||
else None
|
||||
)
|
||||
if prefetched_versions:
|
||||
latest_prefetched = max(prefetched_versions, key=lambda doc: doc.id)
|
||||
return latest_prefetched.content
|
||||
|
||||
latest_version_content = (
|
||||
Document.objects.filter(root_document=self)
|
||||
.order_by("-id")
|
||||
.values_list("content", flat=True)
|
||||
.first()
|
||||
)
|
||||
return (
|
||||
latest_version_content
|
||||
if latest_version_content is not None
|
||||
else self.content
|
||||
)
|
||||
|
||||
@property
|
||||
def suggestion_content(self):
|
||||
"""
|
||||
@@ -373,15 +409,21 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
This improves processing speed for large documents while keeping
|
||||
enough context for accurate suggestions.
|
||||
"""
|
||||
if not self.content or len(self.content) <= 1200000:
|
||||
return self.content
|
||||
effective_content = self.get_effective_content()
|
||||
if not effective_content or len(effective_content) <= 1200000:
|
||||
return effective_content
|
||||
else:
|
||||
# Use 80% from the start and 20% from the end
|
||||
# to preserve both opening and closing context.
|
||||
head_len = 800000
|
||||
tail_len = 200000
|
||||
|
||||
return " ".join((self.content[:head_len], self.content[-tail_len:]))
|
||||
return " ".join(
|
||||
(
|
||||
effective_content[:head_len],
|
||||
effective_content[-tail_len:],
|
||||
),
|
||||
)
|
||||
|
||||
@property
|
||||
def source_path(self) -> Path:
|
||||
|
||||
@@ -156,6 +156,46 @@ class TestDocument(TestCase):
|
||||
)
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
|
||||
|
||||
def test_suggestion_content_uses_latest_version_content_for_root_documents(
|
||||
self,
|
||||
) -> None:
|
||||
root = Document.objects.create(
|
||||
title="root",
|
||||
checksum="root",
|
||||
mime_type="application/pdf",
|
||||
content="outdated root content",
|
||||
)
|
||||
version = Document.objects.create(
|
||||
title="v1",
|
||||
checksum="v1",
|
||||
mime_type="application/pdf",
|
||||
root_document=root,
|
||||
content="latest version content",
|
||||
)
|
||||
|
||||
self.assertEqual(root.suggestion_content, version.content)
|
||||
|
||||
def test_content_length_is_per_document_row_for_versions(self) -> None:
|
||||
root = Document.objects.create(
|
||||
title="root",
|
||||
checksum="root",
|
||||
mime_type="application/pdf",
|
||||
content="abc",
|
||||
)
|
||||
version = Document.objects.create(
|
||||
title="v1",
|
||||
checksum="v1",
|
||||
mime_type="application/pdf",
|
||||
root_document=root,
|
||||
content="abcdefgh",
|
||||
)
|
||||
|
||||
root.refresh_from_db()
|
||||
version.refresh_from_db()
|
||||
|
||||
self.assertEqual(root.content_length, 3)
|
||||
self.assertEqual(version.content_length, 8)
|
||||
|
||||
|
||||
def test_suggestion_content() -> None:
|
||||
"""
|
||||
|
||||
@@ -48,6 +48,52 @@ class _TestMatchingBase(TestCase):
|
||||
|
||||
|
||||
class TestMatching(_TestMatchingBase):
|
||||
def test_matches_uses_latest_version_content_for_root_documents(self) -> None:
|
||||
root = Document.objects.create(
|
||||
title="root",
|
||||
checksum="root",
|
||||
mime_type="application/pdf",
|
||||
content="root content without token",
|
||||
)
|
||||
Document.objects.create(
|
||||
title="v1",
|
||||
checksum="v1",
|
||||
mime_type="application/pdf",
|
||||
root_document=root,
|
||||
content="latest version contains keyword",
|
||||
)
|
||||
tag = Tag.objects.create(
|
||||
name="tag",
|
||||
match="keyword",
|
||||
matching_algorithm=Tag.MATCH_ANY,
|
||||
)
|
||||
|
||||
self.assertTrue(matching.matches(tag, root))
|
||||
|
||||
def test_matches_does_not_fall_back_to_root_content_when_version_exists(
|
||||
self,
|
||||
) -> None:
|
||||
root = Document.objects.create(
|
||||
title="root",
|
||||
checksum="root",
|
||||
mime_type="application/pdf",
|
||||
content="root contains keyword",
|
||||
)
|
||||
Document.objects.create(
|
||||
title="v1",
|
||||
checksum="v1",
|
||||
mime_type="application/pdf",
|
||||
root_document=root,
|
||||
content="latest version without token",
|
||||
)
|
||||
tag = Tag.objects.create(
|
||||
name="tag",
|
||||
match="keyword",
|
||||
matching_algorithm=Tag.MATCH_ANY,
|
||||
)
|
||||
|
||||
self.assertFalse(matching.matches(tag, root))
|
||||
|
||||
def test_match_none(self) -> None:
|
||||
self._test_matching(
|
||||
"",
|
||||
|
||||
Reference in New Issue
Block a user