mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-10 03:01:23 +00:00
Compare commits
3 Commits
dev
...
fix-versio
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d9628f7255 | ||
|
|
fcbe4b200c | ||
|
|
2b434916a0 |
@@ -169,7 +169,7 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user
|
|||||||
def matches(matching_model: MatchingModel, document: Document):
|
def matches(matching_model: MatchingModel, document: Document):
|
||||||
search_flags = 0
|
search_flags = 0
|
||||||
|
|
||||||
document_content = document.content
|
document_content = document.get_effective_content() or ""
|
||||||
|
|
||||||
# Check that match is not empty
|
# Check that match is not empty
|
||||||
if not matching_model.match.strip():
|
if not matching_model.match.strip():
|
||||||
|
|||||||
@@ -361,6 +361,42 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
|||||||
res += f" {self.title}"
|
res += f" {self.title}"
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
def get_effective_content(self) -> str | None:
|
||||||
|
"""
|
||||||
|
Returns the effective content for the document.
|
||||||
|
|
||||||
|
For root documents, this is the latest version's content when available.
|
||||||
|
For version documents, this is always the document's own content.
|
||||||
|
If the queryset already annotated ``effective_content``, that value is used.
|
||||||
|
"""
|
||||||
|
if hasattr(self, "effective_content"):
|
||||||
|
return getattr(self, "effective_content")
|
||||||
|
|
||||||
|
if self.root_document_id is not None or self.pk is None:
|
||||||
|
return self.content
|
||||||
|
|
||||||
|
prefetched_cache = getattr(self, "_prefetched_objects_cache", None)
|
||||||
|
prefetched_versions = (
|
||||||
|
prefetched_cache.get("versions")
|
||||||
|
if isinstance(prefetched_cache, dict)
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
if prefetched_versions:
|
||||||
|
latest_prefetched = max(prefetched_versions, key=lambda doc: doc.id)
|
||||||
|
return latest_prefetched.content
|
||||||
|
|
||||||
|
latest_version_content = (
|
||||||
|
Document.objects.filter(root_document=self)
|
||||||
|
.order_by("-id")
|
||||||
|
.values_list("content", flat=True)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
latest_version_content
|
||||||
|
if latest_version_content is not None
|
||||||
|
else self.content
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def suggestion_content(self):
|
def suggestion_content(self):
|
||||||
"""
|
"""
|
||||||
@@ -373,15 +409,21 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
|||||||
This improves processing speed for large documents while keeping
|
This improves processing speed for large documents while keeping
|
||||||
enough context for accurate suggestions.
|
enough context for accurate suggestions.
|
||||||
"""
|
"""
|
||||||
if not self.content or len(self.content) <= 1200000:
|
effective_content = self.get_effective_content()
|
||||||
return self.content
|
if not effective_content or len(effective_content) <= 1200000:
|
||||||
|
return effective_content
|
||||||
else:
|
else:
|
||||||
# Use 80% from the start and 20% from the end
|
# Use 80% from the start and 20% from the end
|
||||||
# to preserve both opening and closing context.
|
# to preserve both opening and closing context.
|
||||||
head_len = 800000
|
head_len = 800000
|
||||||
tail_len = 200000
|
tail_len = 200000
|
||||||
|
|
||||||
return " ".join((self.content[:head_len], self.content[-tail_len:]))
|
return " ".join(
|
||||||
|
(
|
||||||
|
effective_content[:head_len],
|
||||||
|
effective_content[-tail_len:],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def source_path(self) -> Path:
|
def source_path(self) -> Path:
|
||||||
|
|||||||
@@ -156,6 +156,46 @@ class TestDocument(TestCase):
|
|||||||
)
|
)
|
||||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
|
self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
|
||||||
|
|
||||||
|
def test_suggestion_content_uses_latest_version_content_for_root_documents(
|
||||||
|
self,
|
||||||
|
) -> None:
|
||||||
|
root = Document.objects.create(
|
||||||
|
title="root",
|
||||||
|
checksum="root",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
content="outdated root content",
|
||||||
|
)
|
||||||
|
version = Document.objects.create(
|
||||||
|
title="v1",
|
||||||
|
checksum="v1",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
root_document=root,
|
||||||
|
content="latest version content",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(root.suggestion_content, version.content)
|
||||||
|
|
||||||
|
def test_content_length_is_per_document_row_for_versions(self) -> None:
|
||||||
|
root = Document.objects.create(
|
||||||
|
title="root",
|
||||||
|
checksum="root",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
content="abc",
|
||||||
|
)
|
||||||
|
version = Document.objects.create(
|
||||||
|
title="v1",
|
||||||
|
checksum="v1",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
root_document=root,
|
||||||
|
content="abcdefgh",
|
||||||
|
)
|
||||||
|
|
||||||
|
root.refresh_from_db()
|
||||||
|
version.refresh_from_db()
|
||||||
|
|
||||||
|
self.assertEqual(root.content_length, 3)
|
||||||
|
self.assertEqual(version.content_length, 8)
|
||||||
|
|
||||||
|
|
||||||
def test_suggestion_content() -> None:
|
def test_suggestion_content() -> None:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -48,6 +48,52 @@ class _TestMatchingBase(TestCase):
|
|||||||
|
|
||||||
|
|
||||||
class TestMatching(_TestMatchingBase):
|
class TestMatching(_TestMatchingBase):
|
||||||
|
def test_matches_uses_latest_version_content_for_root_documents(self) -> None:
|
||||||
|
root = Document.objects.create(
|
||||||
|
title="root",
|
||||||
|
checksum="root",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
content="root content without token",
|
||||||
|
)
|
||||||
|
Document.objects.create(
|
||||||
|
title="v1",
|
||||||
|
checksum="v1",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
root_document=root,
|
||||||
|
content="latest version contains keyword",
|
||||||
|
)
|
||||||
|
tag = Tag.objects.create(
|
||||||
|
name="tag",
|
||||||
|
match="keyword",
|
||||||
|
matching_algorithm=Tag.MATCH_ANY,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertTrue(matching.matches(tag, root))
|
||||||
|
|
||||||
|
def test_matches_does_not_fall_back_to_root_content_when_version_exists(
|
||||||
|
self,
|
||||||
|
) -> None:
|
||||||
|
root = Document.objects.create(
|
||||||
|
title="root",
|
||||||
|
checksum="root",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
content="root contains keyword",
|
||||||
|
)
|
||||||
|
Document.objects.create(
|
||||||
|
title="v1",
|
||||||
|
checksum="v1",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
root_document=root,
|
||||||
|
content="latest version without token",
|
||||||
|
)
|
||||||
|
tag = Tag.objects.create(
|
||||||
|
name="tag",
|
||||||
|
match="keyword",
|
||||||
|
matching_algorithm=Tag.MATCH_ANY,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertFalse(matching.matches(tag, root))
|
||||||
|
|
||||||
def test_match_none(self) -> None:
|
def test_match_none(self) -> None:
|
||||||
self._test_matching(
|
self._test_matching(
|
||||||
"",
|
"",
|
||||||
|
|||||||
Reference in New Issue
Block a user