From 3efc9a5733b956b342dd2c7b9a4275dcb2d2b3ab Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:45:56 -0700 Subject: [PATCH] Fix: use effective content for matching and suggestion content (#12293) --- src/documents/matching.py | 2 +- src/documents/models.py | 48 ++++++++++++++++++++-- src/documents/tests/test_document_model.py | 40 ++++++++++++++++++ src/documents/tests/test_matchables.py | 46 +++++++++++++++++++++ 4 files changed, 132 insertions(+), 4 deletions(-) diff --git a/src/documents/matching.py b/src/documents/matching.py index fb458b17c..e023adae7 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -169,7 +169,7 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user def matches(matching_model: MatchingModel, document: Document): search_flags = 0 - document_content = document.content + document_content = document.get_effective_content() or "" # Check that match is not empty if not matching_model.match.strip(): diff --git a/src/documents/models.py b/src/documents/models.py index b1b914069..6147ac001 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -361,6 +361,42 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager- res += f" {self.title}" return res + def get_effective_content(self) -> str | None: + """ + Returns the effective content for the document. + + For root documents, this is the latest version's content when available. + For version documents, this is always the document's own content. + If the queryset already annotated ``effective_content``, that value is used. + """ + if hasattr(self, "effective_content"): + return getattr(self, "effective_content") + + if self.root_document_id is not None or self.pk is None: + return self.content + + prefetched_cache = getattr(self, "_prefetched_objects_cache", None) + prefetched_versions = ( + prefetched_cache.get("versions") + if isinstance(prefetched_cache, dict) + else None + ) + if prefetched_versions: + latest_prefetched = max(prefetched_versions, key=lambda doc: doc.id) + return latest_prefetched.content + + latest_version_content = ( + Document.objects.filter(root_document=self) + .order_by("-id") + .values_list("content", flat=True) + .first() + ) + return ( + latest_version_content + if latest_version_content is not None + else self.content + ) + @property def suggestion_content(self): """ @@ -373,15 +409,21 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager- This improves processing speed for large documents while keeping enough context for accurate suggestions. """ - if not self.content or len(self.content) <= 1200000: - return self.content + effective_content = self.get_effective_content() + if not effective_content or len(effective_content) <= 1200000: + return effective_content else: # Use 80% from the start and 20% from the end # to preserve both opening and closing context. head_len = 800000 tail_len = 200000 - return " ".join((self.content[:head_len], self.content[-tail_len:])) + return " ".join( + ( + effective_content[:head_len], + effective_content[-tail_len:], + ), + ) @property def source_path(self) -> Path: diff --git a/src/documents/tests/test_document_model.py b/src/documents/tests/test_document_model.py index 57486a757..8a58f4b13 100644 --- a/src/documents/tests/test_document_model.py +++ b/src/documents/tests/test_document_model.py @@ -156,6 +156,46 @@ class TestDocument(TestCase): ) self.assertEqual(doc.get_public_filename(), "2020-12-25 test") + def test_suggestion_content_uses_latest_version_content_for_root_documents( + self, + ) -> None: + root = Document.objects.create( + title="root", + checksum="root", + mime_type="application/pdf", + content="outdated root content", + ) + version = Document.objects.create( + title="v1", + checksum="v1", + mime_type="application/pdf", + root_document=root, + content="latest version content", + ) + + self.assertEqual(root.suggestion_content, version.content) + + def test_content_length_is_per_document_row_for_versions(self) -> None: + root = Document.objects.create( + title="root", + checksum="root", + mime_type="application/pdf", + content="abc", + ) + version = Document.objects.create( + title="v1", + checksum="v1", + mime_type="application/pdf", + root_document=root, + content="abcdefgh", + ) + + root.refresh_from_db() + version.refresh_from_db() + + self.assertEqual(root.content_length, 3) + self.assertEqual(version.content_length, 8) + def test_suggestion_content() -> None: """ diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index 04ff3f6d3..e038bf786 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -48,6 +48,52 @@ class _TestMatchingBase(TestCase): class TestMatching(_TestMatchingBase): + def test_matches_uses_latest_version_content_for_root_documents(self) -> None: + root = Document.objects.create( + title="root", + checksum="root", + mime_type="application/pdf", + content="root content without token", + ) + Document.objects.create( + title="v1", + checksum="v1", + mime_type="application/pdf", + root_document=root, + content="latest version contains keyword", + ) + tag = Tag.objects.create( + name="tag", + match="keyword", + matching_algorithm=Tag.MATCH_ANY, + ) + + self.assertTrue(matching.matches(tag, root)) + + def test_matches_does_not_fall_back_to_root_content_when_version_exists( + self, + ) -> None: + root = Document.objects.create( + title="root", + checksum="root", + mime_type="application/pdf", + content="root contains keyword", + ) + Document.objects.create( + title="v1", + checksum="v1", + mime_type="application/pdf", + root_document=root, + content="latest version without token", + ) + tag = Tag.objects.create( + name="tag", + match="keyword", + matching_algorithm=Tag.MATCH_ANY, + ) + + self.assertFalse(matching.matches(tag, root)) + def test_match_none(self) -> None: self._test_matching( "",