From df03207eef9fa13c8024aeb40594746df738cab9 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Wed, 4 Mar 2026 15:28:07 -0800 Subject: [PATCH] Fix: correct doc version filename handling (#12223) --- docs/advanced_usage.md | 6 + docs/usage.md | 1 + src/documents/consumer.py | 32 ++-- src/documents/file_handling.py | 19 ++- .../0016_document_version_index_and_more.py | 37 +++++ src/documents/models.py | 18 +++ src/documents/signals/handlers.py | 10 ++ src/documents/tests/test_consumer.py | 74 ++++++++- src/documents/tests/test_file_handling.py | 140 ++++++++++++++++++ 9 files changed, 309 insertions(+), 28 deletions(-) create mode 100644 src/documents/migrations/0016_document_version_index_and_more.py diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index a293eb91d..13194fe87 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -262,6 +262,10 @@ your files differently, you can do that by adjusting the or using [storage paths (see below)](#storage-paths). Paperless adds the correct file extension e.g. `.pdf`, `.jpg` automatically. +When a document has file versions, each version uses the same naming rules and +storage path resolution as any other document file, with an added version suffix +such as `_v1`, `_v2`, etc. + This variable allows you to configure the filename (folders are allowed) using placeholders. For example, configuring this to @@ -353,6 +357,8 @@ If paperless detects that two documents share the same filename, paperless will automatically append `_01`, `_02`, etc to the filename. This happens if all the placeholders in a filename evaluate to the same value. +For versioned files, this counter is appended after the version suffix +(for example `statement_v2_01.pdf`). If there are any errors in the placeholders included in `PAPERLESS_FILENAME_FORMAT`, paperless will fall back to using the default naming scheme instead. diff --git a/docs/usage.md b/docs/usage.md index bcaf7df7a..b56401351 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -95,6 +95,7 @@ Think of versions as **file history** for a document. - Versions track the underlying file and extracted text content (OCR/text). - Metadata such as tags, correspondent, document type, storage path and custom fields stay on the "root" document. +- Version files follow normal filename formatting (including storage paths) and add a `_vN` suffix (for example `_v1`, `_v2`). - By default, search and document content use the latest version. - In document detail, selecting a version switches the preview, file metadata and content (and download etc buttons) to that version. - Deleting a non-root version keeps metadata and falls back to the latest remaining version. diff --git a/src/documents/consumer.py b/src/documents/consumer.py index b00c4aee3..3f1ffc9d1 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -11,6 +11,7 @@ import magic from django.conf import settings from django.contrib.auth.models import User from django.db import transaction +from django.db.models import Max from django.db.models import Q from django.utils import timezone from filelock import FileLock @@ -124,22 +125,6 @@ class ConsumerPluginMixin: self.filename = self.metadata.filename or self.input_doc.original_file.name - if input_doc.root_document_id: - self.log.debug( - f"Document root document id: {input_doc.root_document_id}", - ) - root_document = Document.objects.get(pk=input_doc.root_document_id) - version_index = Document.objects.filter(root_document=root_document).count() - filename_path = Path(self.filename) - if filename_path.suffix: - self.filename = str( - filename_path.with_name( - f"{filename_path.stem}_v{version_index}{filename_path.suffix}", - ), - ) - else: - self.filename = f"{self.filename}_v{version_index}" - def _send_progress( self, current_progress: int, @@ -194,9 +179,19 @@ class ConsumerPlugin( mime_type: str, ) -> Document: self.log.debug("Saving record for updated version to database") - version_doc = Document.objects.get(pk=root_doc.pk) + root_doc_frozen = Document.objects.select_for_update().get(pk=root_doc.pk) + next_version_index = ( + Document.global_objects.filter( + root_document_id=root_doc_frozen.pk, + ).aggregate( + max_index=Max("version_index"), + )["max_index"] + or 0 + ) + version_doc = Document.objects.get(pk=root_doc_frozen.pk) setattr(version_doc, "pk", None) - version_doc.root_document = root_doc + version_doc.root_document = root_doc_frozen + version_doc.version_index = next_version_index + 1 file_for_checksum = ( self.unmodified_original if self.unmodified_original is not None @@ -209,7 +204,6 @@ class ConsumerPlugin( version_doc.page_count = page_count version_doc.mime_type = mime_type version_doc.original_filename = self.filename - version_doc.storage_path = root_doc.storage_path # Clear unique file path fields so they can be generated uniquely later version_doc.filename = None version_doc.archive_filename = None diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index 17ae84d9b..7f59d4557 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -129,12 +129,19 @@ def generate_filename( archive_filename=False, use_format=True, ) -> Path: + # version docs use the root document for formatting, just with a suffix + context_doc = doc if doc.root_document_id is None else doc.root_document + version_suffix = ( + f"_v{doc.version_index}" + if doc.root_document_id is not None and doc.version_index is not None + else "" + ) base_path: Path | None = None # Determine the source of the format string if use_format: - if doc.storage_path is not None: - filename_format = doc.storage_path.path + if context_doc.storage_path is not None: + filename_format = context_doc.storage_path.path elif settings.FILENAME_FORMAT is not None: # Maybe convert old to new style filename_format = convert_format_str_to_template_format( @@ -147,7 +154,7 @@ def generate_filename( # If we have one, render it if filename_format is not None: - rendered_path: str | None = format_filename(doc, filename_format) + rendered_path: str | None = format_filename(context_doc, filename_format) if rendered_path: base_path = Path(rendered_path) @@ -161,7 +168,7 @@ def generate_filename( base_filename = base_path.name # Build the final filename with counter and filetype - final_filename = f"{base_filename}{counter_str}{filetype_str}" + final_filename = f"{base_filename}{version_suffix}{counter_str}{filetype_str}" # If we have a directory component, include it if str(directory) != ".": @@ -170,7 +177,9 @@ def generate_filename( full_path = Path(final_filename) else: # No template, use document ID - final_filename = f"{doc.pk:07}{counter_str}{filetype_str}" + final_filename = ( + f"{context_doc.pk:07}{version_suffix}{counter_str}{filetype_str}" + ) full_path = Path(final_filename) return full_path diff --git a/src/documents/migrations/0016_document_version_index_and_more.py b/src/documents/migrations/0016_document_version_index_and_more.py new file mode 100644 index 000000000..9c85b05d3 --- /dev/null +++ b/src/documents/migrations/0016_document_version_index_and_more.py @@ -0,0 +1,37 @@ +# Generated by Django 5.2.11 on 2026-03-02 17:48 + +from django.conf import settings +from django.db import migrations +from django.db import models + + +class Migration(migrations.Migration): + dependencies = [ + ("documents", "0015_savedview_visibility_to_ui_settings"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AddField( + model_name="document", + name="version_index", + field=models.PositiveIntegerField( + blank=True, + db_index=True, + help_text="Index of this version within the root document.", + null=True, + verbose_name="version index", + ), + ), + migrations.AddConstraint( + model_name="document", + constraint=models.UniqueConstraint( + condition=models.Q( + ("root_document__isnull", False), + ("version_index__isnull", False), + ), + fields=("root_document", "version_index"), + name="documents_document_root_version_index_uniq", + ), + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index f0191b47f..b1b914069 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -319,6 +319,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager- verbose_name=_("root document for this version"), ) + version_index = models.PositiveIntegerField( + _("version index"), + blank=True, + null=True, + db_index=True, + help_text=_("Index of this version within the root document."), + ) + version_label = models.CharField( _("version label"), max_length=64, @@ -331,6 +339,16 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager- ordering = ("-created",) verbose_name = _("document") verbose_name_plural = _("documents") + constraints = [ + models.UniqueConstraint( + fields=["root_document", "version_index"], + condition=models.Q( + root_document__isnull=False, + version_index__isnull=False, + ), + name="documents_document_root_version_index_uniq", + ), + ] def __str__(self) -> str: created = self.created.isoformat() diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index e2f1e7bc2..81dd3ddd9 100644 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -620,6 +620,16 @@ def update_filename_and_move_files( root=settings.ARCHIVE_DIR, ) + # Keep version files in sync with root + if instance.root_document_id is None: + for version_doc in Document.objects.filter(root_document_id=instance.pk).only( + "pk", + ): + update_filename_and_move_files( + Document, + version_doc, + ) + @shared_task def process_cf_select_update(custom_field: CustomField) -> None: diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index d0ce5ff8c..121717f08 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -726,6 +726,13 @@ class TestConsumer( self.assertIsNotNone(root_doc) assert root_doc is not None + root_storage_path = StoragePath.objects.create( + name="version-root-path", + path="root/{{title}}", + ) + root_doc.storage_path = root_storage_path + root_doc.save() + actor = User.objects.create_user( username="actor", email="actor@example.com", @@ -762,7 +769,7 @@ class TestConsumer( ) consumer.setup() try: - self.assertTrue(consumer.filename.endswith("_v0.pdf")) + self.assertEqual(consumer.filename, version_file.name) consumer.run() finally: consumer.cleanup() @@ -772,8 +779,9 @@ class TestConsumer( version = versions.first() assert version is not None assert version.original_filename is not None + self.assertEqual(version.version_index, 1) self.assertEqual(version.version_label, "v2") - self.assertTrue(version.original_filename.endswith("_v0.pdf")) + self.assertEqual(version.original_filename, version_file.name) self.assertTrue(bool(version.content)) @override_settings(AUDIT_LOG_ENABLED=True) @@ -822,7 +830,7 @@ class TestConsumer( ) consumer.setup() try: - self.assertEqual(consumer.filename, "valid_pdf_version-upload_v0") + self.assertEqual(consumer.filename, "valid_pdf_version-upload") consumer.run() finally: consumer.cleanup() @@ -832,9 +840,67 @@ class TestConsumer( ) self.assertIsNotNone(version) assert version is not None - self.assertEqual(version.original_filename, "valid_pdf_version-upload_v0") + self.assertEqual(version.version_index, 1) + self.assertEqual(version.original_filename, "valid_pdf_version-upload") self.assertTrue(bool(version.content)) + @override_settings(AUDIT_LOG_ENABLED=True) + @mock.patch("documents.consumer.load_classifier") + def test_consume_version_index_monotonic_after_version_deletion(self, m) -> None: + m.return_value = MagicMock() + + with self.get_consumer(self.get_test_file()) as consumer: + consumer.run() + + root_doc = Document.objects.first() + self.assertIsNotNone(root_doc) + assert root_doc is not None + + def consume_version(version_file: Path) -> Document: + status = DummyProgressManager(version_file.name, None) + overrides = DocumentMetadataOverrides() + doc = ConsumableDocument( + DocumentSource.ApiUpload, + original_file=version_file, + root_document_id=root_doc.pk, + ) + preflight = ConsumerPreflightPlugin( + doc, + overrides, + status, # type: ignore[arg-type] + self.dirs.scratch_dir, + "task-id", + ) + preflight.setup() + preflight.run() + + consumer = ConsumerPlugin( + doc, + overrides, + status, # type: ignore[arg-type] + self.dirs.scratch_dir, + "task-id", + ) + consumer.setup() + try: + consumer.run() + finally: + consumer.cleanup() + + version = ( + Document.objects.filter(root_document=root_doc).order_by("-id").first() + ) + assert version is not None + return version + + v1 = consume_version(self.get_test_file2()) + self.assertEqual(v1.version_index, 1) + v1.delete() + + # The next version should have version_index 2, even though version_index 1 was deleted + v2 = consume_version(self.get_test_file()) + self.assertEqual(v2.version_index, 2) + @mock.patch("documents.consumer.load_classifier") def testClassifyDocument(self, m) -> None: correspondent = Correspondent.objects.create( diff --git a/src/documents/tests/test_file_handling.py b/src/documents/tests/test_file_handling.py index 8417c4ce7..0cd34e939 100644 --- a/src/documents/tests/test_file_handling.py +++ b/src/documents/tests/test_file_handling.py @@ -77,6 +77,58 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase): settings.ORIGINALS_DIR / "test" / "test.pdf", ) + @override_settings(FILENAME_FORMAT=None) + def test_root_storage_path_change_updates_version_files(self) -> None: + old_storage_path = StoragePath.objects.create( + name="old-path", + path="old/{{title}}", + ) + new_storage_path = StoragePath.objects.create( + name="new-path", + path="new/{{title}}", + ) + + root_doc = Document.objects.create( + title="rootdoc", + mime_type="application/pdf", + checksum="root-checksum", + storage_path=old_storage_path, + ) + version_doc = Document.objects.create( + title="version-title", + mime_type="application/pdf", + checksum="version-checksum", + root_document=root_doc, + version_index=1, + ) + + Document.objects.filter(pk=root_doc.pk).update( + filename=generate_filename(root_doc), + ) + Document.objects.filter(pk=version_doc.pk).update( + filename=generate_filename(version_doc), + ) + root_doc.refresh_from_db() + version_doc.refresh_from_db() + + create_source_path_directory(root_doc.source_path) + Path(root_doc.source_path).touch() + create_source_path_directory(version_doc.source_path) + Path(version_doc.source_path).touch() + + root_doc.storage_path = new_storage_path + root_doc.save() + + root_doc.refresh_from_db() + version_doc.refresh_from_db() + + self.assertEqual(root_doc.filename, "new/rootdoc.pdf") + self.assertEqual(version_doc.filename, "new/rootdoc_v1.pdf") + self.assertIsFile(root_doc.source_path) + self.assertIsFile(version_doc.source_path) + self.assertIsNotFile(settings.ORIGINALS_DIR / "old" / "rootdoc.pdf") + self.assertIsNotFile(settings.ORIGINALS_DIR / "old" / "rootdoc_v1.pdf") + @override_settings(FILENAME_FORMAT="{correspondent}/{correspondent}") def test_file_renaming_missing_permissions(self) -> None: document = Document() @@ -1226,6 +1278,94 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase): Path("logs.pdf"), ) + @override_settings(FILENAME_FORMAT="{title}") + def test_version_index_suffix_for_template_filename(self) -> None: + root_doc = Document.objects.create( + title="the_doc", + mime_type="application/pdf", + checksum="root-checksum", + ) + version_doc = Document.objects.create( + title="the_doc", + mime_type="application/pdf", + checksum="version-checksum", + root_document=root_doc, + version_index=1, + ) + + self.assertEqual(generate_filename(version_doc), Path("the_doc_v1.pdf")) + self.assertEqual( + generate_filename(version_doc, counter=1), + Path("the_doc_v1_01.pdf"), + ) + + @override_settings(FILENAME_FORMAT=None) + def test_version_index_suffix_for_default_filename(self) -> None: + root_doc = Document.objects.create( + title="root", + mime_type="text/plain", + checksum="root-checksum", + ) + version_doc = Document.objects.create( + title="root", + mime_type="text/plain", + checksum="version-checksum", + root_document=root_doc, + version_index=2, + ) + + self.assertEqual( + generate_filename(version_doc), + Path(f"{root_doc.pk:07d}_v2.txt"), + ) + self.assertEqual( + generate_filename(version_doc, archive_filename=True), + Path(f"{root_doc.pk:07d}_v2.pdf"), + ) + + @override_settings(FILENAME_FORMAT="{original_name}") + def test_version_index_suffix_with_original_name_placeholder(self) -> None: + root_doc = Document.objects.create( + title="root", + mime_type="application/pdf", + checksum="root-checksum", + original_filename="root-upload.pdf", + ) + version_doc = Document.objects.create( + title="root", + mime_type="application/pdf", + checksum="version-checksum", + root_document=root_doc, + version_index=1, + original_filename="version-upload.pdf", + ) + + self.assertEqual(generate_filename(version_doc), Path("root-upload_v1.pdf")) + + def test_version_index_suffix_with_storage_path(self) -> None: + storage_path = StoragePath.objects.create( + name="vtest", + path="folder/{{title}}", + ) + root_doc = Document.objects.create( + title="storage_doc", + mime_type="application/pdf", + checksum="root-checksum", + storage_path=storage_path, + ) + version_doc = Document.objects.create( + title="version_title_should_not_be_used", + mime_type="application/pdf", + checksum="version-checksum", + root_document=root_doc, + version_index=3, + ) + + self.assertEqual( + generate_filename(version_doc), + Path("folder/storage_doc_v3.pdf"), + ) + @override_settings( FILENAME_FORMAT="XX{correspondent}/{title}", FILENAME_FORMAT_REMOVE_NONE=True,