Adds a docstring that an IDE will render better

Handles the rename of the migration
Fixes logging so I can see it
2026-03-25 18:32:45 +00:00 · 2026-03-25 11:16:10 -07:00 · 2026-03-25 11:16:10 -07:00 · 2026-03-25 11:16:10 -07:00 · 2026-03-25 11:16:10 -07:00 · 2026-03-25 11:16:05 -07:00
12 changed files with 441 additions and 276 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,5 +1,4 @@
 import datetime
-import hashlib
 import os
 import tempfile
 from enum import StrEnum
@@ -46,6 +45,7 @@ from documents.signals import document_consumption_started
 from documents.signals import document_updated
 from documents.signals.handlers import run_workflows
 from documents.templating.workflows import parse_w_workflow_placeholders
+from documents.utils import compute_checksum
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
@@ -196,9 +196,7 @@ class ConsumerPlugin(
        version_doc = Document(
            root_document=root_doc_frozen,
            version_index=next_version_index + 1,
-            checksum=hashlib.md5(
-                file_for_checksum.read_bytes(),
-            ).hexdigest(),
+            checksum=compute_checksum(file_for_checksum),
            content=text or "",
            page_count=page_count,
            mime_type=mime_type,
@@ -338,15 +336,18 @@ class ConsumerPlugin(
        Return the document object if it was successfully created.
        """

-        # Preflight has already run including progress update to 0%
-        self.log.info(f"Consuming {self.filename}")
+        tempdir = None

-        # For the actual work, copy the file into a tempdir
-        with tempfile.TemporaryDirectory(
-            prefix="paperless-ngx",
-            dir=settings.SCRATCH_DIR,
-        ) as tmpdir:
-            self.working_copy = Path(tmpdir) / Path(self.filename)
+        try:
+            # Preflight has already run including progress update to 0%
+            self.log.info(f"Consuming {self.filename}")
+
+            # For the actual work, copy the file into a tempdir
+            tempdir = tempfile.TemporaryDirectory(
+                prefix="paperless-ngx",
+                dir=settings.SCRATCH_DIR,
+            )
+            self.working_copy = Path(tempdir.name) / Path(self.filename)
            copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
            self.unmodified_original = None

@@ -378,7 +379,7 @@ class ConsumerPlugin(
                    self.log.debug(f"Detected mime type after qpdf: {mime_type}")
                    # Save the original file for later
                    self.unmodified_original = (
-                        Path(tmpdir) / Path("uo") / Path(self.filename)
+                        Path(tempdir.name) / Path("uo") / Path(self.filename)
                    )
                    self.unmodified_original.parent.mkdir(exist_ok=True)
                    copy_file_with_basic_stats(
@@ -397,6 +398,7 @@ class ConsumerPlugin(
                )
            )
            if not parser_class:
+                tempdir.cleanup()
                self._fail(
                    ConsumerStatusShortMessage.UNSUPPORTED_TYPE,
                    f"Unsupported mime type {mime_type}",
@@ -411,275 +413,277 @@ class ConsumerPlugin(
            )

            self.run_pre_consume_script()
+        except:
+            if tempdir:
+                tempdir.cleanup()
+            raise

-            # This doesn't parse the document yet, but gives us a parser.
-            with parser_class() as document_parser:
-                document_parser.configure(
-                    ParserContext(mailrule_id=self.input_doc.mailrule_id),
-                )
+        # This doesn't parse the document yet, but gives us a parser.
+        with parser_class() as document_parser:
+            document_parser.configure(
+                ParserContext(mailrule_id=self.input_doc.mailrule_id),
+            )

-                self.log.debug(
-                    f"Parser: {document_parser.name} v{document_parser.version}",
-                )
+            self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}")

-                # Parse the document. This may take some time.
+            # Parse the document. This may take some time.

-                text = None
-                date = None
-                thumbnail = None
-                archive_path = None
-                page_count = None
-
-                try:
-                    self._send_progress(
-                        20,
-                        100,
-                        ProgressStatusOptions.WORKING,
-                        ConsumerStatusShortMessage.PARSING_DOCUMENT,
-                    )
-                    self.log.debug(f"Parsing {self.filename}...")
-
-                    document_parser.parse(self.working_copy, mime_type)
-
-                    self.log.debug(f"Generating thumbnail for {self.filename}...")
-                    self._send_progress(
-                        70,
-                        100,
-                        ProgressStatusOptions.WORKING,
-                        ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
-                    )
-                    thumbnail = document_parser.get_thumbnail(
-                        self.working_copy,
-                        mime_type,
-                    )
-
-                    text = document_parser.get_text()
-                    date = document_parser.get_date()
-                    if date is None:
-                        self._send_progress(
-                            90,
-                            100,
-                            ProgressStatusOptions.WORKING,
-                            ConsumerStatusShortMessage.PARSE_DATE,
-                        )
-                        with get_date_parser() as date_parser:
-                            date = next(date_parser.parse(self.filename, text), None)
-                    archive_path = document_parser.get_archive_path()
-                    page_count = document_parser.get_page_count(
-                        self.working_copy,
-                        mime_type,
-                    )
-
-                except ParseError as e:
-                    self._fail(
-                        str(e),
-                        f"Error occurred while consuming document {self.filename}: {e}",
-                        exc_info=True,
-                        exception=e,
-                    )
-                except Exception as e:
-                    self._fail(
-                        str(e),
-                        f"Unexpected error while consuming document {self.filename}: {e}",
-                        exc_info=True,
-                        exception=e,
-                    )
-
-                # Prepare the document classifier.
-
-                # TODO: I don't really like to do this here, but this way we avoid
-                #   reloading the classifier multiple times, since there are multiple
-                #   post-consume hooks that all require the classifier.
-
-                classifier = load_classifier()
+            text = None
+            date = None
+            thumbnail = None
+            archive_path = None
+            page_count = None

+            try:
                self._send_progress(
-                    95,
+                    20,
                    100,
                    ProgressStatusOptions.WORKING,
-                    ConsumerStatusShortMessage.SAVE_DOCUMENT,
+                    ConsumerStatusShortMessage.PARSING_DOCUMENT,
                )
-                # now that everything is done, we can start to store the document
-                # in the system. This will be a transaction and reasonably fast.
-                try:
-                    with transaction.atomic():
-                        # store the document.
-                        if self.input_doc.root_document_id:
-                            # If this is a new version of an existing document, we need
-                            # to make sure we're not creating a new document, but updating
-                            # the existing one.
-                            root_doc = Document.objects.get(
-                                pk=self.input_doc.root_document_id,
-                            )
-                            original_document = self._create_version_from_root(
-                                root_doc,
-                                text=text,
-                                page_count=page_count,
-                                mime_type=mime_type,
-                            )
-                            actor = None
+                self.log.debug(f"Parsing {self.filename}...")

-                            # Save the new version, potentially creating an audit log entry for the version addition if enabled.
-                            if (
-                                settings.AUDIT_LOG_ENABLED
-                                and self.metadata.actor_id is not None
-                            ):
-                                actor = User.objects.filter(
-                                    pk=self.metadata.actor_id,
-                                ).first()
-                                if actor is not None:
-                                    from auditlog.context import (  # type: ignore[import-untyped]
-                                        set_actor,
-                                    )
+                document_parser.parse(self.working_copy, mime_type)

-                                    with set_actor(actor):
-                                        original_document.save()
-                                else:
+                self.log.debug(f"Generating thumbnail for {self.filename}...")
+                self._send_progress(
+                    70,
+                    100,
+                    ProgressStatusOptions.WORKING,
+                    ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
+                )
+                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
+
+                text = document_parser.get_text()
+                date = document_parser.get_date()
+                if date is None:
+                    self._send_progress(
+                        90,
+                        100,
+                        ProgressStatusOptions.WORKING,
+                        ConsumerStatusShortMessage.PARSE_DATE,
+                    )
+                    with get_date_parser() as date_parser:
+                        date = next(date_parser.parse(self.filename, text), None)
+                archive_path = document_parser.get_archive_path()
+                page_count = document_parser.get_page_count(
+                    self.working_copy,
+                    mime_type,
+                )
+
+            except ParseError as e:
+                if tempdir:
+                    tempdir.cleanup()
+                self._fail(
+                    str(e),
+                    f"Error occurred while consuming document {self.filename}: {e}",
+                    exc_info=True,
+                    exception=e,
+                )
+            except Exception as e:
+                if tempdir:
+                    tempdir.cleanup()
+                self._fail(
+                    str(e),
+                    f"Unexpected error while consuming document {self.filename}: {e}",
+                    exc_info=True,
+                    exception=e,
+                )
+
+            # Prepare the document classifier.
+
+            # TODO: I don't really like to do this here, but this way we avoid
+            #   reloading the classifier multiple times, since there are multiple
+            #   post-consume hooks that all require the classifier.
+
+            classifier = load_classifier()
+
+            self._send_progress(
+                95,
+                100,
+                ProgressStatusOptions.WORKING,
+                ConsumerStatusShortMessage.SAVE_DOCUMENT,
+            )
+            # now that everything is done, we can start to store the document
+            # in the system. This will be a transaction and reasonably fast.
+            try:
+                with transaction.atomic():
+                    # store the document.
+                    if self.input_doc.root_document_id:
+                        # If this is a new version of an existing document, we need
+                        # to make sure we're not creating a new document, but updating
+                        # the existing one.
+                        root_doc = Document.objects.get(
+                            pk=self.input_doc.root_document_id,
+                        )
+                        original_document = self._create_version_from_root(
+                            root_doc,
+                            text=text,
+                            page_count=page_count,
+                            mime_type=mime_type,
+                        )
+                        actor = None
+
+                        # Save the new version, potentially creating an audit log entry for the version addition if enabled.
+                        if (
+                            settings.AUDIT_LOG_ENABLED
+                            and self.metadata.actor_id is not None
+                        ):
+                            actor = User.objects.filter(
+                                pk=self.metadata.actor_id,
+                            ).first()
+                            if actor is not None:
+                                from auditlog.context import (  # type: ignore[import-untyped]
+                                    set_actor,
+                                )
+
+                                with set_actor(actor):
                                    original_document.save()
                            else:
                                original_document.save()
-
-                            # Create a log entry for the version addition, if enabled
-                            if settings.AUDIT_LOG_ENABLED:
-                                from auditlog.models import (  # type: ignore[import-untyped]
-                                    LogEntry,
-                                )
-
-                                LogEntry.objects.log_create(
-                                    instance=root_doc,
-                                    changes={
-                                        "Version Added": ["None", original_document.id],
-                                    },
-                                    action=LogEntry.Action.UPDATE,
-                                    actor=actor,
-                                    additional_data={
-                                        "reason": "Version added",
-                                        "version_id": original_document.id,
-                                    },
-                                )
-                            document = original_document
                        else:
-                            document = self._store(
-                                text=text,
-                                date=date,
-                                page_count=page_count,
-                                mime_type=mime_type,
+                            original_document.save()
+
+                        # Create a log entry for the version addition, if enabled
+                        if settings.AUDIT_LOG_ENABLED:
+                            from auditlog.models import (  # type: ignore[import-untyped]
+                                LogEntry,
                            )

-                        # If we get here, it was successful. Proceed with post-consume
-                        # hooks. If they fail, nothing will get changed.
-
-                        document_consumption_finished.send(
-                            sender=self.__class__,
-                            document=document,
-                            logging_group=self.logging_group,
-                            classifier=classifier,
-                            original_file=self.unmodified_original
-                            if self.unmodified_original
-                            else self.working_copy,
+                            LogEntry.objects.log_create(
+                                instance=root_doc,
+                                changes={
+                                    "Version Added": ["None", original_document.id],
+                                },
+                                action=LogEntry.Action.UPDATE,
+                                actor=actor,
+                                additional_data={
+                                    "reason": "Version added",
+                                    "version_id": original_document.id,
+                                },
+                            )
+                        document = original_document
+                    else:
+                        document = self._store(
+                            text=text,
+                            date=date,
+                            page_count=page_count,
+                            mime_type=mime_type,
                        )

-                        # After everything is in the database, copy the files into
-                        # place. If this fails, we'll also rollback the transaction.
-                        with FileLock(settings.MEDIA_LOCK):
-                            generated_filename = generate_unique_filename(document)
+                    # If we get here, it was successful. Proceed with post-consume
+                    # hooks. If they fail, nothing will get changed.
+
+                    document_consumption_finished.send(
+                        sender=self.__class__,
+                        document=document,
+                        logging_group=self.logging_group,
+                        classifier=classifier,
+                        original_file=self.unmodified_original
+                        if self.unmodified_original
+                        else self.working_copy,
+                    )
+
+                    # After everything is in the database, copy the files into
+                    # place. If this fails, we'll also rollback the transaction.
+                    with FileLock(settings.MEDIA_LOCK):
+                        generated_filename = generate_unique_filename(document)
+                        if (
+                            len(str(generated_filename))
+                            > Document.MAX_STORED_FILENAME_LENGTH
+                        ):
+                            self.log.warning(
+                                "Generated source filename exceeds db path limit, falling back to default naming",
+                            )
+                            generated_filename = generate_filename(
+                                document,
+                                use_format=False,
+                            )
+                        document.filename = generated_filename
+                        create_source_path_directory(document.source_path)
+
+                        self._write(
+                            self.unmodified_original
+                            if self.unmodified_original is not None
+                            else self.working_copy,
+                            document.source_path,
+                        )
+
+                        self._write(
+                            thumbnail,
+                            document.thumbnail_path,
+                        )
+
+                        if archive_path and Path(archive_path).is_file():
+                            generated_archive_filename = generate_unique_filename(
+                                document,
+                                archive_filename=True,
+                            )
                            if (
-                                len(str(generated_filename))
+                                len(str(generated_archive_filename))
                                > Document.MAX_STORED_FILENAME_LENGTH
                            ):
                                self.log.warning(
-                                    "Generated source filename exceeds db path limit, falling back to default naming",
+                                    "Generated archive filename exceeds db path limit, falling back to default naming",
                                )
-                                generated_filename = generate_filename(
-                                    document,
-                                    use_format=False,
-                                )
-                            document.filename = generated_filename
-                            create_source_path_directory(document.source_path)
-
-                            self._write(
-                                self.unmodified_original
-                                if self.unmodified_original is not None
-                                else self.working_copy,
-                                document.source_path,
-                            )
-
-                            self._write(
-                                thumbnail,
-                                document.thumbnail_path,
-                            )
-
-                            if archive_path and Path(archive_path).is_file():
-                                generated_archive_filename = generate_unique_filename(
+                                generated_archive_filename = generate_filename(
                                    document,
                                    archive_filename=True,
+                                    use_format=False,
                                )
-                                if (
-                                    len(str(generated_archive_filename))
-                                    > Document.MAX_STORED_FILENAME_LENGTH
-                                ):
-                                    self.log.warning(
-                                        "Generated archive filename exceeds db path limit, falling back to default naming",
-                                    )
-                                    generated_archive_filename = generate_filename(
-                                        document,
-                                        archive_filename=True,
-                                        use_format=False,
-                                    )
-                                document.archive_filename = generated_archive_filename
-                                create_source_path_directory(document.archive_path)
-                                self._write(
-                                    archive_path,
-                                    document.archive_path,
-                                )
-
-                                with Path(archive_path).open("rb") as f:
-                                    document.archive_checksum = hashlib.md5(
-                                        f.read(),
-                                    ).hexdigest()
-
-                        # Don't save with the lock active. Saving will cause the file
-                        # renaming logic to acquire the lock as well.
-                        # This triggers things like file renaming
-                        document.save()
-
-                        if document.root_document_id:
-                            document_updated.send(
-                                sender=self.__class__,
-                                document=document.root_document,
+                            document.archive_filename = generated_archive_filename
+                            create_source_path_directory(document.archive_path)
+                            self._write(
+                                archive_path,
+                                document.archive_path,
                            )

-                        # Delete the file only if it was successfully consumed
-                        self.log.debug(
-                            f"Deleting original file {self.input_doc.original_file}",
-                        )
-                        self.input_doc.original_file.unlink()
-                        self.log.debug(f"Deleting working copy {self.working_copy}")
-                        self.working_copy.unlink()
-                        if self.unmodified_original is not None:  # pragma: no cover
-                            self.log.debug(
-                                f"Deleting unmodified original file {self.unmodified_original}",
-                            )
-                            self.unmodified_original.unlink()
+                            document.archive_checksum = compute_checksum(document.archive_path)

-                        # https://github.com/jonaswinkler/paperless-ng/discussions/1037
-                        shadow_file = (
-                            Path(self.input_doc.original_file).parent
-                            / f"._{Path(self.input_doc.original_file).name}"
+                    # Don't save with the lock active. Saving will cause the file
+                    # renaming logic to acquire the lock as well.
+                    # This triggers things like file renaming
+                    document.save()
+
+                    if document.root_document_id:
+                        document_updated.send(
+                            sender=self.__class__,
+                            document=document.root_document,
                        )

-                        if Path(shadow_file).is_file():
-                            self.log.debug(f"Deleting shadow file {shadow_file}")
-                            Path(shadow_file).unlink()
-
-                except Exception as e:
-                    self._fail(
-                        str(e),
-                        f"The following error occurred while storing document "
-                        f"{self.filename} after parsing: {e}",
-                        exc_info=True,
-                        exception=e,
+                    # Delete the file only if it was successfully consumed
+                    self.log.debug(
+                        f"Deleting original file {self.input_doc.original_file}",
                    )
+                    self.input_doc.original_file.unlink()
+                    self.log.debug(f"Deleting working copy {self.working_copy}")
+                    self.working_copy.unlink()
+                    if self.unmodified_original is not None:  # pragma: no cover
+                        self.log.debug(
+                            f"Deleting unmodified original file {self.unmodified_original}",
+                        )
+                        self.unmodified_original.unlink()
+
+                    # https://github.com/jonaswinkler/paperless-ng/discussions/1037
+                    shadow_file = (
+                        Path(self.input_doc.original_file).parent
+                        / f"._{Path(self.input_doc.original_file).name}"
+                    )
+
+                    if Path(shadow_file).is_file():
+                        self.log.debug(f"Deleting shadow file {shadow_file}")
+                        Path(shadow_file).unlink()
+
+            except Exception as e:
+                self._fail(
+                    str(e),
+                    f"The following error occurred while storing document "
+                    f"{self.filename} after parsing: {e}",
+                    exc_info=True,
+                    exception=e,
+                )
+            finally:
+                tempdir.cleanup()

        self.run_post_consume_script(document)

@@ -776,7 +780,7 @@ class ConsumerPlugin(
            title=title[:127],
            content=text,
            mime_type=mime_type,
-            checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
+            checksum=compute_checksum(file_for_checksum),
            created=create_date,
            modified=create_date,
            page_count=page_count,
@@ -893,10 +897,9 @@ class ConsumerPreflightPlugin(

    def pre_check_duplicate(self) -> None:
        """
-        Using the MD5 of the file, check this exact file doesn't already exist
+        Using the SHA256 of the file, check this exact file doesn't already exist
        """
-        with Path(self.input_doc.original_file).open("rb") as f:
-            checksum = hashlib.md5(f.read()).hexdigest()
+        checksum = compute_checksum(Path(self.input_doc.original_file))
        existing_doc = Document.global_objects.filter(
            Q(checksum=checksum) | Q(archive_checksum=checksum),
        )
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -56,6 +56,7 @@ from documents.models import WorkflowTrigger
 from documents.settings import EXPORTER_ARCHIVE_NAME
 from documents.settings import EXPORTER_FILE_NAME
 from documents.settings import EXPORTER_THUMBNAIL_NAME
+from documents.utils import compute_checksum
 from documents.utils import copy_file_with_basic_stats
 from paperless import version
 from paperless.models import ApplicationConfiguration
@@ -693,7 +694,7 @@ class Command(CryptMixin, PaperlessCommand):
            source_stat = source.stat()
            target_stat = target.stat()
            if self.compare_checksums and source_checksum:
-                target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
+                target_checksum = compute_checksum(target)
                perform_copy = target_checksum != source_checksum
            elif (
                source_stat.st_mtime != target_stat.st_mtime
--- a/src/documents/migrations/0016_sha256_checksums.py
+++ b/src/documents/migrations/0016_sha256_checksums.py
@@ -0,0 +1,130 @@
+import hashlib
+import logging
+from pathlib import Path
+
+from django.conf import settings
+from django.db import migrations
+from django.db import models
+
+logger = logging.getLogger("paperless.migrations")
+
+_CHUNK_SIZE = 65536  # 64 KiB — avoids loading entire files into memory
+_BATCH_SIZE = 500  # documents per bulk_update call
+_PROGRESS_INTERVAL = 500  # log a progress line every N documents
+
+
+def _sha256(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as fh:
+        while chunk := fh.read(_CHUNK_SIZE):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def recompute_checksums(apps, schema_editor):
+    """Recompute all document checksums from MD5 to SHA256."""
+    Document = apps.get_model("documents", "Document")
+
+    total = Document.objects.count()
+    if total == 0:
+        return
+
+    logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
+
+    batch: list = []
+    processed = 0
+
+    for doc in Document.objects.only(
+        "pk",
+        "filename",
+        "checksum",
+        "archive_filename",
+        "archive_checksum",
+    ).iterator(chunk_size=_BATCH_SIZE):
+        updated_fields: list[str] = []
+
+        # Reconstruct source path the same way Document.source_path does
+        fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
+        source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
+
+        if source_path.exists():
+            doc.checksum = _sha256(source_path)
+            updated_fields.append("checksum")
+        else:
+            logger.warning(
+                "Document %s: original file %s not found, checksum not updated.",
+                doc.pk,
+                source_path,
+            )
+
+        # Mirror Document.has_archive_version: archive_filename is not None
+        if doc.archive_filename is not None:
+            archive_path = (
+                settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
+            ).resolve()
+            if archive_path.exists():
+                doc.archive_checksum = _sha256(archive_path)
+                updated_fields.append("archive_checksum")
+            else:
+                logger.warning(
+                    "Document %s: archive file %s not found, checksum not updated.",
+                    doc.pk,
+                    archive_path,
+                )
+
+        if updated_fields:
+            batch.append(doc)
+
+        processed += 1
+
+        if len(batch) >= _BATCH_SIZE:
+            Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
+            batch.clear()
+
+        if processed % _PROGRESS_INTERVAL == 0:
+            logger.info(
+                "SHA-256 checksum progress: %d/%d (%d%%)",
+                processed,
+                total,
+                processed * 100 // total,
+            )
+
+    if batch:
+        Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
+
+    logger.info(
+        "SHA-256 checksum recomputation complete: %d document(s) processed.",
+        total,
+    )
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("documents", "0015_document_version_index_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="document",
+            name="checksum",
+            field=models.CharField(
+                editable=False,
+                help_text="The checksum of the original document.",
+                max_length=64,
+                verbose_name="checksum",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="document",
+            name="archive_checksum",
+            field=models.CharField(
+                blank=True,
+                editable=False,
+                help_text="The checksum of the archived document.",
+                max_length=64,
+                null=True,
+                verbose_name="archive checksum",
+            ),
+        ),
+        migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner):  # type: ignore[django-manager-

    checksum = models.CharField(
        _("checksum"),
-        max_length=32,
+        max_length=64,
        editable=False,
        help_text=_("The checksum of the original document."),
    )

    archive_checksum = models.CharField(
        _("archive checksum"),
-        max_length=32,
+        max_length=64,
        editable=False,
        blank=True,
        null=True,
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@@ -11,7 +11,6 @@ is an identity function that adds no overhead.

 from __future__ import annotations

-import hashlib
 import logging
 import uuid
 from collections import defaultdict
@@ -30,6 +29,7 @@ from django.utils import timezone

 from documents.models import Document
 from documents.models import PaperlessTask
+from documents.utils import compute_checksum
 from paperless.config import GeneralConfig

 logger = logging.getLogger("paperless.sanity_checker")
@@ -218,7 +218,7 @@ def _check_original(

    present_files.discard(source_path)
    try:
-        checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
+        checksum = compute_checksum(source_path)
    except OSError as e:
        messages.error(doc.pk, f"Cannot read original file of document: {e}")
    else:
@@ -255,7 +255,7 @@ def _check_archive(

        present_files.discard(archive_path)
        try:
-            checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
+            checksum = compute_checksum(archive_path)
        except OSError as e:
            messages.error(
                doc.pk,
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -1,5 +1,4 @@
 import datetime
-import hashlib
 import logging
 import shutil
 import uuid
@@ -61,6 +60,7 @@ from documents.signals import document_updated
 from documents.signals.handlers import cleanup_document_deletion
 from documents.signals.handlers import run_workflows
 from documents.signals.handlers import send_websocket_document_updated
+from documents.utils import compute_checksum
 from documents.workflows.utils import get_workflows_for_trigger
 from paperless.config import AIConfig
 from paperless.parsers import ParserContext
@@ -328,8 +328,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
            with transaction.atomic():
                oldDocument = Document.objects.get(pk=document.pk)
                if parser.get_archive_path():
-                    with Path(parser.get_archive_path()).open("rb") as f:
-                        checksum = hashlib.md5(f.read()).hexdigest()
+                    checksum = compute_checksum(parser.get_archive_path())
                    # I'm going to save first so that in case the file move
                    # fails, the database is rolled back.
                    # We also don't use save() since that triggers the filehandling
--- a/src/documents/tests/conftest.py
+++ b/src/documents/tests/conftest.py
@@ -82,8 +82,8 @@ def sample_doc(

    return DocumentFactory(
        title="test",
-        checksum="42995833e01aea9b3edee44bbfdd7ce1",
-        archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
+        checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+        archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
        content="test content",
        pk=1,
        filename="0000001.pdf",
--- a/src/documents/tests/factories.py
+++ b/src/documents/tests/factories.py
@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
        model = Document

    title = factory.Faker("sentence", nb_words=4)
-    checksum = factory.Faker("md5")
+    checksum = factory.Faker("sha256")
    content = factory.Faker("paragraph")
    correspondent = None
    document_type = None
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -261,8 +261,14 @@ class TestConsumer(

        self.assertIsFile(document.archive_path)

-        self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
-        self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
+        self.assertEqual(
+            document.checksum,
+            "1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+        )
+        self.assertEqual(
+            document.archive_checksum,
+            "706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
+        )

        self.assertIsNotFile(filename)

--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@@ -63,8 +63,8 @@ class TestExportImport(

        self.d1 = Document.objects.create(
            content="Content",
-            checksum="42995833e01aea9b3edee44bbfdd7ce1",
-            archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
+            checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+            archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
            title="wow1",
            filename="0000001.pdf",
            mime_type="application/pdf",
@@ -72,21 +72,21 @@ class TestExportImport(
        )
        self.d2 = Document.objects.create(
            content="Content",
-            checksum="9c9691e51741c1f4f41a20896af31770",
+            checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
            title="wow2",
            filename="0000002.pdf",
            mime_type="application/pdf",
        )
        self.d3 = Document.objects.create(
            content="Content",
-            checksum="d38d7ed02e988e072caf924e0f3fcb76",
+            checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
            title="wow2",
            filename="0000003.pdf",
            mime_type="application/pdf",
        )
        self.d4 = Document.objects.create(
            content="Content",
-            checksum="82186aaa94f0b98697d704b90fd1c072",
+            checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
            title="wow_dec",
            filename="0000004.pdf",
            mime_type="application/pdf",
@@ -239,7 +239,7 @@ class TestExportImport(
                )

                with Path(fname).open("rb") as f:
-                    checksum = hashlib.md5(f.read()).hexdigest()
+                    checksum = hashlib.sha256(f.read()).hexdigest()
                self.assertEqual(checksum, element["fields"]["checksum"])

                # Generated field "content_length" should not be exported,
@@ -253,7 +253,7 @@ class TestExportImport(
                    self.assertIsFile(fname)

                    with Path(fname).open("rb") as f:
-                        checksum = hashlib.md5(f.read()).hexdigest()
+                        checksum = hashlib.sha256(f.read()).hexdigest()
                    self.assertEqual(checksum, element["fields"]["archive_checksum"])

            elif element["model"] == "documents.note":
--- a/src/documents/tests/test_management_importer.py
+++ b/src/documents/tests/test_management_importer.py
@@ -277,8 +277,8 @@ class TestCommandImport(

        Document.objects.create(
            content="Content",
-            checksum="42995833e01aea9b3edee44bbfdd7ce1",
-            archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
+            checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
+            archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
            title="wow1",
            filename="0000001.pdf",
            mime_type="application/pdf",
--- a/src/documents/utils.py
+++ b/src/documents/utils.py
@@ -1,3 +1,4 @@
+import hashlib
 import logging
 import shutil
 from os import utime
@@ -128,3 +129,28 @@ def get_boolean(boolstr: str) -> bool:
    Return a boolean value from a string representation.
    """
    return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
+
+
+def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
+    """
+    Compute the SHA-256 checksum of a file.
+
+    Reads the file in chunks to avoid loading the entire file into memory.
+
+    Args:
+        path (Path): Path to the file to hash.
+        chunk_size (int, optional): Number of bytes to read per chunk.
+            Defaults to 65536.
+
+    Returns:
+        str: Hexadecimal SHA-256 digest of the file contents.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        OSError: If the file cannot be read.
+    """
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        while chunk := f.read(chunk_size):
+            h.update(chunk)
+    return h.hexdigest()
Author	SHA1	Message	Date
Trenton H	4934ce0ce8	Adds a docstring that an IDE will render better	2026-03-25 11:16:10 -07:00
Trenton H	a6393aef58	Handles the rename of the migration	2026-03-25 11:16:10 -07:00
Trenton H	0c0ba6510d	Fixes logging so I can see it	2026-03-25 11:16:10 -07:00
Trenton H	f97e8c0452	Batch based iteration and bulk updates, with chunked file reading	2026-03-25 11:16:10 -07:00
Trenton H	226e4b3696	Transitions to SHA256 based checksums	2026-03-25 11:16:05 -07:00