diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 53fd1ca0f..8de83727f 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,5 +1,4 @@ import datetime -import hashlib import os import tempfile from enum import StrEnum @@ -48,6 +47,7 @@ from documents.signals import document_consumption_started from documents.signals import document_updated from documents.signals.handlers import run_workflows from documents.templating.workflows import parse_w_workflow_placeholders +from documents.utils import compute_checksum from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess @@ -214,9 +214,7 @@ class ConsumerPlugin( version_doc = Document( root_document=root_doc_frozen, version_index=next_version_index + 1, - checksum=hashlib.md5( - file_for_checksum.read_bytes(), - ).hexdigest(), + checksum=compute_checksum(file_for_checksum), content=text or "", page_count=page_count, mime_type=mime_type, @@ -687,10 +685,9 @@ class ConsumerPlugin( document.archive_path, ) - with Path(archive_path).open("rb") as f: - document.archive_checksum = hashlib.md5( - f.read(), - ).hexdigest() + document.archive_checksum = compute_checksum( + Path(archive_path), + ) # Don't save with the lock active. Saving will cause the file # renaming logic to acquire the lock as well. @@ -831,7 +828,7 @@ class ConsumerPlugin( title=title[:127], content=text, mime_type=mime_type, - checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(), + checksum=compute_checksum(file_for_checksum), created=create_date, modified=create_date, page_count=page_count, @@ -948,10 +945,9 @@ class ConsumerPreflightPlugin( def pre_check_duplicate(self) -> None: """ - Using the MD5 of the file, check this exact file doesn't already exist + Using the SHA256 of the file, check this exact file doesn't already exist """ - with Path(self.input_doc.original_file).open("rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() + checksum = compute_checksum(Path(self.input_doc.original_file)) existing_doc = Document.global_objects.filter( Q(checksum=checksum) | Q(archive_checksum=checksum), ) diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index b8ccca0ab..cd1cee6b3 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -56,6 +56,7 @@ from documents.models import WorkflowTrigger from documents.settings import EXPORTER_ARCHIVE_NAME from documents.settings import EXPORTER_FILE_NAME from documents.settings import EXPORTER_THUMBNAIL_NAME +from documents.utils import compute_checksum from documents.utils import copy_file_with_basic_stats from paperless import version from paperless.models import ApplicationConfiguration @@ -693,7 +694,7 @@ class Command(CryptMixin, PaperlessCommand): source_stat = source.stat() target_stat = target.stat() if self.compare_checksums and source_checksum: - target_checksum = hashlib.md5(target.read_bytes()).hexdigest() + target_checksum = compute_checksum(target) perform_copy = target_checksum != source_checksum elif ( source_stat.st_mtime != target_stat.st_mtime diff --git a/src/documents/migrations/0017_sha256_checksums.py b/src/documents/migrations/0017_sha256_checksums.py new file mode 100644 index 000000000..d28fdaa66 --- /dev/null +++ b/src/documents/migrations/0017_sha256_checksums.py @@ -0,0 +1,83 @@ +import hashlib +import logging +from pathlib import Path + +from django.conf import settings +from django.db import migrations +from django.db import models + +logger = logging.getLogger(__name__) + + +def recompute_checksums(apps, schema_editor): + """Recompute all document checksums from MD5 to SHA256.""" + Document = apps.get_model("documents", "Document") + + for doc in Document.objects.all().iterator(): + updated_fields = [] + + # Reconstruct source path the same way Document.source_path does + fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf" + source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve() + + if source_path.exists(): + doc.checksum = hashlib.sha256(source_path.read_bytes()).hexdigest() + updated_fields.append("checksum") + else: + logger.warning( + "Document %s: original file %s not found, checksum not updated.", + doc.pk, + source_path, + ) + + # Mirror Document.has_archive_version: archive_filename is not None + if doc.archive_filename is not None: + archive_path = ( + settings.ARCHIVE_DIR / Path(str(doc.archive_filename)) + ).resolve() + if archive_path.exists(): + doc.archive_checksum = hashlib.sha256( + archive_path.read_bytes(), + ).hexdigest() + updated_fields.append("archive_checksum") + else: + logger.warning( + "Document %s: archive file %s not found, checksum not updated.", + doc.pk, + archive_path, + ) + + if updated_fields: + doc.save(update_fields=updated_fields) + + +class Migration(migrations.Migration): + dependencies = [ + ("documents", "0016_document_version_index_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="document", + name="checksum", + field=models.CharField( + editable=False, + help_text="The checksum of the original document.", + max_length=64, + verbose_name="checksum", + ), + ), + migrations.AlterField( + model_name="document", + name="archive_checksum", + field=models.CharField( + blank=True, + editable=False, + help_text="The checksum of the archived document.", + max_length=64, + null=True, + verbose_name="archive checksum", + ), + ), + migrations.RunPython(recompute_checksums, migrations.RunPython.noop), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 6147ac001..08f995ff6 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager- checksum = models.CharField( _("checksum"), - max_length=32, + max_length=64, editable=False, help_text=_("The checksum of the original document."), ) archive_checksum = models.CharField( _("archive checksum"), - max_length=32, + max_length=64, editable=False, blank=True, null=True, diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py index ef0d37e7d..b53ed8cfb 100644 --- a/src/documents/sanity_checker.py +++ b/src/documents/sanity_checker.py @@ -11,7 +11,6 @@ is an identity function that adds no overhead. from __future__ import annotations -import hashlib import logging import uuid from collections import defaultdict @@ -30,6 +29,7 @@ from django.utils import timezone from documents.models import Document from documents.models import PaperlessTask +from documents.utils import compute_checksum from paperless.config import GeneralConfig logger = logging.getLogger("paperless.sanity_checker") @@ -218,7 +218,7 @@ def _check_original( present_files.discard(source_path) try: - checksum = hashlib.md5(source_path.read_bytes()).hexdigest() + checksum = compute_checksum(source_path) except OSError as e: messages.error(doc.pk, f"Cannot read original file of document: {e}") else: @@ -255,7 +255,7 @@ def _check_archive( present_files.discard(archive_path) try: - checksum = hashlib.md5(archive_path.read_bytes()).hexdigest() + checksum = compute_checksum(archive_path) except OSError as e: messages.error( doc.pk, diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 378695731..4827edf47 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -1,5 +1,4 @@ import datetime -import hashlib import logging import shutil import uuid @@ -63,6 +62,7 @@ from documents.signals import document_updated from documents.signals.handlers import cleanup_document_deletion from documents.signals.handlers import run_workflows from documents.signals.handlers import send_websocket_document_updated +from documents.utils import compute_checksum from documents.workflows.utils import get_workflows_for_trigger from paperless.config import AIConfig from paperless_ai.indexing import llm_index_add_or_update_document @@ -327,8 +327,7 @@ def update_document_content_maybe_archive_file(document_id) -> None: with transaction.atomic(): oldDocument = Document.objects.get(pk=document.pk) if parser.get_archive_path(): - with Path(parser.get_archive_path()).open("rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() + checksum = compute_checksum(Path(parser.get_archive_path())) # I'm going to save first so that in case the file move # fails, the database is rolled back. # We also don't use save() since that triggers the filehandling diff --git a/src/documents/tests/conftest.py b/src/documents/tests/conftest.py index a33771fd1..7e75b9194 100644 --- a/src/documents/tests/conftest.py +++ b/src/documents/tests/conftest.py @@ -82,8 +82,8 @@ def sample_doc( return DocumentFactory( title="test", - checksum="42995833e01aea9b3edee44bbfdd7ce1", - archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", + checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab", + archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf", content="test content", pk=1, filename="0000001.pdf", diff --git a/src/documents/tests/factories.py b/src/documents/tests/factories.py index d1d88587c..b0fd68428 100644 --- a/src/documents/tests/factories.py +++ b/src/documents/tests/factories.py @@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory): model = Document title = factory.Faker("sentence", nb_words=4) - checksum = factory.Faker("md5") + checksum = factory.Faker("sha256") content = factory.Faker("paragraph") correspondent = None document_type = None diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index a59c7d676..f46042e7a 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -245,8 +245,14 @@ class TestConsumer( self.assertIsFile(document.archive_path) - self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1") - self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b") + self.assertEqual( + document.checksum, + "1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab", + ) + self.assertEqual( + document.archive_checksum, + "706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf", + ) self.assertIsNotFile(filename) diff --git a/src/documents/tests/test_management_exporter.py b/src/documents/tests/test_management_exporter.py index 9307bab45..fb9effa0e 100644 --- a/src/documents/tests/test_management_exporter.py +++ b/src/documents/tests/test_management_exporter.py @@ -63,8 +63,8 @@ class TestExportImport( self.d1 = Document.objects.create( content="Content", - checksum="42995833e01aea9b3edee44bbfdd7ce1", - archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", + checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab", + archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf", title="wow1", filename="0000001.pdf", mime_type="application/pdf", @@ -72,21 +72,21 @@ class TestExportImport( ) self.d2 = Document.objects.create( content="Content", - checksum="9c9691e51741c1f4f41a20896af31770", + checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b", title="wow2", filename="0000002.pdf", mime_type="application/pdf", ) self.d3 = Document.objects.create( content="Content", - checksum="d38d7ed02e988e072caf924e0f3fcb76", + checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915", title="wow2", filename="0000003.pdf", mime_type="application/pdf", ) self.d4 = Document.objects.create( content="Content", - checksum="82186aaa94f0b98697d704b90fd1c072", + checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b", title="wow_dec", filename="0000004.pdf", mime_type="application/pdf", @@ -239,7 +239,7 @@ class TestExportImport( ) with Path(fname).open("rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() + checksum = hashlib.sha256(f.read()).hexdigest() self.assertEqual(checksum, element["fields"]["checksum"]) # Generated field "content_length" should not be exported, @@ -253,7 +253,7 @@ class TestExportImport( self.assertIsFile(fname) with Path(fname).open("rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() + checksum = hashlib.sha256(f.read()).hexdigest() self.assertEqual(checksum, element["fields"]["archive_checksum"]) elif element["model"] == "documents.note": diff --git a/src/documents/tests/test_management_importer.py b/src/documents/tests/test_management_importer.py index ef20cf895..a890718a1 100644 --- a/src/documents/tests/test_management_importer.py +++ b/src/documents/tests/test_management_importer.py @@ -277,8 +277,8 @@ class TestCommandImport( Document.objects.create( content="Content", - checksum="42995833e01aea9b3edee44bbfdd7ce1", - archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", + checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab", + archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf", title="wow1", filename="0000001.pdf", mime_type="application/pdf", diff --git a/src/documents/utils.py b/src/documents/utils.py index 2b6a60749..b13fcb466 100644 --- a/src/documents/utils.py +++ b/src/documents/utils.py @@ -1,3 +1,4 @@ +import hashlib import logging import shutil from os import utime @@ -128,3 +129,15 @@ def get_boolean(boolstr: str) -> bool: Return a boolean value from a string representation. """ return bool(boolstr.lower() in ("yes", "y", "1", "t", "true")) + + +def compute_checksum(path: Path, chunk_size: int = 65536) -> str: + """ + Return the SHA256 hex digest of the file at *path*, reading in chunks + of *chunk_size* bytes to avoid loading the entire file into memory. + """ + h = hashlib.sha256() + with path.open("rb") as f: + while chunk := f.read(chunk_size): + h.update(chunk) + return h.hexdigest()