diff --git a/src/documents/consumer.py b/src/documents/consumer.py index a80594409..170849153 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,5 +1,4 @@ import datetime -import hashlib import os import shutil import tempfile @@ -47,6 +46,7 @@ from documents.signals import document_consumption_started from documents.signals import document_updated from documents.signals.handlers import run_workflows from documents.templating.workflows import parse_w_workflow_placeholders +from documents.utils import compute_checksum from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess @@ -237,9 +237,7 @@ class ConsumerPlugin( version_doc = Document( root_document=root_doc_frozen, version_index=next_version_index + 1, - checksum=hashlib.md5( - file_for_checksum.read_bytes(), - ).hexdigest(), + checksum=compute_checksum(file_for_checksum), content=text or "", page_count=page_count, mime_type=mime_type, @@ -683,10 +681,9 @@ class ConsumerPlugin( document.archive_path, ) - with Path(archive_path).open("rb") as f: - document.archive_checksum = hashlib.md5( - f.read(), - ).hexdigest() + document.archive_checksum = compute_checksum( + document.archive_path, + ) # Don't save with the lock active. Saving will cause the file # renaming logic to acquire the lock as well. @@ -826,7 +823,7 @@ class ConsumerPlugin( title=title[:127], content=text, mime_type=mime_type, - checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(), + checksum=compute_checksum(file_for_checksum), created=create_date, modified=create_date, page_count=page_count, @@ -943,10 +940,9 @@ class ConsumerPreflightPlugin( def pre_check_duplicate(self) -> None: """ - Using the MD5 of the file, check this exact file doesn't already exist + Using the SHA256 of the file, check this exact file doesn't already exist """ - with Path(self.input_doc.original_file).open("rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() + checksum = compute_checksum(Path(self.input_doc.original_file)) existing_doc = Document.global_objects.filter( Q(checksum=checksum) | Q(archive_checksum=checksum), ) diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index b8ccca0ab..cd1cee6b3 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -56,6 +56,7 @@ from documents.models import WorkflowTrigger from documents.settings import EXPORTER_ARCHIVE_NAME from documents.settings import EXPORTER_FILE_NAME from documents.settings import EXPORTER_THUMBNAIL_NAME +from documents.utils import compute_checksum from documents.utils import copy_file_with_basic_stats from paperless import version from paperless.models import ApplicationConfiguration @@ -693,7 +694,7 @@ class Command(CryptMixin, PaperlessCommand): source_stat = source.stat() target_stat = target.stat() if self.compare_checksums and source_checksum: - target_checksum = hashlib.md5(target.read_bytes()).hexdigest() + target_checksum = compute_checksum(target) perform_copy = target_checksum != source_checksum elif ( source_stat.st_mtime != target_stat.st_mtime diff --git a/src/documents/migrations/0016_sha256_checksums.py b/src/documents/migrations/0016_sha256_checksums.py new file mode 100644 index 000000000..de1c6c596 --- /dev/null +++ b/src/documents/migrations/0016_sha256_checksums.py @@ -0,0 +1,130 @@ +import hashlib +import logging +from pathlib import Path + +from django.conf import settings +from django.db import migrations +from django.db import models + +logger = logging.getLogger("paperless.migrations") + +_CHUNK_SIZE = 65536 # 64 KiB — avoids loading entire files into memory +_BATCH_SIZE = 500 # documents per bulk_update call +_PROGRESS_INTERVAL = 500 # log a progress line every N documents + + +def _sha256(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as fh: + while chunk := fh.read(_CHUNK_SIZE): + h.update(chunk) + return h.hexdigest() + + +def recompute_checksums(apps, schema_editor): + """Recompute all document checksums from MD5 to SHA256.""" + Document = apps.get_model("documents", "Document") + + total = Document.objects.count() + if total == 0: + return + + logger.info("Recomputing SHA-256 checksums for %d document(s)...", total) + + batch: list = [] + processed = 0 + + for doc in Document.objects.only( + "pk", + "filename", + "checksum", + "archive_filename", + "archive_checksum", + ).iterator(chunk_size=_BATCH_SIZE): + updated_fields: list[str] = [] + + # Reconstruct source path the same way Document.source_path does + fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf" + source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve() + + if source_path.exists(): + doc.checksum = _sha256(source_path) + updated_fields.append("checksum") + else: + logger.warning( + "Document %s: original file %s not found, checksum not updated.", + doc.pk, + source_path, + ) + + # Mirror Document.has_archive_version: archive_filename is not None + if doc.archive_filename is not None: + archive_path = ( + settings.ARCHIVE_DIR / Path(str(doc.archive_filename)) + ).resolve() + if archive_path.exists(): + doc.archive_checksum = _sha256(archive_path) + updated_fields.append("archive_checksum") + else: + logger.warning( + "Document %s: archive file %s not found, checksum not updated.", + doc.pk, + archive_path, + ) + + if updated_fields: + batch.append(doc) + + processed += 1 + + if len(batch) >= _BATCH_SIZE: + Document.objects.bulk_update(batch, ["checksum", "archive_checksum"]) + batch.clear() + + if processed % _PROGRESS_INTERVAL == 0: + logger.info( + "SHA-256 checksum progress: %d/%d (%d%%)", + processed, + total, + processed * 100 // total, + ) + + if batch: + Document.objects.bulk_update(batch, ["checksum", "archive_checksum"]) + + logger.info( + "SHA-256 checksum recomputation complete: %d document(s) processed.", + total, + ) + + +class Migration(migrations.Migration): + dependencies = [ + ("documents", "0015_document_version_index_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="document", + name="checksum", + field=models.CharField( + editable=False, + help_text="The checksum of the original document.", + max_length=64, + verbose_name="checksum", + ), + ), + migrations.AlterField( + model_name="document", + name="archive_checksum", + field=models.CharField( + blank=True, + editable=False, + help_text="The checksum of the archived document.", + max_length=64, + null=True, + verbose_name="archive checksum", + ), + ), + migrations.RunPython(recompute_checksums, migrations.RunPython.noop), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 6147ac001..08f995ff6 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager- checksum = models.CharField( _("checksum"), - max_length=32, + max_length=64, editable=False, help_text=_("The checksum of the original document."), ) archive_checksum = models.CharField( _("archive checksum"), - max_length=32, + max_length=64, editable=False, blank=True, null=True, diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py index ef0d37e7d..b53ed8cfb 100644 --- a/src/documents/sanity_checker.py +++ b/src/documents/sanity_checker.py @@ -11,7 +11,6 @@ is an identity function that adds no overhead. from __future__ import annotations -import hashlib import logging import uuid from collections import defaultdict @@ -30,6 +29,7 @@ from django.utils import timezone from documents.models import Document from documents.models import PaperlessTask +from documents.utils import compute_checksum from paperless.config import GeneralConfig logger = logging.getLogger("paperless.sanity_checker") @@ -218,7 +218,7 @@ def _check_original( present_files.discard(source_path) try: - checksum = hashlib.md5(source_path.read_bytes()).hexdigest() + checksum = compute_checksum(source_path) except OSError as e: messages.error(doc.pk, f"Cannot read original file of document: {e}") else: @@ -255,7 +255,7 @@ def _check_archive( present_files.discard(archive_path) try: - checksum = hashlib.md5(archive_path.read_bytes()).hexdigest() + checksum = compute_checksum(archive_path) except OSError as e: messages.error( doc.pk, diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 8b5db781c..6da02486f 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -1,5 +1,4 @@ import datetime -import hashlib import logging import shutil import uuid @@ -62,6 +61,7 @@ from documents.signals import document_updated from documents.signals.handlers import cleanup_document_deletion from documents.signals.handlers import run_workflows from documents.signals.handlers import send_websocket_document_updated +from documents.utils import compute_checksum from documents.workflows.utils import get_workflows_for_trigger from paperless.config import AIConfig from paperless.parsers import ParserContext @@ -338,8 +338,7 @@ def update_document_content_maybe_archive_file(document_id) -> None: with transaction.atomic(): oldDocument = Document.objects.get(pk=document.pk) if parser.get_archive_path(): - with Path(parser.get_archive_path()).open("rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() + checksum = compute_checksum(parser.get_archive_path()) # I'm going to save first so that in case the file move # fails, the database is rolled back. # We also don't use save() since that triggers the filehandling diff --git a/src/documents/tests/conftest.py b/src/documents/tests/conftest.py index a33771fd1..7e75b9194 100644 --- a/src/documents/tests/conftest.py +++ b/src/documents/tests/conftest.py @@ -82,8 +82,8 @@ def sample_doc( return DocumentFactory( title="test", - checksum="42995833e01aea9b3edee44bbfdd7ce1", - archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", + checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab", + archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf", content="test content", pk=1, filename="0000001.pdf", diff --git a/src/documents/tests/factories.py b/src/documents/tests/factories.py index d1d88587c..b0fd68428 100644 --- a/src/documents/tests/factories.py +++ b/src/documents/tests/factories.py @@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory): model = Document title = factory.Faker("sentence", nb_words=4) - checksum = factory.Faker("md5") + checksum = factory.Faker("sha256") content = factory.Faker("paragraph") correspondent = None document_type = None diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 279e4c1b0..3b91c9b5e 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -261,8 +261,14 @@ class TestConsumer( self.assertIsFile(document.archive_path) - self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1") - self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b") + self.assertEqual( + document.checksum, + "1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab", + ) + self.assertEqual( + document.archive_checksum, + "706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf", + ) self.assertIsNotFile(filename) diff --git a/src/documents/tests/test_management_exporter.py b/src/documents/tests/test_management_exporter.py index 9307bab45..fb9effa0e 100644 --- a/src/documents/tests/test_management_exporter.py +++ b/src/documents/tests/test_management_exporter.py @@ -63,8 +63,8 @@ class TestExportImport( self.d1 = Document.objects.create( content="Content", - checksum="42995833e01aea9b3edee44bbfdd7ce1", - archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", + checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab", + archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf", title="wow1", filename="0000001.pdf", mime_type="application/pdf", @@ -72,21 +72,21 @@ class TestExportImport( ) self.d2 = Document.objects.create( content="Content", - checksum="9c9691e51741c1f4f41a20896af31770", + checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b", title="wow2", filename="0000002.pdf", mime_type="application/pdf", ) self.d3 = Document.objects.create( content="Content", - checksum="d38d7ed02e988e072caf924e0f3fcb76", + checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915", title="wow2", filename="0000003.pdf", mime_type="application/pdf", ) self.d4 = Document.objects.create( content="Content", - checksum="82186aaa94f0b98697d704b90fd1c072", + checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b", title="wow_dec", filename="0000004.pdf", mime_type="application/pdf", @@ -239,7 +239,7 @@ class TestExportImport( ) with Path(fname).open("rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() + checksum = hashlib.sha256(f.read()).hexdigest() self.assertEqual(checksum, element["fields"]["checksum"]) # Generated field "content_length" should not be exported, @@ -253,7 +253,7 @@ class TestExportImport( self.assertIsFile(fname) with Path(fname).open("rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() + checksum = hashlib.sha256(f.read()).hexdigest() self.assertEqual(checksum, element["fields"]["archive_checksum"]) elif element["model"] == "documents.note": diff --git a/src/documents/tests/test_management_importer.py b/src/documents/tests/test_management_importer.py index ef20cf895..a890718a1 100644 --- a/src/documents/tests/test_management_importer.py +++ b/src/documents/tests/test_management_importer.py @@ -277,8 +277,8 @@ class TestCommandImport( Document.objects.create( content="Content", - checksum="42995833e01aea9b3edee44bbfdd7ce1", - archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", + checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab", + archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf", title="wow1", filename="0000001.pdf", mime_type="application/pdf", diff --git a/src/documents/tests/test_migration_sha256_checksums.py b/src/documents/tests/test_migration_sha256_checksums.py new file mode 100644 index 000000000..4a53b724c --- /dev/null +++ b/src/documents/tests/test_migration_sha256_checksums.py @@ -0,0 +1,132 @@ +import hashlib +import shutil +import tempfile +from pathlib import Path + +from django.conf import settings +from django.db import connection +from django.test import override_settings + +from documents.tests.utils import TestMigrations + + +def _sha256(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +class TestSha256ChecksumDataMigration(TestMigrations): + """recompute_checksums correctly updates document checksums from MD5 to SHA256.""" + + migrate_from = "0015_document_version_index_and_more" + migrate_to = "0016_sha256_checksums" + reset_sequences = True + + ORIGINAL_CONTENT = b"original file content for sha256 migration test" + ARCHIVE_CONTENT = b"archive file content for sha256 migration test" + + def setUpBeforeMigration(self, apps) -> None: + self._originals_dir = Path(tempfile.mkdtemp()) + self._archive_dir = Path(tempfile.mkdtemp()) + self._settings_override = override_settings( + ORIGINALS_DIR=self._originals_dir, + ARCHIVE_DIR=self._archive_dir, + ) + self._settings_override.enable() + Document = apps.get_model("documents", "Document") + + # doc1: original file present, no archive + (settings.ORIGINALS_DIR / "doc1.txt").write_bytes(self.ORIGINAL_CONTENT) + self.doc1_id = Document.objects.create( + title="Doc 1", + mime_type="text/plain", + filename="doc1.txt", + checksum="a" * 32, + ).pk + + # doc2: original and archive both present + (settings.ORIGINALS_DIR / "doc2.txt").write_bytes(self.ORIGINAL_CONTENT) + (settings.ARCHIVE_DIR / "doc2.pdf").write_bytes(self.ARCHIVE_CONTENT) + self.doc2_id = Document.objects.create( + title="Doc 2", + mime_type="text/plain", + filename="doc2.txt", + checksum="b" * 32, + archive_filename="doc2.pdf", + archive_checksum="c" * 32, + ).pk + + # doc3: original file missing — checksum must stay unchanged + self.doc3_id = Document.objects.create( + title="Doc 3", + mime_type="text/plain", + filename="missing_original.txt", + checksum="d" * 32, + ).pk + + # doc4: original present, archive_filename set but archive file missing + (settings.ORIGINALS_DIR / "doc4.txt").write_bytes(self.ORIGINAL_CONTENT) + self.doc4_id = Document.objects.create( + title="Doc 4", + mime_type="text/plain", + filename="doc4.txt", + checksum="e" * 32, + archive_filename="missing_archive.pdf", + archive_checksum="f" * 32, + ).pk + + # doc5: original present, archive_filename is None — archive_checksum must stay null + (settings.ORIGINALS_DIR / "doc5.txt").write_bytes(self.ORIGINAL_CONTENT) + self.doc5_id = Document.objects.create( + title="Doc 5", + mime_type="text/plain", + filename="doc5.txt", + checksum="0" * 32, + archive_filename=None, + archive_checksum=None, + ).pk + + def _fixture_teardown(self) -> None: + super()._fixture_teardown() + # Django's SQLite backend returns [] from sequence_reset_sql(), so + # reset_sequences=True flushes rows but never clears sqlite_sequence. + # Explicitly delete the entry so subsequent tests start from pk=1. + if connection.vendor == "sqlite": + with connection.cursor() as cursor: + cursor.execute( + "DELETE FROM sqlite_sequence WHERE name='documents_document'", + ) + + def tearDown(self) -> None: + super().tearDown() + self._settings_override.disable() + shutil.rmtree(self._originals_dir, ignore_errors=True) + shutil.rmtree(self._archive_dir, ignore_errors=True) + + def test_original_checksum_updated_to_sha256_when_file_exists(self) -> None: + Document = self.apps.get_model("documents", "Document") + doc = Document.objects.get(pk=self.doc1_id) + self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT)) + + def test_both_checksums_updated_when_original_and_archive_exist(self) -> None: + Document = self.apps.get_model("documents", "Document") + doc = Document.objects.get(pk=self.doc2_id) + self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT)) + self.assertEqual(doc.archive_checksum, _sha256(self.ARCHIVE_CONTENT)) + + def test_checksum_unchanged_when_original_file_missing(self) -> None: + Document = self.apps.get_model("documents", "Document") + doc = Document.objects.get(pk=self.doc3_id) + self.assertEqual(doc.checksum, "d" * 32) + + def test_archive_checksum_unchanged_when_archive_file_missing(self) -> None: + Document = self.apps.get_model("documents", "Document") + doc = Document.objects.get(pk=self.doc4_id) + # Original was updated (file exists) + self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT)) + # Archive was not updated (file missing) + self.assertEqual(doc.archive_checksum, "f" * 32) + + def test_archive_checksum_stays_null_when_no_archive_filename(self) -> None: + Document = self.apps.get_model("documents", "Document") + doc = Document.objects.get(pk=self.doc5_id) + self.assertIsNone(doc.archive_checksum) diff --git a/src/documents/utils.py b/src/documents/utils.py index 2b6a60749..975185a5f 100644 --- a/src/documents/utils.py +++ b/src/documents/utils.py @@ -1,3 +1,4 @@ +import hashlib import logging import shutil from os import utime @@ -128,3 +129,28 @@ def get_boolean(boolstr: str) -> bool: Return a boolean value from a string representation. """ return bool(boolstr.lower() in ("yes", "y", "1", "t", "true")) + + +def compute_checksum(path: Path, chunk_size: int = 65536) -> str: + """ + Compute the SHA-256 checksum of a file. + + Reads the file in chunks to avoid loading the entire file into memory. + + Args: + path (Path): Path to the file to hash. + chunk_size (int, optional): Number of bytes to read per chunk. + Defaults to 65536. + + Returns: + str: Hexadecimal SHA-256 digest of the file contents. + + Raises: + FileNotFoundError: If the file does not exist. + OSError: If the file cannot be read. + """ + h = hashlib.sha256() + with path.open("rb") as f: + while chunk := f.read(chunk_size): + h.update(chunk) + return h.hexdigest()