Compare commits

...

3 Commits

Author SHA1 Message Date
Trenton H
3ae0b8e219 Fixes logging so I can see it 2026-03-06 12:04:54 -08:00
Trenton H
4a6fd02492 Batch based iteration and bulk updates, with chunked file reading 2026-03-06 11:44:41 -08:00
Trenton H
76fb8f3770 Transitions to SHA256 based checksums 2026-03-06 11:33:33 -08:00
12 changed files with 182 additions and 37 deletions

View File

@@ -1,5 +1,4 @@
import datetime
import hashlib
import os
import tempfile
from enum import StrEnum
@@ -48,6 +47,7 @@ from documents.signals import document_consumption_started
from documents.signals import document_updated
from documents.signals.handlers import run_workflows
from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import compute_checksum
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
@@ -196,9 +196,7 @@ class ConsumerPlugin(
version_doc = Document(
root_document=root_doc_frozen,
version_index=next_version_index + 1,
checksum=hashlib.md5(
file_for_checksum.read_bytes(),
).hexdigest(),
checksum=compute_checksum(file_for_checksum),
content=text or "",
page_count=page_count,
mime_type=mime_type,
@@ -656,10 +654,9 @@ class ConsumerPlugin(
document.archive_path,
)
with Path(archive_path).open("rb") as f:
document.archive_checksum = hashlib.md5(
f.read(),
).hexdigest()
document.archive_checksum = compute_checksum(
Path(archive_path),
)
# Don't save with the lock active. Saving will cause the file
# renaming logic to acquire the lock as well.
@@ -800,7 +797,7 @@ class ConsumerPlugin(
title=title[:127],
content=text,
mime_type=mime_type,
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
checksum=compute_checksum(file_for_checksum),
created=create_date,
modified=create_date,
page_count=page_count,
@@ -917,10 +914,9 @@ class ConsumerPreflightPlugin(
def pre_check_duplicate(self) -> None:
"""
Using the MD5 of the file, check this exact file doesn't already exist
Using the SHA256 of the file, check this exact file doesn't already exist
"""
with Path(self.input_doc.original_file).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = compute_checksum(Path(self.input_doc.original_file))
existing_doc = Document.global_objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum),
)

View File

@@ -58,6 +58,7 @@ from documents.models import WorkflowTrigger
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.utils import compute_checksum
from documents.utils import copy_file_with_basic_stats
from paperless import version
from paperless.models import ApplicationConfiguration
@@ -549,14 +550,14 @@ class Command(CryptMixin, BaseCommand):
if target in self.files_in_export_dir:
self.files_in_export_dir.remove(target)
if self.compare_json:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
target_checksum = compute_checksum(target)
src_str = json.dumps(
content,
cls=DjangoJSONEncoder,
indent=2,
ensure_ascii=False,
)
src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
src_checksum = hashlib.sha256(src_str.encode("utf-8")).hexdigest()
if src_checksum == target_checksum:
perform_write = False
@@ -592,7 +593,7 @@ class Command(CryptMixin, BaseCommand):
source_stat = source.stat()
target_stat = target.stat()
if self.compare_checksums and source_checksum:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
target_checksum = compute_checksum(target)
perform_copy = target_checksum != source_checksum
elif (
source_stat.st_mtime != target_stat.st_mtime

View File

@@ -0,0 +1,130 @@
import hashlib
import logging
from pathlib import Path
from django.conf import settings
from django.db import migrations
from django.db import models
logger = logging.getLogger("paperless.migrations")
_CHUNK_SIZE = 65536 # 64 KiB — avoids loading entire files into memory
_BATCH_SIZE = 500 # documents per bulk_update call
_PROGRESS_INTERVAL = 500 # log a progress line every N documents
def _sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as fh:
while chunk := fh.read(_CHUNK_SIZE):
h.update(chunk)
return h.hexdigest()
def recompute_checksums(apps, schema_editor):
"""Recompute all document checksums from MD5 to SHA256."""
Document = apps.get_model("documents", "Document")
total = Document.objects.count()
if total == 0:
return
logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
batch: list = []
processed = 0
for doc in Document.objects.only(
"pk",
"filename",
"checksum",
"archive_filename",
"archive_checksum",
).iterator(chunk_size=_BATCH_SIZE):
updated_fields: list[str] = []
# Reconstruct source path the same way Document.source_path does
fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
if source_path.exists():
doc.checksum = _sha256(source_path)
updated_fields.append("checksum")
else:
logger.warning(
"Document %s: original file %s not found, checksum not updated.",
doc.pk,
source_path,
)
# Mirror Document.has_archive_version: archive_filename is not None
if doc.archive_filename is not None:
archive_path = (
settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
).resolve()
if archive_path.exists():
doc.archive_checksum = _sha256(archive_path)
updated_fields.append("archive_checksum")
else:
logger.warning(
"Document %s: archive file %s not found, checksum not updated.",
doc.pk,
archive_path,
)
if updated_fields:
batch.append(doc)
processed += 1
if len(batch) >= _BATCH_SIZE:
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
batch.clear()
if processed % _PROGRESS_INTERVAL == 0:
logger.info(
"SHA-256 checksum progress: %d/%d (%d%%)",
processed,
total,
processed * 100 // total,
)
if batch:
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
logger.info(
"SHA-256 checksum recomputation complete: %d document(s) processed.",
total,
)
class Migration(migrations.Migration):
dependencies = [
("documents", "0016_document_version_index_and_more"),
]
operations = [
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document.",
max_length=64,
verbose_name="checksum",
),
),
migrations.AlterField(
model_name="document",
name="archive_checksum",
field=models.CharField(
blank=True,
editable=False,
help_text="The checksum of the archived document.",
max_length=64,
null=True,
verbose_name="archive checksum",
),
),
migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
]

View File

@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
checksum = models.CharField(
_("checksum"),
max_length=32,
max_length=64,
editable=False,
help_text=_("The checksum of the original document."),
)
archive_checksum = models.CharField(
_("archive checksum"),
max_length=32,
max_length=64,
editable=False,
blank=True,
null=True,

View File

@@ -11,7 +11,6 @@ is an identity function that adds no overhead.
from __future__ import annotations
import hashlib
import logging
import uuid
from collections import defaultdict
@@ -30,6 +29,7 @@ from django.utils import timezone
from documents.models import Document
from documents.models import PaperlessTask
from documents.utils import compute_checksum
from paperless.config import GeneralConfig
logger = logging.getLogger("paperless.sanity_checker")
@@ -218,7 +218,7 @@ def _check_original(
present_files.discard(source_path)
try:
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
checksum = compute_checksum(source_path)
except OSError as e:
messages.error(doc.pk, f"Cannot read original file of document: {e}")
else:
@@ -255,7 +255,7 @@ def _check_archive(
present_files.discard(archive_path)
try:
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
checksum = compute_checksum(archive_path)
except OSError as e:
messages.error(
doc.pk,

View File

@@ -1,5 +1,4 @@
import datetime
import hashlib
import logging
import shutil
import uuid
@@ -63,6 +62,7 @@ from documents.signals import document_updated
from documents.signals.handlers import cleanup_document_deletion
from documents.signals.handlers import run_workflows
from documents.signals.handlers import send_websocket_document_updated
from documents.utils import compute_checksum
from documents.workflows.utils import get_workflows_for_trigger
from paperless.config import AIConfig
from paperless_ai.indexing import llm_index_add_or_update_document
@@ -323,8 +323,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
with transaction.atomic():
oldDocument = Document.objects.get(pk=document.pk)
if parser.get_archive_path():
with Path(parser.get_archive_path()).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = compute_checksum(Path(parser.get_archive_path()))
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling

View File

@@ -82,8 +82,8 @@ def sample_doc(
return DocumentFactory(
title="test",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
content="test content",
pk=1,
filename="0000001.pdf",

View File

@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
model = Document
title = factory.Faker("sentence", nb_words=4)
checksum = factory.Faker("md5")
checksum = factory.Faker("sha256")
content = factory.Faker("paragraph")
correspondent = None
document_type = None

View File

@@ -245,8 +245,14 @@ class TestConsumer(
self.assertIsFile(document.archive_path)
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
self.assertEqual(
document.checksum,
"1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
)
self.assertEqual(
document.archive_checksum,
"706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
)
self.assertIsNotFile(filename)

View File

@@ -63,8 +63,8 @@ class TestExportImport(
self.d1 = Document.objects.create(
content="Content",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",
@@ -72,21 +72,21 @@ class TestExportImport(
)
self.d2 = Document.objects.create(
content="Content",
checksum="9c9691e51741c1f4f41a20896af31770",
checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
title="wow2",
filename="0000002.pdf",
mime_type="application/pdf",
)
self.d3 = Document.objects.create(
content="Content",
checksum="d38d7ed02e988e072caf924e0f3fcb76",
checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
title="wow2",
filename="0000003.pdf",
mime_type="application/pdf",
)
self.d4 = Document.objects.create(
content="Content",
checksum="82186aaa94f0b98697d704b90fd1c072",
checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
title="wow_dec",
filename="0000004.pdf",
mime_type="application/pdf",
@@ -240,7 +240,7 @@ class TestExportImport(
)
with Path(fname).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = hashlib.sha256(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["checksum"])
# Generated field "content_length" should not be exported,
@@ -254,7 +254,7 @@ class TestExportImport(
self.assertIsFile(fname)
with Path(fname).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = hashlib.sha256(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["archive_checksum"])
elif element["model"] == "documents.note":

View File

@@ -260,8 +260,8 @@ class TestCommandImport(
Document.objects.create(
content="Content",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",

View File

@@ -1,3 +1,4 @@
import hashlib
import logging
import shutil
from os import utime
@@ -128,3 +129,15 @@ def get_boolean(boolstr: str) -> bool:
Return a boolean value from a string representation.
"""
return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
"""
Return the SHA256 hex digest of the file at *path*, reading in chunks
of *chunk_size* bytes to avoid loading the entire file into memory.
"""
h = hashlib.sha256()
with path.open("rb") as f:
while chunk := f.read(chunk_size):
h.update(chunk)
return h.hexdigest()