Compare commits

..

5 Commits

Author SHA1 Message Date
Trenton H
4934ce0ce8 Adds a docstring that an IDE will render better 2026-03-25 11:16:10 -07:00
Trenton H
a6393aef58 Handles the rename of the migration 2026-03-25 11:16:10 -07:00
Trenton H
0c0ba6510d Fixes logging so I can see it 2026-03-25 11:16:10 -07:00
Trenton H
f97e8c0452 Batch based iteration and bulk updates, with chunked file reading 2026-03-25 11:16:10 -07:00
Trenton H
226e4b3696 Transitions to SHA256 based checksums 2026-03-25 11:16:05 -07:00
12 changed files with 441 additions and 276 deletions

View File

@@ -1,5 +1,4 @@
import datetime
import hashlib
import os
import tempfile
from enum import StrEnum
@@ -46,6 +45,7 @@ from documents.signals import document_consumption_started
from documents.signals import document_updated
from documents.signals.handlers import run_workflows
from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import compute_checksum
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
@@ -196,9 +196,7 @@ class ConsumerPlugin(
version_doc = Document(
root_document=root_doc_frozen,
version_index=next_version_index + 1,
checksum=hashlib.md5(
file_for_checksum.read_bytes(),
).hexdigest(),
checksum=compute_checksum(file_for_checksum),
content=text or "",
page_count=page_count,
mime_type=mime_type,
@@ -338,15 +336,18 @@ class ConsumerPlugin(
Return the document object if it was successfully created.
"""
# Preflight has already run including progress update to 0%
self.log.info(f"Consuming {self.filename}")
tempdir = None
# For the actual work, copy the file into a tempdir
with tempfile.TemporaryDirectory(
prefix="paperless-ngx",
dir=settings.SCRATCH_DIR,
) as tmpdir:
self.working_copy = Path(tmpdir) / Path(self.filename)
try:
# Preflight has already run including progress update to 0%
self.log.info(f"Consuming {self.filename}")
# For the actual work, copy the file into a tempdir
tempdir = tempfile.TemporaryDirectory(
prefix="paperless-ngx",
dir=settings.SCRATCH_DIR,
)
self.working_copy = Path(tempdir.name) / Path(self.filename)
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
self.unmodified_original = None
@@ -378,7 +379,7 @@ class ConsumerPlugin(
self.log.debug(f"Detected mime type after qpdf: {mime_type}")
# Save the original file for later
self.unmodified_original = (
Path(tmpdir) / Path("uo") / Path(self.filename)
Path(tempdir.name) / Path("uo") / Path(self.filename)
)
self.unmodified_original.parent.mkdir(exist_ok=True)
copy_file_with_basic_stats(
@@ -397,6 +398,7 @@ class ConsumerPlugin(
)
)
if not parser_class:
tempdir.cleanup()
self._fail(
ConsumerStatusShortMessage.UNSUPPORTED_TYPE,
f"Unsupported mime type {mime_type}",
@@ -411,275 +413,277 @@ class ConsumerPlugin(
)
self.run_pre_consume_script()
except:
if tempdir:
tempdir.cleanup()
raise
# This doesn't parse the document yet, but gives us a parser.
with parser_class() as document_parser:
document_parser.configure(
ParserContext(mailrule_id=self.input_doc.mailrule_id),
)
# This doesn't parse the document yet, but gives us a parser.
with parser_class() as document_parser:
document_parser.configure(
ParserContext(mailrule_id=self.input_doc.mailrule_id),
)
self.log.debug(
f"Parser: {document_parser.name} v{document_parser.version}",
)
self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}")
# Parse the document. This may take some time.
# Parse the document. This may take some time.
text = None
date = None
thumbnail = None
archive_path = None
page_count = None
try:
self._send_progress(
20,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.PARSING_DOCUMENT,
)
self.log.debug(f"Parsing {self.filename}...")
document_parser.parse(self.working_copy, mime_type)
self.log.debug(f"Generating thumbnail for {self.filename}...")
self._send_progress(
70,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
)
thumbnail = document_parser.get_thumbnail(
self.working_copy,
mime_type,
)
text = document_parser.get_text()
date = document_parser.get_date()
if date is None:
self._send_progress(
90,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.PARSE_DATE,
)
with get_date_parser() as date_parser:
date = next(date_parser.parse(self.filename, text), None)
archive_path = document_parser.get_archive_path()
page_count = document_parser.get_page_count(
self.working_copy,
mime_type,
)
except ParseError as e:
self._fail(
str(e),
f"Error occurred while consuming document {self.filename}: {e}",
exc_info=True,
exception=e,
)
except Exception as e:
self._fail(
str(e),
f"Unexpected error while consuming document {self.filename}: {e}",
exc_info=True,
exception=e,
)
# Prepare the document classifier.
# TODO: I don't really like to do this here, but this way we avoid
# reloading the classifier multiple times, since there are multiple
# post-consume hooks that all require the classifier.
classifier = load_classifier()
text = None
date = None
thumbnail = None
archive_path = None
page_count = None
try:
self._send_progress(
95,
20,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.SAVE_DOCUMENT,
ConsumerStatusShortMessage.PARSING_DOCUMENT,
)
# now that everything is done, we can start to store the document
# in the system. This will be a transaction and reasonably fast.
try:
with transaction.atomic():
# store the document.
if self.input_doc.root_document_id:
# If this is a new version of an existing document, we need
# to make sure we're not creating a new document, but updating
# the existing one.
root_doc = Document.objects.get(
pk=self.input_doc.root_document_id,
)
original_document = self._create_version_from_root(
root_doc,
text=text,
page_count=page_count,
mime_type=mime_type,
)
actor = None
self.log.debug(f"Parsing {self.filename}...")
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
if (
settings.AUDIT_LOG_ENABLED
and self.metadata.actor_id is not None
):
actor = User.objects.filter(
pk=self.metadata.actor_id,
).first()
if actor is not None:
from auditlog.context import ( # type: ignore[import-untyped]
set_actor,
)
document_parser.parse(self.working_copy, mime_type)
with set_actor(actor):
original_document.save()
else:
self.log.debug(f"Generating thumbnail for {self.filename}...")
self._send_progress(
70,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
)
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
text = document_parser.get_text()
date = document_parser.get_date()
if date is None:
self._send_progress(
90,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.PARSE_DATE,
)
with get_date_parser() as date_parser:
date = next(date_parser.parse(self.filename, text), None)
archive_path = document_parser.get_archive_path()
page_count = document_parser.get_page_count(
self.working_copy,
mime_type,
)
except ParseError as e:
if tempdir:
tempdir.cleanup()
self._fail(
str(e),
f"Error occurred while consuming document {self.filename}: {e}",
exc_info=True,
exception=e,
)
except Exception as e:
if tempdir:
tempdir.cleanup()
self._fail(
str(e),
f"Unexpected error while consuming document {self.filename}: {e}",
exc_info=True,
exception=e,
)
# Prepare the document classifier.
# TODO: I don't really like to do this here, but this way we avoid
# reloading the classifier multiple times, since there are multiple
# post-consume hooks that all require the classifier.
classifier = load_classifier()
self._send_progress(
95,
100,
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.SAVE_DOCUMENT,
)
# now that everything is done, we can start to store the document
# in the system. This will be a transaction and reasonably fast.
try:
with transaction.atomic():
# store the document.
if self.input_doc.root_document_id:
# If this is a new version of an existing document, we need
# to make sure we're not creating a new document, but updating
# the existing one.
root_doc = Document.objects.get(
pk=self.input_doc.root_document_id,
)
original_document = self._create_version_from_root(
root_doc,
text=text,
page_count=page_count,
mime_type=mime_type,
)
actor = None
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
if (
settings.AUDIT_LOG_ENABLED
and self.metadata.actor_id is not None
):
actor = User.objects.filter(
pk=self.metadata.actor_id,
).first()
if actor is not None:
from auditlog.context import ( # type: ignore[import-untyped]
set_actor,
)
with set_actor(actor):
original_document.save()
else:
original_document.save()
# Create a log entry for the version addition, if enabled
if settings.AUDIT_LOG_ENABLED:
from auditlog.models import ( # type: ignore[import-untyped]
LogEntry,
)
LogEntry.objects.log_create(
instance=root_doc,
changes={
"Version Added": ["None", original_document.id],
},
action=LogEntry.Action.UPDATE,
actor=actor,
additional_data={
"reason": "Version added",
"version_id": original_document.id,
},
)
document = original_document
else:
document = self._store(
text=text,
date=date,
page_count=page_count,
mime_type=mime_type,
original_document.save()
# Create a log entry for the version addition, if enabled
if settings.AUDIT_LOG_ENABLED:
from auditlog.models import ( # type: ignore[import-untyped]
LogEntry,
)
# If we get here, it was successful. Proceed with post-consume
# hooks. If they fail, nothing will get changed.
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier,
original_file=self.unmodified_original
if self.unmodified_original
else self.working_copy,
LogEntry.objects.log_create(
instance=root_doc,
changes={
"Version Added": ["None", original_document.id],
},
action=LogEntry.Action.UPDATE,
actor=actor,
additional_data={
"reason": "Version added",
"version_id": original_document.id,
},
)
document = original_document
else:
document = self._store(
text=text,
date=date,
page_count=page_count,
mime_type=mime_type,
)
# After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction.
with FileLock(settings.MEDIA_LOCK):
generated_filename = generate_unique_filename(document)
# If we get here, it was successful. Proceed with post-consume
# hooks. If they fail, nothing will get changed.
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier,
original_file=self.unmodified_original
if self.unmodified_original
else self.working_copy,
)
# After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction.
with FileLock(settings.MEDIA_LOCK):
generated_filename = generate_unique_filename(document)
if (
len(str(generated_filename))
> Document.MAX_STORED_FILENAME_LENGTH
):
self.log.warning(
"Generated source filename exceeds db path limit, falling back to default naming",
)
generated_filename = generate_filename(
document,
use_format=False,
)
document.filename = generated_filename
create_source_path_directory(document.source_path)
self._write(
self.unmodified_original
if self.unmodified_original is not None
else self.working_copy,
document.source_path,
)
self._write(
thumbnail,
document.thumbnail_path,
)
if archive_path and Path(archive_path).is_file():
generated_archive_filename = generate_unique_filename(
document,
archive_filename=True,
)
if (
len(str(generated_filename))
len(str(generated_archive_filename))
> Document.MAX_STORED_FILENAME_LENGTH
):
self.log.warning(
"Generated source filename exceeds db path limit, falling back to default naming",
"Generated archive filename exceeds db path limit, falling back to default naming",
)
generated_filename = generate_filename(
document,
use_format=False,
)
document.filename = generated_filename
create_source_path_directory(document.source_path)
self._write(
self.unmodified_original
if self.unmodified_original is not None
else self.working_copy,
document.source_path,
)
self._write(
thumbnail,
document.thumbnail_path,
)
if archive_path and Path(archive_path).is_file():
generated_archive_filename = generate_unique_filename(
generated_archive_filename = generate_filename(
document,
archive_filename=True,
use_format=False,
)
if (
len(str(generated_archive_filename))
> Document.MAX_STORED_FILENAME_LENGTH
):
self.log.warning(
"Generated archive filename exceeds db path limit, falling back to default naming",
)
generated_archive_filename = generate_filename(
document,
archive_filename=True,
use_format=False,
)
document.archive_filename = generated_archive_filename
create_source_path_directory(document.archive_path)
self._write(
archive_path,
document.archive_path,
)
with Path(archive_path).open("rb") as f:
document.archive_checksum = hashlib.md5(
f.read(),
).hexdigest()
# Don't save with the lock active. Saving will cause the file
# renaming logic to acquire the lock as well.
# This triggers things like file renaming
document.save()
if document.root_document_id:
document_updated.send(
sender=self.__class__,
document=document.root_document,
document.archive_filename = generated_archive_filename
create_source_path_directory(document.archive_path)
self._write(
archive_path,
document.archive_path,
)
# Delete the file only if it was successfully consumed
self.log.debug(
f"Deleting original file {self.input_doc.original_file}",
)
self.input_doc.original_file.unlink()
self.log.debug(f"Deleting working copy {self.working_copy}")
self.working_copy.unlink()
if self.unmodified_original is not None: # pragma: no cover
self.log.debug(
f"Deleting unmodified original file {self.unmodified_original}",
)
self.unmodified_original.unlink()
document.archive_checksum = compute_checksum(document.archive_path)
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = (
Path(self.input_doc.original_file).parent
/ f"._{Path(self.input_doc.original_file).name}"
# Don't save with the lock active. Saving will cause the file
# renaming logic to acquire the lock as well.
# This triggers things like file renaming
document.save()
if document.root_document_id:
document_updated.send(
sender=self.__class__,
document=document.root_document,
)
if Path(shadow_file).is_file():
self.log.debug(f"Deleting shadow file {shadow_file}")
Path(shadow_file).unlink()
except Exception as e:
self._fail(
str(e),
f"The following error occurred while storing document "
f"{self.filename} after parsing: {e}",
exc_info=True,
exception=e,
# Delete the file only if it was successfully consumed
self.log.debug(
f"Deleting original file {self.input_doc.original_file}",
)
self.input_doc.original_file.unlink()
self.log.debug(f"Deleting working copy {self.working_copy}")
self.working_copy.unlink()
if self.unmodified_original is not None: # pragma: no cover
self.log.debug(
f"Deleting unmodified original file {self.unmodified_original}",
)
self.unmodified_original.unlink()
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = (
Path(self.input_doc.original_file).parent
/ f"._{Path(self.input_doc.original_file).name}"
)
if Path(shadow_file).is_file():
self.log.debug(f"Deleting shadow file {shadow_file}")
Path(shadow_file).unlink()
except Exception as e:
self._fail(
str(e),
f"The following error occurred while storing document "
f"{self.filename} after parsing: {e}",
exc_info=True,
exception=e,
)
finally:
tempdir.cleanup()
self.run_post_consume_script(document)
@@ -776,7 +780,7 @@ class ConsumerPlugin(
title=title[:127],
content=text,
mime_type=mime_type,
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
checksum=compute_checksum(file_for_checksum),
created=create_date,
modified=create_date,
page_count=page_count,
@@ -893,10 +897,9 @@ class ConsumerPreflightPlugin(
def pre_check_duplicate(self) -> None:
"""
Using the MD5 of the file, check this exact file doesn't already exist
Using the SHA256 of the file, check this exact file doesn't already exist
"""
with Path(self.input_doc.original_file).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = compute_checksum(Path(self.input_doc.original_file))
existing_doc = Document.global_objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum),
)

View File

@@ -56,6 +56,7 @@ from documents.models import WorkflowTrigger
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.utils import compute_checksum
from documents.utils import copy_file_with_basic_stats
from paperless import version
from paperless.models import ApplicationConfiguration
@@ -693,7 +694,7 @@ class Command(CryptMixin, PaperlessCommand):
source_stat = source.stat()
target_stat = target.stat()
if self.compare_checksums and source_checksum:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
target_checksum = compute_checksum(target)
perform_copy = target_checksum != source_checksum
elif (
source_stat.st_mtime != target_stat.st_mtime

View File

@@ -0,0 +1,130 @@
import hashlib
import logging
from pathlib import Path
from django.conf import settings
from django.db import migrations
from django.db import models
logger = logging.getLogger("paperless.migrations")
_CHUNK_SIZE = 65536 # 64 KiB — avoids loading entire files into memory
_BATCH_SIZE = 500 # documents per bulk_update call
_PROGRESS_INTERVAL = 500 # log a progress line every N documents
def _sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as fh:
while chunk := fh.read(_CHUNK_SIZE):
h.update(chunk)
return h.hexdigest()
def recompute_checksums(apps, schema_editor):
"""Recompute all document checksums from MD5 to SHA256."""
Document = apps.get_model("documents", "Document")
total = Document.objects.count()
if total == 0:
return
logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
batch: list = []
processed = 0
for doc in Document.objects.only(
"pk",
"filename",
"checksum",
"archive_filename",
"archive_checksum",
).iterator(chunk_size=_BATCH_SIZE):
updated_fields: list[str] = []
# Reconstruct source path the same way Document.source_path does
fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
if source_path.exists():
doc.checksum = _sha256(source_path)
updated_fields.append("checksum")
else:
logger.warning(
"Document %s: original file %s not found, checksum not updated.",
doc.pk,
source_path,
)
# Mirror Document.has_archive_version: archive_filename is not None
if doc.archive_filename is not None:
archive_path = (
settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
).resolve()
if archive_path.exists():
doc.archive_checksum = _sha256(archive_path)
updated_fields.append("archive_checksum")
else:
logger.warning(
"Document %s: archive file %s not found, checksum not updated.",
doc.pk,
archive_path,
)
if updated_fields:
batch.append(doc)
processed += 1
if len(batch) >= _BATCH_SIZE:
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
batch.clear()
if processed % _PROGRESS_INTERVAL == 0:
logger.info(
"SHA-256 checksum progress: %d/%d (%d%%)",
processed,
total,
processed * 100 // total,
)
if batch:
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
logger.info(
"SHA-256 checksum recomputation complete: %d document(s) processed.",
total,
)
class Migration(migrations.Migration):
dependencies = [
("documents", "0015_document_version_index_and_more"),
]
operations = [
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document.",
max_length=64,
verbose_name="checksum",
),
),
migrations.AlterField(
model_name="document",
name="archive_checksum",
field=models.CharField(
blank=True,
editable=False,
help_text="The checksum of the archived document.",
max_length=64,
null=True,
verbose_name="archive checksum",
),
),
migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
]

View File

@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
checksum = models.CharField(
_("checksum"),
max_length=32,
max_length=64,
editable=False,
help_text=_("The checksum of the original document."),
)
archive_checksum = models.CharField(
_("archive checksum"),
max_length=32,
max_length=64,
editable=False,
blank=True,
null=True,

View File

@@ -11,7 +11,6 @@ is an identity function that adds no overhead.
from __future__ import annotations
import hashlib
import logging
import uuid
from collections import defaultdict
@@ -30,6 +29,7 @@ from django.utils import timezone
from documents.models import Document
from documents.models import PaperlessTask
from documents.utils import compute_checksum
from paperless.config import GeneralConfig
logger = logging.getLogger("paperless.sanity_checker")
@@ -218,7 +218,7 @@ def _check_original(
present_files.discard(source_path)
try:
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
checksum = compute_checksum(source_path)
except OSError as e:
messages.error(doc.pk, f"Cannot read original file of document: {e}")
else:
@@ -255,7 +255,7 @@ def _check_archive(
present_files.discard(archive_path)
try:
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
checksum = compute_checksum(archive_path)
except OSError as e:
messages.error(
doc.pk,

View File

@@ -1,5 +1,4 @@
import datetime
import hashlib
import logging
import shutil
import uuid
@@ -61,6 +60,7 @@ from documents.signals import document_updated
from documents.signals.handlers import cleanup_document_deletion
from documents.signals.handlers import run_workflows
from documents.signals.handlers import send_websocket_document_updated
from documents.utils import compute_checksum
from documents.workflows.utils import get_workflows_for_trigger
from paperless.config import AIConfig
from paperless.parsers import ParserContext
@@ -328,8 +328,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
with transaction.atomic():
oldDocument = Document.objects.get(pk=document.pk)
if parser.get_archive_path():
with Path(parser.get_archive_path()).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = compute_checksum(parser.get_archive_path())
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling

View File

@@ -82,8 +82,8 @@ def sample_doc(
return DocumentFactory(
title="test",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
content="test content",
pk=1,
filename="0000001.pdf",

View File

@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
model = Document
title = factory.Faker("sentence", nb_words=4)
checksum = factory.Faker("md5")
checksum = factory.Faker("sha256")
content = factory.Faker("paragraph")
correspondent = None
document_type = None

View File

@@ -261,8 +261,14 @@ class TestConsumer(
self.assertIsFile(document.archive_path)
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
self.assertEqual(
document.checksum,
"1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
)
self.assertEqual(
document.archive_checksum,
"706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
)
self.assertIsNotFile(filename)

View File

@@ -63,8 +63,8 @@ class TestExportImport(
self.d1 = Document.objects.create(
content="Content",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",
@@ -72,21 +72,21 @@ class TestExportImport(
)
self.d2 = Document.objects.create(
content="Content",
checksum="9c9691e51741c1f4f41a20896af31770",
checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
title="wow2",
filename="0000002.pdf",
mime_type="application/pdf",
)
self.d3 = Document.objects.create(
content="Content",
checksum="d38d7ed02e988e072caf924e0f3fcb76",
checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
title="wow2",
filename="0000003.pdf",
mime_type="application/pdf",
)
self.d4 = Document.objects.create(
content="Content",
checksum="82186aaa94f0b98697d704b90fd1c072",
checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
title="wow_dec",
filename="0000004.pdf",
mime_type="application/pdf",
@@ -239,7 +239,7 @@ class TestExportImport(
)
with Path(fname).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = hashlib.sha256(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["checksum"])
# Generated field "content_length" should not be exported,
@@ -253,7 +253,7 @@ class TestExportImport(
self.assertIsFile(fname)
with Path(fname).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = hashlib.sha256(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["archive_checksum"])
elif element["model"] == "documents.note":

View File

@@ -277,8 +277,8 @@ class TestCommandImport(
Document.objects.create(
content="Content",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",

View File

@@ -1,3 +1,4 @@
import hashlib
import logging
import shutil
from os import utime
@@ -128,3 +129,28 @@ def get_boolean(boolstr: str) -> bool:
Return a boolean value from a string representation.
"""
return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
"""
Compute the SHA-256 checksum of a file.
Reads the file in chunks to avoid loading the entire file into memory.
Args:
path (Path): Path to the file to hash.
chunk_size (int, optional): Number of bytes to read per chunk.
Defaults to 65536.
Returns:
str: Hexadecimal SHA-256 digest of the file contents.
Raises:
FileNotFoundError: If the file does not exist.
OSError: If the file cannot be read.
"""
h = hashlib.sha256()
with path.open("rb") as f:
while chunk := f.read(chunk_size):
h.update(chunk)
return h.hexdigest()