Compare commits

..

1 Commits

13 changed files with 41 additions and 330 deletions

View File

@@ -1,5 +1,7 @@
import datetime
import hashlib
import os
import shutil
import tempfile
from enum import StrEnum
from pathlib import Path
@@ -45,7 +47,6 @@ from documents.signals import document_consumption_started
from documents.signals import document_updated
from documents.signals.handlers import run_workflows
from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import compute_checksum
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
@@ -196,7 +197,9 @@ class ConsumerPlugin(
version_doc = Document(
root_document=root_doc_frozen,
version_index=next_version_index + 1,
checksum=compute_checksum(file_for_checksum),
checksum=hashlib.md5(
file_for_checksum.read_bytes(),
).hexdigest(),
content=text or "",
page_count=page_count,
mime_type=mime_type,
@@ -638,9 +641,10 @@ class ConsumerPlugin(
document.archive_path,
)
document.archive_checksum = compute_checksum(
document.archive_path,
)
with Path(archive_path).open("rb") as f:
document.archive_checksum = hashlib.md5(
f.read(),
).hexdigest()
# Don't save with the lock active. Saving will cause the file
# renaming logic to acquire the lock as well.
@@ -782,7 +786,7 @@ class ConsumerPlugin(
title=title[:127],
content=text,
mime_type=mime_type,
checksum=compute_checksum(file_for_checksum),
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
created=create_date,
modified=create_date,
page_count=page_count,
@@ -830,7 +834,7 @@ class ConsumerPlugin(
self.metadata.view_users is not None
or self.metadata.view_groups is not None
or self.metadata.change_users is not None
or self.metadata.change_users is not None
or self.metadata.change_groups is not None
):
permissions = {
"view": {
@@ -863,7 +867,7 @@ class ConsumerPlugin(
Path(source).open("rb") as read_file,
Path(target).open("wb") as write_file,
):
write_file.write(read_file.read())
shutil.copyfileobj(read_file, write_file)
# Attempt to copy file's original stats, but it's ok if we can't
try:
@@ -899,9 +903,10 @@ class ConsumerPreflightPlugin(
def pre_check_duplicate(self) -> None:
"""
Using the SHA256 of the file, check this exact file doesn't already exist
Using the MD5 of the file, check this exact file doesn't already exist
"""
checksum = compute_checksum(Path(self.input_doc.original_file))
with Path(self.input_doc.original_file).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
existing_doc = Document.global_objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum),
)

View File

@@ -56,7 +56,6 @@ from documents.models import WorkflowTrigger
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.utils import compute_checksum
from documents.utils import copy_file_with_basic_stats
from paperless import version
from paperless.models import ApplicationConfiguration
@@ -694,7 +693,7 @@ class Command(CryptMixin, PaperlessCommand):
source_stat = source.stat()
target_stat = target.stat()
if self.compare_checksums and source_checksum:
target_checksum = compute_checksum(target)
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
perform_copy = target_checksum != source_checksum
elif (
source_stat.st_mtime != target_stat.st_mtime

View File

@@ -1,130 +0,0 @@
import hashlib
import logging
from pathlib import Path
from django.conf import settings
from django.db import migrations
from django.db import models
logger = logging.getLogger("paperless.migrations")
_CHUNK_SIZE = 65536 # 64 KiB — avoids loading entire files into memory
_BATCH_SIZE = 500 # documents per bulk_update call
_PROGRESS_INTERVAL = 500 # log a progress line every N documents
def _sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as fh:
while chunk := fh.read(_CHUNK_SIZE):
h.update(chunk)
return h.hexdigest()
def recompute_checksums(apps, schema_editor):
"""Recompute all document checksums from MD5 to SHA256."""
Document = apps.get_model("documents", "Document")
total = Document.objects.count()
if total == 0:
return
logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
batch: list = []
processed = 0
for doc in Document.objects.only(
"pk",
"filename",
"checksum",
"archive_filename",
"archive_checksum",
).iterator(chunk_size=_BATCH_SIZE):
updated_fields: list[str] = []
# Reconstruct source path the same way Document.source_path does
fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
if source_path.exists():
doc.checksum = _sha256(source_path)
updated_fields.append("checksum")
else:
logger.warning(
"Document %s: original file %s not found, checksum not updated.",
doc.pk,
source_path,
)
# Mirror Document.has_archive_version: archive_filename is not None
if doc.archive_filename is not None:
archive_path = (
settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
).resolve()
if archive_path.exists():
doc.archive_checksum = _sha256(archive_path)
updated_fields.append("archive_checksum")
else:
logger.warning(
"Document %s: archive file %s not found, checksum not updated.",
doc.pk,
archive_path,
)
if updated_fields:
batch.append(doc)
processed += 1
if len(batch) >= _BATCH_SIZE:
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
batch.clear()
if processed % _PROGRESS_INTERVAL == 0:
logger.info(
"SHA-256 checksum progress: %d/%d (%d%%)",
processed,
total,
processed * 100 // total,
)
if batch:
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
logger.info(
"SHA-256 checksum recomputation complete: %d document(s) processed.",
total,
)
class Migration(migrations.Migration):
dependencies = [
("documents", "0015_document_version_index_and_more"),
]
operations = [
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document.",
max_length=64,
verbose_name="checksum",
),
),
migrations.AlterField(
model_name="document",
name="archive_checksum",
field=models.CharField(
blank=True,
editable=False,
help_text="The checksum of the archived document.",
max_length=64,
null=True,
verbose_name="archive checksum",
),
),
migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
]

View File

@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
checksum = models.CharField(
_("checksum"),
max_length=64,
max_length=32,
editable=False,
help_text=_("The checksum of the original document."),
)
archive_checksum = models.CharField(
_("archive checksum"),
max_length=64,
max_length=32,
editable=False,
blank=True,
null=True,

View File

@@ -11,6 +11,7 @@ is an identity function that adds no overhead.
from __future__ import annotations
import hashlib
import logging
import uuid
from collections import defaultdict
@@ -29,7 +30,6 @@ from django.utils import timezone
from documents.models import Document
from documents.models import PaperlessTask
from documents.utils import compute_checksum
from paperless.config import GeneralConfig
logger = logging.getLogger("paperless.sanity_checker")
@@ -218,7 +218,7 @@ def _check_original(
present_files.discard(source_path)
try:
checksum = compute_checksum(source_path)
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(doc.pk, f"Cannot read original file of document: {e}")
else:
@@ -255,7 +255,7 @@ def _check_archive(
present_files.discard(archive_path)
try:
checksum = compute_checksum(archive_path)
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(
doc.pk,

View File

@@ -1,4 +1,5 @@
import datetime
import hashlib
import logging
import shutil
import uuid
@@ -52,15 +53,14 @@ from documents.models import Tag
from documents.models import WorkflowRun
from documents.models import WorkflowTrigger
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import ProgressManager
from documents.plugins.base import StopConsumeTaskError
from documents.plugins.helpers import ProgressManager
from documents.plugins.helpers import ProgressStatusOptions
from documents.sanity_checker import SanityCheckFailedException
from documents.signals import document_updated
from documents.signals.handlers import cleanup_document_deletion
from documents.signals.handlers import run_workflows
from documents.signals.handlers import send_websocket_document_updated
from documents.utils import compute_checksum
from documents.workflows.utils import get_workflows_for_trigger
from paperless.config import AIConfig
from paperless.parsers import ParserContext
@@ -328,7 +328,8 @@ def update_document_content_maybe_archive_file(document_id) -> None:
with transaction.atomic():
oldDocument = Document.objects.get(pk=document.pk)
if parser.get_archive_path():
checksum = compute_checksum(parser.get_archive_path())
with Path(parser.get_archive_path()).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
@@ -532,13 +533,13 @@ def check_scheduled_workflows() -> None:
id__in=matched_ids,
)
if documents.count() > 0:
if documents.exists():
documents = prefilter_documents_by_workflowtrigger(
documents,
trigger,
)
if documents.count() > 0:
if documents.exists():
logger.debug(
f"Found {documents.count()} documents for trigger {trigger}",
)

View File

@@ -82,8 +82,8 @@ def sample_doc(
return DocumentFactory(
title="test",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
content="test content",
pk=1,
filename="0000001.pdf",

View File

@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
model = Document
title = factory.Faker("sentence", nb_words=4)
checksum = factory.Faker("sha256")
checksum = factory.Faker("md5")
content = factory.Faker("paragraph")
correspondent = None
document_type = None

View File

@@ -261,14 +261,8 @@ class TestConsumer(
self.assertIsFile(document.archive_path)
self.assertEqual(
document.checksum,
"1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
)
self.assertEqual(
document.archive_checksum,
"706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
)
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
self.assertIsNotFile(filename)

View File

@@ -63,8 +63,8 @@ class TestExportImport(
self.d1 = Document.objects.create(
content="Content",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",
@@ -72,21 +72,21 @@ class TestExportImport(
)
self.d2 = Document.objects.create(
content="Content",
checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
checksum="9c9691e51741c1f4f41a20896af31770",
title="wow2",
filename="0000002.pdf",
mime_type="application/pdf",
)
self.d3 = Document.objects.create(
content="Content",
checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
checksum="d38d7ed02e988e072caf924e0f3fcb76",
title="wow2",
filename="0000003.pdf",
mime_type="application/pdf",
)
self.d4 = Document.objects.create(
content="Content",
checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
checksum="82186aaa94f0b98697d704b90fd1c072",
title="wow_dec",
filename="0000004.pdf",
mime_type="application/pdf",
@@ -239,7 +239,7 @@ class TestExportImport(
)
with Path(fname).open("rb") as f:
checksum = hashlib.sha256(f.read()).hexdigest()
checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["checksum"])
# Generated field "content_length" should not be exported,
@@ -253,7 +253,7 @@ class TestExportImport(
self.assertIsFile(fname)
with Path(fname).open("rb") as f:
checksum = hashlib.sha256(f.read()).hexdigest()
checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["archive_checksum"])
elif element["model"] == "documents.note":

View File

@@ -277,8 +277,8 @@ class TestCommandImport(
Document.objects.create(
content="Content",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",

View File

@@ -1,132 +0,0 @@
import hashlib
import shutil
import tempfile
from pathlib import Path
from django.conf import settings
from django.db import connection
from django.test import override_settings
from documents.tests.utils import TestMigrations
def _sha256(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
class TestSha256ChecksumDataMigration(TestMigrations):
"""recompute_checksums correctly updates document checksums from MD5 to SHA256."""
migrate_from = "0015_document_version_index_and_more"
migrate_to = "0016_sha256_checksums"
reset_sequences = True
ORIGINAL_CONTENT = b"original file content for sha256 migration test"
ARCHIVE_CONTENT = b"archive file content for sha256 migration test"
def setUpBeforeMigration(self, apps) -> None:
self._originals_dir = Path(tempfile.mkdtemp())
self._archive_dir = Path(tempfile.mkdtemp())
self._settings_override = override_settings(
ORIGINALS_DIR=self._originals_dir,
ARCHIVE_DIR=self._archive_dir,
)
self._settings_override.enable()
Document = apps.get_model("documents", "Document")
# doc1: original file present, no archive
(settings.ORIGINALS_DIR / "doc1.txt").write_bytes(self.ORIGINAL_CONTENT)
self.doc1_id = Document.objects.create(
title="Doc 1",
mime_type="text/plain",
filename="doc1.txt",
checksum="a" * 32,
).pk
# doc2: original and archive both present
(settings.ORIGINALS_DIR / "doc2.txt").write_bytes(self.ORIGINAL_CONTENT)
(settings.ARCHIVE_DIR / "doc2.pdf").write_bytes(self.ARCHIVE_CONTENT)
self.doc2_id = Document.objects.create(
title="Doc 2",
mime_type="text/plain",
filename="doc2.txt",
checksum="b" * 32,
archive_filename="doc2.pdf",
archive_checksum="c" * 32,
).pk
# doc3: original file missing — checksum must stay unchanged
self.doc3_id = Document.objects.create(
title="Doc 3",
mime_type="text/plain",
filename="missing_original.txt",
checksum="d" * 32,
).pk
# doc4: original present, archive_filename set but archive file missing
(settings.ORIGINALS_DIR / "doc4.txt").write_bytes(self.ORIGINAL_CONTENT)
self.doc4_id = Document.objects.create(
title="Doc 4",
mime_type="text/plain",
filename="doc4.txt",
checksum="e" * 32,
archive_filename="missing_archive.pdf",
archive_checksum="f" * 32,
).pk
# doc5: original present, archive_filename is None — archive_checksum must stay null
(settings.ORIGINALS_DIR / "doc5.txt").write_bytes(self.ORIGINAL_CONTENT)
self.doc5_id = Document.objects.create(
title="Doc 5",
mime_type="text/plain",
filename="doc5.txt",
checksum="0" * 32,
archive_filename=None,
archive_checksum=None,
).pk
def _fixture_teardown(self) -> None:
super()._fixture_teardown()
# Django's SQLite backend returns [] from sequence_reset_sql(), so
# reset_sequences=True flushes rows but never clears sqlite_sequence.
# Explicitly delete the entry so subsequent tests start from pk=1.
if connection.vendor == "sqlite":
with connection.cursor() as cursor:
cursor.execute(
"DELETE FROM sqlite_sequence WHERE name='documents_document'",
)
def tearDown(self) -> None:
super().tearDown()
self._settings_override.disable()
shutil.rmtree(self._originals_dir, ignore_errors=True)
shutil.rmtree(self._archive_dir, ignore_errors=True)
def test_original_checksum_updated_to_sha256_when_file_exists(self) -> None:
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(pk=self.doc1_id)
self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT))
def test_both_checksums_updated_when_original_and_archive_exist(self) -> None:
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(pk=self.doc2_id)
self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT))
self.assertEqual(doc.archive_checksum, _sha256(self.ARCHIVE_CONTENT))
def test_checksum_unchanged_when_original_file_missing(self) -> None:
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(pk=self.doc3_id)
self.assertEqual(doc.checksum, "d" * 32)
def test_archive_checksum_unchanged_when_archive_file_missing(self) -> None:
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(pk=self.doc4_id)
# Original was updated (file exists)
self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT))
# Archive was not updated (file missing)
self.assertEqual(doc.archive_checksum, "f" * 32)
def test_archive_checksum_stays_null_when_no_archive_filename(self) -> None:
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(pk=self.doc5_id)
self.assertIsNone(doc.archive_checksum)

View File

@@ -1,4 +1,3 @@
import hashlib
import logging
import shutil
from os import utime
@@ -129,28 +128,3 @@ def get_boolean(boolstr: str) -> bool:
Return a boolean value from a string representation.
"""
return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
"""
Compute the SHA-256 checksum of a file.
Reads the file in chunks to avoid loading the entire file into memory.
Args:
path (Path): Path to the file to hash.
chunk_size (int, optional): Number of bytes to read per chunk.
Defaults to 65536.
Returns:
str: Hexadecimal SHA-256 digest of the file contents.
Raises:
FileNotFoundError: If the file does not exist.
OSError: If the file cannot be read.
"""
h = hashlib.sha256()
with path.open("rb") as f:
while chunk := f.read(chunk_size):
h.update(chunk)
return h.hexdigest()