mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-08 02:01:22 +00:00
Compare commits
3 Commits
feature-mi
...
feature-sh
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3ae0b8e219 | ||
|
|
4a6fd02492 | ||
|
|
76fb8f3770 |
@@ -1,5 +1,4 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import tempfile
|
||||
from enum import StrEnum
|
||||
@@ -48,6 +47,7 @@ from documents.signals import document_consumption_started
|
||||
from documents.signals import document_updated
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from documents.utils import compute_checksum
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
@@ -196,9 +196,7 @@ class ConsumerPlugin(
|
||||
version_doc = Document(
|
||||
root_document=root_doc_frozen,
|
||||
version_index=next_version_index + 1,
|
||||
checksum=hashlib.md5(
|
||||
file_for_checksum.read_bytes(),
|
||||
).hexdigest(),
|
||||
checksum=compute_checksum(file_for_checksum),
|
||||
content=text or "",
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
@@ -656,10 +654,9 @@ class ConsumerPlugin(
|
||||
document.archive_path,
|
||||
)
|
||||
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
document.archive_checksum = compute_checksum(
|
||||
Path(archive_path),
|
||||
)
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
@@ -800,7 +797,7 @@ class ConsumerPlugin(
|
||||
title=title[:127],
|
||||
content=text,
|
||||
mime_type=mime_type,
|
||||
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
|
||||
checksum=compute_checksum(file_for_checksum),
|
||||
created=create_date,
|
||||
modified=create_date,
|
||||
page_count=page_count,
|
||||
@@ -917,10 +914,9 @@ class ConsumerPreflightPlugin(
|
||||
|
||||
def pre_check_duplicate(self) -> None:
|
||||
"""
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
Using the SHA256 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
with Path(self.input_doc.original_file).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
checksum = compute_checksum(Path(self.input_doc.original_file))
|
||||
existing_doc = Document.global_objects.filter(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
|
||||
@@ -304,7 +304,7 @@ class PaperlessCommand(RichCommand):
|
||||
|
||||
Progress output is directed to stderr to match the convention that
|
||||
progress bars are transient UI feedback, not command output. This
|
||||
mirrors the convention that progress bars are transient UI feedback and prevents progress bar rendering
|
||||
mirrors tqdm's default behavior and prevents progress bar rendering
|
||||
from interfering with stdout-based assertions in tests or piped
|
||||
command output.
|
||||
|
||||
|
||||
@@ -17,7 +17,6 @@ class Command(PaperlessCommand):
|
||||
"modified) after their initial import."
|
||||
)
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = True
|
||||
|
||||
def add_arguments(self, parser):
|
||||
|
||||
@@ -8,6 +8,7 @@ from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import tqdm
|
||||
from allauth.mfa.models import Authenticator
|
||||
from allauth.socialaccount.models import SocialAccount
|
||||
from allauth.socialaccount.models import SocialApp
|
||||
@@ -18,6 +19,7 @@ from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.core import serializers
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import CommandError
|
||||
from django.core.serializers.json import DjangoJSONEncoder
|
||||
from django.db import transaction
|
||||
@@ -36,7 +38,6 @@ if settings.AUDIT_LOG_ENABLED:
|
||||
|
||||
from documents.file_handling import delete_empty_directories
|
||||
from documents.file_handling import generate_filename
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.management.commands.mixins import CryptMixin
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
@@ -57,6 +58,7 @@ from documents.models import WorkflowTrigger
|
||||
from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||
from documents.utils import compute_checksum
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from paperless import version
|
||||
from paperless.models import ApplicationConfiguration
|
||||
@@ -80,18 +82,14 @@ def serialize_queryset_batched(
|
||||
yield serializers.serialize("python", chunk)
|
||||
|
||||
|
||||
class Command(CryptMixin, PaperlessCommand):
|
||||
class Command(CryptMixin, BaseCommand):
|
||||
help = (
|
||||
"Decrypt and rename all files in our collection into a given target "
|
||||
"directory. And include a manifest file containing document data for "
|
||||
"easy import."
|
||||
)
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = False
|
||||
|
||||
def add_arguments(self, parser) -> None:
|
||||
super().add_arguments(parser)
|
||||
parser.add_argument("target")
|
||||
|
||||
parser.add_argument(
|
||||
@@ -198,6 +196,13 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
help="If set, only the database will be imported, not files",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--no-progress-bar",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, the progress bar will not be shown",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--passphrase",
|
||||
help="If provided, is used to encrypt sensitive data in the export",
|
||||
@@ -226,6 +231,7 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
self.no_thumbnail: bool = options["no_thumbnail"]
|
||||
self.zip_export: bool = options["zip"]
|
||||
self.data_only: bool = options["data_only"]
|
||||
self.no_progress_bar: bool = options["no_progress_bar"]
|
||||
self.passphrase: str | None = options.get("passphrase")
|
||||
self.batch_size: int = options["batch_size"]
|
||||
|
||||
@@ -342,12 +348,10 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
document_manifest = manifest_dict["documents"]
|
||||
|
||||
# 3. Export files from each document
|
||||
for index, document_dict in enumerate(
|
||||
self.track(
|
||||
document_manifest,
|
||||
description="Exporting documents...",
|
||||
total=len(document_manifest),
|
||||
),
|
||||
for index, document_dict in tqdm.tqdm(
|
||||
enumerate(document_manifest),
|
||||
total=len(document_manifest),
|
||||
disable=self.no_progress_bar,
|
||||
):
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
@@ -546,14 +550,14 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
if target in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(target)
|
||||
if self.compare_json:
|
||||
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
|
||||
target_checksum = compute_checksum(target)
|
||||
src_str = json.dumps(
|
||||
content,
|
||||
cls=DjangoJSONEncoder,
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
|
||||
src_checksum = hashlib.sha256(src_str.encode("utf-8")).hexdigest()
|
||||
if src_checksum == target_checksum:
|
||||
perform_write = False
|
||||
|
||||
@@ -589,7 +593,7 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
source_stat = source.stat()
|
||||
target_stat = target.stat()
|
||||
if self.compare_checksums and source_checksum:
|
||||
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
|
||||
target_checksum = compute_checksum(target)
|
||||
perform_copy = target_checksum != source_checksum
|
||||
elif (
|
||||
source_stat.st_mtime != target_stat.st_mtime
|
||||
|
||||
@@ -40,7 +40,6 @@ def _process_and_match(work: _WorkPackage) -> _WorkResult:
|
||||
class Command(PaperlessCommand):
|
||||
help = "Searches for documents where the content almost matches"
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = True
|
||||
|
||||
def add_arguments(self, parser):
|
||||
|
||||
@@ -8,12 +8,14 @@ from pathlib import Path
|
||||
from zipfile import ZipFile
|
||||
from zipfile import is_zipfile
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.core.exceptions import FieldDoesNotExist
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import CommandError
|
||||
from django.core.serializers.base import DeserializationError
|
||||
from django.db import IntegrityError
|
||||
@@ -23,7 +25,6 @@ from django.db.models.signals import post_save
|
||||
from filelock import FileLock
|
||||
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.management.commands.mixins import CryptMixin
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
@@ -56,19 +57,22 @@ def disable_signal(sig, receiver, sender, *, weak: bool | None = None) -> Genera
|
||||
sig.connect(receiver=receiver, sender=sender, **kwargs)
|
||||
|
||||
|
||||
class Command(CryptMixin, PaperlessCommand):
|
||||
class Command(CryptMixin, BaseCommand):
|
||||
help = (
|
||||
"Using a manifest.json file, load the data from there, and import the "
|
||||
"documents it refers to."
|
||||
)
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = False
|
||||
|
||||
def add_arguments(self, parser) -> None:
|
||||
super().add_arguments(parser)
|
||||
parser.add_argument("source")
|
||||
|
||||
parser.add_argument(
|
||||
"--no-progress-bar",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, the progress bar will not be shown",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--data-only",
|
||||
default=False,
|
||||
@@ -227,6 +231,7 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
|
||||
self.source = Path(options["source"]).resolve()
|
||||
self.data_only: bool = options["data_only"]
|
||||
self.no_progress_bar: bool = options["no_progress_bar"]
|
||||
self.passphrase: str | None = options.get("passphrase")
|
||||
self.version: str | None = None
|
||||
self.salt: str | None = None
|
||||
@@ -360,7 +365,7 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
filter(lambda r: r["model"] == "documents.document", self.manifest),
|
||||
)
|
||||
|
||||
for record in self.track(manifest_documents, description="Copying files..."):
|
||||
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
|
||||
@@ -8,9 +8,6 @@ from documents.tasks import index_reindex
|
||||
class Command(PaperlessCommand):
|
||||
help = "Manages the document index."
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = False
|
||||
|
||||
def add_arguments(self, parser):
|
||||
super().add_arguments(parser)
|
||||
parser.add_argument("command", choices=["reindex", "optimize"])
|
||||
|
||||
@@ -7,9 +7,6 @@ from documents.tasks import llmindex_index
|
||||
class Command(PaperlessCommand):
|
||||
help = "Manages the LLM-based vector index for Paperless."
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = False
|
||||
|
||||
def add_arguments(self, parser: Any) -> None:
|
||||
super().add_arguments(parser)
|
||||
parser.add_argument("command", choices=["rebuild", "update"])
|
||||
|
||||
@@ -7,9 +7,6 @@ from documents.models import Document
|
||||
class Command(PaperlessCommand):
|
||||
help = "Rename all documents"
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = False
|
||||
|
||||
def handle(self, *args, **options):
|
||||
for document in self.track(Document.objects.all(), description="Renaming..."):
|
||||
post_save.send(Document, instance=document, created=False)
|
||||
|
||||
@@ -180,9 +180,6 @@ class Command(PaperlessCommand):
|
||||
"modified) after their initial import."
|
||||
)
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = False
|
||||
|
||||
def add_arguments(self, parser) -> None:
|
||||
super().add_arguments(parser)
|
||||
parser.add_argument("-c", "--correspondent", default=False, action="store_true")
|
||||
|
||||
@@ -24,9 +24,6 @@ _LEVEL_STYLE: dict[int, tuple[str, str]] = {
|
||||
class Command(PaperlessCommand):
|
||||
help = "This command checks your document archive for issues."
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = False
|
||||
|
||||
def _render_results(self, messages: SanityCheckMessages) -> None:
|
||||
"""Render sanity check results as a Rich table."""
|
||||
|
||||
|
||||
@@ -36,7 +36,6 @@ def _process_document(doc_id: int) -> None:
|
||||
class Command(PaperlessCommand):
|
||||
help = "This will regenerate the thumbnails for all documents."
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = True
|
||||
|
||||
def add_arguments(self, parser) -> None:
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import base64
|
||||
import os
|
||||
from argparse import ArgumentParser
|
||||
from typing import TypedDict
|
||||
|
||||
from cryptography.fernet import Fernet
|
||||
@@ -20,6 +21,25 @@ class CryptFields(TypedDict):
|
||||
fields: list[str]
|
||||
|
||||
|
||||
class ProgressBarMixin:
|
||||
"""
|
||||
Many commands use a progress bar, which can be disabled
|
||||
via this class
|
||||
"""
|
||||
|
||||
def add_argument_progress_bar_mixin(self, parser: ArgumentParser) -> None:
|
||||
parser.add_argument(
|
||||
"--no-progress-bar",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, the progress bar will not be shown",
|
||||
)
|
||||
|
||||
def handle_progress_bar_mixin(self, *args, **options) -> None:
|
||||
self.no_progress_bar = options["no_progress_bar"]
|
||||
self.use_progress_bar = not self.no_progress_bar
|
||||
|
||||
|
||||
class CryptMixin:
|
||||
"""
|
||||
Fully based on:
|
||||
|
||||
@@ -9,9 +9,6 @@ class Command(PaperlessCommand):
|
||||
|
||||
help = "Prunes the audit logs of objects that no longer exist."
|
||||
|
||||
supports_progress_bar = True
|
||||
supports_multiprocessing = False
|
||||
|
||||
def handle(self, *args, **options):
|
||||
with transaction.atomic():
|
||||
for log_entry in self.track(
|
||||
|
||||
130
src/documents/migrations/0017_sha256_checksums.py
Normal file
130
src/documents/migrations/0017_sha256_checksums.py
Normal file
@@ -0,0 +1,130 @@
|
||||
import hashlib
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
_CHUNK_SIZE = 65536 # 64 KiB — avoids loading entire files into memory
|
||||
_BATCH_SIZE = 500 # documents per bulk_update call
|
||||
_PROGRESS_INTERVAL = 500 # log a progress line every N documents
|
||||
|
||||
|
||||
def _sha256(path: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as fh:
|
||||
while chunk := fh.read(_CHUNK_SIZE):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def recompute_checksums(apps, schema_editor):
|
||||
"""Recompute all document checksums from MD5 to SHA256."""
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
total = Document.objects.count()
|
||||
if total == 0:
|
||||
return
|
||||
|
||||
logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
|
||||
|
||||
batch: list = []
|
||||
processed = 0
|
||||
|
||||
for doc in Document.objects.only(
|
||||
"pk",
|
||||
"filename",
|
||||
"checksum",
|
||||
"archive_filename",
|
||||
"archive_checksum",
|
||||
).iterator(chunk_size=_BATCH_SIZE):
|
||||
updated_fields: list[str] = []
|
||||
|
||||
# Reconstruct source path the same way Document.source_path does
|
||||
fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
|
||||
source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
|
||||
|
||||
if source_path.exists():
|
||||
doc.checksum = _sha256(source_path)
|
||||
updated_fields.append("checksum")
|
||||
else:
|
||||
logger.warning(
|
||||
"Document %s: original file %s not found, checksum not updated.",
|
||||
doc.pk,
|
||||
source_path,
|
||||
)
|
||||
|
||||
# Mirror Document.has_archive_version: archive_filename is not None
|
||||
if doc.archive_filename is not None:
|
||||
archive_path = (
|
||||
settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
|
||||
).resolve()
|
||||
if archive_path.exists():
|
||||
doc.archive_checksum = _sha256(archive_path)
|
||||
updated_fields.append("archive_checksum")
|
||||
else:
|
||||
logger.warning(
|
||||
"Document %s: archive file %s not found, checksum not updated.",
|
||||
doc.pk,
|
||||
archive_path,
|
||||
)
|
||||
|
||||
if updated_fields:
|
||||
batch.append(doc)
|
||||
|
||||
processed += 1
|
||||
|
||||
if len(batch) >= _BATCH_SIZE:
|
||||
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
|
||||
batch.clear()
|
||||
|
||||
if processed % _PROGRESS_INTERVAL == 0:
|
||||
logger.info(
|
||||
"SHA-256 checksum progress: %d/%d (%d%%)",
|
||||
processed,
|
||||
total,
|
||||
processed * 100 // total,
|
||||
)
|
||||
|
||||
if batch:
|
||||
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
|
||||
|
||||
logger.info(
|
||||
"SHA-256 checksum recomputation complete: %d document(s) processed.",
|
||||
total,
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0016_document_version_index_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
help_text="The checksum of the original document.",
|
||||
max_length=64,
|
||||
verbose_name="checksum",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="archive_checksum",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
editable=False,
|
||||
help_text="The checksum of the archived document.",
|
||||
max_length=64,
|
||||
null=True,
|
||||
verbose_name="archive checksum",
|
||||
),
|
||||
),
|
||||
migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
|
||||
]
|
||||
@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
|
||||
checksum = models.CharField(
|
||||
_("checksum"),
|
||||
max_length=32,
|
||||
max_length=64,
|
||||
editable=False,
|
||||
help_text=_("The checksum of the original document."),
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
_("archive checksum"),
|
||||
max_length=32,
|
||||
max_length=64,
|
||||
editable=False,
|
||||
blank=True,
|
||||
null=True,
|
||||
|
||||
@@ -11,7 +11,6 @@ is an identity function that adds no overhead.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
@@ -30,6 +29,7 @@ from django.utils import timezone
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
from documents.utils import compute_checksum
|
||||
from paperless.config import GeneralConfig
|
||||
|
||||
logger = logging.getLogger("paperless.sanity_checker")
|
||||
@@ -218,7 +218,7 @@ def _check_original(
|
||||
|
||||
present_files.discard(source_path)
|
||||
try:
|
||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||
checksum = compute_checksum(source_path)
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read original file of document: {e}")
|
||||
else:
|
||||
@@ -255,7 +255,7 @@ def _check_archive(
|
||||
|
||||
present_files.discard(archive_path)
|
||||
try:
|
||||
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
|
||||
checksum = compute_checksum(archive_path)
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import shutil
|
||||
import uuid
|
||||
@@ -63,6 +62,7 @@ from documents.signals import document_updated
|
||||
from documents.signals.handlers import cleanup_document_deletion
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.utils import compute_checksum
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
@@ -323,8 +323,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
checksum = compute_checksum(Path(parser.get_archive_path()))
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
|
||||
@@ -82,8 +82,8 @@ def sample_doc(
|
||||
|
||||
return DocumentFactory(
|
||||
title="test",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
content="test content",
|
||||
pk=1,
|
||||
filename="0000001.pdf",
|
||||
|
||||
@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
|
||||
model = Document
|
||||
|
||||
title = factory.Faker("sentence", nb_words=4)
|
||||
checksum = factory.Faker("md5")
|
||||
checksum = factory.Faker("sha256")
|
||||
content = factory.Faker("paragraph")
|
||||
correspondent = None
|
||||
document_type = None
|
||||
|
||||
@@ -245,8 +245,14 @@ class TestConsumer(
|
||||
|
||||
self.assertIsFile(document.archive_path)
|
||||
|
||||
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
|
||||
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
|
||||
self.assertEqual(
|
||||
document.checksum,
|
||||
"1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
)
|
||||
self.assertEqual(
|
||||
document.archive_checksum,
|
||||
"706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
)
|
||||
|
||||
self.assertIsNotFile(filename)
|
||||
|
||||
|
||||
@@ -63,8 +63,8 @@ class TestExportImport(
|
||||
|
||||
self.d1 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
title="wow1",
|
||||
filename="0000001.pdf",
|
||||
mime_type="application/pdf",
|
||||
@@ -72,21 +72,21 @@ class TestExportImport(
|
||||
)
|
||||
self.d2 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="9c9691e51741c1f4f41a20896af31770",
|
||||
checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
|
||||
title="wow2",
|
||||
filename="0000002.pdf",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
self.d3 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="d38d7ed02e988e072caf924e0f3fcb76",
|
||||
checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
|
||||
title="wow2",
|
||||
filename="0000003.pdf",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
self.d4 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="82186aaa94f0b98697d704b90fd1c072",
|
||||
checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
|
||||
title="wow_dec",
|
||||
filename="0000004.pdf",
|
||||
mime_type="application/pdf",
|
||||
@@ -240,7 +240,7 @@ class TestExportImport(
|
||||
)
|
||||
|
||||
with Path(fname).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
checksum = hashlib.sha256(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element["fields"]["checksum"])
|
||||
|
||||
# Generated field "content_length" should not be exported,
|
||||
@@ -254,7 +254,7 @@ class TestExportImport(
|
||||
self.assertIsFile(fname)
|
||||
|
||||
with Path(fname).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
checksum = hashlib.sha256(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element["fields"]["archive_checksum"])
|
||||
|
||||
elif element["model"] == "documents.note":
|
||||
|
||||
@@ -260,8 +260,8 @@ class TestCommandImport(
|
||||
|
||||
Document.objects.create(
|
||||
content="Content",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
title="wow1",
|
||||
filename="0000001.pdf",
|
||||
mime_type="application/pdf",
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import shutil
|
||||
from os import utime
|
||||
@@ -128,3 +129,15 @@ def get_boolean(boolstr: str) -> bool:
|
||||
Return a boolean value from a string representation.
|
||||
"""
|
||||
return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
|
||||
def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
|
||||
"""
|
||||
Return the SHA256 hex digest of the file at *path*, reading in chunks
|
||||
of *chunk_size* bytes to avoid loading the entire file into memory.
|
||||
"""
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
while chunk := f.read(chunk_size):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
Reference in New Issue
Block a user