mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-25 18:32:45 +00:00
Compare commits
1 Commits
feature-sh
...
fix/contex
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d3bd374e83 |
@@ -1,4 +1,5 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import tempfile
|
||||
from enum import StrEnum
|
||||
@@ -45,7 +46,6 @@ from documents.signals import document_consumption_started
|
||||
from documents.signals import document_updated
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from documents.utils import compute_checksum
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
@@ -196,7 +196,9 @@ class ConsumerPlugin(
|
||||
version_doc = Document(
|
||||
root_document=root_doc_frozen,
|
||||
version_index=next_version_index + 1,
|
||||
checksum=compute_checksum(file_for_checksum),
|
||||
checksum=hashlib.md5(
|
||||
file_for_checksum.read_bytes(),
|
||||
).hexdigest(),
|
||||
content=text or "",
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
@@ -336,18 +338,15 @@ class ConsumerPlugin(
|
||||
Return the document object if it was successfully created.
|
||||
"""
|
||||
|
||||
tempdir = None
|
||||
# Preflight has already run including progress update to 0%
|
||||
self.log.info(f"Consuming {self.filename}")
|
||||
|
||||
try:
|
||||
# Preflight has already run including progress update to 0%
|
||||
self.log.info(f"Consuming {self.filename}")
|
||||
|
||||
# For the actual work, copy the file into a tempdir
|
||||
tempdir = tempfile.TemporaryDirectory(
|
||||
prefix="paperless-ngx",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
)
|
||||
self.working_copy = Path(tempdir.name) / Path(self.filename)
|
||||
# For the actual work, copy the file into a tempdir
|
||||
with tempfile.TemporaryDirectory(
|
||||
prefix="paperless-ngx",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
) as tmpdir:
|
||||
self.working_copy = Path(tmpdir) / Path(self.filename)
|
||||
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
|
||||
self.unmodified_original = None
|
||||
|
||||
@@ -379,7 +378,7 @@ class ConsumerPlugin(
|
||||
self.log.debug(f"Detected mime type after qpdf: {mime_type}")
|
||||
# Save the original file for later
|
||||
self.unmodified_original = (
|
||||
Path(tempdir.name) / Path("uo") / Path(self.filename)
|
||||
Path(tmpdir) / Path("uo") / Path(self.filename)
|
||||
)
|
||||
self.unmodified_original.parent.mkdir(exist_ok=True)
|
||||
copy_file_with_basic_stats(
|
||||
@@ -398,7 +397,6 @@ class ConsumerPlugin(
|
||||
)
|
||||
)
|
||||
if not parser_class:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.UNSUPPORTED_TYPE,
|
||||
f"Unsupported mime type {mime_type}",
|
||||
@@ -413,278 +411,276 @@ class ConsumerPlugin(
|
||||
)
|
||||
|
||||
self.run_pre_consume_script()
|
||||
except:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
raise
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
with parser_class() as document_parser:
|
||||
document_parser.configure(
|
||||
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||
)
|
||||
|
||||
self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}")
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
with parser_class() as document_parser:
|
||||
document_parser.configure(
|
||||
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
self.log.debug(
|
||||
f"Parser: {document_parser.name} v{document_parser.version}",
|
||||
)
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
90,
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
)
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
except ParseError as e:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
|
||||
# Prepare the document classifier.
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
thumbnail = document_parser.get_thumbnail(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
)
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
if self.input_doc.root_document_id:
|
||||
# If this is a new version of an existing document, we need
|
||||
# to make sure we're not creating a new document, but updating
|
||||
# the existing one.
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
self._send_progress(
|
||||
90,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
actor = None
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
)
|
||||
|
||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||
if (
|
||||
settings.AUDIT_LOG_ENABLED
|
||||
and self.metadata.actor_id is not None
|
||||
):
|
||||
actor = User.objects.filter(
|
||||
pk=self.metadata.actor_id,
|
||||
).first()
|
||||
if actor is not None:
|
||||
from auditlog.context import ( # type: ignore[import-untyped]
|
||||
set_actor,
|
||||
)
|
||||
except ParseError as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
with set_actor(actor):
|
||||
# Prepare the document classifier.
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
if self.input_doc.root_document_id:
|
||||
# If this is a new version of an existing document, we need
|
||||
# to make sure we're not creating a new document, but updating
|
||||
# the existing one.
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
actor = None
|
||||
|
||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||
if (
|
||||
settings.AUDIT_LOG_ENABLED
|
||||
and self.metadata.actor_id is not None
|
||||
):
|
||||
actor = User.objects.filter(
|
||||
pk=self.metadata.actor_id,
|
||||
).first()
|
||||
if actor is not None:
|
||||
from auditlog.context import ( # type: ignore[import-untyped]
|
||||
set_actor,
|
||||
)
|
||||
|
||||
with set_actor(actor):
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
|
||||
# Create a log entry for the version addition, if enabled
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import ( # type: ignore[import-untyped]
|
||||
LogEntry,
|
||||
)
|
||||
|
||||
LogEntry.objects.log_create(
|
||||
instance=root_doc,
|
||||
changes={
|
||||
"Version Added": ["None", original_document.id],
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
actor=actor,
|
||||
additional_data={
|
||||
"reason": "Version added",
|
||||
"version_id": original_document.id,
|
||||
},
|
||||
)
|
||||
document = original_document
|
||||
else:
|
||||
original_document.save()
|
||||
|
||||
# Create a log entry for the version addition, if enabled
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import ( # type: ignore[import-untyped]
|
||||
LogEntry,
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
|
||||
LogEntry.objects.log_create(
|
||||
instance=root_doc,
|
||||
changes={
|
||||
"Version Added": ["None", original_document.id],
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
actor=actor,
|
||||
additional_data={
|
||||
"reason": "Version added",
|
||||
"version_id": original_document.id,
|
||||
},
|
||||
)
|
||||
document = original_document
|
||||
else:
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
)
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
generated_filename = generate_unique_filename(document)
|
||||
if (
|
||||
len(str(generated_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_filename = generate_filename(
|
||||
document,
|
||||
use_format=False,
|
||||
)
|
||||
document.filename = generated_filename
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
self._write(
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
generated_archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
generated_filename = generate_unique_filename(document)
|
||||
if (
|
||||
len(str(generated_archive_filename))
|
||||
len(str(generated_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_archive_filename = generate_filename(
|
||||
generated_filename = generate_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
use_format=False,
|
||||
)
|
||||
document.archive_filename = generated_archive_filename
|
||||
create_source_path_directory(document.archive_path)
|
||||
document.filename = generated_filename
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
document.archive_checksum = compute_checksum(document.archive_path)
|
||||
self._write(
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
generated_archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
if (
|
||||
len(str(generated_archive_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_archive_filename = generate_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
use_format=False,
|
||||
)
|
||||
document.archive_filename = generated_archive_filename
|
||||
create_source_path_directory(document.archive_path)
|
||||
self._write(
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
)
|
||||
|
||||
if document.root_document_id:
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
document=document.root_document,
|
||||
)
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(
|
||||
f"Deleting original file {self.input_doc.original_file}",
|
||||
)
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
|
||||
if document.root_document_id:
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
document=document.root_document,
|
||||
)
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
f"Deleting original file {self.input_doc.original_file}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
finally:
|
||||
tempdir.cleanup()
|
||||
|
||||
self.run_post_consume_script(document)
|
||||
|
||||
self.log.info(f"Document {document} consumption finished")
|
||||
@@ -780,7 +776,7 @@ class ConsumerPlugin(
|
||||
title=title[:127],
|
||||
content=text,
|
||||
mime_type=mime_type,
|
||||
checksum=compute_checksum(file_for_checksum),
|
||||
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
|
||||
created=create_date,
|
||||
modified=create_date,
|
||||
page_count=page_count,
|
||||
@@ -897,9 +893,10 @@ class ConsumerPreflightPlugin(
|
||||
|
||||
def pre_check_duplicate(self) -> None:
|
||||
"""
|
||||
Using the SHA256 of the file, check this exact file doesn't already exist
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
checksum = compute_checksum(Path(self.input_doc.original_file))
|
||||
with Path(self.input_doc.original_file).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
existing_doc = Document.global_objects.filter(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
|
||||
@@ -56,7 +56,6 @@ from documents.models import WorkflowTrigger
|
||||
from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||
from documents.utils import compute_checksum
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from paperless import version
|
||||
from paperless.models import ApplicationConfiguration
|
||||
@@ -694,7 +693,7 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
source_stat = source.stat()
|
||||
target_stat = target.stat()
|
||||
if self.compare_checksums and source_checksum:
|
||||
target_checksum = compute_checksum(target)
|
||||
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
|
||||
perform_copy = target_checksum != source_checksum
|
||||
elif (
|
||||
source_stat.st_mtime != target_stat.st_mtime
|
||||
|
||||
@@ -1,130 +0,0 @@
|
||||
import hashlib
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
_CHUNK_SIZE = 65536 # 64 KiB — avoids loading entire files into memory
|
||||
_BATCH_SIZE = 500 # documents per bulk_update call
|
||||
_PROGRESS_INTERVAL = 500 # log a progress line every N documents
|
||||
|
||||
|
||||
def _sha256(path: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as fh:
|
||||
while chunk := fh.read(_CHUNK_SIZE):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def recompute_checksums(apps, schema_editor):
|
||||
"""Recompute all document checksums from MD5 to SHA256."""
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
total = Document.objects.count()
|
||||
if total == 0:
|
||||
return
|
||||
|
||||
logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
|
||||
|
||||
batch: list = []
|
||||
processed = 0
|
||||
|
||||
for doc in Document.objects.only(
|
||||
"pk",
|
||||
"filename",
|
||||
"checksum",
|
||||
"archive_filename",
|
||||
"archive_checksum",
|
||||
).iterator(chunk_size=_BATCH_SIZE):
|
||||
updated_fields: list[str] = []
|
||||
|
||||
# Reconstruct source path the same way Document.source_path does
|
||||
fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
|
||||
source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
|
||||
|
||||
if source_path.exists():
|
||||
doc.checksum = _sha256(source_path)
|
||||
updated_fields.append("checksum")
|
||||
else:
|
||||
logger.warning(
|
||||
"Document %s: original file %s not found, checksum not updated.",
|
||||
doc.pk,
|
||||
source_path,
|
||||
)
|
||||
|
||||
# Mirror Document.has_archive_version: archive_filename is not None
|
||||
if doc.archive_filename is not None:
|
||||
archive_path = (
|
||||
settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
|
||||
).resolve()
|
||||
if archive_path.exists():
|
||||
doc.archive_checksum = _sha256(archive_path)
|
||||
updated_fields.append("archive_checksum")
|
||||
else:
|
||||
logger.warning(
|
||||
"Document %s: archive file %s not found, checksum not updated.",
|
||||
doc.pk,
|
||||
archive_path,
|
||||
)
|
||||
|
||||
if updated_fields:
|
||||
batch.append(doc)
|
||||
|
||||
processed += 1
|
||||
|
||||
if len(batch) >= _BATCH_SIZE:
|
||||
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
|
||||
batch.clear()
|
||||
|
||||
if processed % _PROGRESS_INTERVAL == 0:
|
||||
logger.info(
|
||||
"SHA-256 checksum progress: %d/%d (%d%%)",
|
||||
processed,
|
||||
total,
|
||||
processed * 100 // total,
|
||||
)
|
||||
|
||||
if batch:
|
||||
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
|
||||
|
||||
logger.info(
|
||||
"SHA-256 checksum recomputation complete: %d document(s) processed.",
|
||||
total,
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0015_document_version_index_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
help_text="The checksum of the original document.",
|
||||
max_length=64,
|
||||
verbose_name="checksum",
|
||||
),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="archive_checksum",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
editable=False,
|
||||
help_text="The checksum of the archived document.",
|
||||
max_length=64,
|
||||
null=True,
|
||||
verbose_name="archive checksum",
|
||||
),
|
||||
),
|
||||
migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
|
||||
]
|
||||
@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
|
||||
|
||||
checksum = models.CharField(
|
||||
_("checksum"),
|
||||
max_length=64,
|
||||
max_length=32,
|
||||
editable=False,
|
||||
help_text=_("The checksum of the original document."),
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
_("archive checksum"),
|
||||
max_length=64,
|
||||
max_length=32,
|
||||
editable=False,
|
||||
blank=True,
|
||||
null=True,
|
||||
|
||||
@@ -11,6 +11,7 @@ is an identity function that adds no overhead.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
@@ -29,7 +30,6 @@ from django.utils import timezone
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
from documents.utils import compute_checksum
|
||||
from paperless.config import GeneralConfig
|
||||
|
||||
logger = logging.getLogger("paperless.sanity_checker")
|
||||
@@ -218,7 +218,7 @@ def _check_original(
|
||||
|
||||
present_files.discard(source_path)
|
||||
try:
|
||||
checksum = compute_checksum(source_path)
|
||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read original file of document: {e}")
|
||||
else:
|
||||
@@ -255,7 +255,7 @@ def _check_archive(
|
||||
|
||||
present_files.discard(archive_path)
|
||||
try:
|
||||
checksum = compute_checksum(archive_path)
|
||||
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import shutil
|
||||
import uuid
|
||||
@@ -60,7 +61,6 @@ from documents.signals import document_updated
|
||||
from documents.signals.handlers import cleanup_document_deletion
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.utils import compute_checksum
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless.parsers import ParserContext
|
||||
@@ -328,7 +328,8 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
checksum = compute_checksum(parser.get_archive_path())
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
|
||||
@@ -82,8 +82,8 @@ def sample_doc(
|
||||
|
||||
return DocumentFactory(
|
||||
title="test",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
content="test content",
|
||||
pk=1,
|
||||
filename="0000001.pdf",
|
||||
|
||||
@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
|
||||
model = Document
|
||||
|
||||
title = factory.Faker("sentence", nb_words=4)
|
||||
checksum = factory.Faker("sha256")
|
||||
checksum = factory.Faker("md5")
|
||||
content = factory.Faker("paragraph")
|
||||
correspondent = None
|
||||
document_type = None
|
||||
|
||||
@@ -261,14 +261,8 @@ class TestConsumer(
|
||||
|
||||
self.assertIsFile(document.archive_path)
|
||||
|
||||
self.assertEqual(
|
||||
document.checksum,
|
||||
"1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
)
|
||||
self.assertEqual(
|
||||
document.archive_checksum,
|
||||
"706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
)
|
||||
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
|
||||
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
|
||||
|
||||
self.assertIsNotFile(filename)
|
||||
|
||||
|
||||
@@ -63,8 +63,8 @@ class TestExportImport(
|
||||
|
||||
self.d1 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
title="wow1",
|
||||
filename="0000001.pdf",
|
||||
mime_type="application/pdf",
|
||||
@@ -72,21 +72,21 @@ class TestExportImport(
|
||||
)
|
||||
self.d2 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
|
||||
checksum="9c9691e51741c1f4f41a20896af31770",
|
||||
title="wow2",
|
||||
filename="0000002.pdf",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
self.d3 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
|
||||
checksum="d38d7ed02e988e072caf924e0f3fcb76",
|
||||
title="wow2",
|
||||
filename="0000003.pdf",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
self.d4 = Document.objects.create(
|
||||
content="Content",
|
||||
checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
|
||||
checksum="82186aaa94f0b98697d704b90fd1c072",
|
||||
title="wow_dec",
|
||||
filename="0000004.pdf",
|
||||
mime_type="application/pdf",
|
||||
@@ -239,7 +239,7 @@ class TestExportImport(
|
||||
)
|
||||
|
||||
with Path(fname).open("rb") as f:
|
||||
checksum = hashlib.sha256(f.read()).hexdigest()
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element["fields"]["checksum"])
|
||||
|
||||
# Generated field "content_length" should not be exported,
|
||||
@@ -253,7 +253,7 @@ class TestExportImport(
|
||||
self.assertIsFile(fname)
|
||||
|
||||
with Path(fname).open("rb") as f:
|
||||
checksum = hashlib.sha256(f.read()).hexdigest()
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element["fields"]["archive_checksum"])
|
||||
|
||||
elif element["model"] == "documents.note":
|
||||
|
||||
@@ -277,8 +277,8 @@ class TestCommandImport(
|
||||
|
||||
Document.objects.create(
|
||||
content="Content",
|
||||
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
|
||||
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
|
||||
checksum="42995833e01aea9b3edee44bbfdd7ce1",
|
||||
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
|
||||
title="wow1",
|
||||
filename="0000001.pdf",
|
||||
mime_type="application/pdf",
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import shutil
|
||||
from os import utime
|
||||
@@ -129,28 +128,3 @@ def get_boolean(boolstr: str) -> bool:
|
||||
Return a boolean value from a string representation.
|
||||
"""
|
||||
return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
|
||||
def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
|
||||
"""
|
||||
Compute the SHA-256 checksum of a file.
|
||||
|
||||
Reads the file in chunks to avoid loading the entire file into memory.
|
||||
|
||||
Args:
|
||||
path (Path): Path to the file to hash.
|
||||
chunk_size (int, optional): Number of bytes to read per chunk.
|
||||
Defaults to 65536.
|
||||
|
||||
Returns:
|
||||
str: Hexadecimal SHA-256 digest of the file contents.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
OSError: If the file cannot be read.
|
||||
"""
|
||||
h = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
while chunk := f.read(chunk_size):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
Reference in New Issue
Block a user