Compare commits

..

5 Commits

Author SHA1 Message Date
Trenton H
6a4da4c46e Adds a docstring that an IDE will render better 2026-03-18 15:26:10 -07:00
Trenton H
741486df16 Handles the rename of the migration 2026-03-18 15:23:37 -07:00
Trenton H
cdbb118f1c Fixes logging so I can see it 2026-03-18 15:22:18 -07:00
Trenton H
cea5971ad8 Batch based iteration and bulk updates, with chunked file reading 2026-03-18 15:22:18 -07:00
Trenton H
156ee4e2ee Transitions to SHA256 based checksums 2026-03-18 15:22:17 -07:00
35 changed files with 454 additions and 737 deletions

View File

@@ -256,7 +256,7 @@ lint.isort.force-single-line = true
[tool.codespell]
write-changes = true
ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples/mail/*,src/documents/tests/samples/*,*.po,*.json"
skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/documents/tests/samples/*,*.po,*.json"
[tool.pytest]
minversion = "9.0"

View File

@@ -1,5 +1,4 @@
import datetime
import hashlib
import os
import tempfile
from enum import StrEnum
@@ -48,12 +47,13 @@ from documents.signals import document_consumption_started
from documents.signals import document_updated
from documents.signals.handlers import run_workflows
from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import compute_checksum
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
from paperless_mail.parsers import MailDocumentParser
LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -68,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
TODO(stumpylog): Remove me in the future
"""
if isinstance(parser, (MailDocumentParser, TextDocumentParser, TikaDocumentParser)):
if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
parser.__exit__(None, None, None)
else:
parser.cleanup()
@@ -214,9 +214,7 @@ class ConsumerPlugin(
version_doc = Document(
root_document=root_doc_frozen,
version_index=next_version_index + 1,
checksum=hashlib.md5(
file_for_checksum.read_bytes(),
).hexdigest(),
checksum=compute_checksum(file_for_checksum),
content=text or "",
page_count=page_count,
mime_type=mime_type,
@@ -477,12 +475,14 @@ class ConsumerPlugin(
isinstance(document_parser, MailDocumentParser)
and self.input_doc.mailrule_id
):
document_parser.mailrule_id = self.input_doc.mailrule_id
if isinstance(
document_parser,
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
):
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
document_parser.parse(
self.working_copy,
mime_type,
self.filename,
self.input_doc.mailrule_id,
)
elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
# TODO(stumpylog): Remove me in the future
document_parser.parse(self.working_copy, mime_type)
else:
document_parser.parse(self.working_copy, mime_type, self.filename)
@@ -494,11 +494,8 @@ class ConsumerPlugin(
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
)
if isinstance(
document_parser,
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
):
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
# TODO(stumpylog): Remove me in the future
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
else:
thumbnail = document_parser.get_thumbnail(
@@ -688,10 +685,9 @@ class ConsumerPlugin(
document.archive_path,
)
with Path(archive_path).open("rb") as f:
document.archive_checksum = hashlib.md5(
f.read(),
).hexdigest()
document.archive_checksum = compute_checksum(
Path(archive_path),
)
# Don't save with the lock active. Saving will cause the file
# renaming logic to acquire the lock as well.
@@ -832,7 +828,7 @@ class ConsumerPlugin(
title=title[:127],
content=text,
mime_type=mime_type,
checksum=hashlib.md5(file_for_checksum.read_bytes()).hexdigest(),
checksum=compute_checksum(file_for_checksum),
created=create_date,
modified=create_date,
page_count=page_count,
@@ -949,10 +945,9 @@ class ConsumerPreflightPlugin(
def pre_check_duplicate(self) -> None:
"""
Using the MD5 of the file, check this exact file doesn't already exist
Using the SHA256 of the file, check this exact file doesn't already exist
"""
with Path(self.input_doc.original_file).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = compute_checksum(Path(self.input_doc.original_file))
existing_doc = Document.global_objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum),
)

View File

@@ -56,6 +56,7 @@ from documents.models import WorkflowTrigger
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.utils import compute_checksum
from documents.utils import copy_file_with_basic_stats
from paperless import version
from paperless.models import ApplicationConfiguration
@@ -693,7 +694,7 @@ class Command(CryptMixin, PaperlessCommand):
source_stat = source.stat()
target_stat = target.stat()
if self.compare_checksums and source_checksum:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
target_checksum = compute_checksum(target)
perform_copy = target_checksum != source_checksum
elif (
source_stat.st_mtime != target_stat.st_mtime

View File

@@ -0,0 +1,130 @@
import hashlib
import logging
from pathlib import Path
from django.conf import settings
from django.db import migrations
from django.db import models
logger = logging.getLogger("paperless.migrations")
_CHUNK_SIZE = 65536 # 64 KiB — avoids loading entire files into memory
_BATCH_SIZE = 500 # documents per bulk_update call
_PROGRESS_INTERVAL = 500 # log a progress line every N documents
def _sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as fh:
while chunk := fh.read(_CHUNK_SIZE):
h.update(chunk)
return h.hexdigest()
def recompute_checksums(apps, schema_editor):
"""Recompute all document checksums from MD5 to SHA256."""
Document = apps.get_model("documents", "Document")
total = Document.objects.count()
if total == 0:
return
logger.info("Recomputing SHA-256 checksums for %d document(s)...", total)
batch: list = []
processed = 0
for doc in Document.objects.only(
"pk",
"filename",
"checksum",
"archive_filename",
"archive_checksum",
).iterator(chunk_size=_BATCH_SIZE):
updated_fields: list[str] = []
# Reconstruct source path the same way Document.source_path does
fname = str(doc.filename) if doc.filename else f"{doc.pk:07}.pdf"
source_path = (settings.ORIGINALS_DIR / Path(fname)).resolve()
if source_path.exists():
doc.checksum = _sha256(source_path)
updated_fields.append("checksum")
else:
logger.warning(
"Document %s: original file %s not found, checksum not updated.",
doc.pk,
source_path,
)
# Mirror Document.has_archive_version: archive_filename is not None
if doc.archive_filename is not None:
archive_path = (
settings.ARCHIVE_DIR / Path(str(doc.archive_filename))
).resolve()
if archive_path.exists():
doc.archive_checksum = _sha256(archive_path)
updated_fields.append("archive_checksum")
else:
logger.warning(
"Document %s: archive file %s not found, checksum not updated.",
doc.pk,
archive_path,
)
if updated_fields:
batch.append(doc)
processed += 1
if len(batch) >= _BATCH_SIZE:
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
batch.clear()
if processed % _PROGRESS_INTERVAL == 0:
logger.info(
"SHA-256 checksum progress: %d/%d (%d%%)",
processed,
total,
processed * 100 // total,
)
if batch:
Document.objects.bulk_update(batch, ["checksum", "archive_checksum"])
logger.info(
"SHA-256 checksum recomputation complete: %d document(s) processed.",
total,
)
class Migration(migrations.Migration):
dependencies = [
("documents", "0015_document_version_index_and_more"),
]
operations = [
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document.",
max_length=64,
verbose_name="checksum",
),
),
migrations.AlterField(
model_name="document",
name="archive_checksum",
field=models.CharField(
blank=True,
editable=False,
help_text="The checksum of the archived document.",
max_length=64,
null=True,
verbose_name="archive checksum",
),
),
migrations.RunPython(recompute_checksums, migrations.RunPython.noop),
]

View File

@@ -216,14 +216,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
checksum = models.CharField(
_("checksum"),
max_length=32,
max_length=64,
editable=False,
help_text=_("The checksum of the original document."),
)
archive_checksum = models.CharField(
_("archive checksum"),
max_length=32,
max_length=64,
editable=False,
blank=True,
null=True,

View File

@@ -11,7 +11,6 @@ is an identity function that adds no overhead.
from __future__ import annotations
import hashlib
import logging
import uuid
from collections import defaultdict
@@ -30,6 +29,7 @@ from django.utils import timezone
from documents.models import Document
from documents.models import PaperlessTask
from documents.utils import compute_checksum
from paperless.config import GeneralConfig
logger = logging.getLogger("paperless.sanity_checker")
@@ -218,7 +218,7 @@ def _check_original(
present_files.discard(source_path)
try:
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
checksum = compute_checksum(source_path)
except OSError as e:
messages.error(doc.pk, f"Cannot read original file of document: {e}")
else:
@@ -255,7 +255,7 @@ def _check_archive(
present_files.discard(archive_path)
try:
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
checksum = compute_checksum(archive_path)
except OSError as e:
messages.error(
doc.pk,

View File

@@ -1,5 +1,4 @@
import datetime
import hashlib
import logging
import shutil
import uuid
@@ -63,6 +62,7 @@ from documents.signals import document_updated
from documents.signals.handlers import cleanup_document_deletion
from documents.signals.handlers import run_workflows
from documents.signals.handlers import send_websocket_document_updated
from documents.utils import compute_checksum
from documents.workflows.utils import get_workflows_for_trigger
from paperless.config import AIConfig
from paperless_ai.indexing import llm_index_add_or_update_document
@@ -327,8 +327,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
with transaction.atomic():
oldDocument = Document.objects.get(pk=document.pk)
if parser.get_archive_path():
with Path(parser.get_archive_path()).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = compute_checksum(Path(parser.get_archive_path()))
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling

View File

@@ -82,8 +82,8 @@ def sample_doc(
return DocumentFactory(
title="test",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
content="test content",
pk=1,
filename="0000001.pdf",

View File

@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
model = Document
title = factory.Faker("sentence", nb_words=4)
checksum = factory.Faker("md5")
checksum = factory.Faker("sha256")
content = factory.Faker("paragraph")
correspondent = None
document_type = None

View File

@@ -36,6 +36,7 @@ from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import GetConsumerMixin
from paperless_mail.models import MailRule
from paperless_mail.parsers import MailDocumentParser
class _BaseTestParser(DocumentParser):
@@ -244,8 +245,14 @@ class TestConsumer(
self.assertIsFile(document.archive_path)
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
self.assertEqual(
document.checksum,
"1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
)
self.assertEqual(
document.archive_checksum,
"706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
)
self.assertIsNotFile(filename)
@@ -1090,7 +1097,7 @@ class TestConsumer(
self.assertEqual(command[1], "--replace-input")
@mock.patch("paperless_mail.models.MailRule.objects.get")
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
@mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_mail_parser_receives_mailrule(
self,
@@ -1106,13 +1113,11 @@ class TestConsumer(
THEN:
- The mail parser should receive the mail rule
"""
from paperless_mail.signals import get_parser as mail_get_parser
mock_consumer_declaration_send.return_value = [
(
None,
{
"parser": mail_get_parser,
"parser": MailDocumentParser,
"mime_types": {"message/rfc822": ".eml"},
"weight": 0,
},
@@ -1124,10 +1129,9 @@ class TestConsumer(
with self.get_consumer(
filepath=(
Path(__file__).parent.parent.parent
/ Path("paperless")
/ Path("paperless_mail")
/ Path("tests")
/ Path("samples")
/ Path("mail")
).resolve()
/ "html.eml",
source=DocumentSource.MailFetch,
@@ -1138,10 +1142,12 @@ class TestConsumer(
ConsumerError,
):
consumer.run()
mock_mail_parser_parse.assert_called_once_with(
consumer.working_copy,
"message/rfc822",
)
mock_mail_parser_parse.assert_called_once_with(
consumer.working_copy,
"message/rfc822",
file_name="sample.pdf",
mailrule=mock_mailrule_get.return_value,
)
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)

View File

@@ -63,8 +63,8 @@ class TestExportImport(
self.d1 = Document.objects.create(
content="Content",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",
@@ -72,21 +72,21 @@ class TestExportImport(
)
self.d2 = Document.objects.create(
content="Content",
checksum="9c9691e51741c1f4f41a20896af31770",
checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
title="wow2",
filename="0000002.pdf",
mime_type="application/pdf",
)
self.d3 = Document.objects.create(
content="Content",
checksum="d38d7ed02e988e072caf924e0f3fcb76",
checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
title="wow2",
filename="0000003.pdf",
mime_type="application/pdf",
)
self.d4 = Document.objects.create(
content="Content",
checksum="82186aaa94f0b98697d704b90fd1c072",
checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
title="wow_dec",
filename="0000004.pdf",
mime_type="application/pdf",
@@ -239,7 +239,7 @@ class TestExportImport(
)
with Path(fname).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = hashlib.sha256(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["checksum"])
# Generated field "content_length" should not be exported,
@@ -253,7 +253,7 @@ class TestExportImport(
self.assertIsFile(fname)
with Path(fname).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = hashlib.sha256(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["archive_checksum"])
elif element["model"] == "documents.note":

View File

@@ -277,8 +277,8 @@ class TestCommandImport(
Document.objects.create(
content="Content",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",

View File

@@ -1,3 +1,4 @@
import hashlib
import logging
import shutil
from os import utime
@@ -128,3 +129,28 @@ def get_boolean(boolstr: str) -> bool:
Return a boolean value from a string representation.
"""
return bool(boolstr.lower() in ("yes", "y", "1", "t", "true"))
def compute_checksum(path: Path, chunk_size: int = 65536) -> str:
"""
Compute the SHA-256 checksum of a file.
Reads the file in chunks to avoid loading the entire file into memory.
Args:
path (Path): Path to the file to hash.
chunk_size (int, optional): Number of bytes to read per chunk.
Defaults to 65536.
Returns:
str: Hexadecimal SHA-256 digest of the file contents.
Raises:
FileNotFoundError: If the file does not exist.
OSError: If the file cannot be read.
"""
h = hashlib.sha256()
with path.open("rb") as f:
while chunk := f.read(chunk_size):
h.update(chunk)
return h.hexdigest()

View File

@@ -193,13 +193,11 @@ class ParserRegistry:
that log output is predictable; scoring determines which parser wins
at runtime regardless of registration order.
"""
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
self.register_builtin(TextDocumentParser)
self.register_builtin(TikaDocumentParser)
self.register_builtin(MailDocumentParser)
# ------------------------------------------------------------------
# Discovery

View File

@@ -10,7 +10,6 @@ from typing import TYPE_CHECKING
import pytest
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
@@ -159,166 +158,3 @@ def tika_parser() -> Generator[TikaDocumentParser, None, None]:
"""
with TikaDocumentParser() as parser:
yield parser
# ------------------------------------------------------------------
# Mail parser sample files
# ------------------------------------------------------------------
@pytest.fixture(scope="session")
def mail_samples_dir(samples_dir: Path) -> Path:
"""Absolute path to the mail parser sample files directory.
Returns
-------
Path
``<samples_dir>/mail/``
"""
return samples_dir / "mail"
@pytest.fixture(scope="session")
def broken_email_file(mail_samples_dir: Path) -> Path:
"""Path to a broken/malformed EML sample file.
Returns
-------
Path
Absolute path to ``mail/broken.eml``.
"""
return mail_samples_dir / "broken.eml"
@pytest.fixture(scope="session")
def simple_txt_email_file(mail_samples_dir: Path) -> Path:
"""Path to a plain-text email sample file.
Returns
-------
Path
Absolute path to ``mail/simple_text.eml``.
"""
return mail_samples_dir / "simple_text.eml"
@pytest.fixture(scope="session")
def simple_txt_email_pdf_file(mail_samples_dir: Path) -> Path:
"""Path to the expected PDF rendition of the plain-text email.
Returns
-------
Path
Absolute path to ``mail/simple_text.eml.pdf``.
"""
return mail_samples_dir / "simple_text.eml.pdf"
@pytest.fixture(scope="session")
def simple_txt_email_thumbnail_file(mail_samples_dir: Path) -> Path:
"""Path to the expected thumbnail for the plain-text email.
Returns
-------
Path
Absolute path to ``mail/simple_text.eml.pdf.webp``.
"""
return mail_samples_dir / "simple_text.eml.pdf.webp"
@pytest.fixture(scope="session")
def html_email_file(mail_samples_dir: Path) -> Path:
"""Path to an HTML email sample file.
Returns
-------
Path
Absolute path to ``mail/html.eml``.
"""
return mail_samples_dir / "html.eml"
@pytest.fixture(scope="session")
def html_email_pdf_file(mail_samples_dir: Path) -> Path:
"""Path to the expected PDF rendition of the HTML email.
Returns
-------
Path
Absolute path to ``mail/html.eml.pdf``.
"""
return mail_samples_dir / "html.eml.pdf"
@pytest.fixture(scope="session")
def html_email_thumbnail_file(mail_samples_dir: Path) -> Path:
"""Path to the expected thumbnail for the HTML email.
Returns
-------
Path
Absolute path to ``mail/html.eml.pdf.webp``.
"""
return mail_samples_dir / "html.eml.pdf.webp"
@pytest.fixture(scope="session")
def html_email_html_file(mail_samples_dir: Path) -> Path:
"""Path to the HTML body of the HTML email sample.
Returns
-------
Path
Absolute path to ``mail/html.eml.html``.
"""
return mail_samples_dir / "html.eml.html"
@pytest.fixture(scope="session")
def merged_pdf_first(mail_samples_dir: Path) -> Path:
"""Path to the first PDF used in PDF-merge tests.
Returns
-------
Path
Absolute path to ``mail/first.pdf``.
"""
return mail_samples_dir / "first.pdf"
@pytest.fixture(scope="session")
def merged_pdf_second(mail_samples_dir: Path) -> Path:
"""Path to the second PDF used in PDF-merge tests.
Returns
-------
Path
Absolute path to ``mail/second.pdf``.
"""
return mail_samples_dir / "second.pdf"
# ------------------------------------------------------------------
# Mail parser instance
# ------------------------------------------------------------------
@pytest.fixture()
def mail_parser() -> Generator[MailDocumentParser, None, None]:
"""Yield a MailDocumentParser and clean up its temporary directory afterwards.
Yields
------
MailDocumentParser
A ready-to-use parser instance.
"""
with MailDocumentParser() as parser:
yield parser
@pytest.fixture(scope="session")
def nginx_base_url() -> Generator[str, None, None]:
"""
The base URL for the nginx HTTP server we expect to be alive
"""
yield "http://localhost:8080"

View File

@@ -1,26 +1,6 @@
"""
Built-in mail document parser.
Handles message/rfc822 (EML) MIME type by:
- Parsing the email using imap_tools
- Generating a PDF via Gotenberg (for display and archive)
- Extracting text via Tika for HTML content
- Extracting metadata from email headers
The parser always produces a PDF because EML files cannot be rendered
natively in a browser (requires_pdf_rendition=True).
"""
from __future__ import annotations
import logging
import re
import shutil
import tempfile
from html import escape
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Self
from bleach import clean
from bleach import linkify
@@ -39,353 +19,65 @@ from imap_tools import MailAttachment
from imap_tools import MailMessage
from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.models import OutputTypeChoices
from paperless.version import __full_version_str__
from paperless_mail.models import MailRule
if TYPE_CHECKING:
import datetime
from types import TracebackType
from paperless.parsers import MetadataEntry
logger = logging.getLogger("paperless.parsing.mail")
_SUPPORTED_MIME_TYPES: dict[str, str] = {
"message/rfc822": ".eml",
}
class MailDocumentParser:
"""Parse .eml email files for Paperless-ngx.
Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
and sends the HTML part to a Tika server for text extraction. Because
EML files cannot be rendered natively in a browser, the parser always
produces a PDF rendition (requires_pdf_rendition=True).
The mailrule_id instance attribute may be set by the consumer before
calling parse() to apply mail-rule-specific PDF layout options:
parser.mailrule_id = rule.pk
parser.parse(path, mime_type)
Class attributes
----------------
name : str
Human-readable parser name.
version : str
Semantic version string, kept in sync with Paperless-ngx releases.
author : str
Maintainer name.
url : str
Issue tracker / source URL.
class MailDocumentParser(DocumentParser):
"""
This parser uses imap_tools to parse .eml files, generates pdf using
Gotenberg and sends the html part to a Tika server for text extraction.
"""
name: str = "Paperless-ngx Mail Parser"
version: str = __full_version_str__
author: str = "Paperless-ngx Contributors"
url: str = "https://github.com/paperless-ngx/paperless-ngx"
logging_name = "paperless.parsing.mail"
# ------------------------------------------------------------------
# Class methods
# ------------------------------------------------------------------
@classmethod
def supported_mime_types(cls) -> dict[str, str]:
"""Return the MIME types this parser handles.
Returns
-------
dict[str, str]
Mapping of MIME type to preferred file extension.
def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
"""
return _SUPPORTED_MIME_TYPES
@classmethod
def score(
cls,
mime_type: str,
filename: str,
path: Path | None = None,
) -> int | None:
"""Return the priority score for handling this file.
Parameters
----------
mime_type:
Detected MIME type of the file.
filename:
Original filename including extension.
path:
Optional filesystem path. Not inspected by this parser.
Returns
-------
int | None
20 if the MIME type is supported (higher than the default 10 to
give the mail parser clear priority), otherwise None.
Converts our requested PDF/A output into the Gotenberg API
format
"""
if mime_type in _SUPPORTED_MIME_TYPES:
return 20
if settings.OCR_OUTPUT_TYPE in {
OutputTypeChoices.PDF_A,
OutputTypeChoices.PDF_A2,
}:
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
self.log.warning(
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
)
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
return PdfAFormat.A3b
return None
# ------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------
@property
def can_produce_archive(self) -> bool:
"""Whether this parser can produce a searchable PDF archive copy.
Returns
-------
bool
Always False the mail parser produces a display PDF
(requires_pdf_rendition=True), not an optional OCR archive.
"""
return False
@property
def requires_pdf_rendition(self) -> bool:
"""Whether the parser must produce a PDF for the frontend to display.
Returns
-------
bool
Always True EML files cannot be rendered natively in a browser,
so a PDF conversion is always required for display.
"""
return True
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def __init__(self, logging_group: object = None) -> None:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
self._tempdir = Path(
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
)
self._text: str | None = None
self._date: datetime.datetime | None = None
self._archive_path: Path | None = None
self.mailrule_id: int | None = None
def __enter__(self) -> Self:
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
logger.debug("Cleaning up temporary directory %s", self._tempdir)
shutil.rmtree(self._tempdir, ignore_errors=True)
# ------------------------------------------------------------------
# Core parsing interface
# ------------------------------------------------------------------
def parse(
self,
document_path: Path,
mime_type: str,
*,
produce_archive: bool = True,
) -> None:
"""Parse the given .eml into formatted text and a PDF archive.
The consumer may set ``self.mailrule_id`` before calling this method
to apply mail-rule-specific PDF layout options. The ``produce_archive``
flag is accepted for protocol compatibility but is always honoured
the mail parser always produces a PDF since EML files cannot be
displayed natively.
Parameters
----------
document_path:
Absolute path to the .eml file.
mime_type:
Detected MIME type of the document (should be "message/rfc822").
produce_archive:
Accepted for protocol compatibility. The PDF rendition is always
produced since EML files cannot be displayed natively in a browser.
Raises
------
documents.parsers.ParseError
If the file cannot be parsed or PDF generation fails.
"""
def strip_text(text: str) -> str:
"""Reduces the spacing of the given text string."""
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
def build_formatted_text(mail_message: MailMessage) -> str:
"""Constructs a formatted string based on the given email."""
fmt_text = f"Subject: {mail_message.subject}\n\n"
fmt_text += f"From: {mail_message.from_values.full}\n\n"
to_list = [address.full for address in mail_message.to_values]
fmt_text += f"To: {', '.join(to_list)}\n\n"
if mail_message.cc_values:
fmt_text += (
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
)
if mail_message.bcc_values:
fmt_text += (
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
)
if mail_message.attachments:
att = []
for a in mail.attachments:
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
att.append(
f"{a.filename} ({attachment_size})",
)
fmt_text += f"Attachments: {', '.join(att)}\n\n"
if mail.html:
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
fmt_text += f"\n\n{strip_text(mail.text)}"
return fmt_text
logger.debug("Parsing file %s into an email", document_path.name)
mail = self.parse_file_to_message(document_path)
logger.debug("Building formatted text from email")
self._text = build_formatted_text(mail)
if is_naive(mail.date):
self._date = make_aware(mail.date)
else:
self._date = mail.date
logger.debug("Creating a PDF from the email")
if self.mailrule_id:
rule = MailRule.objects.get(pk=self.mailrule_id)
self._archive_path = self.generate_pdf(mail, rule.pdf_layout)
else:
self._archive_path = self.generate_pdf(mail)
# ------------------------------------------------------------------
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
"""Return the plain-text content extracted during parse.
Returns
-------
str | None
Extracted text, or None if parse has not been called yet.
"""
return self._text
def get_date(self) -> datetime.datetime | None:
"""Return the document date detected during parse.
Returns
-------
datetime.datetime | None
Date from the email headers, or None if not detected.
"""
return self._date
def get_archive_path(self) -> Path | None:
"""Return the path to the generated archive PDF, or None.
Returns
-------
Path | None
Path to the PDF produced by Gotenberg, or None if parse has not
been called yet.
"""
return self._archive_path
# ------------------------------------------------------------------
# Thumbnail and metadata
# ------------------------------------------------------------------
def get_thumbnail(
self,
document_path: Path,
mime_type: str,
file_name: str | None = None,
file_name=None,
) -> Path:
"""Generate a thumbnail from the PDF rendition of the email.
Converts the document to PDF first if not already done.
Parameters
----------
document_path:
Absolute path to the source document.
mime_type:
Detected MIME type of the document.
file_name:
Kept for backward compatibility; not used.
Returns
-------
Path
Path to the generated WebP thumbnail inside the temporary directory.
"""
if not self._archive_path:
self._archive_path = self.generate_pdf(
if not self.archive_path:
self.archive_path = self.generate_pdf(
self.parse_file_to_message(document_path),
)
return make_thumbnail_from_pdf(
self._archive_path,
self._tempdir,
self.archive_path,
self.tempdir,
self.logging_group,
)
def get_page_count(
self,
document_path: Path,
mime_type: str,
) -> int | None:
"""Return the number of pages in the document.
Returns
-------
int | None
Always None page count is not available for email files.
"""
return None
def extract_metadata(
self,
document_path: Path,
mime_type: str,
) -> list[MetadataEntry]:
"""Extract metadata from the email headers.
Returns email headers as metadata entries with prefix "header",
plus summary entries for attachments and date.
Returns
-------
list[MetadataEntry]
Sorted list of metadata entries, or ``[]`` on parse failure.
"""
result: list[MetadataEntry] = []
def extract_metadata(self, document_path: Path, mime_type: str):
result = []
try:
mail = self.parse_file_to_message(document_path)
except ParseError as e:
logger.warning(
"Error while fetching document metadata for %s: %s",
document_path,
e,
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
)
return result
@@ -394,7 +86,7 @@ class MailDocumentParser:
try:
value.encode("utf-8")
except UnicodeEncodeError as e: # pragma: no cover
logger.debug("Skipping header %s: %s", key, e)
self.log.debug(f"Skipping header {key}: {e}")
continue
result.append(
@@ -431,44 +123,81 @@ class MailDocumentParser:
result.sort(key=lambda item: (item["prefix"], item["key"]))
return result
# ------------------------------------------------------------------
# Email-specific methods
# ------------------------------------------------------------------
def parse(
self,
document_path: Path,
mime_type: str,
file_name=None,
mailrule_id: int | None = None,
) -> None:
"""
Parses the given .eml into formatted text, based on the decoded email.
def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
"""Convert the OCR output type setting to a Gotenberg PdfAFormat."""
if settings.OCR_OUTPUT_TYPE in {
OutputTypeChoices.PDF_A,
OutputTypeChoices.PDF_A2,
}:
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
logger.warning(
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
)
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
return PdfAFormat.A3b
return None
"""
def strip_text(text: str):
"""
Reduces the spacing of the given text string
"""
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
def build_formatted_text(mail_message: MailMessage) -> str:
"""
Constructs a formatted string, based on the given email. Basically tries
to get most of the email content, included front matter, into a nice string
"""
fmt_text = f"Subject: {mail_message.subject}\n\n"
fmt_text += f"From: {mail_message.from_values.full}\n\n"
to_list = [address.full for address in mail_message.to_values]
fmt_text += f"To: {', '.join(to_list)}\n\n"
if mail_message.cc_values:
fmt_text += (
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
)
if mail_message.bcc_values:
fmt_text += (
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
)
if mail_message.attachments:
att = []
for a in mail.attachments:
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
att.append(
f"{a.filename} ({attachment_size})",
)
fmt_text += f"Attachments: {', '.join(att)}\n\n"
if mail.html:
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
fmt_text += f"\n\n{strip_text(mail.text)}"
return fmt_text
self.log.debug(f"Parsing file {document_path.name} into an email")
mail = self.parse_file_to_message(document_path)
self.log.debug("Building formatted text from email")
self.text = build_formatted_text(mail)
if is_naive(mail.date):
self.date = make_aware(mail.date)
else:
self.date = mail.date
self.log.debug("Creating a PDF from the email")
if mailrule_id:
rule = MailRule.objects.get(pk=mailrule_id)
self.archive_path = self.generate_pdf(mail, rule.pdf_layout)
else:
self.archive_path = self.generate_pdf(mail)
@staticmethod
def parse_file_to_message(filepath: Path) -> MailMessage:
"""Parse the given .eml file into a MailMessage object.
Parameters
----------
filepath:
Path to the .eml file.
Returns
-------
MailMessage
Parsed mail message.
Raises
------
documents.parsers.ParseError
If the file cannot be parsed or is missing required fields.
"""
Parses the given .eml file into a MailMessage object
"""
try:
with filepath.open("rb") as eml:
@@ -484,25 +213,8 @@ class MailDocumentParser:
return parsed
def tika_parse(self, html: str) -> str:
"""Send HTML content to the Tika server for text extraction.
Parameters
----------
html:
HTML string to parse.
Returns
-------
str
Extracted plain text.
Raises
------
documents.parsers.ParseError
If the Tika server cannot be reached or returns an error.
"""
logger.info("Sending content to Tika server")
def tika_parse(self, html: str):
self.log.info("Sending content to Tika server")
try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
@@ -522,32 +234,16 @@ class MailDocumentParser:
mail_message: MailMessage,
pdf_layout: MailRule.PdfLayout | None = None,
) -> Path:
"""Generate a PDF from the email message.
Creates separate PDFs for the email body and HTML content, then
merges them according to the requested layout.
Parameters
----------
mail_message:
Parsed email message.
pdf_layout:
Layout option for the PDF. Falls back to the
EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
Returns
-------
Path
Path to the generated PDF inside the temporary directory.
"""
archive_path = Path(self._tempdir) / "merged.pdf"
archive_path = Path(self.tempdir) / "merged.pdf"
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
pdf_layout = pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
pdf_layout = (
pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
) # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout
# If no HTML content, create the PDF from the message.
# Otherwise, create 2 PDFs and merge them with Gotenberg.
# If no HTML content, create the PDF from the message
# Otherwise, create 2 PDFs and merge them with Gotenberg
if not mail_message.html:
archive_path.write_bytes(mail_pdf_file.read_bytes())
else:
@@ -556,7 +252,7 @@ class MailDocumentParser:
mail_message.attachments,
)
logger.debug("Merging email text and HTML content into single PDF")
self.log.debug("Merging email text and HTML content into single PDF")
with (
GotenbergClient(
@@ -591,21 +287,15 @@ class MailDocumentParser:
return archive_path
def mail_to_html(self, mail: MailMessage) -> Path:
"""Convert the given email into an HTML file using a template.
Parameters
----------
mail:
Parsed mail message.
Returns
-------
Path
Path to the rendered HTML file inside the temporary directory.
"""
Converts the given email into an HTML file, formatted
based on the given template
"""
def clean_html(text: str) -> str:
"""Attempt to clean, escape, and linkify the given HTML string."""
"""
Attempts to clean, escape and linkify the given HTML string
"""
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if not isinstance(text, str):
@@ -650,37 +340,19 @@ class MailDocumentParser:
from django.template.loader import render_to_string
html_file = Path(self._tempdir) / "email_as_html.html"
html_file = Path(self.tempdir) / "email_as_html.html"
html_file.write_text(render_to_string("email_msg_template.html", context=data))
return html_file
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
"""Create a PDF from the email body using an HTML template and Gotenberg.
Parameters
----------
mail:
Parsed mail message.
Returns
-------
Path
Path to the generated PDF inside the temporary directory.
Raises
------
documents.parsers.ParseError
If Gotenberg returns an error.
"""
logger.info("Converting mail to PDF")
Creates a PDF based on the given email, using the email's values in a
an HTML template
"""
self.log.info("Converting mail to PDF")
css_file = (
Path(__file__).parent.parent.parent
/ "paperless_mail"
/ "templates"
/ "output.css"
)
css_file = Path(__file__).parent / "templates" / "output.css"
email_html_file = self.mail_to_html(mail)
with (
@@ -716,7 +388,7 @@ class MailDocumentParser:
f"Error while converting email to PDF: {err}",
) from err
email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
email_as_pdf_file.write_bytes(response.content)
return email_as_pdf_file
@@ -726,27 +398,11 @@ class MailDocumentParser:
orig_html: str,
attachments: list[MailAttachment],
) -> Path:
"""Generate a PDF from the HTML content of the email.
Parameters
----------
orig_html:
Raw HTML string from the email body.
attachments:
List of email attachments (used as inline resources).
Returns
-------
Path
Path to the generated PDF inside the temporary directory.
Raises
------
documents.parsers.ParseError
If Gotenberg returns an error.
"""
Generates a PDF file based on the HTML and attachments of the email
"""
def clean_html_script(text: str) -> str:
def clean_html_script(text: str):
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
text = compiled_open.sub("<div hidden ", text)
@@ -754,9 +410,9 @@ class MailDocumentParser:
text = compiled_close.sub("</div", text)
return text
logger.info("Converting message html to PDF")
self.log.info("Converting message html to PDF")
tempdir = Path(self._tempdir)
tempdir = Path(self.tempdir)
html_clean = clean_html_script(orig_html)
html_clean_file = tempdir / "index.html"
@@ -817,3 +473,9 @@ class MailDocumentParser:
html_pdf = tempdir / "html.pdf"
html_pdf.write_bytes(response.content)
return html_pdf
def get_settings(self) -> None:
"""
This parser does not implement additional settings yet
"""
return None

View File

@@ -1,12 +1,7 @@
def get_parser(*args, **kwargs):
from paperless.parsers.mail import MailDocumentParser
from paperless_mail.parsers import MailDocumentParser
# MailDocumentParser accepts no constructor args in the new-style protocol.
# Pop legacy args that arrive from the signal-based consumer path.
# Phase 4 will replace this signal path with the ParserRegistry.
kwargs.pop("logging_group", None)
kwargs.pop("progress_callback", None)
return MailDocumentParser()
return MailDocumentParser(*args, **kwargs)
def mail_consumer_declaration(sender, **kwargs):

View File

@@ -1,9 +1,71 @@
from collections.abc import Generator
from pathlib import Path
import pytest
from paperless_mail.mail import MailAccountHandler
from paperless_mail.models import MailAccount
from paperless_mail.parsers import MailDocumentParser
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture(scope="session")
def broken_email_file(sample_dir: Path) -> Path:
return sample_dir / "broken.eml"
@pytest.fixture(scope="session")
def simple_txt_email_file(sample_dir: Path) -> Path:
return sample_dir / "simple_text.eml"
@pytest.fixture(scope="session")
def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
return sample_dir / "simple_text.eml.pdf"
@pytest.fixture(scope="session")
def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
return sample_dir / "simple_text.eml.pdf.webp"
@pytest.fixture(scope="session")
def html_email_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml"
@pytest.fixture(scope="session")
def html_email_pdf_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.pdf"
@pytest.fixture(scope="session")
def html_email_thumbnail_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.pdf.webp"
@pytest.fixture(scope="session")
def html_email_html_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.html"
@pytest.fixture(scope="session")
def merged_pdf_first(sample_dir: Path) -> Path:
return sample_dir / "first.pdf"
@pytest.fixture(scope="session")
def merged_pdf_second(sample_dir: Path) -> Path:
return sample_dir / "second.pdf"
@pytest.fixture()
def mail_parser() -> MailDocumentParser:
return MailDocumentParser(logging_group=None)
@pytest.fixture()
@@ -27,3 +89,11 @@ def greenmail_mail_account(db: None) -> Generator[MailAccount, None, None]:
@pytest.fixture()
def mail_account_handler() -> MailAccountHandler:
return MailAccountHandler()
@pytest.fixture(scope="session")
def nginx_base_url() -> Generator[str, None, None]:
"""
The base URL for the nginx HTTP server we expect to be alive
"""
yield "http://localhost:8080"

View File

Before

Width:  |  Height:  |  Size: 6.0 KiB

After

Width:  |  Height:  |  Size: 6.0 KiB

View File

Before

Width:  |  Height:  |  Size: 2.8 KiB

After

Width:  |  Height:  |  Size: 2.8 KiB

View File

Before

Width:  |  Height:  |  Size: 6.9 KiB

After

Width:  |  Height:  |  Size: 6.9 KiB

View File

Before

Width:  |  Height:  |  Size: 5.2 KiB

After

Width:  |  Height:  |  Size: 5.2 KiB

View File

@@ -12,7 +12,7 @@ from pytest_httpx import HTTPXMock
from pytest_mock import MockerFixture
from documents.parsers import ParseError
from paperless.parsers.mail import MailDocumentParser
from paperless_mail.parsers import MailDocumentParser
class TestEmailFileParsing:
@@ -24,7 +24,7 @@ class TestEmailFileParsing:
def test_parse_error_missing_file(
self,
mail_parser: MailDocumentParser,
mail_samples_dir: Path,
sample_dir: Path,
) -> None:
"""
GIVEN:
@@ -35,7 +35,7 @@ class TestEmailFileParsing:
- An Exception is thrown
"""
# Check if exception is raised when parsing fails.
test_file = mail_samples_dir / "doesntexist.eml"
test_file = sample_dir / "doesntexist.eml"
assert not test_file.exists()
@@ -246,12 +246,12 @@ class TestEmailThumbnailGenerate:
"""
mocked_return = "Passing the return value through.."
mock_make_thumbnail_from_pdf = mocker.patch(
"paperless.parsers.mail.make_thumbnail_from_pdf",
"paperless_mail.parsers.make_thumbnail_from_pdf",
)
mock_make_thumbnail_from_pdf.return_value = mocked_return
mock_generate_pdf = mocker.patch(
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
)
mock_generate_pdf.return_value = "Mocked return value.."
@@ -260,7 +260,8 @@ class TestEmailThumbnailGenerate:
mock_generate_pdf.assert_called_once()
mock_make_thumbnail_from_pdf.assert_called_once_with(
"Mocked return value..",
mail_parser._tempdir,
mail_parser.tempdir,
None,
)
assert mocked_return == thumb
@@ -372,7 +373,7 @@ class TestParser:
"""
# Validate parsing returns the expected results
mock_generate_pdf = mocker.patch(
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
)
mail_parser.parse(simple_txt_email_file, "message/rfc822")
@@ -384,7 +385,7 @@ class TestParser:
"BCC: fdf@fvf.de\n\n"
"\n\nThis is just a simple Text Mail."
)
assert text_expected == mail_parser.get_text()
assert text_expected == mail_parser.text
assert (
datetime.datetime(
2022,
@@ -395,7 +396,7 @@ class TestParser:
43,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
)
== mail_parser.get_date()
== mail_parser.date
)
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
@@ -418,7 +419,7 @@ class TestParser:
"""
mock_generate_pdf = mocker.patch(
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
)
# Validate parsing returns the expected results
@@ -442,7 +443,7 @@ class TestParser:
mail_parser.parse(html_email_file, "message/rfc822")
mock_generate_pdf.assert_called_once()
assert text_expected == mail_parser.get_text()
assert text_expected == mail_parser.text
assert (
datetime.datetime(
2022,
@@ -453,7 +454,7 @@ class TestParser:
19,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
)
== mail_parser.get_date()
== mail_parser.date
)
def test_generate_pdf_parse_error(
@@ -500,7 +501,7 @@ class TestParser:
mail_parser.parse(simple_txt_email_file, "message/rfc822")
assert mail_parser.get_archive_path() is not None
assert mail_parser.archive_path is not None
@pytest.mark.httpx_mock(can_send_already_matched_responses=True)
def test_generate_pdf_html_email(
@@ -541,7 +542,7 @@ class TestParser:
)
mail_parser.parse(html_email_file, "message/rfc822")
assert mail_parser.get_archive_path() is not None
assert mail_parser.archive_path is not None
def test_generate_pdf_html_email_html_to_pdf_failure(
self,
@@ -711,10 +712,10 @@ class TestParser:
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
mail_parser.mailrule_id = 1
mail_parser.parse(
document_path=html_email_file,
mime_type="message/rfc822",
mailrule_id=1,
)
args, _ = mock_merge_route.call_args
assert len(args[0]) == expected_calls

View File

@@ -11,7 +11,7 @@ from PIL import Image
from pytest_mock import MockerFixture
from documents.tests.utils import util_call_with_backoff
from paperless.parsers.mail import MailDocumentParser
from paperless_mail.parsers import MailDocumentParser
def extract_text(pdf_path: Path) -> str:
@@ -159,7 +159,7 @@ class TestParserLive:
- The returned thumbnail image file shall match the expected hash
"""
mock_generate_pdf = mocker.patch(
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
)
mock_generate_pdf.return_value = simple_txt_email_pdf_file
@@ -216,10 +216,10 @@ class TestParserLive:
- The merged PDF shall contain text from both source PDFs
"""
mock_generate_pdf_from_html = mocker.patch(
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_html",
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
)
mock_generate_pdf_from_mail = mocker.patch(
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_mail",
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
)
mock_generate_pdf_from_mail.return_value = merged_pdf_first
mock_generate_pdf_from_html.return_value = merged_pdf_second

22
uv.lock generated
View File

@@ -5643,7 +5643,7 @@ wheels = [
[[package]]
name = "zensical"
version = "0.0.26"
version = "0.0.25"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -5653,18 +5653,16 @@ dependencies = [
{ name = "pymdown-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pyyaml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/d5/1f/0a0b1ce8e0553a9dabaedc736d0f34b11fc33d71ff46bce44d674996d41f/zensical-0.0.26.tar.gz", hash = "sha256:f4d9c8403df25fbb3d6dd9577122dc2f23c73a2d16ab778bb7d40370dd71e987", size = 3841473, upload-time = "2026-03-11T09:51:38.838Z" }
sdist = { url = "https://files.pythonhosted.org/packages/18/69/4b49ce778059b4888ea854cf4db40e1b2080fe828b7280198999048d6fce/zensical-0.0.25.tar.gz", hash = "sha256:462808359d949469fa7209d367f2e38ed796744074e5dadeac9ddfef0c44be25", size = 3841318, upload-time = "2026-03-10T19:32:35.048Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/41/58/fa3d9538ff1ea8cf4a193edbf47254f374fa7983fcfa876bb4336d72c53a/zensical-0.0.26-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:7823b25afe7d36099253aa59d643abaac940f80fd015d4a37954210c87d3da56", size = 12263607, upload-time = "2026-03-11T09:50:49.202Z" },
{ url = "https://files.pythonhosted.org/packages/5f/6e/44a3b21bd3569b9cad203364d73a956768d28a879e4c2be91bd889f74d2c/zensical-0.0.26-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:c0254814382cdd3769bc7689180d09bf41de8879871dd736dc52d5f141e8ada7", size = 12144562, upload-time = "2026-03-11T09:50:53.685Z" },
{ url = "https://files.pythonhosted.org/packages/07/ae/31b9885745b3e7ef23a3ae7f175b879807288d11b3fb7e2d3c119c916258/zensical-0.0.26-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c8e601b2bbd239e564b04cf235eefb9777e7dfc7e1857b8871d6cdcfb577aa0", size = 12506728, upload-time = "2026-03-11T09:50:57.775Z" },
{ url = "https://files.pythonhosted.org/packages/bd/93/f5291e2c47076474f181f6eef35ef0428117d3f192da4358c0511e2ce09e/zensical-0.0.26-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2dc43c7e6c25d9724fc0450f0273ca4e5e2506eeb7f89f52f1405a592896ca3b", size = 12454975, upload-time = "2026-03-11T09:51:01.514Z" },
{ url = "https://files.pythonhosted.org/packages/aa/2e/61cac4f2ebad31dab768eb02753ffde9e56d4d34b8f876b949bf516fbd50/zensical-0.0.26-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24ed236d1254cc474c19227eaa3670a1ccf921af53134ec5542b05853bdcd59c", size = 12791930, upload-time = "2026-03-11T09:51:05.162Z" },
{ url = "https://files.pythonhosted.org/packages/02/86/51995d1ed2dd6ad8a1a70bcdf3c5eb16b50e62ea70e638d454a6b9061c4d/zensical-0.0.26-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1110147710d1dd025d932c4a7eada836bdf079c91b70fb0ae5b202e14b094617", size = 12548166, upload-time = "2026-03-11T09:51:09.218Z" },
{ url = "https://files.pythonhosted.org/packages/3d/93/decbafdbfc77170cbc3851464632390846e9aaf45e743c8dd5a24d5673e9/zensical-0.0.26-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:7d21596a785428cdebc20859bd94a05334abe14ad24f1bb9cd80d19219e3c220", size = 12682103, upload-time = "2026-03-11T09:51:12.68Z" },
{ url = "https://files.pythonhosted.org/packages/fb/e2/391d2d08dde621177da069a796a886b549fefb15734aeeb6e696af99b662/zensical-0.0.26-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:680a3c7bb71499b4da784d6072e44b3d7b8c0df3ce9bbd9974e24bd8058c2736", size = 12724219, upload-time = "2026-03-11T09:51:17.32Z" },
{ url = "https://files.pythonhosted.org/packages/80/2a/21b40c5c40a67da8a841f278d61dbd8d5e035e489de6fe1cef5f4e211b4f/zensical-0.0.26-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:e3294a79f98218b6fc2219232e166aa0932ae4dad58f6c8dbc0dbe0ecbff9c25", size = 12862117, upload-time = "2026-03-11T09:51:22.161Z" },
{ url = "https://files.pythonhosted.org/packages/51/76/e1910d6d75d207654c867b8efbda6822dedda9fed3601bf4a864a1f4fe26/zensical-0.0.26-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:630229587df1fb47be184a4a69d0772ce59a44cd2c481ae9f7e8852fffaff11e", size = 12815714, upload-time = "2026-03-11T09:51:26.24Z" },
{ url = "https://files.pythonhosted.org/packages/42/7c/f6f5eb1903b5a557d98f48d09e3d4bc33033ed78508986250dabe5387d91/zensical-0.0.25-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3c481dd16a968f97d43f6b596e10e941d8294ed446b8b117235a6b149c0d6965", size = 12263809, upload-time = "2026-03-10T19:31:49.907Z" },
{ url = "https://files.pythonhosted.org/packages/37/b2/3f8be43526a68c52c84f099887d1903c2526a22aa4344378a72671bf6070/zensical-0.0.25-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ae51751e8b11f50df04641b40c1e07d4b703fed9d9548b16dbcb0cf260da229a", size = 12146107, upload-time = "2026-03-10T19:31:53.576Z" },
{ url = "https://files.pythonhosted.org/packages/16/59/89a3a715b1fe538b4b5ee382d71b86bd06d4f351383e36eefd36e824c150/zensical-0.0.25-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56ccf88245bd0b3684bf313384164972f1890802d4a51dd9b7ae6ea126a810bc", size = 12505963, upload-time = "2026-03-10T19:31:57.517Z" },
{ url = "https://files.pythonhosted.org/packages/ad/5b/cc0bada291818bdf36be777af9c16f655a021f16578a31e6fb233affca03/zensical-0.0.25-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2f4e58bcc06f3e50cc518666a0c9d8f82246255a42b37bb1d7c7343e214fbac", size = 12455496, upload-time = "2026-03-10T19:32:02.37Z" },
{ url = "https://files.pythonhosted.org/packages/e9/16/ff91ee42d8b14a1b63e2e0d74922e6c4b0ec1da3819377f20b7ca2742f76/zensical-0.0.25-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:69895273b1319a45667abac543c3e5065ff2a646d9a698eae056b6a35b57e00a", size = 12683609, upload-time = "2026-03-10T19:32:06.144Z" },
{ url = "https://files.pythonhosted.org/packages/01/fd/a85acc4234d31658f4bb54c4900edfc8d4227ad83e4c79de92cfdcd05c79/zensical-0.0.25-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:c51a00ae1de2e9647bfd0ea1965b223fb3891111a00930416e1277e06f3ab3c4", size = 12725420, upload-time = "2026-03-10T19:32:09.938Z" },
{ url = "https://files.pythonhosted.org/packages/37/c7/896c91e457af3d5769d8d70d2bd66a8a287ad129879b51ab5e985ac68889/zensical-0.0.25-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:28e56ec1f06ea66227c1f5af9d7a6ed3bd4246e6af1e45d29e09f40251b52e1f", size = 12861970, upload-time = "2026-03-10T19:32:13.471Z" },
{ url = "https://files.pythonhosted.org/packages/41/06/5d804cf19e4e093394674d9f213546dc1364a34fd85d81a1153b05733c5a/zensical-0.0.25-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d2d5997baad148b65eb0de6baf81973110538e01a3f64467d06d0c5ac23b0d70", size = 12816321, upload-time = "2026-03-10T19:32:17.031Z" },
]
[[package]]