Feat(parsers): migrate MailDocumentParser to ParserProtocol

Move the mail parser from paperless_mail/parsers.py to
paperless/parsers/mail.py and refactor it to implement ParserProtocol:

- Class-level name/version/author/url attributes
- supported_mime_types() and score() classmethods (score=20)
- can_produce_archive=False, requires_pdf_rendition=True
- Context manager lifecycle (__enter__/__exit__)
- New parse() signature without mailrule_id kwarg; consumer sets
  parser.mailrule_id before calling parse() instead
- get_text()/get_date()/get_archive_path() accessor methods
- extract_metadata() returning email headers and attachment info

Register MailDocumentParser in the ParserRegistry alongside Text and
Tika parsers. Update consumer, signals, and all import sites to use
the new location. Update tests to use the new accessor API, patch
paths, and context-manager fixture.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-18 14:41:26 -07:00
parent d107c8c531
commit 3236bbd0c5
8 changed files with 523 additions and 175 deletions
+13 -12
View File
@@ -51,9 +51,9 @@ from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
from paperless_mail.parsers import MailDocumentParser
LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -68,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
TODO(stumpylog): Remove me in the future
"""
if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
if isinstance(parser, (MailDocumentParser, TextDocumentParser, TikaDocumentParser)):
parser.__exit__(None, None, None)
else:
parser.cleanup()
@@ -477,14 +477,12 @@ class ConsumerPlugin(
isinstance(document_parser, MailDocumentParser)
and self.input_doc.mailrule_id
):
document_parser.parse(
self.working_copy,
mime_type,
self.filename,
self.input_doc.mailrule_id,
)
elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
# TODO(stumpylog): Remove me in the future
document_parser.mailrule_id = self.input_doc.mailrule_id
if isinstance(
document_parser,
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
):
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
document_parser.parse(self.working_copy, mime_type)
else:
document_parser.parse(self.working_copy, mime_type, self.filename)
@@ -496,8 +494,11 @@ class ConsumerPlugin(
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
)
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
# TODO(stumpylog): Remove me in the future
if isinstance(
document_parser,
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
):
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
else:
thumbnail = document_parser.get_thumbnail(
+4 -3
View File
@@ -35,8 +35,8 @@ from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import GetConsumerMixin
from paperless.parsers.mail import MailDocumentParser
from paperless_mail.models import MailRule
from paperless_mail.parsers import MailDocumentParser
class _BaseTestParser(DocumentParser):
@@ -1091,7 +1091,7 @@ class TestConsumer(
self.assertEqual(command[1], "--replace-input")
@mock.patch("paperless_mail.models.MailRule.objects.get")
@mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_mail_parser_receives_mailrule(
self,
@@ -1123,9 +1123,10 @@ class TestConsumer(
with self.get_consumer(
filepath=(
Path(__file__).parent.parent.parent
/ Path("paperless_mail")
/ Path("paperless")
/ Path("tests")
/ Path("samples")
/ Path("mail")
).resolve()
/ "html.eml",
source=DocumentSource.MailFetch,
+475 -137
View File
@@ -1,6 +1,26 @@
"""
Built-in mail document parser.
Handles message/rfc822 (EML) MIME type by:
- Parsing the email using imap_tools
- Generating a PDF via Gotenberg (for display and archive)
- Extracting text via Tika for HTML content
- Extracting metadata from email headers
The parser always produces a PDF because EML files cannot be rendered
natively in a browser (requires_pdf_rendition=True).
"""
from __future__ import annotations
import logging
import re
import shutil
import tempfile
from html import escape
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Self
from bleach import clean
from bleach import linkify
@@ -19,65 +39,353 @@ from imap_tools import MailAttachment
from imap_tools import MailMessage
from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.models import OutputTypeChoices
from paperless.version import __full_version_str__
from paperless_mail.models import MailRule
if TYPE_CHECKING:
import datetime
from types import TracebackType
class MailDocumentParser(DocumentParser):
"""
This parser uses imap_tools to parse .eml files, generates pdf using
Gotenberg and sends the html part to a Tika server for text extraction.
from paperless.parsers import MetadataEntry
logger = logging.getLogger("paperless.parsing.mail")
_SUPPORTED_MIME_TYPES: dict[str, str] = {
"message/rfc822": ".eml",
}
class MailDocumentParser:
"""Parse .eml email files for Paperless-ngx.
Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
and sends the HTML part to a Tika server for text extraction. Because
EML files cannot be rendered natively in a browser, the parser always
produces a PDF rendition (requires_pdf_rendition=True).
The mailrule_id instance attribute may be set by the consumer before
calling parse() to apply mail-rule-specific PDF layout options:
parser.mailrule_id = rule.pk
parser.parse(path, mime_type)
Class attributes
----------------
name : str
Human-readable parser name.
version : str
Semantic version string, kept in sync with Paperless-ngx releases.
author : str
Maintainer name.
url : str
Issue tracker / source URL.
"""
logging_name = "paperless.parsing.mail"
name: str = "Paperless-ngx Mail Parser"
version: str = __full_version_str__
author: str = "Paperless-ngx Contributors"
url: str = "https://github.com/paperless-ngx/paperless-ngx"
def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
# ------------------------------------------------------------------
# Class methods
# ------------------------------------------------------------------
@classmethod
def supported_mime_types(cls) -> dict[str, str]:
"""Return the MIME types this parser handles.
Returns
-------
dict[str, str]
Mapping of MIME type to preferred file extension.
"""
Converts our requested PDF/A output into the Gotenberg API
format
return _SUPPORTED_MIME_TYPES
@classmethod
def score(
cls,
mime_type: str,
filename: str,
path: Path | None = None,
) -> int | None:
"""Return the priority score for handling this file.
Parameters
----------
mime_type:
Detected MIME type of the file.
filename:
Original filename including extension.
path:
Optional filesystem path. Not inspected by this parser.
Returns
-------
int | None
20 if the MIME type is supported (higher than the default 10 to
give the mail parser clear priority), otherwise None.
"""
if settings.OCR_OUTPUT_TYPE in {
OutputTypeChoices.PDF_A,
OutputTypeChoices.PDF_A2,
}:
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
self.log.warning(
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
)
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
return PdfAFormat.A3b
if mime_type in _SUPPORTED_MIME_TYPES:
return 20
return None
# ------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------
@property
def can_produce_archive(self) -> bool:
"""Whether this parser can produce a searchable PDF archive copy.
Returns
-------
bool
Always False — the mail parser produces a display PDF
(requires_pdf_rendition=True), not an optional OCR archive.
"""
return False
@property
def requires_pdf_rendition(self) -> bool:
"""Whether the parser must produce a PDF for the frontend to display.
Returns
-------
bool
Always True — EML files cannot be rendered natively in a browser,
so a PDF conversion is always required for display.
"""
return True
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def __init__(self, logging_group: object = None) -> None:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
self._tempdir = Path(
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
)
self._text: str | None = None
self._date: datetime.datetime | None = None
self._archive_path: Path | None = None
self.mailrule_id: int | None = None
def __enter__(self) -> Self:
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
logger.debug("Cleaning up temporary directory %s", self._tempdir)
shutil.rmtree(self._tempdir, ignore_errors=True)
# ------------------------------------------------------------------
# Core parsing interface
# ------------------------------------------------------------------
def parse(
self,
document_path: Path,
mime_type: str,
*,
produce_archive: bool = True,
) -> None:
"""Parse the given .eml into formatted text and a PDF archive.
The consumer may set ``self.mailrule_id`` before calling this method
to apply mail-rule-specific PDF layout options. The ``produce_archive``
flag is accepted for protocol compatibility but is always honoured —
the mail parser always produces a PDF since EML files cannot be
displayed natively.
Parameters
----------
document_path:
Absolute path to the .eml file.
mime_type:
Detected MIME type of the document (should be "message/rfc822").
produce_archive:
Accepted for protocol compatibility. The PDF rendition is always
produced since EML files cannot be displayed natively in a browser.
Raises
------
documents.parsers.ParseError
If the file cannot be parsed or PDF generation fails.
"""
def strip_text(text: str) -> str:
"""Reduces the spacing of the given text string."""
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
def build_formatted_text(mail_message: MailMessage) -> str:
"""Constructs a formatted string based on the given email."""
fmt_text = f"Subject: {mail_message.subject}\n\n"
fmt_text += f"From: {mail_message.from_values.full}\n\n"
to_list = [address.full for address in mail_message.to_values]
fmt_text += f"To: {', '.join(to_list)}\n\n"
if mail_message.cc_values:
fmt_text += (
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
)
if mail_message.bcc_values:
fmt_text += (
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
)
if mail_message.attachments:
att = []
for a in mail.attachments:
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
att.append(
f"{a.filename} ({attachment_size})",
)
fmt_text += f"Attachments: {', '.join(att)}\n\n"
if mail.html:
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
fmt_text += f"\n\n{strip_text(mail.text)}"
return fmt_text
logger.debug("Parsing file %s into an email", document_path.name)
mail = self.parse_file_to_message(document_path)
logger.debug("Building formatted text from email")
self._text = build_formatted_text(mail)
if is_naive(mail.date):
self._date = make_aware(mail.date)
else:
self._date = mail.date
logger.debug("Creating a PDF from the email")
if self.mailrule_id:
rule = MailRule.objects.get(pk=self.mailrule_id)
self._archive_path = self.generate_pdf(mail, rule.pdf_layout)
else:
self._archive_path = self.generate_pdf(mail)
# ------------------------------------------------------------------
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
"""Return the plain-text content extracted during parse.
Returns
-------
str | None
Extracted text, or None if parse has not been called yet.
"""
return self._text
def get_date(self) -> datetime.datetime | None:
"""Return the document date detected during parse.
Returns
-------
datetime.datetime | None
Date from the email headers, or None if not detected.
"""
return self._date
def get_archive_path(self) -> Path | None:
"""Return the path to the generated archive PDF, or None.
Returns
-------
Path | None
Path to the PDF produced by Gotenberg, or None if parse has not
been called yet.
"""
return self._archive_path
# ------------------------------------------------------------------
# Thumbnail and metadata
# ------------------------------------------------------------------
def get_thumbnail(
self,
document_path: Path,
mime_type: str,
file_name=None,
file_name: str | None = None,
) -> Path:
if not self.archive_path:
self.archive_path = self.generate_pdf(
"""Generate a thumbnail from the PDF rendition of the email.
Converts the document to PDF first if not already done.
Parameters
----------
document_path:
Absolute path to the source document.
mime_type:
Detected MIME type of the document.
file_name:
Kept for backward compatibility; not used.
Returns
-------
Path
Path to the generated WebP thumbnail inside the temporary directory.
"""
if not self._archive_path:
self._archive_path = self.generate_pdf(
self.parse_file_to_message(document_path),
)
return make_thumbnail_from_pdf(
self.archive_path,
self.tempdir,
self.logging_group,
self._archive_path,
self._tempdir,
)
def extract_metadata(self, document_path: Path, mime_type: str):
result = []
def get_page_count(
self,
document_path: Path,
mime_type: str,
) -> int | None:
"""Return the number of pages in the document.
Returns
-------
int | None
Always None — page count is not available for email files.
"""
return None
def extract_metadata(
self,
document_path: Path,
mime_type: str,
) -> list[MetadataEntry]:
"""Extract metadata from the email headers.
Returns email headers as metadata entries with prefix "header",
plus summary entries for attachments and date.
Returns
-------
list[MetadataEntry]
Sorted list of metadata entries, or ``[]`` on parse failure.
"""
result: list[MetadataEntry] = []
try:
mail = self.parse_file_to_message(document_path)
except ParseError as e:
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
logger.warning(
"Error while fetching document metadata for %s: %s",
document_path,
e,
)
return result
@@ -86,7 +394,7 @@ class MailDocumentParser(DocumentParser):
try:
value.encode("utf-8")
except UnicodeEncodeError as e: # pragma: no cover
self.log.debug(f"Skipping header {key}: {e}")
logger.debug("Skipping header %s: %s", key, e)
continue
result.append(
@@ -123,81 +431,44 @@ class MailDocumentParser(DocumentParser):
result.sort(key=lambda item: (item["prefix"], item["key"]))
return result
def parse(
self,
document_path: Path,
mime_type: str,
file_name=None,
mailrule_id: int | None = None,
) -> None:
"""
Parses the given .eml into formatted text, based on the decoded email.
# ------------------------------------------------------------------
# Email-specific methods
# ------------------------------------------------------------------
"""
def strip_text(text: str):
"""
Reduces the spacing of the given text string
"""
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
def build_formatted_text(mail_message: MailMessage) -> str:
"""
Constructs a formatted string, based on the given email. Basically tries
to get most of the email content, included front matter, into a nice string
"""
fmt_text = f"Subject: {mail_message.subject}\n\n"
fmt_text += f"From: {mail_message.from_values.full}\n\n"
to_list = [address.full for address in mail_message.to_values]
fmt_text += f"To: {', '.join(to_list)}\n\n"
if mail_message.cc_values:
fmt_text += (
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
)
if mail_message.bcc_values:
fmt_text += (
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
)
if mail_message.attachments:
att = []
for a in mail.attachments:
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
att.append(
f"{a.filename} ({attachment_size})",
)
fmt_text += f"Attachments: {', '.join(att)}\n\n"
if mail.html:
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
fmt_text += f"\n\n{strip_text(mail.text)}"
return fmt_text
self.log.debug(f"Parsing file {document_path.name} into an email")
mail = self.parse_file_to_message(document_path)
self.log.debug("Building formatted text from email")
self.text = build_formatted_text(mail)
if is_naive(mail.date):
self.date = make_aware(mail.date)
else:
self.date = mail.date
self.log.debug("Creating a PDF from the email")
if mailrule_id:
rule = MailRule.objects.get(pk=mailrule_id)
self.archive_path = self.generate_pdf(mail, rule.pdf_layout)
else:
self.archive_path = self.generate_pdf(mail)
def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
"""Convert the OCR output type setting to a Gotenberg PdfAFormat."""
if settings.OCR_OUTPUT_TYPE in {
OutputTypeChoices.PDF_A,
OutputTypeChoices.PDF_A2,
}:
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
logger.warning(
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
)
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
return PdfAFormat.A3b
return None
@staticmethod
def parse_file_to_message(filepath: Path) -> MailMessage:
"""
Parses the given .eml file into a MailMessage object
"""Parse the given .eml file into a MailMessage object.
Parameters
----------
filepath:
Path to the .eml file.
Returns
-------
MailMessage
Parsed mail message.
Raises
------
documents.parsers.ParseError
If the file cannot be parsed or is missing required fields.
"""
try:
with filepath.open("rb") as eml:
@@ -213,8 +484,25 @@ class MailDocumentParser(DocumentParser):
return parsed
def tika_parse(self, html: str):
self.log.info("Sending content to Tika server")
def tika_parse(self, html: str) -> str:
"""Send HTML content to the Tika server for text extraction.
Parameters
----------
html:
HTML string to parse.
Returns
-------
str
Extracted plain text.
Raises
------
documents.parsers.ParseError
If the Tika server cannot be reached or returns an error.
"""
logger.info("Sending content to Tika server")
try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
@@ -234,16 +522,32 @@ class MailDocumentParser(DocumentParser):
mail_message: MailMessage,
pdf_layout: MailRule.PdfLayout | None = None,
) -> Path:
archive_path = Path(self.tempdir) / "merged.pdf"
"""Generate a PDF from the email message.
Creates separate PDFs for the email body and HTML content, then
merges them according to the requested layout.
Parameters
----------
mail_message:
Parsed email message.
pdf_layout:
Layout option for the PDF. Falls back to the
EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
Returns
-------
Path
Path to the generated PDF inside the temporary directory.
"""
archive_path = Path(self._tempdir) / "merged.pdf"
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
pdf_layout = (
pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
) # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout
pdf_layout = pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
# If no HTML content, create the PDF from the message
# Otherwise, create 2 PDFs and merge them with Gotenberg
# If no HTML content, create the PDF from the message.
# Otherwise, create 2 PDFs and merge them with Gotenberg.
if not mail_message.html:
archive_path.write_bytes(mail_pdf_file.read_bytes())
else:
@@ -252,7 +556,7 @@ class MailDocumentParser(DocumentParser):
mail_message.attachments,
)
self.log.debug("Merging email text and HTML content into single PDF")
logger.debug("Merging email text and HTML content into single PDF")
with (
GotenbergClient(
@@ -287,15 +591,21 @@ class MailDocumentParser(DocumentParser):
return archive_path
def mail_to_html(self, mail: MailMessage) -> Path:
"""
Converts the given email into an HTML file, formatted
based on the given template
"""Convert the given email into an HTML file using a template.
Parameters
----------
mail:
Parsed mail message.
Returns
-------
Path
Path to the rendered HTML file inside the temporary directory.
"""
def clean_html(text: str) -> str:
"""
Attempts to clean, escape and linkify the given HTML string
"""
"""Attempt to clean, escape, and linkify the given HTML string."""
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if not isinstance(text, str):
@@ -340,19 +650,37 @@ class MailDocumentParser(DocumentParser):
from django.template.loader import render_to_string
html_file = Path(self.tempdir) / "email_as_html.html"
html_file = Path(self._tempdir) / "email_as_html.html"
html_file.write_text(render_to_string("email_msg_template.html", context=data))
return html_file
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
"""
Creates a PDF based on the given email, using the email's values in a
an HTML template
"""
self.log.info("Converting mail to PDF")
"""Create a PDF from the email body using an HTML template and Gotenberg.
css_file = Path(__file__).parent / "templates" / "output.css"
Parameters
----------
mail:
Parsed mail message.
Returns
-------
Path
Path to the generated PDF inside the temporary directory.
Raises
------
documents.parsers.ParseError
If Gotenberg returns an error.
"""
logger.info("Converting mail to PDF")
css_file = (
Path(__file__).parent.parent.parent
/ "paperless_mail"
/ "templates"
/ "output.css"
)
email_html_file = self.mail_to_html(mail)
with (
@@ -388,7 +716,7 @@ class MailDocumentParser(DocumentParser):
f"Error while converting email to PDF: {err}",
) from err
email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
email_as_pdf_file.write_bytes(response.content)
return email_as_pdf_file
@@ -398,11 +726,27 @@ class MailDocumentParser(DocumentParser):
orig_html: str,
attachments: list[MailAttachment],
) -> Path:
"""
Generates a PDF file based on the HTML and attachments of the email
"""Generate a PDF from the HTML content of the email.
Parameters
----------
orig_html:
Raw HTML string from the email body.
attachments:
List of email attachments (used as inline resources).
Returns
-------
Path
Path to the generated PDF inside the temporary directory.
Raises
------
documents.parsers.ParseError
If Gotenberg returns an error.
"""
def clean_html_script(text: str):
def clean_html_script(text: str) -> str:
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
text = compiled_open.sub("<div hidden ", text)
@@ -410,9 +754,9 @@ class MailDocumentParser(DocumentParser):
text = compiled_close.sub("</div", text)
return text
self.log.info("Converting message html to PDF")
logger.info("Converting message html to PDF")
tempdir = Path(self.tempdir)
tempdir = Path(self._tempdir)
html_clean = clean_html_script(orig_html)
html_clean_file = tempdir / "index.html"
@@ -473,9 +817,3 @@ class MailDocumentParser(DocumentParser):
html_pdf = tempdir / "html.pdf"
html_pdf.write_bytes(response.content)
return html_pdf
def get_settings(self) -> None:
"""
This parser does not implement additional settings yet
"""
return None
+2
View File
@@ -193,11 +193,13 @@ class ParserRegistry:
that log output is predictable; scoring determines which parser wins
at runtime regardless of registration order.
"""
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
self.register_builtin(TextDocumentParser)
self.register_builtin(TikaDocumentParser)
self.register_builtin(MailDocumentParser)
# ------------------------------------------------------------------
# Discovery
+14 -15
View File
@@ -24,7 +24,7 @@ class TestEmailFileParsing:
def test_parse_error_missing_file(
self,
mail_parser: MailDocumentParser,
sample_dir: Path,
mail_samples_dir: Path,
) -> None:
"""
GIVEN:
@@ -35,7 +35,7 @@ class TestEmailFileParsing:
- An Exception is thrown
"""
# Check if exception is raised when parsing fails.
test_file = sample_dir / "doesntexist.eml"
test_file = mail_samples_dir / "doesntexist.eml"
assert not test_file.exists()
@@ -246,12 +246,12 @@ class TestEmailThumbnailGenerate:
"""
mocked_return = "Passing the return value through.."
mock_make_thumbnail_from_pdf = mocker.patch(
"paperless_mail.parsers.make_thumbnail_from_pdf",
"paperless.parsers.mail.make_thumbnail_from_pdf",
)
mock_make_thumbnail_from_pdf.return_value = mocked_return
mock_generate_pdf = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
)
mock_generate_pdf.return_value = "Mocked return value.."
@@ -260,8 +260,7 @@ class TestEmailThumbnailGenerate:
mock_generate_pdf.assert_called_once()
mock_make_thumbnail_from_pdf.assert_called_once_with(
"Mocked return value..",
mail_parser.tempdir,
None,
mail_parser._tempdir,
)
assert mocked_return == thumb
@@ -373,7 +372,7 @@ class TestParser:
"""
# Validate parsing returns the expected results
mock_generate_pdf = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
)
mail_parser.parse(simple_txt_email_file, "message/rfc822")
@@ -385,7 +384,7 @@ class TestParser:
"BCC: fdf@fvf.de\n\n"
"\n\nThis is just a simple Text Mail."
)
assert text_expected == mail_parser.text
assert text_expected == mail_parser.get_text()
assert (
datetime.datetime(
2022,
@@ -396,7 +395,7 @@ class TestParser:
43,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
)
== mail_parser.date
== mail_parser.get_date()
)
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
@@ -419,7 +418,7 @@ class TestParser:
"""
mock_generate_pdf = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
)
# Validate parsing returns the expected results
@@ -443,7 +442,7 @@ class TestParser:
mail_parser.parse(html_email_file, "message/rfc822")
mock_generate_pdf.assert_called_once()
assert text_expected == mail_parser.text
assert text_expected == mail_parser.get_text()
assert (
datetime.datetime(
2022,
@@ -454,7 +453,7 @@ class TestParser:
19,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
)
== mail_parser.date
== mail_parser.get_date()
)
def test_generate_pdf_parse_error(
@@ -501,7 +500,7 @@ class TestParser:
mail_parser.parse(simple_txt_email_file, "message/rfc822")
assert mail_parser.archive_path is not None
assert mail_parser.get_archive_path() is not None
@pytest.mark.httpx_mock(can_send_already_matched_responses=True)
def test_generate_pdf_html_email(
@@ -542,7 +541,7 @@ class TestParser:
)
mail_parser.parse(html_email_file, "message/rfc822")
assert mail_parser.archive_path is not None
assert mail_parser.get_archive_path() is not None
def test_generate_pdf_html_email_html_to_pdf_failure(
self,
@@ -712,10 +711,10 @@ class TestParser:
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
mail_parser.mailrule_id = 1
mail_parser.parse(
document_path=html_email_file,
mime_type="message/rfc822",
mailrule_id=1,
)
args, _ = mock_merge_route.call_args
assert len(args[0]) == expected_calls
@@ -159,7 +159,7 @@ class TestParserLive:
- The returned thumbnail image file shall match the expected hash
"""
mock_generate_pdf = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
)
mock_generate_pdf.return_value = simple_txt_email_pdf_file
@@ -216,10 +216,10 @@ class TestParserLive:
- The merged PDF shall contain text from both source PDFs
"""
mock_generate_pdf_from_html = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_html",
)
mock_generate_pdf_from_mail = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_mail",
)
mock_generate_pdf_from_mail.return_value = merged_pdf_first
mock_generate_pdf_from_html.return_value = merged_pdf_second
+1 -1
View File
@@ -1,5 +1,5 @@
def get_parser(*args, **kwargs):
from paperless_mail.parsers import MailDocumentParser
from paperless.parsers.mail import MailDocumentParser
return MailDocumentParser(*args, **kwargs)
+11 -4
View File
@@ -3,14 +3,20 @@ from pathlib import Path
import pytest
from paperless.parsers.mail import MailDocumentParser
from paperless_mail.mail import MailAccountHandler
from paperless_mail.models import MailAccount
from paperless_mail.parsers import MailDocumentParser
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
return (
Path(__file__).parent.parent.parent
/ Path("paperless")
/ Path("tests")
/ Path("samples")
/ Path("mail")
).resolve()
@pytest.fixture(scope="session")
@@ -64,8 +70,9 @@ def merged_pdf_second(sample_dir: Path) -> Path:
@pytest.fixture()
def mail_parser() -> MailDocumentParser:
return MailDocumentParser(logging_group=None)
def mail_parser() -> Generator[MailDocumentParser, None, None]:
with MailDocumentParser() as parser:
yield parser
@pytest.fixture()