diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 53fd1ca0f..f36ff7ee2 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -51,9 +51,9 @@ from documents.templating.workflows import parse_w_workflow_placeholders from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.parsers.mail import MailDocumentParser from paperless.parsers.text import TextDocumentParser from paperless.parsers.tika import TikaDocumentParser -from paperless_mail.parsers import MailDocumentParser LOGGING_NAME: Final[str] = "paperless.consumer" @@ -68,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None: TODO(stumpylog): Remove me in the future """ - if isinstance(parser, (TextDocumentParser, TikaDocumentParser)): + if isinstance(parser, (MailDocumentParser, TextDocumentParser, TikaDocumentParser)): parser.__exit__(None, None, None) else: parser.cleanup() @@ -477,14 +477,12 @@ class ConsumerPlugin( isinstance(document_parser, MailDocumentParser) and self.input_doc.mailrule_id ): - document_parser.parse( - self.working_copy, - mime_type, - self.filename, - self.input_doc.mailrule_id, - ) - elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)): - # TODO(stumpylog): Remove me in the future + document_parser.mailrule_id = self.input_doc.mailrule_id + if isinstance( + document_parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + # TODO(stumpylog): Remove me in the future when all parsers use new protocol document_parser.parse(self.working_copy, mime_type) else: document_parser.parse(self.working_copy, mime_type, self.filename) @@ -496,8 +494,11 @@ class ConsumerPlugin( ProgressStatusOptions.WORKING, ConsumerStatusShortMessage.GENERATING_THUMBNAIL, ) - if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)): - # TODO(stumpylog): Remove me in the future + if isinstance( + document_parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + # TODO(stumpylog): Remove me in the future when all parsers use new protocol thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type) else: thumbnail = document_parser.get_thumbnail( diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index a59c7d676..006a61b07 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -35,8 +35,8 @@ from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DummyProgressManager from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import GetConsumerMixin +from paperless.parsers.mail import MailDocumentParser from paperless_mail.models import MailRule -from paperless_mail.parsers import MailDocumentParser class _BaseTestParser(DocumentParser): @@ -1091,7 +1091,7 @@ class TestConsumer( self.assertEqual(command[1], "--replace-input") @mock.patch("paperless_mail.models.MailRule.objects.get") - @mock.patch("paperless_mail.parsers.MailDocumentParser.parse") + @mock.patch("paperless.parsers.mail.MailDocumentParser.parse") @mock.patch("documents.parsers.document_consumer_declaration.send") def test_mail_parser_receives_mailrule( self, @@ -1123,9 +1123,10 @@ class TestConsumer( with self.get_consumer( filepath=( Path(__file__).parent.parent.parent - / Path("paperless_mail") + / Path("paperless") / Path("tests") / Path("samples") + / Path("mail") ).resolve() / "html.eml", source=DocumentSource.MailFetch, diff --git a/src/paperless/parsers/mail.py b/src/paperless/parsers/mail.py index 86cba23ab..9b73ae2e1 100644 --- a/src/paperless/parsers/mail.py +++ b/src/paperless/parsers/mail.py @@ -1,6 +1,26 @@ +""" +Built-in mail document parser. + +Handles message/rfc822 (EML) MIME type by: +- Parsing the email using imap_tools +- Generating a PDF via Gotenberg (for display and archive) +- Extracting text via Tika for HTML content +- Extracting metadata from email headers + +The parser always produces a PDF because EML files cannot be rendered +natively in a browser (requires_pdf_rendition=True). +""" + +from __future__ import annotations + +import logging import re +import shutil +import tempfile from html import escape from pathlib import Path +from typing import TYPE_CHECKING +from typing import Self from bleach import clean from bleach import linkify @@ -19,65 +39,353 @@ from imap_tools import MailAttachment from imap_tools import MailMessage from tika_client import TikaClient -from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf from paperless.models import OutputTypeChoices +from paperless.version import __full_version_str__ from paperless_mail.models import MailRule +if TYPE_CHECKING: + import datetime + from types import TracebackType -class MailDocumentParser(DocumentParser): - """ - This parser uses imap_tools to parse .eml files, generates pdf using - Gotenberg and sends the html part to a Tika server for text extraction. + from paperless.parsers import MetadataEntry + +logger = logging.getLogger("paperless.parsing.mail") + +_SUPPORTED_MIME_TYPES: dict[str, str] = { + "message/rfc822": ".eml", +} + + +class MailDocumentParser: + """Parse .eml email files for Paperless-ngx. + + Uses imap_tools to parse .eml files, generates a PDF using Gotenberg, + and sends the HTML part to a Tika server for text extraction. Because + EML files cannot be rendered natively in a browser, the parser always + produces a PDF rendition (requires_pdf_rendition=True). + + The mailrule_id instance attribute may be set by the consumer before + calling parse() to apply mail-rule-specific PDF layout options: + + parser.mailrule_id = rule.pk + parser.parse(path, mime_type) + + Class attributes + ---------------- + name : str + Human-readable parser name. + version : str + Semantic version string, kept in sync with Paperless-ngx releases. + author : str + Maintainer name. + url : str + Issue tracker / source URL. """ - logging_name = "paperless.parsing.mail" + name: str = "Paperless-ngx Mail Parser" + version: str = __full_version_str__ + author: str = "Paperless-ngx Contributors" + url: str = "https://github.com/paperless-ngx/paperless-ngx" - def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None: + # ------------------------------------------------------------------ + # Class methods + # ------------------------------------------------------------------ + + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + """Return the MIME types this parser handles. + + Returns + ------- + dict[str, str] + Mapping of MIME type to preferred file extension. """ - Converts our requested PDF/A output into the Gotenberg API - format + return _SUPPORTED_MIME_TYPES + + @classmethod + def score( + cls, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> int | None: + """Return the priority score for handling this file. + + Parameters + ---------- + mime_type: + Detected MIME type of the file. + filename: + Original filename including extension. + path: + Optional filesystem path. Not inspected by this parser. + + Returns + ------- + int | None + 20 if the MIME type is supported (higher than the default 10 to + give the mail parser clear priority), otherwise None. """ - if settings.OCR_OUTPUT_TYPE in { - OutputTypeChoices.PDF_A, - OutputTypeChoices.PDF_A2, - }: - return PdfAFormat.A2b - elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover - self.log.warning( - "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead", - ) - return PdfAFormat.A2b - elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover - return PdfAFormat.A3b + if mime_type in _SUPPORTED_MIME_TYPES: + return 20 return None + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def can_produce_archive(self) -> bool: + """Whether this parser can produce a searchable PDF archive copy. + + Returns + ------- + bool + Always False — the mail parser produces a display PDF + (requires_pdf_rendition=True), not an optional OCR archive. + """ + return False + + @property + def requires_pdf_rendition(self) -> bool: + """Whether the parser must produce a PDF for the frontend to display. + + Returns + ------- + bool + Always True — EML files cannot be rendered natively in a browser, + so a PDF conversion is always required for display. + """ + return True + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def __init__(self, logging_group: object = None) -> None: + settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) + self._tempdir = Path( + tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), + ) + self._text: str | None = None + self._date: datetime.datetime | None = None + self._archive_path: Path | None = None + self.mailrule_id: int | None = None + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + logger.debug("Cleaning up temporary directory %s", self._tempdir) + shutil.rmtree(self._tempdir, ignore_errors=True) + + # ------------------------------------------------------------------ + # Core parsing interface + # ------------------------------------------------------------------ + + def parse( + self, + document_path: Path, + mime_type: str, + *, + produce_archive: bool = True, + ) -> None: + """Parse the given .eml into formatted text and a PDF archive. + + The consumer may set ``self.mailrule_id`` before calling this method + to apply mail-rule-specific PDF layout options. The ``produce_archive`` + flag is accepted for protocol compatibility but is always honoured — + the mail parser always produces a PDF since EML files cannot be + displayed natively. + + Parameters + ---------- + document_path: + Absolute path to the .eml file. + mime_type: + Detected MIME type of the document (should be "message/rfc822"). + produce_archive: + Accepted for protocol compatibility. The PDF rendition is always + produced since EML files cannot be displayed natively in a browser. + + Raises + ------ + documents.parsers.ParseError + If the file cannot be parsed or PDF generation fails. + """ + + def strip_text(text: str) -> str: + """Reduces the spacing of the given text string.""" + text = re.sub(r"\s+", " ", text) + text = re.sub(r"(\n *)+", "\n", text) + return text.strip() + + def build_formatted_text(mail_message: MailMessage) -> str: + """Constructs a formatted string based on the given email.""" + fmt_text = f"Subject: {mail_message.subject}\n\n" + fmt_text += f"From: {mail_message.from_values.full}\n\n" + to_list = [address.full for address in mail_message.to_values] + fmt_text += f"To: {', '.join(to_list)}\n\n" + if mail_message.cc_values: + fmt_text += ( + f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n" + ) + if mail_message.bcc_values: + fmt_text += ( + f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n" + ) + if mail_message.attachments: + att = [] + for a in mail.attachments: + attachment_size = naturalsize(a.size, binary=True, format="%.2f") + att.append( + f"{a.filename} ({attachment_size})", + ) + fmt_text += f"Attachments: {', '.join(att)}\n\n" + + if mail.html: + fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html)) + + fmt_text += f"\n\n{strip_text(mail.text)}" + + return fmt_text + + logger.debug("Parsing file %s into an email", document_path.name) + mail = self.parse_file_to_message(document_path) + + logger.debug("Building formatted text from email") + self._text = build_formatted_text(mail) + + if is_naive(mail.date): + self._date = make_aware(mail.date) + else: + self._date = mail.date + + logger.debug("Creating a PDF from the email") + if self.mailrule_id: + rule = MailRule.objects.get(pk=self.mailrule_id) + self._archive_path = self.generate_pdf(mail, rule.pdf_layout) + else: + self._archive_path = self.generate_pdf(mail) + + # ------------------------------------------------------------------ + # Result accessors + # ------------------------------------------------------------------ + + def get_text(self) -> str | None: + """Return the plain-text content extracted during parse. + + Returns + ------- + str | None + Extracted text, or None if parse has not been called yet. + """ + return self._text + + def get_date(self) -> datetime.datetime | None: + """Return the document date detected during parse. + + Returns + ------- + datetime.datetime | None + Date from the email headers, or None if not detected. + """ + return self._date + + def get_archive_path(self) -> Path | None: + """Return the path to the generated archive PDF, or None. + + Returns + ------- + Path | None + Path to the PDF produced by Gotenberg, or None if parse has not + been called yet. + """ + return self._archive_path + + # ------------------------------------------------------------------ + # Thumbnail and metadata + # ------------------------------------------------------------------ + def get_thumbnail( self, document_path: Path, mime_type: str, - file_name=None, + file_name: str | None = None, ) -> Path: - if not self.archive_path: - self.archive_path = self.generate_pdf( + """Generate a thumbnail from the PDF rendition of the email. + + Converts the document to PDF first if not already done. + + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + file_name: + Kept for backward compatibility; not used. + + Returns + ------- + Path + Path to the generated WebP thumbnail inside the temporary directory. + """ + if not self._archive_path: + self._archive_path = self.generate_pdf( self.parse_file_to_message(document_path), ) return make_thumbnail_from_pdf( - self.archive_path, - self.tempdir, - self.logging_group, + self._archive_path, + self._tempdir, ) - def extract_metadata(self, document_path: Path, mime_type: str): - result = [] + def get_page_count( + self, + document_path: Path, + mime_type: str, + ) -> int | None: + """Return the number of pages in the document. + + Returns + ------- + int | None + Always None — page count is not available for email files. + """ + return None + + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list[MetadataEntry]: + """Extract metadata from the email headers. + + Returns email headers as metadata entries with prefix "header", + plus summary entries for attachments and date. + + Returns + ------- + list[MetadataEntry] + Sorted list of metadata entries, or ``[]`` on parse failure. + """ + result: list[MetadataEntry] = [] try: mail = self.parse_file_to_message(document_path) except ParseError as e: - self.log.warning( - f"Error while fetching document metadata for {document_path}: {e}", + logger.warning( + "Error while fetching document metadata for %s: %s", + document_path, + e, ) return result @@ -86,7 +394,7 @@ class MailDocumentParser(DocumentParser): try: value.encode("utf-8") except UnicodeEncodeError as e: # pragma: no cover - self.log.debug(f"Skipping header {key}: {e}") + logger.debug("Skipping header %s: %s", key, e) continue result.append( @@ -123,81 +431,44 @@ class MailDocumentParser(DocumentParser): result.sort(key=lambda item: (item["prefix"], item["key"])) return result - def parse( - self, - document_path: Path, - mime_type: str, - file_name=None, - mailrule_id: int | None = None, - ) -> None: - """ - Parses the given .eml into formatted text, based on the decoded email. + # ------------------------------------------------------------------ + # Email-specific methods + # ------------------------------------------------------------------ - """ - - def strip_text(text: str): - """ - Reduces the spacing of the given text string - """ - text = re.sub(r"\s+", " ", text) - text = re.sub(r"(\n *)+", "\n", text) - return text.strip() - - def build_formatted_text(mail_message: MailMessage) -> str: - """ - Constructs a formatted string, based on the given email. Basically tries - to get most of the email content, included front matter, into a nice string - """ - fmt_text = f"Subject: {mail_message.subject}\n\n" - fmt_text += f"From: {mail_message.from_values.full}\n\n" - to_list = [address.full for address in mail_message.to_values] - fmt_text += f"To: {', '.join(to_list)}\n\n" - if mail_message.cc_values: - fmt_text += ( - f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n" - ) - if mail_message.bcc_values: - fmt_text += ( - f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n" - ) - if mail_message.attachments: - att = [] - for a in mail.attachments: - attachment_size = naturalsize(a.size, binary=True, format="%.2f") - att.append( - f"{a.filename} ({attachment_size})", - ) - fmt_text += f"Attachments: {', '.join(att)}\n\n" - - if mail.html: - fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html)) - - fmt_text += f"\n\n{strip_text(mail.text)}" - - return fmt_text - - self.log.debug(f"Parsing file {document_path.name} into an email") - mail = self.parse_file_to_message(document_path) - - self.log.debug("Building formatted text from email") - self.text = build_formatted_text(mail) - - if is_naive(mail.date): - self.date = make_aware(mail.date) - else: - self.date = mail.date - - self.log.debug("Creating a PDF from the email") - if mailrule_id: - rule = MailRule.objects.get(pk=mailrule_id) - self.archive_path = self.generate_pdf(mail, rule.pdf_layout) - else: - self.archive_path = self.generate_pdf(mail) + def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None: + """Convert the OCR output type setting to a Gotenberg PdfAFormat.""" + if settings.OCR_OUTPUT_TYPE in { + OutputTypeChoices.PDF_A, + OutputTypeChoices.PDF_A2, + }: + return PdfAFormat.A2b + elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover + logger.warning( + "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead", + ) + return PdfAFormat.A2b + elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover + return PdfAFormat.A3b + return None @staticmethod def parse_file_to_message(filepath: Path) -> MailMessage: - """ - Parses the given .eml file into a MailMessage object + """Parse the given .eml file into a MailMessage object. + + Parameters + ---------- + filepath: + Path to the .eml file. + + Returns + ------- + MailMessage + Parsed mail message. + + Raises + ------ + documents.parsers.ParseError + If the file cannot be parsed or is missing required fields. """ try: with filepath.open("rb") as eml: @@ -213,8 +484,25 @@ class MailDocumentParser(DocumentParser): return parsed - def tika_parse(self, html: str): - self.log.info("Sending content to Tika server") + def tika_parse(self, html: str) -> str: + """Send HTML content to the Tika server for text extraction. + + Parameters + ---------- + html: + HTML string to parse. + + Returns + ------- + str + Extracted plain text. + + Raises + ------ + documents.parsers.ParseError + If the Tika server cannot be reached or returns an error. + """ + logger.info("Sending content to Tika server") try: with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: @@ -234,16 +522,32 @@ class MailDocumentParser(DocumentParser): mail_message: MailMessage, pdf_layout: MailRule.PdfLayout | None = None, ) -> Path: - archive_path = Path(self.tempdir) / "merged.pdf" + """Generate a PDF from the email message. + + Creates separate PDFs for the email body and HTML content, then + merges them according to the requested layout. + + Parameters + ---------- + mail_message: + Parsed email message. + pdf_layout: + Layout option for the PDF. Falls back to the + EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided. + + Returns + ------- + Path + Path to the generated PDF inside the temporary directory. + """ + archive_path = Path(self._tempdir) / "merged.pdf" mail_pdf_file = self.generate_pdf_from_mail(mail_message) - pdf_layout = ( - pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT - ) # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout + pdf_layout = pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT - # If no HTML content, create the PDF from the message - # Otherwise, create 2 PDFs and merge them with Gotenberg + # If no HTML content, create the PDF from the message. + # Otherwise, create 2 PDFs and merge them with Gotenberg. if not mail_message.html: archive_path.write_bytes(mail_pdf_file.read_bytes()) else: @@ -252,7 +556,7 @@ class MailDocumentParser(DocumentParser): mail_message.attachments, ) - self.log.debug("Merging email text and HTML content into single PDF") + logger.debug("Merging email text and HTML content into single PDF") with ( GotenbergClient( @@ -287,15 +591,21 @@ class MailDocumentParser(DocumentParser): return archive_path def mail_to_html(self, mail: MailMessage) -> Path: - """ - Converts the given email into an HTML file, formatted - based on the given template + """Convert the given email into an HTML file using a template. + + Parameters + ---------- + mail: + Parsed mail message. + + Returns + ------- + Path + Path to the rendered HTML file inside the temporary directory. """ def clean_html(text: str) -> str: - """ - Attempts to clean, escape and linkify the given HTML string - """ + """Attempt to clean, escape, and linkify the given HTML string.""" if isinstance(text, list): text = "\n".join([str(e) for e in text]) if not isinstance(text, str): @@ -340,19 +650,37 @@ class MailDocumentParser(DocumentParser): from django.template.loader import render_to_string - html_file = Path(self.tempdir) / "email_as_html.html" + html_file = Path(self._tempdir) / "email_as_html.html" html_file.write_text(render_to_string("email_msg_template.html", context=data)) return html_file def generate_pdf_from_mail(self, mail: MailMessage) -> Path: - """ - Creates a PDF based on the given email, using the email's values in a - an HTML template - """ - self.log.info("Converting mail to PDF") + """Create a PDF from the email body using an HTML template and Gotenberg. - css_file = Path(__file__).parent / "templates" / "output.css" + Parameters + ---------- + mail: + Parsed mail message. + + Returns + ------- + Path + Path to the generated PDF inside the temporary directory. + + Raises + ------ + documents.parsers.ParseError + If Gotenberg returns an error. + """ + logger.info("Converting mail to PDF") + + css_file = ( + Path(__file__).parent.parent.parent + / "paperless_mail" + / "templates" + / "output.css" + ) email_html_file = self.mail_to_html(mail) with ( @@ -388,7 +716,7 @@ class MailDocumentParser(DocumentParser): f"Error while converting email to PDF: {err}", ) from err - email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf" + email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf" email_as_pdf_file.write_bytes(response.content) return email_as_pdf_file @@ -398,11 +726,27 @@ class MailDocumentParser(DocumentParser): orig_html: str, attachments: list[MailAttachment], ) -> Path: - """ - Generates a PDF file based on the HTML and attachments of the email + """Generate a PDF from the HTML content of the email. + + Parameters + ---------- + orig_html: + Raw HTML string from the email body. + attachments: + List of email attachments (used as inline resources). + + Returns + ------- + Path + Path to the generated PDF inside the temporary directory. + + Raises + ------ + documents.parsers.ParseError + If Gotenberg returns an error. """ - def clean_html_script(text: str): + def clean_html_script(text: str) -> str: compiled_open = re.compile(re.escape("