From 3236bbd0c50c3f5b07b34a56170000aed56085a3 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 18 Mar 2026 14:41:26 -0700 Subject: [PATCH] Feat(parsers): migrate MailDocumentParser to ParserProtocol Move the mail parser from paperless_mail/parsers.py to paperless/parsers/mail.py and refactor it to implement ParserProtocol: - Class-level name/version/author/url attributes - supported_mime_types() and score() classmethods (score=20) - can_produce_archive=False, requires_pdf_rendition=True - Context manager lifecycle (__enter__/__exit__) - New parse() signature without mailrule_id kwarg; consumer sets parser.mailrule_id before calling parse() instead - get_text()/get_date()/get_archive_path() accessor methods - extract_metadata() returning email headers and attachment info Register MailDocumentParser in the ParserRegistry alongside Text and Tika parsers. Update consumer, signals, and all import sites to use the new location. Update tests to use the new accessor API, patch paths, and context-manager fixture. Co-Authored-By: Claude Sonnet 4.6 --- src/documents/consumer.py | 25 +- src/documents/tests/test_consumer.py | 7 +- src/paperless/parsers/mail.py | 612 ++++++++++++++---- src/paperless/parsers/registry.py | 2 + .../tests/parsers/test_mail_parser.py | 29 +- .../tests/parsers/test_mail_parser_live.py | 6 +- src/paperless_mail/signals.py | 2 +- src/paperless_mail/tests/conftest.py | 15 +- 8 files changed, 523 insertions(+), 175 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 53fd1ca0f..f36ff7ee2 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -51,9 +51,9 @@ from documents.templating.workflows import parse_w_workflow_placeholders from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.parsers.mail import MailDocumentParser from paperless.parsers.text import TextDocumentParser from paperless.parsers.tika import TikaDocumentParser -from paperless_mail.parsers import MailDocumentParser LOGGING_NAME: Final[str] = "paperless.consumer" @@ -68,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None: TODO(stumpylog): Remove me in the future """ - if isinstance(parser, (TextDocumentParser, TikaDocumentParser)): + if isinstance(parser, (MailDocumentParser, TextDocumentParser, TikaDocumentParser)): parser.__exit__(None, None, None) else: parser.cleanup() @@ -477,14 +477,12 @@ class ConsumerPlugin( isinstance(document_parser, MailDocumentParser) and self.input_doc.mailrule_id ): - document_parser.parse( - self.working_copy, - mime_type, - self.filename, - self.input_doc.mailrule_id, - ) - elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)): - # TODO(stumpylog): Remove me in the future + document_parser.mailrule_id = self.input_doc.mailrule_id + if isinstance( + document_parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + # TODO(stumpylog): Remove me in the future when all parsers use new protocol document_parser.parse(self.working_copy, mime_type) else: document_parser.parse(self.working_copy, mime_type, self.filename) @@ -496,8 +494,11 @@ class ConsumerPlugin( ProgressStatusOptions.WORKING, ConsumerStatusShortMessage.GENERATING_THUMBNAIL, ) - if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)): - # TODO(stumpylog): Remove me in the future + if isinstance( + document_parser, + (MailDocumentParser, TextDocumentParser, TikaDocumentParser), + ): + # TODO(stumpylog): Remove me in the future when all parsers use new protocol thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type) else: thumbnail = document_parser.get_thumbnail( diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index a59c7d676..006a61b07 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -35,8 +35,8 @@ from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DummyProgressManager from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import GetConsumerMixin +from paperless.parsers.mail import MailDocumentParser from paperless_mail.models import MailRule -from paperless_mail.parsers import MailDocumentParser class _BaseTestParser(DocumentParser): @@ -1091,7 +1091,7 @@ class TestConsumer( self.assertEqual(command[1], "--replace-input") @mock.patch("paperless_mail.models.MailRule.objects.get") - @mock.patch("paperless_mail.parsers.MailDocumentParser.parse") + @mock.patch("paperless.parsers.mail.MailDocumentParser.parse") @mock.patch("documents.parsers.document_consumer_declaration.send") def test_mail_parser_receives_mailrule( self, @@ -1123,9 +1123,10 @@ class TestConsumer( with self.get_consumer( filepath=( Path(__file__).parent.parent.parent - / Path("paperless_mail") + / Path("paperless") / Path("tests") / Path("samples") + / Path("mail") ).resolve() / "html.eml", source=DocumentSource.MailFetch, diff --git a/src/paperless/parsers/mail.py b/src/paperless/parsers/mail.py index 86cba23ab..9b73ae2e1 100644 --- a/src/paperless/parsers/mail.py +++ b/src/paperless/parsers/mail.py @@ -1,6 +1,26 @@ +""" +Built-in mail document parser. + +Handles message/rfc822 (EML) MIME type by: +- Parsing the email using imap_tools +- Generating a PDF via Gotenberg (for display and archive) +- Extracting text via Tika for HTML content +- Extracting metadata from email headers + +The parser always produces a PDF because EML files cannot be rendered +natively in a browser (requires_pdf_rendition=True). +""" + +from __future__ import annotations + +import logging import re +import shutil +import tempfile from html import escape from pathlib import Path +from typing import TYPE_CHECKING +from typing import Self from bleach import clean from bleach import linkify @@ -19,65 +39,353 @@ from imap_tools import MailAttachment from imap_tools import MailMessage from tika_client import TikaClient -from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf from paperless.models import OutputTypeChoices +from paperless.version import __full_version_str__ from paperless_mail.models import MailRule +if TYPE_CHECKING: + import datetime + from types import TracebackType -class MailDocumentParser(DocumentParser): - """ - This parser uses imap_tools to parse .eml files, generates pdf using - Gotenberg and sends the html part to a Tika server for text extraction. + from paperless.parsers import MetadataEntry + +logger = logging.getLogger("paperless.parsing.mail") + +_SUPPORTED_MIME_TYPES: dict[str, str] = { + "message/rfc822": ".eml", +} + + +class MailDocumentParser: + """Parse .eml email files for Paperless-ngx. + + Uses imap_tools to parse .eml files, generates a PDF using Gotenberg, + and sends the HTML part to a Tika server for text extraction. Because + EML files cannot be rendered natively in a browser, the parser always + produces a PDF rendition (requires_pdf_rendition=True). + + The mailrule_id instance attribute may be set by the consumer before + calling parse() to apply mail-rule-specific PDF layout options: + + parser.mailrule_id = rule.pk + parser.parse(path, mime_type) + + Class attributes + ---------------- + name : str + Human-readable parser name. + version : str + Semantic version string, kept in sync with Paperless-ngx releases. + author : str + Maintainer name. + url : str + Issue tracker / source URL. """ - logging_name = "paperless.parsing.mail" + name: str = "Paperless-ngx Mail Parser" + version: str = __full_version_str__ + author: str = "Paperless-ngx Contributors" + url: str = "https://github.com/paperless-ngx/paperless-ngx" - def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None: + # ------------------------------------------------------------------ + # Class methods + # ------------------------------------------------------------------ + + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + """Return the MIME types this parser handles. + + Returns + ------- + dict[str, str] + Mapping of MIME type to preferred file extension. """ - Converts our requested PDF/A output into the Gotenberg API - format + return _SUPPORTED_MIME_TYPES + + @classmethod + def score( + cls, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> int | None: + """Return the priority score for handling this file. + + Parameters + ---------- + mime_type: + Detected MIME type of the file. + filename: + Original filename including extension. + path: + Optional filesystem path. Not inspected by this parser. + + Returns + ------- + int | None + 20 if the MIME type is supported (higher than the default 10 to + give the mail parser clear priority), otherwise None. """ - if settings.OCR_OUTPUT_TYPE in { - OutputTypeChoices.PDF_A, - OutputTypeChoices.PDF_A2, - }: - return PdfAFormat.A2b - elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover - self.log.warning( - "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead", - ) - return PdfAFormat.A2b - elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover - return PdfAFormat.A3b + if mime_type in _SUPPORTED_MIME_TYPES: + return 20 return None + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def can_produce_archive(self) -> bool: + """Whether this parser can produce a searchable PDF archive copy. + + Returns + ------- + bool + Always False — the mail parser produces a display PDF + (requires_pdf_rendition=True), not an optional OCR archive. + """ + return False + + @property + def requires_pdf_rendition(self) -> bool: + """Whether the parser must produce a PDF for the frontend to display. + + Returns + ------- + bool + Always True — EML files cannot be rendered natively in a browser, + so a PDF conversion is always required for display. + """ + return True + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def __init__(self, logging_group: object = None) -> None: + settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) + self._tempdir = Path( + tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), + ) + self._text: str | None = None + self._date: datetime.datetime | None = None + self._archive_path: Path | None = None + self.mailrule_id: int | None = None + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + logger.debug("Cleaning up temporary directory %s", self._tempdir) + shutil.rmtree(self._tempdir, ignore_errors=True) + + # ------------------------------------------------------------------ + # Core parsing interface + # ------------------------------------------------------------------ + + def parse( + self, + document_path: Path, + mime_type: str, + *, + produce_archive: bool = True, + ) -> None: + """Parse the given .eml into formatted text and a PDF archive. + + The consumer may set ``self.mailrule_id`` before calling this method + to apply mail-rule-specific PDF layout options. The ``produce_archive`` + flag is accepted for protocol compatibility but is always honoured — + the mail parser always produces a PDF since EML files cannot be + displayed natively. + + Parameters + ---------- + document_path: + Absolute path to the .eml file. + mime_type: + Detected MIME type of the document (should be "message/rfc822"). + produce_archive: + Accepted for protocol compatibility. The PDF rendition is always + produced since EML files cannot be displayed natively in a browser. + + Raises + ------ + documents.parsers.ParseError + If the file cannot be parsed or PDF generation fails. + """ + + def strip_text(text: str) -> str: + """Reduces the spacing of the given text string.""" + text = re.sub(r"\s+", " ", text) + text = re.sub(r"(\n *)+", "\n", text) + return text.strip() + + def build_formatted_text(mail_message: MailMessage) -> str: + """Constructs a formatted string based on the given email.""" + fmt_text = f"Subject: {mail_message.subject}\n\n" + fmt_text += f"From: {mail_message.from_values.full}\n\n" + to_list = [address.full for address in mail_message.to_values] + fmt_text += f"To: {', '.join(to_list)}\n\n" + if mail_message.cc_values: + fmt_text += ( + f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n" + ) + if mail_message.bcc_values: + fmt_text += ( + f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n" + ) + if mail_message.attachments: + att = [] + for a in mail.attachments: + attachment_size = naturalsize(a.size, binary=True, format="%.2f") + att.append( + f"{a.filename} ({attachment_size})", + ) + fmt_text += f"Attachments: {', '.join(att)}\n\n" + + if mail.html: + fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html)) + + fmt_text += f"\n\n{strip_text(mail.text)}" + + return fmt_text + + logger.debug("Parsing file %s into an email", document_path.name) + mail = self.parse_file_to_message(document_path) + + logger.debug("Building formatted text from email") + self._text = build_formatted_text(mail) + + if is_naive(mail.date): + self._date = make_aware(mail.date) + else: + self._date = mail.date + + logger.debug("Creating a PDF from the email") + if self.mailrule_id: + rule = MailRule.objects.get(pk=self.mailrule_id) + self._archive_path = self.generate_pdf(mail, rule.pdf_layout) + else: + self._archive_path = self.generate_pdf(mail) + + # ------------------------------------------------------------------ + # Result accessors + # ------------------------------------------------------------------ + + def get_text(self) -> str | None: + """Return the plain-text content extracted during parse. + + Returns + ------- + str | None + Extracted text, or None if parse has not been called yet. + """ + return self._text + + def get_date(self) -> datetime.datetime | None: + """Return the document date detected during parse. + + Returns + ------- + datetime.datetime | None + Date from the email headers, or None if not detected. + """ + return self._date + + def get_archive_path(self) -> Path | None: + """Return the path to the generated archive PDF, or None. + + Returns + ------- + Path | None + Path to the PDF produced by Gotenberg, or None if parse has not + been called yet. + """ + return self._archive_path + + # ------------------------------------------------------------------ + # Thumbnail and metadata + # ------------------------------------------------------------------ + def get_thumbnail( self, document_path: Path, mime_type: str, - file_name=None, + file_name: str | None = None, ) -> Path: - if not self.archive_path: - self.archive_path = self.generate_pdf( + """Generate a thumbnail from the PDF rendition of the email. + + Converts the document to PDF first if not already done. + + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + file_name: + Kept for backward compatibility; not used. + + Returns + ------- + Path + Path to the generated WebP thumbnail inside the temporary directory. + """ + if not self._archive_path: + self._archive_path = self.generate_pdf( self.parse_file_to_message(document_path), ) return make_thumbnail_from_pdf( - self.archive_path, - self.tempdir, - self.logging_group, + self._archive_path, + self._tempdir, ) - def extract_metadata(self, document_path: Path, mime_type: str): - result = [] + def get_page_count( + self, + document_path: Path, + mime_type: str, + ) -> int | None: + """Return the number of pages in the document. + + Returns + ------- + int | None + Always None — page count is not available for email files. + """ + return None + + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list[MetadataEntry]: + """Extract metadata from the email headers. + + Returns email headers as metadata entries with prefix "header", + plus summary entries for attachments and date. + + Returns + ------- + list[MetadataEntry] + Sorted list of metadata entries, or ``[]`` on parse failure. + """ + result: list[MetadataEntry] = [] try: mail = self.parse_file_to_message(document_path) except ParseError as e: - self.log.warning( - f"Error while fetching document metadata for {document_path}: {e}", + logger.warning( + "Error while fetching document metadata for %s: %s", + document_path, + e, ) return result @@ -86,7 +394,7 @@ class MailDocumentParser(DocumentParser): try: value.encode("utf-8") except UnicodeEncodeError as e: # pragma: no cover - self.log.debug(f"Skipping header {key}: {e}") + logger.debug("Skipping header %s: %s", key, e) continue result.append( @@ -123,81 +431,44 @@ class MailDocumentParser(DocumentParser): result.sort(key=lambda item: (item["prefix"], item["key"])) return result - def parse( - self, - document_path: Path, - mime_type: str, - file_name=None, - mailrule_id: int | None = None, - ) -> None: - """ - Parses the given .eml into formatted text, based on the decoded email. + # ------------------------------------------------------------------ + # Email-specific methods + # ------------------------------------------------------------------ - """ - - def strip_text(text: str): - """ - Reduces the spacing of the given text string - """ - text = re.sub(r"\s+", " ", text) - text = re.sub(r"(\n *)+", "\n", text) - return text.strip() - - def build_formatted_text(mail_message: MailMessage) -> str: - """ - Constructs a formatted string, based on the given email. Basically tries - to get most of the email content, included front matter, into a nice string - """ - fmt_text = f"Subject: {mail_message.subject}\n\n" - fmt_text += f"From: {mail_message.from_values.full}\n\n" - to_list = [address.full for address in mail_message.to_values] - fmt_text += f"To: {', '.join(to_list)}\n\n" - if mail_message.cc_values: - fmt_text += ( - f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n" - ) - if mail_message.bcc_values: - fmt_text += ( - f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n" - ) - if mail_message.attachments: - att = [] - for a in mail.attachments: - attachment_size = naturalsize(a.size, binary=True, format="%.2f") - att.append( - f"{a.filename} ({attachment_size})", - ) - fmt_text += f"Attachments: {', '.join(att)}\n\n" - - if mail.html: - fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html)) - - fmt_text += f"\n\n{strip_text(mail.text)}" - - return fmt_text - - self.log.debug(f"Parsing file {document_path.name} into an email") - mail = self.parse_file_to_message(document_path) - - self.log.debug("Building formatted text from email") - self.text = build_formatted_text(mail) - - if is_naive(mail.date): - self.date = make_aware(mail.date) - else: - self.date = mail.date - - self.log.debug("Creating a PDF from the email") - if mailrule_id: - rule = MailRule.objects.get(pk=mailrule_id) - self.archive_path = self.generate_pdf(mail, rule.pdf_layout) - else: - self.archive_path = self.generate_pdf(mail) + def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None: + """Convert the OCR output type setting to a Gotenberg PdfAFormat.""" + if settings.OCR_OUTPUT_TYPE in { + OutputTypeChoices.PDF_A, + OutputTypeChoices.PDF_A2, + }: + return PdfAFormat.A2b + elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover + logger.warning( + "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead", + ) + return PdfAFormat.A2b + elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover + return PdfAFormat.A3b + return None @staticmethod def parse_file_to_message(filepath: Path) -> MailMessage: - """ - Parses the given .eml file into a MailMessage object + """Parse the given .eml file into a MailMessage object. + + Parameters + ---------- + filepath: + Path to the .eml file. + + Returns + ------- + MailMessage + Parsed mail message. + + Raises + ------ + documents.parsers.ParseError + If the file cannot be parsed or is missing required fields. """ try: with filepath.open("rb") as eml: @@ -213,8 +484,25 @@ class MailDocumentParser(DocumentParser): return parsed - def tika_parse(self, html: str): - self.log.info("Sending content to Tika server") + def tika_parse(self, html: str) -> str: + """Send HTML content to the Tika server for text extraction. + + Parameters + ---------- + html: + HTML string to parse. + + Returns + ------- + str + Extracted plain text. + + Raises + ------ + documents.parsers.ParseError + If the Tika server cannot be reached or returns an error. + """ + logger.info("Sending content to Tika server") try: with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: @@ -234,16 +522,32 @@ class MailDocumentParser(DocumentParser): mail_message: MailMessage, pdf_layout: MailRule.PdfLayout | None = None, ) -> Path: - archive_path = Path(self.tempdir) / "merged.pdf" + """Generate a PDF from the email message. + + Creates separate PDFs for the email body and HTML content, then + merges them according to the requested layout. + + Parameters + ---------- + mail_message: + Parsed email message. + pdf_layout: + Layout option for the PDF. Falls back to the + EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided. + + Returns + ------- + Path + Path to the generated PDF inside the temporary directory. + """ + archive_path = Path(self._tempdir) / "merged.pdf" mail_pdf_file = self.generate_pdf_from_mail(mail_message) - pdf_layout = ( - pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT - ) # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout + pdf_layout = pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT - # If no HTML content, create the PDF from the message - # Otherwise, create 2 PDFs and merge them with Gotenberg + # If no HTML content, create the PDF from the message. + # Otherwise, create 2 PDFs and merge them with Gotenberg. if not mail_message.html: archive_path.write_bytes(mail_pdf_file.read_bytes()) else: @@ -252,7 +556,7 @@ class MailDocumentParser(DocumentParser): mail_message.attachments, ) - self.log.debug("Merging email text and HTML content into single PDF") + logger.debug("Merging email text and HTML content into single PDF") with ( GotenbergClient( @@ -287,15 +591,21 @@ class MailDocumentParser(DocumentParser): return archive_path def mail_to_html(self, mail: MailMessage) -> Path: - """ - Converts the given email into an HTML file, formatted - based on the given template + """Convert the given email into an HTML file using a template. + + Parameters + ---------- + mail: + Parsed mail message. + + Returns + ------- + Path + Path to the rendered HTML file inside the temporary directory. """ def clean_html(text: str) -> str: - """ - Attempts to clean, escape and linkify the given HTML string - """ + """Attempt to clean, escape, and linkify the given HTML string.""" if isinstance(text, list): text = "\n".join([str(e) for e in text]) if not isinstance(text, str): @@ -340,19 +650,37 @@ class MailDocumentParser(DocumentParser): from django.template.loader import render_to_string - html_file = Path(self.tempdir) / "email_as_html.html" + html_file = Path(self._tempdir) / "email_as_html.html" html_file.write_text(render_to_string("email_msg_template.html", context=data)) return html_file def generate_pdf_from_mail(self, mail: MailMessage) -> Path: - """ - Creates a PDF based on the given email, using the email's values in a - an HTML template - """ - self.log.info("Converting mail to PDF") + """Create a PDF from the email body using an HTML template and Gotenberg. - css_file = Path(__file__).parent / "templates" / "output.css" + Parameters + ---------- + mail: + Parsed mail message. + + Returns + ------- + Path + Path to the generated PDF inside the temporary directory. + + Raises + ------ + documents.parsers.ParseError + If Gotenberg returns an error. + """ + logger.info("Converting mail to PDF") + + css_file = ( + Path(__file__).parent.parent.parent + / "paperless_mail" + / "templates" + / "output.css" + ) email_html_file = self.mail_to_html(mail) with ( @@ -388,7 +716,7 @@ class MailDocumentParser(DocumentParser): f"Error while converting email to PDF: {err}", ) from err - email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf" + email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf" email_as_pdf_file.write_bytes(response.content) return email_as_pdf_file @@ -398,11 +726,27 @@ class MailDocumentParser(DocumentParser): orig_html: str, attachments: list[MailAttachment], ) -> Path: - """ - Generates a PDF file based on the HTML and attachments of the email + """Generate a PDF from the HTML content of the email. + + Parameters + ---------- + orig_html: + Raw HTML string from the email body. + attachments: + List of email attachments (used as inline resources). + + Returns + ------- + Path + Path to the generated PDF inside the temporary directory. + + Raises + ------ + documents.parsers.ParseError + If Gotenberg returns an error. """ - def clean_html_script(text: str): + def clean_html_script(text: str) -> str: compiled_open = re.compile(re.escape("