diff --git a/src/documents/consumer.py b/src/documents/consumer.py index f36ff7ee2..95f707010 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -51,6 +51,7 @@ from documents.templating.workflows import parse_w_workflow_placeholders from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.parsers import ParserContext from paperless.parsers.mail import MailDocumentParser from paperless.parsers.text import TextDocumentParser from paperless.parsers.tika import TikaDocumentParser @@ -473,16 +474,14 @@ class ConsumerPlugin( ConsumerStatusShortMessage.PARSING_DOCUMENT, ) self.log.debug(f"Parsing {self.filename}...") - if ( - isinstance(document_parser, MailDocumentParser) - and self.input_doc.mailrule_id - ): - document_parser.mailrule_id = self.input_doc.mailrule_id if isinstance( document_parser, (MailDocumentParser, TextDocumentParser, TikaDocumentParser), ): # TODO(stumpylog): Remove me in the future when all parsers use new protocol + document_parser.configure( + ParserContext(mailrule_id=self.input_doc.mailrule_id), + ) document_parser.parse(self.working_copy, mime_type) else: document_parser.parse(self.working_copy, mime_type, self.filename) diff --git a/src/paperless/parsers/__init__.py b/src/paperless/parsers/__init__.py index ea67ade00..c9c1530a5 100644 --- a/src/paperless/parsers/__init__.py +++ b/src/paperless/parsers/__init__.py @@ -35,6 +35,7 @@ Usage example (third-party parser):: from __future__ import annotations +from dataclasses import dataclass from typing import TYPE_CHECKING from typing import Protocol from typing import Self @@ -48,6 +49,7 @@ if TYPE_CHECKING: __all__ = [ "MetadataEntry", + "ParserContext", "ParserProtocol", ] @@ -73,6 +75,44 @@ class MetadataEntry(TypedDict): """String representation of the field value.""" +@dataclass(frozen=True, slots=True) +class ParserContext: + """Immutable context passed to a parser before parse(). + + The consumer assembles this from the ingestion event and Django + settings, then calls ``parser.configure(context)`` before + ``parser.parse()``. Parsers read only the fields relevant to them; + unneeded fields are ignored. + + ``frozen=True`` prevents accidental mutation after the consumer + hands the context off. ``slots=True`` keeps instances lightweight. + + Fields + ------ + mailrule_id : int | None + Primary key of the ``MailRule`` that triggered this ingestion, + or ``None`` when the document did not arrive via a mail rule. + Used by ``MailDocumentParser`` to select the PDF layout. + + Notes + ----- + Future fields (not yet implemented): + + * ``output_type`` — PDF/A variant for archive generation + (replaces ``settings.OCR_OUTPUT_TYPE`` reads inside parsers). + * ``ocr_mode`` — skip-text, redo, force, etc. + (replaces ``settings.OCR_MODE`` reads inside parsers). + * ``ocr_language`` — Tesseract language string. + (replaces ``settings.OCR_LANGUAGE`` reads inside parsers). + + When those fields are added the consumer will read from Django + settings once and populate them here, decoupling parsers from + ``settings.*`` entirely. + """ + + mailrule_id: int | None = None + + @runtime_checkable class ParserProtocol(Protocol): """Structural contract for all Paperless-ngx document parsers. @@ -191,6 +231,21 @@ class ParserProtocol(Protocol): # Core parsing interface # ------------------------------------------------------------------ + def configure(self, context: ParserContext) -> None: + """Apply source context before parse(). + + Called by the consumer after instantiation and before parse(). + The default implementation is a no-op; parsers override only the + fields they need. + + Parameters + ---------- + context: + Immutable context assembled by the consumer for this + specific ingestion event. + """ + ... + def parse( self, document_path: Path, diff --git a/src/paperless/parsers/mail.py b/src/paperless/parsers/mail.py index 9b73ae2e1..f00bd5663 100644 --- a/src/paperless/parsers/mail.py +++ b/src/paperless/parsers/mail.py @@ -50,6 +50,7 @@ if TYPE_CHECKING: from types import TracebackType from paperless.parsers import MetadataEntry + from paperless.parsers import ParserContext logger = logging.getLogger("paperless.parsing.mail") @@ -66,10 +67,10 @@ class MailDocumentParser: EML files cannot be rendered natively in a browser, the parser always produces a PDF rendition (requires_pdf_rendition=True). - The mailrule_id instance attribute may be set by the consumer before - calling parse() to apply mail-rule-specific PDF layout options: + Pass a ``ParserContext`` to ``configure()`` before ``parse()`` to + apply mail-rule-specific PDF layout options: - parser.mailrule_id = rule.pk + parser.configure(ParserContext(mailrule_id=rule.pk)) parser.parse(path, mime_type) Class attributes @@ -172,7 +173,7 @@ class MailDocumentParser: self._text: str | None = None self._date: datetime.datetime | None = None self._archive_path: Path | None = None - self.mailrule_id: int | None = None + self._mailrule_id: int | None = None def __enter__(self) -> Self: return self @@ -190,6 +191,9 @@ class MailDocumentParser: # Core parsing interface # ------------------------------------------------------------------ + def configure(self, context: ParserContext) -> None: + self._mailrule_id = context.mailrule_id + def parse( self, document_path: Path, @@ -199,7 +203,7 @@ class MailDocumentParser: ) -> None: """Parse the given .eml into formatted text and a PDF archive. - The consumer may set ``self.mailrule_id`` before calling this method + Call ``configure(ParserContext(mailrule_id=...))`` before this method to apply mail-rule-specific PDF layout options. The ``produce_archive`` flag is accepted for protocol compatibility but is always honoured — the mail parser always produces a PDF since EML files cannot be @@ -269,8 +273,8 @@ class MailDocumentParser: self._date = mail.date logger.debug("Creating a PDF from the email") - if self.mailrule_id: - rule = MailRule.objects.get(pk=self.mailrule_id) + if self._mailrule_id: + rule = MailRule.objects.get(pk=self._mailrule_id) self._archive_path = self.generate_pdf(mail, rule.pdf_layout) else: self._archive_path = self.generate_pdf(mail) diff --git a/src/paperless/parsers/text.py b/src/paperless/parsers/text.py index 99d9dab08..00d738995 100644 --- a/src/paperless/parsers/text.py +++ b/src/paperless/parsers/text.py @@ -27,6 +27,7 @@ if TYPE_CHECKING: from types import TracebackType from paperless.parsers import MetadataEntry + from paperless.parsers import ParserContext logger = logging.getLogger("paperless.parsing.text") @@ -156,6 +157,9 @@ class TextDocumentParser: # Core parsing interface # ------------------------------------------------------------------ + def configure(self, context: ParserContext) -> None: + pass + def parse( self, document_path: Path, diff --git a/src/paperless/parsers/tika.py b/src/paperless/parsers/tika.py index b9307858a..f7e41e444 100644 --- a/src/paperless/parsers/tika.py +++ b/src/paperless/parsers/tika.py @@ -35,6 +35,7 @@ if TYPE_CHECKING: from types import TracebackType from paperless.parsers import MetadataEntry + from paperless.parsers import ParserContext logger = logging.getLogger("paperless.parsing.tika") @@ -205,6 +206,9 @@ class TikaDocumentParser: # Core parsing interface # ------------------------------------------------------------------ + def configure(self, context: ParserContext) -> None: + pass + def parse( self, document_path: Path, diff --git a/src/paperless/tests/parsers/test_mail_parser.py b/src/paperless/tests/parsers/test_mail_parser.py index 316489dba..cfcd341d6 100644 --- a/src/paperless/tests/parsers/test_mail_parser.py +++ b/src/paperless/tests/parsers/test_mail_parser.py @@ -12,6 +12,7 @@ from pytest_httpx import HTTPXMock from pytest_mock import MockerFixture from documents.parsers import ParseError +from paperless.parsers import ParserContext from paperless.parsers.mail import MailDocumentParser @@ -711,7 +712,7 @@ class TestParser: def test_layout_option(layout_option, expected_calls, expected_pdf_names): mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option) - mail_parser.mailrule_id = 1 + mail_parser.configure(ParserContext(mailrule_id=1)) mail_parser.parse( document_path=html_email_file, mime_type="message/rfc822",