Feat(parsers): add ParserContext and configure() to ParserProtocol

Replace the ad-hoc mailrule_id attribute assignment with a typed, immutable ParserContext dataclass and a configure() method on the Protocol: - ParserContext(frozen=True, slots=True) lives in paperless/parsers/ alongside ParserProtocol and MetadataEntry; currently carries only mailrule_id but is designed to grow with output_type, ocr_mode, and ocr_language in a future phase (decoupling parsers from settings.*) - ParserProtocol.configure(context: ParserContext) -> None is the extension point; no-op by default - MailDocumentParser.configure() reads mailrule_id into _mailrule_id - TextDocumentParser and TikaDocumentParser implement a no-op configure() - Consumer calls document_parser.configure(ParserContext(...)) before parse(), replacing the isinstance(parser, MailDocumentParser) guard and the direct attribute mutation Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 05:35:24 +00:00 · 2026-03-19 08:19:17 -07:00
parent 07237bde6a
commit a36b6ecbef
6 changed files with 80 additions and 13 deletions
@@ -51,6 +51,7 @@ from documents.templating.workflows import parse_w_workflow_placeholders
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
+from paperless.parsers import ParserContext
 from paperless.parsers.mail import MailDocumentParser
 from paperless.parsers.text import TextDocumentParser
 from paperless.parsers.tika import TikaDocumentParser
@@ -473,16 +474,14 @@ class ConsumerPlugin(
                ConsumerStatusShortMessage.PARSING_DOCUMENT,
            )
            self.log.debug(f"Parsing {self.filename}...")
-            if (
-                isinstance(document_parser, MailDocumentParser)
-                and self.input_doc.mailrule_id
-            ):
-                document_parser.mailrule_id = self.input_doc.mailrule_id
            if isinstance(
                document_parser,
                (MailDocumentParser, TextDocumentParser, TikaDocumentParser),
            ):
                # TODO(stumpylog): Remove me in the future when all parsers use new protocol
+                document_parser.configure(
+                    ParserContext(mailrule_id=self.input_doc.mailrule_id),
+                )
                document_parser.parse(self.working_copy, mime_type)
            else:
                document_parser.parse(self.working_copy, mime_type, self.filename)
@@ -35,6 +35,7 @@ Usage example (third-party parser)::

 from __future__ import annotations

+from dataclasses import dataclass
 from typing import TYPE_CHECKING
 from typing import Protocol
 from typing import Self
@@ -48,6 +49,7 @@ if TYPE_CHECKING:

 __all__ = [
    "MetadataEntry",
+    "ParserContext",
    "ParserProtocol",
 ]

@@ -73,6 +75,44 @@ class MetadataEntry(TypedDict):
    """String representation of the field value."""


+@dataclass(frozen=True, slots=True)
+class ParserContext:
+    """Immutable context passed to a parser before parse().
+
+    The consumer assembles this from the ingestion event and Django
+    settings, then calls ``parser.configure(context)`` before
+    ``parser.parse()``.  Parsers read only the fields relevant to them;
+    unneeded fields are ignored.
+
+    ``frozen=True`` prevents accidental mutation after the consumer
+    hands the context off.  ``slots=True`` keeps instances lightweight.
+
+    Fields
+    ------
+    mailrule_id : int | None
+        Primary key of the ``MailRule`` that triggered this ingestion,
+        or ``None`` when the document did not arrive via a mail rule.
+        Used by ``MailDocumentParser`` to select the PDF layout.
+
+    Notes
+    -----
+    Future fields (not yet implemented):
+
+    * ``output_type`` — PDF/A variant for archive generation
+      (replaces ``settings.OCR_OUTPUT_TYPE`` reads inside parsers).
+    * ``ocr_mode`` — skip-text, redo, force, etc.
+      (replaces ``settings.OCR_MODE`` reads inside parsers).
+    * ``ocr_language`` — Tesseract language string.
+      (replaces ``settings.OCR_LANGUAGE`` reads inside parsers).
+
+    When those fields are added the consumer will read from Django
+    settings once and populate them here, decoupling parsers from
+    ``settings.*`` entirely.
+    """
+
+    mailrule_id: int | None = None
+
+
@runtime_checkable
 class ParserProtocol(Protocol):
    """Structural contract for all Paperless-ngx document parsers.
@@ -191,6 +231,21 @@ class ParserProtocol(Protocol):
    # Core parsing interface
    # ------------------------------------------------------------------

+    def configure(self, context: ParserContext) -> None:
+        """Apply source context before parse().
+
+        Called by the consumer after instantiation and before parse().
+        The default implementation is a no-op; parsers override only the
+        fields they need.
+
+        Parameters
+        ----------
+        context:
+            Immutable context assembled by the consumer for this
+            specific ingestion event.
+        """
+        ...
+
    def parse(
        self,
        document_path: Path,
@@ -50,6 +50,7 @@ if TYPE_CHECKING:
    from types import TracebackType

    from paperless.parsers import MetadataEntry
+    from paperless.parsers import ParserContext

 logger = logging.getLogger("paperless.parsing.mail")

@@ -66,10 +67,10 @@ class MailDocumentParser:
    EML files cannot be rendered natively in a browser, the parser always
    produces a PDF rendition (requires_pdf_rendition=True).

-    The mailrule_id instance attribute may be set by the consumer before
-    calling parse() to apply mail-rule-specific PDF layout options:
+    Pass a ``ParserContext`` to ``configure()`` before ``parse()`` to
+    apply mail-rule-specific PDF layout options:

-        parser.mailrule_id = rule.pk
+        parser.configure(ParserContext(mailrule_id=rule.pk))
        parser.parse(path, mime_type)

    Class attributes
@@ -172,7 +173,7 @@ class MailDocumentParser:
        self._text: str | None = None
        self._date: datetime.datetime | None = None
        self._archive_path: Path | None = None
-        self.mailrule_id: int | None = None
+        self._mailrule_id: int | None = None

    def __enter__(self) -> Self:
        return self
@@ -190,6 +191,9 @@ class MailDocumentParser:
    # Core parsing interface
    # ------------------------------------------------------------------

+    def configure(self, context: ParserContext) -> None:
+        self._mailrule_id = context.mailrule_id
+
    def parse(
        self,
        document_path: Path,
@@ -199,7 +203,7 @@ class MailDocumentParser:
    ) -> None:
        """Parse the given .eml into formatted text and a PDF archive.

-        The consumer may set ``self.mailrule_id`` before calling this method
+        Call ``configure(ParserContext(mailrule_id=...))`` before this method
        to apply mail-rule-specific PDF layout options.  The ``produce_archive``
        flag is accepted for protocol compatibility but is always honoured —
        the mail parser always produces a PDF since EML files cannot be
@@ -269,8 +273,8 @@ class MailDocumentParser:
            self._date = mail.date

        logger.debug("Creating a PDF from the email")
-        if self.mailrule_id:
-            rule = MailRule.objects.get(pk=self.mailrule_id)
+        if self._mailrule_id:
+            rule = MailRule.objects.get(pk=self._mailrule_id)
            self._archive_path = self.generate_pdf(mail, rule.pdf_layout)
        else:
            self._archive_path = self.generate_pdf(mail)
@@ -27,6 +27,7 @@ if TYPE_CHECKING:
    from types import TracebackType

    from paperless.parsers import MetadataEntry
+    from paperless.parsers import ParserContext

 logger = logging.getLogger("paperless.parsing.text")

@@ -156,6 +157,9 @@ class TextDocumentParser:
    # Core parsing interface
    # ------------------------------------------------------------------

+    def configure(self, context: ParserContext) -> None:
+        pass
+
    def parse(
        self,
        document_path: Path,
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
    from types import TracebackType

    from paperless.parsers import MetadataEntry
+    from paperless.parsers import ParserContext

 logger = logging.getLogger("paperless.parsing.tika")

@@ -205,6 +206,9 @@ class TikaDocumentParser:
    # Core parsing interface
    # ------------------------------------------------------------------

+    def configure(self, context: ParserContext) -> None:
+        pass
+
    def parse(
        self,
        document_path: Path,
@@ -12,6 +12,7 @@ from pytest_httpx import HTTPXMock
 from pytest_mock import MockerFixture

 from documents.parsers import ParseError
+from paperless.parsers import ParserContext
 from paperless.parsers.mail import MailDocumentParser


@@ -711,7 +712,7 @@ class TestParser:

        def test_layout_option(layout_option, expected_calls, expected_pdf_names):
            mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
-            mail_parser.mailrule_id = 1
+            mail_parser.configure(ParserContext(mailrule_id=1))
            mail_parser.parse(
                document_path=html_email_file,
                mime_type="message/rfc822",