Feat(parsers): add ParserContext and configure() to ParserProtocol

Replace the ad-hoc mailrule_id attribute assignment with a typed,
immutable ParserContext dataclass and a configure() method on the
Protocol:

- ParserContext(frozen=True, slots=True) lives in paperless/parsers/
  alongside ParserProtocol and MetadataEntry; currently carries only
  mailrule_id but is designed to grow with output_type, ocr_mode, and
  ocr_language in a future phase (decoupling parsers from settings.*)
- ParserProtocol.configure(context: ParserContext) -> None is the
  extension point; no-op by default
- MailDocumentParser.configure() reads mailrule_id into _mailrule_id
- TextDocumentParser and TikaDocumentParser implement a no-op configure()
- Consumer calls document_parser.configure(ParserContext(...)) before
  parse(), replacing the isinstance(parser, MailDocumentParser) guard
  and the direct attribute mutation

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-19 08:19:17 -07:00
parent 07237bde6a
commit a36b6ecbef
6 changed files with 80 additions and 13 deletions

View File

@@ -51,6 +51,7 @@ from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless.parsers import ParserContext
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
@@ -473,16 +474,14 @@ class ConsumerPlugin(
ConsumerStatusShortMessage.PARSING_DOCUMENT,
)
self.log.debug(f"Parsing {self.filename}...")
if (
isinstance(document_parser, MailDocumentParser)
and self.input_doc.mailrule_id
):
document_parser.mailrule_id = self.input_doc.mailrule_id
if isinstance(
document_parser,
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
):
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
document_parser.configure(
ParserContext(mailrule_id=self.input_doc.mailrule_id),
)
document_parser.parse(self.working_copy, mime_type)
else:
document_parser.parse(self.working_copy, mime_type, self.filename)

View File

@@ -35,6 +35,7 @@ Usage example (third-party parser)::
from __future__ import annotations
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Protocol
from typing import Self
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
__all__ = [
"MetadataEntry",
"ParserContext",
"ParserProtocol",
]
@@ -73,6 +75,44 @@ class MetadataEntry(TypedDict):
"""String representation of the field value."""
@dataclass(frozen=True, slots=True)
class ParserContext:
"""Immutable context passed to a parser before parse().
The consumer assembles this from the ingestion event and Django
settings, then calls ``parser.configure(context)`` before
``parser.parse()``. Parsers read only the fields relevant to them;
unneeded fields are ignored.
``frozen=True`` prevents accidental mutation after the consumer
hands the context off. ``slots=True`` keeps instances lightweight.
Fields
------
mailrule_id : int | None
Primary key of the ``MailRule`` that triggered this ingestion,
or ``None`` when the document did not arrive via a mail rule.
Used by ``MailDocumentParser`` to select the PDF layout.
Notes
-----
Future fields (not yet implemented):
* ``output_type`` — PDF/A variant for archive generation
(replaces ``settings.OCR_OUTPUT_TYPE`` reads inside parsers).
* ``ocr_mode`` — skip-text, redo, force, etc.
(replaces ``settings.OCR_MODE`` reads inside parsers).
* ``ocr_language`` — Tesseract language string.
(replaces ``settings.OCR_LANGUAGE`` reads inside parsers).
When those fields are added the consumer will read from Django
settings once and populate them here, decoupling parsers from
``settings.*`` entirely.
"""
mailrule_id: int | None = None
@runtime_checkable
class ParserProtocol(Protocol):
"""Structural contract for all Paperless-ngx document parsers.
@@ -191,6 +231,21 @@ class ParserProtocol(Protocol):
# Core parsing interface
# ------------------------------------------------------------------
def configure(self, context: ParserContext) -> None:
"""Apply source context before parse().
Called by the consumer after instantiation and before parse().
The default implementation is a no-op; parsers override only the
fields they need.
Parameters
----------
context:
Immutable context assembled by the consumer for this
specific ingestion event.
"""
...
def parse(
self,
document_path: Path,

View File

@@ -50,6 +50,7 @@ if TYPE_CHECKING:
from types import TracebackType
from paperless.parsers import MetadataEntry
from paperless.parsers import ParserContext
logger = logging.getLogger("paperless.parsing.mail")
@@ -66,10 +67,10 @@ class MailDocumentParser:
EML files cannot be rendered natively in a browser, the parser always
produces a PDF rendition (requires_pdf_rendition=True).
The mailrule_id instance attribute may be set by the consumer before
calling parse() to apply mail-rule-specific PDF layout options:
Pass a ``ParserContext`` to ``configure()`` before ``parse()`` to
apply mail-rule-specific PDF layout options:
parser.mailrule_id = rule.pk
parser.configure(ParserContext(mailrule_id=rule.pk))
parser.parse(path, mime_type)
Class attributes
@@ -172,7 +173,7 @@ class MailDocumentParser:
self._text: str | None = None
self._date: datetime.datetime | None = None
self._archive_path: Path | None = None
self.mailrule_id: int | None = None
self._mailrule_id: int | None = None
def __enter__(self) -> Self:
return self
@@ -190,6 +191,9 @@ class MailDocumentParser:
# Core parsing interface
# ------------------------------------------------------------------
def configure(self, context: ParserContext) -> None:
self._mailrule_id = context.mailrule_id
def parse(
self,
document_path: Path,
@@ -199,7 +203,7 @@ class MailDocumentParser:
) -> None:
"""Parse the given .eml into formatted text and a PDF archive.
The consumer may set ``self.mailrule_id`` before calling this method
Call ``configure(ParserContext(mailrule_id=...))`` before this method
to apply mail-rule-specific PDF layout options. The ``produce_archive``
flag is accepted for protocol compatibility but is always honoured —
the mail parser always produces a PDF since EML files cannot be
@@ -269,8 +273,8 @@ class MailDocumentParser:
self._date = mail.date
logger.debug("Creating a PDF from the email")
if self.mailrule_id:
rule = MailRule.objects.get(pk=self.mailrule_id)
if self._mailrule_id:
rule = MailRule.objects.get(pk=self._mailrule_id)
self._archive_path = self.generate_pdf(mail, rule.pdf_layout)
else:
self._archive_path = self.generate_pdf(mail)

View File

@@ -27,6 +27,7 @@ if TYPE_CHECKING:
from types import TracebackType
from paperless.parsers import MetadataEntry
from paperless.parsers import ParserContext
logger = logging.getLogger("paperless.parsing.text")
@@ -156,6 +157,9 @@ class TextDocumentParser:
# Core parsing interface
# ------------------------------------------------------------------
def configure(self, context: ParserContext) -> None:
pass
def parse(
self,
document_path: Path,

View File

@@ -35,6 +35,7 @@ if TYPE_CHECKING:
from types import TracebackType
from paperless.parsers import MetadataEntry
from paperless.parsers import ParserContext
logger = logging.getLogger("paperless.parsing.tika")
@@ -205,6 +206,9 @@ class TikaDocumentParser:
# Core parsing interface
# ------------------------------------------------------------------
def configure(self, context: ParserContext) -> None:
pass
def parse(
self,
document_path: Path,

View File

@@ -12,6 +12,7 @@ from pytest_httpx import HTTPXMock
from pytest_mock import MockerFixture
from documents.parsers import ParseError
from paperless.parsers import ParserContext
from paperless.parsers.mail import MailDocumentParser
@@ -711,7 +712,7 @@ class TestParser:
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
mail_parser.mailrule_id = 1
mail_parser.configure(ParserContext(mailrule_id=1))
mail_parser.parse(
document_path=html_email_file,
mime_type="message/rfc822",