From 8c404910341cc105d6440d5a5311de96c2f76adb Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Mon, 9 Mar 2026 14:09:32 -0700
Subject: [PATCH] Refactor: Clean up ParserProtocol docstrings and drop
 file_name parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove all Sphinx cross-reference markup (:meth:, :class:, :func:,
  :attr:, :data:, backtick quoting) from registry.py and __init__.py
  docstrings; use plain prose matching the rest of the codebase
- Remove unused file_name parameter from parse() and get_thumbnail()
  in ParserProtocol — no existing parser reads it and the path already
  carries the filename

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/paperless/parsers/__init__.py | 154 +++++++++++++-----------------
 src/paperless/parsers/registry.py | 122 +++++++++++------------
 2 files changed, 123 insertions(+), 153 deletions(-)

diff --git a/src/paperless/parsers/__init__.py b/src/paperless/parsers/__init__.py
index f70fb771e..dfe88e452 100644
--- a/src/paperless/parsers/__init__.py
+++ b/src/paperless/parsers/__init__.py
@@ -4,15 +4,14 @@ paperless.parsers
 
 Public interface for the Paperless-ngx parser plugin system.
 
-This module defines :class:`ParserProtocol` — the structural contract that
-every document parser must satisfy — whether it is a built-in parser shipped
-with Paperless-ngx or a third-party parser installed via a Python entrypoint.
+This module defines ParserProtocol — the structural contract that every
+document parser must satisfy, whether it is a built-in parser shipped with
+Paperless-ngx or a third-party parser installed via a Python entrypoint.
 
-Phase 1/2 scope
----------------
-Only the Protocol is defined here.  The transitional :class:`DocumentParser`
-ABC (Phase 3) and concrete built-in parsers (Phase 3+) will be added in later
-phases, so there are intentionally no imports of parser implementations here.
+Phase 1/2 scope: only the Protocol is defined here. The transitional
+DocumentParser ABC (Phase 3) and concrete built-in parsers (Phase 3+) will
+be added in later phases, so there are intentionally no imports of parser
+implementations here.
 
 Usage example (third-party parser)::
 
@@ -58,21 +57,18 @@ class ParserProtocol(Protocol):
     """Structural contract for all Paperless-ngx document parsers.
 
     Both built-in parsers and third-party plugins (discovered via the
-    ``paperless_ngx.parsers`` entrypoint group) must satisfy this Protocol.
-    Because it is decorated with :func:`typing.runtime_checkable`,
-    ``isinstance(obj, ParserProtocol)`` works at runtime based on method
-    presence, which is useful for validation in :meth:`ParserRegistry.discover`.
+    "paperless_ngx.parsers" entrypoint group) must satisfy this Protocol.
+    Because it is decorated with runtime_checkable, isinstance(obj,
+    ParserProtocol) works at runtime based on method presence, which is
+    useful for validation in ParserRegistry.discover.
 
-    Class-level identity attributes
-    --------------------------------
-    Parsers are required to expose four string attributes at the **class**
-    level so the registry can log attribution information without
-    instantiating the parser:
+    Parsers must expose four string attributes at the class level so the
+    registry can log attribution information without instantiating the parser:
 
     name : str
-        Human-readable parser name (e.g. ``"Tesseract OCR"``).
+        Human-readable parser name (e.g. "Tesseract OCR").
     version : str
-        Semantic version string (e.g. ``"1.2.3"``).
+        Semantic version string (e.g. "1.2.3").
     author : str
         Author or organisation name.
     url : str
@@ -96,16 +92,16 @@ class ParserProtocol(Protocol):
     def supported_mime_types(cls) -> dict[str, str]:
         """Return a mapping of supported MIME types to preferred file extensions.
 
-        The keys are MIME type strings (e.g. ``"application/pdf"``), and the
-        values are the preferred file extension **including** the leading dot
-        (e.g. ``".pdf"``).  The registry uses this mapping both to decide
-        whether a parser is a candidate for a given file and to determine the
-        default extension when creating archive copies.
+        The keys are MIME type strings (e.g. "application/pdf"), and the
+        values are the preferred file extension including the leading dot
+        (e.g. ".pdf").  The registry uses this mapping both to decide whether
+        a parser is a candidate for a given file and to determine the default
+        extension when creating archive copies.
 
         Returns
         -------
         dict[str, str]
-            ``{mime_type: extension}`` mapping — may be empty if the parser
+            {mime_type: extension} mapping — may be empty if the parser
             has been temporarily disabled.
         """
         ...
@@ -117,15 +113,15 @@ class ParserProtocol(Protocol):
         filename: str,
         path: Path | None = None,
     ) -> int | None:
-        """Return a priority score for handling ``mime_type`` on ``filename``.
+        """Return a priority score for handling this file, or None to decline.
 
-        The registry calls this method after confirming that the MIME type is
-        in :meth:`supported_mime_types`.  Parsers may inspect ``filename``
-        (and optionally the file at ``path``) to refine their confidence level.
+        The registry calls this after confirming that the MIME type is in
+        supported_mime_types. Parsers may inspect filename and optionally
+        the file at path to refine their confidence level.
 
-        A higher score wins.  Return ``None`` to explicitly decline handling
-        a file even though the MIME type is listed as supported (e.g. when the
-        parser detects a feature flag is disabled, or a licence has expired).
+        A higher score wins. Return None to explicitly decline handling a file
+        even though the MIME type is listed as supported (e.g. when a feature
+        flag is disabled, or a required service is not configured).
 
         Parameters
         ----------
@@ -134,15 +130,14 @@ class ParserProtocol(Protocol):
         filename:
             The original filename, including extension.
         path:
-            Optional filesystem path to the file.  Parsers that need to
+            Optional filesystem path to the file. Parsers that need to
             inspect file content (e.g. magic-byte sniffing) may use this.
-            The path may be ``None`` when scoring happens before the file
-            is available locally.
+            May be None when scoring happens before the file is available locally.
 
         Returns
         -------
         int | None
-            Priority score (higher wins), or ``None`` to decline.
+            Priority score (higher wins), or None to decline.
         """
         ...
 
@@ -154,19 +149,20 @@ class ParserProtocol(Protocol):
     def can_produce_archive(self) -> bool:
         """Whether this parser can produce a searchable PDF archive copy.
 
-        If ``True``, the consumption pipeline will request an archive version
-        when the document is processed.  If ``False``, only the thumbnail and
-        text extraction will be performed.
+        If True, the consumption pipeline may request an archive version when
+        processing the document, subject to the ARCHIVE_FILE_GENERATION
+        setting. If False, only thumbnail and text extraction are performed.
         """
         ...
 
     @property
     def requires_pdf_rendition(self) -> bool:
-        """Whether the parser requires a pre-rendered PDF before parsing.
+        """Whether the parser must produce a PDF for the frontend to display.
 
-        Some parsers (e.g. image-based OCR engines) work on rasterised PDFs
-        rather than the original file.  When ``True``, the pipeline will
-        convert the source document to PDF before calling :meth:`parse`.
+        True for formats the browser cannot display natively (e.g. DOCX, ODT).
+        When True, the pipeline always stores the PDF output regardless of the
+        ARCHIVE_FILE_GENERATION setting, since the original format cannot be
+        shown to the user.
         """
         ...
 
@@ -178,14 +174,13 @@ class ParserProtocol(Protocol):
         self,
         document_path: Path,
         mime_type: str,
-        file_name: str | None = None,
         *,
         produce_archive: bool = True,
     ) -> None:
-        """Parse ``document_path`` and populate internal state.
+        """Parse document_path and populate internal state.
 
-        After a successful call, callers retrieve results via
-        :meth:`get_text`, :meth:`get_date`, and :meth:`get_archive_path`.
+        After a successful call, callers retrieve results via get_text,
+        get_date, and get_archive_path.
 
         Parameters
         ----------
@@ -193,21 +188,16 @@ class ParserProtocol(Protocol):
             Absolute path to the document file to parse.
         mime_type:
             Detected MIME type of the document.
-        file_name:
-            Original filename as provided by the user.  May differ from the
-            stem of ``document_path`` (which is usually a UUID-based name).
         produce_archive:
-            When ``True`` (the default) and :attr:`can_produce_archive` is
-            also ``True``, the parser should produce a searchable PDF at the
-            path returned by :meth:`get_archive_path`.  Pass ``False`` when
-            only text extraction and thumbnail generation are required and
-            disk I/O should be minimised.
+            When True (the default) and can_produce_archive is also True,
+            the parser should produce a searchable PDF at the path returned
+            by get_archive_path. Pass False when only text extraction and
+            thumbnail generation are required and disk I/O should be minimised.
 
         Raises
         ------
         documents.parsers.ParseError
-            If parsing fails for any reason.  The consumption pipeline will
-            catch this and handle failure appropriately.
+            If parsing fails for any reason.
         """
         ...
 
@@ -216,35 +206,34 @@ class ParserProtocol(Protocol):
     # ------------------------------------------------------------------
 
     def get_text(self) -> str | None:
-        """Return the plain-text content extracted during :meth:`parse`.
+        """Return the plain-text content extracted during parse.
 
         Returns
         -------
         str | None
-            Extracted text, or ``None`` if no text could be found.
+            Extracted text, or None if no text could be found.
         """
         ...
 
     def get_date(self) -> datetime.datetime | None:
-        """Return the document date detected during :meth:`parse`.
+        """Return the document date detected during parse.
 
         Returns
         -------
         datetime.datetime | None
-            Detected document date, or ``None`` if no date was found.
+            Detected document date, or None if no date was found.
         """
         ...
 
     def get_archive_path(self) -> Path | None:
-        """Return the path to the generated archive PDF (if any).
+        """Return the path to the generated archive PDF, or None.
 
         Returns
         -------
         Path | None
-            Path to the searchable PDF archive, or ``None`` if no archive
-            was produced (e.g. because ``produce_archive=False`` was passed
-            to :meth:`parse`, or the parser does not support archive
-            production).
+            Path to the searchable PDF archive, or None if no archive was
+            produced (e.g. because produce_archive=False or the parser does
+            not support archive generation).
         """
         ...
 
@@ -252,17 +241,12 @@ class ParserProtocol(Protocol):
     # Thumbnail and metadata
     # ------------------------------------------------------------------
 
-    def get_thumbnail(
-        self,
-        document_path: Path,
-        mime_type: str,
-        file_name: str | None = None,
-    ) -> Path:
+    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
         """Generate and return the path to a thumbnail image for the document.
 
-        Unlike :meth:`parse`, this method may be called independently of
-        :meth:`parse`.  The returned path must point to an existing WebP image
-        file inside the parser's temporary working directory.
+        May be called independently of parse. The returned path must point to
+        an existing WebP image file inside the parser's temporary working
+        directory.
 
         Parameters
         ----------
@@ -270,8 +254,6 @@ class ParserProtocol(Protocol):
             Absolute path to the source document.
         mime_type:
             Detected MIME type of the document.
-        file_name:
-            Original filename.
 
         Returns
         -------
@@ -297,7 +279,7 @@ class ParserProtocol(Protocol):
         Returns
         -------
         int | None
-            Page count, or ``None`` if the parser cannot determine it.
+            Page count, or None if the parser cannot determine it.
         """
         ...
 
@@ -308,8 +290,8 @@ class ParserProtocol(Protocol):
     def __enter__(self) -> Self:
         """Enter the parser context, returning the parser instance.
 
-        Implementations should perform any resource allocation (e.g. creating
-        a temporary working directory) here if not done in ``__init__``.
+        Implementations should perform any resource allocation here if not
+        done in __init__ (e.g. creating API clients or temp directories).
 
         Returns
         -------
@@ -324,18 +306,18 @@ class ParserProtocol(Protocol):
         exc_val: BaseException | None,
         exc_tb: object,
     ) -> None:
-        """Exit the parser context and release resources.
+        """Exit the parser context and release all resources.
 
-        Implementations must clean up all temporary files and other
-        resources regardless of whether an exception occurred.
+        Implementations must clean up all temporary files and other resources
+        regardless of whether an exception occurred.
 
         Parameters
         ----------
         exc_type:
-            The exception class, or ``None`` if no exception was raised.
+            The exception class, or None if no exception was raised.
         exc_val:
-            The exception instance, or ``None``.
+            The exception instance, or None.
         exc_tb:
-            The traceback, or ``None``.
+            The traceback, or None.
         """
         ...
diff --git a/src/paperless/parsers/registry.py b/src/paperless/parsers/registry.py
index 3b2502c75..6c46fd7d1 100644
--- a/src/paperless/parsers/registry.py
+++ b/src/paperless/parsers/registry.py
@@ -8,30 +8,29 @@ plugins installed via Python entrypoints.
 
 Public surface
 --------------
-:func:`get_parser_registry`
-    Lazy-initialise and return the shared :class:`ParserRegistry`.  This is
-    the primary entry point for production code.
+get_parser_registry
+    Lazy-initialise and return the shared ParserRegistry. This is the primary
+    entry point for production code.
 
-:func:`init_builtin_parsers`
-    Register built-in parsers only, without entrypoint discovery.  Safe to
-    call from Celery ``worker_process_init`` where importing all entrypoints
+init_builtin_parsers
+    Register built-in parsers only, without entrypoint discovery. Safe to
+    call from Celery worker_process_init where importing all entrypoints
     would be wasteful or cause side effects.
 
-:func:`reset_parser_registry`
-    Reset module-level state.  **For tests only.**
+reset_parser_registry
+    Reset module-level state. For tests only.
 
 Entrypoint group
 ----------------
 Third-party parsers must advertise themselves under the
-``paperless_ngx.parsers`` entrypoint group in their ``pyproject.toml``::
+"paperless_ngx.parsers" entrypoint group in their pyproject.toml::
 
     [project.entry-points."paperless_ngx.parsers"]
     my_parser = "my_package.parsers:MyParser"
 
-The loaded class must expose the following attributes *at the class level*
+The loaded class must expose the following attributes at the class level
 (not just on instances) for the registry to accept it:
-``name``, ``version``, ``author``, ``url``,
-``supported_mime_types`` (callable), ``score`` (callable).
+name, version, author, url, supported_mime_types (callable), score (callable).
 """
 
 from __future__ import annotations
@@ -69,16 +68,14 @@ _REQUIRED_ATTRS: tuple[str, ...] = (
 
 
 def get_parser_registry() -> ParserRegistry:
-    """Return the shared :class:`ParserRegistry` instance.
+    """Return the shared ParserRegistry instance.
 
     On the first call this function:
 
-    1. Creates a new :class:`ParserRegistry`.
-    2. Calls :meth:`~ParserRegistry.register_defaults` to install built-in
-       parsers.
-    3. Calls :meth:`~ParserRegistry.discover` to load third-party plugins via
-       ``importlib.metadata`` entrypoints.
-    4. Calls :meth:`~ParserRegistry.log_summary` to emit a startup summary.
+    1. Creates a new ParserRegistry.
+    2. Calls register_defaults to install built-in parsers.
+    3. Calls discover to load third-party plugins via importlib.metadata entrypoints.
+    4. Calls log_summary to emit a startup summary.
 
     Subsequent calls return the same instance immediately.
 
@@ -104,13 +101,12 @@ def get_parser_registry() -> ParserRegistry:
 def init_builtin_parsers() -> None:
     """Register built-in parsers without performing entrypoint discovery.
 
-    This function is intended for use in Celery ``worker_process_init``
-    handlers and similar contexts where importing all installed entrypoints
-    would be wasteful, slow, or could produce undesirable side effects.
+    Intended for use in Celery worker_process_init handlers where importing
+    all installed entrypoints would be wasteful, slow, or could produce
+    undesirable side effects. Entrypoint discovery (third-party plugins) is
+    deliberately not performed.
 
-    It is safe to call this function multiple times; subsequent calls are
-    no-ops.  Entrypoint discovery (i.e. third-party plugins) is deliberately
-    **not** performed.
+    Safe to call multiple times — subsequent calls are no-ops.
 
     Returns
     -------
@@ -126,15 +122,13 @@ def init_builtin_parsers() -> None:
 def reset_parser_registry() -> None:
     """Reset the module-level registry state to its initial values.
 
-    This resets both :data:`_registry` and :data:`_discovery_complete` so
-    that the next call to :func:`get_parser_registry` will re-initialise
-    everything from scratch.
+    Resets _registry and _discovery_complete so the next call to
+    get_parser_registry will re-initialise everything from scratch.
 
-    .. warning::
-        **FOR TESTS ONLY.**  Do not call this function in production code.
-        Resetting the registry mid-request will cause all subsequent parser
-        lookups to go through discovery again, which is expensive and may
-        have unexpected side effects in multi-threaded environments.
+    FOR TESTS ONLY. Do not call this in production code — resetting the
+    registry mid-request causes all subsequent parser lookups to go through
+    discovery again, which is expensive and may have unexpected side effects
+    in multi-threaded environments.
 
     Returns
     -------
@@ -156,19 +150,18 @@ class ParserRegistry:
 
     Parsers are partitioned into two lists:
 
-    ``_builtins``
-        Parser classes registered via :meth:`register_builtin` (populated by
-        :meth:`register_defaults` in Phase 3+).
+    _builtins
+        Parser classes registered via register_builtin (populated by
+        register_defaults in Phase 3+).
 
-    ``_external``
-        Parser classes loaded from installed Python entrypoints via
-        :meth:`discover`.
+    _external
+        Parser classes loaded from installed Python entrypoints via discover.
 
     When resolving a parser for a file, external parsers are evaluated
-    alongside built-in parsers using a uniform scoring mechanism.  Both lists
-    are iterated together; the class with the highest :meth:`~ParserProtocol.score`
-    wins.  If an external parser wins, its attribution details are logged so
-    users can identify which third-party package handled their document.
+    alongside built-in parsers using a uniform scoring mechanism. Both lists
+    are iterated together; the class with the highest score wins. If an
+    external parser wins, its attribution details are logged so users can
+    identify which third-party package handled their document.
     """
 
     def __init__(self) -> None:
@@ -183,14 +176,13 @@ class ParserRegistry:
         """Register a built-in parser class.
 
         Built-in parsers are shipped with Paperless-ngx and are appended to
-        the ``_builtins`` list.  They are never overridden by external parsers;
+        the _builtins list. They are never overridden by external parsers;
         instead, scoring determines which parser wins for any given file.
 
         Parameters
         ----------
         parser_class:
-            The parser class to register.  Must satisfy
-            :class:`~paperless.parsers.ParserProtocol`.
+            The parser class to register. Must satisfy ParserProtocol.
         """
         self._builtins.append(parser_class)
 
@@ -208,19 +200,18 @@ class ParserRegistry:
     # ------------------------------------------------------------------
 
     def discover(self) -> None:
-        """Load third-party parsers from the ``paperless_ngx.parsers`` entrypoint group.
+        """Load third-party parsers from the "paperless_ngx.parsers" entrypoint group.
 
         For each advertised entrypoint the method:
 
-        1. Calls ``ep.load()`` to import the class.
+        1. Calls ep.load() to import the class.
         2. Validates that the class exposes all required attributes.
-        3. On success, appends the class to :attr:`_external` and logs an
-           info message.
-        4. On failure (import error or missing attributes), logs an
-           appropriate warning/error and continues to the next entrypoint.
+        3. On success, appends the class to _external and logs an info message.
+        4. On failure (import error or missing attributes), logs an appropriate
+           warning/error and continues to the next entrypoint.
 
-        Errors during discovery of a single parser do not prevent other
-        parsers from being loaded.
+        Errors during discovery of a single parser do not prevent other parsers
+        from being loaded.
 
         Returns
         -------
@@ -313,22 +304,20 @@ class ParserRegistry:
         filename: str,
         path: Path | None = None,
     ) -> type | None:
-        """Return the best parser class for the given file, or ``None``.
+        """Return the best parser class for the given file, or None.
 
         All registered parsers (external first, then built-ins) are evaluated
-        against the file.  A parser is eligible if:
+        against the file. A parser is eligible if mime_type appears in the dict
+        returned by its supported_mime_types classmethod, and its score
+        classmethod returns a non-None integer.
 
-        * ``mime_type`` appears in the dict returned by its
-          ``supported_mime_types()`` classmethod, **and**
-        * its ``score()`` classmethod returns a non-``None`` integer.
-
-        The parser with the highest score wins.  When two parsers return the
+        The parser with the highest score wins. When two parsers return the
         same score, the one that appears earlier in the evaluation order wins
         (external parsers are evaluated before built-ins, giving third-party
         packages a chance to override defaults at equal priority).
 
-        When an external parser is selected, its identity is logged at
-        ``INFO`` level so operators can trace which package handled a document.
+        When an external parser is selected, its identity is logged at INFO
+        level so operators can trace which package handled a document.
 
         Parameters
         ----------
@@ -337,14 +326,13 @@ class ParserRegistry:
         filename:
             The original filename, including extension.
         path:
-            Optional filesystem path to the file.  Forwarded to each
-            parser's ``score()`` method.
+            Optional filesystem path to the file. Forwarded to each
+            parser's score method.
 
         Returns
         -------
         type | None
-            The winning parser class, or ``None`` if no parser can handle
-            the file.
+            The winning parser class, or None if no parser can handle the file.
         """
         best_score: int | None = None
         best_parser: type | None = None