From 8c404910341cc105d6440d5a5311de96c2f76adb Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:09:32 -0700 Subject: [PATCH] Refactor: Clean up ParserProtocol docstrings and drop file_name parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove all Sphinx cross-reference markup (:meth:, :class:, :func:, :attr:, :data:, backtick quoting) from registry.py and __init__.py docstrings; use plain prose matching the rest of the codebase - Remove unused file_name parameter from parse() and get_thumbnail() in ParserProtocol — no existing parser reads it and the path already carries the filename Co-Authored-By: Claude Sonnet 4.6 --- src/paperless/parsers/__init__.py | 154 +++++++++++++----------------- src/paperless/parsers/registry.py | 122 +++++++++++------------ 2 files changed, 123 insertions(+), 153 deletions(-) diff --git a/src/paperless/parsers/__init__.py b/src/paperless/parsers/__init__.py index f70fb771e..dfe88e452 100644 --- a/src/paperless/parsers/__init__.py +++ b/src/paperless/parsers/__init__.py @@ -4,15 +4,14 @@ paperless.parsers Public interface for the Paperless-ngx parser plugin system. -This module defines :class:`ParserProtocol` — the structural contract that -every document parser must satisfy — whether it is a built-in parser shipped -with Paperless-ngx or a third-party parser installed via a Python entrypoint. +This module defines ParserProtocol — the structural contract that every +document parser must satisfy, whether it is a built-in parser shipped with +Paperless-ngx or a third-party parser installed via a Python entrypoint. -Phase 1/2 scope ---------------- -Only the Protocol is defined here. The transitional :class:`DocumentParser` -ABC (Phase 3) and concrete built-in parsers (Phase 3+) will be added in later -phases, so there are intentionally no imports of parser implementations here. +Phase 1/2 scope: only the Protocol is defined here. The transitional +DocumentParser ABC (Phase 3) and concrete built-in parsers (Phase 3+) will +be added in later phases, so there are intentionally no imports of parser +implementations here. Usage example (third-party parser):: @@ -58,21 +57,18 @@ class ParserProtocol(Protocol): """Structural contract for all Paperless-ngx document parsers. Both built-in parsers and third-party plugins (discovered via the - ``paperless_ngx.parsers`` entrypoint group) must satisfy this Protocol. - Because it is decorated with :func:`typing.runtime_checkable`, - ``isinstance(obj, ParserProtocol)`` works at runtime based on method - presence, which is useful for validation in :meth:`ParserRegistry.discover`. + "paperless_ngx.parsers" entrypoint group) must satisfy this Protocol. + Because it is decorated with runtime_checkable, isinstance(obj, + ParserProtocol) works at runtime based on method presence, which is + useful for validation in ParserRegistry.discover. - Class-level identity attributes - -------------------------------- - Parsers are required to expose four string attributes at the **class** - level so the registry can log attribution information without - instantiating the parser: + Parsers must expose four string attributes at the class level so the + registry can log attribution information without instantiating the parser: name : str - Human-readable parser name (e.g. ``"Tesseract OCR"``). + Human-readable parser name (e.g. "Tesseract OCR"). version : str - Semantic version string (e.g. ``"1.2.3"``). + Semantic version string (e.g. "1.2.3"). author : str Author or organisation name. url : str @@ -96,16 +92,16 @@ class ParserProtocol(Protocol): def supported_mime_types(cls) -> dict[str, str]: """Return a mapping of supported MIME types to preferred file extensions. - The keys are MIME type strings (e.g. ``"application/pdf"``), and the - values are the preferred file extension **including** the leading dot - (e.g. ``".pdf"``). The registry uses this mapping both to decide - whether a parser is a candidate for a given file and to determine the - default extension when creating archive copies. + The keys are MIME type strings (e.g. "application/pdf"), and the + values are the preferred file extension including the leading dot + (e.g. ".pdf"). The registry uses this mapping both to decide whether + a parser is a candidate for a given file and to determine the default + extension when creating archive copies. Returns ------- dict[str, str] - ``{mime_type: extension}`` mapping — may be empty if the parser + {mime_type: extension} mapping — may be empty if the parser has been temporarily disabled. """ ... @@ -117,15 +113,15 @@ class ParserProtocol(Protocol): filename: str, path: Path | None = None, ) -> int | None: - """Return a priority score for handling ``mime_type`` on ``filename``. + """Return a priority score for handling this file, or None to decline. - The registry calls this method after confirming that the MIME type is - in :meth:`supported_mime_types`. Parsers may inspect ``filename`` - (and optionally the file at ``path``) to refine their confidence level. + The registry calls this after confirming that the MIME type is in + supported_mime_types. Parsers may inspect filename and optionally + the file at path to refine their confidence level. - A higher score wins. Return ``None`` to explicitly decline handling - a file even though the MIME type is listed as supported (e.g. when the - parser detects a feature flag is disabled, or a licence has expired). + A higher score wins. Return None to explicitly decline handling a file + even though the MIME type is listed as supported (e.g. when a feature + flag is disabled, or a required service is not configured). Parameters ---------- @@ -134,15 +130,14 @@ class ParserProtocol(Protocol): filename: The original filename, including extension. path: - Optional filesystem path to the file. Parsers that need to + Optional filesystem path to the file. Parsers that need to inspect file content (e.g. magic-byte sniffing) may use this. - The path may be ``None`` when scoring happens before the file - is available locally. + May be None when scoring happens before the file is available locally. Returns ------- int | None - Priority score (higher wins), or ``None`` to decline. + Priority score (higher wins), or None to decline. """ ... @@ -154,19 +149,20 @@ class ParserProtocol(Protocol): def can_produce_archive(self) -> bool: """Whether this parser can produce a searchable PDF archive copy. - If ``True``, the consumption pipeline will request an archive version - when the document is processed. If ``False``, only the thumbnail and - text extraction will be performed. + If True, the consumption pipeline may request an archive version when + processing the document, subject to the ARCHIVE_FILE_GENERATION + setting. If False, only thumbnail and text extraction are performed. """ ... @property def requires_pdf_rendition(self) -> bool: - """Whether the parser requires a pre-rendered PDF before parsing. + """Whether the parser must produce a PDF for the frontend to display. - Some parsers (e.g. image-based OCR engines) work on rasterised PDFs - rather than the original file. When ``True``, the pipeline will - convert the source document to PDF before calling :meth:`parse`. + True for formats the browser cannot display natively (e.g. DOCX, ODT). + When True, the pipeline always stores the PDF output regardless of the + ARCHIVE_FILE_GENERATION setting, since the original format cannot be + shown to the user. """ ... @@ -178,14 +174,13 @@ class ParserProtocol(Protocol): self, document_path: Path, mime_type: str, - file_name: str | None = None, *, produce_archive: bool = True, ) -> None: - """Parse ``document_path`` and populate internal state. + """Parse document_path and populate internal state. - After a successful call, callers retrieve results via - :meth:`get_text`, :meth:`get_date`, and :meth:`get_archive_path`. + After a successful call, callers retrieve results via get_text, + get_date, and get_archive_path. Parameters ---------- @@ -193,21 +188,16 @@ class ParserProtocol(Protocol): Absolute path to the document file to parse. mime_type: Detected MIME type of the document. - file_name: - Original filename as provided by the user. May differ from the - stem of ``document_path`` (which is usually a UUID-based name). produce_archive: - When ``True`` (the default) and :attr:`can_produce_archive` is - also ``True``, the parser should produce a searchable PDF at the - path returned by :meth:`get_archive_path`. Pass ``False`` when - only text extraction and thumbnail generation are required and - disk I/O should be minimised. + When True (the default) and can_produce_archive is also True, + the parser should produce a searchable PDF at the path returned + by get_archive_path. Pass False when only text extraction and + thumbnail generation are required and disk I/O should be minimised. Raises ------ documents.parsers.ParseError - If parsing fails for any reason. The consumption pipeline will - catch this and handle failure appropriately. + If parsing fails for any reason. """ ... @@ -216,35 +206,34 @@ class ParserProtocol(Protocol): # ------------------------------------------------------------------ def get_text(self) -> str | None: - """Return the plain-text content extracted during :meth:`parse`. + """Return the plain-text content extracted during parse. Returns ------- str | None - Extracted text, or ``None`` if no text could be found. + Extracted text, or None if no text could be found. """ ... def get_date(self) -> datetime.datetime | None: - """Return the document date detected during :meth:`parse`. + """Return the document date detected during parse. Returns ------- datetime.datetime | None - Detected document date, or ``None`` if no date was found. + Detected document date, or None if no date was found. """ ... def get_archive_path(self) -> Path | None: - """Return the path to the generated archive PDF (if any). + """Return the path to the generated archive PDF, or None. Returns ------- Path | None - Path to the searchable PDF archive, or ``None`` if no archive - was produced (e.g. because ``produce_archive=False`` was passed - to :meth:`parse`, or the parser does not support archive - production). + Path to the searchable PDF archive, or None if no archive was + produced (e.g. because produce_archive=False or the parser does + not support archive generation). """ ... @@ -252,17 +241,12 @@ class ParserProtocol(Protocol): # Thumbnail and metadata # ------------------------------------------------------------------ - def get_thumbnail( - self, - document_path: Path, - mime_type: str, - file_name: str | None = None, - ) -> Path: + def get_thumbnail(self, document_path: Path, mime_type: str) -> Path: """Generate and return the path to a thumbnail image for the document. - Unlike :meth:`parse`, this method may be called independently of - :meth:`parse`. The returned path must point to an existing WebP image - file inside the parser's temporary working directory. + May be called independently of parse. The returned path must point to + an existing WebP image file inside the parser's temporary working + directory. Parameters ---------- @@ -270,8 +254,6 @@ class ParserProtocol(Protocol): Absolute path to the source document. mime_type: Detected MIME type of the document. - file_name: - Original filename. Returns ------- @@ -297,7 +279,7 @@ class ParserProtocol(Protocol): Returns ------- int | None - Page count, or ``None`` if the parser cannot determine it. + Page count, or None if the parser cannot determine it. """ ... @@ -308,8 +290,8 @@ class ParserProtocol(Protocol): def __enter__(self) -> Self: """Enter the parser context, returning the parser instance. - Implementations should perform any resource allocation (e.g. creating - a temporary working directory) here if not done in ``__init__``. + Implementations should perform any resource allocation here if not + done in __init__ (e.g. creating API clients or temp directories). Returns ------- @@ -324,18 +306,18 @@ class ParserProtocol(Protocol): exc_val: BaseException | None, exc_tb: object, ) -> None: - """Exit the parser context and release resources. + """Exit the parser context and release all resources. - Implementations must clean up all temporary files and other - resources regardless of whether an exception occurred. + Implementations must clean up all temporary files and other resources + regardless of whether an exception occurred. Parameters ---------- exc_type: - The exception class, or ``None`` if no exception was raised. + The exception class, or None if no exception was raised. exc_val: - The exception instance, or ``None``. + The exception instance, or None. exc_tb: - The traceback, or ``None``. + The traceback, or None. """ ... diff --git a/src/paperless/parsers/registry.py b/src/paperless/parsers/registry.py index 3b2502c75..6c46fd7d1 100644 --- a/src/paperless/parsers/registry.py +++ b/src/paperless/parsers/registry.py @@ -8,30 +8,29 @@ plugins installed via Python entrypoints. Public surface -------------- -:func:`get_parser_registry` - Lazy-initialise and return the shared :class:`ParserRegistry`. This is - the primary entry point for production code. +get_parser_registry + Lazy-initialise and return the shared ParserRegistry. This is the primary + entry point for production code. -:func:`init_builtin_parsers` - Register built-in parsers only, without entrypoint discovery. Safe to - call from Celery ``worker_process_init`` where importing all entrypoints +init_builtin_parsers + Register built-in parsers only, without entrypoint discovery. Safe to + call from Celery worker_process_init where importing all entrypoints would be wasteful or cause side effects. -:func:`reset_parser_registry` - Reset module-level state. **For tests only.** +reset_parser_registry + Reset module-level state. For tests only. Entrypoint group ---------------- Third-party parsers must advertise themselves under the -``paperless_ngx.parsers`` entrypoint group in their ``pyproject.toml``:: +"paperless_ngx.parsers" entrypoint group in their pyproject.toml:: [project.entry-points."paperless_ngx.parsers"] my_parser = "my_package.parsers:MyParser" -The loaded class must expose the following attributes *at the class level* +The loaded class must expose the following attributes at the class level (not just on instances) for the registry to accept it: -``name``, ``version``, ``author``, ``url``, -``supported_mime_types`` (callable), ``score`` (callable). +name, version, author, url, supported_mime_types (callable), score (callable). """ from __future__ import annotations @@ -69,16 +68,14 @@ _REQUIRED_ATTRS: tuple[str, ...] = ( def get_parser_registry() -> ParserRegistry: - """Return the shared :class:`ParserRegistry` instance. + """Return the shared ParserRegistry instance. On the first call this function: - 1. Creates a new :class:`ParserRegistry`. - 2. Calls :meth:`~ParserRegistry.register_defaults` to install built-in - parsers. - 3. Calls :meth:`~ParserRegistry.discover` to load third-party plugins via - ``importlib.metadata`` entrypoints. - 4. Calls :meth:`~ParserRegistry.log_summary` to emit a startup summary. + 1. Creates a new ParserRegistry. + 2. Calls register_defaults to install built-in parsers. + 3. Calls discover to load third-party plugins via importlib.metadata entrypoints. + 4. Calls log_summary to emit a startup summary. Subsequent calls return the same instance immediately. @@ -104,13 +101,12 @@ def get_parser_registry() -> ParserRegistry: def init_builtin_parsers() -> None: """Register built-in parsers without performing entrypoint discovery. - This function is intended for use in Celery ``worker_process_init`` - handlers and similar contexts where importing all installed entrypoints - would be wasteful, slow, or could produce undesirable side effects. + Intended for use in Celery worker_process_init handlers where importing + all installed entrypoints would be wasteful, slow, or could produce + undesirable side effects. Entrypoint discovery (third-party plugins) is + deliberately not performed. - It is safe to call this function multiple times; subsequent calls are - no-ops. Entrypoint discovery (i.e. third-party plugins) is deliberately - **not** performed. + Safe to call multiple times — subsequent calls are no-ops. Returns ------- @@ -126,15 +122,13 @@ def init_builtin_parsers() -> None: def reset_parser_registry() -> None: """Reset the module-level registry state to its initial values. - This resets both :data:`_registry` and :data:`_discovery_complete` so - that the next call to :func:`get_parser_registry` will re-initialise - everything from scratch. + Resets _registry and _discovery_complete so the next call to + get_parser_registry will re-initialise everything from scratch. - .. warning:: - **FOR TESTS ONLY.** Do not call this function in production code. - Resetting the registry mid-request will cause all subsequent parser - lookups to go through discovery again, which is expensive and may - have unexpected side effects in multi-threaded environments. + FOR TESTS ONLY. Do not call this in production code — resetting the + registry mid-request causes all subsequent parser lookups to go through + discovery again, which is expensive and may have unexpected side effects + in multi-threaded environments. Returns ------- @@ -156,19 +150,18 @@ class ParserRegistry: Parsers are partitioned into two lists: - ``_builtins`` - Parser classes registered via :meth:`register_builtin` (populated by - :meth:`register_defaults` in Phase 3+). + _builtins + Parser classes registered via register_builtin (populated by + register_defaults in Phase 3+). - ``_external`` - Parser classes loaded from installed Python entrypoints via - :meth:`discover`. + _external + Parser classes loaded from installed Python entrypoints via discover. When resolving a parser for a file, external parsers are evaluated - alongside built-in parsers using a uniform scoring mechanism. Both lists - are iterated together; the class with the highest :meth:`~ParserProtocol.score` - wins. If an external parser wins, its attribution details are logged so - users can identify which third-party package handled their document. + alongside built-in parsers using a uniform scoring mechanism. Both lists + are iterated together; the class with the highest score wins. If an + external parser wins, its attribution details are logged so users can + identify which third-party package handled their document. """ def __init__(self) -> None: @@ -183,14 +176,13 @@ class ParserRegistry: """Register a built-in parser class. Built-in parsers are shipped with Paperless-ngx and are appended to - the ``_builtins`` list. They are never overridden by external parsers; + the _builtins list. They are never overridden by external parsers; instead, scoring determines which parser wins for any given file. Parameters ---------- parser_class: - The parser class to register. Must satisfy - :class:`~paperless.parsers.ParserProtocol`. + The parser class to register. Must satisfy ParserProtocol. """ self._builtins.append(parser_class) @@ -208,19 +200,18 @@ class ParserRegistry: # ------------------------------------------------------------------ def discover(self) -> None: - """Load third-party parsers from the ``paperless_ngx.parsers`` entrypoint group. + """Load third-party parsers from the "paperless_ngx.parsers" entrypoint group. For each advertised entrypoint the method: - 1. Calls ``ep.load()`` to import the class. + 1. Calls ep.load() to import the class. 2. Validates that the class exposes all required attributes. - 3. On success, appends the class to :attr:`_external` and logs an - info message. - 4. On failure (import error or missing attributes), logs an - appropriate warning/error and continues to the next entrypoint. + 3. On success, appends the class to _external and logs an info message. + 4. On failure (import error or missing attributes), logs an appropriate + warning/error and continues to the next entrypoint. - Errors during discovery of a single parser do not prevent other - parsers from being loaded. + Errors during discovery of a single parser do not prevent other parsers + from being loaded. Returns ------- @@ -313,22 +304,20 @@ class ParserRegistry: filename: str, path: Path | None = None, ) -> type | None: - """Return the best parser class for the given file, or ``None``. + """Return the best parser class for the given file, or None. All registered parsers (external first, then built-ins) are evaluated - against the file. A parser is eligible if: + against the file. A parser is eligible if mime_type appears in the dict + returned by its supported_mime_types classmethod, and its score + classmethod returns a non-None integer. - * ``mime_type`` appears in the dict returned by its - ``supported_mime_types()`` classmethod, **and** - * its ``score()`` classmethod returns a non-``None`` integer. - - The parser with the highest score wins. When two parsers return the + The parser with the highest score wins. When two parsers return the same score, the one that appears earlier in the evaluation order wins (external parsers are evaluated before built-ins, giving third-party packages a chance to override defaults at equal priority). - When an external parser is selected, its identity is logged at - ``INFO`` level so operators can trace which package handled a document. + When an external parser is selected, its identity is logged at INFO + level so operators can trace which package handled a document. Parameters ---------- @@ -337,14 +326,13 @@ class ParserRegistry: filename: The original filename, including extension. path: - Optional filesystem path to the file. Forwarded to each - parser's ``score()`` method. + Optional filesystem path to the file. Forwarded to each + parser's score method. Returns ------- type | None - The winning parser class, or ``None`` if no parser can handle - the file. + The winning parser class, or None if no parser can handle the file. """ best_score: int | None = None best_parser: type | None = None