Renames so it aligns better in the browser view

Fix: require context manager for TikaDocumentParser; clean up client lifecycle
- consumer.py: call __enter__ for new-style parsers so _tika_client and _gotenberg_client are set before parse() is invoked - views.py: use `with parser` (via nullcontext for old-style parsers) in get_metadata so extract_metadata always runs inside a context manager - tika.py: GotenbergClient added to ExitStack alongside TikaClient; inline client creation removed from extract_metadata and _convert_to_pdf; __exit__ uses ExitStack.close() instead of __exit__ pass-through Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-13 12:41:23 +00:00 · 2026-03-12 19:38:16 -07:00 · 2026-03-12 19:29:37 -07:00 · 2026-03-12 15:46:26 -07:00 · 2026-03-12 15:38:25 -07:00 · 2026-03-12 15:30:59 -07:00
30 changed files with 2841 additions and 366 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -51,11 +51,29 @@ from documents.templating.workflows import parse_w_workflow_placeholders
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
+from paperless.parsers.text import TextDocumentParser
+from paperless.parsers.tika import TikaDocumentParser
 from paperless_mail.parsers import MailDocumentParser

 LOGGING_NAME: Final[str] = "paperless.consumer"


+def _parser_cleanup(parser: DocumentParser) -> None:
+    """
+    Call cleanup on a parser, handling the new-style context-manager parsers.
+
+    New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown
+    instead of a cleanup() method.  This shim will be removed once all existing parsers
+    have switched to the new style and this consumer is updated to use it
+
+    TODO(stumpylog): Remove me in the future
+    """
+    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
+        parser.__exit__(None, None, None)
+    else:
+        parser.cleanup()
+
+
 class WorkflowTriggerPlugin(
    NoCleanupPluginMixin,
    NoSetupPluginMixin,
@@ -431,6 +449,12 @@ class ConsumerPlugin(
            progress_callback=progress_callback,
        )

+        # New-style parsers use __enter__/__exit__ for resource management.
+        # _parser_cleanup (below) handles __exit__; call __enter__ here.
+        # TODO(stumpylog): Remove me in the future
+        if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+            document_parser.__enter__()
+
        self.log.debug(f"Parser: {type(document_parser).__name__}")

        # Parse the document. This may take some time.
@@ -459,6 +483,9 @@ class ConsumerPlugin(
                    self.filename,
                    self.input_doc.mailrule_id,
                )
+            elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+                # TODO(stumpylog): Remove me in the future
+                document_parser.parse(self.working_copy, mime_type)
            else:
                document_parser.parse(self.working_copy, mime_type, self.filename)

@@ -469,11 +496,15 @@ class ConsumerPlugin(
                ProgressStatusOptions.WORKING,
                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
            )
-            thumbnail = document_parser.get_thumbnail(
-                self.working_copy,
-                mime_type,
-                self.filename,
-            )
+            if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+                # TODO(stumpylog): Remove me in the future
+                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
+            else:
+                thumbnail = document_parser.get_thumbnail(
+                    self.working_copy,
+                    mime_type,
+                    self.filename,
+                )

            text = document_parser.get_text()
            date = document_parser.get_date()
@@ -490,7 +521,7 @@ class ConsumerPlugin(
            page_count = document_parser.get_page_count(self.working_copy, mime_type)

        except ParseError as e:
-            document_parser.cleanup()
+            _parser_cleanup(document_parser)
            if tempdir:
                tempdir.cleanup()
            self._fail(
@@ -500,7 +531,7 @@ class ConsumerPlugin(
                exception=e,
            )
        except Exception as e:
-            document_parser.cleanup()
+            _parser_cleanup(document_parser)
            if tempdir:
                tempdir.cleanup()
            self._fail(
@@ -702,7 +733,7 @@ class ConsumerPlugin(
                exception=e,
            )
        finally:
-            document_parser.cleanup()
+            _parser_cleanup(document_parser)
            tempdir.cleanup()

        self.run_post_consume_script(document)
--- a/src/documents/management/commands/document_thumbnails.py
+++ b/src/documents/management/commands/document_thumbnails.py
@@ -30,6 +30,7 @@ def _process_document(doc_id: int) -> None:
        )
        shutil.move(thumb, document.thumbnail_path)
    finally:
+        # TODO(stumpylog): Cleanup once all parsers are handled
        parser.cleanup()


--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -399,6 +399,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
            f"Error while parsing document {document} (ID: {document_id})",
        )
    finally:
+        # TODO(stumpylog): Cleanup once all parsers are handled
        parser.cleanup()


--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -9,9 +9,9 @@ from documents.parsers import get_default_file_extension
 from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import get_supported_file_extensions
 from documents.parsers import is_file_ext_supported
+from paperless.parsers.text import TextDocumentParser
+from paperless.parsers.tika import TikaDocumentParser
 from paperless_tesseract.parsers import RasterisedDocumentParser
-from paperless_text.parsers import TextDocumentParser
-from paperless_tika.parsers import TikaDocumentParser


 class TestParserDiscovery(TestCase):
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -7,6 +7,7 @@ import tempfile
 import zipfile
 from collections import defaultdict
 from collections import deque
+from contextlib import nullcontext
 from datetime import datetime
 from pathlib import Path
 from time import mktime
@@ -219,6 +220,7 @@ from paperless.celery import app as celery_app
 from paperless.config import AIConfig
 from paperless.config import GeneralConfig
 from paperless.models import ApplicationConfiguration
+from paperless.parsers import ParserProtocol
 from paperless.serialisers import GroupSerializer
 from paperless.serialisers import UserSerializer
 from paperless.views import StandardPagination
@@ -1078,9 +1080,11 @@ class DocumentViewSet(
        parser_class = get_parser_class_for_mime_type(mime_type)
        if parser_class:
            parser = parser_class(progress_callback=None, logging_group=None)
+            cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)

            try:
-                return parser.extract_metadata(file, mime_type)
+                with cm:
+                    return parser.extract_metadata(file, mime_type)
            except Exception:  # pragma: no cover
                logger.exception(f"Issue getting metadata for {file}")
                # TODO: cover GPG errors, remove later.
--- a/src/paperless/celery.py
+++ b/src/paperless/celery.py
@@ -1,6 +1,7 @@
 import os

 from celery import Celery
+from celery.signals import worker_process_init

 # Set the default Django settings module for the 'celery' program.
 os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
@@ -15,3 +16,19 @@ app.config_from_object("django.conf:settings", namespace="CELERY")

 # Load task modules from all registered Django apps.
 app.autodiscover_tasks()
+
+
+@worker_process_init.connect
+def on_worker_process_init(**kwargs) -> None:  # pragma: no cover
+    """
+    Register built-in parsers eagerly in each Celery worker process.
+
+    This registers only the built-in parsers (no entrypoint discovery) so
+    that workers can begin consuming documents immediately.  Entrypoint
+    discovery for third-party parsers is deferred to the first call of
+    get_parser_registry() inside a task, keeping worker_process_init
+    well within its 4-second timeout budget.
+    """
+    from paperless.parsers.registry import init_builtin_parsers
+
+    init_builtin_parsers()
--- a/src/paperless/parsers/init.py
+++ b/src/paperless/parsers/init.py
@@ -0,0 +1,379 @@
+"""
+Public interface for the Paperless-ngx parser plugin system.
+
+This module defines ParserProtocol — the structural contract that every
+document parser must satisfy, whether it is a built-in parser shipped with
+Paperless-ngx or a third-party parser installed via a Python entrypoint.
+
+Phase 1/2 scope: only the Protocol is defined here. The transitional
+DocumentParser ABC (Phase 3) and concrete built-in parsers (Phase 3+) will
+be added in later phases, so there are intentionally no imports of parser
+implementations here.
+
+Usage example (third-party parser)::
+
+    from paperless.parsers import ParserProtocol
+
+    class MyParser:
+        name = "my-parser"
+        version = "1.0.0"
+        author = "Acme Corp"
+        url = "https://example.com/my-parser"
+
+        @classmethod
+        def supported_mime_types(cls) -> dict[str, str]:
+            return {"application/x-my-format": ".myf"}
+
+        @classmethod
+        def score(cls, mime_type, filename, path=None):
+            return 10
+
+        # … implement remaining protocol methods …
+
+    assert isinstance(MyParser(), ParserProtocol)
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from typing import Protocol
+from typing import Self
+from typing import TypedDict
+from typing import runtime_checkable
+
+if TYPE_CHECKING:
+    import datetime
+    from pathlib import Path
+    from types import TracebackType
+
+__all__ = [
+    "MetadataEntry",
+    "ParserProtocol",
+]
+
+
+class MetadataEntry(TypedDict):
+    """A single metadata field extracted from a document.
+
+    All four keys are required. Values are always serialised to strings —
+    type-specific conversion (dates, integers, lists) is the responsibility
+    of the parser before returning.
+    """
+
+    namespace: str
+    """URI of the metadata namespace (e.g. 'http://ns.adobe.com/pdf/1.3/')."""
+
+    prefix: str
+    """Conventional namespace prefix (e.g. 'pdf', 'xmp', 'dc')."""
+
+    key: str
+    """Field name within the namespace (e.g. 'Author', 'CreateDate')."""
+
+    value: str
+    """String representation of the field value."""
+
+
+@runtime_checkable
+class ParserProtocol(Protocol):
+    """Structural contract for all Paperless-ngx document parsers.
+
+    Both built-in parsers and third-party plugins (discovered via the
+    "paperless_ngx.parsers" entrypoint group) must satisfy this Protocol.
+    Because it is decorated with runtime_checkable, isinstance(obj,
+    ParserProtocol) works at runtime based on method presence, which is
+    useful for validation in ParserRegistry.discover.
+
+    Parsers must expose four string attributes at the class level so the
+    registry can log attribution information without instantiating the parser:
+
+    name : str
+        Human-readable parser name (e.g. "Tesseract OCR").
+    version : str
+        Semantic version string (e.g. "1.2.3").
+    author : str
+        Author or organisation name.
+    url : str
+        URL for documentation, source code, or issue tracker.
+    """
+
+    # ------------------------------------------------------------------
+    # Class-level identity (checked by the registry, not Protocol methods)
+    # ------------------------------------------------------------------
+
+    name: str
+    version: str
+    author: str
+    url: str
+
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        """Return a mapping of supported MIME types to preferred file extensions.
+
+        The keys are MIME type strings (e.g. "application/pdf"), and the
+        values are the preferred file extension including the leading dot
+        (e.g. ".pdf").  The registry uses this mapping both to decide whether
+        a parser is a candidate for a given file and to determine the default
+        extension when creating archive copies.
+
+        Returns
+        -------
+        dict[str, str]
+            {mime_type: extension} mapping — may be empty if the parser
+            has been temporarily disabled.
+        """
+        ...
+
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        """Return a priority score for handling this file, or None to decline.
+
+        The registry calls this after confirming that the MIME type is in
+        supported_mime_types. Parsers may inspect filename and optionally
+        the file at path to refine their confidence level.
+
+        A higher score wins. Return None to explicitly decline handling a file
+        even though the MIME type is listed as supported (e.g. when a feature
+        flag is disabled, or a required service is not configured).
+
+        Parameters
+        ----------
+        mime_type:
+            The detected MIME type of the file to be parsed.
+        filename:
+            The original filename, including extension.
+        path:
+            Optional filesystem path to the file. Parsers that need to
+            inspect file content (e.g. magic-byte sniffing) may use this.
+            May be None when scoring happens before the file is available locally.
+
+        Returns
+        -------
+        int | None
+            Priority score (higher wins), or None to decline.
+        """
+        ...
+
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def can_produce_archive(self) -> bool:
+        """Whether this parser can produce a searchable PDF archive copy.
+
+        If True, the consumption pipeline may request an archive version when
+        processing the document, subject to the ARCHIVE_FILE_GENERATION
+        setting. If False, only thumbnail and text extraction are performed.
+        """
+        ...
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        """Whether the parser must produce a PDF for the frontend to display.
+
+        True for formats the browser cannot display natively (e.g. DOCX, ODT).
+        When True, the pipeline always stores the PDF output regardless of the
+        ARCHIVE_FILE_GENERATION setting, since the original format cannot be
+        shown to the user.
+        """
+        ...
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
+        """Parse document_path and populate internal state.
+
+        After a successful call, callers retrieve results via get_text,
+        get_date, and get_archive_path.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the document file to parse.
+        mime_type:
+            Detected MIME type of the document.
+        produce_archive:
+            When True (the default) and can_produce_archive is also True,
+            the parser should produce a searchable PDF at the path returned
+            by get_archive_path. Pass False when only text extraction and
+            thumbnail generation are required and disk I/O should be minimised.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If parsing fails for any reason.
+        """
+        ...
+
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
+
+    def get_text(self) -> str | None:
+        """Return the plain-text content extracted during parse.
+
+        Returns
+        -------
+        str | None
+            Extracted text, or None if no text could be found.
+        """
+        ...
+
+    def get_date(self) -> datetime.datetime | None:
+        """Return the document date detected during parse.
+
+        Returns
+        -------
+        datetime.datetime | None
+            Detected document date, or None if no date was found.
+        """
+        ...
+
+    def get_archive_path(self) -> Path | None:
+        """Return the path to the generated archive PDF, or None.
+
+        Returns
+        -------
+        Path | None
+            Path to the searchable PDF archive, or None if no archive was
+            produced (e.g. because produce_archive=False or the parser does
+            not support archive generation).
+        """
+        ...
+
+    # ------------------------------------------------------------------
+    # Thumbnail and metadata
+    # ------------------------------------------------------------------
+
+    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
+        """Generate and return the path to a thumbnail image for the document.
+
+        May be called independently of parse. The returned path must point to
+        an existing WebP image file inside the parser's temporary working
+        directory.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        Path
+            Path to the generated thumbnail image (WebP format preferred).
+        """
+        ...
+
+    def get_page_count(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> int | None:
+        """Return the number of pages in the document, if determinable.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        int | None
+            Page count, or None if the parser cannot determine it.
+        """
+        ...
+
+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract format-specific metadata from the document.
+
+        Called by the API view layer on demand — not during the consumption
+        pipeline. Results are returned to the frontend for per-file display.
+
+        For documents with an archive version, this method is called twice:
+        once for the original file (with its native MIME type) and once for
+        the archive file (with ``"application/pdf"``). Parsers that produce
+        archives should handle both cases.
+
+        Implementations must not raise. A failure to read metadata is not
+        fatal — log a warning and return whatever partial results were
+        collected, or ``[]`` if none.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the file to extract metadata from.
+        mime_type:
+            MIME type of the file at ``document_path``. May be
+            ``"application/pdf"`` when called for the archive version.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            Zero or more metadata entries. Returns ``[]`` if no metadata
+            could be extracted or the format does not support it.
+        """
+        ...
+
+    # ------------------------------------------------------------------
+    # Context manager
+    # ------------------------------------------------------------------
+
+    def __enter__(self) -> Self:
+        """Enter the parser context, returning the parser instance.
+
+        Implementations should perform any resource allocation here if not
+        done in __init__ (e.g. creating API clients or temp directories).
+
+        Returns
+        -------
+        Self
+            The parser instance itself.
+        """
+        ...
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        """Exit the parser context and release all resources.
+
+        Implementations must clean up all temporary files and other resources
+        regardless of whether an exception occurred.
+
+        Parameters
+        ----------
+        exc_type:
+            The exception class, or None if no exception was raised.
+        exc_val:
+            The exception instance, or None.
+        exc_tb:
+            The traceback, or None.
+        """
+        ...
--- a/src/paperless/parsers/registry.py
+++ b/src/paperless/parsers/registry.py
@@ -0,0 +1,364 @@
+"""
+Singleton registry that tracks all document parsers available to
+Paperless-ngx — both built-ins shipped with the application and third-party
+plugins installed via Python entrypoints.
+
+Public surface
+--------------
+get_parser_registry
+    Lazy-initialise and return the shared ParserRegistry. This is the primary
+    entry point for production code.
+
+init_builtin_parsers
+    Register built-in parsers only, without entrypoint discovery. Safe to
+    call from Celery worker_process_init where importing all entrypoints
+    would be wasteful or cause side effects.
+
+reset_parser_registry
+    Reset module-level state. For tests only.
+
+Entrypoint group
+----------------
+Third-party parsers must advertise themselves under the
+"paperless_ngx.parsers" entrypoint group in their pyproject.toml::
+
+    [project.entry-points."paperless_ngx.parsers"]
+    my_parser = "my_package.parsers:MyParser"
+
+The loaded class must expose the following attributes at the class level
+(not just on instances) for the registry to accept it:
+name, version, author, url, supported_mime_types (callable), score (callable).
+"""
+
+from __future__ import annotations
+
+import logging
+from importlib.metadata import entry_points
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from paperless.parsers import ParserProtocol
+
+logger = logging.getLogger("paperless.parsers.registry")
+
+# ---------------------------------------------------------------------------
+# Module-level singleton state
+# ---------------------------------------------------------------------------
+
+_registry: ParserRegistry | None = None
+_discovery_complete: bool = False
+
+# Attribute names that every registered external parser class must expose.
+_REQUIRED_ATTRS: tuple[str, ...] = (
+    "name",
+    "version",
+    "author",
+    "url",
+    "supported_mime_types",
+    "score",
+)
+
+
+# ---------------------------------------------------------------------------
+# Module-level accessor functions
+# ---------------------------------------------------------------------------
+
+
+def get_parser_registry() -> ParserRegistry:
+    """Return the shared ParserRegistry instance.
+
+    On the first call this function:
+
+    1. Creates a new ParserRegistry.
+    2. Calls register_defaults to install built-in parsers.
+    3. Calls discover to load third-party plugins via importlib.metadata entrypoints.
+    4. Calls log_summary to emit a startup summary.
+
+    Subsequent calls return the same instance immediately.
+
+    Returns
+    -------
+    ParserRegistry
+        The shared registry singleton.
+    """
+    global _registry, _discovery_complete
+
+    if _registry is None:
+        _registry = ParserRegistry()
+        _registry.register_defaults()
+
+    if not _discovery_complete:
+        _registry.discover()
+        _registry.log_summary()
+        _discovery_complete = True
+
+    return _registry
+
+
+def init_builtin_parsers() -> None:
+    """Register built-in parsers without performing entrypoint discovery.
+
+    Intended for use in Celery worker_process_init handlers where importing
+    all installed entrypoints would be wasteful, slow, or could produce
+    undesirable side effects. Entrypoint discovery (third-party plugins) is
+    deliberately not performed.
+
+    Safe to call multiple times — subsequent calls are no-ops.
+
+    Returns
+    -------
+    None
+    """
+    global _registry
+
+    if _registry is None:
+        _registry = ParserRegistry()
+        _registry.register_defaults()
+
+
+def reset_parser_registry() -> None:
+    """Reset the module-level registry state to its initial values.
+
+    Resets _registry and _discovery_complete so the next call to
+    get_parser_registry will re-initialise everything from scratch.
+
+    FOR TESTS ONLY. Do not call this in production code — resetting the
+    registry mid-request causes all subsequent parser lookups to go through
+    discovery again, which is expensive and may have unexpected side effects
+    in multi-threaded environments.
+
+    Returns
+    -------
+    None
+    """
+    global _registry, _discovery_complete
+
+    _registry = None
+    _discovery_complete = False
+
+
+# ---------------------------------------------------------------------------
+# Registry class
+# ---------------------------------------------------------------------------
+
+
+class ParserRegistry:
+    """Registry that maps MIME types to the best available parser class.
+
+    Parsers are partitioned into two lists:
+
+    _builtins
+        Parser classes registered via register_builtin (populated by
+        register_defaults in Phase 3+).
+
+    _external
+        Parser classes loaded from installed Python entrypoints via discover.
+
+    When resolving a parser for a file, external parsers are evaluated
+    alongside built-in parsers using a uniform scoring mechanism. Both lists
+    are iterated together; the class with the highest score wins. If an
+    external parser wins, its attribution details are logged so users can
+    identify which third-party package handled their document.
+    """
+
+    def __init__(self) -> None:
+        self._external: list[type[ParserProtocol]] = []
+        self._builtins: list[type[ParserProtocol]] = []
+
+    # ------------------------------------------------------------------
+    # Registration
+    # ------------------------------------------------------------------
+
+    def register_builtin(self, parser_class: type[ParserProtocol]) -> None:
+        """Register a built-in parser class.
+
+        Built-in parsers are shipped with Paperless-ngx and are appended to
+        the _builtins list. They are never overridden by external parsers;
+        instead, scoring determines which parser wins for any given file.
+
+        Parameters
+        ----------
+        parser_class:
+            The parser class to register. Must satisfy ParserProtocol.
+        """
+        self._builtins.append(parser_class)
+
+    def register_defaults(self) -> None:
+        """Register the built-in parsers that ship with Paperless-ngx.
+
+        Each parser that has been migrated to the new ParserProtocol interface
+        is registered here.  Parsers are added in ascending weight order so
+        that log output is predictable; scoring determines which parser wins
+        at runtime regardless of registration order.
+        """
+        from paperless.parsers.text import TextDocumentParser
+
+        self.register_builtin(TextDocumentParser)
+
+    # ------------------------------------------------------------------
+    # Discovery
+    # ------------------------------------------------------------------
+
+    def discover(self) -> None:
+        """Load third-party parsers from the "paperless_ngx.parsers" entrypoint group.
+
+        For each advertised entrypoint the method:
+
+        1. Calls ep.load() to import the class.
+        2. Validates that the class exposes all required attributes.
+        3. On success, appends the class to _external and logs an info message.
+        4. On failure (import error or missing attributes), logs an appropriate
+           warning/error and continues to the next entrypoint.
+
+        Errors during discovery of a single parser do not prevent other parsers
+        from being loaded.
+
+        Returns
+        -------
+        None
+        """
+        eps = entry_points(group="paperless_ngx.parsers")
+
+        for ep in eps:
+            try:
+                parser_class = ep.load()
+            except Exception:
+                logger.exception(
+                    "Failed to load parser entrypoint '%s' — skipping.",
+                    ep.name,
+                )
+                continue
+
+            missing = [
+                attr for attr in _REQUIRED_ATTRS if not hasattr(parser_class, attr)
+            ]
+            if missing:
+                logger.warning(
+                    "Parser loaded from entrypoint '%s' is missing required "
+                    "attributes %r — skipping.",
+                    ep.name,
+                    missing,
+                )
+                continue
+
+            self._external.append(parser_class)
+            logger.info(
+                "Loaded third-party parser '%s' v%s by %s (entrypoint: '%s').",
+                parser_class.name,
+                parser_class.version,
+                parser_class.author,
+                ep.name,
+            )
+
+    # ------------------------------------------------------------------
+    # Summary logging
+    # ------------------------------------------------------------------
+
+    def log_summary(self) -> None:
+        """Log a startup summary of all registered parsers.
+
+        Built-in parsers are listed first, followed by any external parsers
+        discovered from entrypoints.  If no external parsers were found a
+        short informational message is logged instead of an empty list.
+
+        Returns
+        -------
+        None
+        """
+        logger.info(
+            "Built-in parsers (%d):",
+            len(self._builtins),
+        )
+        for cls in self._builtins:
+            logger.info(
+                "  [built-in] %s v%s — %s",
+                getattr(cls, "name", repr(cls)),
+                getattr(cls, "version", "unknown"),
+                getattr(cls, "url", "built-in"),
+            )
+
+        if not self._external:
+            logger.info("No third-party parsers discovered.")
+            return
+
+        logger.info(
+            "Third-party parsers (%d):",
+            len(self._external),
+        )
+        for cls in self._external:
+            logger.info(
+                "  [external] %s v%s by %s — report issues at %s",
+                getattr(cls, "name", repr(cls)),
+                getattr(cls, "version", "unknown"),
+                getattr(cls, "author", "unknown"),
+                getattr(cls, "url", "unknown"),
+            )
+
+    # ------------------------------------------------------------------
+    # Parser resolution
+    # ------------------------------------------------------------------
+
+    def get_parser_for_file(
+        self,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> type[ParserProtocol] | None:
+        """Return the best parser class for the given file, or None.
+
+        All registered parsers (external first, then built-ins) are evaluated
+        against the file. A parser is eligible if mime_type appears in the dict
+        returned by its supported_mime_types classmethod, and its score
+        classmethod returns a non-None integer.
+
+        The parser with the highest score wins. When two parsers return the
+        same score, the one that appears earlier in the evaluation order wins
+        (external parsers are evaluated before built-ins, giving third-party
+        packages a chance to override defaults at equal priority).
+
+        When an external parser is selected, its identity is logged at INFO
+        level so operators can trace which package handled a document.
+
+        Parameters
+        ----------
+        mime_type:
+            The detected MIME type of the file.
+        filename:
+            The original filename, including extension.
+        path:
+            Optional filesystem path to the file. Forwarded to each
+            parser's score method.
+
+        Returns
+        -------
+        type[ParserProtocol] | None
+            The winning parser class, or None if no parser can handle the file.
+        """
+        best_score: int | None = None
+        best_parser: type[ParserProtocol] | None = None
+
+        # External parsers are placed first so that, at equal scores, an
+        # external parser wins over a built-in (first-seen policy).
+        for parser_class in (*self._external, *self._builtins):
+            if mime_type not in parser_class.supported_mime_types():
+                continue
+
+            score = parser_class.score(mime_type, filename, path)
+            if score is None:
+                continue
+
+            if best_score is None or score > best_score:
+                best_score = score
+                best_parser = parser_class
+
+        if best_parser is not None and best_parser in self._external:
+            logger.info(
+                "Document handled by third-party parser '%s' v%s — %s",
+                getattr(best_parser, "name", repr(best_parser)),
+                getattr(best_parser, "version", "unknown"),
+                getattr(best_parser, "url", "unknown"),
+            )
+
+        return best_parser
--- a/src/paperless/parsers/text.py
+++ b/src/paperless/parsers/text.py
@@ -0,0 +1,320 @@
+"""
+Built-in plain-text document parser.
+
+Handles text/plain, text/csv, and application/csv MIME types by reading the
+file content directly.  Thumbnails are generated by rendering a page-sized
+WebP image from the first 100,000 characters using Pillow.
+"""
+
+from __future__ import annotations
+
+import logging
+import shutil
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Self
+
+from django.conf import settings
+from PIL import Image
+from PIL import ImageDraw
+from PIL import ImageFont
+
+from paperless.version import __full_version_str__
+
+if TYPE_CHECKING:
+    import datetime
+    from types import TracebackType
+
+    from paperless.parsers import MetadataEntry
+
+logger = logging.getLogger("paperless.parsing.text")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+    "text/plain": ".txt",
+    "text/csv": ".csv",
+    "application/csv": ".csv",
+}
+
+
+class TextDocumentParser:
+    """Parse plain-text documents (txt, csv) for Paperless-ngx.
+
+    This parser reads the file content directly as UTF-8 text and renders a
+    simple thumbnail using Pillow.  It does not perform OCR and does not
+    produce a searchable PDF archive copy.
+
+    Class attributes
+    ----------------
+    name : str
+        Human-readable parser name.
+    version : str
+        Semantic version string, kept in sync with Paperless-ngx releases.
+    author : str
+        Maintainer name.
+    url : str
+        Issue tracker / source URL.
+    """
+
+    name: str = "Paperless-ngx Text Parser"
+    version: str = __full_version_str__
+    author: str = "Paperless-ngx Contributors"
+    url: str = "https://github.com/paperless-ngx/paperless-ngx"
+
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        """Return the MIME types this parser handles.
+
+        Returns
+        -------
+        dict[str, str]
+            Mapping of MIME type to preferred file extension.
+        """
+        return _SUPPORTED_MIME_TYPES
+
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        """Return the priority score for handling this file.
+
+        Parameters
+        ----------
+        mime_type:
+            Detected MIME type of the file.
+        filename:
+            Original filename including extension.
+        path:
+            Optional filesystem path. Not inspected by this parser.
+
+        Returns
+        -------
+        int | None
+            10 if the MIME type is supported, otherwise None.
+        """
+        if mime_type in _SUPPORTED_MIME_TYPES:
+            return 10
+        return None
+
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def can_produce_archive(self) -> bool:
+        """Whether this parser can produce a searchable PDF archive copy.
+
+        Returns
+        -------
+        bool
+            Always False — the text parser does not produce a PDF archive.
+        """
+        return False
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        """Whether the parser must produce a PDF for the frontend to display.
+
+        Returns
+        -------
+        bool
+            Always False — plain text files are displayable as-is.
+        """
+        return False
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def __init__(self, logging_group: object = None) -> None:
+        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+        self._tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
+        self._text: str | None = None
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        logger.debug("Cleaning up temporary directory %s", self._tempdir)
+        shutil.rmtree(self._tempdir, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
+        """Read the document and store its text content.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the text file.
+        mime_type:
+            Detected MIME type of the document.
+        produce_archive:
+            Ignored — this parser never produces a PDF archive.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If the file cannot be read.
+        """
+        self._text = self._read_text(document_path)
+
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
+
+    def get_text(self) -> str | None:
+        """Return the plain-text content extracted during parse.
+
+        Returns
+        -------
+        str | None
+            Extracted text, or None if parse has not been called yet.
+        """
+        return self._text
+
+    def get_date(self) -> datetime.datetime | None:
+        """Return the document date detected during parse.
+
+        Returns
+        -------
+        datetime.datetime | None
+            Always None — the text parser does not detect dates.
+        """
+        return None
+
+    def get_archive_path(self) -> Path | None:
+        """Return the path to a generated archive PDF, or None.
+
+        Returns
+        -------
+        Path | None
+            Always None — the text parser does not produce a PDF archive.
+        """
+        return None
+
+    # ------------------------------------------------------------------
+    # Thumbnail and metadata
+    # ------------------------------------------------------------------
+
+    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
+        """Render the first portion of the document as a WebP thumbnail.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        Path
+            Path to the generated WebP thumbnail inside the temporary directory.
+        """
+        max_chars = 100_000
+        file_size_limit = 50 * 1024 * 1024
+
+        if document_path.stat().st_size > file_size_limit:
+            text = "[File too large to preview]"
+        else:
+            with Path(document_path).open("r", encoding="utf-8", errors="replace") as f:
+                text = f.read(max_chars)
+
+        img = Image.new("RGB", (500, 700), color="white")
+        draw = ImageDraw.Draw(img)
+        font = ImageFont.truetype(
+            font=settings.THUMBNAIL_FONT_NAME,
+            size=20,
+            layout_engine=ImageFont.Layout.BASIC,
+        )
+        draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
+
+        out_path = self._tempdir / "thumb.webp"
+        img.save(out_path, format="WEBP")
+
+        return out_path
+
+    def get_page_count(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> int | None:
+        """Return the number of pages in the document.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        int | None
+            Always None — page count is not meaningful for plain text.
+        """
+        return None
+
+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract format-specific metadata from the document.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            Always ``[]`` — plain text files carry no structured metadata.
+        """
+        return []
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    def _read_text(self, filepath: Path) -> str:
+        """Read file content, replacing invalid UTF-8 bytes rather than failing.
+
+        Parameters
+        ----------
+        filepath:
+            Path to the file to read.
+
+        Returns
+        -------
+        str
+            File content as a string.
+        """
+        try:
+            return filepath.read_text(encoding="utf-8")
+        except UnicodeDecodeError as exc:
+            logger.warning(
+                "Unicode error reading %s, replacing bad bytes: %s",
+                filepath,
+                exc,
+            )
+            return filepath.read_bytes().decode("utf-8", errors="replace")
--- a/src/paperless/parsers/tika.py
+++ b/src/paperless/parsers/tika.py
@@ -0,0 +1,440 @@
+"""
+Built-in Tika document parser.
+
+Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
+sending them to an Apache Tika server for text extraction and a Gotenberg
+server for PDF conversion.  Because the source formats cannot be rendered by
+a browser natively, the parser always produces a PDF rendition for display.
+"""
+
+from __future__ import annotations
+
+import logging
+import shutil
+import tempfile
+from contextlib import ExitStack
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Self
+
+import httpx
+from django.conf import settings
+from django.utils import timezone
+from gotenberg_client import GotenbergClient
+from gotenberg_client.options import PdfAFormat
+from tika_client import TikaClient
+
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from paperless.config import OutputTypeConfig
+from paperless.models import OutputTypeChoices
+from paperless.version import __full_version_str__
+
+if TYPE_CHECKING:
+    import datetime
+    from types import TracebackType
+
+    from paperless.parsers import MetadataEntry
+
+logger = logging.getLogger("paperless.parsing.tika")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+    "application/msword": ".doc",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+    "application/vnd.ms-excel": ".xls",
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
+    "application/vnd.ms-powerpoint": ".ppt",
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
+    "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
+    "application/vnd.oasis.opendocument.presentation": ".odp",
+    "application/vnd.oasis.opendocument.spreadsheet": ".ods",
+    "application/vnd.oasis.opendocument.text": ".odt",
+    "application/vnd.oasis.opendocument.graphics": ".odg",
+    "text/rtf": ".rtf",
+}
+
+
+class TikaDocumentParser:
+    """Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
+
+    Text extraction is handled by the Tika server.  PDF conversion for display
+    is handled by Gotenberg (LibreOffice route).  Because the source formats
+    cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
+    True and the PDF is always produced regardless of the ``produce_archive``
+    flag passed to ``parse``.
+
+    Both ``TikaClient`` and ``GotenbergClient`` are opened once in
+    ``__enter__`` via an ``ExitStack`` and shared across ``parse``,
+    ``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
+    ``ExitStack.close()`` in ``__exit__``.  The parser must always be used
+    as a context manager.
+
+    Class attributes
+    ----------------
+    name : str
+        Human-readable parser name.
+    version : str
+        Semantic version string, kept in sync with Paperless-ngx releases.
+    author : str
+        Maintainer name.
+    url : str
+        Issue tracker / source URL.
+    """
+
+    name: str = "Paperless-ngx Tika Parser"
+    version: str = __full_version_str__
+    author: str = "Paperless-ngx Contributors"
+    url: str = "https://github.com/paperless-ngx/paperless-ngx"
+
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        """Return the MIME types this parser handles.
+
+        Returns
+        -------
+        dict[str, str]
+            Mapping of MIME type to preferred file extension.
+        """
+        return _SUPPORTED_MIME_TYPES
+
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        """Return the priority score for handling this file.
+
+        Returns ``None`` when Tika integration is disabled so the registry
+        skips this parser entirely.
+
+        Parameters
+        ----------
+        mime_type:
+            Detected MIME type of the file.
+        filename:
+            Original filename including extension.
+        path:
+            Optional filesystem path. Not inspected by this parser.
+
+        Returns
+        -------
+        int | None
+            10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
+        """
+        if not settings.TIKA_ENABLED:
+            return None
+        if mime_type in _SUPPORTED_MIME_TYPES:
+            return 10
+        return None
+
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def can_produce_archive(self) -> bool:
+        """Whether this parser can produce a searchable PDF archive copy.
+
+        Returns
+        -------
+        bool
+            Always False — Tika produces a display PDF, not an OCR archive.
+        """
+        return False
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        """Whether the parser must produce a PDF for the frontend to display.
+
+        Returns
+        -------
+        bool
+            Always True — Office formats cannot be rendered natively in a
+            browser, so a PDF conversion is always required for display.
+        """
+        return True
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def __init__(self, logging_group: object = None) -> None:
+        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+        self._tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
+        self._text: str | None = None
+        self._date: datetime.datetime | None = None
+        self._archive_path: Path | None = None
+        self._exit_stack = ExitStack()
+        self._tika_client: TikaClient | None = None
+        self._gotenberg_client: GotenbergClient | None = None
+
+    def __enter__(self) -> Self:
+        self._tika_client = self._exit_stack.enter_context(
+            TikaClient(
+                tika_url=settings.TIKA_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ),
+        )
+        self._gotenberg_client = self._exit_stack.enter_context(
+            GotenbergClient(
+                host=settings.TIKA_GOTENBERG_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ),
+        )
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        self._exit_stack.close()
+        logger.debug("Cleaning up temporary directory %s", self._tempdir)
+        shutil.rmtree(self._tempdir, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
+        """Send the document to Tika for text extraction and Gotenberg for PDF.
+
+        Because ``requires_pdf_rendition`` is True the PDF conversion is
+        always performed — the ``produce_archive`` flag is intentionally
+        ignored.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the document file to parse.
+        mime_type:
+            Detected MIME type of the document.
+        produce_archive:
+            Accepted for protocol compatibility but ignored; the PDF rendition
+            is always produced since the source format cannot be displayed
+            natively in the browser.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If Tika or Gotenberg returns an error.
+        """
+        if TYPE_CHECKING:
+            assert self._tika_client is not None
+
+        logger.info("Sending %s to Tika server", document_path)
+
+        try:
+            try:
+                parsed = self._tika_client.tika.as_text.from_file(
+                    document_path,
+                    mime_type,
+                )
+            except httpx.HTTPStatusError as err:
+                # Workaround https://issues.apache.org/jira/browse/TIKA-4110
+                # Tika fails with some files as multi-part form data
+                if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
+                    parsed = self._tika_client.tika.as_text.from_buffer(
+                        document_path.read_bytes(),
+                        mime_type,
+                    )
+                else:  # pragma: no cover
+                    raise
+        except Exception as err:
+            raise ParseError(
+                f"Could not parse {document_path} with tika server at "
+                f"{settings.TIKA_ENDPOINT}: {err}",
+            ) from err
+
+        self._text = parsed.content
+        if self._text is not None:
+            self._text = self._text.strip()
+
+        self._date = parsed.created
+        if self._date is not None and timezone.is_naive(self._date):
+            self._date = timezone.make_aware(self._date)
+
+        # Always convert — requires_pdf_rendition=True means the browser
+        # cannot display the source format natively.
+        self._archive_path = self._convert_to_pdf(document_path)
+
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
+
+    def get_text(self) -> str | None:
+        """Return the plain-text content extracted during parse.
+
+        Returns
+        -------
+        str | None
+            Extracted text, or None if parse has not been called yet.
+        """
+        return self._text
+
+    def get_date(self) -> datetime.datetime | None:
+        """Return the document date detected during parse.
+
+        Returns
+        -------
+        datetime.datetime | None
+            Creation date from Tika metadata, or None if not detected.
+        """
+        return self._date
+
+    def get_archive_path(self) -> Path | None:
+        """Return the path to the generated PDF rendition, or None.
+
+        Returns
+        -------
+        Path | None
+            Path to the PDF produced by Gotenberg, or None if parse has not
+            been called yet.
+        """
+        return self._archive_path
+
+    # ------------------------------------------------------------------
+    # Thumbnail and metadata
+    # ------------------------------------------------------------------
+
+    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
+        """Generate a thumbnail from the PDF rendition of the document.
+
+        Converts the document to PDF first if not already done.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        Path
+            Path to the generated WebP thumbnail inside the temporary directory.
+        """
+        if self._archive_path is None:
+            self._archive_path = self._convert_to_pdf(document_path)
+        return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
+
+    def get_page_count(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> int | None:
+        """Return the number of pages in the document.
+
+        Returns
+        -------
+        int | None
+            Always None — page count is not available from Tika.
+        """
+        return None
+
+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract format-specific metadata via the Tika metadata endpoint.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            All key/value pairs returned by Tika, or ``[]`` on error.
+        """
+        if TYPE_CHECKING:
+            assert self._tika_client is not None
+
+        try:
+            parsed = self._tika_client.metadata.from_file(document_path, mime_type)
+            return [
+                {
+                    "namespace": "",
+                    "prefix": "",
+                    "key": key,
+                    "value": parsed.data[key],
+                }
+                for key in parsed.data
+            ]
+        except Exception as e:
+            logger.warning(
+                "Error while fetching document metadata for %s: %s",
+                document_path,
+                e,
+            )
+            return []
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    def _convert_to_pdf(self, document_path: Path) -> Path:
+        """Convert the document to PDF using Gotenberg's LibreOffice route.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+
+        Returns
+        -------
+        Path
+            Path to the generated PDF inside the temporary directory.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If Gotenberg returns an error.
+        """
+        if TYPE_CHECKING:
+            assert self._gotenberg_client is not None
+
+        pdf_path = self._tempdir / "convert.pdf"
+
+        logger.info("Converting %s to PDF as %s", document_path, pdf_path)
+
+        with self._gotenberg_client.libre_office.to_pdf() as route:
+            # Set the output format of the resulting PDF.
+            # OutputTypeConfig reads the database-stored ApplicationConfiguration
+            # first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
+            output_type = OutputTypeConfig().output_type
+            if output_type in {
+                OutputTypeChoices.PDF_A,
+                OutputTypeChoices.PDF_A2,
+            }:
+                route.pdf_format(PdfAFormat.A2b)
+            elif output_type == OutputTypeChoices.PDF_A1:
+                logger.warning(
+                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
+                )
+                route.pdf_format(PdfAFormat.A2b)
+            elif output_type == OutputTypeChoices.PDF_A3:
+                route.pdf_format(PdfAFormat.A3b)
+
+            route.convert(document_path)
+
+            try:
+                response = route.run()
+                pdf_path.write_bytes(response.content)
+                return pdf_path
+            except Exception as err:
+                raise ParseError(
+                    f"Error while converting document to PDF: {err}",
+                ) from err
--- a/src/paperless/tests/conftest.py
+++ b/src/paperless/tests/conftest.py
@@ -0,0 +1,48 @@
+"""
+Fixtures defined here are available to every test module under
+src/paperless/tests/ (including sub-packages such as parsers/).
+
+Session-scoped fixtures for the shared samples directory live here so
+sub-package conftest files can reference them without duplicating path logic.
+Parser-specific fixtures (concrete parser instances, format-specific sample
+files) live in paperless/tests/parsers/conftest.py.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import pytest
+
+from paperless.parsers.registry import reset_parser_registry
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+@pytest.fixture(scope="session")
+def samples_dir() -> Path:
+    """Absolute path to the shared parser sample files directory.
+
+    Sub-package conftest files derive format-specific paths from this root,
+    e.g. ``samples_dir / "text" / "test.txt"``.
+
+    Returns
+    -------
+    Path
+        Directory containing all sample documents used by parser tests.
+    """
+    return (Path(__file__).parent / "samples").resolve()
+
+
+@pytest.fixture(autouse=True)
+def clean_registry() -> Generator[None, None, None]:
+    """Reset the parser registry before and after every test.
+
+    This prevents registry state from leaking between tests that call
+    get_parser_registry() or init_builtin_parsers().
+    """
+    reset_parser_registry()
+    yield
+    reset_parser_registry()
--- a/src/paperless/tests/parsers/init.py
+++ b/src/paperless/tests/parsers/init.py
--- a/src/paperless/tests/parsers/conftest.py
+++ b/src/paperless/tests/parsers/conftest.py
@@ -0,0 +1,160 @@
+"""
+Parser fixtures that are used across multiple test modules in this package
+are defined here.  Format-specific sample-file fixtures are grouped by parser
+so it is easy to see which files belong to which test module.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+from paperless.parsers.text import TextDocumentParser
+from paperless.parsers.tika import TikaDocumentParser
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+    from pathlib import Path
+
+
+# ------------------------------------------------------------------
+# Text parser sample files
+# ------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def text_samples_dir(samples_dir: Path) -> Path:
+    """Absolute path to the text parser sample files directory.
+
+    Returns
+    -------
+    Path
+        ``<samples_dir>/text/``
+    """
+    return samples_dir / "text"
+
+
+@pytest.fixture(scope="session")
+def sample_txt_file(text_samples_dir: Path) -> Path:
+    """Path to a valid UTF-8 plain-text sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``text/test.txt``.
+    """
+    return text_samples_dir / "test.txt"
+
+
+@pytest.fixture(scope="session")
+def malformed_txt_file(text_samples_dir: Path) -> Path:
+    """Path to a text file containing invalid UTF-8 bytes.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``text/decode_error.txt``.
+    """
+    return text_samples_dir / "decode_error.txt"
+
+
+# ------------------------------------------------------------------
+# Text parser instance
+# ------------------------------------------------------------------
+
+
+@pytest.fixture()
+def text_parser() -> Generator[TextDocumentParser, None, None]:
+    """Yield a TextDocumentParser and clean up its temporary directory afterwards.
+
+    Yields
+    ------
+    TextDocumentParser
+        A ready-to-use parser instance.
+    """
+    with TextDocumentParser() as parser:
+        yield parser
+
+
+# ------------------------------------------------------------------
+# Tika parser sample files
+# ------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def tika_samples_dir(samples_dir: Path) -> Path:
+    """Absolute path to the Tika parser sample files directory.
+
+    Returns
+    -------
+    Path
+        ``<samples_dir>/tika/``
+    """
+    return samples_dir / "tika"
+
+
+@pytest.fixture(scope="session")
+def sample_odt_file(tika_samples_dir: Path) -> Path:
+    """Path to a sample ODT file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tika/sample.odt``.
+    """
+    return tika_samples_dir / "sample.odt"
+
+
+@pytest.fixture(scope="session")
+def sample_docx_file(tika_samples_dir: Path) -> Path:
+    """Path to a sample DOCX file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tika/sample.docx``.
+    """
+    return tika_samples_dir / "sample.docx"
+
+
+@pytest.fixture(scope="session")
+def sample_doc_file(tika_samples_dir: Path) -> Path:
+    """Path to a sample DOC file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tika/sample.doc``.
+    """
+    return tika_samples_dir / "sample.doc"
+
+
+@pytest.fixture(scope="session")
+def sample_broken_odt(tika_samples_dir: Path) -> Path:
+    """Path to a broken ODT file that triggers the multi-part fallback.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``tika/multi-part-broken.odt``.
+    """
+    return tika_samples_dir / "multi-part-broken.odt"
+
+
+# ------------------------------------------------------------------
+# Tika parser instance
+# ------------------------------------------------------------------
+
+
+@pytest.fixture()
+def tika_parser() -> Generator[TikaDocumentParser, None, None]:
+    """Yield a TikaDocumentParser and clean up its temporary directory afterwards.
+
+    Yields
+    ------
+    TikaDocumentParser
+        A ready-to-use parser instance.
+    """
+    with TikaDocumentParser() as parser:
+        yield parser
--- a/src/paperless/tests/parsers/test_text_parser.py
+++ b/src/paperless/tests/parsers/test_text_parser.py
@@ -0,0 +1,256 @@
+"""
+Tests for paperless.parsers.text.TextDocumentParser.
+
+All tests use the context-manager protocol for parser lifecycle.  Sample
+files are provided by session-scoped fixtures defined in conftest.py.
+"""
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from paperless.parsers import ParserProtocol
+from paperless.parsers.text import TextDocumentParser
+
+
+class TestTextParserProtocol:
+    """Verify that TextDocumentParser satisfies the ParserProtocol contract."""
+
+    def test_isinstance_satisfies_protocol(
+        self,
+        text_parser: TextDocumentParser,
+    ) -> None:
+        assert isinstance(text_parser, ParserProtocol)
+
+    def test_class_attributes_present(self) -> None:
+        assert isinstance(TextDocumentParser.name, str) and TextDocumentParser.name
+        assert (
+            isinstance(TextDocumentParser.version, str) and TextDocumentParser.version
+        )
+        assert isinstance(TextDocumentParser.author, str) and TextDocumentParser.author
+        assert isinstance(TextDocumentParser.url, str) and TextDocumentParser.url
+
+    def test_supported_mime_types_returns_dict(self) -> None:
+        mime_types = TextDocumentParser.supported_mime_types()
+        assert isinstance(mime_types, dict)
+        assert "text/plain" in mime_types
+        assert "text/csv" in mime_types
+        assert "application/csv" in mime_types
+
+    @pytest.mark.parametrize(
+        ("mime_type", "expected"),
+        [
+            ("text/plain", 10),
+            ("text/csv", 10),
+            ("application/csv", 10),
+            ("application/pdf", None),
+            ("image/png", None),
+        ],
+    )
+    def test_score(self, mime_type: str, expected: int | None) -> None:
+        assert TextDocumentParser.score(mime_type, "file.txt") == expected
+
+    def test_can_produce_archive_is_false(
+        self,
+        text_parser: TextDocumentParser,
+    ) -> None:
+        assert text_parser.can_produce_archive is False
+
+    def test_requires_pdf_rendition_is_false(
+        self,
+        text_parser: TextDocumentParser,
+    ) -> None:
+        assert text_parser.requires_pdf_rendition is False
+
+
+class TestTextParserLifecycle:
+    """Verify context-manager behaviour and temporary directory cleanup."""
+
+    def test_context_manager_cleans_up_tempdir(self) -> None:
+        with TextDocumentParser() as parser:
+            tempdir = parser._tempdir
+            assert tempdir.exists()
+        assert not tempdir.exists()
+
+    def test_context_manager_cleans_up_after_exception(self) -> None:
+        tempdir: Path | None = None
+        with pytest.raises(RuntimeError):
+            with TextDocumentParser() as parser:
+                tempdir = parser._tempdir
+                raise RuntimeError("boom")
+        assert tempdir is not None
+        assert not tempdir.exists()
+
+
+class TestTextParserParse:
+    """Verify parse() and the result accessors."""
+
+    def test_parse_valid_utf8(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        text_parser.parse(sample_txt_file, "text/plain")
+
+        assert text_parser.get_text() == "This is a test file.\n"
+
+    def test_parse_returns_none_for_archive_path(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        text_parser.parse(sample_txt_file, "text/plain")
+
+        assert text_parser.get_archive_path() is None
+
+    def test_parse_returns_none_for_date(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        text_parser.parse(sample_txt_file, "text/plain")
+
+        assert text_parser.get_date() is None
+
+    def test_parse_invalid_utf8_bytes_replaced(
+        self,
+        text_parser: TextDocumentParser,
+        malformed_txt_file: Path,
+    ) -> None:
+        """
+        GIVEN:
+            - A text file containing invalid UTF-8 byte sequences
+        WHEN:
+            - The file is parsed
+        THEN:
+            - Parsing succeeds
+            - Invalid bytes are replaced with the Unicode replacement character
+        """
+        text_parser.parse(malformed_txt_file, "text/plain")
+
+        assert text_parser.get_text() == "Pantothens\ufffdure\n"
+
+    def test_get_text_none_before_parse(
+        self,
+        text_parser: TextDocumentParser,
+    ) -> None:
+        assert text_parser.get_text() is None
+
+
+class TestTextParserThumbnail:
+    """Verify thumbnail generation."""
+
+    def test_thumbnail_exists_and_is_file(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        thumb = text_parser.get_thumbnail(sample_txt_file, "text/plain")
+
+        assert thumb.exists()
+        assert thumb.is_file()
+
+    def test_thumbnail_large_file_does_not_read_all(
+        self,
+        text_parser: TextDocumentParser,
+    ) -> None:
+        """
+        GIVEN:
+            - A text file larger than 50 MB
+        WHEN:
+            - A thumbnail is requested
+        THEN:
+            - The thumbnail is generated without loading the full file
+        """
+        with tempfile.NamedTemporaryFile(
+            delete=False,
+            mode="w",
+            encoding="utf-8",
+            suffix=".txt",
+        ) as tmp:
+            tmp.write("A" * (51 * 1024 * 1024))
+            large_file = Path(tmp.name)
+
+        try:
+            thumb = text_parser.get_thumbnail(large_file, "text/plain")
+            assert thumb.exists()
+            assert thumb.is_file()
+        finally:
+            large_file.unlink(missing_ok=True)
+
+    def test_get_page_count_returns_none(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        assert text_parser.get_page_count(sample_txt_file, "text/plain") is None
+
+
+class TestTextParserMetadata:
+    """Verify extract_metadata behaviour."""
+
+    def test_extract_metadata_returns_empty_list(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        result = text_parser.extract_metadata(sample_txt_file, "text/plain")
+
+        assert result == []
+
+    def test_extract_metadata_returns_list_type(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        result = text_parser.extract_metadata(sample_txt_file, "text/plain")
+
+        assert isinstance(result, list)
+
+    def test_extract_metadata_ignores_mime_type(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        """extract_metadata returns [] regardless of the mime_type argument."""
+        assert text_parser.extract_metadata(sample_txt_file, "application/pdf") == []
+        assert text_parser.extract_metadata(sample_txt_file, "text/csv") == []
+
+
+class TestTextParserRegistry:
+    """Verify that TextDocumentParser is registered by default."""
+
+    def test_registered_in_defaults(self) -> None:
+        from paperless.parsers.registry import ParserRegistry
+
+        registry = ParserRegistry()
+        registry.register_defaults()
+
+        assert TextDocumentParser in registry._builtins
+
+    def test_get_parser_for_text_plain(self) -> None:
+        from paperless.parsers.registry import get_parser_registry
+
+        registry = get_parser_registry()
+        parser_cls = registry.get_parser_for_file("text/plain", "doc.txt")
+
+        assert parser_cls is TextDocumentParser
+
+    def test_get_parser_for_text_csv(self) -> None:
+        from paperless.parsers.registry import get_parser_registry
+
+        registry = get_parser_registry()
+        parser_cls = registry.get_parser_for_file("text/csv", "data.csv")
+
+        assert parser_cls is TextDocumentParser
+
+    def test_get_parser_for_unknown_type_returns_none(self) -> None:
+        from paperless.parsers.registry import get_parser_registry
+
+        registry = get_parser_registry()
+        parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
+
+        assert parser_cls is None
--- a/src/paperless/tests/parsers/test_tika_liva.py
+++ b/src/paperless/tests/parsers/test_tika_liva.py
@@ -4,7 +4,7 @@ from pathlib import Path
 import pytest

 from documents.tests.utils import util_call_with_backoff
-from paperless_tika.parsers import TikaDocumentParser
+from paperless.parsers.tika import TikaDocumentParser


@pytest.mark.skipif(
@@ -42,14 +42,15 @@ class TestTikaParserAgainstServer:
        )

        assert (
-            tika_parser.text
+            tika_parser.get_text()
            == "This is an ODT test document, created September 14, 2022"
        )
-        assert tika_parser.archive_path is not None
-        assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
+        archive = tika_parser.get_archive_path()
+        assert archive is not None
+        assert b"PDF-" in archive.read_bytes()[:10]

        # TODO: Unsure what can set the Creation-Date field in a document, enable when possible
-        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))

    def test_basic_parse_docx(
        self,
@@ -74,14 +75,15 @@ class TestTikaParserAgainstServer:
        )

        assert (
-            tika_parser.text
+            tika_parser.get_text()
            == "This is an DOCX test document, also made September 14, 2022"
        )
-        assert tika_parser.archive_path is not None
-        with Path(tika_parser.archive_path).open("rb") as f:
+        archive = tika_parser.get_archive_path()
+        assert archive is not None
+        with archive.open("rb") as f:
            assert b"PDF-" in f.read()[:10]

-        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))

    def test_basic_parse_doc(
        self,
@@ -102,13 +104,12 @@ class TestTikaParserAgainstServer:
            [sample_doc_file, "application/msword"],
        )

-        assert tika_parser.text is not None
-        assert (
-            "This is a test document, saved in the older .doc format"
-            in tika_parser.text
-        )
-        assert tika_parser.archive_path is not None
-        with Path(tika_parser.archive_path).open("rb") as f:
+        text = tika_parser.get_text()
+        assert text is not None
+        assert "This is a test document, saved in the older .doc format" in text
+        archive = tika_parser.get_archive_path()
+        assert archive is not None
+        with archive.open("rb") as f:
            assert b"PDF-" in f.read()[:10]

    def test_tika_fails_multi_part(
@@ -133,6 +134,7 @@ class TestTikaParserAgainstServer:
            [sample_broken_odt, "application/vnd.oasis.opendocument.text"],
        )

-        assert tika_parser.archive_path is not None
-        with Path(tika_parser.archive_path).open("rb") as f:
+        archive = tika_parser.get_archive_path()
+        assert archive is not None
+        with archive.open("rb") as f:
            assert b"PDF-" in f.read()[:10]
--- a/src/paperless/tests/parsers/test_tika_parser.py
+++ b/src/paperless/tests/parsers/test_tika_parser.py
@@ -9,7 +9,56 @@ from pytest_django.fixtures import SettingsWrapper
 from pytest_httpx import HTTPXMock

 from documents.parsers import ParseError
-from paperless_tika.parsers import TikaDocumentParser
+from paperless.parsers import ParserProtocol
+from paperless.parsers.tika import TikaDocumentParser
+
+
+class TestTikaParserRegistryInterface:
+    """Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
+
+    def test_satisfies_parser_protocol(self) -> None:
+        assert isinstance(TikaDocumentParser(), ParserProtocol)
+
+    def test_supported_mime_types_is_classmethod(self) -> None:
+        mime_types = TikaDocumentParser.supported_mime_types()
+        assert isinstance(mime_types, dict)
+        assert len(mime_types) > 0
+
+    def test_score_returns_none_when_tika_disabled(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.TIKA_ENABLED = False
+        result = TikaDocumentParser.score(
+            "application/vnd.oasis.opendocument.text",
+            "sample.odt",
+        )
+        assert result is None
+
+    def test_score_returns_int_when_tika_enabled(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.TIKA_ENABLED = True
+        result = TikaDocumentParser.score(
+            "application/vnd.oasis.opendocument.text",
+            "sample.odt",
+        )
+        assert isinstance(result, int)
+
+    def test_score_returns_none_for_unsupported_mime(
+        self,
+        settings: SettingsWrapper,
+    ) -> None:
+        settings.TIKA_ENABLED = True
+        result = TikaDocumentParser.score("application/pdf", "doc.pdf")
+        assert result is None
+
+    def test_can_produce_archive_is_false(self) -> None:
+        assert TikaDocumentParser().can_produce_archive is False
+
+    def test_requires_pdf_rendition_is_true(self) -> None:
+        assert TikaDocumentParser().requires_pdf_rendition is True


@pytest.mark.django_db()
@@ -36,12 +85,12 @@ class TestTikaParser:

        tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")

-        assert tika_parser.text == "the content"
-        assert tika_parser.archive_path is not None
-        with Path(tika_parser.archive_path).open("rb") as f:
+        assert tika_parser.get_text() == "the content"
+        assert tika_parser.get_archive_path() is not None
+        with Path(tika_parser.get_archive_path()).open("rb") as f:
            assert f.read() == b"PDF document"

-        assert tika_parser.date == datetime.datetime(
+        assert tika_parser.get_date() == datetime.datetime(
            2020,
            11,
            21,
@@ -89,7 +138,7 @@ class TestTikaParser:
        httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)

        with pytest.raises(ParseError):
-            tika_parser.convert_to_pdf(sample_odt_file, None)
+            tika_parser._convert_to_pdf(sample_odt_file)

    @pytest.mark.parametrize(
        ("setting_value", "expected_form_value"),
@@ -106,7 +155,6 @@ class TestTikaParser:
        expected_form_value: str,
        httpx_mock: HTTPXMock,
        settings: SettingsWrapper,
-        tika_parser: TikaDocumentParser,
        sample_odt_file: Path,
    ) -> None:
        """
@@ -117,6 +165,8 @@ class TestTikaParser:
        THEN:
            - Request to Gotenberg contains the expected PDF/A format string
        """
+        # Parser must be created after the setting is changed so that
+        # OutputTypeConfig reads the correct value at __init__ time.
        settings.OCR_OUTPUT_TYPE = setting_value
        httpx_mock.add_response(
            status_code=codes.OK,
@@ -124,7 +174,8 @@ class TestTikaParser:
            method="POST",
        )

-        tika_parser.convert_to_pdf(sample_odt_file, None)
+        with TikaDocumentParser() as parser:
+            parser._convert_to_pdf(sample_odt_file)

        request = httpx_mock.get_request()

--- a/src/paperless/tests/samples/text/decode_error.txt
+++ b/src/paperless/tests/samples/text/decode_error.txt
--- a/src/paperless/tests/samples/text/test.txt
+++ b/src/paperless/tests/samples/text/test.txt
--- a/src/paperless/tests/samples/tika/multi-part-broken.odt
+++ b/src/paperless/tests/samples/tika/multi-part-broken.odt
--- a/src/paperless/tests/samples/tika/sample.doc
+++ b/src/paperless/tests/samples/tika/sample.doc
--- a/src/paperless/tests/samples/tika/sample.docx
+++ b/src/paperless/tests/samples/tika/sample.docx
--- a/src/paperless/tests/samples/tika/sample.odt
+++ b/src/paperless/tests/samples/tika/sample.odt
--- a/src/paperless/tests/test_registry.py
+++ b/src/paperless/tests/test_registry.py
@@ -0,0 +1,714 @@
+"""
+Tests for :mod:`paperless.parsers` (ParserProtocol) and
+:mod:`paperless.parsers.registry` (ParserRegistry + module-level helpers).
+
+All tests use pytest-style functions/classes — no unittest.TestCase.
+The ``clean_registry`` fixture ensures complete isolation between tests by
+resetting the module-level singleton before and after every test.
+"""
+
+from __future__ import annotations
+
+import logging
+from importlib.metadata import EntryPoint
+from pathlib import Path
+from typing import Self
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+import pytest
+
+from paperless.parsers import ParserProtocol
+from paperless.parsers.registry import ParserRegistry
+from paperless.parsers.registry import get_parser_registry
+from paperless.parsers.registry import init_builtin_parsers
+from paperless.parsers.registry import reset_parser_registry
+
+
+@pytest.fixture()
+def dummy_parser_cls() -> type:
+    """Return a class that fully satisfies :class:`ParserProtocol`.
+
+    GIVEN: A need to exercise registry and Protocol logic with a minimal
+           but complete parser.
+    WHEN:  A test requests this fixture.
+    THEN:  A class with all required attributes and methods is returned.
+    """
+
+    class DummyParser:
+        name = "dummy-parser"
+        version = "0.1.0"
+        author = "Test Author"
+        url = "https://example.com/dummy-parser"
+
+        @classmethod
+        def supported_mime_types(cls) -> dict[str, str]:
+            return {"text/plain": ".txt"}
+
+        @classmethod
+        def score(
+            cls,
+            mime_type: str,
+            filename: str,
+            path: Path | None = None,
+        ) -> int | None:
+            return 10
+
+        @property
+        def can_produce_archive(self) -> bool:
+            return False
+
+        @property
+        def requires_pdf_rendition(self) -> bool:
+            return False
+
+        def parse(
+            self,
+            document_path: Path,
+            mime_type: str,
+            *,
+            produce_archive: bool = True,
+        ) -> None:
+            """
+            Required to exist, but doesn't need to do anything
+            """
+
+        def get_text(self) -> str | None:
+            return None
+
+        def get_date(self) -> None:
+            return None
+
+        def get_archive_path(self) -> Path | None:
+            return None
+
+        def get_thumbnail(
+            self,
+            document_path: Path,
+            mime_type: str,
+        ) -> Path:
+            return Path("/tmp/thumbnail.webp")
+
+        def get_page_count(
+            self,
+            document_path: Path,
+            mime_type: str,
+        ) -> int | None:
+            return None
+
+        def extract_metadata(
+            self,
+            document_path: Path,
+            mime_type: str,
+        ) -> list:
+            return []
+
+        def __enter__(self) -> Self:
+            return self
+
+        def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+            """
+            Required to exist, but doesn't need to do anything
+            """
+
+    return DummyParser
+
+
+class TestParserProtocol:
+    """Verify runtime isinstance() checks against ParserProtocol."""
+
+    def test_compliant_class_instance_passes_isinstance(
+        self,
+        dummy_parser_cls: type,
+    ) -> None:
+        """
+        GIVEN: A class that implements every method required by ParserProtocol.
+        WHEN:  isinstance() is called with the Protocol.
+        THEN:  The check passes (returns True).
+        """
+        instance = dummy_parser_cls()
+        assert isinstance(instance, ParserProtocol)
+
+    def test_non_compliant_class_instance_fails_isinstance(self) -> None:
+        """
+        GIVEN: A plain class with no parser-related methods.
+        WHEN:  isinstance() is called with ParserProtocol.
+        THEN:  The check fails (returns False).
+        """
+
+        class Unrelated:
+            pass
+
+        assert not isinstance(Unrelated(), ParserProtocol)
+
+    @pytest.mark.parametrize(
+        "missing_method",
+        [
+            pytest.param("parse", id="missing-parse"),
+            pytest.param("get_text", id="missing-get_text"),
+            pytest.param("get_thumbnail", id="missing-get_thumbnail"),
+            pytest.param("__enter__", id="missing-__enter__"),
+            pytest.param("__exit__", id="missing-__exit__"),
+        ],
+    )
+    def test_partial_compliant_fails_isinstance(
+        self,
+        dummy_parser_cls: type,
+        missing_method: str,
+    ) -> None:
+        """
+        GIVEN: A class that satisfies ParserProtocol except for one method.
+        WHEN:  isinstance() is called with ParserProtocol.
+        THEN:  The check fails because the Protocol is not fully satisfied.
+        """
+        # Create a subclass and delete the specified method to break compliance.
+        partial_cls = type(
+            "PartialParser",
+            (dummy_parser_cls,),
+            {missing_method: None},  # Replace with None — not callable
+        )
+        assert not isinstance(partial_cls(), ParserProtocol)
+
+
+class TestRegistrySingleton:
+    """Verify the module-level singleton lifecycle functions."""
+
+    def test_get_parser_registry_returns_instance(self) -> None:
+        """
+        GIVEN: No registry has been created yet.
+        WHEN:  get_parser_registry() is called.
+        THEN:  A ParserRegistry instance is returned.
+        """
+        registry = get_parser_registry()
+        assert isinstance(registry, ParserRegistry)
+
+    def test_get_parser_registry_same_instance_on_repeated_calls(self) -> None:
+        """
+        GIVEN: A registry instance was created by a prior call.
+        WHEN:  get_parser_registry() is called a second time.
+        THEN:  The exact same object (identity) is returned.
+        """
+        first = get_parser_registry()
+        second = get_parser_registry()
+        assert first is second
+
+    def test_reset_parser_registry_gives_fresh_instance(self) -> None:
+        """
+        GIVEN: A registry instance already exists.
+        WHEN:  reset_parser_registry() is called and then get_parser_registry()
+               is called again.
+        THEN:  A new, distinct registry instance is returned.
+        """
+        first = get_parser_registry()
+        reset_parser_registry()
+        second = get_parser_registry()
+        assert first is not second
+
+    def test_init_builtin_parsers_does_not_run_discover(
+        self,
+        monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        """
+        GIVEN: discover() would raise an exception if called.
+        WHEN:  init_builtin_parsers() is called.
+        THEN:  No exception is raised, confirming discover() was not invoked.
+        """
+
+        def exploding_discover(self) -> None:
+            raise RuntimeError(
+                "discover() must not be called from init_builtin_parsers",
+            )
+
+        monkeypatch.setattr(ParserRegistry, "discover", exploding_discover)
+
+        # Should complete without raising.
+        init_builtin_parsers()
+
+    def test_init_builtin_parsers_idempotent(self) -> None:
+        """
+        GIVEN: init_builtin_parsers() has already been called once.
+        WHEN:  init_builtin_parsers() is called a second time.
+        THEN:  No error is raised and the same registry instance is reused.
+        """
+        init_builtin_parsers()
+        # Capture the registry created by the first call.
+        import paperless.parsers.registry as reg_module
+
+        first_registry = reg_module._registry
+
+        init_builtin_parsers()
+
+        assert reg_module._registry is first_registry
+
+
+class TestParserRegistryGetParserForFile:
+    """Verify parser selection logic in get_parser_for_file()."""
+
+    def test_returns_none_when_no_parsers_registered(self) -> None:
+        """
+        GIVEN: A registry with no parsers registered.
+        WHEN:  get_parser_for_file() is called for any MIME type.
+        THEN:  None is returned.
+        """
+        registry = ParserRegistry()
+        result = registry.get_parser_for_file("text/plain", "doc.txt")
+        assert result is None
+
+    def test_returns_none_for_unsupported_mime_type(
+        self,
+        dummy_parser_cls: type,
+    ) -> None:
+        """
+        GIVEN: A registry with a parser that supports only 'text/plain'.
+        WHEN:  get_parser_for_file() is called with 'application/pdf'.
+        THEN:  None is returned.
+        """
+        registry = ParserRegistry()
+        registry.register_builtin(dummy_parser_cls)
+        result = registry.get_parser_for_file("application/pdf", "file.pdf")
+        assert result is None
+
+    def test_returns_parser_for_supported_mime_type(
+        self,
+        dummy_parser_cls: type,
+    ) -> None:
+        """
+        GIVEN: A registry with a parser registered for 'text/plain'.
+        WHEN:  get_parser_for_file() is called with 'text/plain'.
+        THEN:  The registered parser class is returned.
+        """
+        registry = ParserRegistry()
+        registry.register_builtin(dummy_parser_cls)
+        result = registry.get_parser_for_file("text/plain", "readme.txt")
+        assert result is dummy_parser_cls
+
+    def test_highest_score_wins(self) -> None:
+        """
+        GIVEN: Two parsers both supporting 'text/plain' with scores 5 and 20.
+        WHEN:  get_parser_for_file() is called for 'text/plain'.
+        THEN:  The parser with score 20 is returned.
+        """
+
+        class LowScoreParser:
+            name = "low"
+            version = "1.0"
+            author = "A"
+            url = "https://example.com/low"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"text/plain": ".txt"}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return 5
+
+        class HighScoreParser:
+            name = "high"
+            version = "1.0"
+            author = "B"
+            url = "https://example.com/high"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"text/plain": ".txt"}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return 20
+
+        registry = ParserRegistry()
+        registry.register_builtin(LowScoreParser)
+        registry.register_builtin(HighScoreParser)
+        result = registry.get_parser_for_file("text/plain", "readme.txt")
+        assert result is HighScoreParser
+
+    def test_parser_returning_none_score_is_skipped(self) -> None:
+        """
+        GIVEN: A parser that returns None from score() for the given file.
+        WHEN:  get_parser_for_file() is called.
+        THEN:  That parser is skipped and None is returned (no other candidates).
+        """
+
+        class DecliningParser:
+            name = "declining"
+            version = "1.0"
+            author = "A"
+            url = "https://example.com"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"text/plain": ".txt"}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return None  # Explicitly declines
+
+        registry = ParserRegistry()
+        registry.register_builtin(DecliningParser)
+        result = registry.get_parser_for_file("text/plain", "readme.txt")
+        assert result is None
+
+    def test_all_parsers_decline_returns_none(self) -> None:
+        """
+        GIVEN: Multiple parsers that all return None from score().
+        WHEN:  get_parser_for_file() is called.
+        THEN:  None is returned.
+        """
+
+        class AlwaysDeclines:
+            name = "declines"
+            version = "1.0"
+            author = "A"
+            url = "https://example.com"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"text/plain": ".txt"}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return None
+
+        registry = ParserRegistry()
+        registry.register_builtin(AlwaysDeclines)
+        registry._external.append(AlwaysDeclines)
+        result = registry.get_parser_for_file("text/plain", "file.txt")
+        assert result is None
+
+    def test_external_parser_beats_builtin_same_score(self) -> None:
+        """
+        GIVEN: An external and a built-in parser both returning score 10.
+        WHEN:  get_parser_for_file() is called.
+        THEN:  The external parser wins because externals are evaluated first
+               and the first-seen-wins policy applies at equal scores.
+        """
+
+        class BuiltinParser:
+            name = "builtin"
+            version = "1.0"
+            author = "Core"
+            url = "https://example.com/builtin"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"text/plain": ".txt"}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return 10
+
+        class ExternalParser:
+            name = "external"
+            version = "2.0"
+            author = "Third Party"
+            url = "https://example.com/external"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"text/plain": ".txt"}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return 10
+
+        registry = ParserRegistry()
+        registry.register_builtin(BuiltinParser)
+        registry._external.append(ExternalParser)
+        result = registry.get_parser_for_file("text/plain", "file.txt")
+        assert result is ExternalParser
+
+    def test_builtin_wins_when_external_declines(self) -> None:
+        """
+        GIVEN: An external parser that declines (score None) and a built-in
+               that returns score 5.
+        WHEN:  get_parser_for_file() is called.
+        THEN:  The built-in parser is returned.
+        """
+
+        class DecliningExternal:
+            name = "declining-external"
+            version = "1.0"
+            author = "Third Party"
+            url = "https://example.com/declining"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"text/plain": ".txt"}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return None
+
+        class AcceptingBuiltin:
+            name = "accepting-builtin"
+            version = "1.0"
+            author = "Core"
+            url = "https://example.com/accepting"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"text/plain": ".txt"}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return 5
+
+        registry = ParserRegistry()
+        registry.register_builtin(AcceptingBuiltin)
+        registry._external.append(DecliningExternal)
+        result = registry.get_parser_for_file("text/plain", "file.txt")
+        assert result is AcceptingBuiltin
+
+
+class TestDiscover:
+    """Verify entrypoint discovery in ParserRegistry.discover()."""
+
+    def test_discover_with_no_entrypoints(self) -> None:
+        """
+        GIVEN: No entrypoints are registered under 'paperless_ngx.parsers'.
+        WHEN:  discover() is called.
+        THEN:  _external remains empty and no errors are raised.
+        """
+        registry = ParserRegistry()
+
+        with patch(
+            "paperless.parsers.registry.entry_points",
+            return_value=[],
+        ):
+            registry.discover()
+
+        assert registry._external == []
+
+    def test_discover_adds_valid_external_parser(self) -> None:
+        """
+        GIVEN: One valid entrypoint whose loaded class has all required attrs.
+        WHEN:  discover() is called.
+        THEN:  The class is appended to _external.
+        """
+
+        class ValidExternal:
+            name = "valid-external"
+            version = "3.0.0"
+            author = "Someone"
+            url = "https://example.com/valid"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"application/pdf": ".pdf"}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return 5
+
+        mock_ep = MagicMock(spec=EntryPoint)
+        mock_ep.name = "valid_external"
+        mock_ep.load.return_value = ValidExternal
+
+        registry = ParserRegistry()
+
+        with patch(
+            "paperless.parsers.registry.entry_points",
+            return_value=[mock_ep],
+        ):
+            registry.discover()
+
+        assert ValidExternal in registry._external
+
+    def test_discover_skips_entrypoint_with_load_error(
+        self,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        """
+        GIVEN: An entrypoint whose load() method raises ImportError.
+        WHEN:  discover() is called.
+        THEN:  The entrypoint is skipped, an error is logged, and _external
+               remains empty.
+        """
+        mock_ep = MagicMock(spec=EntryPoint)
+        mock_ep.name = "broken_ep"
+        mock_ep.load.side_effect = ImportError("missing dependency")
+
+        registry = ParserRegistry()
+
+        with caplog.at_level(logging.ERROR, logger="paperless.parsers.registry"):
+            with patch(
+                "paperless.parsers.registry.entry_points",
+                return_value=[mock_ep],
+            ):
+                registry.discover()
+
+        assert registry._external == []
+        assert any(
+            "broken_ep" in record.message
+            for record in caplog.records
+            if record.levelno >= logging.ERROR
+        )
+
+    def test_discover_skips_entrypoint_with_missing_attrs(
+        self,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        """
+        GIVEN: A class loaded from an entrypoint that is missing the 'score'
+               attribute.
+        WHEN:  discover() is called.
+        THEN:  The entrypoint is skipped, a warning is logged, and _external
+               remains empty.
+        """
+
+        class MissingScore:
+            name = "missing-score"
+            version = "1.0"
+            author = "Someone"
+            url = "https://example.com"
+
+            # 'score' classmethod is intentionally absent.
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"text/plain": ".txt"}
+
+        mock_ep = MagicMock(spec=EntryPoint)
+        mock_ep.name = "missing_score_ep"
+        mock_ep.load.return_value = MissingScore
+
+        registry = ParserRegistry()
+
+        with caplog.at_level(logging.WARNING, logger="paperless.parsers.registry"):
+            with patch(
+                "paperless.parsers.registry.entry_points",
+                return_value=[mock_ep],
+            ):
+                registry.discover()
+
+        assert registry._external == []
+        assert any(
+            "missing_score_ep" in record.message
+            for record in caplog.records
+            if record.levelno >= logging.WARNING
+        )
+
+    def test_discover_logs_loaded_parser_info(
+        self,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        """
+        GIVEN: A valid entrypoint that loads successfully.
+        WHEN:  discover() is called.
+        THEN:  An INFO log message is emitted containing the parser name,
+               version, author, and entrypoint name.
+        """
+
+        class LoggableParser:
+            name = "loggable"
+            version = "4.2.0"
+            author = "Log Tester"
+            url = "https://example.com/loggable"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {"image/png": ".png"}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return 1
+
+        mock_ep = MagicMock(spec=EntryPoint)
+        mock_ep.name = "loggable_ep"
+        mock_ep.load.return_value = LoggableParser
+
+        registry = ParserRegistry()
+
+        with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"):
+            with patch(
+                "paperless.parsers.registry.entry_points",
+                return_value=[mock_ep],
+            ):
+                registry.discover()
+
+        info_messages = " ".join(
+            r.message for r in caplog.records if r.levelno == logging.INFO
+        )
+        assert "loggable" in info_messages
+        assert "4.2.0" in info_messages
+        assert "Log Tester" in info_messages
+        assert "loggable_ep" in info_messages
+
+
+class TestLogSummary:
+    """Verify log output from ParserRegistry.log_summary()."""
+
+    def test_log_summary_with_no_external_parsers(
+        self,
+        dummy_parser_cls: type,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        """
+        GIVEN: A registry with one built-in parser and no external parsers.
+        WHEN:  log_summary() is called.
+        THEN:  The built-in parser name appears in the logs.
+        """
+        registry = ParserRegistry()
+        registry.register_builtin(dummy_parser_cls)
+
+        with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"):
+            registry.log_summary()
+
+        all_messages = " ".join(r.message for r in caplog.records)
+        assert dummy_parser_cls.name in all_messages
+
+    def test_log_summary_with_external_parsers(
+        self,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        """
+        GIVEN: A registry with one external parser registered.
+        WHEN:  log_summary() is called.
+        THEN:  The external parser name, version, author, and url appear in
+               the log output.
+        """
+
+        class ExtParser:
+            name = "ext-parser"
+            version = "9.9.9"
+            author = "Ext Corp"
+            url = "https://ext.example.com"
+
+            @classmethod
+            def supported_mime_types(cls):
+                return {}
+
+            @classmethod
+            def score(cls, mime_type, filename, path=None):
+                return None
+
+        registry = ParserRegistry()
+        registry._external.append(ExtParser)
+
+        with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"):
+            registry.log_summary()
+
+        all_messages = " ".join(r.message for r in caplog.records)
+        assert "ext-parser" in all_messages
+        assert "9.9.9" in all_messages
+        assert "Ext Corp" in all_messages
+        assert "https://ext.example.com" in all_messages
+
+    def test_log_summary_logs_no_third_party_message_when_none(
+        self,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        """
+        GIVEN: A registry with no external parsers.
+        WHEN:  log_summary() is called.
+        THEN:  A message containing 'No third-party parsers discovered.' is
+               logged.
+        """
+        registry = ParserRegistry()
+
+        with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"):
+            registry.log_summary()
+
+        all_messages = " ".join(r.message for r in caplog.records)
+        assert "No third-party parsers discovered." in all_messages
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -1,50 +0,0 @@
-from pathlib import Path
-
-from django.conf import settings
-from PIL import Image
-from PIL import ImageDraw
-from PIL import ImageFont
-
-from documents.parsers import DocumentParser
-
-
-class TextDocumentParser(DocumentParser):
-    """
-    This parser directly parses a text document (.txt, .md, or .csv)
-    """
-
-    logging_name = "paperless.parsing.text"
-
-    def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
-        # Avoid reading entire file into memory
-        max_chars = 100_000
-        file_size_limit = 50 * 1024 * 1024
-
-        if document_path.stat().st_size > file_size_limit:
-            text = "[File too large to preview]"
-        else:
-            with Path(document_path).open("r", encoding="utf-8", errors="replace") as f:
-                text = f.read(max_chars)
-
-        img = Image.new("RGB", (500, 700), color="white")
-        draw = ImageDraw.Draw(img)
-        font = ImageFont.truetype(
-            font=settings.THUMBNAIL_FONT_NAME,
-            size=20,
-            layout_engine=ImageFont.Layout.BASIC,
-        )
-        draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
-
-        out_path = self.tempdir / "thumb.webp"
-        img.save(out_path, format="WEBP")
-
-        return out_path
-
-    def parse(self, document_path, mime_type, file_name=None) -> None:
-        self.text = self.read_file_handle_unicode_errors(document_path)
-
-    def get_settings(self) -> None:
-        """
-        This parser does not implement additional settings yet
-        """
-        return None
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -1,7 +1,13 @@
 def get_parser(*args, **kwargs):
-    from paperless_text.parsers import TextDocumentParser
+    from paperless.parsers.text import TextDocumentParser

-    return TextDocumentParser(*args, **kwargs)
+    # The new TextDocumentParser does not accept the legacy logging_group /
+    # progress_callback kwargs injected by the old signal-based consumer.
+    # These are dropped here; Phase 4 will replace this signal path with the
+    # new ParserRegistry so the shim can be removed at that point.
+    kwargs.pop("logging_group", None)
+    kwargs.pop("progress_callback", None)
+    return TextDocumentParser()


 def text_consumer_declaration(sender, **kwargs):
--- a/src/paperless_text/tests/conftest.py
+++ b/src/paperless_text/tests/conftest.py
@@ -1,30 +0,0 @@
-from collections.abc import Generator
-from pathlib import Path
-
-import pytest
-
-from paperless_text.parsers import TextDocumentParser
-
-
-@pytest.fixture(scope="session")
-def sample_dir() -> Path:
-    return (Path(__file__).parent / Path("samples")).resolve()
-
-
-@pytest.fixture()
-def text_parser() -> Generator[TextDocumentParser, None, None]:
-    try:
-        parser = TextDocumentParser(logging_group=None)
-        yield parser
-    finally:
-        parser.cleanup()
-
-
-@pytest.fixture(scope="session")
-def sample_txt_file(sample_dir: Path) -> Path:
-    return sample_dir / "test.txt"
-
-
-@pytest.fixture(scope="session")
-def malformed_txt_file(sample_dir: Path) -> Path:
-    return sample_dir / "decode_error.txt"
--- a/src/paperless_text/tests/test_parser.py
+++ b/src/paperless_text/tests/test_parser.py
@@ -1,69 +0,0 @@
-import tempfile
-from pathlib import Path
-
-from paperless_text.parsers import TextDocumentParser
-
-
-class TestTextParser:
-    def test_thumbnail(
-        self,
-        text_parser: TextDocumentParser,
-        sample_txt_file: Path,
-    ) -> None:
-        # just make sure that it does not crash
-        f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
-        assert f.exists()
-        assert f.is_file()
-
-    def test_parse(
-        self,
-        text_parser: TextDocumentParser,
-        sample_txt_file: Path,
-    ) -> None:
-        text_parser.parse(sample_txt_file, "text/plain")
-
-        assert text_parser.get_text() == "This is a test file.\n"
-        assert text_parser.get_archive_path() is None
-
-    def test_parse_invalid_bytes(
-        self,
-        text_parser: TextDocumentParser,
-        malformed_txt_file: Path,
-    ) -> None:
-        """
-        GIVEN:
-            - Text file which contains invalid UTF bytes
-        WHEN:
-            - The file is parsed
-        THEN:
-            - Parsing continues
-            - Invalid bytes are removed
-        """
-
-        text_parser.parse(malformed_txt_file, "text/plain")
-
-        assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
-        assert text_parser.get_archive_path() is None
-
-    def test_thumbnail_large_file(self, text_parser: TextDocumentParser) -> None:
-        """
-        GIVEN:
-            - A very large text file (>50MB)
-        WHEN:
-            - A thumbnail is requested
-        THEN:
-            - A thumbnail is created without reading the entire file into memory
-        """
-        with tempfile.NamedTemporaryFile(
-            delete=False,
-            mode="w",
-            encoding="utf-8",
-            suffix=".txt",
-        ) as tmp:
-            tmp.write("A" * (51 * 1024 * 1024))  # 51 MB of 'A'
-            large_file = Path(tmp.name)
-
-            thumb = text_parser.get_thumbnail(large_file, "text/plain")
-            assert thumb.exists()
-            assert thumb.is_file()
-            large_file.unlink()
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -1,136 +0,0 @@
-from pathlib import Path
-
-import httpx
-from django.conf import settings
-from django.utils import timezone
-from gotenberg_client import GotenbergClient
-from gotenberg_client.options import PdfAFormat
-from tika_client import TikaClient
-
-from documents.parsers import DocumentParser
-from documents.parsers import ParseError
-from documents.parsers import make_thumbnail_from_pdf
-from paperless.config import OutputTypeConfig
-from paperless.models import OutputTypeChoices
-
-
-class TikaDocumentParser(DocumentParser):
-    """
-    This parser sends documents to a local tika server
-    """
-
-    logging_name = "paperless.parsing.tika"
-
-    def get_thumbnail(self, document_path, mime_type, file_name=None):
-        if not self.archive_path:
-            self.archive_path = self.convert_to_pdf(document_path, file_name)
-
-        return make_thumbnail_from_pdf(
-            self.archive_path,
-            self.tempdir,
-            self.logging_group,
-        )
-
-    def extract_metadata(self, document_path, mime_type):
-        try:
-            with TikaClient(
-                tika_url=settings.TIKA_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client:
-                parsed = client.metadata.from_file(document_path, mime_type)
-                return [
-                    {
-                        "namespace": "",
-                        "prefix": "",
-                        "key": key,
-                        "value": parsed.data[key],
-                    }
-                    for key in parsed.data
-                ]
-        except Exception as e:
-            self.log.warning(
-                f"Error while fetching document metadata for {document_path}: {e}",
-            )
-            return []
-
-    def parse(self, document_path: Path, mime_type: str, file_name=None) -> None:
-        self.log.info(f"Sending {document_path} to Tika server")
-
-        try:
-            with TikaClient(
-                tika_url=settings.TIKA_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client:
-                try:
-                    parsed = client.tika.as_text.from_file(document_path, mime_type)
-                except httpx.HTTPStatusError as err:
-                    # Workaround https://issues.apache.org/jira/browse/TIKA-4110
-                    # Tika fails with some files as multi-part form data
-                    if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
-                        parsed = client.tika.as_text.from_buffer(
-                            document_path.read_bytes(),
-                            mime_type,
-                        )
-                    else:  # pragma: no cover
-                        raise
-        except Exception as err:
-            raise ParseError(
-                f"Could not parse {document_path} with tika server at "
-                f"{settings.TIKA_ENDPOINT}: {err}",
-            ) from err
-
-        self.text = parsed.content
-        if self.text is not None:
-            self.text = self.text.strip()
-
-        self.date = parsed.created
-        if self.date is not None and timezone.is_naive(self.date):
-            self.date = timezone.make_aware(self.date)
-
-        self.archive_path = self.convert_to_pdf(document_path, file_name)
-
-    def convert_to_pdf(self, document_path: Path, file_name):
-        pdf_path = Path(self.tempdir) / "convert.pdf"
-
-        self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
-
-        with (
-            GotenbergClient(
-                host=settings.TIKA_GOTENBERG_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ) as client,
-            client.libre_office.to_pdf() as route,
-        ):
-            # Set the output format of the resulting PDF
-            if settings.OCR_OUTPUT_TYPE in {
-                OutputTypeChoices.PDF_A,
-                OutputTypeChoices.PDF_A2,
-            }:
-                route.pdf_format(PdfAFormat.A2b)
-            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
-                self.log.warning(
-                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
-                )
-                route.pdf_format(PdfAFormat.A2b)
-            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
-                route.pdf_format(PdfAFormat.A3b)
-
-            route.convert(document_path)
-
-            try:
-                response = route.run()
-
-                pdf_path.write_bytes(response.content)
-
-                return pdf_path
-
-            except Exception as err:
-                raise ParseError(
-                    f"Error while converting document to PDF: {err}",
-                ) from err
-
-    def get_settings(self) -> OutputTypeConfig:
-        """
-        This parser only uses the PDF output type configuration currently
-        """
-        return OutputTypeConfig()
--- a/src/paperless_tika/signals.py
+++ b/src/paperless_tika/signals.py
@@ -1,7 +1,13 @@
 def get_parser(*args, **kwargs):
-    from paperless_tika.parsers import TikaDocumentParser
+    from paperless.parsers.tika import TikaDocumentParser

-    return TikaDocumentParser(*args, **kwargs)
+    # The new TikaDocumentParser does not accept the legacy logging_group /
+    # progress_callback kwargs injected by the old signal-based consumer.
+    # These are dropped here; Phase 4 will replace this signal path with the
+    # new ParserRegistry so the shim can be removed at that point.
+    kwargs.pop("logging_group", None)
+    kwargs.pop("progress_callback", None)
+    return TikaDocumentParser()


 def tika_consumer_declaration(sender, **kwargs):
--- a/src/paperless_tika/tests/conftest.py
+++ b/src/paperless_tika/tests/conftest.py
@@ -1,40 +0,0 @@
-from collections.abc import Generator
-from pathlib import Path
-
-import pytest
-
-from paperless_tika.parsers import TikaDocumentParser
-
-
-@pytest.fixture()
-def tika_parser() -> Generator[TikaDocumentParser, None, None]:
-    try:
-        parser = TikaDocumentParser(logging_group=None)
-        yield parser
-    finally:
-        parser.cleanup()
-
-
-@pytest.fixture(scope="session")
-def sample_dir() -> Path:
-    return (Path(__file__).parent / Path("samples")).resolve()
-
-
-@pytest.fixture(scope="session")
-def sample_odt_file(sample_dir: Path) -> Path:
-    return sample_dir / "sample.odt"
-
-
-@pytest.fixture(scope="session")
-def sample_docx_file(sample_dir: Path) -> Path:
-    return sample_dir / "sample.docx"
-
-
-@pytest.fixture(scope="session")
-def sample_doc_file(sample_dir: Path) -> Path:
-    return sample_dir / "sample.doc"
-
-
-@pytest.fixture(scope="session")
-def sample_broken_odt(sample_dir: Path) -> Path:
-    return sample_dir / "multi-part-broken.odt"
Author	SHA1	Message	Date
Trenton H	c7e1981064	Renames so it aligns better in the browser view	2026-03-12 19:38:16 -07:00
Trenton H	d6edb6c9f7	Fix: require context manager for TikaDocumentParser; clean up client lifecycle - consumer.py: call __enter__ for new-style parsers so _tika_client and _gotenberg_client are set before parse() is invoked - views.py: use `with parser` (via nullcontext for old-style parsers) in get_metadata so extract_metadata always runs inside a context manager - tika.py: GotenbergClient added to ExitStack alongside TikaClient; inline client creation removed from extract_metadata and _convert_to_pdf; __exit__ uses ExitStack.close() instead of __exit__ pass-through Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-12 19:29:37 -07:00
Trenton H	29ccac98e0	Fix: satisfy mypy and pyrefly for TikaDocumentParser Use a TYPE_CHECKING-guarded assert to narrow self._tika_client from TikaClient \| None to TikaClient at the point of use in parse(). The assert is visible to type checkers (TYPE_CHECKING=True) so both mypy and pyrefly accept the subsequent attribute accesses without error; at runtime TYPE_CHECKING is False so the assert never executes and no ruff S101 suppression is required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-12 15:46:26 -07:00
Trenton H	8c432afb82	Fix: update remaining imports and move live Tika tests after parser migration - src/documents/tests/test_parsers.py: import TikaDocumentParser from paperless.parsers.tika (old paperless_tika.parsers no longer exists) - git mv paperless_tika/tests/test_live_tika.py → paperless/tests/parsers/test_live_tika.py to co-locate all Tika tests with the parser; update import and replace old attribute API (tika_parser.text/.archive_path) with accessor methods (get_text/get_archive_path) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-12 15:38:25 -07:00
Trenton H	a0d0ea28cf	Feature: Phase 3 — migrate TikaDocumentParser to ParserProtocol Refactor TikaDocumentParser to satisfy ParserProtocol without subclassing the legacy DocumentParser ABC: - Add ClassVars: name, version, author, url - Add supported_mime_types() classmethod (12 Office/ODF/RTF MIME types) - Add score() classmethod — returns None when TIKA_ENABLED is False, 10 otherwise - can_produce_archive = False (PDF is for display, not an OCR archive) - requires_pdf_rendition = True (Office formats need PDF for browser display) - __enter__/__exit__ via ExitStack: TikaClient opened once per parser lifetime and shared across parse() and extract_metadata() calls - extract_metadata() falls back to a short-lived TikaClient when called outside a context manager (legacy view-layer metadata path) - _convert_to_pdf() uses OutputTypeConfig() to honour the database-stored ApplicationConfiguration before falling back to the env-var setting - Rename convert_to_pdf → _convert_to_pdf (private helper) Update paperless_tika/signals.py shim to import from the new module path and drop the legacy logging_group/progress_callback kwargs. Update documents/consumer.py to extend the existing TextDocumentParser special cases to also cover TikaDocumentParser (parse/get_thumbnail signatures, __exit__ cleanup). Add TestTikaParserRegistryInterface (7 tests) covering score(), properties, and ParserProtocol isinstance check. Update existing tests to use the new accessor API (get_text, get_date, get_archive_path, _convert_to_pdf). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-12 15:30:59 -07:00
Trenton H	da76c16274	Chore: move Tika parser and tests to paperless/ Move TikaDocumentParser and its tests to the canonical parser package location, matching the pattern established for TextDocumentParser: - src/paperless_tika/parsers.py → src/paperless/parsers/tika.py - src/paperless_tika/tests/test_tika_parser.py → src/paperless/tests/parsers/test_tika_parser.py - src/paperless_tika/tests/samples/ → src/paperless/tests/samples/tika/ Merge tika fixtures (tika_parser, sample_odt_file, sample_docx_file, sample_doc_file, sample_broken_odt) into the shared parsers conftest. Remove the now-empty src/paperless_tika/tests/conftest.py. Content is unchanged — this commit is rename-only so git history is preserved on the moved files. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-12 15:06:20 -07:00
Trenton H	f2eef8a6d1	The worker restarts every task, so don't log again and again	2026-03-10 09:20:48 -07:00
Trenton H	c8e5e6c4e2	Some additional places which will need updating	2026-03-10 09:13:57 -07:00
Trenton H	0b00c66b96	Special cases for the new parsers as I create them + don't forget to cleanup the shims	2026-03-10 08:49:15 -07:00
Trenton H	e4cadff749	Merge remote-tracking branch 'origin/dev' into feature-parser-plugin	2026-03-10 07:42:51 -07:00
Trenton H	d4b6075a2a	No cover the worker init (tested manually) and make sonar happy	2026-03-10 07:42:21 -07:00
Trenton H	130a73ec71	Linting	2026-03-10 07:28:47 -07:00
Trenton H	8ebc24bcfa	Fix: update paperless_text signal shim to import from new parser location paperless_text/parsers.py was moved to paperless/parsers/text.py as part of the Phase 3 parser migration. Update the signal-based get_parser() factory to import from the new location and strip the legacy logging_group / progress_callback kwargs that the new TextDocumentParser no longer accepts. This shim keeps document consumption functional until Phase 4 replaces the signal path with the new ParserRegistry. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-10 07:22:22 -07:00
Trenton H	d7052b8dee	Linting	2026-03-09 20:56:11 -07:00
Trenton H	c96e9f5dc7	Feat: add MetadataEntry TypedDict and extract_metadata to ParserProtocol - Define MetadataEntry TypedDict (namespace, prefix, key, value) in paperless.parsers and export it from __all__ - Add extract_metadata(document_path, mime_type) -> list[MetadataEntry] to ParserProtocol; implementations must not raise - Implement extract_metadata on TextDocumentParser (returns []) - Update DummyParser fixture in test_registry to include extract_metadata and align parse/get_thumbnail signatures with the current Protocol - Add TestTextParserMetadata tests covering empty-list return and mime_type-agnostic behaviour Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-09 20:54:00 -07:00
Trenton H	f7f162424b	Feature: Phase 3 — migrate TextDocumentParser to ParserProtocol Implement ParserProtocol on the moved TextDocumentParser without inheriting from the old DocumentParser ABC: - Add class-level identity attributes (name, version, author, url) - Add supported_mime_types() and score() classmethods - Add can_produce_archive and requires_pdf_rendition properties (both False) - Replace tempdir / read_file_handle_unicode_errors from old base class with a self-contained __init__, __enter__, __exit__, and _read_text helper - Drop file_name parameter from parse() and get_thumbnail(); add produce_archive kwarg - Use Self as __enter__ return type; align __exit__ exc_tb type to TracebackType \| None - Register TextDocumentParser in ParserRegistry.register_defaults() Tests: - Rewrite test_text_parser.py with 20 tests covering protocol compliance, lifecycle/cleanup, parse, thumbnail, and registry integration - Update parsers/conftest.py with text_parser fixture and sample file fixtures - Update top-level tests/conftest.py with shared clean_registry autouse fixture Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-09 20:53:51 -07:00
Trenton H	cdeabaf75d	Chore: move paperless_text parser and tests to paperless/ Move TextDocumentParser and its test suite from paperless_text/ into the new paperless/ package where parsers are being consolidated: - paperless_text/parsers.py → paperless/parsers/text.py - paperless_text/tests/test_parser.py → paperless/tests/parsers/test_text_parser.py - paperless_text/tests/conftest.py → paperless/tests/parsers/conftest.py - paperless_text/tests/samples/*.txt → paperless/tests/samples/text/ Also add paperless/tests/__init__.py, paperless/tests/parsers/__init__.py, and a new top-level paperless/tests/conftest.py for shared fixtures. The parser and test files are unchanged; subsequent commits will update them to implement ParserProtocol. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-09 20:53:20 -07:00
Trenton H	404ef6b40d	Formatting	2026-03-09 14:25:33 -07:00
Trenton H	8c40491034	Refactor: Clean up ParserProtocol docstrings and drop file_name parameter - Remove all Sphinx cross-reference markup (:meth:, :class:, :func:, :attr:, :data:, backtick quoting) from registry.py and __init__.py docstrings; use plain prose matching the rest of the codebase - Remove unused file_name parameter from parse() and get_thumbnail() in ParserProtocol — no existing parser reads it and the path already carries the filename Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-09 14:09:32 -07:00
Trenton H	0f6bdaf5de	Feature: Add parser plugin registry and ParserProtocol (Phase 1 & 2) Introduces the foundation of the entrypoint-based parser discovery system to replace the signal-based document_consumer_declaration approach. - Add ParserProtocol: runtime_checkable Protocol defining the full contract for document parsers (supported_mime_types, score, parse, context manager, result accessors) - Add ParserRegistry: lazy singleton with entrypoint discovery via importlib.metadata group 'paperless_ngx.parsers', uniform score-based selection across external and built-in parsers - Add get_parser_registry(), init_builtin_parsers(), reset_parser_registry() module-level helpers - Wire Celery worker_process_init to call init_builtin_parsers() eagerly in each worker, deferring third-party discovery to first task use - Add 28 pytest tests covering Protocol compliance, singleton lifecycle, scoring logic, entrypoint discovery, and log output Built-in parsers and consumer migration follow in Phases 3-6. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-09 13:54:52 -07:00