Feature: Phase 3 — migrate TextDocumentParser to ParserProtocol

Implement ParserProtocol on the moved TextDocumentParser without inheriting from the old DocumentParser ABC: - Add class-level identity attributes (name, version, author, url) - Add supported_mime_types() and score() classmethods - Add can_produce_archive and requires_pdf_rendition properties (both False) - Replace tempdir / read_file_handle_unicode_errors from old base class with a self-contained __init__, __enter__, __exit__, and _read_text helper - Drop file_name parameter from parse() and get_thumbnail(); add produce_archive kwarg - Use Self as __enter__ return type; align __exit__ exc_tb type to TracebackType | None - Register TextDocumentParser in ParserRegistry.register_defaults() Tests: - Rewrite test_text_parser.py with 20 tests covering protocol compliance, lifecycle/cleanup, parse, thumbnail, and registry integration - Update parsers/conftest.py with text_parser fixture and sample file fixtures - Update top-level tests/conftest.py with shared clean_registry autouse fixture Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-08-01 08:32:18 +00:00 · 2026-03-09 20:53:51 -07:00
parent cdeabaf75d
commit f7f162424b
6 changed files with 528 additions and 84 deletions
@@ -1,7 +1,4 @@
 """
-paperless.parsers
-=================
-
 Public interface for the Paperless-ngx parser plugin system.

 This module defines ParserProtocol — the structural contract that every
@@ -46,6 +43,7 @@ from typing import runtime_checkable
 if TYPE_CHECKING:
    import datetime
    from pathlib import Path
+    from types import TracebackType

 __all__ = [
    "ParserProtocol",
@@ -304,7 +302,7 @@ class ParserProtocol(Protocol):
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
-        exc_tb: object,
+        exc_tb: TracebackType | None,
    ) -> None:
        """Exit the parser context and release all resources.

@@ -1,7 +1,4 @@
 """
-paperless.parsers.registry
-==========================
-
 Singleton registry that tracks all document parsers available to
 Paperless-ngx — both built-ins shipped with the application and third-party
 plugins installed via Python entrypoints.
@@ -42,6 +39,8 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from pathlib import Path

+    from paperless.parsers import ParserProtocol
+
 logger = logging.getLogger("paperless.parsers.registry")

 # ---------------------------------------------------------------------------
@@ -117,6 +116,7 @@ def init_builtin_parsers() -> None:
    if _registry is None:
        _registry = ParserRegistry()
        _registry.register_defaults()
+        _registry.log_summary()


 def reset_parser_registry() -> None:
@@ -165,14 +165,14 @@ class ParserRegistry:
    """

    def __init__(self) -> None:
-        self._external: list[type] = []
-        self._builtins: list[type] = []
+        self._external: list[type[ParserProtocol]] = []
+        self._builtins: list[type[ParserProtocol]] = []

    # ------------------------------------------------------------------
    # Registration
    # ------------------------------------------------------------------

-    def register_builtin(self, parser_class: type) -> None:
+    def register_builtin(self, parser_class: type[ParserProtocol]) -> None:
        """Register a built-in parser class.

        Built-in parsers are shipped with Paperless-ngx and are appended to
@@ -189,11 +189,14 @@ class ParserRegistry:
    def register_defaults(self) -> None:
        """Register the built-in parsers that ship with Paperless-ngx.

-        Populated in Phase 3 when built-in parsers implement the new
-        interface.  In Phase 1/2 this is intentionally a no-op so that the
-        registry infrastructure can be tested in isolation without depending
-        on any concrete parser implementations.
+        Each parser that has been migrated to the new ParserProtocol interface
+        is registered here.  Parsers are added in ascending weight order so
+        that log output is predictable; scoring determines which parser wins
+        at runtime regardless of registration order.
        """
+        from paperless.parsers.text import TextDocumentParser
+
+        self.register_builtin(TextDocumentParser)

    # ------------------------------------------------------------------
    # Discovery
@@ -303,7 +306,7 @@ class ParserRegistry:
        mime_type: str,
        filename: str,
        path: Path | None = None,
-    ) -> type | None:
+    ) -> type[ParserProtocol] | None:
        """Return the best parser class for the given file, or None.

        All registered parsers (external first, then built-ins) are evaluated
@@ -331,11 +334,11 @@ class ParserRegistry:

        Returns
        -------
-        type | None
+        type[ParserProtocol] | None
            The winning parser class, or None if no parser can handle the file.
        """
        best_score: int | None = None
-        best_parser: type | None = None
+        best_parser: type[ParserProtocol] | None = None

        # External parsers are placed first so that, at equal scores, an
        # external parser wins over a built-in (first-seen policy).
@@ -1,22 +1,237 @@
+"""
+Built-in plain-text document parser.
+
+Handles text/plain, text/csv, and application/csv MIME types by reading the
+file content directly.  Thumbnails are generated by rendering a page-sized
+WebP image from the first 100,000 characters using Pillow.
+"""
+
+from __future__ import annotations
+
+import logging
+import shutil
+import tempfile
 from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Self

 from django.conf import settings
 from PIL import Image
 from PIL import ImageDraw
 from PIL import ImageFont

-from documents.parsers import DocumentParser
+from paperless.version import __full_version_str__
+
+if TYPE_CHECKING:
+    import datetime
+    from types import TracebackType
+
+logger = logging.getLogger("paperless.parsing.text")
+
+_SUPPORTED_MIME_TYPES: dict[str, str] = {
+    "text/plain": ".txt",
+    "text/csv": ".csv",
+    "application/csv": ".csv",
+}


-class TextDocumentParser(DocumentParser):
-    """
-    This parser directly parses a text document (.txt, .md, or .csv)
+class TextDocumentParser:
+    """Parse plain-text documents (txt, csv) for Paperless-ngx.
+
+    This parser reads the file content directly as UTF-8 text and renders a
+    simple thumbnail using Pillow.  It does not perform OCR and does not
+    produce a searchable PDF archive copy.
+
+    Class attributes
+    ----------------
+    name : str
+        Human-readable parser name.
+    version : str
+        Semantic version string, kept in sync with Paperless-ngx releases.
+    author : str
+        Maintainer name.
+    url : str
+        Issue tracker / source URL.
    """

-    logging_name = "paperless.parsing.text"
+    name: str = "Paperless-ngx Text Parser"
+    version: str = __full_version_str__
+    author: str = "Paperless-ngx Contributors"
+    url: str = "https://github.com/paperless-ngx/paperless-ngx"

-    def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
-        # Avoid reading entire file into memory
+    # ------------------------------------------------------------------
+    # Class methods
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def supported_mime_types(cls) -> dict[str, str]:
+        """Return the MIME types this parser handles.
+
+        Returns
+        -------
+        dict[str, str]
+            Mapping of MIME type to preferred file extension.
+        """
+        return _SUPPORTED_MIME_TYPES
+
+    @classmethod
+    def score(
+        cls,
+        mime_type: str,
+        filename: str,
+        path: Path | None = None,
+    ) -> int | None:
+        """Return the priority score for handling this file.
+
+        Parameters
+        ----------
+        mime_type:
+            Detected MIME type of the file.
+        filename:
+            Original filename including extension.
+        path:
+            Optional filesystem path. Not inspected by this parser.
+
+        Returns
+        -------
+        int | None
+            10 if the MIME type is supported, otherwise None.
+        """
+        if mime_type in _SUPPORTED_MIME_TYPES:
+            return 10
+        return None
+
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def can_produce_archive(self) -> bool:
+        """Whether this parser can produce a searchable PDF archive copy.
+
+        Returns
+        -------
+        bool
+            Always False — the text parser does not produce a PDF archive.
+        """
+        return False
+
+    @property
+    def requires_pdf_rendition(self) -> bool:
+        """Whether the parser must produce a PDF for the frontend to display.
+
+        Returns
+        -------
+        bool
+            Always False — plain text files are displayable as-is.
+        """
+        return False
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def __init__(self, logging_group: object = None) -> None:
+        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
+        self._tempdir = Path(
+            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
+        )
+        self._text: str | None = None
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        logger.debug("Cleaning up temporary directory %s", self._tempdir)
+        shutil.rmtree(self._tempdir, ignore_errors=True)
+
+    # ------------------------------------------------------------------
+    # Core parsing interface
+    # ------------------------------------------------------------------
+
+    def parse(
+        self,
+        document_path: Path,
+        mime_type: str,
+        *,
+        produce_archive: bool = True,
+    ) -> None:
+        """Read the document and store its text content.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the text file.
+        mime_type:
+            Detected MIME type of the document.
+        produce_archive:
+            Ignored — this parser never produces a PDF archive.
+
+        Raises
+        ------
+        documents.parsers.ParseError
+            If the file cannot be read.
+        """
+        self._text = self._read_text(document_path)
+
+    # ------------------------------------------------------------------
+    # Result accessors
+    # ------------------------------------------------------------------
+
+    def get_text(self) -> str | None:
+        """Return the plain-text content extracted during parse.
+
+        Returns
+        -------
+        str | None
+            Extracted text, or None if parse has not been called yet.
+        """
+        return self._text
+
+    def get_date(self) -> datetime.datetime | None:
+        """Return the document date detected during parse.
+
+        Returns
+        -------
+        datetime.datetime | None
+            Always None — the text parser does not detect dates.
+        """
+        return None
+
+    def get_archive_path(self) -> Path | None:
+        """Return the path to a generated archive PDF, or None.
+
+        Returns
+        -------
+        Path | None
+            Always None — the text parser does not produce a PDF archive.
+        """
+        return None
+
+    # ------------------------------------------------------------------
+    # Thumbnail and metadata
+    # ------------------------------------------------------------------
+
+    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
+        """Render the first portion of the document as a WebP thumbnail.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        Path
+            Path to the generated WebP thumbnail inside the temporary directory.
+        """
        max_chars = 100_000
        file_size_limit = 50 * 1024 * 1024

@@ -35,16 +250,55 @@ class TextDocumentParser(DocumentParser):
        )
        draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)

-        out_path = self.tempdir / "thumb.webp"
+        out_path = self._tempdir / "thumb.webp"
        img.save(out_path, format="WEBP")

        return out_path

-    def parse(self, document_path, mime_type, file_name=None) -> None:
-        self.text = self.read_file_handle_unicode_errors(document_path)
+    def get_page_count(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> int | None:
+        """Return the number of pages in the document.

-    def get_settings(self) -> None:
-        """
-        This parser does not implement additional settings yet
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the source document.
+        mime_type:
+            Detected MIME type of the document.
+
+        Returns
+        -------
+        int | None
+            Always None — page count is not meaningful for plain text.
        """
        return None
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    def _read_text(self, filepath: Path) -> str:
+        """Read file content, replacing invalid UTF-8 bytes rather than failing.
+
+        Parameters
+        ----------
+        filepath:
+            Path to the file to read.
+
+        Returns
+        -------
+        str
+            File content as a string.
+        """
+        try:
+            return filepath.read_text(encoding="utf-8")
+        except UnicodeDecodeError as exc:
+            logger.warning(
+                "Unicode error reading %s, replacing bad bytes: %s",
+                filepath,
+                exc,
+            )
+            return filepath.read_bytes().decode("utf-8", errors="replace")
@@ -1,29 +1,76 @@
-from collections.abc import Generator
-from pathlib import Path
+"""
+Parser fixtures that are used across multiple test modules in this package
+are defined here.  Format-specific sample-file fixtures are grouped by parser
+so it is easy to see which files belong to which test module.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING

 import pytest
-from paperless_text.parsers import TextDocumentParser
+
+from paperless.parsers.text import TextDocumentParser
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+    from pathlib import Path
+
+
+# ------------------------------------------------------------------
+# Text parser sample files
+# ------------------------------------------------------------------


@pytest.fixture(scope="session")
-def sample_dir() -> Path:
-    return (Path(__file__).parent / Path("samples")).resolve()
+def text_samples_dir(samples_dir: Path) -> Path:
+    """Absolute path to the text parser sample files directory.
+
+    Returns
+    -------
+    Path
+        ``<samples_dir>/text/``
+    """
+    return samples_dir / "text"
+
+
+@pytest.fixture(scope="session")
+def sample_txt_file(text_samples_dir: Path) -> Path:
+    """Path to a valid UTF-8 plain-text sample file.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``text/test.txt``.
+    """
+    return text_samples_dir / "test.txt"
+
+
+@pytest.fixture(scope="session")
+def malformed_txt_file(text_samples_dir: Path) -> Path:
+    """Path to a text file containing invalid UTF-8 bytes.
+
+    Returns
+    -------
+    Path
+        Absolute path to ``text/decode_error.txt``.
+    """
+    return text_samples_dir / "decode_error.txt"
+
+
+# ------------------------------------------------------------------
+# Text parser instance
+# ------------------------------------------------------------------


@pytest.fixture()
 def text_parser() -> Generator[TextDocumentParser, None, None]:
-    try:
-        parser = TextDocumentParser(logging_group=None)
+    """Yield a TextDocumentParser and clean up its temporary directory afterwards.
+
+    Yields
+    ------
+    TextDocumentParser
+        A ready-to-use parser instance.
+    """
+    with TextDocumentParser() as parser:
        yield parser
-    finally:
-        parser.cleanup()
-
-
-@pytest.fixture(scope="session")
-def sample_txt_file(sample_dir: Path) -> Path:
-    return sample_dir / "test.txt"
-
-
-@pytest.fixture(scope="session")
-def malformed_txt_file(sample_dir: Path) -> Path:
-    return sample_dir / "decode_error.txt"
@@ -1,21 +1,94 @@
+"""
+Tests for paperless.parsers.text.TextDocumentParser.
+
+All tests use the context-manager protocol for parser lifecycle.  Sample
+files are provided by session-scoped fixtures defined in conftest.py.
+"""
+
+from __future__ import annotations
+
 import tempfile
 from pathlib import Path

-from paperless_text.parsers import TextDocumentParser
+import pytest
+
+from paperless.parsers import ParserProtocol
+from paperless.parsers.text import TextDocumentParser


-class TestTextParser:
-    def test_thumbnail(
+class TestTextParserProtocol:
+    """Verify that TextDocumentParser satisfies the ParserProtocol contract."""
+
+    def test_isinstance_satisfies_protocol(
        self,
        text_parser: TextDocumentParser,
-        sample_txt_file: Path,
    ) -> None:
-        # just make sure that it does not crash
-        f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
-        assert f.exists()
-        assert f.is_file()
+        assert isinstance(text_parser, ParserProtocol)

-    def test_parse(
+    def test_class_attributes_present(self) -> None:
+        assert isinstance(TextDocumentParser.name, str) and TextDocumentParser.name
+        assert (
+            isinstance(TextDocumentParser.version, str) and TextDocumentParser.version
+        )
+        assert isinstance(TextDocumentParser.author, str) and TextDocumentParser.author
+        assert isinstance(TextDocumentParser.url, str) and TextDocumentParser.url
+
+    def test_supported_mime_types_returns_dict(self) -> None:
+        mime_types = TextDocumentParser.supported_mime_types()
+        assert isinstance(mime_types, dict)
+        assert "text/plain" in mime_types
+        assert "text/csv" in mime_types
+        assert "application/csv" in mime_types
+
+    @pytest.mark.parametrize(
+        ("mime_type", "expected"),
+        [
+            ("text/plain", 10),
+            ("text/csv", 10),
+            ("application/csv", 10),
+            ("application/pdf", None),
+            ("image/png", None),
+        ],
+    )
+    def test_score(self, mime_type: str, expected: int | None) -> None:
+        assert TextDocumentParser.score(mime_type, "file.txt") == expected
+
+    def test_can_produce_archive_is_false(
+        self,
+        text_parser: TextDocumentParser,
+    ) -> None:
+        assert text_parser.can_produce_archive is False
+
+    def test_requires_pdf_rendition_is_false(
+        self,
+        text_parser: TextDocumentParser,
+    ) -> None:
+        assert text_parser.requires_pdf_rendition is False
+
+
+class TestTextParserLifecycle:
+    """Verify context-manager behaviour and temporary directory cleanup."""
+
+    def test_context_manager_cleans_up_tempdir(self) -> None:
+        with TextDocumentParser() as parser:
+            tempdir = parser._tempdir
+            assert tempdir.exists()
+        assert not tempdir.exists()
+
+    def test_context_manager_cleans_up_after_exception(self) -> None:
+        tempdir: Path | None = None
+        with pytest.raises(RuntimeError):
+            with TextDocumentParser() as parser:
+                tempdir = parser._tempdir
+                raise RuntimeError("boom")
+        assert tempdir is not None
+        assert not tempdir.exists()
+
+
+class TestTextParserParse:
+    """Verify parse() and the result accessors."""
+
+    def test_parse_valid_utf8(
        self,
        text_parser: TextDocumentParser,
        sample_txt_file: Path,
@@ -23,36 +96,74 @@ class TestTextParser:
        text_parser.parse(sample_txt_file, "text/plain")

        assert text_parser.get_text() == "This is a test file.\n"
+
+    def test_parse_returns_none_for_archive_path(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        text_parser.parse(sample_txt_file, "text/plain")
+
        assert text_parser.get_archive_path() is None

-    def test_parse_invalid_bytes(
+    def test_parse_returns_none_for_date(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        text_parser.parse(sample_txt_file, "text/plain")
+
+        assert text_parser.get_date() is None
+
+    def test_parse_invalid_utf8_bytes_replaced(
        self,
        text_parser: TextDocumentParser,
        malformed_txt_file: Path,
    ) -> None:
        """
        GIVEN:
-            - Text file which contains invalid UTF bytes
+            - A text file containing invalid UTF-8 byte sequences
        WHEN:
            - The file is parsed
        THEN:
-            - Parsing continues
-            - Invalid bytes are removed
+            - Parsing succeeds
+            - Invalid bytes are replaced with the Unicode replacement character
        """
-
        text_parser.parse(malformed_txt_file, "text/plain")

-        assert text_parser.get_text() == "Pantothens�ure\n"
-        assert text_parser.get_archive_path() is None
+        assert text_parser.get_text() == "Pantothens\ufffdure\n"

-    def test_thumbnail_large_file(self, text_parser: TextDocumentParser) -> None:
+    def test_get_text_none_before_parse(
+        self,
+        text_parser: TextDocumentParser,
+    ) -> None:
+        assert text_parser.get_text() is None
+
+
+class TestTextParserThumbnail:
+    """Verify thumbnail generation."""
+
+    def test_thumbnail_exists_and_is_file(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        thumb = text_parser.get_thumbnail(sample_txt_file, "text/plain")
+
+        assert thumb.exists()
+        assert thumb.is_file()
+
+    def test_thumbnail_large_file_does_not_read_all(
+        self,
+        text_parser: TextDocumentParser,
+    ) -> None:
        """
        GIVEN:
-            - A very large text file (>50MB)
+            - A text file larger than 50 MB
        WHEN:
            - A thumbnail is requested
        THEN:
-            - A thumbnail is created without reading the entire file into memory
+            - The thumbnail is generated without loading the full file
        """
        with tempfile.NamedTemporaryFile(
            delete=False,
@@ -60,10 +171,55 @@ class TestTextParser:
            encoding="utf-8",
            suffix=".txt",
        ) as tmp:
-            tmp.write("A" * (51 * 1024 * 1024))  # 51 MB of 'A'
+            tmp.write("A" * (51 * 1024 * 1024))
            large_file = Path(tmp.name)

+        try:
            thumb = text_parser.get_thumbnail(large_file, "text/plain")
            assert thumb.exists()
            assert thumb.is_file()
-            large_file.unlink()
+        finally:
+            large_file.unlink(missing_ok=True)
+
+    def test_get_page_count_returns_none(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        assert text_parser.get_page_count(sample_txt_file, "text/plain") is None
+
+
+class TestTextParserRegistry:
+    """Verify that TextDocumentParser is registered by default."""
+
+    def test_registered_in_defaults(self) -> None:
+        from paperless.parsers.registry import ParserRegistry
+
+        registry = ParserRegistry()
+        registry.register_defaults()
+
+        assert TextDocumentParser in registry._builtins
+
+    def test_get_parser_for_text_plain(self) -> None:
+        from paperless.parsers.registry import get_parser_registry
+
+        registry = get_parser_registry()
+        parser_cls = registry.get_parser_for_file("text/plain", "doc.txt")
+
+        assert parser_cls is TextDocumentParser
+
+    def test_get_parser_for_text_csv(self) -> None:
+        from paperless.parsers.registry import get_parser_registry
+
+        registry = get_parser_registry()
+        parser_cls = registry.get_parser_for_file("text/csv", "data.csv")
+
+        assert parser_cls is TextDocumentParser
+
+    def test_get_parser_for_unknown_type_returns_none(self) -> None:
+        from paperless.parsers.registry import get_parser_registry
+
+        registry = get_parser_registry()
+        parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
+
+        assert parser_cls is None
@@ -25,20 +25,6 @@ from paperless.parsers.registry import init_builtin_parsers
 from paperless.parsers.registry import reset_parser_registry


-@pytest.fixture(autouse=True)
-def clean_registry() -> None:
-    """Reset the global parser registry before and after every test.
-
-    GIVEN: The registry module carries module-level singleton state.
-    WHEN:  Any test is executed.
-    THEN:  Each test starts and ends with a clean slate, preventing state
-           leak between tests.
-    """
-    reset_parser_registry()
-    yield
-    reset_parser_registry()
-
-
@pytest.fixture()
 def dummy_parser_cls() -> type:
    """Return a class that fully satisfies :class:`ParserProtocol`.