diff --git a/src/paperless/parsers/__init__.py b/src/paperless/parsers/__init__.py index dfe88e452..7be3abadd 100644 --- a/src/paperless/parsers/__init__.py +++ b/src/paperless/parsers/__init__.py @@ -1,7 +1,4 @@ """ -paperless.parsers -================= - Public interface for the Paperless-ngx parser plugin system. This module defines ParserProtocol — the structural contract that every @@ -46,6 +43,7 @@ from typing import runtime_checkable if TYPE_CHECKING: import datetime from pathlib import Path + from types import TracebackType __all__ = [ "ParserProtocol", @@ -304,7 +302,7 @@ class ParserProtocol(Protocol): self, exc_type: type[BaseException] | None, exc_val: BaseException | None, - exc_tb: object, + exc_tb: TracebackType | None, ) -> None: """Exit the parser context and release all resources. diff --git a/src/paperless/parsers/registry.py b/src/paperless/parsers/registry.py index 6c46fd7d1..3f5a10df1 100644 --- a/src/paperless/parsers/registry.py +++ b/src/paperless/parsers/registry.py @@ -1,7 +1,4 @@ """ -paperless.parsers.registry -========================== - Singleton registry that tracks all document parsers available to Paperless-ngx — both built-ins shipped with the application and third-party plugins installed via Python entrypoints. @@ -42,6 +39,8 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: from pathlib import Path + from paperless.parsers import ParserProtocol + logger = logging.getLogger("paperless.parsers.registry") # --------------------------------------------------------------------------- @@ -117,6 +116,7 @@ def init_builtin_parsers() -> None: if _registry is None: _registry = ParserRegistry() _registry.register_defaults() + _registry.log_summary() def reset_parser_registry() -> None: @@ -165,14 +165,14 @@ class ParserRegistry: """ def __init__(self) -> None: - self._external: list[type] = [] - self._builtins: list[type] = [] + self._external: list[type[ParserProtocol]] = [] + self._builtins: list[type[ParserProtocol]] = [] # ------------------------------------------------------------------ # Registration # ------------------------------------------------------------------ - def register_builtin(self, parser_class: type) -> None: + def register_builtin(self, parser_class: type[ParserProtocol]) -> None: """Register a built-in parser class. Built-in parsers are shipped with Paperless-ngx and are appended to @@ -189,11 +189,14 @@ class ParserRegistry: def register_defaults(self) -> None: """Register the built-in parsers that ship with Paperless-ngx. - Populated in Phase 3 when built-in parsers implement the new - interface. In Phase 1/2 this is intentionally a no-op so that the - registry infrastructure can be tested in isolation without depending - on any concrete parser implementations. + Each parser that has been migrated to the new ParserProtocol interface + is registered here. Parsers are added in ascending weight order so + that log output is predictable; scoring determines which parser wins + at runtime regardless of registration order. """ + from paperless.parsers.text import TextDocumentParser + + self.register_builtin(TextDocumentParser) # ------------------------------------------------------------------ # Discovery @@ -303,7 +306,7 @@ class ParserRegistry: mime_type: str, filename: str, path: Path | None = None, - ) -> type | None: + ) -> type[ParserProtocol] | None: """Return the best parser class for the given file, or None. All registered parsers (external first, then built-ins) are evaluated @@ -331,11 +334,11 @@ class ParserRegistry: Returns ------- - type | None + type[ParserProtocol] | None The winning parser class, or None if no parser can handle the file. """ best_score: int | None = None - best_parser: type | None = None + best_parser: type[ParserProtocol] | None = None # External parsers are placed first so that, at equal scores, an # external parser wins over a built-in (first-seen policy). diff --git a/src/paperless/parsers/text.py b/src/paperless/parsers/text.py index a6c149a0a..9e4d89c85 100644 --- a/src/paperless/parsers/text.py +++ b/src/paperless/parsers/text.py @@ -1,22 +1,237 @@ +""" +Built-in plain-text document parser. + +Handles text/plain, text/csv, and application/csv MIME types by reading the +file content directly. Thumbnails are generated by rendering a page-sized +WebP image from the first 100,000 characters using Pillow. +""" + +from __future__ import annotations + +import logging +import shutil +import tempfile from pathlib import Path +from typing import TYPE_CHECKING +from typing import Self from django.conf import settings from PIL import Image from PIL import ImageDraw from PIL import ImageFont -from documents.parsers import DocumentParser +from paperless.version import __full_version_str__ + +if TYPE_CHECKING: + import datetime + from types import TracebackType + +logger = logging.getLogger("paperless.parsing.text") + +_SUPPORTED_MIME_TYPES: dict[str, str] = { + "text/plain": ".txt", + "text/csv": ".csv", + "application/csv": ".csv", +} -class TextDocumentParser(DocumentParser): - """ - This parser directly parses a text document (.txt, .md, or .csv) +class TextDocumentParser: + """Parse plain-text documents (txt, csv) for Paperless-ngx. + + This parser reads the file content directly as UTF-8 text and renders a + simple thumbnail using Pillow. It does not perform OCR and does not + produce a searchable PDF archive copy. + + Class attributes + ---------------- + name : str + Human-readable parser name. + version : str + Semantic version string, kept in sync with Paperless-ngx releases. + author : str + Maintainer name. + url : str + Issue tracker / source URL. """ - logging_name = "paperless.parsing.text" + name: str = "Paperless-ngx Text Parser" + version: str = __full_version_str__ + author: str = "Paperless-ngx Contributors" + url: str = "https://github.com/paperless-ngx/paperless-ngx" - def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path: - # Avoid reading entire file into memory + # ------------------------------------------------------------------ + # Class methods + # ------------------------------------------------------------------ + + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + """Return the MIME types this parser handles. + + Returns + ------- + dict[str, str] + Mapping of MIME type to preferred file extension. + """ + return _SUPPORTED_MIME_TYPES + + @classmethod + def score( + cls, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> int | None: + """Return the priority score for handling this file. + + Parameters + ---------- + mime_type: + Detected MIME type of the file. + filename: + Original filename including extension. + path: + Optional filesystem path. Not inspected by this parser. + + Returns + ------- + int | None + 10 if the MIME type is supported, otherwise None. + """ + if mime_type in _SUPPORTED_MIME_TYPES: + return 10 + return None + + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def can_produce_archive(self) -> bool: + """Whether this parser can produce a searchable PDF archive copy. + + Returns + ------- + bool + Always False — the text parser does not produce a PDF archive. + """ + return False + + @property + def requires_pdf_rendition(self) -> bool: + """Whether the parser must produce a PDF for the frontend to display. + + Returns + ------- + bool + Always False — plain text files are displayable as-is. + """ + return False + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def __init__(self, logging_group: object = None) -> None: + settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) + self._tempdir = Path( + tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), + ) + self._text: str | None = None + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + logger.debug("Cleaning up temporary directory %s", self._tempdir) + shutil.rmtree(self._tempdir, ignore_errors=True) + + # ------------------------------------------------------------------ + # Core parsing interface + # ------------------------------------------------------------------ + + def parse( + self, + document_path: Path, + mime_type: str, + *, + produce_archive: bool = True, + ) -> None: + """Read the document and store its text content. + + Parameters + ---------- + document_path: + Absolute path to the text file. + mime_type: + Detected MIME type of the document. + produce_archive: + Ignored — this parser never produces a PDF archive. + + Raises + ------ + documents.parsers.ParseError + If the file cannot be read. + """ + self._text = self._read_text(document_path) + + # ------------------------------------------------------------------ + # Result accessors + # ------------------------------------------------------------------ + + def get_text(self) -> str | None: + """Return the plain-text content extracted during parse. + + Returns + ------- + str | None + Extracted text, or None if parse has not been called yet. + """ + return self._text + + def get_date(self) -> datetime.datetime | None: + """Return the document date detected during parse. + + Returns + ------- + datetime.datetime | None + Always None — the text parser does not detect dates. + """ + return None + + def get_archive_path(self) -> Path | None: + """Return the path to a generated archive PDF, or None. + + Returns + ------- + Path | None + Always None — the text parser does not produce a PDF archive. + """ + return None + + # ------------------------------------------------------------------ + # Thumbnail and metadata + # ------------------------------------------------------------------ + + def get_thumbnail(self, document_path: Path, mime_type: str) -> Path: + """Render the first portion of the document as a WebP thumbnail. + + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + + Returns + ------- + Path + Path to the generated WebP thumbnail inside the temporary directory. + """ max_chars = 100_000 file_size_limit = 50 * 1024 * 1024 @@ -35,16 +250,55 @@ class TextDocumentParser(DocumentParser): ) draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4) - out_path = self.tempdir / "thumb.webp" + out_path = self._tempdir / "thumb.webp" img.save(out_path, format="WEBP") return out_path - def parse(self, document_path, mime_type, file_name=None) -> None: - self.text = self.read_file_handle_unicode_errors(document_path) + def get_page_count( + self, + document_path: Path, + mime_type: str, + ) -> int | None: + """Return the number of pages in the document. - def get_settings(self) -> None: - """ - This parser does not implement additional settings yet + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + + Returns + ------- + int | None + Always None — page count is not meaningful for plain text. """ return None + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + def _read_text(self, filepath: Path) -> str: + """Read file content, replacing invalid UTF-8 bytes rather than failing. + + Parameters + ---------- + filepath: + Path to the file to read. + + Returns + ------- + str + File content as a string. + """ + try: + return filepath.read_text(encoding="utf-8") + except UnicodeDecodeError as exc: + logger.warning( + "Unicode error reading %s, replacing bad bytes: %s", + filepath, + exc, + ) + return filepath.read_bytes().decode("utf-8", errors="replace") diff --git a/src/paperless/tests/parsers/conftest.py b/src/paperless/tests/parsers/conftest.py index fa03e6f6b..2d5deb684 100644 --- a/src/paperless/tests/parsers/conftest.py +++ b/src/paperless/tests/parsers/conftest.py @@ -1,29 +1,76 @@ -from collections.abc import Generator -from pathlib import Path +""" +Parser fixtures that are used across multiple test modules in this package +are defined here. Format-specific sample-file fixtures are grouped by parser +so it is easy to see which files belong to which test module. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING import pytest -from paperless_text.parsers import TextDocumentParser + +from paperless.parsers.text import TextDocumentParser + +if TYPE_CHECKING: + from collections.abc import Generator + from pathlib import Path + + +# ------------------------------------------------------------------ +# Text parser sample files +# ------------------------------------------------------------------ @pytest.fixture(scope="session") -def sample_dir() -> Path: - return (Path(__file__).parent / Path("samples")).resolve() +def text_samples_dir(samples_dir: Path) -> Path: + """Absolute path to the text parser sample files directory. + + Returns + ------- + Path + ``/text/`` + """ + return samples_dir / "text" + + +@pytest.fixture(scope="session") +def sample_txt_file(text_samples_dir: Path) -> Path: + """Path to a valid UTF-8 plain-text sample file. + + Returns + ------- + Path + Absolute path to ``text/test.txt``. + """ + return text_samples_dir / "test.txt" + + +@pytest.fixture(scope="session") +def malformed_txt_file(text_samples_dir: Path) -> Path: + """Path to a text file containing invalid UTF-8 bytes. + + Returns + ------- + Path + Absolute path to ``text/decode_error.txt``. + """ + return text_samples_dir / "decode_error.txt" + + +# ------------------------------------------------------------------ +# Text parser instance +# ------------------------------------------------------------------ @pytest.fixture() def text_parser() -> Generator[TextDocumentParser, None, None]: - try: - parser = TextDocumentParser(logging_group=None) + """Yield a TextDocumentParser and clean up its temporary directory afterwards. + + Yields + ------ + TextDocumentParser + A ready-to-use parser instance. + """ + with TextDocumentParser() as parser: yield parser - finally: - parser.cleanup() - - -@pytest.fixture(scope="session") -def sample_txt_file(sample_dir: Path) -> Path: - return sample_dir / "test.txt" - - -@pytest.fixture(scope="session") -def malformed_txt_file(sample_dir: Path) -> Path: - return sample_dir / "decode_error.txt" diff --git a/src/paperless/tests/parsers/test_text_parser.py b/src/paperless/tests/parsers/test_text_parser.py index b1086bc3d..0b702eda2 100644 --- a/src/paperless/tests/parsers/test_text_parser.py +++ b/src/paperless/tests/parsers/test_text_parser.py @@ -1,21 +1,94 @@ +""" +Tests for paperless.parsers.text.TextDocumentParser. + +All tests use the context-manager protocol for parser lifecycle. Sample +files are provided by session-scoped fixtures defined in conftest.py. +""" + +from __future__ import annotations + import tempfile from pathlib import Path -from paperless_text.parsers import TextDocumentParser +import pytest + +from paperless.parsers import ParserProtocol +from paperless.parsers.text import TextDocumentParser -class TestTextParser: - def test_thumbnail( +class TestTextParserProtocol: + """Verify that TextDocumentParser satisfies the ParserProtocol contract.""" + + def test_isinstance_satisfies_protocol( self, text_parser: TextDocumentParser, - sample_txt_file: Path, ) -> None: - # just make sure that it does not crash - f = text_parser.get_thumbnail(sample_txt_file, "text/plain") - assert f.exists() - assert f.is_file() + assert isinstance(text_parser, ParserProtocol) - def test_parse( + def test_class_attributes_present(self) -> None: + assert isinstance(TextDocumentParser.name, str) and TextDocumentParser.name + assert ( + isinstance(TextDocumentParser.version, str) and TextDocumentParser.version + ) + assert isinstance(TextDocumentParser.author, str) and TextDocumentParser.author + assert isinstance(TextDocumentParser.url, str) and TextDocumentParser.url + + def test_supported_mime_types_returns_dict(self) -> None: + mime_types = TextDocumentParser.supported_mime_types() + assert isinstance(mime_types, dict) + assert "text/plain" in mime_types + assert "text/csv" in mime_types + assert "application/csv" in mime_types + + @pytest.mark.parametrize( + ("mime_type", "expected"), + [ + ("text/plain", 10), + ("text/csv", 10), + ("application/csv", 10), + ("application/pdf", None), + ("image/png", None), + ], + ) + def test_score(self, mime_type: str, expected: int | None) -> None: + assert TextDocumentParser.score(mime_type, "file.txt") == expected + + def test_can_produce_archive_is_false( + self, + text_parser: TextDocumentParser, + ) -> None: + assert text_parser.can_produce_archive is False + + def test_requires_pdf_rendition_is_false( + self, + text_parser: TextDocumentParser, + ) -> None: + assert text_parser.requires_pdf_rendition is False + + +class TestTextParserLifecycle: + """Verify context-manager behaviour and temporary directory cleanup.""" + + def test_context_manager_cleans_up_tempdir(self) -> None: + with TextDocumentParser() as parser: + tempdir = parser._tempdir + assert tempdir.exists() + assert not tempdir.exists() + + def test_context_manager_cleans_up_after_exception(self) -> None: + tempdir: Path | None = None + with pytest.raises(RuntimeError): + with TextDocumentParser() as parser: + tempdir = parser._tempdir + raise RuntimeError("boom") + assert tempdir is not None + assert not tempdir.exists() + + +class TestTextParserParse: + """Verify parse() and the result accessors.""" + + def test_parse_valid_utf8( self, text_parser: TextDocumentParser, sample_txt_file: Path, @@ -23,36 +96,74 @@ class TestTextParser: text_parser.parse(sample_txt_file, "text/plain") assert text_parser.get_text() == "This is a test file.\n" + + def test_parse_returns_none_for_archive_path( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + text_parser.parse(sample_txt_file, "text/plain") + assert text_parser.get_archive_path() is None - def test_parse_invalid_bytes( + def test_parse_returns_none_for_date( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + text_parser.parse(sample_txt_file, "text/plain") + + assert text_parser.get_date() is None + + def test_parse_invalid_utf8_bytes_replaced( self, text_parser: TextDocumentParser, malformed_txt_file: Path, ) -> None: """ GIVEN: - - Text file which contains invalid UTF bytes + - A text file containing invalid UTF-8 byte sequences WHEN: - The file is parsed THEN: - - Parsing continues - - Invalid bytes are removed + - Parsing succeeds + - Invalid bytes are replaced with the Unicode replacement character """ - text_parser.parse(malformed_txt_file, "text/plain") - assert text_parser.get_text() == "Pantothens�ure\n" - assert text_parser.get_archive_path() is None + assert text_parser.get_text() == "Pantothens\ufffdure\n" - def test_thumbnail_large_file(self, text_parser: TextDocumentParser) -> None: + def test_get_text_none_before_parse( + self, + text_parser: TextDocumentParser, + ) -> None: + assert text_parser.get_text() is None + + +class TestTextParserThumbnail: + """Verify thumbnail generation.""" + + def test_thumbnail_exists_and_is_file( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + thumb = text_parser.get_thumbnail(sample_txt_file, "text/plain") + + assert thumb.exists() + assert thumb.is_file() + + def test_thumbnail_large_file_does_not_read_all( + self, + text_parser: TextDocumentParser, + ) -> None: """ GIVEN: - - A very large text file (>50MB) + - A text file larger than 50 MB WHEN: - A thumbnail is requested THEN: - - A thumbnail is created without reading the entire file into memory + - The thumbnail is generated without loading the full file """ with tempfile.NamedTemporaryFile( delete=False, @@ -60,10 +171,55 @@ class TestTextParser: encoding="utf-8", suffix=".txt", ) as tmp: - tmp.write("A" * (51 * 1024 * 1024)) # 51 MB of 'A' + tmp.write("A" * (51 * 1024 * 1024)) large_file = Path(tmp.name) + try: thumb = text_parser.get_thumbnail(large_file, "text/plain") assert thumb.exists() assert thumb.is_file() - large_file.unlink() + finally: + large_file.unlink(missing_ok=True) + + def test_get_page_count_returns_none( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + assert text_parser.get_page_count(sample_txt_file, "text/plain") is None + + +class TestTextParserRegistry: + """Verify that TextDocumentParser is registered by default.""" + + def test_registered_in_defaults(self) -> None: + from paperless.parsers.registry import ParserRegistry + + registry = ParserRegistry() + registry.register_defaults() + + assert TextDocumentParser in registry._builtins + + def test_get_parser_for_text_plain(self) -> None: + from paperless.parsers.registry import get_parser_registry + + registry = get_parser_registry() + parser_cls = registry.get_parser_for_file("text/plain", "doc.txt") + + assert parser_cls is TextDocumentParser + + def test_get_parser_for_text_csv(self) -> None: + from paperless.parsers.registry import get_parser_registry + + registry = get_parser_registry() + parser_cls = registry.get_parser_for_file("text/csv", "data.csv") + + assert parser_cls is TextDocumentParser + + def test_get_parser_for_unknown_type_returns_none(self) -> None: + from paperless.parsers.registry import get_parser_registry + + registry = get_parser_registry() + parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf") + + assert parser_cls is None diff --git a/src/paperless/tests/test_registry.py b/src/paperless/tests/test_registry.py index 1c0da1d2d..05b0d45a2 100644 --- a/src/paperless/tests/test_registry.py +++ b/src/paperless/tests/test_registry.py @@ -25,20 +25,6 @@ from paperless.parsers.registry import init_builtin_parsers from paperless.parsers.registry import reset_parser_registry -@pytest.fixture(autouse=True) -def clean_registry() -> None: - """Reset the global parser registry before and after every test. - - GIVEN: The registry module carries module-level singleton state. - WHEN: Any test is executed. - THEN: Each test starts and ends with a clean slate, preventing state - leak between tests. - """ - reset_parser_registry() - yield - reset_parser_registry() - - @pytest.fixture() def dummy_parser_cls() -> type: """Return a class that fully satisfies :class:`ParserProtocol`.