Feature: Phase 3 — migrate TextDocumentParser to ParserProtocol

Implement ParserProtocol on the moved TextDocumentParser without inheriting
from the old DocumentParser ABC:

- Add class-level identity attributes (name, version, author, url)
- Add supported_mime_types() and score() classmethods
- Add can_produce_archive and requires_pdf_rendition properties (both False)
- Replace tempdir / read_file_handle_unicode_errors from old base class with
  a self-contained __init__, __enter__, __exit__, and _read_text helper
- Drop file_name parameter from parse() and get_thumbnail(); add produce_archive kwarg
- Use Self as __enter__ return type; align __exit__ exc_tb type to TracebackType | None
- Register TextDocumentParser in ParserRegistry.register_defaults()

Tests:
- Rewrite test_text_parser.py with 20 tests covering protocol compliance,
  lifecycle/cleanup, parse, thumbnail, and registry integration
- Update parsers/conftest.py with text_parser fixture and sample file fixtures
- Update top-level tests/conftest.py with shared clean_registry autouse fixture

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-09 20:53:51 -07:00
parent cdeabaf75d
commit f7f162424b
6 changed files with 528 additions and 84 deletions
+2 -4
View File
@@ -1,7 +1,4 @@
"""
paperless.parsers
=================
Public interface for the Paperless-ngx parser plugin system.
This module defines ParserProtocol — the structural contract that every
@@ -46,6 +43,7 @@ from typing import runtime_checkable
if TYPE_CHECKING:
import datetime
from pathlib import Path
from types import TracebackType
__all__ = [
"ParserProtocol",
@@ -304,7 +302,7 @@ class ParserProtocol(Protocol):
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: object,
exc_tb: TracebackType | None,
) -> None:
"""Exit the parser context and release all resources.
+16 -13
View File
@@ -1,7 +1,4 @@
"""
paperless.parsers.registry
==========================
Singleton registry that tracks all document parsers available to
Paperless-ngx — both built-ins shipped with the application and third-party
plugins installed via Python entrypoints.
@@ -42,6 +39,8 @@ from typing import TYPE_CHECKING
if TYPE_CHECKING:
from pathlib import Path
from paperless.parsers import ParserProtocol
logger = logging.getLogger("paperless.parsers.registry")
# ---------------------------------------------------------------------------
@@ -117,6 +116,7 @@ def init_builtin_parsers() -> None:
if _registry is None:
_registry = ParserRegistry()
_registry.register_defaults()
_registry.log_summary()
def reset_parser_registry() -> None:
@@ -165,14 +165,14 @@ class ParserRegistry:
"""
def __init__(self) -> None:
self._external: list[type] = []
self._builtins: list[type] = []
self._external: list[type[ParserProtocol]] = []
self._builtins: list[type[ParserProtocol]] = []
# ------------------------------------------------------------------
# Registration
# ------------------------------------------------------------------
def register_builtin(self, parser_class: type) -> None:
def register_builtin(self, parser_class: type[ParserProtocol]) -> None:
"""Register a built-in parser class.
Built-in parsers are shipped with Paperless-ngx and are appended to
@@ -189,11 +189,14 @@ class ParserRegistry:
def register_defaults(self) -> None:
"""Register the built-in parsers that ship with Paperless-ngx.
Populated in Phase 3 when built-in parsers implement the new
interface. In Phase 1/2 this is intentionally a no-op so that the
registry infrastructure can be tested in isolation without depending
on any concrete parser implementations.
Each parser that has been migrated to the new ParserProtocol interface
is registered here. Parsers are added in ascending weight order so
that log output is predictable; scoring determines which parser wins
at runtime regardless of registration order.
"""
from paperless.parsers.text import TextDocumentParser
self.register_builtin(TextDocumentParser)
# ------------------------------------------------------------------
# Discovery
@@ -303,7 +306,7 @@ class ParserRegistry:
mime_type: str,
filename: str,
path: Path | None = None,
) -> type | None:
) -> type[ParserProtocol] | None:
"""Return the best parser class for the given file, or None.
All registered parsers (external first, then built-ins) are evaluated
@@ -331,11 +334,11 @@ class ParserRegistry:
Returns
-------
type | None
type[ParserProtocol] | None
The winning parser class, or None if no parser can handle the file.
"""
best_score: int | None = None
best_parser: type | None = None
best_parser: type[ParserProtocol] | None = None
# External parsers are placed first so that, at equal scores, an
# external parser wins over a built-in (first-seen policy).
+267 -13
View File
@@ -1,22 +1,237 @@
"""
Built-in plain-text document parser.
Handles text/plain, text/csv, and application/csv MIME types by reading the
file content directly. Thumbnails are generated by rendering a page-sized
WebP image from the first 100,000 characters using Pillow.
"""
from __future__ import annotations
import logging
import shutil
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Self
from django.conf import settings
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from documents.parsers import DocumentParser
from paperless.version import __full_version_str__
if TYPE_CHECKING:
import datetime
from types import TracebackType
logger = logging.getLogger("paperless.parsing.text")
_SUPPORTED_MIME_TYPES: dict[str, str] = {
"text/plain": ".txt",
"text/csv": ".csv",
"application/csv": ".csv",
}
class TextDocumentParser(DocumentParser):
"""
This parser directly parses a text document (.txt, .md, or .csv)
class TextDocumentParser:
"""Parse plain-text documents (txt, csv) for Paperless-ngx.
This parser reads the file content directly as UTF-8 text and renders a
simple thumbnail using Pillow. It does not perform OCR and does not
produce a searchable PDF archive copy.
Class attributes
----------------
name : str
Human-readable parser name.
version : str
Semantic version string, kept in sync with Paperless-ngx releases.
author : str
Maintainer name.
url : str
Issue tracker / source URL.
"""
logging_name = "paperless.parsing.text"
name: str = "Paperless-ngx Text Parser"
version: str = __full_version_str__
author: str = "Paperless-ngx Contributors"
url: str = "https://github.com/paperless-ngx/paperless-ngx"
def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
# Avoid reading entire file into memory
# ------------------------------------------------------------------
# Class methods
# ------------------------------------------------------------------
@classmethod
def supported_mime_types(cls) -> dict[str, str]:
"""Return the MIME types this parser handles.
Returns
-------
dict[str, str]
Mapping of MIME type to preferred file extension.
"""
return _SUPPORTED_MIME_TYPES
@classmethod
def score(
cls,
mime_type: str,
filename: str,
path: Path | None = None,
) -> int | None:
"""Return the priority score for handling this file.
Parameters
----------
mime_type:
Detected MIME type of the file.
filename:
Original filename including extension.
path:
Optional filesystem path. Not inspected by this parser.
Returns
-------
int | None
10 if the MIME type is supported, otherwise None.
"""
if mime_type in _SUPPORTED_MIME_TYPES:
return 10
return None
# ------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------
@property
def can_produce_archive(self) -> bool:
"""Whether this parser can produce a searchable PDF archive copy.
Returns
-------
bool
Always False — the text parser does not produce a PDF archive.
"""
return False
@property
def requires_pdf_rendition(self) -> bool:
"""Whether the parser must produce a PDF for the frontend to display.
Returns
-------
bool
Always False — plain text files are displayable as-is.
"""
return False
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def __init__(self, logging_group: object = None) -> None:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
self._tempdir = Path(
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
)
self._text: str | None = None
def __enter__(self) -> Self:
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
logger.debug("Cleaning up temporary directory %s", self._tempdir)
shutil.rmtree(self._tempdir, ignore_errors=True)
# ------------------------------------------------------------------
# Core parsing interface
# ------------------------------------------------------------------
def parse(
self,
document_path: Path,
mime_type: str,
*,
produce_archive: bool = True,
) -> None:
"""Read the document and store its text content.
Parameters
----------
document_path:
Absolute path to the text file.
mime_type:
Detected MIME type of the document.
produce_archive:
Ignored — this parser never produces a PDF archive.
Raises
------
documents.parsers.ParseError
If the file cannot be read.
"""
self._text = self._read_text(document_path)
# ------------------------------------------------------------------
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
"""Return the plain-text content extracted during parse.
Returns
-------
str | None
Extracted text, or None if parse has not been called yet.
"""
return self._text
def get_date(self) -> datetime.datetime | None:
"""Return the document date detected during parse.
Returns
-------
datetime.datetime | None
Always None — the text parser does not detect dates.
"""
return None
def get_archive_path(self) -> Path | None:
"""Return the path to a generated archive PDF, or None.
Returns
-------
Path | None
Always None — the text parser does not produce a PDF archive.
"""
return None
# ------------------------------------------------------------------
# Thumbnail and metadata
# ------------------------------------------------------------------
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
"""Render the first portion of the document as a WebP thumbnail.
Parameters
----------
document_path:
Absolute path to the source document.
mime_type:
Detected MIME type of the document.
Returns
-------
Path
Path to the generated WebP thumbnail inside the temporary directory.
"""
max_chars = 100_000
file_size_limit = 50 * 1024 * 1024
@@ -35,16 +250,55 @@ class TextDocumentParser(DocumentParser):
)
draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
out_path = self.tempdir / "thumb.webp"
out_path = self._tempdir / "thumb.webp"
img.save(out_path, format="WEBP")
return out_path
def parse(self, document_path, mime_type, file_name=None) -> None:
self.text = self.read_file_handle_unicode_errors(document_path)
def get_page_count(
self,
document_path: Path,
mime_type: str,
) -> int | None:
"""Return the number of pages in the document.
def get_settings(self) -> None:
"""
This parser does not implement additional settings yet
Parameters
----------
document_path:
Absolute path to the source document.
mime_type:
Detected MIME type of the document.
Returns
-------
int | None
Always None — page count is not meaningful for plain text.
"""
return None
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
def _read_text(self, filepath: Path) -> str:
"""Read file content, replacing invalid UTF-8 bytes rather than failing.
Parameters
----------
filepath:
Path to the file to read.
Returns
-------
str
File content as a string.
"""
try:
return filepath.read_text(encoding="utf-8")
except UnicodeDecodeError as exc:
logger.warning(
"Unicode error reading %s, replacing bad bytes: %s",
filepath,
exc,
)
return filepath.read_bytes().decode("utf-8", errors="replace")
+66 -19
View File
@@ -1,29 +1,76 @@
from collections.abc import Generator
from pathlib import Path
"""
Parser fixtures that are used across multiple test modules in this package
are defined here. Format-specific sample-file fixtures are grouped by parser
so it is easy to see which files belong to which test module.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
from paperless_text.parsers import TextDocumentParser
from paperless.parsers.text import TextDocumentParser
if TYPE_CHECKING:
from collections.abc import Generator
from pathlib import Path
# ------------------------------------------------------------------
# Text parser sample files
# ------------------------------------------------------------------
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
def text_samples_dir(samples_dir: Path) -> Path:
"""Absolute path to the text parser sample files directory.
Returns
-------
Path
``<samples_dir>/text/``
"""
return samples_dir / "text"
@pytest.fixture(scope="session")
def sample_txt_file(text_samples_dir: Path) -> Path:
"""Path to a valid UTF-8 plain-text sample file.
Returns
-------
Path
Absolute path to ``text/test.txt``.
"""
return text_samples_dir / "test.txt"
@pytest.fixture(scope="session")
def malformed_txt_file(text_samples_dir: Path) -> Path:
"""Path to a text file containing invalid UTF-8 bytes.
Returns
-------
Path
Absolute path to ``text/decode_error.txt``.
"""
return text_samples_dir / "decode_error.txt"
# ------------------------------------------------------------------
# Text parser instance
# ------------------------------------------------------------------
@pytest.fixture()
def text_parser() -> Generator[TextDocumentParser, None, None]:
try:
parser = TextDocumentParser(logging_group=None)
"""Yield a TextDocumentParser and clean up its temporary directory afterwards.
Yields
------
TextDocumentParser
A ready-to-use parser instance.
"""
with TextDocumentParser() as parser:
yield parser
finally:
parser.cleanup()
@pytest.fixture(scope="session")
def sample_txt_file(sample_dir: Path) -> Path:
return sample_dir / "test.txt"
@pytest.fixture(scope="session")
def malformed_txt_file(sample_dir: Path) -> Path:
return sample_dir / "decode_error.txt"
+177 -21
View File
@@ -1,21 +1,94 @@
"""
Tests for paperless.parsers.text.TextDocumentParser.
All tests use the context-manager protocol for parser lifecycle. Sample
files are provided by session-scoped fixtures defined in conftest.py.
"""
from __future__ import annotations
import tempfile
from pathlib import Path
from paperless_text.parsers import TextDocumentParser
import pytest
from paperless.parsers import ParserProtocol
from paperless.parsers.text import TextDocumentParser
class TestTextParser:
def test_thumbnail(
class TestTextParserProtocol:
"""Verify that TextDocumentParser satisfies the ParserProtocol contract."""
def test_isinstance_satisfies_protocol(
self,
text_parser: TextDocumentParser,
sample_txt_file: Path,
) -> None:
# just make sure that it does not crash
f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
assert f.exists()
assert f.is_file()
assert isinstance(text_parser, ParserProtocol)
def test_parse(
def test_class_attributes_present(self) -> None:
assert isinstance(TextDocumentParser.name, str) and TextDocumentParser.name
assert (
isinstance(TextDocumentParser.version, str) and TextDocumentParser.version
)
assert isinstance(TextDocumentParser.author, str) and TextDocumentParser.author
assert isinstance(TextDocumentParser.url, str) and TextDocumentParser.url
def test_supported_mime_types_returns_dict(self) -> None:
mime_types = TextDocumentParser.supported_mime_types()
assert isinstance(mime_types, dict)
assert "text/plain" in mime_types
assert "text/csv" in mime_types
assert "application/csv" in mime_types
@pytest.mark.parametrize(
("mime_type", "expected"),
[
("text/plain", 10),
("text/csv", 10),
("application/csv", 10),
("application/pdf", None),
("image/png", None),
],
)
def test_score(self, mime_type: str, expected: int | None) -> None:
assert TextDocumentParser.score(mime_type, "file.txt") == expected
def test_can_produce_archive_is_false(
self,
text_parser: TextDocumentParser,
) -> None:
assert text_parser.can_produce_archive is False
def test_requires_pdf_rendition_is_false(
self,
text_parser: TextDocumentParser,
) -> None:
assert text_parser.requires_pdf_rendition is False
class TestTextParserLifecycle:
"""Verify context-manager behaviour and temporary directory cleanup."""
def test_context_manager_cleans_up_tempdir(self) -> None:
with TextDocumentParser() as parser:
tempdir = parser._tempdir
assert tempdir.exists()
assert not tempdir.exists()
def test_context_manager_cleans_up_after_exception(self) -> None:
tempdir: Path | None = None
with pytest.raises(RuntimeError):
with TextDocumentParser() as parser:
tempdir = parser._tempdir
raise RuntimeError("boom")
assert tempdir is not None
assert not tempdir.exists()
class TestTextParserParse:
"""Verify parse() and the result accessors."""
def test_parse_valid_utf8(
self,
text_parser: TextDocumentParser,
sample_txt_file: Path,
@@ -23,36 +96,74 @@ class TestTextParser:
text_parser.parse(sample_txt_file, "text/plain")
assert text_parser.get_text() == "This is a test file.\n"
def test_parse_returns_none_for_archive_path(
self,
text_parser: TextDocumentParser,
sample_txt_file: Path,
) -> None:
text_parser.parse(sample_txt_file, "text/plain")
assert text_parser.get_archive_path() is None
def test_parse_invalid_bytes(
def test_parse_returns_none_for_date(
self,
text_parser: TextDocumentParser,
sample_txt_file: Path,
) -> None:
text_parser.parse(sample_txt_file, "text/plain")
assert text_parser.get_date() is None
def test_parse_invalid_utf8_bytes_replaced(
self,
text_parser: TextDocumentParser,
malformed_txt_file: Path,
) -> None:
"""
GIVEN:
- Text file which contains invalid UTF bytes
- A text file containing invalid UTF-8 byte sequences
WHEN:
- The file is parsed
THEN:
- Parsing continues
- Invalid bytes are removed
- Parsing succeeds
- Invalid bytes are replaced with the Unicode replacement character
"""
text_parser.parse(malformed_txt_file, "text/plain")
assert text_parser.get_text() == "Pantothensure\n"
assert text_parser.get_archive_path() is None
assert text_parser.get_text() == "Pantothens\ufffdure\n"
def test_thumbnail_large_file(self, text_parser: TextDocumentParser) -> None:
def test_get_text_none_before_parse(
self,
text_parser: TextDocumentParser,
) -> None:
assert text_parser.get_text() is None
class TestTextParserThumbnail:
"""Verify thumbnail generation."""
def test_thumbnail_exists_and_is_file(
self,
text_parser: TextDocumentParser,
sample_txt_file: Path,
) -> None:
thumb = text_parser.get_thumbnail(sample_txt_file, "text/plain")
assert thumb.exists()
assert thumb.is_file()
def test_thumbnail_large_file_does_not_read_all(
self,
text_parser: TextDocumentParser,
) -> None:
"""
GIVEN:
- A very large text file (>50MB)
- A text file larger than 50 MB
WHEN:
- A thumbnail is requested
THEN:
- A thumbnail is created without reading the entire file into memory
- The thumbnail is generated without loading the full file
"""
with tempfile.NamedTemporaryFile(
delete=False,
@@ -60,10 +171,55 @@ class TestTextParser:
encoding="utf-8",
suffix=".txt",
) as tmp:
tmp.write("A" * (51 * 1024 * 1024)) # 51 MB of 'A'
tmp.write("A" * (51 * 1024 * 1024))
large_file = Path(tmp.name)
try:
thumb = text_parser.get_thumbnail(large_file, "text/plain")
assert thumb.exists()
assert thumb.is_file()
large_file.unlink()
finally:
large_file.unlink(missing_ok=True)
def test_get_page_count_returns_none(
self,
text_parser: TextDocumentParser,
sample_txt_file: Path,
) -> None:
assert text_parser.get_page_count(sample_txt_file, "text/plain") is None
class TestTextParserRegistry:
"""Verify that TextDocumentParser is registered by default."""
def test_registered_in_defaults(self) -> None:
from paperless.parsers.registry import ParserRegistry
registry = ParserRegistry()
registry.register_defaults()
assert TextDocumentParser in registry._builtins
def test_get_parser_for_text_plain(self) -> None:
from paperless.parsers.registry import get_parser_registry
registry = get_parser_registry()
parser_cls = registry.get_parser_for_file("text/plain", "doc.txt")
assert parser_cls is TextDocumentParser
def test_get_parser_for_text_csv(self) -> None:
from paperless.parsers.registry import get_parser_registry
registry = get_parser_registry()
parser_cls = registry.get_parser_for_file("text/csv", "data.csv")
assert parser_cls is TextDocumentParser
def test_get_parser_for_unknown_type_returns_none(self) -> None:
from paperless.parsers.registry import get_parser_registry
registry = get_parser_registry()
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
assert parser_cls is None
-14
View File
@@ -25,20 +25,6 @@ from paperless.parsers.registry import init_builtin_parsers
from paperless.parsers.registry import reset_parser_registry
@pytest.fixture(autouse=True)
def clean_registry() -> None:
"""Reset the global parser registry before and after every test.
GIVEN: The registry module carries module-level singleton state.
WHEN: Any test is executed.
THEN: Each test starts and ends with a clean slate, preventing state
leak between tests.
"""
reset_parser_registry()
yield
reset_parser_registry()
@pytest.fixture()
def dummy_parser_cls() -> type:
"""Return a class that fully satisfies :class:`ParserProtocol`.