mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-10 23:59:43 +00:00
Feature: Phase 3 — migrate TextDocumentParser to ParserProtocol
Implement ParserProtocol on the moved TextDocumentParser without inheriting from the old DocumentParser ABC: - Add class-level identity attributes (name, version, author, url) - Add supported_mime_types() and score() classmethods - Add can_produce_archive and requires_pdf_rendition properties (both False) - Replace tempdir / read_file_handle_unicode_errors from old base class with a self-contained __init__, __enter__, __exit__, and _read_text helper - Drop file_name parameter from parse() and get_thumbnail(); add produce_archive kwarg - Use Self as __enter__ return type; align __exit__ exc_tb type to TracebackType | None - Register TextDocumentParser in ParserRegistry.register_defaults() Tests: - Rewrite test_text_parser.py with 20 tests covering protocol compliance, lifecycle/cleanup, parse, thumbnail, and registry integration - Update parsers/conftest.py with text_parser fixture and sample file fixtures - Update top-level tests/conftest.py with shared clean_registry autouse fixture Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,4 @@
|
||||
"""
|
||||
paperless.parsers
|
||||
=================
|
||||
|
||||
Public interface for the Paperless-ngx parser plugin system.
|
||||
|
||||
This module defines ParserProtocol — the structural contract that every
|
||||
@@ -46,6 +43,7 @@ from typing import runtime_checkable
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
from types import TracebackType
|
||||
|
||||
__all__ = [
|
||||
"ParserProtocol",
|
||||
@@ -304,7 +302,7 @@ class ParserProtocol(Protocol):
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: object,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
"""Exit the parser context and release all resources.
|
||||
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
"""
|
||||
paperless.parsers.registry
|
||||
==========================
|
||||
|
||||
Singleton registry that tracks all document parsers available to
|
||||
Paperless-ngx — both built-ins shipped with the application and third-party
|
||||
plugins installed via Python entrypoints.
|
||||
@@ -42,6 +39,8 @@ from typing import TYPE_CHECKING
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers import ParserProtocol
|
||||
|
||||
logger = logging.getLogger("paperless.parsers.registry")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -117,6 +116,7 @@ def init_builtin_parsers() -> None:
|
||||
if _registry is None:
|
||||
_registry = ParserRegistry()
|
||||
_registry.register_defaults()
|
||||
_registry.log_summary()
|
||||
|
||||
|
||||
def reset_parser_registry() -> None:
|
||||
@@ -165,14 +165,14 @@ class ParserRegistry:
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._external: list[type] = []
|
||||
self._builtins: list[type] = []
|
||||
self._external: list[type[ParserProtocol]] = []
|
||||
self._builtins: list[type[ParserProtocol]] = []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Registration
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def register_builtin(self, parser_class: type) -> None:
|
||||
def register_builtin(self, parser_class: type[ParserProtocol]) -> None:
|
||||
"""Register a built-in parser class.
|
||||
|
||||
Built-in parsers are shipped with Paperless-ngx and are appended to
|
||||
@@ -189,11 +189,14 @@ class ParserRegistry:
|
||||
def register_defaults(self) -> None:
|
||||
"""Register the built-in parsers that ship with Paperless-ngx.
|
||||
|
||||
Populated in Phase 3 when built-in parsers implement the new
|
||||
interface. In Phase 1/2 this is intentionally a no-op so that the
|
||||
registry infrastructure can be tested in isolation without depending
|
||||
on any concrete parser implementations.
|
||||
Each parser that has been migrated to the new ParserProtocol interface
|
||||
is registered here. Parsers are added in ascending weight order so
|
||||
that log output is predictable; scoring determines which parser wins
|
||||
at runtime regardless of registration order.
|
||||
"""
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
self.register_builtin(TextDocumentParser)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Discovery
|
||||
@@ -303,7 +306,7 @@ class ParserRegistry:
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> type | None:
|
||||
) -> type[ParserProtocol] | None:
|
||||
"""Return the best parser class for the given file, or None.
|
||||
|
||||
All registered parsers (external first, then built-ins) are evaluated
|
||||
@@ -331,11 +334,11 @@ class ParserRegistry:
|
||||
|
||||
Returns
|
||||
-------
|
||||
type | None
|
||||
type[ParserProtocol] | None
|
||||
The winning parser class, or None if no parser can handle the file.
|
||||
"""
|
||||
best_score: int | None = None
|
||||
best_parser: type | None = None
|
||||
best_parser: type[ParserProtocol] | None = None
|
||||
|
||||
# External parsers are placed first so that, at equal scores, an
|
||||
# external parser wins over a built-in (first-seen policy).
|
||||
|
||||
+267
-13
@@ -1,22 +1,237 @@
|
||||
"""
|
||||
Built-in plain-text document parser.
|
||||
|
||||
Handles text/plain, text/csv, and application/csv MIME types by reading the
|
||||
file content directly. Thumbnails are generated by rendering a page-sized
|
||||
WebP image from the first 100,000 characters using Pillow.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
|
||||
from django.conf import settings
|
||||
from PIL import Image
|
||||
from PIL import ImageDraw
|
||||
from PIL import ImageFont
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from types import TracebackType
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.text")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv",
|
||||
"application/csv": ".csv",
|
||||
}
|
||||
|
||||
|
||||
class TextDocumentParser(DocumentParser):
|
||||
"""
|
||||
This parser directly parses a text document (.txt, .md, or .csv)
|
||||
class TextDocumentParser:
|
||||
"""Parse plain-text documents (txt, csv) for Paperless-ngx.
|
||||
|
||||
This parser reads the file content directly as UTF-8 text and renders a
|
||||
simple thumbnail using Pillow. It does not perform OCR and does not
|
||||
produce a searchable PDF archive copy.
|
||||
|
||||
Class attributes
|
||||
----------------
|
||||
name : str
|
||||
Human-readable parser name.
|
||||
version : str
|
||||
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||
author : str
|
||||
Maintainer name.
|
||||
url : str
|
||||
Issue tracker / source URL.
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.text"
|
||||
name: str = "Paperless-ngx Text Parser"
|
||||
version: str = __full_version_str__
|
||||
author: str = "Paperless-ngx Contributors"
|
||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||
|
||||
def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
|
||||
# Avoid reading entire file into memory
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict[str, str]:
|
||||
"""Return the MIME types this parser handles.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, str]
|
||||
Mapping of MIME type to preferred file extension.
|
||||
"""
|
||||
return _SUPPORTED_MIME_TYPES
|
||||
|
||||
@classmethod
|
||||
def score(
|
||||
cls,
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> int | None:
|
||||
"""Return the priority score for handling this file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mime_type:
|
||||
Detected MIME type of the file.
|
||||
filename:
|
||||
Original filename including extension.
|
||||
path:
|
||||
Optional filesystem path. Not inspected by this parser.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
10 if the MIME type is supported, otherwise None.
|
||||
"""
|
||||
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||
return 10
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
"""Whether this parser can produce a searchable PDF archive copy.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always False — the text parser does not produce a PDF archive.
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
"""Whether the parser must produce a PDF for the frontend to display.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always False — plain text files are displayable as-is.
|
||||
"""
|
||||
return False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self._tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
self._text: str | None = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
*,
|
||||
produce_archive: bool = True,
|
||||
) -> None:
|
||||
"""Read the document and store its text content.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the text file.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
produce_archive:
|
||||
Ignored — this parser never produces a PDF archive.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If the file cannot be read.
|
||||
"""
|
||||
self._text = self._read_text(document_path)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
"""Return the plain-text content extracted during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if parse has not been called yet.
|
||||
"""
|
||||
return self._text
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
datetime.datetime | None
|
||||
Always None — the text parser does not detect dates.
|
||||
"""
|
||||
return None
|
||||
|
||||
def get_archive_path(self) -> Path | None:
|
||||
"""Return the path to a generated archive PDF, or None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path | None
|
||||
Always None — the text parser does not produce a PDF archive.
|
||||
"""
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thumbnail and metadata
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||
"""Render the first portion of the document as a WebP thumbnail.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated WebP thumbnail inside the temporary directory.
|
||||
"""
|
||||
max_chars = 100_000
|
||||
file_size_limit = 50 * 1024 * 1024
|
||||
|
||||
@@ -35,16 +250,55 @@ class TextDocumentParser(DocumentParser):
|
||||
)
|
||||
draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
|
||||
|
||||
out_path = self.tempdir / "thumb.webp"
|
||||
out_path = self._tempdir / "thumb.webp"
|
||||
img.save(out_path, format="WEBP")
|
||||
|
||||
return out_path
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
||||
self.text = self.read_file_handle_unicode_errors(document_path)
|
||||
def get_page_count(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in the document.
|
||||
|
||||
def get_settings(self) -> None:
|
||||
"""
|
||||
This parser does not implement additional settings yet
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Always None — page count is not meaningful for plain text.
|
||||
"""
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _read_text(self, filepath: Path) -> str:
|
||||
"""Read file content, replacing invalid UTF-8 bytes rather than failing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath:
|
||||
Path to the file to read.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
File content as a string.
|
||||
"""
|
||||
try:
|
||||
return filepath.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError as exc:
|
||||
logger.warning(
|
||||
"Unicode error reading %s, replacing bad bytes: %s",
|
||||
filepath,
|
||||
exc,
|
||||
)
|
||||
return filepath.read_bytes().decode("utf-8", errors="replace")
|
||||
|
||||
@@ -1,29 +1,76 @@
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
"""
|
||||
Parser fixtures that are used across multiple test modules in this package
|
||||
are defined here. Format-specific sample-file fixtures are grouped by parser
|
||||
so it is easy to see which files belong to which test module.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Text parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_dir() -> Path:
|
||||
return (Path(__file__).parent / Path("samples")).resolve()
|
||||
def text_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the text parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/text/``
|
||||
"""
|
||||
return samples_dir / "text"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_txt_file(text_samples_dir: Path) -> Path:
|
||||
"""Path to a valid UTF-8 plain-text sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``text/test.txt``.
|
||||
"""
|
||||
return text_samples_dir / "test.txt"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def malformed_txt_file(text_samples_dir: Path) -> Path:
|
||||
"""Path to a text file containing invalid UTF-8 bytes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``text/decode_error.txt``.
|
||||
"""
|
||||
return text_samples_dir / "decode_error.txt"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Text parser instance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def text_parser() -> Generator[TextDocumentParser, None, None]:
|
||||
try:
|
||||
parser = TextDocumentParser(logging_group=None)
|
||||
"""Yield a TextDocumentParser and clean up its temporary directory afterwards.
|
||||
|
||||
Yields
|
||||
------
|
||||
TextDocumentParser
|
||||
A ready-to-use parser instance.
|
||||
"""
|
||||
with TextDocumentParser() as parser:
|
||||
yield parser
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_txt_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "test.txt"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def malformed_txt_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "decode_error.txt"
|
||||
|
||||
@@ -1,21 +1,94 @@
|
||||
"""
|
||||
Tests for paperless.parsers.text.TextDocumentParser.
|
||||
|
||||
All tests use the context-manager protocol for parser lifecycle. Sample
|
||||
files are provided by session-scoped fixtures defined in conftest.py.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
import pytest
|
||||
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
|
||||
class TestTextParser:
|
||||
def test_thumbnail(
|
||||
class TestTextParserProtocol:
|
||||
"""Verify that TextDocumentParser satisfies the ParserProtocol contract."""
|
||||
|
||||
def test_isinstance_satisfies_protocol(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
# just make sure that it does not crash
|
||||
f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
|
||||
assert f.exists()
|
||||
assert f.is_file()
|
||||
assert isinstance(text_parser, ParserProtocol)
|
||||
|
||||
def test_parse(
|
||||
def test_class_attributes_present(self) -> None:
|
||||
assert isinstance(TextDocumentParser.name, str) and TextDocumentParser.name
|
||||
assert (
|
||||
isinstance(TextDocumentParser.version, str) and TextDocumentParser.version
|
||||
)
|
||||
assert isinstance(TextDocumentParser.author, str) and TextDocumentParser.author
|
||||
assert isinstance(TextDocumentParser.url, str) and TextDocumentParser.url
|
||||
|
||||
def test_supported_mime_types_returns_dict(self) -> None:
|
||||
mime_types = TextDocumentParser.supported_mime_types()
|
||||
assert isinstance(mime_types, dict)
|
||||
assert "text/plain" in mime_types
|
||||
assert "text/csv" in mime_types
|
||||
assert "application/csv" in mime_types
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("mime_type", "expected"),
|
||||
[
|
||||
("text/plain", 10),
|
||||
("text/csv", 10),
|
||||
("application/csv", 10),
|
||||
("application/pdf", None),
|
||||
("image/png", None),
|
||||
],
|
||||
)
|
||||
def test_score(self, mime_type: str, expected: int | None) -> None:
|
||||
assert TextDocumentParser.score(mime_type, "file.txt") == expected
|
||||
|
||||
def test_can_produce_archive_is_false(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
) -> None:
|
||||
assert text_parser.can_produce_archive is False
|
||||
|
||||
def test_requires_pdf_rendition_is_false(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
) -> None:
|
||||
assert text_parser.requires_pdf_rendition is False
|
||||
|
||||
|
||||
class TestTextParserLifecycle:
|
||||
"""Verify context-manager behaviour and temporary directory cleanup."""
|
||||
|
||||
def test_context_manager_cleans_up_tempdir(self) -> None:
|
||||
with TextDocumentParser() as parser:
|
||||
tempdir = parser._tempdir
|
||||
assert tempdir.exists()
|
||||
assert not tempdir.exists()
|
||||
|
||||
def test_context_manager_cleans_up_after_exception(self) -> None:
|
||||
tempdir: Path | None = None
|
||||
with pytest.raises(RuntimeError):
|
||||
with TextDocumentParser() as parser:
|
||||
tempdir = parser._tempdir
|
||||
raise RuntimeError("boom")
|
||||
assert tempdir is not None
|
||||
assert not tempdir.exists()
|
||||
|
||||
|
||||
class TestTextParserParse:
|
||||
"""Verify parse() and the result accessors."""
|
||||
|
||||
def test_parse_valid_utf8(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
@@ -23,36 +96,74 @@ class TestTextParser:
|
||||
text_parser.parse(sample_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_text() == "This is a test file.\n"
|
||||
|
||||
def test_parse_returns_none_for_archive_path(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
text_parser.parse(sample_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_archive_path() is None
|
||||
|
||||
def test_parse_invalid_bytes(
|
||||
def test_parse_returns_none_for_date(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
text_parser.parse(sample_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_date() is None
|
||||
|
||||
def test_parse_invalid_utf8_bytes_replaced(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
malformed_txt_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Text file which contains invalid UTF bytes
|
||||
- A text file containing invalid UTF-8 byte sequences
|
||||
WHEN:
|
||||
- The file is parsed
|
||||
THEN:
|
||||
- Parsing continues
|
||||
- Invalid bytes are removed
|
||||
- Parsing succeeds
|
||||
- Invalid bytes are replaced with the Unicode replacement character
|
||||
"""
|
||||
|
||||
text_parser.parse(malformed_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_text() == "Pantothens�ure\n"
|
||||
assert text_parser.get_archive_path() is None
|
||||
assert text_parser.get_text() == "Pantothens\ufffdure\n"
|
||||
|
||||
def test_thumbnail_large_file(self, text_parser: TextDocumentParser) -> None:
|
||||
def test_get_text_none_before_parse(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
) -> None:
|
||||
assert text_parser.get_text() is None
|
||||
|
||||
|
||||
class TestTextParserThumbnail:
|
||||
"""Verify thumbnail generation."""
|
||||
|
||||
def test_thumbnail_exists_and_is_file(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
thumb = text_parser.get_thumbnail(sample_txt_file, "text/plain")
|
||||
|
||||
assert thumb.exists()
|
||||
assert thumb.is_file()
|
||||
|
||||
def test_thumbnail_large_file_does_not_read_all(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A very large text file (>50MB)
|
||||
- A text file larger than 50 MB
|
||||
WHEN:
|
||||
- A thumbnail is requested
|
||||
THEN:
|
||||
- A thumbnail is created without reading the entire file into memory
|
||||
- The thumbnail is generated without loading the full file
|
||||
"""
|
||||
with tempfile.NamedTemporaryFile(
|
||||
delete=False,
|
||||
@@ -60,10 +171,55 @@ class TestTextParser:
|
||||
encoding="utf-8",
|
||||
suffix=".txt",
|
||||
) as tmp:
|
||||
tmp.write("A" * (51 * 1024 * 1024)) # 51 MB of 'A'
|
||||
tmp.write("A" * (51 * 1024 * 1024))
|
||||
large_file = Path(tmp.name)
|
||||
|
||||
try:
|
||||
thumb = text_parser.get_thumbnail(large_file, "text/plain")
|
||||
assert thumb.exists()
|
||||
assert thumb.is_file()
|
||||
large_file.unlink()
|
||||
finally:
|
||||
large_file.unlink(missing_ok=True)
|
||||
|
||||
def test_get_page_count_returns_none(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
assert text_parser.get_page_count(sample_txt_file, "text/plain") is None
|
||||
|
||||
|
||||
class TestTextParserRegistry:
|
||||
"""Verify that TextDocumentParser is registered by default."""
|
||||
|
||||
def test_registered_in_defaults(self) -> None:
|
||||
from paperless.parsers.registry import ParserRegistry
|
||||
|
||||
registry = ParserRegistry()
|
||||
registry.register_defaults()
|
||||
|
||||
assert TextDocumentParser in registry._builtins
|
||||
|
||||
def test_get_parser_for_text_plain(self) -> None:
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
registry = get_parser_registry()
|
||||
parser_cls = registry.get_parser_for_file("text/plain", "doc.txt")
|
||||
|
||||
assert parser_cls is TextDocumentParser
|
||||
|
||||
def test_get_parser_for_text_csv(self) -> None:
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
registry = get_parser_registry()
|
||||
parser_cls = registry.get_parser_for_file("text/csv", "data.csv")
|
||||
|
||||
assert parser_cls is TextDocumentParser
|
||||
|
||||
def test_get_parser_for_unknown_type_returns_none(self) -> None:
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
registry = get_parser_registry()
|
||||
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||
|
||||
assert parser_cls is None
|
||||
|
||||
@@ -25,20 +25,6 @@ from paperless.parsers.registry import init_builtin_parsers
|
||||
from paperless.parsers.registry import reset_parser_registry
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clean_registry() -> None:
|
||||
"""Reset the global parser registry before and after every test.
|
||||
|
||||
GIVEN: The registry module carries module-level singleton state.
|
||||
WHEN: Any test is executed.
|
||||
THEN: Each test starts and ends with a clean slate, preventing state
|
||||
leak between tests.
|
||||
"""
|
||||
reset_parser_registry()
|
||||
yield
|
||||
reset_parser_registry()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def dummy_parser_cls() -> type:
|
||||
"""Return a class that fully satisfies :class:`ParserProtocol`.
|
||||
|
||||
Reference in New Issue
Block a user