diff --git a/src/documents/consumer.py b/src/documents/consumer.py index b5f7ebb24..cbc2198ef 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -51,11 +51,28 @@ from documents.templating.workflows import parse_w_workflow_placeholders from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.parsers.text import TextDocumentParser from paperless_mail.parsers import MailDocumentParser LOGGING_NAME: Final[str] = "paperless.consumer" +def _parser_cleanup(parser: DocumentParser) -> None: + """ + Call cleanup on a parser, handling the new-style context-manager parsers. + + New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown + instead of a cleanup() method. This shim will be removed once all existing parsers + have switched to the new style and this consumer is updated to use it + + TODO(stumpylog): Remove me in the future + """ + if isinstance(parser, TextDocumentParser): + parser.__exit__(None, None, None) + else: + parser.cleanup() + + class WorkflowTriggerPlugin( NoCleanupPluginMixin, NoSetupPluginMixin, @@ -459,6 +476,9 @@ class ConsumerPlugin( self.filename, self.input_doc.mailrule_id, ) + elif isinstance(document_parser, TextDocumentParser): + # TODO(stumpylog): Remove me in the future + document_parser.parse(self.working_copy, mime_type) else: document_parser.parse(self.working_copy, mime_type, self.filename) @@ -469,11 +489,15 @@ class ConsumerPlugin( ProgressStatusOptions.WORKING, ConsumerStatusShortMessage.GENERATING_THUMBNAIL, ) - thumbnail = document_parser.get_thumbnail( - self.working_copy, - mime_type, - self.filename, - ) + if isinstance(document_parser, TextDocumentParser): + # TODO(stumpylog): Remove me in the future + thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type) + else: + thumbnail = document_parser.get_thumbnail( + self.working_copy, + mime_type, + self.filename, + ) text = document_parser.get_text() date = document_parser.get_date() @@ -490,7 +514,7 @@ class ConsumerPlugin( page_count = document_parser.get_page_count(self.working_copy, mime_type) except ParseError as e: - document_parser.cleanup() + _parser_cleanup(document_parser) if tempdir: tempdir.cleanup() self._fail( @@ -500,7 +524,7 @@ class ConsumerPlugin( exception=e, ) except Exception as e: - document_parser.cleanup() + _parser_cleanup(document_parser) if tempdir: tempdir.cleanup() self._fail( @@ -702,7 +726,7 @@ class ConsumerPlugin( exception=e, ) finally: - document_parser.cleanup() + _parser_cleanup(document_parser) tempdir.cleanup() self.run_post_consume_script(document) diff --git a/src/documents/management/commands/document_thumbnails.py b/src/documents/management/commands/document_thumbnails.py index 2d8609588..e4ae88766 100644 --- a/src/documents/management/commands/document_thumbnails.py +++ b/src/documents/management/commands/document_thumbnails.py @@ -30,6 +30,7 @@ def _process_document(doc_id: int) -> None: ) shutil.move(thumb, document.thumbnail_path) finally: + # TODO(stumpylog): Cleanup once all parsers are handled parser.cleanup() diff --git a/src/documents/tasks.py b/src/documents/tasks.py index ff25adbc7..86b6b2716 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -399,6 +399,7 @@ def update_document_content_maybe_archive_file(document_id) -> None: f"Error while parsing document {document} (ID: {document_id})", ) finally: + # TODO(stumpylog): Cleanup once all parsers are handled parser.cleanup() diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 1447d5c30..5383975d1 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -9,8 +9,8 @@ from documents.parsers import get_default_file_extension from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_supported_file_extensions from documents.parsers import is_file_ext_supported +from paperless.parsers.text import TextDocumentParser from paperless_tesseract.parsers import RasterisedDocumentParser -from paperless_text.parsers import TextDocumentParser from paperless_tika.parsers import TikaDocumentParser diff --git a/src/paperless/celery.py b/src/paperless/celery.py index a9a853521..d937b3ada 100644 --- a/src/paperless/celery.py +++ b/src/paperless/celery.py @@ -1,6 +1,7 @@ import os from celery import Celery +from celery.signals import worker_process_init # Set the default Django settings module for the 'celery' program. os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") @@ -15,3 +16,19 @@ app.config_from_object("django.conf:settings", namespace="CELERY") # Load task modules from all registered Django apps. app.autodiscover_tasks() + + +@worker_process_init.connect +def on_worker_process_init(**kwargs) -> None: # pragma: no cover + """ + Register built-in parsers eagerly in each Celery worker process. + + This registers only the built-in parsers (no entrypoint discovery) so + that workers can begin consuming documents immediately. Entrypoint + discovery for third-party parsers is deferred to the first call of + get_parser_registry() inside a task, keeping worker_process_init + well within its 4-second timeout budget. + """ + from paperless.parsers.registry import init_builtin_parsers + + init_builtin_parsers() diff --git a/src/paperless/parsers/__init__.py b/src/paperless/parsers/__init__.py new file mode 100644 index 000000000..ea67ade00 --- /dev/null +++ b/src/paperless/parsers/__init__.py @@ -0,0 +1,379 @@ +""" +Public interface for the Paperless-ngx parser plugin system. + +This module defines ParserProtocol — the structural contract that every +document parser must satisfy, whether it is a built-in parser shipped with +Paperless-ngx or a third-party parser installed via a Python entrypoint. + +Phase 1/2 scope: only the Protocol is defined here. The transitional +DocumentParser ABC (Phase 3) and concrete built-in parsers (Phase 3+) will +be added in later phases, so there are intentionally no imports of parser +implementations here. + +Usage example (third-party parser):: + + from paperless.parsers import ParserProtocol + + class MyParser: + name = "my-parser" + version = "1.0.0" + author = "Acme Corp" + url = "https://example.com/my-parser" + + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + return {"application/x-my-format": ".myf"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return 10 + + # … implement remaining protocol methods … + + assert isinstance(MyParser(), ParserProtocol) +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Protocol +from typing import Self +from typing import TypedDict +from typing import runtime_checkable + +if TYPE_CHECKING: + import datetime + from pathlib import Path + from types import TracebackType + +__all__ = [ + "MetadataEntry", + "ParserProtocol", +] + + +class MetadataEntry(TypedDict): + """A single metadata field extracted from a document. + + All four keys are required. Values are always serialised to strings — + type-specific conversion (dates, integers, lists) is the responsibility + of the parser before returning. + """ + + namespace: str + """URI of the metadata namespace (e.g. 'http://ns.adobe.com/pdf/1.3/').""" + + prefix: str + """Conventional namespace prefix (e.g. 'pdf', 'xmp', 'dc').""" + + key: str + """Field name within the namespace (e.g. 'Author', 'CreateDate').""" + + value: str + """String representation of the field value.""" + + +@runtime_checkable +class ParserProtocol(Protocol): + """Structural contract for all Paperless-ngx document parsers. + + Both built-in parsers and third-party plugins (discovered via the + "paperless_ngx.parsers" entrypoint group) must satisfy this Protocol. + Because it is decorated with runtime_checkable, isinstance(obj, + ParserProtocol) works at runtime based on method presence, which is + useful for validation in ParserRegistry.discover. + + Parsers must expose four string attributes at the class level so the + registry can log attribution information without instantiating the parser: + + name : str + Human-readable parser name (e.g. "Tesseract OCR"). + version : str + Semantic version string (e.g. "1.2.3"). + author : str + Author or organisation name. + url : str + URL for documentation, source code, or issue tracker. + """ + + # ------------------------------------------------------------------ + # Class-level identity (checked by the registry, not Protocol methods) + # ------------------------------------------------------------------ + + name: str + version: str + author: str + url: str + + # ------------------------------------------------------------------ + # Class methods + # ------------------------------------------------------------------ + + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + """Return a mapping of supported MIME types to preferred file extensions. + + The keys are MIME type strings (e.g. "application/pdf"), and the + values are the preferred file extension including the leading dot + (e.g. ".pdf"). The registry uses this mapping both to decide whether + a parser is a candidate for a given file and to determine the default + extension when creating archive copies. + + Returns + ------- + dict[str, str] + {mime_type: extension} mapping — may be empty if the parser + has been temporarily disabled. + """ + ... + + @classmethod + def score( + cls, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> int | None: + """Return a priority score for handling this file, or None to decline. + + The registry calls this after confirming that the MIME type is in + supported_mime_types. Parsers may inspect filename and optionally + the file at path to refine their confidence level. + + A higher score wins. Return None to explicitly decline handling a file + even though the MIME type is listed as supported (e.g. when a feature + flag is disabled, or a required service is not configured). + + Parameters + ---------- + mime_type: + The detected MIME type of the file to be parsed. + filename: + The original filename, including extension. + path: + Optional filesystem path to the file. Parsers that need to + inspect file content (e.g. magic-byte sniffing) may use this. + May be None when scoring happens before the file is available locally. + + Returns + ------- + int | None + Priority score (higher wins), or None to decline. + """ + ... + + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def can_produce_archive(self) -> bool: + """Whether this parser can produce a searchable PDF archive copy. + + If True, the consumption pipeline may request an archive version when + processing the document, subject to the ARCHIVE_FILE_GENERATION + setting. If False, only thumbnail and text extraction are performed. + """ + ... + + @property + def requires_pdf_rendition(self) -> bool: + """Whether the parser must produce a PDF for the frontend to display. + + True for formats the browser cannot display natively (e.g. DOCX, ODT). + When True, the pipeline always stores the PDF output regardless of the + ARCHIVE_FILE_GENERATION setting, since the original format cannot be + shown to the user. + """ + ... + + # ------------------------------------------------------------------ + # Core parsing interface + # ------------------------------------------------------------------ + + def parse( + self, + document_path: Path, + mime_type: str, + *, + produce_archive: bool = True, + ) -> None: + """Parse document_path and populate internal state. + + After a successful call, callers retrieve results via get_text, + get_date, and get_archive_path. + + Parameters + ---------- + document_path: + Absolute path to the document file to parse. + mime_type: + Detected MIME type of the document. + produce_archive: + When True (the default) and can_produce_archive is also True, + the parser should produce a searchable PDF at the path returned + by get_archive_path. Pass False when only text extraction and + thumbnail generation are required and disk I/O should be minimised. + + Raises + ------ + documents.parsers.ParseError + If parsing fails for any reason. + """ + ... + + # ------------------------------------------------------------------ + # Result accessors + # ------------------------------------------------------------------ + + def get_text(self) -> str | None: + """Return the plain-text content extracted during parse. + + Returns + ------- + str | None + Extracted text, or None if no text could be found. + """ + ... + + def get_date(self) -> datetime.datetime | None: + """Return the document date detected during parse. + + Returns + ------- + datetime.datetime | None + Detected document date, or None if no date was found. + """ + ... + + def get_archive_path(self) -> Path | None: + """Return the path to the generated archive PDF, or None. + + Returns + ------- + Path | None + Path to the searchable PDF archive, or None if no archive was + produced (e.g. because produce_archive=False or the parser does + not support archive generation). + """ + ... + + # ------------------------------------------------------------------ + # Thumbnail and metadata + # ------------------------------------------------------------------ + + def get_thumbnail(self, document_path: Path, mime_type: str) -> Path: + """Generate and return the path to a thumbnail image for the document. + + May be called independently of parse. The returned path must point to + an existing WebP image file inside the parser's temporary working + directory. + + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + + Returns + ------- + Path + Path to the generated thumbnail image (WebP format preferred). + """ + ... + + def get_page_count( + self, + document_path: Path, + mime_type: str, + ) -> int | None: + """Return the number of pages in the document, if determinable. + + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + + Returns + ------- + int | None + Page count, or None if the parser cannot determine it. + """ + ... + + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list[MetadataEntry]: + """Extract format-specific metadata from the document. + + Called by the API view layer on demand — not during the consumption + pipeline. Results are returned to the frontend for per-file display. + + For documents with an archive version, this method is called twice: + once for the original file (with its native MIME type) and once for + the archive file (with ``"application/pdf"``). Parsers that produce + archives should handle both cases. + + Implementations must not raise. A failure to read metadata is not + fatal — log a warning and return whatever partial results were + collected, or ``[]`` if none. + + Parameters + ---------- + document_path: + Absolute path to the file to extract metadata from. + mime_type: + MIME type of the file at ``document_path``. May be + ``"application/pdf"`` when called for the archive version. + + Returns + ------- + list[MetadataEntry] + Zero or more metadata entries. Returns ``[]`` if no metadata + could be extracted or the format does not support it. + """ + ... + + # ------------------------------------------------------------------ + # Context manager + # ------------------------------------------------------------------ + + def __enter__(self) -> Self: + """Enter the parser context, returning the parser instance. + + Implementations should perform any resource allocation here if not + done in __init__ (e.g. creating API clients or temp directories). + + Returns + ------- + Self + The parser instance itself. + """ + ... + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + """Exit the parser context and release all resources. + + Implementations must clean up all temporary files and other resources + regardless of whether an exception occurred. + + Parameters + ---------- + exc_type: + The exception class, or None if no exception was raised. + exc_val: + The exception instance, or None. + exc_tb: + The traceback, or None. + """ + ... diff --git a/src/paperless/parsers/registry.py b/src/paperless/parsers/registry.py new file mode 100644 index 000000000..8c45db628 --- /dev/null +++ b/src/paperless/parsers/registry.py @@ -0,0 +1,364 @@ +""" +Singleton registry that tracks all document parsers available to +Paperless-ngx — both built-ins shipped with the application and third-party +plugins installed via Python entrypoints. + +Public surface +-------------- +get_parser_registry + Lazy-initialise and return the shared ParserRegistry. This is the primary + entry point for production code. + +init_builtin_parsers + Register built-in parsers only, without entrypoint discovery. Safe to + call from Celery worker_process_init where importing all entrypoints + would be wasteful or cause side effects. + +reset_parser_registry + Reset module-level state. For tests only. + +Entrypoint group +---------------- +Third-party parsers must advertise themselves under the +"paperless_ngx.parsers" entrypoint group in their pyproject.toml:: + + [project.entry-points."paperless_ngx.parsers"] + my_parser = "my_package.parsers:MyParser" + +The loaded class must expose the following attributes at the class level +(not just on instances) for the registry to accept it: +name, version, author, url, supported_mime_types (callable), score (callable). +""" + +from __future__ import annotations + +import logging +from importlib.metadata import entry_points +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + + from paperless.parsers import ParserProtocol + +logger = logging.getLogger("paperless.parsers.registry") + +# --------------------------------------------------------------------------- +# Module-level singleton state +# --------------------------------------------------------------------------- + +_registry: ParserRegistry | None = None +_discovery_complete: bool = False + +# Attribute names that every registered external parser class must expose. +_REQUIRED_ATTRS: tuple[str, ...] = ( + "name", + "version", + "author", + "url", + "supported_mime_types", + "score", +) + + +# --------------------------------------------------------------------------- +# Module-level accessor functions +# --------------------------------------------------------------------------- + + +def get_parser_registry() -> ParserRegistry: + """Return the shared ParserRegistry instance. + + On the first call this function: + + 1. Creates a new ParserRegistry. + 2. Calls register_defaults to install built-in parsers. + 3. Calls discover to load third-party plugins via importlib.metadata entrypoints. + 4. Calls log_summary to emit a startup summary. + + Subsequent calls return the same instance immediately. + + Returns + ------- + ParserRegistry + The shared registry singleton. + """ + global _registry, _discovery_complete + + if _registry is None: + _registry = ParserRegistry() + _registry.register_defaults() + + if not _discovery_complete: + _registry.discover() + _registry.log_summary() + _discovery_complete = True + + return _registry + + +def init_builtin_parsers() -> None: + """Register built-in parsers without performing entrypoint discovery. + + Intended for use in Celery worker_process_init handlers where importing + all installed entrypoints would be wasteful, slow, or could produce + undesirable side effects. Entrypoint discovery (third-party plugins) is + deliberately not performed. + + Safe to call multiple times — subsequent calls are no-ops. + + Returns + ------- + None + """ + global _registry + + if _registry is None: + _registry = ParserRegistry() + _registry.register_defaults() + + +def reset_parser_registry() -> None: + """Reset the module-level registry state to its initial values. + + Resets _registry and _discovery_complete so the next call to + get_parser_registry will re-initialise everything from scratch. + + FOR TESTS ONLY. Do not call this in production code — resetting the + registry mid-request causes all subsequent parser lookups to go through + discovery again, which is expensive and may have unexpected side effects + in multi-threaded environments. + + Returns + ------- + None + """ + global _registry, _discovery_complete + + _registry = None + _discovery_complete = False + + +# --------------------------------------------------------------------------- +# Registry class +# --------------------------------------------------------------------------- + + +class ParserRegistry: + """Registry that maps MIME types to the best available parser class. + + Parsers are partitioned into two lists: + + _builtins + Parser classes registered via register_builtin (populated by + register_defaults in Phase 3+). + + _external + Parser classes loaded from installed Python entrypoints via discover. + + When resolving a parser for a file, external parsers are evaluated + alongside built-in parsers using a uniform scoring mechanism. Both lists + are iterated together; the class with the highest score wins. If an + external parser wins, its attribution details are logged so users can + identify which third-party package handled their document. + """ + + def __init__(self) -> None: + self._external: list[type[ParserProtocol]] = [] + self._builtins: list[type[ParserProtocol]] = [] + + # ------------------------------------------------------------------ + # Registration + # ------------------------------------------------------------------ + + def register_builtin(self, parser_class: type[ParserProtocol]) -> None: + """Register a built-in parser class. + + Built-in parsers are shipped with Paperless-ngx and are appended to + the _builtins list. They are never overridden by external parsers; + instead, scoring determines which parser wins for any given file. + + Parameters + ---------- + parser_class: + The parser class to register. Must satisfy ParserProtocol. + """ + self._builtins.append(parser_class) + + def register_defaults(self) -> None: + """Register the built-in parsers that ship with Paperless-ngx. + + Each parser that has been migrated to the new ParserProtocol interface + is registered here. Parsers are added in ascending weight order so + that log output is predictable; scoring determines which parser wins + at runtime regardless of registration order. + """ + from paperless.parsers.text import TextDocumentParser + + self.register_builtin(TextDocumentParser) + + # ------------------------------------------------------------------ + # Discovery + # ------------------------------------------------------------------ + + def discover(self) -> None: + """Load third-party parsers from the "paperless_ngx.parsers" entrypoint group. + + For each advertised entrypoint the method: + + 1. Calls ep.load() to import the class. + 2. Validates that the class exposes all required attributes. + 3. On success, appends the class to _external and logs an info message. + 4. On failure (import error or missing attributes), logs an appropriate + warning/error and continues to the next entrypoint. + + Errors during discovery of a single parser do not prevent other parsers + from being loaded. + + Returns + ------- + None + """ + eps = entry_points(group="paperless_ngx.parsers") + + for ep in eps: + try: + parser_class = ep.load() + except Exception: + logger.exception( + "Failed to load parser entrypoint '%s' — skipping.", + ep.name, + ) + continue + + missing = [ + attr for attr in _REQUIRED_ATTRS if not hasattr(parser_class, attr) + ] + if missing: + logger.warning( + "Parser loaded from entrypoint '%s' is missing required " + "attributes %r — skipping.", + ep.name, + missing, + ) + continue + + self._external.append(parser_class) + logger.info( + "Loaded third-party parser '%s' v%s by %s (entrypoint: '%s').", + parser_class.name, + parser_class.version, + parser_class.author, + ep.name, + ) + + # ------------------------------------------------------------------ + # Summary logging + # ------------------------------------------------------------------ + + def log_summary(self) -> None: + """Log a startup summary of all registered parsers. + + Built-in parsers are listed first, followed by any external parsers + discovered from entrypoints. If no external parsers were found a + short informational message is logged instead of an empty list. + + Returns + ------- + None + """ + logger.info( + "Built-in parsers (%d):", + len(self._builtins), + ) + for cls in self._builtins: + logger.info( + " [built-in] %s v%s — %s", + getattr(cls, "name", repr(cls)), + getattr(cls, "version", "unknown"), + getattr(cls, "url", "built-in"), + ) + + if not self._external: + logger.info("No third-party parsers discovered.") + return + + logger.info( + "Third-party parsers (%d):", + len(self._external), + ) + for cls in self._external: + logger.info( + " [external] %s v%s by %s — report issues at %s", + getattr(cls, "name", repr(cls)), + getattr(cls, "version", "unknown"), + getattr(cls, "author", "unknown"), + getattr(cls, "url", "unknown"), + ) + + # ------------------------------------------------------------------ + # Parser resolution + # ------------------------------------------------------------------ + + def get_parser_for_file( + self, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> type[ParserProtocol] | None: + """Return the best parser class for the given file, or None. + + All registered parsers (external first, then built-ins) are evaluated + against the file. A parser is eligible if mime_type appears in the dict + returned by its supported_mime_types classmethod, and its score + classmethod returns a non-None integer. + + The parser with the highest score wins. When two parsers return the + same score, the one that appears earlier in the evaluation order wins + (external parsers are evaluated before built-ins, giving third-party + packages a chance to override defaults at equal priority). + + When an external parser is selected, its identity is logged at INFO + level so operators can trace which package handled a document. + + Parameters + ---------- + mime_type: + The detected MIME type of the file. + filename: + The original filename, including extension. + path: + Optional filesystem path to the file. Forwarded to each + parser's score method. + + Returns + ------- + type[ParserProtocol] | None + The winning parser class, or None if no parser can handle the file. + """ + best_score: int | None = None + best_parser: type[ParserProtocol] | None = None + + # External parsers are placed first so that, at equal scores, an + # external parser wins over a built-in (first-seen policy). + for parser_class in (*self._external, *self._builtins): + if mime_type not in parser_class.supported_mime_types(): + continue + + score = parser_class.score(mime_type, filename, path) + if score is None: + continue + + if best_score is None or score > best_score: + best_score = score + best_parser = parser_class + + if best_parser is not None and best_parser in self._external: + logger.info( + "Document handled by third-party parser '%s' v%s — %s", + getattr(best_parser, "name", repr(best_parser)), + getattr(best_parser, "version", "unknown"), + getattr(best_parser, "url", "unknown"), + ) + + return best_parser diff --git a/src/paperless/parsers/text.py b/src/paperless/parsers/text.py new file mode 100644 index 000000000..99d9dab08 --- /dev/null +++ b/src/paperless/parsers/text.py @@ -0,0 +1,320 @@ +""" +Built-in plain-text document parser. + +Handles text/plain, text/csv, and application/csv MIME types by reading the +file content directly. Thumbnails are generated by rendering a page-sized +WebP image from the first 100,000 characters using Pillow. +""" + +from __future__ import annotations + +import logging +import shutil +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Self + +from django.conf import settings +from PIL import Image +from PIL import ImageDraw +from PIL import ImageFont + +from paperless.version import __full_version_str__ + +if TYPE_CHECKING: + import datetime + from types import TracebackType + + from paperless.parsers import MetadataEntry + +logger = logging.getLogger("paperless.parsing.text") + +_SUPPORTED_MIME_TYPES: dict[str, str] = { + "text/plain": ".txt", + "text/csv": ".csv", + "application/csv": ".csv", +} + + +class TextDocumentParser: + """Parse plain-text documents (txt, csv) for Paperless-ngx. + + This parser reads the file content directly as UTF-8 text and renders a + simple thumbnail using Pillow. It does not perform OCR and does not + produce a searchable PDF archive copy. + + Class attributes + ---------------- + name : str + Human-readable parser name. + version : str + Semantic version string, kept in sync with Paperless-ngx releases. + author : str + Maintainer name. + url : str + Issue tracker / source URL. + """ + + name: str = "Paperless-ngx Text Parser" + version: str = __full_version_str__ + author: str = "Paperless-ngx Contributors" + url: str = "https://github.com/paperless-ngx/paperless-ngx" + + # ------------------------------------------------------------------ + # Class methods + # ------------------------------------------------------------------ + + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + """Return the MIME types this parser handles. + + Returns + ------- + dict[str, str] + Mapping of MIME type to preferred file extension. + """ + return _SUPPORTED_MIME_TYPES + + @classmethod + def score( + cls, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> int | None: + """Return the priority score for handling this file. + + Parameters + ---------- + mime_type: + Detected MIME type of the file. + filename: + Original filename including extension. + path: + Optional filesystem path. Not inspected by this parser. + + Returns + ------- + int | None + 10 if the MIME type is supported, otherwise None. + """ + if mime_type in _SUPPORTED_MIME_TYPES: + return 10 + return None + + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def can_produce_archive(self) -> bool: + """Whether this parser can produce a searchable PDF archive copy. + + Returns + ------- + bool + Always False — the text parser does not produce a PDF archive. + """ + return False + + @property + def requires_pdf_rendition(self) -> bool: + """Whether the parser must produce a PDF for the frontend to display. + + Returns + ------- + bool + Always False — plain text files are displayable as-is. + """ + return False + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def __init__(self, logging_group: object = None) -> None: + settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) + self._tempdir = Path( + tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), + ) + self._text: str | None = None + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + logger.debug("Cleaning up temporary directory %s", self._tempdir) + shutil.rmtree(self._tempdir, ignore_errors=True) + + # ------------------------------------------------------------------ + # Core parsing interface + # ------------------------------------------------------------------ + + def parse( + self, + document_path: Path, + mime_type: str, + *, + produce_archive: bool = True, + ) -> None: + """Read the document and store its text content. + + Parameters + ---------- + document_path: + Absolute path to the text file. + mime_type: + Detected MIME type of the document. + produce_archive: + Ignored — this parser never produces a PDF archive. + + Raises + ------ + documents.parsers.ParseError + If the file cannot be read. + """ + self._text = self._read_text(document_path) + + # ------------------------------------------------------------------ + # Result accessors + # ------------------------------------------------------------------ + + def get_text(self) -> str | None: + """Return the plain-text content extracted during parse. + + Returns + ------- + str | None + Extracted text, or None if parse has not been called yet. + """ + return self._text + + def get_date(self) -> datetime.datetime | None: + """Return the document date detected during parse. + + Returns + ------- + datetime.datetime | None + Always None — the text parser does not detect dates. + """ + return None + + def get_archive_path(self) -> Path | None: + """Return the path to a generated archive PDF, or None. + + Returns + ------- + Path | None + Always None — the text parser does not produce a PDF archive. + """ + return None + + # ------------------------------------------------------------------ + # Thumbnail and metadata + # ------------------------------------------------------------------ + + def get_thumbnail(self, document_path: Path, mime_type: str) -> Path: + """Render the first portion of the document as a WebP thumbnail. + + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + + Returns + ------- + Path + Path to the generated WebP thumbnail inside the temporary directory. + """ + max_chars = 100_000 + file_size_limit = 50 * 1024 * 1024 + + if document_path.stat().st_size > file_size_limit: + text = "[File too large to preview]" + else: + with Path(document_path).open("r", encoding="utf-8", errors="replace") as f: + text = f.read(max_chars) + + img = Image.new("RGB", (500, 700), color="white") + draw = ImageDraw.Draw(img) + font = ImageFont.truetype( + font=settings.THUMBNAIL_FONT_NAME, + size=20, + layout_engine=ImageFont.Layout.BASIC, + ) + draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4) + + out_path = self._tempdir / "thumb.webp" + img.save(out_path, format="WEBP") + + return out_path + + def get_page_count( + self, + document_path: Path, + mime_type: str, + ) -> int | None: + """Return the number of pages in the document. + + Parameters + ---------- + document_path: + Absolute path to the source document. + mime_type: + Detected MIME type of the document. + + Returns + ------- + int | None + Always None — page count is not meaningful for plain text. + """ + return None + + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list[MetadataEntry]: + """Extract format-specific metadata from the document. + + Returns + ------- + list[MetadataEntry] + Always ``[]`` — plain text files carry no structured metadata. + """ + return [] + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + def _read_text(self, filepath: Path) -> str: + """Read file content, replacing invalid UTF-8 bytes rather than failing. + + Parameters + ---------- + filepath: + Path to the file to read. + + Returns + ------- + str + File content as a string. + """ + try: + return filepath.read_text(encoding="utf-8") + except UnicodeDecodeError as exc: + logger.warning( + "Unicode error reading %s, replacing bad bytes: %s", + filepath, + exc, + ) + return filepath.read_bytes().decode("utf-8", errors="replace") diff --git a/src/paperless/tests/conftest.py b/src/paperless/tests/conftest.py new file mode 100644 index 000000000..b016191c4 --- /dev/null +++ b/src/paperless/tests/conftest.py @@ -0,0 +1,48 @@ +""" +Fixtures defined here are available to every test module under +src/paperless/tests/ (including sub-packages such as parsers/). + +Session-scoped fixtures for the shared samples directory live here so +sub-package conftest files can reference them without duplicating path logic. +Parser-specific fixtures (concrete parser instances, format-specific sample +files) live in paperless/tests/parsers/conftest.py. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +from paperless.parsers.registry import reset_parser_registry + +if TYPE_CHECKING: + from collections.abc import Generator + + +@pytest.fixture(scope="session") +def samples_dir() -> Path: + """Absolute path to the shared parser sample files directory. + + Sub-package conftest files derive format-specific paths from this root, + e.g. ``samples_dir / "text" / "test.txt"``. + + Returns + ------- + Path + Directory containing all sample documents used by parser tests. + """ + return (Path(__file__).parent / "samples").resolve() + + +@pytest.fixture(autouse=True) +def clean_registry() -> Generator[None, None, None]: + """Reset the parser registry before and after every test. + + This prevents registry state from leaking between tests that call + get_parser_registry() or init_builtin_parsers(). + """ + reset_parser_registry() + yield + reset_parser_registry() diff --git a/src/paperless/tests/parsers/__init__.py b/src/paperless/tests/parsers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless/tests/parsers/conftest.py b/src/paperless/tests/parsers/conftest.py new file mode 100644 index 000000000..2d5deb684 --- /dev/null +++ b/src/paperless/tests/parsers/conftest.py @@ -0,0 +1,76 @@ +""" +Parser fixtures that are used across multiple test modules in this package +are defined here. Format-specific sample-file fixtures are grouped by parser +so it is easy to see which files belong to which test module. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from paperless.parsers.text import TextDocumentParser + +if TYPE_CHECKING: + from collections.abc import Generator + from pathlib import Path + + +# ------------------------------------------------------------------ +# Text parser sample files +# ------------------------------------------------------------------ + + +@pytest.fixture(scope="session") +def text_samples_dir(samples_dir: Path) -> Path: + """Absolute path to the text parser sample files directory. + + Returns + ------- + Path + ``/text/`` + """ + return samples_dir / "text" + + +@pytest.fixture(scope="session") +def sample_txt_file(text_samples_dir: Path) -> Path: + """Path to a valid UTF-8 plain-text sample file. + + Returns + ------- + Path + Absolute path to ``text/test.txt``. + """ + return text_samples_dir / "test.txt" + + +@pytest.fixture(scope="session") +def malformed_txt_file(text_samples_dir: Path) -> Path: + """Path to a text file containing invalid UTF-8 bytes. + + Returns + ------- + Path + Absolute path to ``text/decode_error.txt``. + """ + return text_samples_dir / "decode_error.txt" + + +# ------------------------------------------------------------------ +# Text parser instance +# ------------------------------------------------------------------ + + +@pytest.fixture() +def text_parser() -> Generator[TextDocumentParser, None, None]: + """Yield a TextDocumentParser and clean up its temporary directory afterwards. + + Yields + ------ + TextDocumentParser + A ready-to-use parser instance. + """ + with TextDocumentParser() as parser: + yield parser diff --git a/src/paperless/tests/parsers/test_text_parser.py b/src/paperless/tests/parsers/test_text_parser.py new file mode 100644 index 000000000..d2f095f5c --- /dev/null +++ b/src/paperless/tests/parsers/test_text_parser.py @@ -0,0 +1,256 @@ +""" +Tests for paperless.parsers.text.TextDocumentParser. + +All tests use the context-manager protocol for parser lifecycle. Sample +files are provided by session-scoped fixtures defined in conftest.py. +""" + +from __future__ import annotations + +import tempfile +from pathlib import Path + +import pytest + +from paperless.parsers import ParserProtocol +from paperless.parsers.text import TextDocumentParser + + +class TestTextParserProtocol: + """Verify that TextDocumentParser satisfies the ParserProtocol contract.""" + + def test_isinstance_satisfies_protocol( + self, + text_parser: TextDocumentParser, + ) -> None: + assert isinstance(text_parser, ParserProtocol) + + def test_class_attributes_present(self) -> None: + assert isinstance(TextDocumentParser.name, str) and TextDocumentParser.name + assert ( + isinstance(TextDocumentParser.version, str) and TextDocumentParser.version + ) + assert isinstance(TextDocumentParser.author, str) and TextDocumentParser.author + assert isinstance(TextDocumentParser.url, str) and TextDocumentParser.url + + def test_supported_mime_types_returns_dict(self) -> None: + mime_types = TextDocumentParser.supported_mime_types() + assert isinstance(mime_types, dict) + assert "text/plain" in mime_types + assert "text/csv" in mime_types + assert "application/csv" in mime_types + + @pytest.mark.parametrize( + ("mime_type", "expected"), + [ + ("text/plain", 10), + ("text/csv", 10), + ("application/csv", 10), + ("application/pdf", None), + ("image/png", None), + ], + ) + def test_score(self, mime_type: str, expected: int | None) -> None: + assert TextDocumentParser.score(mime_type, "file.txt") == expected + + def test_can_produce_archive_is_false( + self, + text_parser: TextDocumentParser, + ) -> None: + assert text_parser.can_produce_archive is False + + def test_requires_pdf_rendition_is_false( + self, + text_parser: TextDocumentParser, + ) -> None: + assert text_parser.requires_pdf_rendition is False + + +class TestTextParserLifecycle: + """Verify context-manager behaviour and temporary directory cleanup.""" + + def test_context_manager_cleans_up_tempdir(self) -> None: + with TextDocumentParser() as parser: + tempdir = parser._tempdir + assert tempdir.exists() + assert not tempdir.exists() + + def test_context_manager_cleans_up_after_exception(self) -> None: + tempdir: Path | None = None + with pytest.raises(RuntimeError): + with TextDocumentParser() as parser: + tempdir = parser._tempdir + raise RuntimeError("boom") + assert tempdir is not None + assert not tempdir.exists() + + +class TestTextParserParse: + """Verify parse() and the result accessors.""" + + def test_parse_valid_utf8( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + text_parser.parse(sample_txt_file, "text/plain") + + assert text_parser.get_text() == "This is a test file.\n" + + def test_parse_returns_none_for_archive_path( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + text_parser.parse(sample_txt_file, "text/plain") + + assert text_parser.get_archive_path() is None + + def test_parse_returns_none_for_date( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + text_parser.parse(sample_txt_file, "text/plain") + + assert text_parser.get_date() is None + + def test_parse_invalid_utf8_bytes_replaced( + self, + text_parser: TextDocumentParser, + malformed_txt_file: Path, + ) -> None: + """ + GIVEN: + - A text file containing invalid UTF-8 byte sequences + WHEN: + - The file is parsed + THEN: + - Parsing succeeds + - Invalid bytes are replaced with the Unicode replacement character + """ + text_parser.parse(malformed_txt_file, "text/plain") + + assert text_parser.get_text() == "Pantothens\ufffdure\n" + + def test_get_text_none_before_parse( + self, + text_parser: TextDocumentParser, + ) -> None: + assert text_parser.get_text() is None + + +class TestTextParserThumbnail: + """Verify thumbnail generation.""" + + def test_thumbnail_exists_and_is_file( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + thumb = text_parser.get_thumbnail(sample_txt_file, "text/plain") + + assert thumb.exists() + assert thumb.is_file() + + def test_thumbnail_large_file_does_not_read_all( + self, + text_parser: TextDocumentParser, + ) -> None: + """ + GIVEN: + - A text file larger than 50 MB + WHEN: + - A thumbnail is requested + THEN: + - The thumbnail is generated without loading the full file + """ + with tempfile.NamedTemporaryFile( + delete=False, + mode="w", + encoding="utf-8", + suffix=".txt", + ) as tmp: + tmp.write("A" * (51 * 1024 * 1024)) + large_file = Path(tmp.name) + + try: + thumb = text_parser.get_thumbnail(large_file, "text/plain") + assert thumb.exists() + assert thumb.is_file() + finally: + large_file.unlink(missing_ok=True) + + def test_get_page_count_returns_none( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + assert text_parser.get_page_count(sample_txt_file, "text/plain") is None + + +class TestTextParserMetadata: + """Verify extract_metadata behaviour.""" + + def test_extract_metadata_returns_empty_list( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + result = text_parser.extract_metadata(sample_txt_file, "text/plain") + + assert result == [] + + def test_extract_metadata_returns_list_type( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + result = text_parser.extract_metadata(sample_txt_file, "text/plain") + + assert isinstance(result, list) + + def test_extract_metadata_ignores_mime_type( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + """extract_metadata returns [] regardless of the mime_type argument.""" + assert text_parser.extract_metadata(sample_txt_file, "application/pdf") == [] + assert text_parser.extract_metadata(sample_txt_file, "text/csv") == [] + + +class TestTextParserRegistry: + """Verify that TextDocumentParser is registered by default.""" + + def test_registered_in_defaults(self) -> None: + from paperless.parsers.registry import ParserRegistry + + registry = ParserRegistry() + registry.register_defaults() + + assert TextDocumentParser in registry._builtins + + def test_get_parser_for_text_plain(self) -> None: + from paperless.parsers.registry import get_parser_registry + + registry = get_parser_registry() + parser_cls = registry.get_parser_for_file("text/plain", "doc.txt") + + assert parser_cls is TextDocumentParser + + def test_get_parser_for_text_csv(self) -> None: + from paperless.parsers.registry import get_parser_registry + + registry = get_parser_registry() + parser_cls = registry.get_parser_for_file("text/csv", "data.csv") + + assert parser_cls is TextDocumentParser + + def test_get_parser_for_unknown_type_returns_none(self) -> None: + from paperless.parsers.registry import get_parser_registry + + registry = get_parser_registry() + parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf") + + assert parser_cls is None diff --git a/src/paperless_text/tests/samples/decode_error.txt b/src/paperless/tests/samples/text/decode_error.txt similarity index 100% rename from src/paperless_text/tests/samples/decode_error.txt rename to src/paperless/tests/samples/text/decode_error.txt diff --git a/src/paperless_text/tests/samples/test.txt b/src/paperless/tests/samples/text/test.txt similarity index 100% rename from src/paperless_text/tests/samples/test.txt rename to src/paperless/tests/samples/text/test.txt diff --git a/src/paperless/tests/test_registry.py b/src/paperless/tests/test_registry.py new file mode 100644 index 000000000..80c686bc4 --- /dev/null +++ b/src/paperless/tests/test_registry.py @@ -0,0 +1,714 @@ +""" +Tests for :mod:`paperless.parsers` (ParserProtocol) and +:mod:`paperless.parsers.registry` (ParserRegistry + module-level helpers). + +All tests use pytest-style functions/classes — no unittest.TestCase. +The ``clean_registry`` fixture ensures complete isolation between tests by +resetting the module-level singleton before and after every test. +""" + +from __future__ import annotations + +import logging +from importlib.metadata import EntryPoint +from pathlib import Path +from typing import Self +from unittest.mock import MagicMock +from unittest.mock import patch + +import pytest + +from paperless.parsers import ParserProtocol +from paperless.parsers.registry import ParserRegistry +from paperless.parsers.registry import get_parser_registry +from paperless.parsers.registry import init_builtin_parsers +from paperless.parsers.registry import reset_parser_registry + + +@pytest.fixture() +def dummy_parser_cls() -> type: + """Return a class that fully satisfies :class:`ParserProtocol`. + + GIVEN: A need to exercise registry and Protocol logic with a minimal + but complete parser. + WHEN: A test requests this fixture. + THEN: A class with all required attributes and methods is returned. + """ + + class DummyParser: + name = "dummy-parser" + version = "0.1.0" + author = "Test Author" + url = "https://example.com/dummy-parser" + + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + return {"text/plain": ".txt"} + + @classmethod + def score( + cls, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> int | None: + return 10 + + @property + def can_produce_archive(self) -> bool: + return False + + @property + def requires_pdf_rendition(self) -> bool: + return False + + def parse( + self, + document_path: Path, + mime_type: str, + *, + produce_archive: bool = True, + ) -> None: + """ + Required to exist, but doesn't need to do anything + """ + + def get_text(self) -> str | None: + return None + + def get_date(self) -> None: + return None + + def get_archive_path(self) -> Path | None: + return None + + def get_thumbnail( + self, + document_path: Path, + mime_type: str, + ) -> Path: + return Path("/tmp/thumbnail.webp") + + def get_page_count( + self, + document_path: Path, + mime_type: str, + ) -> int | None: + return None + + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list: + return [] + + def __enter__(self) -> Self: + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + """ + Required to exist, but doesn't need to do anything + """ + + return DummyParser + + +class TestParserProtocol: + """Verify runtime isinstance() checks against ParserProtocol.""" + + def test_compliant_class_instance_passes_isinstance( + self, + dummy_parser_cls: type, + ) -> None: + """ + GIVEN: A class that implements every method required by ParserProtocol. + WHEN: isinstance() is called with the Protocol. + THEN: The check passes (returns True). + """ + instance = dummy_parser_cls() + assert isinstance(instance, ParserProtocol) + + def test_non_compliant_class_instance_fails_isinstance(self) -> None: + """ + GIVEN: A plain class with no parser-related methods. + WHEN: isinstance() is called with ParserProtocol. + THEN: The check fails (returns False). + """ + + class Unrelated: + pass + + assert not isinstance(Unrelated(), ParserProtocol) + + @pytest.mark.parametrize( + "missing_method", + [ + pytest.param("parse", id="missing-parse"), + pytest.param("get_text", id="missing-get_text"), + pytest.param("get_thumbnail", id="missing-get_thumbnail"), + pytest.param("__enter__", id="missing-__enter__"), + pytest.param("__exit__", id="missing-__exit__"), + ], + ) + def test_partial_compliant_fails_isinstance( + self, + dummy_parser_cls: type, + missing_method: str, + ) -> None: + """ + GIVEN: A class that satisfies ParserProtocol except for one method. + WHEN: isinstance() is called with ParserProtocol. + THEN: The check fails because the Protocol is not fully satisfied. + """ + # Create a subclass and delete the specified method to break compliance. + partial_cls = type( + "PartialParser", + (dummy_parser_cls,), + {missing_method: None}, # Replace with None — not callable + ) + assert not isinstance(partial_cls(), ParserProtocol) + + +class TestRegistrySingleton: + """Verify the module-level singleton lifecycle functions.""" + + def test_get_parser_registry_returns_instance(self) -> None: + """ + GIVEN: No registry has been created yet. + WHEN: get_parser_registry() is called. + THEN: A ParserRegistry instance is returned. + """ + registry = get_parser_registry() + assert isinstance(registry, ParserRegistry) + + def test_get_parser_registry_same_instance_on_repeated_calls(self) -> None: + """ + GIVEN: A registry instance was created by a prior call. + WHEN: get_parser_registry() is called a second time. + THEN: The exact same object (identity) is returned. + """ + first = get_parser_registry() + second = get_parser_registry() + assert first is second + + def test_reset_parser_registry_gives_fresh_instance(self) -> None: + """ + GIVEN: A registry instance already exists. + WHEN: reset_parser_registry() is called and then get_parser_registry() + is called again. + THEN: A new, distinct registry instance is returned. + """ + first = get_parser_registry() + reset_parser_registry() + second = get_parser_registry() + assert first is not second + + def test_init_builtin_parsers_does_not_run_discover( + self, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + """ + GIVEN: discover() would raise an exception if called. + WHEN: init_builtin_parsers() is called. + THEN: No exception is raised, confirming discover() was not invoked. + """ + + def exploding_discover(self) -> None: + raise RuntimeError( + "discover() must not be called from init_builtin_parsers", + ) + + monkeypatch.setattr(ParserRegistry, "discover", exploding_discover) + + # Should complete without raising. + init_builtin_parsers() + + def test_init_builtin_parsers_idempotent(self) -> None: + """ + GIVEN: init_builtin_parsers() has already been called once. + WHEN: init_builtin_parsers() is called a second time. + THEN: No error is raised and the same registry instance is reused. + """ + init_builtin_parsers() + # Capture the registry created by the first call. + import paperless.parsers.registry as reg_module + + first_registry = reg_module._registry + + init_builtin_parsers() + + assert reg_module._registry is first_registry + + +class TestParserRegistryGetParserForFile: + """Verify parser selection logic in get_parser_for_file().""" + + def test_returns_none_when_no_parsers_registered(self) -> None: + """ + GIVEN: A registry with no parsers registered. + WHEN: get_parser_for_file() is called for any MIME type. + THEN: None is returned. + """ + registry = ParserRegistry() + result = registry.get_parser_for_file("text/plain", "doc.txt") + assert result is None + + def test_returns_none_for_unsupported_mime_type( + self, + dummy_parser_cls: type, + ) -> None: + """ + GIVEN: A registry with a parser that supports only 'text/plain'. + WHEN: get_parser_for_file() is called with 'application/pdf'. + THEN: None is returned. + """ + registry = ParserRegistry() + registry.register_builtin(dummy_parser_cls) + result = registry.get_parser_for_file("application/pdf", "file.pdf") + assert result is None + + def test_returns_parser_for_supported_mime_type( + self, + dummy_parser_cls: type, + ) -> None: + """ + GIVEN: A registry with a parser registered for 'text/plain'. + WHEN: get_parser_for_file() is called with 'text/plain'. + THEN: The registered parser class is returned. + """ + registry = ParserRegistry() + registry.register_builtin(dummy_parser_cls) + result = registry.get_parser_for_file("text/plain", "readme.txt") + assert result is dummy_parser_cls + + def test_highest_score_wins(self) -> None: + """ + GIVEN: Two parsers both supporting 'text/plain' with scores 5 and 20. + WHEN: get_parser_for_file() is called for 'text/plain'. + THEN: The parser with score 20 is returned. + """ + + class LowScoreParser: + name = "low" + version = "1.0" + author = "A" + url = "https://example.com/low" + + @classmethod + def supported_mime_types(cls): + return {"text/plain": ".txt"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return 5 + + class HighScoreParser: + name = "high" + version = "1.0" + author = "B" + url = "https://example.com/high" + + @classmethod + def supported_mime_types(cls): + return {"text/plain": ".txt"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return 20 + + registry = ParserRegistry() + registry.register_builtin(LowScoreParser) + registry.register_builtin(HighScoreParser) + result = registry.get_parser_for_file("text/plain", "readme.txt") + assert result is HighScoreParser + + def test_parser_returning_none_score_is_skipped(self) -> None: + """ + GIVEN: A parser that returns None from score() for the given file. + WHEN: get_parser_for_file() is called. + THEN: That parser is skipped and None is returned (no other candidates). + """ + + class DecliningParser: + name = "declining" + version = "1.0" + author = "A" + url = "https://example.com" + + @classmethod + def supported_mime_types(cls): + return {"text/plain": ".txt"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return None # Explicitly declines + + registry = ParserRegistry() + registry.register_builtin(DecliningParser) + result = registry.get_parser_for_file("text/plain", "readme.txt") + assert result is None + + def test_all_parsers_decline_returns_none(self) -> None: + """ + GIVEN: Multiple parsers that all return None from score(). + WHEN: get_parser_for_file() is called. + THEN: None is returned. + """ + + class AlwaysDeclines: + name = "declines" + version = "1.0" + author = "A" + url = "https://example.com" + + @classmethod + def supported_mime_types(cls): + return {"text/plain": ".txt"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return None + + registry = ParserRegistry() + registry.register_builtin(AlwaysDeclines) + registry._external.append(AlwaysDeclines) + result = registry.get_parser_for_file("text/plain", "file.txt") + assert result is None + + def test_external_parser_beats_builtin_same_score(self) -> None: + """ + GIVEN: An external and a built-in parser both returning score 10. + WHEN: get_parser_for_file() is called. + THEN: The external parser wins because externals are evaluated first + and the first-seen-wins policy applies at equal scores. + """ + + class BuiltinParser: + name = "builtin" + version = "1.0" + author = "Core" + url = "https://example.com/builtin" + + @classmethod + def supported_mime_types(cls): + return {"text/plain": ".txt"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return 10 + + class ExternalParser: + name = "external" + version = "2.0" + author = "Third Party" + url = "https://example.com/external" + + @classmethod + def supported_mime_types(cls): + return {"text/plain": ".txt"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return 10 + + registry = ParserRegistry() + registry.register_builtin(BuiltinParser) + registry._external.append(ExternalParser) + result = registry.get_parser_for_file("text/plain", "file.txt") + assert result is ExternalParser + + def test_builtin_wins_when_external_declines(self) -> None: + """ + GIVEN: An external parser that declines (score None) and a built-in + that returns score 5. + WHEN: get_parser_for_file() is called. + THEN: The built-in parser is returned. + """ + + class DecliningExternal: + name = "declining-external" + version = "1.0" + author = "Third Party" + url = "https://example.com/declining" + + @classmethod + def supported_mime_types(cls): + return {"text/plain": ".txt"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return None + + class AcceptingBuiltin: + name = "accepting-builtin" + version = "1.0" + author = "Core" + url = "https://example.com/accepting" + + @classmethod + def supported_mime_types(cls): + return {"text/plain": ".txt"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return 5 + + registry = ParserRegistry() + registry.register_builtin(AcceptingBuiltin) + registry._external.append(DecliningExternal) + result = registry.get_parser_for_file("text/plain", "file.txt") + assert result is AcceptingBuiltin + + +class TestDiscover: + """Verify entrypoint discovery in ParserRegistry.discover().""" + + def test_discover_with_no_entrypoints(self) -> None: + """ + GIVEN: No entrypoints are registered under 'paperless_ngx.parsers'. + WHEN: discover() is called. + THEN: _external remains empty and no errors are raised. + """ + registry = ParserRegistry() + + with patch( + "paperless.parsers.registry.entry_points", + return_value=[], + ): + registry.discover() + + assert registry._external == [] + + def test_discover_adds_valid_external_parser(self) -> None: + """ + GIVEN: One valid entrypoint whose loaded class has all required attrs. + WHEN: discover() is called. + THEN: The class is appended to _external. + """ + + class ValidExternal: + name = "valid-external" + version = "3.0.0" + author = "Someone" + url = "https://example.com/valid" + + @classmethod + def supported_mime_types(cls): + return {"application/pdf": ".pdf"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return 5 + + mock_ep = MagicMock(spec=EntryPoint) + mock_ep.name = "valid_external" + mock_ep.load.return_value = ValidExternal + + registry = ParserRegistry() + + with patch( + "paperless.parsers.registry.entry_points", + return_value=[mock_ep], + ): + registry.discover() + + assert ValidExternal in registry._external + + def test_discover_skips_entrypoint_with_load_error( + self, + caplog: pytest.LogCaptureFixture, + ) -> None: + """ + GIVEN: An entrypoint whose load() method raises ImportError. + WHEN: discover() is called. + THEN: The entrypoint is skipped, an error is logged, and _external + remains empty. + """ + mock_ep = MagicMock(spec=EntryPoint) + mock_ep.name = "broken_ep" + mock_ep.load.side_effect = ImportError("missing dependency") + + registry = ParserRegistry() + + with caplog.at_level(logging.ERROR, logger="paperless.parsers.registry"): + with patch( + "paperless.parsers.registry.entry_points", + return_value=[mock_ep], + ): + registry.discover() + + assert registry._external == [] + assert any( + "broken_ep" in record.message + for record in caplog.records + if record.levelno >= logging.ERROR + ) + + def test_discover_skips_entrypoint_with_missing_attrs( + self, + caplog: pytest.LogCaptureFixture, + ) -> None: + """ + GIVEN: A class loaded from an entrypoint that is missing the 'score' + attribute. + WHEN: discover() is called. + THEN: The entrypoint is skipped, a warning is logged, and _external + remains empty. + """ + + class MissingScore: + name = "missing-score" + version = "1.0" + author = "Someone" + url = "https://example.com" + + # 'score' classmethod is intentionally absent. + + @classmethod + def supported_mime_types(cls): + return {"text/plain": ".txt"} + + mock_ep = MagicMock(spec=EntryPoint) + mock_ep.name = "missing_score_ep" + mock_ep.load.return_value = MissingScore + + registry = ParserRegistry() + + with caplog.at_level(logging.WARNING, logger="paperless.parsers.registry"): + with patch( + "paperless.parsers.registry.entry_points", + return_value=[mock_ep], + ): + registry.discover() + + assert registry._external == [] + assert any( + "missing_score_ep" in record.message + for record in caplog.records + if record.levelno >= logging.WARNING + ) + + def test_discover_logs_loaded_parser_info( + self, + caplog: pytest.LogCaptureFixture, + ) -> None: + """ + GIVEN: A valid entrypoint that loads successfully. + WHEN: discover() is called. + THEN: An INFO log message is emitted containing the parser name, + version, author, and entrypoint name. + """ + + class LoggableParser: + name = "loggable" + version = "4.2.0" + author = "Log Tester" + url = "https://example.com/loggable" + + @classmethod + def supported_mime_types(cls): + return {"image/png": ".png"} + + @classmethod + def score(cls, mime_type, filename, path=None): + return 1 + + mock_ep = MagicMock(spec=EntryPoint) + mock_ep.name = "loggable_ep" + mock_ep.load.return_value = LoggableParser + + registry = ParserRegistry() + + with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"): + with patch( + "paperless.parsers.registry.entry_points", + return_value=[mock_ep], + ): + registry.discover() + + info_messages = " ".join( + r.message for r in caplog.records if r.levelno == logging.INFO + ) + assert "loggable" in info_messages + assert "4.2.0" in info_messages + assert "Log Tester" in info_messages + assert "loggable_ep" in info_messages + + +class TestLogSummary: + """Verify log output from ParserRegistry.log_summary().""" + + def test_log_summary_with_no_external_parsers( + self, + dummy_parser_cls: type, + caplog: pytest.LogCaptureFixture, + ) -> None: + """ + GIVEN: A registry with one built-in parser and no external parsers. + WHEN: log_summary() is called. + THEN: The built-in parser name appears in the logs. + """ + registry = ParserRegistry() + registry.register_builtin(dummy_parser_cls) + + with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"): + registry.log_summary() + + all_messages = " ".join(r.message for r in caplog.records) + assert dummy_parser_cls.name in all_messages + + def test_log_summary_with_external_parsers( + self, + caplog: pytest.LogCaptureFixture, + ) -> None: + """ + GIVEN: A registry with one external parser registered. + WHEN: log_summary() is called. + THEN: The external parser name, version, author, and url appear in + the log output. + """ + + class ExtParser: + name = "ext-parser" + version = "9.9.9" + author = "Ext Corp" + url = "https://ext.example.com" + + @classmethod + def supported_mime_types(cls): + return {} + + @classmethod + def score(cls, mime_type, filename, path=None): + return None + + registry = ParserRegistry() + registry._external.append(ExtParser) + + with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"): + registry.log_summary() + + all_messages = " ".join(r.message for r in caplog.records) + assert "ext-parser" in all_messages + assert "9.9.9" in all_messages + assert "Ext Corp" in all_messages + assert "https://ext.example.com" in all_messages + + def test_log_summary_logs_no_third_party_message_when_none( + self, + caplog: pytest.LogCaptureFixture, + ) -> None: + """ + GIVEN: A registry with no external parsers. + WHEN: log_summary() is called. + THEN: A message containing 'No third-party parsers discovered.' is + logged. + """ + registry = ParserRegistry() + + with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"): + registry.log_summary() + + all_messages = " ".join(r.message for r in caplog.records) + assert "No third-party parsers discovered." in all_messages diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py deleted file mode 100644 index a6c149a0a..000000000 --- a/src/paperless_text/parsers.py +++ /dev/null @@ -1,50 +0,0 @@ -from pathlib import Path - -from django.conf import settings -from PIL import Image -from PIL import ImageDraw -from PIL import ImageFont - -from documents.parsers import DocumentParser - - -class TextDocumentParser(DocumentParser): - """ - This parser directly parses a text document (.txt, .md, or .csv) - """ - - logging_name = "paperless.parsing.text" - - def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path: - # Avoid reading entire file into memory - max_chars = 100_000 - file_size_limit = 50 * 1024 * 1024 - - if document_path.stat().st_size > file_size_limit: - text = "[File too large to preview]" - else: - with Path(document_path).open("r", encoding="utf-8", errors="replace") as f: - text = f.read(max_chars) - - img = Image.new("RGB", (500, 700), color="white") - draw = ImageDraw.Draw(img) - font = ImageFont.truetype( - font=settings.THUMBNAIL_FONT_NAME, - size=20, - layout_engine=ImageFont.Layout.BASIC, - ) - draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4) - - out_path = self.tempdir / "thumb.webp" - img.save(out_path, format="WEBP") - - return out_path - - def parse(self, document_path, mime_type, file_name=None) -> None: - self.text = self.read_file_handle_unicode_errors(document_path) - - def get_settings(self) -> None: - """ - This parser does not implement additional settings yet - """ - return None diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py index 05804c6d6..cf74d1c0e 100644 --- a/src/paperless_text/signals.py +++ b/src/paperless_text/signals.py @@ -1,7 +1,13 @@ def get_parser(*args, **kwargs): - from paperless_text.parsers import TextDocumentParser + from paperless.parsers.text import TextDocumentParser - return TextDocumentParser(*args, **kwargs) + # The new TextDocumentParser does not accept the legacy logging_group / + # progress_callback kwargs injected by the old signal-based consumer. + # These are dropped here; Phase 4 will replace this signal path with the + # new ParserRegistry so the shim can be removed at that point. + kwargs.pop("logging_group", None) + kwargs.pop("progress_callback", None) + return TextDocumentParser() def text_consumer_declaration(sender, **kwargs): diff --git a/src/paperless_text/tests/conftest.py b/src/paperless_text/tests/conftest.py deleted file mode 100644 index 1d9e4fc2f..000000000 --- a/src/paperless_text/tests/conftest.py +++ /dev/null @@ -1,30 +0,0 @@ -from collections.abc import Generator -from pathlib import Path - -import pytest - -from paperless_text.parsers import TextDocumentParser - - -@pytest.fixture(scope="session") -def sample_dir() -> Path: - return (Path(__file__).parent / Path("samples")).resolve() - - -@pytest.fixture() -def text_parser() -> Generator[TextDocumentParser, None, None]: - try: - parser = TextDocumentParser(logging_group=None) - yield parser - finally: - parser.cleanup() - - -@pytest.fixture(scope="session") -def sample_txt_file(sample_dir: Path) -> Path: - return sample_dir / "test.txt" - - -@pytest.fixture(scope="session") -def malformed_txt_file(sample_dir: Path) -> Path: - return sample_dir / "decode_error.txt" diff --git a/src/paperless_text/tests/test_parser.py b/src/paperless_text/tests/test_parser.py deleted file mode 100644 index b1086bc3d..000000000 --- a/src/paperless_text/tests/test_parser.py +++ /dev/null @@ -1,69 +0,0 @@ -import tempfile -from pathlib import Path - -from paperless_text.parsers import TextDocumentParser - - -class TestTextParser: - def test_thumbnail( - self, - text_parser: TextDocumentParser, - sample_txt_file: Path, - ) -> None: - # just make sure that it does not crash - f = text_parser.get_thumbnail(sample_txt_file, "text/plain") - assert f.exists() - assert f.is_file() - - def test_parse( - self, - text_parser: TextDocumentParser, - sample_txt_file: Path, - ) -> None: - text_parser.parse(sample_txt_file, "text/plain") - - assert text_parser.get_text() == "This is a test file.\n" - assert text_parser.get_archive_path() is None - - def test_parse_invalid_bytes( - self, - text_parser: TextDocumentParser, - malformed_txt_file: Path, - ) -> None: - """ - GIVEN: - - Text file which contains invalid UTF bytes - WHEN: - - The file is parsed - THEN: - - Parsing continues - - Invalid bytes are removed - """ - - text_parser.parse(malformed_txt_file, "text/plain") - - assert text_parser.get_text() == "Pantothens�ure\n" - assert text_parser.get_archive_path() is None - - def test_thumbnail_large_file(self, text_parser: TextDocumentParser) -> None: - """ - GIVEN: - - A very large text file (>50MB) - WHEN: - - A thumbnail is requested - THEN: - - A thumbnail is created without reading the entire file into memory - """ - with tempfile.NamedTemporaryFile( - delete=False, - mode="w", - encoding="utf-8", - suffix=".txt", - ) as tmp: - tmp.write("A" * (51 * 1024 * 1024)) # 51 MB of 'A' - large_file = Path(tmp.name) - - thumb = text_parser.get_thumbnail(large_file, "text/plain") - assert thumb.exists() - assert thumb.is_file() - large_file.unlink() diff --git a/src/paperless_tika/tests/conftest.py b/src/paperless_tika/tests/conftest.py index 657192e4e..5a54dae95 100644 --- a/src/paperless_tika/tests/conftest.py +++ b/src/paperless_tika/tests/conftest.py @@ -12,6 +12,7 @@ def tika_parser() -> Generator[TikaDocumentParser, None, None]: parser = TikaDocumentParser(logging_group=None) yield parser finally: + # TODO(stumpylog): Cleanup once all parsers are handled parser.cleanup()