From 97bc53ccdc3ada038abcbeaa5b5aa4e9a65a22f2 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 19 Mar 2026 13:02:43 -0700 Subject: [PATCH] Refactor RasterisedDocumentParser to ParserProtocol interface - Add RasterisedDocumentParser to registry.register_defaults() - Update parser class: remove DocumentParser inheritance, add Protocol class attrs/classmethods/properties, context-manager lifecycle - Add read_file_handle_unicode_errors() to shared parsers/utils.py - Replace inline unicode-error-handling with shared utility call Co-Authored-By: Claude Sonnet 4.6 --- src/paperless/parsers/registry.py | 2 + src/paperless/parsers/tesseract.py | 206 ++++++++++++++++++++--------- src/paperless/parsers/utils.py | 28 ++++ 3 files changed, 170 insertions(+), 66 deletions(-) diff --git a/src/paperless/parsers/registry.py b/src/paperless/parsers/registry.py index dc227ce7a..7effe554f 100644 --- a/src/paperless/parsers/registry.py +++ b/src/paperless/parsers/registry.py @@ -195,6 +195,7 @@ class ParserRegistry: """ from paperless.parsers.mail import MailDocumentParser from paperless.parsers.remote import RemoteDocumentParser + from paperless.parsers.tesseract import RasterisedDocumentParser from paperless.parsers.text import TextDocumentParser from paperless.parsers.tika import TikaDocumentParser @@ -202,6 +203,7 @@ class ParserRegistry: self.register_builtin(RemoteDocumentParser) self.register_builtin(TikaDocumentParser) self.register_builtin(MailDocumentParser) + self.register_builtin(RasterisedDocumentParser) # ------------------------------------------------------------------ # Discovery diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index 73532caa0..f61482d02 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -1,13 +1,17 @@ +from __future__ import annotations + +import logging import os import re +import shutil import tempfile from pathlib import Path from typing import TYPE_CHECKING +from typing import Self from django.conf import settings from PIL import Image -from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf from documents.utils import maybe_override_pixel_limit @@ -16,6 +20,28 @@ from paperless.config import OcrConfig from paperless.models import ArchiveFileChoices from paperless.models import CleanChoices from paperless.models import ModeChoices +from paperless.parsers.utils import read_file_handle_unicode_errors +from paperless.version import __full_version_str__ + +if TYPE_CHECKING: + import datetime + from types import TracebackType + + from paperless.parsers import MetadataEntry + from paperless.parsers import ParserContext + +logger = logging.getLogger("paperless.parsing.tesseract") + +_SUPPORTED_MIME_TYPES: dict[str, str] = { + "application/pdf": ".pdf", + "image/jpeg": ".jpg", + "image/png": ".png", + "image/tiff": ".tif", + "image/gif": ".gif", + "image/bmp": ".bmp", + "image/webp": ".webp", + "image/heic": ".heic", +} class NoTextFoundException(Exception): @@ -26,79 +52,123 @@ class RtlLanguageException(Exception): pass -class RasterisedDocumentParser(DocumentParser): +class RasterisedDocumentParser: """ This parser uses Tesseract to try and get some text out of a rasterised image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) """ - logging_name = "paperless.parsing.tesseract" + name: str = "Paperless-ngx Tesseract OCR Parser" + version: str = __full_version_str__ + author: str = "Paperless-ngx Contributors" + url: str = "https://github.com/paperless-ngx/paperless-ngx" - def get_settings(self) -> OcrConfig: - """ - This parser uses the OCR configuration settings to parse documents - """ - return OcrConfig() + # ------------------------------------------------------------------ + # Class methods + # ------------------------------------------------------------------ - def get_page_count(self, document_path, mime_type): - page_count = None - if mime_type == "application/pdf": - try: - import pikepdf + @classmethod + def supported_mime_types(cls) -> dict[str, str]: + return _SUPPORTED_MIME_TYPES - with pikepdf.Pdf.open(document_path) as pdf: - page_count = len(pdf.pages) - except Exception as e: - self.log.warning( - f"Unable to determine PDF page count {document_path}: {e}", - ) - return page_count + @classmethod + def score( + cls, + mime_type: str, + filename: str, + path: Path | None = None, + ) -> int | None: + if mime_type in _SUPPORTED_MIME_TYPES: + return 10 + return None - def extract_metadata(self, document_path, mime_type): - result = [] - if mime_type == "application/pdf": - import pikepdf + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ - namespace_pattern = re.compile(r"\{(.*)\}(.*)") + @property + def can_produce_archive(self) -> bool: + return True - pdf = pikepdf.open(document_path) - meta = pdf.open_metadata() - for key, value in meta.items(): - if isinstance(value, list): - value = " ".join([str(e) for e in value]) - value = str(value) - try: - m = namespace_pattern.match(key) - if m is None: # pragma: no cover - continue - namespace = m.group(1) - key_value = m.group(2) - try: - namespace.encode("utf-8") - key_value.encode("utf-8") - except UnicodeEncodeError as e: # pragma: no cover - self.log.debug(f"Skipping metadata key {key}: {e}") - continue - result.append( - { - "namespace": namespace, - "prefix": meta.REVERSE_NS[namespace], - "key": key_value, - "value": value, - }, - ) - except Exception as e: - self.log.warning( - f"Error while reading metadata {key}: {value}. Error: {e}", - ) - return result + @property + def requires_pdf_rendition(self) -> bool: + return False - def get_thumbnail(self, document_path, mime_type, file_name=None): - return make_thumbnail_from_pdf( - self.archive_path or document_path, - self.tempdir, - self.logging_group, + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def __init__(self, logging_group: object = None) -> None: + settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) + self.tempdir = Path( + tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), ) + self.settings = OcrConfig() + self.archive_path: Path | None = None + self.text: str | None = None + self.date: datetime.datetime | None = None + self.log = logger + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + logger.debug("Cleaning up temporary directory %s", self.tempdir) + shutil.rmtree(self.tempdir, ignore_errors=True) + + # ------------------------------------------------------------------ + # Core parsing interface + # ------------------------------------------------------------------ + + def configure(self, context: ParserContext) -> None: + pass + + # ------------------------------------------------------------------ + # Result accessors + # ------------------------------------------------------------------ + + def get_text(self) -> str | None: + return self.text + + def get_date(self) -> datetime.datetime | None: + return self.date + + def get_archive_path(self) -> Path | None: + return self.archive_path + + # ------------------------------------------------------------------ + # Thumbnail, page count, and metadata + # ------------------------------------------------------------------ + + def get_thumbnail(self, document_path: Path, mime_type: str) -> Path: + return make_thumbnail_from_pdf( + self.archive_path or Path(document_path), + self.tempdir, + ) + + def get_page_count(self, document_path: Path, mime_type: str) -> int | None: + if mime_type == "application/pdf": + from paperless.parsers.utils import get_page_count_for_pdf + + return get_page_count_for_pdf(Path(document_path), log=self.log) + return None + + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list[MetadataEntry]: + if mime_type != "application/pdf": + return [] + + from paperless.parsers.utils import extract_pdf_metadata + + return extract_pdf_metadata(Path(document_path), log=self.log) def is_image(self, mime_type) -> bool: return mime_type in [ @@ -163,7 +233,7 @@ class RasterisedDocumentParser(DocumentParser): and sidecar_file.is_file() and self.settings.mode != "redo" ): - text = self.read_file_handle_unicode_errors(sidecar_file) + text = read_file_handle_unicode_errors(sidecar_file) if "[OCR skipped on page" not in text: # This happens when there's already text in the input file. @@ -196,7 +266,7 @@ class RasterisedDocumentParser(DocumentParser): ], logger=self.log, ) - text = self.read_file_handle_unicode_errors(Path(tmp.name)) + text = read_file_handle_unicode_errors(Path(tmp.name)) return post_process_text(text) @@ -218,8 +288,6 @@ class RasterisedDocumentParser(DocumentParser): *, safe_fallback=False, ): - if TYPE_CHECKING: - assert isinstance(self.settings, OcrConfig) ocrmypdf_args = { "input_file_or_options": input_file, "output_file": output_file, @@ -330,7 +398,13 @@ class RasterisedDocumentParser(DocumentParser): return ocrmypdf_args - def parse(self, document_path: Path, mime_type, file_name=None) -> None: + def parse( + self, + document_path: Path, + mime_type: str, + *, + produce_archive: bool = True, + ) -> None: # This forces tesseract to use one core per page. os.environ["OMP_THREAD_LIMIT"] = "1" VALID_TEXT_LENGTH = 50 diff --git a/src/paperless/parsers/utils.py b/src/paperless/parsers/utils.py index b72f31a28..5378e2de5 100644 --- a/src/paperless/parsers/utils.py +++ b/src/paperless/parsers/utils.py @@ -20,6 +20,34 @@ if TYPE_CHECKING: logger = logging.getLogger("paperless.parsers.utils") +def read_file_handle_unicode_errors( + filepath: Path, + log: logging.Logger | None = None, +) -> str: + """Read a file as UTF-8 text, replacing invalid bytes rather than raising. + + Parameters + ---------- + filepath: + Absolute path to the file to read. + log: + Logger to use for warnings. Falls back to the module-level logger + when omitted. + + Returns + ------- + str + File content as a string, with any invalid UTF-8 sequences replaced + by the Unicode replacement character. + """ + _log = log or logger + try: + return filepath.read_text(encoding="utf-8") + except UnicodeDecodeError as e: + _log.warning("Unicode error during text reading, continuing: %s", e) + return filepath.read_bytes().decode("utf-8", errors="replace") + + def get_page_count_for_pdf( document_path: Path, log: logging.Logger | None = None,