From 5cbbe0be89c69aed3acfc9a7745a0b1055182e23 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Sat, 28 Mar 2026 13:21:52 -0700 Subject: [PATCH] Improvements for typing purposes mostly + some reuse --- src/documents/consumer.py | 2 +- src/paperless/checks.py | 17 +++-- src/paperless/config.py | 17 +++-- src/paperless/parsers/tesseract.py | 73 +++++++++---------- src/paperless/parsers/utils.py | 3 +- .../tests/parsers/test_parse_modes.py | 2 +- 6 files changed, 61 insertions(+), 53 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 92eec23fc..14f7904a7 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -839,7 +839,7 @@ class ConsumerPlugin( return document - def apply_overrides(self, document) -> None: + def apply_overrides(self, document: Document) -> None: if self.metadata.correspondent_id: document.correspondent = Correspondent.objects.get( pk=self.metadata.correspondent_id, diff --git a/src/paperless/checks.py b/src/paperless/checks.py index efeebb4ee..fbcae320a 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -5,6 +5,7 @@ import shutil import stat import subprocess from pathlib import Path +from typing import Any from django.conf import settings from django.core.checks import Error @@ -22,7 +23,7 @@ writeable_hint = ( ) -def path_check(var, directory: Path) -> list[Error]: +def path_check(var: str, directory: Path) -> list[Error]: messages: list[Error] = [] if directory: if not directory.is_dir(): @@ -59,7 +60,7 @@ def path_check(var, directory: Path) -> list[Error]: @register() -def paths_check(app_configs, **kwargs) -> list[Error]: +def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]: """ Check the various paths for existence, readability and writeability """ @@ -73,7 +74,7 @@ def paths_check(app_configs, **kwargs) -> list[Error]: @register() -def binaries_check(app_configs, **kwargs): +def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]: """ Paperless requires the existence of a few binaries, so we do some checks for those here. @@ -93,7 +94,7 @@ def binaries_check(app_configs, **kwargs): @register() -def debug_mode_check(app_configs, **kwargs): +def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]: if settings.DEBUG: return [ Warning( @@ -109,7 +110,7 @@ def debug_mode_check(app_configs, **kwargs): @register() -def settings_values_check(app_configs, **kwargs): +def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warning]: """ Validates at least some of the user provided settings """ @@ -182,7 +183,7 @@ def settings_values_check(app_configs, **kwargs): @register() -def audit_log_check(app_configs, **kwargs): +def audit_log_check(app_configs: Any, **kwargs: Any) -> list[Error]: db_conn = connections["default"] all_tables = db_conn.introspection.table_names() result = [] @@ -329,7 +330,7 @@ def check_deprecated_v2_ocr_env_vars( @register() -def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]: +def check_remote_parser_configured(app_configs: Any, **kwargs: Any) -> list[Error]: if settings.REMOTE_OCR_ENGINE == "azureai" and not ( settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY ): @@ -355,7 +356,7 @@ def get_tesseract_langs(): @register() -def check_default_language_available(app_configs, **kwargs): +def check_default_language_available(app_configs: Any, **kwargs: Any) -> list[Error]: errs = [] if not settings.OCR_LANGUAGE: diff --git a/src/paperless/config.py b/src/paperless/config.py index 50cd80b6a..8363cfb1f 100644 --- a/src/paperless/config.py +++ b/src/paperless/config.py @@ -4,6 +4,11 @@ import json from django.conf import settings from paperless.models import ApplicationConfiguration +from paperless.models import ArchiveFileGenerationChoices +from paperless.models import CleanChoices +from paperless.models import ColorConvertChoices +from paperless.models import ModeChoices +from paperless.models import OutputTypeChoices @dataclasses.dataclass @@ -28,7 +33,7 @@ class OutputTypeConfig(BaseConfig): Almost all parsers care about the chosen PDF output format """ - output_type: str = dataclasses.field(init=False) + output_type: OutputTypeChoices = dataclasses.field(init=False) def __post_init__(self) -> None: app_config = self._get_config_instance() @@ -45,15 +50,17 @@ class OcrConfig(OutputTypeConfig): pages: int | None = dataclasses.field(init=False) language: str = dataclasses.field(init=False) - mode: str = dataclasses.field(init=False) - archive_file_generation: str = dataclasses.field(init=False) + mode: ModeChoices = dataclasses.field(init=False) + archive_file_generation: ArchiveFileGenerationChoices = dataclasses.field( + init=False, + ) image_dpi: int | None = dataclasses.field(init=False) - clean: str = dataclasses.field(init=False) + clean: CleanChoices = dataclasses.field(init=False) deskew: bool = dataclasses.field(init=False) rotate: bool = dataclasses.field(init=False) rotate_threshold: float = dataclasses.field(init=False) max_image_pixel: float | None = dataclasses.field(init=False) - color_conversion_strategy: str = dataclasses.field(init=False) + color_conversion_strategy: ColorConvertChoices = dataclasses.field(init=False) user_args: dict[str, str] | None = dataclasses.field(init=False) def __post_init__(self) -> None: diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index fb02c8e64..ccdcc3ecb 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -9,6 +9,8 @@ import tempfile from pathlib import Path from typing import TYPE_CHECKING from typing import Any +from typing import Final +from typing import NoReturn from typing import Self from django.conf import settings @@ -36,7 +38,11 @@ if TYPE_CHECKING: logger = logging.getLogger("paperless.parsing.tesseract") -_SUPPORTED_MIME_TYPES: dict[str, str] = { +_SRGB_ICC_DATA: Final[bytes] = ( + importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes() +) + +_SUPPORTED_MIME_TYPES: Final[dict[str, str]] = { "application/pdf": ".pdf", "image/jpeg": ".jpg", "image/png": ".png", @@ -102,7 +108,7 @@ class RasterisedDocumentParser: # Lifecycle # ------------------------------------------------------------------ - def __init__(self, logging_group: object = None) -> None: + def __init__(self, logging_group: object | None = None) -> None: settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) self.tempdir = Path( tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), @@ -236,7 +242,7 @@ class RasterisedDocumentParser: if ( sidecar_file is not None and sidecar_file.is_file() - and self.settings.mode != "redo" + and self.settings.mode != ModeChoices.REDO ): text = read_file_handle_unicode_errors(sidecar_file) @@ -374,7 +380,7 @@ class RasterisedDocumentParser: return ocrmypdf_args - def _convert_image_to_pdfa(self, document_path: Path, mime_type: str) -> Path: + def _convert_image_to_pdfa(self, document_path: Path) -> Path: """Convert an image to a PDF/A-2b file without invoking the OCR engine. Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp @@ -400,14 +406,10 @@ class RasterisedDocumentParser: f"img2pdf conversion failed for {document_path}: {e!s}", ) from e - icc_data = ( - importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes() - ) - pdfa_path = Path(self.tempdir) / "archive.pdf" try: with pikepdf.open(plain_pdf_path) as pdf: - cs = pdf.make_stream(icc_data) + cs = pdf.make_stream(_SRGB_ICC_DATA) cs["/N"] = 3 output_intent = pikepdf.Dictionary( Type=pikepdf.Name("/OutputIntent"), @@ -430,6 +432,22 @@ class RasterisedDocumentParser: return pdfa_path + def _handle_subprocess_output_error(self, e: Exception) -> NoReturn: + """Log context for Ghostscript failures and raise ParseError. + + Called from the SubprocessOutputError handlers in parse() to avoid + duplicating the Ghostscript hint and re-raise logic. + """ + if "Ghostscript PDF/A rendering" in str(e): + self.log.warning( + "Ghostscript PDF/A rendering failed, consider setting " + "PAPERLESS_OCR_USER_ARGS: " + "'{\"continue_on_soft_render_error\": true}'", + ) + raise ParseError( + f"SubprocessOutputError: {e!s}. See logs for more information.", + ) from e + def parse( self, document_path: Path, @@ -440,6 +458,13 @@ class RasterisedDocumentParser: # This forces tesseract to use one core per page. os.environ["OMP_THREAD_LIMIT"] = "1" + import ocrmypdf + from ocrmypdf import EncryptedPdfError + from ocrmypdf import InputFileError + from ocrmypdf import SubprocessOutputError + from ocrmypdf.exceptions import DigitalSignatureError + from ocrmypdf.exceptions import PriorOcrFoundError + if mime_type == "application/pdf": text_original = self.extract_text(None, document_path) original_has_text = is_tagged_pdf(document_path, log=self.log) or ( @@ -458,7 +483,6 @@ class RasterisedDocumentParser: try: self.archive_path = self._convert_image_to_pdfa( document_path, - mime_type, ) self.text = "" except Exception as e: @@ -467,9 +491,6 @@ class RasterisedDocumentParser: ) from e return # PDFs in off mode: PDF/A conversion only via skip_text - import ocrmypdf - from ocrmypdf import SubprocessOutputError - archive_path = Path(self.tempdir) / "archive.pdf" sidecar_file = Path(self.tempdir) / "sidecar.txt" args = self.construct_ocrmypdf_parameters( @@ -487,15 +508,7 @@ class RasterisedDocumentParser: self.archive_path = archive_path self.text = self.extract_text(None, archive_path) or text_original or "" except SubprocessOutputError as e: - if "Ghostscript PDF/A rendering" in str(e): - self.log.warning( - "Ghostscript PDF/A rendering failed, consider setting " - "PAPERLESS_OCR_USER_ARGS: " - "'{\"continue_on_soft_render_error\": true}'", - ) - raise ParseError( - f"SubprocessOutputError: {e!s}. See logs for more information.", - ) from e + self._handle_subprocess_output_error(e) except Exception as e: raise ParseError(f"{e.__class__.__name__}: {e!s}") from e return @@ -513,13 +526,6 @@ class RasterisedDocumentParser: return # --- All other paths: run ocrmypdf --- - import ocrmypdf - from ocrmypdf import EncryptedPdfError - from ocrmypdf import InputFileError - from ocrmypdf import SubprocessOutputError - from ocrmypdf.exceptions import DigitalSignatureError - from ocrmypdf.exceptions import PriorOcrFoundError - archive_path = Path(self.tempdir) / "archive.pdf" sidecar_file = Path(self.tempdir) / "sidecar.txt" @@ -553,14 +559,7 @@ class RasterisedDocumentParser: if original_has_text: self.text = text_original except SubprocessOutputError as e: - if "Ghostscript PDF/A rendering" in str(e): - self.log.warning( - "Ghostscript PDF/A rendering failed, consider setting " - "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'", - ) - raise ParseError( - f"SubprocessOutputError: {e!s}. See logs for more information.", - ) from e + self._handle_subprocess_output_error(e) except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e: self.log.warning( f"Encountered an error while running OCR: {e!s}. " diff --git a/src/paperless/parsers/utils.py b/src/paperless/parsers/utils.py index 0e65aadd3..8cc4630bf 100644 --- a/src/paperless/parsers/utils.py +++ b/src/paperless/parsers/utils.py @@ -13,6 +13,7 @@ import re import tempfile from pathlib import Path from typing import TYPE_CHECKING +from typing import Final if TYPE_CHECKING: from paperless.parsers import MetadataEntry @@ -21,7 +22,7 @@ logger = logging.getLogger("paperless.parsers.utils") # Minimum character count for a PDF to be considered "born-digital" (has real text). # Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision). -PDF_TEXT_MIN_LENGTH = 50 +PDF_TEXT_MIN_LENGTH: Final[int] = 50 def is_tagged_pdf( diff --git a/src/paperless/tests/parsers/test_parse_modes.py b/src/paperless/tests/parsers/test_parse_modes.py index 6766379c3..f101e6561 100644 --- a/src/paperless/tests/parsers/test_parse_modes.py +++ b/src/paperless/tests/parsers/test_parse_modes.py @@ -354,7 +354,7 @@ class TestOffModeImage: tesseract_parser.settings.mode = "off" tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True) - mock_convert.assert_called_once_with(simple_png_file, "image/png") + mock_convert.assert_called_once_with(simple_png_file) mock_ocr.assert_not_called() assert tesseract_parser.archive_path == fake_archive assert tesseract_parser.get_text() == ""