From 5cbbe0be89c69aed3acfc9a7745a0b1055182e23 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Sat, 28 Mar 2026 13:21:52 -0700
Subject: [PATCH] Improvements for typing purposes mostly + some reuse

---
 src/documents/consumer.py                     |  2 +-
 src/paperless/checks.py                       | 17 +++--
 src/paperless/config.py                       | 17 +++--
 src/paperless/parsers/tesseract.py            | 73 +++++++++----------
 src/paperless/parsers/utils.py                |  3 +-
 .../tests/parsers/test_parse_modes.py         |  2 +-
 6 files changed, 61 insertions(+), 53 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 92eec23fc..14f7904a7 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -839,7 +839,7 @@ class ConsumerPlugin(
 
         return document
 
-    def apply_overrides(self, document) -> None:
+    def apply_overrides(self, document: Document) -> None:
         if self.metadata.correspondent_id:
             document.correspondent = Correspondent.objects.get(
                 pk=self.metadata.correspondent_id,
diff --git a/src/paperless/checks.py b/src/paperless/checks.py
index efeebb4ee..fbcae320a 100644
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -5,6 +5,7 @@ import shutil
 import stat
 import subprocess
 from pathlib import Path
+from typing import Any
 
 from django.conf import settings
 from django.core.checks import Error
@@ -22,7 +23,7 @@ writeable_hint = (
 )
 
 
-def path_check(var, directory: Path) -> list[Error]:
+def path_check(var: str, directory: Path) -> list[Error]:
     messages: list[Error] = []
     if directory:
         if not directory.is_dir():
@@ -59,7 +60,7 @@ def path_check(var, directory: Path) -> list[Error]:
 
 
 @register()
-def paths_check(app_configs, **kwargs) -> list[Error]:
+def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
     """
     Check the various paths for existence, readability and writeability
     """
@@ -73,7 +74,7 @@ def paths_check(app_configs, **kwargs) -> list[Error]:
 
 
 @register()
-def binaries_check(app_configs, **kwargs):
+def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
     """
     Paperless requires the existence of a few binaries, so we do some checks
     for those here.
@@ -93,7 +94,7 @@ def binaries_check(app_configs, **kwargs):
 
 
 @register()
-def debug_mode_check(app_configs, **kwargs):
+def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
     if settings.DEBUG:
         return [
             Warning(
@@ -109,7 +110,7 @@ def debug_mode_check(app_configs, **kwargs):
 
 
 @register()
-def settings_values_check(app_configs, **kwargs):
+def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warning]:
     """
     Validates at least some of the user provided settings
     """
@@ -182,7 +183,7 @@ def settings_values_check(app_configs, **kwargs):
 
 
 @register()
-def audit_log_check(app_configs, **kwargs):
+def audit_log_check(app_configs: Any, **kwargs: Any) -> list[Error]:
     db_conn = connections["default"]
     all_tables = db_conn.introspection.table_names()
     result = []
@@ -329,7 +330,7 @@ def check_deprecated_v2_ocr_env_vars(
 
 
 @register()
-def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
+def check_remote_parser_configured(app_configs: Any, **kwargs: Any) -> list[Error]:
     if settings.REMOTE_OCR_ENGINE == "azureai" and not (
         settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
     ):
@@ -355,7 +356,7 @@ def get_tesseract_langs():
 
 
 @register()
-def check_default_language_available(app_configs, **kwargs):
+def check_default_language_available(app_configs: Any, **kwargs: Any) -> list[Error]:
     errs = []
 
     if not settings.OCR_LANGUAGE:
diff --git a/src/paperless/config.py b/src/paperless/config.py
index 50cd80b6a..8363cfb1f 100644
--- a/src/paperless/config.py
+++ b/src/paperless/config.py
@@ -4,6 +4,11 @@ import json
 from django.conf import settings
 
 from paperless.models import ApplicationConfiguration
+from paperless.models import ArchiveFileGenerationChoices
+from paperless.models import CleanChoices
+from paperless.models import ColorConvertChoices
+from paperless.models import ModeChoices
+from paperless.models import OutputTypeChoices
 
 
 @dataclasses.dataclass
@@ -28,7 +33,7 @@ class OutputTypeConfig(BaseConfig):
     Almost all parsers care about the chosen PDF output format
     """
 
-    output_type: str = dataclasses.field(init=False)
+    output_type: OutputTypeChoices = dataclasses.field(init=False)
 
     def __post_init__(self) -> None:
         app_config = self._get_config_instance()
@@ -45,15 +50,17 @@ class OcrConfig(OutputTypeConfig):
 
     pages: int | None = dataclasses.field(init=False)
     language: str = dataclasses.field(init=False)
-    mode: str = dataclasses.field(init=False)
-    archive_file_generation: str = dataclasses.field(init=False)
+    mode: ModeChoices = dataclasses.field(init=False)
+    archive_file_generation: ArchiveFileGenerationChoices = dataclasses.field(
+        init=False,
+    )
     image_dpi: int | None = dataclasses.field(init=False)
-    clean: str = dataclasses.field(init=False)
+    clean: CleanChoices = dataclasses.field(init=False)
     deskew: bool = dataclasses.field(init=False)
     rotate: bool = dataclasses.field(init=False)
     rotate_threshold: float = dataclasses.field(init=False)
     max_image_pixel: float | None = dataclasses.field(init=False)
-    color_conversion_strategy: str = dataclasses.field(init=False)
+    color_conversion_strategy: ColorConvertChoices = dataclasses.field(init=False)
     user_args: dict[str, str] | None = dataclasses.field(init=False)
 
     def __post_init__(self) -> None:
diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py
index fb02c8e64..ccdcc3ecb 100644
--- a/src/paperless/parsers/tesseract.py
+++ b/src/paperless/parsers/tesseract.py
@@ -9,6 +9,8 @@ import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import Final
+from typing import NoReturn
 from typing import Self
 
 from django.conf import settings
@@ -36,7 +38,11 @@ if TYPE_CHECKING:
 
 logger = logging.getLogger("paperless.parsing.tesseract")
 
-_SUPPORTED_MIME_TYPES: dict[str, str] = {
+_SRGB_ICC_DATA: Final[bytes] = (
+    importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
+)
+
+_SUPPORTED_MIME_TYPES: Final[dict[str, str]] = {
     "application/pdf": ".pdf",
     "image/jpeg": ".jpg",
     "image/png": ".png",
@@ -102,7 +108,7 @@ class RasterisedDocumentParser:
     # Lifecycle
     # ------------------------------------------------------------------
 
-    def __init__(self, logging_group: object = None) -> None:
+    def __init__(self, logging_group: object | None = None) -> None:
         settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
         self.tempdir = Path(
             tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
@@ -236,7 +242,7 @@ class RasterisedDocumentParser:
         if (
             sidecar_file is not None
             and sidecar_file.is_file()
-            and self.settings.mode != "redo"
+            and self.settings.mode != ModeChoices.REDO
         ):
             text = read_file_handle_unicode_errors(sidecar_file)
 
@@ -374,7 +380,7 @@ class RasterisedDocumentParser:
 
         return ocrmypdf_args
 
-    def _convert_image_to_pdfa(self, document_path: Path, mime_type: str) -> Path:
+    def _convert_image_to_pdfa(self, document_path: Path) -> Path:
         """Convert an image to a PDF/A-2b file without invoking the OCR engine.
 
         Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
@@ -400,14 +406,10 @@ class RasterisedDocumentParser:
                 f"img2pdf conversion failed for {document_path}: {e!s}",
             ) from e
 
-        icc_data = (
-            importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
-        )
-
         pdfa_path = Path(self.tempdir) / "archive.pdf"
         try:
             with pikepdf.open(plain_pdf_path) as pdf:
-                cs = pdf.make_stream(icc_data)
+                cs = pdf.make_stream(_SRGB_ICC_DATA)
                 cs["/N"] = 3
                 output_intent = pikepdf.Dictionary(
                     Type=pikepdf.Name("/OutputIntent"),
@@ -430,6 +432,22 @@ class RasterisedDocumentParser:
 
         return pdfa_path
 
+    def _handle_subprocess_output_error(self, e: Exception) -> NoReturn:
+        """Log context for Ghostscript failures and raise ParseError.
+
+        Called from the SubprocessOutputError handlers in parse() to avoid
+        duplicating the Ghostscript hint and re-raise logic.
+        """
+        if "Ghostscript PDF/A rendering" in str(e):
+            self.log.warning(
+                "Ghostscript PDF/A rendering failed, consider setting "
+                "PAPERLESS_OCR_USER_ARGS: "
+                "'{\"continue_on_soft_render_error\": true}'",
+            )
+        raise ParseError(
+            f"SubprocessOutputError: {e!s}. See logs for more information.",
+        ) from e
+
     def parse(
         self,
         document_path: Path,
@@ -440,6 +458,13 @@ class RasterisedDocumentParser:
         # This forces tesseract to use one core per page.
         os.environ["OMP_THREAD_LIMIT"] = "1"
 
+        import ocrmypdf
+        from ocrmypdf import EncryptedPdfError
+        from ocrmypdf import InputFileError
+        from ocrmypdf import SubprocessOutputError
+        from ocrmypdf.exceptions import DigitalSignatureError
+        from ocrmypdf.exceptions import PriorOcrFoundError
+
         if mime_type == "application/pdf":
             text_original = self.extract_text(None, document_path)
             original_has_text = is_tagged_pdf(document_path, log=self.log) or (
@@ -458,7 +483,6 @@ class RasterisedDocumentParser:
                 try:
                     self.archive_path = self._convert_image_to_pdfa(
                         document_path,
-                        mime_type,
                     )
                     self.text = ""
                 except Exception as e:
@@ -467,9 +491,6 @@ class RasterisedDocumentParser:
                     ) from e
                 return
             # PDFs in off mode: PDF/A conversion only via skip_text
-            import ocrmypdf
-            from ocrmypdf import SubprocessOutputError
-
             archive_path = Path(self.tempdir) / "archive.pdf"
             sidecar_file = Path(self.tempdir) / "sidecar.txt"
             args = self.construct_ocrmypdf_parameters(
@@ -487,15 +508,7 @@ class RasterisedDocumentParser:
                 self.archive_path = archive_path
                 self.text = self.extract_text(None, archive_path) or text_original or ""
             except SubprocessOutputError as e:
-                if "Ghostscript PDF/A rendering" in str(e):
-                    self.log.warning(
-                        "Ghostscript PDF/A rendering failed, consider setting "
-                        "PAPERLESS_OCR_USER_ARGS: "
-                        "'{\"continue_on_soft_render_error\": true}'",
-                    )
-                raise ParseError(
-                    f"SubprocessOutputError: {e!s}. See logs for more information.",
-                ) from e
+                self._handle_subprocess_output_error(e)
             except Exception as e:
                 raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
             return
@@ -513,13 +526,6 @@ class RasterisedDocumentParser:
             return
 
         # --- All other paths: run ocrmypdf ---
-        import ocrmypdf
-        from ocrmypdf import EncryptedPdfError
-        from ocrmypdf import InputFileError
-        from ocrmypdf import SubprocessOutputError
-        from ocrmypdf.exceptions import DigitalSignatureError
-        from ocrmypdf.exceptions import PriorOcrFoundError
-
         archive_path = Path(self.tempdir) / "archive.pdf"
         sidecar_file = Path(self.tempdir) / "sidecar.txt"
 
@@ -553,14 +559,7 @@ class RasterisedDocumentParser:
             if original_has_text:
                 self.text = text_original
         except SubprocessOutputError as e:
-            if "Ghostscript PDF/A rendering" in str(e):
-                self.log.warning(
-                    "Ghostscript PDF/A rendering failed, consider setting "
-                    "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
-                )
-            raise ParseError(
-                f"SubprocessOutputError: {e!s}. See logs for more information.",
-            ) from e
+            self._handle_subprocess_output_error(e)
         except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
             self.log.warning(
                 f"Encountered an error while running OCR: {e!s}. "
diff --git a/src/paperless/parsers/utils.py b/src/paperless/parsers/utils.py
index 0e65aadd3..8cc4630bf 100644
--- a/src/paperless/parsers/utils.py
+++ b/src/paperless/parsers/utils.py
@@ -13,6 +13,7 @@ import re
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
+from typing import Final
 
 if TYPE_CHECKING:
     from paperless.parsers import MetadataEntry
@@ -21,7 +22,7 @@ logger = logging.getLogger("paperless.parsers.utils")
 
 # Minimum character count for a PDF to be considered "born-digital" (has real text).
 # Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
-PDF_TEXT_MIN_LENGTH = 50
+PDF_TEXT_MIN_LENGTH: Final[int] = 50
 
 
 def is_tagged_pdf(
diff --git a/src/paperless/tests/parsers/test_parse_modes.py b/src/paperless/tests/parsers/test_parse_modes.py
index 6766379c3..f101e6561 100644
--- a/src/paperless/tests/parsers/test_parse_modes.py
+++ b/src/paperless/tests/parsers/test_parse_modes.py
@@ -354,7 +354,7 @@ class TestOffModeImage:
         tesseract_parser.settings.mode = "off"
         tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)
 
-        mock_convert.assert_called_once_with(simple_png_file, "image/png")
+        mock_convert.assert_called_once_with(simple_png_file)
         mock_ocr.assert_not_called()
         assert tesseract_parser.archive_path == fake_archive
         assert tesseract_parser.get_text() == ""