Improvements for typing purposes mostly + some reuse

2026-06-20 12:24:17 +00:00 · 2026-03-28 13:21:52 -07:00
parent d5248838ca
commit 5cbbe0be89
6 changed files with 61 additions and 53 deletions
@@ -839,7 +839,7 @@ class ConsumerPlugin(

        return document

-    def apply_overrides(self, document) -> None:
+    def apply_overrides(self, document: Document) -> None:
        if self.metadata.correspondent_id:
            document.correspondent = Correspondent.objects.get(
                pk=self.metadata.correspondent_id,
@@ -5,6 +5,7 @@ import shutil
 import stat
 import subprocess
 from pathlib import Path
+from typing import Any

 from django.conf import settings
 from django.core.checks import Error
@@ -22,7 +23,7 @@ writeable_hint = (
 )


-def path_check(var, directory: Path) -> list[Error]:
+def path_check(var: str, directory: Path) -> list[Error]:
    messages: list[Error] = []
    if directory:
        if not directory.is_dir():
@@ -59,7 +60,7 @@ def path_check(var, directory: Path) -> list[Error]:


@register()
-def paths_check(app_configs, **kwargs) -> list[Error]:
+def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
    """
    Check the various paths for existence, readability and writeability
    """
@@ -73,7 +74,7 @@ def paths_check(app_configs, **kwargs) -> list[Error]:


@register()
-def binaries_check(app_configs, **kwargs):
+def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
    """
    Paperless requires the existence of a few binaries, so we do some checks
    for those here.
@@ -93,7 +94,7 @@ def binaries_check(app_configs, **kwargs):


@register()
-def debug_mode_check(app_configs, **kwargs):
+def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
    if settings.DEBUG:
        return [
            Warning(
@@ -109,7 +110,7 @@ def debug_mode_check(app_configs, **kwargs):


@register()
-def settings_values_check(app_configs, **kwargs):
+def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warning]:
    """
    Validates at least some of the user provided settings
    """
@@ -182,7 +183,7 @@ def settings_values_check(app_configs, **kwargs):


@register()
-def audit_log_check(app_configs, **kwargs):
+def audit_log_check(app_configs: Any, **kwargs: Any) -> list[Error]:
    db_conn = connections["default"]
    all_tables = db_conn.introspection.table_names()
    result = []
@@ -329,7 +330,7 @@ def check_deprecated_v2_ocr_env_vars(


@register()
-def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
+def check_remote_parser_configured(app_configs: Any, **kwargs: Any) -> list[Error]:
    if settings.REMOTE_OCR_ENGINE == "azureai" and not (
        settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
    ):
@@ -355,7 +356,7 @@ def get_tesseract_langs():


@register()
-def check_default_language_available(app_configs, **kwargs):
+def check_default_language_available(app_configs: Any, **kwargs: Any) -> list[Error]:
    errs = []

    if not settings.OCR_LANGUAGE:
@@ -4,6 +4,11 @@ import json
 from django.conf import settings

 from paperless.models import ApplicationConfiguration
+from paperless.models import ArchiveFileGenerationChoices
+from paperless.models import CleanChoices
+from paperless.models import ColorConvertChoices
+from paperless.models import ModeChoices
+from paperless.models import OutputTypeChoices


@dataclasses.dataclass
@@ -28,7 +33,7 @@ class OutputTypeConfig(BaseConfig):
    Almost all parsers care about the chosen PDF output format
    """

-    output_type: str = dataclasses.field(init=False)
+    output_type: OutputTypeChoices = dataclasses.field(init=False)

    def __post_init__(self) -> None:
        app_config = self._get_config_instance()
@@ -45,15 +50,17 @@ class OcrConfig(OutputTypeConfig):

    pages: int | None = dataclasses.field(init=False)
    language: str = dataclasses.field(init=False)
-    mode: str = dataclasses.field(init=False)
-    archive_file_generation: str = dataclasses.field(init=False)
+    mode: ModeChoices = dataclasses.field(init=False)
+    archive_file_generation: ArchiveFileGenerationChoices = dataclasses.field(
+        init=False,
+    )
    image_dpi: int | None = dataclasses.field(init=False)
-    clean: str = dataclasses.field(init=False)
+    clean: CleanChoices = dataclasses.field(init=False)
    deskew: bool = dataclasses.field(init=False)
    rotate: bool = dataclasses.field(init=False)
    rotate_threshold: float = dataclasses.field(init=False)
    max_image_pixel: float | None = dataclasses.field(init=False)
-    color_conversion_strategy: str = dataclasses.field(init=False)
+    color_conversion_strategy: ColorConvertChoices = dataclasses.field(init=False)
    user_args: dict[str, str] | None = dataclasses.field(init=False)

    def __post_init__(self) -> None:
@@ -9,6 +9,8 @@ import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import Final
+from typing import NoReturn
 from typing import Self

 from django.conf import settings
@@ -36,7 +38,11 @@ if TYPE_CHECKING:

 logger = logging.getLogger("paperless.parsing.tesseract")

-_SUPPORTED_MIME_TYPES: dict[str, str] = {
+_SRGB_ICC_DATA: Final[bytes] = (
+    importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
+)
+
+_SUPPORTED_MIME_TYPES: Final[dict[str, str]] = {
    "application/pdf": ".pdf",
    "image/jpeg": ".jpg",
    "image/png": ".png",
@@ -102,7 +108,7 @@ class RasterisedDocumentParser:
    # Lifecycle
    # ------------------------------------------------------------------

-    def __init__(self, logging_group: object = None) -> None:
+    def __init__(self, logging_group: object | None = None) -> None:
        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
        self.tempdir = Path(
            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
@@ -236,7 +242,7 @@ class RasterisedDocumentParser:
        if (
            sidecar_file is not None
            and sidecar_file.is_file()
-            and self.settings.mode != "redo"
+            and self.settings.mode != ModeChoices.REDO
        ):
            text = read_file_handle_unicode_errors(sidecar_file)

@@ -374,7 +380,7 @@ class RasterisedDocumentParser:

        return ocrmypdf_args

-    def _convert_image_to_pdfa(self, document_path: Path, mime_type: str) -> Path:
+    def _convert_image_to_pdfa(self, document_path: Path) -> Path:
        """Convert an image to a PDF/A-2b file without invoking the OCR engine.

        Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
@@ -400,14 +406,10 @@ class RasterisedDocumentParser:
                f"img2pdf conversion failed for {document_path}: {e!s}",
            ) from e

-        icc_data = (
-            importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
-        )
-
        pdfa_path = Path(self.tempdir) / "archive.pdf"
        try:
            with pikepdf.open(plain_pdf_path) as pdf:
-                cs = pdf.make_stream(icc_data)
+                cs = pdf.make_stream(_SRGB_ICC_DATA)
                cs["/N"] = 3
                output_intent = pikepdf.Dictionary(
                    Type=pikepdf.Name("/OutputIntent"),
@@ -430,6 +432,22 @@ class RasterisedDocumentParser:

        return pdfa_path

+    def _handle_subprocess_output_error(self, e: Exception) -> NoReturn:
+        """Log context for Ghostscript failures and raise ParseError.
+
+        Called from the SubprocessOutputError handlers in parse() to avoid
+        duplicating the Ghostscript hint and re-raise logic.
+        """
+        if "Ghostscript PDF/A rendering" in str(e):
+            self.log.warning(
+                "Ghostscript PDF/A rendering failed, consider setting "
+                "PAPERLESS_OCR_USER_ARGS: "
+                "'{\"continue_on_soft_render_error\": true}'",
+            )
+        raise ParseError(
+            f"SubprocessOutputError: {e!s}. See logs for more information.",
+        ) from e
+
    def parse(
        self,
        document_path: Path,
@@ -440,6 +458,13 @@ class RasterisedDocumentParser:
        # This forces tesseract to use one core per page.
        os.environ["OMP_THREAD_LIMIT"] = "1"

+        import ocrmypdf
+        from ocrmypdf import EncryptedPdfError
+        from ocrmypdf import InputFileError
+        from ocrmypdf import SubprocessOutputError
+        from ocrmypdf.exceptions import DigitalSignatureError
+        from ocrmypdf.exceptions import PriorOcrFoundError
+
        if mime_type == "application/pdf":
            text_original = self.extract_text(None, document_path)
            original_has_text = is_tagged_pdf(document_path, log=self.log) or (
@@ -458,7 +483,6 @@ class RasterisedDocumentParser:
                try:
                    self.archive_path = self._convert_image_to_pdfa(
                        document_path,
-                        mime_type,
                    )
                    self.text = ""
                except Exception as e:
@@ -467,9 +491,6 @@ class RasterisedDocumentParser:
                    ) from e
                return
            # PDFs in off mode: PDF/A conversion only via skip_text
-            import ocrmypdf
-            from ocrmypdf import SubprocessOutputError
-
            archive_path = Path(self.tempdir) / "archive.pdf"
            sidecar_file = Path(self.tempdir) / "sidecar.txt"
            args = self.construct_ocrmypdf_parameters(
@@ -487,15 +508,7 @@ class RasterisedDocumentParser:
                self.archive_path = archive_path
                self.text = self.extract_text(None, archive_path) or text_original or ""
            except SubprocessOutputError as e:
-                if "Ghostscript PDF/A rendering" in str(e):
-                    self.log.warning(
-                        "Ghostscript PDF/A rendering failed, consider setting "
-                        "PAPERLESS_OCR_USER_ARGS: "
-                        "'{\"continue_on_soft_render_error\": true}'",
-                    )
-                raise ParseError(
-                    f"SubprocessOutputError: {e!s}. See logs for more information.",
-                ) from e
+                self._handle_subprocess_output_error(e)
            except Exception as e:
                raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
            return
@@ -513,13 +526,6 @@ class RasterisedDocumentParser:
            return

        # --- All other paths: run ocrmypdf ---
-        import ocrmypdf
-        from ocrmypdf import EncryptedPdfError
-        from ocrmypdf import InputFileError
-        from ocrmypdf import SubprocessOutputError
-        from ocrmypdf.exceptions import DigitalSignatureError
-        from ocrmypdf.exceptions import PriorOcrFoundError
-
        archive_path = Path(self.tempdir) / "archive.pdf"
        sidecar_file = Path(self.tempdir) / "sidecar.txt"

@@ -553,14 +559,7 @@ class RasterisedDocumentParser:
            if original_has_text:
                self.text = text_original
        except SubprocessOutputError as e:
-            if "Ghostscript PDF/A rendering" in str(e):
-                self.log.warning(
-                    "Ghostscript PDF/A rendering failed, consider setting "
-                    "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
-                )
-            raise ParseError(
-                f"SubprocessOutputError: {e!s}. See logs for more information.",
-            ) from e
+            self._handle_subprocess_output_error(e)
        except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
            self.log.warning(
                f"Encountered an error while running OCR: {e!s}. "
@@ -13,6 +13,7 @@ import re
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
+from typing import Final

 if TYPE_CHECKING:
    from paperless.parsers import MetadataEntry
@@ -21,7 +22,7 @@ logger = logging.getLogger("paperless.parsers.utils")

 # Minimum character count for a PDF to be considered "born-digital" (has real text).
 # Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
-PDF_TEXT_MIN_LENGTH = 50
+PDF_TEXT_MIN_LENGTH: Final[int] = 50


 def is_tagged_pdf(
@@ -354,7 +354,7 @@ class TestOffModeImage:
        tesseract_parser.settings.mode = "off"
        tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)

-        mock_convert.assert_called_once_with(simple_png_file, "image/png")
+        mock_convert.assert_called_once_with(simple_png_file)
        mock_ocr.assert_not_called()
        assert tesseract_parser.archive_path == fake_archive
        assert tesseract_parser.get_text() == ""