Improvements for typing purposes mostly + some reuse

This commit is contained in:
Trenton H
2026-03-28 13:21:52 -07:00
parent d5248838ca
commit 5cbbe0be89
6 changed files with 61 additions and 53 deletions
+1 -1
View File
@@ -839,7 +839,7 @@ class ConsumerPlugin(
return document
def apply_overrides(self, document) -> None:
def apply_overrides(self, document: Document) -> None:
if self.metadata.correspondent_id:
document.correspondent = Correspondent.objects.get(
pk=self.metadata.correspondent_id,
+9 -8
View File
@@ -5,6 +5,7 @@ import shutil
import stat
import subprocess
from pathlib import Path
from typing import Any
from django.conf import settings
from django.core.checks import Error
@@ -22,7 +23,7 @@ writeable_hint = (
)
def path_check(var, directory: Path) -> list[Error]:
def path_check(var: str, directory: Path) -> list[Error]:
messages: list[Error] = []
if directory:
if not directory.is_dir():
@@ -59,7 +60,7 @@ def path_check(var, directory: Path) -> list[Error]:
@register()
def paths_check(app_configs, **kwargs) -> list[Error]:
def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
"""
Check the various paths for existence, readability and writeability
"""
@@ -73,7 +74,7 @@ def paths_check(app_configs, **kwargs) -> list[Error]:
@register()
def binaries_check(app_configs, **kwargs):
def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
"""
Paperless requires the existence of a few binaries, so we do some checks
for those here.
@@ -93,7 +94,7 @@ def binaries_check(app_configs, **kwargs):
@register()
def debug_mode_check(app_configs, **kwargs):
def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
if settings.DEBUG:
return [
Warning(
@@ -109,7 +110,7 @@ def debug_mode_check(app_configs, **kwargs):
@register()
def settings_values_check(app_configs, **kwargs):
def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warning]:
"""
Validates at least some of the user provided settings
"""
@@ -182,7 +183,7 @@ def settings_values_check(app_configs, **kwargs):
@register()
def audit_log_check(app_configs, **kwargs):
def audit_log_check(app_configs: Any, **kwargs: Any) -> list[Error]:
db_conn = connections["default"]
all_tables = db_conn.introspection.table_names()
result = []
@@ -329,7 +330,7 @@ def check_deprecated_v2_ocr_env_vars(
@register()
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
def check_remote_parser_configured(app_configs: Any, **kwargs: Any) -> list[Error]:
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
):
@@ -355,7 +356,7 @@ def get_tesseract_langs():
@register()
def check_default_language_available(app_configs, **kwargs):
def check_default_language_available(app_configs: Any, **kwargs: Any) -> list[Error]:
errs = []
if not settings.OCR_LANGUAGE:
+12 -5
View File
@@ -4,6 +4,11 @@ import json
from django.conf import settings
from paperless.models import ApplicationConfiguration
from paperless.models import ArchiveFileGenerationChoices
from paperless.models import CleanChoices
from paperless.models import ColorConvertChoices
from paperless.models import ModeChoices
from paperless.models import OutputTypeChoices
@dataclasses.dataclass
@@ -28,7 +33,7 @@ class OutputTypeConfig(BaseConfig):
Almost all parsers care about the chosen PDF output format
"""
output_type: str = dataclasses.field(init=False)
output_type: OutputTypeChoices = dataclasses.field(init=False)
def __post_init__(self) -> None:
app_config = self._get_config_instance()
@@ -45,15 +50,17 @@ class OcrConfig(OutputTypeConfig):
pages: int | None = dataclasses.field(init=False)
language: str = dataclasses.field(init=False)
mode: str = dataclasses.field(init=False)
archive_file_generation: str = dataclasses.field(init=False)
mode: ModeChoices = dataclasses.field(init=False)
archive_file_generation: ArchiveFileGenerationChoices = dataclasses.field(
init=False,
)
image_dpi: int | None = dataclasses.field(init=False)
clean: str = dataclasses.field(init=False)
clean: CleanChoices = dataclasses.field(init=False)
deskew: bool = dataclasses.field(init=False)
rotate: bool = dataclasses.field(init=False)
rotate_threshold: float = dataclasses.field(init=False)
max_image_pixel: float | None = dataclasses.field(init=False)
color_conversion_strategy: str = dataclasses.field(init=False)
color_conversion_strategy: ColorConvertChoices = dataclasses.field(init=False)
user_args: dict[str, str] | None = dataclasses.field(init=False)
def __post_init__(self) -> None:
+36 -37
View File
@@ -9,6 +9,8 @@ import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Any
from typing import Final
from typing import NoReturn
from typing import Self
from django.conf import settings
@@ -36,7 +38,11 @@ if TYPE_CHECKING:
logger = logging.getLogger("paperless.parsing.tesseract")
_SUPPORTED_MIME_TYPES: dict[str, str] = {
_SRGB_ICC_DATA: Final[bytes] = (
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
)
_SUPPORTED_MIME_TYPES: Final[dict[str, str]] = {
"application/pdf": ".pdf",
"image/jpeg": ".jpg",
"image/png": ".png",
@@ -102,7 +108,7 @@ class RasterisedDocumentParser:
# Lifecycle
# ------------------------------------------------------------------
def __init__(self, logging_group: object = None) -> None:
def __init__(self, logging_group: object | None = None) -> None:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
self.tempdir = Path(
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
@@ -236,7 +242,7 @@ class RasterisedDocumentParser:
if (
sidecar_file is not None
and sidecar_file.is_file()
and self.settings.mode != "redo"
and self.settings.mode != ModeChoices.REDO
):
text = read_file_handle_unicode_errors(sidecar_file)
@@ -374,7 +380,7 @@ class RasterisedDocumentParser:
return ocrmypdf_args
def _convert_image_to_pdfa(self, document_path: Path, mime_type: str) -> Path:
def _convert_image_to_pdfa(self, document_path: Path) -> Path:
"""Convert an image to a PDF/A-2b file without invoking the OCR engine.
Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
@@ -400,14 +406,10 @@ class RasterisedDocumentParser:
f"img2pdf conversion failed for {document_path}: {e!s}",
) from e
icc_data = (
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
)
pdfa_path = Path(self.tempdir) / "archive.pdf"
try:
with pikepdf.open(plain_pdf_path) as pdf:
cs = pdf.make_stream(icc_data)
cs = pdf.make_stream(_SRGB_ICC_DATA)
cs["/N"] = 3
output_intent = pikepdf.Dictionary(
Type=pikepdf.Name("/OutputIntent"),
@@ -430,6 +432,22 @@ class RasterisedDocumentParser:
return pdfa_path
def _handle_subprocess_output_error(self, e: Exception) -> NoReturn:
"""Log context for Ghostscript failures and raise ParseError.
Called from the SubprocessOutputError handlers in parse() to avoid
duplicating the Ghostscript hint and re-raise logic.
"""
if "Ghostscript PDF/A rendering" in str(e):
self.log.warning(
"Ghostscript PDF/A rendering failed, consider setting "
"PAPERLESS_OCR_USER_ARGS: "
"'{\"continue_on_soft_render_error\": true}'",
)
raise ParseError(
f"SubprocessOutputError: {e!s}. See logs for more information.",
) from e
def parse(
self,
document_path: Path,
@@ -440,6 +458,13 @@ class RasterisedDocumentParser:
# This forces tesseract to use one core per page.
os.environ["OMP_THREAD_LIMIT"] = "1"
import ocrmypdf
from ocrmypdf import EncryptedPdfError
from ocrmypdf import InputFileError
from ocrmypdf import SubprocessOutputError
from ocrmypdf.exceptions import DigitalSignatureError
from ocrmypdf.exceptions import PriorOcrFoundError
if mime_type == "application/pdf":
text_original = self.extract_text(None, document_path)
original_has_text = is_tagged_pdf(document_path, log=self.log) or (
@@ -458,7 +483,6 @@ class RasterisedDocumentParser:
try:
self.archive_path = self._convert_image_to_pdfa(
document_path,
mime_type,
)
self.text = ""
except Exception as e:
@@ -467,9 +491,6 @@ class RasterisedDocumentParser:
) from e
return
# PDFs in off mode: PDF/A conversion only via skip_text
import ocrmypdf
from ocrmypdf import SubprocessOutputError
archive_path = Path(self.tempdir) / "archive.pdf"
sidecar_file = Path(self.tempdir) / "sidecar.txt"
args = self.construct_ocrmypdf_parameters(
@@ -487,15 +508,7 @@ class RasterisedDocumentParser:
self.archive_path = archive_path
self.text = self.extract_text(None, archive_path) or text_original or ""
except SubprocessOutputError as e:
if "Ghostscript PDF/A rendering" in str(e):
self.log.warning(
"Ghostscript PDF/A rendering failed, consider setting "
"PAPERLESS_OCR_USER_ARGS: "
"'{\"continue_on_soft_render_error\": true}'",
)
raise ParseError(
f"SubprocessOutputError: {e!s}. See logs for more information.",
) from e
self._handle_subprocess_output_error(e)
except Exception as e:
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
return
@@ -513,13 +526,6 @@ class RasterisedDocumentParser:
return
# --- All other paths: run ocrmypdf ---
import ocrmypdf
from ocrmypdf import EncryptedPdfError
from ocrmypdf import InputFileError
from ocrmypdf import SubprocessOutputError
from ocrmypdf.exceptions import DigitalSignatureError
from ocrmypdf.exceptions import PriorOcrFoundError
archive_path = Path(self.tempdir) / "archive.pdf"
sidecar_file = Path(self.tempdir) / "sidecar.txt"
@@ -553,14 +559,7 @@ class RasterisedDocumentParser:
if original_has_text:
self.text = text_original
except SubprocessOutputError as e:
if "Ghostscript PDF/A rendering" in str(e):
self.log.warning(
"Ghostscript PDF/A rendering failed, consider setting "
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
)
raise ParseError(
f"SubprocessOutputError: {e!s}. See logs for more information.",
) from e
self._handle_subprocess_output_error(e)
except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
self.log.warning(
f"Encountered an error while running OCR: {e!s}. "
+2 -1
View File
@@ -13,6 +13,7 @@ import re
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Final
if TYPE_CHECKING:
from paperless.parsers import MetadataEntry
@@ -21,7 +22,7 @@ logger = logging.getLogger("paperless.parsers.utils")
# Minimum character count for a PDF to be considered "born-digital" (has real text).
# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
PDF_TEXT_MIN_LENGTH = 50
PDF_TEXT_MIN_LENGTH: Final[int] = 50
def is_tagged_pdf(
@@ -354,7 +354,7 @@ class TestOffModeImage:
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)
mock_convert.assert_called_once_with(simple_png_file, "image/png")
mock_convert.assert_called_once_with(simple_png_file)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path == fake_archive
assert tesseract_parser.get_text() == ""