mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-04 13:45:25 +00:00
Improvements for typing purposes mostly + some reuse
This commit is contained in:
@@ -839,7 +839,7 @@ class ConsumerPlugin(
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document) -> None:
|
||||
def apply_overrides(self, document: Document) -> None:
|
||||
if self.metadata.correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(
|
||||
pk=self.metadata.correspondent_id,
|
||||
|
||||
@@ -5,6 +5,7 @@ import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error
|
||||
@@ -22,7 +23,7 @@ writeable_hint = (
|
||||
)
|
||||
|
||||
|
||||
def path_check(var, directory: Path) -> list[Error]:
|
||||
def path_check(var: str, directory: Path) -> list[Error]:
|
||||
messages: list[Error] = []
|
||||
if directory:
|
||||
if not directory.is_dir():
|
||||
@@ -59,7 +60,7 @@ def path_check(var, directory: Path) -> list[Error]:
|
||||
|
||||
|
||||
@register()
|
||||
def paths_check(app_configs, **kwargs) -> list[Error]:
|
||||
def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
"""
|
||||
Check the various paths for existence, readability and writeability
|
||||
"""
|
||||
@@ -73,7 +74,7 @@ def paths_check(app_configs, **kwargs) -> list[Error]:
|
||||
|
||||
|
||||
@register()
|
||||
def binaries_check(app_configs, **kwargs):
|
||||
def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
"""
|
||||
Paperless requires the existence of a few binaries, so we do some checks
|
||||
for those here.
|
||||
@@ -93,7 +94,7 @@ def binaries_check(app_configs, **kwargs):
|
||||
|
||||
|
||||
@register()
|
||||
def debug_mode_check(app_configs, **kwargs):
|
||||
def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]:
|
||||
if settings.DEBUG:
|
||||
return [
|
||||
Warning(
|
||||
@@ -109,7 +110,7 @@ def debug_mode_check(app_configs, **kwargs):
|
||||
|
||||
|
||||
@register()
|
||||
def settings_values_check(app_configs, **kwargs):
|
||||
def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warning]:
|
||||
"""
|
||||
Validates at least some of the user provided settings
|
||||
"""
|
||||
@@ -182,7 +183,7 @@ def settings_values_check(app_configs, **kwargs):
|
||||
|
||||
|
||||
@register()
|
||||
def audit_log_check(app_configs, **kwargs):
|
||||
def audit_log_check(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
db_conn = connections["default"]
|
||||
all_tables = db_conn.introspection.table_names()
|
||||
result = []
|
||||
@@ -329,7 +330,7 @@ def check_deprecated_v2_ocr_env_vars(
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
|
||||
def check_remote_parser_configured(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||
):
|
||||
@@ -355,7 +356,7 @@ def get_tesseract_langs():
|
||||
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
def check_default_language_available(app_configs: Any, **kwargs: Any) -> list[Error]:
|
||||
errs = []
|
||||
|
||||
if not settings.OCR_LANGUAGE:
|
||||
|
||||
+12
-5
@@ -4,6 +4,11 @@ import json
|
||||
from django.conf import settings
|
||||
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.models import ArchiveFileGenerationChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ColorConvertChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.models import OutputTypeChoices
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
@@ -28,7 +33,7 @@ class OutputTypeConfig(BaseConfig):
|
||||
Almost all parsers care about the chosen PDF output format
|
||||
"""
|
||||
|
||||
output_type: str = dataclasses.field(init=False)
|
||||
output_type: OutputTypeChoices = dataclasses.field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
app_config = self._get_config_instance()
|
||||
@@ -45,15 +50,17 @@ class OcrConfig(OutputTypeConfig):
|
||||
|
||||
pages: int | None = dataclasses.field(init=False)
|
||||
language: str = dataclasses.field(init=False)
|
||||
mode: str = dataclasses.field(init=False)
|
||||
archive_file_generation: str = dataclasses.field(init=False)
|
||||
mode: ModeChoices = dataclasses.field(init=False)
|
||||
archive_file_generation: ArchiveFileGenerationChoices = dataclasses.field(
|
||||
init=False,
|
||||
)
|
||||
image_dpi: int | None = dataclasses.field(init=False)
|
||||
clean: str = dataclasses.field(init=False)
|
||||
clean: CleanChoices = dataclasses.field(init=False)
|
||||
deskew: bool = dataclasses.field(init=False)
|
||||
rotate: bool = dataclasses.field(init=False)
|
||||
rotate_threshold: float = dataclasses.field(init=False)
|
||||
max_image_pixel: float | None = dataclasses.field(init=False)
|
||||
color_conversion_strategy: str = dataclasses.field(init=False)
|
||||
color_conversion_strategy: ColorConvertChoices = dataclasses.field(init=False)
|
||||
user_args: dict[str, str] | None = dataclasses.field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
|
||||
@@ -9,6 +9,8 @@ import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Any
|
||||
from typing import Final
|
||||
from typing import NoReturn
|
||||
from typing import Self
|
||||
|
||||
from django.conf import settings
|
||||
@@ -36,7 +38,11 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.tesseract")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
_SRGB_ICC_DATA: Final[bytes] = (
|
||||
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
|
||||
)
|
||||
|
||||
_SUPPORTED_MIME_TYPES: Final[dict[str, str]] = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
@@ -102,7 +108,7 @@ class RasterisedDocumentParser:
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
def __init__(self, logging_group: object | None = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self.tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
@@ -236,7 +242,7 @@ class RasterisedDocumentParser:
|
||||
if (
|
||||
sidecar_file is not None
|
||||
and sidecar_file.is_file()
|
||||
and self.settings.mode != "redo"
|
||||
and self.settings.mode != ModeChoices.REDO
|
||||
):
|
||||
text = read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
@@ -374,7 +380,7 @@ class RasterisedDocumentParser:
|
||||
|
||||
return ocrmypdf_args
|
||||
|
||||
def _convert_image_to_pdfa(self, document_path: Path, mime_type: str) -> Path:
|
||||
def _convert_image_to_pdfa(self, document_path: Path) -> Path:
|
||||
"""Convert an image to a PDF/A-2b file without invoking the OCR engine.
|
||||
|
||||
Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
|
||||
@@ -400,14 +406,10 @@ class RasterisedDocumentParser:
|
||||
f"img2pdf conversion failed for {document_path}: {e!s}",
|
||||
) from e
|
||||
|
||||
icc_data = (
|
||||
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
|
||||
)
|
||||
|
||||
pdfa_path = Path(self.tempdir) / "archive.pdf"
|
||||
try:
|
||||
with pikepdf.open(plain_pdf_path) as pdf:
|
||||
cs = pdf.make_stream(icc_data)
|
||||
cs = pdf.make_stream(_SRGB_ICC_DATA)
|
||||
cs["/N"] = 3
|
||||
output_intent = pikepdf.Dictionary(
|
||||
Type=pikepdf.Name("/OutputIntent"),
|
||||
@@ -430,6 +432,22 @@ class RasterisedDocumentParser:
|
||||
|
||||
return pdfa_path
|
||||
|
||||
def _handle_subprocess_output_error(self, e: Exception) -> NoReturn:
|
||||
"""Log context for Ghostscript failures and raise ParseError.
|
||||
|
||||
Called from the SubprocessOutputError handlers in parse() to avoid
|
||||
duplicating the Ghostscript hint and re-raise logic.
|
||||
"""
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: "
|
||||
"'{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
@@ -440,6 +458,13 @@ class RasterisedDocumentParser:
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||
|
||||
import ocrmypdf
|
||||
from ocrmypdf import EncryptedPdfError
|
||||
from ocrmypdf import InputFileError
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
from ocrmypdf.exceptions import DigitalSignatureError
|
||||
from ocrmypdf.exceptions import PriorOcrFoundError
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
text_original = self.extract_text(None, document_path)
|
||||
original_has_text = is_tagged_pdf(document_path, log=self.log) or (
|
||||
@@ -458,7 +483,6 @@ class RasterisedDocumentParser:
|
||||
try:
|
||||
self.archive_path = self._convert_image_to_pdfa(
|
||||
document_path,
|
||||
mime_type,
|
||||
)
|
||||
self.text = ""
|
||||
except Exception as e:
|
||||
@@ -467,9 +491,6 @@ class RasterisedDocumentParser:
|
||||
) from e
|
||||
return
|
||||
# PDFs in off mode: PDF/A conversion only via skip_text
|
||||
import ocrmypdf
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
@@ -487,15 +508,7 @@ class RasterisedDocumentParser:
|
||||
self.archive_path = archive_path
|
||||
self.text = self.extract_text(None, archive_path) or text_original or ""
|
||||
except SubprocessOutputError as e:
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: "
|
||||
"'{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
self._handle_subprocess_output_error(e)
|
||||
except Exception as e:
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
return
|
||||
@@ -513,13 +526,6 @@ class RasterisedDocumentParser:
|
||||
return
|
||||
|
||||
# --- All other paths: run ocrmypdf ---
|
||||
import ocrmypdf
|
||||
from ocrmypdf import EncryptedPdfError
|
||||
from ocrmypdf import InputFileError
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
from ocrmypdf.exceptions import DigitalSignatureError
|
||||
from ocrmypdf.exceptions import PriorOcrFoundError
|
||||
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
|
||||
@@ -553,14 +559,7 @@ class RasterisedDocumentParser:
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
except SubprocessOutputError as e:
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
self._handle_subprocess_output_error(e)
|
||||
except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
|
||||
self.log.warning(
|
||||
f"Encountered an error while running OCR: {e!s}. "
|
||||
|
||||
@@ -13,6 +13,7 @@ import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Final
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from paperless.parsers import MetadataEntry
|
||||
@@ -21,7 +22,7 @@ logger = logging.getLogger("paperless.parsers.utils")
|
||||
|
||||
# Minimum character count for a PDF to be considered "born-digital" (has real text).
|
||||
# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
|
||||
PDF_TEXT_MIN_LENGTH = 50
|
||||
PDF_TEXT_MIN_LENGTH: Final[int] = 50
|
||||
|
||||
|
||||
def is_tagged_pdf(
|
||||
|
||||
@@ -354,7 +354,7 @@ class TestOffModeImage:
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)
|
||||
|
||||
mock_convert.assert_called_once_with(simple_png_file, "image/png")
|
||||
mock_convert.assert_called_once_with(simple_png_file)
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path == fake_archive
|
||||
assert tesseract_parser.get_text() == ""
|
||||
|
||||
Reference in New Issue
Block a user