mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-27 03:12:45 +00:00
feat!: restructure parse() for OCR_MODE=auto/off and produce_archive flag
Implement the new decoupled archive/OCR control in RasterisedDocumentParser: - construct_ocrmypdf_parameters(): add skip_text parameter; fix AUTO mode dispatch so skip_text is only added when explicitly requested (text-present + produce_archive case) rather than unconditionally; add OFF mode support. - parse(): remove archive_file_generation checks; control archive creation exclusively via the produce_archive bool passed by the consumer. - OFF + no archive: return pdftotext text, skip OCRmyPDF entirely. - OFF + image + archive: use new _convert_image_to_pdfa() helper. - OFF + PDF + archive: run OCRmyPDF with skip_text=True (PDF/A only). - AUTO + text + no archive: skip OCRmyPDF entirely (fast path). - AUTO + text + archive: run OCRmyPDF with skip_text=True. - AUTO + no text: run normal OCR regardless of produce_archive. - FORCE/REDO: always run OCRmyPDF; set archive_path only when produce_archive. - Add _convert_image_to_pdfa(): img2pdf wrapping + pikepdf PDF/A-2b stamping without invoking Tesseract or Ghostscript. - Add PriorOcrFoundError to the fallback exception list (same treatment as InputFileError: retry with force_ocr). - Update existing tests to use produce_archive instead of archive_file_generation: TestSkipArchive rewritten; RTL test uses mode=off to preserve Arabic text layer; AUTO mode tests clarified. - Add test_parse_modes.py: 11 focused unit tests with mocked ocrmypdf.ocr verifying control flow for all mode/produce_archive combinations. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.resources
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
@@ -18,7 +19,6 @@ from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileGenerationChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||
@@ -289,6 +289,7 @@ class RasterisedDocumentParser:
|
||||
sidecar_file: Path,
|
||||
*,
|
||||
safe_fallback: bool = False,
|
||||
skip_text: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
ocrmypdf_args: dict[str, Any] = {
|
||||
"input_file_or_options": input_file,
|
||||
@@ -307,12 +308,14 @@ class RasterisedDocumentParser:
|
||||
self.settings.color_conversion_strategy
|
||||
)
|
||||
|
||||
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||
if safe_fallback or self.settings.mode == ModeChoices.FORCE:
|
||||
ocrmypdf_args["force_ocr"] = True
|
||||
elif self.settings.mode == ModeChoices.AUTO:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["redo_ocr"] = True
|
||||
elif skip_text or self.settings.mode == ModeChoices.OFF:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.AUTO:
|
||||
pass # no extra flag: normal OCR (text not found case)
|
||||
else: # pragma: no cover
|
||||
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
||||
|
||||
@@ -397,6 +400,62 @@ class RasterisedDocumentParser:
|
||||
|
||||
return ocrmypdf_args
|
||||
|
||||
def _convert_image_to_pdfa(self, document_path: Path, mime_type: str) -> Path:
|
||||
"""Convert an image to a PDF/A-2b file without invoking the OCR engine.
|
||||
|
||||
Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
|
||||
PDF/A-2b conformance metadata.
|
||||
|
||||
No Tesseract and no Ghostscript are invoked.
|
||||
"""
|
||||
import img2pdf
|
||||
import pikepdf
|
||||
|
||||
plain_pdf_path = Path(self.tempdir) / "image_plain.pdf"
|
||||
try:
|
||||
layout_fun = None
|
||||
if self.settings.image_dpi is not None:
|
||||
layout_fun = img2pdf.get_fixed_dpi_layout_fun(
|
||||
(self.settings.image_dpi, self.settings.image_dpi),
|
||||
)
|
||||
plain_pdf_path.write_bytes(
|
||||
img2pdf.convert(str(document_path), layout_fun=layout_fun),
|
||||
)
|
||||
except Exception as e:
|
||||
raise ParseError(
|
||||
f"img2pdf conversion failed for {document_path}: {e!s}",
|
||||
) from e
|
||||
|
||||
icc_data = (
|
||||
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
|
||||
)
|
||||
|
||||
pdfa_path = Path(self.tempdir) / "archive.pdf"
|
||||
try:
|
||||
with pikepdf.open(plain_pdf_path) as pdf:
|
||||
cs = pdf.make_stream(icc_data)
|
||||
cs["/N"] = 3
|
||||
output_intent = pikepdf.Dictionary(
|
||||
Type=pikepdf.Name("/OutputIntent"),
|
||||
S=pikepdf.Name("/GTS_PDFA1"),
|
||||
OutputConditionIdentifier=pikepdf.String("sRGB"),
|
||||
DestOutputProfile=cs,
|
||||
)
|
||||
pdf.Root["/OutputIntents"] = pdf.make_indirect(
|
||||
pikepdf.Array([output_intent]),
|
||||
)
|
||||
meta = pdf.open_metadata(set_pikepdf_as_editor=False)
|
||||
meta["pdfaid:part"] = "2"
|
||||
meta["pdfaid:conformance"] = "B"
|
||||
pdf.save(pdfa_path)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"PDF/A metadata stamping failed ({e!s}); falling back to plain PDF.",
|
||||
)
|
||||
pdfa_path.write_bytes(plain_pdf_path.read_bytes())
|
||||
|
||||
return pdfa_path
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
@@ -417,48 +476,96 @@ class RasterisedDocumentParser:
|
||||
text_original = None
|
||||
original_has_text = False
|
||||
|
||||
# If the original has text, and the user doesn't want an archive,
|
||||
# we're done here (but not when force/redo mode is explicitly requested)
|
||||
skip_archive_for_text = self.settings.mode not in {
|
||||
ModeChoices.FORCE,
|
||||
ModeChoices.REDO,
|
||||
} and self.settings.archive_file_generation in {
|
||||
ArchiveFileGenerationChoices.NEVER,
|
||||
ArchiveFileGenerationChoices.AUTO,
|
||||
}
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||
# --- OCR_MODE=off: never invoke OCR engine ---
|
||||
if self.settings.mode == ModeChoices.OFF:
|
||||
if not produce_archive:
|
||||
self.text = text_original or ""
|
||||
return
|
||||
if self.is_image(mime_type):
|
||||
try:
|
||||
self.archive_path = self._convert_image_to_pdfa(
|
||||
document_path,
|
||||
mime_type,
|
||||
)
|
||||
self.text = ""
|
||||
except Exception as e:
|
||||
raise ParseError(
|
||||
f"Image to PDF/A conversion failed: {e!s}",
|
||||
) from e
|
||||
return
|
||||
# PDFs in off mode: PDF/A conversion only via skip_text
|
||||
import ocrmypdf
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
skip_text=True,
|
||||
)
|
||||
try:
|
||||
self.log.debug(
|
||||
f"Calling OCRmyPDF (off mode, PDF/A conversion only): {args}",
|
||||
)
|
||||
ocrmypdf.ocr(**args)
|
||||
self.archive_path = archive_path
|
||||
self.text = self.extract_text(None, archive_path) or text_original or ""
|
||||
except SubprocessOutputError as e:
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: "
|
||||
"'{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
except Exception as e:
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
return
|
||||
|
||||
# --- OCR_MODE=auto: skip ocrmypdf entirely if text exists and no archive needed ---
|
||||
if (
|
||||
self.settings.mode == ModeChoices.AUTO
|
||||
and original_has_text
|
||||
and not produce_archive
|
||||
):
|
||||
self.log.debug(
|
||||
"Document has text and no archive requested; skipping OCRmyPDF entirely.",
|
||||
)
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
# Either no text was in the original or there should be an archive
|
||||
# file created, so OCR the file and create an archive with any
|
||||
# text located via OCR
|
||||
|
||||
# --- All other paths: run ocrmypdf ---
|
||||
import ocrmypdf
|
||||
from ocrmypdf import EncryptedPdfError
|
||||
from ocrmypdf import InputFileError
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
from ocrmypdf.exceptions import DigitalSignatureError
|
||||
from ocrmypdf.exceptions import PriorOcrFoundError
|
||||
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
|
||||
# auto mode with existing text: PDF/A conversion only (no OCR).
|
||||
skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
skip_text=skip_text,
|
||||
)
|
||||
|
||||
try:
|
||||
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
if (
|
||||
self.settings.archive_file_generation
|
||||
!= ArchiveFileGenerationChoices.NEVER
|
||||
):
|
||||
if produce_archive:
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
@@ -478,11 +585,10 @@ class RasterisedDocumentParser:
|
||||
"Ghostscript PDF/A rendering failed, consider setting "
|
||||
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
|
||||
)
|
||||
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
except (NoTextFoundException, InputFileError) as e:
|
||||
except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
|
||||
self.log.warning(
|
||||
f"Encountered an error while running OCR: {e!s}. "
|
||||
f"Attempting force OCR to get the text.",
|
||||
@@ -491,8 +597,6 @@ class RasterisedDocumentParser:
|
||||
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
||||
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
||||
|
||||
# Attempt to run OCR with safe settings.
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
@@ -504,25 +608,16 @@ class RasterisedDocumentParser:
|
||||
try:
|
||||
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
# Don't return the archived file here, since this file
|
||||
# is bigger and blurry due to --force-ocr.
|
||||
|
||||
self.text = self.extract_text(
|
||||
sidecar_file_fallback,
|
||||
archive_path_fallback,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# If this fails, we have a serious issue at hand.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
except Exception as e:
|
||||
# Anything else is probably serious.
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
# As a last resort, if we still don't have any text for any reason,
|
||||
# try to extract the text from the original document.
|
||||
if not self.text:
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
|
||||
436
src/paperless/tests/parsers/test_parse_modes.py
Normal file
436
src/paperless/tests/parsers/test_parse_modes.py
Normal file
@@ -0,0 +1,436 @@
|
||||
"""
|
||||
Focused tests for RasterisedDocumentParser.parse() mode behaviour.
|
||||
|
||||
These tests mock ``ocrmypdf.ocr`` so they run without a real Tesseract/OCRmyPDF
|
||||
installation and execute quickly. The intent is to verify the *control flow*
|
||||
introduced by the ``produce_archive`` flag and the ``OCR_MODE=auto/off`` logic,
|
||||
not to test OCRmyPDF itself.
|
||||
|
||||
Fixtures are pulled from conftest.py in this package.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_LONG_TEXT = "This is a test document with enough text. " * 5 # >50 chars
|
||||
_SHORT_TEXT = "Hi." # <50 chars
|
||||
|
||||
|
||||
def _make_extract_text(text: str | None):
|
||||
"""Return a side_effect function for ``extract_text`` that returns *text*."""
|
||||
|
||||
def _extract(sidecar_file, pdf_file):
|
||||
return text
|
||||
|
||||
return _extract
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AUTO mode — PDF with sufficient text layer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAutoModeWithText:
|
||||
"""AUTO mode, original PDF has detectable text (>50 chars)."""
|
||||
|
||||
def test_auto_text_no_archive_skips_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called (early return path)
|
||||
- archive_path remains None
|
||||
- text is set from the original
|
||||
"""
|
||||
# Patch extract_text to return long text (simulating detectable text layer)
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
||||
|
||||
def test_auto_text_with_archive_calls_ocrmypdf_skip_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=True
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called with skip_text=True
|
||||
- archive_path is set
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert call_kwargs.get("skip_text") is True
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AUTO mode — PDF without text layer (or too short)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAutoModeNoText:
|
||||
"""AUTO mode, original PDF has no detectable text (<= 50 chars)."""
|
||||
|
||||
def test_auto_no_text_with_archive_calls_ocrmypdf_no_extra_flag(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=True
|
||||
- PDF with no text (or text <= VALID_TEXT_LENGTH)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called WITHOUT skip_text/force_ocr/redo_ocr
|
||||
- archive_path is set (since produce_archive=True)
|
||||
"""
|
||||
# Return "no text" for the original; return real text for archive
|
||||
extract_call_count = 0
|
||||
|
||||
def _extract_side(sidecar_file, pdf_file):
|
||||
nonlocal extract_call_count
|
||||
extract_call_count += 1
|
||||
if extract_call_count == 1:
|
||||
return None # original has no text
|
||||
return _LONG_TEXT # text from archive after OCR
|
||||
|
||||
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert "skip_text" not in call_kwargs
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
def test_auto_no_text_no_archive_calls_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with no text
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called (no early return since no text detected)
|
||||
- archive_path is NOT set (produce_archive=False)
|
||||
"""
|
||||
extract_call_count = 0
|
||||
|
||||
def _extract_side(sidecar_file, pdf_file):
|
||||
nonlocal extract_call_count
|
||||
extract_call_count += 1
|
||||
if extract_call_count == 1:
|
||||
return None
|
||||
return _LONG_TEXT
|
||||
|
||||
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
assert tesseract_parser.archive_path is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OFF mode — PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOffModePdf:
|
||||
"""OCR_MODE=off, document is a PDF."""
|
||||
|
||||
def test_off_no_archive_returns_pdftotext(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=False
|
||||
- PDF with text
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called
|
||||
- archive_path is None
|
||||
- text comes from pdftotext (extract_text)
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == _LONG_TEXT
|
||||
|
||||
def test_off_with_archive_calls_ocrmypdf_skip_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=True
|
||||
- PDF document
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr IS called with skip_text=True (PDF/A conversion only)
|
||||
- archive_path is set
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
|
||||
mock_ocr.assert_called_once()
|
||||
call_kwargs = mock_ocr.call_args.kwargs
|
||||
assert call_kwargs.get("skip_text") is True
|
||||
assert "force_ocr" not in call_kwargs
|
||||
assert "redo_ocr" not in call_kwargs
|
||||
assert tesseract_parser.archive_path is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OFF mode — image
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOffModeImage:
|
||||
"""OCR_MODE=off, document is an image (PNG)."""
|
||||
|
||||
def test_off_image_no_archive_no_ocrmypdf(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=False
|
||||
- Image document (PNG)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf.ocr is NOT called
|
||||
- archive_path is None
|
||||
- text is empty string (images have no text layer)
|
||||
"""
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=False)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() == ""
|
||||
|
||||
def test_off_image_with_archive_uses_img2pdf_path(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- OFF mode, produce_archive=True
|
||||
- Image document (PNG)
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- _convert_image_to_pdfa() is called instead of ocrmypdf.ocr
|
||||
- archive_path is set to the returned path
|
||||
- text is empty string
|
||||
"""
|
||||
fake_archive = Path("/tmp/fake-archive.pdf")
|
||||
mock_convert = mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"_convert_image_to_pdfa",
|
||||
return_value=fake_archive,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)
|
||||
|
||||
mock_convert.assert_called_once_with(simple_png_file, "image/png")
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path == fake_archive
|
||||
assert tesseract_parser.get_text() == ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# produce_archive=False never sets archive_path for FORCE / REDO / AUTO modes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestProduceArchiveFalse:
|
||||
"""Verify produce_archive=False never results in an archive regardless of mode."""
|
||||
|
||||
@pytest.mark.parametrize("mode", ["force", "redo"])
|
||||
def test_produce_archive_false_force_redo_modes(
|
||||
self,
|
||||
mode: str,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- FORCE or REDO mode, produce_archive=False
|
||||
- Any PDF
|
||||
WHEN:
|
||||
- parse() is called (ocrmypdf mocked to succeed)
|
||||
THEN:
|
||||
- archive_path is NOT set even though ocrmypdf ran
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = mode
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert tesseract_parser.get_text() is not None
|
||||
|
||||
def test_produce_archive_false_auto_with_text(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- AUTO mode, produce_archive=False
|
||||
- PDF with text > VALID_TEXT_LENGTH
|
||||
WHEN:
|
||||
- parse() is called
|
||||
THEN:
|
||||
- ocrmypdf is skipped entirely (early return)
|
||||
- archive_path is None
|
||||
"""
|
||||
mocker.patch.object(
|
||||
tesseract_parser,
|
||||
"extract_text",
|
||||
return_value=_LONG_TEXT,
|
||||
)
|
||||
mock_ocr = mocker.patch("ocrmypdf.ocr")
|
||||
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
simple_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
|
||||
mock_ocr.assert_not_called()
|
||||
assert tesseract_parser.archive_path is None
|
||||
@@ -89,15 +89,35 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
WHEN:
|
||||
- OCR parameters are constructed
|
||||
THEN:
|
||||
- Configuration from database is utilized
|
||||
- Configuration from database is utilized (AUTO mode with skip_text=True
|
||||
triggers skip_text; AUTO mode alone does not add any extra flag)
|
||||
"""
|
||||
# AUTO mode with skip_text=True explicitly passed: skip_text is set
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.AUTO
|
||||
instance.save()
|
||||
|
||||
params = RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
|
||||
input_file="input.pdf",
|
||||
output_file="output.pdf",
|
||||
sidecar_file="sidecar.txt",
|
||||
mime_type="application/pdf",
|
||||
safe_fallback=False,
|
||||
skip_text=True,
|
||||
)
|
||||
self.assertTrue(params["skip_text"])
|
||||
self.assertNotIn("redo_ocr", params)
|
||||
self.assertNotIn("force_ocr", params)
|
||||
|
||||
# AUTO mode alone (no skip_text): no extra OCR flag is set
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.AUTO
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertTrue(params["skip_text"])
|
||||
self.assertNotIn("skip_text", params)
|
||||
self.assertNotIn("redo_ocr", params)
|
||||
self.assertNotIn("force_ocr", params)
|
||||
|
||||
|
||||
@@ -370,15 +370,26 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Multi-page digital PDF with sufficient text layer
|
||||
- Default settings (mode=auto, produce_archive=True)
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Archive is created (AUTO mode + text present + produce_archive=True
|
||||
→ PDF/A conversion via skip_text)
|
||||
- Text is extracted
|
||||
"""
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "simple-digital.pdf",
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert tesseract_parser.archive_path.is_file()
|
||||
assert_ordered_substrings(
|
||||
tesseract_parser.get_text(),
|
||||
["This is a test document."],
|
||||
tesseract_parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
def test_with_form_default(
|
||||
@@ -738,16 +749,18 @@ class TestSkipArchive:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- Mode: auto, skip_archive_file: auto
|
||||
- Mode: auto, produce_archive=False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted; no archive created (text exists, auto skips OCR)
|
||||
- Text extracted from original; no archive created (text exists +
|
||||
produce_archive=False skips OCRmyPDF entirely)
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -781,46 +794,58 @@ class TestSkipArchive:
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("skip_archive_file", "filename", "expect_archive"),
|
||||
("produce_archive", "filename", "expect_archive"),
|
||||
[
|
||||
pytest.param(
|
||||
"always",
|
||||
True,
|
||||
"multi-page-digital.pdf",
|
||||
True,
|
||||
id="always-with-text",
|
||||
),
|
||||
pytest.param("always", "multi-page-images.pdf", True, id="always-no-text"),
|
||||
pytest.param(
|
||||
"auto",
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="auto-with-text-layer",
|
||||
id="produce-archive-with-text",
|
||||
),
|
||||
pytest.param(
|
||||
"auto",
|
||||
True,
|
||||
"multi-page-images.pdf",
|
||||
True,
|
||||
id="auto-no-text-layer",
|
||||
id="produce-archive-no-text",
|
||||
),
|
||||
pytest.param(
|
||||
"never",
|
||||
False,
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="never-with-text",
|
||||
id="no-archive-with-text-layer",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
"multi-page-images.pdf",
|
||||
False,
|
||||
id="no-archive-no-text-layer",
|
||||
),
|
||||
pytest.param("never", "multi-page-images.pdf", False, id="never-no-text"),
|
||||
],
|
||||
)
|
||||
def test_skip_archive_file_setting(
|
||||
def test_produce_archive_flag(
|
||||
self,
|
||||
skip_archive_file: str,
|
||||
produce_archive: bool, # noqa: FBT001
|
||||
filename: str,
|
||||
expect_archive: str,
|
||||
expect_archive: bool, # noqa: FBT001
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.archive_file_generation = skip_archive_file
|
||||
tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf")
|
||||
"""
|
||||
GIVEN:
|
||||
- Various PDFs (with and without text layers)
|
||||
- produce_archive flag set to True or False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- archive_path is set if and only if produce_archive=True
|
||||
- Text is always extracted
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / filename,
|
||||
"application/pdf",
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
text = tesseract_parser.get_text().lower()
|
||||
assert_ordered_substrings(text, ["page 1", "page 2", "page 3"])
|
||||
if expect_archive:
|
||||
@@ -907,17 +932,18 @@ class TestParseMixed:
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with mixed pages
|
||||
- Mode: auto, skip_archive_file: auto
|
||||
- File with mixed pages (some with text, some image-only)
|
||||
- Mode: auto, produce_archive=False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- No archive created (file has text layer); later-page text present
|
||||
- No archive created (produce_archive=False); text from text layer present
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -964,12 +990,19 @@ class TestParseRtl:
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- PDF with RTL Arabic text
|
||||
- PDF with RTL Arabic text in its text layer (short: 18 chars)
|
||||
- mode=off, produce_archive=True: PDF/A conversion via skip_text, no OCR engine
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Arabic content is extracted (normalised for bidi)
|
||||
- Arabic content is extracted from the PDF text layer (normalised for bidi)
|
||||
|
||||
Note: The RTL PDF has a short text layer (< VALID_TEXT_LENGTH=50) so AUTO mode
|
||||
would attempt full OCR, which fails due to PriorOcrFoundError and falls back to
|
||||
force-ocr with English Tesseract (producing garbage). Using mode="off" forces
|
||||
skip_text=True so the Arabic text layer is preserved through PDF/A conversion.
|
||||
"""
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "rtl-test.pdf",
|
||||
"application/pdf",
|
||||
|
||||
Reference in New Issue
Block a user