feat!: restructure parse() for OCR_MODE=auto/off and produce_archive flag

Implement the new decoupled archive/OCR control in RasterisedDocumentParser:

- construct_ocrmypdf_parameters(): add skip_text parameter; fix AUTO mode
  dispatch so skip_text is only added when explicitly requested (text-present
  + produce_archive case) rather than unconditionally; add OFF mode support.

- parse(): remove archive_file_generation checks; control archive creation
  exclusively via the produce_archive bool passed by the consumer.
  - OFF + no archive: return pdftotext text, skip OCRmyPDF entirely.
  - OFF + image + archive: use new _convert_image_to_pdfa() helper.
  - OFF + PDF + archive: run OCRmyPDF with skip_text=True (PDF/A only).
  - AUTO + text + no archive: skip OCRmyPDF entirely (fast path).
  - AUTO + text + archive: run OCRmyPDF with skip_text=True.
  - AUTO + no text: run normal OCR regardless of produce_archive.
  - FORCE/REDO: always run OCRmyPDF; set archive_path only when produce_archive.

- Add _convert_image_to_pdfa(): img2pdf wrapping + pikepdf PDF/A-2b stamping
  without invoking Tesseract or Ghostscript.

- Add PriorOcrFoundError to the fallback exception list (same treatment as
  InputFileError: retry with force_ocr).

- Update existing tests to use produce_archive instead of archive_file_generation:
  TestSkipArchive rewritten; RTL test uses mode=off to preserve Arabic text
  layer; AUTO mode tests clarified.

- Add test_parse_modes.py: 11 focused unit tests with mocked ocrmypdf.ocr
  verifying control flow for all mode/produce_archive combinations.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-26 14:10:02 -07:00
parent 6658d94f77
commit 55c51cb89e
4 changed files with 652 additions and 68 deletions

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import importlib.resources
import logging
import os
import re
@@ -18,7 +19,6 @@ from documents.parsers import make_thumbnail_from_pdf
from documents.utils import maybe_override_pixel_limit
from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.models import ArchiveFileGenerationChoices
from paperless.models import CleanChoices
from paperless.models import ModeChoices
from paperless.parsers.utils import read_file_handle_unicode_errors
@@ -289,6 +289,7 @@ class RasterisedDocumentParser:
sidecar_file: Path,
*,
safe_fallback: bool = False,
skip_text: bool = False,
) -> dict[str, Any]:
ocrmypdf_args: dict[str, Any] = {
"input_file_or_options": input_file,
@@ -307,12 +308,14 @@ class RasterisedDocumentParser:
self.settings.color_conversion_strategy
)
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
if safe_fallback or self.settings.mode == ModeChoices.FORCE:
ocrmypdf_args["force_ocr"] = True
elif self.settings.mode == ModeChoices.AUTO:
ocrmypdf_args["skip_text"] = True
elif self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["redo_ocr"] = True
elif skip_text or self.settings.mode == ModeChoices.OFF:
ocrmypdf_args["skip_text"] = True
elif self.settings.mode == ModeChoices.AUTO:
pass # no extra flag: normal OCR (text not found case)
else: # pragma: no cover
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
@@ -397,6 +400,62 @@ class RasterisedDocumentParser:
return ocrmypdf_args
def _convert_image_to_pdfa(self, document_path: Path, mime_type: str) -> Path:
"""Convert an image to a PDF/A-2b file without invoking the OCR engine.
Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp
PDF/A-2b conformance metadata.
No Tesseract and no Ghostscript are invoked.
"""
import img2pdf
import pikepdf
plain_pdf_path = Path(self.tempdir) / "image_plain.pdf"
try:
layout_fun = None
if self.settings.image_dpi is not None:
layout_fun = img2pdf.get_fixed_dpi_layout_fun(
(self.settings.image_dpi, self.settings.image_dpi),
)
plain_pdf_path.write_bytes(
img2pdf.convert(str(document_path), layout_fun=layout_fun),
)
except Exception as e:
raise ParseError(
f"img2pdf conversion failed for {document_path}: {e!s}",
) from e
icc_data = (
importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes()
)
pdfa_path = Path(self.tempdir) / "archive.pdf"
try:
with pikepdf.open(plain_pdf_path) as pdf:
cs = pdf.make_stream(icc_data)
cs["/N"] = 3
output_intent = pikepdf.Dictionary(
Type=pikepdf.Name("/OutputIntent"),
S=pikepdf.Name("/GTS_PDFA1"),
OutputConditionIdentifier=pikepdf.String("sRGB"),
DestOutputProfile=cs,
)
pdf.Root["/OutputIntents"] = pdf.make_indirect(
pikepdf.Array([output_intent]),
)
meta = pdf.open_metadata(set_pikepdf_as_editor=False)
meta["pdfaid:part"] = "2"
meta["pdfaid:conformance"] = "B"
pdf.save(pdfa_path)
except Exception as e:
self.log.warning(
f"PDF/A metadata stamping failed ({e!s}); falling back to plain PDF.",
)
pdfa_path.write_bytes(plain_pdf_path.read_bytes())
return pdfa_path
def parse(
self,
document_path: Path,
@@ -417,48 +476,96 @@ class RasterisedDocumentParser:
text_original = None
original_has_text = False
# If the original has text, and the user doesn't want an archive,
# we're done here (but not when force/redo mode is explicitly requested)
skip_archive_for_text = self.settings.mode not in {
ModeChoices.FORCE,
ModeChoices.REDO,
} and self.settings.archive_file_generation in {
ArchiveFileGenerationChoices.NEVER,
ArchiveFileGenerationChoices.AUTO,
}
if skip_archive_for_text and original_has_text:
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
# --- OCR_MODE=off: never invoke OCR engine ---
if self.settings.mode == ModeChoices.OFF:
if not produce_archive:
self.text = text_original or ""
return
if self.is_image(mime_type):
try:
self.archive_path = self._convert_image_to_pdfa(
document_path,
mime_type,
)
self.text = ""
except Exception as e:
raise ParseError(
f"Image to PDF/A conversion failed: {e!s}",
) from e
return
# PDFs in off mode: PDF/A conversion only via skip_text
import ocrmypdf
from ocrmypdf import SubprocessOutputError
archive_path = Path(self.tempdir) / "archive.pdf"
sidecar_file = Path(self.tempdir) / "sidecar.txt"
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,
archive_path,
sidecar_file,
skip_text=True,
)
try:
self.log.debug(
f"Calling OCRmyPDF (off mode, PDF/A conversion only): {args}",
)
ocrmypdf.ocr(**args)
self.archive_path = archive_path
self.text = self.extract_text(None, archive_path) or text_original or ""
except SubprocessOutputError as e:
if "Ghostscript PDF/A rendering" in str(e):
self.log.warning(
"Ghostscript PDF/A rendering failed, consider setting "
"PAPERLESS_OCR_USER_ARGS: "
"'{\"continue_on_soft_render_error\": true}'",
)
raise ParseError(
f"SubprocessOutputError: {e!s}. See logs for more information.",
) from e
except Exception as e:
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
return
# --- OCR_MODE=auto: skip ocrmypdf entirely if text exists and no archive needed ---
if (
self.settings.mode == ModeChoices.AUTO
and original_has_text
and not produce_archive
):
self.log.debug(
"Document has text and no archive requested; skipping OCRmyPDF entirely.",
)
self.text = text_original
return
# Either no text was in the original or there should be an archive
# file created, so OCR the file and create an archive with any
# text located via OCR
# --- All other paths: run ocrmypdf ---
import ocrmypdf
from ocrmypdf import EncryptedPdfError
from ocrmypdf import InputFileError
from ocrmypdf import SubprocessOutputError
from ocrmypdf.exceptions import DigitalSignatureError
from ocrmypdf.exceptions import PriorOcrFoundError
archive_path = Path(self.tempdir) / "archive.pdf"
sidecar_file = Path(self.tempdir) / "sidecar.txt"
# auto mode with existing text: PDF/A conversion only (no OCR).
skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,
archive_path,
sidecar_file,
skip_text=skip_text,
)
try:
self.log.debug(f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
if (
self.settings.archive_file_generation
!= ArchiveFileGenerationChoices.NEVER
):
if produce_archive:
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)
@@ -478,11 +585,10 @@ class RasterisedDocumentParser:
"Ghostscript PDF/A rendering failed, consider setting "
"PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
)
raise ParseError(
f"SubprocessOutputError: {e!s}. See logs for more information.",
) from e
except (NoTextFoundException, InputFileError) as e:
except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e:
self.log.warning(
f"Encountered an error while running OCR: {e!s}. "
f"Attempting force OCR to get the text.",
@@ -491,8 +597,6 @@ class RasterisedDocumentParser:
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
# Attempt to run OCR with safe settings.
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,
@@ -504,25 +608,16 @@ class RasterisedDocumentParser:
try:
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
# Don't return the archived file here, since this file
# is bigger and blurry due to --force-ocr.
self.text = self.extract_text(
sidecar_file_fallback,
archive_path_fallback,
)
except Exception as e:
# If this fails, we have a serious issue at hand.
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
except Exception as e:
# Anything else is probably serious.
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
# As a last resort, if we still don't have any text for any reason,
# try to extract the text from the original document.
if not self.text:
if original_has_text:
self.text = text_original

View File

@@ -0,0 +1,436 @@
"""
Focused tests for RasterisedDocumentParser.parse() mode behaviour.
These tests mock ``ocrmypdf.ocr`` so they run without a real Tesseract/OCRmyPDF
installation and execute quickly. The intent is to verify the *control flow*
introduced by the ``produce_archive`` flag and the ``OCR_MODE=auto/off`` logic,
not to test OCRmyPDF itself.
Fixtures are pulled from conftest.py in this package.
"""
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
import pytest
if TYPE_CHECKING:
from pytest_mock import MockerFixture
from paperless.parsers.tesseract import RasterisedDocumentParser
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
_LONG_TEXT = "This is a test document with enough text. " * 5 # >50 chars
_SHORT_TEXT = "Hi." # <50 chars
def _make_extract_text(text: str | None):
"""Return a side_effect function for ``extract_text`` that returns *text*."""
def _extract(sidecar_file, pdf_file):
return text
return _extract
# ---------------------------------------------------------------------------
# AUTO mode — PDF with sufficient text layer
# ---------------------------------------------------------------------------
class TestAutoModeWithText:
"""AUTO mode, original PDF has detectable text (>50 chars)."""
def test_auto_text_no_archive_skips_ocrmypdf(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_digital_pdf_file: Path,
) -> None:
"""
GIVEN:
- AUTO mode, produce_archive=False
- PDF with text > VALID_TEXT_LENGTH
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr is NOT called (early return path)
- archive_path remains None
- text is set from the original
"""
# Patch extract_text to return long text (simulating detectable text layer)
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
simple_digital_pdf_file,
"application/pdf",
produce_archive=False,
)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path is None
assert tesseract_parser.get_text() == _LONG_TEXT
def test_auto_text_with_archive_calls_ocrmypdf_skip_text(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_digital_pdf_file: Path,
) -> None:
"""
GIVEN:
- AUTO mode, produce_archive=True
- PDF with text > VALID_TEXT_LENGTH
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr IS called with skip_text=True
- archive_path is set
"""
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
simple_digital_pdf_file,
"application/pdf",
produce_archive=True,
)
mock_ocr.assert_called_once()
call_kwargs = mock_ocr.call_args.kwargs
assert call_kwargs.get("skip_text") is True
assert "force_ocr" not in call_kwargs
assert "redo_ocr" not in call_kwargs
assert tesseract_parser.archive_path is not None
# ---------------------------------------------------------------------------
# AUTO mode — PDF without text layer (or too short)
# ---------------------------------------------------------------------------
class TestAutoModeNoText:
"""AUTO mode, original PDF has no detectable text (<= 50 chars)."""
def test_auto_no_text_with_archive_calls_ocrmypdf_no_extra_flag(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
multi_page_images_pdf_file: Path,
) -> None:
"""
GIVEN:
- AUTO mode, produce_archive=True
- PDF with no text (or text <= VALID_TEXT_LENGTH)
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr IS called WITHOUT skip_text/force_ocr/redo_ocr
- archive_path is set (since produce_archive=True)
"""
# Return "no text" for the original; return real text for archive
extract_call_count = 0
def _extract_side(sidecar_file, pdf_file):
nonlocal extract_call_count
extract_call_count += 1
if extract_call_count == 1:
return None # original has no text
return _LONG_TEXT # text from archive after OCR
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
multi_page_images_pdf_file,
"application/pdf",
produce_archive=True,
)
mock_ocr.assert_called_once()
call_kwargs = mock_ocr.call_args.kwargs
assert "skip_text" not in call_kwargs
assert "force_ocr" not in call_kwargs
assert "redo_ocr" not in call_kwargs
assert tesseract_parser.archive_path is not None
def test_auto_no_text_no_archive_calls_ocrmypdf(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
multi_page_images_pdf_file: Path,
) -> None:
"""
GIVEN:
- AUTO mode, produce_archive=False
- PDF with no text
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr IS called (no early return since no text detected)
- archive_path is NOT set (produce_archive=False)
"""
extract_call_count = 0
def _extract_side(sidecar_file, pdf_file):
nonlocal extract_call_count
extract_call_count += 1
if extract_call_count == 1:
return None
return _LONG_TEXT
mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
multi_page_images_pdf_file,
"application/pdf",
produce_archive=False,
)
mock_ocr.assert_called_once()
assert tesseract_parser.archive_path is None
# ---------------------------------------------------------------------------
# OFF mode — PDF
# ---------------------------------------------------------------------------
class TestOffModePdf:
"""OCR_MODE=off, document is a PDF."""
def test_off_no_archive_returns_pdftotext(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_digital_pdf_file: Path,
) -> None:
"""
GIVEN:
- OFF mode, produce_archive=False
- PDF with text
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr is NOT called
- archive_path is None
- text comes from pdftotext (extract_text)
"""
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(
simple_digital_pdf_file,
"application/pdf",
produce_archive=False,
)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path is None
assert tesseract_parser.get_text() == _LONG_TEXT
def test_off_with_archive_calls_ocrmypdf_skip_text(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_digital_pdf_file: Path,
) -> None:
"""
GIVEN:
- OFF mode, produce_archive=True
- PDF document
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr IS called with skip_text=True (PDF/A conversion only)
- archive_path is set
"""
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(
simple_digital_pdf_file,
"application/pdf",
produce_archive=True,
)
mock_ocr.assert_called_once()
call_kwargs = mock_ocr.call_args.kwargs
assert call_kwargs.get("skip_text") is True
assert "force_ocr" not in call_kwargs
assert "redo_ocr" not in call_kwargs
assert tesseract_parser.archive_path is not None
# ---------------------------------------------------------------------------
# OFF mode — image
# ---------------------------------------------------------------------------
class TestOffModeImage:
"""OCR_MODE=off, document is an image (PNG)."""
def test_off_image_no_archive_no_ocrmypdf(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_png_file: Path,
) -> None:
"""
GIVEN:
- OFF mode, produce_archive=False
- Image document (PNG)
WHEN:
- parse() is called
THEN:
- ocrmypdf.ocr is NOT called
- archive_path is None
- text is empty string (images have no text layer)
"""
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=False)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path is None
assert tesseract_parser.get_text() == ""
def test_off_image_with_archive_uses_img2pdf_path(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_png_file: Path,
) -> None:
"""
GIVEN:
- OFF mode, produce_archive=True
- Image document (PNG)
WHEN:
- parse() is called
THEN:
- _convert_image_to_pdfa() is called instead of ocrmypdf.ocr
- archive_path is set to the returned path
- text is empty string
"""
fake_archive = Path("/tmp/fake-archive.pdf")
mock_convert = mocker.patch.object(
tesseract_parser,
"_convert_image_to_pdfa",
return_value=fake_archive,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True)
mock_convert.assert_called_once_with(simple_png_file, "image/png")
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path == fake_archive
assert tesseract_parser.get_text() == ""
# ---------------------------------------------------------------------------
# produce_archive=False never sets archive_path for FORCE / REDO / AUTO modes
# ---------------------------------------------------------------------------
class TestProduceArchiveFalse:
"""Verify produce_archive=False never results in an archive regardless of mode."""
@pytest.mark.parametrize("mode", ["force", "redo"])
def test_produce_archive_false_force_redo_modes(
self,
mode: str,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
multi_page_images_pdf_file: Path,
) -> None:
"""
GIVEN:
- FORCE or REDO mode, produce_archive=False
- Any PDF
WHEN:
- parse() is called (ocrmypdf mocked to succeed)
THEN:
- archive_path is NOT set even though ocrmypdf ran
"""
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = mode
tesseract_parser.parse(
multi_page_images_pdf_file,
"application/pdf",
produce_archive=False,
)
assert tesseract_parser.archive_path is None
assert tesseract_parser.get_text() is not None
def test_produce_archive_false_auto_with_text(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
simple_digital_pdf_file: Path,
) -> None:
"""
GIVEN:
- AUTO mode, produce_archive=False
- PDF with text > VALID_TEXT_LENGTH
WHEN:
- parse() is called
THEN:
- ocrmypdf is skipped entirely (early return)
- archive_path is None
"""
mocker.patch.object(
tesseract_parser,
"extract_text",
return_value=_LONG_TEXT,
)
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
simple_digital_pdf_file,
"application/pdf",
produce_archive=False,
)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path is None

View File

@@ -89,15 +89,35 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
- Configuration from database is utilized (AUTO mode with skip_text=True
triggers skip_text; AUTO mode alone does not add any extra flag)
"""
# AUTO mode with skip_text=True explicitly passed: skip_text is set
with override_settings(OCR_MODE="redo"):
instance = ApplicationConfiguration.objects.all().first()
instance.mode = ModeChoices.AUTO
instance.save()
params = RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
input_file="input.pdf",
output_file="output.pdf",
sidecar_file="sidecar.txt",
mime_type="application/pdf",
safe_fallback=False,
skip_text=True,
)
self.assertTrue(params["skip_text"])
self.assertNotIn("redo_ocr", params)
self.assertNotIn("force_ocr", params)
# AUTO mode alone (no skip_text): no extra OCR flag is set
with override_settings(OCR_MODE="redo"):
instance = ApplicationConfiguration.objects.all().first()
instance.mode = ModeChoices.AUTO
instance.save()
params = self.get_params()
self.assertTrue(params["skip_text"])
self.assertNotIn("skip_text", params)
self.assertNotIn("redo_ocr", params)
self.assertNotIn("force_ocr", params)

View File

@@ -370,15 +370,26 @@ class TestParsePdf:
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
"""
GIVEN:
- Multi-page digital PDF with sufficient text layer
- Default settings (mode=auto, produce_archive=True)
WHEN:
- Document is parsed
THEN:
- Archive is created (AUTO mode + text present + produce_archive=True
→ PDF/A conversion via skip_text)
- Text is extracted
"""
tesseract_parser.parse(
tesseract_samples_dir / "simple-digital.pdf",
tesseract_samples_dir / "multi-page-digital.pdf",
"application/pdf",
)
assert tesseract_parser.archive_path is not None
assert tesseract_parser.archive_path.is_file()
assert_ordered_substrings(
tesseract_parser.get_text(),
["This is a test document."],
tesseract_parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
def test_with_form_default(
@@ -738,16 +749,18 @@ class TestSkipArchive:
"""
GIVEN:
- File with existing text layer
- Mode: auto, skip_archive_file: auto
- Mode: auto, produce_archive=False
WHEN:
- Document is parsed
THEN:
- Text extracted; no archive created (text exists, auto skips OCR)
- Text extracted from original; no archive created (text exists +
produce_archive=False skips OCRmyPDF entirely)
"""
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-digital.pdf",
"application/pdf",
produce_archive=False,
)
assert tesseract_parser.archive_path is None
assert_ordered_substrings(
@@ -781,46 +794,58 @@ class TestSkipArchive:
)
@pytest.mark.parametrize(
("skip_archive_file", "filename", "expect_archive"),
("produce_archive", "filename", "expect_archive"),
[
pytest.param(
"always",
True,
"multi-page-digital.pdf",
True,
id="always-with-text",
),
pytest.param("always", "multi-page-images.pdf", True, id="always-no-text"),
pytest.param(
"auto",
"multi-page-digital.pdf",
False,
id="auto-with-text-layer",
id="produce-archive-with-text",
),
pytest.param(
"auto",
True,
"multi-page-images.pdf",
True,
id="auto-no-text-layer",
id="produce-archive-no-text",
),
pytest.param(
"never",
False,
"multi-page-digital.pdf",
False,
id="never-with-text",
id="no-archive-with-text-layer",
),
pytest.param(
False,
"multi-page-images.pdf",
False,
id="no-archive-no-text-layer",
),
pytest.param("never", "multi-page-images.pdf", False, id="never-no-text"),
],
)
def test_skip_archive_file_setting(
def test_produce_archive_flag(
self,
skip_archive_file: str,
produce_archive: bool, # noqa: FBT001
filename: str,
expect_archive: str,
expect_archive: bool, # noqa: FBT001
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.archive_file_generation = skip_archive_file
tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf")
"""
GIVEN:
- Various PDFs (with and without text layers)
- produce_archive flag set to True or False
WHEN:
- Document is parsed
THEN:
- archive_path is set if and only if produce_archive=True
- Text is always extracted
"""
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / filename,
"application/pdf",
produce_archive=produce_archive,
)
text = tesseract_parser.get_text().lower()
assert_ordered_substrings(text, ["page 1", "page 2", "page 3"])
if expect_archive:
@@ -907,17 +932,18 @@ class TestParseMixed:
) -> None:
"""
GIVEN:
- File with mixed pages
- Mode: auto, skip_archive_file: auto
- File with mixed pages (some with text, some image-only)
- Mode: auto, produce_archive=False
WHEN:
- Document is parsed
THEN:
- No archive created (file has text layer); later-page text present
- No archive created (produce_archive=False); text from text layer present
"""
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-mixed.pdf",
"application/pdf",
produce_archive=False,
)
assert tesseract_parser.archive_path is None
assert_ordered_substrings(
@@ -964,12 +990,19 @@ class TestParseRtl:
) -> None:
"""
GIVEN:
- PDF with RTL Arabic text
- PDF with RTL Arabic text in its text layer (short: 18 chars)
- mode=off, produce_archive=True: PDF/A conversion via skip_text, no OCR engine
WHEN:
- Document is parsed
THEN:
- Arabic content is extracted (normalised for bidi)
- Arabic content is extracted from the PDF text layer (normalised for bidi)
Note: The RTL PDF has a short text layer (< VALID_TEXT_LENGTH=50) so AUTO mode
would attempt full OCR, which fails due to PriorOcrFoundError and falls back to
force-ocr with English Tesseract (producing garbage). Using mode="off" forces
skip_text=True so the Arabic text layer is preserved through PDF/A conversion.
"""
tesseract_parser.settings.mode = "off"
tesseract_parser.parse(
tesseract_samples_dir / "rtl-test.pdf",
"application/pdf",