diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index 6716194f4..5c344fe9f 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -1,5 +1,6 @@ from __future__ import annotations +import importlib.resources import logging import os import re @@ -18,7 +19,6 @@ from documents.parsers import make_thumbnail_from_pdf from documents.utils import maybe_override_pixel_limit from documents.utils import run_subprocess from paperless.config import OcrConfig -from paperless.models import ArchiveFileGenerationChoices from paperless.models import CleanChoices from paperless.models import ModeChoices from paperless.parsers.utils import read_file_handle_unicode_errors @@ -289,6 +289,7 @@ class RasterisedDocumentParser: sidecar_file: Path, *, safe_fallback: bool = False, + skip_text: bool = False, ) -> dict[str, Any]: ocrmypdf_args: dict[str, Any] = { "input_file_or_options": input_file, @@ -307,12 +308,14 @@ class RasterisedDocumentParser: self.settings.color_conversion_strategy ) - if self.settings.mode == ModeChoices.FORCE or safe_fallback: + if safe_fallback or self.settings.mode == ModeChoices.FORCE: ocrmypdf_args["force_ocr"] = True - elif self.settings.mode == ModeChoices.AUTO: - ocrmypdf_args["skip_text"] = True elif self.settings.mode == ModeChoices.REDO: ocrmypdf_args["redo_ocr"] = True + elif skip_text or self.settings.mode == ModeChoices.OFF: + ocrmypdf_args["skip_text"] = True + elif self.settings.mode == ModeChoices.AUTO: + pass # no extra flag: normal OCR (text not found case) else: # pragma: no cover raise ParseError(f"Invalid ocr mode: {self.settings.mode}") @@ -397,6 +400,62 @@ class RasterisedDocumentParser: return ocrmypdf_args + def _convert_image_to_pdfa(self, document_path: Path, mime_type: str) -> Path: + """Convert an image to a PDF/A-2b file without invoking the OCR engine. + + Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp + PDF/A-2b conformance metadata. + + No Tesseract and no Ghostscript are invoked. + """ + import img2pdf + import pikepdf + + plain_pdf_path = Path(self.tempdir) / "image_plain.pdf" + try: + layout_fun = None + if self.settings.image_dpi is not None: + layout_fun = img2pdf.get_fixed_dpi_layout_fun( + (self.settings.image_dpi, self.settings.image_dpi), + ) + plain_pdf_path.write_bytes( + img2pdf.convert(str(document_path), layout_fun=layout_fun), + ) + except Exception as e: + raise ParseError( + f"img2pdf conversion failed for {document_path}: {e!s}", + ) from e + + icc_data = ( + importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes() + ) + + pdfa_path = Path(self.tempdir) / "archive.pdf" + try: + with pikepdf.open(plain_pdf_path) as pdf: + cs = pdf.make_stream(icc_data) + cs["/N"] = 3 + output_intent = pikepdf.Dictionary( + Type=pikepdf.Name("/OutputIntent"), + S=pikepdf.Name("/GTS_PDFA1"), + OutputConditionIdentifier=pikepdf.String("sRGB"), + DestOutputProfile=cs, + ) + pdf.Root["/OutputIntents"] = pdf.make_indirect( + pikepdf.Array([output_intent]), + ) + meta = pdf.open_metadata(set_pikepdf_as_editor=False) + meta["pdfaid:part"] = "2" + meta["pdfaid:conformance"] = "B" + pdf.save(pdfa_path) + except Exception as e: + self.log.warning( + f"PDF/A metadata stamping failed ({e!s}); falling back to plain PDF.", + ) + pdfa_path.write_bytes(plain_pdf_path.read_bytes()) + + return pdfa_path + def parse( self, document_path: Path, @@ -417,48 +476,96 @@ class RasterisedDocumentParser: text_original = None original_has_text = False - # If the original has text, and the user doesn't want an archive, - # we're done here (but not when force/redo mode is explicitly requested) - skip_archive_for_text = self.settings.mode not in { - ModeChoices.FORCE, - ModeChoices.REDO, - } and self.settings.archive_file_generation in { - ArchiveFileGenerationChoices.NEVER, - ArchiveFileGenerationChoices.AUTO, - } - if skip_archive_for_text and original_has_text: - self.log.debug("Document has text, skipping OCRmyPDF entirely.") + # --- OCR_MODE=off: never invoke OCR engine --- + if self.settings.mode == ModeChoices.OFF: + if not produce_archive: + self.text = text_original or "" + return + if self.is_image(mime_type): + try: + self.archive_path = self._convert_image_to_pdfa( + document_path, + mime_type, + ) + self.text = "" + except Exception as e: + raise ParseError( + f"Image to PDF/A conversion failed: {e!s}", + ) from e + return + # PDFs in off mode: PDF/A conversion only via skip_text + import ocrmypdf + from ocrmypdf import SubprocessOutputError + + archive_path = Path(self.tempdir) / "archive.pdf" + sidecar_file = Path(self.tempdir) / "sidecar.txt" + args = self.construct_ocrmypdf_parameters( + document_path, + mime_type, + archive_path, + sidecar_file, + skip_text=True, + ) + try: + self.log.debug( + f"Calling OCRmyPDF (off mode, PDF/A conversion only): {args}", + ) + ocrmypdf.ocr(**args) + self.archive_path = archive_path + self.text = self.extract_text(None, archive_path) or text_original or "" + except SubprocessOutputError as e: + if "Ghostscript PDF/A rendering" in str(e): + self.log.warning( + "Ghostscript PDF/A rendering failed, consider setting " + "PAPERLESS_OCR_USER_ARGS: " + "'{\"continue_on_soft_render_error\": true}'", + ) + raise ParseError( + f"SubprocessOutputError: {e!s}. See logs for more information.", + ) from e + except Exception as e: + raise ParseError(f"{e.__class__.__name__}: {e!s}") from e + return + + # --- OCR_MODE=auto: skip ocrmypdf entirely if text exists and no archive needed --- + if ( + self.settings.mode == ModeChoices.AUTO + and original_has_text + and not produce_archive + ): + self.log.debug( + "Document has text and no archive requested; skipping OCRmyPDF entirely.", + ) self.text = text_original return - # Either no text was in the original or there should be an archive - # file created, so OCR the file and create an archive with any - # text located via OCR - + # --- All other paths: run ocrmypdf --- import ocrmypdf from ocrmypdf import EncryptedPdfError from ocrmypdf import InputFileError from ocrmypdf import SubprocessOutputError from ocrmypdf.exceptions import DigitalSignatureError + from ocrmypdf.exceptions import PriorOcrFoundError archive_path = Path(self.tempdir) / "archive.pdf" sidecar_file = Path(self.tempdir) / "sidecar.txt" + # auto mode with existing text: PDF/A conversion only (no OCR). + skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text + args = self.construct_ocrmypdf_parameters( document_path, mime_type, archive_path, sidecar_file, + skip_text=skip_text, ) try: self.log.debug(f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - if ( - self.settings.archive_file_generation - != ArchiveFileGenerationChoices.NEVER - ): + if produce_archive: self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) @@ -478,11 +585,10 @@ class RasterisedDocumentParser: "Ghostscript PDF/A rendering failed, consider setting " "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'", ) - raise ParseError( f"SubprocessOutputError: {e!s}. See logs for more information.", ) from e - except (NoTextFoundException, InputFileError) as e: + except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e: self.log.warning( f"Encountered an error while running OCR: {e!s}. " f"Attempting force OCR to get the text.", @@ -491,8 +597,6 @@ class RasterisedDocumentParser: archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf" sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt" - # Attempt to run OCR with safe settings. - args = self.construct_ocrmypdf_parameters( document_path, mime_type, @@ -504,25 +608,16 @@ class RasterisedDocumentParser: try: self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - - # Don't return the archived file here, since this file - # is bigger and blurry due to --force-ocr. - self.text = self.extract_text( sidecar_file_fallback, archive_path_fallback, ) - except Exception as e: - # If this fails, we have a serious issue at hand. raise ParseError(f"{e.__class__.__name__}: {e!s}") from e except Exception as e: - # Anything else is probably serious. raise ParseError(f"{e.__class__.__name__}: {e!s}") from e - # As a last resort, if we still don't have any text for any reason, - # try to extract the text from the original document. if not self.text: if original_has_text: self.text = text_original diff --git a/src/paperless/tests/parsers/test_parse_modes.py b/src/paperless/tests/parsers/test_parse_modes.py new file mode 100644 index 000000000..6766379c3 --- /dev/null +++ b/src/paperless/tests/parsers/test_parse_modes.py @@ -0,0 +1,436 @@ +""" +Focused tests for RasterisedDocumentParser.parse() mode behaviour. + +These tests mock ``ocrmypdf.ocr`` so they run without a real Tesseract/OCRmyPDF +installation and execute quickly. The intent is to verify the *control flow* +introduced by the ``produce_archive`` flag and the ``OCR_MODE=auto/off`` logic, +not to test OCRmyPDF itself. + +Fixtures are pulled from conftest.py in this package. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + from pytest_mock import MockerFixture + + from paperless.parsers.tesseract import RasterisedDocumentParser + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_LONG_TEXT = "This is a test document with enough text. " * 5 # >50 chars +_SHORT_TEXT = "Hi." # <50 chars + + +def _make_extract_text(text: str | None): + """Return a side_effect function for ``extract_text`` that returns *text*.""" + + def _extract(sidecar_file, pdf_file): + return text + + return _extract + + +# --------------------------------------------------------------------------- +# AUTO mode — PDF with sufficient text layer +# --------------------------------------------------------------------------- + + +class TestAutoModeWithText: + """AUTO mode, original PDF has detectable text (>50 chars).""" + + def test_auto_text_no_archive_skips_ocrmypdf( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - AUTO mode, produce_archive=False + - PDF with text > VALID_TEXT_LENGTH + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr is NOT called (early return path) + - archive_path remains None + - text is set from the original + """ + # Patch extract_text to return long text (simulating detectable text layer) + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + simple_digital_pdf_file, + "application/pdf", + produce_archive=False, + ) + + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path is None + assert tesseract_parser.get_text() == _LONG_TEXT + + def test_auto_text_with_archive_calls_ocrmypdf_skip_text( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - AUTO mode, produce_archive=True + - PDF with text > VALID_TEXT_LENGTH + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr IS called with skip_text=True + - archive_path is set + """ + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + simple_digital_pdf_file, + "application/pdf", + produce_archive=True, + ) + + mock_ocr.assert_called_once() + call_kwargs = mock_ocr.call_args.kwargs + assert call_kwargs.get("skip_text") is True + assert "force_ocr" not in call_kwargs + assert "redo_ocr" not in call_kwargs + assert tesseract_parser.archive_path is not None + + +# --------------------------------------------------------------------------- +# AUTO mode — PDF without text layer (or too short) +# --------------------------------------------------------------------------- + + +class TestAutoModeNoText: + """AUTO mode, original PDF has no detectable text (<= 50 chars).""" + + def test_auto_no_text_with_archive_calls_ocrmypdf_no_extra_flag( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + multi_page_images_pdf_file: Path, + ) -> None: + """ + GIVEN: + - AUTO mode, produce_archive=True + - PDF with no text (or text <= VALID_TEXT_LENGTH) + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr IS called WITHOUT skip_text/force_ocr/redo_ocr + - archive_path is set (since produce_archive=True) + """ + # Return "no text" for the original; return real text for archive + extract_call_count = 0 + + def _extract_side(sidecar_file, pdf_file): + nonlocal extract_call_count + extract_call_count += 1 + if extract_call_count == 1: + return None # original has no text + return _LONG_TEXT # text from archive after OCR + + mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + multi_page_images_pdf_file, + "application/pdf", + produce_archive=True, + ) + + mock_ocr.assert_called_once() + call_kwargs = mock_ocr.call_args.kwargs + assert "skip_text" not in call_kwargs + assert "force_ocr" not in call_kwargs + assert "redo_ocr" not in call_kwargs + assert tesseract_parser.archive_path is not None + + def test_auto_no_text_no_archive_calls_ocrmypdf( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + multi_page_images_pdf_file: Path, + ) -> None: + """ + GIVEN: + - AUTO mode, produce_archive=False + - PDF with no text + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr IS called (no early return since no text detected) + - archive_path is NOT set (produce_archive=False) + """ + extract_call_count = 0 + + def _extract_side(sidecar_file, pdf_file): + nonlocal extract_call_count + extract_call_count += 1 + if extract_call_count == 1: + return None + return _LONG_TEXT + + mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + multi_page_images_pdf_file, + "application/pdf", + produce_archive=False, + ) + + mock_ocr.assert_called_once() + assert tesseract_parser.archive_path is None + + +# --------------------------------------------------------------------------- +# OFF mode — PDF +# --------------------------------------------------------------------------- + + +class TestOffModePdf: + """OCR_MODE=off, document is a PDF.""" + + def test_off_no_archive_returns_pdftotext( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - OFF mode, produce_archive=False + - PDF with text + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr is NOT called + - archive_path is None + - text comes from pdftotext (extract_text) + """ + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "off" + tesseract_parser.parse( + simple_digital_pdf_file, + "application/pdf", + produce_archive=False, + ) + + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path is None + assert tesseract_parser.get_text() == _LONG_TEXT + + def test_off_with_archive_calls_ocrmypdf_skip_text( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - OFF mode, produce_archive=True + - PDF document + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr IS called with skip_text=True (PDF/A conversion only) + - archive_path is set + """ + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "off" + tesseract_parser.parse( + simple_digital_pdf_file, + "application/pdf", + produce_archive=True, + ) + + mock_ocr.assert_called_once() + call_kwargs = mock_ocr.call_args.kwargs + assert call_kwargs.get("skip_text") is True + assert "force_ocr" not in call_kwargs + assert "redo_ocr" not in call_kwargs + assert tesseract_parser.archive_path is not None + + +# --------------------------------------------------------------------------- +# OFF mode — image +# --------------------------------------------------------------------------- + + +class TestOffModeImage: + """OCR_MODE=off, document is an image (PNG).""" + + def test_off_image_no_archive_no_ocrmypdf( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: + - OFF mode, produce_archive=False + - Image document (PNG) + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr is NOT called + - archive_path is None + - text is empty string (images have no text layer) + """ + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "off" + tesseract_parser.parse(simple_png_file, "image/png", produce_archive=False) + + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path is None + assert tesseract_parser.get_text() == "" + + def test_off_image_with_archive_uses_img2pdf_path( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: + - OFF mode, produce_archive=True + - Image document (PNG) + WHEN: + - parse() is called + THEN: + - _convert_image_to_pdfa() is called instead of ocrmypdf.ocr + - archive_path is set to the returned path + - text is empty string + """ + fake_archive = Path("/tmp/fake-archive.pdf") + mock_convert = mocker.patch.object( + tesseract_parser, + "_convert_image_to_pdfa", + return_value=fake_archive, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "off" + tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True) + + mock_convert.assert_called_once_with(simple_png_file, "image/png") + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path == fake_archive + assert tesseract_parser.get_text() == "" + + +# --------------------------------------------------------------------------- +# produce_archive=False never sets archive_path for FORCE / REDO / AUTO modes +# --------------------------------------------------------------------------- + + +class TestProduceArchiveFalse: + """Verify produce_archive=False never results in an archive regardless of mode.""" + + @pytest.mark.parametrize("mode", ["force", "redo"]) + def test_produce_archive_false_force_redo_modes( + self, + mode: str, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + multi_page_images_pdf_file: Path, + ) -> None: + """ + GIVEN: + - FORCE or REDO mode, produce_archive=False + - Any PDF + WHEN: + - parse() is called (ocrmypdf mocked to succeed) + THEN: + - archive_path is NOT set even though ocrmypdf ran + """ + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = mode + tesseract_parser.parse( + multi_page_images_pdf_file, + "application/pdf", + produce_archive=False, + ) + + assert tesseract_parser.archive_path is None + assert tesseract_parser.get_text() is not None + + def test_produce_archive_false_auto_with_text( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - AUTO mode, produce_archive=False + - PDF with text > VALID_TEXT_LENGTH + WHEN: + - parse() is called + THEN: + - ocrmypdf is skipped entirely (early return) + - archive_path is None + """ + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + simple_digital_pdf_file, + "application/pdf", + produce_archive=False, + ) + + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path is None diff --git a/src/paperless/tests/parsers/test_tesseract_custom_settings.py b/src/paperless/tests/parsers/test_tesseract_custom_settings.py index bade65ef1..e6ab6cf81 100644 --- a/src/paperless/tests/parsers/test_tesseract_custom_settings.py +++ b/src/paperless/tests/parsers/test_tesseract_custom_settings.py @@ -89,15 +89,35 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas WHEN: - OCR parameters are constructed THEN: - - Configuration from database is utilized + - Configuration from database is utilized (AUTO mode with skip_text=True + triggers skip_text; AUTO mode alone does not add any extra flag) """ + # AUTO mode with skip_text=True explicitly passed: skip_text is set + with override_settings(OCR_MODE="redo"): + instance = ApplicationConfiguration.objects.all().first() + instance.mode = ModeChoices.AUTO + instance.save() + + params = RasterisedDocumentParser(None).construct_ocrmypdf_parameters( + input_file="input.pdf", + output_file="output.pdf", + sidecar_file="sidecar.txt", + mime_type="application/pdf", + safe_fallback=False, + skip_text=True, + ) + self.assertTrue(params["skip_text"]) + self.assertNotIn("redo_ocr", params) + self.assertNotIn("force_ocr", params) + + # AUTO mode alone (no skip_text): no extra OCR flag is set with override_settings(OCR_MODE="redo"): instance = ApplicationConfiguration.objects.all().first() instance.mode = ModeChoices.AUTO instance.save() params = self.get_params() - self.assertTrue(params["skip_text"]) + self.assertNotIn("skip_text", params) self.assertNotIn("redo_ocr", params) self.assertNotIn("force_ocr", params) diff --git a/src/paperless/tests/parsers/test_tesseract_parser.py b/src/paperless/tests/parsers/test_tesseract_parser.py index d3d08bc41..088031766 100644 --- a/src/paperless/tests/parsers/test_tesseract_parser.py +++ b/src/paperless/tests/parsers/test_tesseract_parser.py @@ -370,15 +370,26 @@ class TestParsePdf: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: + """ + GIVEN: + - Multi-page digital PDF with sufficient text layer + - Default settings (mode=auto, produce_archive=True) + WHEN: + - Document is parsed + THEN: + - Archive is created (AUTO mode + text present + produce_archive=True + → PDF/A conversion via skip_text) + - Text is extracted + """ tesseract_parser.parse( - tesseract_samples_dir / "simple-digital.pdf", + tesseract_samples_dir / "multi-page-digital.pdf", "application/pdf", ) assert tesseract_parser.archive_path is not None assert tesseract_parser.archive_path.is_file() assert_ordered_substrings( - tesseract_parser.get_text(), - ["This is a test document."], + tesseract_parser.get_text().lower(), + ["page 1", "page 2", "page 3"], ) def test_with_form_default( @@ -738,16 +749,18 @@ class TestSkipArchive: """ GIVEN: - File with existing text layer - - Mode: auto, skip_archive_file: auto + - Mode: auto, produce_archive=False WHEN: - Document is parsed THEN: - - Text extracted; no archive created (text exists, auto skips OCR) + - Text extracted from original; no archive created (text exists + + produce_archive=False skips OCRmyPDF entirely) """ tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-digital.pdf", "application/pdf", + produce_archive=False, ) assert tesseract_parser.archive_path is None assert_ordered_substrings( @@ -781,46 +794,58 @@ class TestSkipArchive: ) @pytest.mark.parametrize( - ("skip_archive_file", "filename", "expect_archive"), + ("produce_archive", "filename", "expect_archive"), [ pytest.param( - "always", + True, "multi-page-digital.pdf", True, - id="always-with-text", - ), - pytest.param("always", "multi-page-images.pdf", True, id="always-no-text"), - pytest.param( - "auto", - "multi-page-digital.pdf", - False, - id="auto-with-text-layer", + id="produce-archive-with-text", ), pytest.param( - "auto", + True, "multi-page-images.pdf", True, - id="auto-no-text-layer", + id="produce-archive-no-text", ), pytest.param( - "never", + False, "multi-page-digital.pdf", False, - id="never-with-text", + id="no-archive-with-text-layer", + ), + pytest.param( + False, + "multi-page-images.pdf", + False, + id="no-archive-no-text-layer", ), - pytest.param("never", "multi-page-images.pdf", False, id="never-no-text"), ], ) - def test_skip_archive_file_setting( + def test_produce_archive_flag( self, - skip_archive_file: str, + produce_archive: bool, # noqa: FBT001 filename: str, - expect_archive: str, + expect_archive: bool, # noqa: FBT001 tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.archive_file_generation = skip_archive_file - tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf") + """ + GIVEN: + - Various PDFs (with and without text layers) + - produce_archive flag set to True or False + WHEN: + - Document is parsed + THEN: + - archive_path is set if and only if produce_archive=True + - Text is always extracted + """ + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + tesseract_samples_dir / filename, + "application/pdf", + produce_archive=produce_archive, + ) text = tesseract_parser.get_text().lower() assert_ordered_substrings(text, ["page 1", "page 2", "page 3"]) if expect_archive: @@ -907,17 +932,18 @@ class TestParseMixed: ) -> None: """ GIVEN: - - File with mixed pages - - Mode: auto, skip_archive_file: auto + - File with mixed pages (some with text, some image-only) + - Mode: auto, produce_archive=False WHEN: - Document is parsed THEN: - - No archive created (file has text layer); later-page text present + - No archive created (produce_archive=False); text from text layer present """ tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-mixed.pdf", "application/pdf", + produce_archive=False, ) assert tesseract_parser.archive_path is None assert_ordered_substrings( @@ -964,12 +990,19 @@ class TestParseRtl: ) -> None: """ GIVEN: - - PDF with RTL Arabic text + - PDF with RTL Arabic text in its text layer (short: 18 chars) + - mode=off, produce_archive=True: PDF/A conversion via skip_text, no OCR engine WHEN: - Document is parsed THEN: - - Arabic content is extracted (normalised for bidi) + - Arabic content is extracted from the PDF text layer (normalised for bidi) + + Note: The RTL PDF has a short text layer (< VALID_TEXT_LENGTH=50) so AUTO mode + would attempt full OCR, which fails due to PriorOcrFoundError and falls back to + force-ocr with English Tesseract (producing garbage). Using mode="off" forces + skip_text=True so the Arabic text layer is preserved through PDF/A conversion. """ + tesseract_parser.settings.mode = "off" tesseract_parser.parse( tesseract_samples_dir / "rtl-test.pdf", "application/pdf",