From 6eb6e352daf71a60facf89bdc18ae86aeb9043fe Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 27 Mar 2026 08:45:20 -0700 Subject: [PATCH] Adds a tagged PDF check as well, for an even better decision to skip OCR in auto mode --- docs/configuration.md | 4 +- src/documents/consumer.py | 3 ++ src/documents/tests/test_consumer_archive.py | 44 ++++++++++++--- src/paperless/parsers/tesseract.py | 3 +- src/paperless/parsers/utils.py | 39 ++++++++++++++ .../tests/parsers/test_tesseract_parser.py | 53 +++++++++++++++++++ src/paperless/tests/test_parser_utils.py | 25 +++++++++ 7 files changed, 162 insertions(+), 9 deletions(-) create mode 100644 src/paperless/tests/test_parser_utils.py diff --git a/docs/configuration.md b/docs/configuration.md index cc2e0183c..79e94ed51 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -858,8 +858,8 @@ for display in the web interface. | Document type | `never` | `auto` (default) | `always` | | -------------------------- | ------- | -------------------------- | -------- | | Scanned image (TIFF, JPEG) | No | **Yes** | Yes | - | Image-based PDF | No | **Yes** (short/no text) | Yes | - | Born-digital PDF | No | No (has embedded text) | Yes | + | Image-based PDF | No | **Yes** (short/no text, untagged) | Yes | + | Born-digital PDF | No | No (tagged or has embedded text) | Yes | | Plain text, email, HTML | No | No | No | | DOCX / ODT (via Tika) | Yes\* | Yes\* | Yes\* | diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 170849153..92eec23fc 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -57,6 +57,7 @@ from paperless.parsers import ParserProtocol from paperless.parsers.registry import get_parser_registry from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH from paperless.parsers.utils import extract_pdf_text +from paperless.parsers.utils import is_tagged_pdf LOGGING_NAME: Final[str] = "paperless.consumer" @@ -140,6 +141,8 @@ def should_produce_archive( if mime_type.startswith("image/"): return True if mime_type == "application/pdf": + if is_tagged_pdf(document_path): + return False text = extract_pdf_text(document_path) return text is None or len(text) <= PDF_TEXT_MIN_LENGTH return False diff --git a/src/documents/tests/test_consumer_archive.py b/src/documents/tests/test_consumer_archive.py index 68f95056c..265bd7bc6 100644 --- a/src/documents/tests/test_consumer_archive.py +++ b/src/documents/tests/test_consumer_archive.py @@ -3,13 +3,16 @@ from __future__ import annotations from pathlib import Path +from typing import TYPE_CHECKING from unittest.mock import MagicMock -from unittest.mock import patch import pytest from documents.consumer import should_produce_archive +if TYPE_CHECKING: + from pytest_mock import MockerFixture + def _parser_instance( *, @@ -144,14 +147,43 @@ class TestShouldProduceArchive: ) def test_auto_pdf_archive_decision( self, + mocker: MockerFixture, settings, extracted_text: str | None, expected: bool, # noqa: FBT001 ) -> None: settings.ARCHIVE_FILE_GENERATION = "auto" + mocker.patch("documents.consumer.is_tagged_pdf", return_value=False) + mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text) parser = _parser_instance(can_produce=True, requires_rendition=False) - with patch("documents.consumer.extract_pdf_text", return_value=extracted_text): - assert ( - should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf")) - is expected - ) + assert ( + should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf")) + is expected + ) + + def test_tagged_pdf_skips_archive_in_auto_mode( + self, + mocker: MockerFixture, + settings, + ) -> None: + """Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length.""" + settings.ARCHIVE_FILE_GENERATION = "auto" + mocker.patch("documents.consumer.is_tagged_pdf", return_value=True) + parser = _parser_instance(can_produce=True, requires_rendition=False) + assert ( + should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf")) + is False + ) + + def test_tagged_pdf_does_not_call_pdftotext( + self, + mocker: MockerFixture, + settings, + ) -> None: + """When a PDF is tagged, pdftotext is not invoked (fast path).""" + settings.ARCHIVE_FILE_GENERATION = "auto" + mocker.patch("documents.consumer.is_tagged_pdf", return_value=True) + mock_extract = mocker.patch("documents.consumer.extract_pdf_text") + parser = _parser_instance(can_produce=True, requires_rendition=False) + should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf")) + mock_extract.assert_not_called() diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index 566acd92e..fb02c8e64 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -23,6 +23,7 @@ from paperless.models import CleanChoices from paperless.models import ModeChoices from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH from paperless.parsers.utils import extract_pdf_text +from paperless.parsers.utils import is_tagged_pdf from paperless.parsers.utils import read_file_handle_unicode_errors from paperless.version import __full_version_str__ @@ -441,7 +442,7 @@ class RasterisedDocumentParser: if mime_type == "application/pdf": text_original = self.extract_text(None, document_path) - original_has_text = ( + original_has_text = is_tagged_pdf(document_path, log=self.log) or ( text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH ) else: diff --git a/src/paperless/parsers/utils.py b/src/paperless/parsers/utils.py index 68dded3b7..0e65aadd3 100644 --- a/src/paperless/parsers/utils.py +++ b/src/paperless/parsers/utils.py @@ -24,6 +24,45 @@ logger = logging.getLogger("paperless.parsers.utils") PDF_TEXT_MIN_LENGTH = 50 +def is_tagged_pdf( + path: Path, + log: logging.Logger | None = None, +) -> bool: + """Return True if the PDF declares itself as tagged (born-digital indicator). + + Tagged PDFs (e.g. exported from Word or LibreOffice) have ``/MarkInfo`` + with ``/Marked true`` in the document root. This is a reliable signal + that the document has a logical structure and embedded text — running OCR + on it is unnecessary and archive generation can be skipped. + + https://github.com/ocrmypdf/OCRmyPDF/blob/4e974ebd465a5921b2e79004f098f5d203010282/src/ocrmypdf/pdfinfo/info.py#L449 + + Parameters + ---------- + path: + Absolute path to the PDF file. + log: + Logger for warnings. Falls back to the module-level logger when omitted. + + Returns + ------- + bool + ``True`` when the PDF is tagged, ``False`` otherwise or on any error. + """ + import pikepdf + + _log = log or logger + try: + with pikepdf.open(path) as pdf: + mark_info = pdf.Root.get("/MarkInfo") + if mark_info is None: + return False + return bool(mark_info.get("/Marked", False)) + except Exception: + _log.warning("Could not check PDF tag status for %s", path, exc_info=True) + return False + + def extract_pdf_text( path: Path, log: logging.Logger | None = None, diff --git a/src/paperless/tests/parsers/test_tesseract_parser.py b/src/paperless/tests/parsers/test_tesseract_parser.py index 565361185..b686b6c75 100644 --- a/src/paperless/tests/parsers/test_tesseract_parser.py +++ b/src/paperless/tests/parsers/test_tesseract_parser.py @@ -851,6 +851,59 @@ class TestSkipArchive: else: assert tesseract_parser.archive_path is None + def test_tagged_pdf_skips_ocr_in_auto_mode( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + tesseract_samples_dir: Path, + ) -> None: + """ + GIVEN: + - A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true) + - Mode: auto, produce_archive=False + WHEN: + - Document is parsed + THEN: + - OCRmyPDF is not invoked (tagged ⇒ original_has_text=True) + - Text is extracted from the original via pdftotext + - No archive is produced + """ + tesseract_parser.settings.mode = "auto" + mock_ocr = mocker.patch("ocrmypdf.ocr") + tesseract_parser.parse( + tesseract_samples_dir / "simple-digital.pdf", + "application/pdf", + produce_archive=False, + ) + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path is None + assert tesseract_parser.get_text() + + def test_tagged_pdf_produces_pdfa_archive_without_ocr( + self, + tesseract_parser: RasterisedDocumentParser, + tesseract_samples_dir: Path, + ) -> None: + """ + GIVEN: + - A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true) + - Mode: auto, produce_archive=True + WHEN: + - Document is parsed + THEN: + - OCRmyPDF runs with skip_text (PDF/A conversion only, no OCR) + - Archive is produced + - Text is preserved from the original + """ + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + tesseract_samples_dir / "simple-digital.pdf", + "application/pdf", + produce_archive=True, + ) + assert tesseract_parser.archive_path is not None + assert tesseract_parser.get_text() + # --------------------------------------------------------------------------- # Parse — mixed pages / sidecar diff --git a/src/paperless/tests/test_parser_utils.py b/src/paperless/tests/test_parser_utils.py new file mode 100644 index 000000000..ca6d9e6fe --- /dev/null +++ b/src/paperless/tests/test_parser_utils.py @@ -0,0 +1,25 @@ +"""Tests for paperless.parsers.utils helpers.""" + +from __future__ import annotations + +from pathlib import Path + +from paperless.parsers.utils import is_tagged_pdf + +SAMPLES = Path(__file__).parent / "samples" / "tesseract" + + +class TestIsTaggedPdf: + def test_tagged_pdf_returns_true(self) -> None: + assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True + + def test_untagged_pdf_returns_false(self) -> None: + assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False + + def test_nonexistent_path_returns_false(self) -> None: + assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False + + def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None: + bad = tmp_path / "bad.pdf" + bad.write_bytes(b"not a pdf") + assert is_tagged_pdf(bad) is False