From 6eb6e352daf71a60facf89bdc18ae86aeb9043fe Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Fri, 27 Mar 2026 08:45:20 -0700
Subject: [PATCH] Adds a tagged PDF check as well, for an even better decision
 to skip OCR in auto mode

---
 docs/configuration.md                         |  4 +-
 src/documents/consumer.py                     |  3 ++
 src/documents/tests/test_consumer_archive.py  | 44 ++++++++++++---
 src/paperless/parsers/tesseract.py            |  3 +-
 src/paperless/parsers/utils.py                | 39 ++++++++++++++
 .../tests/parsers/test_tesseract_parser.py    | 53 +++++++++++++++++++
 src/paperless/tests/test_parser_utils.py      | 25 +++++++++
 7 files changed, 162 insertions(+), 9 deletions(-)
 create mode 100644 src/paperless/tests/test_parser_utils.py

diff --git a/docs/configuration.md b/docs/configuration.md
index cc2e0183c..79e94ed51 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -858,8 +858,8 @@ for display in the web interface.
     | Document type              | `never` | `auto` (default)           | `always` |
     | -------------------------- | ------- | -------------------------- | -------- |
     | Scanned image (TIFF, JPEG) | No      | **Yes**                    | Yes      |
-    | Image-based PDF            | No      | **Yes** (short/no text)    | Yes      |
-    | Born-digital PDF           | No      | No (has embedded text)     | Yes      |
+    | Image-based PDF            | No      | **Yes** (short/no text, untagged) | Yes |
+    | Born-digital PDF           | No      | No (tagged or has embedded text)  | Yes |
     | Plain text, email, HTML    | No      | No                         | No       |
     | DOCX / ODT (via Tika)      | Yes\*   | Yes\*                      | Yes\*    |
 
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 170849153..92eec23fc 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -57,6 +57,7 @@ from paperless.parsers import ParserProtocol
 from paperless.parsers.registry import get_parser_registry
 from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
 from paperless.parsers.utils import extract_pdf_text
+from paperless.parsers.utils import is_tagged_pdf
 
 LOGGING_NAME: Final[str] = "paperless.consumer"
 
@@ -140,6 +141,8 @@ def should_produce_archive(
     if mime_type.startswith("image/"):
         return True
     if mime_type == "application/pdf":
+        if is_tagged_pdf(document_path):
+            return False
         text = extract_pdf_text(document_path)
         return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
     return False
diff --git a/src/documents/tests/test_consumer_archive.py b/src/documents/tests/test_consumer_archive.py
index 68f95056c..265bd7bc6 100644
--- a/src/documents/tests/test_consumer_archive.py
+++ b/src/documents/tests/test_consumer_archive.py
@@ -3,13 +3,16 @@
 from __future__ import annotations
 
 from pathlib import Path
+from typing import TYPE_CHECKING
 from unittest.mock import MagicMock
-from unittest.mock import patch
 
 import pytest
 
 from documents.consumer import should_produce_archive
 
+if TYPE_CHECKING:
+    from pytest_mock import MockerFixture
+
 
 def _parser_instance(
     *,
@@ -144,14 +147,43 @@ class TestShouldProduceArchive:
     )
     def test_auto_pdf_archive_decision(
         self,
+        mocker: MockerFixture,
         settings,
         extracted_text: str | None,
         expected: bool,  # noqa: FBT001
     ) -> None:
         settings.ARCHIVE_FILE_GENERATION = "auto"
+        mocker.patch("documents.consumer.is_tagged_pdf", return_value=False)
+        mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text)
         parser = _parser_instance(can_produce=True, requires_rendition=False)
-        with patch("documents.consumer.extract_pdf_text", return_value=extracted_text):
-            assert (
-                should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
-                is expected
-            )
+        assert (
+            should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
+            is expected
+        )
+
+    def test_tagged_pdf_skips_archive_in_auto_mode(
+        self,
+        mocker: MockerFixture,
+        settings,
+    ) -> None:
+        """Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length."""
+        settings.ARCHIVE_FILE_GENERATION = "auto"
+        mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        assert (
+            should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
+            is False
+        )
+
+    def test_tagged_pdf_does_not_call_pdftotext(
+        self,
+        mocker: MockerFixture,
+        settings,
+    ) -> None:
+        """When a PDF is tagged, pdftotext is not invoked (fast path)."""
+        settings.ARCHIVE_FILE_GENERATION = "auto"
+        mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
+        mock_extract = mocker.patch("documents.consumer.extract_pdf_text")
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
+        mock_extract.assert_not_called()
diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py
index 566acd92e..fb02c8e64 100644
--- a/src/paperless/parsers/tesseract.py
+++ b/src/paperless/parsers/tesseract.py
@@ -23,6 +23,7 @@ from paperless.models import CleanChoices
 from paperless.models import ModeChoices
 from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
 from paperless.parsers.utils import extract_pdf_text
+from paperless.parsers.utils import is_tagged_pdf
 from paperless.parsers.utils import read_file_handle_unicode_errors
 from paperless.version import __full_version_str__
 
@@ -441,7 +442,7 @@ class RasterisedDocumentParser:
 
         if mime_type == "application/pdf":
             text_original = self.extract_text(None, document_path)
-            original_has_text = (
+            original_has_text = is_tagged_pdf(document_path, log=self.log) or (
                 text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH
             )
         else:
diff --git a/src/paperless/parsers/utils.py b/src/paperless/parsers/utils.py
index 68dded3b7..0e65aadd3 100644
--- a/src/paperless/parsers/utils.py
+++ b/src/paperless/parsers/utils.py
@@ -24,6 +24,45 @@ logger = logging.getLogger("paperless.parsers.utils")
 PDF_TEXT_MIN_LENGTH = 50
 
 
+def is_tagged_pdf(
+    path: Path,
+    log: logging.Logger | None = None,
+) -> bool:
+    """Return True if the PDF declares itself as tagged (born-digital indicator).
+
+    Tagged PDFs (e.g. exported from Word or LibreOffice) have ``/MarkInfo``
+    with ``/Marked true`` in the document root.  This is a reliable signal
+    that the document has a logical structure and embedded text — running OCR
+    on it is unnecessary and archive generation can be skipped.
+
+    https://github.com/ocrmypdf/OCRmyPDF/blob/4e974ebd465a5921b2e79004f098f5d203010282/src/ocrmypdf/pdfinfo/info.py#L449
+
+    Parameters
+    ----------
+    path:
+        Absolute path to the PDF file.
+    log:
+        Logger for warnings.  Falls back to the module-level logger when omitted.
+
+    Returns
+    -------
+    bool
+        ``True`` when the PDF is tagged, ``False`` otherwise or on any error.
+    """
+    import pikepdf
+
+    _log = log or logger
+    try:
+        with pikepdf.open(path) as pdf:
+            mark_info = pdf.Root.get("/MarkInfo")
+            if mark_info is None:
+                return False
+            return bool(mark_info.get("/Marked", False))
+    except Exception:
+        _log.warning("Could not check PDF tag status for %s", path, exc_info=True)
+        return False
+
+
 def extract_pdf_text(
     path: Path,
     log: logging.Logger | None = None,
diff --git a/src/paperless/tests/parsers/test_tesseract_parser.py b/src/paperless/tests/parsers/test_tesseract_parser.py
index 565361185..b686b6c75 100644
--- a/src/paperless/tests/parsers/test_tesseract_parser.py
+++ b/src/paperless/tests/parsers/test_tesseract_parser.py
@@ -851,6 +851,59 @@ class TestSkipArchive:
         else:
             assert tesseract_parser.archive_path is None
 
+    def test_tagged_pdf_skips_ocr_in_auto_mode(
+        self,
+        mocker: MockerFixture,
+        tesseract_parser: RasterisedDocumentParser,
+        tesseract_samples_dir: Path,
+    ) -> None:
+        """
+        GIVEN:
+            - A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
+            - Mode: auto, produce_archive=False
+        WHEN:
+            - Document is parsed
+        THEN:
+            - OCRmyPDF is not invoked (tagged ⇒ original_has_text=True)
+            - Text is extracted from the original via pdftotext
+            - No archive is produced
+        """
+        tesseract_parser.settings.mode = "auto"
+        mock_ocr = mocker.patch("ocrmypdf.ocr")
+        tesseract_parser.parse(
+            tesseract_samples_dir / "simple-digital.pdf",
+            "application/pdf",
+            produce_archive=False,
+        )
+        mock_ocr.assert_not_called()
+        assert tesseract_parser.archive_path is None
+        assert tesseract_parser.get_text()
+
+    def test_tagged_pdf_produces_pdfa_archive_without_ocr(
+        self,
+        tesseract_parser: RasterisedDocumentParser,
+        tesseract_samples_dir: Path,
+    ) -> None:
+        """
+        GIVEN:
+            - A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
+            - Mode: auto, produce_archive=True
+        WHEN:
+            - Document is parsed
+        THEN:
+            - OCRmyPDF runs with skip_text (PDF/A conversion only, no OCR)
+            - Archive is produced
+            - Text is preserved from the original
+        """
+        tesseract_parser.settings.mode = "auto"
+        tesseract_parser.parse(
+            tesseract_samples_dir / "simple-digital.pdf",
+            "application/pdf",
+            produce_archive=True,
+        )
+        assert tesseract_parser.archive_path is not None
+        assert tesseract_parser.get_text()
+
 
 # ---------------------------------------------------------------------------
 # Parse — mixed pages / sidecar
diff --git a/src/paperless/tests/test_parser_utils.py b/src/paperless/tests/test_parser_utils.py
new file mode 100644
index 000000000..ca6d9e6fe
--- /dev/null
+++ b/src/paperless/tests/test_parser_utils.py
@@ -0,0 +1,25 @@
+"""Tests for paperless.parsers.utils helpers."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from paperless.parsers.utils import is_tagged_pdf
+
+SAMPLES = Path(__file__).parent / "samples" / "tesseract"
+
+
+class TestIsTaggedPdf:
+    def test_tagged_pdf_returns_true(self) -> None:
+        assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True
+
+    def test_untagged_pdf_returns_false(self) -> None:
+        assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False
+
+    def test_nonexistent_path_returns_false(self) -> None:
+        assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False
+
+    def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None:
+        bad = tmp_path / "bad.pdf"
+        bad.write_bytes(b"not a pdf")
+        assert is_tagged_pdf(bad) is False