Adds a tagged PDF check as well, for an even better decision to skip OCR in auto mode

2026-05-12 17:45:24 +00:00 · 2026-03-27 08:45:20 -07:00
parent d89a86643d
commit 6eb6e352da
7 changed files with 162 additions and 9 deletions
@@ -858,8 +858,8 @@ for display in the web interface.
    | Document type              | `never` | `auto` (default)           | `always` |
    | -------------------------- | ------- | -------------------------- | -------- |
    | Scanned image (TIFF, JPEG) | No      | **Yes**                    | Yes      |
-    | Image-based PDF            | No      | **Yes** (short/no text)    | Yes      |
-    | Born-digital PDF           | No      | No (has embedded text)     | Yes      |
+    | Image-based PDF            | No      | **Yes** (short/no text, untagged) | Yes |
+    | Born-digital PDF           | No      | No (tagged or has embedded text)  | Yes |
    | Plain text, email, HTML    | No      | No                         | No       |
    | DOCX / ODT (via Tika)      | Yes\*   | Yes\*                      | Yes\*    |

@@ -57,6 +57,7 @@ from paperless.parsers import ParserProtocol
 from paperless.parsers.registry import get_parser_registry
 from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
 from paperless.parsers.utils import extract_pdf_text
+from paperless.parsers.utils import is_tagged_pdf

 LOGGING_NAME: Final[str] = "paperless.consumer"

@@ -140,6 +141,8 @@ def should_produce_archive(
    if mime_type.startswith("image/"):
        return True
    if mime_type == "application/pdf":
+        if is_tagged_pdf(document_path):
+            return False
        text = extract_pdf_text(document_path)
        return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
    return False
@@ -3,13 +3,16 @@
 from __future__ import annotations

 from pathlib import Path
+from typing import TYPE_CHECKING
 from unittest.mock import MagicMock
-from unittest.mock import patch

 import pytest

 from documents.consumer import should_produce_archive

+if TYPE_CHECKING:
+    from pytest_mock import MockerFixture
+

 def _parser_instance(
    *,
@@ -144,14 +147,43 @@ class TestShouldProduceArchive:
    )
    def test_auto_pdf_archive_decision(
        self,
+        mocker: MockerFixture,
        settings,
        extracted_text: str | None,
        expected: bool,  # noqa: FBT001
    ) -> None:
        settings.ARCHIVE_FILE_GENERATION = "auto"
+        mocker.patch("documents.consumer.is_tagged_pdf", return_value=False)
+        mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text)
        parser = _parser_instance(can_produce=True, requires_rendition=False)
-        with patch("documents.consumer.extract_pdf_text", return_value=extracted_text):
-            assert (
-                should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
-                is expected
-            )
+        assert (
+            should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
+            is expected
+        )
+
+    def test_tagged_pdf_skips_archive_in_auto_mode(
+        self,
+        mocker: MockerFixture,
+        settings,
+    ) -> None:
+        """Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length."""
+        settings.ARCHIVE_FILE_GENERATION = "auto"
+        mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        assert (
+            should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
+            is False
+        )
+
+    def test_tagged_pdf_does_not_call_pdftotext(
+        self,
+        mocker: MockerFixture,
+        settings,
+    ) -> None:
+        """When a PDF is tagged, pdftotext is not invoked (fast path)."""
+        settings.ARCHIVE_FILE_GENERATION = "auto"
+        mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
+        mock_extract = mocker.patch("documents.consumer.extract_pdf_text")
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
+        mock_extract.assert_not_called()
@@ -23,6 +23,7 @@ from paperless.models import CleanChoices
 from paperless.models import ModeChoices
 from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
 from paperless.parsers.utils import extract_pdf_text
+from paperless.parsers.utils import is_tagged_pdf
 from paperless.parsers.utils import read_file_handle_unicode_errors
 from paperless.version import __full_version_str__

@@ -441,7 +442,7 @@ class RasterisedDocumentParser:

        if mime_type == "application/pdf":
            text_original = self.extract_text(None, document_path)
-            original_has_text = (
+            original_has_text = is_tagged_pdf(document_path, log=self.log) or (
                text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH
            )
        else:
@@ -24,6 +24,45 @@ logger = logging.getLogger("paperless.parsers.utils")
 PDF_TEXT_MIN_LENGTH = 50


+def is_tagged_pdf(
+    path: Path,
+    log: logging.Logger | None = None,
+) -> bool:
+    """Return True if the PDF declares itself as tagged (born-digital indicator).
+
+    Tagged PDFs (e.g. exported from Word or LibreOffice) have ``/MarkInfo``
+    with ``/Marked true`` in the document root.  This is a reliable signal
+    that the document has a logical structure and embedded text — running OCR
+    on it is unnecessary and archive generation can be skipped.
+
+    https://github.com/ocrmypdf/OCRmyPDF/blob/4e974ebd465a5921b2e79004f098f5d203010282/src/ocrmypdf/pdfinfo/info.py#L449
+
+    Parameters
+    ----------
+    path:
+        Absolute path to the PDF file.
+    log:
+        Logger for warnings.  Falls back to the module-level logger when omitted.
+
+    Returns
+    -------
+    bool
+        ``True`` when the PDF is tagged, ``False`` otherwise or on any error.
+    """
+    import pikepdf
+
+    _log = log or logger
+    try:
+        with pikepdf.open(path) as pdf:
+            mark_info = pdf.Root.get("/MarkInfo")
+            if mark_info is None:
+                return False
+            return bool(mark_info.get("/Marked", False))
+    except Exception:
+        _log.warning("Could not check PDF tag status for %s", path, exc_info=True)
+        return False
+
+
 def extract_pdf_text(
    path: Path,
    log: logging.Logger | None = None,
@@ -851,6 +851,59 @@ class TestSkipArchive:
        else:
            assert tesseract_parser.archive_path is None

+    def test_tagged_pdf_skips_ocr_in_auto_mode(
+        self,
+        mocker: MockerFixture,
+        tesseract_parser: RasterisedDocumentParser,
+        tesseract_samples_dir: Path,
+    ) -> None:
+        """
+        GIVEN:
+            - A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
+            - Mode: auto, produce_archive=False
+        WHEN:
+            - Document is parsed
+        THEN:
+            - OCRmyPDF is not invoked (tagged ⇒ original_has_text=True)
+            - Text is extracted from the original via pdftotext
+            - No archive is produced
+        """
+        tesseract_parser.settings.mode = "auto"
+        mock_ocr = mocker.patch("ocrmypdf.ocr")
+        tesseract_parser.parse(
+            tesseract_samples_dir / "simple-digital.pdf",
+            "application/pdf",
+            produce_archive=False,
+        )
+        mock_ocr.assert_not_called()
+        assert tesseract_parser.archive_path is None
+        assert tesseract_parser.get_text()
+
+    def test_tagged_pdf_produces_pdfa_archive_without_ocr(
+        self,
+        tesseract_parser: RasterisedDocumentParser,
+        tesseract_samples_dir: Path,
+    ) -> None:
+        """
+        GIVEN:
+            - A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
+            - Mode: auto, produce_archive=True
+        WHEN:
+            - Document is parsed
+        THEN:
+            - OCRmyPDF runs with skip_text (PDF/A conversion only, no OCR)
+            - Archive is produced
+            - Text is preserved from the original
+        """
+        tesseract_parser.settings.mode = "auto"
+        tesseract_parser.parse(
+            tesseract_samples_dir / "simple-digital.pdf",
+            "application/pdf",
+            produce_archive=True,
+        )
+        assert tesseract_parser.archive_path is not None
+        assert tesseract_parser.get_text()
+

 # ---------------------------------------------------------------------------
 # Parse — mixed pages / sidecar
@@ -0,0 +1,25 @@
+"""Tests for paperless.parsers.utils helpers."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from paperless.parsers.utils import is_tagged_pdf
+
+SAMPLES = Path(__file__).parent / "samples" / "tesseract"
+
+
+class TestIsTaggedPdf:
+    def test_tagged_pdf_returns_true(self) -> None:
+        assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True
+
+    def test_untagged_pdf_returns_false(self) -> None:
+        assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False
+
+    def test_nonexistent_path_returns_false(self) -> None:
+        assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False
+
+    def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None:
+        bad = tmp_path / "bad.pdf"
+        bad.write_bytes(b"not a pdf")
+        assert is_tagged_pdf(bad) is False