Adds a tagged PDF check as well, for an even better decision to skip OCR in auto mode

This commit is contained in:
Trenton H
2026-03-27 08:45:20 -07:00
parent d89a86643d
commit 6eb6e352da
7 changed files with 162 additions and 9 deletions

View File

@@ -858,8 +858,8 @@ for display in the web interface.
| Document type | `never` | `auto` (default) | `always` |
| -------------------------- | ------- | -------------------------- | -------- |
| Scanned image (TIFF, JPEG) | No | **Yes** | Yes |
| Image-based PDF | No | **Yes** (short/no text) | Yes |
| Born-digital PDF | No | No (has embedded text) | Yes |
| Image-based PDF | No | **Yes** (short/no text, untagged) | Yes |
| Born-digital PDF | No | No (tagged or has embedded text) | Yes |
| Plain text, email, HTML | No | No | No |
| DOCX / ODT (via Tika) | Yes\* | Yes\* | Yes\* |

View File

@@ -57,6 +57,7 @@ from paperless.parsers import ParserProtocol
from paperless.parsers.registry import get_parser_registry
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
from paperless.parsers.utils import extract_pdf_text
from paperless.parsers.utils import is_tagged_pdf
LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -140,6 +141,8 @@ def should_produce_archive(
if mime_type.startswith("image/"):
return True
if mime_type == "application/pdf":
if is_tagged_pdf(document_path):
return False
text = extract_pdf_text(document_path)
return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
return False

View File

@@ -3,13 +3,16 @@
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest
from documents.consumer import should_produce_archive
if TYPE_CHECKING:
from pytest_mock import MockerFixture
def _parser_instance(
*,
@@ -144,14 +147,43 @@ class TestShouldProduceArchive:
)
def test_auto_pdf_archive_decision(
self,
mocker: MockerFixture,
settings,
extracted_text: str | None,
expected: bool, # noqa: FBT001
) -> None:
settings.ARCHIVE_FILE_GENERATION = "auto"
mocker.patch("documents.consumer.is_tagged_pdf", return_value=False)
mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text)
parser = _parser_instance(can_produce=True, requires_rendition=False)
with patch("documents.consumer.extract_pdf_text", return_value=extracted_text):
assert (
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
is expected
)
assert (
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
is expected
)
def test_tagged_pdf_skips_archive_in_auto_mode(
self,
mocker: MockerFixture,
settings,
) -> None:
"""Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length."""
settings.ARCHIVE_FILE_GENERATION = "auto"
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
parser = _parser_instance(can_produce=True, requires_rendition=False)
assert (
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
is False
)
def test_tagged_pdf_does_not_call_pdftotext(
self,
mocker: MockerFixture,
settings,
) -> None:
"""When a PDF is tagged, pdftotext is not invoked (fast path)."""
settings.ARCHIVE_FILE_GENERATION = "auto"
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
mock_extract = mocker.patch("documents.consumer.extract_pdf_text")
parser = _parser_instance(can_produce=True, requires_rendition=False)
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
mock_extract.assert_not_called()

View File

@@ -23,6 +23,7 @@ from paperless.models import CleanChoices
from paperless.models import ModeChoices
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
from paperless.parsers.utils import extract_pdf_text
from paperless.parsers.utils import is_tagged_pdf
from paperless.parsers.utils import read_file_handle_unicode_errors
from paperless.version import __full_version_str__
@@ -441,7 +442,7 @@ class RasterisedDocumentParser:
if mime_type == "application/pdf":
text_original = self.extract_text(None, document_path)
original_has_text = (
original_has_text = is_tagged_pdf(document_path, log=self.log) or (
text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH
)
else:

View File

@@ -24,6 +24,45 @@ logger = logging.getLogger("paperless.parsers.utils")
PDF_TEXT_MIN_LENGTH = 50
def is_tagged_pdf(
path: Path,
log: logging.Logger | None = None,
) -> bool:
"""Return True if the PDF declares itself as tagged (born-digital indicator).
Tagged PDFs (e.g. exported from Word or LibreOffice) have ``/MarkInfo``
with ``/Marked true`` in the document root. This is a reliable signal
that the document has a logical structure and embedded text — running OCR
on it is unnecessary and archive generation can be skipped.
https://github.com/ocrmypdf/OCRmyPDF/blob/4e974ebd465a5921b2e79004f098f5d203010282/src/ocrmypdf/pdfinfo/info.py#L449
Parameters
----------
path:
Absolute path to the PDF file.
log:
Logger for warnings. Falls back to the module-level logger when omitted.
Returns
-------
bool
``True`` when the PDF is tagged, ``False`` otherwise or on any error.
"""
import pikepdf
_log = log or logger
try:
with pikepdf.open(path) as pdf:
mark_info = pdf.Root.get("/MarkInfo")
if mark_info is None:
return False
return bool(mark_info.get("/Marked", False))
except Exception:
_log.warning("Could not check PDF tag status for %s", path, exc_info=True)
return False
def extract_pdf_text(
path: Path,
log: logging.Logger | None = None,

View File

@@ -851,6 +851,59 @@ class TestSkipArchive:
else:
assert tesseract_parser.archive_path is None
def test_tagged_pdf_skips_ocr_in_auto_mode(
self,
mocker: MockerFixture,
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
"""
GIVEN:
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
- Mode: auto, produce_archive=False
WHEN:
- Document is parsed
THEN:
- OCRmyPDF is not invoked (tagged ⇒ original_has_text=True)
- Text is extracted from the original via pdftotext
- No archive is produced
"""
tesseract_parser.settings.mode = "auto"
mock_ocr = mocker.patch("ocrmypdf.ocr")
tesseract_parser.parse(
tesseract_samples_dir / "simple-digital.pdf",
"application/pdf",
produce_archive=False,
)
mock_ocr.assert_not_called()
assert tesseract_parser.archive_path is None
assert tesseract_parser.get_text()
def test_tagged_pdf_produces_pdfa_archive_without_ocr(
self,
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
"""
GIVEN:
- A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true)
- Mode: auto, produce_archive=True
WHEN:
- Document is parsed
THEN:
- OCRmyPDF runs with skip_text (PDF/A conversion only, no OCR)
- Archive is produced
- Text is preserved from the original
"""
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / "simple-digital.pdf",
"application/pdf",
produce_archive=True,
)
assert tesseract_parser.archive_path is not None
assert tesseract_parser.get_text()
# ---------------------------------------------------------------------------
# Parse — mixed pages / sidecar

View File

@@ -0,0 +1,25 @@
"""Tests for paperless.parsers.utils helpers."""
from __future__ import annotations
from pathlib import Path
from paperless.parsers.utils import is_tagged_pdf
SAMPLES = Path(__file__).parent / "samples" / "tesseract"
class TestIsTaggedPdf:
def test_tagged_pdf_returns_true(self) -> None:
assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True
def test_untagged_pdf_returns_false(self) -> None:
assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False
def test_nonexistent_path_returns_false(self) -> None:
assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False
def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None:
bad = tmp_path / "bad.pdf"
bad.write_bytes(b"not a pdf")
assert is_tagged_pdf(bad) is False