Adds a tagged PDF check as well, for an even better decision to skip OCR in auto mode

This commit is contained in:
Trenton H
2026-03-27 08:45:20 -07:00
parent d89a86643d
commit 6eb6e352da
7 changed files with 162 additions and 9 deletions
+3
View File
@@ -57,6 +57,7 @@ from paperless.parsers import ParserProtocol
from paperless.parsers.registry import get_parser_registry
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
from paperless.parsers.utils import extract_pdf_text
from paperless.parsers.utils import is_tagged_pdf
LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -140,6 +141,8 @@ def should_produce_archive(
if mime_type.startswith("image/"):
return True
if mime_type == "application/pdf":
if is_tagged_pdf(document_path):
return False
text = extract_pdf_text(document_path)
return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
return False
+38 -6
View File
@@ -3,13 +3,16 @@
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest
from documents.consumer import should_produce_archive
if TYPE_CHECKING:
from pytest_mock import MockerFixture
def _parser_instance(
*,
@@ -144,14 +147,43 @@ class TestShouldProduceArchive:
)
def test_auto_pdf_archive_decision(
self,
mocker: MockerFixture,
settings,
extracted_text: str | None,
expected: bool, # noqa: FBT001
) -> None:
settings.ARCHIVE_FILE_GENERATION = "auto"
mocker.patch("documents.consumer.is_tagged_pdf", return_value=False)
mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text)
parser = _parser_instance(can_produce=True, requires_rendition=False)
with patch("documents.consumer.extract_pdf_text", return_value=extracted_text):
assert (
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
is expected
)
assert (
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
is expected
)
def test_tagged_pdf_skips_archive_in_auto_mode(
self,
mocker: MockerFixture,
settings,
) -> None:
"""Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length."""
settings.ARCHIVE_FILE_GENERATION = "auto"
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
parser = _parser_instance(can_produce=True, requires_rendition=False)
assert (
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
is False
)
def test_tagged_pdf_does_not_call_pdftotext(
self,
mocker: MockerFixture,
settings,
) -> None:
"""When a PDF is tagged, pdftotext is not invoked (fast path)."""
settings.ARCHIVE_FILE_GENERATION = "auto"
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
mock_extract = mocker.patch("documents.consumer.extract_pdf_text")
parser = _parser_instance(can_produce=True, requires_rendition=False)
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
mock_extract.assert_not_called()