mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-10 15:49:45 +00:00
Adds a tagged PDF check as well, for an even better decision to skip OCR in auto mode
This commit is contained in:
@@ -57,6 +57,7 @@ from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
||||
from paperless.parsers.utils import extract_pdf_text
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
|
||||
@@ -140,6 +141,8 @@ def should_produce_archive(
|
||||
if mime_type.startswith("image/"):
|
||||
return True
|
||||
if mime_type == "application/pdf":
|
||||
if is_tagged_pdf(document_path):
|
||||
return False
|
||||
text = extract_pdf_text(document_path)
|
||||
return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
|
||||
return False
|
||||
|
||||
@@ -3,13 +3,16 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from documents.consumer import should_produce_archive
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
|
||||
def _parser_instance(
|
||||
*,
|
||||
@@ -144,14 +147,43 @@ class TestShouldProduceArchive:
|
||||
)
|
||||
def test_auto_pdf_archive_decision(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
extracted_text: str | None,
|
||||
expected: bool, # noqa: FBT001
|
||||
) -> None:
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=False)
|
||||
mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text)
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
with patch("documents.consumer.extract_pdf_text", return_value=extracted_text):
|
||||
assert (
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
is expected
|
||||
)
|
||||
assert (
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
is expected
|
||||
)
|
||||
|
||||
def test_tagged_pdf_skips_archive_in_auto_mode(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
) -> None:
|
||||
"""Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length."""
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
assert (
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
is False
|
||||
)
|
||||
|
||||
def test_tagged_pdf_does_not_call_pdftotext(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
settings,
|
||||
) -> None:
|
||||
"""When a PDF is tagged, pdftotext is not invoked (fast path)."""
|
||||
settings.ARCHIVE_FILE_GENERATION = "auto"
|
||||
mocker.patch("documents.consumer.is_tagged_pdf", return_value=True)
|
||||
mock_extract = mocker.patch("documents.consumer.extract_pdf_text")
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf"))
|
||||
mock_extract.assert_not_called()
|
||||
|
||||
Reference in New Issue
Block a user