mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-29 04:12:45 +00:00
refactor: consolidate pdftotext utility and archive-decision logic
- Add extract_pdf_text() and PDF_TEXT_MIN_LENGTH to paperless/parsers/utils.py, eliminating duplicate pdftotext call sites in consumer.py and tesseract.py - Rename _should_produce_archive → should_produce_archive (now public, imported by both consumer.py and tasks.py) - update_document_content_maybe_archive_file now calls should_produce_archive, honouring ARCHIVE_FILE_GENERATION the same as the consumer pipeline - Fallback OCR path sets archive_path when produce_archive=True; update test_with_form_redo_produces_no_archive to use produce_archive=False Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
@@ -56,6 +55,8 @@ from paperless.models import ArchiveFileGenerationChoices
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
||||
from paperless.parsers.utils import extract_pdf_text
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
|
||||
@@ -108,40 +109,12 @@ class ConsumerStatusShortMessage(StrEnum):
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
_VALID_TEXT_LENGTH_FOR_ARCHIVE_CHECK = 50
|
||||
|
||||
|
||||
def _extract_text_for_archive_check(path: Path) -> str | None:
|
||||
"""Run pdftotext on *path* and return the text, or None on any failure.
|
||||
|
||||
Used only for the ARCHIVE_FILE_GENERATION=auto born-digital detection.
|
||||
"""
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
out_path = Path(tmpdir) / "text.txt"
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
str(path),
|
||||
str(out_path),
|
||||
],
|
||||
logger=logging.getLogger(__name__),
|
||||
)
|
||||
return out_path.read_text(encoding="utf-8", errors="replace") or None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _should_produce_archive(
|
||||
def should_produce_archive(
|
||||
parser: "ParserProtocol",
|
||||
mime_type: str,
|
||||
working_copy: Path,
|
||||
document_path: Path,
|
||||
) -> bool:
|
||||
"""Return True if the consumer should request a PDF/A archive from the parser.
|
||||
"""Return True if a PDF/A archive should be produced for this document.
|
||||
|
||||
IMPORTANT: *parser* must be an instantiated parser, not the class.
|
||||
``requires_pdf_rendition`` and ``can_produce_archive`` are instance
|
||||
@@ -167,8 +140,8 @@ def _should_produce_archive(
|
||||
if mime_type.startswith("image/"):
|
||||
return True
|
||||
if mime_type == "application/pdf":
|
||||
text = _extract_text_for_archive_check(working_copy)
|
||||
return text is None or len(text) <= _VALID_TEXT_LENGTH_FOR_ARCHIVE_CHECK
|
||||
text = extract_pdf_text(document_path)
|
||||
return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
|
||||
return False
|
||||
|
||||
|
||||
@@ -507,7 +480,7 @@ class ConsumerPlugin(
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
should_produce_archive = _should_produce_archive(
|
||||
produce_archive = should_produce_archive(
|
||||
document_parser,
|
||||
mime_type,
|
||||
self.working_copy,
|
||||
@@ -515,7 +488,7 @@ class ConsumerPlugin(
|
||||
document_parser.parse(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
produce_archive=should_produce_archive,
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
|
||||
@@ -35,6 +35,7 @@ from documents.consumer import AsnCheckPlugin
|
||||
from documents.consumer import ConsumerPlugin
|
||||
from documents.consumer import ConsumerPreflightPlugin
|
||||
from documents.consumer import WorkflowTriggerPlugin
|
||||
from documents.consumer import should_produce_archive
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.double_sided import CollatePlugin
|
||||
@@ -321,7 +322,16 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
parser.configure(ParserContext())
|
||||
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type)
|
||||
produce_archive = should_produce_archive(
|
||||
parser,
|
||||
mime_type,
|
||||
document.source_path,
|
||||
)
|
||||
parser.parse(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
produce_archive=produce_archive,
|
||||
)
|
||||
|
||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||
|
||||
|
||||
@@ -1020,7 +1020,7 @@ class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, Tes
|
||||
CONSUMER_TAG_BARCODE_SPLIT=True,
|
||||
CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"},
|
||||
CELERY_TASK_ALWAYS_EAGER=True,
|
||||
OCR_MODE="skip",
|
||||
OCR_MODE="auto",
|
||||
)
|
||||
def test_consume_barcode_file_tag_split_and_assignment(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Tests for _should_produce_archive() and _extract_text_for_archive_check()."""
|
||||
"""Tests for should_produce_archive()."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -9,7 +9,7 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.consumer import _should_produce_archive
|
||||
from documents.consumer import should_produce_archive
|
||||
|
||||
|
||||
def _parser_instance(
|
||||
@@ -57,7 +57,7 @@ class TestShouldProduceArchive:
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="never")
|
||||
def test_never_setting_returns_false(self) -> None:
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
result = _should_produce_archive(
|
||||
result = should_produce_archive(
|
||||
parser,
|
||||
"application/pdf",
|
||||
Path("/tmp/doc.pdf"),
|
||||
@@ -67,7 +67,7 @@ class TestShouldProduceArchive:
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
||||
def test_always_setting_returns_true(self) -> None:
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
result = _should_produce_archive(
|
||||
result = should_produce_archive(
|
||||
parser,
|
||||
"application/pdf",
|
||||
Path("/tmp/doc.pdf"),
|
||||
@@ -78,7 +78,7 @@ class TestShouldProduceArchive:
|
||||
def test_requires_pdf_rendition_overrides_never(self) -> None:
|
||||
"""requires_pdf_rendition=True forces archive even when setting is never."""
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=True)
|
||||
result = _should_produce_archive(
|
||||
result = should_produce_archive(
|
||||
parser,
|
||||
"application/pdf",
|
||||
Path("/tmp/doc.pdf"),
|
||||
@@ -89,14 +89,14 @@ class TestShouldProduceArchive:
|
||||
def test_cannot_produce_archive_overrides_always(self) -> None:
|
||||
"""can_produce_archive=False prevents archive even when setting is always."""
|
||||
parser = _parser_instance(can_produce=False, requires_rendition=False)
|
||||
result = _should_produce_archive(parser, "text/plain", Path("/tmp/doc.txt"))
|
||||
result = should_produce_archive(parser, "text/plain", Path("/tmp/doc.txt"))
|
||||
assert result is False
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="auto")
|
||||
def test_auto_image_returns_true(self) -> None:
|
||||
"""auto mode: image/* MIME types always produce archive (scanned doc)."""
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
result = _should_produce_archive(parser, "image/tiff", Path("/tmp/scan.tiff"))
|
||||
result = should_produce_archive(parser, "image/tiff", Path("/tmp/scan.tiff"))
|
||||
assert result is True
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="auto")
|
||||
@@ -105,10 +105,10 @@ class TestShouldProduceArchive:
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
long_text = "This is a born-digital PDF with lots of text content. " * 10
|
||||
with patch(
|
||||
"documents.consumer._extract_text_for_archive_check",
|
||||
"documents.consumer.extract_pdf_text",
|
||||
return_value=long_text,
|
||||
):
|
||||
result = _should_produce_archive(
|
||||
result = should_produce_archive(
|
||||
parser,
|
||||
"application/pdf",
|
||||
Path("/tmp/doc.pdf"),
|
||||
@@ -120,10 +120,10 @@ class TestShouldProduceArchive:
|
||||
"""auto mode: PDF where pdftotext returns None (scanned) produces archive."""
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
with patch(
|
||||
"documents.consumer._extract_text_for_archive_check",
|
||||
"documents.consumer.extract_pdf_text",
|
||||
return_value=None,
|
||||
):
|
||||
result = _should_produce_archive(
|
||||
result = should_produce_archive(
|
||||
parser,
|
||||
"application/pdf",
|
||||
Path("/tmp/scan.pdf"),
|
||||
@@ -135,10 +135,10 @@ class TestShouldProduceArchive:
|
||||
"""auto mode: PDF with very short text (<=50 chars) is treated as scanned."""
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
with patch(
|
||||
"documents.consumer._extract_text_for_archive_check",
|
||||
"documents.consumer.extract_pdf_text",
|
||||
return_value="tiny",
|
||||
):
|
||||
result = _should_produce_archive(
|
||||
result = should_produce_archive(
|
||||
parser,
|
||||
"application/pdf",
|
||||
Path("/tmp/scan.pdf"),
|
||||
@@ -149,7 +149,7 @@ class TestShouldProduceArchive:
|
||||
def test_auto_non_pdf_non_image_returns_false(self) -> None:
|
||||
"""auto mode: other MIME types (e.g. email) don't produce archive by default."""
|
||||
parser = _parser_instance(can_produce=True, requires_rendition=False)
|
||||
result = _should_produce_archive(
|
||||
result = should_produce_archive(
|
||||
parser,
|
||||
"message/rfc822",
|
||||
Path("/tmp/email.eml"),
|
||||
@@ -160,7 +160,7 @@ class TestShouldProduceArchive:
|
||||
def test_requires_rendition_with_can_produce_false_returns_true(self) -> None:
|
||||
"""requires_pdf_rendition=True always wins, even if can_produce_archive=False."""
|
||||
parser = _parser_instance(can_produce=False, requires_rendition=True)
|
||||
result = _should_produce_archive(
|
||||
result = should_produce_archive(
|
||||
parser,
|
||||
"application/pdf",
|
||||
Path("/tmp/doc.pdf"),
|
||||
|
||||
@@ -27,7 +27,10 @@ sample_file: Path = Path(__file__).parent / "samples" / "simple.pdf"
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{correspondent}/{title}",
|
||||
ARCHIVE_FILE_GENERATION="always",
|
||||
)
|
||||
class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
def make_models(self):
|
||||
return Document.objects.create(
|
||||
|
||||
@@ -232,6 +232,7 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertEqual(Document.global_objects.count(), 0)
|
||||
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
||||
class TestUpdateContent(DirectoriesMixin, TestCase):
|
||||
def test_update_content_maybe_archive_file(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -21,6 +21,8 @@ from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
|
||||
from paperless.parsers.utils import extract_pdf_text
|
||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
@@ -250,36 +252,7 @@ class RasterisedDocumentParser:
|
||||
if not Path(pdf_file).is_file():
|
||||
return None
|
||||
|
||||
try:
|
||||
text = None
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w+",
|
||||
dir=self.tempdir,
|
||||
) as tmp:
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
str(pdf_file),
|
||||
tmp.name,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
text = read_file_handle_unicode_errors(Path(tmp.name))
|
||||
|
||||
return post_process_text(text)
|
||||
|
||||
except Exception:
|
||||
# If pdftotext fails, fall back to OCR.
|
||||
self.log.warning(
|
||||
"Error while getting text from PDF document with pdftotext",
|
||||
exc_info=True,
|
||||
)
|
||||
# probably not a PDF file.
|
||||
return None
|
||||
return post_process_text(extract_pdf_text(Path(pdf_file), log=self.log))
|
||||
|
||||
def construct_ocrmypdf_parameters(
|
||||
self,
|
||||
@@ -465,12 +438,11 @@ class RasterisedDocumentParser:
|
||||
) -> None:
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||
VALID_TEXT_LENGTH = 50
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
text_original = self.extract_text(None, document_path)
|
||||
original_has_text = (
|
||||
text_original is not None and len(text_original) > VALID_TEXT_LENGTH
|
||||
text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH
|
||||
)
|
||||
else:
|
||||
text_original = None
|
||||
@@ -612,6 +584,8 @@ class RasterisedDocumentParser:
|
||||
sidecar_file_fallback,
|
||||
archive_path_fallback,
|
||||
)
|
||||
if produce_archive:
|
||||
self.archive_path = archive_path_fallback
|
||||
except Exception as e:
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
|
||||
|
||||
@@ -10,15 +10,65 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
logger = logging.getLogger("paperless.parsers.utils")
|
||||
|
||||
# Minimum character count for a PDF to be considered "born-digital" (has real text).
|
||||
# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
|
||||
PDF_TEXT_MIN_LENGTH = 50
|
||||
|
||||
|
||||
def extract_pdf_text(
|
||||
path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> str | None:
|
||||
"""Run pdftotext on *path* and return the extracted text, or None on failure.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger for warnings. Falls back to the module-level logger when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or ``None`` if pdftotext fails or the file is not a PDF.
|
||||
"""
|
||||
from documents.utils import run_subprocess
|
||||
|
||||
_log = log or logger
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
out_path = Path(tmpdir) / "text.txt"
|
||||
run_subprocess(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
str(path),
|
||||
str(out_path),
|
||||
],
|
||||
logger=_log,
|
||||
)
|
||||
text = read_file_handle_unicode_errors(out_path, log=_log)
|
||||
return text or None
|
||||
except Exception:
|
||||
_log.warning(
|
||||
"Error while getting text from PDF document with pdftotext",
|
||||
exc_info=True,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def read_file_handle_unicode_errors(
|
||||
filepath: Path,
|
||||
|
||||
@@ -408,7 +408,7 @@ class TestParsePdf:
|
||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
||||
)
|
||||
|
||||
def test_with_form_redo_produces_no_archive(
|
||||
def test_with_form_redo_no_archive_when_not_requested(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
@@ -417,6 +417,7 @@ class TestParsePdf:
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "with-form.pdf",
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
|
||||
Reference in New Issue
Block a user