From e45f81db395f8677cb286ae4dbf8d128a09a3b69 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:40:02 -0700 Subject: [PATCH] refactor: consolidate pdftotext utility and archive-decision logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add extract_pdf_text() and PDF_TEXT_MIN_LENGTH to paperless/parsers/utils.py, eliminating duplicate pdftotext call sites in consumer.py and tesseract.py - Rename _should_produce_archive → should_produce_archive (now public, imported by both consumer.py and tasks.py) - update_document_content_maybe_archive_file now calls should_produce_archive, honouring ARCHIVE_FILE_GENERATION the same as the consumer pipeline - Fallback OCR path sets archive_path when produce_archive=True; update test_with_form_redo_produces_no_archive to use produce_archive=False Co-Authored-By: Claude Sonnet 4.6 --- src/documents/consumer.py | 45 ++++------------ src/documents/tasks.py | 12 ++++- src/documents/tests/test_barcodes.py | 2 +- src/documents/tests/test_consumer_archive.py | 30 +++++------ src/documents/tests/test_management.py | 5 +- src/documents/tests/test_tasks.py | 1 + src/paperless/parsers/tesseract.py | 38 +++---------- src/paperless/parsers/utils.py | 54 ++++++++++++++++++- .../tests/parsers/test_tesseract_parser.py | 3 +- 9 files changed, 101 insertions(+), 89 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index c5a4c6204..a80594409 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,6 +1,5 @@ import datetime import hashlib -import logging import os import shutil import tempfile @@ -56,6 +55,8 @@ from paperless.models import ArchiveFileGenerationChoices from paperless.parsers import ParserContext from paperless.parsers import ParserProtocol from paperless.parsers.registry import get_parser_registry +from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH +from paperless.parsers.utils import extract_pdf_text LOGGING_NAME: Final[str] = "paperless.consumer" @@ -108,40 +109,12 @@ class ConsumerStatusShortMessage(StrEnum): FAILED = "failed" -_VALID_TEXT_LENGTH_FOR_ARCHIVE_CHECK = 50 - - -def _extract_text_for_archive_check(path: Path) -> str | None: - """Run pdftotext on *path* and return the text, or None on any failure. - - Used only for the ARCHIVE_FILE_GENERATION=auto born-digital detection. - """ - try: - with tempfile.TemporaryDirectory() as tmpdir: - out_path = Path(tmpdir) / "text.txt" - run_subprocess( - [ - "pdftotext", - "-q", - "-layout", - "-enc", - "UTF-8", - str(path), - str(out_path), - ], - logger=logging.getLogger(__name__), - ) - return out_path.read_text(encoding="utf-8", errors="replace") or None - except Exception: - return None - - -def _should_produce_archive( +def should_produce_archive( parser: "ParserProtocol", mime_type: str, - working_copy: Path, + document_path: Path, ) -> bool: - """Return True if the consumer should request a PDF/A archive from the parser. + """Return True if a PDF/A archive should be produced for this document. IMPORTANT: *parser* must be an instantiated parser, not the class. ``requires_pdf_rendition`` and ``can_produce_archive`` are instance @@ -167,8 +140,8 @@ def _should_produce_archive( if mime_type.startswith("image/"): return True if mime_type == "application/pdf": - text = _extract_text_for_archive_check(working_copy) - return text is None or len(text) <= _VALID_TEXT_LENGTH_FOR_ARCHIVE_CHECK + text = extract_pdf_text(document_path) + return text is None or len(text) <= PDF_TEXT_MIN_LENGTH return False @@ -507,7 +480,7 @@ class ConsumerPlugin( ) self.log.debug(f"Parsing {self.filename}...") - should_produce_archive = _should_produce_archive( + produce_archive = should_produce_archive( document_parser, mime_type, self.working_copy, @@ -515,7 +488,7 @@ class ConsumerPlugin( document_parser.parse( self.working_copy, mime_type, - produce_archive=should_produce_archive, + produce_archive=produce_archive, ) self.log.debug(f"Generating thumbnail for {self.filename}...") diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 20351a49d..8b5db781c 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -35,6 +35,7 @@ from documents.consumer import AsnCheckPlugin from documents.consumer import ConsumerPlugin from documents.consumer import ConsumerPreflightPlugin from documents.consumer import WorkflowTriggerPlugin +from documents.consumer import should_produce_archive from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides from documents.double_sided import CollatePlugin @@ -321,7 +322,16 @@ def update_document_content_maybe_archive_file(document_id) -> None: parser.configure(ParserContext()) try: - parser.parse(document.source_path, mime_type) + produce_archive = should_produce_archive( + parser, + mime_type, + document.source_path, + ) + parser.parse( + document.source_path, + mime_type, + produce_archive=produce_archive, + ) thumbnail = parser.get_thumbnail(document.source_path, mime_type) diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index b1847f2b4..4d8da62a3 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -1020,7 +1020,7 @@ class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, Tes CONSUMER_TAG_BARCODE_SPLIT=True, CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"}, CELERY_TASK_ALWAYS_EAGER=True, - OCR_MODE="skip", + OCR_MODE="auto", ) def test_consume_barcode_file_tag_split_and_assignment(self) -> None: """ diff --git a/src/documents/tests/test_consumer_archive.py b/src/documents/tests/test_consumer_archive.py index e3e829685..0af416322 100644 --- a/src/documents/tests/test_consumer_archive.py +++ b/src/documents/tests/test_consumer_archive.py @@ -1,4 +1,4 @@ -"""Tests for _should_produce_archive() and _extract_text_for_archive_check().""" +"""Tests for should_produce_archive().""" from __future__ import annotations @@ -9,7 +9,7 @@ from unittest.mock import patch import pytest from django.test import override_settings -from documents.consumer import _should_produce_archive +from documents.consumer import should_produce_archive def _parser_instance( @@ -57,7 +57,7 @@ class TestShouldProduceArchive: @override_settings(ARCHIVE_FILE_GENERATION="never") def test_never_setting_returns_false(self) -> None: parser = _parser_instance(can_produce=True, requires_rendition=False) - result = _should_produce_archive( + result = should_produce_archive( parser, "application/pdf", Path("/tmp/doc.pdf"), @@ -67,7 +67,7 @@ class TestShouldProduceArchive: @override_settings(ARCHIVE_FILE_GENERATION="always") def test_always_setting_returns_true(self) -> None: parser = _parser_instance(can_produce=True, requires_rendition=False) - result = _should_produce_archive( + result = should_produce_archive( parser, "application/pdf", Path("/tmp/doc.pdf"), @@ -78,7 +78,7 @@ class TestShouldProduceArchive: def test_requires_pdf_rendition_overrides_never(self) -> None: """requires_pdf_rendition=True forces archive even when setting is never.""" parser = _parser_instance(can_produce=True, requires_rendition=True) - result = _should_produce_archive( + result = should_produce_archive( parser, "application/pdf", Path("/tmp/doc.pdf"), @@ -89,14 +89,14 @@ class TestShouldProduceArchive: def test_cannot_produce_archive_overrides_always(self) -> None: """can_produce_archive=False prevents archive even when setting is always.""" parser = _parser_instance(can_produce=False, requires_rendition=False) - result = _should_produce_archive(parser, "text/plain", Path("/tmp/doc.txt")) + result = should_produce_archive(parser, "text/plain", Path("/tmp/doc.txt")) assert result is False @override_settings(ARCHIVE_FILE_GENERATION="auto") def test_auto_image_returns_true(self) -> None: """auto mode: image/* MIME types always produce archive (scanned doc).""" parser = _parser_instance(can_produce=True, requires_rendition=False) - result = _should_produce_archive(parser, "image/tiff", Path("/tmp/scan.tiff")) + result = should_produce_archive(parser, "image/tiff", Path("/tmp/scan.tiff")) assert result is True @override_settings(ARCHIVE_FILE_GENERATION="auto") @@ -105,10 +105,10 @@ class TestShouldProduceArchive: parser = _parser_instance(can_produce=True, requires_rendition=False) long_text = "This is a born-digital PDF with lots of text content. " * 10 with patch( - "documents.consumer._extract_text_for_archive_check", + "documents.consumer.extract_pdf_text", return_value=long_text, ): - result = _should_produce_archive( + result = should_produce_archive( parser, "application/pdf", Path("/tmp/doc.pdf"), @@ -120,10 +120,10 @@ class TestShouldProduceArchive: """auto mode: PDF where pdftotext returns None (scanned) produces archive.""" parser = _parser_instance(can_produce=True, requires_rendition=False) with patch( - "documents.consumer._extract_text_for_archive_check", + "documents.consumer.extract_pdf_text", return_value=None, ): - result = _should_produce_archive( + result = should_produce_archive( parser, "application/pdf", Path("/tmp/scan.pdf"), @@ -135,10 +135,10 @@ class TestShouldProduceArchive: """auto mode: PDF with very short text (<=50 chars) is treated as scanned.""" parser = _parser_instance(can_produce=True, requires_rendition=False) with patch( - "documents.consumer._extract_text_for_archive_check", + "documents.consumer.extract_pdf_text", return_value="tiny", ): - result = _should_produce_archive( + result = should_produce_archive( parser, "application/pdf", Path("/tmp/scan.pdf"), @@ -149,7 +149,7 @@ class TestShouldProduceArchive: def test_auto_non_pdf_non_image_returns_false(self) -> None: """auto mode: other MIME types (e.g. email) don't produce archive by default.""" parser = _parser_instance(can_produce=True, requires_rendition=False) - result = _should_produce_archive( + result = should_produce_archive( parser, "message/rfc822", Path("/tmp/email.eml"), @@ -160,7 +160,7 @@ class TestShouldProduceArchive: def test_requires_rendition_with_can_produce_false_returns_true(self) -> None: """requires_pdf_rendition=True always wins, even if can_produce_archive=False.""" parser = _parser_instance(can_produce=False, requires_rendition=True) - result = _should_produce_archive( + result = should_produce_archive( parser, "application/pdf", Path("/tmp/doc.pdf"), diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py index 7719d21dd..1624b9a46 100644 --- a/src/documents/tests/test_management.py +++ b/src/documents/tests/test_management.py @@ -27,7 +27,10 @@ sample_file: Path = Path(__file__).parent / "samples" / "simple.pdf" @pytest.mark.management -@override_settings(FILENAME_FORMAT="{correspondent}/{title}") +@override_settings( + FILENAME_FORMAT="{correspondent}/{title}", + ARCHIVE_FILE_GENERATION="always", +) class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def make_models(self): return Document.objects.create( diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index 37f1e6fed..5d16146f7 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -232,6 +232,7 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertEqual(Document.global_objects.count(), 0) +@override_settings(ARCHIVE_FILE_GENERATION="always") class TestUpdateContent(DirectoriesMixin, TestCase): def test_update_content_maybe_archive_file(self) -> None: """ diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index 5c344fe9f..566acd92e 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -21,6 +21,8 @@ from documents.utils import run_subprocess from paperless.config import OcrConfig from paperless.models import CleanChoices from paperless.models import ModeChoices +from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH +from paperless.parsers.utils import extract_pdf_text from paperless.parsers.utils import read_file_handle_unicode_errors from paperless.version import __full_version_str__ @@ -250,36 +252,7 @@ class RasterisedDocumentParser: if not Path(pdf_file).is_file(): return None - try: - text = None - with tempfile.NamedTemporaryFile( - mode="w+", - dir=self.tempdir, - ) as tmp: - run_subprocess( - [ - "pdftotext", - "-q", - "-layout", - "-enc", - "UTF-8", - str(pdf_file), - tmp.name, - ], - logger=self.log, - ) - text = read_file_handle_unicode_errors(Path(tmp.name)) - - return post_process_text(text) - - except Exception: - # If pdftotext fails, fall back to OCR. - self.log.warning( - "Error while getting text from PDF document with pdftotext", - exc_info=True, - ) - # probably not a PDF file. - return None + return post_process_text(extract_pdf_text(Path(pdf_file), log=self.log)) def construct_ocrmypdf_parameters( self, @@ -465,12 +438,11 @@ class RasterisedDocumentParser: ) -> None: # This forces tesseract to use one core per page. os.environ["OMP_THREAD_LIMIT"] = "1" - VALID_TEXT_LENGTH = 50 if mime_type == "application/pdf": text_original = self.extract_text(None, document_path) original_has_text = ( - text_original is not None and len(text_original) > VALID_TEXT_LENGTH + text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH ) else: text_original = None @@ -612,6 +584,8 @@ class RasterisedDocumentParser: sidecar_file_fallback, archive_path_fallback, ) + if produce_archive: + self.archive_path = archive_path_fallback except Exception as e: raise ParseError(f"{e.__class__.__name__}: {e!s}") from e diff --git a/src/paperless/parsers/utils.py b/src/paperless/parsers/utils.py index 2e6ce7061..68dded3b7 100644 --- a/src/paperless/parsers/utils.py +++ b/src/paperless/parsers/utils.py @@ -10,15 +10,65 @@ from __future__ import annotations import logging import re +import tempfile +from pathlib import Path from typing import TYPE_CHECKING if TYPE_CHECKING: - from pathlib import Path - from paperless.parsers import MetadataEntry logger = logging.getLogger("paperless.parsers.utils") +# Minimum character count for a PDF to be considered "born-digital" (has real text). +# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision). +PDF_TEXT_MIN_LENGTH = 50 + + +def extract_pdf_text( + path: Path, + log: logging.Logger | None = None, +) -> str | None: + """Run pdftotext on *path* and return the extracted text, or None on failure. + + Parameters + ---------- + path: + Absolute path to the PDF file. + log: + Logger for warnings. Falls back to the module-level logger when omitted. + + Returns + ------- + str | None + Extracted text, or ``None`` if pdftotext fails or the file is not a PDF. + """ + from documents.utils import run_subprocess + + _log = log or logger + try: + with tempfile.TemporaryDirectory() as tmpdir: + out_path = Path(tmpdir) / "text.txt" + run_subprocess( + [ + "pdftotext", + "-q", + "-layout", + "-enc", + "UTF-8", + str(path), + str(out_path), + ], + logger=_log, + ) + text = read_file_handle_unicode_errors(out_path, log=_log) + return text or None + except Exception: + _log.warning( + "Error while getting text from PDF document with pdftotext", + exc_info=True, + ) + return None + def read_file_handle_unicode_errors( filepath: Path, diff --git a/src/paperless/tests/parsers/test_tesseract_parser.py b/src/paperless/tests/parsers/test_tesseract_parser.py index bae3ca58b..565361185 100644 --- a/src/paperless/tests/parsers/test_tesseract_parser.py +++ b/src/paperless/tests/parsers/test_tesseract_parser.py @@ -408,7 +408,7 @@ class TestParsePdf: ["Please enter your name in here:", "This is a PDF document with a form."], ) - def test_with_form_redo_produces_no_archive( + def test_with_form_redo_no_archive_when_not_requested( self, tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, @@ -417,6 +417,7 @@ class TestParsePdf: tesseract_parser.parse( tesseract_samples_dir / "with-form.pdf", "application/pdf", + produce_archive=False, ) assert tesseract_parser.archive_path is None assert_ordered_substrings(