From e45f81db395f8677cb286ae4dbf8d128a09a3b69 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 26 Mar 2026 15:40:02 -0700
Subject: [PATCH] refactor: consolidate pdftotext utility and archive-decision
 logic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add extract_pdf_text() and PDF_TEXT_MIN_LENGTH to paperless/parsers/utils.py,
  eliminating duplicate pdftotext call sites in consumer.py and tesseract.py
- Rename _should_produce_archive → should_produce_archive (now public, imported
  by both consumer.py and tasks.py)
- update_document_content_maybe_archive_file now calls should_produce_archive,
  honouring ARCHIVE_FILE_GENERATION the same as the consumer pipeline
- Fallback OCR path sets archive_path when produce_archive=True; update
  test_with_form_redo_produces_no_archive to use produce_archive=False

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/documents/consumer.py                     | 45 ++++------------
 src/documents/tasks.py                        | 12 ++++-
 src/documents/tests/test_barcodes.py          |  2 +-
 src/documents/tests/test_consumer_archive.py  | 30 +++++------
 src/documents/tests/test_management.py        |  5 +-
 src/documents/tests/test_tasks.py             |  1 +
 src/paperless/parsers/tesseract.py            | 38 +++----------
 src/paperless/parsers/utils.py                | 54 ++++++++++++++++++-
 .../tests/parsers/test_tesseract_parser.py    |  3 +-
 9 files changed, 101 insertions(+), 89 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index c5a4c6204..a80594409 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,6 +1,5 @@
 import datetime
 import hashlib
-import logging
 import os
 import shutil
 import tempfile
@@ -56,6 +55,8 @@ from paperless.models import ArchiveFileGenerationChoices
 from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.registry import get_parser_registry
+from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
+from paperless.parsers.utils import extract_pdf_text
 
 LOGGING_NAME: Final[str] = "paperless.consumer"
 
@@ -108,40 +109,12 @@ class ConsumerStatusShortMessage(StrEnum):
     FAILED = "failed"
 
 
-_VALID_TEXT_LENGTH_FOR_ARCHIVE_CHECK = 50
-
-
-def _extract_text_for_archive_check(path: Path) -> str | None:
-    """Run pdftotext on *path* and return the text, or None on any failure.
-
-    Used only for the ARCHIVE_FILE_GENERATION=auto born-digital detection.
-    """
-    try:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            out_path = Path(tmpdir) / "text.txt"
-            run_subprocess(
-                [
-                    "pdftotext",
-                    "-q",
-                    "-layout",
-                    "-enc",
-                    "UTF-8",
-                    str(path),
-                    str(out_path),
-                ],
-                logger=logging.getLogger(__name__),
-            )
-            return out_path.read_text(encoding="utf-8", errors="replace") or None
-    except Exception:
-        return None
-
-
-def _should_produce_archive(
+def should_produce_archive(
     parser: "ParserProtocol",
     mime_type: str,
-    working_copy: Path,
+    document_path: Path,
 ) -> bool:
-    """Return True if the consumer should request a PDF/A archive from the parser.
+    """Return True if a PDF/A archive should be produced for this document.
 
     IMPORTANT: *parser* must be an instantiated parser, not the class.
     ``requires_pdf_rendition`` and ``can_produce_archive`` are instance
@@ -167,8 +140,8 @@ def _should_produce_archive(
     if mime_type.startswith("image/"):
         return True
     if mime_type == "application/pdf":
-        text = _extract_text_for_archive_check(working_copy)
-        return text is None or len(text) <= _VALID_TEXT_LENGTH_FOR_ARCHIVE_CHECK
+        text = extract_pdf_text(document_path)
+        return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
     return False
 
 
@@ -507,7 +480,7 @@ class ConsumerPlugin(
                     )
                     self.log.debug(f"Parsing {self.filename}...")
 
-                    should_produce_archive = _should_produce_archive(
+                    produce_archive = should_produce_archive(
                         document_parser,
                         mime_type,
                         self.working_copy,
@@ -515,7 +488,7 @@ class ConsumerPlugin(
                     document_parser.parse(
                         self.working_copy,
                         mime_type,
-                        produce_archive=should_produce_archive,
+                        produce_archive=produce_archive,
                     )
 
                     self.log.debug(f"Generating thumbnail for {self.filename}...")
diff --git a/src/documents/tasks.py b/src/documents/tasks.py
index 20351a49d..8b5db781c 100644
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -35,6 +35,7 @@ from documents.consumer import AsnCheckPlugin
 from documents.consumer import ConsumerPlugin
 from documents.consumer import ConsumerPreflightPlugin
 from documents.consumer import WorkflowTriggerPlugin
+from documents.consumer import should_produce_archive
 from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
 from documents.double_sided import CollatePlugin
@@ -321,7 +322,16 @@ def update_document_content_maybe_archive_file(document_id) -> None:
         parser.configure(ParserContext())
 
         try:
-            parser.parse(document.source_path, mime_type)
+            produce_archive = should_produce_archive(
+                parser,
+                mime_type,
+                document.source_path,
+            )
+            parser.parse(
+                document.source_path,
+                mime_type,
+                produce_archive=produce_archive,
+            )
 
             thumbnail = parser.get_thumbnail(document.source_path, mime_type)
 
diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py
index b1847f2b4..4d8da62a3 100644
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -1020,7 +1020,7 @@ class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, Tes
         CONSUMER_TAG_BARCODE_SPLIT=True,
         CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"},
         CELERY_TASK_ALWAYS_EAGER=True,
-        OCR_MODE="skip",
+        OCR_MODE="auto",
     )
     def test_consume_barcode_file_tag_split_and_assignment(self) -> None:
         """
diff --git a/src/documents/tests/test_consumer_archive.py b/src/documents/tests/test_consumer_archive.py
index e3e829685..0af416322 100644
--- a/src/documents/tests/test_consumer_archive.py
+++ b/src/documents/tests/test_consumer_archive.py
@@ -1,4 +1,4 @@
-"""Tests for _should_produce_archive() and _extract_text_for_archive_check()."""
+"""Tests for should_produce_archive()."""
 
 from __future__ import annotations
 
@@ -9,7 +9,7 @@ from unittest.mock import patch
 import pytest
 from django.test import override_settings
 
-from documents.consumer import _should_produce_archive
+from documents.consumer import should_produce_archive
 
 
 def _parser_instance(
@@ -57,7 +57,7 @@ class TestShouldProduceArchive:
     @override_settings(ARCHIVE_FILE_GENERATION="never")
     def test_never_setting_returns_false(self) -> None:
         parser = _parser_instance(can_produce=True, requires_rendition=False)
-        result = _should_produce_archive(
+        result = should_produce_archive(
             parser,
             "application/pdf",
             Path("/tmp/doc.pdf"),
@@ -67,7 +67,7 @@ class TestShouldProduceArchive:
     @override_settings(ARCHIVE_FILE_GENERATION="always")
     def test_always_setting_returns_true(self) -> None:
         parser = _parser_instance(can_produce=True, requires_rendition=False)
-        result = _should_produce_archive(
+        result = should_produce_archive(
             parser,
             "application/pdf",
             Path("/tmp/doc.pdf"),
@@ -78,7 +78,7 @@ class TestShouldProduceArchive:
     def test_requires_pdf_rendition_overrides_never(self) -> None:
         """requires_pdf_rendition=True forces archive even when setting is never."""
         parser = _parser_instance(can_produce=True, requires_rendition=True)
-        result = _should_produce_archive(
+        result = should_produce_archive(
             parser,
             "application/pdf",
             Path("/tmp/doc.pdf"),
@@ -89,14 +89,14 @@ class TestShouldProduceArchive:
     def test_cannot_produce_archive_overrides_always(self) -> None:
         """can_produce_archive=False prevents archive even when setting is always."""
         parser = _parser_instance(can_produce=False, requires_rendition=False)
-        result = _should_produce_archive(parser, "text/plain", Path("/tmp/doc.txt"))
+        result = should_produce_archive(parser, "text/plain", Path("/tmp/doc.txt"))
         assert result is False
 
     @override_settings(ARCHIVE_FILE_GENERATION="auto")
     def test_auto_image_returns_true(self) -> None:
         """auto mode: image/* MIME types always produce archive (scanned doc)."""
         parser = _parser_instance(can_produce=True, requires_rendition=False)
-        result = _should_produce_archive(parser, "image/tiff", Path("/tmp/scan.tiff"))
+        result = should_produce_archive(parser, "image/tiff", Path("/tmp/scan.tiff"))
         assert result is True
 
     @override_settings(ARCHIVE_FILE_GENERATION="auto")
@@ -105,10 +105,10 @@ class TestShouldProduceArchive:
         parser = _parser_instance(can_produce=True, requires_rendition=False)
         long_text = "This is a born-digital PDF with lots of text content. " * 10
         with patch(
-            "documents.consumer._extract_text_for_archive_check",
+            "documents.consumer.extract_pdf_text",
             return_value=long_text,
         ):
-            result = _should_produce_archive(
+            result = should_produce_archive(
                 parser,
                 "application/pdf",
                 Path("/tmp/doc.pdf"),
@@ -120,10 +120,10 @@ class TestShouldProduceArchive:
         """auto mode: PDF where pdftotext returns None (scanned) produces archive."""
         parser = _parser_instance(can_produce=True, requires_rendition=False)
         with patch(
-            "documents.consumer._extract_text_for_archive_check",
+            "documents.consumer.extract_pdf_text",
             return_value=None,
         ):
-            result = _should_produce_archive(
+            result = should_produce_archive(
                 parser,
                 "application/pdf",
                 Path("/tmp/scan.pdf"),
@@ -135,10 +135,10 @@ class TestShouldProduceArchive:
         """auto mode: PDF with very short text (<=50 chars) is treated as scanned."""
         parser = _parser_instance(can_produce=True, requires_rendition=False)
         with patch(
-            "documents.consumer._extract_text_for_archive_check",
+            "documents.consumer.extract_pdf_text",
             return_value="tiny",
         ):
-            result = _should_produce_archive(
+            result = should_produce_archive(
                 parser,
                 "application/pdf",
                 Path("/tmp/scan.pdf"),
@@ -149,7 +149,7 @@ class TestShouldProduceArchive:
     def test_auto_non_pdf_non_image_returns_false(self) -> None:
         """auto mode: other MIME types (e.g. email) don't produce archive by default."""
         parser = _parser_instance(can_produce=True, requires_rendition=False)
-        result = _should_produce_archive(
+        result = should_produce_archive(
             parser,
             "message/rfc822",
             Path("/tmp/email.eml"),
@@ -160,7 +160,7 @@ class TestShouldProduceArchive:
     def test_requires_rendition_with_can_produce_false_returns_true(self) -> None:
         """requires_pdf_rendition=True always wins, even if can_produce_archive=False."""
         parser = _parser_instance(can_produce=False, requires_rendition=True)
-        result = _should_produce_archive(
+        result = should_produce_archive(
             parser,
             "application/pdf",
             Path("/tmp/doc.pdf"),
diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py
index 7719d21dd..1624b9a46 100644
--- a/src/documents/tests/test_management.py
+++ b/src/documents/tests/test_management.py
@@ -27,7 +27,10 @@ sample_file: Path = Path(__file__).parent / "samples" / "simple.pdf"
 
 
 @pytest.mark.management
-@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
+@override_settings(
+    FILENAME_FORMAT="{correspondent}/{title}",
+    ARCHIVE_FILE_GENERATION="always",
+)
 class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
     def make_models(self):
         return Document.objects.create(
diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py
index 37f1e6fed..5d16146f7 100644
--- a/src/documents/tests/test_tasks.py
+++ b/src/documents/tests/test_tasks.py
@@ -232,6 +232,7 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
         self.assertEqual(Document.global_objects.count(), 0)
 
 
+@override_settings(ARCHIVE_FILE_GENERATION="always")
 class TestUpdateContent(DirectoriesMixin, TestCase):
     def test_update_content_maybe_archive_file(self) -> None:
         """
diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py
index 5c344fe9f..566acd92e 100644
--- a/src/paperless/parsers/tesseract.py
+++ b/src/paperless/parsers/tesseract.py
@@ -21,6 +21,8 @@ from documents.utils import run_subprocess
 from paperless.config import OcrConfig
 from paperless.models import CleanChoices
 from paperless.models import ModeChoices
+from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH
+from paperless.parsers.utils import extract_pdf_text
 from paperless.parsers.utils import read_file_handle_unicode_errors
 from paperless.version import __full_version_str__
 
@@ -250,36 +252,7 @@ class RasterisedDocumentParser:
         if not Path(pdf_file).is_file():
             return None
 
-        try:
-            text = None
-            with tempfile.NamedTemporaryFile(
-                mode="w+",
-                dir=self.tempdir,
-            ) as tmp:
-                run_subprocess(
-                    [
-                        "pdftotext",
-                        "-q",
-                        "-layout",
-                        "-enc",
-                        "UTF-8",
-                        str(pdf_file),
-                        tmp.name,
-                    ],
-                    logger=self.log,
-                )
-                text = read_file_handle_unicode_errors(Path(tmp.name))
-
-            return post_process_text(text)
-
-        except Exception:
-            #  If pdftotext fails, fall back to OCR.
-            self.log.warning(
-                "Error while getting text from PDF document with pdftotext",
-                exc_info=True,
-            )
-            # probably not a PDF file.
-            return None
+        return post_process_text(extract_pdf_text(Path(pdf_file), log=self.log))
 
     def construct_ocrmypdf_parameters(
         self,
@@ -465,12 +438,11 @@ class RasterisedDocumentParser:
     ) -> None:
         # This forces tesseract to use one core per page.
         os.environ["OMP_THREAD_LIMIT"] = "1"
-        VALID_TEXT_LENGTH = 50
 
         if mime_type == "application/pdf":
             text_original = self.extract_text(None, document_path)
             original_has_text = (
-                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
+                text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH
             )
         else:
             text_original = None
@@ -612,6 +584,8 @@ class RasterisedDocumentParser:
                     sidecar_file_fallback,
                     archive_path_fallback,
                 )
+                if produce_archive:
+                    self.archive_path = archive_path_fallback
             except Exception as e:
                 raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
 
diff --git a/src/paperless/parsers/utils.py b/src/paperless/parsers/utils.py
index 2e6ce7061..68dded3b7 100644
--- a/src/paperless/parsers/utils.py
+++ b/src/paperless/parsers/utils.py
@@ -10,15 +10,65 @@ from __future__ import annotations
 
 import logging
 import re
+import tempfile
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from pathlib import Path
-
     from paperless.parsers import MetadataEntry
 
 logger = logging.getLogger("paperless.parsers.utils")
 
+# Minimum character count for a PDF to be considered "born-digital" (has real text).
+# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision).
+PDF_TEXT_MIN_LENGTH = 50
+
+
+def extract_pdf_text(
+    path: Path,
+    log: logging.Logger | None = None,
+) -> str | None:
+    """Run pdftotext on *path* and return the extracted text, or None on failure.
+
+    Parameters
+    ----------
+    path:
+        Absolute path to the PDF file.
+    log:
+        Logger for warnings.  Falls back to the module-level logger when omitted.
+
+    Returns
+    -------
+    str | None
+        Extracted text, or ``None`` if pdftotext fails or the file is not a PDF.
+    """
+    from documents.utils import run_subprocess
+
+    _log = log or logger
+    try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            out_path = Path(tmpdir) / "text.txt"
+            run_subprocess(
+                [
+                    "pdftotext",
+                    "-q",
+                    "-layout",
+                    "-enc",
+                    "UTF-8",
+                    str(path),
+                    str(out_path),
+                ],
+                logger=_log,
+            )
+            text = read_file_handle_unicode_errors(out_path, log=_log)
+            return text or None
+    except Exception:
+        _log.warning(
+            "Error while getting text from PDF document with pdftotext",
+            exc_info=True,
+        )
+        return None
+
 
 def read_file_handle_unicode_errors(
     filepath: Path,
diff --git a/src/paperless/tests/parsers/test_tesseract_parser.py b/src/paperless/tests/parsers/test_tesseract_parser.py
index bae3ca58b..565361185 100644
--- a/src/paperless/tests/parsers/test_tesseract_parser.py
+++ b/src/paperless/tests/parsers/test_tesseract_parser.py
@@ -408,7 +408,7 @@ class TestParsePdf:
             ["Please enter your name in here:", "This is a PDF document with a form."],
         )
 
-    def test_with_form_redo_produces_no_archive(
+    def test_with_form_redo_no_archive_when_not_requested(
         self,
         tesseract_parser: RasterisedDocumentParser,
         tesseract_samples_dir: Path,
@@ -417,6 +417,7 @@ class TestParsePdf:
         tesseract_parser.parse(
             tesseract_samples_dir / "with-form.pdf",
             "application/pdf",
+            produce_archive=False,
         )
         assert tesseract_parser.archive_path is None
         assert_ordered_substrings(