From dc74e2176f1b559b421c13f1edb09ae4405e71eb Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 26 Mar 2026 14:27:42 -0700
Subject: [PATCH] feat: compute produce_archive from ARCHIVE_FILE_GENERATION,
 pass to parser

Add _extract_text_for_archive_check() and _should_produce_archive() helper
functions to documents/consumer.py. These compute whether the parser should
produce a PDF/A archive based on the ARCHIVE_FILE_GENERATION setting (always/
never/auto), parser capabilities (can_produce_archive, requires_pdf_rendition),
MIME type, and pdftotext-based born-digital detection for auto mode.

Update the parse() call site to compute and pass produce_archive=... kwarg.
Add 10 unit tests in test_consumer_archive.py; update two existing consumer
tests that asserted run_subprocess call counts now that pdftotext is invoked
during auto-mode archive detection.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/documents/consumer.py                    |  78 ++++++++-
 src/documents/tests/test_consumer.py         |  21 ++-
 src/documents/tests/test_consumer_archive.py | 168 +++++++++++++++++++
 3 files changed, 263 insertions(+), 4 deletions(-)
 create mode 100644 src/documents/tests/test_consumer_archive.py

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 8f7efb0a2..c5a4c6204 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,5 +1,6 @@
 import datetime
 import hashlib
+import logging
 import os
 import shutil
 import tempfile
@@ -50,6 +51,8 @@ from documents.templating.workflows import parse_w_workflow_placeholders
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
+from paperless.config import OcrConfig
+from paperless.models import ArchiveFileGenerationChoices
 from paperless.parsers import ParserContext
 from paperless.parsers import ParserProtocol
 from paperless.parsers.registry import get_parser_registry
@@ -105,6 +108,70 @@ class ConsumerStatusShortMessage(StrEnum):
     FAILED = "failed"
 
 
+_VALID_TEXT_LENGTH_FOR_ARCHIVE_CHECK = 50
+
+
+def _extract_text_for_archive_check(path: Path) -> str | None:
+    """Run pdftotext on *path* and return the text, or None on any failure.
+
+    Used only for the ARCHIVE_FILE_GENERATION=auto born-digital detection.
+    """
+    try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            out_path = Path(tmpdir) / "text.txt"
+            run_subprocess(
+                [
+                    "pdftotext",
+                    "-q",
+                    "-layout",
+                    "-enc",
+                    "UTF-8",
+                    str(path),
+                    str(out_path),
+                ],
+                logger=logging.getLogger(__name__),
+            )
+            return out_path.read_text(encoding="utf-8", errors="replace") or None
+    except Exception:
+        return None
+
+
+def _should_produce_archive(
+    parser: "ParserProtocol",
+    mime_type: str,
+    working_copy: Path,
+) -> bool:
+    """Return True if the consumer should request a PDF/A archive from the parser.
+
+    IMPORTANT: *parser* must be an instantiated parser, not the class.
+    ``requires_pdf_rendition`` and ``can_produce_archive`` are instance
+    ``@property`` methods — accessing them on the class returns the descriptor
+    (always truthy).
+    """
+    # Must produce a PDF so the frontend can display the original format at all.
+    if parser.requires_pdf_rendition:
+        return True
+
+    # Parser cannot produce an archive (e.g. TextDocumentParser).
+    if not parser.can_produce_archive:
+        return False
+
+    generation = OcrConfig().archive_file_generation
+
+    if generation == ArchiveFileGenerationChoices.ALWAYS:
+        return True
+    if generation == ArchiveFileGenerationChoices.NEVER:
+        return False
+
+    # auto: produce archives for scanned/image documents; skip for born-digital PDFs.
+    if mime_type.startswith("image/"):
+        return True
+    if mime_type == "application/pdf":
+        text = _extract_text_for_archive_check(working_copy)
+        return text is None or len(text) <= _VALID_TEXT_LENGTH_FOR_ARCHIVE_CHECK
+    return False
+
+
 class ConsumerPluginMixin:
     if TYPE_CHECKING:
         from logging import Logger
@@ -440,7 +507,16 @@ class ConsumerPlugin(
                     )
                     self.log.debug(f"Parsing {self.filename}...")
 
-                    document_parser.parse(self.working_copy, mime_type)
+                    should_produce_archive = _should_produce_archive(
+                        document_parser,
+                        mime_type,
+                        self.working_copy,
+                    )
+                    document_parser.parse(
+                        self.working_copy,
+                        mime_type,
+                        produce_archive=should_produce_archive,
+                    )
 
                     self.log.debug(f"Generating thumbnail for {self.filename}...")
                     self._send_progress(
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index df4c7d9c4..279e4c1b0 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1126,6 +1126,7 @@ class TestConsumer(
             mock_mail_parser_parse.assert_called_once_with(
                 consumer.working_copy,
                 "message/rfc822",
+                produce_archive=True,
             )
 
 
@@ -1273,7 +1274,14 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
     def test_no_pre_consume_script(self, m) -> None:
         with self.get_consumer(self.test_file) as c:
             c.run()
-            m.assert_not_called()
+            # Verify no pre-consume script subprocess was invoked
+            # (run_subprocess may still be called by _extract_text_for_archive_check)
+            script_calls = [
+                call
+                for call in m.call_args_list
+                if call.args and call.args[0] and call.args[0][0] not in ("pdftotext",)
+            ]
+            self.assertEqual(script_calls, [])
 
     @mock.patch("documents.consumer.run_subprocess")
     @override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
@@ -1289,9 +1297,16 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
                 with self.get_consumer(self.test_file) as c:
                     c.run()
 
-                    m.assert_called_once()
+                    self.assertTrue(m.called)
 
-                    args, _ = m.call_args
+                    # Find the call that invoked the pre-consume script
+                    # (run_subprocess may also be called by _extract_text_for_archive_check)
+                    script_call = next(
+                        call
+                        for call in m.call_args_list
+                        if call.args and call.args[0] and call.args[0][0] == script.name
+                    )
+                    args, _ = script_call
 
                     command = args[0]
                     environment = args[1]
diff --git a/src/documents/tests/test_consumer_archive.py b/src/documents/tests/test_consumer_archive.py
new file mode 100644
index 000000000..e3e829685
--- /dev/null
+++ b/src/documents/tests/test_consumer_archive.py
@@ -0,0 +1,168 @@
+"""Tests for _should_produce_archive() and _extract_text_for_archive_check()."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+import pytest
+from django.test import override_settings
+
+from documents.consumer import _should_produce_archive
+
+
+def _parser_instance(
+    *,
+    can_produce: bool = True,
+    requires_rendition: bool = False,
+) -> MagicMock:
+    """Return a mock parser instance with the given capability flags."""
+    instance = MagicMock()
+    instance.can_produce_archive = can_produce
+    instance.requires_pdf_rendition = requires_rendition
+    return instance
+
+
+@pytest.fixture()
+def null_app_config(mocker) -> MagicMock:
+    """Mock ApplicationConfiguration with all fields None → falls back to Django settings."""
+    return mocker.MagicMock(
+        output_type=None,
+        pages=None,
+        language=None,
+        mode=None,
+        archive_file_generation=None,
+        image_dpi=None,
+        unpaper_clean=None,
+        deskew=None,
+        rotate_pages=None,
+        rotate_pages_threshold=None,
+        max_image_pixels=None,
+        color_conversion_strategy=None,
+        user_args=None,
+    )
+
+
+@pytest.fixture(autouse=True)
+def patch_app_config(mocker, null_app_config):
+    """Patch BaseConfig._get_config_instance for all tests in this module."""
+    mocker.patch(
+        "paperless.config.BaseConfig._get_config_instance",
+        return_value=null_app_config,
+    )
+
+
+class TestShouldProduceArchive:
+    @override_settings(ARCHIVE_FILE_GENERATION="never")
+    def test_never_setting_returns_false(self) -> None:
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        result = _should_produce_archive(
+            parser,
+            "application/pdf",
+            Path("/tmp/doc.pdf"),
+        )
+        assert result is False
+
+    @override_settings(ARCHIVE_FILE_GENERATION="always")
+    def test_always_setting_returns_true(self) -> None:
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        result = _should_produce_archive(
+            parser,
+            "application/pdf",
+            Path("/tmp/doc.pdf"),
+        )
+        assert result is True
+
+    @override_settings(ARCHIVE_FILE_GENERATION="never")
+    def test_requires_pdf_rendition_overrides_never(self) -> None:
+        """requires_pdf_rendition=True forces archive even when setting is never."""
+        parser = _parser_instance(can_produce=True, requires_rendition=True)
+        result = _should_produce_archive(
+            parser,
+            "application/pdf",
+            Path("/tmp/doc.pdf"),
+        )
+        assert result is True
+
+    @override_settings(ARCHIVE_FILE_GENERATION="always")
+    def test_cannot_produce_archive_overrides_always(self) -> None:
+        """can_produce_archive=False prevents archive even when setting is always."""
+        parser = _parser_instance(can_produce=False, requires_rendition=False)
+        result = _should_produce_archive(parser, "text/plain", Path("/tmp/doc.txt"))
+        assert result is False
+
+    @override_settings(ARCHIVE_FILE_GENERATION="auto")
+    def test_auto_image_returns_true(self) -> None:
+        """auto mode: image/* MIME types always produce archive (scanned doc)."""
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        result = _should_produce_archive(parser, "image/tiff", Path("/tmp/scan.tiff"))
+        assert result is True
+
+    @override_settings(ARCHIVE_FILE_GENERATION="auto")
+    def test_auto_born_digital_pdf_returns_false(self) -> None:
+        """auto mode: PDF with substantial text (born-digital) skips archive."""
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        long_text = "This is a born-digital PDF with lots of text content. " * 10
+        with patch(
+            "documents.consumer._extract_text_for_archive_check",
+            return_value=long_text,
+        ):
+            result = _should_produce_archive(
+                parser,
+                "application/pdf",
+                Path("/tmp/doc.pdf"),
+            )
+        assert result is False
+
+    @override_settings(ARCHIVE_FILE_GENERATION="auto")
+    def test_auto_scanned_pdf_no_text_returns_true(self) -> None:
+        """auto mode: PDF where pdftotext returns None (scanned) produces archive."""
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        with patch(
+            "documents.consumer._extract_text_for_archive_check",
+            return_value=None,
+        ):
+            result = _should_produce_archive(
+                parser,
+                "application/pdf",
+                Path("/tmp/scan.pdf"),
+            )
+        assert result is True
+
+    @override_settings(ARCHIVE_FILE_GENERATION="auto")
+    def test_auto_pdf_short_text_returns_true(self) -> None:
+        """auto mode: PDF with very short text (<=50 chars) is treated as scanned."""
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        with patch(
+            "documents.consumer._extract_text_for_archive_check",
+            return_value="tiny",
+        ):
+            result = _should_produce_archive(
+                parser,
+                "application/pdf",
+                Path("/tmp/scan.pdf"),
+            )
+        assert result is True
+
+    @override_settings(ARCHIVE_FILE_GENERATION="auto")
+    def test_auto_non_pdf_non_image_returns_false(self) -> None:
+        """auto mode: other MIME types (e.g. email) don't produce archive by default."""
+        parser = _parser_instance(can_produce=True, requires_rendition=False)
+        result = _should_produce_archive(
+            parser,
+            "message/rfc822",
+            Path("/tmp/email.eml"),
+        )
+        assert result is False
+
+    @override_settings(ARCHIVE_FILE_GENERATION="always")
+    def test_requires_rendition_with_can_produce_false_returns_true(self) -> None:
+        """requires_pdf_rendition=True always wins, even if can_produce_archive=False."""
+        parser = _parser_instance(can_produce=False, requires_rendition=True)
+        result = _should_produce_archive(
+            parser,
+            "application/pdf",
+            Path("/tmp/doc.pdf"),
+        )
+        assert result is True