Add debug level logging for why an archive is made and why we decided OCR or not

2026-07-02 02:04:19 +00:00 · 2026-04-03 09:16:00 -07:00
parent 8115332cc9
commit 91c77c42f0
2 changed files with 57 additions and 1 deletions
@@ -1,4 +1,5 @@
 import datetime
+import logging
 import os
 import shutil
 import tempfile
@@ -114,6 +115,7 @@ def should_produce_archive(
    parser: "ParserProtocol",
    mime_type: str,
    document_path: Path,
+    log: logging.Logger | None = None,
 ) -> bool:
    """Return True if a PDF/A archive should be produced for this document.

@@ -122,29 +124,58 @@ def should_produce_archive(
    ``@property`` methods — accessing them on the class returns the descriptor
    (always truthy).
    """
+    _log = log or logging.getLogger(LOGGING_NAME)
+
    # Must produce a PDF so the frontend can display the original format at all.
    if parser.requires_pdf_rendition:
+        _log.debug("Archive: yes — parser requires PDF rendition for frontend display")
        return True

    # Parser cannot produce an archive (e.g. TextDocumentParser).
    if not parser.can_produce_archive:
+        _log.debug("Archive: no — parser cannot produce archives")
        return False

    generation = OcrConfig().archive_file_generation

    if generation == ArchiveFileGenerationChoices.ALWAYS:
+        _log.debug("Archive: yes — ARCHIVE_FILE_GENERATION=always")
        return True
    if generation == ArchiveFileGenerationChoices.NEVER:
+        _log.debug("Archive: no — ARCHIVE_FILE_GENERATION=never")
        return False

    # auto: produce archives for scanned/image documents; skip for born-digital PDFs.
    if mime_type.startswith("image/"):
+        _log.debug("Archive: yes — image document, ARCHIVE_FILE_GENERATION=auto")
        return True
    if mime_type == "application/pdf":
        if is_tagged_pdf(document_path):
+            _log.debug(
+                "Archive: no — born-digital PDF (structure tags detected),"
+                " ARCHIVE_FILE_GENERATION=auto",
+            )
            return False
        text = extract_pdf_text(document_path)
-        return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
+        if text is None or len(text) <= PDF_TEXT_MIN_LENGTH:
+            _log.debug(
+                "Archive: yes — scanned PDF (text_length=%d ≤ %d),"
+                " ARCHIVE_FILE_GENERATION=auto",
+                len(text) if text else 0,
+                PDF_TEXT_MIN_LENGTH,
+            )
+            return True
+        _log.debug(
+            "Archive: no — born-digital PDF (text_length=%d > %d),"
+            " ARCHIVE_FILE_GENERATION=auto",
+            len(text),
+            PDF_TEXT_MIN_LENGTH,
+        )
+        return False
+    _log.debug(
+        "Archive: no — MIME type %r not eligible for auto archive generation",
+        mime_type,
+    )
    return False


@@ -485,6 +516,7 @@ class ConsumerPlugin(
                        document_parser,
                        mime_type,
                        self.working_copy,
+                        self.log,
                    )
                    document_parser.parse(
                        self.working_copy,
@@ -474,12 +474,28 @@ class RasterisedDocumentParser:
            text_original = None
            original_has_text = False

+        self.log.debug(
+            "Text detection: original_has_text=%s (text_length=%d, mode=%s, produce_archive=%s)",
+            original_has_text,
+            len(text_original) if text_original else 0,
+            self.settings.mode,
+            produce_archive,
+        )
+
        # --- OCR_MODE=off: never invoke OCR engine ---
        if self.settings.mode == ModeChoices.OFF:
            if not produce_archive:
+                self.log.debug(
+                    "OCR: skipped — OCR_MODE=off, no archive requested;"
+                    " returning pdftotext content only",
+                )
                self.text = text_original or ""
                return
            if self.is_image(mime_type):
+                self.log.debug(
+                    "OCR: skipped — OCR_MODE=off, image input;"
+                    " converting to PDF/A without OCR",
+                )
                try:
                    self.archive_path = self._convert_image_to_pdfa(
                        document_path,
@@ -532,6 +548,14 @@ class RasterisedDocumentParser:
        # auto mode with existing text: PDF/A conversion only (no OCR).
        skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text

+        if skip_text:
+            self.log.debug(
+                "OCR strategy: PDF/A conversion only (skip_text)"
+                " — OCR_MODE=auto, document already has text",
+            )
+        else:
+            self.log.debug("OCR strategy: full OCR — OCR_MODE=%s", self.settings.mode)
+
        args = self.construct_ocrmypdf_parameters(
            document_path,
            mime_type,