Add debug level logging for why an archive is made and why we decided OCR or not

This commit is contained in:
Trenton H
2026-04-03 09:16:00 -07:00
parent 8115332cc9
commit 91c77c42f0
2 changed files with 57 additions and 1 deletions
+33 -1
View File
@@ -1,4 +1,5 @@
import datetime
import logging
import os
import shutil
import tempfile
@@ -114,6 +115,7 @@ def should_produce_archive(
parser: "ParserProtocol",
mime_type: str,
document_path: Path,
log: logging.Logger | None = None,
) -> bool:
"""Return True if a PDF/A archive should be produced for this document.
@@ -122,29 +124,58 @@ def should_produce_archive(
``@property`` methods — accessing them on the class returns the descriptor
(always truthy).
"""
_log = log or logging.getLogger(LOGGING_NAME)
# Must produce a PDF so the frontend can display the original format at all.
if parser.requires_pdf_rendition:
_log.debug("Archive: yes — parser requires PDF rendition for frontend display")
return True
# Parser cannot produce an archive (e.g. TextDocumentParser).
if not parser.can_produce_archive:
_log.debug("Archive: no — parser cannot produce archives")
return False
generation = OcrConfig().archive_file_generation
if generation == ArchiveFileGenerationChoices.ALWAYS:
_log.debug("Archive: yes — ARCHIVE_FILE_GENERATION=always")
return True
if generation == ArchiveFileGenerationChoices.NEVER:
_log.debug("Archive: no — ARCHIVE_FILE_GENERATION=never")
return False
# auto: produce archives for scanned/image documents; skip for born-digital PDFs.
if mime_type.startswith("image/"):
_log.debug("Archive: yes — image document, ARCHIVE_FILE_GENERATION=auto")
return True
if mime_type == "application/pdf":
if is_tagged_pdf(document_path):
_log.debug(
"Archive: no — born-digital PDF (structure tags detected),"
" ARCHIVE_FILE_GENERATION=auto",
)
return False
text = extract_pdf_text(document_path)
return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
if text is None or len(text) <= PDF_TEXT_MIN_LENGTH:
_log.debug(
"Archive: yes — scanned PDF (text_length=%d%d),"
" ARCHIVE_FILE_GENERATION=auto",
len(text) if text else 0,
PDF_TEXT_MIN_LENGTH,
)
return True
_log.debug(
"Archive: no — born-digital PDF (text_length=%d > %d),"
" ARCHIVE_FILE_GENERATION=auto",
len(text),
PDF_TEXT_MIN_LENGTH,
)
return False
_log.debug(
"Archive: no — MIME type %r not eligible for auto archive generation",
mime_type,
)
return False
@@ -485,6 +516,7 @@ class ConsumerPlugin(
document_parser,
mime_type,
self.working_copy,
self.log,
)
document_parser.parse(
self.working_copy,
+24
View File
@@ -474,12 +474,28 @@ class RasterisedDocumentParser:
text_original = None
original_has_text = False
self.log.debug(
"Text detection: original_has_text=%s (text_length=%d, mode=%s, produce_archive=%s)",
original_has_text,
len(text_original) if text_original else 0,
self.settings.mode,
produce_archive,
)
# --- OCR_MODE=off: never invoke OCR engine ---
if self.settings.mode == ModeChoices.OFF:
if not produce_archive:
self.log.debug(
"OCR: skipped — OCR_MODE=off, no archive requested;"
" returning pdftotext content only",
)
self.text = text_original or ""
return
if self.is_image(mime_type):
self.log.debug(
"OCR: skipped — OCR_MODE=off, image input;"
" converting to PDF/A without OCR",
)
try:
self.archive_path = self._convert_image_to_pdfa(
document_path,
@@ -532,6 +548,14 @@ class RasterisedDocumentParser:
# auto mode with existing text: PDF/A conversion only (no OCR).
skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text
if skip_text:
self.log.debug(
"OCR strategy: PDF/A conversion only (skip_text)"
" — OCR_MODE=auto, document already has text",
)
else:
self.log.debug("OCR strategy: full OCR — OCR_MODE=%s", self.settings.mode)
args = self.construct_ocrmypdf_parameters(
document_path,
mime_type,