mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-12 17:45:24 +00:00
Add debug level logging for why an archive is made and why we decided OCR or not
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
@@ -114,6 +115,7 @@ def should_produce_archive(
|
||||
parser: "ParserProtocol",
|
||||
mime_type: str,
|
||||
document_path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> bool:
|
||||
"""Return True if a PDF/A archive should be produced for this document.
|
||||
|
||||
@@ -122,29 +124,58 @@ def should_produce_archive(
|
||||
``@property`` methods — accessing them on the class returns the descriptor
|
||||
(always truthy).
|
||||
"""
|
||||
_log = log or logging.getLogger(LOGGING_NAME)
|
||||
|
||||
# Must produce a PDF so the frontend can display the original format at all.
|
||||
if parser.requires_pdf_rendition:
|
||||
_log.debug("Archive: yes — parser requires PDF rendition for frontend display")
|
||||
return True
|
||||
|
||||
# Parser cannot produce an archive (e.g. TextDocumentParser).
|
||||
if not parser.can_produce_archive:
|
||||
_log.debug("Archive: no — parser cannot produce archives")
|
||||
return False
|
||||
|
||||
generation = OcrConfig().archive_file_generation
|
||||
|
||||
if generation == ArchiveFileGenerationChoices.ALWAYS:
|
||||
_log.debug("Archive: yes — ARCHIVE_FILE_GENERATION=always")
|
||||
return True
|
||||
if generation == ArchiveFileGenerationChoices.NEVER:
|
||||
_log.debug("Archive: no — ARCHIVE_FILE_GENERATION=never")
|
||||
return False
|
||||
|
||||
# auto: produce archives for scanned/image documents; skip for born-digital PDFs.
|
||||
if mime_type.startswith("image/"):
|
||||
_log.debug("Archive: yes — image document, ARCHIVE_FILE_GENERATION=auto")
|
||||
return True
|
||||
if mime_type == "application/pdf":
|
||||
if is_tagged_pdf(document_path):
|
||||
_log.debug(
|
||||
"Archive: no — born-digital PDF (structure tags detected),"
|
||||
" ARCHIVE_FILE_GENERATION=auto",
|
||||
)
|
||||
return False
|
||||
text = extract_pdf_text(document_path)
|
||||
return text is None or len(text) <= PDF_TEXT_MIN_LENGTH
|
||||
if text is None or len(text) <= PDF_TEXT_MIN_LENGTH:
|
||||
_log.debug(
|
||||
"Archive: yes — scanned PDF (text_length=%d ≤ %d),"
|
||||
" ARCHIVE_FILE_GENERATION=auto",
|
||||
len(text) if text else 0,
|
||||
PDF_TEXT_MIN_LENGTH,
|
||||
)
|
||||
return True
|
||||
_log.debug(
|
||||
"Archive: no — born-digital PDF (text_length=%d > %d),"
|
||||
" ARCHIVE_FILE_GENERATION=auto",
|
||||
len(text),
|
||||
PDF_TEXT_MIN_LENGTH,
|
||||
)
|
||||
return False
|
||||
_log.debug(
|
||||
"Archive: no — MIME type %r not eligible for auto archive generation",
|
||||
mime_type,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
@@ -485,6 +516,7 @@ class ConsumerPlugin(
|
||||
document_parser,
|
||||
mime_type,
|
||||
self.working_copy,
|
||||
self.log,
|
||||
)
|
||||
document_parser.parse(
|
||||
self.working_copy,
|
||||
|
||||
@@ -474,12 +474,28 @@ class RasterisedDocumentParser:
|
||||
text_original = None
|
||||
original_has_text = False
|
||||
|
||||
self.log.debug(
|
||||
"Text detection: original_has_text=%s (text_length=%d, mode=%s, produce_archive=%s)",
|
||||
original_has_text,
|
||||
len(text_original) if text_original else 0,
|
||||
self.settings.mode,
|
||||
produce_archive,
|
||||
)
|
||||
|
||||
# --- OCR_MODE=off: never invoke OCR engine ---
|
||||
if self.settings.mode == ModeChoices.OFF:
|
||||
if not produce_archive:
|
||||
self.log.debug(
|
||||
"OCR: skipped — OCR_MODE=off, no archive requested;"
|
||||
" returning pdftotext content only",
|
||||
)
|
||||
self.text = text_original or ""
|
||||
return
|
||||
if self.is_image(mime_type):
|
||||
self.log.debug(
|
||||
"OCR: skipped — OCR_MODE=off, image input;"
|
||||
" converting to PDF/A without OCR",
|
||||
)
|
||||
try:
|
||||
self.archive_path = self._convert_image_to_pdfa(
|
||||
document_path,
|
||||
@@ -532,6 +548,14 @@ class RasterisedDocumentParser:
|
||||
# auto mode with existing text: PDF/A conversion only (no OCR).
|
||||
skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text
|
||||
|
||||
if skip_text:
|
||||
self.log.debug(
|
||||
"OCR strategy: PDF/A conversion only (skip_text)"
|
||||
" — OCR_MODE=auto, document already has text",
|
||||
)
|
||||
else:
|
||||
self.log.debug("OCR strategy: full OCR — OCR_MODE=%s", self.settings.mode)
|
||||
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
document_path,
|
||||
mime_type,
|
||||
|
||||
Reference in New Issue
Block a user