diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 14f7904a7..424e22ce2 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,4 +1,5 @@ import datetime +import logging import os import shutil import tempfile @@ -114,6 +115,7 @@ def should_produce_archive( parser: "ParserProtocol", mime_type: str, document_path: Path, + log: logging.Logger | None = None, ) -> bool: """Return True if a PDF/A archive should be produced for this document. @@ -122,29 +124,58 @@ def should_produce_archive( ``@property`` methods — accessing them on the class returns the descriptor (always truthy). """ + _log = log or logging.getLogger(LOGGING_NAME) + # Must produce a PDF so the frontend can display the original format at all. if parser.requires_pdf_rendition: + _log.debug("Archive: yes — parser requires PDF rendition for frontend display") return True # Parser cannot produce an archive (e.g. TextDocumentParser). if not parser.can_produce_archive: + _log.debug("Archive: no — parser cannot produce archives") return False generation = OcrConfig().archive_file_generation if generation == ArchiveFileGenerationChoices.ALWAYS: + _log.debug("Archive: yes — ARCHIVE_FILE_GENERATION=always") return True if generation == ArchiveFileGenerationChoices.NEVER: + _log.debug("Archive: no — ARCHIVE_FILE_GENERATION=never") return False # auto: produce archives for scanned/image documents; skip for born-digital PDFs. if mime_type.startswith("image/"): + _log.debug("Archive: yes — image document, ARCHIVE_FILE_GENERATION=auto") return True if mime_type == "application/pdf": if is_tagged_pdf(document_path): + _log.debug( + "Archive: no — born-digital PDF (structure tags detected)," + " ARCHIVE_FILE_GENERATION=auto", + ) return False text = extract_pdf_text(document_path) - return text is None or len(text) <= PDF_TEXT_MIN_LENGTH + if text is None or len(text) <= PDF_TEXT_MIN_LENGTH: + _log.debug( + "Archive: yes — scanned PDF (text_length=%d ≤ %d)," + " ARCHIVE_FILE_GENERATION=auto", + len(text) if text else 0, + PDF_TEXT_MIN_LENGTH, + ) + return True + _log.debug( + "Archive: no — born-digital PDF (text_length=%d > %d)," + " ARCHIVE_FILE_GENERATION=auto", + len(text), + PDF_TEXT_MIN_LENGTH, + ) + return False + _log.debug( + "Archive: no — MIME type %r not eligible for auto archive generation", + mime_type, + ) return False @@ -485,6 +516,7 @@ class ConsumerPlugin( document_parser, mime_type, self.working_copy, + self.log, ) document_parser.parse( self.working_copy, diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index e1f33c3af..b451520db 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -474,12 +474,28 @@ class RasterisedDocumentParser: text_original = None original_has_text = False + self.log.debug( + "Text detection: original_has_text=%s (text_length=%d, mode=%s, produce_archive=%s)", + original_has_text, + len(text_original) if text_original else 0, + self.settings.mode, + produce_archive, + ) + # --- OCR_MODE=off: never invoke OCR engine --- if self.settings.mode == ModeChoices.OFF: if not produce_archive: + self.log.debug( + "OCR: skipped — OCR_MODE=off, no archive requested;" + " returning pdftotext content only", + ) self.text = text_original or "" return if self.is_image(mime_type): + self.log.debug( + "OCR: skipped — OCR_MODE=off, image input;" + " converting to PDF/A without OCR", + ) try: self.archive_path = self._convert_image_to_pdfa( document_path, @@ -532,6 +548,14 @@ class RasterisedDocumentParser: # auto mode with existing text: PDF/A conversion only (no OCR). skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text + if skip_text: + self.log.debug( + "OCR strategy: PDF/A conversion only (skip_text)" + " — OCR_MODE=auto, document already has text", + ) + else: + self.log.debug("OCR strategy: full OCR — OCR_MODE=%s", self.settings.mode) + args = self.construct_ocrmypdf_parameters( document_path, mime_type,