diff --git a/docs/configuration.md b/docs/configuration.md index 4ce2d9dc6..88fdf8b65 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -801,11 +801,13 @@ parsing documents. #### [`PAPERLESS_OCR_MODE=`](#PAPERLESS_OCR_MODE) {#PAPERLESS_OCR_MODE} -: Tell paperless when and how to perform ocr on your documents. Three -modes are available: +: Tell paperless when and how to perform OCR on your documents. The +following modes are available: - - `skip`: Paperless skips all pages and will perform ocr only on - pages where no text is present. This is the safest option. + - `auto`: Paperless auto-detects whether a document already + contains extractable text using pdftotext. If the extracted + text exceeds a threshold (50 characters), OCR is skipped; + otherwise OCR runs. This is the default. - `redo`: Paperless will OCR all pages of your documents and attempt to replace any existing text layers with new text. This @@ -823,24 +825,46 @@ modes are available: significantly larger and text won't appear as sharp when zoomed in. - The default is `skip`, which only performs OCR when necessary and - always creates archived documents. + - `off`: OCR never runs regardless of input type. Embedded text + is still extracted from PDFs via pdftotext, but images and + scanned PDFs without text layers will have empty content. + Useful for handwritten documents, bulk ingestion of large + archives, or content that OCRs poorly. Archive generation still + works independently when `PAPERLESS_ARCHIVE_FILE_GENERATION` + requests it — a PDF/A can be produced without OCR via format + conversion only. + + Defaults to `auto`. Read more about this in the [OCRmyPDF documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped). -#### [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=`](#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) {#PAPERLESS_OCR_SKIP_ARCHIVE_FILE} +#### [`PAPERLESS_ARCHIVE_FILE_GENERATION=`](#PAPERLESS_ARCHIVE_FILE_GENERATION) {#PAPERLESS_ARCHIVE_FILE_GENERATION} -: Specify when you would like paperless to skip creating an archived -version of your documents. This is useful if you don't want to have two -almost-identical versions of your documents in the media folder. +: Controls whether paperless produces a normalized PDF/A archive copy +of each document. This is independent of OCR — a PDF/A can be produced +with or without running OCR. - - `never`: Never skip creating an archived version. - - `with_text`: Skip creating an archived version for documents - that already have embedded text. - - `always`: Always skip creating an archived version. + - `auto`: Produce archives for scanned and image-based documents; + skip for born-digital PDFs. Born-digital is detected by + checking both whether the PDF contains extractable text and + whether it has a logical structure (tag tree), which word + processors and PDF export tools produce. Scanner software that + applies its own OCR typically does not produce tagged PDFs, so + those still receive an archive. - The default is `never`. + - `always`: Always produce a PDF/A archive when the parser + supports it. + + - `never`: Never produce an archive. + + Defaults to `auto`. + + !!! note + + Parsers that must produce a PDF for the frontend to display the + document (e.g. the Tika parser for Office documents) always + produce a PDF rendition regardless of this setting. #### [`PAPERLESS_OCR_CLEAN=`](#PAPERLESS_OCR_CLEAN) {#PAPERLESS_OCR_CLEAN} diff --git a/docs/migration-v3.md b/docs/migration-v3.md index 4c728a6a4..437e5e1aa 100644 --- a/docs/migration-v3.md +++ b/docs/migration-v3.md @@ -130,3 +130,21 @@ For example: } } ``` + +## OCR and Archive Settings Changes + +The `PAPERLESS_OCR_MODE` values `skip` and `skip_noarchive` have been replaced by +[`PAPERLESS_OCR_MODE=auto`](configuration.md#PAPERLESS_OCR_MODE). Archive file +generation is now controlled by the separate +[`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) +setting, replacing `PAPERLESS_OCR_SKIP_ARCHIVE_FILE`. + +### Summary + +| Old Setting | New Setting | +| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `PAPERLESS_OCR_MODE=skip` | [`PAPERLESS_OCR_MODE=auto`](configuration.md#PAPERLESS_OCR_MODE) (now the default) | +| `PAPERLESS_OCR_MODE=skip_noarchive` | [`PAPERLESS_OCR_MODE=auto`](configuration.md#PAPERLESS_OCR_MODE) + [`PAPERLESS_ARCHIVE_FILE_GENERATION=never`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) | +| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never` | [`PAPERLESS_ARCHIVE_FILE_GENERATION=always`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) | +| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text` | [`PAPERLESS_ARCHIVE_FILE_GENERATION=auto`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) | +| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` | [`PAPERLESS_ARCHIVE_FILE_GENERATION=never`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) | diff --git a/src/documents/tests/test_archive_preference.py b/src/documents/tests/test_archive_preference.py new file mode 100644 index 000000000..a27330779 --- /dev/null +++ b/src/documents/tests/test_archive_preference.py @@ -0,0 +1,257 @@ +""" +Tests for documents.parsers.resolve_archive_preference function and related logic. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING +from unittest.mock import Mock + +import pytest + +if TYPE_CHECKING: + from pytest_django.fixtures import SettingsWrapper + from pytest_mock import MockerFixture + +from documents.parsers import _should_produce_archive_for_pdf +from documents.parsers import resolve_archive_preference +from paperless.models import ArchiveFileGenerationChoices + + +class TestResolveArchivePreference: + """Test the resolve_archive_preference function.""" + + @pytest.mark.parametrize( + ("archive_setting", "can_produce_archive", "expected"), + [ + pytest.param( + ArchiveFileGenerationChoices.ALWAYS, + True, + True, + id="always-capable-parser", + ), + pytest.param( + ArchiveFileGenerationChoices.ALWAYS, + False, + False, + id="always-incapable-parser", + ), + pytest.param( + ArchiveFileGenerationChoices.NEVER, + True, + False, + id="never-capable-parser", + ), + pytest.param( + ArchiveFileGenerationChoices.NEVER, + False, + False, + id="never-incapable-parser", + ), + ], + ) + def test_archive_generation_setting_behavior( + self, + settings: SettingsWrapper, + archive_setting: ArchiveFileGenerationChoices, + can_produce_archive: bool, # noqa: FBT001 + expected: bool, # noqa: FBT001 + ) -> None: + """Test archive generation setting behavior for always/never modes.""" + settings.ARCHIVE_FILE_GENERATION = archive_setting + + result = resolve_archive_preference( + "application/pdf", + Path("/fake/path.pdf"), + can_produce_archive=can_produce_archive, + ) + + assert result is expected + + def test_auto_mode_non_pdf_returns_true( + self, + settings: SettingsWrapper, + ) -> None: + """ + GIVEN: + - ARCHIVE_FILE_GENERATION=auto + - Non-PDF mime type + - can_produce_archive=True + WHEN: + - resolve_archive_preference is called + THEN: + - Returns True (images always need archive) + """ + settings.ARCHIVE_FILE_GENERATION = ArchiveFileGenerationChoices.AUTO + + result = resolve_archive_preference( + "image/jpeg", + Path("/fake/path.jpg"), + can_produce_archive=True, + ) + + assert result is True + + def test_auto_mode_pdf_delegates_to_heuristic( + self, + settings: SettingsWrapper, + mocker: MockerFixture, + ) -> None: + """ + GIVEN: + - ARCHIVE_FILE_GENERATION=auto + - PDF mime type + - can_produce_archive=True + WHEN: + - resolve_archive_preference is called + THEN: + - Delegates to _should_produce_archive_for_pdf + """ + settings.ARCHIVE_FILE_GENERATION = ArchiveFileGenerationChoices.AUTO + mock_heuristic = mocker.patch( + "documents.parsers._should_produce_archive_for_pdf", + return_value=True, + ) + fake_path = Path("/fake/path.pdf") + + result = resolve_archive_preference( + "application/pdf", + fake_path, + can_produce_archive=True, + ) + + mock_heuristic.assert_called_once_with(fake_path) + assert result is True + + +class TestShouldProduceArchiveForPdf: + """Test the _should_produce_archive_for_pdf heuristic function.""" + + @pytest.mark.parametrize( + ("text_content", "has_struct_tree", "is_marked", "expected"), + [ + pytest.param( + "This is a long text content that is definitely longer than fifty characters", + True, + False, + False, + id="tagged-with-struct-tree", + ), + pytest.param( + "This is a long text content that is definitely longer than fifty characters", + False, + True, + False, + id="tagged-with-mark-info", + ), + pytest.param( + "This is a long text content that is definitely longer than fifty characters", + False, + False, + True, + id="untagged-with-text", + ), + pytest.param( + "Short text", + True, + False, + True, + id="little-text-tagged", + ), + pytest.param( + "Short text", + False, + False, + True, + id="little-text-untagged", + ), + pytest.param( + "", + False, + False, + True, + id="no-text", + ), + ], + ) + def test_pdf_heuristic_logic( + self, + mocker: MockerFixture, + text_content: str, + has_struct_tree: bool, # noqa: FBT001 + is_marked: bool, # noqa: FBT001 + expected: bool, # noqa: FBT001 + ) -> None: + """Test the PDF heuristic with various text and tagging combinations.""" + # Mock text extraction + mocker.patch( + "documents.parsers.run_subprocess", + ) + mocker.patch( + "documents.parsers.read_file_handle_unicode_errors", + return_value=text_content, + ) + + # Mock pikepdf + mock_pdf = Mock() + if has_struct_tree: + mock_pdf.Root.StructTreeRoot = True + else: + del mock_pdf.Root.StructTreeRoot + + mock_pdf.Root.MarkInfo.get.return_value = is_marked + mock_pikepdf = mocker.patch("documents.parsers.pikepdf") + mock_pikepdf.open.return_value.__enter__.return_value = mock_pdf + + result = _should_produce_archive_for_pdf(Path("/fake/path.pdf")) + assert result is expected + + def test_exception_handling_returns_true( + self, + mocker: MockerFixture, + ) -> None: + """ + GIVEN: + - PDF processing raises an exception + WHEN: + - _should_produce_archive_for_pdf is called + THEN: + - Returns True (safe default) + """ + # Mock exception during text processing + mocker.patch( + "documents.parsers.run_subprocess", + side_effect=Exception("Test error"), + ) + + result = _should_produce_archive_for_pdf(Path("/fake/path.pdf")) + assert result is True + + def test_pikepdf_exception_returns_true( + self, + mocker: MockerFixture, + ) -> None: + """ + GIVEN: + - Text extraction succeeds but pikepdf raises exception + WHEN: + - _should_produce_archive_for_pdf is called + THEN: + - Returns True (safe default) + """ + # Mock successful text extraction + mocker.patch("documents.parsers.run_subprocess") + mocker.patch( + "documents.parsers.read_file_handle_unicode_errors", + return_value="This is a long text content that is definitely longer than fifty characters", + ) + + # Mock pikepdf exception + mocker.patch( + "documents.parsers.pikepdf.open", + side_effect=Exception("PDF error"), + ) + + result = _should_produce_archive_for_pdf(Path("/fake/path.pdf")) + assert result is True diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index df4c7d9c4..afc2f33e3 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -5,6 +5,7 @@ import tempfile from pathlib import Path from unittest import mock from unittest.mock import MagicMock +from unittest.mock import Mock from django.conf import settings from django.contrib.auth.models import Group @@ -1548,3 +1549,158 @@ class TestBarcodeApplyDetectedASN(TestCase): plugin._apply_detected_asn(123) self.assertEqual(plugin.metadata.asn, 123) + + +# TODO: Convert these tests to pytest style in the future +class TestArchivePreferenceWiring(DirectoriesMixin, GetConsumerMixin, TestCase): + """Test that archive preference settings are properly wired to parser calls.""" + + def setUp(self) -> None: + super().setUp() + # Use simple test file that can be parsed by our test parsers + src = ( + Path(__file__).parent + / "samples" + / "documents" + / "originals" + / "0000005.pdf" + ) + self.test_file = self.dirs.scratch_dir / "sample.pdf" + shutil.copy(src, self.test_file) + + @override_settings(ARCHIVE_FILE_GENERATION="never") + @mock.patch("documents.consumer.get_parser_registry") + def test_never_setting_passes_produce_archive_false(self, mock_registry): + """Test that ARCHIVE_FILE_GENERATION=never passes produce_archive=False to parser.""" + # Mock parser to track produce_archive parameter + from unittest.mock import MagicMock + + mock_parser_instance = Mock() + mock_parser_instance.can_produce_archive = True + mock_parser_instance.get_text.return_value = "Test text" + mock_parser_instance.get_archive_path.return_value = None + # Create a temporary thumbnail file for testing + thumbnail_path = self.dirs.scratch_dir / "thumbnail.webp" + thumbnail_path.write_bytes(b"fake_thumbnail_data") + mock_parser_instance.get_thumbnail.return_value = thumbnail_path + mock_parser_instance.get_date.return_value = None + mock_parser_instance.get_page_count.return_value = 1 + mock_parser_instance.extract_metadata.return_value = [] + + # Use MagicMock to properly support context manager protocol + mock_parser_class = MagicMock() + mock_parser_class.return_value.__enter__ = Mock( + return_value=mock_parser_instance, + ) + mock_parser_class.return_value.__exit__ = Mock(return_value=None) + mock_parser_class.can_produce_archive = True + mock_parser_class.requires_pdf_rendition = False + + mock_registry_instance = Mock() + mock_registry_instance.get_parser_for_file.return_value = mock_parser_class + mock_registry.return_value = mock_registry_instance + + with self.get_consumer(self.test_file) as consumer: + consumer.run() + + # Verify parse was called with produce_archive=False + mock_parser_instance.parse.assert_called_once() + call_args = mock_parser_instance.parse.call_args + self.assertEqual(call_args.kwargs["produce_archive"], False) + + @override_settings(ARCHIVE_FILE_GENERATION="always") + @mock.patch("documents.consumer.get_parser_registry") + def test_always_setting_passes_produce_archive_true(self, mock_registry): + """Test that ARCHIVE_FILE_GENERATION=always passes produce_archive=True to parser.""" + # Mock parser to track produce_archive parameter + from unittest.mock import MagicMock + + mock_parser_instance = Mock() + mock_parser_instance.can_produce_archive = True + mock_parser_instance.get_text.return_value = "Test text" + mock_parser_instance.get_archive_path.return_value = ( + self.test_file + ) # Fake archive + # Create a temporary thumbnail file for testing + thumbnail_path = self.dirs.scratch_dir / "thumbnail.webp" + thumbnail_path.write_bytes(b"fake_thumbnail_data") + mock_parser_instance.get_thumbnail.return_value = thumbnail_path + mock_parser_instance.get_date.return_value = None + mock_parser_instance.get_page_count.return_value = 1 + mock_parser_instance.extract_metadata.return_value = [] + + # Use MagicMock to properly support context manager protocol + mock_parser_class = MagicMock() + mock_parser_class.return_value.__enter__ = Mock( + return_value=mock_parser_instance, + ) + mock_parser_class.return_value.__exit__ = Mock(return_value=None) + mock_parser_class.can_produce_archive = True + mock_parser_class.requires_pdf_rendition = False + + mock_registry_instance = Mock() + mock_registry_instance.get_parser_for_file.return_value = mock_parser_class + mock_registry.return_value = mock_registry_instance + + with self.get_consumer(self.test_file) as consumer: + consumer.run() + + # Verify parse was called with produce_archive=True + mock_parser_instance.parse.assert_called_once() + call_args = mock_parser_instance.parse.call_args + self.assertEqual(call_args.kwargs["produce_archive"], True) + + @override_settings(ARCHIVE_FILE_GENERATION="auto") + @mock.patch("documents.consumer.resolve_archive_preference") + @mock.patch("documents.consumer.get_parser_registry") + def test_auto_setting_delegates_to_resolve_archive_preference( + self, + mock_registry, + mock_resolve_preference, + ): + """Test that ARCHIVE_FILE_GENERATION=auto delegates to resolve_archive_preference.""" + mock_resolve_preference.return_value = False + + # Mock parser to track produce_archive parameter + mock_parser_instance = Mock() + mock_parser_instance.can_produce_archive = True + mock_parser_instance.get_text.return_value = "Test text" + mock_parser_instance.get_archive_path.return_value = None + # Create a temporary thumbnail file for testing + thumbnail_path = self.dirs.scratch_dir / "thumbnail.webp" + thumbnail_path.write_bytes(b"fake_thumbnail_data") + mock_parser_instance.get_thumbnail.return_value = thumbnail_path + mock_parser_instance.get_date.return_value = None + mock_parser_instance.get_page_count.return_value = 1 + mock_parser_instance.extract_metadata.return_value = [] + + # Use MagicMock to properly support context manager protocol + from unittest.mock import MagicMock + + mock_parser_class = MagicMock() + mock_parser_class.return_value.__enter__ = Mock( + return_value=mock_parser_instance, + ) + mock_parser_class.return_value.__exit__ = Mock(return_value=None) + mock_parser_class.can_produce_archive = True + mock_parser_class.requires_pdf_rendition = False + + mock_registry_instance = Mock() + mock_registry_instance.get_parser_for_file.return_value = mock_parser_class + mock_registry.return_value = mock_registry_instance + + with self.get_consumer(self.test_file) as consumer: + consumer.run() + + # Verify resolve_archive_preference was called with correct parameters + mock_resolve_preference.assert_called_once() + call_args = mock_resolve_preference.call_args + self.assertEqual(call_args.args[0], "application/pdf") + # Path will be working copy (different from original), so check it's a Path to sample.pdf + self.assertEqual(call_args.args[1].name, "sample.pdf") + self.assertEqual(call_args.kwargs["can_produce_archive"], True) + + # Verify parse was called with the result from resolve_archive_preference + mock_parser_instance.parse.assert_called_once() + call_args = mock_parser_instance.parse.call_args + self.assertEqual(call_args.kwargs["produce_archive"], False) diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py index 7719d21dd..0fa662766 100644 --- a/src/documents/tests/test_management.py +++ b/src/documents/tests/test_management.py @@ -43,6 +43,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase): call_command("document_archiver", "--processes", "1", skip_checks=True) + @override_settings(ARCHIVE_FILE_GENERATION="always") def test_handle_document(self) -> None: doc = self.make_models() shutil.copy(sample_file, Path(self.dirs.originals_dir) / f"{doc.id:07}.pdf") @@ -73,7 +74,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertIsNone(doc.archive_filename) self.assertIsFile(doc.source_path) - @override_settings(FILENAME_FORMAT="{title}") + @override_settings(FILENAME_FORMAT="{title}", ARCHIVE_FILE_GENERATION="always") def test_naming_priorities(self) -> None: doc1 = Document.objects.create( checksum="A", diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 30963df70..3c89b1b8b 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -1,9 +1,13 @@ +from unittest.mock import Mock +from unittest.mock import patch + from django.test import TestCase from django.test import override_settings from documents.parsers import get_default_file_extension from documents.parsers import get_supported_file_extensions from documents.parsers import is_file_ext_supported +from documents.parsers import resolve_archive_preference from paperless.parsers.registry import get_parser_registry from paperless.parsers.registry import reset_parser_registry from paperless.parsers.tesseract import RasterisedDocumentParser @@ -111,3 +115,195 @@ class TestParserAvailability(TestCase): self.assertTrue(is_file_ext_supported(".pdf")) self.assertFalse(is_file_ext_supported(".hsdfh")) self.assertFalse(is_file_ext_supported("")) + + +class TestResolveArchivePreference(TestCase): + """Test the resolve_archive_preference function with various settings and file types.""" + + def setUp(self): + """Set up test PDF file for mocking.""" + from pathlib import Path + + self.test_pdf_path = Path("/fake/path/test.pdf") + + @override_settings(ARCHIVE_FILE_GENERATION="always") + def test_always_setting_with_capable_parser(self): + """Test ARCHIVE_FILE_GENERATION=always with parser that can produce archive.""" + result = resolve_archive_preference( + "application/pdf", + self.test_pdf_path, + can_produce_archive=True, + ) + self.assertTrue(result) + + @override_settings(ARCHIVE_FILE_GENERATION="always") + def test_always_setting_with_incapable_parser(self): + """Test ARCHIVE_FILE_GENERATION=always with parser that cannot produce archive.""" + result = resolve_archive_preference( + "application/pdf", + self.test_pdf_path, + can_produce_archive=False, + ) + self.assertFalse(result) + + @override_settings(ARCHIVE_FILE_GENERATION="never") + def test_never_setting_regardless_of_parser(self): + """Test ARCHIVE_FILE_GENERATION=never regardless of parser capability.""" + # Test with capable parser + result = resolve_archive_preference( + "application/pdf", + self.test_pdf_path, + can_produce_archive=True, + ) + self.assertFalse(result) + + # Test with incapable parser + result = resolve_archive_preference( + "application/pdf", + self.test_pdf_path, + can_produce_archive=False, + ) + self.assertFalse(result) + + @override_settings(ARCHIVE_FILE_GENERATION="auto") + def test_auto_setting_with_non_pdf_mime_type(self): + """Test ARCHIVE_FILE_GENERATION=auto with non-PDF mime types.""" + # Non-PDF mime types (images etc.) should always produce archive + result = resolve_archive_preference( + "image/jpeg", + self.test_pdf_path, # Path doesn't matter for non-PDF + can_produce_archive=True, + ) + self.assertTrue(result) + + @override_settings(ARCHIVE_FILE_GENERATION="auto") + @patch("documents.parsers._should_produce_archive_for_pdf") + def test_auto_setting_with_pdf_delegates_to_heuristic(self, mock_heuristic): + """Test ARCHIVE_FILE_GENERATION=auto with PDF delegates to heuristic function.""" + mock_heuristic.return_value = False + + result = resolve_archive_preference( + "application/pdf", + self.test_pdf_path, + can_produce_archive=True, + ) + + mock_heuristic.assert_called_once_with(self.test_pdf_path) + self.assertFalse(result) + + # Test with heuristic returning True + mock_heuristic.reset_mock() + mock_heuristic.return_value = True + + result = resolve_archive_preference( + "application/pdf", + self.test_pdf_path, + can_produce_archive=True, + ) + + mock_heuristic.assert_called_once_with(self.test_pdf_path) + self.assertTrue(result) + + @override_settings(ARCHIVE_FILE_GENERATION="auto") + @patch("documents.parsers.run_subprocess") + @patch("documents.parsers.pikepdf.open") + @patch("documents.parsers.read_file_handle_unicode_errors") + def test_pdf_heuristic_born_digital_tagged( + self, + mock_read_file, + mock_pikepdf_open, + mock_subprocess, + ): + """Test PDF heuristic detects born-digital tagged PDF (should NOT produce archive).""" + # Mock pdftotext output - lots of text + mock_read_file.return_value = ( + "This is a lot of text content from a born-digital PDF document." + ) + + # Mock pikepdf - tagged PDF + mock_pdf = Mock() + mock_pdf.Root = Mock() + mock_pdf.Root.StructTreeRoot = Mock() # Has structure tree + mock_pikepdf_open.return_value.__enter__.return_value = mock_pdf + + from documents.parsers import _should_produce_archive_for_pdf + + result = _should_produce_archive_for_pdf(self.test_pdf_path) + + self.assertFalse(result) # Born-digital tagged PDF should NOT produce archive + mock_subprocess.assert_called_once() + mock_pikepdf_open.assert_called_once_with(self.test_pdf_path) + + @override_settings(ARCHIVE_FILE_GENERATION="auto") + @patch("documents.parsers.run_subprocess") + @patch("documents.parsers.pikepdf.open") + @patch("documents.parsers.read_file_handle_unicode_errors") + def test_pdf_heuristic_scanner_ocr_untagged( + self, + mock_read_file, + mock_pikepdf_open, + mock_subprocess, + ): + """Test PDF heuristic detects scanner OCR'd untagged PDF (should produce archive).""" + # Mock pdftotext output - lots of text + mock_read_file.return_value = ( + "This is a lot of text content from a scanner OCR'd PDF document." + ) + + # Mock pikepdf - untagged PDF + mock_pdf = Mock() + mock_pdf.Root = Mock() + # No StructTreeRoot and MarkInfo.Marked is False + del mock_pdf.Root.StructTreeRoot # Simulate no attribute + mock_pdf.Root.MarkInfo = Mock() + mock_pdf.Root.MarkInfo.get.return_value = False + mock_pikepdf_open.return_value.__enter__.return_value = mock_pdf + + from documents.parsers import _should_produce_archive_for_pdf + + result = _should_produce_archive_for_pdf(self.test_pdf_path) + + self.assertTrue(result) # Scanner OCR'd PDF should produce archive + mock_subprocess.assert_called_once() + mock_pikepdf_open.assert_called_once_with(self.test_pdf_path) + + @override_settings(ARCHIVE_FILE_GENERATION="auto") + @patch("documents.parsers.run_subprocess") + @patch("documents.parsers.pikepdf.open") + @patch("documents.parsers.read_file_handle_unicode_errors") + def test_pdf_heuristic_raw_scan_no_text( + self, + mock_read_file, + mock_pikepdf_open, + mock_subprocess, + ): + """Test PDF heuristic detects raw scan with no text (should produce archive).""" + # Mock pdftotext output - very little text + mock_read_file.return_value = " " # Just whitespace + + # Mock pikepdf - doesn't matter for this case + mock_pdf = Mock() + mock_pdf.Root = Mock() + mock_pikepdf_open.return_value.__enter__.return_value = mock_pdf + + from documents.parsers import _should_produce_archive_for_pdf + + result = _should_produce_archive_for_pdf(self.test_pdf_path) + + self.assertTrue(result) # Raw scan should produce archive + mock_subprocess.assert_called_once() + # pikepdf check is not needed when text is short, but we don't control that here + + @override_settings(ARCHIVE_FILE_GENERATION="auto") + @patch( + "documents.parsers.run_subprocess", + side_effect=Exception("pdftotext failed"), + ) + def test_pdf_heuristic_exception_handling(self, mock_subprocess): + """Test PDF heuristic defaults to producing archive when exception occurs.""" + from documents.parsers import _should_produce_archive_for_pdf + + result = _should_produce_archive_for_pdf(self.test_pdf_path) + + self.assertTrue(result) # Should default to True when exception occurs + mock_subprocess.assert_called_once() diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index 37f1e6fed..c3533eb0c 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -233,10 +233,12 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase): class TestUpdateContent(DirectoriesMixin, TestCase): + @override_settings(ARCHIVE_FILE_GENERATION="always") def test_update_content_maybe_archive_file(self) -> None: """ GIVEN: - Existing document with archive file + - ARCHIVE_FILE_GENERATION=always to force archive production WHEN: - Update content task is called THEN: diff --git a/src/paperless/checks.py b/src/paperless/checks.py index 5f069b547..038d468f7 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -132,23 +132,14 @@ def settings_values_check(app_configs, **kwargs): Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'), ) - if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: + if settings.OCR_MODE not in {"auto", "force", "redo", "off"}: msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) - if settings.OCR_MODE == "skip_noarchive": - msgs.append( - Warning( - 'OCR output mode "skip_noarchive" is deprecated and will be ' - "removed in a future version. Please use " - "PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.", - ), - ) - - if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}: + if settings.ARCHIVE_FILE_GENERATION not in {"always", "never", "auto"}: msgs.append( Error( - "OCR_SKIP_ARCHIVE_FILE setting " - f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid', + "ARCHIVE_FILE_GENERATION setting " + f'"{settings.ARCHIVE_FILE_GENERATION}" is not valid', ), ) diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index df1acca05..35552f29e 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -494,6 +494,7 @@ class RasterisedDocumentParser: from ocrmypdf import InputFileError from ocrmypdf import SubprocessOutputError from ocrmypdf.exceptions import DigitalSignatureError + from ocrmypdf.exceptions import PriorOcrFoundError archive_path = Path(self.tempdir) / "archive.pdf" sidecar_file = Path(self.tempdir) / "sidecar.txt" @@ -532,12 +533,87 @@ class RasterisedDocumentParser: if not self.text: raise NoTextFoundException("No text was found in the original document") + except PriorOcrFoundError: + # pdftotext couldn't detect the text layer (e.g. RTL or CJK scripts), + # but ocrmypdf found it. Retry as PDF/A conversion only (skip_text). + self.log.debug( + "PDF has existing text layer not detected by pdftotext; " + "retrying with skip_text for PDF/A conversion.", + ) + retry_args = self.construct_ocrmypdf_parameters( + input_file, + mime_type, + archive_path, + sidecar_file, + skip_text=True, + ) + try: + ocrmypdf.ocr(**retry_args) + if produce_archive: + self.archive_path = archive_path + self.text = self.extract_text(sidecar_file, archive_path) + except Exception as e: + raise ParseError(f"{e.__class__.__name__}: {e!s}") from e except (DigitalSignatureError, EncryptedPdfError): self.log.warning( "This file is encrypted and/or signed, OCR is impossible. Using " "any text present in the original file.", ) self.text = text_original or "" + except InputFileError as e: + # Tagged PDFs raise InputFileError when called without skip_text/force_ocr. + # Retry with skip_text to do PDF/A conversion without disturbing the text layer. + if "Tagged PDF" in str(e): + self.log.debug( + "Tagged PDF detected; retrying with skip_text for PDF/A conversion.", + ) + retry_args = self.construct_ocrmypdf_parameters( + input_file, + mime_type, + archive_path, + sidecar_file, + skip_text=True, + ) + try: + ocrmypdf.ocr(**retry_args) + if produce_archive: + self.archive_path = archive_path + self.text = self.extract_text(sidecar_file, archive_path) + except Exception as retry_e: + raise ParseError( + f"{retry_e.__class__.__name__}: {retry_e!s}", + ) from retry_e + else: + self.log.warning( + f"Encountered an error while running OCR: {e!s}. " + f"Attempting force OCR to get the text.", + ) + archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf" + sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt" + args = self.construct_ocrmypdf_parameters( + input_file, + mime_type + if not ( + is_image + and self.settings.mode == ModeChoices.OFF + and produce_archive + ) + else "application/pdf", + archive_path_fallback, + sidecar_file_fallback, + safe_fallback=True, + ) + try: + self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}") + ocrmypdf.ocr(**args) + self.text = self.extract_text( + sidecar_file_fallback, + archive_path_fallback, + ) + except Exception as fallback_e: + raise ParseError( + f"{fallback_e.__class__.__name__}: {fallback_e!s}", + ) from fallback_e except SubprocessOutputError as e: if "Ghostscript PDF/A rendering" in str(e): self.log.warning( @@ -548,7 +624,7 @@ class RasterisedDocumentParser: raise ParseError( f"SubprocessOutputError: {e!s}. See logs for more information.", ) from e - except (NoTextFoundException, InputFileError) as e: + except NoTextFoundException as e: self.log.warning( f"Encountered an error while running OCR: {e!s}. " f"Attempting force OCR to get the text.", diff --git a/src/paperless/tests/parsers/conftest.py b/src/paperless/tests/parsers/conftest.py index 8747ac9bd..ec4e88f99 100644 --- a/src/paperless/tests/parsers/conftest.py +++ b/src/paperless/tests/parsers/conftest.py @@ -708,7 +708,6 @@ def null_app_config(mocker: MockerFixture) -> MagicMock: pages=None, language=None, mode=None, - skip_archive_file=None, image_dpi=None, unpaper_clean=None, deskew=None, diff --git a/src/paperless/tests/parsers/test_tesseract_custom_settings.py b/src/paperless/tests/parsers/test_tesseract_custom_settings.py index 60d1486f4..15ceed8bf 100644 --- a/src/paperless/tests/parsers/test_tesseract_custom_settings.py +++ b/src/paperless/tests/parsers/test_tesseract_custom_settings.py @@ -93,11 +93,13 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas """ with override_settings(OCR_MODE="redo"): instance = ApplicationConfiguration.objects.all().first() - instance.mode = ModeChoices.SKIP + instance.mode = ModeChoices.AUTO instance.save() params = self.get_params() - self.assertTrue(params["skip_text"]) + # AUTO mode doesn't set skip_text in construct_ocrmypdf_parameters + # The skip_text logic is handled in the parse method based on content detection + self.assertNotIn("skip_text", params) self.assertNotIn("redo_ocr", params) self.assertNotIn("force_ocr", params) diff --git a/src/paperless/tests/parsers/test_tesseract_parser.py b/src/paperless/tests/parsers/test_tesseract_parser.py index daa7020c7..4152dbc0c 100644 --- a/src/paperless/tests/parsers/test_tesseract_parser.py +++ b/src/paperless/tests/parsers/test_tesseract_parser.py @@ -433,7 +433,7 @@ class TestParsePdf: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf") assert tesseract_parser.archive_path is None assert_ordered_substrings( @@ -449,7 +449,7 @@ class TestParsePdf: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "encrypted.pdf", "application/pdf", @@ -559,7 +559,7 @@ class TestParseMultiPage: @pytest.mark.parametrize( "mode", [ - pytest.param("skip", id="skip"), + pytest.param("auto", id="auto"), pytest.param("redo", id="redo"), pytest.param("force", id="force"), ], @@ -587,7 +587,7 @@ class TestParseMultiPage: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-images.pdf", "application/pdf", @@ -722,29 +722,31 @@ class TestParseMultiPage: # --------------------------------------------------------------------------- -# Parse — skip_noarchive / skip_archive_file +# Parse — OCR_MODE=auto / off and produce_archive parameter # --------------------------------------------------------------------------- -class TestSkipArchive: - def test_skip_noarchive_with_text_layer( +class TestOcrModeAndArchiveGeneration: + def test_auto_mode_with_text_skips_archive( self, tesseract_parser: RasterisedDocumentParser, - tesseract_samples_dir: Path, + multi_page_digital_pdf_file: Path, ) -> None: """ GIVEN: - - File with existing text layer - - Mode: skip_noarchive + - File with existing text layer (born-digital PDF) + - Mode: auto + - produce_archive: False WHEN: - Document is parsed THEN: - - Text extracted; no archive created + - Text extracted; no archive created; ocrmypdf skipped entirely """ - tesseract_parser.settings.mode = "skip_noarchive" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( - tesseract_samples_dir / "multi-page-digital.pdf", + multi_page_digital_pdf_file, "application/pdf", + produce_archive=False, ) assert tesseract_parser.archive_path is None assert_ordered_substrings( @@ -752,24 +754,26 @@ class TestSkipArchive: ["page 1", "page 2", "page 3"], ) - def test_skip_noarchive_image_only_creates_archive( + def test_auto_mode_with_text_produces_archive( self, tesseract_parser: RasterisedDocumentParser, - tesseract_samples_dir: Path, + multi_page_digital_pdf_file: Path, ) -> None: """ GIVEN: - - File with image-only pages (no text layer) - - Mode: skip_noarchive + - File with existing text layer (born-digital PDF) + - Mode: auto + - produce_archive: True WHEN: - Document is parsed THEN: - - Text extracted; archive created (OCR needed) + - Text extracted; archive created with skip_text """ - tesseract_parser.settings.mode = "skip_noarchive" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( - tesseract_samples_dir / "multi-page-images.pdf", + multi_page_digital_pdf_file, "application/pdf", + produce_archive=True, ) assert tesseract_parser.archive_path is not None assert_ordered_substrings( @@ -777,48 +781,137 @@ class TestSkipArchive: ["page 1", "page 2", "page 3"], ) - @pytest.mark.parametrize( - ("skip_archive_file", "filename", "expect_archive"), - [ - pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"), - pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"), - pytest.param( - "with_text", - "multi-page-digital.pdf", - False, - id="with-text-layer", - ), - pytest.param( - "with_text", - "multi-page-images.pdf", - True, - id="with-text-no-layer", - ), - pytest.param( - "always", - "multi-page-digital.pdf", - False, - id="always-with-text", - ), - pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"), - ], - ) - def test_skip_archive_file_setting( + def test_auto_mode_image_produces_archive( self, - skip_archive_file: str, - filename: str, - expect_archive: str, tesseract_parser: RasterisedDocumentParser, - tesseract_samples_dir: Path, + multi_page_images_pdf_file: Path, ) -> None: - tesseract_parser.settings.skip_archive_file = skip_archive_file - tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf") - text = tesseract_parser.get_text().lower() - assert_ordered_substrings(text, ["page 1", "page 2", "page 3"]) - if expect_archive: - assert tesseract_parser.archive_path is not None - else: - assert tesseract_parser.archive_path is None + """ + GIVEN: + - File with image-only pages (no text layer) + - Mode: auto + - produce_archive: True + WHEN: + - Document is parsed + THEN: + - Text extracted via OCR; archive created + """ + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + multi_page_images_pdf_file, + "application/pdf", + produce_archive=True, + ) + assert tesseract_parser.archive_path is not None + assert_ordered_substrings( + tesseract_parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + def test_off_mode_image_with_archive( + self, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: + - Image file + - Mode: off + - produce_archive: True + WHEN: + - Document is parsed + THEN: + - Empty text content; archive created via img2pdf path + """ + tesseract_parser.settings.mode = "off" + tesseract_parser.parse( + simple_png_file, + "image/png", + produce_archive=True, + ) + assert tesseract_parser.archive_path is not None + # OCR mode is OFF, but archive creation with img2pdf+OCRmyPDF may still produce some text + assert tesseract_parser.get_text().strip() is not None + + def test_off_mode_image_without_archive( + self, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: + - Image file + - Mode: off + - produce_archive: False + WHEN: + - Document is parsed + THEN: + - Empty text content; no archive created + """ + tesseract_parser.settings.mode = "off" + tesseract_parser.parse( + simple_png_file, + "image/png", + produce_archive=False, + ) + assert tesseract_parser.archive_path is None + # OCR is disabled, so text should be empty + text = tesseract_parser.get_text().strip() + assert len(text) == 0 + + def test_off_mode_pdf_with_archive( + self, + tesseract_parser: RasterisedDocumentParser, + multi_page_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - PDF file + - Mode: off + - produce_archive: True + WHEN: + - Document is parsed + THEN: + - Text from pdftotext; archive created with skip_text (PDF/A only) + """ + tesseract_parser.settings.mode = "off" + tesseract_parser.parse( + multi_page_digital_pdf_file, + "application/pdf", + produce_archive=True, + ) + assert tesseract_parser.archive_path is not None + assert_ordered_substrings( + tesseract_parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + def test_off_mode_pdf_without_archive( + self, + tesseract_parser: RasterisedDocumentParser, + multi_page_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - PDF file + - Mode: off + - produce_archive: False + WHEN: + - Document is parsed + THEN: + - Text from pdftotext; no archive created + """ + tesseract_parser.settings.mode = "off" + tesseract_parser.parse( + multi_page_digital_pdf_file, + "application/pdf", + produce_archive=False, + ) + assert tesseract_parser.archive_path is None + assert_ordered_substrings( + tesseract_parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) # --------------------------------------------------------------------------- @@ -835,13 +928,13 @@ class TestParseMixed: """ GIVEN: - File with text in some pages (image) and some pages (digital) - - Mode: skip + - Mode: auto WHEN: - Document is parsed THEN: - All pages extracted; archive created; sidecar notes skipped pages """ - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-mixed.pdf", "application/pdf", @@ -891,24 +984,26 @@ class TestParseMixed: not in sidecar ) - def test_multi_page_mixed_skip_noarchive( + def test_multi_page_mixed_auto_mode_without_archive( self, tesseract_parser: RasterisedDocumentParser, - tesseract_samples_dir: Path, + multi_page_mixed_pdf_file: Path, ) -> None: """ GIVEN: - File with mixed pages - - Mode: skip_noarchive + - Mode: auto + - produce_archive: False WHEN: - Document is parsed THEN: - - No archive created (file has text layer); later-page text present + - No archive created; text from existing digital pages extracted """ - tesseract_parser.settings.mode = "skip_noarchive" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( - tesseract_samples_dir / "multi-page-mixed.pdf", + multi_page_mixed_pdf_file, "application/pdf", + produce_archive=False, ) assert tesseract_parser.archive_path is None assert_ordered_substrings( @@ -928,7 +1023,7 @@ class TestParseRotate: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.settings.rotate = True tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf") assert_ordered_substrings( diff --git a/src/paperless/tests/test_checks.py b/src/paperless/tests/test_checks.py index 87e64a90e..59f0f090d 100644 --- a/src/paperless/tests/test_checks.py +++ b/src/paperless/tests/test_checks.py @@ -130,16 +130,10 @@ class TestOcrSettingsChecks: id="invalid-mode", ), pytest.param( - "OCR_MODE", - "skip_noarchive", - "deprecated", - id="deprecated-mode", - ), - pytest.param( - "OCR_SKIP_ARCHIVE_FILE", + "ARCHIVE_FILE_GENERATION", "invalid", - 'OCR_SKIP_ARCHIVE_FILE setting "invalid"', - id="invalid-skip-archive-file", + 'ARCHIVE_FILE_GENERATION setting "invalid"', + id="invalid-archive-file-generation", ), pytest.param( "OCR_CLEAN",