mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-27 03:12:45 +00:00
fix: update tests for OCR/archive decoupling
- Add @override_settings(ARCHIVE_FILE_GENERATION="always") to tests that
expect archive files to always be produced (test_handle_document,
test_naming_priorities, test_update_content_maybe_archive_file)
- Add test_archive_preference.py with 14 unit tests covering
resolve_archive_preference() and _should_produce_archive_for_pdf()
- Update tesseract parser: add PriorOcrFoundError retry path and split
InputFileError handling (tagged PDF -> skip_text retry vs force-OCR
fallback for other input errors)
- Update checks.py: validate ARCHIVE_FILE_GENERATION, update OCR_MODE
valid set to {auto, force, redo, off}
- Update docs: PAPERLESS_OCR_MODE and PAPERLESS_ARCHIVE_FILE_GENERATION
docs and migration guide for v3
- Update tesseract parser tests, consumer tests, and parsers tests to
reflect new settings and behaviour
This commit is contained in:
@@ -801,11 +801,13 @@ parsing documents.
|
||||
|
||||
#### [`PAPERLESS_OCR_MODE=<mode>`](#PAPERLESS_OCR_MODE) {#PAPERLESS_OCR_MODE}
|
||||
|
||||
: Tell paperless when and how to perform ocr on your documents. Three
|
||||
modes are available:
|
||||
: Tell paperless when and how to perform OCR on your documents. The
|
||||
following modes are available:
|
||||
|
||||
- `skip`: Paperless skips all pages and will perform ocr only on
|
||||
pages where no text is present. This is the safest option.
|
||||
- `auto`: Paperless auto-detects whether a document already
|
||||
contains extractable text using pdftotext. If the extracted
|
||||
text exceeds a threshold (50 characters), OCR is skipped;
|
||||
otherwise OCR runs. This is the default.
|
||||
|
||||
- `redo`: Paperless will OCR all pages of your documents and
|
||||
attempt to replace any existing text layers with new text. This
|
||||
@@ -823,24 +825,46 @@ modes are available:
|
||||
significantly larger and text won't appear as sharp when zoomed
|
||||
in.
|
||||
|
||||
The default is `skip`, which only performs OCR when necessary and
|
||||
always creates archived documents.
|
||||
- `off`: OCR never runs regardless of input type. Embedded text
|
||||
is still extracted from PDFs via pdftotext, but images and
|
||||
scanned PDFs without text layers will have empty content.
|
||||
Useful for handwritten documents, bulk ingestion of large
|
||||
archives, or content that OCRs poorly. Archive generation still
|
||||
works independently when `PAPERLESS_ARCHIVE_FILE_GENERATION`
|
||||
requests it — a PDF/A can be produced without OCR via format
|
||||
conversion only.
|
||||
|
||||
Defaults to `auto`.
|
||||
|
||||
Read more about this in the [OCRmyPDF
|
||||
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
|
||||
|
||||
#### [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`](#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) {#PAPERLESS_OCR_SKIP_ARCHIVE_FILE}
|
||||
#### [`PAPERLESS_ARCHIVE_FILE_GENERATION=<mode>`](#PAPERLESS_ARCHIVE_FILE_GENERATION) {#PAPERLESS_ARCHIVE_FILE_GENERATION}
|
||||
|
||||
: Specify when you would like paperless to skip creating an archived
|
||||
version of your documents. This is useful if you don't want to have two
|
||||
almost-identical versions of your documents in the media folder.
|
||||
: Controls whether paperless produces a normalized PDF/A archive copy
|
||||
of each document. This is independent of OCR — a PDF/A can be produced
|
||||
with or without running OCR.
|
||||
|
||||
- `never`: Never skip creating an archived version.
|
||||
- `with_text`: Skip creating an archived version for documents
|
||||
that already have embedded text.
|
||||
- `always`: Always skip creating an archived version.
|
||||
- `auto`: Produce archives for scanned and image-based documents;
|
||||
skip for born-digital PDFs. Born-digital is detected by
|
||||
checking both whether the PDF contains extractable text and
|
||||
whether it has a logical structure (tag tree), which word
|
||||
processors and PDF export tools produce. Scanner software that
|
||||
applies its own OCR typically does not produce tagged PDFs, so
|
||||
those still receive an archive.
|
||||
|
||||
The default is `never`.
|
||||
- `always`: Always produce a PDF/A archive when the parser
|
||||
supports it.
|
||||
|
||||
- `never`: Never produce an archive.
|
||||
|
||||
Defaults to `auto`.
|
||||
|
||||
!!! note
|
||||
|
||||
Parsers that must produce a PDF for the frontend to display the
|
||||
document (e.g. the Tika parser for Office documents) always
|
||||
produce a PDF rendition regardless of this setting.
|
||||
|
||||
#### [`PAPERLESS_OCR_CLEAN=<mode>`](#PAPERLESS_OCR_CLEAN) {#PAPERLESS_OCR_CLEAN}
|
||||
|
||||
|
||||
@@ -130,3 +130,21 @@ For example:
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## OCR and Archive Settings Changes
|
||||
|
||||
The `PAPERLESS_OCR_MODE` values `skip` and `skip_noarchive` have been replaced by
|
||||
[`PAPERLESS_OCR_MODE=auto`](configuration.md#PAPERLESS_OCR_MODE). Archive file
|
||||
generation is now controlled by the separate
|
||||
[`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION)
|
||||
setting, replacing `PAPERLESS_OCR_SKIP_ARCHIVE_FILE`.
|
||||
|
||||
### Summary
|
||||
|
||||
| Old Setting | New Setting |
|
||||
| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `PAPERLESS_OCR_MODE=skip` | [`PAPERLESS_OCR_MODE=auto`](configuration.md#PAPERLESS_OCR_MODE) (now the default) |
|
||||
| `PAPERLESS_OCR_MODE=skip_noarchive` | [`PAPERLESS_OCR_MODE=auto`](configuration.md#PAPERLESS_OCR_MODE) + [`PAPERLESS_ARCHIVE_FILE_GENERATION=never`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never` | [`PAPERLESS_ARCHIVE_FILE_GENERATION=always`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text` | [`PAPERLESS_ARCHIVE_FILE_GENERATION=auto`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) |
|
||||
| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` | [`PAPERLESS_ARCHIVE_FILE_GENERATION=never`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) |
|
||||
|
||||
257
src/documents/tests/test_archive_preference.py
Normal file
257
src/documents/tests/test_archive_preference.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
Tests for documents.parsers.resolve_archive_preference function and related logic.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from documents.parsers import _should_produce_archive_for_pdf
|
||||
from documents.parsers import resolve_archive_preference
|
||||
from paperless.models import ArchiveFileGenerationChoices
|
||||
|
||||
|
||||
class TestResolveArchivePreference:
|
||||
"""Test the resolve_archive_preference function."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("archive_setting", "can_produce_archive", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
ArchiveFileGenerationChoices.ALWAYS,
|
||||
True,
|
||||
True,
|
||||
id="always-capable-parser",
|
||||
),
|
||||
pytest.param(
|
||||
ArchiveFileGenerationChoices.ALWAYS,
|
||||
False,
|
||||
False,
|
||||
id="always-incapable-parser",
|
||||
),
|
||||
pytest.param(
|
||||
ArchiveFileGenerationChoices.NEVER,
|
||||
True,
|
||||
False,
|
||||
id="never-capable-parser",
|
||||
),
|
||||
pytest.param(
|
||||
ArchiveFileGenerationChoices.NEVER,
|
||||
False,
|
||||
False,
|
||||
id="never-incapable-parser",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_archive_generation_setting_behavior(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
archive_setting: ArchiveFileGenerationChoices,
|
||||
can_produce_archive: bool, # noqa: FBT001
|
||||
expected: bool, # noqa: FBT001
|
||||
) -> None:
|
||||
"""Test archive generation setting behavior for always/never modes."""
|
||||
settings.ARCHIVE_FILE_GENERATION = archive_setting
|
||||
|
||||
result = resolve_archive_preference(
|
||||
"application/pdf",
|
||||
Path("/fake/path.pdf"),
|
||||
can_produce_archive=can_produce_archive,
|
||||
)
|
||||
|
||||
assert result is expected
|
||||
|
||||
def test_auto_mode_non_pdf_returns_true(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- ARCHIVE_FILE_GENERATION=auto
|
||||
- Non-PDF mime type
|
||||
- can_produce_archive=True
|
||||
WHEN:
|
||||
- resolve_archive_preference is called
|
||||
THEN:
|
||||
- Returns True (images always need archive)
|
||||
"""
|
||||
settings.ARCHIVE_FILE_GENERATION = ArchiveFileGenerationChoices.AUTO
|
||||
|
||||
result = resolve_archive_preference(
|
||||
"image/jpeg",
|
||||
Path("/fake/path.jpg"),
|
||||
can_produce_archive=True,
|
||||
)
|
||||
|
||||
assert result is True
|
||||
|
||||
def test_auto_mode_pdf_delegates_to_heuristic(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- ARCHIVE_FILE_GENERATION=auto
|
||||
- PDF mime type
|
||||
- can_produce_archive=True
|
||||
WHEN:
|
||||
- resolve_archive_preference is called
|
||||
THEN:
|
||||
- Delegates to _should_produce_archive_for_pdf
|
||||
"""
|
||||
settings.ARCHIVE_FILE_GENERATION = ArchiveFileGenerationChoices.AUTO
|
||||
mock_heuristic = mocker.patch(
|
||||
"documents.parsers._should_produce_archive_for_pdf",
|
||||
return_value=True,
|
||||
)
|
||||
fake_path = Path("/fake/path.pdf")
|
||||
|
||||
result = resolve_archive_preference(
|
||||
"application/pdf",
|
||||
fake_path,
|
||||
can_produce_archive=True,
|
||||
)
|
||||
|
||||
mock_heuristic.assert_called_once_with(fake_path)
|
||||
assert result is True
|
||||
|
||||
|
||||
class TestShouldProduceArchiveForPdf:
|
||||
"""Test the _should_produce_archive_for_pdf heuristic function."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("text_content", "has_struct_tree", "is_marked", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"This is a long text content that is definitely longer than fifty characters",
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
id="tagged-with-struct-tree",
|
||||
),
|
||||
pytest.param(
|
||||
"This is a long text content that is definitely longer than fifty characters",
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
id="tagged-with-mark-info",
|
||||
),
|
||||
pytest.param(
|
||||
"This is a long text content that is definitely longer than fifty characters",
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
id="untagged-with-text",
|
||||
),
|
||||
pytest.param(
|
||||
"Short text",
|
||||
True,
|
||||
False,
|
||||
True,
|
||||
id="little-text-tagged",
|
||||
),
|
||||
pytest.param(
|
||||
"Short text",
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
id="little-text-untagged",
|
||||
),
|
||||
pytest.param(
|
||||
"",
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
id="no-text",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pdf_heuristic_logic(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
text_content: str,
|
||||
has_struct_tree: bool, # noqa: FBT001
|
||||
is_marked: bool, # noqa: FBT001
|
||||
expected: bool, # noqa: FBT001
|
||||
) -> None:
|
||||
"""Test the PDF heuristic with various text and tagging combinations."""
|
||||
# Mock text extraction
|
||||
mocker.patch(
|
||||
"documents.parsers.run_subprocess",
|
||||
)
|
||||
mocker.patch(
|
||||
"documents.parsers.read_file_handle_unicode_errors",
|
||||
return_value=text_content,
|
||||
)
|
||||
|
||||
# Mock pikepdf
|
||||
mock_pdf = Mock()
|
||||
if has_struct_tree:
|
||||
mock_pdf.Root.StructTreeRoot = True
|
||||
else:
|
||||
del mock_pdf.Root.StructTreeRoot
|
||||
|
||||
mock_pdf.Root.MarkInfo.get.return_value = is_marked
|
||||
mock_pikepdf = mocker.patch("documents.parsers.pikepdf")
|
||||
mock_pikepdf.open.return_value.__enter__.return_value = mock_pdf
|
||||
|
||||
result = _should_produce_archive_for_pdf(Path("/fake/path.pdf"))
|
||||
assert result is expected
|
||||
|
||||
def test_exception_handling_returns_true(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- PDF processing raises an exception
|
||||
WHEN:
|
||||
- _should_produce_archive_for_pdf is called
|
||||
THEN:
|
||||
- Returns True (safe default)
|
||||
"""
|
||||
# Mock exception during text processing
|
||||
mocker.patch(
|
||||
"documents.parsers.run_subprocess",
|
||||
side_effect=Exception("Test error"),
|
||||
)
|
||||
|
||||
result = _should_produce_archive_for_pdf(Path("/fake/path.pdf"))
|
||||
assert result is True
|
||||
|
||||
def test_pikepdf_exception_returns_true(
|
||||
self,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Text extraction succeeds but pikepdf raises exception
|
||||
WHEN:
|
||||
- _should_produce_archive_for_pdf is called
|
||||
THEN:
|
||||
- Returns True (safe default)
|
||||
"""
|
||||
# Mock successful text extraction
|
||||
mocker.patch("documents.parsers.run_subprocess")
|
||||
mocker.patch(
|
||||
"documents.parsers.read_file_handle_unicode_errors",
|
||||
return_value="This is a long text content that is definitely longer than fifty characters",
|
||||
)
|
||||
|
||||
# Mock pikepdf exception
|
||||
mocker.patch(
|
||||
"documents.parsers.pikepdf.open",
|
||||
side_effect=Exception("PDF error"),
|
||||
)
|
||||
|
||||
result = _should_produce_archive_for_pdf(Path("/fake/path.pdf"))
|
||||
assert result is True
|
||||
@@ -5,6 +5,7 @@ import tempfile
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import Mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Group
|
||||
@@ -1548,3 +1549,158 @@ class TestBarcodeApplyDetectedASN(TestCase):
|
||||
|
||||
plugin._apply_detected_asn(123)
|
||||
self.assertEqual(plugin.metadata.asn, 123)
|
||||
|
||||
|
||||
# TODO: Convert these tests to pytest style in the future
|
||||
class TestArchivePreferenceWiring(DirectoriesMixin, GetConsumerMixin, TestCase):
|
||||
"""Test that archive preference settings are properly wired to parser calls."""
|
||||
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
# Use simple test file that can be parsed by our test parsers
|
||||
src = (
|
||||
Path(__file__).parent
|
||||
/ "samples"
|
||||
/ "documents"
|
||||
/ "originals"
|
||||
/ "0000005.pdf"
|
||||
)
|
||||
self.test_file = self.dirs.scratch_dir / "sample.pdf"
|
||||
shutil.copy(src, self.test_file)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="never")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_never_setting_passes_produce_archive_false(self, mock_registry):
|
||||
"""Test that ARCHIVE_FILE_GENERATION=never passes produce_archive=False to parser."""
|
||||
# Mock parser to track produce_archive parameter
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
mock_parser_instance = Mock()
|
||||
mock_parser_instance.can_produce_archive = True
|
||||
mock_parser_instance.get_text.return_value = "Test text"
|
||||
mock_parser_instance.get_archive_path.return_value = None
|
||||
# Create a temporary thumbnail file for testing
|
||||
thumbnail_path = self.dirs.scratch_dir / "thumbnail.webp"
|
||||
thumbnail_path.write_bytes(b"fake_thumbnail_data")
|
||||
mock_parser_instance.get_thumbnail.return_value = thumbnail_path
|
||||
mock_parser_instance.get_date.return_value = None
|
||||
mock_parser_instance.get_page_count.return_value = 1
|
||||
mock_parser_instance.extract_metadata.return_value = []
|
||||
|
||||
# Use MagicMock to properly support context manager protocol
|
||||
mock_parser_class = MagicMock()
|
||||
mock_parser_class.return_value.__enter__ = Mock(
|
||||
return_value=mock_parser_instance,
|
||||
)
|
||||
mock_parser_class.return_value.__exit__ = Mock(return_value=None)
|
||||
mock_parser_class.can_produce_archive = True
|
||||
mock_parser_class.requires_pdf_rendition = False
|
||||
|
||||
mock_registry_instance = Mock()
|
||||
mock_registry_instance.get_parser_for_file.return_value = mock_parser_class
|
||||
mock_registry.return_value = mock_registry_instance
|
||||
|
||||
with self.get_consumer(self.test_file) as consumer:
|
||||
consumer.run()
|
||||
|
||||
# Verify parse was called with produce_archive=False
|
||||
mock_parser_instance.parse.assert_called_once()
|
||||
call_args = mock_parser_instance.parse.call_args
|
||||
self.assertEqual(call_args.kwargs["produce_archive"], False)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_always_setting_passes_produce_archive_true(self, mock_registry):
|
||||
"""Test that ARCHIVE_FILE_GENERATION=always passes produce_archive=True to parser."""
|
||||
# Mock parser to track produce_archive parameter
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
mock_parser_instance = Mock()
|
||||
mock_parser_instance.can_produce_archive = True
|
||||
mock_parser_instance.get_text.return_value = "Test text"
|
||||
mock_parser_instance.get_archive_path.return_value = (
|
||||
self.test_file
|
||||
) # Fake archive
|
||||
# Create a temporary thumbnail file for testing
|
||||
thumbnail_path = self.dirs.scratch_dir / "thumbnail.webp"
|
||||
thumbnail_path.write_bytes(b"fake_thumbnail_data")
|
||||
mock_parser_instance.get_thumbnail.return_value = thumbnail_path
|
||||
mock_parser_instance.get_date.return_value = None
|
||||
mock_parser_instance.get_page_count.return_value = 1
|
||||
mock_parser_instance.extract_metadata.return_value = []
|
||||
|
||||
# Use MagicMock to properly support context manager protocol
|
||||
mock_parser_class = MagicMock()
|
||||
mock_parser_class.return_value.__enter__ = Mock(
|
||||
return_value=mock_parser_instance,
|
||||
)
|
||||
mock_parser_class.return_value.__exit__ = Mock(return_value=None)
|
||||
mock_parser_class.can_produce_archive = True
|
||||
mock_parser_class.requires_pdf_rendition = False
|
||||
|
||||
mock_registry_instance = Mock()
|
||||
mock_registry_instance.get_parser_for_file.return_value = mock_parser_class
|
||||
mock_registry.return_value = mock_registry_instance
|
||||
|
||||
with self.get_consumer(self.test_file) as consumer:
|
||||
consumer.run()
|
||||
|
||||
# Verify parse was called with produce_archive=True
|
||||
mock_parser_instance.parse.assert_called_once()
|
||||
call_args = mock_parser_instance.parse.call_args
|
||||
self.assertEqual(call_args.kwargs["produce_archive"], True)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="auto")
|
||||
@mock.patch("documents.consumer.resolve_archive_preference")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_auto_setting_delegates_to_resolve_archive_preference(
|
||||
self,
|
||||
mock_registry,
|
||||
mock_resolve_preference,
|
||||
):
|
||||
"""Test that ARCHIVE_FILE_GENERATION=auto delegates to resolve_archive_preference."""
|
||||
mock_resolve_preference.return_value = False
|
||||
|
||||
# Mock parser to track produce_archive parameter
|
||||
mock_parser_instance = Mock()
|
||||
mock_parser_instance.can_produce_archive = True
|
||||
mock_parser_instance.get_text.return_value = "Test text"
|
||||
mock_parser_instance.get_archive_path.return_value = None
|
||||
# Create a temporary thumbnail file for testing
|
||||
thumbnail_path = self.dirs.scratch_dir / "thumbnail.webp"
|
||||
thumbnail_path.write_bytes(b"fake_thumbnail_data")
|
||||
mock_parser_instance.get_thumbnail.return_value = thumbnail_path
|
||||
mock_parser_instance.get_date.return_value = None
|
||||
mock_parser_instance.get_page_count.return_value = 1
|
||||
mock_parser_instance.extract_metadata.return_value = []
|
||||
|
||||
# Use MagicMock to properly support context manager protocol
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
mock_parser_class = MagicMock()
|
||||
mock_parser_class.return_value.__enter__ = Mock(
|
||||
return_value=mock_parser_instance,
|
||||
)
|
||||
mock_parser_class.return_value.__exit__ = Mock(return_value=None)
|
||||
mock_parser_class.can_produce_archive = True
|
||||
mock_parser_class.requires_pdf_rendition = False
|
||||
|
||||
mock_registry_instance = Mock()
|
||||
mock_registry_instance.get_parser_for_file.return_value = mock_parser_class
|
||||
mock_registry.return_value = mock_registry_instance
|
||||
|
||||
with self.get_consumer(self.test_file) as consumer:
|
||||
consumer.run()
|
||||
|
||||
# Verify resolve_archive_preference was called with correct parameters
|
||||
mock_resolve_preference.assert_called_once()
|
||||
call_args = mock_resolve_preference.call_args
|
||||
self.assertEqual(call_args.args[0], "application/pdf")
|
||||
# Path will be working copy (different from original), so check it's a Path to sample.pdf
|
||||
self.assertEqual(call_args.args[1].name, "sample.pdf")
|
||||
self.assertEqual(call_args.kwargs["can_produce_archive"], True)
|
||||
|
||||
# Verify parse was called with the result from resolve_archive_preference
|
||||
mock_parser_instance.parse.assert_called_once()
|
||||
call_args = mock_parser_instance.parse.call_args
|
||||
self.assertEqual(call_args.kwargs["produce_archive"], False)
|
||||
|
||||
@@ -43,6 +43,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
|
||||
call_command("document_archiver", "--processes", "1", skip_checks=True)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
||||
def test_handle_document(self) -> None:
|
||||
doc = self.make_models()
|
||||
shutil.copy(sample_file, Path(self.dirs.originals_dir) / f"{doc.id:07}.pdf")
|
||||
@@ -73,7 +74,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertIsNone(doc.archive_filename)
|
||||
self.assertIsFile(doc.source_path)
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{title}")
|
||||
@override_settings(FILENAME_FORMAT="{title}", ARCHIVE_FILE_GENERATION="always")
|
||||
def test_naming_priorities(self) -> None:
|
||||
doc1 = Document.objects.create(
|
||||
checksum="A",
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
from unittest.mock import Mock
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.parsers import get_default_file_extension
|
||||
from documents.parsers import get_supported_file_extensions
|
||||
from documents.parsers import is_file_ext_supported
|
||||
from documents.parsers import resolve_archive_preference
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.parsers.registry import reset_parser_registry
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
@@ -111,3 +115,195 @@ class TestParserAvailability(TestCase):
|
||||
self.assertTrue(is_file_ext_supported(".pdf"))
|
||||
self.assertFalse(is_file_ext_supported(".hsdfh"))
|
||||
self.assertFalse(is_file_ext_supported(""))
|
||||
|
||||
|
||||
class TestResolveArchivePreference(TestCase):
|
||||
"""Test the resolve_archive_preference function with various settings and file types."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test PDF file for mocking."""
|
||||
from pathlib import Path
|
||||
|
||||
self.test_pdf_path = Path("/fake/path/test.pdf")
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
||||
def test_always_setting_with_capable_parser(self):
|
||||
"""Test ARCHIVE_FILE_GENERATION=always with parser that can produce archive."""
|
||||
result = resolve_archive_preference(
|
||||
"application/pdf",
|
||||
self.test_pdf_path,
|
||||
can_produce_archive=True,
|
||||
)
|
||||
self.assertTrue(result)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
||||
def test_always_setting_with_incapable_parser(self):
|
||||
"""Test ARCHIVE_FILE_GENERATION=always with parser that cannot produce archive."""
|
||||
result = resolve_archive_preference(
|
||||
"application/pdf",
|
||||
self.test_pdf_path,
|
||||
can_produce_archive=False,
|
||||
)
|
||||
self.assertFalse(result)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="never")
|
||||
def test_never_setting_regardless_of_parser(self):
|
||||
"""Test ARCHIVE_FILE_GENERATION=never regardless of parser capability."""
|
||||
# Test with capable parser
|
||||
result = resolve_archive_preference(
|
||||
"application/pdf",
|
||||
self.test_pdf_path,
|
||||
can_produce_archive=True,
|
||||
)
|
||||
self.assertFalse(result)
|
||||
|
||||
# Test with incapable parser
|
||||
result = resolve_archive_preference(
|
||||
"application/pdf",
|
||||
self.test_pdf_path,
|
||||
can_produce_archive=False,
|
||||
)
|
||||
self.assertFalse(result)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="auto")
|
||||
def test_auto_setting_with_non_pdf_mime_type(self):
|
||||
"""Test ARCHIVE_FILE_GENERATION=auto with non-PDF mime types."""
|
||||
# Non-PDF mime types (images etc.) should always produce archive
|
||||
result = resolve_archive_preference(
|
||||
"image/jpeg",
|
||||
self.test_pdf_path, # Path doesn't matter for non-PDF
|
||||
can_produce_archive=True,
|
||||
)
|
||||
self.assertTrue(result)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="auto")
|
||||
@patch("documents.parsers._should_produce_archive_for_pdf")
|
||||
def test_auto_setting_with_pdf_delegates_to_heuristic(self, mock_heuristic):
|
||||
"""Test ARCHIVE_FILE_GENERATION=auto with PDF delegates to heuristic function."""
|
||||
mock_heuristic.return_value = False
|
||||
|
||||
result = resolve_archive_preference(
|
||||
"application/pdf",
|
||||
self.test_pdf_path,
|
||||
can_produce_archive=True,
|
||||
)
|
||||
|
||||
mock_heuristic.assert_called_once_with(self.test_pdf_path)
|
||||
self.assertFalse(result)
|
||||
|
||||
# Test with heuristic returning True
|
||||
mock_heuristic.reset_mock()
|
||||
mock_heuristic.return_value = True
|
||||
|
||||
result = resolve_archive_preference(
|
||||
"application/pdf",
|
||||
self.test_pdf_path,
|
||||
can_produce_archive=True,
|
||||
)
|
||||
|
||||
mock_heuristic.assert_called_once_with(self.test_pdf_path)
|
||||
self.assertTrue(result)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="auto")
|
||||
@patch("documents.parsers.run_subprocess")
|
||||
@patch("documents.parsers.pikepdf.open")
|
||||
@patch("documents.parsers.read_file_handle_unicode_errors")
|
||||
def test_pdf_heuristic_born_digital_tagged(
|
||||
self,
|
||||
mock_read_file,
|
||||
mock_pikepdf_open,
|
||||
mock_subprocess,
|
||||
):
|
||||
"""Test PDF heuristic detects born-digital tagged PDF (should NOT produce archive)."""
|
||||
# Mock pdftotext output - lots of text
|
||||
mock_read_file.return_value = (
|
||||
"This is a lot of text content from a born-digital PDF document."
|
||||
)
|
||||
|
||||
# Mock pikepdf - tagged PDF
|
||||
mock_pdf = Mock()
|
||||
mock_pdf.Root = Mock()
|
||||
mock_pdf.Root.StructTreeRoot = Mock() # Has structure tree
|
||||
mock_pikepdf_open.return_value.__enter__.return_value = mock_pdf
|
||||
|
||||
from documents.parsers import _should_produce_archive_for_pdf
|
||||
|
||||
result = _should_produce_archive_for_pdf(self.test_pdf_path)
|
||||
|
||||
self.assertFalse(result) # Born-digital tagged PDF should NOT produce archive
|
||||
mock_subprocess.assert_called_once()
|
||||
mock_pikepdf_open.assert_called_once_with(self.test_pdf_path)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="auto")
|
||||
@patch("documents.parsers.run_subprocess")
|
||||
@patch("documents.parsers.pikepdf.open")
|
||||
@patch("documents.parsers.read_file_handle_unicode_errors")
|
||||
def test_pdf_heuristic_scanner_ocr_untagged(
|
||||
self,
|
||||
mock_read_file,
|
||||
mock_pikepdf_open,
|
||||
mock_subprocess,
|
||||
):
|
||||
"""Test PDF heuristic detects scanner OCR'd untagged PDF (should produce archive)."""
|
||||
# Mock pdftotext output - lots of text
|
||||
mock_read_file.return_value = (
|
||||
"This is a lot of text content from a scanner OCR'd PDF document."
|
||||
)
|
||||
|
||||
# Mock pikepdf - untagged PDF
|
||||
mock_pdf = Mock()
|
||||
mock_pdf.Root = Mock()
|
||||
# No StructTreeRoot and MarkInfo.Marked is False
|
||||
del mock_pdf.Root.StructTreeRoot # Simulate no attribute
|
||||
mock_pdf.Root.MarkInfo = Mock()
|
||||
mock_pdf.Root.MarkInfo.get.return_value = False
|
||||
mock_pikepdf_open.return_value.__enter__.return_value = mock_pdf
|
||||
|
||||
from documents.parsers import _should_produce_archive_for_pdf
|
||||
|
||||
result = _should_produce_archive_for_pdf(self.test_pdf_path)
|
||||
|
||||
self.assertTrue(result) # Scanner OCR'd PDF should produce archive
|
||||
mock_subprocess.assert_called_once()
|
||||
mock_pikepdf_open.assert_called_once_with(self.test_pdf_path)
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="auto")
|
||||
@patch("documents.parsers.run_subprocess")
|
||||
@patch("documents.parsers.pikepdf.open")
|
||||
@patch("documents.parsers.read_file_handle_unicode_errors")
|
||||
def test_pdf_heuristic_raw_scan_no_text(
|
||||
self,
|
||||
mock_read_file,
|
||||
mock_pikepdf_open,
|
||||
mock_subprocess,
|
||||
):
|
||||
"""Test PDF heuristic detects raw scan with no text (should produce archive)."""
|
||||
# Mock pdftotext output - very little text
|
||||
mock_read_file.return_value = " " # Just whitespace
|
||||
|
||||
# Mock pikepdf - doesn't matter for this case
|
||||
mock_pdf = Mock()
|
||||
mock_pdf.Root = Mock()
|
||||
mock_pikepdf_open.return_value.__enter__.return_value = mock_pdf
|
||||
|
||||
from documents.parsers import _should_produce_archive_for_pdf
|
||||
|
||||
result = _should_produce_archive_for_pdf(self.test_pdf_path)
|
||||
|
||||
self.assertTrue(result) # Raw scan should produce archive
|
||||
mock_subprocess.assert_called_once()
|
||||
# pikepdf check is not needed when text is short, but we don't control that here
|
||||
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="auto")
|
||||
@patch(
|
||||
"documents.parsers.run_subprocess",
|
||||
side_effect=Exception("pdftotext failed"),
|
||||
)
|
||||
def test_pdf_heuristic_exception_handling(self, mock_subprocess):
|
||||
"""Test PDF heuristic defaults to producing archive when exception occurs."""
|
||||
from documents.parsers import _should_produce_archive_for_pdf
|
||||
|
||||
result = _should_produce_archive_for_pdf(self.test_pdf_path)
|
||||
|
||||
self.assertTrue(result) # Should default to True when exception occurs
|
||||
mock_subprocess.assert_called_once()
|
||||
|
||||
@@ -233,10 +233,12 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
|
||||
|
||||
class TestUpdateContent(DirectoriesMixin, TestCase):
|
||||
@override_settings(ARCHIVE_FILE_GENERATION="always")
|
||||
def test_update_content_maybe_archive_file(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Existing document with archive file
|
||||
- ARCHIVE_FILE_GENERATION=always to force archive production
|
||||
WHEN:
|
||||
- Update content task is called
|
||||
THEN:
|
||||
|
||||
@@ -132,23 +132,14 @@ def settings_values_check(app_configs, **kwargs):
|
||||
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
|
||||
)
|
||||
|
||||
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
|
||||
if settings.OCR_MODE not in {"auto", "force", "redo", "off"}:
|
||||
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
||||
|
||||
if settings.OCR_MODE == "skip_noarchive":
|
||||
msgs.append(
|
||||
Warning(
|
||||
'OCR output mode "skip_noarchive" is deprecated and will be '
|
||||
"removed in a future version. Please use "
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
|
||||
),
|
||||
)
|
||||
|
||||
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
|
||||
if settings.ARCHIVE_FILE_GENERATION not in {"always", "never", "auto"}:
|
||||
msgs.append(
|
||||
Error(
|
||||
"OCR_SKIP_ARCHIVE_FILE setting "
|
||||
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
|
||||
"ARCHIVE_FILE_GENERATION setting "
|
||||
f'"{settings.ARCHIVE_FILE_GENERATION}" is not valid',
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -494,6 +494,7 @@ class RasterisedDocumentParser:
|
||||
from ocrmypdf import InputFileError
|
||||
from ocrmypdf import SubprocessOutputError
|
||||
from ocrmypdf.exceptions import DigitalSignatureError
|
||||
from ocrmypdf.exceptions import PriorOcrFoundError
|
||||
|
||||
archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
sidecar_file = Path(self.tempdir) / "sidecar.txt"
|
||||
@@ -532,12 +533,87 @@ class RasterisedDocumentParser:
|
||||
|
||||
if not self.text:
|
||||
raise NoTextFoundException("No text was found in the original document")
|
||||
except PriorOcrFoundError:
|
||||
# pdftotext couldn't detect the text layer (e.g. RTL or CJK scripts),
|
||||
# but ocrmypdf found it. Retry as PDF/A conversion only (skip_text).
|
||||
self.log.debug(
|
||||
"PDF has existing text layer not detected by pdftotext; "
|
||||
"retrying with skip_text for PDF/A conversion.",
|
||||
)
|
||||
retry_args = self.construct_ocrmypdf_parameters(
|
||||
input_file,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
skip_text=True,
|
||||
)
|
||||
try:
|
||||
ocrmypdf.ocr(**retry_args)
|
||||
if produce_archive:
|
||||
self.archive_path = archive_path
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
except Exception as e:
|
||||
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
|
||||
except (DigitalSignatureError, EncryptedPdfError):
|
||||
self.log.warning(
|
||||
"This file is encrypted and/or signed, OCR is impossible. Using "
|
||||
"any text present in the original file.",
|
||||
)
|
||||
self.text = text_original or ""
|
||||
except InputFileError as e:
|
||||
# Tagged PDFs raise InputFileError when called without skip_text/force_ocr.
|
||||
# Retry with skip_text to do PDF/A conversion without disturbing the text layer.
|
||||
if "Tagged PDF" in str(e):
|
||||
self.log.debug(
|
||||
"Tagged PDF detected; retrying with skip_text for PDF/A conversion.",
|
||||
)
|
||||
retry_args = self.construct_ocrmypdf_parameters(
|
||||
input_file,
|
||||
mime_type,
|
||||
archive_path,
|
||||
sidecar_file,
|
||||
skip_text=True,
|
||||
)
|
||||
try:
|
||||
ocrmypdf.ocr(**retry_args)
|
||||
if produce_archive:
|
||||
self.archive_path = archive_path
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
except Exception as retry_e:
|
||||
raise ParseError(
|
||||
f"{retry_e.__class__.__name__}: {retry_e!s}",
|
||||
) from retry_e
|
||||
else:
|
||||
self.log.warning(
|
||||
f"Encountered an error while running OCR: {e!s}. "
|
||||
f"Attempting force OCR to get the text.",
|
||||
)
|
||||
archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf"
|
||||
sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt"
|
||||
args = self.construct_ocrmypdf_parameters(
|
||||
input_file,
|
||||
mime_type
|
||||
if not (
|
||||
is_image
|
||||
and self.settings.mode == ModeChoices.OFF
|
||||
and produce_archive
|
||||
)
|
||||
else "application/pdf",
|
||||
archive_path_fallback,
|
||||
sidecar_file_fallback,
|
||||
safe_fallback=True,
|
||||
)
|
||||
try:
|
||||
self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
self.text = self.extract_text(
|
||||
sidecar_file_fallback,
|
||||
archive_path_fallback,
|
||||
)
|
||||
except Exception as fallback_e:
|
||||
raise ParseError(
|
||||
f"{fallback_e.__class__.__name__}: {fallback_e!s}",
|
||||
) from fallback_e
|
||||
except SubprocessOutputError as e:
|
||||
if "Ghostscript PDF/A rendering" in str(e):
|
||||
self.log.warning(
|
||||
@@ -548,7 +624,7 @@ class RasterisedDocumentParser:
|
||||
raise ParseError(
|
||||
f"SubprocessOutputError: {e!s}. See logs for more information.",
|
||||
) from e
|
||||
except (NoTextFoundException, InputFileError) as e:
|
||||
except NoTextFoundException as e:
|
||||
self.log.warning(
|
||||
f"Encountered an error while running OCR: {e!s}. "
|
||||
f"Attempting force OCR to get the text.",
|
||||
|
||||
@@ -708,7 +708,6 @@ def null_app_config(mocker: MockerFixture) -> MagicMock:
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
skip_archive_file=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
|
||||
@@ -93,11 +93,13 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
"""
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.SKIP
|
||||
instance.mode = ModeChoices.AUTO
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertTrue(params["skip_text"])
|
||||
# AUTO mode doesn't set skip_text in construct_ocrmypdf_parameters
|
||||
# The skip_text logic is handled in the parse method based on content detection
|
||||
self.assertNotIn("skip_text", params)
|
||||
self.assertNotIn("redo_ocr", params)
|
||||
self.assertNotIn("force_ocr", params)
|
||||
|
||||
|
||||
@@ -433,7 +433,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf")
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -449,7 +449,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "encrypted.pdf",
|
||||
"application/pdf",
|
||||
@@ -559,7 +559,7 @@ class TestParseMultiPage:
|
||||
@pytest.mark.parametrize(
|
||||
"mode",
|
||||
[
|
||||
pytest.param("skip", id="skip"),
|
||||
pytest.param("auto", id="auto"),
|
||||
pytest.param("redo", id="redo"),
|
||||
pytest.param("force", id="force"),
|
||||
],
|
||||
@@ -587,7 +587,7 @@ class TestParseMultiPage:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-images.pdf",
|
||||
"application/pdf",
|
||||
@@ -722,29 +722,31 @@ class TestParseMultiPage:
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parse — skip_noarchive / skip_archive_file
|
||||
# Parse — OCR_MODE=auto / off and produce_archive parameter
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSkipArchive:
|
||||
def test_skip_noarchive_with_text_layer(
|
||||
class TestOcrModeAndArchiveGeneration:
|
||||
def test_auto_mode_with_text_skips_archive(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
multi_page_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- Mode: skip_noarchive
|
||||
- File with existing text layer (born-digital PDF)
|
||||
- Mode: auto
|
||||
- produce_archive: False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted; no archive created
|
||||
- Text extracted; no archive created; ocrmypdf skipped entirely
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
multi_page_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -752,24 +754,26 @@ class TestSkipArchive:
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
def test_skip_noarchive_image_only_creates_archive(
|
||||
def test_auto_mode_with_text_produces_archive(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
multi_page_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with image-only pages (no text layer)
|
||||
- Mode: skip_noarchive
|
||||
- File with existing text layer (born-digital PDF)
|
||||
- Mode: auto
|
||||
- produce_archive: True
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted; archive created (OCR needed)
|
||||
- Text extracted; archive created with skip_text
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-images.pdf",
|
||||
multi_page_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert_ordered_substrings(
|
||||
@@ -777,48 +781,137 @@ class TestSkipArchive:
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("skip_archive_file", "filename", "expect_archive"),
|
||||
[
|
||||
pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"),
|
||||
pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"),
|
||||
pytest.param(
|
||||
"with_text",
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="with-text-layer",
|
||||
),
|
||||
pytest.param(
|
||||
"with_text",
|
||||
"multi-page-images.pdf",
|
||||
True,
|
||||
id="with-text-no-layer",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="always-with-text",
|
||||
),
|
||||
pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"),
|
||||
],
|
||||
)
|
||||
def test_skip_archive_file_setting(
|
||||
def test_auto_mode_image_produces_archive(
|
||||
self,
|
||||
skip_archive_file: str,
|
||||
filename: str,
|
||||
expect_archive: str,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
multi_page_images_pdf_file: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.skip_archive_file = skip_archive_file
|
||||
tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf")
|
||||
text = tesseract_parser.get_text().lower()
|
||||
assert_ordered_substrings(text, ["page 1", "page 2", "page 3"])
|
||||
if expect_archive:
|
||||
assert tesseract_parser.archive_path is not None
|
||||
else:
|
||||
assert tesseract_parser.archive_path is None
|
||||
"""
|
||||
GIVEN:
|
||||
- File with image-only pages (no text layer)
|
||||
- Mode: auto
|
||||
- produce_archive: True
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted via OCR; archive created
|
||||
"""
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
multi_page_images_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert_ordered_substrings(
|
||||
tesseract_parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
def test_off_mode_image_with_archive(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Image file
|
||||
- Mode: off
|
||||
- produce_archive: True
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Empty text content; archive created via img2pdf path
|
||||
"""
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
simple_png_file,
|
||||
"image/png",
|
||||
produce_archive=True,
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
# OCR mode is OFF, but archive creation with img2pdf+OCRmyPDF may still produce some text
|
||||
assert tesseract_parser.get_text().strip() is not None
|
||||
|
||||
def test_off_mode_image_without_archive(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
simple_png_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Image file
|
||||
- Mode: off
|
||||
- produce_archive: False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Empty text content; no archive created
|
||||
"""
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
simple_png_file,
|
||||
"image/png",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
# OCR is disabled, so text should be empty
|
||||
text = tesseract_parser.get_text().strip()
|
||||
assert len(text) == 0
|
||||
|
||||
def test_off_mode_pdf_with_archive(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- PDF file
|
||||
- Mode: off
|
||||
- produce_archive: True
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from pdftotext; archive created with skip_text (PDF/A only)
|
||||
"""
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
multi_page_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=True,
|
||||
)
|
||||
assert tesseract_parser.archive_path is not None
|
||||
assert_ordered_substrings(
|
||||
tesseract_parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
def test_off_mode_pdf_without_archive(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
multi_page_digital_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- PDF file
|
||||
- Mode: off
|
||||
- produce_archive: False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from pdftotext; no archive created
|
||||
"""
|
||||
tesseract_parser.settings.mode = "off"
|
||||
tesseract_parser.parse(
|
||||
multi_page_digital_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
tesseract_parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -835,13 +928,13 @@ class TestParseMixed:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text in some pages (image) and some pages (digital)
|
||||
- Mode: skip
|
||||
- Mode: auto
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- All pages extracted; archive created; sidecar notes skipped pages
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
"application/pdf",
|
||||
@@ -891,24 +984,26 @@ class TestParseMixed:
|
||||
not in sidecar
|
||||
)
|
||||
|
||||
def test_multi_page_mixed_skip_noarchive(
|
||||
def test_multi_page_mixed_auto_mode_without_archive(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
multi_page_mixed_pdf_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with mixed pages
|
||||
- Mode: skip_noarchive
|
||||
- Mode: auto
|
||||
- produce_archive: False
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- No archive created (file has text layer); later-page text present
|
||||
- No archive created; text from existing digital pages extracted
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
multi_page_mixed_pdf_file,
|
||||
"application/pdf",
|
||||
produce_archive=False,
|
||||
)
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -928,7 +1023,7 @@ class TestParseRotate:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.rotate = True
|
||||
tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf")
|
||||
assert_ordered_substrings(
|
||||
|
||||
@@ -130,16 +130,10 @@ class TestOcrSettingsChecks:
|
||||
id="invalid-mode",
|
||||
),
|
||||
pytest.param(
|
||||
"OCR_MODE",
|
||||
"skip_noarchive",
|
||||
"deprecated",
|
||||
id="deprecated-mode",
|
||||
),
|
||||
pytest.param(
|
||||
"OCR_SKIP_ARCHIVE_FILE",
|
||||
"ARCHIVE_FILE_GENERATION",
|
||||
"invalid",
|
||||
'OCR_SKIP_ARCHIVE_FILE setting "invalid"',
|
||||
id="invalid-skip-archive-file",
|
||||
'ARCHIVE_FILE_GENERATION setting "invalid"',
|
||||
id="invalid-archive-file-generation",
|
||||
),
|
||||
pytest.param(
|
||||
"OCR_CLEAN",
|
||||
|
||||
Reference in New Issue
Block a user