From c232d443fa5d377e5aa6809b9dd6052c62734de6 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 6 Apr 2026 15:50:21 -0700 Subject: [PATCH 1/3] Breaking: Decouple OCR control from archive file control (#12448) Co-authored-by: Claude Sonnet 4.6 Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> --- .gitignore | 1 + docs/configuration.md | 68 ++- docs/migration-v3.md | 63 ++- docs/setup.md | 7 +- docs/usage.md | 6 +- src-ui/src/app/data/paperless-config.ts | 18 +- src/documents/consumer.py | 88 +++- src/documents/tasks.py | 12 +- src/documents/tests/test_api_app_config.py | 2 +- src/documents/tests/test_barcodes.py | 2 +- src/documents/tests/test_consumer.py | 41 +- src/documents/tests/test_consumer_archive.py | 189 ++++++++ src/documents/tests/test_management.py | 5 +- src/documents/tests/test_tasks.py | 1 + src/paperless/checks.py | 69 ++- src/paperless/config.py | 21 +- .../0008_replace_skip_archive_file.py | 90 ++++ src/paperless/models.py | 20 +- src/paperless/parsers/tesseract.py | 306 ++++++++---- src/paperless/parsers/utils.py | 94 +++- src/paperless/settings/__init__.py | 19 +- src/paperless/tests/parsers/conftest.py | 2 +- .../parsers/test_convert_image_to_pdfa.py | 141 ++++++ .../tests/parsers/test_parse_modes.py | 440 ++++++++++++++++++ .../parsers/test_tesseract_custom_settings.py | 26 +- .../tests/parsers/test_tesseract_parser.py | 183 ++++++-- src/paperless/tests/test_checks.py | 8 +- src/paperless/tests/test_checks_v3.py | 64 +++ ...est_migration_replace_skip_archive_file.py | 89 ++++ src/paperless/tests/test_ocr_config.py | 66 +++ src/paperless/tests/test_parser_utils.py | 25 + 31 files changed, 1933 insertions(+), 233 deletions(-) create mode 100644 src/documents/tests/test_consumer_archive.py create mode 100644 src/paperless/migrations/0008_replace_skip_archive_file.py create mode 100644 src/paperless/tests/parsers/test_convert_image_to_pdfa.py create mode 100644 src/paperless/tests/parsers/test_parse_modes.py create mode 100644 src/paperless/tests/test_checks_v3.py create mode 100644 src/paperless/tests/test_migration_replace_skip_archive_file.py create mode 100644 src/paperless/tests/test_ocr_config.py create mode 100644 src/paperless/tests/test_parser_utils.py diff --git a/.gitignore b/.gitignore index c607f922d..da74df3cf 100644 --- a/.gitignore +++ b/.gitignore @@ -111,3 +111,4 @@ celerybeat-schedule* # ignore pnpm package store folder created when setting up the devcontainer .pnpm-store/ +.worktrees diff --git a/docs/configuration.md b/docs/configuration.md index 7156b3553..f8927a575 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -821,11 +821,14 @@ parsing documents. #### [`PAPERLESS_OCR_MODE=`](#PAPERLESS_OCR_MODE) {#PAPERLESS_OCR_MODE} -: Tell paperless when and how to perform ocr on your documents. Three +: Tell paperless when and how to perform ocr on your documents. Four modes are available: - - `skip`: Paperless skips all pages and will perform ocr only on - pages where no text is present. This is the safest option. + - `auto` (default): Paperless detects whether a document already + has embedded text via pdftotext. If sufficient text is found, + OCR is skipped for that document (`--skip-text`). If no text is + present, OCR runs normally. This is the safest option for mixed + document collections. - `redo`: Paperless will OCR all pages of your documents and attempt to replace any existing text layers with new text. This @@ -843,24 +846,59 @@ modes are available: significantly larger and text won't appear as sharp when zoomed in. - The default is `skip`, which only performs OCR when necessary and - always creates archived documents. + - `off`: Paperless never invokes the OCR engine. For PDFs, text + is extracted via pdftotext only. For image documents, text will + be empty. Archive file generation still works via format + conversion (no Tesseract or Ghostscript required). - Read more about this in the [OCRmyPDF + The default is `auto`. + + For the `skip`, `redo`, and `force` modes, read more about OCR + behaviour in the [OCRmyPDF documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped). -#### [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=`](#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) {#PAPERLESS_OCR_SKIP_ARCHIVE_FILE} +#### [`PAPERLESS_ARCHIVE_FILE_GENERATION=`](#PAPERLESS_ARCHIVE_FILE_GENERATION) {#PAPERLESS_ARCHIVE_FILE_GENERATION} -: Specify when you would like paperless to skip creating an archived -version of your documents. This is useful if you don't want to have two -almost-identical versions of your documents in the media folder. +: Controls when paperless creates a PDF/A archive version of your +documents. Archive files are stored alongside the original and are used +for display in the web interface. - - `never`: Never skip creating an archived version. - - `with_text`: Skip creating an archived version for documents - that already have embedded text. - - `always`: Always skip creating an archived version. + - `auto` (default): Produce archives for scanned or image-based + documents. Skip archive generation for born-digital PDFs that + already contain embedded text. This is the recommended setting + for mixed document collections. + - `always`: Always produce a PDF/A archive when the parser + supports it, regardless of whether the document already has + text. + - `never`: Never produce an archive. Only the original file is + stored. Saves disk space but the web viewer will display the + original file directly. - The default is `never`. + **Behaviour by file type and mode** (`auto` column shows the default): + + | Document type | `never` | `auto` (default) | `always` | + | -------------------------- | ------- | -------------------------- | -------- | + | Scanned image (TIFF, JPEG) | No | **Yes** | Yes | + | Image-based PDF | No | **Yes** (short/no text, untagged) | Yes | + | Born-digital PDF | No | No (tagged or has embedded text) | Yes | + | Plain text, email, HTML | No | No | No | + | DOCX / ODT (via Tika) | Yes\* | Yes\* | Yes\* | + + \* Tika always produces a PDF rendition for display; this counts as + the archive regardless of the setting. + + !!! note + + This setting applies to the built-in Tesseract parser. Parsers + that must always convert documents to PDF for display (e.g. DOCX, + ODT via Tika) will produce a PDF regardless of this setting. + + !!! note + + The **remote OCR parser** (Azure AI) always produces a searchable + PDF and stores it as the archive copy, regardless of this setting. + `ARCHIVE_FILE_GENERATION=never` has no effect when the remote + parser handles a document. #### [`PAPERLESS_OCR_CLEAN=`](#PAPERLESS_OCR_CLEAN) {#PAPERLESS_OCR_CLEAN} diff --git a/docs/migration-v3.md b/docs/migration-v3.md index afbc83186..b428defeb 100644 --- a/docs/migration-v3.md +++ b/docs/migration-v3.md @@ -123,7 +123,68 @@ Multiple options are combined in a single value: PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10" ``` -## Search Index (Whoosh -> Tantivy) +## OCR and Archive File Generation Settings + +The settings that control OCR behaviour and archive file generation have been redesigned. The old settings that coupled these two concerns together are **removed** — old values are not silently honoured; a startup warning is logged if any removed variable is still set in your environment. + +### Removed settings + +| Removed Setting | Replacement | +| ------------------------------------------- | --------------------------------------------------------------------- | +| `PAPERLESS_OCR_MODE=skip` | `PAPERLESS_OCR_MODE=auto` (new default) | +| `PAPERLESS_OCR_MODE=skip_noarchive` | `PAPERLESS_OCR_MODE=auto` + `PAPERLESS_ARCHIVE_FILE_GENERATION=never` | +| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never` | `PAPERLESS_ARCHIVE_FILE_GENERATION=always` | +| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text` | `PAPERLESS_ARCHIVE_FILE_GENERATION=auto` (new default) | +| `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always` | `PAPERLESS_ARCHIVE_FILE_GENERATION=never` | + +### What changed and why + +Previously, `OCR_MODE` conflated two independent concerns: whether to run OCR and whether to produce an archive. `skip` meant "skip OCR if text exists, but always produce an archive". `skip_noarchive` meant "skip OCR if text exists, and also skip the archive". This made it impossible to, for example, disable OCR entirely while still producing archives. + +The new settings are independent: + +- [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) controls OCR: `auto` (default), `force`, `redo`, `off`. +- [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) controls archive production: `auto` (default), `always`, `never`. + +### Database configuration + +If you changed OCR settings via the admin UI (ApplicationConfiguration), the database values are **migrated automatically** during the upgrade. `mode` values (`skip` / `skip_noarchive`) are mapped to their new equivalents and `skip_archive_file` values are converted to the new `archive_file_generation` field. After upgrading, review the OCR settings in the admin UI to confirm the migrated values match your intent. + +### Action Required + +Remove any `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` variable from your environment. If you relied on `OCR_MODE=skip` or `OCR_MODE=skip_noarchive`, update accordingly: + +```bash +# v2: skip OCR when text present, always archive +PAPERLESS_OCR_MODE=skip +# v3: equivalent (auto is the new default) +# No change needed — auto is the default + +# v2: skip OCR when text present, skip archive too +PAPERLESS_OCR_MODE=skip_noarchive +# v3: equivalent +PAPERLESS_OCR_MODE=auto +PAPERLESS_ARCHIVE_FILE_GENERATION=never + +# v2: always skip archive +PAPERLESS_OCR_SKIP_ARCHIVE_FILE=always +# v3: equivalent +PAPERLESS_ARCHIVE_FILE_GENERATION=never + +# v2: skip archive only for born-digital docs +PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text +# v3: equivalent (auto is the new default) +PAPERLESS_ARCHIVE_FILE_GENERATION=auto +``` + +### Remote OCR parser + +If you use the **remote OCR parser** (Azure AI), note that it always produces a +searchable PDF and stores it as the archive copy. `ARCHIVE_FILE_GENERATION=never` +has no effect for documents handled by the remote parser — the archive is produced +unconditionally by the remote engine. + +# Search Index (Whoosh -> Tantivy) The full-text search backend has been replaced with [Tantivy](https://github.com/quickwit-oss/tantivy). The index format is incompatible with Whoosh, so **the search index is automatically rebuilt from diff --git a/docs/setup.md b/docs/setup.md index 3b84fd729..5580dde92 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -633,12 +633,11 @@ hardware, but a few settings can improve performance: consumption, so you might want to lower these settings (example: 2 workers and 1 thread to always have some computing power left for other tasks). -- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `skip` and consider +- Keep [`PAPERLESS_OCR_MODE`](configuration.md#PAPERLESS_OCR_MODE) at its default value `auto` and consider OCRing your documents before feeding them into Paperless. Some scanners are able to do this! -- Set [`PAPERLESS_OCR_SKIP_ARCHIVE_FILE`](configuration.md#PAPERLESS_OCR_SKIP_ARCHIVE_FILE) to `with_text` to skip archive - file generation for already OCRed documents, or `always` to skip it - for all documents. +- Set [`PAPERLESS_ARCHIVE_FILE_GENERATION`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) to `never` to skip archive + file generation entirely, saving disk space at the cost of in-browser PDF/A viewing. - If you want to perform OCR on the device, consider using `PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use less memory at the expense of slightly worse OCR results. diff --git a/docs/usage.md b/docs/usage.md index 4e2def93b..412900df9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -134,9 +134,9 @@ following operations on your documents: !!! tip This process can be configured to fit your needs. If you don't want - paperless to create archived versions for digital documents, you can - configure that by configuring - `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the + paperless to create archived versions for born-digital documents, set + [`PAPERLESS_ARCHIVE_FILE_GENERATION=auto`](configuration.md#PAPERLESS_ARCHIVE_FILE_GENERATION) + (the default). To skip archives entirely, use `never`. Please read the [relevant section in the documentation](configuration.md#ocr). !!! note diff --git a/src-ui/src/app/data/paperless-config.ts b/src-ui/src/app/data/paperless-config.ts index ce4faff81..38af21e3c 100644 --- a/src-ui/src/app/data/paperless-config.ts +++ b/src-ui/src/app/data/paperless-config.ts @@ -11,16 +11,16 @@ export enum OutputTypeConfig { } export enum ModeConfig { - SKIP = 'skip', - REDO = 'redo', + AUTO = 'auto', FORCE = 'force', - SKIP_NO_ARCHIVE = 'skip_noarchive', + REDO = 'redo', + OFF = 'off', } export enum ArchiveFileConfig { - NEVER = 'never', - WITH_TEXT = 'with_text', + AUTO = 'auto', ALWAYS = 'always', + NEVER = 'never', } export enum CleanConfig { @@ -115,11 +115,11 @@ export const PaperlessConfigOptions: ConfigOption[] = [ category: ConfigCategory.OCR, }, { - key: 'skip_archive_file', - title: $localize`Skip Archive File`, + key: 'archive_file_generation', + title: $localize`Archive File Generation`, type: ConfigOptionType.Select, choices: mapToItems(ArchiveFileConfig), - config_key: 'PAPERLESS_OCR_SKIP_ARCHIVE_FILE', + config_key: 'PAPERLESS_ARCHIVE_FILE_GENERATION', category: ConfigCategory.OCR, }, { @@ -337,7 +337,7 @@ export interface PaperlessConfig extends ObjectWithId { pages: number language: string mode: ModeConfig - skip_archive_file: ArchiveFileConfig + archive_file_generation: ArchiveFileConfig image_dpi: number unpaper_clean: CleanConfig deskew: boolean diff --git a/src/documents/consumer.py b/src/documents/consumer.py index f68fa0685..8035f3857 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,4 +1,5 @@ import datetime +import logging import os import shutil import tempfile @@ -50,9 +51,14 @@ from documents.utils import compute_checksum from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.config import OcrConfig +from paperless.models import ArchiveFileGenerationChoices from paperless.parsers import ParserContext from paperless.parsers import ParserProtocol from paperless.parsers.registry import get_parser_registry +from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH +from paperless.parsers.utils import extract_pdf_text +from paperless.parsers.utils import is_tagged_pdf LOGGING_NAME: Final[str] = "paperless.consumer" @@ -105,6 +111,74 @@ class ConsumerStatusShortMessage(StrEnum): FAILED = "failed" +def should_produce_archive( + parser: "ParserProtocol", + mime_type: str, + document_path: Path, + log: logging.Logger | None = None, +) -> bool: + """Return True if a PDF/A archive should be produced for this document. + + IMPORTANT: *parser* must be an instantiated parser, not the class. + ``requires_pdf_rendition`` and ``can_produce_archive`` are instance + ``@property`` methods — accessing them on the class returns the descriptor + (always truthy). + """ + _log = log or logging.getLogger(LOGGING_NAME) + + # Must produce a PDF so the frontend can display the original format at all. + if parser.requires_pdf_rendition: + _log.debug("Archive: yes — parser requires PDF rendition for frontend display") + return True + + # Parser cannot produce an archive (e.g. TextDocumentParser). + if not parser.can_produce_archive: + _log.debug("Archive: no — parser cannot produce archives") + return False + + generation = OcrConfig().archive_file_generation + + if generation == ArchiveFileGenerationChoices.ALWAYS: + _log.debug("Archive: yes — ARCHIVE_FILE_GENERATION=always") + return True + if generation == ArchiveFileGenerationChoices.NEVER: + _log.debug("Archive: no — ARCHIVE_FILE_GENERATION=never") + return False + + # auto: produce archives for scanned/image documents; skip for born-digital PDFs. + if mime_type.startswith("image/"): + _log.debug("Archive: yes — image document, ARCHIVE_FILE_GENERATION=auto") + return True + if mime_type == "application/pdf": + if is_tagged_pdf(document_path): + _log.debug( + "Archive: no — born-digital PDF (structure tags detected)," + " ARCHIVE_FILE_GENERATION=auto", + ) + return False + text = extract_pdf_text(document_path) + if text is None or len(text) <= PDF_TEXT_MIN_LENGTH: + _log.debug( + "Archive: yes — scanned PDF (text_length=%d ≤ %d)," + " ARCHIVE_FILE_GENERATION=auto", + len(text) if text else 0, + PDF_TEXT_MIN_LENGTH, + ) + return True + _log.debug( + "Archive: no — born-digital PDF (text_length=%d > %d)," + " ARCHIVE_FILE_GENERATION=auto", + len(text), + PDF_TEXT_MIN_LENGTH, + ) + return False + _log.debug( + "Archive: no — MIME type %r not eligible for auto archive generation", + mime_type, + ) + return False + + class ConsumerPluginMixin: if TYPE_CHECKING: from logging import Logger @@ -436,7 +510,17 @@ class ConsumerPlugin( ) self.log.debug(f"Parsing {self.filename}...") - document_parser.parse(self.working_copy, mime_type) + produce_archive = should_produce_archive( + document_parser, + mime_type, + self.working_copy, + self.log, + ) + document_parser.parse( + self.working_copy, + mime_type, + produce_archive=produce_archive, + ) self.log.debug(f"Generating thumbnail for {self.filename}...") self._send_progress( @@ -785,7 +869,7 @@ class ConsumerPlugin( return document - def apply_overrides(self, document) -> None: + def apply_overrides(self, document: Document) -> None: if self.metadata.correspondent_id: document.correspondent = Correspondent.objects.get( pk=self.metadata.correspondent_id, diff --git a/src/documents/tasks.py b/src/documents/tasks.py index c40b1ff3f..57c819492 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -30,6 +30,7 @@ from documents.consumer import AsnCheckPlugin from documents.consumer import ConsumerPlugin from documents.consumer import ConsumerPreflightPlugin from documents.consumer import WorkflowTriggerPlugin +from documents.consumer import should_produce_archive from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides from documents.double_sided import CollatePlugin @@ -311,7 +312,16 @@ def update_document_content_maybe_archive_file(document_id) -> None: parser.configure(ParserContext()) try: - parser.parse(document.source_path, mime_type) + produce_archive = should_produce_archive( + parser, + mime_type, + document.source_path, + ) + parser.parse( + document.source_path, + mime_type, + produce_archive=produce_archive, + ) thumbnail = parser.get_thumbnail(document.source_path, mime_type) diff --git a/src/documents/tests/test_api_app_config.py b/src/documents/tests/test_api_app_config.py index b946bc1bf..e7b599cbb 100644 --- a/src/documents/tests/test_api_app_config.py +++ b/src/documents/tests/test_api_app_config.py @@ -46,7 +46,7 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase): "pages": None, "language": None, "mode": None, - "skip_archive_file": None, + "archive_file_generation": None, "image_dpi": None, "unpaper_clean": None, "deskew": None, diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index b1847f2b4..4d8da62a3 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -1020,7 +1020,7 @@ class TestTagBarcode(DirectoriesMixin, SampleDirMixin, GetReaderPluginMixin, Tes CONSUMER_TAG_BARCODE_SPLIT=True, CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"}, CELERY_TASK_ALWAYS_EAGER=True, - OCR_MODE="skip", + OCR_MODE="auto", ) def test_consume_barcode_file_tag_split_and_assignment(self) -> None: """ diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 165b27e0b..0ff415a5f 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -230,7 +230,11 @@ class TestConsumer( shutil.copy(src, dst) return dst - @override_settings(FILENAME_FORMAT=None, TIME_ZONE="America/Chicago") + @override_settings( + FILENAME_FORMAT=None, + TIME_ZONE="America/Chicago", + ARCHIVE_FILE_GENERATION="always", + ) def testNormalOperation(self) -> None: filename = self.get_test_file() @@ -629,7 +633,10 @@ class TestConsumer( # Database empty self.assertEqual(Document.objects.all().count(), 0) - @override_settings(FILENAME_FORMAT="{correspondent}/{title}") + @override_settings( + FILENAME_FORMAT="{correspondent}/{title}", + ARCHIVE_FILE_GENERATION="always", + ) def testFilenameHandling(self) -> None: with self.get_consumer( self.get_test_file(), @@ -646,7 +653,7 @@ class TestConsumer( self._assert_first_last_send_progress() @mock.patch("documents.consumer.generate_unique_filename") - @override_settings(FILENAME_FORMAT="{pk}") + @override_settings(FILENAME_FORMAT="{pk}", ARCHIVE_FILE_GENERATION="always") def testFilenameHandlingFallsBackWhenGeneratedPathExceedsDbLimit(self, m): m.side_effect = lambda doc, archive_filename=False: Path( ("a" * 1100 + ".pdf") if not archive_filename else ("b" * 1100 + ".pdf"), @@ -673,7 +680,10 @@ class TestConsumer( self._assert_first_last_send_progress() - @override_settings(FILENAME_FORMAT="{correspondent}/{title}") + @override_settings( + FILENAME_FORMAT="{correspondent}/{title}", + ARCHIVE_FILE_GENERATION="always", + ) @mock.patch("documents.signals.handlers.generate_unique_filename") def testFilenameHandlingUnstableFormat(self, m) -> None: filenames = ["this", "that", "now this", "i cannot decide"] @@ -1021,7 +1031,7 @@ class TestConsumer( self.assertEqual(Document.objects.count(), 2) self._assert_first_last_send_progress() - @override_settings(FILENAME_FORMAT="{title}") + @override_settings(FILENAME_FORMAT="{title}", ARCHIVE_FILE_GENERATION="always") @mock.patch("documents.consumer.get_parser_registry") def test_similar_filenames(self, m) -> None: shutil.copy( @@ -1132,6 +1142,7 @@ class TestConsumer( mock_mail_parser_parse.assert_called_once_with( consumer.working_copy, "message/rfc822", + produce_archive=True, ) @@ -1279,7 +1290,14 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase): def test_no_pre_consume_script(self, m) -> None: with self.get_consumer(self.test_file) as c: c.run() - m.assert_not_called() + # Verify no pre-consume script subprocess was invoked + # (run_subprocess may still be called by _extract_text_for_archive_check) + script_calls = [ + call + for call in m.call_args_list + if call.args and call.args[0] and call.args[0][0] not in ("pdftotext",) + ] + self.assertEqual(script_calls, []) @mock.patch("documents.consumer.run_subprocess") @override_settings(PRE_CONSUME_SCRIPT="does-not-exist") @@ -1295,9 +1313,16 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase): with self.get_consumer(self.test_file) as c: c.run() - m.assert_called_once() + self.assertTrue(m.called) - args, _ = m.call_args + # Find the call that invoked the pre-consume script + # (run_subprocess may also be called by _extract_text_for_archive_check) + script_call = next( + call + for call in m.call_args_list + if call.args and call.args[0] and call.args[0][0] == script.name + ) + args, _ = script_call command = args[0] environment = args[1] diff --git a/src/documents/tests/test_consumer_archive.py b/src/documents/tests/test_consumer_archive.py new file mode 100644 index 000000000..265bd7bc6 --- /dev/null +++ b/src/documents/tests/test_consumer_archive.py @@ -0,0 +1,189 @@ +"""Tests for should_produce_archive().""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING +from unittest.mock import MagicMock + +import pytest + +from documents.consumer import should_produce_archive + +if TYPE_CHECKING: + from pytest_mock import MockerFixture + + +def _parser_instance( + *, + can_produce: bool = True, + requires_rendition: bool = False, +) -> MagicMock: + """Return a mock parser instance with the given capability flags.""" + instance = MagicMock() + instance.can_produce_archive = can_produce + instance.requires_pdf_rendition = requires_rendition + return instance + + +@pytest.fixture() +def null_app_config(mocker) -> MagicMock: + """Mock ApplicationConfiguration with all fields None → falls back to Django settings.""" + return mocker.MagicMock( + output_type=None, + pages=None, + language=None, + mode=None, + archive_file_generation=None, + image_dpi=None, + unpaper_clean=None, + deskew=None, + rotate_pages=None, + rotate_pages_threshold=None, + max_image_pixels=None, + color_conversion_strategy=None, + user_args=None, + ) + + +@pytest.fixture(autouse=True) +def patch_app_config(mocker, null_app_config): + """Patch BaseConfig._get_config_instance for all tests in this module.""" + mocker.patch( + "paperless.config.BaseConfig._get_config_instance", + return_value=null_app_config, + ) + + +class TestShouldProduceArchive: + @pytest.mark.parametrize( + ("generation", "can_produce", "requires_rendition", "mime", "expected"), + [ + pytest.param( + "never", + True, + False, + "application/pdf", + False, + id="never-returns-false", + ), + pytest.param( + "always", + True, + False, + "application/pdf", + True, + id="always-returns-true", + ), + pytest.param( + "never", + True, + True, + "application/pdf", + True, + id="requires-rendition-overrides-never", + ), + pytest.param( + "always", + False, + False, + "text/plain", + False, + id="cannot-produce-overrides-always", + ), + pytest.param( + "always", + False, + True, + "application/pdf", + True, + id="requires-rendition-wins-even-if-cannot-produce", + ), + pytest.param( + "auto", + True, + False, + "image/tiff", + True, + id="auto-image-returns-true", + ), + pytest.param( + "auto", + True, + False, + "message/rfc822", + False, + id="auto-non-pdf-non-image-returns-false", + ), + ], + ) + def test_generation_setting( + self, + settings, + generation: str, + can_produce: bool, # noqa: FBT001 + requires_rendition: bool, # noqa: FBT001 + mime: str, + expected: bool, # noqa: FBT001 + ) -> None: + settings.ARCHIVE_FILE_GENERATION = generation + parser = _parser_instance( + can_produce=can_produce, + requires_rendition=requires_rendition, + ) + assert should_produce_archive(parser, mime, Path("/tmp/doc")) is expected + + @pytest.mark.parametrize( + ("extracted_text", "expected"), + [ + pytest.param( + "This is a born-digital PDF with lots of text content. " * 10, + False, + id="born-digital-long-text-skips-archive", + ), + pytest.param(None, True, id="no-text-scanned-produces-archive"), + pytest.param("tiny", True, id="short-text-treated-as-scanned"), + ], + ) + def test_auto_pdf_archive_decision( + self, + mocker: MockerFixture, + settings, + extracted_text: str | None, + expected: bool, # noqa: FBT001 + ) -> None: + settings.ARCHIVE_FILE_GENERATION = "auto" + mocker.patch("documents.consumer.is_tagged_pdf", return_value=False) + mocker.patch("documents.consumer.extract_pdf_text", return_value=extracted_text) + parser = _parser_instance(can_produce=True, requires_rendition=False) + assert ( + should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf")) + is expected + ) + + def test_tagged_pdf_skips_archive_in_auto_mode( + self, + mocker: MockerFixture, + settings, + ) -> None: + """Tagged PDFs (e.g. Word exports) are treated as born-digital regardless of text length.""" + settings.ARCHIVE_FILE_GENERATION = "auto" + mocker.patch("documents.consumer.is_tagged_pdf", return_value=True) + parser = _parser_instance(can_produce=True, requires_rendition=False) + assert ( + should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf")) + is False + ) + + def test_tagged_pdf_does_not_call_pdftotext( + self, + mocker: MockerFixture, + settings, + ) -> None: + """When a PDF is tagged, pdftotext is not invoked (fast path).""" + settings.ARCHIVE_FILE_GENERATION = "auto" + mocker.patch("documents.consumer.is_tagged_pdf", return_value=True) + mock_extract = mocker.patch("documents.consumer.extract_pdf_text") + parser = _parser_instance(can_produce=True, requires_rendition=False) + should_produce_archive(parser, "application/pdf", Path("/tmp/doc.pdf")) + mock_extract.assert_not_called() diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py index 6ea4431fd..72476d403 100644 --- a/src/documents/tests/test_management.py +++ b/src/documents/tests/test_management.py @@ -27,7 +27,10 @@ sample_file: Path = Path(__file__).parent / "samples" / "simple.pdf" @pytest.mark.management -@override_settings(FILENAME_FORMAT="{correspondent}/{title}") +@override_settings( + FILENAME_FORMAT="{correspondent}/{title}", + ARCHIVE_FILE_GENERATION="always", +) class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def make_models(self): return Document.objects.create( diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index 9fb9ddbc6..4502423b3 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -213,6 +213,7 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertEqual(Document.global_objects.count(), 0) +@override_settings(ARCHIVE_FILE_GENERATION="always") class TestUpdateContent(DirectoriesMixin, TestCase): def test_update_content_maybe_archive_file(self) -> None: """ diff --git a/src/paperless/checks.py b/src/paperless/checks.py index 5f069b547..fbcae320a 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -5,6 +5,7 @@ import shutil import stat import subprocess from pathlib import Path +from typing import Any from django.conf import settings from django.core.checks import Error @@ -22,7 +23,7 @@ writeable_hint = ( ) -def path_check(var, directory: Path) -> list[Error]: +def path_check(var: str, directory: Path) -> list[Error]: messages: list[Error] = [] if directory: if not directory.is_dir(): @@ -59,7 +60,7 @@ def path_check(var, directory: Path) -> list[Error]: @register() -def paths_check(app_configs, **kwargs) -> list[Error]: +def paths_check(app_configs: Any, **kwargs: Any) -> list[Error]: """ Check the various paths for existence, readability and writeability """ @@ -73,7 +74,7 @@ def paths_check(app_configs, **kwargs) -> list[Error]: @register() -def binaries_check(app_configs, **kwargs): +def binaries_check(app_configs: Any, **kwargs: Any) -> list[Error]: """ Paperless requires the existence of a few binaries, so we do some checks for those here. @@ -93,7 +94,7 @@ def binaries_check(app_configs, **kwargs): @register() -def debug_mode_check(app_configs, **kwargs): +def debug_mode_check(app_configs: Any, **kwargs: Any) -> list[Warning]: if settings.DEBUG: return [ Warning( @@ -109,7 +110,7 @@ def debug_mode_check(app_configs, **kwargs): @register() -def settings_values_check(app_configs, **kwargs): +def settings_values_check(app_configs: Any, **kwargs: Any) -> list[Error | Warning]: """ Validates at least some of the user provided settings """ @@ -132,23 +133,14 @@ def settings_values_check(app_configs, **kwargs): Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'), ) - if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: + if settings.OCR_MODE not in {"auto", "force", "redo", "off"}: msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) - if settings.OCR_MODE == "skip_noarchive": - msgs.append( - Warning( - 'OCR output mode "skip_noarchive" is deprecated and will be ' - "removed in a future version. Please use " - "PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.", - ), - ) - - if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}: + if settings.ARCHIVE_FILE_GENERATION not in {"auto", "always", "never"}: msgs.append( Error( - "OCR_SKIP_ARCHIVE_FILE setting " - f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid', + "PAPERLESS_ARCHIVE_FILE_GENERATION setting " + f'"{settings.ARCHIVE_FILE_GENERATION}" is not valid', ), ) @@ -191,7 +183,7 @@ def settings_values_check(app_configs, **kwargs): @register() -def audit_log_check(app_configs, **kwargs): +def audit_log_check(app_configs: Any, **kwargs: Any) -> list[Error]: db_conn = connections["default"] all_tables = db_conn.introspection.table_names() result = [] @@ -303,7 +295,42 @@ def check_deprecated_db_settings( @register() -def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]: +def check_deprecated_v2_ocr_env_vars( + app_configs: object, + **kwargs: object, +) -> list[Warning]: + """Warn when deprecated v2 OCR environment variables are set. + + Users upgrading from v2 may still have these in their environment or + config files, where they are now silently ignored. + """ + warnings: list[Warning] = [] + + if os.environ.get("PAPERLESS_OCR_SKIP_ARCHIVE_FILE"): + warnings.append( + Warning( + "PAPERLESS_OCR_SKIP_ARCHIVE_FILE is set but has no effect. " + "Use PAPERLESS_ARCHIVE_FILE_GENERATION=never/always/auto instead.", + id="paperless.W002", + ), + ) + + ocr_mode = os.environ.get("PAPERLESS_OCR_MODE", "") + if ocr_mode in {"skip", "skip_noarchive"}: + warnings.append( + Warning( + f"PAPERLESS_OCR_MODE={ocr_mode!r} is not a valid value. " + f"Use PAPERLESS_OCR_MODE=auto (and PAPERLESS_ARCHIVE_FILE_GENERATION=never " + f"if you used skip_noarchive) instead.", + id="paperless.W003", + ), + ) + + return warnings + + +@register() +def check_remote_parser_configured(app_configs: Any, **kwargs: Any) -> list[Error]: if settings.REMOTE_OCR_ENGINE == "azureai" and not ( settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY ): @@ -329,7 +356,7 @@ def get_tesseract_langs(): @register() -def check_default_language_available(app_configs, **kwargs): +def check_default_language_available(app_configs: Any, **kwargs: Any) -> list[Error]: errs = [] if not settings.OCR_LANGUAGE: diff --git a/src/paperless/config.py b/src/paperless/config.py index fd2c1620d..8363cfb1f 100644 --- a/src/paperless/config.py +++ b/src/paperless/config.py @@ -4,6 +4,11 @@ import json from django.conf import settings from paperless.models import ApplicationConfiguration +from paperless.models import ArchiveFileGenerationChoices +from paperless.models import CleanChoices +from paperless.models import ColorConvertChoices +from paperless.models import ModeChoices +from paperless.models import OutputTypeChoices @dataclasses.dataclass @@ -28,7 +33,7 @@ class OutputTypeConfig(BaseConfig): Almost all parsers care about the chosen PDF output format """ - output_type: str = dataclasses.field(init=False) + output_type: OutputTypeChoices = dataclasses.field(init=False) def __post_init__(self) -> None: app_config = self._get_config_instance() @@ -45,15 +50,17 @@ class OcrConfig(OutputTypeConfig): pages: int | None = dataclasses.field(init=False) language: str = dataclasses.field(init=False) - mode: str = dataclasses.field(init=False) - skip_archive_file: str = dataclasses.field(init=False) + mode: ModeChoices = dataclasses.field(init=False) + archive_file_generation: ArchiveFileGenerationChoices = dataclasses.field( + init=False, + ) image_dpi: int | None = dataclasses.field(init=False) - clean: str = dataclasses.field(init=False) + clean: CleanChoices = dataclasses.field(init=False) deskew: bool = dataclasses.field(init=False) rotate: bool = dataclasses.field(init=False) rotate_threshold: float = dataclasses.field(init=False) max_image_pixel: float | None = dataclasses.field(init=False) - color_conversion_strategy: str = dataclasses.field(init=False) + color_conversion_strategy: ColorConvertChoices = dataclasses.field(init=False) user_args: dict[str, str] | None = dataclasses.field(init=False) def __post_init__(self) -> None: @@ -64,8 +71,8 @@ class OcrConfig(OutputTypeConfig): self.pages = app_config.pages or settings.OCR_PAGES self.language = app_config.language or settings.OCR_LANGUAGE self.mode = app_config.mode or settings.OCR_MODE - self.skip_archive_file = ( - app_config.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE + self.archive_file_generation = ( + app_config.archive_file_generation or settings.ARCHIVE_FILE_GENERATION ) self.image_dpi = app_config.image_dpi or settings.OCR_IMAGE_DPI self.clean = app_config.unpaper_clean or settings.OCR_CLEAN diff --git a/src/paperless/migrations/0008_replace_skip_archive_file.py b/src/paperless/migrations/0008_replace_skip_archive_file.py new file mode 100644 index 000000000..b6c4fbbee --- /dev/null +++ b/src/paperless/migrations/0008_replace_skip_archive_file.py @@ -0,0 +1,90 @@ +# Generated by Django 5.2.12 on 2026-03-26 20:31 + +from django.db import migrations +from django.db import models + +_MODE_MAP = { + "skip": "auto", + "redo": "redo", + "force": "force", + "skip_noarchive": "auto", +} + +_ARCHIVE_MAP = { + # never skip -> always generate + "never": "always", + # skip when text present -> auto + "with_text": "auto", + # always skip -> never generate + "always": "never", +} + + +def migrate_old_values(apps, schema_editor): + ApplicationConfiguration = apps.get_model("paperless", "ApplicationConfiguration") + for config in ApplicationConfiguration.objects.all(): + old_mode = config.mode + old_skip = config.skip_archive_file + + # Map the old mode value + if old_mode in _MODE_MAP: + config.mode = _MODE_MAP[old_mode] + + # Map skip_archive_file -> archive_file_generation + if old_skip in _ARCHIVE_MAP: + config.archive_file_generation = _ARCHIVE_MAP[old_skip] + + # skip_noarchive implied no archive file; set that if the user + # didn't already have an explicit skip_archive_file preference + if old_mode == "skip_noarchive" and old_skip is None: + config.archive_file_generation = "never" + + config.save() + + +class Migration(migrations.Migration): + dependencies = [ + ("paperless", "0007_optimize_integer_field_sizes"), + ] + + operations = [ + # 1. Update mode choices in-place (old values still in the column) + migrations.AlterField( + model_name="applicationconfiguration", + name="mode", + field=models.CharField( + blank=True, + choices=[ + ("auto", "auto"), + ("force", "force"), + ("redo", "redo"), + ("off", "off"), + ], + max_length=16, + null=True, + verbose_name="Sets the OCR mode", + ), + ), + # 2. Add the new field + migrations.AddField( + model_name="applicationconfiguration", + name="archive_file_generation", + field=models.CharField( + blank=True, + choices=[("auto", "auto"), ("always", "always"), ("never", "never")], + max_length=8, + null=True, + verbose_name="Controls archive file generation", + ), + ), + # 3. Migrate data from old values to new + migrations.RunPython( + migrate_old_values, + migrations.RunPython.noop, + ), + # 4. Drop the old field + migrations.RemoveField( + model_name="applicationconfiguration", + name="skip_archive_file", + ), + ] diff --git a/src/paperless/models.py b/src/paperless/models.py index c67f16b03..192e429d4 100644 --- a/src/paperless/models.py +++ b/src/paperless/models.py @@ -36,20 +36,20 @@ class ModeChoices(models.TextChoices): and our own custom setting """ - SKIP = ("skip", _("skip")) - REDO = ("redo", _("redo")) + AUTO = ("auto", _("auto")) FORCE = ("force", _("force")) - SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive")) + REDO = ("redo", _("redo")) + OFF = ("off", _("off")) -class ArchiveFileChoices(models.TextChoices): +class ArchiveFileGenerationChoices(models.TextChoices): """ Settings to control creation of an archive PDF file """ - NEVER = ("never", _("never")) - WITH_TEXT = ("with_text", _("with_text")) + AUTO = ("auto", _("auto")) ALWAYS = ("always", _("always")) + NEVER = ("never", _("never")) class CleanChoices(models.TextChoices): @@ -126,12 +126,12 @@ class ApplicationConfiguration(AbstractSingletonModel): choices=ModeChoices.choices, ) - skip_archive_file = models.CharField( - verbose_name=_("Controls the generation of an archive file"), + archive_file_generation = models.CharField( + verbose_name=_("Controls archive file generation"), null=True, blank=True, - max_length=16, - choices=ArchiveFileChoices.choices, + max_length=8, + choices=ArchiveFileGenerationChoices.choices, ) image_dpi = models.PositiveSmallIntegerField( diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index 99cff36aa..e19922dd3 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -1,5 +1,6 @@ from __future__ import annotations +import importlib.resources import logging import os import re @@ -8,6 +9,8 @@ import tempfile from pathlib import Path from typing import TYPE_CHECKING from typing import Any +from typing import Final +from typing import NoReturn from typing import Self from django.conf import settings @@ -15,12 +18,16 @@ from PIL import Image from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf +from documents.utils import copy_file_with_basic_stats from documents.utils import maybe_override_pixel_limit from documents.utils import run_subprocess from paperless.config import OcrConfig -from paperless.models import ArchiveFileChoices from paperless.models import CleanChoices from paperless.models import ModeChoices +from paperless.models import OutputTypeChoices +from paperless.parsers.utils import PDF_TEXT_MIN_LENGTH +from paperless.parsers.utils import extract_pdf_text +from paperless.parsers.utils import is_tagged_pdf from paperless.parsers.utils import read_file_handle_unicode_errors from paperless.version import __full_version_str__ @@ -33,7 +40,11 @@ if TYPE_CHECKING: logger = logging.getLogger("paperless.parsing.tesseract") -_SUPPORTED_MIME_TYPES: dict[str, str] = { +_SRGB_ICC_DATA: Final[bytes] = ( + importlib.resources.files("ocrmypdf.data").joinpath("sRGB.icc").read_bytes() +) + +_SUPPORTED_MIME_TYPES: Final[dict[str, str]] = { "application/pdf": ".pdf", "image/jpeg": ".jpg", "image/png": ".png", @@ -99,7 +110,7 @@ class RasterisedDocumentParser: # Lifecycle # ------------------------------------------------------------------ - def __init__(self, logging_group: object = None) -> None: + def __init__(self, logging_group: object | None = None) -> None: settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) self.tempdir = Path( tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR), @@ -233,7 +244,7 @@ class RasterisedDocumentParser: if ( sidecar_file is not None and sidecar_file.is_file() - and self.settings.mode != "redo" + and self.settings.mode != ModeChoices.REDO ): text = read_file_handle_unicode_errors(sidecar_file) @@ -250,36 +261,7 @@ class RasterisedDocumentParser: if not Path(pdf_file).is_file(): return None - try: - text = None - with tempfile.NamedTemporaryFile( - mode="w+", - dir=self.tempdir, - ) as tmp: - run_subprocess( - [ - "pdftotext", - "-q", - "-layout", - "-enc", - "UTF-8", - str(pdf_file), - tmp.name, - ], - logger=self.log, - ) - text = read_file_handle_unicode_errors(Path(tmp.name)) - - return post_process_text(text) - - except Exception: - # If pdftotext fails, fall back to OCR. - self.log.warning( - "Error while getting text from PDF document with pdftotext", - exc_info=True, - ) - # probably not a PDF file. - return None + return post_process_text(extract_pdf_text(Path(pdf_file), log=self.log)) def construct_ocrmypdf_parameters( self, @@ -289,6 +271,7 @@ class RasterisedDocumentParser: sidecar_file: Path, *, safe_fallback: bool = False, + skip_text: bool = False, ) -> dict[str, Any]: ocrmypdf_args: dict[str, Any] = { "input_file_or_options": input_file, @@ -307,15 +290,14 @@ class RasterisedDocumentParser: self.settings.color_conversion_strategy ) - if self.settings.mode == ModeChoices.FORCE or safe_fallback: + if safe_fallback or self.settings.mode == ModeChoices.FORCE: ocrmypdf_args["force_ocr"] = True - elif self.settings.mode in { - ModeChoices.SKIP, - ModeChoices.SKIP_NO_ARCHIVE, - }: - ocrmypdf_args["skip_text"] = True elif self.settings.mode == ModeChoices.REDO: ocrmypdf_args["redo_ocr"] = True + elif skip_text or self.settings.mode == ModeChoices.OFF: + ocrmypdf_args["skip_text"] = True + elif self.settings.mode == ModeChoices.AUTO: + pass # no extra flag: normal OCR (text not found case) else: # pragma: no cover raise ParseError(f"Invalid ocr mode: {self.settings.mode}") @@ -400,6 +382,115 @@ class RasterisedDocumentParser: return ocrmypdf_args + def _convert_image_to_pdfa(self, document_path: Path) -> Path: + """Convert an image to a PDF/A-2b file without invoking the OCR engine. + + Uses img2pdf for the initial image->PDF wrapping, then pikepdf to stamp + PDF/A-2b conformance metadata. + + No Tesseract and no Ghostscript are invoked. + """ + import img2pdf + import pikepdf + + plain_pdf_path = Path(self.tempdir) / "image_plain.pdf" + try: + convert_kwargs: dict = {} + if self.settings.image_dpi is not None: + convert_kwargs["layout_fun"] = img2pdf.get_fixed_dpi_layout_fun( + (self.settings.image_dpi, self.settings.image_dpi), + ) + plain_pdf_path.write_bytes( + img2pdf.convert(str(document_path), **convert_kwargs), + ) + except Exception as e: + raise ParseError( + f"img2pdf conversion failed for {document_path}: {e!s}", + ) from e + + pdfa_path = Path(self.tempdir) / "archive.pdf" + try: + with pikepdf.open(plain_pdf_path) as pdf: + cs = pdf.make_stream(_SRGB_ICC_DATA) + cs["/N"] = 3 + output_intent = pikepdf.Dictionary( + Type=pikepdf.Name("/OutputIntent"), + S=pikepdf.Name("/GTS_PDFA1"), + OutputConditionIdentifier=pikepdf.String("sRGB"), + DestOutputProfile=cs, + ) + pdf.Root["/OutputIntents"] = pdf.make_indirect( + pikepdf.Array([output_intent]), + ) + meta = pdf.open_metadata(set_pikepdf_as_editor=False) + meta["pdfaid:part"] = "2" + meta["pdfaid:conformance"] = "B" + pdf.save(pdfa_path) + except Exception as e: + self.log.warning( + f"PDF/A metadata stamping failed ({e!s}); falling back to plain PDF.", + ) + pdfa_path.write_bytes(plain_pdf_path.read_bytes()) + + return pdfa_path + + def _convert_pdf_to_pdfa( + self, + input_path: Path, + output_path: Path, + ) -> None: + """Convert a PDF to PDF/A using Ghostscript directly, without OCR. + + Respects the user's output_type, color_conversion_strategy, and + continue_on_soft_render_error settings. + """ + from ocrmypdf._exec.ghostscript import generate_pdfa + from ocrmypdf.pdfa import generate_pdfa_ps + + output_type = self.settings.output_type + if output_type == OutputTypeChoices.PDF: + # No PDF/A requested — just copy the original + copy_file_with_basic_stats(input_path, output_path) + return + + # Map output_type to pdfa_part: pdfa→2, pdfa-1→1, pdfa-2→2, pdfa-3→3 + pdfa_part = "2" if output_type == "pdfa" else output_type.split("-")[-1] + + pdfmark = Path(self.tempdir) / "pdfa.ps" + generate_pdfa_ps(pdfmark) + + color_strategy = self.settings.color_conversion_strategy or "RGB" + + self.log.debug( + "Converting PDF to PDF/A-%s via Ghostscript (no OCR): %s", + pdfa_part, + input_path, + ) + + generate_pdfa( + pdf_pages=[pdfmark, input_path], + output_file=output_path, + compression="auto", + color_conversion_strategy=color_strategy, + pdfa_part=pdfa_part, + ) + + def _handle_subprocess_output_error(self, e: Exception) -> NoReturn: + """Log context for Ghostscript failures and raise ParseError. + + Called from the SubprocessOutputError handlers in parse() to avoid + duplicating the Ghostscript hint and re-raise logic. + """ + if "Ghostscript PDF/A rendering" in str(e): + self.log.warning( + "Ghostscript PDF/A rendering failed, consider setting " + "PAPERLESS_OCR_USER_ARGS: " + "'{\"continue_on_soft_render_error\": true}'", + ) + raise ParseError( + f"SubprocessOutputError: {e!s}. See logs for more information.", + ) from e + def parse( self, document_path: Path, @@ -409,57 +500,107 @@ class RasterisedDocumentParser: ) -> None: # This forces tesseract to use one core per page. os.environ["OMP_THREAD_LIMIT"] = "1" - VALID_TEXT_LENGTH = 50 - - if mime_type == "application/pdf": - text_original = self.extract_text(None, document_path) - original_has_text = ( - text_original is not None and len(text_original) > VALID_TEXT_LENGTH - ) - else: - text_original = None - original_has_text = False - - # If the original has text, and the user doesn't want an archive, - # we're done here - skip_archive_for_text = ( - self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE - or self.settings.skip_archive_file - in { - ArchiveFileChoices.WITH_TEXT, - ArchiveFileChoices.ALWAYS, - } - ) - if skip_archive_for_text and original_has_text: - self.log.debug("Document has text, skipping OCRmyPDF entirely.") - self.text = text_original - return - - # Either no text was in the original or there should be an archive - # file created, so OCR the file and create an archive with any - # text located via OCR import ocrmypdf from ocrmypdf import EncryptedPdfError from ocrmypdf import InputFileError from ocrmypdf import SubprocessOutputError from ocrmypdf.exceptions import DigitalSignatureError + from ocrmypdf.exceptions import PriorOcrFoundError + if mime_type == "application/pdf": + text_original = self.extract_text(None, document_path) + original_has_text = is_tagged_pdf(document_path, log=self.log) or ( + text_original is not None and len(text_original) > PDF_TEXT_MIN_LENGTH + ) + else: + text_original = None + original_has_text = False + + self.log.debug( + "Text detection: original_has_text=%s (text_length=%d, mode=%s, produce_archive=%s)", + original_has_text, + len(text_original) if text_original else 0, + self.settings.mode, + produce_archive, + ) + + # --- OCR_MODE=off: never invoke OCR engine --- + if self.settings.mode == ModeChoices.OFF: + if not produce_archive: + self.log.debug( + "OCR: skipped — OCR_MODE=off, no archive requested;" + " returning pdftotext content only", + ) + self.text = text_original or "" + return + if self.is_image(mime_type): + self.log.debug( + "OCR: skipped — OCR_MODE=off, image input;" + " converting to PDF/A without OCR", + ) + try: + self.archive_path = self._convert_image_to_pdfa( + document_path, + ) + self.text = "" + except Exception as e: + raise ParseError( + f"Image to PDF/A conversion failed: {e!s}", + ) from e + return + # PDFs in off mode: PDF/A conversion via Ghostscript, no OCR + archive_path = Path(self.tempdir) / "archive.pdf" + try: + self._convert_pdf_to_pdfa(document_path, archive_path) + self.archive_path = archive_path + self.text = text_original or "" + except SubprocessOutputError as e: + self._handle_subprocess_output_error(e) + except Exception as e: + raise ParseError(f"{e.__class__.__name__}: {e!s}") from e + return + + # --- OCR_MODE=auto: skip ocrmypdf entirely if text exists and no archive needed --- + if ( + self.settings.mode == ModeChoices.AUTO + and original_has_text + and not produce_archive + ): + self.log.debug( + "Document has text and no archive requested; skipping OCRmyPDF entirely.", + ) + self.text = text_original + return + + # --- All other paths: run ocrmypdf --- archive_path = Path(self.tempdir) / "archive.pdf" sidecar_file = Path(self.tempdir) / "sidecar.txt" + # auto mode with existing text: PDF/A conversion only (no OCR). + skip_text = self.settings.mode == ModeChoices.AUTO and original_has_text + + if skip_text: + self.log.debug( + "OCR strategy: PDF/A conversion only (skip_text)" + " — OCR_MODE=auto, document already has text", + ) + else: + self.log.debug("OCR strategy: full OCR — OCR_MODE=%s", self.settings.mode) + args = self.construct_ocrmypdf_parameters( document_path, mime_type, archive_path, sidecar_file, + skip_text=skip_text, ) try: self.log.debug(f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS: + if produce_archive: self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) @@ -474,16 +615,8 @@ class RasterisedDocumentParser: if original_has_text: self.text = text_original except SubprocessOutputError as e: - if "Ghostscript PDF/A rendering" in str(e): - self.log.warning( - "Ghostscript PDF/A rendering failed, consider setting " - "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'", - ) - - raise ParseError( - f"SubprocessOutputError: {e!s}. See logs for more information.", - ) from e - except (NoTextFoundException, InputFileError) as e: + self._handle_subprocess_output_error(e) + except (NoTextFoundException, InputFileError, PriorOcrFoundError) as e: self.log.warning( f"Encountered an error while running OCR: {e!s}. " f"Attempting force OCR to get the text.", @@ -492,8 +625,6 @@ class RasterisedDocumentParser: archive_path_fallback = Path(self.tempdir) / "archive-fallback.pdf" sidecar_file_fallback = Path(self.tempdir) / "sidecar-fallback.txt" - # Attempt to run OCR with safe settings. - args = self.construct_ocrmypdf_parameters( document_path, mime_type, @@ -505,25 +636,18 @@ class RasterisedDocumentParser: try: self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - - # Don't return the archived file here, since this file - # is bigger and blurry due to --force-ocr. - self.text = self.extract_text( sidecar_file_fallback, archive_path_fallback, ) - + if produce_archive: + self.archive_path = archive_path_fallback except Exception as e: - # If this fails, we have a serious issue at hand. raise ParseError(f"{e.__class__.__name__}: {e!s}") from e except Exception as e: - # Anything else is probably serious. raise ParseError(f"{e.__class__.__name__}: {e!s}") from e - # As a last resort, if we still don't have any text for any reason, - # try to extract the text from the original document. if not self.text: if original_has_text: self.text = text_original diff --git a/src/paperless/parsers/utils.py b/src/paperless/parsers/utils.py index 2e6ce7061..8cc4630bf 100644 --- a/src/paperless/parsers/utils.py +++ b/src/paperless/parsers/utils.py @@ -10,15 +10,105 @@ from __future__ import annotations import logging import re +import tempfile +from pathlib import Path from typing import TYPE_CHECKING +from typing import Final if TYPE_CHECKING: - from pathlib import Path - from paperless.parsers import MetadataEntry logger = logging.getLogger("paperless.parsers.utils") +# Minimum character count for a PDF to be considered "born-digital" (has real text). +# Used by both the consumer (archive decision) and the tesseract parser (skip-OCR decision). +PDF_TEXT_MIN_LENGTH: Final[int] = 50 + + +def is_tagged_pdf( + path: Path, + log: logging.Logger | None = None, +) -> bool: + """Return True if the PDF declares itself as tagged (born-digital indicator). + + Tagged PDFs (e.g. exported from Word or LibreOffice) have ``/MarkInfo`` + with ``/Marked true`` in the document root. This is a reliable signal + that the document has a logical structure and embedded text — running OCR + on it is unnecessary and archive generation can be skipped. + + https://github.com/ocrmypdf/OCRmyPDF/blob/4e974ebd465a5921b2e79004f098f5d203010282/src/ocrmypdf/pdfinfo/info.py#L449 + + Parameters + ---------- + path: + Absolute path to the PDF file. + log: + Logger for warnings. Falls back to the module-level logger when omitted. + + Returns + ------- + bool + ``True`` when the PDF is tagged, ``False`` otherwise or on any error. + """ + import pikepdf + + _log = log or logger + try: + with pikepdf.open(path) as pdf: + mark_info = pdf.Root.get("/MarkInfo") + if mark_info is None: + return False + return bool(mark_info.get("/Marked", False)) + except Exception: + _log.warning("Could not check PDF tag status for %s", path, exc_info=True) + return False + + +def extract_pdf_text( + path: Path, + log: logging.Logger | None = None, +) -> str | None: + """Run pdftotext on *path* and return the extracted text, or None on failure. + + Parameters + ---------- + path: + Absolute path to the PDF file. + log: + Logger for warnings. Falls back to the module-level logger when omitted. + + Returns + ------- + str | None + Extracted text, or ``None`` if pdftotext fails or the file is not a PDF. + """ + from documents.utils import run_subprocess + + _log = log or logger + try: + with tempfile.TemporaryDirectory() as tmpdir: + out_path = Path(tmpdir) / "text.txt" + run_subprocess( + [ + "pdftotext", + "-q", + "-layout", + "-enc", + "UTF-8", + str(path), + str(out_path), + ], + logger=_log, + ) + text = read_file_handle_unicode_errors(out_path, log=_log) + return text or None + except Exception: + _log.warning( + "Error while getting text from PDF document with pdftotext", + exc_info=True, + ) + return None + def read_file_handle_unicode_errors( filepath: Path, diff --git a/src/paperless/settings/__init__.py b/src/paperless/settings/__init__.py index fac1391b3..bace016cf 100644 --- a/src/paperless/settings/__init__.py +++ b/src/paperless/settings/__init__.py @@ -889,10 +889,23 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") # OCRmyPDF --output-type options are available. OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") -# skip. redo, force -OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") +if os.environ.get("PAPERLESS_OCR_MODE", "") in ( + "skip", + "skip_noarchive", +): # pragma: no cover + OCR_MODE = "auto" +else: + OCR_MODE = get_choice_from_env( + "PAPERLESS_OCR_MODE", + {"auto", "force", "redo", "off"}, + default="auto", + ) -OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never") +ARCHIVE_FILE_GENERATION = get_choice_from_env( + "PAPERLESS_ARCHIVE_FILE_GENERATION", + {"auto", "always", "never"}, + default="auto", +) OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI") diff --git a/src/paperless/tests/parsers/conftest.py b/src/paperless/tests/parsers/conftest.py index 8747ac9bd..843ffdb88 100644 --- a/src/paperless/tests/parsers/conftest.py +++ b/src/paperless/tests/parsers/conftest.py @@ -708,7 +708,7 @@ def null_app_config(mocker: MockerFixture) -> MagicMock: pages=None, language=None, mode=None, - skip_archive_file=None, + archive_file_generation=None, image_dpi=None, unpaper_clean=None, deskew=None, diff --git a/src/paperless/tests/parsers/test_convert_image_to_pdfa.py b/src/paperless/tests/parsers/test_convert_image_to_pdfa.py new file mode 100644 index 000000000..615900a25 --- /dev/null +++ b/src/paperless/tests/parsers/test_convert_image_to_pdfa.py @@ -0,0 +1,141 @@ +""" +Tests for RasterisedDocumentParser._convert_image_to_pdfa. + +The method converts an image to a PDF/A-2b file using img2pdf (wrapping) +then pikepdf (PDF/A metadata stamping), with a fallback to plain PDF when +pikepdf stamping fails. No Tesseract or Ghostscript is invoked. + +These are unit/integration tests: img2pdf and pikepdf run for real; only +error-path branches mock the respective library call. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import img2pdf +import magic +import pikepdf +import pytest + +from documents.parsers import ParseError + +if TYPE_CHECKING: + from pytest_mock import MockerFixture + + from paperless.parsers.tesseract import RasterisedDocumentParser + + +class TestConvertImageToPdfa: + """_convert_image_to_pdfa: output shape, error paths, DPI handling.""" + + def test_valid_png_produces_pdf_bytes( + self, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: a valid PNG with DPI metadata + WHEN: _convert_image_to_pdfa is called + THEN: the returned file is non-empty and begins with the PDF magic bytes + """ + result = tesseract_parser._convert_image_to_pdfa(simple_png_file) + + assert result.exists() + assert magic.from_file(str(result), mime=True) == "application/pdf" + + def test_output_path_is_archive_pdf_in_tempdir( + self, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: any valid image + WHEN: _convert_image_to_pdfa is called + THEN: the returned path is exactly /archive.pdf + """ + result = tesseract_parser._convert_image_to_pdfa(simple_png_file) + + assert result == Path(tesseract_parser.tempdir) / "archive.pdf" + + def test_img2pdf_failure_raises_parse_error( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: img2pdf.convert raises an exception + WHEN: _convert_image_to_pdfa is called + THEN: a ParseError is raised that mentions "img2pdf conversion failed" + """ + mocker.patch.object(img2pdf, "convert", side_effect=Exception("boom")) + + with pytest.raises(ParseError, match="img2pdf conversion failed"): + tesseract_parser._convert_image_to_pdfa(simple_png_file) + + def test_pikepdf_stamping_failure_falls_back_to_plain_pdf( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: pikepdf.open raises during PDF/A metadata stamping + WHEN: _convert_image_to_pdfa is called + THEN: no exception is raised and the returned file is still a valid PDF + (plain PDF bytes are used as fallback) + """ + mocker.patch.object(pikepdf, "open", side_effect=Exception("pikepdf boom")) + + result = tesseract_parser._convert_image_to_pdfa(simple_png_file) + + assert result.exists() + assert magic.from_file(str(result), mime=True) == "application/pdf" + + def test_image_dpi_setting_applies_fixed_dpi_layout( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_no_dpi_png_file: Path, + ) -> None: + """ + GIVEN: parser.settings.image_dpi = 150 + WHEN: _convert_image_to_pdfa is called with a no-DPI PNG + THEN: img2pdf.get_fixed_dpi_layout_fun is called with (150, 150) + and the output is still a valid PDF + """ + spy = mocker.patch.object( + img2pdf, + "get_fixed_dpi_layout_fun", + wraps=img2pdf.get_fixed_dpi_layout_fun, + ) + tesseract_parser.settings.image_dpi = 150 + + result = tesseract_parser._convert_image_to_pdfa(simple_no_dpi_png_file) + + spy.assert_called_once_with((150, 150)) + assert magic.from_file(str(result), mime=True) == "application/pdf" + + def test_no_image_dpi_setting_skips_fixed_dpi_layout( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: parser.settings.image_dpi is None (default) + WHEN: _convert_image_to_pdfa is called + THEN: img2pdf.get_fixed_dpi_layout_fun is never called + """ + spy = mocker.patch.object( + img2pdf, + "get_fixed_dpi_layout_fun", + wraps=img2pdf.get_fixed_dpi_layout_fun, + ) + tesseract_parser.settings.image_dpi = None + + tesseract_parser._convert_image_to_pdfa(simple_png_file) + + spy.assert_not_called() diff --git a/src/paperless/tests/parsers/test_parse_modes.py b/src/paperless/tests/parsers/test_parse_modes.py new file mode 100644 index 000000000..bf95cdaf8 --- /dev/null +++ b/src/paperless/tests/parsers/test_parse_modes.py @@ -0,0 +1,440 @@ +""" +Focused tests for RasterisedDocumentParser.parse() mode behaviour. + +These tests mock ``ocrmypdf.ocr`` so they run without a real Tesseract/OCRmyPDF +installation and execute quickly. The intent is to verify the *control flow* +introduced by the ``produce_archive`` flag and the ``OCR_MODE=auto/off`` logic, +not to test OCRmyPDF itself. + +Fixtures are pulled from conftest.py in this package. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + from pytest_mock import MockerFixture + + from paperless.parsers.tesseract import RasterisedDocumentParser + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_LONG_TEXT = "This is a test document with enough text. " * 5 # >50 chars +_SHORT_TEXT = "Hi." # <50 chars + + +def _make_extract_text(text: str | None): + """Return a side_effect function for ``extract_text`` that returns *text*.""" + + def _extract(sidecar_file, pdf_file): + return text + + return _extract + + +# --------------------------------------------------------------------------- +# AUTO mode — PDF with sufficient text layer +# --------------------------------------------------------------------------- + + +class TestAutoModeWithText: + """AUTO mode, original PDF has detectable text (>50 chars).""" + + def test_auto_text_no_archive_skips_ocrmypdf( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - AUTO mode, produce_archive=False + - PDF with text > VALID_TEXT_LENGTH + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr is NOT called (early return path) + - archive_path remains None + - text is set from the original + """ + # Patch extract_text to return long text (simulating detectable text layer) + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + simple_digital_pdf_file, + "application/pdf", + produce_archive=False, + ) + + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path is None + assert tesseract_parser.get_text() == _LONG_TEXT + + def test_auto_text_with_archive_calls_ocrmypdf_skip_text( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - AUTO mode, produce_archive=True + - PDF with text > VALID_TEXT_LENGTH + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr IS called with skip_text=True + - archive_path is set + """ + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + simple_digital_pdf_file, + "application/pdf", + produce_archive=True, + ) + + mock_ocr.assert_called_once() + call_kwargs = mock_ocr.call_args.kwargs + assert call_kwargs.get("skip_text") is True + assert "force_ocr" not in call_kwargs + assert "redo_ocr" not in call_kwargs + assert tesseract_parser.archive_path is not None + + +# --------------------------------------------------------------------------- +# AUTO mode — PDF without text layer (or too short) +# --------------------------------------------------------------------------- + + +class TestAutoModeNoText: + """AUTO mode, original PDF has no detectable text (<= 50 chars).""" + + def test_auto_no_text_with_archive_calls_ocrmypdf_no_extra_flag( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + multi_page_images_pdf_file: Path, + ) -> None: + """ + GIVEN: + - AUTO mode, produce_archive=True + - PDF with no text (or text <= VALID_TEXT_LENGTH) + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr IS called WITHOUT skip_text/force_ocr/redo_ocr + - archive_path is set (since produce_archive=True) + """ + # Return "no text" for the original; return real text for archive + extract_call_count = 0 + + def _extract_side(sidecar_file, pdf_file): + nonlocal extract_call_count + extract_call_count += 1 + if extract_call_count == 1: + return None # original has no text + return _LONG_TEXT # text from archive after OCR + + mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + multi_page_images_pdf_file, + "application/pdf", + produce_archive=True, + ) + + mock_ocr.assert_called_once() + call_kwargs = mock_ocr.call_args.kwargs + assert "skip_text" not in call_kwargs + assert "force_ocr" not in call_kwargs + assert "redo_ocr" not in call_kwargs + assert tesseract_parser.archive_path is not None + + def test_auto_no_text_no_archive_calls_ocrmypdf( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + multi_page_images_pdf_file: Path, + ) -> None: + """ + GIVEN: + - AUTO mode, produce_archive=False + - PDF with no text + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr IS called (no early return since no text detected) + - archive_path is NOT set (produce_archive=False) + """ + extract_call_count = 0 + + def _extract_side(sidecar_file, pdf_file): + nonlocal extract_call_count + extract_call_count += 1 + if extract_call_count == 1: + return None + return _LONG_TEXT + + mocker.patch.object(tesseract_parser, "extract_text", side_effect=_extract_side) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + multi_page_images_pdf_file, + "application/pdf", + produce_archive=False, + ) + + mock_ocr.assert_called_once() + assert tesseract_parser.archive_path is None + + +# --------------------------------------------------------------------------- +# OFF mode — PDF +# --------------------------------------------------------------------------- + + +class TestOffModePdf: + """OCR_MODE=off, document is a PDF.""" + + def test_off_no_archive_returns_pdftotext( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - OFF mode, produce_archive=False + - PDF with text + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr is NOT called + - archive_path is None + - text comes from pdftotext (extract_text) + """ + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "off" + tesseract_parser.parse( + simple_digital_pdf_file, + "application/pdf", + produce_archive=False, + ) + + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path is None + assert tesseract_parser.get_text() == _LONG_TEXT + + def test_off_with_archive_uses_ghostscript_not_ocr( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - OFF mode, produce_archive=True + - PDF document + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr is NOT called + - Ghostscript generate_pdfa IS called (PDF/A conversion without OCR) + - archive_path is set + - text comes from pdftotext, not OCR + """ + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + mock_gs = mocker.patch( + "ocrmypdf._exec.ghostscript.generate_pdfa", + ) + mocker.patch("ocrmypdf.pdfa.generate_pdfa_ps") + + tesseract_parser.settings.mode = "off" + tesseract_parser.parse( + simple_digital_pdf_file, + "application/pdf", + produce_archive=True, + ) + + mock_ocr.assert_not_called() + mock_gs.assert_called_once() + assert tesseract_parser.archive_path is not None + assert tesseract_parser.get_text() == _LONG_TEXT + + +# --------------------------------------------------------------------------- +# OFF mode — image +# --------------------------------------------------------------------------- + + +class TestOffModeImage: + """OCR_MODE=off, document is an image (PNG).""" + + def test_off_image_no_archive_no_ocrmypdf( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: + - OFF mode, produce_archive=False + - Image document (PNG) + WHEN: + - parse() is called + THEN: + - ocrmypdf.ocr is NOT called + - archive_path is None + - text is empty string (images have no text layer) + """ + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "off" + tesseract_parser.parse(simple_png_file, "image/png", produce_archive=False) + + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path is None + assert tesseract_parser.get_text() == "" + + def test_off_image_with_archive_uses_img2pdf_path( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: + - OFF mode, produce_archive=True + - Image document (PNG) + WHEN: + - parse() is called + THEN: + - _convert_image_to_pdfa() is called instead of ocrmypdf.ocr + - archive_path is set to the returned path + - text is empty string + """ + fake_archive = Path("/tmp/fake-archive.pdf") + mock_convert = mocker.patch.object( + tesseract_parser, + "_convert_image_to_pdfa", + return_value=fake_archive, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "off" + tesseract_parser.parse(simple_png_file, "image/png", produce_archive=True) + + mock_convert.assert_called_once_with(simple_png_file) + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path == fake_archive + assert tesseract_parser.get_text() == "" + + +# --------------------------------------------------------------------------- +# produce_archive=False never sets archive_path for FORCE / REDO / AUTO modes +# --------------------------------------------------------------------------- + + +class TestProduceArchiveFalse: + """Verify produce_archive=False never results in an archive regardless of mode.""" + + @pytest.mark.parametrize("mode", ["force", "redo"]) + def test_produce_archive_false_force_redo_modes( + self, + mode: str, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + multi_page_images_pdf_file: Path, + ) -> None: + """ + GIVEN: + - FORCE or REDO mode, produce_archive=False + - Any PDF + WHEN: + - parse() is called (ocrmypdf mocked to succeed) + THEN: + - archive_path is NOT set even though ocrmypdf ran + """ + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = mode + tesseract_parser.parse( + multi_page_images_pdf_file, + "application/pdf", + produce_archive=False, + ) + + assert tesseract_parser.archive_path is None + assert tesseract_parser.get_text() is not None + + def test_produce_archive_false_auto_with_text( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_digital_pdf_file: Path, + ) -> None: + """ + GIVEN: + - AUTO mode, produce_archive=False + - PDF with text > VALID_TEXT_LENGTH + WHEN: + - parse() is called + THEN: + - ocrmypdf is skipped entirely (early return) + - archive_path is None + """ + mocker.patch.object( + tesseract_parser, + "extract_text", + return_value=_LONG_TEXT, + ) + mock_ocr = mocker.patch("ocrmypdf.ocr") + + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + simple_digital_pdf_file, + "application/pdf", + produce_archive=False, + ) + + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path is None diff --git a/src/paperless/tests/parsers/test_tesseract_custom_settings.py b/src/paperless/tests/parsers/test_tesseract_custom_settings.py index 9f3afacb6..06111173d 100644 --- a/src/paperless/tests/parsers/test_tesseract_custom_settings.py +++ b/src/paperless/tests/parsers/test_tesseract_custom_settings.py @@ -94,15 +94,35 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas WHEN: - OCR parameters are constructed THEN: - - Configuration from database is utilized + - Configuration from database is utilized (AUTO mode with skip_text=True + triggers skip_text; AUTO mode alone does not add any extra flag) """ + # AUTO mode with skip_text=True explicitly passed: skip_text is set with override_settings(OCR_MODE="redo"): instance = ApplicationConfiguration.objects.all().first() - instance.mode = ModeChoices.SKIP + instance.mode = ModeChoices.AUTO + instance.save() + + params = RasterisedDocumentParser(None).construct_ocrmypdf_parameters( + input_file="input.pdf", + output_file="output.pdf", + sidecar_file="sidecar.txt", + mime_type="application/pdf", + safe_fallback=False, + skip_text=True, + ) + self.assertTrue(params["skip_text"]) + self.assertNotIn("redo_ocr", params) + self.assertNotIn("force_ocr", params) + + # AUTO mode alone (no skip_text): no extra OCR flag is set + with override_settings(OCR_MODE="redo"): + instance = ApplicationConfiguration.objects.all().first() + instance.mode = ModeChoices.AUTO instance.save() params = self.get_params() - self.assertTrue(params["skip_text"]) + self.assertNotIn("skip_text", params) self.assertNotIn("redo_ocr", params) self.assertNotIn("force_ocr", params) diff --git a/src/paperless/tests/parsers/test_tesseract_parser.py b/src/paperless/tests/parsers/test_tesseract_parser.py index daa7020c7..bb8d986b4 100644 --- a/src/paperless/tests/parsers/test_tesseract_parser.py +++ b/src/paperless/tests/parsers/test_tesseract_parser.py @@ -370,15 +370,26 @@ class TestParsePdf: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: + """ + GIVEN: + - Multi-page digital PDF with sufficient text layer + - Default settings (mode=auto, produce_archive=True) + WHEN: + - Document is parsed + THEN: + - Archive is created (AUTO mode + text present + produce_archive=True + → PDF/A conversion via skip_text) + - Text is extracted + """ tesseract_parser.parse( - tesseract_samples_dir / "simple-digital.pdf", + tesseract_samples_dir / "multi-page-digital.pdf", "application/pdf", ) assert tesseract_parser.archive_path is not None assert tesseract_parser.archive_path.is_file() assert_ordered_substrings( - tesseract_parser.get_text(), - ["This is a test document."], + tesseract_parser.get_text().lower(), + ["page 1", "page 2", "page 3"], ) def test_with_form_default( @@ -397,7 +408,7 @@ class TestParsePdf: ["Please enter your name in here:", "This is a PDF document with a form."], ) - def test_with_form_redo_produces_no_archive( + def test_with_form_redo_no_archive_when_not_requested( self, tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, @@ -406,6 +417,7 @@ class TestParsePdf: tesseract_parser.parse( tesseract_samples_dir / "with-form.pdf", "application/pdf", + produce_archive=False, ) assert tesseract_parser.archive_path is None assert_ordered_substrings( @@ -433,7 +445,7 @@ class TestParsePdf: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf") assert tesseract_parser.archive_path is None assert_ordered_substrings( @@ -449,7 +461,7 @@ class TestParsePdf: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "encrypted.pdf", "application/pdf", @@ -559,7 +571,7 @@ class TestParseMultiPage: @pytest.mark.parametrize( "mode", [ - pytest.param("skip", id="skip"), + pytest.param("auto", id="auto"), pytest.param("redo", id="redo"), pytest.param("force", id="force"), ], @@ -587,7 +599,7 @@ class TestParseMultiPage: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-images.pdf", "application/pdf", @@ -735,16 +747,18 @@ class TestSkipArchive: """ GIVEN: - File with existing text layer - - Mode: skip_noarchive + - Mode: auto, produce_archive=False WHEN: - Document is parsed THEN: - - Text extracted; no archive created + - Text extracted from original; no archive created (text exists + + produce_archive=False skips OCRmyPDF entirely) """ - tesseract_parser.settings.mode = "skip_noarchive" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-digital.pdf", "application/pdf", + produce_archive=False, ) assert tesseract_parser.archive_path is None assert_ordered_substrings( @@ -760,13 +774,13 @@ class TestSkipArchive: """ GIVEN: - File with image-only pages (no text layer) - - Mode: skip_noarchive + - Mode: auto, skip_archive_file: auto WHEN: - Document is parsed THEN: - - Text extracted; archive created (OCR needed) + - Text extracted; archive created (OCR needed, no existing text) """ - tesseract_parser.settings.mode = "skip_noarchive" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-images.pdf", "application/pdf", @@ -778,41 +792,58 @@ class TestSkipArchive: ) @pytest.mark.parametrize( - ("skip_archive_file", "filename", "expect_archive"), + ("produce_archive", "filename", "expect_archive"), [ - pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"), - pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"), pytest.param( - "with_text", + True, "multi-page-digital.pdf", - False, - id="with-text-layer", + True, + id="produce-archive-with-text", ), pytest.param( - "with_text", + True, "multi-page-images.pdf", True, - id="with-text-no-layer", + id="produce-archive-no-text", ), pytest.param( - "always", + False, "multi-page-digital.pdf", False, - id="always-with-text", + id="no-archive-with-text-layer", + ), + pytest.param( + False, + "multi-page-images.pdf", + False, + id="no-archive-no-text-layer", ), - pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"), ], ) - def test_skip_archive_file_setting( + def test_produce_archive_flag( self, - skip_archive_file: str, + produce_archive: bool, # noqa: FBT001 filename: str, - expect_archive: str, + expect_archive: bool, # noqa: FBT001 tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.skip_archive_file = skip_archive_file - tesseract_parser.parse(tesseract_samples_dir / filename, "application/pdf") + """ + GIVEN: + - Various PDFs (with and without text layers) + - produce_archive flag set to True or False + WHEN: + - Document is parsed + THEN: + - archive_path is set if and only if produce_archive=True + - Text is always extracted + """ + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + tesseract_samples_dir / filename, + "application/pdf", + produce_archive=produce_archive, + ) text = tesseract_parser.get_text().lower() assert_ordered_substrings(text, ["page 1", "page 2", "page 3"]) if expect_archive: @@ -820,6 +851,59 @@ class TestSkipArchive: else: assert tesseract_parser.archive_path is None + def test_tagged_pdf_skips_ocr_in_auto_mode( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + tesseract_samples_dir: Path, + ) -> None: + """ + GIVEN: + - A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true) + - Mode: auto, produce_archive=False + WHEN: + - Document is parsed + THEN: + - OCRmyPDF is not invoked (tagged ⇒ original_has_text=True) + - Text is extracted from the original via pdftotext + - No archive is produced + """ + tesseract_parser.settings.mode = "auto" + mock_ocr = mocker.patch("ocrmypdf.ocr") + tesseract_parser.parse( + tesseract_samples_dir / "simple-digital.pdf", + "application/pdf", + produce_archive=False, + ) + mock_ocr.assert_not_called() + assert tesseract_parser.archive_path is None + assert tesseract_parser.get_text() + + def test_tagged_pdf_produces_pdfa_archive_without_ocr( + self, + tesseract_parser: RasterisedDocumentParser, + tesseract_samples_dir: Path, + ) -> None: + """ + GIVEN: + - A tagged PDF (e.g. exported from Word, /MarkInfo /Marked true) + - Mode: auto, produce_archive=True + WHEN: + - Document is parsed + THEN: + - OCRmyPDF runs with skip_text (PDF/A conversion only, no OCR) + - Archive is produced + - Text is preserved from the original + """ + tesseract_parser.settings.mode = "auto" + tesseract_parser.parse( + tesseract_samples_dir / "simple-digital.pdf", + "application/pdf", + produce_archive=True, + ) + assert tesseract_parser.archive_path is not None + assert tesseract_parser.get_text() + # --------------------------------------------------------------------------- # Parse — mixed pages / sidecar @@ -835,13 +919,13 @@ class TestParseMixed: """ GIVEN: - File with text in some pages (image) and some pages (digital) - - Mode: skip + - Mode: auto (skip_text), skip_archive_file: always WHEN: - Document is parsed THEN: - All pages extracted; archive created; sidecar notes skipped pages """ - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-mixed.pdf", "application/pdf", @@ -898,17 +982,18 @@ class TestParseMixed: ) -> None: """ GIVEN: - - File with mixed pages - - Mode: skip_noarchive + - File with mixed pages (some with text, some image-only) + - Mode: auto, produce_archive=False WHEN: - Document is parsed THEN: - - No archive created (file has text layer); later-page text present + - No archive created (produce_archive=False); text from text layer present """ - tesseract_parser.settings.mode = "skip_noarchive" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-mixed.pdf", "application/pdf", + produce_archive=False, ) assert tesseract_parser.archive_path is None assert_ordered_substrings( @@ -923,12 +1008,12 @@ class TestParseMixed: class TestParseRotate: - def test_rotate_skip_mode( + def test_rotate_auto_mode( self, tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.settings.rotate = True tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf") assert_ordered_substrings( @@ -955,12 +1040,19 @@ class TestParseRtl: ) -> None: """ GIVEN: - - PDF with RTL Arabic text + - PDF with RTL Arabic text in its text layer (short: 18 chars) + - mode=off, produce_archive=True: PDF/A conversion via skip_text, no OCR engine WHEN: - Document is parsed THEN: - - Arabic content is extracted (normalised for bidi) + - Arabic content is extracted from the PDF text layer (normalised for bidi) + + Note: The RTL PDF has a short text layer (< VALID_TEXT_LENGTH=50) so AUTO mode + would attempt full OCR, which fails due to PriorOcrFoundError and falls back to + force-ocr with English Tesseract (producing garbage). Using mode="off" forces + skip_text=True so the Arabic text layer is preserved through PDF/A conversion. """ + tesseract_parser.settings.mode = "off" tesseract_parser.parse( tesseract_samples_dir / "rtl-test.pdf", "application/pdf", @@ -971,7 +1063,8 @@ class TestParseRtl: if unicodedata.category(ch) != "Cf" and not ch.isspace() ) assert "ةرازو" in normalised - assert any(token in normalised for token in ("ةیلخادلا", "الاخليد")) + # pdftotext uses Arabic Yeh (U+064A) where ocrmypdf used Farsi Yeh (U+06CC) + assert any(token in normalised for token in ("ةیلخادلا", "الاخليد", "ةيلخادال")) # --------------------------------------------------------------------------- @@ -1023,11 +1116,11 @@ class TestOcrmypdfParameters: assert ("clean" in params) == expected_clean assert ("clean_final" in params) == expected_clean_final - def test_clean_final_skip_mode( + def test_clean_final_auto_mode( self, make_tesseract_parser: MakeTesseractParser, ) -> None: - with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="skip") as parser: + with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="auto") as parser: params = parser.construct_ocrmypdf_parameters("", "", "", "") assert params["clean_final"] is True assert "clean" not in params @@ -1044,9 +1137,9 @@ class TestOcrmypdfParameters: @pytest.mark.parametrize( ("ocr_mode", "ocr_deskew", "expect_deskew"), [ - pytest.param("skip", True, True, id="skip-deskew-on"), + pytest.param("auto", True, True, id="auto-deskew-on"), pytest.param("redo", True, False, id="redo-deskew-off"), - pytest.param("skip", False, False, id="skip-no-deskew"), + pytest.param("auto", False, False, id="auto-no-deskew"), ], ) def test_deskew_option( diff --git a/src/paperless/tests/test_checks.py b/src/paperless/tests/test_checks.py index 87e64a90e..c4935c1f1 100644 --- a/src/paperless/tests/test_checks.py +++ b/src/paperless/tests/test_checks.py @@ -132,13 +132,13 @@ class TestOcrSettingsChecks: pytest.param( "OCR_MODE", "skip_noarchive", - "deprecated", - id="deprecated-mode", + 'OCR output mode "skip_noarchive"', + id="deprecated-mode-now-invalid", ), pytest.param( - "OCR_SKIP_ARCHIVE_FILE", + "ARCHIVE_FILE_GENERATION", "invalid", - 'OCR_SKIP_ARCHIVE_FILE setting "invalid"', + 'PAPERLESS_ARCHIVE_FILE_GENERATION setting "invalid"', id="invalid-skip-archive-file", ), pytest.param( diff --git a/src/paperless/tests/test_checks_v3.py b/src/paperless/tests/test_checks_v3.py new file mode 100644 index 000000000..a87a19727 --- /dev/null +++ b/src/paperless/tests/test_checks_v3.py @@ -0,0 +1,64 @@ +"""Tests for v3 system checks: deprecated v2 OCR env var warnings.""" + +from __future__ import annotations + +import os +from typing import TYPE_CHECKING + +import pytest + +from paperless.checks import check_deprecated_v2_ocr_env_vars + +if TYPE_CHECKING: + from pytest_mock import MockerFixture + + +class TestDeprecatedV2OcrEnvVarWarnings: + def test_no_deprecated_vars_returns_empty(self, mocker: MockerFixture) -> None: + """No warnings when neither deprecated variable is set.""" + mocker.patch.dict(os.environ, {"PAPERLESS_OCR_MODE": "auto"}, clear=True) + result = check_deprecated_v2_ocr_env_vars(None) + assert result == [] + + @pytest.mark.parametrize( + ("env_var", "env_value", "expected_id", "expected_fragment"), + [ + pytest.param( + "PAPERLESS_OCR_SKIP_ARCHIVE_FILE", + "always", + "paperless.W002", + "PAPERLESS_OCR_SKIP_ARCHIVE_FILE", + id="skip-archive-file-warns", + ), + pytest.param( + "PAPERLESS_OCR_MODE", + "skip", + "paperless.W003", + "skip", + id="ocr-mode-skip-warns", + ), + pytest.param( + "PAPERLESS_OCR_MODE", + "skip_noarchive", + "paperless.W003", + "skip_noarchive", + id="ocr-mode-skip-noarchive-warns", + ), + ], + ) + def test_deprecated_var_produces_one_warning( + self, + mocker: MockerFixture, + env_var: str, + env_value: str, + expected_id: str, + expected_fragment: str, + ) -> None: + """Each deprecated setting in isolation produces exactly one warning.""" + mocker.patch.dict(os.environ, {env_var: env_value}, clear=True) + result = check_deprecated_v2_ocr_env_vars(None) + + assert len(result) == 1 + warning = result[0] + assert warning.id == expected_id + assert expected_fragment in warning.msg diff --git a/src/paperless/tests/test_migration_replace_skip_archive_file.py b/src/paperless/tests/test_migration_replace_skip_archive_file.py new file mode 100644 index 000000000..cb31ffb35 --- /dev/null +++ b/src/paperless/tests/test_migration_replace_skip_archive_file.py @@ -0,0 +1,89 @@ +from documents.tests.utils import TestMigrations + + +class TestMigrateSkipArchiveFile(TestMigrations): + migrate_from = "0007_optimize_integer_field_sizes" + migrate_to = "0008_replace_skip_archive_file" + + def setUpBeforeMigration(self, apps): + ApplicationConfiguration = apps.get_model( + "paperless", + "ApplicationConfiguration", + ) + ApplicationConfiguration.objects.all().delete() + ApplicationConfiguration.objects.create( + pk=1, + mode="skip", + skip_archive_file="always", + ) + ApplicationConfiguration.objects.create( + pk=2, + mode="redo", + skip_archive_file="with_text", + ) + ApplicationConfiguration.objects.create( + pk=3, + mode="force", + skip_archive_file="never", + ) + ApplicationConfiguration.objects.create( + pk=4, + mode="skip_noarchive", + skip_archive_file=None, + ) + ApplicationConfiguration.objects.create( + pk=5, + mode="skip_noarchive", + skip_archive_file="never", + ) + ApplicationConfiguration.objects.create(pk=6, mode=None, skip_archive_file=None) + + def _get_config(self, pk): + ApplicationConfiguration = self.apps.get_model( + "paperless", + "ApplicationConfiguration", + ) + return ApplicationConfiguration.objects.get(pk=pk) + + def test_skip_mapped_to_auto(self): + config = self._get_config(1) + assert config.mode == "auto" + + def test_skip_archive_always_mapped_to_never(self): + config = self._get_config(1) + assert config.archive_file_generation == "never" + + def test_redo_unchanged(self): + config = self._get_config(2) + assert config.mode == "redo" + + def test_skip_archive_with_text_mapped_to_auto(self): + config = self._get_config(2) + assert config.archive_file_generation == "auto" + + def test_force_unchanged(self): + config = self._get_config(3) + assert config.mode == "force" + + def test_skip_archive_never_mapped_to_always(self): + config = self._get_config(3) + assert config.archive_file_generation == "always" + + def test_skip_noarchive_mapped_to_auto(self): + config = self._get_config(4) + assert config.mode == "auto" + + def test_skip_noarchive_implies_archive_never(self): + config = self._get_config(4) + assert config.archive_file_generation == "never" + + def test_skip_noarchive_explicit_skip_archive_takes_precedence(self): + """skip_archive_file=never maps to always, not overridden by skip_noarchive.""" + config = self._get_config(5) + assert config.mode == "auto" + assert config.archive_file_generation == "always" + + def test_null_values_remain_null(self): + config = self._get_config(6) + assert config.mode is None + assert config.archive_file_generation is None diff --git a/src/paperless/tests/test_ocr_config.py b/src/paperless/tests/test_ocr_config.py new file mode 100644 index 000000000..12204b903 --- /dev/null +++ b/src/paperless/tests/test_ocr_config.py @@ -0,0 +1,66 @@ +"""Tests for OcrConfig archive_file_generation field behavior.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from django.test import override_settings + +from paperless.config import OcrConfig + +if TYPE_CHECKING: + from unittest.mock import MagicMock + + +@pytest.fixture() +def null_app_config(mocker) -> MagicMock: + """Mock ApplicationConfiguration with all fields None → falls back to Django settings.""" + return mocker.MagicMock( + output_type=None, + pages=None, + language=None, + mode=None, + archive_file_generation=None, + image_dpi=None, + unpaper_clean=None, + deskew=None, + rotate_pages=None, + rotate_pages_threshold=None, + max_image_pixels=None, + color_conversion_strategy=None, + user_args=None, + ) + + +@pytest.fixture() +def make_ocr_config(mocker, null_app_config): + mocker.patch( + "paperless.config.BaseConfig._get_config_instance", + return_value=null_app_config, + ) + + def _make(**django_settings_overrides): + with override_settings(**django_settings_overrides): + return OcrConfig() + + return _make + + +class TestOcrConfigArchiveFileGeneration: + def test_auto_from_settings(self, make_ocr_config) -> None: + cfg = make_ocr_config(OCR_MODE="auto", ARCHIVE_FILE_GENERATION="auto") + assert cfg.archive_file_generation == "auto" + + def test_always_from_settings(self, make_ocr_config) -> None: + cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always") + assert cfg.archive_file_generation == "always" + + def test_never_from_settings(self, make_ocr_config) -> None: + cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="never") + assert cfg.archive_file_generation == "never" + + def test_db_value_overrides_setting(self, make_ocr_config, null_app_config) -> None: + null_app_config.archive_file_generation = "never" + cfg = make_ocr_config(ARCHIVE_FILE_GENERATION="always") + assert cfg.archive_file_generation == "never" diff --git a/src/paperless/tests/test_parser_utils.py b/src/paperless/tests/test_parser_utils.py new file mode 100644 index 000000000..ca6d9e6fe --- /dev/null +++ b/src/paperless/tests/test_parser_utils.py @@ -0,0 +1,25 @@ +"""Tests for paperless.parsers.utils helpers.""" + +from __future__ import annotations + +from pathlib import Path + +from paperless.parsers.utils import is_tagged_pdf + +SAMPLES = Path(__file__).parent / "samples" / "tesseract" + + +class TestIsTaggedPdf: + def test_tagged_pdf_returns_true(self) -> None: + assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True + + def test_untagged_pdf_returns_false(self) -> None: + assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False + + def test_nonexistent_path_returns_false(self) -> None: + assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False + + def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None: + bad = tmp_path / "bad.pdf" + bad.write_bytes(b"not a pdf") + assert is_tagged_pdf(bad) is False From 51c59746a770a88b87a69c475a65a62f7ab74573 Mon Sep 17 00:00:00 2001 From: GitHub Actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 6 Apr 2026 22:51:57 +0000 Subject: [PATCH 2/3] Auto translate strings --- src-ui/messages.xlf | 4 ++-- src/locale/en_US/LC_MESSAGES/django.po | 24 ++++++++++-------------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src-ui/messages.xlf b/src-ui/messages.xlf index f30605a4e..8587b2a90 100644 --- a/src-ui/messages.xlf +++ b/src-ui/messages.xlf @@ -10456,8 +10456,8 @@ 111 - - Skip Archive File + + Archive File Generation src/app/data/paperless-config.ts 119 diff --git a/src/locale/en_US/LC_MESSAGES/django.po b/src/locale/en_US/LC_MESSAGES/django.po index 03fdcc6e1..eb3a243f3 100644 --- a/src/locale/en_US/LC_MESSAGES/django.po +++ b/src/locale/en_US/LC_MESSAGES/django.po @@ -2,7 +2,7 @@ msgid "" msgstr "" "Project-Id-Version: paperless-ngx\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2026-04-03 20:54+0000\n" +"POT-Creation-Date: 2026-04-06 22:51+0000\n" "PO-Revision-Date: 2022-02-17 04:17\n" "Last-Translator: \n" "Language-Team: English\n" @@ -1666,32 +1666,28 @@ msgstr "" msgid "pdfa-3" msgstr "" -#: paperless/models.py:39 -msgid "skip" +#: paperless/models.py:39 paperless/models.py:50 +msgid "auto" msgstr "" #: paperless/models.py:40 -msgid "redo" -msgstr "" - -#: paperless/models.py:41 msgid "force" msgstr "" -#: paperless/models.py:42 -msgid "skip_noarchive" +#: paperless/models.py:41 +msgid "redo" msgstr "" -#: paperless/models.py:50 -msgid "never" +#: paperless/models.py:42 +msgid "off" msgstr "" #: paperless/models.py:51 -msgid "with_text" +msgid "always" msgstr "" #: paperless/models.py:52 -msgid "always" +msgid "never" msgstr "" #: paperless/models.py:60 @@ -1755,7 +1751,7 @@ msgid "Sets the OCR mode" msgstr "" #: paperless/models.py:130 -msgid "Controls the generation of an archive file" +msgid "Controls archive file generation" msgstr "" #: paperless/models.py:138 From a5fe88d2a1696b8561e2857b0753537d22574c44 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 6 Apr 2026 16:03:29 -0700 Subject: [PATCH 3/3] Chore: Resolves some zizmor reported code scan findings (#12516) Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> --- .github/workflows/ci-backend.yml | 7 +++ .github/workflows/ci-docker.yml | 6 ++- .github/workflows/ci-docs.yml | 5 +- .github/workflows/ci-frontend.yml | 14 ++++++ .github/workflows/ci-lint.yml | 2 + .github/workflows/ci-release.yml | 29 ++++++++--- .github/workflows/ci-static-analysis.yml | 10 +++- .github/workflows/cleanup-tags.yml | 1 + .github/workflows/crowdin.yml | 3 ++ .github/workflows/repo-maintenance.yml | 17 +++++-- .github/zizmor.yml | 61 ++++++++++++++++++++++++ 11 files changed, 139 insertions(+), 16 deletions(-) create mode 100644 .github/zizmor.yml diff --git a/.github/workflows/ci-backend.yml b/.github/workflows/ci-backend.yml index cff139e8c..2a52b84f0 100644 --- a/.github/workflows/ci-backend.yml +++ b/.github/workflows/ci-backend.yml @@ -13,10 +13,13 @@ concurrency: env: DEFAULT_UV_VERSION: "0.10.x" NLTK_DATA: "/usr/share/nltk_data" +permissions: {} jobs: changes: name: Detect Backend Changes runs-on: ubuntu-slim + permissions: + contents: read outputs: backend_changed: ${{ steps.force.outputs.run_all == 'true' || steps.filter.outputs.backend == 'true' }} steps: @@ -66,6 +69,8 @@ jobs: if: needs.changes.outputs.backend_changed == 'true' name: "Python ${{ matrix.python-version }}" runs-on: ubuntu-24.04 + permissions: + contents: read strategy: matrix: python-version: ['3.11', '3.12', '3.13', '3.14'] @@ -143,6 +148,8 @@ jobs: if: needs.changes.outputs.backend_changed == 'true' name: Check project typing runs-on: ubuntu-24.04 + permissions: + contents: read env: DEFAULT_PYTHON: "3.12" steps: diff --git a/.github/workflows/ci-docker.yml b/.github/workflows/ci-docker.yml index 43b79728d..aa754a258 100644 --- a/.github/workflows/ci-docker.yml +++ b/.github/workflows/ci-docker.yml @@ -89,7 +89,7 @@ jobs: push_external="true" ;; esac - case "${{ github.ref }}" in + case "${GITHUB_REF}" in refs/tags/v*|*beta.rc*) push_external="true" ;; @@ -230,8 +230,10 @@ jobs: docker buildx imagetools create ${tags} ${digests} - name: Inspect image + env: + FIRST_TAG: ${{ fromJSON(steps.docker-meta.outputs.json).tags[0] }} run: | - docker buildx imagetools inspect ${{ fromJSON(steps.docker-meta.outputs.json).tags[0] }} + docker buildx imagetools inspect "${FIRST_TAG}" - name: Copy to Docker Hub if: needs.build-arch.outputs.push-external == 'true' env: diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml index a598a3c9d..00b6e00d6 100644 --- a/.github/workflows/ci-docs.yml +++ b/.github/workflows/ci-docs.yml @@ -10,8 +10,6 @@ concurrency: cancel-in-progress: true permissions: contents: read - pages: write - id-token: write env: DEFAULT_UV_VERSION: "0.10.x" DEFAULT_PYTHON_VERSION: "3.12" @@ -105,6 +103,9 @@ jobs: needs: [changes, build] if: github.event_name == 'push' && github.ref == 'refs/heads/main' && needs.changes.outputs.docs_changed == 'true' runs-on: ubuntu-24.04 + permissions: + pages: write + id-token: write environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} diff --git a/.github/workflows/ci-frontend.yml b/.github/workflows/ci-frontend.yml index 9d4e23a1a..8a8ff6574 100644 --- a/.github/workflows/ci-frontend.yml +++ b/.github/workflows/ci-frontend.yml @@ -10,10 +10,13 @@ on: concurrency: group: frontend-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +permissions: {} jobs: changes: name: Detect Frontend Changes runs-on: ubuntu-slim + permissions: + contents: read outputs: frontend_changed: ${{ steps.force.outputs.run_all == 'true' || steps.filter.outputs.frontend == 'true' }} steps: @@ -21,6 +24,7 @@ jobs: uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: Decide run mode id: force run: | @@ -59,6 +63,8 @@ jobs: if: needs.changes.outputs.frontend_changed == 'true' name: Install Dependencies runs-on: ubuntu-24.04 + permissions: + contents: read steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -89,6 +95,8 @@ jobs: needs: [changes, install-dependencies] if: needs.changes.outputs.frontend_changed == 'true' runs-on: ubuntu-24.04 + permissions: + contents: read steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -120,6 +128,8 @@ jobs: needs: [changes, install-dependencies] if: needs.changes.outputs.frontend_changed == 'true' runs-on: ubuntu-24.04 + permissions: + contents: read strategy: fail-fast: false matrix: @@ -169,6 +179,8 @@ jobs: needs: [changes, install-dependencies] if: needs.changes.outputs.frontend_changed == 'true' runs-on: ubuntu-24.04 + permissions: + contents: read container: mcr.microsoft.com/playwright:v1.58.2-noble env: PLAYWRIGHT_BROWSERS_PATH: /ms-playwright @@ -212,6 +224,8 @@ jobs: needs: [changes, unit-tests, e2e-tests] if: needs.changes.outputs.frontend_changed == 'true' runs-on: ubuntu-24.04 + permissions: + contents: read steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml index 314250719..74a1c1ef8 100644 --- a/.github/workflows/ci-lint.yml +++ b/.github/workflows/ci-lint.yml @@ -9,6 +9,8 @@ on: concurrency: group: lint-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +permissions: + contents: read jobs: lint: name: Linting via prek diff --git a/.github/workflows/ci-release.yml b/.github/workflows/ci-release.yml index 030e3bcad..eb7d7473d 100644 --- a/.github/workflows/ci-release.yml +++ b/.github/workflows/ci-release.yml @@ -10,10 +10,14 @@ concurrency: env: DEFAULT_UV_VERSION: "0.10.x" DEFAULT_PYTHON_VERSION: "3.12" +permissions: {} jobs: wait-for-docker: name: Wait for Docker Build runs-on: ubuntu-24.04 + permissions: + checks: read + statuses: read steps: - name: Wait for Docker build uses: lewagon/wait-on-check-action@74049309dfeff245fe8009a0137eacf28136cb3c # v1.5.0 @@ -26,6 +30,8 @@ jobs: name: Build Release needs: wait-for-docker runs-on: ubuntu-24.04 + permissions: + contents: read steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -40,8 +46,7 @@ jobs: uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0 with: node-version: 24.x - cache: 'pnpm' - cache-dependency-path: 'src-ui/pnpm-lock.yaml' + package-manager-cache: false - name: Install frontend dependencies run: cd src-ui && pnpm install - name: Build frontend @@ -56,7 +61,7 @@ jobs: uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7.3.1 with: version: ${{ env.DEFAULT_UV_VERSION }} - enable-cache: true + enable-cache: false python-version: ${{ steps.setup-python.outputs.python-version }} - name: Install Python dependencies run: | @@ -129,6 +134,9 @@ jobs: name: Publish Release needs: build-release runs-on: ubuntu-24.04 + permissions: + contents: write + pull-requests: write outputs: prerelease: ${{ steps.get-version.outputs.prerelease }} changelog: ${{ steps.create-release.outputs.body }} @@ -141,9 +149,11 @@ jobs: path: ./ - name: Get version info id: get-version + env: + REF_NAME: ${{ github.ref_name }} run: | - echo "version=${{ github.ref_name }}" >> $GITHUB_OUTPUT - if [[ "${{ github.ref_name }}" == *"-beta.rc"* ]]; then + echo "version=${REF_NAME}" >> $GITHUB_OUTPUT + if [[ "${REF_NAME}" == *"-beta.rc"* ]]; then echo "prerelease=true" >> $GITHUB_OUTPUT else echo "prerelease=false" >> $GITHUB_OUTPUT @@ -176,6 +186,9 @@ jobs: needs: publish-release if: needs.publish-release.outputs.prerelease == 'false' runs-on: ubuntu-24.04 + permissions: + contents: write + pull-requests: write steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -191,15 +204,17 @@ jobs: uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7.3.1 with: version: ${{ env.DEFAULT_UV_VERSION }} - enable-cache: true + enable-cache: false python-version: ${{ env.DEFAULT_PYTHON_VERSION }} - name: Update changelog working-directory: docs + env: + CHANGELOG: ${{ needs.publish-release.outputs.changelog }} run: | git branch ${{ needs.publish-release.outputs.version }}-changelog git checkout ${{ needs.publish-release.outputs.version }}-changelog - echo -e "# Changelog\n\n${{ needs.publish-release.outputs.changelog }}\n" > changelog-new.md + printf '# Changelog\n\n%s\n' "${CHANGELOG}" > changelog-new.md echo "Manually linking usernames" sed -i -r 's|@([a-zA-Z0-9_]+) \(\[#|[@\1](https://github.com/\1) ([#|g' changelog-new.md diff --git a/.github/workflows/ci-static-analysis.yml b/.github/workflows/ci-static-analysis.yml index 99388354a..23da803b8 100644 --- a/.github/workflows/ci-static-analysis.yml +++ b/.github/workflows/ci-static-analysis.yml @@ -33,10 +33,18 @@ jobs: container: image: semgrep/semgrep:1.155.0@sha256:cc869c685dcc0fe497c86258da9f205397d8108e56d21a86082ea4886e52784d if: github.actor != 'dependabot[bot]' + permissions: + contents: read + security-events: write steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - name: Run Semgrep - run: semgrep scan --config auto + run: semgrep scan --config auto --sarif-output results.sarif + - name: Upload results to GitHub code scanning + uses: github/codeql-action/upload-sarif@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + if: always() + with: + sarif_file: results.sarif diff --git a/.github/workflows/cleanup-tags.yml b/.github/workflows/cleanup-tags.yml index 426554777..24895ffaa 100644 --- a/.github/workflows/cleanup-tags.yml +++ b/.github/workflows/cleanup-tags.yml @@ -12,6 +12,7 @@ on: concurrency: group: registry-tags-cleanup cancel-in-progress: false +permissions: {} jobs: cleanup-images: name: Cleanup Image Tags for ${{ matrix.primary-name }} diff --git a/.github/workflows/crowdin.yml b/.github/workflows/crowdin.yml index 29b4be02f..559e83917 100644 --- a/.github/workflows/crowdin.yml +++ b/.github/workflows/crowdin.yml @@ -6,6 +6,9 @@ on: push: paths: ['src/locale/**', 'src-ui/messages.xlf', 'src-ui/src/locale/**'] branches: [dev] +permissions: + contents: write + pull-requests: write jobs: synchronize-with-crowdin: name: Crowdin Sync diff --git a/.github/workflows/repo-maintenance.yml b/.github/workflows/repo-maintenance.yml index 1d4903193..42c5e66ab 100644 --- a/.github/workflows/repo-maintenance.yml +++ b/.github/workflows/repo-maintenance.yml @@ -3,10 +3,6 @@ on: schedule: - cron: '0 3 * * *' workflow_dispatch: -permissions: - issues: write - pull-requests: write - discussions: write concurrency: group: lock jobs: @@ -14,6 +10,9 @@ jobs: name: 'Stale' if: github.repository_owner == 'paperless-ngx' runs-on: ubuntu-24.04 + permissions: + issues: write + pull-requests: write steps: - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0 with: @@ -36,6 +35,10 @@ jobs: name: 'Lock Old Threads' if: github.repository_owner == 'paperless-ngx' runs-on: ubuntu-24.04 + permissions: + issues: write + pull-requests: write + discussions: write steps: - uses: dessant/lock-threads@7266a7ce5c1df01b1c6db85bf8cd86c737dadbe7 # v6.0.0 with: @@ -56,6 +59,8 @@ jobs: name: 'Close Answered Discussions' if: github.repository_owner == 'paperless-ngx' runs-on: ubuntu-24.04 + permissions: + discussions: write steps: - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: @@ -113,6 +118,8 @@ jobs: name: 'Close Outdated Discussions' if: github.repository_owner == 'paperless-ngx' runs-on: ubuntu-24.04 + permissions: + discussions: write steps: - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: @@ -205,6 +212,8 @@ jobs: name: 'Close Unsupported Feature Requests' if: github.repository_owner == 'paperless-ngx' runs-on: ubuntu-24.04 + permissions: + discussions: write steps: - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: diff --git a/.github/zizmor.yml b/.github/zizmor.yml new file mode 100644 index 000000000..f45e6bbd4 --- /dev/null +++ b/.github/zizmor.yml @@ -0,0 +1,61 @@ +rules: + template-injection: + ignore: + # github.event_name is a GitHub-internal constant (push/pull_request/etc.), + # not attacker-controllable. + - ci-backend.yml:35 + - ci-docker.yml:74 + - ci-docs.yml:33 + - ci-frontend.yml:32 + # github.event.repository.default_branch refers to the target repo's setting, + # which only admins can change; not influenced by fork PR authors. + - ci-backend.yml:47 + - ci-docs.yml:45 + - ci-frontend.yml:44 + # steps.setup-python.outputs.python-version is always a semver string (e.g. "3.12.0") + # produced by actions/setup-python from a hardcoded env var input. + - ci-backend.yml:106 + - ci-backend.yml:121 + - ci-backend.yml:169 + - ci-docs.yml:88 + - ci-docs.yml:92 + - ci-release.yml:69 + - ci-release.yml:78 + - ci-release.yml:90 + - ci-release.yml:96 + - ci-release.yml:229 + # needs.*.result is always one of: success/failure/cancelled/skipped. + - ci-backend.yml:211 + - ci-backend.yml:212 + - ci-backend.yml:216 + - ci-docs.yml:131 + - ci-docs.yml:132 + - ci-frontend.yml:259 + - ci-frontend.yml:260 + - ci-frontend.yml:264 + - ci-frontend.yml:269 + - ci-frontend.yml:274 + - ci-frontend.yml:279 + # needs.changes.outputs.* is always "true" or "false". + - ci-backend.yml:206 + - ci-docs.yml:126 + - ci-frontend.yml:254 + # steps.build.outputs.digest is always a SHA256 digest (sha256:[a-f0-9]{64}). + - ci-docker.yml:152 + # needs.publish-release.outputs.version is the git tag name (e.g. v2.14.0); + # only maintainers can push tags upstream, and the tag pattern excludes + # shell metacharacters. Used in git commands and github-script JS, not eval. + - ci-release.yml:215 + - ci-release.yml:216 + - ci-release.yml:231 + - ci-release.yml:237 + - ci-release.yml:245 + - ci-release.yml:248 + dangerous-triggers: + ignore: + # Both workflows use pull_request_target solely to label/comment on fork PRs + # (requires write-back access unavailable to pull_request). Neither workflow + # checks out PR code or executes anything from the fork — only reads PR + # metadata via context/API. Permissions are scoped to pull-requests: write. + - pr-bot.yml:2 + - project-actions.yml:2