feat!: replace ModeChoices and ArchiveFileChoices with new v3 enums

- Replace ModeChoices (SKIP/SKIP_NO_ARCHIVE/REDO/FORCE) with new values: AUTO, FORCE, REDO, OFF - Remove ArchiveFileChoices entirely; add ArchiveFileGenerationChoices with AUTO, ALWAYS, NEVER values - Update checks.py valid sets and default settings to use new enum values - Update tesseract parser to use new enum comparisons; AUTO mode maps to skip_text behavior; FORCE/REDO bypass archive-skip early-exit - Update all affected tests to use new valid mode/archive string values Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-02 19:59:46 +00:00 · 2026-03-26 12:50:43 -07:00
parent 338cadf284
commit cd653959d6
7 changed files with 79 additions and 75 deletions
@@ -132,19 +132,10 @@ def settings_values_check(app_configs, **kwargs):
                Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
            )

-        if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
+        if settings.OCR_MODE not in {"auto", "force", "redo", "off"}:
            msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))

-        if settings.OCR_MODE == "skip_noarchive":
-            msgs.append(
-                Warning(
-                    'OCR output mode "skip_noarchive" is deprecated and will be '
-                    "removed in a future version. Please use "
-                    "PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
-                ),
-            )
-
-        if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
+        if settings.OCR_SKIP_ARCHIVE_FILE not in {"auto", "always", "never"}:
            msgs.append(
                Error(
                    "OCR_SKIP_ARCHIVE_FILE setting "
@@ -36,20 +36,20 @@ class ModeChoices(models.TextChoices):
    and our own custom setting
    """

-    SKIP = ("skip", _("skip"))
-    REDO = ("redo", _("redo"))
+    AUTO = ("auto", _("auto"))
    FORCE = ("force", _("force"))
-    SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
+    REDO = ("redo", _("redo"))
+    OFF = ("off", _("off"))


-class ArchiveFileChoices(models.TextChoices):
+class ArchiveFileGenerationChoices(models.TextChoices):
    """
    Settings to control creation of an archive PDF file
    """

-    NEVER = ("never", _("never"))
-    WITH_TEXT = ("with_text", _("with_text"))
+    AUTO = ("auto", _("auto"))
    ALWAYS = ("always", _("always"))
+    NEVER = ("never", _("never"))


 class CleanChoices(models.TextChoices):
@@ -131,7 +131,7 @@ class ApplicationConfiguration(AbstractSingletonModel):
        null=True,
        blank=True,
        max_length=16,
-        choices=ArchiveFileChoices.choices,
+        choices=ArchiveFileGenerationChoices.choices,
    )

    image_dpi = models.PositiveSmallIntegerField(
@@ -18,7 +18,7 @@ from documents.parsers import make_thumbnail_from_pdf
 from documents.utils import maybe_override_pixel_limit
 from documents.utils import run_subprocess
 from paperless.config import OcrConfig
-from paperless.models import ArchiveFileChoices
+from paperless.models import ArchiveFileGenerationChoices
 from paperless.models import CleanChoices
 from paperless.models import ModeChoices
 from paperless.parsers.utils import read_file_handle_unicode_errors
@@ -309,10 +309,7 @@ class RasterisedDocumentParser:

        if self.settings.mode == ModeChoices.FORCE or safe_fallback:
            ocrmypdf_args["force_ocr"] = True
-        elif self.settings.mode in {
-            ModeChoices.SKIP,
-            ModeChoices.SKIP_NO_ARCHIVE,
-        }:
+        elif self.settings.mode == ModeChoices.AUTO:
            ocrmypdf_args["skip_text"] = True
        elif self.settings.mode == ModeChoices.REDO:
            ocrmypdf_args["redo_ocr"] = True
@@ -421,15 +418,14 @@ class RasterisedDocumentParser:
            original_has_text = False

        # If the original has text, and the user doesn't want an archive,
-        # we're done here
-        skip_archive_for_text = (
-            self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
-            or self.settings.skip_archive_file
-            in {
-                ArchiveFileChoices.WITH_TEXT,
-                ArchiveFileChoices.ALWAYS,
-            }
-        )
+        # we're done here (but not when force/redo mode is explicitly requested)
+        skip_archive_for_text = self.settings.mode not in {
+            ModeChoices.FORCE,
+            ModeChoices.REDO,
+        } and self.settings.skip_archive_file in {
+            ArchiveFileGenerationChoices.NEVER,
+            ArchiveFileGenerationChoices.AUTO,
+        }
        if skip_archive_for_text and original_has_text:
            self.log.debug("Document has text, skipping OCRmyPDF entirely.")
            self.text = text_original
@@ -459,7 +455,7 @@ class RasterisedDocumentParser:
            self.log.debug(f"Calling OCRmyPDF with args: {args}")
            ocrmypdf.ocr(**args)

-            if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
+            if self.settings.skip_archive_file != ArchiveFileGenerationChoices.NEVER:
                self.archive_path = archive_path

            self.text = self.extract_text(sidecar_file, archive_path)
@@ -21,6 +21,7 @@ from paperless.settings.custom import parse_hosting_settings
 from paperless.settings.custom import parse_ignore_dates
 from paperless.settings.custom import parse_redis_url
 from paperless.settings.parsers import get_bool_from_env
+from paperless.settings.parsers import get_choice_from_env
 from paperless.settings.parsers import get_float_from_env
 from paperless.settings.parsers import get_int_from_env
 from paperless.settings.parsers import get_list_from_env
@@ -874,10 +875,17 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 # OCRmyPDF --output-type options are available.
 OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")

-# skip. redo, force
-OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
+OCR_MODE = get_choice_from_env(
+    "PAPERLESS_OCR_MODE",
+    {"auto", "force", "redo", "off"},
+    default="auto",
+)

-OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
+OCR_SKIP_ARCHIVE_FILE = get_choice_from_env(
+    "PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
+    {"auto", "always", "never"},
+    default="auto",
+)

 OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI")

@@ -93,7 +93,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
        """
        with override_settings(OCR_MODE="redo"):
            instance = ApplicationConfiguration.objects.all().first()
-            instance.mode = ModeChoices.SKIP
+            instance.mode = ModeChoices.AUTO
            instance.save()

            params = self.get_params()
@@ -386,6 +386,7 @@ class TestParsePdf:
        tesseract_parser: RasterisedDocumentParser,
        tesseract_samples_dir: Path,
    ) -> None:
+        tesseract_parser.settings.skip_archive_file = "always"
        tesseract_parser.parse(
            tesseract_samples_dir / "with-form.pdf",
            "application/pdf",
@@ -433,7 +434,7 @@ class TestParsePdf:
        tesseract_parser: RasterisedDocumentParser,
        tesseract_samples_dir: Path,
    ) -> None:
-        tesseract_parser.settings.mode = "skip"
+        tesseract_parser.settings.mode = "auto"
        tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf")
        assert tesseract_parser.archive_path is None
        assert_ordered_substrings(
@@ -449,7 +450,7 @@ class TestParsePdf:
        tesseract_parser: RasterisedDocumentParser,
        tesseract_samples_dir: Path,
    ) -> None:
-        tesseract_parser.settings.mode = "skip"
+        tesseract_parser.settings.mode = "auto"
        tesseract_parser.parse(
            tesseract_samples_dir / "encrypted.pdf",
            "application/pdf",
@@ -545,6 +546,7 @@ class TestParseMultiPage:
        tesseract_parser: RasterisedDocumentParser,
        tesseract_samples_dir: Path,
    ) -> None:
+        tesseract_parser.settings.skip_archive_file = "always"
        tesseract_parser.parse(
            tesseract_samples_dir / "multi-page-digital.pdf",
            "application/pdf",
@@ -559,7 +561,7 @@ class TestParseMultiPage:
    @pytest.mark.parametrize(
        "mode",
        [
-            pytest.param("skip", id="skip"),
+            pytest.param("auto", id="auto"),
            pytest.param("redo", id="redo"),
            pytest.param("force", id="force"),
        ],
@@ -572,6 +574,7 @@ class TestParseMultiPage:
    ) -> None:
        tesseract_parser.settings.pages = 2
        tesseract_parser.settings.mode = mode
+        tesseract_parser.settings.skip_archive_file = "always"
        tesseract_parser.parse(
            tesseract_samples_dir / "multi-page-digital.pdf",
            "application/pdf",
@@ -587,7 +590,7 @@ class TestParseMultiPage:
        tesseract_parser: RasterisedDocumentParser,
        tesseract_samples_dir: Path,
    ) -> None:
-        tesseract_parser.settings.mode = "skip"
+        tesseract_parser.settings.mode = "auto"
        tesseract_parser.parse(
            tesseract_samples_dir / "multi-page-images.pdf",
            "application/pdf",
@@ -735,13 +738,13 @@ class TestSkipArchive:
        """
        GIVEN:
            - File with existing text layer
-            - Mode: skip_noarchive
+            - Mode: auto, skip_archive_file: auto
        WHEN:
            - Document is parsed
        THEN:
-            - Text extracted; no archive created
+            - Text extracted; no archive created (text exists, auto skips OCR)
        """
-        tesseract_parser.settings.mode = "skip_noarchive"
+        tesseract_parser.settings.mode = "auto"
        tesseract_parser.parse(
            tesseract_samples_dir / "multi-page-digital.pdf",
            "application/pdf",
@@ -760,13 +763,13 @@ class TestSkipArchive:
        """
        GIVEN:
            - File with image-only pages (no text layer)
-            - Mode: skip_noarchive
+            - Mode: auto, skip_archive_file: auto
        WHEN:
            - Document is parsed
        THEN:
-            - Text extracted; archive created (OCR needed)
+            - Text extracted; archive created (OCR needed, no existing text)
        """
-        tesseract_parser.settings.mode = "skip_noarchive"
+        tesseract_parser.settings.mode = "auto"
        tesseract_parser.parse(
            tesseract_samples_dir / "multi-page-images.pdf",
            "application/pdf",
@@ -780,27 +783,32 @@ class TestSkipArchive:
    @pytest.mark.parametrize(
        ("skip_archive_file", "filename", "expect_archive"),
        [
-            pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"),
-            pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"),
-            pytest.param(
-                "with_text",
-                "multi-page-digital.pdf",
-                False,
-                id="with-text-layer",
-            ),
-            pytest.param(
-                "with_text",
-                "multi-page-images.pdf",
-                True,
-                id="with-text-no-layer",
-            ),
            pytest.param(
                "always",
                "multi-page-digital.pdf",
-                False,
+                True,
                id="always-with-text",
            ),
-            pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"),
+            pytest.param("always", "multi-page-images.pdf", True, id="always-no-text"),
+            pytest.param(
+                "auto",
+                "multi-page-digital.pdf",
+                False,
+                id="auto-with-text-layer",
+            ),
+            pytest.param(
+                "auto",
+                "multi-page-images.pdf",
+                True,
+                id="auto-no-text-layer",
+            ),
+            pytest.param(
+                "never",
+                "multi-page-digital.pdf",
+                False,
+                id="never-with-text",
+            ),
+            pytest.param("never", "multi-page-images.pdf", False, id="never-no-text"),
        ],
    )
    def test_skip_archive_file_setting(
@@ -835,13 +843,14 @@ class TestParseMixed:
        """
        GIVEN:
            - File with text in some pages (image) and some pages (digital)
-            - Mode: skip
+            - Mode: auto (skip_text), skip_archive_file: always
        WHEN:
            - Document is parsed
        THEN:
            - All pages extracted; archive created; sidecar notes skipped pages
        """
-        tesseract_parser.settings.mode = "skip"
+        tesseract_parser.settings.mode = "auto"
+        tesseract_parser.settings.skip_archive_file = "always"
        tesseract_parser.parse(
            tesseract_samples_dir / "multi-page-mixed.pdf",
            "application/pdf",
@@ -899,13 +908,13 @@ class TestParseMixed:
        """
        GIVEN:
            - File with mixed pages
-            - Mode: skip_noarchive
+            - Mode: auto, skip_archive_file: auto
        WHEN:
            - Document is parsed
        THEN:
            - No archive created (file has text layer); later-page text present
        """
-        tesseract_parser.settings.mode = "skip_noarchive"
+        tesseract_parser.settings.mode = "auto"
        tesseract_parser.parse(
            tesseract_samples_dir / "multi-page-mixed.pdf",
            "application/pdf",
@@ -923,12 +932,12 @@ class TestParseMixed:


 class TestParseRotate:
-    def test_rotate_skip_mode(
+    def test_rotate_auto_mode(
        self,
        tesseract_parser: RasterisedDocumentParser,
        tesseract_samples_dir: Path,
    ) -> None:
-        tesseract_parser.settings.mode = "skip"
+        tesseract_parser.settings.mode = "auto"
        tesseract_parser.settings.rotate = True
        tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf")
        assert_ordered_substrings(
@@ -1023,11 +1032,11 @@ class TestOcrmypdfParameters:
        assert ("clean" in params) == expected_clean
        assert ("clean_final" in params) == expected_clean_final

-    def test_clean_final_skip_mode(
+    def test_clean_final_auto_mode(
        self,
        make_tesseract_parser: MakeTesseractParser,
    ) -> None:
-        with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="skip") as parser:
+        with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="auto") as parser:
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
        assert params["clean_final"] is True
        assert "clean" not in params
@@ -1044,9 +1053,9 @@ class TestOcrmypdfParameters:
    @pytest.mark.parametrize(
        ("ocr_mode", "ocr_deskew", "expect_deskew"),
        [
-            pytest.param("skip", True, True, id="skip-deskew-on"),
+            pytest.param("auto", True, True, id="auto-deskew-on"),
            pytest.param("redo", True, False, id="redo-deskew-off"),
-            pytest.param("skip", False, False, id="skip-no-deskew"),
+            pytest.param("auto", False, False, id="auto-no-deskew"),
        ],
    )
    def test_deskew_option(
@@ -132,8 +132,8 @@ class TestOcrSettingsChecks:
            pytest.param(
                "OCR_MODE",
                "skip_noarchive",
-                "deprecated",
-                id="deprecated-mode",
+                'OCR output mode "skip_noarchive"',
+                id="deprecated-mode-now-invalid",
            ),
            pytest.param(
                "OCR_SKIP_ARCHIVE_FILE",