feat!: replace ModeChoices and ArchiveFileChoices with new v3 enums

- Replace ModeChoices (SKIP/SKIP_NO_ARCHIVE/REDO/FORCE) with new values:
  AUTO, FORCE, REDO, OFF
- Remove ArchiveFileChoices entirely; add ArchiveFileGenerationChoices
  with AUTO, ALWAYS, NEVER values
- Update checks.py valid sets and default settings to use new enum values
- Update tesseract parser to use new enum comparisons; AUTO mode maps to
  skip_text behavior; FORCE/REDO bypass archive-skip early-exit
- Update all affected tests to use new valid mode/archive string values

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-26 12:50:43 -07:00
parent 338cadf284
commit cd653959d6
7 changed files with 79 additions and 75 deletions
+2 -11
View File
@@ -132,19 +132,10 @@ def settings_values_check(app_configs, **kwargs):
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
)
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
if settings.OCR_MODE not in {"auto", "force", "redo", "off"}:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.OCR_MODE == "skip_noarchive":
msgs.append(
Warning(
'OCR output mode "skip_noarchive" is deprecated and will be '
"removed in a future version. Please use "
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
),
)
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
if settings.OCR_SKIP_ARCHIVE_FILE not in {"auto", "always", "never"}:
msgs.append(
Error(
"OCR_SKIP_ARCHIVE_FILE setting "
+7 -7
View File
@@ -36,20 +36,20 @@ class ModeChoices(models.TextChoices):
and our own custom setting
"""
SKIP = ("skip", _("skip"))
REDO = ("redo", _("redo"))
AUTO = ("auto", _("auto"))
FORCE = ("force", _("force"))
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
REDO = ("redo", _("redo"))
OFF = ("off", _("off"))
class ArchiveFileChoices(models.TextChoices):
class ArchiveFileGenerationChoices(models.TextChoices):
"""
Settings to control creation of an archive PDF file
"""
NEVER = ("never", _("never"))
WITH_TEXT = ("with_text", _("with_text"))
AUTO = ("auto", _("auto"))
ALWAYS = ("always", _("always"))
NEVER = ("never", _("never"))
class CleanChoices(models.TextChoices):
@@ -131,7 +131,7 @@ class ApplicationConfiguration(AbstractSingletonModel):
null=True,
blank=True,
max_length=16,
choices=ArchiveFileChoices.choices,
choices=ArchiveFileGenerationChoices.choices,
)
image_dpi = models.PositiveSmallIntegerField(
+11 -15
View File
@@ -18,7 +18,7 @@ from documents.parsers import make_thumbnail_from_pdf
from documents.utils import maybe_override_pixel_limit
from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.models import ArchiveFileChoices
from paperless.models import ArchiveFileGenerationChoices
from paperless.models import CleanChoices
from paperless.models import ModeChoices
from paperless.parsers.utils import read_file_handle_unicode_errors
@@ -309,10 +309,7 @@ class RasterisedDocumentParser:
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
ocrmypdf_args["force_ocr"] = True
elif self.settings.mode in {
ModeChoices.SKIP,
ModeChoices.SKIP_NO_ARCHIVE,
}:
elif self.settings.mode == ModeChoices.AUTO:
ocrmypdf_args["skip_text"] = True
elif self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["redo_ocr"] = True
@@ -421,15 +418,14 @@ class RasterisedDocumentParser:
original_has_text = False
# If the original has text, and the user doesn't want an archive,
# we're done here
skip_archive_for_text = (
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
or self.settings.skip_archive_file
in {
ArchiveFileChoices.WITH_TEXT,
ArchiveFileChoices.ALWAYS,
}
)
# we're done here (but not when force/redo mode is explicitly requested)
skip_archive_for_text = self.settings.mode not in {
ModeChoices.FORCE,
ModeChoices.REDO,
} and self.settings.skip_archive_file in {
ArchiveFileGenerationChoices.NEVER,
ArchiveFileGenerationChoices.AUTO,
}
if skip_archive_for_text and original_has_text:
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
@@ -459,7 +455,7 @@ class RasterisedDocumentParser:
self.log.debug(f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
if self.settings.skip_archive_file != ArchiveFileGenerationChoices.NEVER:
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)
+11 -3
View File
@@ -21,6 +21,7 @@ from paperless.settings.custom import parse_hosting_settings
from paperless.settings.custom import parse_ignore_dates
from paperless.settings.custom import parse_redis_url
from paperless.settings.parsers import get_bool_from_env
from paperless.settings.parsers import get_choice_from_env
from paperless.settings.parsers import get_float_from_env
from paperless.settings.parsers import get_int_from_env
from paperless.settings.parsers import get_list_from_env
@@ -874,10 +875,17 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# OCRmyPDF --output-type options are available.
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
# skip. redo, force
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
OCR_MODE = get_choice_from_env(
"PAPERLESS_OCR_MODE",
{"auto", "force", "redo", "off"},
default="auto",
)
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
OCR_SKIP_ARCHIVE_FILE = get_choice_from_env(
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
{"auto", "always", "never"},
default="auto",
)
OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI")
@@ -93,7 +93,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
"""
with override_settings(OCR_MODE="redo"):
instance = ApplicationConfiguration.objects.all().first()
instance.mode = ModeChoices.SKIP
instance.mode = ModeChoices.AUTO
instance.save()
params = self.get_params()
@@ -386,6 +386,7 @@ class TestParsePdf:
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.skip_archive_file = "always"
tesseract_parser.parse(
tesseract_samples_dir / "with-form.pdf",
"application/pdf",
@@ -433,7 +434,7 @@ class TestParsePdf:
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.mode = "skip"
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf")
assert tesseract_parser.archive_path is None
assert_ordered_substrings(
@@ -449,7 +450,7 @@ class TestParsePdf:
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.mode = "skip"
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / "encrypted.pdf",
"application/pdf",
@@ -545,6 +546,7 @@ class TestParseMultiPage:
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.skip_archive_file = "always"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-digital.pdf",
"application/pdf",
@@ -559,7 +561,7 @@ class TestParseMultiPage:
@pytest.mark.parametrize(
"mode",
[
pytest.param("skip", id="skip"),
pytest.param("auto", id="auto"),
pytest.param("redo", id="redo"),
pytest.param("force", id="force"),
],
@@ -572,6 +574,7 @@ class TestParseMultiPage:
) -> None:
tesseract_parser.settings.pages = 2
tesseract_parser.settings.mode = mode
tesseract_parser.settings.skip_archive_file = "always"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-digital.pdf",
"application/pdf",
@@ -587,7 +590,7 @@ class TestParseMultiPage:
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.mode = "skip"
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-images.pdf",
"application/pdf",
@@ -735,13 +738,13 @@ class TestSkipArchive:
"""
GIVEN:
- File with existing text layer
- Mode: skip_noarchive
- Mode: auto, skip_archive_file: auto
WHEN:
- Document is parsed
THEN:
- Text extracted; no archive created
- Text extracted; no archive created (text exists, auto skips OCR)
"""
tesseract_parser.settings.mode = "skip_noarchive"
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-digital.pdf",
"application/pdf",
@@ -760,13 +763,13 @@ class TestSkipArchive:
"""
GIVEN:
- File with image-only pages (no text layer)
- Mode: skip_noarchive
- Mode: auto, skip_archive_file: auto
WHEN:
- Document is parsed
THEN:
- Text extracted; archive created (OCR needed)
- Text extracted; archive created (OCR needed, no existing text)
"""
tesseract_parser.settings.mode = "skip_noarchive"
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-images.pdf",
"application/pdf",
@@ -780,27 +783,32 @@ class TestSkipArchive:
@pytest.mark.parametrize(
("skip_archive_file", "filename", "expect_archive"),
[
pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"),
pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"),
pytest.param(
"with_text",
"multi-page-digital.pdf",
False,
id="with-text-layer",
),
pytest.param(
"with_text",
"multi-page-images.pdf",
True,
id="with-text-no-layer",
),
pytest.param(
"always",
"multi-page-digital.pdf",
False,
True,
id="always-with-text",
),
pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"),
pytest.param("always", "multi-page-images.pdf", True, id="always-no-text"),
pytest.param(
"auto",
"multi-page-digital.pdf",
False,
id="auto-with-text-layer",
),
pytest.param(
"auto",
"multi-page-images.pdf",
True,
id="auto-no-text-layer",
),
pytest.param(
"never",
"multi-page-digital.pdf",
False,
id="never-with-text",
),
pytest.param("never", "multi-page-images.pdf", False, id="never-no-text"),
],
)
def test_skip_archive_file_setting(
@@ -835,13 +843,14 @@ class TestParseMixed:
"""
GIVEN:
- File with text in some pages (image) and some pages (digital)
- Mode: skip
- Mode: auto (skip_text), skip_archive_file: always
WHEN:
- Document is parsed
THEN:
- All pages extracted; archive created; sidecar notes skipped pages
"""
tesseract_parser.settings.mode = "skip"
tesseract_parser.settings.mode = "auto"
tesseract_parser.settings.skip_archive_file = "always"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-mixed.pdf",
"application/pdf",
@@ -899,13 +908,13 @@ class TestParseMixed:
"""
GIVEN:
- File with mixed pages
- Mode: skip_noarchive
- Mode: auto, skip_archive_file: auto
WHEN:
- Document is parsed
THEN:
- No archive created (file has text layer); later-page text present
"""
tesseract_parser.settings.mode = "skip_noarchive"
tesseract_parser.settings.mode = "auto"
tesseract_parser.parse(
tesseract_samples_dir / "multi-page-mixed.pdf",
"application/pdf",
@@ -923,12 +932,12 @@ class TestParseMixed:
class TestParseRotate:
def test_rotate_skip_mode(
def test_rotate_auto_mode(
self,
tesseract_parser: RasterisedDocumentParser,
tesseract_samples_dir: Path,
) -> None:
tesseract_parser.settings.mode = "skip"
tesseract_parser.settings.mode = "auto"
tesseract_parser.settings.rotate = True
tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf")
assert_ordered_substrings(
@@ -1023,11 +1032,11 @@ class TestOcrmypdfParameters:
assert ("clean" in params) == expected_clean
assert ("clean_final" in params) == expected_clean_final
def test_clean_final_skip_mode(
def test_clean_final_auto_mode(
self,
make_tesseract_parser: MakeTesseractParser,
) -> None:
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="skip") as parser:
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="auto") as parser:
params = parser.construct_ocrmypdf_parameters("", "", "", "")
assert params["clean_final"] is True
assert "clean" not in params
@@ -1044,9 +1053,9 @@ class TestOcrmypdfParameters:
@pytest.mark.parametrize(
("ocr_mode", "ocr_deskew", "expect_deskew"),
[
pytest.param("skip", True, True, id="skip-deskew-on"),
pytest.param("auto", True, True, id="auto-deskew-on"),
pytest.param("redo", True, False, id="redo-deskew-off"),
pytest.param("skip", False, False, id="skip-no-deskew"),
pytest.param("auto", False, False, id="auto-no-deskew"),
],
)
def test_deskew_option(
+2 -2
View File
@@ -132,8 +132,8 @@ class TestOcrSettingsChecks:
pytest.param(
"OCR_MODE",
"skip_noarchive",
"deprecated",
id="deprecated-mode",
'OCR output mode "skip_noarchive"',
id="deprecated-mode-now-invalid",
),
pytest.param(
"OCR_SKIP_ARCHIVE_FILE",