mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-12 01:25:23 +00:00
feat!: replace ModeChoices and ArchiveFileChoices with new v3 enums
- Replace ModeChoices (SKIP/SKIP_NO_ARCHIVE/REDO/FORCE) with new values: AUTO, FORCE, REDO, OFF - Remove ArchiveFileChoices entirely; add ArchiveFileGenerationChoices with AUTO, ALWAYS, NEVER values - Update checks.py valid sets and default settings to use new enum values - Update tesseract parser to use new enum comparisons; AUTO mode maps to skip_text behavior; FORCE/REDO bypass archive-skip early-exit - Update all affected tests to use new valid mode/archive string values Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+2
-11
@@ -132,19 +132,10 @@ def settings_values_check(app_configs, **kwargs):
|
||||
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
|
||||
)
|
||||
|
||||
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
|
||||
if settings.OCR_MODE not in {"auto", "force", "redo", "off"}:
|
||||
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
||||
|
||||
if settings.OCR_MODE == "skip_noarchive":
|
||||
msgs.append(
|
||||
Warning(
|
||||
'OCR output mode "skip_noarchive" is deprecated and will be '
|
||||
"removed in a future version. Please use "
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
|
||||
),
|
||||
)
|
||||
|
||||
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
|
||||
if settings.OCR_SKIP_ARCHIVE_FILE not in {"auto", "always", "never"}:
|
||||
msgs.append(
|
||||
Error(
|
||||
"OCR_SKIP_ARCHIVE_FILE setting "
|
||||
|
||||
@@ -36,20 +36,20 @@ class ModeChoices(models.TextChoices):
|
||||
and our own custom setting
|
||||
"""
|
||||
|
||||
SKIP = ("skip", _("skip"))
|
||||
REDO = ("redo", _("redo"))
|
||||
AUTO = ("auto", _("auto"))
|
||||
FORCE = ("force", _("force"))
|
||||
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
|
||||
REDO = ("redo", _("redo"))
|
||||
OFF = ("off", _("off"))
|
||||
|
||||
|
||||
class ArchiveFileChoices(models.TextChoices):
|
||||
class ArchiveFileGenerationChoices(models.TextChoices):
|
||||
"""
|
||||
Settings to control creation of an archive PDF file
|
||||
"""
|
||||
|
||||
NEVER = ("never", _("never"))
|
||||
WITH_TEXT = ("with_text", _("with_text"))
|
||||
AUTO = ("auto", _("auto"))
|
||||
ALWAYS = ("always", _("always"))
|
||||
NEVER = ("never", _("never"))
|
||||
|
||||
|
||||
class CleanChoices(models.TextChoices):
|
||||
@@ -131,7 +131,7 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=16,
|
||||
choices=ArchiveFileChoices.choices,
|
||||
choices=ArchiveFileGenerationChoices.choices,
|
||||
)
|
||||
|
||||
image_dpi = models.PositiveSmallIntegerField(
|
||||
|
||||
@@ -18,7 +18,7 @@ from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileChoices
|
||||
from paperless.models import ArchiveFileGenerationChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||
@@ -309,10 +309,7 @@ class RasterisedDocumentParser:
|
||||
|
||||
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||
ocrmypdf_args["force_ocr"] = True
|
||||
elif self.settings.mode in {
|
||||
ModeChoices.SKIP,
|
||||
ModeChoices.SKIP_NO_ARCHIVE,
|
||||
}:
|
||||
elif self.settings.mode == ModeChoices.AUTO:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["redo_ocr"] = True
|
||||
@@ -421,15 +418,14 @@ class RasterisedDocumentParser:
|
||||
original_has_text = False
|
||||
|
||||
# If the original has text, and the user doesn't want an archive,
|
||||
# we're done here
|
||||
skip_archive_for_text = (
|
||||
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
|
||||
or self.settings.skip_archive_file
|
||||
in {
|
||||
ArchiveFileChoices.WITH_TEXT,
|
||||
ArchiveFileChoices.ALWAYS,
|
||||
}
|
||||
)
|
||||
# we're done here (but not when force/redo mode is explicitly requested)
|
||||
skip_archive_for_text = self.settings.mode not in {
|
||||
ModeChoices.FORCE,
|
||||
ModeChoices.REDO,
|
||||
} and self.settings.skip_archive_file in {
|
||||
ArchiveFileGenerationChoices.NEVER,
|
||||
ArchiveFileGenerationChoices.AUTO,
|
||||
}
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
@@ -459,7 +455,7 @@ class RasterisedDocumentParser:
|
||||
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
|
||||
if self.settings.skip_archive_file != ArchiveFileGenerationChoices.NEVER:
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
|
||||
@@ -21,6 +21,7 @@ from paperless.settings.custom import parse_hosting_settings
|
||||
from paperless.settings.custom import parse_ignore_dates
|
||||
from paperless.settings.custom import parse_redis_url
|
||||
from paperless.settings.parsers import get_bool_from_env
|
||||
from paperless.settings.parsers import get_choice_from_env
|
||||
from paperless.settings.parsers import get_float_from_env
|
||||
from paperless.settings.parsers import get_int_from_env
|
||||
from paperless.settings.parsers import get_list_from_env
|
||||
@@ -874,10 +875,17 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
# OCRmyPDF --output-type options are available.
|
||||
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
|
||||
|
||||
# skip. redo, force
|
||||
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
||||
OCR_MODE = get_choice_from_env(
|
||||
"PAPERLESS_OCR_MODE",
|
||||
{"auto", "force", "redo", "off"},
|
||||
default="auto",
|
||||
)
|
||||
|
||||
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
|
||||
OCR_SKIP_ARCHIVE_FILE = get_choice_from_env(
|
||||
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE",
|
||||
{"auto", "always", "never"},
|
||||
default="auto",
|
||||
)
|
||||
|
||||
OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI")
|
||||
|
||||
|
||||
@@ -93,7 +93,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
"""
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.SKIP
|
||||
instance.mode = ModeChoices.AUTO
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
|
||||
@@ -386,6 +386,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.skip_archive_file = "always"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "with-form.pdf",
|
||||
"application/pdf",
|
||||
@@ -433,7 +434,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf")
|
||||
assert tesseract_parser.archive_path is None
|
||||
assert_ordered_substrings(
|
||||
@@ -449,7 +450,7 @@ class TestParsePdf:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "encrypted.pdf",
|
||||
"application/pdf",
|
||||
@@ -545,6 +546,7 @@ class TestParseMultiPage:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.skip_archive_file = "always"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
"application/pdf",
|
||||
@@ -559,7 +561,7 @@ class TestParseMultiPage:
|
||||
@pytest.mark.parametrize(
|
||||
"mode",
|
||||
[
|
||||
pytest.param("skip", id="skip"),
|
||||
pytest.param("auto", id="auto"),
|
||||
pytest.param("redo", id="redo"),
|
||||
pytest.param("force", id="force"),
|
||||
],
|
||||
@@ -572,6 +574,7 @@ class TestParseMultiPage:
|
||||
) -> None:
|
||||
tesseract_parser.settings.pages = 2
|
||||
tesseract_parser.settings.mode = mode
|
||||
tesseract_parser.settings.skip_archive_file = "always"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
"application/pdf",
|
||||
@@ -587,7 +590,7 @@ class TestParseMultiPage:
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-images.pdf",
|
||||
"application/pdf",
|
||||
@@ -735,13 +738,13 @@ class TestSkipArchive:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- Mode: skip_noarchive
|
||||
- Mode: auto, skip_archive_file: auto
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted; no archive created
|
||||
- Text extracted; no archive created (text exists, auto skips OCR)
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-digital.pdf",
|
||||
"application/pdf",
|
||||
@@ -760,13 +763,13 @@ class TestSkipArchive:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with image-only pages (no text layer)
|
||||
- Mode: skip_noarchive
|
||||
- Mode: auto, skip_archive_file: auto
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text extracted; archive created (OCR needed)
|
||||
- Text extracted; archive created (OCR needed, no existing text)
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-images.pdf",
|
||||
"application/pdf",
|
||||
@@ -780,27 +783,32 @@ class TestSkipArchive:
|
||||
@pytest.mark.parametrize(
|
||||
("skip_archive_file", "filename", "expect_archive"),
|
||||
[
|
||||
pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"),
|
||||
pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"),
|
||||
pytest.param(
|
||||
"with_text",
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="with-text-layer",
|
||||
),
|
||||
pytest.param(
|
||||
"with_text",
|
||||
"multi-page-images.pdf",
|
||||
True,
|
||||
id="with-text-no-layer",
|
||||
),
|
||||
pytest.param(
|
||||
"always",
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
True,
|
||||
id="always-with-text",
|
||||
),
|
||||
pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"),
|
||||
pytest.param("always", "multi-page-images.pdf", True, id="always-no-text"),
|
||||
pytest.param(
|
||||
"auto",
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="auto-with-text-layer",
|
||||
),
|
||||
pytest.param(
|
||||
"auto",
|
||||
"multi-page-images.pdf",
|
||||
True,
|
||||
id="auto-no-text-layer",
|
||||
),
|
||||
pytest.param(
|
||||
"never",
|
||||
"multi-page-digital.pdf",
|
||||
False,
|
||||
id="never-with-text",
|
||||
),
|
||||
pytest.param("never", "multi-page-images.pdf", False, id="never-no-text"),
|
||||
],
|
||||
)
|
||||
def test_skip_archive_file_setting(
|
||||
@@ -835,13 +843,14 @@ class TestParseMixed:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text in some pages (image) and some pages (digital)
|
||||
- Mode: skip
|
||||
- Mode: auto (skip_text), skip_archive_file: always
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- All pages extracted; archive created; sidecar notes skipped pages
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.skip_archive_file = "always"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
"application/pdf",
|
||||
@@ -899,13 +908,13 @@ class TestParseMixed:
|
||||
"""
|
||||
GIVEN:
|
||||
- File with mixed pages
|
||||
- Mode: skip_noarchive
|
||||
- Mode: auto, skip_archive_file: auto
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- No archive created (file has text layer); later-page text present
|
||||
"""
|
||||
tesseract_parser.settings.mode = "skip_noarchive"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.parse(
|
||||
tesseract_samples_dir / "multi-page-mixed.pdf",
|
||||
"application/pdf",
|
||||
@@ -923,12 +932,12 @@ class TestParseMixed:
|
||||
|
||||
|
||||
class TestParseRotate:
|
||||
def test_rotate_skip_mode(
|
||||
def test_rotate_auto_mode(
|
||||
self,
|
||||
tesseract_parser: RasterisedDocumentParser,
|
||||
tesseract_samples_dir: Path,
|
||||
) -> None:
|
||||
tesseract_parser.settings.mode = "skip"
|
||||
tesseract_parser.settings.mode = "auto"
|
||||
tesseract_parser.settings.rotate = True
|
||||
tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf")
|
||||
assert_ordered_substrings(
|
||||
@@ -1023,11 +1032,11 @@ class TestOcrmypdfParameters:
|
||||
assert ("clean" in params) == expected_clean
|
||||
assert ("clean_final" in params) == expected_clean_final
|
||||
|
||||
def test_clean_final_skip_mode(
|
||||
def test_clean_final_auto_mode(
|
||||
self,
|
||||
make_tesseract_parser: MakeTesseractParser,
|
||||
) -> None:
|
||||
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="skip") as parser:
|
||||
with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="auto") as parser:
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
assert params["clean_final"] is True
|
||||
assert "clean" not in params
|
||||
@@ -1044,9 +1053,9 @@ class TestOcrmypdfParameters:
|
||||
@pytest.mark.parametrize(
|
||||
("ocr_mode", "ocr_deskew", "expect_deskew"),
|
||||
[
|
||||
pytest.param("skip", True, True, id="skip-deskew-on"),
|
||||
pytest.param("auto", True, True, id="auto-deskew-on"),
|
||||
pytest.param("redo", True, False, id="redo-deskew-off"),
|
||||
pytest.param("skip", False, False, id="skip-no-deskew"),
|
||||
pytest.param("auto", False, False, id="auto-no-deskew"),
|
||||
],
|
||||
)
|
||||
def test_deskew_option(
|
||||
|
||||
@@ -132,8 +132,8 @@ class TestOcrSettingsChecks:
|
||||
pytest.param(
|
||||
"OCR_MODE",
|
||||
"skip_noarchive",
|
||||
"deprecated",
|
||||
id="deprecated-mode",
|
||||
'OCR output mode "skip_noarchive"',
|
||||
id="deprecated-mode-now-invalid",
|
||||
),
|
||||
pytest.param(
|
||||
"OCR_SKIP_ARCHIVE_FILE",
|
||||
|
||||
Reference in New Issue
Block a user