diff --git a/src/paperless/checks.py b/src/paperless/checks.py index 5f069b547..e8ced2a76 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -132,19 +132,10 @@ def settings_values_check(app_configs, **kwargs): Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'), ) - if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: + if settings.OCR_MODE not in {"auto", "force", "redo", "off"}: msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) - if settings.OCR_MODE == "skip_noarchive": - msgs.append( - Warning( - 'OCR output mode "skip_noarchive" is deprecated and will be ' - "removed in a future version. Please use " - "PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.", - ), - ) - - if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}: + if settings.OCR_SKIP_ARCHIVE_FILE not in {"auto", "always", "never"}: msgs.append( Error( "OCR_SKIP_ARCHIVE_FILE setting " diff --git a/src/paperless/models.py b/src/paperless/models.py index c67f16b03..1e5a58bb2 100644 --- a/src/paperless/models.py +++ b/src/paperless/models.py @@ -36,20 +36,20 @@ class ModeChoices(models.TextChoices): and our own custom setting """ - SKIP = ("skip", _("skip")) - REDO = ("redo", _("redo")) + AUTO = ("auto", _("auto")) FORCE = ("force", _("force")) - SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive")) + REDO = ("redo", _("redo")) + OFF = ("off", _("off")) -class ArchiveFileChoices(models.TextChoices): +class ArchiveFileGenerationChoices(models.TextChoices): """ Settings to control creation of an archive PDF file """ - NEVER = ("never", _("never")) - WITH_TEXT = ("with_text", _("with_text")) + AUTO = ("auto", _("auto")) ALWAYS = ("always", _("always")) + NEVER = ("never", _("never")) class CleanChoices(models.TextChoices): @@ -131,7 +131,7 @@ class ApplicationConfiguration(AbstractSingletonModel): null=True, blank=True, max_length=16, - choices=ArchiveFileChoices.choices, + choices=ArchiveFileGenerationChoices.choices, ) image_dpi = models.PositiveSmallIntegerField( diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index 99cff36aa..5379cc1a5 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -18,7 +18,7 @@ from documents.parsers import make_thumbnail_from_pdf from documents.utils import maybe_override_pixel_limit from documents.utils import run_subprocess from paperless.config import OcrConfig -from paperless.models import ArchiveFileChoices +from paperless.models import ArchiveFileGenerationChoices from paperless.models import CleanChoices from paperless.models import ModeChoices from paperless.parsers.utils import read_file_handle_unicode_errors @@ -309,10 +309,7 @@ class RasterisedDocumentParser: if self.settings.mode == ModeChoices.FORCE or safe_fallback: ocrmypdf_args["force_ocr"] = True - elif self.settings.mode in { - ModeChoices.SKIP, - ModeChoices.SKIP_NO_ARCHIVE, - }: + elif self.settings.mode == ModeChoices.AUTO: ocrmypdf_args["skip_text"] = True elif self.settings.mode == ModeChoices.REDO: ocrmypdf_args["redo_ocr"] = True @@ -421,15 +418,14 @@ class RasterisedDocumentParser: original_has_text = False # If the original has text, and the user doesn't want an archive, - # we're done here - skip_archive_for_text = ( - self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE - or self.settings.skip_archive_file - in { - ArchiveFileChoices.WITH_TEXT, - ArchiveFileChoices.ALWAYS, - } - ) + # we're done here (but not when force/redo mode is explicitly requested) + skip_archive_for_text = self.settings.mode not in { + ModeChoices.FORCE, + ModeChoices.REDO, + } and self.settings.skip_archive_file in { + ArchiveFileGenerationChoices.NEVER, + ArchiveFileGenerationChoices.AUTO, + } if skip_archive_for_text and original_has_text: self.log.debug("Document has text, skipping OCRmyPDF entirely.") self.text = text_original @@ -459,7 +455,7 @@ class RasterisedDocumentParser: self.log.debug(f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS: + if self.settings.skip_archive_file != ArchiveFileGenerationChoices.NEVER: self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) diff --git a/src/paperless/settings/__init__.py b/src/paperless/settings/__init__.py index 1c33db7c6..e0bd0ac94 100644 --- a/src/paperless/settings/__init__.py +++ b/src/paperless/settings/__init__.py @@ -21,6 +21,7 @@ from paperless.settings.custom import parse_hosting_settings from paperless.settings.custom import parse_ignore_dates from paperless.settings.custom import parse_redis_url from paperless.settings.parsers import get_bool_from_env +from paperless.settings.parsers import get_choice_from_env from paperless.settings.parsers import get_float_from_env from paperless.settings.parsers import get_int_from_env from paperless.settings.parsers import get_list_from_env @@ -874,10 +875,17 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") # OCRmyPDF --output-type options are available. OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") -# skip. redo, force -OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") +OCR_MODE = get_choice_from_env( + "PAPERLESS_OCR_MODE", + {"auto", "force", "redo", "off"}, + default="auto", +) -OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never") +OCR_SKIP_ARCHIVE_FILE = get_choice_from_env( + "PAPERLESS_OCR_SKIP_ARCHIVE_FILE", + {"auto", "always", "never"}, + default="auto", +) OCR_IMAGE_DPI = get_int_from_env("PAPERLESS_OCR_IMAGE_DPI") diff --git a/src/paperless/tests/parsers/test_tesseract_custom_settings.py b/src/paperless/tests/parsers/test_tesseract_custom_settings.py index 60d1486f4..bade65ef1 100644 --- a/src/paperless/tests/parsers/test_tesseract_custom_settings.py +++ b/src/paperless/tests/parsers/test_tesseract_custom_settings.py @@ -93,7 +93,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas """ with override_settings(OCR_MODE="redo"): instance = ApplicationConfiguration.objects.all().first() - instance.mode = ModeChoices.SKIP + instance.mode = ModeChoices.AUTO instance.save() params = self.get_params() diff --git a/src/paperless/tests/parsers/test_tesseract_parser.py b/src/paperless/tests/parsers/test_tesseract_parser.py index daa7020c7..2be76ad4b 100644 --- a/src/paperless/tests/parsers/test_tesseract_parser.py +++ b/src/paperless/tests/parsers/test_tesseract_parser.py @@ -386,6 +386,7 @@ class TestParsePdf: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: + tesseract_parser.settings.skip_archive_file = "always" tesseract_parser.parse( tesseract_samples_dir / "with-form.pdf", "application/pdf", @@ -433,7 +434,7 @@ class TestParsePdf: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse(tesseract_samples_dir / "signed.pdf", "application/pdf") assert tesseract_parser.archive_path is None assert_ordered_substrings( @@ -449,7 +450,7 @@ class TestParsePdf: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "encrypted.pdf", "application/pdf", @@ -545,6 +546,7 @@ class TestParseMultiPage: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: + tesseract_parser.settings.skip_archive_file = "always" tesseract_parser.parse( tesseract_samples_dir / "multi-page-digital.pdf", "application/pdf", @@ -559,7 +561,7 @@ class TestParseMultiPage: @pytest.mark.parametrize( "mode", [ - pytest.param("skip", id="skip"), + pytest.param("auto", id="auto"), pytest.param("redo", id="redo"), pytest.param("force", id="force"), ], @@ -572,6 +574,7 @@ class TestParseMultiPage: ) -> None: tesseract_parser.settings.pages = 2 tesseract_parser.settings.mode = mode + tesseract_parser.settings.skip_archive_file = "always" tesseract_parser.parse( tesseract_samples_dir / "multi-page-digital.pdf", "application/pdf", @@ -587,7 +590,7 @@ class TestParseMultiPage: tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-images.pdf", "application/pdf", @@ -735,13 +738,13 @@ class TestSkipArchive: """ GIVEN: - File with existing text layer - - Mode: skip_noarchive + - Mode: auto, skip_archive_file: auto WHEN: - Document is parsed THEN: - - Text extracted; no archive created + - Text extracted; no archive created (text exists, auto skips OCR) """ - tesseract_parser.settings.mode = "skip_noarchive" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-digital.pdf", "application/pdf", @@ -760,13 +763,13 @@ class TestSkipArchive: """ GIVEN: - File with image-only pages (no text layer) - - Mode: skip_noarchive + - Mode: auto, skip_archive_file: auto WHEN: - Document is parsed THEN: - - Text extracted; archive created (OCR needed) + - Text extracted; archive created (OCR needed, no existing text) """ - tesseract_parser.settings.mode = "skip_noarchive" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-images.pdf", "application/pdf", @@ -780,27 +783,32 @@ class TestSkipArchive: @pytest.mark.parametrize( ("skip_archive_file", "filename", "expect_archive"), [ - pytest.param("never", "multi-page-digital.pdf", True, id="never-with-text"), - pytest.param("never", "multi-page-images.pdf", True, id="never-no-text"), - pytest.param( - "with_text", - "multi-page-digital.pdf", - False, - id="with-text-layer", - ), - pytest.param( - "with_text", - "multi-page-images.pdf", - True, - id="with-text-no-layer", - ), pytest.param( "always", "multi-page-digital.pdf", - False, + True, id="always-with-text", ), - pytest.param("always", "multi-page-images.pdf", False, id="always-no-text"), + pytest.param("always", "multi-page-images.pdf", True, id="always-no-text"), + pytest.param( + "auto", + "multi-page-digital.pdf", + False, + id="auto-with-text-layer", + ), + pytest.param( + "auto", + "multi-page-images.pdf", + True, + id="auto-no-text-layer", + ), + pytest.param( + "never", + "multi-page-digital.pdf", + False, + id="never-with-text", + ), + pytest.param("never", "multi-page-images.pdf", False, id="never-no-text"), ], ) def test_skip_archive_file_setting( @@ -835,13 +843,14 @@ class TestParseMixed: """ GIVEN: - File with text in some pages (image) and some pages (digital) - - Mode: skip + - Mode: auto (skip_text), skip_archive_file: always WHEN: - Document is parsed THEN: - All pages extracted; archive created; sidecar notes skipped pages """ - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" + tesseract_parser.settings.skip_archive_file = "always" tesseract_parser.parse( tesseract_samples_dir / "multi-page-mixed.pdf", "application/pdf", @@ -899,13 +908,13 @@ class TestParseMixed: """ GIVEN: - File with mixed pages - - Mode: skip_noarchive + - Mode: auto, skip_archive_file: auto WHEN: - Document is parsed THEN: - No archive created (file has text layer); later-page text present """ - tesseract_parser.settings.mode = "skip_noarchive" + tesseract_parser.settings.mode = "auto" tesseract_parser.parse( tesseract_samples_dir / "multi-page-mixed.pdf", "application/pdf", @@ -923,12 +932,12 @@ class TestParseMixed: class TestParseRotate: - def test_rotate_skip_mode( + def test_rotate_auto_mode( self, tesseract_parser: RasterisedDocumentParser, tesseract_samples_dir: Path, ) -> None: - tesseract_parser.settings.mode = "skip" + tesseract_parser.settings.mode = "auto" tesseract_parser.settings.rotate = True tesseract_parser.parse(tesseract_samples_dir / "rotated.pdf", "application/pdf") assert_ordered_substrings( @@ -1023,11 +1032,11 @@ class TestOcrmypdfParameters: assert ("clean" in params) == expected_clean assert ("clean_final" in params) == expected_clean_final - def test_clean_final_skip_mode( + def test_clean_final_auto_mode( self, make_tesseract_parser: MakeTesseractParser, ) -> None: - with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="skip") as parser: + with make_tesseract_parser(OCR_CLEAN="clean-final", OCR_MODE="auto") as parser: params = parser.construct_ocrmypdf_parameters("", "", "", "") assert params["clean_final"] is True assert "clean" not in params @@ -1044,9 +1053,9 @@ class TestOcrmypdfParameters: @pytest.mark.parametrize( ("ocr_mode", "ocr_deskew", "expect_deskew"), [ - pytest.param("skip", True, True, id="skip-deskew-on"), + pytest.param("auto", True, True, id="auto-deskew-on"), pytest.param("redo", True, False, id="redo-deskew-off"), - pytest.param("skip", False, False, id="skip-no-deskew"), + pytest.param("auto", False, False, id="auto-no-deskew"), ], ) def test_deskew_option( diff --git a/src/paperless/tests/test_checks.py b/src/paperless/tests/test_checks.py index 87e64a90e..561d61430 100644 --- a/src/paperless/tests/test_checks.py +++ b/src/paperless/tests/test_checks.py @@ -132,8 +132,8 @@ class TestOcrSettingsChecks: pytest.param( "OCR_MODE", "skip_noarchive", - "deprecated", - id="deprecated-mode", + 'OCR output mode "skip_noarchive"', + id="deprecated-mode-now-invalid", ), pytest.param( "OCR_SKIP_ARCHIVE_FILE",