Merge branch 'dev' into feature-archive-ocr-decoupling

2026-05-23 06:55:23 +00:00 · 2026-04-03 08:17:09 -07:00
parent 33c41dd2e7 64debc87a5
commit c3be765761
22 changed files with 726 additions and 228 deletions
@@ -1,5 +1,5 @@
 import re
-import shutil
+import warnings
 from pathlib import Path
 from unittest import mock

@@ -366,8 +366,7 @@ class TestClassifier(DirectoriesMixin, TestCase):

        self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])

-    @mock.patch("documents.classifier.pickle.load")
-    def test_load_corrupt_file(self, patched_pickle_load: mock.MagicMock) -> None:
+    def test_load_corrupt_file(self) -> None:
        """
        GIVEN:
            - Corrupted classifier pickle file
@@ -378,36 +377,116 @@ class TestClassifier(DirectoriesMixin, TestCase):
        """
        self.generate_train_and_save()

-        # First load is the schema version,allow it
-        patched_pickle_load.side_effect = [DocumentClassifier.FORMAT_VERSION, OSError()]
+        # Write garbage data (valid HMAC length but invalid content)
+        Path(settings.MODEL_FILE).write_bytes(b"\x00" * 64)

        with self.assertRaises(ClassifierModelCorruptError):
            self.classifier.load()
-            patched_pickle_load.assert_called()
-
-        patched_pickle_load.reset_mock()
-        patched_pickle_load.side_effect = [
-            DocumentClassifier.FORMAT_VERSION,
-            ClassifierModelCorruptError(),
-        ]

        self.assertIsNone(load_classifier())
-        patched_pickle_load.assert_called()
+
+    def test_load_corrupt_pickle_valid_hmac(self) -> None:
+        """
+        GIVEN:
+            - A classifier file with valid HMAC but unparsable pickle data
+        WHEN:
+            - An attempt is made to load the classifier
+        THEN:
+            - The ClassifierModelCorruptError is raised
+        """
+        garbage_data = b"this is not valid pickle data"
+        signature = DocumentClassifier._compute_hmac(garbage_data)
+        Path(settings.MODEL_FILE).write_bytes(signature + garbage_data)
+
+        with self.assertRaises(ClassifierModelCorruptError):
+            self.classifier.load()
+
+    def test_load_tampered_file(self) -> None:
+        """
+        GIVEN:
+            - A classifier model file whose data has been modified
+        WHEN:
+            - An attempt is made to load the classifier
+        THEN:
+            - The ClassifierModelCorruptError is raised due to HMAC mismatch
+        """
+        self.generate_train_and_save()
+
+        raw = Path(settings.MODEL_FILE).read_bytes()
+        # Flip a byte in the data portion (after the 32-byte HMAC)
+        tampered = raw[:32] + bytes([raw[32] ^ 0xFF]) + raw[33:]
+        Path(settings.MODEL_FILE).write_bytes(tampered)
+
+        with self.assertRaises(ClassifierModelCorruptError):
+            self.classifier.load()
+
+    def test_load_wrong_secret_key(self) -> None:
+        """
+        GIVEN:
+            - A classifier model file signed with a different SECRET_KEY
+        WHEN:
+            - An attempt is made to load the classifier
+        THEN:
+            - The ClassifierModelCorruptError is raised due to HMAC mismatch
+        """
+        self.generate_train_and_save()
+
+        with override_settings(SECRET_KEY="different-secret-key"):
+            with self.assertRaises(ClassifierModelCorruptError):
+                self.classifier.load()
+
+    def test_load_truncated_file(self) -> None:
+        """
+        GIVEN:
+            - A classifier model file that is too short to contain an HMAC
+        WHEN:
+            - An attempt is made to load the classifier
+        THEN:
+            - The ClassifierModelCorruptError is raised
+        """
+        Path(settings.MODEL_FILE).write_bytes(b"\x00" * 16)
+
+        with self.assertRaises(ClassifierModelCorruptError):
+            self.classifier.load()

    def test_load_new_scikit_learn_version(self) -> None:
        """
        GIVEN:
-            - classifier pickle file created with a different scikit-learn version
+            - classifier pickle file triggers an InconsistentVersionWarning
        WHEN:
            - An attempt is made to load the classifier
        THEN:
-            - The classifier reports the warning was captured and processed
+            - IncompatibleClassifierVersionError is raised
        """
-        # TODO: This wasn't testing the warning anymore, as the schema changed
-        # but as it was implemented, it would require installing an old version
-        # rebuilding the file and committing that.  Not developer friendly
-        # Need to rethink how to pass the load through to a file with a single
-        # old model?
+        from sklearn.exceptions import InconsistentVersionWarning
+
+        self.generate_train_and_save()
+
+        fake_warning = warnings.WarningMessage(
+            message=InconsistentVersionWarning(
+                estimator_name="MLPClassifier",
+                current_sklearn_version="1.0",
+                original_sklearn_version="0.9",
+            ),
+            category=InconsistentVersionWarning,
+            filename="",
+            lineno=0,
+        )
+
+        real_catch_warnings = warnings.catch_warnings
+
+        class PatchedCatchWarnings(real_catch_warnings):
+            def __enter__(self):
+                w = super().__enter__()
+                w.append(fake_warning)
+                return w
+
+        with mock.patch(
+            "documents.classifier.warnings.catch_warnings",
+            PatchedCatchWarnings,
+        ):
+            with self.assertRaises(IncompatibleClassifierVersionError):
+                self.classifier.load()

    def test_one_correspondent_predict(self) -> None:
        c1 = Correspondent.objects.create(
@@ -685,17 +764,6 @@ class TestClassifier(DirectoriesMixin, TestCase):
        self.assertIsNone(load_classifier())
        self.assertTrue(Path(settings.MODEL_FILE).exists())

-    def test_load_old_classifier_version(self) -> None:
-        shutil.copy(
-            Path(__file__).parent / "data" / "v1.17.4.model.pickle",
-            self.dirs.scratch_dir,
-        )
-        with override_settings(
-            MODEL_FILE=self.dirs.scratch_dir / "v1.17.4.model.pickle",
-        ):
-            classifier = load_classifier()
-            self.assertIsNone(classifier)
-
    @mock.patch("documents.classifier.DocumentClassifier.load")
    def test_load_classifier_raise_exception(self, mock_load) -> None:
        Path(settings.MODEL_FILE).touch()
@@ -0,0 +1,128 @@
+import pytest
+import regex
+from pytest_mock import MockerFixture
+
+from documents.regex import safe_regex_finditer
+from documents.regex import safe_regex_match
+from documents.regex import safe_regex_search
+from documents.regex import safe_regex_sub
+from documents.regex import validate_regex_pattern
+
+
+class TestValidateRegexPattern:
+    def test_valid_pattern(self):
+        validate_regex_pattern(r"\d+")
+
+    def test_invalid_pattern_raises(self):
+        with pytest.raises(ValueError):
+            validate_regex_pattern(r"[invalid")
+
+
+class TestSafeRegexSearchAndMatch:
+    """Tests for safe_regex_search and safe_regex_match (same contract)."""
+
+    @pytest.mark.parametrize(
+        ("func", "pattern", "text", "expected_group"),
+        [
+            pytest.param(
+                safe_regex_search,
+                r"\d+",
+                "abc123def",
+                "123",
+                id="search-match-found",
+            ),
+            pytest.param(
+                safe_regex_match,
+                r"\d+",
+                "123abc",
+                "123",
+                id="match-match-found",
+            ),
+        ],
+    )
+    def test_match_found(self, func, pattern, text, expected_group):
+        result = func(pattern, text)
+        assert result is not None
+        assert result.group() == expected_group
+
+    @pytest.mark.parametrize(
+        ("func", "pattern", "text"),
+        [
+            pytest.param(safe_regex_search, r"\d+", "abcdef", id="search-no-match"),
+            pytest.param(safe_regex_match, r"\d+", "abc123", id="match-no-match"),
+        ],
+    )
+    def test_no_match(self, func, pattern, text):
+        assert func(pattern, text) is None
+
+    @pytest.mark.parametrize(
+        "func",
+        [
+            pytest.param(safe_regex_search, id="search"),
+            pytest.param(safe_regex_match, id="match"),
+        ],
+    )
+    def test_invalid_pattern_returns_none(self, func):
+        assert func(r"[invalid", "test") is None
+
+    @pytest.mark.parametrize(
+        "func",
+        [
+            pytest.param(safe_regex_search, id="search"),
+            pytest.param(safe_regex_match, id="match"),
+        ],
+    )
+    def test_flags_respected(self, func):
+        assert func(r"abc", "ABC", flags=regex.IGNORECASE) is not None
+
+    @pytest.mark.parametrize(
+        ("func", "method_name"),
+        [
+            pytest.param(safe_regex_search, "search", id="search"),
+            pytest.param(safe_regex_match, "match", id="match"),
+        ],
+    )
+    def test_timeout_returns_none(self, func, method_name, mocker: MockerFixture):
+        mock_compile = mocker.patch("documents.regex.regex.compile")
+        getattr(mock_compile.return_value, method_name).side_effect = TimeoutError
+        assert func(r"\d+", "test") is None
+
+
+class TestSafeRegexSub:
+    @pytest.mark.parametrize(
+        ("pattern", "repl", "text", "expected"),
+        [
+            pytest.param(r"\d+", "NUM", "abc123def456", "abcNUMdefNUM", id="basic-sub"),
+            pytest.param(r"\d+", "NUM", "abcdef", "abcdef", id="no-match"),
+            pytest.param(r"abc", "X", "ABC", "X", id="flags"),
+        ],
+    )
+    def test_substitution(self, pattern, repl, text, expected):
+        flags = regex.IGNORECASE if pattern == r"abc" else 0
+        result = safe_regex_sub(pattern, repl, text, flags=flags)
+        assert result == expected
+
+    def test_invalid_pattern_returns_none(self):
+        assert safe_regex_sub(r"[invalid", "x", "test") is None
+
+    def test_timeout_returns_none(self, mocker: MockerFixture):
+        mock_compile = mocker.patch("documents.regex.regex.compile")
+        mock_compile.return_value.sub.side_effect = TimeoutError
+        assert safe_regex_sub(r"\d+", "X", "test") is None
+
+
+class TestSafeRegexFinditer:
+    def test_yields_matches(self):
+        pattern = regex.compile(r"\d+")
+        matches = list(safe_regex_finditer(pattern, "a1b22c333"))
+        assert [m.group() for m in matches] == ["1", "22", "333"]
+
+    def test_no_matches(self):
+        pattern = regex.compile(r"\d+")
+        assert list(safe_regex_finditer(pattern, "abcdef")) == []
+
+    def test_timeout_stops_iteration(self, mocker: MockerFixture):
+        mock_pattern = mocker.MagicMock()
+        mock_pattern.finditer.side_effect = TimeoutError
+        mock_pattern.pattern = r"\d+"
+        assert list(safe_regex_finditer(mock_pattern, "test")) == []
@@ -31,6 +31,11 @@ from paperless.models import ApplicationConfiguration


 class TestViews(DirectoriesMixin, TestCase):
+    @classmethod
+    def setUpTestData(cls) -> None:
+        super().setUpTestData()
+        ApplicationConfiguration.objects.get_or_create()
+
    def setUp(self) -> None:
        self.user = User.objects.create_user("testuser")
        super().setUp()