mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-23 06:55:23 +00:00
Merge branch 'dev' into feature-archive-ocr-decoupling
This commit is contained in:
Binary file not shown.
@@ -1,5 +1,5 @@
|
||||
import re
|
||||
import shutil
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
@@ -366,8 +366,7 @@ class TestClassifier(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])
|
||||
|
||||
@mock.patch("documents.classifier.pickle.load")
|
||||
def test_load_corrupt_file(self, patched_pickle_load: mock.MagicMock) -> None:
|
||||
def test_load_corrupt_file(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Corrupted classifier pickle file
|
||||
@@ -378,36 +377,116 @@ class TestClassifier(DirectoriesMixin, TestCase):
|
||||
"""
|
||||
self.generate_train_and_save()
|
||||
|
||||
# First load is the schema version,allow it
|
||||
patched_pickle_load.side_effect = [DocumentClassifier.FORMAT_VERSION, OSError()]
|
||||
# Write garbage data (valid HMAC length but invalid content)
|
||||
Path(settings.MODEL_FILE).write_bytes(b"\x00" * 64)
|
||||
|
||||
with self.assertRaises(ClassifierModelCorruptError):
|
||||
self.classifier.load()
|
||||
patched_pickle_load.assert_called()
|
||||
|
||||
patched_pickle_load.reset_mock()
|
||||
patched_pickle_load.side_effect = [
|
||||
DocumentClassifier.FORMAT_VERSION,
|
||||
ClassifierModelCorruptError(),
|
||||
]
|
||||
|
||||
self.assertIsNone(load_classifier())
|
||||
patched_pickle_load.assert_called()
|
||||
|
||||
def test_load_corrupt_pickle_valid_hmac(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A classifier file with valid HMAC but unparsable pickle data
|
||||
WHEN:
|
||||
- An attempt is made to load the classifier
|
||||
THEN:
|
||||
- The ClassifierModelCorruptError is raised
|
||||
"""
|
||||
garbage_data = b"this is not valid pickle data"
|
||||
signature = DocumentClassifier._compute_hmac(garbage_data)
|
||||
Path(settings.MODEL_FILE).write_bytes(signature + garbage_data)
|
||||
|
||||
with self.assertRaises(ClassifierModelCorruptError):
|
||||
self.classifier.load()
|
||||
|
||||
def test_load_tampered_file(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A classifier model file whose data has been modified
|
||||
WHEN:
|
||||
- An attempt is made to load the classifier
|
||||
THEN:
|
||||
- The ClassifierModelCorruptError is raised due to HMAC mismatch
|
||||
"""
|
||||
self.generate_train_and_save()
|
||||
|
||||
raw = Path(settings.MODEL_FILE).read_bytes()
|
||||
# Flip a byte in the data portion (after the 32-byte HMAC)
|
||||
tampered = raw[:32] + bytes([raw[32] ^ 0xFF]) + raw[33:]
|
||||
Path(settings.MODEL_FILE).write_bytes(tampered)
|
||||
|
||||
with self.assertRaises(ClassifierModelCorruptError):
|
||||
self.classifier.load()
|
||||
|
||||
def test_load_wrong_secret_key(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A classifier model file signed with a different SECRET_KEY
|
||||
WHEN:
|
||||
- An attempt is made to load the classifier
|
||||
THEN:
|
||||
- The ClassifierModelCorruptError is raised due to HMAC mismatch
|
||||
"""
|
||||
self.generate_train_and_save()
|
||||
|
||||
with override_settings(SECRET_KEY="different-secret-key"):
|
||||
with self.assertRaises(ClassifierModelCorruptError):
|
||||
self.classifier.load()
|
||||
|
||||
def test_load_truncated_file(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- A classifier model file that is too short to contain an HMAC
|
||||
WHEN:
|
||||
- An attempt is made to load the classifier
|
||||
THEN:
|
||||
- The ClassifierModelCorruptError is raised
|
||||
"""
|
||||
Path(settings.MODEL_FILE).write_bytes(b"\x00" * 16)
|
||||
|
||||
with self.assertRaises(ClassifierModelCorruptError):
|
||||
self.classifier.load()
|
||||
|
||||
def test_load_new_scikit_learn_version(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- classifier pickle file created with a different scikit-learn version
|
||||
- classifier pickle file triggers an InconsistentVersionWarning
|
||||
WHEN:
|
||||
- An attempt is made to load the classifier
|
||||
THEN:
|
||||
- The classifier reports the warning was captured and processed
|
||||
- IncompatibleClassifierVersionError is raised
|
||||
"""
|
||||
# TODO: This wasn't testing the warning anymore, as the schema changed
|
||||
# but as it was implemented, it would require installing an old version
|
||||
# rebuilding the file and committing that. Not developer friendly
|
||||
# Need to rethink how to pass the load through to a file with a single
|
||||
# old model?
|
||||
from sklearn.exceptions import InconsistentVersionWarning
|
||||
|
||||
self.generate_train_and_save()
|
||||
|
||||
fake_warning = warnings.WarningMessage(
|
||||
message=InconsistentVersionWarning(
|
||||
estimator_name="MLPClassifier",
|
||||
current_sklearn_version="1.0",
|
||||
original_sklearn_version="0.9",
|
||||
),
|
||||
category=InconsistentVersionWarning,
|
||||
filename="",
|
||||
lineno=0,
|
||||
)
|
||||
|
||||
real_catch_warnings = warnings.catch_warnings
|
||||
|
||||
class PatchedCatchWarnings(real_catch_warnings):
|
||||
def __enter__(self):
|
||||
w = super().__enter__()
|
||||
w.append(fake_warning)
|
||||
return w
|
||||
|
||||
with mock.patch(
|
||||
"documents.classifier.warnings.catch_warnings",
|
||||
PatchedCatchWarnings,
|
||||
):
|
||||
with self.assertRaises(IncompatibleClassifierVersionError):
|
||||
self.classifier.load()
|
||||
|
||||
def test_one_correspondent_predict(self) -> None:
|
||||
c1 = Correspondent.objects.create(
|
||||
@@ -685,17 +764,6 @@ class TestClassifier(DirectoriesMixin, TestCase):
|
||||
self.assertIsNone(load_classifier())
|
||||
self.assertTrue(Path(settings.MODEL_FILE).exists())
|
||||
|
||||
def test_load_old_classifier_version(self) -> None:
|
||||
shutil.copy(
|
||||
Path(__file__).parent / "data" / "v1.17.4.model.pickle",
|
||||
self.dirs.scratch_dir,
|
||||
)
|
||||
with override_settings(
|
||||
MODEL_FILE=self.dirs.scratch_dir / "v1.17.4.model.pickle",
|
||||
):
|
||||
classifier = load_classifier()
|
||||
self.assertIsNone(classifier)
|
||||
|
||||
@mock.patch("documents.classifier.DocumentClassifier.load")
|
||||
def test_load_classifier_raise_exception(self, mock_load) -> None:
|
||||
Path(settings.MODEL_FILE).touch()
|
||||
|
||||
@@ -0,0 +1,128 @@
|
||||
import pytest
|
||||
import regex
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from documents.regex import safe_regex_finditer
|
||||
from documents.regex import safe_regex_match
|
||||
from documents.regex import safe_regex_search
|
||||
from documents.regex import safe_regex_sub
|
||||
from documents.regex import validate_regex_pattern
|
||||
|
||||
|
||||
class TestValidateRegexPattern:
|
||||
def test_valid_pattern(self):
|
||||
validate_regex_pattern(r"\d+")
|
||||
|
||||
def test_invalid_pattern_raises(self):
|
||||
with pytest.raises(ValueError):
|
||||
validate_regex_pattern(r"[invalid")
|
||||
|
||||
|
||||
class TestSafeRegexSearchAndMatch:
|
||||
"""Tests for safe_regex_search and safe_regex_match (same contract)."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("func", "pattern", "text", "expected_group"),
|
||||
[
|
||||
pytest.param(
|
||||
safe_regex_search,
|
||||
r"\d+",
|
||||
"abc123def",
|
||||
"123",
|
||||
id="search-match-found",
|
||||
),
|
||||
pytest.param(
|
||||
safe_regex_match,
|
||||
r"\d+",
|
||||
"123abc",
|
||||
"123",
|
||||
id="match-match-found",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_match_found(self, func, pattern, text, expected_group):
|
||||
result = func(pattern, text)
|
||||
assert result is not None
|
||||
assert result.group() == expected_group
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("func", "pattern", "text"),
|
||||
[
|
||||
pytest.param(safe_regex_search, r"\d+", "abcdef", id="search-no-match"),
|
||||
pytest.param(safe_regex_match, r"\d+", "abc123", id="match-no-match"),
|
||||
],
|
||||
)
|
||||
def test_no_match(self, func, pattern, text):
|
||||
assert func(pattern, text) is None
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func",
|
||||
[
|
||||
pytest.param(safe_regex_search, id="search"),
|
||||
pytest.param(safe_regex_match, id="match"),
|
||||
],
|
||||
)
|
||||
def test_invalid_pattern_returns_none(self, func):
|
||||
assert func(r"[invalid", "test") is None
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func",
|
||||
[
|
||||
pytest.param(safe_regex_search, id="search"),
|
||||
pytest.param(safe_regex_match, id="match"),
|
||||
],
|
||||
)
|
||||
def test_flags_respected(self, func):
|
||||
assert func(r"abc", "ABC", flags=regex.IGNORECASE) is not None
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("func", "method_name"),
|
||||
[
|
||||
pytest.param(safe_regex_search, "search", id="search"),
|
||||
pytest.param(safe_regex_match, "match", id="match"),
|
||||
],
|
||||
)
|
||||
def test_timeout_returns_none(self, func, method_name, mocker: MockerFixture):
|
||||
mock_compile = mocker.patch("documents.regex.regex.compile")
|
||||
getattr(mock_compile.return_value, method_name).side_effect = TimeoutError
|
||||
assert func(r"\d+", "test") is None
|
||||
|
||||
|
||||
class TestSafeRegexSub:
|
||||
@pytest.mark.parametrize(
|
||||
("pattern", "repl", "text", "expected"),
|
||||
[
|
||||
pytest.param(r"\d+", "NUM", "abc123def456", "abcNUMdefNUM", id="basic-sub"),
|
||||
pytest.param(r"\d+", "NUM", "abcdef", "abcdef", id="no-match"),
|
||||
pytest.param(r"abc", "X", "ABC", "X", id="flags"),
|
||||
],
|
||||
)
|
||||
def test_substitution(self, pattern, repl, text, expected):
|
||||
flags = regex.IGNORECASE if pattern == r"abc" else 0
|
||||
result = safe_regex_sub(pattern, repl, text, flags=flags)
|
||||
assert result == expected
|
||||
|
||||
def test_invalid_pattern_returns_none(self):
|
||||
assert safe_regex_sub(r"[invalid", "x", "test") is None
|
||||
|
||||
def test_timeout_returns_none(self, mocker: MockerFixture):
|
||||
mock_compile = mocker.patch("documents.regex.regex.compile")
|
||||
mock_compile.return_value.sub.side_effect = TimeoutError
|
||||
assert safe_regex_sub(r"\d+", "X", "test") is None
|
||||
|
||||
|
||||
class TestSafeRegexFinditer:
|
||||
def test_yields_matches(self):
|
||||
pattern = regex.compile(r"\d+")
|
||||
matches = list(safe_regex_finditer(pattern, "a1b22c333"))
|
||||
assert [m.group() for m in matches] == ["1", "22", "333"]
|
||||
|
||||
def test_no_matches(self):
|
||||
pattern = regex.compile(r"\d+")
|
||||
assert list(safe_regex_finditer(pattern, "abcdef")) == []
|
||||
|
||||
def test_timeout_stops_iteration(self, mocker: MockerFixture):
|
||||
mock_pattern = mocker.MagicMock()
|
||||
mock_pattern.finditer.side_effect = TimeoutError
|
||||
mock_pattern.pattern = r"\d+"
|
||||
assert list(safe_regex_finditer(mock_pattern, "test")) == []
|
||||
@@ -31,6 +31,11 @@ from paperless.models import ApplicationConfiguration
|
||||
|
||||
|
||||
class TestViews(DirectoriesMixin, TestCase):
|
||||
@classmethod
|
||||
def setUpTestData(cls) -> None:
|
||||
super().setUpTestData()
|
||||
ApplicationConfiguration.objects.get_or_create()
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.user = User.objects.create_user("testuser")
|
||||
super().setUp()
|
||||
|
||||
Reference in New Issue
Block a user