Security: Improve overall security in a few ways (#12501)

- Make sure we're always using regex with timeouts for user controlled data
- Adds rate limiting to the token endpoint (configurable)
- Signs the classifier pickle file with the SECRET_KEY and refuse to load one which doesn't verify.
- Require the user to set a secret key, instead of falling back to our old hard coded one
This commit is contained in:
Trenton H
2026-04-02 15:30:26 -07:00
committed by GitHub
parent 376af81b9c
commit dda05a7c00
14 changed files with 443 additions and 110 deletions

View File

@@ -1,5 +1,5 @@
import re
import shutil
import warnings
from pathlib import Path
from unittest import mock
@@ -366,8 +366,7 @@ class TestClassifier(DirectoriesMixin, TestCase):
self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])
@mock.patch("documents.classifier.pickle.load")
def test_load_corrupt_file(self, patched_pickle_load: mock.MagicMock) -> None:
def test_load_corrupt_file(self) -> None:
"""
GIVEN:
- Corrupted classifier pickle file
@@ -378,36 +377,116 @@ class TestClassifier(DirectoriesMixin, TestCase):
"""
self.generate_train_and_save()
# First load is the schema version,allow it
patched_pickle_load.side_effect = [DocumentClassifier.FORMAT_VERSION, OSError()]
# Write garbage data (valid HMAC length but invalid content)
Path(settings.MODEL_FILE).write_bytes(b"\x00" * 64)
with self.assertRaises(ClassifierModelCorruptError):
self.classifier.load()
patched_pickle_load.assert_called()
patched_pickle_load.reset_mock()
patched_pickle_load.side_effect = [
DocumentClassifier.FORMAT_VERSION,
ClassifierModelCorruptError(),
]
self.assertIsNone(load_classifier())
patched_pickle_load.assert_called()
def test_load_corrupt_pickle_valid_hmac(self) -> None:
"""
GIVEN:
- A classifier file with valid HMAC but unparsable pickle data
WHEN:
- An attempt is made to load the classifier
THEN:
- The ClassifierModelCorruptError is raised
"""
garbage_data = b"this is not valid pickle data"
signature = DocumentClassifier._compute_hmac(garbage_data)
Path(settings.MODEL_FILE).write_bytes(signature + garbage_data)
with self.assertRaises(ClassifierModelCorruptError):
self.classifier.load()
def test_load_tampered_file(self) -> None:
"""
GIVEN:
- A classifier model file whose data has been modified
WHEN:
- An attempt is made to load the classifier
THEN:
- The ClassifierModelCorruptError is raised due to HMAC mismatch
"""
self.generate_train_and_save()
raw = Path(settings.MODEL_FILE).read_bytes()
# Flip a byte in the data portion (after the 32-byte HMAC)
tampered = raw[:32] + bytes([raw[32] ^ 0xFF]) + raw[33:]
Path(settings.MODEL_FILE).write_bytes(tampered)
with self.assertRaises(ClassifierModelCorruptError):
self.classifier.load()
def test_load_wrong_secret_key(self) -> None:
"""
GIVEN:
- A classifier model file signed with a different SECRET_KEY
WHEN:
- An attempt is made to load the classifier
THEN:
- The ClassifierModelCorruptError is raised due to HMAC mismatch
"""
self.generate_train_and_save()
with override_settings(SECRET_KEY="different-secret-key"):
with self.assertRaises(ClassifierModelCorruptError):
self.classifier.load()
def test_load_truncated_file(self) -> None:
"""
GIVEN:
- A classifier model file that is too short to contain an HMAC
WHEN:
- An attempt is made to load the classifier
THEN:
- The ClassifierModelCorruptError is raised
"""
Path(settings.MODEL_FILE).write_bytes(b"\x00" * 16)
with self.assertRaises(ClassifierModelCorruptError):
self.classifier.load()
def test_load_new_scikit_learn_version(self) -> None:
"""
GIVEN:
- classifier pickle file created with a different scikit-learn version
- classifier pickle file triggers an InconsistentVersionWarning
WHEN:
- An attempt is made to load the classifier
THEN:
- The classifier reports the warning was captured and processed
- IncompatibleClassifierVersionError is raised
"""
# TODO: This wasn't testing the warning anymore, as the schema changed
# but as it was implemented, it would require installing an old version
# rebuilding the file and committing that. Not developer friendly
# Need to rethink how to pass the load through to a file with a single
# old model?
from sklearn.exceptions import InconsistentVersionWarning
self.generate_train_and_save()
fake_warning = warnings.WarningMessage(
message=InconsistentVersionWarning(
estimator_name="MLPClassifier",
current_sklearn_version="1.0",
original_sklearn_version="0.9",
),
category=InconsistentVersionWarning,
filename="",
lineno=0,
)
real_catch_warnings = warnings.catch_warnings
class PatchedCatchWarnings(real_catch_warnings):
def __enter__(self):
w = super().__enter__()
w.append(fake_warning)
return w
with mock.patch(
"documents.classifier.warnings.catch_warnings",
PatchedCatchWarnings,
):
with self.assertRaises(IncompatibleClassifierVersionError):
self.classifier.load()
def test_one_correspondent_predict(self) -> None:
c1 = Correspondent.objects.create(
@@ -685,17 +764,6 @@ class TestClassifier(DirectoriesMixin, TestCase):
self.assertIsNone(load_classifier())
self.assertTrue(Path(settings.MODEL_FILE).exists())
def test_load_old_classifier_version(self) -> None:
shutil.copy(
Path(__file__).parent / "data" / "v1.17.4.model.pickle",
self.dirs.scratch_dir,
)
with override_settings(
MODEL_FILE=self.dirs.scratch_dir / "v1.17.4.model.pickle",
):
classifier = load_classifier()
self.assertIsNone(classifier)
@mock.patch("documents.classifier.DocumentClassifier.load")
def test_load_classifier_raise_exception(self, mock_load) -> None:
Path(settings.MODEL_FILE).touch()