Performance: Classifier performance optimizations (#10363)

This commit is contained in:
Antoine Mérino
2025-08-06 22:00:11 +02:00
committed by GitHub
parent 6dca4daea5
commit 1bee1495cf
9 changed files with 395 additions and 70 deletions
+106 -19
View File
@@ -21,7 +21,7 @@ from documents.models import Tag
from documents.tests.utils import DirectoriesMixin
def dummy_preprocess(content: str):
def dummy_preprocess(content: str, **kwargs):
"""
Simpler, faster pre-processing for testing purposes
"""
@@ -223,24 +223,47 @@ class TestClassifier(DirectoriesMixin, TestCase):
self.generate_test_data()
self.classifier.train()
self.assertEqual(
self.classifier.predict_correspondent(self.doc1.content),
self.c1.pk,
)
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
self.assertListEqual(
self.classifier.predict_tags(self.doc1.content),
[self.t1.pk],
)
self.assertListEqual(
self.classifier.predict_tags(self.doc2.content),
[self.t1.pk, self.t3.pk],
)
self.assertEqual(
self.classifier.predict_document_type(self.doc1.content),
self.dt.pk,
)
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
with (
mock.patch.object(
self.classifier.data_vectorizer,
"transform",
wraps=self.classifier.data_vectorizer.transform,
) as mock_transform,
mock.patch.object(
self.classifier,
"preprocess_content",
wraps=self.classifier.preprocess_content,
) as mock_preprocess_content,
):
self.assertEqual(
self.classifier.predict_correspondent(self.doc1.content),
self.c1.pk,
)
self.assertEqual(
self.classifier.predict_correspondent(self.doc2.content),
None,
)
self.assertListEqual(
self.classifier.predict_tags(self.doc1.content),
[self.t1.pk],
)
self.assertListEqual(
self.classifier.predict_tags(self.doc2.content),
[self.t1.pk, self.t3.pk],
)
self.assertEqual(
self.classifier.predict_document_type(self.doc1.content),
self.dt.pk,
)
self.assertEqual(
self.classifier.predict_document_type(self.doc2.content),
None,
)
# Check that the classifier vectorized content and text preprocessing has been cached
# It should be called once per document (doc1 and doc2)
self.assertEqual(mock_preprocess_content.call_count, 2)
self.assertEqual(mock_transform.call_count, 2)
def test_no_retrain_if_no_change(self):
"""
@@ -694,3 +717,67 @@ class TestClassifier(DirectoriesMixin, TestCase):
mock_load.side_effect = Exception()
with self.assertRaises(Exception):
load_classifier(raise_exception=True)
def test_preprocess_content():
"""
GIVEN:
- Advanced text processing is enabled (default)
WHEN:
- Classifier preprocesses a document's content
THEN:
- Processed content matches the expected output (stemmed words)
"""
with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
content = f.read()
with (Path(__file__).parent / "samples" / "preprocessed_content_advanced.txt").open(
"r",
) as f:
expected_preprocess_content = f.read().rstrip()
classifier = DocumentClassifier()
result = classifier.preprocess_content(content)
assert result == expected_preprocess_content
def test_preprocess_content_nltk_disabled():
"""
GIVEN:
- Advanced text processing is disabled
WHEN:
- Classifier preprocesses a document's content
THEN:
- Processed content matches the expected output (unstemmed words)
"""
with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
content = f.read()
with (Path(__file__).parent / "samples" / "preprocessed_content.txt").open(
"r",
) as f:
expected_preprocess_content = f.read().rstrip()
classifier = DocumentClassifier()
with mock.patch("documents.classifier.ADVANCED_TEXT_PROCESSING_ENABLED", new=False):
result = classifier.preprocess_content(content)
assert result == expected_preprocess_content
def test_preprocess_content_nltk_load_fail(mocker):
"""
GIVEN:
- NLTK stop words fail to load
WHEN:
- Classifier preprocesses a document's content
THEN:
- Processed content matches the expected output (unstemmed words)
"""
_module = mocker.MagicMock(name="nltk_corpus_mock")
_module.stopwords.words.side_effect = AttributeError()
mocker.patch.dict("sys.modules", {"nltk.corpus": _module})
classifier = DocumentClassifier()
with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
content = f.read()
with (Path(__file__).parent / "samples" / "preprocessed_content.txt").open(
"r",
) as f:
expected_preprocess_content = f.read().rstrip()
result = classifier.preprocess_content(content)
assert result == expected_preprocess_content