mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-02 11:49:44 +00:00
Performance: Classifier performance optimizations (#10363)
This commit is contained in:
@@ -21,7 +21,7 @@ from documents.models import Tag
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
def dummy_preprocess(content: str):
|
||||
def dummy_preprocess(content: str, **kwargs):
|
||||
"""
|
||||
Simpler, faster pre-processing for testing purposes
|
||||
"""
|
||||
@@ -223,24 +223,47 @@ class TestClassifier(DirectoriesMixin, TestCase):
|
||||
self.generate_test_data()
|
||||
self.classifier.train()
|
||||
|
||||
self.assertEqual(
|
||||
self.classifier.predict_correspondent(self.doc1.content),
|
||||
self.c1.pk,
|
||||
)
|
||||
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
|
||||
self.assertListEqual(
|
||||
self.classifier.predict_tags(self.doc1.content),
|
||||
[self.t1.pk],
|
||||
)
|
||||
self.assertListEqual(
|
||||
self.classifier.predict_tags(self.doc2.content),
|
||||
[self.t1.pk, self.t3.pk],
|
||||
)
|
||||
self.assertEqual(
|
||||
self.classifier.predict_document_type(self.doc1.content),
|
||||
self.dt.pk,
|
||||
)
|
||||
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
|
||||
with (
|
||||
mock.patch.object(
|
||||
self.classifier.data_vectorizer,
|
||||
"transform",
|
||||
wraps=self.classifier.data_vectorizer.transform,
|
||||
) as mock_transform,
|
||||
mock.patch.object(
|
||||
self.classifier,
|
||||
"preprocess_content",
|
||||
wraps=self.classifier.preprocess_content,
|
||||
) as mock_preprocess_content,
|
||||
):
|
||||
self.assertEqual(
|
||||
self.classifier.predict_correspondent(self.doc1.content),
|
||||
self.c1.pk,
|
||||
)
|
||||
self.assertEqual(
|
||||
self.classifier.predict_correspondent(self.doc2.content),
|
||||
None,
|
||||
)
|
||||
self.assertListEqual(
|
||||
self.classifier.predict_tags(self.doc1.content),
|
||||
[self.t1.pk],
|
||||
)
|
||||
self.assertListEqual(
|
||||
self.classifier.predict_tags(self.doc2.content),
|
||||
[self.t1.pk, self.t3.pk],
|
||||
)
|
||||
self.assertEqual(
|
||||
self.classifier.predict_document_type(self.doc1.content),
|
||||
self.dt.pk,
|
||||
)
|
||||
self.assertEqual(
|
||||
self.classifier.predict_document_type(self.doc2.content),
|
||||
None,
|
||||
)
|
||||
|
||||
# Check that the classifier vectorized content and text preprocessing has been cached
|
||||
# It should be called once per document (doc1 and doc2)
|
||||
self.assertEqual(mock_preprocess_content.call_count, 2)
|
||||
self.assertEqual(mock_transform.call_count, 2)
|
||||
|
||||
def test_no_retrain_if_no_change(self):
|
||||
"""
|
||||
@@ -694,3 +717,67 @@ class TestClassifier(DirectoriesMixin, TestCase):
|
||||
mock_load.side_effect = Exception()
|
||||
with self.assertRaises(Exception):
|
||||
load_classifier(raise_exception=True)
|
||||
|
||||
|
||||
def test_preprocess_content():
|
||||
"""
|
||||
GIVEN:
|
||||
- Advanced text processing is enabled (default)
|
||||
WHEN:
|
||||
- Classifier preprocesses a document's content
|
||||
THEN:
|
||||
- Processed content matches the expected output (stemmed words)
|
||||
"""
|
||||
with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
|
||||
content = f.read()
|
||||
with (Path(__file__).parent / "samples" / "preprocessed_content_advanced.txt").open(
|
||||
"r",
|
||||
) as f:
|
||||
expected_preprocess_content = f.read().rstrip()
|
||||
classifier = DocumentClassifier()
|
||||
result = classifier.preprocess_content(content)
|
||||
assert result == expected_preprocess_content
|
||||
|
||||
|
||||
def test_preprocess_content_nltk_disabled():
|
||||
"""
|
||||
GIVEN:
|
||||
- Advanced text processing is disabled
|
||||
WHEN:
|
||||
- Classifier preprocesses a document's content
|
||||
THEN:
|
||||
- Processed content matches the expected output (unstemmed words)
|
||||
"""
|
||||
with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
|
||||
content = f.read()
|
||||
with (Path(__file__).parent / "samples" / "preprocessed_content.txt").open(
|
||||
"r",
|
||||
) as f:
|
||||
expected_preprocess_content = f.read().rstrip()
|
||||
classifier = DocumentClassifier()
|
||||
with mock.patch("documents.classifier.ADVANCED_TEXT_PROCESSING_ENABLED", new=False):
|
||||
result = classifier.preprocess_content(content)
|
||||
assert result == expected_preprocess_content
|
||||
|
||||
|
||||
def test_preprocess_content_nltk_load_fail(mocker):
|
||||
"""
|
||||
GIVEN:
|
||||
- NLTK stop words fail to load
|
||||
WHEN:
|
||||
- Classifier preprocesses a document's content
|
||||
THEN:
|
||||
- Processed content matches the expected output (unstemmed words)
|
||||
"""
|
||||
_module = mocker.MagicMock(name="nltk_corpus_mock")
|
||||
_module.stopwords.words.side_effect = AttributeError()
|
||||
mocker.patch.dict("sys.modules", {"nltk.corpus": _module})
|
||||
classifier = DocumentClassifier()
|
||||
with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
|
||||
content = f.read()
|
||||
with (Path(__file__).parent / "samples" / "preprocessed_content.txt").open(
|
||||
"r",
|
||||
) as f:
|
||||
expected_preprocess_content = f.read().rstrip()
|
||||
result = classifier.preprocess_content(content)
|
||||
assert result == expected_preprocess_content
|
||||
|
||||
Reference in New Issue
Block a user