From 1fefd506b7a86480f813f83030b4e41bafe80d4f Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 8 Apr 2026 09:38:53 -0700 Subject: [PATCH] perf: eliminate second document queryset scan in classifier train() Capture doc.content during the label extraction loop so the document queryset is iterated exactly once per training run. Previously CountVectorizer.fit_transform() consumed a content_generator() that re-evaluated the same docs_queryset, causing a second full table scan. At 5k docs this wasted ~2.4 s and doubled DB I/O on every train. Remove content_generator(); replace with a generator expression over the in-memory doc_contents list collected during Step 1. Co-Authored-By: Claude Sonnet 4.6 --- src/documents/classifier.py | 16 +-- .../tests/test_classifier_single_pass.py | 134 ++++++++++++++++++ 2 files changed, 140 insertions(+), 10 deletions(-) create mode 100644 src/documents/tests/test_classifier_single_pass.py diff --git a/src/documents/classifier.py b/src/documents/classifier.py index d285800ff..a9c3319ab 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -11,7 +11,6 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import Callable - from collections.abc import Iterator from datetime import datetime from numpy import ndarray @@ -304,12 +303,15 @@ class DocumentClassifier: labels_correspondent = [] labels_document_type = [] labels_storage_path = [] + doc_contents: list[str] = [] - # Step 1: Extract and preprocess training data from the database. + # Step 1: Extract labels and capture content in a single pass. logger.debug("Gathering data from database...") notify(f"Gathering data from {docs_queryset.count()} document(s)...") hasher = sha256() for doc in docs_queryset: + doc_contents.append(doc.content) + y = -1 dt = doc.document_type if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO: @@ -369,13 +371,6 @@ class DocumentClassifier: logger.debug("Vectorizing data...") notify("Vectorizing document content...") - def content_generator() -> Iterator[str]: - """ - Generates the content for documents, but once at a time - """ - for doc in docs_queryset: - yield self.preprocess_content(doc.content, shared_cache=False) - self.data_vectorizer = CountVectorizer( analyzer="word", ngram_range=(1, 2), @@ -383,7 +378,8 @@ class DocumentClassifier: ) data_vectorized: ndarray = self.data_vectorizer.fit_transform( - content_generator(), + self.preprocess_content(content, shared_cache=False) + for content in doc_contents ) # See the notes here: diff --git a/src/documents/tests/test_classifier_single_pass.py b/src/documents/tests/test_classifier_single_pass.py new file mode 100644 index 000000000..1d6b239ba --- /dev/null +++ b/src/documents/tests/test_classifier_single_pass.py @@ -0,0 +1,134 @@ +""" +Phase 2 — Single queryset pass in DocumentClassifier.train() + +The document queryset must be iterated exactly once: during the label +extraction loop, which now also captures doc.content for vectorization. +The previous content_generator() caused a second full table scan. +""" + +from __future__ import annotations + +from unittest import mock + +import pytest +from django.db.models.query import QuerySet + +from documents.classifier import DocumentClassifier +from documents.models import Correspondent +from documents.models import Document +from documents.models import DocumentType +from documents.models import MatchingModel +from documents.models import StoragePath +from documents.models import Tag + +# --------------------------------------------------------------------------- +# Fixtures (mirrors test_classifier_train_skip.py) +# --------------------------------------------------------------------------- + + +@pytest.fixture() +def classifier_settings(settings, tmp_path): + settings.MODEL_FILE = tmp_path / "model.pickle" + return settings + + +@pytest.fixture() +def classifier(classifier_settings): + return DocumentClassifier() + + +@pytest.fixture() +def label_corpus(classifier_settings): + c_auto = Correspondent.objects.create( + name="Auto Corp", + matching_algorithm=MatchingModel.MATCH_AUTO, + ) + dt_auto = DocumentType.objects.create( + name="Invoice", + matching_algorithm=MatchingModel.MATCH_AUTO, + ) + t_auto = Tag.objects.create( + name="finance", + matching_algorithm=MatchingModel.MATCH_AUTO, + ) + sp_auto = StoragePath.objects.create( + name="Finance Path", + path="finance/{correspondent}", + matching_algorithm=MatchingModel.MATCH_AUTO, + ) + + doc_a = Document.objects.create( + title="Invoice A", + content="quarterly invoice payment tax financial statement revenue", + correspondent=c_auto, + document_type=dt_auto, + storage_path=sp_auto, + checksum="aaa", + mime_type="application/pdf", + filename="invoice_a.pdf", + ) + doc_a.tags.set([t_auto]) + + doc_b = Document.objects.create( + title="Invoice B", + content="monthly invoice billing statement account balance due", + correspondent=c_auto, + document_type=dt_auto, + storage_path=sp_auto, + checksum="bbb", + mime_type="application/pdf", + filename="invoice_b.pdf", + ) + doc_b.tags.set([t_auto]) + + doc_c = Document.objects.create( + title="Notes", + content="meeting notes agenda discussion summary action items follow", + checksum="ccc", + mime_type="application/pdf", + filename="notes_c.pdf", + ) + + return {"doc_a": doc_a, "doc_b": doc_b, "doc_c": doc_c} + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.django_db() +class TestSingleQuerysetPass: + def test_train_iterates_document_queryset_once(self, classifier, label_corpus): + """ + train() must iterate the Document queryset exactly once. + + Before Phase 2 there were two iterations: one in the label extraction + loop and a second inside content_generator() for CountVectorizer. + After Phase 2 content is captured during the label loop; the second + iteration is eliminated. + """ + original_iter = QuerySet.__iter__ + doc_iter_count = 0 + + def counting_iter(qs): + nonlocal doc_iter_count + if qs.model is Document: + doc_iter_count += 1 + return original_iter(qs) + + with mock.patch.object(QuerySet, "__iter__", counting_iter): + classifier.train() + + assert doc_iter_count == 1, ( + f"Expected 1 Document queryset iteration, got {doc_iter_count}. " + "content_generator() may still be re-fetching from the DB." + ) + + def test_train_result_unchanged(self, classifier, label_corpus): + """ + Collapsing to a single pass must not change what the classifier learns: + a second train() with no changes still returns False. + """ + assert classifier.train() is True + assert classifier.train() is False