perf: eliminate second document queryset scan in classifier train()

Capture doc.content during the label extraction loop so the document
queryset is iterated exactly once per training run.

Previously CountVectorizer.fit_transform() consumed a content_generator()
that re-evaluated the same docs_queryset, causing a second full table
scan. At 5k docs this wasted ~2.4 s and doubled DB I/O on every train.

Remove content_generator(); replace with a generator expression over
the in-memory doc_contents list collected during Step 1.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-04-08 09:38:53 -07:00
parent 68b866aeee
commit 1fefd506b7
2 changed files with 140 additions and 10 deletions

View File

@@ -11,7 +11,6 @@ from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Callable
from collections.abc import Iterator
from datetime import datetime
from numpy import ndarray
@@ -304,12 +303,15 @@ class DocumentClassifier:
labels_correspondent = []
labels_document_type = []
labels_storage_path = []
doc_contents: list[str] = []
# Step 1: Extract and preprocess training data from the database.
# Step 1: Extract labels and capture content in a single pass.
logger.debug("Gathering data from database...")
notify(f"Gathering data from {docs_queryset.count()} document(s)...")
hasher = sha256()
for doc in docs_queryset:
doc_contents.append(doc.content)
y = -1
dt = doc.document_type
if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
@@ -369,13 +371,6 @@ class DocumentClassifier:
logger.debug("Vectorizing data...")
notify("Vectorizing document content...")
def content_generator() -> Iterator[str]:
"""
Generates the content for documents, but once at a time
"""
for doc in docs_queryset:
yield self.preprocess_content(doc.content, shared_cache=False)
self.data_vectorizer = CountVectorizer(
analyzer="word",
ngram_range=(1, 2),
@@ -383,7 +378,8 @@ class DocumentClassifier:
)
data_vectorized: ndarray = self.data_vectorizer.fit_transform(
content_generator(),
self.preprocess_content(content, shared_cache=False)
for content in doc_contents
)
# See the notes here:

View File

@@ -0,0 +1,134 @@
"""
Phase 2 — Single queryset pass in DocumentClassifier.train()
The document queryset must be iterated exactly once: during the label
extraction loop, which now also captures doc.content for vectorization.
The previous content_generator() caused a second full table scan.
"""
from __future__ import annotations
from unittest import mock
import pytest
from django.db.models.query import QuerySet
from documents.classifier import DocumentClassifier
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import StoragePath
from documents.models import Tag
# ---------------------------------------------------------------------------
# Fixtures (mirrors test_classifier_train_skip.py)
# ---------------------------------------------------------------------------
@pytest.fixture()
def classifier_settings(settings, tmp_path):
settings.MODEL_FILE = tmp_path / "model.pickle"
return settings
@pytest.fixture()
def classifier(classifier_settings):
return DocumentClassifier()
@pytest.fixture()
def label_corpus(classifier_settings):
c_auto = Correspondent.objects.create(
name="Auto Corp",
matching_algorithm=MatchingModel.MATCH_AUTO,
)
dt_auto = DocumentType.objects.create(
name="Invoice",
matching_algorithm=MatchingModel.MATCH_AUTO,
)
t_auto = Tag.objects.create(
name="finance",
matching_algorithm=MatchingModel.MATCH_AUTO,
)
sp_auto = StoragePath.objects.create(
name="Finance Path",
path="finance/{correspondent}",
matching_algorithm=MatchingModel.MATCH_AUTO,
)
doc_a = Document.objects.create(
title="Invoice A",
content="quarterly invoice payment tax financial statement revenue",
correspondent=c_auto,
document_type=dt_auto,
storage_path=sp_auto,
checksum="aaa",
mime_type="application/pdf",
filename="invoice_a.pdf",
)
doc_a.tags.set([t_auto])
doc_b = Document.objects.create(
title="Invoice B",
content="monthly invoice billing statement account balance due",
correspondent=c_auto,
document_type=dt_auto,
storage_path=sp_auto,
checksum="bbb",
mime_type="application/pdf",
filename="invoice_b.pdf",
)
doc_b.tags.set([t_auto])
doc_c = Document.objects.create(
title="Notes",
content="meeting notes agenda discussion summary action items follow",
checksum="ccc",
mime_type="application/pdf",
filename="notes_c.pdf",
)
return {"doc_a": doc_a, "doc_b": doc_b, "doc_c": doc_c}
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
@pytest.mark.django_db()
class TestSingleQuerysetPass:
def test_train_iterates_document_queryset_once(self, classifier, label_corpus):
"""
train() must iterate the Document queryset exactly once.
Before Phase 2 there were two iterations: one in the label extraction
loop and a second inside content_generator() for CountVectorizer.
After Phase 2 content is captured during the label loop; the second
iteration is eliminated.
"""
original_iter = QuerySet.__iter__
doc_iter_count = 0
def counting_iter(qs):
nonlocal doc_iter_count
if qs.model is Document:
doc_iter_count += 1
return original_iter(qs)
with mock.patch.object(QuerySet, "__iter__", counting_iter):
classifier.train()
assert doc_iter_count == 1, (
f"Expected 1 Document queryset iteration, got {doc_iter_count}. "
"content_generator() may still be re-fetching from the DB."
)
def test_train_result_unchanged(self, classifier, label_corpus):
"""
Collapsing to a single pass must not change what the classifier learns:
a second train() with no changes still returns False.
"""
assert classifier.train() is True
assert classifier.train() is False