feat(profiling): add matching pipeline profiling

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-04-11 14:30:33 -07:00
parent 6ba1acd7d3
commit e64b9a4cfd

284
test_matching_profile.py Normal file
View File

@@ -0,0 +1,284 @@
"""
Matching pipeline profiling.
Run with:
uv run pytest ../test_matching_profile.py \
-m profiling --override-ini="addopts=" -s -v
Corpus: 1 document + 50 correspondents, 100 tags, 25 doc types, 20 storage
paths. Labels are spread across all six matching algorithms
(NONE, ANY, ALL, LITERAL, REGEX, FUZZY, AUTO).
Classifier is passed as None -- MATCH_AUTO models skip prediction gracefully,
which is correct for isolating the ORM query and Python-side evaluation cost.
Scenarios
---------
TestMatchingPipelineProfile
- test_match_correspondents 50 correspondents, algorithm mix
- test_match_tags 100 tags
- test_match_document_types 25 doc types
- test_match_storage_paths 20 storage paths
- test_full_match_sequence all four in order (cumulative consumption cost)
- test_algorithm_breakdown each MATCH_* algorithm in isolation
"""
from __future__ import annotations
import random
import pytest
from faker import Faker
from profiling import profile_block
from documents.matching import match_correspondents
from documents.matching import match_document_types
from documents.matching import match_storage_paths
from documents.matching import match_tags
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import StoragePath
from documents.models import Tag
pytestmark = [pytest.mark.profiling, pytest.mark.django_db]
NUM_CORRESPONDENTS = 50
NUM_TAGS = 100
NUM_DOC_TYPES = 25
NUM_STORAGE_PATHS = 20
SEED = 42
# Algorithm distribution across labels (cycles through in order)
_ALGORITHMS = [
MatchingModel.MATCH_NONE,
MatchingModel.MATCH_ANY,
MatchingModel.MATCH_ALL,
MatchingModel.MATCH_LITERAL,
MatchingModel.MATCH_REGEX,
MatchingModel.MATCH_FUZZY,
MatchingModel.MATCH_AUTO,
]
def _algo(i: int) -> int:
return _ALGORITHMS[i % len(_ALGORITHMS)]
# ---------------------------------------------------------------------------
# Module-scoped corpus fixture
# ---------------------------------------------------------------------------
@pytest.fixture(scope="module")
def module_db(django_db_setup, django_db_blocker):
"""Unlock the DB for the whole module (module-scoped)."""
with django_db_blocker.unblock():
yield
@pytest.fixture(scope="module")
def matching_corpus(module_db):
"""
1 document with realistic content + dense matching model sets.
Classifier=None so MATCH_AUTO models are simply skipped.
"""
fake = Faker()
Faker.seed(SEED)
random.seed(SEED)
# ---- matching models ---------------------------------------------------
print(f"\n[setup] Creating {NUM_CORRESPONDENTS} correspondents...") # noqa: T201
correspondents = []
for i in range(NUM_CORRESPONDENTS):
algo = _algo(i)
match_text = (
fake.word()
if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
else ""
)
if algo == MatchingModel.MATCH_REGEX:
match_text = r"\b" + fake.word() + r"\b"
correspondents.append(
Correspondent.objects.create(
name=f"mcorp-{i}-{fake.company()}"[:128],
matching_algorithm=algo,
match=match_text,
),
)
print(f"[setup] Creating {NUM_TAGS} tags...") # noqa: T201
tags = []
for i in range(NUM_TAGS):
algo = _algo(i)
match_text = (
fake.word()
if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
else ""
)
if algo == MatchingModel.MATCH_REGEX:
match_text = r"\b" + fake.word() + r"\b"
tags.append(
Tag.objects.create(
name=f"mtag-{i}-{fake.word()}"[:100],
matching_algorithm=algo,
match=match_text,
),
)
print(f"[setup] Creating {NUM_DOC_TYPES} doc types...") # noqa: T201
doc_types = []
for i in range(NUM_DOC_TYPES):
algo = _algo(i)
match_text = (
fake.word()
if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
else ""
)
if algo == MatchingModel.MATCH_REGEX:
match_text = r"\b" + fake.word() + r"\b"
doc_types.append(
DocumentType.objects.create(
name=f"mtype-{i}-{fake.word()}"[:128],
matching_algorithm=algo,
match=match_text,
),
)
print(f"[setup] Creating {NUM_STORAGE_PATHS} storage paths...") # noqa: T201
storage_paths = []
for i in range(NUM_STORAGE_PATHS):
algo = _algo(i)
match_text = (
fake.word()
if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
else ""
)
if algo == MatchingModel.MATCH_REGEX:
match_text = r"\b" + fake.word() + r"\b"
storage_paths.append(
StoragePath.objects.create(
name=f"mpath-{i}-{fake.word()}",
path=f"{fake.word()}/{{title}}",
matching_algorithm=algo,
match=match_text,
),
)
# ---- document with diverse content ------------------------------------
doc = Document.objects.create(
title="quarterly invoice payment tax financial statement",
content=" ".join(fake.paragraph(nb_sentences=5) for _ in range(3)),
checksum="MATCHPROF0001",
)
print(f"[setup] Document pk={doc.pk}, content length={len(doc.content)} chars") # noqa: T201
print( # noqa: T201
f" Correspondents: {NUM_CORRESPONDENTS} "
f"({sum(1 for c in correspondents if c.matching_algorithm == MatchingModel.MATCH_AUTO)} AUTO)",
)
print( # noqa: T201
f" Tags: {NUM_TAGS} "
f"({sum(1 for t in tags if t.matching_algorithm == MatchingModel.MATCH_AUTO)} AUTO)",
)
yield {"doc": doc}
# Teardown
print("\n[teardown] Removing matching corpus...") # noqa: T201
Document.objects.all().delete()
Correspondent.objects.all().delete()
Tag.objects.all().delete()
DocumentType.objects.all().delete()
StoragePath.objects.all().delete()
# ---------------------------------------------------------------------------
# TestMatchingPipelineProfile
# ---------------------------------------------------------------------------
class TestMatchingPipelineProfile:
"""Profile the matching functions called per document during consumption."""
@pytest.fixture(autouse=True)
def _setup(self, matching_corpus):
self.doc = matching_corpus["doc"]
def test_match_correspondents(self):
"""50 correspondents, algorithm mix. Query count + time."""
with profile_block(
f"match_correspondents() [{NUM_CORRESPONDENTS} correspondents, mixed algorithms]",
):
result = match_correspondents(self.doc, classifier=None)
print(f" -> {len(result)} matched") # noqa: T201
def test_match_tags(self):
"""100 tags -- densest set in real installs."""
with profile_block(f"match_tags() [{NUM_TAGS} tags, mixed algorithms]"):
result = match_tags(self.doc, classifier=None)
print(f" -> {len(result)} matched") # noqa: T201
def test_match_document_types(self):
"""25 doc types."""
with profile_block(
f"match_document_types() [{NUM_DOC_TYPES} types, mixed algorithms]",
):
result = match_document_types(self.doc, classifier=None)
print(f" -> {len(result)} matched") # noqa: T201
def test_match_storage_paths(self):
"""20 storage paths."""
with profile_block(
f"match_storage_paths() [{NUM_STORAGE_PATHS} paths, mixed algorithms]",
):
result = match_storage_paths(self.doc, classifier=None)
print(f" -> {len(result)} matched") # noqa: T201
def test_full_match_sequence(self):
"""All four match_*() calls in order -- cumulative cost per document consumed."""
with profile_block(
"full match sequence: correspondents + doc_types + tags + storage_paths",
):
match_correspondents(self.doc, classifier=None)
match_document_types(self.doc, classifier=None)
match_tags(self.doc, classifier=None)
match_storage_paths(self.doc, classifier=None)
def test_algorithm_breakdown(self):
"""Create one correspondent per algorithm and time each independently."""
import time
from documents.matching import matches
fake = Faker()
algo_names = {
MatchingModel.MATCH_NONE: "MATCH_NONE",
MatchingModel.MATCH_ANY: "MATCH_ANY",
MatchingModel.MATCH_ALL: "MATCH_ALL",
MatchingModel.MATCH_LITERAL: "MATCH_LITERAL",
MatchingModel.MATCH_REGEX: "MATCH_REGEX",
MatchingModel.MATCH_FUZZY: "MATCH_FUZZY",
}
doc = self.doc
print() # noqa: T201
for algo, name in algo_names.items():
match_text = fake.word() if algo != MatchingModel.MATCH_NONE else ""
if algo == MatchingModel.MATCH_REGEX:
match_text = r"\b" + fake.word() + r"\b"
model = Correspondent(
name=f"algo-test-{name}",
matching_algorithm=algo,
match=match_text,
)
# Time 1000 iterations to get stable microsecond readings
runs = 1_000
t0 = time.perf_counter()
for _ in range(runs):
matches(model, doc)
us_per_call = (time.perf_counter() - t0) / runs * 1_000_000
print( # noqa: T201
f" {name:<20s} {us_per_call:8.2f} us/call (match={match_text[:20]!r})",
)