feat(profiling): add matching pipeline profiling

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-12 06:56:15 +00:00 · 2026-04-11 14:30:33 -07:00
parent 6ba1acd7d3
commit e64b9a4cfd
1 changed files with 284 additions and 0 deletions
@@ -0,0 +1,284 @@
+"""
+Matching pipeline profiling.
+
+Run with:
+    uv run pytest ../test_matching_profile.py \
+        -m profiling --override-ini="addopts=" -s -v
+
+Corpus: 1 document + 50 correspondents, 100 tags, 25 doc types, 20 storage
+        paths. Labels are spread across all six matching algorithms
+        (NONE, ANY, ALL, LITERAL, REGEX, FUZZY, AUTO).
+
+Classifier is passed as None -- MATCH_AUTO models skip prediction gracefully,
+which is correct for isolating the ORM query and Python-side evaluation cost.
+
+Scenarios
+---------
+TestMatchingPipelineProfile
+  - test_match_correspondents   50 correspondents, algorithm mix
+  - test_match_tags             100 tags
+  - test_match_document_types   25 doc types
+  - test_match_storage_paths    20 storage paths
+  - test_full_match_sequence    all four in order (cumulative consumption cost)
+  - test_algorithm_breakdown    each MATCH_* algorithm in isolation
+"""
+
+from __future__ import annotations
+
+import random
+
+import pytest
+from faker import Faker
+from profiling import profile_block
+
+from documents.matching import match_correspondents
+from documents.matching import match_document_types
+from documents.matching import match_storage_paths
+from documents.matching import match_tags
+from documents.models import Correspondent
+from documents.models import Document
+from documents.models import DocumentType
+from documents.models import MatchingModel
+from documents.models import StoragePath
+from documents.models import Tag
+
+pytestmark = [pytest.mark.profiling, pytest.mark.django_db]
+
+NUM_CORRESPONDENTS = 50
+NUM_TAGS = 100
+NUM_DOC_TYPES = 25
+NUM_STORAGE_PATHS = 20
+SEED = 42
+
+# Algorithm distribution across labels (cycles through in order)
+_ALGORITHMS = [
+    MatchingModel.MATCH_NONE,
+    MatchingModel.MATCH_ANY,
+    MatchingModel.MATCH_ALL,
+    MatchingModel.MATCH_LITERAL,
+    MatchingModel.MATCH_REGEX,
+    MatchingModel.MATCH_FUZZY,
+    MatchingModel.MATCH_AUTO,
+]
+
+
+def _algo(i: int) -> int:
+    return _ALGORITHMS[i % len(_ALGORITHMS)]
+
+
+# ---------------------------------------------------------------------------
+# Module-scoped corpus fixture
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="module")
+def module_db(django_db_setup, django_db_blocker):
+    """Unlock the DB for the whole module (module-scoped)."""
+    with django_db_blocker.unblock():
+        yield
+
+
+@pytest.fixture(scope="module")
+def matching_corpus(module_db):
+    """
+    1 document with realistic content + dense matching model sets.
+    Classifier=None so MATCH_AUTO models are simply skipped.
+    """
+    fake = Faker()
+    Faker.seed(SEED)
+    random.seed(SEED)
+
+    # ---- matching models ---------------------------------------------------
+    print(f"\n[setup] Creating {NUM_CORRESPONDENTS} correspondents...")  # noqa: T201
+    correspondents = []
+    for i in range(NUM_CORRESPONDENTS):
+        algo = _algo(i)
+        match_text = (
+            fake.word()
+            if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
+            else ""
+        )
+        if algo == MatchingModel.MATCH_REGEX:
+            match_text = r"\b" + fake.word() + r"\b"
+        correspondents.append(
+            Correspondent.objects.create(
+                name=f"mcorp-{i}-{fake.company()}"[:128],
+                matching_algorithm=algo,
+                match=match_text,
+            ),
+        )
+
+    print(f"[setup] Creating {NUM_TAGS} tags...")  # noqa: T201
+    tags = []
+    for i in range(NUM_TAGS):
+        algo = _algo(i)
+        match_text = (
+            fake.word()
+            if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
+            else ""
+        )
+        if algo == MatchingModel.MATCH_REGEX:
+            match_text = r"\b" + fake.word() + r"\b"
+        tags.append(
+            Tag.objects.create(
+                name=f"mtag-{i}-{fake.word()}"[:100],
+                matching_algorithm=algo,
+                match=match_text,
+            ),
+        )
+
+    print(f"[setup] Creating {NUM_DOC_TYPES} doc types...")  # noqa: T201
+    doc_types = []
+    for i in range(NUM_DOC_TYPES):
+        algo = _algo(i)
+        match_text = (
+            fake.word()
+            if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
+            else ""
+        )
+        if algo == MatchingModel.MATCH_REGEX:
+            match_text = r"\b" + fake.word() + r"\b"
+        doc_types.append(
+            DocumentType.objects.create(
+                name=f"mtype-{i}-{fake.word()}"[:128],
+                matching_algorithm=algo,
+                match=match_text,
+            ),
+        )
+
+    print(f"[setup] Creating {NUM_STORAGE_PATHS} storage paths...")  # noqa: T201
+    storage_paths = []
+    for i in range(NUM_STORAGE_PATHS):
+        algo = _algo(i)
+        match_text = (
+            fake.word()
+            if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
+            else ""
+        )
+        if algo == MatchingModel.MATCH_REGEX:
+            match_text = r"\b" + fake.word() + r"\b"
+        storage_paths.append(
+            StoragePath.objects.create(
+                name=f"mpath-{i}-{fake.word()}",
+                path=f"{fake.word()}/{{title}}",
+                matching_algorithm=algo,
+                match=match_text,
+            ),
+        )
+
+    # ---- document with diverse content ------------------------------------
+    doc = Document.objects.create(
+        title="quarterly invoice payment tax financial statement",
+        content=" ".join(fake.paragraph(nb_sentences=5) for _ in range(3)),
+        checksum="MATCHPROF0001",
+    )
+
+    print(f"[setup] Document pk={doc.pk}, content length={len(doc.content)} chars")  # noqa: T201
+    print(  # noqa: T201
+        f"  Correspondents: {NUM_CORRESPONDENTS} "
+        f"({sum(1 for c in correspondents if c.matching_algorithm == MatchingModel.MATCH_AUTO)} AUTO)",
+    )
+    print(  # noqa: T201
+        f"  Tags: {NUM_TAGS} "
+        f"({sum(1 for t in tags if t.matching_algorithm == MatchingModel.MATCH_AUTO)} AUTO)",
+    )
+
+    yield {"doc": doc}
+
+    # Teardown
+    print("\n[teardown] Removing matching corpus...")  # noqa: T201
+    Document.objects.all().delete()
+    Correspondent.objects.all().delete()
+    Tag.objects.all().delete()
+    DocumentType.objects.all().delete()
+    StoragePath.objects.all().delete()
+
+
+# ---------------------------------------------------------------------------
+# TestMatchingPipelineProfile
+# ---------------------------------------------------------------------------
+
+
+class TestMatchingPipelineProfile:
+    """Profile the matching functions called per document during consumption."""
+
+    @pytest.fixture(autouse=True)
+    def _setup(self, matching_corpus):
+        self.doc = matching_corpus["doc"]
+
+    def test_match_correspondents(self):
+        """50 correspondents, algorithm mix. Query count + time."""
+        with profile_block(
+            f"match_correspondents()  [{NUM_CORRESPONDENTS} correspondents, mixed algorithms]",
+        ):
+            result = match_correspondents(self.doc, classifier=None)
+        print(f"  -> {len(result)} matched")  # noqa: T201
+
+    def test_match_tags(self):
+        """100 tags -- densest set in real installs."""
+        with profile_block(f"match_tags()  [{NUM_TAGS} tags, mixed algorithms]"):
+            result = match_tags(self.doc, classifier=None)
+        print(f"  -> {len(result)} matched")  # noqa: T201
+
+    def test_match_document_types(self):
+        """25 doc types."""
+        with profile_block(
+            f"match_document_types()  [{NUM_DOC_TYPES} types, mixed algorithms]",
+        ):
+            result = match_document_types(self.doc, classifier=None)
+        print(f"  -> {len(result)} matched")  # noqa: T201
+
+    def test_match_storage_paths(self):
+        """20 storage paths."""
+        with profile_block(
+            f"match_storage_paths()  [{NUM_STORAGE_PATHS} paths, mixed algorithms]",
+        ):
+            result = match_storage_paths(self.doc, classifier=None)
+        print(f"  -> {len(result)} matched")  # noqa: T201
+
+    def test_full_match_sequence(self):
+        """All four match_*() calls in order -- cumulative cost per document consumed."""
+        with profile_block(
+            "full match sequence: correspondents + doc_types + tags + storage_paths",
+        ):
+            match_correspondents(self.doc, classifier=None)
+            match_document_types(self.doc, classifier=None)
+            match_tags(self.doc, classifier=None)
+            match_storage_paths(self.doc, classifier=None)
+
+    def test_algorithm_breakdown(self):
+        """Create one correspondent per algorithm and time each independently."""
+        import time
+
+        from documents.matching import matches
+
+        fake = Faker()
+        algo_names = {
+            MatchingModel.MATCH_NONE: "MATCH_NONE",
+            MatchingModel.MATCH_ANY: "MATCH_ANY",
+            MatchingModel.MATCH_ALL: "MATCH_ALL",
+            MatchingModel.MATCH_LITERAL: "MATCH_LITERAL",
+            MatchingModel.MATCH_REGEX: "MATCH_REGEX",
+            MatchingModel.MATCH_FUZZY: "MATCH_FUZZY",
+        }
+        doc = self.doc
+        print()  # noqa: T201
+
+        for algo, name in algo_names.items():
+            match_text = fake.word() if algo != MatchingModel.MATCH_NONE else ""
+            if algo == MatchingModel.MATCH_REGEX:
+                match_text = r"\b" + fake.word() + r"\b"
+            model = Correspondent(
+                name=f"algo-test-{name}",
+                matching_algorithm=algo,
+                match=match_text,
+            )
+            # Time 1000 iterations to get stable microsecond readings
+            runs = 1_000
+            t0 = time.perf_counter()
+            for _ in range(runs):
+                matches(model, doc)
+            us_per_call = (time.perf_counter() - t0) / runs * 1_000_000
+            print(  # noqa: T201
+                f"  {name:<20s}  {us_per_call:8.2f} us/call  (match={match_text[:20]!r})",
+            )