refactor(profiling): use shared profile_cpu in backend search test

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-10 07:39:50 +00:00 · 2026-04-11 13:46:43 -07:00
parent 23449cda17
commit a1a3520a8c
1 changed files with 346 additions and 0 deletions
@@ -0,0 +1,346 @@
+# ruff: noqa: T201
+"""
+cProfile-based search pipeline profiling with a 20k-document dataset.
+
+Run with:
+    uv run pytest src/documents/tests/search/test_backend_profile.py \
+        -m profiling --override-ini="addopts=" -s -v
+
+Each scenario prints:
+  - Wall time for the operation
+  - cProfile stats sorted by cumulative time (top 25 callers)
+
+This is a developer tool, not a correctness test.  Nothing here should
+fail unless the code is broken.
+"""
+
+from __future__ import annotations
+
+import random
+import time
+from typing import TYPE_CHECKING
+
+import pytest
+from profiling import profile_cpu
+
+from documents.models import Document
+from documents.search._backend import TantivyBackend
+from documents.search._backend import reset_backend
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+# transaction=False (default): tests roll back, but the module-scoped fixture
+# commits its data outside the test transaction so it remains visible throughout.
+pytestmark = [pytest.mark.profiling, pytest.mark.django_db]
+
+# ---------------------------------------------------------------------------
+# Dataset constants
+# ---------------------------------------------------------------------------
+NUM_DOCS = 20_000
+SEED = 42
+
+# Terms and their approximate match rates across the corpus.
+# "rechnung"    -> ~70% of docs  (~14 000)
+# "mahnung"     -> ~20% of docs  (~4 000)
+# "kontonummer" -> ~5%  of docs  (~1 000)
+# "rarewort"    -> ~1%  of docs  (~200)
+COMMON_TERM = "rechnung"
+MEDIUM_TERM = "mahnung"
+RARE_TERM = "kontonummer"
+VERY_RARE_TERM = "rarewort"
+
+PAGE_SIZE = 25
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+_FILLER_WORDS = [
+    "dokument",  # codespell:ignore
+    "seite",
+    "datum",
+    "betrag",
+    "nummer",
+    "konto",
+    "firma",
+    "vertrag",
+    "lieferant",
+    "bestellung",
+    "steuer",
+    "mwst",
+    "leistung",
+    "auftrag",
+    "zahlung",
+]
+
+
+def _build_content(rng: random.Random) -> str:
+    """Return a short paragraph with terms embedded at the desired rates."""
+    words = rng.choices(_FILLER_WORDS, k=15)
+    if rng.random() < 0.70:
+        words.append(COMMON_TERM)
+    if rng.random() < 0.20:
+        words.append(MEDIUM_TERM)
+    if rng.random() < 0.05:
+        words.append(RARE_TERM)
+    if rng.random() < 0.01:
+        words.append(VERY_RARE_TERM)
+    rng.shuffle(words)
+    return " ".join(words)
+
+
+def _time(fn, *, label: str, runs: int = 3):
+    """Run *fn()* several times and report min/avg/max wall time (no cProfile)."""
+    times = []
+    result = None
+    for _ in range(runs):
+        t0 = time.perf_counter()
+        result = fn()
+        times.append(time.perf_counter() - t0)
+    mn, avg, mx = min(times), sum(times) / len(times), max(times)
+    print(
+        f"  {label}: min={mn * 1000:.1f}ms  avg={avg * 1000:.1f}ms  max={mx * 1000:.1f}ms  (n={runs})",
+    )
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="module")
+def module_db(django_db_setup, django_db_blocker):
+    """Unlock the DB for the whole module (module-scoped)."""
+    with django_db_blocker.unblock():
+        yield
+
+
+@pytest.fixture(scope="module")
+def large_backend(tmp_path_factory, module_db) -> TantivyBackend:
+    """
+    Build a 20 000-document DB + on-disk Tantivy index, shared across all
+    profiling scenarios in this module.  Teardown deletes all documents.
+    """
+    index_path: Path = tmp_path_factory.mktemp("tantivy_profile")
+
+    # ---- 1. Bulk-create Document rows ----------------------------------------
+    rng = random.Random(SEED)
+    docs = [
+        Document(
+            title=f"Document {i:05d}",
+            content=_build_content(rng),
+            checksum=f"{i:064x}",
+            pk=i + 1,
+        )
+        for i in range(NUM_DOCS)
+    ]
+    t0 = time.perf_counter()
+    Document.objects.bulk_create(docs, batch_size=1_000)
+    db_time = time.perf_counter() - t0
+    print(f"\n[setup] bulk_create {NUM_DOCS} docs: {db_time:.2f}s")
+
+    # ---- 2. Build Tantivy index -----------------------------------------------
+    backend = TantivyBackend(path=index_path)
+    backend.open()
+
+    t0 = time.perf_counter()
+    with backend.batch_update() as batch:
+        for doc in Document.objects.iterator(chunk_size=500):
+            batch.add_or_update(doc)
+    idx_time = time.perf_counter() - t0
+    print(f"[setup] index {NUM_DOCS} docs: {idx_time:.2f}s")
+
+    # ---- 3. Report corpus stats -----------------------------------------------
+    for term in (COMMON_TERM, MEDIUM_TERM, RARE_TERM, VERY_RARE_TERM):
+        count = len(backend.search_ids(term, user=None))
+        print(f"[setup]   '{term}' -> {count} hits")
+
+    yield backend
+
+    # ---- Teardown ------------------------------------------------------------
+    backend.close()
+    reset_backend()
+    Document.objects.all().delete()
+
+
+# ---------------------------------------------------------------------------
+# Profiling tests — each scenario is a separate function so pytest can run
+# them individually or all together with -m profiling.
+# ---------------------------------------------------------------------------
+
+
+class TestSearchIdsProfile:
+    """Profile backend.search_ids() — pure Tantivy, no DB."""
+
+    def test_search_ids_large(self, large_backend: TantivyBackend):
+        """~14 000 hits: how long does Tantivy take to collect all IDs?"""
+        profile_cpu(
+            lambda: large_backend.search_ids(COMMON_TERM, user=None),
+            label=f"search_ids('{COMMON_TERM}')  [large result set ~14k]",
+        )
+
+    def test_search_ids_medium(self, large_backend: TantivyBackend):
+        """~4 000 hits."""
+        profile_cpu(
+            lambda: large_backend.search_ids(MEDIUM_TERM, user=None),
+            label=f"search_ids('{MEDIUM_TERM}')  [medium result set ~4k]",
+        )
+
+    def test_search_ids_rare(self, large_backend: TantivyBackend):
+        """~1 000 hits."""
+        profile_cpu(
+            lambda: large_backend.search_ids(RARE_TERM, user=None),
+            label=f"search_ids('{RARE_TERM}')  [rare result set ~1k]",
+        )
+
+
+class TestIntersectAndOrderProfile:
+    """
+    Profile the DB intersection step: filter(pk__in=search_ids).
+    This is the 'intersect_and_order' logic from views.py.
+    """
+
+    def test_intersect_large(self, large_backend: TantivyBackend):
+        """Intersect 14k Tantivy IDs with all 20k ORM-visible docs."""
+        all_ids = large_backend.search_ids(COMMON_TERM, user=None)
+        qs = Document.objects.all()
+
+        print(f"\n  Tantivy returned {len(all_ids)} IDs")
+
+        profile_cpu(
+            lambda: list(qs.filter(pk__in=all_ids).values_list("pk", flat=True)),
+            label=f"filter(pk__in={len(all_ids)} ids)  [large, use_tantivy_sort=True path]",
+        )
+
+        # Also time it a few times to get stable numbers
+        print()
+        _time(
+            lambda: list(qs.filter(pk__in=all_ids).values_list("pk", flat=True)),
+            label=f"filter(pk__in={len(all_ids)}) repeated",
+        )
+
+    def test_intersect_rare(self, large_backend: TantivyBackend):
+        """Intersect ~1k Tantivy IDs — the happy path."""
+        all_ids = large_backend.search_ids(RARE_TERM, user=None)
+        qs = Document.objects.all()
+
+        print(f"\n  Tantivy returned {len(all_ids)} IDs")
+
+        profile_cpu(
+            lambda: list(qs.filter(pk__in=all_ids).values_list("pk", flat=True)),
+            label=f"filter(pk__in={len(all_ids)} ids)  [rare, use_tantivy_sort=True path]",
+        )
+
+
+class TestHighlightHitsProfile:
+    """Profile backend.highlight_hits() — per-doc Tantivy lookups with BM25 scoring."""
+
+    def test_highlight_page1(self, large_backend: TantivyBackend):
+        """25-doc highlight for page 1 (rank_start=1)."""
+        all_ids = large_backend.search_ids(COMMON_TERM, user=None)
+        page_ids = all_ids[:PAGE_SIZE]
+
+        profile_cpu(
+            lambda: large_backend.highlight_hits(
+                COMMON_TERM,
+                page_ids,
+                rank_start=1,
+            ),
+            label=f"highlight_hits page 1  (ids {all_ids[0]}..{all_ids[PAGE_SIZE - 1]})",
+        )
+
+    def test_highlight_page_middle(self, large_backend: TantivyBackend):
+        """25-doc highlight for a mid-corpus page (rank_start=page_offset+1)."""
+        all_ids = large_backend.search_ids(COMMON_TERM, user=None)
+        mid = len(all_ids) // 2
+        page_ids = all_ids[mid : mid + PAGE_SIZE]
+        page_offset = mid
+
+        profile_cpu(
+            lambda: large_backend.highlight_hits(
+                COMMON_TERM,
+                page_ids,
+                rank_start=page_offset + 1,
+            ),
+            label=f"highlight_hits page ~{mid // PAGE_SIZE}  (offset {page_offset})",
+        )
+
+    def test_highlight_repeated(self, large_backend: TantivyBackend):
+        """Multiple runs of page-1 highlight to see variance."""
+        all_ids = large_backend.search_ids(COMMON_TERM, user=None)
+        page_ids = all_ids[:PAGE_SIZE]
+
+        print()
+        _time(
+            lambda: large_backend.highlight_hits(COMMON_TERM, page_ids, rank_start=1),
+            label="highlight_hits page 1",
+            runs=5,
+        )
+
+
+class TestFullPipelineProfile:
+    """
+    Profile the combined pipeline as it runs in views.py:
+      search_ids -> filter(pk__in) -> highlight_hits
+    """
+
+    def _run_pipeline(
+        self,
+        backend: TantivyBackend,
+        term: str,
+        page: int = 1,
+    ):
+        all_ids = backend.search_ids(term, user=None)
+        qs = Document.objects.all()
+        visible_ids = set(qs.filter(pk__in=all_ids).values_list("pk", flat=True))
+        ordered_ids = [i for i in all_ids if i in visible_ids]
+
+        page_offset = (page - 1) * PAGE_SIZE
+        page_ids = ordered_ids[page_offset : page_offset + PAGE_SIZE]
+        hits = backend.highlight_hits(
+            term,
+            page_ids,
+            rank_start=page_offset + 1,
+        )
+        return ordered_ids, hits
+
+    def test_pipeline_large_page1(self, large_backend: TantivyBackend):
+        """Full pipeline: large result set, page 1."""
+        ordered_ids, hits = profile_cpu(
+            lambda: self._run_pipeline(large_backend, COMMON_TERM, page=1),
+            label=f"full pipeline  '{COMMON_TERM}'  page 1",
+        )[0]
+        print(f"  -> {len(ordered_ids)} total results, {len(hits)} hits on page")
+
+    def test_pipeline_large_page5(self, large_backend: TantivyBackend):
+        """Full pipeline: large result set, page 5."""
+        ordered_ids, hits = profile_cpu(
+            lambda: self._run_pipeline(large_backend, COMMON_TERM, page=5),
+            label=f"full pipeline  '{COMMON_TERM}'  page 5",
+        )[0]
+        print(f"  -> {len(ordered_ids)} total results, {len(hits)} hits on page")
+
+    def test_pipeline_rare(self, large_backend: TantivyBackend):
+        """Full pipeline: rare term, page 1 (fast path)."""
+        ordered_ids, hits = profile_cpu(
+            lambda: self._run_pipeline(large_backend, RARE_TERM, page=1),
+            label=f"full pipeline  '{RARE_TERM}'  page 1",
+        )[0]
+        print(f"  -> {len(ordered_ids)} total results, {len(hits)} hits on page")
+
+    def test_pipeline_repeated(self, large_backend: TantivyBackend):
+        """Repeated runs to get stable timing (no cProfile overhead)."""
+        print()
+        for term, label in [
+            (COMMON_TERM, f"'{COMMON_TERM}' (large)"),
+            (MEDIUM_TERM, f"'{MEDIUM_TERM}' (medium)"),
+            (RARE_TERM, f"'{RARE_TERM}'  (rare)"),
+        ]:
+            _time(
+                lambda t=term: self._run_pipeline(large_backend, t, page=1),
+                label=f"full pipeline {label} page 1",
+                runs=3,
+            )