feat(profiling): add document list API and selection_data profiling

Adds test_doclist_profile.py with 8 profiling tests covering the /api/documents/ list path (ORM ordering, page sizes, single-doc detail, cProfile) and _get_selection_data_for_queryset in isolation and via API. Also registers the 'profiling' pytest marker in pyproject.toml. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-26 23:34:17 +00:00 · 2026-04-11 14:13:17 -07:00
parent 24b754b44c
commit d006b79fd1
2 changed files with 293 additions and 0 deletions
@@ -312,6 +312,7 @@ markers = [
  "date_parsing: Tests which cover date parsing from content or filename",
  "management: Tests which cover management commands/functionality",
  "search: Tests for the Tantivy search backend",
+  "profiling: Performance profiling tests — print measurements, no assertions",
 ]

 [tool.pytest_env]
@@ -0,0 +1,292 @@
+"""
+Document list API profiling — no search, pure ORM path.
+
+Run with:
+    uv run pytest ../test_doclist_profile.py \
+        -m profiling --override-ini="addopts=" -s -v
+
+Corpus: 5 000 documents, 30 correspondents, 20 doc types, 80 tags,
+        ~500 notes (10 %), 10 custom fields with instances on ~50 % of docs.
+
+Scenarios
+---------
+TestDocListProfile
+  - test_list_default_ordering     GET /api/documents/ created desc, page 1, page_size=25
+  - test_list_title_ordering       same with ordering=title
+  - test_list_page_size_comparison page_size=10 / 25 / 100 in sequence
+  - test_list_detail_fields        GET /api/documents/{id}/ — single document serializer cost
+  - test_list_cpu_profile          cProfile of one list request
+
+TestSelectionDataProfile
+  - test_selection_data_unfiltered  _get_selection_data_for_queryset(all docs) in isolation
+  - test_selection_data_via_api     GET /api/documents/?include_selection_data=true
+  - test_selection_data_filtered    filtered vs unfiltered COUNT query comparison
+"""
+
+from __future__ import annotations
+
+import datetime
+import random
+import time
+
+import pytest
+from django.contrib.auth.models import User
+from faker import Faker
+from profiling import profile_block
+from profiling import profile_cpu
+from rest_framework.test import APIClient
+
+from documents.models import Correspondent
+from documents.models import CustomField
+from documents.models import CustomFieldInstance
+from documents.models import Document
+from documents.models import DocumentType
+from documents.models import Note
+from documents.models import Tag
+from documents.views import DocumentViewSet
+
+pytestmark = [pytest.mark.profiling, pytest.mark.django_db]
+
+# ---------------------------------------------------------------------------
+# Corpus parameters
+# ---------------------------------------------------------------------------
+
+NUM_DOCS = 5_000
+NUM_CORRESPONDENTS = 30
+NUM_DOC_TYPES = 20
+NUM_TAGS = 80
+NOTE_FRACTION = 0.10
+CUSTOM_FIELD_COUNT = 10
+CUSTOM_FIELD_FRACTION = 0.50
+PAGE_SIZE = 25
+SEED = 42
+
+
+# ---------------------------------------------------------------------------
+# Module-scoped corpus fixture
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="module")
+def module_db(django_db_setup, django_db_blocker):
+    with django_db_blocker.unblock():
+        yield
+
+
+@pytest.fixture(scope="module")
+def doclist_corpus(module_db):
+    """
+    Build a 5 000-document corpus with tags, notes, custom fields, correspondents,
+    doc types, and storage paths.  All objects are deleted on teardown.
+    """
+    fake = Faker()
+    Faker.seed(SEED)
+    rng = random.Random(SEED)
+
+    print(f"\n[setup] Creating {NUM_CORRESPONDENTS} correspondents...")  # noqa: T201
+    correspondents = [
+        Correspondent.objects.create(name=f"dlcorp-{i}-{fake.company()}"[:128])
+        for i in range(NUM_CORRESPONDENTS)
+    ]
+
+    print(f"[setup] Creating {NUM_DOC_TYPES} doc types...")  # noqa: T201
+    doc_types = [
+        DocumentType.objects.create(name=f"dltype-{i}-{fake.word()}"[:128])
+        for i in range(NUM_DOC_TYPES)
+    ]
+
+    print(f"[setup] Creating {NUM_TAGS} tags...")  # noqa: T201
+    tags = [
+        Tag.objects.create(name=f"dltag-{i}-{fake.word()}"[:100])
+        for i in range(NUM_TAGS)
+    ]
+
+    print(f"[setup] Creating {CUSTOM_FIELD_COUNT} custom fields...")  # noqa: T201
+    custom_fields = [
+        CustomField.objects.create(
+            name=f"Field {i}",
+            data_type=CustomField.FieldDataType.STRING,
+        )
+        for i in range(CUSTOM_FIELD_COUNT)
+    ]
+
+    note_user = User.objects.create_user(username="doclistnoteuser", password="x")
+    owner = User.objects.create_superuser(username="doclistowner", password="admin")
+
+    print(f"[setup] Building {NUM_DOCS} document rows...")  # noqa: T201
+    base_date = datetime.date(2018, 1, 1)
+    raw_docs = []
+    for i in range(NUM_DOCS):
+        day_offset = rng.randint(0, 6 * 365)
+        raw_docs.append(
+            Document(
+                title=fake.sentence(nb_words=rng.randint(3, 8)).rstrip("."),
+                content="\n\n".join(
+                    fake.paragraph(nb_sentences=rng.randint(2, 5))
+                    for _ in range(rng.randint(1, 3))
+                ),
+                checksum=f"DL{i:07d}",
+                correspondent=rng.choice(correspondents + [None] * 5),
+                document_type=rng.choice(doc_types + [None] * 4),
+                created=base_date + datetime.timedelta(days=day_offset),
+                owner=owner if rng.random() < 0.8 else None,
+            ),
+        )
+    t0 = time.perf_counter()
+    documents = Document.objects.bulk_create(raw_docs)
+    print(f"[setup] bulk_create {NUM_DOCS} docs: {time.perf_counter() - t0:.2f}s")  # noqa: T201
+
+    t0 = time.perf_counter()
+    for doc in documents:
+        k = rng.randint(0, 5)
+        if k:
+            doc.tags.add(*rng.sample(tags, k))
+    print(f"[setup] tag M2M assignments: {time.perf_counter() - t0:.2f}s")  # noqa: T201
+
+    note_docs = rng.sample(documents, int(NUM_DOCS * NOTE_FRACTION))
+    Note.objects.bulk_create(
+        [
+            Note(
+                document=doc,
+                note=fake.sentence(nb_words=rng.randint(4, 15)),
+                user=note_user,
+            )
+            for doc in note_docs
+        ],
+    )
+
+    cf_docs = rng.sample(documents, int(NUM_DOCS * CUSTOM_FIELD_FRACTION))
+    CustomFieldInstance.objects.bulk_create(
+        [
+            CustomFieldInstance(
+                document=doc,
+                field=rng.choice(custom_fields),
+                value_text=fake.word(),
+            )
+            for doc in cf_docs
+        ],
+    )
+
+    first_doc_pk = documents[0].pk
+
+    yield {"owner": owner, "first_doc_pk": first_doc_pk, "tags": tags}
+
+    print("\n[teardown] Removing doclist corpus...")  # noqa: T201
+    Document.objects.all().delete()
+    Correspondent.objects.all().delete()
+    DocumentType.objects.all().delete()
+    Tag.objects.all().delete()
+    CustomField.objects.all().delete()
+    User.objects.filter(username__in=["doclistnoteuser", "doclistowner"]).delete()
+
+
+# ---------------------------------------------------------------------------
+# TestDocListProfile
+# ---------------------------------------------------------------------------
+
+
+class TestDocListProfile:
+    """Profile GET /api/documents/ — pure ORM path, no Tantivy."""
+
+    @pytest.fixture(autouse=True)
+    def _client(self, doclist_corpus):
+        owner = doclist_corpus["owner"]
+        self.client = APIClient()
+        self.client.force_authenticate(user=owner)
+        self.first_doc_pk = doclist_corpus["first_doc_pk"]
+
+    def test_list_default_ordering(self):
+        """GET /api/documents/ default ordering (-created), page 1, page_size=25."""
+        with profile_block(
+            f"GET /api/documents/ default ordering  [page_size={PAGE_SIZE}]",
+        ):
+            response = self.client.get(
+                f"/api/documents/?page=1&page_size={PAGE_SIZE}",
+            )
+        assert response.status_code == 200
+
+    def test_list_title_ordering(self):
+        """GET /api/documents/ ordered by title — tests ORM sort path."""
+        with profile_block(
+            f"GET /api/documents/?ordering=title  [page_size={PAGE_SIZE}]",
+        ):
+            response = self.client.get(
+                f"/api/documents/?ordering=title&page=1&page_size={PAGE_SIZE}",
+            )
+        assert response.status_code == 200
+
+    def test_list_page_size_comparison(self):
+        """Compare serializer cost at page_size=10, 25, 100."""
+        for page_size in [10, 25, 100]:
+            with profile_block(f"GET /api/documents/  [page_size={page_size}]"):
+                response = self.client.get(
+                    f"/api/documents/?page=1&page_size={page_size}",
+                )
+            assert response.status_code == 200
+
+    def test_list_detail_fields(self):
+        """GET /api/documents/{id}/ — per-doc serializer cost with all relations."""
+        pk = self.first_doc_pk
+        with profile_block(f"GET /api/documents/{pk}/ — single doc serializer"):
+            response = self.client.get(f"/api/documents/{pk}/")
+        assert response.status_code == 200
+
+    def test_list_cpu_profile(self):
+        """cProfile of one list request — surfaces hot frames in serializer."""
+        profile_cpu(
+            lambda: self.client.get(
+                f"/api/documents/?page=1&page_size={PAGE_SIZE}",
+            ),
+            label=f"GET /api/documents/ cProfile  [page_size={PAGE_SIZE}]",
+            top=30,
+        )
+
+
+# ---------------------------------------------------------------------------
+# TestSelectionDataProfile
+# ---------------------------------------------------------------------------
+
+
+class TestSelectionDataProfile:
+    """Profile _get_selection_data_for_queryset — the 5+ COUNT queries per request."""
+
+    @pytest.fixture(autouse=True)
+    def _setup(self, doclist_corpus):
+        owner = doclist_corpus["owner"]
+        self.client = APIClient()
+        self.client.force_authenticate(user=owner)
+        self.tags = doclist_corpus["tags"]
+
+    def test_selection_data_unfiltered(self):
+        """Call _get_selection_data_for_queryset(all docs) directly — COUNT queries in isolation."""
+        viewset = DocumentViewSet()
+        qs = Document.objects.all()
+
+        with profile_block("_get_selection_data_for_queryset(all docs) — direct call"):
+            viewset._get_selection_data_for_queryset(qs)
+
+    def test_selection_data_via_api(self):
+        """Full API round-trip with include_selection_data=true."""
+        with profile_block(
+            f"GET /api/documents/?include_selection_data=true  [page_size={PAGE_SIZE}]",
+        ):
+            response = self.client.get(
+                f"/api/documents/?page=1&page_size={PAGE_SIZE}&include_selection_data=true",
+            )
+        assert response.status_code == 200
+        assert "selection_data" in response.data
+
+    def test_selection_data_filtered(self):
+        """selection_data on a tag-filtered queryset — filtered COUNT vs unfiltered."""
+        tag = self.tags[0]
+        viewset = DocumentViewSet()
+        filtered_qs = Document.objects.filter(tags=tag)
+        unfiltered_qs = Document.objects.all()
+
+        print(f"\n  Tag '{tag.name}' matches {filtered_qs.count()} docs")  # noqa: T201
+
+        with profile_block("_get_selection_data_for_queryset(unfiltered)"):
+            viewset._get_selection_data_for_queryset(unfiltered_qs)
+
+        with profile_block("_get_selection_data_for_queryset(filtered by tag)"):
+            viewset._get_selection_data_for_queryset(filtered_qs)