mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-26 23:34:17 +00:00
feat(profiling): add document list API and selection_data profiling
Adds test_doclist_profile.py with 8 profiling tests covering the /api/documents/ list path (ORM ordering, page sizes, single-doc detail, cProfile) and _get_selection_data_for_queryset in isolation and via API. Also registers the 'profiling' pytest marker in pyproject.toml. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -312,6 +312,7 @@ markers = [
|
||||
"date_parsing: Tests which cover date parsing from content or filename",
|
||||
"management: Tests which cover management commands/functionality",
|
||||
"search: Tests for the Tantivy search backend",
|
||||
"profiling: Performance profiling tests — print measurements, no assertions",
|
||||
]
|
||||
|
||||
[tool.pytest_env]
|
||||
|
||||
@@ -0,0 +1,292 @@
|
||||
"""
|
||||
Document list API profiling — no search, pure ORM path.
|
||||
|
||||
Run with:
|
||||
uv run pytest ../test_doclist_profile.py \
|
||||
-m profiling --override-ini="addopts=" -s -v
|
||||
|
||||
Corpus: 5 000 documents, 30 correspondents, 20 doc types, 80 tags,
|
||||
~500 notes (10 %), 10 custom fields with instances on ~50 % of docs.
|
||||
|
||||
Scenarios
|
||||
---------
|
||||
TestDocListProfile
|
||||
- test_list_default_ordering GET /api/documents/ created desc, page 1, page_size=25
|
||||
- test_list_title_ordering same with ordering=title
|
||||
- test_list_page_size_comparison page_size=10 / 25 / 100 in sequence
|
||||
- test_list_detail_fields GET /api/documents/{id}/ — single document serializer cost
|
||||
- test_list_cpu_profile cProfile of one list request
|
||||
|
||||
TestSelectionDataProfile
|
||||
- test_selection_data_unfiltered _get_selection_data_for_queryset(all docs) in isolation
|
||||
- test_selection_data_via_api GET /api/documents/?include_selection_data=true
|
||||
- test_selection_data_filtered filtered vs unfiltered COUNT query comparison
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import random
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from django.contrib.auth.models import User
|
||||
from faker import Faker
|
||||
from profiling import profile_block
|
||||
from profiling import profile_cpu
|
||||
from rest_framework.test import APIClient
|
||||
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import Note
|
||||
from documents.models import Tag
|
||||
from documents.views import DocumentViewSet
|
||||
|
||||
pytestmark = [pytest.mark.profiling, pytest.mark.django_db]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Corpus parameters
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
NUM_DOCS = 5_000
|
||||
NUM_CORRESPONDENTS = 30
|
||||
NUM_DOC_TYPES = 20
|
||||
NUM_TAGS = 80
|
||||
NOTE_FRACTION = 0.10
|
||||
CUSTOM_FIELD_COUNT = 10
|
||||
CUSTOM_FIELD_FRACTION = 0.50
|
||||
PAGE_SIZE = 25
|
||||
SEED = 42
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-scoped corpus fixture
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def module_db(django_db_setup, django_db_blocker):
|
||||
with django_db_blocker.unblock():
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def doclist_corpus(module_db):
|
||||
"""
|
||||
Build a 5 000-document corpus with tags, notes, custom fields, correspondents,
|
||||
doc types, and storage paths. All objects are deleted on teardown.
|
||||
"""
|
||||
fake = Faker()
|
||||
Faker.seed(SEED)
|
||||
rng = random.Random(SEED)
|
||||
|
||||
print(f"\n[setup] Creating {NUM_CORRESPONDENTS} correspondents...") # noqa: T201
|
||||
correspondents = [
|
||||
Correspondent.objects.create(name=f"dlcorp-{i}-{fake.company()}"[:128])
|
||||
for i in range(NUM_CORRESPONDENTS)
|
||||
]
|
||||
|
||||
print(f"[setup] Creating {NUM_DOC_TYPES} doc types...") # noqa: T201
|
||||
doc_types = [
|
||||
DocumentType.objects.create(name=f"dltype-{i}-{fake.word()}"[:128])
|
||||
for i in range(NUM_DOC_TYPES)
|
||||
]
|
||||
|
||||
print(f"[setup] Creating {NUM_TAGS} tags...") # noqa: T201
|
||||
tags = [
|
||||
Tag.objects.create(name=f"dltag-{i}-{fake.word()}"[:100])
|
||||
for i in range(NUM_TAGS)
|
||||
]
|
||||
|
||||
print(f"[setup] Creating {CUSTOM_FIELD_COUNT} custom fields...") # noqa: T201
|
||||
custom_fields = [
|
||||
CustomField.objects.create(
|
||||
name=f"Field {i}",
|
||||
data_type=CustomField.FieldDataType.STRING,
|
||||
)
|
||||
for i in range(CUSTOM_FIELD_COUNT)
|
||||
]
|
||||
|
||||
note_user = User.objects.create_user(username="doclistnoteuser", password="x")
|
||||
owner = User.objects.create_superuser(username="doclistowner", password="admin")
|
||||
|
||||
print(f"[setup] Building {NUM_DOCS} document rows...") # noqa: T201
|
||||
base_date = datetime.date(2018, 1, 1)
|
||||
raw_docs = []
|
||||
for i in range(NUM_DOCS):
|
||||
day_offset = rng.randint(0, 6 * 365)
|
||||
raw_docs.append(
|
||||
Document(
|
||||
title=fake.sentence(nb_words=rng.randint(3, 8)).rstrip("."),
|
||||
content="\n\n".join(
|
||||
fake.paragraph(nb_sentences=rng.randint(2, 5))
|
||||
for _ in range(rng.randint(1, 3))
|
||||
),
|
||||
checksum=f"DL{i:07d}",
|
||||
correspondent=rng.choice(correspondents + [None] * 5),
|
||||
document_type=rng.choice(doc_types + [None] * 4),
|
||||
created=base_date + datetime.timedelta(days=day_offset),
|
||||
owner=owner if rng.random() < 0.8 else None,
|
||||
),
|
||||
)
|
||||
t0 = time.perf_counter()
|
||||
documents = Document.objects.bulk_create(raw_docs)
|
||||
print(f"[setup] bulk_create {NUM_DOCS} docs: {time.perf_counter() - t0:.2f}s") # noqa: T201
|
||||
|
||||
t0 = time.perf_counter()
|
||||
for doc in documents:
|
||||
k = rng.randint(0, 5)
|
||||
if k:
|
||||
doc.tags.add(*rng.sample(tags, k))
|
||||
print(f"[setup] tag M2M assignments: {time.perf_counter() - t0:.2f}s") # noqa: T201
|
||||
|
||||
note_docs = rng.sample(documents, int(NUM_DOCS * NOTE_FRACTION))
|
||||
Note.objects.bulk_create(
|
||||
[
|
||||
Note(
|
||||
document=doc,
|
||||
note=fake.sentence(nb_words=rng.randint(4, 15)),
|
||||
user=note_user,
|
||||
)
|
||||
for doc in note_docs
|
||||
],
|
||||
)
|
||||
|
||||
cf_docs = rng.sample(documents, int(NUM_DOCS * CUSTOM_FIELD_FRACTION))
|
||||
CustomFieldInstance.objects.bulk_create(
|
||||
[
|
||||
CustomFieldInstance(
|
||||
document=doc,
|
||||
field=rng.choice(custom_fields),
|
||||
value_text=fake.word(),
|
||||
)
|
||||
for doc in cf_docs
|
||||
],
|
||||
)
|
||||
|
||||
first_doc_pk = documents[0].pk
|
||||
|
||||
yield {"owner": owner, "first_doc_pk": first_doc_pk, "tags": tags}
|
||||
|
||||
print("\n[teardown] Removing doclist corpus...") # noqa: T201
|
||||
Document.objects.all().delete()
|
||||
Correspondent.objects.all().delete()
|
||||
DocumentType.objects.all().delete()
|
||||
Tag.objects.all().delete()
|
||||
CustomField.objects.all().delete()
|
||||
User.objects.filter(username__in=["doclistnoteuser", "doclistowner"]).delete()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TestDocListProfile
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestDocListProfile:
|
||||
"""Profile GET /api/documents/ — pure ORM path, no Tantivy."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _client(self, doclist_corpus):
|
||||
owner = doclist_corpus["owner"]
|
||||
self.client = APIClient()
|
||||
self.client.force_authenticate(user=owner)
|
||||
self.first_doc_pk = doclist_corpus["first_doc_pk"]
|
||||
|
||||
def test_list_default_ordering(self):
|
||||
"""GET /api/documents/ default ordering (-created), page 1, page_size=25."""
|
||||
with profile_block(
|
||||
f"GET /api/documents/ default ordering [page_size={PAGE_SIZE}]",
|
||||
):
|
||||
response = self.client.get(
|
||||
f"/api/documents/?page=1&page_size={PAGE_SIZE}",
|
||||
)
|
||||
assert response.status_code == 200
|
||||
|
||||
def test_list_title_ordering(self):
|
||||
"""GET /api/documents/ ordered by title — tests ORM sort path."""
|
||||
with profile_block(
|
||||
f"GET /api/documents/?ordering=title [page_size={PAGE_SIZE}]",
|
||||
):
|
||||
response = self.client.get(
|
||||
f"/api/documents/?ordering=title&page=1&page_size={PAGE_SIZE}",
|
||||
)
|
||||
assert response.status_code == 200
|
||||
|
||||
def test_list_page_size_comparison(self):
|
||||
"""Compare serializer cost at page_size=10, 25, 100."""
|
||||
for page_size in [10, 25, 100]:
|
||||
with profile_block(f"GET /api/documents/ [page_size={page_size}]"):
|
||||
response = self.client.get(
|
||||
f"/api/documents/?page=1&page_size={page_size}",
|
||||
)
|
||||
assert response.status_code == 200
|
||||
|
||||
def test_list_detail_fields(self):
|
||||
"""GET /api/documents/{id}/ — per-doc serializer cost with all relations."""
|
||||
pk = self.first_doc_pk
|
||||
with profile_block(f"GET /api/documents/{pk}/ — single doc serializer"):
|
||||
response = self.client.get(f"/api/documents/{pk}/")
|
||||
assert response.status_code == 200
|
||||
|
||||
def test_list_cpu_profile(self):
|
||||
"""cProfile of one list request — surfaces hot frames in serializer."""
|
||||
profile_cpu(
|
||||
lambda: self.client.get(
|
||||
f"/api/documents/?page=1&page_size={PAGE_SIZE}",
|
||||
),
|
||||
label=f"GET /api/documents/ cProfile [page_size={PAGE_SIZE}]",
|
||||
top=30,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TestSelectionDataProfile
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSelectionDataProfile:
|
||||
"""Profile _get_selection_data_for_queryset — the 5+ COUNT queries per request."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _setup(self, doclist_corpus):
|
||||
owner = doclist_corpus["owner"]
|
||||
self.client = APIClient()
|
||||
self.client.force_authenticate(user=owner)
|
||||
self.tags = doclist_corpus["tags"]
|
||||
|
||||
def test_selection_data_unfiltered(self):
|
||||
"""Call _get_selection_data_for_queryset(all docs) directly — COUNT queries in isolation."""
|
||||
viewset = DocumentViewSet()
|
||||
qs = Document.objects.all()
|
||||
|
||||
with profile_block("_get_selection_data_for_queryset(all docs) — direct call"):
|
||||
viewset._get_selection_data_for_queryset(qs)
|
||||
|
||||
def test_selection_data_via_api(self):
|
||||
"""Full API round-trip with include_selection_data=true."""
|
||||
with profile_block(
|
||||
f"GET /api/documents/?include_selection_data=true [page_size={PAGE_SIZE}]",
|
||||
):
|
||||
response = self.client.get(
|
||||
f"/api/documents/?page=1&page_size={PAGE_SIZE}&include_selection_data=true",
|
||||
)
|
||||
assert response.status_code == 200
|
||||
assert "selection_data" in response.data
|
||||
|
||||
def test_selection_data_filtered(self):
|
||||
"""selection_data on a tag-filtered queryset — filtered COUNT vs unfiltered."""
|
||||
tag = self.tags[0]
|
||||
viewset = DocumentViewSet()
|
||||
filtered_qs = Document.objects.filter(tags=tag)
|
||||
unfiltered_qs = Document.objects.all()
|
||||
|
||||
print(f"\n Tag '{tag.name}' matches {filtered_qs.count()} docs") # noqa: T201
|
||||
|
||||
with profile_block("_get_selection_data_for_queryset(unfiltered)"):
|
||||
viewset._get_selection_data_for_queryset(unfiltered_qs)
|
||||
|
||||
with profile_block("_get_selection_data_for_queryset(filtered by tag)"):
|
||||
viewset._get_selection_data_for_queryset(filtered_qs)
|
||||
Reference in New Issue
Block a user