diff --git a/test_backend_profile.py b/test_backend_profile.py index e700c1091..696e84bda 100644 --- a/test_backend_profile.py +++ b/test_backend_profile.py @@ -3,7 +3,7 @@ cProfile-based search pipeline profiling with a 20k-document dataset. Run with: - uv run pytest src/documents/tests/search/test_backend_profile.py \ + uv run pytest ../test_backend_profile.py \ -m profiling --override-ini="addopts=" -s -v Each scenario prints: diff --git a/test_classifier_profile.py b/test_classifier_profile.py index be95b2603..c87f89be8 100644 --- a/test_classifier_profile.py +++ b/test_classifier_profile.py @@ -3,7 +3,7 @@ cProfile + tracemalloc classifier profiling test. Run with: - uv run pytest src/documents/tests/test_classifier_profile.py \ + uv run pytest ../test_classifier_profile.py \ -m profiling --override-ini="addopts=" -s -v Corpus: 5 000 documents, 40 correspondents (25 AUTO), 25 doc types (15 AUTO), diff --git a/test_search_profiling.py b/test_search_profiling.py new file mode 100644 index 000000000..07c218fd2 --- /dev/null +++ b/test_search_profiling.py @@ -0,0 +1,273 @@ +""" +Search performance profiling tests. + +Run explicitly — excluded from the normal test suite: + + uv run pytest -m profiling -s -p no:xdist --override-ini="addopts=" -v + +The ``-s`` flag is required to see profile_block() output. +The ``-p no:xdist`` flag disables parallel execution for accurate measurements. + +Corpus: 5 000 documents generated deterministically from a fixed Faker seed, +with realistic variety: 30 correspondents, 15 document types, 50 tags, ~500 +notes spread across ~10 % of documents. +""" + +from __future__ import annotations + +import random + +import pytest +from django.contrib.auth.models import User +from faker import Faker +from profiling import profile_block +from rest_framework.test import APIClient + +from documents.models import Correspondent +from documents.models import Document +from documents.models import DocumentType +from documents.models import Note +from documents.models import Tag +from documents.search import get_backend +from documents.search import reset_backend +from documents.search._backend import SearchMode + +pytestmark = [pytest.mark.profiling, pytest.mark.search, pytest.mark.django_db] + +# --------------------------------------------------------------------------- +# Corpus parameters +# --------------------------------------------------------------------------- + +DOC_COUNT = 5_000 +SEED = 42 +NUM_CORRESPONDENTS = 30 +NUM_DOC_TYPES = 15 +NUM_TAGS = 50 +NOTE_FRACTION = 0.10 # ~500 documents get a note +PAGE_SIZE = 25 + + +def _build_corpus(rng: random.Random, fake: Faker) -> None: + """ + Insert the full corpus into the database and index it. + + Uses bulk_create for the Document rows (fast) then handles the M2M tag + relationships and notes individually. Indexes the full corpus with a + single backend.rebuild() call. + """ + import datetime + + # ---- lookup objects ------------------------------------------------- + correspondents = [ + Correspondent.objects.create(name=f"profcorp-{i}-{fake.company()}"[:128]) + for i in range(NUM_CORRESPONDENTS) + ] + doc_types = [ + DocumentType.objects.create(name=f"proftype-{i}-{fake.word()}"[:128]) + for i in range(NUM_DOC_TYPES) + ] + tags = [ + Tag.objects.create(name=f"proftag-{i}-{fake.word()}"[:100]) + for i in range(NUM_TAGS) + ] + note_user = User.objects.create_user(username="profnoteuser", password="x") + + # ---- bulk-create documents ------------------------------------------ + base_date = datetime.date(2018, 1, 1) + raw_docs = [] + for i in range(DOC_COUNT): + day_offset = rng.randint(0, 6 * 365) + created = base_date + datetime.timedelta(days=day_offset) + raw_docs.append( + Document( + title=fake.sentence(nb_words=rng.randint(3, 9)).rstrip("."), + content="\n\n".join( + fake.paragraph(nb_sentences=rng.randint(3, 7)) + for _ in range(rng.randint(2, 5)) + ), + checksum=f"PROF{i:07d}", + correspondent=rng.choice(correspondents + [None] * 8), + document_type=rng.choice(doc_types + [None] * 4), + created=created, + ), + ) + documents = Document.objects.bulk_create(raw_docs) + + # ---- tags (M2M, post-bulk) ------------------------------------------ + for doc in documents: + k = rng.randint(0, 5) + if k: + doc.tags.add(*rng.sample(tags, k)) + + # ---- notes on ~10 % of docs ----------------------------------------- + note_docs = rng.sample(documents, int(DOC_COUNT * NOTE_FRACTION)) + for doc in note_docs: + Note.objects.create( + document=doc, + note=fake.sentence(nb_words=rng.randint(6, 20)), + user=note_user, + ) + + # ---- build Tantivy index -------------------------------------------- + backend = get_backend() + qs = Document.objects.select_related( + "correspondent", + "document_type", + "storage_path", + "owner", + ).prefetch_related("tags", "notes__user", "custom_fields__field") + backend.rebuild(qs) + + +class TestSearchProfiling: + """ + Performance profiling for the Tantivy search backend and DRF API layer. + + Each test builds a fresh 5 000-document corpus, exercises one hot path, + and prints profile_block() measurements to stdout. No correctness + assertions — the goal is to surface hot spots and track regressions. + """ + + @pytest.fixture(autouse=True) + def _setup(self, tmp_path, settings): + index_dir = tmp_path / "index" + index_dir.mkdir() + settings.INDEX_DIR = index_dir + + reset_backend() + rng = random.Random(SEED) + fake = Faker() + Faker.seed(SEED) + + self.user = User.objects.create_superuser( + username="profiler", + password="admin", + ) + self.client = APIClient() + self.client.force_authenticate(user=self.user) + + _build_corpus(rng, fake) + yield + reset_backend() + + # -- 1. Backend: search_ids relevance --------------------------------- + + def test_profile_search_ids_relevance(self): + """Profile: search_ids() with relevance ordering across several queries.""" + backend = get_backend() + queries = [ + "invoice payment", + "annual report", + "bank statement", + "contract agreement", + "receipt", + ] + with profile_block(f"search_ids — relevance ({len(queries)} queries)"): + for q in queries: + backend.search_ids(q, user=None) + + # -- 2. Backend: search_ids with Tantivy-native sort ------------------ + + def test_profile_search_ids_sorted(self): + """Profile: search_ids() sorted by a Tantivy fast field (created).""" + backend = get_backend() + with profile_block("search_ids — sorted by created (asc + desc)"): + backend.search_ids( + "the", + user=None, + sort_field="created", + sort_reverse=False, + ) + backend.search_ids( + "the", + user=None, + sort_field="created", + sort_reverse=True, + ) + + # -- 3. Backend: highlight_hits for a page of 25 ---------------------- + + def test_profile_highlight_hits(self): + """Profile: highlight_hits() for a 25-document page.""" + backend = get_backend() + all_ids = backend.search_ids("report", user=None) + page_ids = all_ids[:PAGE_SIZE] + with profile_block(f"highlight_hits — {len(page_ids)} docs"): + backend.highlight_hits("report", page_ids) + + # -- 4. Backend: autocomplete ----------------------------------------- + + def test_profile_autocomplete(self): + """Profile: autocomplete() with eight common prefixes.""" + backend = get_backend() + prefixes = ["inv", "pay", "con", "rep", "sta", "acc", "doc", "fin"] + with profile_block(f"autocomplete — {len(prefixes)} prefixes"): + for prefix in prefixes: + backend.autocomplete(prefix, limit=10) + + # -- 5. Backend: simple-mode search (TEXT and TITLE) ------------------ + + def test_profile_search_ids_simple_modes(self): + """Profile: search_ids() in TEXT and TITLE simple-search modes.""" + backend = get_backend() + queries = ["invoice 2023", "annual report", "bank statement"] + with profile_block( + f"search_ids — TEXT + TITLE modes ({len(queries)} queries each)", + ): + for q in queries: + backend.search_ids(q, user=None, search_mode=SearchMode.TEXT) + backend.search_ids(q, user=None, search_mode=SearchMode.TITLE) + + # -- 6. API: full round-trip, relevance + page 1 ---------------------- + + def test_profile_api_relevance_search(self): + """Profile: full API search round-trip, relevance order, page 1.""" + with profile_block( + f"API /documents/?query=… relevance (page 1, page_size={PAGE_SIZE})", + ): + response = self.client.get( + f"/api/documents/?query=invoice+payment&page=1&page_size={PAGE_SIZE}", + ) + assert response.status_code == 200 + + # -- 7. API: full round-trip, ORM-ordered (title) --------------------- + + def test_profile_api_orm_sorted_search(self): + """Profile: full API search round-trip with ORM-delegated sort (title).""" + with profile_block("API /documents/?query=…&ordering=title"): + response = self.client.get( + f"/api/documents/?query=report&ordering=title&page=1&page_size={PAGE_SIZE}", + ) + assert response.status_code == 200 + + # -- 8. API: full round-trip, score sort ------------------------------ + + def test_profile_api_score_sort(self): + """Profile: full API search with ordering=-score (relevance, preserve order).""" + with profile_block("API /documents/?query=…&ordering=-score"): + response = self.client.get( + f"/api/documents/?query=statement&ordering=-score&page=1&page_size={PAGE_SIZE}", + ) + assert response.status_code == 200 + + # -- 9. API: full round-trip, with selection_data --------------------- + + def test_profile_api_with_selection_data(self): + """Profile: full API search including include_selection_data=true.""" + with profile_block("API /documents/?query=…&include_selection_data=true"): + response = self.client.get( + f"/api/documents/?query=contract&page=1&page_size={PAGE_SIZE}" + "&include_selection_data=true", + ) + assert response.status_code == 200 + assert "selection_data" in response.data + + # -- 10. API: paginated (page 2) -------------------------------------- + + def test_profile_api_page_2(self): + """Profile: full API search, page 2 — exercises page offset arithmetic.""" + with profile_block(f"API /documents/?query=…&page=2&page_size={PAGE_SIZE}"): + response = self.client.get( + f"/api/documents/?query=the&page=2&page_size={PAGE_SIZE}", + ) + assert response.status_code == 200