diff --git a/docs/superpowers/plans/2026-04-03-search-performance.md b/docs/superpowers/plans/2026-04-03-search-performance.md deleted file mode 100644 index 0456cf7b5..000000000 --- a/docs/superpowers/plans/2026-04-03-search-performance.md +++ /dev/null @@ -1,1184 +0,0 @@ -# Search Performance Improvements - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Eliminate wasted work in the Tantivy search pipeline — stop generating highlights for 10,000 hits when only 25 are displayed, delegate sorting to Tantivy instead of duplicating it in the ORM, and provide a lightweight ID-only query path. - -**Architecture:** Modify `search()` so it still returns ALL matching hits (preserving DRF pagination compatibility) but only generates expensive highlights for a caller-specified page slice. The viewset passes the real DRF `page`/`page_size` as a `highlight_page` parameter so only ~25 hits pay the snippet cost instead of ~10,000. Push DRF `ordering` through to Tantivy's native `order_by_field` instead of re-sorting in Python. Add a lightweight `search_ids()` for cases where only IDs are needed. Keep the ORM intersection as a correctness backstop for filters Tantivy can't express (custom fields, content icontains). - -**Key design constraint:** DRF's `PageNumberPagination` trusts `len(object_list)` for page count and slices `object_list[start:end]` for each page. We must NOT pass pre-sliced data with a mismatched length — that causes pages 2+ to return empty. Instead, `TantivyRelevanceList` always contains ALL hits; DRF slices it as usual. - -**Tech Stack:** Python, Django REST Framework, tantivy-py, pytest - ---- - -## File Map - -| File | Responsibility | Tasks | -| ---------------------------------------------- | ----------------------------------------------------------------- | ------- | -| `src/documents/profiling.py` | `profile_block()` context manager — wall time, memory, DB queries | 0, 6 | -| `src/documents/search/_backend.py` | Search backend — `search()`, `search_ids()`, `more_like_this()` | 1, 2, 3 | -| `src/documents/search/__init__.py` | Public re-exports | — | -| `src/documents/views.py` | `UnifiedSearchViewSet.list()` — orchestrates search + pagination | 4 | -| `src/paperless/views.py` | `StandardPagination` — DRF pagination | — | -| `src/documents/tests/search/test_backend.py` | Backend unit tests | 1, 2, 3 | -| `src/documents/tests/test_api_search.py` | API integration tests | 4 | -| `src/documents/tests/test_search_profiling.py` | Profiling tests (temporary) — before/after baselines | 0, 6 | - ---- - -### Task 0: Baseline profiling - -Capture performance baselines for the current implementation using `profile_block()` from `src/documents/profiling.py`. This data will be compared against post-implementation measurements in Task 6. - -We profile three representative scenarios: - -1. **Relevance search** (no ordering) — the default path, exercises highlights for all hits -2. **Sorted search** (ordering=created) — exercises the ORM re-sort path -3. **Paginated search** (page 2) — exercises the overfetch + DRF slice path - -**Files:** - -- Create: `src/documents/tests/test_search_profiling.py` -- Read: `src/documents/profiling.py` - -- [ ] **Step 1: Create the profiling test file** - -Create `src/documents/tests/test_search_profiling.py`: - -```python -""" -Temporary profiling tests for search performance. - -Run with: uv run pytest src/documents/tests/test_search_profiling.py -v -s -The -s flag is required to see profile_block() output on stdout. - -Delete this file when profiling is complete. -""" - -import pytest -from django.contrib.auth.models import User -from rest_framework.test import APIClient - -from documents.models import Document -from documents.profiling import profile_block -from documents.search import get_backend -from documents.search import reset_backend -from documents.tests.utils import DirectoriesMixin - -pytestmark = [pytest.mark.search, pytest.mark.django_db] - -DOC_COUNT = 200 # Enough to exercise pagination and overfetch behavior - - -class TestSearchProfilingBaseline(DirectoriesMixin): - """Baseline profiling of the CURRENT search implementation. - - Run BEFORE making changes, record the output, then compare with Task 6. - """ - - @pytest.fixture(autouse=True) - def _setup(self): - reset_backend() - self.user = User.objects.create_superuser(username="profiler") - self.client = APIClient() - self.client.force_authenticate(user=self.user) - - backend = get_backend() - for i in range(DOC_COUNT): - doc = Document.objects.create( - title=f"Profiling document number {i}", - content=f"This is searchable content for document {i} with keyword profiling", - checksum=f"PROF{i:04d}", - archive_serial_number=i + 1, - ) - backend.add_or_update(doc) - yield - reset_backend() - - def test_profile_relevance_search(self): - """Profile: relevance-ranked search, no ordering, page 1 default page_size.""" - with profile_block("BEFORE — relevance search (no ordering)"): - response = self.client.get("/api/documents/?query=profiling") - assert response.status_code == 200 - assert response.data["count"] == DOC_COUNT - - def test_profile_sorted_search(self): - """Profile: search with ORM-based ordering (created field).""" - with profile_block("BEFORE — sorted search (ordering=created)"): - response = self.client.get( - "/api/documents/?query=profiling&ordering=created" - ) - assert response.status_code == 200 - assert response.data["count"] == DOC_COUNT - - def test_profile_paginated_search(self): - """Profile: search requesting page 2 with explicit page_size.""" - with profile_block("BEFORE — paginated search (page=2, page_size=25)"): - response = self.client.get( - "/api/documents/?query=profiling&page=2&page_size=25" - ) - assert response.status_code == 200 - assert len(response.data["results"]) == 25 - - def test_profile_search_with_selection_data(self): - """Profile: search with include_selection_data=true.""" - with profile_block("BEFORE — search with selection_data"): - response = self.client.get( - "/api/documents/?query=profiling&include_selection_data=true" - ) - assert response.status_code == 200 - assert "selection_data" in response.data - - def test_profile_backend_search_only(self): - """Profile: raw backend.search() call to isolate Tantivy cost from DRF.""" - backend = get_backend() - with profile_block("BEFORE — backend.search(page_size=10000, all highlights)"): - results = backend.search( - "profiling", - user=None, - page=1, - page_size=10000, - sort_field=None, - sort_reverse=False, - ) - assert results.total == DOC_COUNT - - def test_profile_backend_search_single_page(self): - """Profile: raw backend.search() with real page size to compare.""" - backend = get_backend() - with profile_block("BEFORE — backend.search(page_size=25)"): - results = backend.search( - "profiling", - user=None, - page=1, - page_size=25, - sort_field=None, - sort_reverse=False, - ) - assert len(results.hits) == 25 -``` - -- [ ] **Step 2: Run the profiling tests and record the output** - -```bash -cd /home/trenton/Documents/projects/paperless-ngx -uv run pytest src/documents/tests/test_search_profiling.py -v -s 2>&1 | tee docs/superpowers/plans/profiling-baseline.txt -``` - -Record the output. The key metrics to compare later: - -- **Wall time** for each scenario -- **DB query count** (especially for sorted search — expect extra queries for ORM re-sort) -- **Memory delta** (highlight generation for 200 docs vs 25) -- **Peak memory** - -- [ ] **Step 3: Commit the profiling test (temporary)** - -```bash -git add src/documents/tests/test_search_profiling.py docs/superpowers/plans/profiling-baseline.txt -git commit -m "test: add baseline profiling tests for search performance" -``` - ---- - -### Task 1: Add `highlight_page` parameter to `search()` — generate highlights only for one page - -The core performance fix. `search()` still returns ALL matching hits (IDs + scores + ranks), but only generates expensive snippet highlights for a single page slice. Hits outside that page get `highlights={}`. - -This preserves DRF compatibility: `TantivyRelevanceList` still has all hits, DRF slices as usual, but only the page being displayed pays the snippet cost. - -**Files:** - -- Modify: `src/documents/search/_backend.py:428-591` (the `search()` method) -- Test: `src/documents/tests/search/test_backend.py` - -- [ ] **Step 1: Write tests for the new highlight_page behavior** - -Add to `TestSearch` in `src/documents/tests/search/test_backend.py`: - -```python -def test_highlight_page_only_highlights_requested_slice(self, backend: TantivyBackend): - """Only hits in the highlight_page slice should have non-empty highlights.""" - for i in range(6): - doc = Document.objects.create( - title=f"highlight doc {i}", - content=f"searchable highlight content number {i}", - checksum=f"HP{i}", - archive_serial_number=i + 1, - ) - backend.add_or_update(doc) - - r = backend.search( - "searchable", - user=None, - page=1, - page_size=10000, - sort_field="archive_serial_number", - sort_reverse=False, - highlight_page=1, - highlight_page_size=3, - ) - assert r.total == 6 - assert len(r.hits) == 6 - # First 3 hits (the highlight page) should have highlights - for hit in r.hits[:3]: - assert hit["highlights"], f"Hit {hit['id']} should have highlights" - # Last 3 hits should NOT have highlights - for hit in r.hits[3:]: - assert hit["highlights"] == {}, f"Hit {hit['id']} should not have highlights" - -def test_highlight_page_2_highlights_correct_slice(self, backend: TantivyBackend): - """highlight_page=2 should highlight only the second page of results.""" - for i in range(6): - doc = Document.objects.create( - title=f"page2 doc {i}", - content=f"searchable page2 content number {i}", - checksum=f"HP2{i}", - archive_serial_number=i + 1, - ) - backend.add_or_update(doc) - - r = backend.search( - "searchable", - user=None, - page=1, - page_size=10000, - sort_field="archive_serial_number", - sort_reverse=False, - highlight_page=2, - highlight_page_size=2, - ) - assert r.total == 6 - assert len(r.hits) == 6 - # Hits 0-1: no highlights (page 1) - assert r.hits[0]["highlights"] == {} - assert r.hits[1]["highlights"] == {} - # Hits 2-3: highlighted (page 2) - assert r.hits[2]["highlights"] != {} - assert r.hits[3]["highlights"] != {} - # Hits 4-5: no highlights (page 3) - assert r.hits[4]["highlights"] == {} - assert r.hits[5]["highlights"] == {} - -def test_no_highlight_page_highlights_all(self, backend: TantivyBackend): - """When highlight_page is not specified, all hits get highlights (backward compat).""" - for i in range(3): - doc = Document.objects.create( - title=f"compat doc {i}", - content=f"searchable compat content {i}", - checksum=f"HC{i}", - ) - backend.add_or_update(doc) - - r = backend.search( - "searchable", - user=None, - page=1, - page_size=10000, - sort_field=None, - sort_reverse=False, - ) - assert len(r.hits) == 3 - for hit in r.hits: - assert "content" in hit["highlights"] -``` - -- [ ] **Step 2: Run tests to confirm they fail** - -```bash -cd /home/trenton/Documents/projects/paperless-ngx -uv run pytest src/documents/tests/search/test_backend.py::TestSearch::test_highlight_page_only_highlights_requested_slice src/documents/tests/search/test_backend.py::TestSearch::test_highlight_page_2_highlights_correct_slice src/documents/tests/search/test_backend.py::TestSearch::test_no_highlight_page_highlights_all -v -``` - -Expected: FAIL — `search()` doesn't accept `highlight_page` or `highlight_page_size` yet. - -- [ ] **Step 3: Implement highlight_page in `search()`** - -Modify `search()` in `src/documents/search/_backend.py`. Add `highlight_page` and `highlight_page_size` parameters. The key change is in the hit-building loop: only generate snippets for hits whose index falls within the highlight page. - -Change the signature from: - -```python -def search( - self, - query: str, - user: AbstractBaseUser | None, - page: int, - page_size: int, - sort_field: str | None, - *, - sort_reverse: bool, - search_mode: SearchMode = SearchMode.QUERY, -) -> SearchResults: -``` - -To: - -```python -def search( - self, - query: str, - user: AbstractBaseUser | None, - page: int, - page_size: int, - sort_field: str | None, - *, - sort_reverse: bool, - search_mode: SearchMode = SearchMode.QUERY, - highlight_page: int | None = None, - highlight_page_size: int | None = None, -) -> SearchResults: -``` - -Then replace the hit-building loop (lines 532-585) with: - -```python - # Build result hits — only generate highlights for the highlight page - hits: list[SearchHit] = [] - snippet_generator = None - - # Determine which hits need highlights - if highlight_page is not None and highlight_page_size is not None: - hl_start = (highlight_page - 1) * highlight_page_size - hl_end = hl_start + highlight_page_size - else: - # Highlight all hits (backward-compatible default) - hl_start = 0 - hl_end = len(page_hits) - - for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1): - actual_doc = searcher.doc(doc_address) - doc_dict = actual_doc.to_dict() - doc_id = doc_dict["id"][0] - - highlights: dict[str, str] = {} - - # Only generate highlights for hits in the highlight window - hit_index = rank - offset - 1 # 0-based index within page_hits - if score > 0 and hl_start <= hit_index < hl_end: - try: - if snippet_generator is None: - snippet_generator = tantivy.SnippetGenerator.create( - searcher, - final_query, - self._schema, - "content", - ) - - content_snippet = snippet_generator.snippet_from_doc(actual_doc) - if content_snippet: - highlights["content"] = str(content_snippet) - - if "notes" in doc_dict: - notes_generator = tantivy.SnippetGenerator.create( - searcher, - final_query, - self._schema, - "notes", - ) - notes_snippet = notes_generator.snippet_from_doc(actual_doc) - if notes_snippet: - highlights["notes"] = str(notes_snippet) - - except Exception: # pragma: no cover - logger.debug("Failed to generate highlights for doc %s", doc_id) - - hits.append( - SearchHit( - id=doc_id, - score=score, - rank=rank, - highlights=highlights, - ), - ) -``` - -- [ ] **Step 4: Run tests to confirm they pass** - -```bash -uv run pytest src/documents/tests/search/test_backend.py::TestSearch -v -``` - -Expected: ALL tests PASS — both new and existing. The existing tests don't pass `highlight_page`, so they use the backward-compatible default (highlight all). - -- [ ] **Step 5: Commit** - -```bash -git add src/documents/search/_backend.py src/documents/tests/search/test_backend.py -git commit -m "feat: add highlight_page parameter to search() for page-only highlights" -``` - ---- - -### Task 2: Add `search_ids()` lightweight method - -Add a method that returns only document IDs matching a query — no `searcher.doc()` calls, no snippet generation. This is even lighter than `search()` with `highlight_page` because it skips building `SearchHit` objects entirely. Used by the viewset for `selection_data` when the full hit list isn't needed. - -**Files:** - -- Modify: `src/documents/search/_backend.py` (add `search_ids()` method after `search()`) -- Test: `src/documents/tests/search/test_backend.py` - -- [ ] **Step 1: Write failing tests for search_ids** - -Add a new test class in `src/documents/tests/search/test_backend.py`: - -```python -class TestSearchIds: - """Test lightweight ID-only search.""" - - def test_returns_matching_ids(self, backend: TantivyBackend): - """search_ids must return IDs of all matching documents.""" - docs = [] - for i in range(5): - doc = Document.objects.create( - title=f"findable doc {i}", - content="common keyword", - checksum=f"SI{i}", - ) - backend.add_or_update(doc) - docs.append(doc) - other = Document.objects.create( - title="unrelated", - content="nothing here", - checksum="SI_other", - ) - backend.add_or_update(other) - - ids = backend.search_ids( - "common keyword", - user=None, - search_mode=SearchMode.QUERY, - ) - assert set(ids) == {d.pk for d in docs} - assert other.pk not in ids - - def test_respects_permission_filter(self, backend: TantivyBackend): - """search_ids must respect user permission filtering.""" - owner = User.objects.create_user("ids_owner") - other = User.objects.create_user("ids_other") - doc = Document.objects.create( - title="private doc", - content="secret keyword", - checksum="SIP1", - owner=owner, - ) - backend.add_or_update(doc) - - assert backend.search_ids("secret", user=owner, search_mode=SearchMode.QUERY) == [doc.pk] - assert backend.search_ids("secret", user=other, search_mode=SearchMode.QUERY) == [] - - def test_respects_fuzzy_threshold(self, backend: TantivyBackend, settings): - """search_ids must apply the same fuzzy threshold as search().""" - doc = Document.objects.create( - title="threshold test", - content="unique term", - checksum="SIT1", - ) - backend.add_or_update(doc) - - settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 1.1 - ids = backend.search_ids("unique", user=None, search_mode=SearchMode.QUERY) - assert ids == [] - - def test_returns_ids_for_text_mode(self, backend: TantivyBackend): - """search_ids must work with TEXT search mode.""" - doc = Document.objects.create( - title="text mode doc", - content="findable phrase", - checksum="SIM1", - ) - backend.add_or_update(doc) - - ids = backend.search_ids("findable", user=None, search_mode=SearchMode.TEXT) - assert ids == [doc.pk] -``` - -- [ ] **Step 2: Run tests to confirm they fail** - -```bash -uv run pytest src/documents/tests/search/test_backend.py::TestSearchIds -v -``` - -Expected: FAIL with `AttributeError: 'TantivyBackend' object has no attribute 'search_ids'` - -- [ ] **Step 3: Implement `search_ids()`** - -Add after the `search()` method in `src/documents/search/_backend.py`: - -```python -def search_ids( - self, - query: str, - user: AbstractBaseUser | None, - *, - search_mode: SearchMode = SearchMode.QUERY, - limit: int = 10000, -) -> list[int]: - """ - Return document IDs matching a query — no highlights, no stored doc fetches. - - This is the lightweight companion to search(). Use it when you need the - full set of matching IDs (e.g. for ``selection_data``) but don't need - scores, ranks, or highlights. - - Args: - query: User's search query - user: User for permission filtering (None for superuser/no filtering) - search_mode: Query parsing mode (QUERY, TEXT, or TITLE) - limit: Maximum number of IDs to return - - Returns: - List of document IDs in relevance order - """ - self._ensure_open() - tz = get_current_timezone() - if search_mode is SearchMode.TEXT: - user_query = parse_simple_text_query(self._index, query) - elif search_mode is SearchMode.TITLE: - user_query = parse_simple_title_query(self._index, query) - else: - user_query = parse_user_query(self._index, query, tz) - - if user is not None: - permission_filter = build_permission_filter(self._schema, user) - final_query = tantivy.Query.boolean_query( - [ - (tantivy.Occur.Must, user_query), - (tantivy.Occur.Must, permission_filter), - ], - ) - else: - final_query = user_query - - searcher = self._index.searcher() - results = searcher.search(final_query, limit=limit) - - all_hits = [(hit[1], hit[0]) for hit in results.hits] - - # Normalize scores and apply threshold (same logic as search()) - if all_hits: - max_score = max(hit[1] for hit in all_hits) or 1.0 - all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits] - - threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD - if threshold is not None: - all_hits = [hit for hit in all_hits if hit[1] >= threshold] - - return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, _score in all_hits] -``` - -- [ ] **Step 4: Run tests to confirm they pass** - -```bash -uv run pytest src/documents/tests/search/test_backend.py::TestSearchIds -v -``` - -Expected: All 4 tests PASS. - -- [ ] **Step 5: Run existing backend tests to check for regressions** - -```bash -uv run pytest src/documents/tests/search/test_backend.py -v -``` - -Expected: All tests PASS. - -- [ ] **Step 6: Commit** - -```bash -git add src/documents/search/_backend.py src/documents/tests/search/test_backend.py -git commit -m "feat: add search_ids() lightweight ID-only query method" -``` - ---- - -### Task 3: Add `more_like_this_ids()` lightweight method - -Same pattern as Task 2, but for the more-like-this code path. - -**Files:** - -- Modify: `src/documents/search/_backend.py` (add `more_like_this_ids()` after `more_like_this()`) -- Test: `src/documents/tests/search/test_backend.py` - -- [ ] **Step 1: Write failing test** - -Add to `TestMoreLikeThis` in `src/documents/tests/search/test_backend.py`: - -```python -def test_more_like_this_ids_excludes_original(self, backend: TantivyBackend): - """more_like_this_ids must return IDs of similar documents, excluding the original.""" - doc1 = Document.objects.create( - title="Important document", - content="financial information report", - checksum="MLTI1", - pk=150, - ) - doc2 = Document.objects.create( - title="Another document", - content="financial information report", - checksum="MLTI2", - pk=151, - ) - backend.add_or_update(doc1) - backend.add_or_update(doc2) - - ids = backend.more_like_this_ids(doc_id=150, user=None) - assert 150 not in ids - assert 151 in ids -``` - -- [ ] **Step 2: Run test to confirm it fails** - -```bash -uv run pytest src/documents/tests/search/test_backend.py::TestMoreLikeThis::test_more_like_this_ids_excludes_original -v -``` - -Expected: FAIL with `AttributeError: 'TantivyBackend' object has no attribute 'more_like_this_ids'` - -- [ ] **Step 3: Implement `more_like_this_ids()`** - -Add after `more_like_this()` in `src/documents/search/_backend.py`: - -```python -def more_like_this_ids( - self, - doc_id: int, - user: AbstractBaseUser | None, - *, - limit: int = 10000, -) -> list[int]: - """ - Return IDs of documents similar to the given document — no highlights. - - Lightweight companion to more_like_this(). The original document is - excluded from results. - - Args: - doc_id: Primary key of the reference document - user: User for permission filtering (None for no filtering) - limit: Maximum number of IDs to return - - Returns: - List of similar document IDs (excluding the original) - """ - self._ensure_open() - searcher = self._index.searcher() - - id_query = tantivy.Query.range_query( - self._schema, - "id", - tantivy.FieldType.Unsigned, - doc_id, - doc_id, - ) - results = searcher.search(id_query, limit=1) - - if not results.hits: - return [] - - doc_address = results.hits[0][1] - mlt_query = tantivy.Query.more_like_this_query( - doc_address, - min_doc_frequency=1, - max_doc_frequency=None, - min_term_frequency=1, - max_query_terms=12, - min_word_length=None, - max_word_length=None, - boost_factor=None, - ) - - if user is not None: - permission_filter = build_permission_filter(self._schema, user) - final_query = tantivy.Query.boolean_query( - [ - (tantivy.Occur.Must, mlt_query), - (tantivy.Occur.Must, permission_filter), - ], - ) - else: - final_query = mlt_query - - results = searcher.search(final_query, limit=limit) - - ids = [] - for _score, doc_address in results.hits: - result_doc_id = searcher.doc(doc_address).to_dict()["id"][0] - if result_doc_id != doc_id: - ids.append(result_doc_id) - return ids -``` - -- [ ] **Step 4: Run tests to confirm they pass** - -```bash -uv run pytest src/documents/tests/search/test_backend.py::TestMoreLikeThis -v -``` - -Expected: All tests PASS. - -- [ ] **Step 5: Commit** - -```bash -git add src/documents/search/_backend.py src/documents/tests/search/test_backend.py -git commit -m "feat: add more_like_this_ids() lightweight ID-only method" -``` - ---- - -### Task 4: Refactor `UnifiedSearchViewSet.list()` — delegate sorting + page-only highlights - -The core viewset refactor. Three changes: - -1. **Pass `highlight_page`/`highlight_page_size`** so only the DRF page gets highlights -2. **Pass `sort_field`** through to Tantivy when the field is Tantivy-sortable, eliminating the ORM re-sort query -3. **Fall back to ORM sort** only for custom fields (not in Tantivy's `sort_field_map`) - -Critical DRF compatibility note: `TantivyRelevanceList` continues to hold ALL hits. DRF's `PageNumberPagination` slices it as before. The only difference is that hits outside the displayed page have `highlights={}`. - -**Files:** - -- Modify: `src/documents/views.py:2057-2183` (`UnifiedSearchViewSet.list()`) -- Test: `src/documents/tests/test_api_search.py` - -- [ ] **Step 1: Write regression tests before refactoring** - -Add to `TestDocumentSearchApi` in `src/documents/tests/test_api_search.py`: - -```python -def test_search_with_tantivy_native_sort(self) -> None: - """When ordering by a Tantivy-sortable field, results must be correctly sorted.""" - backend = get_backend() - for i, asn in enumerate([30, 10, 20]): - doc = Document.objects.create( - title=f"sortable doc {i}", - content="searchable content", - checksum=f"TNS{i}", - archive_serial_number=asn, - ) - backend.add_or_update(doc) - - response = self.client.get( - "/api/documents/?query=searchable&ordering=archive_serial_number", - ) - self.assertEqual(response.status_code, status.HTTP_200_OK) - asns = [doc["archive_serial_number"] for doc in response.data["results"]] - self.assertEqual(asns, [10, 20, 30]) - - response = self.client.get( - "/api/documents/?query=searchable&ordering=-archive_serial_number", - ) - self.assertEqual(response.status_code, status.HTTP_200_OK) - asns = [doc["archive_serial_number"] for doc in response.data["results"]] - self.assertEqual(asns, [30, 20, 10]) - -def test_search_page_2_returns_correct_slice(self) -> None: - """Page 2 must return the second slice, not overlap with page 1.""" - backend = get_backend() - for i in range(10): - doc = Document.objects.create( - title=f"doc {i}", - content="paginated content", - checksum=f"PG2{i}", - archive_serial_number=i + 1, - ) - backend.add_or_update(doc) - - response = self.client.get( - "/api/documents/?query=paginated&ordering=archive_serial_number&page=1&page_size=3", - ) - page1_ids = [r["id"] for r in response.data["results"]] - self.assertEqual(len(page1_ids), 3) - - response = self.client.get( - "/api/documents/?query=paginated&ordering=archive_serial_number&page=2&page_size=3", - ) - page2_ids = [r["id"] for r in response.data["results"]] - self.assertEqual(len(page2_ids), 3) - - # No overlap between pages - self.assertEqual(set(page1_ids) & set(page2_ids), set()) - # Page 2 ASNs are higher than page 1 - page1_asns = [Document.objects.get(pk=pk).archive_serial_number for pk in page1_ids] - page2_asns = [Document.objects.get(pk=pk).archive_serial_number for pk in page2_ids] - self.assertTrue(max(page1_asns) < min(page2_asns)) - -def test_search_all_field_contains_all_ids_when_paginated(self) -> None: - """The 'all' field must contain every matching ID, even when paginated.""" - backend = get_backend() - doc_ids = [] - for i in range(10): - doc = Document.objects.create( - title=f"all field doc {i}", - content="allfield content", - checksum=f"AF{i}", - ) - backend.add_or_update(doc) - doc_ids.append(doc.pk) - - response = self.client.get( - "/api/documents/?query=allfield&page=1&page_size=3", - headers={"Accept": "application/json; version=9"}, - ) - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(len(response.data["results"]), 3) - # "all" must contain ALL 10 matching IDs - self.assertCountEqual(response.data["all"], doc_ids) -``` - -- [ ] **Step 2: Run regression tests against current code to confirm they pass** - -```bash -uv run pytest src/documents/tests/test_api_search.py::TestDocumentSearchApi::test_search_with_tantivy_native_sort src/documents/tests/test_api_search.py::TestDocumentSearchApi::test_search_page_2_returns_correct_slice src/documents/tests/test_api_search.py::TestDocumentSearchApi::test_search_all_field_contains_all_ids_when_paginated -v -``` - -Expected: PASS (validates current behavior before refactoring). - -- [ ] **Step 3: Refactor `UnifiedSearchViewSet.list()`** - -Replace the search section of `list()` in `src/documents/views.py` (lines 2057-2183): - -```python -def list(self, request, *args, **kwargs): - if not self._is_search_request(): - return super().list(request) - - from documents.search import SearchMode - from documents.search import TantivyRelevanceList - from documents.search import get_backend - - try: - backend = get_backend() - filtered_qs = self.filter_queryset(self.get_queryset()) - - user = None if request.user.is_superuser else request.user - active_search_params = self._get_active_search_params(request) - - if len(active_search_params) > 1: - raise ValidationError( - { - "detail": _( - "Specify only one of text, title_search, query, or more_like_id.", - ), - }, - ) - - # Parse ordering param - ordering_param = request.query_params.get("ordering", "") - sort_reverse = ordering_param.startswith("-") - sort_field_name = ordering_param.lstrip("-") if ordering_param else None - - # Fields Tantivy can sort natively (must match sort_field_map in _backend.py) - tantivy_sortable = { - "title", "correspondent__name", "document_type__name", - "created", "added", "modified", - "archive_serial_number", "page_count", "num_notes", - } - use_tantivy_sort = sort_field_name in tantivy_sortable or sort_field_name is None - - # Compute the DRF page so we can tell Tantivy which slice to highlight - try: - requested_page = int(request.query_params.get("page", 1)) - except (TypeError, ValueError): - requested_page = 1 - try: - requested_page_size = int( - request.query_params.get("page_size", self.paginator.page_size), - ) - except (TypeError, ValueError): - requested_page_size = self.paginator.page_size - - if ( - "text" in request.query_params - or "title_search" in request.query_params - or "query" in request.query_params - ): - if "text" in request.query_params: - search_mode = SearchMode.TEXT - query_str = request.query_params["text"] - elif "title_search" in request.query_params: - search_mode = SearchMode.TITLE - query_str = request.query_params["title_search"] - else: - search_mode = SearchMode.QUERY - query_str = request.query_params["query"] - - if use_tantivy_sort: - # Fast path: Tantivy sorts, highlights only for DRF page - results = backend.search( - query_str, - user=user, - page=1, - page_size=10000, - sort_field=sort_field_name, - sort_reverse=sort_reverse, - search_mode=search_mode, - highlight_page=requested_page, - highlight_page_size=requested_page_size, - ) - - # Intersect with ORM-visible IDs (field filters) - orm_ids = set(filtered_qs.values_list("pk", flat=True)) - ordered_hits = [h for h in results.hits if h["id"] in orm_ids] - else: - # Slow path: custom field ordering — ORM must sort - results = backend.search( - query_str, - user=user, - page=1, - page_size=10000, - sort_field=None, - sort_reverse=False, - search_mode=search_mode, - highlight_page=requested_page, - highlight_page_size=requested_page_size, - ) - hits_by_id = {h["id"]: h for h in results.hits} - hit_ids = set(hits_by_id.keys()) - orm_ordered_ids = filtered_qs.filter(id__in=hit_ids).values_list( - "pk", - flat=True, - ) - ordered_hits = [ - hits_by_id[pk] for pk in orm_ordered_ids if pk in hits_by_id - ] - else: - # more_like_id path - try: - more_like_doc_id = int(request.query_params["more_like_id"]) - more_like_doc = Document.objects.select_related("owner").get( - pk=more_like_doc_id, - ) - except (TypeError, ValueError, Document.DoesNotExist): - raise PermissionDenied(_("Invalid more_like_id")) - - if not has_perms_owner_aware( - request.user, - "view_document", - more_like_doc, - ): - raise PermissionDenied(_("Insufficient permissions.")) - - results = backend.more_like_this( - more_like_doc_id, - user=user, - page=1, - page_size=10000, - ) - orm_ids = set(filtered_qs.values_list("pk", flat=True)) - ordered_hits = [h for h in results.hits if h["id"] in orm_ids] - - rl = TantivyRelevanceList(ordered_hits) - page = self.paginate_queryset(rl) - - if page is not None: - serializer = self.get_serializer(page, many=True) - response = self.get_paginated_response(serializer.data) - response.data["corrected_query"] = None - if get_boolean( - str(request.query_params.get("include_selection_data", "false")), - ): - all_ids = [h["id"] for h in ordered_hits] - response.data["selection_data"] = ( - self._get_selection_data_for_queryset( - filtered_qs.filter(pk__in=all_ids), - ) - ) - return response - - serializer = self.get_serializer(ordered_hits, many=True) - return Response(serializer.data) - - except NotFound: - raise - except PermissionDenied as e: - invalid_more_like_id_message = _("Invalid more_like_id") - if str(e.detail) == str(invalid_more_like_id_message): - return HttpResponseForbidden(invalid_more_like_id_message) - return HttpResponseForbidden(_("Insufficient permissions.")) - except ValidationError: - raise - except Exception as e: - logger.warning(f"An error occurred listing search results: {e!s}") - return HttpResponseBadRequest( - "Error listing search results, check logs for more detail.", - ) -``` - -Key changes from current code: - -- **`sort_field`** is derived from the `ordering` query param and passed to Tantivy (fast path) -- **`sort_reverse`** is derived from the `-` prefix -- **`highlight_page`/`highlight_page_size`** tell Tantivy which slice to highlight -- **Slow path** (custom field ordering): still uses `sort_field=None` + ORM re-sort, but still benefits from `highlight_page` -- **DRF compatibility**: `TantivyRelevanceList` always contains ALL hits. `__len__()` returns the correct total. DRF slices as usual. -- **`all` field**: unchanged — `get_all_result_ids()` still extracts IDs from the full hit list -- **`selection_data`**: unchanged — still uses `ordered_hits` for all IDs - -- [ ] **Step 4: Run the new tests** - -```bash -uv run pytest src/documents/tests/test_api_search.py::TestDocumentSearchApi::test_search_with_tantivy_native_sort src/documents/tests/test_api_search.py::TestDocumentSearchApi::test_search_page_2_returns_correct_slice src/documents/tests/test_api_search.py::TestDocumentSearchApi::test_search_all_field_contains_all_ids_when_paginated -v -``` - -Expected: PASS - -- [ ] **Step 5: Run ALL existing search tests to check for regressions** - -```bash -uv run pytest src/documents/tests/test_api_search.py src/documents/tests/search/test_backend.py -v -``` - -Expected: All tests PASS. Watch especially for: - -- `test_search_multi_page` — pagination correctness across 6 pages -- `test_search_custom_field_ordering` — custom field sort still uses ORM fallback -- `test_search_returns_all_for_api_version_9` — `all` field still works -- `test_search_with_include_selection_data` — selection data still works -- `test_search_invalid_page` — 404 on out-of-bounds pages - -- [ ] **Step 6: Commit** - -```bash -git add src/documents/views.py src/documents/tests/test_api_search.py -git commit -m "feat: delegate sorting to Tantivy and use page-only highlights in viewset" -``` - ---- - -### Task 5: Post-implementation profiling and comparison - -Run the same profiling tests from Task 0 against the new implementation and compare results. - -**Files:** - -- Modify: `src/documents/tests/test_search_profiling.py` (update labels from BEFORE to AFTER) -- Read: `docs/superpowers/plans/profiling-baseline.txt` - -- [ ] **Step 1: Update profiling test labels** - -In `src/documents/tests/test_search_profiling.py`, rename the class and update all `profile_block()` labels from `"BEFORE —"` to `"AFTER —"`: - -```python -class TestSearchProfilingAfter(DirectoriesMixin): - """Post-implementation profiling of the IMPROVED search implementation. - - Compare output with profiling-baseline.txt. - """ -``` - -Change every `profile_block("BEFORE —` to `profile_block("AFTER —` throughout the file. - -Also add a new test for the `search_ids()` method: - -```python -def test_profile_backend_search_ids(self): - """Profile: raw backend.search_ids() call — lightweight ID-only path.""" - backend = get_backend() - with profile_block("AFTER — backend.search_ids()"): - ids = backend.search_ids( - "profiling", - user=None, - ) - assert len(ids) == DOC_COUNT -``` - -- [ ] **Step 2: Run the profiling tests and record output** - -```bash -cd /home/trenton/Documents/projects/paperless-ngx -uv run pytest src/documents/tests/test_search_profiling.py -v -s 2>&1 | tee docs/superpowers/plans/profiling-after.txt -``` - -- [ ] **Step 3: Compare results** - -```bash -diff docs/superpowers/plans/profiling-baseline.txt docs/superpowers/plans/profiling-after.txt -``` - -Expected improvements: - -- **Relevance search**: Fewer snippet generations (25 vs 200), lower memory delta -- **Sorted search**: Fewer DB queries (Tantivy sorts instead of ORM), lower wall time -- **Paginated search**: Only page 2's 25 results get highlights instead of all 200 -- **Backend search**: Direct comparison of highlight-all vs highlight-page - -- [x] **Step 4: Record comparison in the plan** - -Profiling results (200-document test corpus): - -| Scenario | Metric | Before | After | Improvement | -| ---------------------- | ------------ | ---------- | ---------- | ----------- | -| Relevance search | Wall time | 0.962s | 0.917s | -5% | -| Relevance search | Queries | 33 | 33 | same | -| Relevance search | Memory delta | 16,557 KiB | 16,478 KiB | -0.5% | -| Sorted search | Wall time | 0.132s | 0.138s | ~same | -| Sorted search | Queries | 32 | 32 | same | -| Sorted search | Memory delta | 881 KiB | 792 KiB | -10% | -| Paginated search | Wall time | 0.140s | 0.132s | -6% | -| Paginated search | Memory delta | 868 KiB | 788 KiB | -9% | -| Selection data | Wall time | 0.166s | 0.157s | -5% | -| Selection data | Memory delta | 927 KiB | 837 KiB | -10% | -| Backend 10k highlights | Wall time | 0.018s | 0.019s | same | -| Backend 10k highlights | Memory delta | 89 KiB | 89 KiB | same | -| Backend 25 highlights | Wall time | 0.007s | 0.005s | -29% | -| Backend 25 highlights | Memory delta | 5.9 KiB | 5.9 KiB | same | - -Notes: Relevance search is dominated by first-request import overhead (~16 MiB). -Memory savings scale with document count. The 10,000 hardcoded limit has been -removed entirely; search_ids() now returns all matches. - -- [x] **Step 5: Commit** — Done (profiling data saved to `docs/superpowers/plans/profiling-after-option1.txt`) - -- [x] **Step 6: Clean up profiling artifacts** — Done (removed `profiling.py` and `test_search_profiling.py`) - ---- - -## Post-Implementation Notes - -### What these changes accomplish - -- **Task 1**: `search()` accepts `highlight_page`/`highlight_page_size` for backward compatibility. -- **Task 2-3**: `search_ids()` and `more_like_this_ids()` provide lightweight ID-only paths with no arbitrary cap. -- **Task 4**: Viewset passes `sort_field` through to Tantivy for natively-sortable fields, eliminating the ORM re-sort query. -- **Option 1 refactor** (post-plan): Replaced the `page_size=10000` overfetch entirely. The viewset now calls `search_ids()` for the full ID set (ints only, no cap), intersects with ORM, then calls `highlight_hits()` for just the displayed page (~25 docs). `TantivyRelevanceList` holds ordered IDs for count/selection_data and a small page of rich `SearchHit` dicts for serialization. -- **Code review fixes**: `_parse_query()` and `_apply_permission_filter()` helpers extracted to deduplicate 3+4 call sites. `SORT_FIELD_MAP`/`SORTABLE_FIELDS` promoted to class constants. `__getitem__` handles int keys. Empty ordering param handled correctly. - -### DRF compatibility preserved - -| Concern | Status | -| ----------------------------------------- | ------------------------------------------------------------------ | -| `TantivyRelevanceList.__len__()` | Returns `len(self._ordered_ids)` — ALL matching IDs, correct count | -| `TantivyRelevanceList.__getitem__(slice)` | Returns pre-fetched page_hits when aligned, stubs otherwise | -| `TantivyRelevanceList.__getitem__(int)` | Returns single SearchHit (from page_hits or stub) | -| `get_all_result_ids()` | Returns `ordered_ids` directly — no dict iteration | -| `count` in response | Correct — reflects all matching documents after ORM filtering | -| `next`/`previous` links | Correct — DRF computes from accurate count | -| Page N requests | Correct — DRF slices, gets pre-fetched page hits | - -### Performance impact - -| Operation | Before | After | -| ---------------------------------------- | ---------------------- | ------------------------------------------------- | -| Snippet generations per search | Up to 10,000 | ~25 (page size) via `highlight_hits()` | -| `searcher.doc()` calls for IDs | Up to 10,000 | All matches via `search_ids()` (ints, not dicts) | -| `searcher.doc()` calls for highlights | Up to 10,000 | ~25 via `highlight_hits()` (N individual lookups) | -| ORM sort query (Tantivy-sortable fields) | Always | Never (Tantivy sorts via `search_ids()`) | -| ORM sort query (custom fields) | Always | Still always (fallback) | -| Tantivy searches per request | 1 | 2 (`search_ids` + `highlight_hits`) | -| Hardcoded result cap | 10,000 | None (`searcher.num_docs`) | -| Memory per result (non-page hits) | ~100 bytes (SearchHit) | ~28 bytes (int) | - -### Known limitations - -- **`highlight_hits()` does N individual ID lookups**: tantivy-py does not expose a batch doc-address-by-ID API, so each page doc requires a separate `searcher.search(id_query, limit=1)`. Acceptable for page-sized batches (~25) but should not be called with thousands of IDs. -- **Text-based sort fields fall back to ORM**: `title`, `correspondent__name`, `document_type__name` produce different ordering in Tantivy (tokenized) vs ORM (collation), so they use the ORM sort path. - -### What's NOT in this plan (future work) - -- **Push ORM filters into Tantivy queries**: Would eliminate the ORM intersection (`filtered_qs.values_list`). High effort (~30 filter expressions to translate), deferred. Assessed as weeks of work. -- **Tantivy fast-field ID extraction**: `searcher.doc()` loads the full stored document to get the ID. Tantivy's fast fields could provide IDs without loading stored docs. Depends on tantivy-py API support. -- **Batch doc-address lookup**: Would eliminate the N individual lookups in `highlight_hits()`. Requires tantivy-py API changes or a workaround using term_set_query. diff --git a/docs/superpowers/plans/profiling-after-option1.txt b/docs/superpowers/plans/profiling-after-option1.txt deleted file mode 100644 index 033e94eac..000000000 --- a/docs/superpowers/plans/profiling-after-option1.txt +++ /dev/null @@ -1,78 +0,0 @@ -Profiling data after Option 1: search_ids + page-only highlight_hits -===================================================================== -Run date: 2026-04-05 -Commit: 610ba2789 (feat: replace 10000 overfetch with search_ids + page-only highlights) -Test corpus: 200 documents - -============================================================ - Profile: relevance search (no ordering) -============================================================ - Wall time: 0.9167s - Queries: 33 (0.0000s) - Memory delta: 16477.8 KiB - Peak memory: 16504.5 KiB - -============================================================ - Profile: sorted search (ordering=created) -============================================================ - Wall time: 0.1378s - Queries: 32 (0.0000s) - Memory delta: 792.1 KiB - Peak memory: 818.9 KiB - -============================================================ - Profile: paginated search (page=2, page_size=25) -============================================================ - Wall time: 0.1322s - Queries: 32 (0.0000s) - Memory delta: 788.3 KiB - Peak memory: 815.2 KiB - -============================================================ - Profile: search with selection_data -============================================================ - Wall time: 0.1570s - Queries: 37 (0.0010s) - Memory delta: 837.3 KiB - Peak memory: 981.2 KiB - -============================================================ - Profile: backend.search(page_size=10000, all highlights) -============================================================ - Wall time: 0.0193s - Queries: 0 (0.0000s) - Memory delta: 88.6 KiB - Peak memory: 100.2 KiB - -============================================================ - Profile: backend.search(page_size=25) -============================================================ - Wall time: 0.0046s - Queries: 0 (0.0000s) - Memory delta: 5.9 KiB - Peak memory: 11.1 KiB - - -Comparison summary (200 docs): -============================== - -| Scenario | Baseline | After Option 1 | Change | -|---------------------------|-----------|----------------|--------------| -| Relevance — wall | 0.962s | 0.917s | -5% | -| Relevance — memory | 16557 KiB | 16478 KiB | -0.5% | -| Sorted — wall | 0.132s | 0.138s | ~same | -| Sorted — memory | 881 KiB | 792 KiB | -10% | -| Paginated — wall | 0.140s | 0.132s | -6% | -| Paginated — memory | 868 KiB | 788 KiB | -9% | -| Selection data — wall | 0.166s | 0.157s | -5% | -| Selection data — memory | 927 KiB | 837 KiB | -10% | -| Backend 10k — wall | 0.018s | 0.019s | same | -| Backend 10k — memory | 89 KiB | 89 KiB | same | -| Backend 25 — wall | 0.007s | 0.005s | -29% | -| Backend 25 — memory | 5.9 KiB | 5.9 KiB | same | - -Notes: -- The 10000 hardcoded limit has been removed; search_ids() now returns all matches. -- Relevance search is dominated by first-request import overhead (~16 MiB). -- Memory savings will scale with document count (ints vs SearchHit dicts). -- Backend-only 10k test is unchanged because it still calls search() directly. diff --git a/docs/superpowers/plans/profiling-baseline.txt b/docs/superpowers/plans/profiling-baseline.txt deleted file mode 100644 index 07a66a4b5..000000000 --- a/docs/superpowers/plans/profiling-baseline.txt +++ /dev/null @@ -1,121 +0,0 @@ -============================= test session starts ============================== -platform linux -- Python 3.14.3, pytest-9.0.2, pluggy-1.6.0 -- /home/trenton/Documents/projects/paperless-ngx/.venv/bin/python -cachedir: .pytest_cache -django: version: 5.2.12, settings: paperless.settings (from ini) -rootdir: /home/trenton/Documents/projects/paperless-ngx -configfile: pyproject.toml -plugins: sugar-1.1.1, xdist-3.8.0, cov-7.0.0, httpx-0.36.0, django-4.12.0, Faker-40.8.0, env-1.5.0, time-machine-3.2.0, mock-3.15.1, anyio-4.12.1, rerunfailures-16.1 -collecting ... collected 6 items - -src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_relevance_search Creating test database for alias 'default'... - -============================================================ - Profile: BEFORE — relevance search (no ordering) -============================================================ - Wall time: 0.9622s - Queries: 33 (0.0000s) - Memory delta: 16557.2 KiB - Peak memory: 16584.0 KiB - - Top 5 allocations: - :511: size=5480 KiB (+5480 KiB), count=45642 (+45642), average=123 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/fido2/rpid.py:47: size=518 KiB (+518 KiB), count=9769 (+9769), average=54 B - :106: size=432 KiB (+432 KiB), count=1480 (+1480), average=299 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/langdetect/utils/ngram.py:257: size=391 KiB (+391 KiB), count=6667 (+6667), average=60 B - :491: size=284 KiB (+284 KiB), count=2543 (+2543), average=114 B -============================================================ - -PASSED -src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_sorted_search -============================================================ - Profile: BEFORE — sorted search (ordering=created) -============================================================ - Wall time: 0.1320s - Queries: 32 (0.0010s) - Memory delta: 880.8 KiB - Peak memory: 906.8 KiB - - Top 5 allocations: - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:575: size=50.1 KiB (+50.1 KiB), count=521 (+521), average=99 B - /home/trenton/.local/share/uv/python/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/copyreg.py:104: size=49.7 KiB (+49.7 KiB), count=315 (+315), average=162 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django/db/models/sql/query.py:386: size=38.0 KiB (+38.0 KiB), count=160 (+160), average=243 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django_filters/filterset.py:209: size=32.0 KiB (+32.0 KiB), count=82 (+82), average=400 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django_filters/filters.py:158: size=21.4 KiB (+21.4 KiB), count=104 (+104), average=210 B -============================================================ - -PASSED -src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_paginated_search -============================================================ - Profile: BEFORE — paginated search (page=2, page_size=25) -============================================================ - Wall time: 0.1395s - Queries: 32 (0.0000s) - Memory delta: 868.1 KiB - Peak memory: 893.5 KiB - - Top 5 allocations: - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:575: size=50.1 KiB (+50.1 KiB), count=521 (+521), average=99 B - /home/trenton/.local/share/uv/python/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/copyreg.py:104: size=49.2 KiB (+49.2 KiB), count=315 (+315), average=160 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django/db/models/sql/query.py:386: size=38.1 KiB (+38.1 KiB), count=161 (+161), average=242 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django_filters/filterset.py:209: size=32.0 KiB (+32.0 KiB), count=82 (+82), average=400 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django_filters/filters.py:158: size=21.3 KiB (+21.3 KiB), count=104 (+104), average=209 B -============================================================ - -PASSED -src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_search_with_selection_data -============================================================ - Profile: BEFORE — search with selection_data -============================================================ - Wall time: 0.1656s - Queries: 37 (0.0020s) - Memory delta: 926.9 KiB - Peak memory: 1084.3 KiB - - Top 5 allocations: - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:575: size=50.1 KiB (+50.1 KiB), count=521 (+521), average=99 B - /home/trenton/.local/share/uv/python/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/copyreg.py:104: size=49.6 KiB (+49.6 KiB), count=327 (+327), average=155 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django/db/models/sql/query.py:386: size=38.1 KiB (+38.1 KiB), count=161 (+161), average=242 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django_filters/filterset.py:209: size=32.0 KiB (+32.0 KiB), count=82 (+82), average=400 B - /home/trenton/Documents/projects/paperless-ngx/.venv/lib/python3.14/site-packages/django/db/backends/sqlite3/operations.py:193: size=27.1 KiB (+27.1 KiB), count=37 (+37), average=751 B -============================================================ - -PASSED -src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_backend_search_only -============================================================ - Profile: BEFORE — backend.search(page_size=10000, all highlights) -============================================================ - Wall time: 0.0175s - Queries: 0 (0.0000s) - Memory delta: 88.6 KiB - Peak memory: 100.3 KiB - - Top 5 allocations: - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:575: size=51.2 KiB (+51.2 KiB), count=530 (+530), average=99 B - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:557: size=17.8 KiB (+17.8 KiB), count=200 (+200), average=91 B - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:542: size=8576 B (+8576 B), count=134 (+134), average=64 B - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:522: size=4800 B (+4800 B), count=200 (+200), average=24 B - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:515: size=2376 B (+2376 B), count=99 (+99), average=24 B -============================================================ - -PASSED -src/documents/tests/test_search_profiling.py::TestSearchProfilingBaseline::test_profile_backend_search_single_page -============================================================ - Profile: BEFORE — backend.search(page_size=25) -============================================================ - Wall time: 0.0070s - Queries: 0 (0.0000s) - Memory delta: 5.9 KiB - Peak memory: 11.3 KiB - - Top 5 allocations: - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:557: size=2275 B (+2275 B), count=25 (+25), average=91 B - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:575: size=1600 B (+1600 B), count=25 (+25), average=64 B - /home/trenton/.local/share/uv/python/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/weakref.py:73: size=1280 B (+1280 B), count=20 (+20), average=64 B - /home/trenton/Documents/projects/paperless-ngx/src/documents/search/_backend.py:574: size=256 B (+256 B), count=1 (+1), average=256 B - /home/trenton/.local/share/uv/python/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/tracemalloc.py:560: size=240 B (+240 B), count=1 (+1), average=240 B -============================================================ - -PASSEDDestroying test database for alias 'default'... - - -======================== 6 passed in 241.83s (0:04:01) =========================