Stores profiling stuff for later

Using a draft release of tanvity-py, fixes up all the TODO locations with the new API calls
Oops, it should be dark
2026-04-19 22:39:27 +00:00 · 2026-04-19 14:10:11 -07:00 · 2026-04-19 13:14:17 -07:00 · 2026-04-18 16:02:09 -07:00
10 changed files with 494 additions and 165 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,7 +74,7 @@ dependencies = [
  "scikit-learn~=1.8.0",
  "sentence-transformers>=4.1",
  "setproctitle~=1.3.4",
-  "tantivy>=0.25.1",
+  "tantivy @ git+https://github.com/quickwit-oss/tantivy-py.git",
  "tika-client~=0.11.0",
  "torch~=2.11.0",
  "watchfiles>=1.1.1",
--- a/src-ui/src/app/components/common/dates-dropdown/dates-dropdown.component.html
+++ b/src-ui/src/app/components/common/dates-dropdown/dates-dropdown.component.html
@@ -86,7 +86,7 @@
        <div class="selected-icon">
          @if (addedRelativeDate) {
            <a class="text-light focus-variants" href="javascript:void(0)" (click)="clearAddedRelativeDate()">
-              <i-bs width="1em" height="1em" name="check" class="variant-unfocused text-primary"></i-bs>
+              <i-bs width="1em" height="1em" name="check" class="variant-unfocused text-dark"></i-bs>
              <i-bs width="1em" height="1em" name="x" class="variant-focused text-primary"></i-bs>
            </a>
          }
--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@@ -514,8 +514,9 @@ class CustomFieldQueryParser:
        value_field_name = CustomFieldInstance.get_value_field_name(
            custom_field.data_type,
        )
-        if custom_field.data_type == CustomField.FieldDataType.MONETARY and (
-            op in self.EXPR_BY_CATEGORY["arithmetic"] or op in {"exact", "in"}
+        if (
+            custom_field.data_type == CustomField.FieldDataType.MONETARY
+            and op in self.EXPR_BY_CATEGORY["arithmetic"]
        ):
            value_field_name = "value_monetary_amount"
        has_field = Q(custom_fields__field=custom_field)
--- a/src/documents/search/_backend.py
+++ b/src/documents/search/_backend.py
@@ -221,24 +221,9 @@ class WriteBatch:
        self._writer.add_document(doc)

    def remove(self, doc_id: int) -> None:
-        """
-        Remove a document from the batch by its primary key.
-
-        Uses range_query instead of term_query to work around a tantivy-py bug
-        where Python integers are inferred as i64, producing Terms that never
-        match u64 fields.
-
-        TODO: Replace with term_query("id", doc_id) once
-        https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
-        """
+        """Remove a document from the batch by its primary key."""
        self._writer.delete_documents_by_query(
-            tantivy.Query.range_query(
-                self._backend._schema,
-                "id",
-                tantivy.FieldType.Unsigned,
-                doc_id,
-                doc_id,
-            ),
+            tantivy.Query.term_query(self._backend._schema, "id", doc_id),
        )


@@ -525,15 +510,6 @@ class TantivyBackend:
        Use this when you already know which documents to display (from
        search_ids + ORM filtering) and just need highlight data.

-        Note: Each doc_id requires an individual index lookup because tantivy-py
-        does not yet expose a batch fast-field read API. This is acceptable for
-        page-sized batches (typically 25 docs) but should not be called with
-        thousands of IDs.
-
-        TODO: When https://github.com/quickwit-oss/tantivy-py/pull/641 lands,
-        the per-doc range_query lookups here can be replaced with a single
-        collect_u64_fast_field("id", doc_addresses) call.
-
        Args:
            query: The search query (used for snippet generation)
            doc_ids: Ordered list of document IDs to generate hits for
@@ -567,32 +543,42 @@ class TantivyBackend:
            notes_text_query = user_query

        searcher = self._index.searcher()
+
+        # Fetch all requested docs in a single search: user_query MUST match
+        # and exactly the requested IDs MUST match (OR of term_queries).
+        id_filter = tantivy.Query.boolean_query(
+            [
+                (
+                    tantivy.Occur.Should,
+                    tantivy.Query.term_query(self._schema, "id", did),
+                )
+                for did in doc_ids
+            ],
+        )
+        batch_query = tantivy.Query.boolean_query(
+            [
+                (tantivy.Occur.Must, user_query),
+                (tantivy.Occur.Must, id_filter),
+            ],
+        )
+        batch_results = searcher.search(batch_query, limit=len(doc_ids))
+
+        result_addrs = [addr for _score, addr in batch_results.hits]
+        result_ids = searcher.fast_field_values("id", result_addrs)
+        addr_by_id: dict[int, tuple[float, tantivy.DocAddress]] = {
+            doc_id: (score, addr)
+            for (score, addr), doc_id in zip(batch_results.hits, result_ids)
+        }
+
        snippet_generator = None
        notes_snippet_generator = None
        hits: list[SearchHit] = []

        for rank, doc_id in enumerate(doc_ids, start=rank_start):
-            # Look up document by ID, scoring against the user query so that
-            # the returned SearchHit carries a real BM25 relevance score.
-            id_query = tantivy.Query.range_query(
-                self._schema,
-                "id",
-                tantivy.FieldType.Unsigned,
-                doc_id,
-                doc_id,
-            )
-            scored_query = tantivy.Query.boolean_query(
-                [
-                    (tantivy.Occur.Must, user_query),
-                    (tantivy.Occur.Must, id_query),
-                ],
-            )
-            results = searcher.search(scored_query, limit=1)
-
-            if not results.hits:
+            if doc_id not in addr_by_id:
                continue

-            score, doc_address = results.hits[0]
+            score, doc_address = addr_by_id[doc_id]
            actual_doc = searcher.doc(doc_address)
            doc_dict = actual_doc.to_dict()

@@ -697,10 +683,7 @@ class TantivyBackend:
            if threshold is not None:
                all_hits = [hit for hit in all_hits if hit[1] >= threshold]

-        # TODO: Replace with searcher.collect_u64_fast_field("id", addrs) once
-        # https://github.com/quickwit-oss/tantivy-py/pull/641 lands — eliminates
-        # one stored-doc fetch per result (~80% reduction in search_ids latency).
-        return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]
+        return searcher.fast_field_values("id", [doc_addr for doc_addr, *_ in all_hits])

    def autocomplete(
        self,
@@ -817,13 +800,7 @@ class TantivyBackend:
        self._ensure_open()
        searcher = self._index.searcher()

-        id_query = tantivy.Query.range_query(
-            self._schema,
-            "id",
-            tantivy.FieldType.Unsigned,
-            doc_id,
-            doc_id,
-        )
+        id_query = tantivy.Query.term_query(self._schema, "id", doc_id)
        results = searcher.search(id_query, limit=1)

        if not results.hits:
@@ -847,14 +824,9 @@ class TantivyBackend:
        # Fetch one extra to account for excluding the original document
        results = searcher.search(final_query, limit=effective_limit + 1)

-        # TODO: Replace with collect_u64_fast_field("id", addrs) once
-        # https://github.com/quickwit-oss/tantivy-py/pull/641 lands.
-        ids = []
-        for _score, doc_address in results.hits:
-            result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
-            if result_doc_id != doc_id:
-                ids.append(result_doc_id)
-
+        addrs = [addr for _score, addr in results.hits]
+        all_ids = searcher.fast_field_values("id", addrs)
+        ids = [rid for rid in all_ids if rid != doc_id]
        return ids[:limit] if limit is not None else ids

    def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
--- a/src/documents/search/_query.py
+++ b/src/documents/search/_query.py
@@ -372,9 +372,6 @@ def normalize_query(query: str) -> str:
        raise ValueError("Query too complex to process (normalization timed out)")


-_MAX_U64 = 2**64 - 1  # u64 max — used as inclusive upper bound for "any owner" range
-
-
 def build_permission_filter(
    schema: tantivy.Schema,
    user: AbstractBaseUser,
@@ -394,48 +391,16 @@ def build_permission_filter(

    Returns:
        Tantivy query that filters results to visible documents
-
-    Implementation Notes:
-        - Uses range_query instead of term_query for owner_id/viewer_id to work
-          around a tantivy-py bug where Python ints are inferred as i64, causing
-          term_query to return no hits on u64 fields.
-          TODO: Replace with term_query once
-          https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
-
-        - Uses range_query(owner_id, 1, MAX_U64) as an "owner exists" check
-          because exists_query is not yet available in tantivy-py 0.25.
-          TODO: Replace with exists_query("owner_id") once that is exposed in
-          a tantivy-py release.
-
-        - Uses disjunction_max_query to combine permission clauses with OR logic
    """
-    owner_any = tantivy.Query.range_query(
-        schema,
-        "owner_id",
-        tantivy.FieldType.Unsigned,
-        1,
-        _MAX_U64,
-    )
+    owner_any = tantivy.Query.exists_query("owner_id")
    no_owner = tantivy.Query.boolean_query(
        [
            (tantivy.Occur.Must, tantivy.Query.all_query()),
            (tantivy.Occur.MustNot, owner_any),
        ],
    )
-    owned = tantivy.Query.range_query(
-        schema,
-        "owner_id",
-        tantivy.FieldType.Unsigned,
-        user.pk,
-        user.pk,
-    )
-    shared = tantivy.Query.range_query(
-        schema,
-        "viewer_id",
-        tantivy.FieldType.Unsigned,
-        user.pk,
-        user.pk,
-    )
+    owned = tantivy.Query.term_query(schema, "owner_id", user.pk)
+    shared = tantivy.Query.term_query(schema, "viewer_id", user.pk)
    return tantivy.Query.disjunction_max_query([no_owner, owned, shared])


--- a/src/documents/tests/test_admin.py
+++ b/src/documents/tests/test_admin.py
@@ -24,13 +24,7 @@ class TestDocumentAdmin(DirectoriesMixin, TestCase):
        backend = get_backend()
        searcher = backend._index.searcher()
        results = searcher.search(
-            tantivy.Query.range_query(
-                backend._schema,
-                "id",
-                tantivy.FieldType.Unsigned,
-                doc.pk,
-                doc.pk,
-            ),
+            tantivy.Query.term_query(backend._schema, "id", doc.pk),
            limit=1,
        )
        if results.hits:
--- a/src/documents/tests/test_api_filter_by_custom_fields.py
+++ b/src/documents/tests/test_api_filter_by_custom_fields.py
@@ -453,32 +453,6 @@ class TestCustomFieldsSearch(DirectoriesMixin, APITestCase):
            ),
        )

-    def test_exact_monetary(self) -> None:
-        # "exact" should match by numeric amount, ignoring currency code prefix.
-        self._assert_query_match_predicate(
-            ["monetary_field", "exact", "100"],
-            lambda document: (
-                "monetary_field" in document
-                and document["monetary_field"] == "USD100.00"
-            ),
-        )
-        self._assert_query_match_predicate(
-            ["monetary_field", "exact", "101"],
-            lambda document: (
-                "monetary_field" in document and document["monetary_field"] == "101.00"
-            ),
-        )
-
-    def test_in_monetary(self) -> None:
-        # "in" should match by numeric amount, ignoring currency code prefix.
-        self._assert_query_match_predicate(
-            ["monetary_field", "in", ["100", "50"]],
-            lambda document: (
-                "monetary_field" in document
-                and document["monetary_field"] in {"USD100.00", "EUR50.00"}
-            ),
-        )
-
    # ==========================================================#
    # Subset check (document link field only)                   #
    # ==========================================================#
--- a/src/documents/tests/test_search_profiling.py
+++ b/src/documents/tests/test_search_profiling.py
@@ -0,0 +1,273 @@
+"""
+Search performance profiling tests.
+
+Run explicitly — excluded from the normal test suite:
+
+    uv run pytest -m profiling -s -p no:xdist --override-ini="addopts=" -v
+
+The ``-s`` flag is required to see profile_block() output.
+The ``-p no:xdist`` flag disables parallel execution for accurate measurements.
+
+Corpus: 5 000 documents generated deterministically from a fixed Faker seed,
+with realistic variety: 30 correspondents, 15 document types, 50 tags, ~500
+notes spread across ~10 % of documents.
+"""
+
+from __future__ import annotations
+
+import random
+
+import pytest
+from django.contrib.auth.models import User
+from faker import Faker
+from rest_framework.test import APIClient
+
+from documents.models import Correspondent
+from documents.models import Document
+from documents.models import DocumentType
+from documents.models import Note
+from documents.models import Tag
+from documents.search import get_backend
+from documents.search import reset_backend
+from documents.search._backend import SearchMode
+from profiling import profile_block
+
+pytestmark = [pytest.mark.profiling, pytest.mark.search, pytest.mark.django_db]
+
+# ---------------------------------------------------------------------------
+# Corpus parameters
+# ---------------------------------------------------------------------------
+
+DOC_COUNT = 5_000
+SEED = 42
+NUM_CORRESPONDENTS = 30
+NUM_DOC_TYPES = 15
+NUM_TAGS = 50
+NOTE_FRACTION = 0.10  # ~500 documents get a note
+PAGE_SIZE = 25
+
+
+def _build_corpus(rng: random.Random, fake: Faker) -> None:
+    """
+    Insert the full corpus into the database and index it.
+
+    Uses bulk_create for the Document rows (fast) then handles the M2M tag
+    relationships and notes individually.  Indexes the full corpus with a
+    single backend.rebuild() call.
+    """
+    import datetime
+
+    # ---- lookup objects -------------------------------------------------
+    correspondents = [
+        Correspondent.objects.create(name=f"profcorp-{i}-{fake.company()}"[:128])
+        for i in range(NUM_CORRESPONDENTS)
+    ]
+    doc_types = [
+        DocumentType.objects.create(name=f"proftype-{i}-{fake.word()}"[:128])
+        for i in range(NUM_DOC_TYPES)
+    ]
+    tags = [
+        Tag.objects.create(name=f"proftag-{i}-{fake.word()}"[:100])
+        for i in range(NUM_TAGS)
+    ]
+    note_user = User.objects.create_user(username="profnoteuser", password="x")
+
+    # ---- bulk-create documents ------------------------------------------
+    base_date = datetime.date(2018, 1, 1)
+    raw_docs = []
+    for i in range(DOC_COUNT):
+        day_offset = rng.randint(0, 6 * 365)
+        created = base_date + datetime.timedelta(days=day_offset)
+        raw_docs.append(
+            Document(
+                title=fake.sentence(nb_words=rng.randint(3, 9)).rstrip("."),
+                content="\n\n".join(
+                    fake.paragraph(nb_sentences=rng.randint(3, 7))
+                    for _ in range(rng.randint(2, 5))
+                ),
+                checksum=f"PROF{i:07d}",
+                correspondent=rng.choice(correspondents + [None] * 8),
+                document_type=rng.choice(doc_types + [None] * 4),
+                created=created,
+            ),
+        )
+    documents = Document.objects.bulk_create(raw_docs)
+
+    # ---- tags (M2M, post-bulk) ------------------------------------------
+    for doc in documents:
+        k = rng.randint(0, 5)
+        if k:
+            doc.tags.add(*rng.sample(tags, k))
+
+    # ---- notes on ~10 % of docs -----------------------------------------
+    note_docs = rng.sample(documents, int(DOC_COUNT * NOTE_FRACTION))
+    for doc in note_docs:
+        Note.objects.create(
+            document=doc,
+            note=fake.sentence(nb_words=rng.randint(6, 20)),
+            user=note_user,
+        )
+
+    # ---- build Tantivy index --------------------------------------------
+    backend = get_backend()
+    qs = Document.objects.select_related(
+        "correspondent",
+        "document_type",
+        "storage_path",
+        "owner",
+    ).prefetch_related("tags", "notes__user", "custom_fields__field")
+    backend.rebuild(qs)
+
+
+class TestSearchProfiling:
+    """
+    Performance profiling for the Tantivy search backend and DRF API layer.
+
+    Each test builds a fresh 5 000-document corpus, exercises one hot path,
+    and prints profile_block() measurements to stdout.  No correctness
+    assertions — the goal is to surface hot spots and track regressions.
+    """
+
+    @pytest.fixture(autouse=True)
+    def _setup(self, tmp_path, settings):
+        index_dir = tmp_path / "index"
+        index_dir.mkdir()
+        settings.INDEX_DIR = index_dir
+
+        reset_backend()
+        rng = random.Random(SEED)
+        fake = Faker()
+        Faker.seed(SEED)
+
+        self.user = User.objects.create_superuser(
+            username="profiler",
+            password="admin",
+        )
+        self.client = APIClient()
+        self.client.force_authenticate(user=self.user)
+
+        _build_corpus(rng, fake)
+        yield
+        reset_backend()
+
+    # -- 1. Backend: search_ids relevance ---------------------------------
+
+    def test_profile_search_ids_relevance(self):
+        """Profile: search_ids() with relevance ordering across several queries."""
+        backend = get_backend()
+        queries = [
+            "invoice payment",
+            "annual report",
+            "bank statement",
+            "contract agreement",
+            "receipt",
+        ]
+        with profile_block(f"search_ids — relevance ({len(queries)} queries)"):
+            for q in queries:
+                backend.search_ids(q, user=None)
+
+    # -- 2. Backend: search_ids with Tantivy-native sort ------------------
+
+    def test_profile_search_ids_sorted(self):
+        """Profile: search_ids() sorted by a Tantivy fast field (created)."""
+        backend = get_backend()
+        with profile_block("search_ids — sorted by created (asc + desc)"):
+            backend.search_ids(
+                "the",
+                user=None,
+                sort_field="created",
+                sort_reverse=False,
+            )
+            backend.search_ids(
+                "the",
+                user=None,
+                sort_field="created",
+                sort_reverse=True,
+            )
+
+    # -- 3. Backend: highlight_hits for a page of 25 ----------------------
+
+    def test_profile_highlight_hits(self):
+        """Profile: highlight_hits() for a 25-document page."""
+        backend = get_backend()
+        all_ids = backend.search_ids("report", user=None)
+        page_ids = all_ids[:PAGE_SIZE]
+        with profile_block(f"highlight_hits — {len(page_ids)} docs"):
+            backend.highlight_hits("report", page_ids)
+
+    # -- 4. Backend: autocomplete -----------------------------------------
+
+    def test_profile_autocomplete(self):
+        """Profile: autocomplete() with eight common prefixes."""
+        backend = get_backend()
+        prefixes = ["inv", "pay", "con", "rep", "sta", "acc", "doc", "fin"]
+        with profile_block(f"autocomplete — {len(prefixes)} prefixes"):
+            for prefix in prefixes:
+                backend.autocomplete(prefix, limit=10)
+
+    # -- 5. Backend: simple-mode search (TEXT and TITLE) ------------------
+
+    def test_profile_search_ids_simple_modes(self):
+        """Profile: search_ids() in TEXT and TITLE simple-search modes."""
+        backend = get_backend()
+        queries = ["invoice 2023", "annual report", "bank statement"]
+        with profile_block(
+            f"search_ids — TEXT + TITLE modes ({len(queries)} queries each)",
+        ):
+            for q in queries:
+                backend.search_ids(q, user=None, search_mode=SearchMode.TEXT)
+                backend.search_ids(q, user=None, search_mode=SearchMode.TITLE)
+
+    # -- 6. API: full round-trip, relevance + page 1 ----------------------
+
+    def test_profile_api_relevance_search(self):
+        """Profile: full API search round-trip, relevance order, page 1."""
+        with profile_block(
+            f"API /documents/?query=… relevance (page 1, page_size={PAGE_SIZE})",
+        ):
+            response = self.client.get(
+                f"/api/documents/?query=invoice+payment&page=1&page_size={PAGE_SIZE}",
+            )
+        assert response.status_code == 200
+
+    # -- 7. API: full round-trip, ORM-ordered (title) ---------------------
+
+    def test_profile_api_orm_sorted_search(self):
+        """Profile: full API search round-trip with ORM-delegated sort (title)."""
+        with profile_block("API /documents/?query=…&ordering=title"):
+            response = self.client.get(
+                f"/api/documents/?query=report&ordering=title&page=1&page_size={PAGE_SIZE}",
+            )
+        assert response.status_code == 200
+
+    # -- 8. API: full round-trip, score sort ------------------------------
+
+    def test_profile_api_score_sort(self):
+        """Profile: full API search with ordering=-score (relevance, preserve order)."""
+        with profile_block("API /documents/?query=…&ordering=-score"):
+            response = self.client.get(
+                f"/api/documents/?query=statement&ordering=-score&page=1&page_size={PAGE_SIZE}",
+            )
+        assert response.status_code == 200
+
+    # -- 9. API: full round-trip, with selection_data ---------------------
+
+    def test_profile_api_with_selection_data(self):
+        """Profile: full API search including include_selection_data=true."""
+        with profile_block("API /documents/?query=…&include_selection_data=true"):
+            response = self.client.get(
+                f"/api/documents/?query=contract&page=1&page_size={PAGE_SIZE}"
+                "&include_selection_data=true",
+            )
+        assert response.status_code == 200
+        assert "selection_data" in response.data
+
+    # -- 10. API: paginated (page 2) --------------------------------------
+
+    def test_profile_api_page_2(self):
+        """Profile: full API search, page 2 — exercises page offset arithmetic."""
+        with profile_block(f"API /documents/?query=…&page=2&page_size={PAGE_SIZE}"):
+            response = self.client.get(
+                f"/api/documents/?query=the&page=2&page_size={PAGE_SIZE}",
+            )
+        assert response.status_code == 200
--- a/src/profiling.py
+++ b/src/profiling.py
@@ -0,0 +1,173 @@
+"""
+Temporary profiling utilities for comparing implementations.
+
+Usage in a management command or shell::
+
+    from profiling import profile_block, profile_cpu, measure_memory
+
+    with profile_block("new check_sanity"):
+        messages = check_sanity()
+
+    with profile_block("old check_sanity"):
+        messages = check_sanity_old()
+
+Drop this file when done.
+"""
+
+from __future__ import annotations
+
+import resource
+import tracemalloc
+from collections.abc import Callable  # noqa: TC003
+from collections.abc import Generator  # noqa: TC003
+from contextlib import contextmanager
+from time import perf_counter
+from typing import Any
+
+from django.db import connection
+from django.db import reset_queries
+from django.test.utils import override_settings
+
+
+def _rss_kib() -> int:
+    """Return current process RSS in KiB (Linux: /proc/self/status; fallback: getrusage)."""
+    try:
+        with open("/proc/self/status") as f:
+            for line in f:
+                if line.startswith("VmRSS:"):
+                    return int(line.split()[1])
+    except OSError:
+        pass
+    # getrusage reports in KB on Linux, bytes on macOS
+    import sys
+
+    ru = resource.getrusage(resource.RUSAGE_SELF)
+    return ru.ru_maxrss if sys.platform != "darwin" else ru.ru_maxrss // 1024
+
+
+@contextmanager
+def profile_block(label: str = "block") -> Generator[None, None, None]:
+    """Profile memory, wall time, and DB queries for a code block.
+
+    Prints a summary to stdout on exit. Requires no external packages.
+    Enables DEBUG temporarily to capture Django's query log.
+    Reports both Python-level (tracemalloc) and process-level (RSS) memory.
+    """
+    rss_before = _rss_kib()
+    tracemalloc.start()
+    snapshot_before = tracemalloc.take_snapshot()
+
+    with override_settings(DEBUG=True):
+        reset_queries()
+        start = perf_counter()
+
+        yield
+
+        elapsed = perf_counter() - start
+        queries = list(connection.queries)
+
+    snapshot_after = tracemalloc.take_snapshot()
+    _, peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+    rss_after = _rss_kib()
+
+    # Compare snapshots for top allocations
+    stats = snapshot_after.compare_to(snapshot_before, "lineno")
+
+    query_time = sum(float(q["time"]) for q in queries)
+    mem_diff = sum(s.size_diff for s in stats)
+
+    print(f"\n{'=' * 60}")  # noqa: T201
+    print(f"  Profile: {label}")  # noqa: T201
+    print(f"{'=' * 60}")  # noqa: T201
+    print(f"  Wall time:    {elapsed:.4f}s")  # noqa: T201
+    print(f"  Queries:      {len(queries)} ({query_time:.4f}s)")  # noqa: T201
+    print(
+        f"  RSS delta:    {rss_after - rss_before:+d} KiB  (before={rss_before} KiB, after={rss_after} KiB)",
+    )
+    print(f"  Py mem delta: {mem_diff / 1024:.1f} KiB  (tracemalloc — Python only)")  # noqa: T201
+    print(f"  Py peak:      {peak / 1024:.1f} KiB")  # noqa: T201
+    print("\n  Top 5 allocations:")  # noqa: T201
+    for stat in stats[:5]:
+        print(f"    {stat}")  # noqa: T201
+    print(f"{'=' * 60}\n")  # noqa: T201
+
+
+def profile_cpu(
+    fn: Callable[[], Any],
+    *,
+    label: str,
+    top: int = 30,
+    sort: str = "cumtime",
+) -> tuple[Any, float]:
+    """Run *fn()* under cProfile, print stats, return (result, elapsed_s).
+
+    Args:
+        fn: Zero-argument callable to profile.
+        label: Human-readable label printed in the header.
+        top: Number of cProfile rows to print.
+        sort: cProfile sort key (default: cumulative time).
+
+    Returns:
+        ``(result, elapsed_s)`` where *result* is the return value of *fn()*.
+    """
+    import cProfile
+    import io
+    import pstats
+
+    pr = cProfile.Profile()
+    t0 = perf_counter()
+    pr.enable()
+    result = fn()
+    pr.disable()
+    elapsed = perf_counter() - t0
+
+    buf = io.StringIO()
+    ps = pstats.Stats(pr, stream=buf).sort_stats(sort)
+    ps.print_stats(top)
+
+    print(f"\n{'=' * 72}")  # noqa: T201
+    print(f"  {label}")  # noqa: T201
+    print(f"  wall time: {elapsed * 1000:.1f} ms")  # noqa: T201
+    print(f"{'=' * 72}")  # noqa: T201
+    print(buf.getvalue())  # noqa: T201
+
+    return result, elapsed
+
+
+def measure_memory(fn: Callable[[], Any], *, label: str) -> tuple[Any, float, float]:
+    """Run *fn()* under tracemalloc, print allocation report.
+
+    Args:
+        fn: Zero-argument callable to profile.
+        label: Human-readable label printed in the header.
+
+    Returns:
+        ``(result, peak_kib, delta_kib)``.
+    """
+    tracemalloc.start()
+    snapshot_before = tracemalloc.take_snapshot()
+    t0 = perf_counter()
+    result = fn()
+    elapsed = perf_counter() - t0
+    snapshot_after = tracemalloc.take_snapshot()
+    _, peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+
+    stats = snapshot_after.compare_to(snapshot_before, "lineno")
+    delta_kib = sum(s.size_diff for s in stats) / 1024
+
+    print(f"\n{'=' * 72}")  # noqa: T201
+    print(f"  [memory] {label}")  # noqa: T201
+    print(f"  wall time:    {elapsed * 1000:.1f} ms")  # noqa: T201
+    print(f"  memory delta: {delta_kib:+.1f} KiB")  # noqa: T201
+    print(f"  peak traced:  {peak / 1024:.1f} KiB")  # noqa: T201
+    print(f"{'=' * 72}")  # noqa: T201
+    print("  Top allocation sites (by size_diff):")  # noqa: T201
+    for stat in stats[:20]:
+        if stat.size_diff != 0:
+            print(  # noqa: T201
+                f"    {stat.size_diff / 1024:+8.1f} KiB  {stat.traceback.format()[0]}",
+            )
+
+    return result, peak / 1024, delta_kib
--- a/uv.lock
+++ b/uv.lock
@@ -3071,7 +3071,7 @@ requires-dist = [
    { name = "scikit-learn", specifier = "~=1.8.0" },
    { name = "sentence-transformers", specifier = ">=4.1" },
    { name = "setproctitle", specifier = "~=1.3.4" },
-    { name = "tantivy", specifier = ">=0.25.1" },
+    { name = "tantivy", git = "https://github.com/quickwit-oss/tantivy-py.git" },
    { name = "tika-client", specifier = "~=0.11.0" },
    { name = "torch", specifier = "~=2.11.0", index = "https://download.pytorch.org/whl/cpu" },
    { name = "watchfiles", specifier = ">=1.1.1" },
@@ -4675,31 +4675,8 @@ wheels = [

 [[package]]
 name = "tantivy"
-version = "0.25.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/1b/f9/0cd3955d155d3e3ef74b864769514dd191e5dacba9f0beb7af2d914942ce/tantivy-0.25.1.tar.gz", hash = "sha256:68a3314699a7d18fcf338b52bae8ce46a97dde1128a3e47e33fa4db7f71f265e", size = 75120, upload-time = "2025-12-02T11:57:12.997Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4e/7a/8a277f377e8a151fc0e71d4ffc1114aefb6e5e1c7dd609fed0955cf34ed8/tantivy-0.25.1-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:d363d7b4207d3a5aa7f0d212420df35bed18bdb6bae26a2a8bd57428388b7c29", size = 7637033, upload-time = "2025-12-02T11:56:18.104Z" },
-    { url = "https://files.pythonhosted.org/packages/71/31/8b4acdedfc9f9a2d04b1340d07eef5213d6f151d1e18da0cb423e5f090d2/tantivy-0.25.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8f4389cf1d889a1df7c5a3195806b4b56c37cee10d8a26faaa0dea35a867b5ff", size = 3932180, upload-time = "2025-12-02T11:56:19.833Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/dc/3e8499c21b4b9795e8f2fc54c68ce5b92905aaeadadaa56ecfa9180b11b1/tantivy-0.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99864c09fc54652c3c2486cdf13f86cdc8200f4b481569cb291e095ca5d496e5", size = 4197620, upload-time = "2025-12-02T11:56:21.496Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/8e/f2ce62fffc811eb62bead92c7b23c2e218f817cbd54c4f3b802e03ba1438/tantivy-0.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05abf37ddbc5063c575548be0d62931629c086bff7a5a1b67cf5a8f5ebf4cd8c", size = 4183794, upload-time = "2025-12-02T11:56:23.215Z" },
-    { url = "https://files.pythonhosted.org/packages/41/e7/6849c713ed0996c7628324c60512c4882006f0a62145e56c624a93407f90/tantivy-0.25.1-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:90fd919e5f611809f746560ecf36eb9be824dec62e21ae17a27243759edb9aa1", size = 7621494, upload-time = "2025-12-02T11:56:27.069Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/22/c3d8294600dc6e7fa350daef9ff337d3c06e132b81df727de9f7a50c692a/tantivy-0.25.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:4613c7cf6c23f3a97989819690a0f956d799354957de7a204abcc60083cebe02", size = 3925219, upload-time = "2025-12-02T11:56:29.403Z" },
-    { url = "https://files.pythonhosted.org/packages/41/fc/cbb1df71dd44c9110eff4eaaeda9d44f2d06182fe0452193be20ddfba93f/tantivy-0.25.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c477bd20b4df804d57dfc5033431bef27cde605695ae141b03abbf6ebc069129", size = 4198699, upload-time = "2025-12-02T11:56:31.359Z" },
-    { url = "https://files.pythonhosted.org/packages/47/4d/71abb78b774073c3ce12a4faa4351a9d910a71ffa3659526affba163873d/tantivy-0.25.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9b1a1ba1113c523c7ff7b10f282d6c4074006f7ef8d71e1d973d51bf7291ddb", size = 4183585, upload-time = "2025-12-02T11:56:33.317Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/25/73cfbcf1a8ea49be6c42817431cac46b70a119fe64da903fcc2d92b5b511/tantivy-0.25.1-cp313-cp313-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f51ff7196c6f31719202080ed8372d5e3d51e92c749c032fb8234f012e99744c", size = 7622530, upload-time = "2025-12-02T11:56:36.839Z" },
-    { url = "https://files.pythonhosted.org/packages/12/c8/c0d7591cdf4f7e7a9fc4da786d1ca8cd1aacffaa2be16ea6d401a8e4a566/tantivy-0.25.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:550e63321bfcacc003859f2fa29c1e8e56450807b3c9a501c1add27cfb9236d9", size = 3925637, upload-time = "2025-12-02T11:56:38.425Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/09/bedfc223bffec7641b417dd7ab071134b2ef8f8550e9b1fb6014657ef52e/tantivy-0.25.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fde31cc8d6e122faf7902aeea32bc008a429a6e8904e34d3468126a3ec01b016", size = 4197322, upload-time = "2025-12-02T11:56:40.411Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/f1/1fa5183500c8042200c9f2b840d34f5bbcfb434a1ee750e7132262d2a5c9/tantivy-0.25.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b11bd5a518b0be645320b47af8493f6a40c4f3234313e37adcf4534a564d27dd", size = 4183143, upload-time = "2025-12-02T11:56:42.048Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/2f/581519492226f97d23bd0adc95dad991ebeaa73ea6abc8bff389a3096d9a/tantivy-0.25.1-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:dae99e75b7eaa9bf5bd16ab106b416370f08c135aed0e117d62a3201cd1ffe36", size = 7610316, upload-time = "2025-12-02T11:56:45.927Z" },
-    { url = "https://files.pythonhosted.org/packages/91/40/5d7bc315ab9e6a22c5572656e8ada1c836cfa96dccf533377504fbc3c9d9/tantivy-0.25.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:506e9533c5ef4d3df43bad64ffecc0aa97c76e361ea610815dc3a20a9d6b30b3", size = 3919882, upload-time = "2025-12-02T11:56:48.469Z" },
-    { url = "https://files.pythonhosted.org/packages/02/b9/e0ef2f57a6a72444cb66c2ffbc310ab33ffaace275f1c4b0319d84ea3f18/tantivy-0.25.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dbd4f8f264dacbcc9dee542832da2173fd53deaaea03f082d95214f8b5ed6bc", size = 4196031, upload-time = "2025-12-02T11:56:50.151Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/02/bf3f8cacfd08642e14a73f7956a3fb95d58119132c98c121b9065a1f8615/tantivy-0.25.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:824c643ccb640dd9e35e00c5d5054ddf3323f56fe4219d57d428a9eeea13d22c", size = 4183437, upload-time = "2025-12-02T11:56:51.818Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/44/9f1d67aa5030f7eebc966c863d1316a510a971dd8bb45651df4acdfae9ed/tantivy-0.25.1-cp314-cp314-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:7f5d29ae85dd0f23df8d15b3e7b341d4f9eb5a446bbb9640df48ac1f6d9e0c6c", size = 7623723, upload-time = "2025-12-02T11:56:55.066Z" },
-    { url = "https://files.pythonhosted.org/packages/db/30/6e085bd3ed9d12da3c91c185854abd70f9dfd35fb36a75ea98428d42c30b/tantivy-0.25.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:f2d2938fb69a74fc1bb36edfaf7f0d1596fa1264db0f377bda2195c58bcb6245", size = 3926243, upload-time = "2025-12-02T11:56:57.058Z" },
-    { url = "https://files.pythonhosted.org/packages/32/f5/a00d65433430f51718e5cc6938df571765d7c4e03aedec5aef4ab567aa9b/tantivy-0.25.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f5ff124c4802558e627091e780b362ca944169736caba5a372eef39a79d0ae0", size = 4207186, upload-time = "2025-12-02T11:56:58.803Z" },
-    { url = "https://files.pythonhosted.org/packages/19/63/61bdb12fc95f2a7f77bd419a5149bfa9f28caa76cb569bf2b6b06e1d033e/tantivy-0.25.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43b80ef62a340416139c93d19264e5f808da48e04f9305f1092b8ed22be0a5be", size = 4187312, upload-time = "2025-12-02T11:57:00.595Z" },
-]
+version = "0.26.0"
+source = { git = "https://github.com/quickwit-oss/tantivy-py.git#fa1a1985b96001929fc1cafcdd9dc94e56658b2a" }

 [[package]]
 name = "tenacity"
Author	SHA1	Message	Date
Trenton Holmes	eb86ed617e	Stores profiling stuff for later	2026-04-19 14:10:11 -07:00
Trenton Holmes	9b9554a158	Using a draft release of tanvity-py, fixes up all the TODO locations with the new API calls	2026-04-19 13:14:17 -07:00
shamoon	8edbc70dbc	Oops, it should be dark	2026-04-18 16:02:09 -07:00