Perf: stream manifest parsing with ijson in document_importer

Replace bulk json.load of the full manifest (which materializes the entire JSON array into memory) with incremental ijson streaming. Eliminates self.manifest entirely — records are never all in memory at once. - Add ijson>=3.2 dependency - New module-level iter_manifest_records() generator - load_manifest_files() collects paths only; no parsing at load time - check_manifest_validity() streams without accumulating records - decrypt_secret_fields() streams each manifest to a .decrypted.json temp file record-by-record; temp files cleaned up after file copy - _import_files_from_manifest() collects only document records (small fraction of manifest) for the tqdm progress bar Measured on 200 docs + 200 CustomFieldInstances: - Streaming validation: peak memory 3081 KiB -> 333 KiB (89% reduction) - Stream-decrypt to file: peak memory 3081 KiB -> 549 KiB (82% reduction) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-08 23:55:24 +00:00 · 2026-03-06 15:15:33 -08:00
parent f5c0c21922
commit dfd370700a
6 changed files with 322 additions and 51 deletions
@@ -0,0 +1,71 @@
+"""
+Temporary profiling utilities for comparing implementations.
+
+Usage in a management command or shell::
+
+    from documents.profiling import profile_block
+
+    with profile_block("new check_sanity"):
+        messages = check_sanity()
+
+    with profile_block("old check_sanity"):
+        messages = check_sanity_old()
+
+Drop this file when done.
+"""
+
+from __future__ import annotations
+
+import tracemalloc
+from contextlib import contextmanager
+from time import perf_counter
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+from django.db import connection
+from django.db import reset_queries
+from django.test.utils import override_settings
+
+
+@contextmanager
+def profile_block(label: str = "block") -> Generator[None, None, None]:
+    """Profile memory, wall time, and DB queries for a code block.
+
+    Prints a summary to stdout on exit. Requires no external packages.
+    Enables DEBUG temporarily to capture Django's query log.
+    """
+    tracemalloc.start()
+    snapshot_before = tracemalloc.take_snapshot()
+
+    with override_settings(DEBUG=True):
+        reset_queries()
+        start = perf_counter()
+
+        yield
+
+        elapsed = perf_counter() - start
+        queries = list(connection.queries)
+
+    snapshot_after = tracemalloc.take_snapshot()
+    _, peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+
+    # Compare snapshots for top allocations
+    stats = snapshot_after.compare_to(snapshot_before, "lineno")
+
+    query_time = sum(float(q["time"]) for q in queries)
+    mem_diff = sum(s.size_diff for s in stats)
+
+    print(f"\n{'=' * 60}")  # noqa: T201
+    print(f"  Profile: {label}")  # noqa: T201
+    print(f"{'=' * 60}")  # noqa: T201
+    print(f"  Wall time:    {elapsed:.4f}s")  # noqa: T201
+    print(f"  Queries:      {len(queries)} ({query_time:.4f}s)")  # noqa: T201
+    print(f"  Memory delta: {mem_diff / 1024:.1f} KiB")  # noqa: T201
+    print(f"  Peak memory:  {peak / 1024:.1f} KiB")  # noqa: T201
+    print("\n  Top 5 allocations:")  # noqa: T201
+    for stat in stats[:5]:
+        print(f"    {stat}")  # noqa: T201
+    print(f"{'=' * 60}\n")  # noqa: T201