Files
paperless-ngx/src/documents/profiling.py
Trenton H dfd370700a Perf: stream manifest parsing with ijson in document_importer
Replace bulk json.load of the full manifest (which materializes the
entire JSON array into memory) with incremental ijson streaming.
Eliminates self.manifest entirely — records are never all in memory
at once.

- Add ijson>=3.2 dependency
- New module-level iter_manifest_records() generator
- load_manifest_files() collects paths only; no parsing at load time
- check_manifest_validity() streams without accumulating records
- decrypt_secret_fields() streams each manifest to a .decrypted.json
  temp file record-by-record; temp files cleaned up after file copy
- _import_files_from_manifest() collects only document records (small
  fraction of manifest) for the tqdm progress bar

Measured on 200 docs + 200 CustomFieldInstances:
- Streaming validation: peak memory 3081 KiB -> 333 KiB (89% reduction)
- Stream-decrypt to file: peak memory 3081 KiB -> 549 KiB (82% reduction)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-07 13:20:29 -08:00

72 lines
2.1 KiB
Python

"""
Temporary profiling utilities for comparing implementations.
Usage in a management command or shell::
from documents.profiling import profile_block
with profile_block("new check_sanity"):
messages = check_sanity()
with profile_block("old check_sanity"):
messages = check_sanity_old()
Drop this file when done.
"""
from __future__ import annotations
import tracemalloc
from contextlib import contextmanager
from time import perf_counter
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Generator
from django.db import connection
from django.db import reset_queries
from django.test.utils import override_settings
@contextmanager
def profile_block(label: str = "block") -> Generator[None, None, None]:
"""Profile memory, wall time, and DB queries for a code block.
Prints a summary to stdout on exit. Requires no external packages.
Enables DEBUG temporarily to capture Django's query log.
"""
tracemalloc.start()
snapshot_before = tracemalloc.take_snapshot()
with override_settings(DEBUG=True):
reset_queries()
start = perf_counter()
yield
elapsed = perf_counter() - start
queries = list(connection.queries)
snapshot_after = tracemalloc.take_snapshot()
_, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
# Compare snapshots for top allocations
stats = snapshot_after.compare_to(snapshot_before, "lineno")
query_time = sum(float(q["time"]) for q in queries)
mem_diff = sum(s.size_diff for s in stats)
print(f"\n{'=' * 60}") # noqa: T201
print(f" Profile: {label}") # noqa: T201
print(f"{'=' * 60}") # noqa: T201
print(f" Wall time: {elapsed:.4f}s") # noqa: T201
print(f" Queries: {len(queries)} ({query_time:.4f}s)") # noqa: T201
print(f" Memory delta: {mem_diff / 1024:.1f} KiB") # noqa: T201
print(f" Peak memory: {peak / 1024:.1f} KiB") # noqa: T201
print("\n Top 5 allocations:") # noqa: T201
for stat in stats[:5]:
print(f" {stat}") # noqa: T201
print(f"{'=' * 60}\n") # noqa: T201