mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-17 22:45:58 +00:00
Replace bulk json.load of the full manifest (which materializes the entire JSON array into memory) with incremental ijson streaming. Eliminates self.manifest entirely — records are never all in memory at once. - Add ijson>=3.2 dependency - New module-level iter_manifest_records() generator - load_manifest_files() collects paths only; no parsing at load time - check_manifest_validity() streams without accumulating records - decrypt_secret_fields() streams each manifest to a .decrypted.json temp file record-by-record; temp files cleaned up after file copy - _import_files_from_manifest() collects only document records (small fraction of manifest) for the tqdm progress bar Measured on 200 docs + 200 CustomFieldInstances: - Streaming validation: peak memory 3081 KiB -> 333 KiB (89% reduction) - Stream-decrypt to file: peak memory 3081 KiB -> 549 KiB (82% reduction) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
72 lines
2.1 KiB
Python
72 lines
2.1 KiB
Python
"""
|
|
Temporary profiling utilities for comparing implementations.
|
|
|
|
Usage in a management command or shell::
|
|
|
|
from documents.profiling import profile_block
|
|
|
|
with profile_block("new check_sanity"):
|
|
messages = check_sanity()
|
|
|
|
with profile_block("old check_sanity"):
|
|
messages = check_sanity_old()
|
|
|
|
Drop this file when done.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import tracemalloc
|
|
from contextlib import contextmanager
|
|
from time import perf_counter
|
|
from typing import TYPE_CHECKING
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Generator
|
|
|
|
from django.db import connection
|
|
from django.db import reset_queries
|
|
from django.test.utils import override_settings
|
|
|
|
|
|
@contextmanager
|
|
def profile_block(label: str = "block") -> Generator[None, None, None]:
|
|
"""Profile memory, wall time, and DB queries for a code block.
|
|
|
|
Prints a summary to stdout on exit. Requires no external packages.
|
|
Enables DEBUG temporarily to capture Django's query log.
|
|
"""
|
|
tracemalloc.start()
|
|
snapshot_before = tracemalloc.take_snapshot()
|
|
|
|
with override_settings(DEBUG=True):
|
|
reset_queries()
|
|
start = perf_counter()
|
|
|
|
yield
|
|
|
|
elapsed = perf_counter() - start
|
|
queries = list(connection.queries)
|
|
|
|
snapshot_after = tracemalloc.take_snapshot()
|
|
_, peak = tracemalloc.get_traced_memory()
|
|
tracemalloc.stop()
|
|
|
|
# Compare snapshots for top allocations
|
|
stats = snapshot_after.compare_to(snapshot_before, "lineno")
|
|
|
|
query_time = sum(float(q["time"]) for q in queries)
|
|
mem_diff = sum(s.size_diff for s in stats)
|
|
|
|
print(f"\n{'=' * 60}") # noqa: T201
|
|
print(f" Profile: {label}") # noqa: T201
|
|
print(f"{'=' * 60}") # noqa: T201
|
|
print(f" Wall time: {elapsed:.4f}s") # noqa: T201
|
|
print(f" Queries: {len(queries)} ({query_time:.4f}s)") # noqa: T201
|
|
print(f" Memory delta: {mem_diff / 1024:.1f} KiB") # noqa: T201
|
|
print(f" Peak memory: {peak / 1024:.1f} KiB") # noqa: T201
|
|
print("\n Top 5 allocations:") # noqa: T201
|
|
for stat in stats[:5]:
|
|
print(f" {stat}") # noqa: T201
|
|
print(f"{'=' * 60}\n") # noqa: T201
|