From 49d1e773d9f5f5102011df07795ff432adc68e84 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:27:22 -0800 Subject: [PATCH] Feature: Improve exporter memory efficieny MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 -- Eliminate JSON round-trip in document exporter Replace json.loads(serializers.serialize("json", qs)) with serializers.serialize("python", qs) to skip the intermediate JSON string allocation and parse step. Use DjangoJSONEncoder in check_and_write_json() to handle native Python types (datetime, Decimal, UUID) the Python serializer returns. Measured on 200 documents + 200 CustomFieldInstances: - Memory delta: 1,410 KiB → 527 KiB (−63%) - Peak memory: 1,500 KiB → 530 KiB (−65%) - Wall time: 0.54s → 0.36s (−34%) - JSON output: identical (byte-for-byte, 345 KB) Phase 2 -- Batched QuerySet serialization in document exporter Add serialize_queryset_batched() helper that uses QuerySet.iterator() and itertools.islice to stream records in configurable chunks, bounding peak memory during serialization to batch_size * avg_record_size rather than loading the entire QuerySet at once. Replace the single-call serializers.serialize("python", qs) in dump() with list(chain.from_iterable(serialize_queryset_batched(qs, batch_size))). Add --batch-size CLI argument (default 500). Measured on 2,000 documents + 2,000 CustomFieldInstances: Phase 1 baseline (full queryset, no iterator): Peak: 9,293 KiB | Time: 4.33s Phase 2 batch=2000 (iterator, 1 batch): Peak: 7,716 KiB | Time: 4.20s (−17% peak vs Phase 1) Phase 2 batch=500 (iterator, 4 batches -- default): Peak: 6,980 KiB | Time: 4.28s (−25% peak vs Phase 1) Phase 2 batch=100 (iterator, 20 batches): Peak: 6,776 KiB | Time: 4.30s (−27% peak vs Phase 1) Peak memory falls as batch_size decreases. Wall time is within noise (batching overhead negligible). Output is byte-for-byte identical across all batch sizes and approaches. The primary gain is that Django's queryset cache is bypassed (iterator()), preventing the ORM from holding all model instances in memory after fetch. Smaller batches reduce the per-batch model-instance peak further. --- pyproject.toml | 1 + .../management/commands/document_exporter.py | 56 +++++++++++++-- src/documents/profiling.py | 71 +++++++++++++++++++ 3 files changed, 124 insertions(+), 4 deletions(-) create mode 100644 src/documents/profiling.py diff --git a/pyproject.toml b/pyproject.toml index 3d00f4e67..36a02528d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -305,6 +305,7 @@ markers = [ "greenmail: Tests requiring Greenmail service", "date_parsing: Tests which cover date parsing from content or filename", "management: Tests which cover management commands/functionality", + "profiling: Benchmarks that profile and compare implementation performance", ] [tool.pytest_env] diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index bd962efc4..52cadb4fe 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -3,6 +3,8 @@ import json import os import shutil import tempfile +from itertools import chain +from itertools import islice from pathlib import Path from typing import TYPE_CHECKING @@ -19,6 +21,7 @@ from django.contrib.contenttypes.models import ContentType from django.core import serializers from django.core.management.base import BaseCommand from django.core.management.base import CommandError +from django.core.serializers.json import DjangoJSONEncoder from django.db import transaction from django.utils import timezone from filelock import FileLock @@ -26,6 +29,8 @@ from guardian.models import GroupObjectPermission from guardian.models import UserObjectPermission if TYPE_CHECKING: + from collections.abc import Generator + from django.db.models import QuerySet if settings.AUDIT_LOG_ENABLED: @@ -60,6 +65,22 @@ from paperless_mail.models import MailAccount from paperless_mail.models import MailRule +def serialize_queryset_batched( + queryset: "QuerySet", + *, + batch_size: int = 500, +) -> "Generator[list[dict], None, None]": + """Yield batches of serialized records from a QuerySet. + + Each batch is a list of dicts in Django's Python serialization format. + Uses QuerySet.iterator() to avoid loading the full queryset into memory, + and islice to collect chunk-sized batches serialized in a single call. + """ + iterator = queryset.iterator(chunk_size=batch_size) + while chunk := list(islice(iterator, batch_size)): + yield serializers.serialize("python", chunk) + + class Command(CryptMixin, BaseCommand): help = ( "Decrypt and rename all files in our collection into a given target " @@ -186,6 +207,17 @@ class Command(CryptMixin, BaseCommand): help="If provided, is used to encrypt sensitive data in the export", ) + parser.add_argument( + "--batch-size", + type=int, + default=500, + help=( + "Number of records to process per batch during serialization. " + "Lower values reduce peak memory usage; higher values improve " + "throughput. Default: 500." + ), + ) + def handle(self, *args, **options) -> None: self.target = Path(options["target"]).resolve() self.split_manifest: bool = options["split_manifest"] @@ -200,6 +232,7 @@ class Command(CryptMixin, BaseCommand): self.data_only: bool = options["data_only"] self.no_progress_bar: bool = options["no_progress_bar"] self.passphrase: str | None = options.get("passphrase") + self.batch_size: int = options["batch_size"] self.files_in_export_dir: set[Path] = set() self.exported_files: set[str] = set() @@ -294,8 +327,13 @@ class Command(CryptMixin, BaseCommand): # Build an overall manifest for key, object_query in manifest_key_to_object_query.items(): - manifest_dict[key] = json.loads( - serializers.serialize("json", object_query), + manifest_dict[key] = list( + chain.from_iterable( + serialize_queryset_batched( + object_query, + batch_size=self.batch_size, + ), + ), ) self.encrypt_secret_fields(manifest_dict) @@ -512,14 +550,24 @@ class Command(CryptMixin, BaseCommand): self.files_in_export_dir.remove(target) if self.compare_json: target_checksum = hashlib.md5(target.read_bytes()).hexdigest() - src_str = json.dumps(content, indent=2, ensure_ascii=False) + src_str = json.dumps( + content, + cls=DjangoJSONEncoder, + indent=2, + ensure_ascii=False, + ) src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest() if src_checksum == target_checksum: perform_write = False if perform_write: target.write_text( - json.dumps(content, indent=2, ensure_ascii=False), + json.dumps( + content, + cls=DjangoJSONEncoder, + indent=2, + ensure_ascii=False, + ), encoding="utf-8", ) diff --git a/src/documents/profiling.py b/src/documents/profiling.py new file mode 100644 index 000000000..aca0913e4 --- /dev/null +++ b/src/documents/profiling.py @@ -0,0 +1,71 @@ +""" +Temporary profiling utilities for comparing implementations. + +Usage in a management command or shell:: + + from documents.profiling import profile_block + + with profile_block("new check_sanity"): + messages = check_sanity() + + with profile_block("old check_sanity"): + messages = check_sanity_old() + +Drop this file when done. +""" + +from __future__ import annotations + +import tracemalloc +from contextlib import contextmanager +from time import perf_counter +from typing import TYPE_CHECKING + +from django.db import connection +from django.db import reset_queries +from django.test.utils import override_settings + +if TYPE_CHECKING: + from collections.abc import Generator + + +@contextmanager +def profile_block(label: str = "block") -> Generator[None, None, None]: + """Profile memory, wall time, and DB queries for a code block. + + Prints a summary to stdout on exit. Requires no external packages. + Enables DEBUG temporarily to capture Django's query log. + """ + tracemalloc.start() + snapshot_before = tracemalloc.take_snapshot() + + with override_settings(DEBUG=True): + reset_queries() + start = perf_counter() + + yield + + elapsed = perf_counter() - start + queries = list(connection.queries) + + snapshot_after = tracemalloc.take_snapshot() + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + # Compare snapshots for top allocations + stats = snapshot_after.compare_to(snapshot_before, "lineno") + + query_time = sum(float(q["time"]) for q in queries) + mem_diff = sum(s.size_diff for s in stats) + + print(f"\n{'=' * 60}") # noqa: T201 + print(f" Profile: {label}") # noqa: T201 + print(f"{'=' * 60}") # noqa: T201 + print(f" Wall time: {elapsed:.4f}s") # noqa: T201 + print(f" Queries: {len(queries)} ({query_time:.4f}s)") # noqa: T201 + print(f" Memory delta: {mem_diff / 1024:.1f} KiB") # noqa: T201 + print(f" Peak memory: {peak / 1024:.1f} KiB") # noqa: T201 + print("\n Top 5 allocations:") # noqa: T201 + for stat in stats[:5]: + print(f" {stat}") # noqa: T201 + print(f"{'=' * 60}\n") # noqa: T201