Feature: Improve exporter memory efficieny

Phase 1 -- Eliminate JSON round-trip in document exporter Replace json.loads(serializers.serialize("json", qs)) with serializers.serialize("python", qs) to skip the intermediate JSON string allocation and parse step. Use DjangoJSONEncoder in check_and_write_json() to handle native Python types (datetime, Decimal, UUID) the Python serializer returns. Measured on 200 documents + 200 CustomFieldInstances: - Memory delta: 1,410 KiB → 527 KiB (−63%) - Peak memory: 1,500 KiB → 530 KiB (−65%) - Wall time: 0.54s → 0.36s (−34%) - JSON output: identical (byte-for-byte, 345 KB) Phase 2 -- Batched QuerySet serialization in document exporter Add serialize_queryset_batched() helper that uses QuerySet.iterator() and itertools.islice to stream records in configurable chunks, bounding peak memory during serialization to batch_size * avg_record_size rather than loading the entire QuerySet at once. Replace the single-call serializers.serialize("python", qs) in dump() with list(chain.from_iterable(serialize_queryset_batched(qs, batch_size))). Add --batch-size CLI argument (default 500). Measured on 2,000 documents + 2,000 CustomFieldInstances: Phase 1 baseline (full queryset, no iterator): Peak: 9,293 KiB | Time: 4.33s Phase 2 batch=2000 (iterator, 1 batch): Peak: 7,716 KiB | Time: 4.20s (−17% peak vs Phase 1) Phase 2 batch=500 (iterator, 4 batches -- default): Peak: 6,980 KiB | Time: 4.28s (−25% peak vs Phase 1) Phase 2 batch=100 (iterator, 20 batches): Peak: 6,776 KiB | Time: 4.30s (−27% peak vs Phase 1) Peak memory falls as batch_size decreases. Wall time is within noise (batching overhead negligible). Output is byte-for-byte identical across all batch sizes and approaches. The primary gain is that Django's queryset cache is bypassed (iterator()), preventing the ORM from holding all model instances in memory after fetch. Smaller batches reduce the per-batch model-instance peak further.
2026-03-04 08:16:25 +00:00 · 2026-03-03 19:29:29 -08:00
3 changed files with 124 additions and 4 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -305,6 +305,7 @@ markers = [
  "greenmail: Tests requiring Greenmail service",
  "date_parsing: Tests which cover date parsing from content or filename",
  "management: Tests which cover management commands/functionality",
+  "profiling: Benchmarks that profile and compare implementation performance",
 ]

 [tool.pytest_env]
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -3,6 +3,8 @@ import json
 import os
 import shutil
 import tempfile
+from itertools import chain
+from itertools import islice
 from pathlib import Path
 from typing import TYPE_CHECKING

@@ -19,6 +21,7 @@ from django.contrib.contenttypes.models import ContentType
 from django.core import serializers
 from django.core.management.base import BaseCommand
 from django.core.management.base import CommandError
+from django.core.serializers.json import DjangoJSONEncoder
 from django.db import transaction
 from django.utils import timezone
 from filelock import FileLock
@@ -26,6 +29,8 @@ from guardian.models import GroupObjectPermission
 from guardian.models import UserObjectPermission

 if TYPE_CHECKING:
+    from collections.abc import Generator
+
    from django.db.models import QuerySet

 if settings.AUDIT_LOG_ENABLED:
@@ -60,6 +65,22 @@ from paperless_mail.models import MailAccount
 from paperless_mail.models import MailRule


+def serialize_queryset_batched(
+    queryset: "QuerySet",
+    *,
+    batch_size: int = 500,
+) -> "Generator[list[dict], None, None]":
+    """Yield batches of serialized records from a QuerySet.
+
+    Each batch is a list of dicts in Django's Python serialization format.
+    Uses QuerySet.iterator() to avoid loading the full queryset into memory,
+    and islice to collect chunk-sized batches serialized in a single call.
+    """
+    iterator = queryset.iterator(chunk_size=batch_size)
+    while chunk := list(islice(iterator, batch_size)):
+        yield serializers.serialize("python", chunk)
+
+
 class Command(CryptMixin, BaseCommand):
    help = (
        "Decrypt and rename all files in our collection into a given target "
@@ -186,6 +207,17 @@ class Command(CryptMixin, BaseCommand):
            help="If provided, is used to encrypt sensitive data in the export",
        )

+        parser.add_argument(
+            "--batch-size",
+            type=int,
+            default=500,
+            help=(
+                "Number of records to process per batch during serialization. "
+                "Lower values reduce peak memory usage; higher values improve "
+                "throughput. Default: 500."
+            ),
+        )
+
    def handle(self, *args, **options) -> None:
        self.target = Path(options["target"]).resolve()
        self.split_manifest: bool = options["split_manifest"]
@@ -200,6 +232,7 @@ class Command(CryptMixin, BaseCommand):
        self.data_only: bool = options["data_only"]
        self.no_progress_bar: bool = options["no_progress_bar"]
        self.passphrase: str | None = options.get("passphrase")
+        self.batch_size: int = options["batch_size"]

        self.files_in_export_dir: set[Path] = set()
        self.exported_files: set[str] = set()
@@ -294,8 +327,13 @@ class Command(CryptMixin, BaseCommand):

            # Build an overall manifest
            for key, object_query in manifest_key_to_object_query.items():
-                manifest_dict[key] = json.loads(
-                    serializers.serialize("json", object_query),
+                manifest_dict[key] = list(
+                    chain.from_iterable(
+                        serialize_queryset_batched(
+                            object_query,
+                            batch_size=self.batch_size,
+                        ),
+                    ),
                )

            self.encrypt_secret_fields(manifest_dict)
@@ -512,14 +550,24 @@ class Command(CryptMixin, BaseCommand):
            self.files_in_export_dir.remove(target)
            if self.compare_json:
                target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
-                src_str = json.dumps(content, indent=2, ensure_ascii=False)
+                src_str = json.dumps(
+                    content,
+                    cls=DjangoJSONEncoder,
+                    indent=2,
+                    ensure_ascii=False,
+                )
                src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
                if src_checksum == target_checksum:
                    perform_write = False

        if perform_write:
            target.write_text(
-                json.dumps(content, indent=2, ensure_ascii=False),
+                json.dumps(
+                    content,
+                    cls=DjangoJSONEncoder,
+                    indent=2,
+                    ensure_ascii=False,
+                ),
                encoding="utf-8",
            )

--- a/src/documents/profiling.py
+++ b/src/documents/profiling.py
@@ -0,0 +1,71 @@
+"""
+Temporary profiling utilities for comparing implementations.
+
+Usage in a management command or shell::
+
+    from documents.profiling import profile_block
+
+    with profile_block("new check_sanity"):
+        messages = check_sanity()
+
+    with profile_block("old check_sanity"):
+        messages = check_sanity_old()
+
+Drop this file when done.
+"""
+
+from __future__ import annotations
+
+import tracemalloc
+from contextlib import contextmanager
+from time import perf_counter
+from typing import TYPE_CHECKING
+
+from django.db import connection
+from django.db import reset_queries
+from django.test.utils import override_settings
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+@contextmanager
+def profile_block(label: str = "block") -> Generator[None, None, None]:
+    """Profile memory, wall time, and DB queries for a code block.
+
+    Prints a summary to stdout on exit. Requires no external packages.
+    Enables DEBUG temporarily to capture Django's query log.
+    """
+    tracemalloc.start()
+    snapshot_before = tracemalloc.take_snapshot()
+
+    with override_settings(DEBUG=True):
+        reset_queries()
+        start = perf_counter()
+
+        yield
+
+        elapsed = perf_counter() - start
+        queries = list(connection.queries)
+
+    snapshot_after = tracemalloc.take_snapshot()
+    _, peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+
+    # Compare snapshots for top allocations
+    stats = snapshot_after.compare_to(snapshot_before, "lineno")
+
+    query_time = sum(float(q["time"]) for q in queries)
+    mem_diff = sum(s.size_diff for s in stats)
+
+    print(f"\n{'=' * 60}")  # noqa: T201
+    print(f"  Profile: {label}")  # noqa: T201
+    print(f"{'=' * 60}")  # noqa: T201
+    print(f"  Wall time:    {elapsed:.4f}s")  # noqa: T201
+    print(f"  Queries:      {len(queries)} ({query_time:.4f}s)")  # noqa: T201
+    print(f"  Memory delta: {mem_diff / 1024:.1f} KiB")  # noqa: T201
+    print(f"  Peak memory:  {peak / 1024:.1f} KiB")  # noqa: T201
+    print("\n  Top 5 allocations:")  # noqa: T201
+    for stat in stats[:5]:
+        print(f"    {stat}")  # noqa: T201
+    print(f"{'=' * 60}\n")  # noqa: T201