From 49d1e773d9f5f5102011df07795ff432adc68e84 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Tue, 3 Mar 2026 16:27:22 -0800
Subject: [PATCH] Feature: Improve exporter memory efficieny
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1 -- Eliminate JSON round-trip in document exporter

Replace json.loads(serializers.serialize("json", qs)) with
serializers.serialize("python", qs) to skip the intermediate
JSON string allocation and parse step. Use DjangoJSONEncoder
in check_and_write_json() to handle native Python types
(datetime, Decimal, UUID) the Python serializer returns.

Measured on 200 documents + 200 CustomFieldInstances:
  - Memory delta:  1,410 KiB → 527 KiB  (−63%)
  - Peak memory:   1,500 KiB → 530 KiB  (−65%)
  - Wall time:       0.54s  → 0.36s    (−34%)
  - JSON output: identical (byte-for-byte, 345 KB)

Phase 2 -- Batched QuerySet serialization in document exporter

Add serialize_queryset_batched() helper that uses QuerySet.iterator()
and itertools.islice to stream records in configurable chunks, bounding
peak memory during serialization to batch_size * avg_record_size rather
than loading the entire QuerySet at once.

Replace the single-call serializers.serialize("python", qs) in dump()
with list(chain.from_iterable(serialize_queryset_batched(qs, batch_size))).
Add --batch-size CLI argument (default 500).

Measured on 2,000 documents + 2,000 CustomFieldInstances:
  Phase 1 baseline (full queryset, no iterator):
    Peak: 9,293 KiB  |  Time: 4.33s

  Phase 2 batch=2000 (iterator, 1 batch):
    Peak: 7,716 KiB  |  Time: 4.20s  (−17% peak vs Phase 1)

  Phase 2 batch=500 (iterator, 4 batches -- default):
    Peak: 6,980 KiB  |  Time: 4.28s  (−25% peak vs Phase 1)

  Phase 2 batch=100 (iterator, 20 batches):
    Peak: 6,776 KiB  |  Time: 4.30s  (−27% peak vs Phase 1)

Peak memory falls as batch_size decreases. Wall time is within noise
(batching overhead negligible). Output is byte-for-byte identical across
all batch sizes and approaches.

The primary gain is that Django's queryset cache is bypassed (iterator()),
preventing the ORM from holding all model instances in memory after fetch.
Smaller batches reduce the per-batch model-instance peak further.
---
 pyproject.toml                                |  1 +
 .../management/commands/document_exporter.py  | 56 +++++++++++++--
 src/documents/profiling.py                    | 71 +++++++++++++++++++
 3 files changed, 124 insertions(+), 4 deletions(-)
 create mode 100644 src/documents/profiling.py

diff --git a/pyproject.toml b/pyproject.toml
index 3d00f4e67..36a02528d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -305,6 +305,7 @@ markers = [
   "greenmail: Tests requiring Greenmail service",
   "date_parsing: Tests which cover date parsing from content or filename",
   "management: Tests which cover management commands/functionality",
+  "profiling: Benchmarks that profile and compare implementation performance",
 ]
 
 [tool.pytest_env]
diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py
index bd962efc4..52cadb4fe 100644
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -3,6 +3,8 @@ import json
 import os
 import shutil
 import tempfile
+from itertools import chain
+from itertools import islice
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -19,6 +21,7 @@ from django.contrib.contenttypes.models import ContentType
 from django.core import serializers
 from django.core.management.base import BaseCommand
 from django.core.management.base import CommandError
+from django.core.serializers.json import DjangoJSONEncoder
 from django.db import transaction
 from django.utils import timezone
 from filelock import FileLock
@@ -26,6 +29,8 @@ from guardian.models import GroupObjectPermission
 from guardian.models import UserObjectPermission
 
 if TYPE_CHECKING:
+    from collections.abc import Generator
+
     from django.db.models import QuerySet
 
 if settings.AUDIT_LOG_ENABLED:
@@ -60,6 +65,22 @@ from paperless_mail.models import MailAccount
 from paperless_mail.models import MailRule
 
 
+def serialize_queryset_batched(
+    queryset: "QuerySet",
+    *,
+    batch_size: int = 500,
+) -> "Generator[list[dict], None, None]":
+    """Yield batches of serialized records from a QuerySet.
+
+    Each batch is a list of dicts in Django's Python serialization format.
+    Uses QuerySet.iterator() to avoid loading the full queryset into memory,
+    and islice to collect chunk-sized batches serialized in a single call.
+    """
+    iterator = queryset.iterator(chunk_size=batch_size)
+    while chunk := list(islice(iterator, batch_size)):
+        yield serializers.serialize("python", chunk)
+
+
 class Command(CryptMixin, BaseCommand):
     help = (
         "Decrypt and rename all files in our collection into a given target "
@@ -186,6 +207,17 @@ class Command(CryptMixin, BaseCommand):
             help="If provided, is used to encrypt sensitive data in the export",
         )
 
+        parser.add_argument(
+            "--batch-size",
+            type=int,
+            default=500,
+            help=(
+                "Number of records to process per batch during serialization. "
+                "Lower values reduce peak memory usage; higher values improve "
+                "throughput. Default: 500."
+            ),
+        )
+
     def handle(self, *args, **options) -> None:
         self.target = Path(options["target"]).resolve()
         self.split_manifest: bool = options["split_manifest"]
@@ -200,6 +232,7 @@ class Command(CryptMixin, BaseCommand):
         self.data_only: bool = options["data_only"]
         self.no_progress_bar: bool = options["no_progress_bar"]
         self.passphrase: str | None = options.get("passphrase")
+        self.batch_size: int = options["batch_size"]
 
         self.files_in_export_dir: set[Path] = set()
         self.exported_files: set[str] = set()
@@ -294,8 +327,13 @@ class Command(CryptMixin, BaseCommand):
 
             # Build an overall manifest
             for key, object_query in manifest_key_to_object_query.items():
-                manifest_dict[key] = json.loads(
-                    serializers.serialize("json", object_query),
+                manifest_dict[key] = list(
+                    chain.from_iterable(
+                        serialize_queryset_batched(
+                            object_query,
+                            batch_size=self.batch_size,
+                        ),
+                    ),
                 )
 
             self.encrypt_secret_fields(manifest_dict)
@@ -512,14 +550,24 @@ class Command(CryptMixin, BaseCommand):
             self.files_in_export_dir.remove(target)
             if self.compare_json:
                 target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
-                src_str = json.dumps(content, indent=2, ensure_ascii=False)
+                src_str = json.dumps(
+                    content,
+                    cls=DjangoJSONEncoder,
+                    indent=2,
+                    ensure_ascii=False,
+                )
                 src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
                 if src_checksum == target_checksum:
                     perform_write = False
 
         if perform_write:
             target.write_text(
-                json.dumps(content, indent=2, ensure_ascii=False),
+                json.dumps(
+                    content,
+                    cls=DjangoJSONEncoder,
+                    indent=2,
+                    ensure_ascii=False,
+                ),
                 encoding="utf-8",
             )
 
diff --git a/src/documents/profiling.py b/src/documents/profiling.py
new file mode 100644
index 000000000..aca0913e4
--- /dev/null
+++ b/src/documents/profiling.py
@@ -0,0 +1,71 @@
+"""
+Temporary profiling utilities for comparing implementations.
+
+Usage in a management command or shell::
+
+    from documents.profiling import profile_block
+
+    with profile_block("new check_sanity"):
+        messages = check_sanity()
+
+    with profile_block("old check_sanity"):
+        messages = check_sanity_old()
+
+Drop this file when done.
+"""
+
+from __future__ import annotations
+
+import tracemalloc
+from contextlib import contextmanager
+from time import perf_counter
+from typing import TYPE_CHECKING
+
+from django.db import connection
+from django.db import reset_queries
+from django.test.utils import override_settings
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+@contextmanager
+def profile_block(label: str = "block") -> Generator[None, None, None]:
+    """Profile memory, wall time, and DB queries for a code block.
+
+    Prints a summary to stdout on exit. Requires no external packages.
+    Enables DEBUG temporarily to capture Django's query log.
+    """
+    tracemalloc.start()
+    snapshot_before = tracemalloc.take_snapshot()
+
+    with override_settings(DEBUG=True):
+        reset_queries()
+        start = perf_counter()
+
+        yield
+
+        elapsed = perf_counter() - start
+        queries = list(connection.queries)
+
+    snapshot_after = tracemalloc.take_snapshot()
+    _, peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+
+    # Compare snapshots for top allocations
+    stats = snapshot_after.compare_to(snapshot_before, "lineno")
+
+    query_time = sum(float(q["time"]) for q in queries)
+    mem_diff = sum(s.size_diff for s in stats)
+
+    print(f"\n{'=' * 60}")  # noqa: T201
+    print(f"  Profile: {label}")  # noqa: T201
+    print(f"{'=' * 60}")  # noqa: T201
+    print(f"  Wall time:    {elapsed:.4f}s")  # noqa: T201
+    print(f"  Queries:      {len(queries)} ({query_time:.4f}s)")  # noqa: T201
+    print(f"  Memory delta: {mem_diff / 1024:.1f} KiB")  # noqa: T201
+    print(f"  Peak memory:  {peak / 1024:.1f} KiB")  # noqa: T201
+    print("\n  Top 5 allocations:")  # noqa: T201
+    for stat in stats[:5]:
+        print(f"    {stat}")  # noqa: T201
+    print(f"{'=' * 60}\n")  # noqa: T201