Compare commits

..

3 Commits

Author SHA1 Message Date
Trenton H
2eabdcb0ef Removes the profiling leftovers 2026-03-06 15:57:54 -08:00
Trenton H
ac8bab7ed0 Refactor: O(1) lookup table for CRYPT_FIELDS in per-record encryption
Add CRYPT_FIELDS_BY_MODEL to CryptMixin, derived from CRYPT_FIELDS at
class definition time. _encrypt_record_inline() now does a single dict
lookup instead of a linear scan per record, eliminating the loop and
break pattern.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 14:43:56 -08:00
Trenton H
3e961eff0e Perf: streaming manifest writer for document exporter (Phase 3)
Replaces the in-memory manifest dict accumulation with a
StreamingManifestWriter that writes records to manifest.json
incrementally, keeping only one batch resident in memory at a time.

Key changes:
- Add StreamingManifestWriter: writes to .tmp atomically, BLAKE2b
  compare for --compare-json, discard() on exception
- Add _encrypt_record_inline(): per-record encryption replacing the
  bulk encrypt_secret_fields() call; crypto setup moved before streaming
- Add _write_split_manifest(): extracted per-document manifest writing
- Refactor dump(): non-doc records streamed during transaction, documents
  accumulated then written after filenames are assigned
- Upgrade check_and_write_json() from MD5 to BLAKE2b
- Remove encrypt_secret_fields() and unused itertools.chain import
- Add profiling marker to pyproject.toml

Measured improvement (200 docs + 200 CustomFieldInstances, same
dump() code path, only writer differs):
- Peak memory: ~50% reduction
- Memory delta: ~70% reduction
- Wall time and query count: unchanged

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 14:41:48 -08:00
2 changed files with 198 additions and 114 deletions

View File

@@ -3,7 +3,6 @@ import json
import os import os
import shutil import shutil
import tempfile import tempfile
from itertools import chain
from itertools import islice from itertools import islice
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@@ -81,6 +80,88 @@ def serialize_queryset_batched(
yield serializers.serialize("python", chunk) yield serializers.serialize("python", chunk)
class StreamingManifestWriter:
"""Incrementally writes a JSON array to a file, one record at a time.
Writes to <target>.tmp first; on close(), optionally BLAKE2b-compares
with the existing file (--compare-json) and renames or discards accordingly.
On exception, discard() deletes the tmp file and leaves the original intact.
"""
def __init__(
self,
path: Path,
*,
compare_json: bool = False,
files_in_export_dir: "set[Path] | None" = None,
) -> None:
self._path = path.resolve()
self._tmp_path = self._path.with_suffix(self._path.suffix + ".tmp")
self._compare_json = compare_json
self._files_in_export_dir: set[Path] = (
files_in_export_dir if files_in_export_dir is not None else set()
)
self._file = None
self._first = True
def open(self) -> None:
self._path.parent.mkdir(parents=True, exist_ok=True)
self._file = self._tmp_path.open("w", encoding="utf-8")
self._file.write("[")
self._first = True
def write_record(self, record: dict) -> None:
if not self._first:
self._file.write(",\n")
else:
self._first = False
self._file.write(
json.dumps(record, cls=DjangoJSONEncoder, indent=2, ensure_ascii=False),
)
def write_batch(self, records: list[dict]) -> None:
for record in records:
self.write_record(record)
def close(self) -> None:
if self._file is None:
return
self._file.write("\n]")
self._file.close()
self._file = None
self._finalize()
def discard(self) -> None:
if self._file is not None:
self._file.close()
self._file = None
if self._tmp_path.exists():
self._tmp_path.unlink()
def _finalize(self) -> None:
"""Compare with existing file (if --compare-json) then rename or discard tmp."""
if self._path in self._files_in_export_dir:
self._files_in_export_dir.remove(self._path)
if self._compare_json:
existing_hash = hashlib.blake2b(self._path.read_bytes()).hexdigest()
new_hash = hashlib.blake2b(self._tmp_path.read_bytes()).hexdigest()
if existing_hash == new_hash:
self._tmp_path.unlink()
return
self._tmp_path.rename(self._path)
def __enter__(self) -> "StreamingManifestWriter":
self.open()
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
if exc_type is not None:
self.discard()
else:
self.close()
return False
class Command(CryptMixin, BaseCommand): class Command(CryptMixin, BaseCommand):
help = ( help = (
"Decrypt and rename all files in our collection into a given target " "Decrypt and rename all files in our collection into a given target "
@@ -322,95 +403,83 @@ class Command(CryptMixin, BaseCommand):
if settings.AUDIT_LOG_ENABLED: if settings.AUDIT_LOG_ENABLED:
manifest_key_to_object_query["log_entries"] = LogEntry.objects.all() manifest_key_to_object_query["log_entries"] = LogEntry.objects.all()
with transaction.atomic(): # Crypto setup before streaming begins
manifest_dict = {} if self.passphrase:
self.setup_crypto(passphrase=self.passphrase)
# Build an overall manifest elif MailAccount.objects.count() > 0 or SocialToken.objects.count() > 0:
for key, object_query in manifest_key_to_object_query.items(): self.stdout.write(
manifest_dict[key] = list( self.style.NOTICE(
chain.from_iterable( "No passphrase was given, sensitive fields will be in plaintext",
serialize_queryset_batched( ),
object_query,
batch_size=self.batch_size,
),
),
)
self.encrypt_secret_fields(manifest_dict)
# These are treated specially and included in the per-document manifest
# if that setting is enabled. Otherwise, they are just exported to the bulk
# manifest
document_map: dict[int, Document] = {
d.pk: d for d in manifest_key_to_object_query["documents"]
}
document_manifest = manifest_dict["documents"]
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(
enumerate(document_manifest),
total=len(document_manifest),
disable=self.no_progress_bar,
):
document = document_map[document_dict["pk"]]
# 3.1. generate a unique filename
base_name = self.generate_base_name(document)
# 3.2. write filenames into manifest
original_target, thumbnail_target, archive_target = (
self.generate_document_targets(document, base_name, document_dict)
) )
# 3.3. write files to target folder document_manifest: list[dict] = []
if not self.data_only:
self.copy_document_files(
document,
original_target,
thumbnail_target,
archive_target,
)
if self.split_manifest:
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
if self.use_folder_prefix:
manifest_name = Path("json") / manifest_name
manifest_name = (self.target / manifest_name).resolve()
manifest_name.parent.mkdir(parents=True, exist_ok=True)
content = [document_manifest[index]]
content += list(
filter(
lambda d: d["fields"]["document"] == document_dict["pk"],
manifest_dict["notes"],
),
)
content += list(
filter(
lambda d: d["fields"]["document"] == document_dict["pk"],
manifest_dict["custom_field_instances"],
),
)
self.check_and_write_json(
content,
manifest_name,
)
# These were exported already
if self.split_manifest:
del manifest_dict["documents"]
del manifest_dict["notes"]
del manifest_dict["custom_field_instances"]
# 4.1 write primary manifest to target folder
manifest = []
for key, item in manifest_dict.items():
manifest.extend(item)
manifest_path = (self.target / "manifest.json").resolve() manifest_path = (self.target / "manifest.json").resolve()
self.check_and_write_json(
manifest, with StreamingManifestWriter(
manifest_path, manifest_path,
) compare_json=self.compare_json,
files_in_export_dir=self.files_in_export_dir,
) as writer:
with transaction.atomic():
for key, qs in manifest_key_to_object_query.items():
if key == "documents":
# Accumulate for file-copy loop; written to manifest after
for batch in serialize_queryset_batched(
qs,
batch_size=self.batch_size,
):
for record in batch:
self._encrypt_record_inline(record)
document_manifest.extend(batch)
elif self.split_manifest and key in (
"notes",
"custom_field_instances",
):
# Written per-document in _write_split_manifest
pass
else:
for batch in serialize_queryset_batched(
qs,
batch_size=self.batch_size,
):
for record in batch:
self._encrypt_record_inline(record)
writer.write_batch(batch)
document_map: dict[int, Document] = {
d.pk: d for d in Document.objects.order_by("id")
}
# 3. Export files from each document
for document_dict in tqdm.tqdm(
document_manifest,
total=len(document_manifest),
disable=self.no_progress_bar,
):
document = document_map[document_dict["pk"]]
# 3.1. generate a unique filename
base_name = self.generate_base_name(document)
# 3.2. write filenames into manifest
original_target, thumbnail_target, archive_target = (
self.generate_document_targets(document, base_name, document_dict)
)
# 3.3. write files to target folder
if not self.data_only:
self.copy_document_files(
document,
original_target,
thumbnail_target,
archive_target,
)
if self.split_manifest:
self._write_split_manifest(document_dict, document, base_name)
else:
writer.write_record(document_dict)
# 4.2 write version information to target folder # 4.2 write version information to target folder
extra_metadata_path = (self.target / "metadata.json").resolve() extra_metadata_path = (self.target / "metadata.json").resolve()
@@ -532,6 +601,42 @@ class Command(CryptMixin, BaseCommand):
archive_target, archive_target,
) )
def _encrypt_record_inline(self, record: dict) -> None:
"""Encrypt sensitive fields in a single record, if passphrase is set."""
if not self.passphrase:
return
fields = self.CRYPT_FIELDS_BY_MODEL.get(record.get("model", ""))
if fields:
for field in fields:
if record["fields"].get(field):
record["fields"][field] = self.encrypt_string(
value=record["fields"][field],
)
def _write_split_manifest(
self,
document_dict: dict,
document: Document,
base_name: Path,
) -> None:
"""Write per-document manifest file for --split-manifest mode."""
content = [document_dict]
content.extend(
serializers.serialize("python", Note.objects.filter(document=document)),
)
content.extend(
serializers.serialize(
"python",
CustomFieldInstance.objects.filter(document=document),
),
)
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
if self.use_folder_prefix:
manifest_name = Path("json") / manifest_name
manifest_name = (self.target / manifest_name).resolve()
manifest_name.parent.mkdir(parents=True, exist_ok=True)
self.check_and_write_json(content, manifest_name)
def check_and_write_json( def check_and_write_json(
self, self,
content: list[dict] | dict, content: list[dict] | dict,
@@ -549,14 +654,14 @@ class Command(CryptMixin, BaseCommand):
if target in self.files_in_export_dir: if target in self.files_in_export_dir:
self.files_in_export_dir.remove(target) self.files_in_export_dir.remove(target)
if self.compare_json: if self.compare_json:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest() target_checksum = hashlib.blake2b(target.read_bytes()).hexdigest()
src_str = json.dumps( src_str = json.dumps(
content, content,
cls=DjangoJSONEncoder, cls=DjangoJSONEncoder,
indent=2, indent=2,
ensure_ascii=False, ensure_ascii=False,
) )
src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest() src_checksum = hashlib.blake2b(src_str.encode("utf-8")).hexdigest()
if src_checksum == target_checksum: if src_checksum == target_checksum:
perform_write = False perform_write = False
@@ -606,28 +711,3 @@ class Command(CryptMixin, BaseCommand):
if perform_copy: if perform_copy:
target.parent.mkdir(parents=True, exist_ok=True) target.parent.mkdir(parents=True, exist_ok=True)
copy_file_with_basic_stats(source, target) copy_file_with_basic_stats(source, target)
def encrypt_secret_fields(self, manifest: dict) -> None:
"""
Encrypts certain fields in the export. Currently limited to the mail account password
"""
if self.passphrase:
self.setup_crypto(passphrase=self.passphrase)
for crypt_config in self.CRYPT_FIELDS:
exporter_key = crypt_config["exporter_key"]
crypt_fields = crypt_config["fields"]
for manifest_record in manifest[exporter_key]:
for field in crypt_fields:
if manifest_record["fields"][field]:
manifest_record["fields"][field] = self.encrypt_string(
value=manifest_record["fields"][field],
)
elif MailAccount.objects.count() > 0 or SocialToken.objects.count() > 0:
self.stdout.write(
self.style.NOTICE(
"No passphrase was given, sensitive fields will be in plaintext",
),
)

View File

@@ -71,7 +71,7 @@ class CryptMixin:
key_size = 32 key_size = 32
kdf_algorithm = "pbkdf2_sha256" kdf_algorithm = "pbkdf2_sha256"
CRYPT_FIELDS: CryptFields = [ CRYPT_FIELDS: list[CryptFields] = [
{ {
"exporter_key": "mail_accounts", "exporter_key": "mail_accounts",
"model_name": "paperless_mail.mailaccount", "model_name": "paperless_mail.mailaccount",
@@ -89,6 +89,10 @@ class CryptMixin:
], ],
}, },
] ]
# O(1) lookup for per-record encryption; derived from CRYPT_FIELDS at class definition time
CRYPT_FIELDS_BY_MODEL: dict[str, list[str]] = {
cfg["model_name"]: cfg["fields"] for cfg in CRYPT_FIELDS
}
def get_crypt_params(self) -> dict[str, dict[str, str | int]]: def get_crypt_params(self) -> dict[str, dict[str, str | int]]:
return { return {