Refactor: Write zip exports directly into ZipFile instead of temp dir

Replace the temp-dir + shutil.make_archive() workaround with direct
zipfile.ZipFile writes. Document files are added via zf.write() and
JSON manifests via zf.writestr()/StringIO buffering, eliminating the
double-I/O and 2x disk usage of the previous approach.

Key changes:
- Removed tempfile.TemporaryDirectory and shutil.make_archive() from handle()
- ZipFile opened on a .tmp path; renamed to final .zip atomically on success;
  .tmp cleaned up on failure
- StreamingManifestWriter: zip mode buffers manifest in io.StringIO and
  writes to zip atomically on close() (zipfile allows only one open write
  handle at a time)
- check_and_copy(): zip mode calls zf.write(source, arcname=...) directly
- check_and_write_json(): zip mode calls zf.writestr(arcname, ...) directly
- files_in_export_dir scan skipped in zip mode (always fresh write)
- --compare-checksums and --compare-json emit warnings when used with --zip
- --delete in zip mode removes pre-existing files from target dir, skipping
  the in-progress .tmp and any prior .zip
- Added tests: atomicity on failure, no SCRATCH_DIR usage

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-09 09:13:12 -07:00
parent fdd5e3ecb2
commit f875863fe0
2 changed files with 144 additions and 35 deletions

View File

@@ -1,8 +1,9 @@
import hashlib
import io
import json
import os
import shutil
import tempfile
import zipfile
from itertools import islice
from pathlib import Path
from typing import TYPE_CHECKING
@@ -97,6 +98,8 @@ class StreamingManifestWriter:
*,
compare_json: bool = False,
files_in_export_dir: "set[Path] | None" = None,
zip_file: "zipfile.ZipFile | None" = None,
zip_arcname: str | None = None,
) -> None:
self._path = path.resolve()
self._tmp_path = self._path.with_suffix(self._path.suffix + ".tmp")
@@ -104,12 +107,20 @@ class StreamingManifestWriter:
self._files_in_export_dir: set[Path] = (
files_in_export_dir if files_in_export_dir is not None else set()
)
self._zip_file = zip_file
self._zip_arcname = zip_arcname
self._zip_mode = zip_file is not None
self._file = None
self._first = True
def open(self) -> None:
self._path.parent.mkdir(parents=True, exist_ok=True)
self._file = self._tmp_path.open("w", encoding="utf-8")
if self._zip_mode:
# zipfile only allows one open write handle at a time, so buffer
# the manifest in memory and write it atomically on close()
self._file = io.StringIO()
else:
self._path.parent.mkdir(parents=True, exist_ok=True)
self._file = self._tmp_path.open("w", encoding="utf-8")
self._file.write("[")
self._first = True
@@ -130,15 +141,18 @@ class StreamingManifestWriter:
if self._file is None:
return
self._file.write("\n]")
if self._zip_mode:
self._zip_file.writestr(self._zip_arcname, self._file.getvalue())
self._file.close()
self._file = None
self._finalize()
if not self._zip_mode:
self._finalize()
def discard(self) -> None:
if self._file is not None:
self._file.close()
self._file = None
if self._tmp_path.exists():
if not self._zip_mode and self._tmp_path.exists():
self._tmp_path.unlink()
def _finalize(self) -> None:
@@ -315,18 +329,12 @@ class Command(CryptMixin, PaperlessCommand):
self.files_in_export_dir: set[Path] = set()
self.exported_files: set[str] = set()
self.zip_file: zipfile.ZipFile | None = None
# If zipping, save the original target for later and
# get a temporary directory for the target instead
temp_dir = None
self.original_target = self.target
if self.zip_export:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
temp_dir = tempfile.TemporaryDirectory(
dir=settings.SCRATCH_DIR,
prefix="paperless-export",
)
self.target = Path(temp_dir.name).resolve()
zip_name = options["zip_name"]
self.zip_path = (self.target / zip_name).with_suffix(".zip")
self.zip_tmp_path = self.zip_path.parent / (self.zip_path.name + ".tmp")
if not self.target.exists():
raise CommandError("That path doesn't exist")
@@ -337,30 +345,53 @@ class Command(CryptMixin, PaperlessCommand):
if not os.access(self.target, os.W_OK):
raise CommandError("That path doesn't appear to be writable")
if self.zip_export:
if self.compare_checksums:
self.stdout.write(
self.style.WARNING(
"--compare-checksums is ignored when --zip is used",
),
)
if self.compare_json:
self.stdout.write(
self.style.WARNING(
"--compare-json is ignored when --zip is used",
),
)
try:
# Prevent any ongoing changes in the documents
with FileLock(settings.MEDIA_LOCK):
self.dump()
# We've written everything to the temporary directory in this case,
# now make an archive in the original target, with all files stored
if self.zip_export and temp_dir is not None:
shutil.make_archive(
self.original_target / options["zip_name"],
format="zip",
root_dir=temp_dir.name,
if self.zip_export:
self.zip_file = zipfile.ZipFile(
self.zip_tmp_path,
"w",
compression=zipfile.ZIP_DEFLATED,
allowZip64=True,
)
self.dump()
if self.zip_export and self.zip_file is not None:
self.zip_file.close()
self.zip_file = None
self.zip_tmp_path.rename(self.zip_path)
finally:
# Always cleanup the temporary directory, if one was created
if self.zip_export and temp_dir is not None:
temp_dir.cleanup()
# Ensure zip_file is closed and the incomplete .tmp is removed on failure
if self.zip_file is not None:
self.zip_file.close()
self.zip_file = None
if self.zip_export and self.zip_tmp_path.exists():
self.zip_tmp_path.unlink()
def dump(self) -> None:
# 1. Take a snapshot of what files exist in the current export folder
for x in self.target.glob("**/*"):
if x.is_file():
self.files_in_export_dir.add(x.resolve())
# (skipped in zip mode — always write fresh, no skip/compare logic applies)
if not self.zip_export:
for x in self.target.glob("**/*"):
if x.is_file():
self.files_in_export_dir.add(x.resolve())
# 2. Create manifest, containing all correspondents, types, tags, storage paths
# note, documents and ui_settings
@@ -421,6 +452,8 @@ class Command(CryptMixin, PaperlessCommand):
manifest_path,
compare_json=self.compare_json,
files_in_export_dir=self.files_in_export_dir,
zip_file=self.zip_file,
zip_arcname="manifest.json",
) as writer:
with transaction.atomic():
for key, qs in manifest_key_to_object_query.items():
@@ -539,8 +572,12 @@ class Command(CryptMixin, PaperlessCommand):
self.target,
)
else:
# 5. Remove anything in the original location (before moving the zip)
for item in self.original_target.glob("*"):
# 5. Remove pre-existing files/dirs from target, keeping the
# in-progress zip (.tmp) and any prior zip at the final path
skip = {self.zip_path.resolve(), self.zip_tmp_path.resolve()}
for item in self.target.glob("*"):
if item.resolve() in skip:
continue
if item.is_dir():
shutil.rmtree(item)
else:
@@ -710,7 +747,8 @@ class Command(CryptMixin, PaperlessCommand):
if self.use_folder_prefix:
manifest_name = Path("json") / manifest_name
manifest_name = (self.target / manifest_name).resolve()
manifest_name.parent.mkdir(parents=True, exist_ok=True)
if not self.zip_export:
manifest_name.parent.mkdir(parents=True, exist_ok=True)
self.check_and_write_json(content, manifest_name)
def check_and_write_json(
@@ -725,6 +763,19 @@ class Command(CryptMixin, PaperlessCommand):
This preserves the file timestamps when no changes are made.
"""
if self.zip_export:
arcname = str(target.resolve().relative_to(self.target))
self.zip_file.writestr(
arcname,
json.dumps(
content,
cls=DjangoJSONEncoder,
indent=2,
ensure_ascii=False,
),
)
return
target = target.resolve()
perform_write = True
if target in self.files_in_export_dir:
@@ -763,6 +814,11 @@ class Command(CryptMixin, PaperlessCommand):
the source attributes
"""
if self.zip_export:
arcname = str(target.resolve().relative_to(self.target))
self.zip_file.write(source, arcname=arcname)
return
target = target.resolve()
if target in self.files_in_export_dir:
self.files_in_export_dir.remove(target)

View File

@@ -615,8 +615,8 @@ class TestExportImport(
self.assertIsFile(expected_file)
with ZipFile(expected_file) as zip:
# Extras are from the directories, which also appear in the listing
self.assertEqual(len(zip.namelist()), 14)
# Direct ZipFile writing doesn't add directory entries (unlike shutil.make_archive)
self.assertEqual(len(zip.namelist()), 11)
self.assertIn("manifest.json", zip.namelist())
self.assertIn("metadata.json", zip.namelist())
@@ -666,6 +666,59 @@ class TestExportImport(
self.assertIn("manifest.json", zip.namelist())
self.assertIn("metadata.json", zip.namelist())
def test_export_zip_atomic_on_failure(self) -> None:
"""
GIVEN:
- Request to export documents to zipfile
WHEN:
- Export raises an exception mid-way
THEN:
- No .zip file is written at the final path
- The .tmp file is cleaned up
"""
args = ["document_exporter", self.target, "--zip"]
with mock.patch.object(
document_exporter.Command,
"dump",
side_effect=RuntimeError("simulated failure"),
):
with self.assertRaises(RuntimeError):
call_command(*args)
expected_zip = self.target / f"export-{timezone.localdate().isoformat()}.zip"
expected_tmp = (
self.target / f"export-{timezone.localdate().isoformat()}.zip.tmp"
)
self.assertIsNotFile(expected_zip)
self.assertIsNotFile(expected_tmp)
def test_export_zip_no_scratch_dir(self) -> None:
"""
GIVEN:
- Request to export documents to zipfile
WHEN:
- Documents are exported
THEN:
- No files are written under SCRATCH_DIR during the export
(the old workaround used a temp dir there)
"""
from django.conf import settings
shutil.rmtree(Path(self.dirs.media_dir) / "documents")
shutil.copytree(
Path(__file__).parent / "samples" / "documents",
Path(self.dirs.media_dir) / "documents",
)
scratch_before = set(settings.SCRATCH_DIR.glob("paperless-export*"))
args = ["document_exporter", self.target, "--zip"]
call_command(*args)
scratch_after = set(settings.SCRATCH_DIR.glob("paperless-export*"))
self.assertEqual(scratch_before, scratch_after)
def test_export_target_not_exists(self) -> None:
"""
GIVEN: