mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-09 18:51:25 +00:00
Compare commits
2 Commits
drop-api-v
...
feature-di
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6b0f3e93cd | ||
|
|
882ae7a3d8 |
@@ -1,8 +1,9 @@
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import zipfile
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
@@ -93,6 +94,8 @@ class StreamingManifestWriter:
|
||||
*,
|
||||
compare_json: bool = False,
|
||||
files_in_export_dir: "set[Path] | None" = None,
|
||||
zip_file: "zipfile.ZipFile | None" = None,
|
||||
zip_arcname: str | None = None,
|
||||
) -> None:
|
||||
self._path = path.resolve()
|
||||
self._tmp_path = self._path.with_suffix(self._path.suffix + ".tmp")
|
||||
@@ -100,12 +103,20 @@ class StreamingManifestWriter:
|
||||
self._files_in_export_dir: set[Path] = (
|
||||
files_in_export_dir if files_in_export_dir is not None else set()
|
||||
)
|
||||
self._zip_file = zip_file
|
||||
self._zip_arcname = zip_arcname
|
||||
self._zip_mode = zip_file is not None
|
||||
self._file = None
|
||||
self._first = True
|
||||
|
||||
def open(self) -> None:
|
||||
self._path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._file = self._tmp_path.open("w", encoding="utf-8")
|
||||
if self._zip_mode:
|
||||
# zipfile only allows one open write handle at a time, so buffer
|
||||
# the manifest in memory and write it atomically on close()
|
||||
self._file = io.StringIO()
|
||||
else:
|
||||
self._path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._file = self._tmp_path.open("w", encoding="utf-8")
|
||||
self._file.write("[")
|
||||
self._first = True
|
||||
|
||||
@@ -126,15 +137,18 @@ class StreamingManifestWriter:
|
||||
if self._file is None:
|
||||
return
|
||||
self._file.write("\n]")
|
||||
if self._zip_mode:
|
||||
self._zip_file.writestr(self._zip_arcname, self._file.getvalue())
|
||||
self._file.close()
|
||||
self._file = None
|
||||
self._finalize()
|
||||
if not self._zip_mode:
|
||||
self._finalize()
|
||||
|
||||
def discard(self) -> None:
|
||||
if self._file is not None:
|
||||
self._file.close()
|
||||
self._file = None
|
||||
if self._tmp_path.exists():
|
||||
if not self._zip_mode and self._tmp_path.exists():
|
||||
self._tmp_path.unlink()
|
||||
|
||||
def _finalize(self) -> None:
|
||||
@@ -311,18 +325,13 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
|
||||
self.files_in_export_dir: set[Path] = set()
|
||||
self.exported_files: set[str] = set()
|
||||
self.zip_file: zipfile.ZipFile | None = None
|
||||
self._zip_dirs: set[str] = set()
|
||||
|
||||
# If zipping, save the original target for later and
|
||||
# get a temporary directory for the target instead
|
||||
temp_dir = None
|
||||
self.original_target = self.target
|
||||
if self.zip_export:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
temp_dir = tempfile.TemporaryDirectory(
|
||||
dir=settings.SCRATCH_DIR,
|
||||
prefix="paperless-export",
|
||||
)
|
||||
self.target = Path(temp_dir.name).resolve()
|
||||
zip_name = options["zip_name"]
|
||||
self.zip_path = (self.target / zip_name).with_suffix(".zip")
|
||||
self.zip_tmp_path = self.zip_path.parent / (self.zip_path.name + ".tmp")
|
||||
|
||||
if not self.target.exists():
|
||||
raise CommandError("That path doesn't exist")
|
||||
@@ -333,30 +342,53 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
if not os.access(self.target, os.W_OK):
|
||||
raise CommandError("That path doesn't appear to be writable")
|
||||
|
||||
if self.zip_export:
|
||||
if self.compare_checksums:
|
||||
self.stdout.write(
|
||||
self.style.WARNING(
|
||||
"--compare-checksums is ignored when --zip is used",
|
||||
),
|
||||
)
|
||||
if self.compare_json:
|
||||
self.stdout.write(
|
||||
self.style.WARNING(
|
||||
"--compare-json is ignored when --zip is used",
|
||||
),
|
||||
)
|
||||
|
||||
try:
|
||||
# Prevent any ongoing changes in the documents
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
self.dump()
|
||||
|
||||
# We've written everything to the temporary directory in this case,
|
||||
# now make an archive in the original target, with all files stored
|
||||
if self.zip_export and temp_dir is not None:
|
||||
shutil.make_archive(
|
||||
self.original_target / options["zip_name"],
|
||||
format="zip",
|
||||
root_dir=temp_dir.name,
|
||||
if self.zip_export:
|
||||
self.zip_file = zipfile.ZipFile(
|
||||
self.zip_tmp_path,
|
||||
"w",
|
||||
compression=zipfile.ZIP_DEFLATED,
|
||||
allowZip64=True,
|
||||
)
|
||||
|
||||
self.dump()
|
||||
|
||||
if self.zip_export and self.zip_file is not None:
|
||||
self.zip_file.close()
|
||||
self.zip_file = None
|
||||
self.zip_tmp_path.rename(self.zip_path)
|
||||
|
||||
finally:
|
||||
# Always cleanup the temporary directory, if one was created
|
||||
if self.zip_export and temp_dir is not None:
|
||||
temp_dir.cleanup()
|
||||
# Ensure zip_file is closed and the incomplete .tmp is removed on failure
|
||||
if self.zip_file is not None:
|
||||
self.zip_file.close()
|
||||
self.zip_file = None
|
||||
if self.zip_export and self.zip_tmp_path.exists():
|
||||
self.zip_tmp_path.unlink()
|
||||
|
||||
def dump(self) -> None:
|
||||
# 1. Take a snapshot of what files exist in the current export folder
|
||||
for x in self.target.glob("**/*"):
|
||||
if x.is_file():
|
||||
self.files_in_export_dir.add(x.resolve())
|
||||
# (skipped in zip mode — always write fresh, no skip/compare logic applies)
|
||||
if not self.zip_export:
|
||||
for x in self.target.glob("**/*"):
|
||||
if x.is_file():
|
||||
self.files_in_export_dir.add(x.resolve())
|
||||
|
||||
# 2. Create manifest, containing all correspondents, types, tags, storage paths
|
||||
# note, documents and ui_settings
|
||||
@@ -414,6 +446,8 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
manifest_path,
|
||||
compare_json=self.compare_json,
|
||||
files_in_export_dir=self.files_in_export_dir,
|
||||
zip_file=self.zip_file,
|
||||
zip_arcname="manifest.json",
|
||||
) as writer:
|
||||
with transaction.atomic():
|
||||
for key, qs in manifest_key_to_object_query.items():
|
||||
@@ -504,8 +538,12 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
self.target,
|
||||
)
|
||||
else:
|
||||
# 5. Remove anything in the original location (before moving the zip)
|
||||
for item in self.original_target.glob("*"):
|
||||
# 5. Remove pre-existing files/dirs from target, keeping the
|
||||
# in-progress zip (.tmp) and any prior zip at the final path
|
||||
skip = {self.zip_path.resolve(), self.zip_tmp_path.resolve()}
|
||||
for item in self.target.glob("*"):
|
||||
if item.resolve() in skip:
|
||||
continue
|
||||
if item.is_dir():
|
||||
shutil.rmtree(item)
|
||||
else:
|
||||
@@ -630,9 +668,23 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
if self.use_folder_prefix:
|
||||
manifest_name = Path("json") / manifest_name
|
||||
manifest_name = (self.target / manifest_name).resolve()
|
||||
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
||||
if not self.zip_export:
|
||||
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.check_and_write_json(content, manifest_name)
|
||||
|
||||
def _ensure_zip_dirs(self, arcname: str) -> None:
|
||||
"""Write directory marker entries for all parent directories of arcname.
|
||||
|
||||
Some zip viewers only show folder structure when explicit directory
|
||||
entries exist, so we add them to avoid confusing users.
|
||||
"""
|
||||
parts = Path(arcname).parts[:-1]
|
||||
for i in range(len(parts)):
|
||||
dir_arc = "/".join(parts[: i + 1]) + "/"
|
||||
if dir_arc not in self._zip_dirs:
|
||||
self._zip_dirs.add(dir_arc)
|
||||
self.zip_file.mkdir(dir_arc)
|
||||
|
||||
def check_and_write_json(
|
||||
self,
|
||||
content: list[dict] | dict,
|
||||
@@ -645,6 +697,20 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
This preserves the file timestamps when no changes are made.
|
||||
"""
|
||||
|
||||
if self.zip_export:
|
||||
arcname = str(target.resolve().relative_to(self.target))
|
||||
self._ensure_zip_dirs(arcname)
|
||||
self.zip_file.writestr(
|
||||
arcname,
|
||||
json.dumps(
|
||||
content,
|
||||
cls=DjangoJSONEncoder,
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
target = target.resolve()
|
||||
perform_write = True
|
||||
if target in self.files_in_export_dir:
|
||||
@@ -683,6 +749,12 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
the source attributes
|
||||
"""
|
||||
|
||||
if self.zip_export:
|
||||
arcname = str(target.resolve().relative_to(self.target))
|
||||
self._ensure_zip_dirs(arcname)
|
||||
self.zip_file.write(source, arcname=arcname)
|
||||
return
|
||||
|
||||
target = target.resolve()
|
||||
if target in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(target)
|
||||
|
||||
@@ -505,7 +505,7 @@ class TestExportImport(
|
||||
self.assertIsFile(expected_file)
|
||||
|
||||
with ZipFile(expected_file) as zip:
|
||||
# Extras are from the directories, which also appear in the listing
|
||||
# 11 files + 3 directory marker entries for the subdirectory structure
|
||||
self.assertEqual(len(zip.namelist()), 14)
|
||||
self.assertIn("manifest.json", zip.namelist())
|
||||
self.assertIn("metadata.json", zip.namelist())
|
||||
@@ -557,6 +557,59 @@ class TestExportImport(
|
||||
self.assertIn("manifest.json", zip.namelist())
|
||||
self.assertIn("metadata.json", zip.namelist())
|
||||
|
||||
def test_export_zip_atomic_on_failure(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Request to export documents to zipfile
|
||||
WHEN:
|
||||
- Export raises an exception mid-way
|
||||
THEN:
|
||||
- No .zip file is written at the final path
|
||||
- The .tmp file is cleaned up
|
||||
"""
|
||||
args = ["document_exporter", self.target, "--zip"]
|
||||
|
||||
with mock.patch.object(
|
||||
document_exporter.Command,
|
||||
"dump",
|
||||
side_effect=RuntimeError("simulated failure"),
|
||||
):
|
||||
with self.assertRaises(RuntimeError):
|
||||
call_command(*args)
|
||||
|
||||
expected_zip = self.target / f"export-{timezone.localdate().isoformat()}.zip"
|
||||
expected_tmp = (
|
||||
self.target / f"export-{timezone.localdate().isoformat()}.zip.tmp"
|
||||
)
|
||||
self.assertIsNotFile(expected_zip)
|
||||
self.assertIsNotFile(expected_tmp)
|
||||
|
||||
def test_export_zip_no_scratch_dir(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Request to export documents to zipfile
|
||||
WHEN:
|
||||
- Documents are exported
|
||||
THEN:
|
||||
- No files are written under SCRATCH_DIR during the export
|
||||
(the old workaround used a temp dir there)
|
||||
"""
|
||||
from django.conf import settings
|
||||
|
||||
shutil.rmtree(Path(self.dirs.media_dir) / "documents")
|
||||
shutil.copytree(
|
||||
Path(__file__).parent / "samples" / "documents",
|
||||
Path(self.dirs.media_dir) / "documents",
|
||||
)
|
||||
|
||||
scratch_before = set(settings.SCRATCH_DIR.glob("paperless-export*"))
|
||||
|
||||
args = ["document_exporter", self.target, "--zip"]
|
||||
call_command(*args)
|
||||
|
||||
scratch_after = set(settings.SCRATCH_DIR.glob("paperless-export*"))
|
||||
self.assertEqual(scratch_before, scratch_after)
|
||||
|
||||
def test_export_target_not_exists(self) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
|
||||
Reference in New Issue
Block a user