mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-10 03:01:23 +00:00
Compare commits
2 Commits
dev
...
feature-di
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6b0f3e93cd | ||
|
|
882ae7a3d8 |
@@ -1,8 +1,9 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import zipfile
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
@@ -93,6 +94,8 @@ class StreamingManifestWriter:
|
|||||||
*,
|
*,
|
||||||
compare_json: bool = False,
|
compare_json: bool = False,
|
||||||
files_in_export_dir: "set[Path] | None" = None,
|
files_in_export_dir: "set[Path] | None" = None,
|
||||||
|
zip_file: "zipfile.ZipFile | None" = None,
|
||||||
|
zip_arcname: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self._path = path.resolve()
|
self._path = path.resolve()
|
||||||
self._tmp_path = self._path.with_suffix(self._path.suffix + ".tmp")
|
self._tmp_path = self._path.with_suffix(self._path.suffix + ".tmp")
|
||||||
@@ -100,12 +103,20 @@ class StreamingManifestWriter:
|
|||||||
self._files_in_export_dir: set[Path] = (
|
self._files_in_export_dir: set[Path] = (
|
||||||
files_in_export_dir if files_in_export_dir is not None else set()
|
files_in_export_dir if files_in_export_dir is not None else set()
|
||||||
)
|
)
|
||||||
|
self._zip_file = zip_file
|
||||||
|
self._zip_arcname = zip_arcname
|
||||||
|
self._zip_mode = zip_file is not None
|
||||||
self._file = None
|
self._file = None
|
||||||
self._first = True
|
self._first = True
|
||||||
|
|
||||||
def open(self) -> None:
|
def open(self) -> None:
|
||||||
self._path.parent.mkdir(parents=True, exist_ok=True)
|
if self._zip_mode:
|
||||||
self._file = self._tmp_path.open("w", encoding="utf-8")
|
# zipfile only allows one open write handle at a time, so buffer
|
||||||
|
# the manifest in memory and write it atomically on close()
|
||||||
|
self._file = io.StringIO()
|
||||||
|
else:
|
||||||
|
self._path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._file = self._tmp_path.open("w", encoding="utf-8")
|
||||||
self._file.write("[")
|
self._file.write("[")
|
||||||
self._first = True
|
self._first = True
|
||||||
|
|
||||||
@@ -126,15 +137,18 @@ class StreamingManifestWriter:
|
|||||||
if self._file is None:
|
if self._file is None:
|
||||||
return
|
return
|
||||||
self._file.write("\n]")
|
self._file.write("\n]")
|
||||||
|
if self._zip_mode:
|
||||||
|
self._zip_file.writestr(self._zip_arcname, self._file.getvalue())
|
||||||
self._file.close()
|
self._file.close()
|
||||||
self._file = None
|
self._file = None
|
||||||
self._finalize()
|
if not self._zip_mode:
|
||||||
|
self._finalize()
|
||||||
|
|
||||||
def discard(self) -> None:
|
def discard(self) -> None:
|
||||||
if self._file is not None:
|
if self._file is not None:
|
||||||
self._file.close()
|
self._file.close()
|
||||||
self._file = None
|
self._file = None
|
||||||
if self._tmp_path.exists():
|
if not self._zip_mode and self._tmp_path.exists():
|
||||||
self._tmp_path.unlink()
|
self._tmp_path.unlink()
|
||||||
|
|
||||||
def _finalize(self) -> None:
|
def _finalize(self) -> None:
|
||||||
@@ -311,18 +325,13 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
|
|
||||||
self.files_in_export_dir: set[Path] = set()
|
self.files_in_export_dir: set[Path] = set()
|
||||||
self.exported_files: set[str] = set()
|
self.exported_files: set[str] = set()
|
||||||
|
self.zip_file: zipfile.ZipFile | None = None
|
||||||
|
self._zip_dirs: set[str] = set()
|
||||||
|
|
||||||
# If zipping, save the original target for later and
|
|
||||||
# get a temporary directory for the target instead
|
|
||||||
temp_dir = None
|
|
||||||
self.original_target = self.target
|
|
||||||
if self.zip_export:
|
if self.zip_export:
|
||||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
zip_name = options["zip_name"]
|
||||||
temp_dir = tempfile.TemporaryDirectory(
|
self.zip_path = (self.target / zip_name).with_suffix(".zip")
|
||||||
dir=settings.SCRATCH_DIR,
|
self.zip_tmp_path = self.zip_path.parent / (self.zip_path.name + ".tmp")
|
||||||
prefix="paperless-export",
|
|
||||||
)
|
|
||||||
self.target = Path(temp_dir.name).resolve()
|
|
||||||
|
|
||||||
if not self.target.exists():
|
if not self.target.exists():
|
||||||
raise CommandError("That path doesn't exist")
|
raise CommandError("That path doesn't exist")
|
||||||
@@ -333,30 +342,53 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
if not os.access(self.target, os.W_OK):
|
if not os.access(self.target, os.W_OK):
|
||||||
raise CommandError("That path doesn't appear to be writable")
|
raise CommandError("That path doesn't appear to be writable")
|
||||||
|
|
||||||
|
if self.zip_export:
|
||||||
|
if self.compare_checksums:
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.WARNING(
|
||||||
|
"--compare-checksums is ignored when --zip is used",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if self.compare_json:
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.WARNING(
|
||||||
|
"--compare-json is ignored when --zip is used",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Prevent any ongoing changes in the documents
|
# Prevent any ongoing changes in the documents
|
||||||
with FileLock(settings.MEDIA_LOCK):
|
with FileLock(settings.MEDIA_LOCK):
|
||||||
self.dump()
|
if self.zip_export:
|
||||||
|
self.zip_file = zipfile.ZipFile(
|
||||||
# We've written everything to the temporary directory in this case,
|
self.zip_tmp_path,
|
||||||
# now make an archive in the original target, with all files stored
|
"w",
|
||||||
if self.zip_export and temp_dir is not None:
|
compression=zipfile.ZIP_DEFLATED,
|
||||||
shutil.make_archive(
|
allowZip64=True,
|
||||||
self.original_target / options["zip_name"],
|
|
||||||
format="zip",
|
|
||||||
root_dir=temp_dir.name,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.dump()
|
||||||
|
|
||||||
|
if self.zip_export and self.zip_file is not None:
|
||||||
|
self.zip_file.close()
|
||||||
|
self.zip_file = None
|
||||||
|
self.zip_tmp_path.rename(self.zip_path)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Always cleanup the temporary directory, if one was created
|
# Ensure zip_file is closed and the incomplete .tmp is removed on failure
|
||||||
if self.zip_export and temp_dir is not None:
|
if self.zip_file is not None:
|
||||||
temp_dir.cleanup()
|
self.zip_file.close()
|
||||||
|
self.zip_file = None
|
||||||
|
if self.zip_export and self.zip_tmp_path.exists():
|
||||||
|
self.zip_tmp_path.unlink()
|
||||||
|
|
||||||
def dump(self) -> None:
|
def dump(self) -> None:
|
||||||
# 1. Take a snapshot of what files exist in the current export folder
|
# 1. Take a snapshot of what files exist in the current export folder
|
||||||
for x in self.target.glob("**/*"):
|
# (skipped in zip mode — always write fresh, no skip/compare logic applies)
|
||||||
if x.is_file():
|
if not self.zip_export:
|
||||||
self.files_in_export_dir.add(x.resolve())
|
for x in self.target.glob("**/*"):
|
||||||
|
if x.is_file():
|
||||||
|
self.files_in_export_dir.add(x.resolve())
|
||||||
|
|
||||||
# 2. Create manifest, containing all correspondents, types, tags, storage paths
|
# 2. Create manifest, containing all correspondents, types, tags, storage paths
|
||||||
# note, documents and ui_settings
|
# note, documents and ui_settings
|
||||||
@@ -414,6 +446,8 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
manifest_path,
|
manifest_path,
|
||||||
compare_json=self.compare_json,
|
compare_json=self.compare_json,
|
||||||
files_in_export_dir=self.files_in_export_dir,
|
files_in_export_dir=self.files_in_export_dir,
|
||||||
|
zip_file=self.zip_file,
|
||||||
|
zip_arcname="manifest.json",
|
||||||
) as writer:
|
) as writer:
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
for key, qs in manifest_key_to_object_query.items():
|
for key, qs in manifest_key_to_object_query.items():
|
||||||
@@ -504,8 +538,12 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
self.target,
|
self.target,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# 5. Remove anything in the original location (before moving the zip)
|
# 5. Remove pre-existing files/dirs from target, keeping the
|
||||||
for item in self.original_target.glob("*"):
|
# in-progress zip (.tmp) and any prior zip at the final path
|
||||||
|
skip = {self.zip_path.resolve(), self.zip_tmp_path.resolve()}
|
||||||
|
for item in self.target.glob("*"):
|
||||||
|
if item.resolve() in skip:
|
||||||
|
continue
|
||||||
if item.is_dir():
|
if item.is_dir():
|
||||||
shutil.rmtree(item)
|
shutil.rmtree(item)
|
||||||
else:
|
else:
|
||||||
@@ -630,9 +668,23 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
if self.use_folder_prefix:
|
if self.use_folder_prefix:
|
||||||
manifest_name = Path("json") / manifest_name
|
manifest_name = Path("json") / manifest_name
|
||||||
manifest_name = (self.target / manifest_name).resolve()
|
manifest_name = (self.target / manifest_name).resolve()
|
||||||
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
if not self.zip_export:
|
||||||
|
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
||||||
self.check_and_write_json(content, manifest_name)
|
self.check_and_write_json(content, manifest_name)
|
||||||
|
|
||||||
|
def _ensure_zip_dirs(self, arcname: str) -> None:
|
||||||
|
"""Write directory marker entries for all parent directories of arcname.
|
||||||
|
|
||||||
|
Some zip viewers only show folder structure when explicit directory
|
||||||
|
entries exist, so we add them to avoid confusing users.
|
||||||
|
"""
|
||||||
|
parts = Path(arcname).parts[:-1]
|
||||||
|
for i in range(len(parts)):
|
||||||
|
dir_arc = "/".join(parts[: i + 1]) + "/"
|
||||||
|
if dir_arc not in self._zip_dirs:
|
||||||
|
self._zip_dirs.add(dir_arc)
|
||||||
|
self.zip_file.mkdir(dir_arc)
|
||||||
|
|
||||||
def check_and_write_json(
|
def check_and_write_json(
|
||||||
self,
|
self,
|
||||||
content: list[dict] | dict,
|
content: list[dict] | dict,
|
||||||
@@ -645,6 +697,20 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
This preserves the file timestamps when no changes are made.
|
This preserves the file timestamps when no changes are made.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if self.zip_export:
|
||||||
|
arcname = str(target.resolve().relative_to(self.target))
|
||||||
|
self._ensure_zip_dirs(arcname)
|
||||||
|
self.zip_file.writestr(
|
||||||
|
arcname,
|
||||||
|
json.dumps(
|
||||||
|
content,
|
||||||
|
cls=DjangoJSONEncoder,
|
||||||
|
indent=2,
|
||||||
|
ensure_ascii=False,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
target = target.resolve()
|
target = target.resolve()
|
||||||
perform_write = True
|
perform_write = True
|
||||||
if target in self.files_in_export_dir:
|
if target in self.files_in_export_dir:
|
||||||
@@ -683,6 +749,12 @@ class Command(CryptMixin, PaperlessCommand):
|
|||||||
the source attributes
|
the source attributes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if self.zip_export:
|
||||||
|
arcname = str(target.resolve().relative_to(self.target))
|
||||||
|
self._ensure_zip_dirs(arcname)
|
||||||
|
self.zip_file.write(source, arcname=arcname)
|
||||||
|
return
|
||||||
|
|
||||||
target = target.resolve()
|
target = target.resolve()
|
||||||
if target in self.files_in_export_dir:
|
if target in self.files_in_export_dir:
|
||||||
self.files_in_export_dir.remove(target)
|
self.files_in_export_dir.remove(target)
|
||||||
|
|||||||
@@ -505,7 +505,7 @@ class TestExportImport(
|
|||||||
self.assertIsFile(expected_file)
|
self.assertIsFile(expected_file)
|
||||||
|
|
||||||
with ZipFile(expected_file) as zip:
|
with ZipFile(expected_file) as zip:
|
||||||
# Extras are from the directories, which also appear in the listing
|
# 11 files + 3 directory marker entries for the subdirectory structure
|
||||||
self.assertEqual(len(zip.namelist()), 14)
|
self.assertEqual(len(zip.namelist()), 14)
|
||||||
self.assertIn("manifest.json", zip.namelist())
|
self.assertIn("manifest.json", zip.namelist())
|
||||||
self.assertIn("metadata.json", zip.namelist())
|
self.assertIn("metadata.json", zip.namelist())
|
||||||
@@ -557,6 +557,59 @@ class TestExportImport(
|
|||||||
self.assertIn("manifest.json", zip.namelist())
|
self.assertIn("manifest.json", zip.namelist())
|
||||||
self.assertIn("metadata.json", zip.namelist())
|
self.assertIn("metadata.json", zip.namelist())
|
||||||
|
|
||||||
|
def test_export_zip_atomic_on_failure(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Request to export documents to zipfile
|
||||||
|
WHEN:
|
||||||
|
- Export raises an exception mid-way
|
||||||
|
THEN:
|
||||||
|
- No .zip file is written at the final path
|
||||||
|
- The .tmp file is cleaned up
|
||||||
|
"""
|
||||||
|
args = ["document_exporter", self.target, "--zip"]
|
||||||
|
|
||||||
|
with mock.patch.object(
|
||||||
|
document_exporter.Command,
|
||||||
|
"dump",
|
||||||
|
side_effect=RuntimeError("simulated failure"),
|
||||||
|
):
|
||||||
|
with self.assertRaises(RuntimeError):
|
||||||
|
call_command(*args)
|
||||||
|
|
||||||
|
expected_zip = self.target / f"export-{timezone.localdate().isoformat()}.zip"
|
||||||
|
expected_tmp = (
|
||||||
|
self.target / f"export-{timezone.localdate().isoformat()}.zip.tmp"
|
||||||
|
)
|
||||||
|
self.assertIsNotFile(expected_zip)
|
||||||
|
self.assertIsNotFile(expected_tmp)
|
||||||
|
|
||||||
|
def test_export_zip_no_scratch_dir(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Request to export documents to zipfile
|
||||||
|
WHEN:
|
||||||
|
- Documents are exported
|
||||||
|
THEN:
|
||||||
|
- No files are written under SCRATCH_DIR during the export
|
||||||
|
(the old workaround used a temp dir there)
|
||||||
|
"""
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
shutil.rmtree(Path(self.dirs.media_dir) / "documents")
|
||||||
|
shutil.copytree(
|
||||||
|
Path(__file__).parent / "samples" / "documents",
|
||||||
|
Path(self.dirs.media_dir) / "documents",
|
||||||
|
)
|
||||||
|
|
||||||
|
scratch_before = set(settings.SCRATCH_DIR.glob("paperless-export*"))
|
||||||
|
|
||||||
|
args = ["document_exporter", self.target, "--zip"]
|
||||||
|
call_command(*args)
|
||||||
|
|
||||||
|
scratch_after = set(settings.SCRATCH_DIR.glob("paperless-export*"))
|
||||||
|
self.assertEqual(scratch_before, scratch_after)
|
||||||
|
|
||||||
def test_export_target_not_exists(self) -> None:
|
def test_export_target_not_exists(self) -> None:
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
|
|||||||
Reference in New Issue
Block a user