Compare commits

..

1 Commits

5 changed files with 117 additions and 182 deletions

View File

@@ -241,3 +241,66 @@ For example:
}
}
```
## Consume Script Positional Arguments Removed
Pre- and post-consumption scripts no longer receive positional arguments. All information is
now passed exclusively via environment variables, which have been available since earlier versions.
### Pre-consumption script
Previously, the original file path was passed as `$1`. It is now only available as
`DOCUMENT_SOURCE_PATH`.
**Before:**
```bash
#!/usr/bin/env bash
# $1 was the original file path
process_document "$1"
```
**After:**
```bash
#!/usr/bin/env bash
process_document "${DOCUMENT_SOURCE_PATH}"
```
### Post-consumption script
Previously, document metadata was passed as positional arguments `$1` through `$8`:
| Argument | Environment Variable Equivalent |
| -------- | ------------------------------- |
| `$1` | `DOCUMENT_ID` |
| `$2` | `DOCUMENT_FILE_NAME` |
| `$3` | `DOCUMENT_SOURCE_PATH` |
| `$4` | `DOCUMENT_THUMBNAIL_PATH` |
| `$5` | `DOCUMENT_DOWNLOAD_URL` |
| `$6` | `DOCUMENT_THUMBNAIL_URL` |
| `$7` | `DOCUMENT_CORRESPONDENT` |
| `$8` | `DOCUMENT_TAGS` |
**Before:**
```bash
#!/usr/bin/env bash
DOCUMENT_ID=$1
CORRESPONDENT=$7
TAGS=$8
```
**After:**
```bash
#!/usr/bin/env bash
# Use environment variables directly
echo "Document ${DOCUMENT_ID} from ${DOCUMENT_CORRESPONDENT} tagged: ${DOCUMENT_TAGS}"
```
### Action Required
Update any pre- or post-consumption scripts that read `$1`, `$2`, etc. to use the
corresponding environment variables instead. Environment variables have been the preferred
option since v1.8.0.

View File

@@ -313,7 +313,6 @@ class ConsumerPlugin(
run_subprocess(
[
settings.PRE_CONSUME_SCRIPT,
original_file_path,
],
script_env,
self.log,
@@ -383,14 +382,6 @@ class ConsumerPlugin(
run_subprocess(
[
settings.POST_CONSUME_SCRIPT,
str(document.pk),
document.get_public_filename(),
os.path.normpath(document.source_path),
os.path.normpath(document.thumbnail_path),
reverse("document-download", kwargs={"pk": document.pk}),
reverse("document-thumb", kwargs={"pk": document.pk}),
str(document.correspondent),
str(",".join(document.tags.all().values_list("name", flat=True))),
],
script_env,
self.log,

View File

@@ -1,9 +1,8 @@
import hashlib
import io
import json
import os
import shutil
import zipfile
import tempfile
from itertools import islice
from pathlib import Path
from typing import TYPE_CHECKING
@@ -98,8 +97,6 @@ class StreamingManifestWriter:
*,
compare_json: bool = False,
files_in_export_dir: "set[Path] | None" = None,
zip_file: "zipfile.ZipFile | None" = None,
zip_arcname: str | None = None,
) -> None:
self._path = path.resolve()
self._tmp_path = self._path.with_suffix(self._path.suffix + ".tmp")
@@ -107,20 +104,12 @@ class StreamingManifestWriter:
self._files_in_export_dir: set[Path] = (
files_in_export_dir if files_in_export_dir is not None else set()
)
self._zip_file = zip_file
self._zip_arcname = zip_arcname
self._zip_mode = zip_file is not None
self._file = None
self._first = True
def open(self) -> None:
if self._zip_mode:
# zipfile only allows one open write handle at a time, so buffer
# the manifest in memory and write it atomically on close()
self._file = io.StringIO()
else:
self._path.parent.mkdir(parents=True, exist_ok=True)
self._file = self._tmp_path.open("w", encoding="utf-8")
self._path.parent.mkdir(parents=True, exist_ok=True)
self._file = self._tmp_path.open("w", encoding="utf-8")
self._file.write("[")
self._first = True
@@ -141,18 +130,15 @@ class StreamingManifestWriter:
if self._file is None:
return
self._file.write("\n]")
if self._zip_mode:
self._zip_file.writestr(self._zip_arcname, self._file.getvalue())
self._file.close()
self._file = None
if not self._zip_mode:
self._finalize()
self._finalize()
def discard(self) -> None:
if self._file is not None:
self._file.close()
self._file = None
if not self._zip_mode and self._tmp_path.exists():
if self._tmp_path.exists():
self._tmp_path.unlink()
def _finalize(self) -> None:
@@ -329,13 +315,18 @@ class Command(CryptMixin, PaperlessCommand):
self.files_in_export_dir: set[Path] = set()
self.exported_files: set[str] = set()
self.zip_file: zipfile.ZipFile | None = None
self._zip_dirs: set[str] = set()
# If zipping, save the original target for later and
# get a temporary directory for the target instead
temp_dir = None
self.original_target = self.target
if self.zip_export:
zip_name = options["zip_name"]
self.zip_path = (self.target / zip_name).with_suffix(".zip")
self.zip_tmp_path = self.zip_path.parent / (self.zip_path.name + ".tmp")
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
temp_dir = tempfile.TemporaryDirectory(
dir=settings.SCRATCH_DIR,
prefix="paperless-export",
)
self.target = Path(temp_dir.name).resolve()
if not self.target.exists():
raise CommandError("That path doesn't exist")
@@ -346,53 +337,30 @@ class Command(CryptMixin, PaperlessCommand):
if not os.access(self.target, os.W_OK):
raise CommandError("That path doesn't appear to be writable")
if self.zip_export:
if self.compare_checksums:
self.stdout.write(
self.style.WARNING(
"--compare-checksums is ignored when --zip is used",
),
)
if self.compare_json:
self.stdout.write(
self.style.WARNING(
"--compare-json is ignored when --zip is used",
),
)
try:
# Prevent any ongoing changes in the documents
with FileLock(settings.MEDIA_LOCK):
if self.zip_export:
self.zip_file = zipfile.ZipFile(
self.zip_tmp_path,
"w",
compression=zipfile.ZIP_DEFLATED,
allowZip64=True,
)
self.dump()
if self.zip_file is not None:
self.zip_file.close()
self.zip_file = None
self.zip_tmp_path.rename(self.zip_path)
# We've written everything to the temporary directory in this case,
# now make an archive in the original target, with all files stored
if self.zip_export and temp_dir is not None:
shutil.make_archive(
self.original_target / options["zip_name"],
format="zip",
root_dir=temp_dir.name,
)
finally:
# Ensure zip_file is closed and the incomplete .tmp is removed on failure
if self.zip_file is not None:
self.zip_file.close()
self.zip_file = None
if self.zip_export and self.zip_tmp_path.exists():
self.zip_tmp_path.unlink()
# Always cleanup the temporary directory, if one was created
if self.zip_export and temp_dir is not None:
temp_dir.cleanup()
def dump(self) -> None:
# 1. Take a snapshot of what files exist in the current export folder
# (skipped in zip mode — always write fresh, no skip/compare logic applies)
if not self.zip_export:
for x in self.target.glob("**/*"):
if x.is_file():
self.files_in_export_dir.add(x.resolve())
for x in self.target.glob("**/*"):
if x.is_file():
self.files_in_export_dir.add(x.resolve())
# 2. Create manifest, containing all correspondents, types, tags, storage paths
# note, documents and ui_settings
@@ -453,8 +421,6 @@ class Command(CryptMixin, PaperlessCommand):
manifest_path,
compare_json=self.compare_json,
files_in_export_dir=self.files_in_export_dir,
zip_file=self.zip_file,
zip_arcname="manifest.json",
) as writer:
with transaction.atomic():
for key, qs in manifest_key_to_object_query.items():
@@ -573,12 +539,8 @@ class Command(CryptMixin, PaperlessCommand):
self.target,
)
else:
# 5. Remove pre-existing files/dirs from target, keeping the
# in-progress zip (.tmp) and any prior zip at the final path
skip = {self.zip_path.resolve(), self.zip_tmp_path.resolve()}
for item in self.target.glob("*"):
if item.resolve() in skip:
continue
# 5. Remove anything in the original location (before moving the zip)
for item in self.original_target.glob("*"):
if item.is_dir():
shutil.rmtree(item)
else:
@@ -748,23 +710,9 @@ class Command(CryptMixin, PaperlessCommand):
if self.use_folder_prefix:
manifest_name = Path("json") / manifest_name
manifest_name = (self.target / manifest_name).resolve()
if not self.zip_export:
manifest_name.parent.mkdir(parents=True, exist_ok=True)
manifest_name.parent.mkdir(parents=True, exist_ok=True)
self.check_and_write_json(content, manifest_name)
def _ensure_zip_dirs(self, arcname: str) -> None:
"""Write directory marker entries for all parent directories of arcname.
Some zip viewers only show folder structure when explicit directory
entries exist, so we add them to avoid confusing users.
"""
parts = Path(arcname).parts[:-1]
for i in range(len(parts)):
dir_arc = "/".join(parts[: i + 1]) + "/"
if dir_arc not in self._zip_dirs:
self._zip_dirs.add(dir_arc)
self.zip_file.mkdir(dir_arc)
def check_and_write_json(
self,
content: list[dict] | dict,
@@ -777,38 +725,32 @@ class Command(CryptMixin, PaperlessCommand):
This preserves the file timestamps when no changes are made.
"""
if self.zip_export:
arcname = str(target.resolve().relative_to(self.target))
self._ensure_zip_dirs(arcname)
self.zip_file.writestr(
arcname,
target = target.resolve()
perform_write = True
if target in self.files_in_export_dir:
self.files_in_export_dir.remove(target)
if self.compare_json:
target_checksum = hashlib.blake2b(target.read_bytes()).hexdigest()
src_str = json.dumps(
content,
cls=DjangoJSONEncoder,
indent=2,
ensure_ascii=False,
)
src_checksum = hashlib.blake2b(src_str.encode("utf-8")).hexdigest()
if src_checksum == target_checksum:
perform_write = False
if perform_write:
target.write_text(
json.dumps(
content,
cls=DjangoJSONEncoder,
indent=2,
ensure_ascii=False,
),
encoding="utf-8",
)
return
target = target.resolve()
json_str = json.dumps(
content,
cls=DjangoJSONEncoder,
indent=2,
ensure_ascii=False,
)
perform_write = True
if target in self.files_in_export_dir:
self.files_in_export_dir.remove(target)
if self.compare_json:
target_checksum = hashlib.blake2b(target.read_bytes()).hexdigest()
src_checksum = hashlib.blake2b(json_str.encode("utf-8")).hexdigest()
if src_checksum == target_checksum:
perform_write = False
if perform_write:
target.write_text(json_str, encoding="utf-8")
def check_and_copy(
self,
@@ -821,12 +763,6 @@ class Command(CryptMixin, PaperlessCommand):
the source attributes
"""
if self.zip_export:
arcname = str(target.resolve().relative_to(self.target))
self._ensure_zip_dirs(arcname)
self.zip_file.write(source, arcname=arcname)
return
target = target.resolve()
if target in self.files_in_export_dir:
self.files_in_export_dir.remove(target)

View File

@@ -1328,7 +1328,7 @@ class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
environment = args[1]
self.assertEqual(command[0], script.name)
self.assertEqual(command[1], str(self.test_file))
self.assertEqual(len(command), 1)
subset = {
"DOCUMENT_SOURCE_PATH": str(c.input_doc.original_file),
@@ -1478,11 +1478,7 @@ class PostConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
environment = args[1]
self.assertEqual(command[0], script.name)
self.assertEqual(command[1], str(doc.pk))
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"])
self.assertEqual(len(command), 1)
subset = {
"DOCUMENT_ID": str(doc.pk),

View File

@@ -615,7 +615,7 @@ class TestExportImport(
self.assertIsFile(expected_file)
with ZipFile(expected_file) as zip:
# 11 files + 3 directory marker entries for the subdirectory structure
# Extras are from the directories, which also appear in the listing
self.assertEqual(len(zip.namelist()), 14)
self.assertIn("manifest.json", zip.namelist())
self.assertIn("metadata.json", zip.namelist())
@@ -666,57 +666,6 @@ class TestExportImport(
self.assertIn("manifest.json", zip.namelist())
self.assertIn("metadata.json", zip.namelist())
def test_export_zip_atomic_on_failure(self) -> None:
"""
GIVEN:
- Request to export documents to zipfile
WHEN:
- Export raises an exception mid-way
THEN:
- No .zip file is written at the final path
- The .tmp file is cleaned up
"""
args = ["document_exporter", self.target, "--zip"]
with mock.patch.object(
document_exporter.Command,
"dump",
side_effect=RuntimeError("simulated failure"),
):
with self.assertRaises(RuntimeError):
call_command(*args)
expected_zip = self.target / f"export-{timezone.localdate().isoformat()}.zip"
expected_tmp = (
self.target / f"export-{timezone.localdate().isoformat()}.zip.tmp"
)
self.assertIsNotFile(expected_zip)
self.assertIsNotFile(expected_tmp)
def test_export_zip_no_scratch_dir(self) -> None:
"""
GIVEN:
- Request to export documents to zipfile
WHEN:
- Documents are exported
THEN:
- No files are written under SCRATCH_DIR during the export
(the old workaround used a temp dir there)
"""
shutil.rmtree(Path(self.dirs.media_dir) / "documents")
shutil.copytree(
Path(__file__).parent / "samples" / "documents",
Path(self.dirs.media_dir) / "documents",
)
scratch_before = set(settings.SCRATCH_DIR.glob("paperless-export*"))
args = ["document_exporter", self.target, "--zip"]
call_command(*args)
scratch_after = set(settings.SCRATCH_DIR.glob("paperless-export*"))
self.assertEqual(scratch_before, scratch_after)
def test_export_target_not_exists(self) -> None:
"""
GIVEN: