mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-07 09:41:22 +00:00
Compare commits
3 Commits
feature-im
...
feature-fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2eabdcb0ef | ||
|
|
ac8bab7ed0 | ||
|
|
3e961eff0e |
@@ -3,7 +3,6 @@ import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from itertools import chain
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
@@ -81,6 +80,88 @@ def serialize_queryset_batched(
|
||||
yield serializers.serialize("python", chunk)
|
||||
|
||||
|
||||
class StreamingManifestWriter:
|
||||
"""Incrementally writes a JSON array to a file, one record at a time.
|
||||
|
||||
Writes to <target>.tmp first; on close(), optionally BLAKE2b-compares
|
||||
with the existing file (--compare-json) and renames or discards accordingly.
|
||||
On exception, discard() deletes the tmp file and leaves the original intact.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Path,
|
||||
*,
|
||||
compare_json: bool = False,
|
||||
files_in_export_dir: "set[Path] | None" = None,
|
||||
) -> None:
|
||||
self._path = path.resolve()
|
||||
self._tmp_path = self._path.with_suffix(self._path.suffix + ".tmp")
|
||||
self._compare_json = compare_json
|
||||
self._files_in_export_dir: set[Path] = (
|
||||
files_in_export_dir if files_in_export_dir is not None else set()
|
||||
)
|
||||
self._file = None
|
||||
self._first = True
|
||||
|
||||
def open(self) -> None:
|
||||
self._path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._file = self._tmp_path.open("w", encoding="utf-8")
|
||||
self._file.write("[")
|
||||
self._first = True
|
||||
|
||||
def write_record(self, record: dict) -> None:
|
||||
if not self._first:
|
||||
self._file.write(",\n")
|
||||
else:
|
||||
self._first = False
|
||||
self._file.write(
|
||||
json.dumps(record, cls=DjangoJSONEncoder, indent=2, ensure_ascii=False),
|
||||
)
|
||||
|
||||
def write_batch(self, records: list[dict]) -> None:
|
||||
for record in records:
|
||||
self.write_record(record)
|
||||
|
||||
def close(self) -> None:
|
||||
if self._file is None:
|
||||
return
|
||||
self._file.write("\n]")
|
||||
self._file.close()
|
||||
self._file = None
|
||||
self._finalize()
|
||||
|
||||
def discard(self) -> None:
|
||||
if self._file is not None:
|
||||
self._file.close()
|
||||
self._file = None
|
||||
if self._tmp_path.exists():
|
||||
self._tmp_path.unlink()
|
||||
|
||||
def _finalize(self) -> None:
|
||||
"""Compare with existing file (if --compare-json) then rename or discard tmp."""
|
||||
if self._path in self._files_in_export_dir:
|
||||
self._files_in_export_dir.remove(self._path)
|
||||
if self._compare_json:
|
||||
existing_hash = hashlib.blake2b(self._path.read_bytes()).hexdigest()
|
||||
new_hash = hashlib.blake2b(self._tmp_path.read_bytes()).hexdigest()
|
||||
if existing_hash == new_hash:
|
||||
self._tmp_path.unlink()
|
||||
return
|
||||
self._tmp_path.rename(self._path)
|
||||
|
||||
def __enter__(self) -> "StreamingManifestWriter":
|
||||
self.open()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||
if exc_type is not None:
|
||||
self.discard()
|
||||
else:
|
||||
self.close()
|
||||
return False
|
||||
|
||||
|
||||
class Command(CryptMixin, BaseCommand):
|
||||
help = (
|
||||
"Decrypt and rename all files in our collection into a given target "
|
||||
@@ -322,95 +403,83 @@ class Command(CryptMixin, BaseCommand):
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
manifest_key_to_object_query["log_entries"] = LogEntry.objects.all()
|
||||
|
||||
with transaction.atomic():
|
||||
manifest_dict = {}
|
||||
|
||||
# Build an overall manifest
|
||||
for key, object_query in manifest_key_to_object_query.items():
|
||||
manifest_dict[key] = list(
|
||||
chain.from_iterable(
|
||||
serialize_queryset_batched(
|
||||
object_query,
|
||||
batch_size=self.batch_size,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
self.encrypt_secret_fields(manifest_dict)
|
||||
|
||||
# These are treated specially and included in the per-document manifest
|
||||
# if that setting is enabled. Otherwise, they are just exported to the bulk
|
||||
# manifest
|
||||
document_map: dict[int, Document] = {
|
||||
d.pk: d for d in manifest_key_to_object_query["documents"]
|
||||
}
|
||||
document_manifest = manifest_dict["documents"]
|
||||
|
||||
# 3. Export files from each document
|
||||
for index, document_dict in tqdm.tqdm(
|
||||
enumerate(document_manifest),
|
||||
total=len(document_manifest),
|
||||
disable=self.no_progress_bar,
|
||||
):
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
# 3.1. generate a unique filename
|
||||
base_name = self.generate_base_name(document)
|
||||
|
||||
# 3.2. write filenames into manifest
|
||||
original_target, thumbnail_target, archive_target = (
|
||||
self.generate_document_targets(document, base_name, document_dict)
|
||||
# Crypto setup before streaming begins
|
||||
if self.passphrase:
|
||||
self.setup_crypto(passphrase=self.passphrase)
|
||||
elif MailAccount.objects.count() > 0 or SocialToken.objects.count() > 0:
|
||||
self.stdout.write(
|
||||
self.style.NOTICE(
|
||||
"No passphrase was given, sensitive fields will be in plaintext",
|
||||
),
|
||||
)
|
||||
|
||||
# 3.3. write files to target folder
|
||||
if not self.data_only:
|
||||
self.copy_document_files(
|
||||
document,
|
||||
original_target,
|
||||
thumbnail_target,
|
||||
archive_target,
|
||||
)
|
||||
|
||||
if self.split_manifest:
|
||||
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
|
||||
if self.use_folder_prefix:
|
||||
manifest_name = Path("json") / manifest_name
|
||||
manifest_name = (self.target / manifest_name).resolve()
|
||||
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
||||
content = [document_manifest[index]]
|
||||
content += list(
|
||||
filter(
|
||||
lambda d: d["fields"]["document"] == document_dict["pk"],
|
||||
manifest_dict["notes"],
|
||||
),
|
||||
)
|
||||
content += list(
|
||||
filter(
|
||||
lambda d: d["fields"]["document"] == document_dict["pk"],
|
||||
manifest_dict["custom_field_instances"],
|
||||
),
|
||||
)
|
||||
|
||||
self.check_and_write_json(
|
||||
content,
|
||||
manifest_name,
|
||||
)
|
||||
|
||||
# These were exported already
|
||||
if self.split_manifest:
|
||||
del manifest_dict["documents"]
|
||||
del manifest_dict["notes"]
|
||||
del manifest_dict["custom_field_instances"]
|
||||
|
||||
# 4.1 write primary manifest to target folder
|
||||
manifest = []
|
||||
for key, item in manifest_dict.items():
|
||||
manifest.extend(item)
|
||||
document_manifest: list[dict] = []
|
||||
manifest_path = (self.target / "manifest.json").resolve()
|
||||
self.check_and_write_json(
|
||||
manifest,
|
||||
|
||||
with StreamingManifestWriter(
|
||||
manifest_path,
|
||||
)
|
||||
compare_json=self.compare_json,
|
||||
files_in_export_dir=self.files_in_export_dir,
|
||||
) as writer:
|
||||
with transaction.atomic():
|
||||
for key, qs in manifest_key_to_object_query.items():
|
||||
if key == "documents":
|
||||
# Accumulate for file-copy loop; written to manifest after
|
||||
for batch in serialize_queryset_batched(
|
||||
qs,
|
||||
batch_size=self.batch_size,
|
||||
):
|
||||
for record in batch:
|
||||
self._encrypt_record_inline(record)
|
||||
document_manifest.extend(batch)
|
||||
elif self.split_manifest and key in (
|
||||
"notes",
|
||||
"custom_field_instances",
|
||||
):
|
||||
# Written per-document in _write_split_manifest
|
||||
pass
|
||||
else:
|
||||
for batch in serialize_queryset_batched(
|
||||
qs,
|
||||
batch_size=self.batch_size,
|
||||
):
|
||||
for record in batch:
|
||||
self._encrypt_record_inline(record)
|
||||
writer.write_batch(batch)
|
||||
|
||||
document_map: dict[int, Document] = {
|
||||
d.pk: d for d in Document.objects.order_by("id")
|
||||
}
|
||||
|
||||
# 3. Export files from each document
|
||||
for document_dict in tqdm.tqdm(
|
||||
document_manifest,
|
||||
total=len(document_manifest),
|
||||
disable=self.no_progress_bar,
|
||||
):
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
# 3.1. generate a unique filename
|
||||
base_name = self.generate_base_name(document)
|
||||
|
||||
# 3.2. write filenames into manifest
|
||||
original_target, thumbnail_target, archive_target = (
|
||||
self.generate_document_targets(document, base_name, document_dict)
|
||||
)
|
||||
|
||||
# 3.3. write files to target folder
|
||||
if not self.data_only:
|
||||
self.copy_document_files(
|
||||
document,
|
||||
original_target,
|
||||
thumbnail_target,
|
||||
archive_target,
|
||||
)
|
||||
|
||||
if self.split_manifest:
|
||||
self._write_split_manifest(document_dict, document, base_name)
|
||||
else:
|
||||
writer.write_record(document_dict)
|
||||
|
||||
# 4.2 write version information to target folder
|
||||
extra_metadata_path = (self.target / "metadata.json").resolve()
|
||||
@@ -532,6 +601,42 @@ class Command(CryptMixin, BaseCommand):
|
||||
archive_target,
|
||||
)
|
||||
|
||||
def _encrypt_record_inline(self, record: dict) -> None:
|
||||
"""Encrypt sensitive fields in a single record, if passphrase is set."""
|
||||
if not self.passphrase:
|
||||
return
|
||||
fields = self.CRYPT_FIELDS_BY_MODEL.get(record.get("model", ""))
|
||||
if fields:
|
||||
for field in fields:
|
||||
if record["fields"].get(field):
|
||||
record["fields"][field] = self.encrypt_string(
|
||||
value=record["fields"][field],
|
||||
)
|
||||
|
||||
def _write_split_manifest(
|
||||
self,
|
||||
document_dict: dict,
|
||||
document: Document,
|
||||
base_name: Path,
|
||||
) -> None:
|
||||
"""Write per-document manifest file for --split-manifest mode."""
|
||||
content = [document_dict]
|
||||
content.extend(
|
||||
serializers.serialize("python", Note.objects.filter(document=document)),
|
||||
)
|
||||
content.extend(
|
||||
serializers.serialize(
|
||||
"python",
|
||||
CustomFieldInstance.objects.filter(document=document),
|
||||
),
|
||||
)
|
||||
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
|
||||
if self.use_folder_prefix:
|
||||
manifest_name = Path("json") / manifest_name
|
||||
manifest_name = (self.target / manifest_name).resolve()
|
||||
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.check_and_write_json(content, manifest_name)
|
||||
|
||||
def check_and_write_json(
|
||||
self,
|
||||
content: list[dict] | dict,
|
||||
@@ -549,14 +654,14 @@ class Command(CryptMixin, BaseCommand):
|
||||
if target in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(target)
|
||||
if self.compare_json:
|
||||
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
|
||||
target_checksum = hashlib.blake2b(target.read_bytes()).hexdigest()
|
||||
src_str = json.dumps(
|
||||
content,
|
||||
cls=DjangoJSONEncoder,
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
|
||||
src_checksum = hashlib.blake2b(src_str.encode("utf-8")).hexdigest()
|
||||
if src_checksum == target_checksum:
|
||||
perform_write = False
|
||||
|
||||
@@ -606,28 +711,3 @@ class Command(CryptMixin, BaseCommand):
|
||||
if perform_copy:
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
copy_file_with_basic_stats(source, target)
|
||||
|
||||
def encrypt_secret_fields(self, manifest: dict) -> None:
|
||||
"""
|
||||
Encrypts certain fields in the export. Currently limited to the mail account password
|
||||
"""
|
||||
|
||||
if self.passphrase:
|
||||
self.setup_crypto(passphrase=self.passphrase)
|
||||
|
||||
for crypt_config in self.CRYPT_FIELDS:
|
||||
exporter_key = crypt_config["exporter_key"]
|
||||
crypt_fields = crypt_config["fields"]
|
||||
for manifest_record in manifest[exporter_key]:
|
||||
for field in crypt_fields:
|
||||
if manifest_record["fields"][field]:
|
||||
manifest_record["fields"][field] = self.encrypt_string(
|
||||
value=manifest_record["fields"][field],
|
||||
)
|
||||
|
||||
elif MailAccount.objects.count() > 0 or SocialToken.objects.count() > 0:
|
||||
self.stdout.write(
|
||||
self.style.NOTICE(
|
||||
"No passphrase was given, sensitive fields will be in plaintext",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -71,7 +71,7 @@ class CryptMixin:
|
||||
key_size = 32
|
||||
kdf_algorithm = "pbkdf2_sha256"
|
||||
|
||||
CRYPT_FIELDS: CryptFields = [
|
||||
CRYPT_FIELDS: list[CryptFields] = [
|
||||
{
|
||||
"exporter_key": "mail_accounts",
|
||||
"model_name": "paperless_mail.mailaccount",
|
||||
@@ -89,6 +89,10 @@ class CryptMixin:
|
||||
],
|
||||
},
|
||||
]
|
||||
# O(1) lookup for per-record encryption; derived from CRYPT_FIELDS at class definition time
|
||||
CRYPT_FIELDS_BY_MODEL: dict[str, list[str]] = {
|
||||
cfg["model_name"]: cfg["fields"] for cfg in CRYPT_FIELDS
|
||||
}
|
||||
|
||||
def get_crypt_params(self) -> dict[str, dict[str, str | int]]:
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user