mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-07 09:41:22 +00:00
Compare commits
3 Commits
dev
...
feature-fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2eabdcb0ef | ||
|
|
ac8bab7ed0 | ||
|
|
3e961eff0e |
@@ -3,7 +3,6 @@ import json
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
from itertools import chain
|
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
@@ -81,6 +80,88 @@ def serialize_queryset_batched(
|
|||||||
yield serializers.serialize("python", chunk)
|
yield serializers.serialize("python", chunk)
|
||||||
|
|
||||||
|
|
||||||
|
class StreamingManifestWriter:
|
||||||
|
"""Incrementally writes a JSON array to a file, one record at a time.
|
||||||
|
|
||||||
|
Writes to <target>.tmp first; on close(), optionally BLAKE2b-compares
|
||||||
|
with the existing file (--compare-json) and renames or discards accordingly.
|
||||||
|
On exception, discard() deletes the tmp file and leaves the original intact.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
path: Path,
|
||||||
|
*,
|
||||||
|
compare_json: bool = False,
|
||||||
|
files_in_export_dir: "set[Path] | None" = None,
|
||||||
|
) -> None:
|
||||||
|
self._path = path.resolve()
|
||||||
|
self._tmp_path = self._path.with_suffix(self._path.suffix + ".tmp")
|
||||||
|
self._compare_json = compare_json
|
||||||
|
self._files_in_export_dir: set[Path] = (
|
||||||
|
files_in_export_dir if files_in_export_dir is not None else set()
|
||||||
|
)
|
||||||
|
self._file = None
|
||||||
|
self._first = True
|
||||||
|
|
||||||
|
def open(self) -> None:
|
||||||
|
self._path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._file = self._tmp_path.open("w", encoding="utf-8")
|
||||||
|
self._file.write("[")
|
||||||
|
self._first = True
|
||||||
|
|
||||||
|
def write_record(self, record: dict) -> None:
|
||||||
|
if not self._first:
|
||||||
|
self._file.write(",\n")
|
||||||
|
else:
|
||||||
|
self._first = False
|
||||||
|
self._file.write(
|
||||||
|
json.dumps(record, cls=DjangoJSONEncoder, indent=2, ensure_ascii=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
def write_batch(self, records: list[dict]) -> None:
|
||||||
|
for record in records:
|
||||||
|
self.write_record(record)
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
if self._file is None:
|
||||||
|
return
|
||||||
|
self._file.write("\n]")
|
||||||
|
self._file.close()
|
||||||
|
self._file = None
|
||||||
|
self._finalize()
|
||||||
|
|
||||||
|
def discard(self) -> None:
|
||||||
|
if self._file is not None:
|
||||||
|
self._file.close()
|
||||||
|
self._file = None
|
||||||
|
if self._tmp_path.exists():
|
||||||
|
self._tmp_path.unlink()
|
||||||
|
|
||||||
|
def _finalize(self) -> None:
|
||||||
|
"""Compare with existing file (if --compare-json) then rename or discard tmp."""
|
||||||
|
if self._path in self._files_in_export_dir:
|
||||||
|
self._files_in_export_dir.remove(self._path)
|
||||||
|
if self._compare_json:
|
||||||
|
existing_hash = hashlib.blake2b(self._path.read_bytes()).hexdigest()
|
||||||
|
new_hash = hashlib.blake2b(self._tmp_path.read_bytes()).hexdigest()
|
||||||
|
if existing_hash == new_hash:
|
||||||
|
self._tmp_path.unlink()
|
||||||
|
return
|
||||||
|
self._tmp_path.rename(self._path)
|
||||||
|
|
||||||
|
def __enter__(self) -> "StreamingManifestWriter":
|
||||||
|
self.open()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||||
|
if exc_type is not None:
|
||||||
|
self.discard()
|
||||||
|
else:
|
||||||
|
self.close()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class Command(CryptMixin, BaseCommand):
|
class Command(CryptMixin, BaseCommand):
|
||||||
help = (
|
help = (
|
||||||
"Decrypt and rename all files in our collection into a given target "
|
"Decrypt and rename all files in our collection into a given target "
|
||||||
@@ -322,95 +403,83 @@ class Command(CryptMixin, BaseCommand):
|
|||||||
if settings.AUDIT_LOG_ENABLED:
|
if settings.AUDIT_LOG_ENABLED:
|
||||||
manifest_key_to_object_query["log_entries"] = LogEntry.objects.all()
|
manifest_key_to_object_query["log_entries"] = LogEntry.objects.all()
|
||||||
|
|
||||||
with transaction.atomic():
|
# Crypto setup before streaming begins
|
||||||
manifest_dict = {}
|
if self.passphrase:
|
||||||
|
self.setup_crypto(passphrase=self.passphrase)
|
||||||
# Build an overall manifest
|
elif MailAccount.objects.count() > 0 or SocialToken.objects.count() > 0:
|
||||||
for key, object_query in manifest_key_to_object_query.items():
|
self.stdout.write(
|
||||||
manifest_dict[key] = list(
|
self.style.NOTICE(
|
||||||
chain.from_iterable(
|
"No passphrase was given, sensitive fields will be in plaintext",
|
||||||
serialize_queryset_batched(
|
),
|
||||||
object_query,
|
|
||||||
batch_size=self.batch_size,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.encrypt_secret_fields(manifest_dict)
|
|
||||||
|
|
||||||
# These are treated specially and included in the per-document manifest
|
|
||||||
# if that setting is enabled. Otherwise, they are just exported to the bulk
|
|
||||||
# manifest
|
|
||||||
document_map: dict[int, Document] = {
|
|
||||||
d.pk: d for d in manifest_key_to_object_query["documents"]
|
|
||||||
}
|
|
||||||
document_manifest = manifest_dict["documents"]
|
|
||||||
|
|
||||||
# 3. Export files from each document
|
|
||||||
for index, document_dict in tqdm.tqdm(
|
|
||||||
enumerate(document_manifest),
|
|
||||||
total=len(document_manifest),
|
|
||||||
disable=self.no_progress_bar,
|
|
||||||
):
|
|
||||||
document = document_map[document_dict["pk"]]
|
|
||||||
|
|
||||||
# 3.1. generate a unique filename
|
|
||||||
base_name = self.generate_base_name(document)
|
|
||||||
|
|
||||||
# 3.2. write filenames into manifest
|
|
||||||
original_target, thumbnail_target, archive_target = (
|
|
||||||
self.generate_document_targets(document, base_name, document_dict)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# 3.3. write files to target folder
|
document_manifest: list[dict] = []
|
||||||
if not self.data_only:
|
|
||||||
self.copy_document_files(
|
|
||||||
document,
|
|
||||||
original_target,
|
|
||||||
thumbnail_target,
|
|
||||||
archive_target,
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.split_manifest:
|
|
||||||
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
|
|
||||||
if self.use_folder_prefix:
|
|
||||||
manifest_name = Path("json") / manifest_name
|
|
||||||
manifest_name = (self.target / manifest_name).resolve()
|
|
||||||
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
content = [document_manifest[index]]
|
|
||||||
content += list(
|
|
||||||
filter(
|
|
||||||
lambda d: d["fields"]["document"] == document_dict["pk"],
|
|
||||||
manifest_dict["notes"],
|
|
||||||
),
|
|
||||||
)
|
|
||||||
content += list(
|
|
||||||
filter(
|
|
||||||
lambda d: d["fields"]["document"] == document_dict["pk"],
|
|
||||||
manifest_dict["custom_field_instances"],
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.check_and_write_json(
|
|
||||||
content,
|
|
||||||
manifest_name,
|
|
||||||
)
|
|
||||||
|
|
||||||
# These were exported already
|
|
||||||
if self.split_manifest:
|
|
||||||
del manifest_dict["documents"]
|
|
||||||
del manifest_dict["notes"]
|
|
||||||
del manifest_dict["custom_field_instances"]
|
|
||||||
|
|
||||||
# 4.1 write primary manifest to target folder
|
|
||||||
manifest = []
|
|
||||||
for key, item in manifest_dict.items():
|
|
||||||
manifest.extend(item)
|
|
||||||
manifest_path = (self.target / "manifest.json").resolve()
|
manifest_path = (self.target / "manifest.json").resolve()
|
||||||
self.check_and_write_json(
|
|
||||||
manifest,
|
with StreamingManifestWriter(
|
||||||
manifest_path,
|
manifest_path,
|
||||||
)
|
compare_json=self.compare_json,
|
||||||
|
files_in_export_dir=self.files_in_export_dir,
|
||||||
|
) as writer:
|
||||||
|
with transaction.atomic():
|
||||||
|
for key, qs in manifest_key_to_object_query.items():
|
||||||
|
if key == "documents":
|
||||||
|
# Accumulate for file-copy loop; written to manifest after
|
||||||
|
for batch in serialize_queryset_batched(
|
||||||
|
qs,
|
||||||
|
batch_size=self.batch_size,
|
||||||
|
):
|
||||||
|
for record in batch:
|
||||||
|
self._encrypt_record_inline(record)
|
||||||
|
document_manifest.extend(batch)
|
||||||
|
elif self.split_manifest and key in (
|
||||||
|
"notes",
|
||||||
|
"custom_field_instances",
|
||||||
|
):
|
||||||
|
# Written per-document in _write_split_manifest
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
for batch in serialize_queryset_batched(
|
||||||
|
qs,
|
||||||
|
batch_size=self.batch_size,
|
||||||
|
):
|
||||||
|
for record in batch:
|
||||||
|
self._encrypt_record_inline(record)
|
||||||
|
writer.write_batch(batch)
|
||||||
|
|
||||||
|
document_map: dict[int, Document] = {
|
||||||
|
d.pk: d for d in Document.objects.order_by("id")
|
||||||
|
}
|
||||||
|
|
||||||
|
# 3. Export files from each document
|
||||||
|
for document_dict in tqdm.tqdm(
|
||||||
|
document_manifest,
|
||||||
|
total=len(document_manifest),
|
||||||
|
disable=self.no_progress_bar,
|
||||||
|
):
|
||||||
|
document = document_map[document_dict["pk"]]
|
||||||
|
|
||||||
|
# 3.1. generate a unique filename
|
||||||
|
base_name = self.generate_base_name(document)
|
||||||
|
|
||||||
|
# 3.2. write filenames into manifest
|
||||||
|
original_target, thumbnail_target, archive_target = (
|
||||||
|
self.generate_document_targets(document, base_name, document_dict)
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3.3. write files to target folder
|
||||||
|
if not self.data_only:
|
||||||
|
self.copy_document_files(
|
||||||
|
document,
|
||||||
|
original_target,
|
||||||
|
thumbnail_target,
|
||||||
|
archive_target,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.split_manifest:
|
||||||
|
self._write_split_manifest(document_dict, document, base_name)
|
||||||
|
else:
|
||||||
|
writer.write_record(document_dict)
|
||||||
|
|
||||||
# 4.2 write version information to target folder
|
# 4.2 write version information to target folder
|
||||||
extra_metadata_path = (self.target / "metadata.json").resolve()
|
extra_metadata_path = (self.target / "metadata.json").resolve()
|
||||||
@@ -532,6 +601,42 @@ class Command(CryptMixin, BaseCommand):
|
|||||||
archive_target,
|
archive_target,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _encrypt_record_inline(self, record: dict) -> None:
|
||||||
|
"""Encrypt sensitive fields in a single record, if passphrase is set."""
|
||||||
|
if not self.passphrase:
|
||||||
|
return
|
||||||
|
fields = self.CRYPT_FIELDS_BY_MODEL.get(record.get("model", ""))
|
||||||
|
if fields:
|
||||||
|
for field in fields:
|
||||||
|
if record["fields"].get(field):
|
||||||
|
record["fields"][field] = self.encrypt_string(
|
||||||
|
value=record["fields"][field],
|
||||||
|
)
|
||||||
|
|
||||||
|
def _write_split_manifest(
|
||||||
|
self,
|
||||||
|
document_dict: dict,
|
||||||
|
document: Document,
|
||||||
|
base_name: Path,
|
||||||
|
) -> None:
|
||||||
|
"""Write per-document manifest file for --split-manifest mode."""
|
||||||
|
content = [document_dict]
|
||||||
|
content.extend(
|
||||||
|
serializers.serialize("python", Note.objects.filter(document=document)),
|
||||||
|
)
|
||||||
|
content.extend(
|
||||||
|
serializers.serialize(
|
||||||
|
"python",
|
||||||
|
CustomFieldInstance.objects.filter(document=document),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
|
||||||
|
if self.use_folder_prefix:
|
||||||
|
manifest_name = Path("json") / manifest_name
|
||||||
|
manifest_name = (self.target / manifest_name).resolve()
|
||||||
|
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.check_and_write_json(content, manifest_name)
|
||||||
|
|
||||||
def check_and_write_json(
|
def check_and_write_json(
|
||||||
self,
|
self,
|
||||||
content: list[dict] | dict,
|
content: list[dict] | dict,
|
||||||
@@ -549,14 +654,14 @@ class Command(CryptMixin, BaseCommand):
|
|||||||
if target in self.files_in_export_dir:
|
if target in self.files_in_export_dir:
|
||||||
self.files_in_export_dir.remove(target)
|
self.files_in_export_dir.remove(target)
|
||||||
if self.compare_json:
|
if self.compare_json:
|
||||||
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
|
target_checksum = hashlib.blake2b(target.read_bytes()).hexdigest()
|
||||||
src_str = json.dumps(
|
src_str = json.dumps(
|
||||||
content,
|
content,
|
||||||
cls=DjangoJSONEncoder,
|
cls=DjangoJSONEncoder,
|
||||||
indent=2,
|
indent=2,
|
||||||
ensure_ascii=False,
|
ensure_ascii=False,
|
||||||
)
|
)
|
||||||
src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
|
src_checksum = hashlib.blake2b(src_str.encode("utf-8")).hexdigest()
|
||||||
if src_checksum == target_checksum:
|
if src_checksum == target_checksum:
|
||||||
perform_write = False
|
perform_write = False
|
||||||
|
|
||||||
@@ -606,28 +711,3 @@ class Command(CryptMixin, BaseCommand):
|
|||||||
if perform_copy:
|
if perform_copy:
|
||||||
target.parent.mkdir(parents=True, exist_ok=True)
|
target.parent.mkdir(parents=True, exist_ok=True)
|
||||||
copy_file_with_basic_stats(source, target)
|
copy_file_with_basic_stats(source, target)
|
||||||
|
|
||||||
def encrypt_secret_fields(self, manifest: dict) -> None:
|
|
||||||
"""
|
|
||||||
Encrypts certain fields in the export. Currently limited to the mail account password
|
|
||||||
"""
|
|
||||||
|
|
||||||
if self.passphrase:
|
|
||||||
self.setup_crypto(passphrase=self.passphrase)
|
|
||||||
|
|
||||||
for crypt_config in self.CRYPT_FIELDS:
|
|
||||||
exporter_key = crypt_config["exporter_key"]
|
|
||||||
crypt_fields = crypt_config["fields"]
|
|
||||||
for manifest_record in manifest[exporter_key]:
|
|
||||||
for field in crypt_fields:
|
|
||||||
if manifest_record["fields"][field]:
|
|
||||||
manifest_record["fields"][field] = self.encrypt_string(
|
|
||||||
value=manifest_record["fields"][field],
|
|
||||||
)
|
|
||||||
|
|
||||||
elif MailAccount.objects.count() > 0 or SocialToken.objects.count() > 0:
|
|
||||||
self.stdout.write(
|
|
||||||
self.style.NOTICE(
|
|
||||||
"No passphrase was given, sensitive fields will be in plaintext",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ class CryptMixin:
|
|||||||
key_size = 32
|
key_size = 32
|
||||||
kdf_algorithm = "pbkdf2_sha256"
|
kdf_algorithm = "pbkdf2_sha256"
|
||||||
|
|
||||||
CRYPT_FIELDS: CryptFields = [
|
CRYPT_FIELDS: list[CryptFields] = [
|
||||||
{
|
{
|
||||||
"exporter_key": "mail_accounts",
|
"exporter_key": "mail_accounts",
|
||||||
"model_name": "paperless_mail.mailaccount",
|
"model_name": "paperless_mail.mailaccount",
|
||||||
@@ -89,6 +89,10 @@ class CryptMixin:
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
# O(1) lookup for per-record encryption; derived from CRYPT_FIELDS at class definition time
|
||||||
|
CRYPT_FIELDS_BY_MODEL: dict[str, list[str]] = {
|
||||||
|
cfg["model_name"]: cfg["fields"] for cfg in CRYPT_FIELDS
|
||||||
|
}
|
||||||
|
|
||||||
def get_crypt_params(self) -> dict[str, dict[str, str | int]]:
|
def get_crypt_params(self) -> dict[str, dict[str, str | int]]:
|
||||||
return {
|
return {
|
||||||
|
|||||||
Reference in New Issue
Block a user