Compare commits

...

3 Commits

Author SHA1 Message Date
Trenton H
db1e4ce432 Refactor: add explicit supports_progress_bar and supports_multiprocessing to all PaperlessCommand subclasses
Each management command now explicitly declares both class attributes
rather than relying on defaults, making intent unambiguous at a glance.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-05 15:05:15 -08:00
Trenton H
0b3cdd6934 Refactor: migrate exporter/importer from tqdm to PaperlessCommand.track()
Replace direct tqdm usage in document_exporter and document_importer with
the PaperlessCommand base class and its track() method, which is backed by
Rich and handles --no-progress-bar automatically. Also removes the unused
ProgressBarMixin from mixins.py.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-05 08:56:55 -08:00
Trenton H
a9cb89c633 Enhancement: Improve exporter memory efficiency (#12236)
Phase 1 -- Eliminate JSON round-trip in document exporter

Replace json.loads(serializers.serialize("json", qs)) with
serializers.serialize("python", qs) to skip the intermediate
JSON string allocation and parse step. Use DjangoJSONEncoder
in check_and_write_json() to handle native Python types
(datetime, Decimal, UUID) the Python serializer returns.

Phase 2 -- Batched QuerySet serialization in document exporter

Add serialize_queryset_batched() helper that uses QuerySet.iterator()
and itertools.islice to stream records in configurable chunks, bounding
peak memory during serialization to batch_size * avg_record_size rather
than loading the entire QuerySet at once.
2026-03-04 14:54:20 -08:00
13 changed files with 93 additions and 52 deletions

View File

@@ -304,7 +304,7 @@ class PaperlessCommand(RichCommand):
Progress output is directed to stderr to match the convention that
progress bars are transient UI feedback, not command output. This
mirrors tqdm's default behavior and prevents progress bar rendering
mirrors the convention that progress bars are transient UI feedback and prevents progress bar rendering
from interfering with stdout-based assertions in tests or piped
command output.

View File

@@ -17,6 +17,7 @@ class Command(PaperlessCommand):
"modified) after their initial import."
)
supports_progress_bar = True
supports_multiprocessing = True
def add_arguments(self, parser):

View File

@@ -3,10 +3,11 @@ import json
import os
import shutil
import tempfile
from itertools import chain
from itertools import islice
from pathlib import Path
from typing import TYPE_CHECKING
import tqdm
from allauth.mfa.models import Authenticator
from allauth.socialaccount.models import SocialAccount
from allauth.socialaccount.models import SocialApp
@@ -17,8 +18,8 @@ from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.core import serializers
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django.core.serializers.json import DjangoJSONEncoder
from django.db import transaction
from django.utils import timezone
from filelock import FileLock
@@ -26,6 +27,8 @@ from guardian.models import GroupObjectPermission
from guardian.models import UserObjectPermission
if TYPE_CHECKING:
from collections.abc import Generator
from django.db.models import QuerySet
if settings.AUDIT_LOG_ENABLED:
@@ -33,6 +36,7 @@ if settings.AUDIT_LOG_ENABLED:
from documents.file_handling import delete_empty_directories
from documents.file_handling import generate_filename
from documents.management.commands.base import PaperlessCommand
from documents.management.commands.mixins import CryptMixin
from documents.models import Correspondent
from documents.models import CustomField
@@ -60,14 +64,34 @@ from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
class Command(CryptMixin, BaseCommand):
def serialize_queryset_batched(
queryset: "QuerySet",
*,
batch_size: int = 500,
) -> "Generator[list[dict], None, None]":
"""Yield batches of serialized records from a QuerySet.
Each batch is a list of dicts in Django's Python serialization format.
Uses QuerySet.iterator() to avoid loading the full queryset into memory,
and islice to collect chunk-sized batches serialized in a single call.
"""
iterator = queryset.iterator(chunk_size=batch_size)
while chunk := list(islice(iterator, batch_size)):
yield serializers.serialize("python", chunk)
class Command(CryptMixin, PaperlessCommand):
help = (
"Decrypt and rename all files in our collection into a given target "
"directory. And include a manifest file containing document data for "
"easy import."
)
supports_progress_bar = True
supports_multiprocessing = False
def add_arguments(self, parser) -> None:
super().add_arguments(parser)
parser.add_argument("target")
parser.add_argument(
@@ -175,15 +199,19 @@ class Command(CryptMixin, BaseCommand):
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
"--passphrase",
help="If provided, is used to encrypt sensitive data in the export",
)
parser.add_argument(
"--passphrase",
help="If provided, is used to encrypt sensitive data in the export",
"--batch-size",
type=int,
default=500,
help=(
"Number of records to process per batch during serialization. "
"Lower values reduce peak memory usage; higher values improve "
"throughput. Default: 500."
),
)
def handle(self, *args, **options) -> None:
@@ -198,8 +226,8 @@ class Command(CryptMixin, BaseCommand):
self.no_thumbnail: bool = options["no_thumbnail"]
self.zip_export: bool = options["zip"]
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
self.passphrase: str | None = options.get("passphrase")
self.batch_size: int = options["batch_size"]
self.files_in_export_dir: set[Path] = set()
self.exported_files: set[str] = set()
@@ -294,8 +322,13 @@ class Command(CryptMixin, BaseCommand):
# Build an overall manifest
for key, object_query in manifest_key_to_object_query.items():
manifest_dict[key] = json.loads(
serializers.serialize("json", object_query),
manifest_dict[key] = list(
chain.from_iterable(
serialize_queryset_batched(
object_query,
batch_size=self.batch_size,
),
),
)
self.encrypt_secret_fields(manifest_dict)
@@ -309,10 +342,12 @@ class Command(CryptMixin, BaseCommand):
document_manifest = manifest_dict["documents"]
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(
enumerate(document_manifest),
total=len(document_manifest),
disable=self.no_progress_bar,
for index, document_dict in enumerate(
self.track(
document_manifest,
description="Exporting documents...",
total=len(document_manifest),
),
):
document = document_map[document_dict["pk"]]
@@ -512,14 +547,24 @@ class Command(CryptMixin, BaseCommand):
self.files_in_export_dir.remove(target)
if self.compare_json:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
src_str = json.dumps(content, indent=2, ensure_ascii=False)
src_str = json.dumps(
content,
cls=DjangoJSONEncoder,
indent=2,
ensure_ascii=False,
)
src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
if src_checksum == target_checksum:
perform_write = False
if perform_write:
target.write_text(
json.dumps(content, indent=2, ensure_ascii=False),
json.dumps(
content,
cls=DjangoJSONEncoder,
indent=2,
ensure_ascii=False,
),
encoding="utf-8",
)

View File

@@ -40,6 +40,7 @@ def _process_and_match(work: _WorkPackage) -> _WorkResult:
class Command(PaperlessCommand):
help = "Searches for documents where the content almost matches"
supports_progress_bar = True
supports_multiprocessing = True
def add_arguments(self, parser):

View File

@@ -8,14 +8,12 @@ from pathlib import Path
from zipfile import ZipFile
from zipfile import is_zipfile
import tqdm
from django.conf import settings
from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import FieldDoesNotExist
from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django.core.serializers.base import DeserializationError
from django.db import IntegrityError
@@ -25,6 +23,7 @@ from django.db.models.signals import post_save
from filelock import FileLock
from documents.file_handling import create_source_path_directory
from documents.management.commands.base import PaperlessCommand
from documents.management.commands.mixins import CryptMixin
from documents.models import Correspondent
from documents.models import CustomField
@@ -57,21 +56,18 @@ def disable_signal(sig, receiver, sender, *, weak: bool | None = None) -> Genera
sig.connect(receiver=receiver, sender=sender, **kwargs)
class Command(CryptMixin, BaseCommand):
class Command(CryptMixin, PaperlessCommand):
help = (
"Using a manifest.json file, load the data from there, and import the "
"documents it refers to."
)
def add_arguments(self, parser) -> None:
parser.add_argument("source")
supports_progress_bar = True
supports_multiprocessing = False
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def add_arguments(self, parser) -> None:
super().add_arguments(parser)
parser.add_argument("source")
parser.add_argument(
"--data-only",
@@ -231,7 +227,6 @@ class Command(CryptMixin, BaseCommand):
self.source = Path(options["source"]).resolve()
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
self.passphrase: str | None = options.get("passphrase")
self.version: str | None = None
self.salt: str | None = None
@@ -365,7 +360,7 @@ class Command(CryptMixin, BaseCommand):
filter(lambda r: r["model"] == "documents.document", self.manifest),
)
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
for record in self.track(manifest_documents, description="Copying files..."):
document = Document.objects.get(pk=record["pk"])
doc_file = record[EXPORTER_FILE_NAME]

View File

@@ -8,6 +8,9 @@ from documents.tasks import index_reindex
class Command(PaperlessCommand):
help = "Manages the document index."
supports_progress_bar = True
supports_multiprocessing = False
def add_arguments(self, parser):
super().add_arguments(parser)
parser.add_argument("command", choices=["reindex", "optimize"])

View File

@@ -7,6 +7,9 @@ from documents.tasks import llmindex_index
class Command(PaperlessCommand):
help = "Manages the LLM-based vector index for Paperless."
supports_progress_bar = True
supports_multiprocessing = False
def add_arguments(self, parser: Any) -> None:
super().add_arguments(parser)
parser.add_argument("command", choices=["rebuild", "update"])

View File

@@ -7,6 +7,9 @@ from documents.models import Document
class Command(PaperlessCommand):
help = "Rename all documents"
supports_progress_bar = True
supports_multiprocessing = False
def handle(self, *args, **options):
for document in self.track(Document.objects.all(), description="Renaming..."):
post_save.send(Document, instance=document, created=False)

View File

@@ -180,6 +180,9 @@ class Command(PaperlessCommand):
"modified) after their initial import."
)
supports_progress_bar = True
supports_multiprocessing = False
def add_arguments(self, parser) -> None:
super().add_arguments(parser)
parser.add_argument("-c", "--correspondent", default=False, action="store_true")

View File

@@ -24,6 +24,9 @@ _LEVEL_STYLE: dict[int, tuple[str, str]] = {
class Command(PaperlessCommand):
help = "This command checks your document archive for issues."
supports_progress_bar = True
supports_multiprocessing = False
def _render_results(self, messages: SanityCheckMessages) -> None:
"""Render sanity check results as a Rich table."""

View File

@@ -36,6 +36,7 @@ def _process_document(doc_id: int) -> None:
class Command(PaperlessCommand):
help = "This will regenerate the thumbnails for all documents."
supports_progress_bar = True
supports_multiprocessing = True
def add_arguments(self, parser) -> None:

View File

@@ -1,6 +1,5 @@
import base64
import os
from argparse import ArgumentParser
from typing import TypedDict
from cryptography.fernet import Fernet
@@ -21,25 +20,6 @@ class CryptFields(TypedDict):
fields: list[str]
class ProgressBarMixin:
"""
Many commands use a progress bar, which can be disabled
via this class
"""
def add_argument_progress_bar_mixin(self, parser: ArgumentParser) -> None:
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def handle_progress_bar_mixin(self, *args, **options) -> None:
self.no_progress_bar = options["no_progress_bar"]
self.use_progress_bar = not self.no_progress_bar
class CryptMixin:
"""
Fully based on:

View File

@@ -9,6 +9,9 @@ class Command(PaperlessCommand):
help = "Prunes the audit logs of objects that no longer exist."
supports_progress_bar = True
supports_multiprocessing = False
def handle(self, *args, **options):
with transaction.atomic():
for log_entry in self.track(