Compare commits

..

3 Commits

31 changed files with 962 additions and 2029 deletions

View File

@@ -30,7 +30,7 @@ RUN set -eux \
# Purpose: Installs s6-overlay and rootfs
# Comments:
# - Don't leave anything extra in here either
FROM ghcr.io/astral-sh/uv:0.10.7-python3.12-trixie-slim AS s6-overlay-base
FROM ghcr.io/astral-sh/uv:0.10.5-python3.12-trixie-slim AS s6-overlay-base
WORKDIR /usr/src/s6

View File

@@ -262,6 +262,10 @@ your files differently, you can do that by adjusting the
or using [storage paths (see below)](#storage-paths). Paperless adds the
correct file extension e.g. `.pdf`, `.jpg` automatically.
When a document has file versions, each version uses the same naming rules and
storage path resolution as any other document file, with an added version suffix
such as `_v1`, `_v2`, etc.
This variable allows you to configure the filename (folders are allowed)
using placeholders. For example, configuring this to
@@ -353,6 +357,8 @@ If paperless detects that two documents share the same filename,
paperless will automatically append `_01`, `_02`, etc to the filename.
This happens if all the placeholders in a filename evaluate to the same
value.
For versioned files, this counter is appended after the version suffix
(for example `statement_v2_01.pdf`).
If there are any errors in the placeholders included in `PAPERLESS_FILENAME_FORMAT`,
paperless will fall back to using the default naming scheme instead.

View File

@@ -95,6 +95,7 @@ Think of versions as **file history** for a document.
- Versions track the underlying file and extracted text content (OCR/text).
- Metadata such as tags, correspondent, document type, storage path and custom fields stay on the "root" document.
- Version files follow normal filename formatting (including storage paths) and add a `_vN` suffix (for example `_v1`, `_v2`).
- By default, search and document content use the latest version.
- In document detail, selecting a version switches the preview, file metadata and content (and download etc buttons) to that version.
- Deleting a non-root version keeps metadata and falls back to the latest remaining version.

View File

@@ -111,7 +111,6 @@ docs = [
testing = [
"daphne",
"factory-boy~=3.3.1",
"faker~=40.5.1",
"imagehash",
"pytest~=9.0.0",
"pytest-cov~=7.0.0",
@@ -305,7 +304,6 @@ markers = [
"greenmail: Tests requiring Greenmail service",
"date_parsing: Tests which cover date parsing from content or filename",
"management: Tests which cover management commands/functionality",
"profiling: Benchmarks that profile and compare implementation performance",
]
[tool.pytest_env]

View File

@@ -1238,8 +1238,8 @@
<context context-type="linenumber">82</context>
</context-group>
</trans-unit>
<trans-unit id="7860582931776068318" datatype="html">
<source>Add document version</source>
<trans-unit id="8035757452478567832" datatype="html">
<source>Update existing document</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/settings/settings.component.html</context>
<context context-type="linenumber">280</context>
@@ -8411,8 +8411,8 @@
<context context-type="linenumber">832</context>
</context-group>
</trans-unit>
<trans-unit id="5203024009814367559" datatype="html">
<source>This operation will add rotated versions of the <x id="PH" equiv-text="this.list.selected.size"/> document(s).</source>
<trans-unit id="6390006284731990222" datatype="html">
<source>This operation will permanently rotate the original version of <x id="PH" equiv-text="this.list.selected.size"/> document(s).</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/bulk-editor/bulk-editor.component.ts</context>
<context context-type="linenumber">833</context>

View File

@@ -277,7 +277,7 @@
<div class="col">
<select class="form-select" formControlName="pdfEditorDefaultEditMode">
<option [ngValue]="PdfEditorEditMode.Create" i18n>Create new document(s)</option>
<option [ngValue]="PdfEditorEditMode.Update" i18n>Add document version</option>
<option [ngValue]="PdfEditorEditMode.Update" i18n>Update existing document</option>
</select>
</div>
</div>

View File

@@ -84,7 +84,7 @@
<input type="radio" class="btn-check" [(ngModel)]="editMode" [value]="PdfEditorEditMode.Update" id="editModeUpdate" name="editmode" [disabled]="hasSplit()">
<label for="editModeUpdate" class="btn btn-outline-primary btn-sm">
<i-bs name="pencil"></i-bs>
<span class="form-check-label ms-2" i18n>Add document version</span>
<span class="form-check-label ms-2" i18n>Update existing document</span>
</label>
</div>
@if (editMode === PdfEditorEditMode.Create) {

View File

@@ -830,7 +830,7 @@ export class BulkEditorComponent
})
const rotateDialog = modal.componentInstance as RotateConfirmDialogComponent
rotateDialog.title = $localize`Rotate confirm`
rotateDialog.messageBold = $localize`This operation will add rotated versions of the ${this.list.selected.size} document(s).`
rotateDialog.messageBold = $localize`This operation will permanently rotate the original version of ${this.list.selected.size} document(s).`
rotateDialog.btnClass = 'btn-danger'
rotateDialog.btnCaption = $localize`Proceed`
rotateDialog.documentID = Array.from(this.list.selected)[0]

View File

@@ -11,6 +11,7 @@ import magic
from django.conf import settings
from django.contrib.auth.models import User
from django.db import transaction
from django.db.models import Max
from django.db.models import Q
from django.utils import timezone
from filelock import FileLock
@@ -123,22 +124,6 @@ class ConsumerPluginMixin:
self.filename = self.metadata.filename or self.input_doc.original_file.name
if input_doc.root_document_id:
self.log.debug(
f"Document root document id: {input_doc.root_document_id}",
)
root_document = Document.objects.get(pk=input_doc.root_document_id)
version_index = Document.objects.filter(root_document=root_document).count()
filename_path = Path(self.filename)
if filename_path.suffix:
self.filename = str(
filename_path.with_name(
f"{filename_path.stem}_v{version_index}{filename_path.suffix}",
),
)
else:
self.filename = f"{self.filename}_v{version_index}"
def _send_progress(
self,
current_progress: int,
@@ -193,9 +178,19 @@ class ConsumerPlugin(
mime_type: str,
) -> Document:
self.log.debug("Saving record for updated version to database")
version_doc = Document.objects.get(pk=root_doc.pk)
root_doc_frozen = Document.objects.select_for_update().get(pk=root_doc.pk)
next_version_index = (
Document.global_objects.filter(
root_document_id=root_doc_frozen.pk,
).aggregate(
max_index=Max("version_index"),
)["max_index"]
or 0
)
version_doc = Document.objects.get(pk=root_doc_frozen.pk)
setattr(version_doc, "pk", None)
version_doc.root_document = root_doc
version_doc.root_document = root_doc_frozen
version_doc.version_index = next_version_index + 1
file_for_checksum = (
self.unmodified_original
if self.unmodified_original is not None
@@ -208,7 +203,6 @@ class ConsumerPlugin(
version_doc.page_count = page_count
version_doc.mime_type = mime_type
version_doc.original_filename = self.filename
version_doc.storage_path = root_doc.storage_path
# Clear unique file path fields so they can be generated uniquely later
version_doc.filename = None
version_doc.archive_filename = None

View File

@@ -128,11 +128,18 @@ def generate_filename(
counter=0,
archive_filename=False,
) -> Path:
# version docs use the root document for formatting, just with a suffix
context_doc = doc if doc.root_document_id is None else doc.root_document
version_suffix = (
f"_v{doc.version_index}"
if doc.root_document_id is not None and doc.version_index is not None
else ""
)
base_path: Path | None = None
# Determine the source of the format string
if doc.storage_path is not None:
filename_format = doc.storage_path.path
if context_doc.storage_path is not None:
filename_format = context_doc.storage_path.path
elif settings.FILENAME_FORMAT is not None:
# Maybe convert old to new style
filename_format = convert_format_str_to_template_format(
@@ -143,7 +150,7 @@ def generate_filename(
# If we have one, render it
if filename_format is not None:
rendered_path: str | None = format_filename(doc, filename_format)
rendered_path: str | None = format_filename(context_doc, filename_format)
if rendered_path:
base_path = Path(rendered_path)
@@ -157,7 +164,7 @@ def generate_filename(
base_filename = base_path.name
# Build the final filename with counter and filetype
final_filename = f"{base_filename}{counter_str}{filetype_str}"
final_filename = f"{base_filename}{version_suffix}{counter_str}{filetype_str}"
# If we have a directory component, include it
if str(directory) != ".":
@@ -166,7 +173,9 @@ def generate_filename(
full_path = Path(final_filename)
else:
# No template, use document ID
final_filename = f"{doc.pk:07}{counter_str}{filetype_str}"
final_filename = (
f"{context_doc.pk:07}{version_suffix}{counter_str}{filetype_str}"
)
full_path = Path(final_filename)
return full_path

View File

@@ -6,14 +6,11 @@ Provides automatic progress bar and multiprocessing support with minimal boilerp
from __future__ import annotations
import logging
import os
from collections.abc import Callable
from collections.abc import Iterable
from collections.abc import Sized
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import as_completed
from contextlib import contextmanager
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Any
@@ -25,11 +22,7 @@ from django import db
from django.core.management import CommandError
from django.db.models import QuerySet
from django_rich.management import RichCommand
from rich import box
from rich.console import Console
from rich.console import Group
from rich.console import RenderableType
from rich.live import Live
from rich.progress import BarColumn
from rich.progress import MofNCompleteColumn
from rich.progress import Progress
@@ -37,11 +30,11 @@ from rich.progress import SpinnerColumn
from rich.progress import TextColumn
from rich.progress import TimeElapsedColumn
from rich.progress import TimeRemainingColumn
from rich.table import Table
from rich.text import Text
if TYPE_CHECKING:
from collections.abc import Callable
from collections.abc import Generator
from collections.abc import Iterable
from collections.abc import Sequence
from django.core.management import CommandParser
@@ -50,78 +43,6 @@ T = TypeVar("T")
R = TypeVar("R")
@dataclass(slots=True, frozen=True)
class _BufferedRecord:
level: int
name: str
message: str
class BufferingLogHandler(logging.Handler):
"""Captures log records during a command run for deferred rendering.
Attach to a logger before a long operation and call ``render()``
afterwards to emit the buffered records via Rich, optionally filtered
by minimum level.
"""
def __init__(self) -> None:
super().__init__()
self._records: list[_BufferedRecord] = []
def emit(self, record: logging.LogRecord) -> None:
self._records.append(
_BufferedRecord(
level=record.levelno,
name=record.name,
message=self.format(record),
),
)
def render(
self,
console: Console,
*,
min_level: int = logging.DEBUG,
title: str = "Log Output",
) -> None:
records = [r for r in self._records if r.level >= min_level]
if not records:
return
table = Table(
title=title,
show_header=True,
header_style="bold",
show_lines=False,
box=box.SIMPLE,
)
table.add_column("Level", style="bold", width=8)
table.add_column("Logger", style="dim")
table.add_column("Message", no_wrap=False)
_level_styles: dict[int, str] = {
logging.DEBUG: "dim",
logging.INFO: "cyan",
logging.WARNING: "yellow",
logging.ERROR: "red",
logging.CRITICAL: "bold red",
}
for record in records:
style = _level_styles.get(record.level, "")
table.add_row(
Text(logging.getLevelName(record.level), style=style),
record.name,
record.message,
)
console.print(table)
def clear(self) -> None:
self._records.clear()
@dataclass(frozen=True, slots=True)
class ProcessResult(Generic[T, R]):
"""
@@ -170,23 +91,6 @@ class PaperlessCommand(RichCommand):
for result in self.process_parallel(process_doc, ids):
if result.error:
self.console.print(f"[red]Failed: {result.error}[/red]")
class Command(PaperlessCommand):
help = "Import documents with live stats"
def handle(self, *args, **options):
stats = ImportStats()
def render_stats() -> Table:
... # build Rich Table from stats
for item in self.track_with_stats(
items,
description="Importing...",
stats_renderer=render_stats,
):
result = import_item(item)
stats.imported += 1
"""
supports_progress_bar: ClassVar[bool] = True
@@ -224,11 +128,13 @@ class PaperlessCommand(RichCommand):
This is called by Django's command infrastructure after argument parsing
but before handle(). We use it to set instance attributes from options.
"""
# Set progress bar state
if self.supports_progress_bar:
self.no_progress_bar = options.get("no_progress_bar", False)
else:
self.no_progress_bar = True
# Set multiprocessing state
if self.supports_multiprocessing:
self.process_count = options.get("processes", 1)
if self.process_count < 1:
@@ -238,69 +144,9 @@ class PaperlessCommand(RichCommand):
return super().execute(*args, **options)
@contextmanager
def buffered_logging(
self,
*logger_names: str,
level: int = logging.DEBUG,
) -> Generator[BufferingLogHandler, None, None]:
"""Context manager that captures log output from named loggers.
Installs a ``BufferingLogHandler`` on each named logger for the
duration of the block, suppressing propagation to avoid interleaving
with the Rich live display. The handler is removed on exit regardless
of whether an exception occurred.
Usage::
with self.buffered_logging("paperless", "documents") as log_buf:
# ... run progress loop ...
if options["verbose"]:
log_buf.render(self.console)
"""
handler = BufferingLogHandler()
handler.setFormatter(logging.Formatter("%(message)s"))
loggers: list[logging.Logger] = []
original_propagate: dict[str, bool] = {}
for name in logger_names:
log = logging.getLogger(name)
log.addHandler(handler)
original_propagate[name] = log.propagate
log.propagate = False
loggers.append(log)
try:
yield handler
finally:
for log in loggers:
log.removeHandler(handler)
log.propagate = original_propagate[log.name]
@staticmethod
def _progress_columns() -> tuple[Any, ...]:
"""
Return the standard set of progress bar columns.
Extracted so both _create_progress (standalone) and track_with_stats
(inside Live) use identical column configuration without duplication.
"""
return (
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
TimeElapsedColumn(),
TimeRemainingColumn(),
)
def _create_progress(self, description: str) -> Progress:
"""
Create a standalone Progress instance with its own stderr Console.
Use this for track(). For track_with_stats(), Progress is created
directly inside a Live context instead.
Create a configured Progress instance.
Progress output is directed to stderr to match the convention that
progress bars are transient UI feedback, not command output. This
@@ -315,7 +161,12 @@ class PaperlessCommand(RichCommand):
A Progress instance configured with appropriate columns.
"""
return Progress(
*self._progress_columns(),
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
TimeElapsedColumn(),
TimeRemainingColumn(),
console=Console(stderr=True),
transient=False,
)
@@ -371,6 +222,7 @@ class PaperlessCommand(RichCommand):
yield from iterable
return
# Attempt to determine total if not provided
if total is None:
total = self._get_iterable_length(iterable)
@@ -380,87 +232,6 @@ class PaperlessCommand(RichCommand):
yield item
progress.advance(task_id)
def track_with_stats(
self,
iterable: Iterable[T],
*,
description: str = "Processing...",
stats_renderer: Callable[[], RenderableType],
total: int | None = None,
) -> Generator[T, None, None]:
"""
Iterate over items with a progress bar and a live-updating stats display.
The progress bar and stats renderable are combined in a single Live
context, so the stats panel re-renders in place below the progress bar
after each item is processed.
Respects --no-progress-bar flag. When disabled, yields items without
any display (stats are still updated by the caller's loop body, so
they will be accurate for any post-loop summary the caller prints).
Args:
iterable: The items to iterate over.
description: Text to display alongside the progress bar.
stats_renderer: Zero-argument callable that returns a Rich
renderable. Called after each item to refresh the display.
The caller typically closes over a mutable dataclass and
rebuilds a Table from it on each call.
total: Total number of items. If None, attempts to determine
automatically via .count() (for querysets) or len().
Yields:
Items from the iterable.
Example:
@dataclass
class Stats:
processed: int = 0
failed: int = 0
stats = Stats()
def render_stats() -> Table:
table = Table(box=None)
table.add_column("Processed")
table.add_column("Failed")
table.add_row(str(stats.processed), str(stats.failed))
return table
for item in self.track_with_stats(
items,
description="Importing...",
stats_renderer=render_stats,
):
try:
import_item(item)
stats.processed += 1
except Exception:
stats.failed += 1
"""
if self.no_progress_bar:
yield from iterable
return
if total is None:
total = self._get_iterable_length(iterable)
stderr_console = Console(stderr=True)
# Progress is created without its own console so Live controls rendering.
progress = Progress(*self._progress_columns())
task_id = progress.add_task(description, total=total)
with Live(
Group(progress, stats_renderer()),
console=stderr_console,
refresh_per_second=4,
) as live:
for item in iterable:
yield item
progress.advance(task_id)
live.update(Group(progress, stats_renderer()))
def process_parallel(
self,
fn: Callable[[T], R],
@@ -498,7 +269,7 @@ class PaperlessCommand(RichCommand):
total = len(items)
if self.process_count == 1:
# Sequential execution in main process - critical for testing, so we don't fork in fork, etc
# Sequential execution in main process - critical for testing
yield from self._process_sequential(fn, items, description, total)
else:
# Parallel execution with ProcessPoolExecutor
@@ -527,7 +298,6 @@ class PaperlessCommand(RichCommand):
total: int,
) -> Generator[ProcessResult[T, R], None, None]:
"""Process items in parallel using ProcessPoolExecutor."""
# Close database connections before forking - required for PostgreSQL
db.connections.close_all()

View File

@@ -3,8 +3,6 @@ import json
import os
import shutil
import tempfile
from itertools import chain
from itertools import islice
from pathlib import Path
from typing import TYPE_CHECKING
@@ -21,7 +19,6 @@ from django.contrib.contenttypes.models import ContentType
from django.core import serializers
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django.core.serializers.json import DjangoJSONEncoder
from django.db import transaction
from django.utils import timezone
from filelock import FileLock
@@ -29,8 +26,6 @@ from guardian.models import GroupObjectPermission
from guardian.models import UserObjectPermission
if TYPE_CHECKING:
from collections.abc import Generator
from django.db.models import QuerySet
if settings.AUDIT_LOG_ENABLED:
@@ -65,22 +60,6 @@ from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
def serialize_queryset_batched(
queryset: "QuerySet",
*,
batch_size: int = 500,
) -> "Generator[list[dict], None, None]":
"""Yield batches of serialized records from a QuerySet.
Each batch is a list of dicts in Django's Python serialization format.
Uses QuerySet.iterator() to avoid loading the full queryset into memory,
and islice to collect chunk-sized batches serialized in a single call.
"""
iterator = queryset.iterator(chunk_size=batch_size)
while chunk := list(islice(iterator, batch_size)):
yield serializers.serialize("python", chunk)
class Command(CryptMixin, BaseCommand):
help = (
"Decrypt and rename all files in our collection into a given target "
@@ -207,17 +186,6 @@ class Command(CryptMixin, BaseCommand):
help="If provided, is used to encrypt sensitive data in the export",
)
parser.add_argument(
"--batch-size",
type=int,
default=500,
help=(
"Number of records to process per batch during serialization. "
"Lower values reduce peak memory usage; higher values improve "
"throughput. Default: 500."
),
)
def handle(self, *args, **options) -> None:
self.target = Path(options["target"]).resolve()
self.split_manifest: bool = options["split_manifest"]
@@ -232,7 +200,6 @@ class Command(CryptMixin, BaseCommand):
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
self.passphrase: str | None = options.get("passphrase")
self.batch_size: int = options["batch_size"]
self.files_in_export_dir: set[Path] = set()
self.exported_files: set[str] = set()
@@ -327,13 +294,8 @@ class Command(CryptMixin, BaseCommand):
# Build an overall manifest
for key, object_query in manifest_key_to_object_query.items():
manifest_dict[key] = list(
chain.from_iterable(
serialize_queryset_batched(
object_query,
batch_size=self.batch_size,
),
),
manifest_dict[key] = json.loads(
serializers.serialize("json", object_query),
)
self.encrypt_secret_fields(manifest_dict)
@@ -550,24 +512,14 @@ class Command(CryptMixin, BaseCommand):
self.files_in_export_dir.remove(target)
if self.compare_json:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
src_str = json.dumps(
content,
cls=DjangoJSONEncoder,
indent=2,
ensure_ascii=False,
)
src_str = json.dumps(content, indent=2, ensure_ascii=False)
src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
if src_checksum == target_checksum:
perform_write = False
if perform_write:
target.write_text(
json.dumps(
content,
cls=DjangoJSONEncoder,
indent=2,
ensure_ascii=False,
),
json.dumps(content, indent=2, ensure_ascii=False),
encoding="utf-8",
)

View File

@@ -1,12 +1,4 @@
from __future__ import annotations
import logging
from dataclasses import dataclass
from dataclasses import field
from typing import TYPE_CHECKING
from rich.table import Table
from rich.text import Text
from documents.classifier import load_classifier
from documents.management.commands.base import PaperlessCommand
@@ -16,162 +8,9 @@ from documents.signals.handlers import set_document_type
from documents.signals.handlers import set_storage_path
from documents.signals.handlers import set_tags
if TYPE_CHECKING:
from rich.console import RenderableType
from documents.models import Correspondent
from documents.models import DocumentType
from documents.models import StoragePath
from documents.models import Tag
logger = logging.getLogger("paperless.management.retagger")
@dataclass(slots=True)
class RetaggerStats:
"""Cumulative counters updated as the retagger processes documents.
Mutable by design -- fields are incremented in the processing loop.
slots=True reduces per-instance memory overhead and speeds attribute access.
"""
correspondents: int = 0
document_types: int = 0
tags_added: int = 0
tags_removed: int = 0
storage_paths: int = 0
documents_processed: int = 0
@dataclass(slots=True)
class DocumentSuggestion:
"""Buffered classifier suggestions for a single document (suggest mode only).
Mutable by design -- fields are assigned incrementally as each setter runs.
"""
document: Document
correspondent: Correspondent | None = None
document_type: DocumentType | None = None
tags_to_add: frozenset[Tag] = field(default_factory=frozenset)
tags_to_remove: frozenset[Tag] = field(default_factory=frozenset)
storage_path: StoragePath | None = None
@property
def has_suggestions(self) -> bool:
return bool(
self.correspondent is not None
or self.document_type is not None
or self.tags_to_add
or self.tags_to_remove
or self.storage_path is not None,
)
def _build_stats_table(stats: RetaggerStats, *, suggest: bool) -> Table:
"""
Build the live-updating stats table shown below the progress bar.
In suggest mode the labels read "would set / would add" to make clear
that nothing has been written to the database.
"""
table = Table(box=None, padding=(0, 2), show_header=True, header_style="bold")
table.add_column("Documents")
table.add_column("Correspondents")
table.add_column("Doc Types")
table.add_column("Tags (+)")
table.add_column("Tags (-)")
table.add_column("Storage Paths")
verb = "would set" if suggest else "set"
table.add_row(
str(stats.documents_processed),
f"{stats.correspondents} {verb}",
f"{stats.document_types} {verb}",
f"+{stats.tags_added}",
f"-{stats.tags_removed}",
f"{stats.storage_paths} {verb}",
)
return table
def _build_suggestion_table(
suggestions: list[DocumentSuggestion],
base_url: str | None,
) -> Table:
"""
Build the final suggestion table printed after the progress bar completes.
Only documents with at least one suggestion are included.
"""
table = Table(
title="Suggested Changes",
show_header=True,
header_style="bold cyan",
show_lines=True,
)
table.add_column("Document", style="bold", no_wrap=False, min_width=20)
table.add_column("Correspondent")
table.add_column("Doc Type")
table.add_column("Tags")
table.add_column("Storage Path")
for suggestion in suggestions:
if not suggestion.has_suggestions:
continue
doc = suggestion.document
if base_url:
doc_cell = Text()
doc_cell.append(str(doc))
doc_cell.append(f"\n{base_url}/documents/{doc.pk}", style="dim")
else:
doc_cell = Text(f"{doc} [{doc.pk}]")
tag_parts: list[str] = []
for tag in sorted(suggestion.tags_to_add, key=lambda t: t.name):
tag_parts.append(f"[green]+{tag.name}[/green]")
for tag in sorted(suggestion.tags_to_remove, key=lambda t: t.name):
tag_parts.append(f"[red]-{tag.name}[/red]")
tag_cell = Text.from_markup(", ".join(tag_parts)) if tag_parts else Text("-")
table.add_row(
doc_cell,
str(suggestion.correspondent) if suggestion.correspondent else "-",
str(suggestion.document_type) if suggestion.document_type else "-",
tag_cell,
str(suggestion.storage_path) if suggestion.storage_path else "-",
)
return table
def _build_summary_table(stats: RetaggerStats) -> Table:
"""Build the final applied-changes summary table."""
table = Table(
title="Retagger Summary",
show_header=True,
header_style="bold cyan",
)
table.add_column("Metric", style="bold")
table.add_column("Count", justify="right")
table.add_row("Documents processed", str(stats.documents_processed))
table.add_row("Correspondents set", str(stats.correspondents))
table.add_row("Document types set", str(stats.document_types))
table.add_row("Tags added", str(stats.tags_added))
table.add_row("Tags removed", str(stats.tags_removed))
table.add_row("Storage paths set", str(stats.storage_paths))
return table
class Command(PaperlessCommand):
help = (
"Using the current classification model, assigns correspondents, tags "
@@ -180,7 +19,7 @@ class Command(PaperlessCommand):
"modified) after their initial import."
)
def add_arguments(self, parser) -> None:
def add_arguments(self, parser):
super().add_arguments(parser)
parser.add_argument("-c", "--correspondent", default=False, action="store_true")
parser.add_argument("-T", "--tags", default=False, action="store_true")
@@ -192,9 +31,9 @@ class Command(PaperlessCommand):
default=False,
action="store_true",
help=(
"By default this command will not try to assign a correspondent "
"if more than one matches the document. Use this flag to pick "
"the first match instead."
"By default this command won't try to assign a correspondent "
"if more than one matches the document. Use this flag if "
"you'd rather it just pick the first one it finds."
),
)
parser.add_argument(
@@ -203,140 +42,91 @@ class Command(PaperlessCommand):
default=False,
action="store_true",
help=(
"Overwrite any previously set correspondent, document type, and "
"remove tags that no longer match due to changed rules."
"If set, the document retagger will overwrite any previously "
"set correspondent, document and remove correspondents, types "
"and tags that do not match anymore due to changed rules."
),
)
parser.add_argument(
"--suggest",
default=False,
action="store_true",
help="Show what would be changed without applying anything.",
help="Return the suggestion, don't change anything.",
)
parser.add_argument(
"--base-url",
help="Base URL used to build document links in suggest output.",
help="The base URL to use to build the link to the documents.",
)
parser.add_argument(
"--id-range",
help="Restrict retagging to documents within this ID range (inclusive).",
help="A range of document ids on which the retagging should be applied.",
nargs=2,
type=int,
)
def handle(self, *args, **options) -> None:
suggest: bool = options["suggest"]
overwrite: bool = options["overwrite"]
use_first: bool = options["use_first"]
base_url: str | None = options["base_url"]
do_correspondent: bool = options["correspondent"]
do_document_type: bool = options["document_type"]
do_tags: bool = options["tags"]
do_storage_path: bool = options["storage_path"]
if not any([do_correspondent, do_document_type, do_tags, do_storage_path]):
self.console.print(
"[yellow]No classifier targets specified. "
"Use -c, -T, -t, or -s to select what to retag.[/yellow]",
)
return
def handle(self, *args, **options):
if options["inbox_only"]:
queryset = Document.objects.filter(tags__is_inbox_tag=True)
else:
queryset = Document.objects.all()
if options["id_range"]:
lo, hi = options["id_range"]
queryset = queryset.filter(id__range=(lo, hi))
queryset = queryset.filter(
id__range=(options["id_range"][0], options["id_range"][1]),
)
documents = queryset.distinct()
classifier = load_classifier()
stats = RetaggerStats()
suggestions: list[DocumentSuggestion] = []
for document in self.track(documents, description="Retagging..."):
if options["correspondent"]:
set_correspondent(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
def render_stats() -> RenderableType:
return _build_stats_table(stats, suggest=suggest)
if options["document_type"]:
set_document_type(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
with self.buffered_logging(
"paperless",
"paperless.handlers",
"documents",
) as log_buf:
for document in self.track_with_stats(
documents,
description="Retagging...",
stats_renderer=render_stats,
):
suggestion = DocumentSuggestion(document=document)
if options["tags"]:
set_tags(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if do_correspondent:
correspondent = set_correspondent(
None,
document,
classifier=classifier,
replace=overwrite,
use_first=use_first,
dry_run=suggest,
)
if correspondent is not None:
stats.correspondents += 1
suggestion.correspondent = correspondent
if do_document_type:
document_type = set_document_type(
None,
document,
classifier=classifier,
replace=overwrite,
use_first=use_first,
dry_run=suggest,
)
if document_type is not None:
stats.document_types += 1
suggestion.document_type = document_type
if do_tags:
tags_to_add, tags_to_remove = set_tags(
None,
document,
classifier=classifier,
replace=overwrite,
dry_run=suggest,
)
stats.tags_added += len(tags_to_add)
stats.tags_removed += len(tags_to_remove)
suggestion.tags_to_add = frozenset(tags_to_add)
suggestion.tags_to_remove = frozenset(tags_to_remove)
if do_storage_path:
storage_path = set_storage_path(
None,
document,
classifier=classifier,
replace=overwrite,
use_first=use_first,
dry_run=suggest,
)
if storage_path is not None:
stats.storage_paths += 1
suggestion.storage_path = storage_path
stats.documents_processed += 1
if suggest:
suggestions.append(suggestion)
# Post-loop output
if suggest:
visible = [s for s in suggestions if s.has_suggestions]
if visible:
self.console.print(_build_suggestion_table(visible, base_url))
else:
self.console.print("[green]No changes suggested.[/green]")
else:
self.console.print(_build_summary_table(stats))
log_buf.render(self.console, min_level=logging.INFO, title="Retagger Log")
if options["storage_path"]:
set_storage_path(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)

View File

@@ -1,4 +1,4 @@
# Generated by Django 5.2.11 on 2026-03-03 16:27
# Generated by Django 5.2.7 on 2026-01-15 22:08
import datetime
@@ -21,207 +21,6 @@ class Migration(migrations.Migration):
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
replaces = [
("documents", "0001_initial"),
("documents", "0002_auto_20151226_1316"),
("documents", "0003_sender"),
("documents", "0004_auto_20160114_1844"),
(
"documents",
"0004_auto_20160114_1844_squashed_0011_auto_20160303_1929",
),
("documents", "0005_auto_20160123_0313"),
("documents", "0006_auto_20160123_0430"),
("documents", "0007_auto_20160126_2114"),
("documents", "0008_document_file_type"),
("documents", "0009_auto_20160214_0040"),
("documents", "0010_log"),
("documents", "0011_auto_20160303_1929"),
("documents", "0012_auto_20160305_0040"),
("documents", "0013_auto_20160325_2111"),
("documents", "0014_document_checksum"),
("documents", "0015_add_insensitive_to_match"),
(
"documents",
"0015_add_insensitive_to_match_squashed_0018_auto_20170715_1712",
),
("documents", "0016_auto_20170325_1558"),
("documents", "0017_auto_20170512_0507"),
("documents", "0018_auto_20170715_1712"),
("documents", "0019_add_consumer_user"),
("documents", "0020_document_added"),
("documents", "0021_document_storage_type"),
("documents", "0022_auto_20181007_1420"),
("documents", "0023_document_current_filename"),
("documents", "1000_update_paperless_all"),
("documents", "1001_auto_20201109_1636"),
("documents", "1002_auto_20201111_1105"),
("documents", "1003_mime_types"),
("documents", "1004_sanity_check_schedule"),
("documents", "1005_checksums"),
("documents", "1006_auto_20201208_2209"),
(
"documents",
"1006_auto_20201208_2209_squashed_1011_auto_20210101_2340",
),
("documents", "1007_savedview_savedviewfilterrule"),
("documents", "1008_auto_20201216_1736"),
("documents", "1009_auto_20201216_2005"),
("documents", "1010_auto_20210101_2159"),
("documents", "1011_auto_20210101_2340"),
("documents", "1012_fix_archive_files"),
("documents", "1013_migrate_tag_colour"),
("documents", "1014_auto_20210228_1614"),
("documents", "1015_remove_null_characters"),
("documents", "1016_auto_20210317_1351"),
(
"documents",
"1016_auto_20210317_1351_squashed_1020_merge_20220518_1839",
),
("documents", "1017_alter_savedviewfilterrule_rule_type"),
("documents", "1018_alter_savedviewfilterrule_value"),
("documents", "1019_storagepath_document_storage_path"),
("documents", "1019_uisettings"),
("documents", "1020_merge_20220518_1839"),
("documents", "1021_webp_thumbnail_conversion"),
("documents", "1022_paperlesstask"),
(
"documents",
"1022_paperlesstask_squashed_1036_alter_savedviewfilterrule_rule_type",
),
("documents", "1023_add_comments"),
("documents", "1024_document_original_filename"),
("documents", "1025_alter_savedviewfilterrule_rule_type"),
("documents", "1026_transition_to_celery"),
(
"documents",
"1027_remove_paperlesstask_attempted_task_and_more",
),
(
"documents",
"1028_remove_paperlesstask_task_args_and_more",
),
("documents", "1029_alter_document_archive_serial_number"),
("documents", "1030_alter_paperlesstask_task_file_name"),
(
"documents",
"1031_remove_savedview_user_correspondent_owner_and_more",
),
(
"documents",
"1032_alter_correspondent_matching_algorithm_and_more",
),
(
"documents",
"1033_alter_documenttype_options_alter_tag_options_and_more",
),
("documents", "1034_alter_savedviewfilterrule_rule_type"),
("documents", "1035_rename_comment_note"),
("documents", "1036_alter_savedviewfilterrule_rule_type"),
("documents", "1037_webp_encrypted_thumbnail_conversion"),
("documents", "1038_sharelink"),
("documents", "1039_consumptiontemplate"),
(
"documents",
"1040_customfield_customfieldinstance_and_more",
),
("documents", "1041_alter_consumptiontemplate_sources"),
(
"documents",
"1042_consumptiontemplate_assign_custom_fields_and_more",
),
("documents", "1043_alter_savedviewfilterrule_rule_type"),
(
"documents",
"1044_workflow_workflowaction_workflowtrigger_and_more",
),
(
"documents",
"1045_alter_customfieldinstance_value_monetary",
),
(
"documents",
"1045_alter_customfieldinstance_value_monetary_squashed_1049_document_deleted_at_document_restored_at",
),
(
"documents",
"1046_workflowaction_remove_all_correspondents_and_more",
),
("documents", "1047_savedview_display_mode_and_more"),
("documents", "1048_alter_savedviewfilterrule_rule_type"),
(
"documents",
"1049_document_deleted_at_document_restored_at",
),
("documents", "1050_customfield_extra_data_and_more"),
(
"documents",
"1051_alter_correspondent_owner_alter_document_owner_and_more",
),
("documents", "1052_document_transaction_id"),
("documents", "1053_document_page_count"),
(
"documents",
"1054_customfieldinstance_value_monetary_amount_and_more",
),
("documents", "1055_alter_storagepath_path"),
(
"documents",
"1056_customfieldinstance_deleted_at_and_more",
),
("documents", "1057_paperlesstask_owner"),
(
"documents",
"1058_workflowtrigger_schedule_date_custom_field_and_more",
),
(
"documents",
"1059_workflowactionemail_workflowactionwebhook_and_more",
),
(
"documents",
"1060_alter_customfieldinstance_value_select",
),
("documents", "1061_workflowactionwebhook_as_json"),
("documents", "1062_alter_savedviewfilterrule_rule_type"),
(
"documents",
"1063_paperlesstask_type_alter_paperlesstask_task_name_and_more",
),
("documents", "1064_delete_log"),
(
"documents",
"1065_workflowaction_assign_custom_fields_values",
),
(
"documents",
"1066_alter_workflowtrigger_schedule_offset_days",
),
("documents", "1067_alter_document_created"),
("documents", "1068_alter_document_created"),
(
"documents",
"1069_workflowtrigger_filter_has_storage_path_and_more",
),
(
"documents",
"1070_customfieldinstance_value_long_text_and_more",
),
(
"documents",
"1071_tag_tn_ancestors_count_tag_tn_ancestors_pks_and_more",
),
(
"documents",
"1072_workflowtrigger_filter_custom_field_query_and_more",
),
("documents", "1073_migrate_workflow_title_jinja"),
(
"documents",
"1074_workflowrun_deleted_at_workflowrun_restored_at_and_more",
),
]
operations = [
migrations.CreateModel(
name="WorkflowActionEmail",
@@ -386,6 +185,70 @@ class Migration(migrations.Migration):
"abstract": False,
},
),
migrations.CreateModel(
name="CustomField",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"created",
models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
editable=False,
verbose_name="created",
),
),
("name", models.CharField(max_length=128)),
(
"data_type",
models.CharField(
choices=[
("string", "String"),
("url", "URL"),
("date", "Date"),
("boolean", "Boolean"),
("integer", "Integer"),
("float", "Float"),
("monetary", "Monetary"),
("documentlink", "Document Link"),
("select", "Select"),
("longtext", "Long Text"),
],
editable=False,
max_length=50,
verbose_name="data type",
),
),
(
"extra_data",
models.JSONField(
blank=True,
help_text="Extra data for the custom field, such as select options",
null=True,
verbose_name="extra data",
),
),
],
options={
"verbose_name": "custom field",
"verbose_name_plural": "custom fields",
"ordering": ("created",),
"constraints": [
models.UniqueConstraint(
fields=("name",),
name="documents_customfield_unique_name",
),
],
},
),
migrations.CreateModel(
name="DocumentType",
fields=[
@@ -870,6 +733,17 @@ class Migration(migrations.Migration):
verbose_name="correspondent",
),
),
(
"owner",
models.ForeignKey(
blank=True,
default=None,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
verbose_name="owner",
),
),
(
"document_type",
models.ForeignKey(
@@ -893,14 +767,12 @@ class Migration(migrations.Migration):
),
),
(
"owner",
models.ForeignKey(
"tags",
models.ManyToManyField(
blank=True,
default=None,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
verbose_name="owner",
related_name="documents",
to="documents.tag",
verbose_name="tags",
),
),
],
@@ -910,140 +782,6 @@ class Migration(migrations.Migration):
"ordering": ("-created",),
},
),
migrations.AddField(
model_name="document",
name="tags",
field=models.ManyToManyField(
blank=True,
related_name="documents",
to="documents.tag",
verbose_name="tags",
),
),
migrations.CreateModel(
name="Note",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("deleted_at", models.DateTimeField(blank=True, null=True)),
("restored_at", models.DateTimeField(blank=True, null=True)),
("transaction_id", models.UUIDField(blank=True, null=True)),
(
"note",
models.TextField(
blank=True,
help_text="Note for the document",
verbose_name="content",
),
),
(
"created",
models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
verbose_name="created",
),
),
(
"document",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="notes",
to="documents.document",
verbose_name="document",
),
),
(
"user",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="notes",
to=settings.AUTH_USER_MODEL,
verbose_name="user",
),
),
],
options={
"verbose_name": "note",
"verbose_name_plural": "notes",
"ordering": ("created",),
},
),
migrations.CreateModel(
name="CustomField",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"created",
models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
editable=False,
verbose_name="created",
),
),
("name", models.CharField(max_length=128)),
(
"data_type",
models.CharField(
choices=[
("string", "String"),
("url", "URL"),
("date", "Date"),
("boolean", "Boolean"),
("integer", "Integer"),
("float", "Float"),
("monetary", "Monetary"),
("documentlink", "Document Link"),
("select", "Select"),
("longtext", "Long Text"),
],
editable=False,
max_length=50,
verbose_name="data type",
),
),
(
"extra_data",
models.JSONField(
blank=True,
help_text="Extra data for the custom field, such as select options",
null=True,
verbose_name="extra data",
),
),
],
options={
"verbose_name": "custom field",
"verbose_name_plural": "custom fields",
"ordering": ("created",),
"constraints": [
models.UniqueConstraint(
fields=("name",),
name="documents_customfield_unique_name",
),
],
},
),
migrations.CreateModel(
name="CustomFieldInstance",
fields=[
@@ -1142,6 +880,66 @@ class Migration(migrations.Migration):
"ordering": ("created",),
},
),
migrations.CreateModel(
name="Note",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("deleted_at", models.DateTimeField(blank=True, null=True)),
("restored_at", models.DateTimeField(blank=True, null=True)),
("transaction_id", models.UUIDField(blank=True, null=True)),
(
"note",
models.TextField(
blank=True,
help_text="Note for the document",
verbose_name="content",
),
),
(
"created",
models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
verbose_name="created",
),
),
(
"document",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="notes",
to="documents.document",
verbose_name="document",
),
),
(
"user",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="notes",
to=settings.AUTH_USER_MODEL,
verbose_name="user",
),
),
],
options={
"verbose_name": "note",
"verbose_name_plural": "notes",
"ordering": ("created",),
},
),
migrations.CreateModel(
name="PaperlessTask",
fields=[
@@ -1188,6 +986,7 @@ class Migration(migrations.Migration):
("train_classifier", "Train Classifier"),
("check_sanity", "Check Sanity"),
("index_optimize", "Index Optimize"),
("llmindex_update", "LLM Index Update"),
],
help_text="Name of the task that was run",
max_length=255,
@@ -1581,7 +1380,6 @@ class Migration(migrations.Migration):
verbose_name="Workflow Action Type",
),
),
("order", models.PositiveIntegerField(default=0, verbose_name="order")),
(
"assign_title",
models.TextField(

View File

@@ -1,4 +1,4 @@
# Generated by Django 5.2.11 on 2026-03-03 16:27
# Generated by Django 5.2.9 on 2026-01-20 18:46
import django.db.models.deletion
from django.db import migrations
@@ -9,14 +9,8 @@ class Migration(migrations.Migration):
initial = True
dependencies = [
("documents", "0001_squashed"),
("paperless_mail", "0001_squashed"),
]
# This migration needs a "replaces", but it doesn't matter which.
# Chose the last 2.20.x migration
replaces = [
("documents", "1075_workflowaction_order"),
("documents", "0001_initial"),
("paperless_mail", "0001_initial"),
]
operations = [

View File

@@ -6,7 +6,7 @@ from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0002_squashed"),
("documents", "0002_initial"),
]
operations = [

View File

@@ -1,30 +0,0 @@
# Generated by Django 5.2.11 on 2026-03-03 16:42
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0013_document_root_document"),
]
operations = [
migrations.AlterField(
model_name="paperlesstask",
name="task_name",
field=models.CharField(
choices=[
("consume_file", "Consume File"),
("train_classifier", "Train Classifier"),
("check_sanity", "Check Sanity"),
("index_optimize", "Index Optimize"),
("llmindex_update", "LLM Index Update"),
],
help_text="Name of the task that was run",
max_length=255,
null=True,
verbose_name="Task Name",
),
),
]

View File

@@ -0,0 +1,37 @@
# Generated by Django 5.2.11 on 2026-03-02 17:48
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0013_document_root_document"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AddField(
model_name="document",
name="version_index",
field=models.PositiveIntegerField(
blank=True,
db_index=True,
help_text="Index of this version within the root document.",
null=True,
verbose_name="version index",
),
),
migrations.AddConstraint(
model_name="document",
constraint=models.UniqueConstraint(
condition=models.Q(
("root_document__isnull", False),
("version_index__isnull", False),
),
fields=("root_document", "version_index"),
name="documents_document_root_version_index_uniq",
),
),
]

View File

@@ -75,7 +75,7 @@ class MatchingModel(ModelWithOwner):
is_insensitive = models.BooleanField(_("is insensitive"), default=True)
class Meta(ModelWithOwner.Meta):
class Meta:
abstract = True
ordering = ("name",)
constraints = [
@@ -317,6 +317,14 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
verbose_name=_("root document for this version"),
)
version_index = models.PositiveIntegerField(
_("version index"),
blank=True,
null=True,
db_index=True,
help_text=_("Index of this version within the root document."),
)
version_label = models.CharField(
_("version label"),
max_length=64,
@@ -329,6 +337,16 @@ class Document(SoftDeleteModel, ModelWithOwner): # type: ignore[django-manager-
ordering = ("-created",)
verbose_name = _("document")
verbose_name_plural = _("documents")
constraints = [
models.UniqueConstraint(
fields=["root_document", "version_index"],
condition=models.Q(
root_document__isnull=False,
version_index__isnull=False,
),
name="documents_document_root_version_index_uniq",
),
]
def __str__(self) -> str:
created = self.created.isoformat()

View File

@@ -1,71 +0,0 @@
"""
Temporary profiling utilities for comparing implementations.
Usage in a management command or shell::
from documents.profiling import profile_block
with profile_block("new check_sanity"):
messages = check_sanity()
with profile_block("old check_sanity"):
messages = check_sanity_old()
Drop this file when done.
"""
from __future__ import annotations
import tracemalloc
from contextlib import contextmanager
from time import perf_counter
from typing import TYPE_CHECKING
from django.db import connection
from django.db import reset_queries
from django.test.utils import override_settings
if TYPE_CHECKING:
from collections.abc import Generator
@contextmanager
def profile_block(label: str = "block") -> Generator[None, None, None]:
"""Profile memory, wall time, and DB queries for a code block.
Prints a summary to stdout on exit. Requires no external packages.
Enables DEBUG temporarily to capture Django's query log.
"""
tracemalloc.start()
snapshot_before = tracemalloc.take_snapshot()
with override_settings(DEBUG=True):
reset_queries()
start = perf_counter()
yield
elapsed = perf_counter() - start
queries = list(connection.queries)
snapshot_after = tracemalloc.take_snapshot()
_, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
# Compare snapshots for top allocations
stats = snapshot_after.compare_to(snapshot_before, "lineno")
query_time = sum(float(q["time"]) for q in queries)
mem_diff = sum(s.size_diff for s in stats)
print(f"\n{'=' * 60}") # noqa: T201
print(f" Profile: {label}") # noqa: T201
print(f"{'=' * 60}") # noqa: T201
print(f" Wall time: {elapsed:.4f}s") # noqa: T201
print(f" Queries: {len(queries)} ({query_time:.4f}s)") # noqa: T201
print(f" Memory delta: {mem_diff / 1024:.1f} KiB") # noqa: T201
print(f" Peak memory: {peak / 1024:.1f} KiB") # noqa: T201
print("\n Top 5 allocations:") # noqa: T201
for stat in stats[:5]:
print(f" {stat}") # noqa: T201
print(f"{'=' * 60}\n") # noqa: T201

View File

@@ -4,7 +4,6 @@ import logging
import shutil
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Any
from celery import shared_task
from celery import states
@@ -33,14 +32,12 @@ from documents.file_handling import create_source_path_directory
from documents.file_handling import delete_empty_directories
from documents.file_handling import generate_filename
from documents.file_handling import generate_unique_filename
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import PaperlessTask
from documents.models import SavedView
from documents.models import StoragePath
from documents.models import Tag
from documents.models import UiSettings
from documents.models import Workflow
@@ -84,41 +81,47 @@ def add_inbox_tags(sender, document: Document, logging_group=None, **kwargs) ->
document.add_nested_tags(inbox_tags)
def _suggestion_printer(
stdout,
style_func,
suggestion_type: str,
document: Document,
selected: MatchingModel,
base_url: str | None = None,
) -> None:
"""
Smaller helper to reduce duplication when just outputting suggestions to the console
"""
doc_str = str(document)
if base_url is not None:
stdout.write(style_func.SUCCESS(doc_str))
stdout.write(style_func.SUCCESS(f"{base_url}/documents/{document.pk}"))
else:
stdout.write(style_func.SUCCESS(f"{doc_str} [{document.pk}]"))
stdout.write(f"Suggest {suggestion_type}: {selected}")
def set_correspondent(
sender: object,
sender,
document: Document,
*,
logging_group: object = None,
logging_group=None,
classifier: DocumentClassifier | None = None,
replace: bool = False,
use_first: bool = True,
dry_run: bool = False,
**kwargs: Any,
) -> Correspondent | None:
"""
Assign a correspondent to a document based on classifier results.
Args:
document: The document to classify.
logging_group: Optional logging group for structured log output.
classifier: The trained classifier. If None, only rule-based matching runs.
replace: If True, overwrite an existing correspondent assignment.
use_first: If True, pick the first match when multiple correspondents
match. If False, skip assignment when multiple match.
dry_run: If True, compute and return the selection without saving.
**kwargs: Absorbed for Django signal compatibility (e.g. sender, signal).
Returns:
The correspondent that was (or would be) assigned, or None if no match
was found or assignment was skipped.
"""
replace=False,
use_first=True,
suggest=False,
base_url=None,
stdout=None,
style_func=None,
**kwargs,
) -> None:
if document.correspondent and not replace:
return None
return
potential_correspondents = matching.match_correspondents(document, classifier)
potential_count = len(potential_correspondents)
selected = potential_correspondents[0] if potential_correspondents else None
if potential_count > 1:
if use_first:
logger.debug(
@@ -132,53 +135,49 @@ def set_correspondent(
f"not assigning any correspondent",
extra={"group": logging_group},
)
return None
return
if (selected or replace) and not dry_run:
logger.info(
f"Assigning correspondent {selected} to {document}",
extra={"group": logging_group},
)
document.correspondent = selected
document.save(update_fields=("correspondent",))
if selected or replace:
if suggest:
_suggestion_printer(
stdout,
style_func,
"correspondent",
document,
selected,
base_url,
)
else:
logger.info(
f"Assigning correspondent {selected} to {document}",
extra={"group": logging_group},
)
return selected
document.correspondent = selected
document.save(update_fields=("correspondent",))
def set_document_type(
sender: object,
sender,
document: Document,
*,
logging_group: object = None,
logging_group=None,
classifier: DocumentClassifier | None = None,
replace: bool = False,
use_first: bool = True,
dry_run: bool = False,
**kwargs: Any,
) -> DocumentType | None:
"""
Assign a document type to a document based on classifier results.
Args:
document: The document to classify.
logging_group: Optional logging group for structured log output.
classifier: The trained classifier. If None, only rule-based matching runs.
replace: If True, overwrite an existing document type assignment.
use_first: If True, pick the first match when multiple types match.
If False, skip assignment when multiple match.
dry_run: If True, compute and return the selection without saving.
**kwargs: Absorbed for Django signal compatibility (e.g. sender, signal).
Returns:
The document type that was (or would be) assigned, or None if no match
was found or assignment was skipped.
"""
replace=False,
use_first=True,
suggest=False,
base_url=None,
stdout=None,
style_func=None,
**kwargs,
) -> None:
if document.document_type and not replace:
return None
return
potential_document_types = matching.match_document_types(document, classifier)
potential_count = len(potential_document_types)
selected = potential_document_types[0] if potential_document_types else None
potential_document_type = matching.match_document_types(document, classifier)
potential_count = len(potential_document_type)
selected = potential_document_type[0] if potential_document_type else None
if potential_count > 1:
if use_first:
@@ -193,64 +192,42 @@ def set_document_type(
f"not assigning any document type",
extra={"group": logging_group},
)
return None
return
if (selected or replace) and not dry_run:
logger.info(
f"Assigning document type {selected} to {document}",
extra={"group": logging_group},
)
document.document_type = selected
document.save(update_fields=("document_type",))
if selected or replace:
if suggest:
_suggestion_printer(
stdout,
style_func,
"document type",
document,
selected,
base_url,
)
else:
logger.info(
f"Assigning document type {selected} to {document}",
extra={"group": logging_group},
)
return selected
document.document_type = selected
document.save(update_fields=("document_type",))
def set_tags(
sender: object,
sender,
document: Document,
*,
logging_group: object = None,
logging_group=None,
classifier: DocumentClassifier | None = None,
replace: bool = False,
dry_run: bool = False,
**kwargs: Any,
) -> tuple[set[Tag], set[Tag]]:
"""
Assign tags to a document based on classifier results.
When replace=True, existing auto-matched and rule-matched tags are removed
before applying the new set (inbox tags and manually-added tags are preserved).
Args:
document: The document to classify.
logging_group: Optional logging group for structured log output.
classifier: The trained classifier. If None, only rule-based matching runs.
replace: If True, remove existing classifier-managed tags before applying
new ones. Inbox tags and manually-added tags are always preserved.
dry_run: If True, compute what would change without saving anything.
**kwargs: Absorbed for Django signal compatibility (e.g. sender, signal).
Returns:
A two-tuple of (tags_added, tags_removed). In non-replace mode,
tags_removed is always an empty set. In dry_run mode, neither set
is applied to the database.
"""
# Compute which tags would be removed under replace mode.
# The filter mirrors the .delete() call below: keep inbox tags and
# manually-added tags (match="" and not auto-matched).
replace=False,
suggest=False,
base_url=None,
stdout=None,
style_func=None,
**kwargs,
) -> None:
if replace:
tags_to_remove: set[Tag] = set(
document.tags.exclude(
is_inbox_tag=True,
).exclude(
Q(match="") & ~Q(matching_algorithm=Tag.MATCH_AUTO),
),
)
else:
tags_to_remove = set()
if replace and not dry_run:
Document.tags.through.objects.filter(document=document).exclude(
Q(tag__is_inbox_tag=True),
).exclude(
@@ -258,53 +235,65 @@ def set_tags(
).delete()
current_tags = set(document.tags.all())
matched_tags = matching.match_tags(document, classifier)
tags_to_add = set(matched_tags) - current_tags
if tags_to_add and not dry_run:
matched_tags = matching.match_tags(document, classifier)
relevant_tags = set(matched_tags) - current_tags
if suggest:
extra_tags = current_tags - set(matched_tags)
extra_tags = [
t for t in extra_tags if t.matching_algorithm == MatchingModel.MATCH_AUTO
]
if not relevant_tags and not extra_tags:
return
doc_str = style_func.SUCCESS(str(document))
if base_url:
stdout.write(doc_str)
stdout.write(f"{base_url}/documents/{document.pk}")
else:
stdout.write(doc_str + style_func.SUCCESS(f" [{document.pk}]"))
if relevant_tags:
stdout.write("Suggest tags: " + ", ".join([t.name for t in relevant_tags]))
if extra_tags:
stdout.write("Extra tags: " + ", ".join([t.name for t in extra_tags]))
else:
if not relevant_tags:
return
message = 'Tagging "{}" with "{}"'
logger.info(
f'Tagging "{document}" with "{", ".join(t.name for t in tags_to_add)}"',
message.format(document, ", ".join([t.name for t in relevant_tags])),
extra={"group": logging_group},
)
document.add_nested_tags(tags_to_add)
return tags_to_add, tags_to_remove
document.add_nested_tags(relevant_tags)
def set_storage_path(
sender: object,
sender,
document: Document,
*,
logging_group: object = None,
logging_group=None,
classifier: DocumentClassifier | None = None,
replace: bool = False,
use_first: bool = True,
dry_run: bool = False,
**kwargs: Any,
) -> StoragePath | None:
"""
Assign a storage path to a document based on classifier results.
Args:
document: The document to classify.
logging_group: Optional logging group for structured log output.
classifier: The trained classifier. If None, only rule-based matching runs.
replace: If True, overwrite an existing storage path assignment.
use_first: If True, pick the first match when multiple paths match.
If False, skip assignment when multiple match.
dry_run: If True, compute and return the selection without saving.
**kwargs: Absorbed for Django signal compatibility (e.g. sender, signal).
Returns:
The storage path that was (or would be) assigned, or None if no match
was found or assignment was skipped.
"""
replace=False,
use_first=True,
suggest=False,
base_url=None,
stdout=None,
style_func=None,
**kwargs,
) -> None:
if document.storage_path and not replace:
return None
return
potential_storage_paths = matching.match_storage_paths(document, classifier)
potential_count = len(potential_storage_paths)
selected = potential_storage_paths[0] if potential_storage_paths else None
potential_storage_path = matching.match_storage_paths(
document,
classifier,
)
potential_count = len(potential_storage_path)
selected = potential_storage_path[0] if potential_storage_path else None
if potential_count > 1:
if use_first:
@@ -319,17 +308,26 @@ def set_storage_path(
f"not assigning any storage directory",
extra={"group": logging_group},
)
return None
return
if (selected or replace) and not dry_run:
logger.info(
f"Assigning storage path {selected} to {document}",
extra={"group": logging_group},
)
document.storage_path = selected
document.save(update_fields=("storage_path",))
if selected or replace:
if suggest:
_suggestion_printer(
stdout,
style_func,
"storage directory",
document,
selected,
base_url,
)
else:
logger.info(
f"Assigning storage path {selected} to {document}",
extra={"group": logging_group},
)
return selected
document.storage_path = selected
document.save(update_fields=("storage_path",))
# see empty_trash in documents/tasks.py for signal handling
@@ -598,6 +596,16 @@ def update_filename_and_move_files(
root=settings.ARCHIVE_DIR,
)
# Keep version files in sync with root
if instance.root_document_id is None:
for version_doc in Document.objects.filter(root_document_id=instance.pk).only(
"pk",
):
update_filename_and_move_files(
Document,
version_doc,
)
@shared_task
def process_cf_select_update(custom_field: CustomField) -> None:

View File

@@ -114,14 +114,3 @@ def authenticated_rest_api_client(rest_api_client: APIClient):
user = UserModel.objects.create_user(username="testuser", password="password")
rest_api_client.force_authenticate(user=user)
yield rest_api_client
@pytest.fixture(scope="session", autouse=True)
def faker_session_locale():
"""Set Faker locale for reproducibility."""
return "en_US"
@pytest.fixture(scope="session", autouse=True)
def faker_seed():
return 12345

View File

@@ -1,67 +1,17 @@
"""
Factory-boy factories for documents app models.
"""
from __future__ import annotations
import factory
from factory import Faker
from factory.django import DjangoModelFactory
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import StoragePath
from documents.models import Tag
class CorrespondentFactory(DjangoModelFactory):
class Meta:
model = Correspondent
name = factory.Sequence(lambda n: f"{factory.Faker('company')} {n}")
match = ""
matching_algorithm = MatchingModel.MATCH_NONE
class DocumentTypeFactory(DjangoModelFactory):
class Meta:
model = DocumentType
name = factory.Sequence(lambda n: f"{factory.Faker('bs')} {n}")
match = ""
matching_algorithm = MatchingModel.MATCH_NONE
class TagFactory(DjangoModelFactory):
class Meta:
model = Tag
name = factory.Sequence(lambda n: f"{factory.Faker('word')} {n}")
match = ""
matching_algorithm = MatchingModel.MATCH_NONE
is_inbox_tag = False
class StoragePathFactory(DjangoModelFactory):
class Meta:
model = StoragePath
name = factory.Sequence(
lambda n: f"{factory.Faker('file_path', depth=2, extension='')} {n}",
)
path = factory.LazyAttribute(lambda o: f"{o.name}/{{title}}")
match = ""
matching_algorithm = MatchingModel.MATCH_NONE
name = Faker("name")
class DocumentFactory(DjangoModelFactory):
class Meta:
model = Document
title = factory.Faker("sentence", nb_words=4)
checksum = factory.Faker("md5")
content = factory.Faker("paragraph")
correspondent = None
document_type = None
storage_path = None

View File

@@ -699,6 +699,13 @@ class TestConsumer(
self.assertIsNotNone(root_doc)
assert root_doc is not None
root_storage_path = StoragePath.objects.create(
name="version-root-path",
path="root/{{title}}",
)
root_doc.storage_path = root_storage_path
root_doc.save()
actor = User.objects.create_user(
username="actor",
email="actor@example.com",
@@ -735,7 +742,7 @@ class TestConsumer(
)
consumer.setup()
try:
self.assertTrue(consumer.filename.endswith("_v0.pdf"))
self.assertEqual(consumer.filename, version_file.name)
consumer.run()
finally:
consumer.cleanup()
@@ -745,8 +752,9 @@ class TestConsumer(
version = versions.first()
assert version is not None
assert version.original_filename is not None
self.assertEqual(version.version_index, 1)
self.assertEqual(version.version_label, "v2")
self.assertTrue(version.original_filename.endswith("_v0.pdf"))
self.assertEqual(version.original_filename, version_file.name)
self.assertTrue(bool(version.content))
@override_settings(AUDIT_LOG_ENABLED=True)
@@ -795,7 +803,7 @@ class TestConsumer(
)
consumer.setup()
try:
self.assertEqual(consumer.filename, "valid_pdf_version-upload_v0")
self.assertEqual(consumer.filename, "valid_pdf_version-upload")
consumer.run()
finally:
consumer.cleanup()
@@ -805,9 +813,67 @@ class TestConsumer(
)
self.assertIsNotNone(version)
assert version is not None
self.assertEqual(version.original_filename, "valid_pdf_version-upload_v0")
self.assertEqual(version.version_index, 1)
self.assertEqual(version.original_filename, "valid_pdf_version-upload")
self.assertTrue(bool(version.content))
@override_settings(AUDIT_LOG_ENABLED=True)
@mock.patch("documents.consumer.load_classifier")
def test_consume_version_index_monotonic_after_version_deletion(self, m) -> None:
m.return_value = MagicMock()
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
root_doc = Document.objects.first()
self.assertIsNotNone(root_doc)
assert root_doc is not None
def consume_version(version_file: Path) -> Document:
status = DummyProgressManager(version_file.name, None)
overrides = DocumentMetadataOverrides()
doc = ConsumableDocument(
DocumentSource.ApiUpload,
original_file=version_file,
root_document_id=root_doc.pk,
)
preflight = ConsumerPreflightPlugin(
doc,
overrides,
status, # type: ignore[arg-type]
self.dirs.scratch_dir,
"task-id",
)
preflight.setup()
preflight.run()
consumer = ConsumerPlugin(
doc,
overrides,
status, # type: ignore[arg-type]
self.dirs.scratch_dir,
"task-id",
)
consumer.setup()
try:
consumer.run()
finally:
consumer.cleanup()
version = (
Document.objects.filter(root_document=root_doc).order_by("-id").first()
)
assert version is not None
return version
v1 = consume_version(self.get_test_file2())
self.assertEqual(v1.version_index, 1)
v1.delete()
# The next version should have version_index 2, even though version_index 1 was deleted
v2 = consume_version(self.get_test_file())
self.assertEqual(v2.version_index, 2)
@mock.patch("documents.consumer.load_classifier")
def testClassifyDocument(self, m) -> None:
correspondent = Correspondent.objects.create(

View File

@@ -77,6 +77,58 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
settings.ORIGINALS_DIR / "test" / "test.pdf",
)
@override_settings(FILENAME_FORMAT=None)
def test_root_storage_path_change_updates_version_files(self) -> None:
old_storage_path = StoragePath.objects.create(
name="old-path",
path="old/{{title}}",
)
new_storage_path = StoragePath.objects.create(
name="new-path",
path="new/{{title}}",
)
root_doc = Document.objects.create(
title="rootdoc",
mime_type="application/pdf",
checksum="root-checksum",
storage_path=old_storage_path,
)
version_doc = Document.objects.create(
title="version-title",
mime_type="application/pdf",
checksum="version-checksum",
root_document=root_doc,
version_index=1,
)
Document.objects.filter(pk=root_doc.pk).update(
filename=generate_filename(root_doc),
)
Document.objects.filter(pk=version_doc.pk).update(
filename=generate_filename(version_doc),
)
root_doc.refresh_from_db()
version_doc.refresh_from_db()
create_source_path_directory(root_doc.source_path)
Path(root_doc.source_path).touch()
create_source_path_directory(version_doc.source_path)
Path(version_doc.source_path).touch()
root_doc.storage_path = new_storage_path
root_doc.save()
root_doc.refresh_from_db()
version_doc.refresh_from_db()
self.assertEqual(root_doc.filename, "new/rootdoc.pdf")
self.assertEqual(version_doc.filename, "new/rootdoc_v1.pdf")
self.assertIsFile(root_doc.source_path)
self.assertIsFile(version_doc.source_path)
self.assertIsNotFile(settings.ORIGINALS_DIR / "old" / "rootdoc.pdf")
self.assertIsNotFile(settings.ORIGINALS_DIR / "old" / "rootdoc_v1.pdf")
@override_settings(FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_missing_permissions(self) -> None:
document = Document()
@@ -1222,6 +1274,94 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
Path("logs.pdf"),
)
@override_settings(FILENAME_FORMAT="{title}")
def test_version_index_suffix_for_template_filename(self) -> None:
root_doc = Document.objects.create(
title="the_doc",
mime_type="application/pdf",
checksum="root-checksum",
)
version_doc = Document.objects.create(
title="the_doc",
mime_type="application/pdf",
checksum="version-checksum",
root_document=root_doc,
version_index=1,
)
self.assertEqual(generate_filename(version_doc), Path("the_doc_v1.pdf"))
self.assertEqual(
generate_filename(version_doc, counter=1),
Path("the_doc_v1_01.pdf"),
)
@override_settings(FILENAME_FORMAT=None)
def test_version_index_suffix_for_default_filename(self) -> None:
root_doc = Document.objects.create(
title="root",
mime_type="text/plain",
checksum="root-checksum",
)
version_doc = Document.objects.create(
title="root",
mime_type="text/plain",
checksum="version-checksum",
root_document=root_doc,
version_index=2,
)
self.assertEqual(
generate_filename(version_doc),
Path(f"{root_doc.pk:07d}_v2.txt"),
)
self.assertEqual(
generate_filename(version_doc, archive_filename=True),
Path(f"{root_doc.pk:07d}_v2.pdf"),
)
@override_settings(FILENAME_FORMAT="{original_name}")
def test_version_index_suffix_with_original_name_placeholder(self) -> None:
root_doc = Document.objects.create(
title="root",
mime_type="application/pdf",
checksum="root-checksum",
original_filename="root-upload.pdf",
)
version_doc = Document.objects.create(
title="root",
mime_type="application/pdf",
checksum="version-checksum",
root_document=root_doc,
version_index=1,
original_filename="version-upload.pdf",
)
self.assertEqual(generate_filename(version_doc), Path("root-upload_v1.pdf"))
def test_version_index_suffix_with_storage_path(self) -> None:
storage_path = StoragePath.objects.create(
name="vtest",
path="folder/{{title}}",
)
root_doc = Document.objects.create(
title="storage_doc",
mime_type="application/pdf",
checksum="root-checksum",
storage_path=storage_path,
)
version_doc = Document.objects.create(
title="version_title_should_not_be_used",
mime_type="application/pdf",
checksum="version-checksum",
root_document=root_doc,
version_index=3,
)
self.assertEqual(
generate_filename(version_doc),
Path("folder/storage_doc_v3.pdf"),
)
@override_settings(
FILENAME_FORMAT="XX{correspondent}/{title}",
FILENAME_FORMAT_REMOVE_NONE=True,

View File

@@ -1,442 +1,298 @@
"""
Tests for the document_retagger management command.
"""
from __future__ import annotations
import pytest
from django.core.management import call_command
from django.core.management.base import CommandError
from django.test import TestCase
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import StoragePath
from documents.models import Tag
from documents.tests.factories import CorrespondentFactory
from documents.tests.factories import DocumentFactory
from documents.tests.factories import DocumentTypeFactory
from documents.tests.factories import StoragePathFactory
from documents.tests.factories import TagFactory
from documents.tests.utils import DirectoriesMixin
# ---------------------------------------------------------------------------
# Module-level type aliases
# ---------------------------------------------------------------------------
StoragePathTuple = tuple[StoragePath, StoragePath, StoragePath]
TagTuple = tuple[Tag, Tag, Tag, Tag, Tag]
CorrespondentTuple = tuple[Correspondent, Correspondent]
DocumentTypeTuple = tuple[DocumentType, DocumentType]
DocumentTuple = tuple[Document, Document, Document, Document]
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture()
def storage_paths(db) -> StoragePathTuple:
"""Three storage paths with varying match rules."""
sp1 = StoragePathFactory(
path="{created_data}/{title}",
match="auto document",
matching_algorithm=MatchingModel.MATCH_LITERAL,
)
sp2 = StoragePathFactory(
path="{title}",
match="^first|^unrelated",
matching_algorithm=MatchingModel.MATCH_REGEX,
)
sp3 = StoragePathFactory(
path="{title}",
match="^blah",
matching_algorithm=MatchingModel.MATCH_REGEX,
)
return sp1, sp2, sp3
@pytest.fixture()
def tags(db) -> TagTuple:
"""Tags covering the common matching scenarios."""
tag_first = TagFactory(match="first", matching_algorithm=Tag.MATCH_ANY)
tag_second = TagFactory(match="second", matching_algorithm=Tag.MATCH_ANY)
tag_inbox = TagFactory(is_inbox_tag=True)
tag_no_match = TagFactory()
tag_auto = TagFactory(matching_algorithm=Tag.MATCH_AUTO)
return tag_first, tag_second, tag_inbox, tag_no_match, tag_auto
@pytest.fixture()
def correspondents(db) -> CorrespondentTuple:
"""Two correspondents matching 'first' and 'second' content."""
c_first = CorrespondentFactory(
match="first",
matching_algorithm=MatchingModel.MATCH_ANY,
)
c_second = CorrespondentFactory(
match="second",
matching_algorithm=MatchingModel.MATCH_ANY,
)
return c_first, c_second
@pytest.fixture()
def document_types(db) -> DocumentTypeTuple:
"""Two document types matching 'first' and 'second' content."""
dt_first = DocumentTypeFactory(
match="first",
matching_algorithm=MatchingModel.MATCH_ANY,
)
dt_second = DocumentTypeFactory(
match="second",
matching_algorithm=MatchingModel.MATCH_ANY,
)
return dt_first, dt_second
@pytest.fixture()
def documents(storage_paths: StoragePathTuple, tags: TagTuple) -> DocumentTuple:
"""Four documents with varied content used across most retagger tests."""
_, _, sp3 = storage_paths
_, _, tag_inbox, tag_no_match, tag_auto = tags
d1 = DocumentFactory(checksum="A", title="A", content="first document")
d2 = DocumentFactory(checksum="B", title="B", content="second document")
d3 = DocumentFactory(
checksum="C",
title="C",
content="unrelated document",
storage_path=sp3,
)
d4 = DocumentFactory(checksum="D", title="D", content="auto document")
d3.tags.add(tag_inbox, tag_no_match)
d4.tags.add(tag_auto)
return d1, d2, d3, d4
def _get_docs() -> DocumentTuple:
return (
Document.objects.get(title="A"),
Document.objects.get(title="B"),
Document.objects.get(title="C"),
Document.objects.get(title="D"),
)
# ---------------------------------------------------------------------------
# Tag assignment
# ---------------------------------------------------------------------------
@pytest.mark.management
@pytest.mark.django_db
class TestRetaggerTags(DirectoriesMixin):
@pytest.mark.usefixtures("documents")
def test_add_tags(self, tags: TagTuple) -> None:
tag_first, tag_second, *_ = tags
class TestRetagger(DirectoriesMixin, TestCase):
def make_models(self) -> None:
self.sp1 = StoragePath.objects.create(
name="dummy a",
path="{created_data}/{title}",
match="auto document",
matching_algorithm=StoragePath.MATCH_LITERAL,
)
self.sp2 = StoragePath.objects.create(
name="dummy b",
path="{title}",
match="^first|^unrelated",
matching_algorithm=StoragePath.MATCH_REGEX,
)
self.sp3 = StoragePath.objects.create(
name="dummy c",
path="{title}",
match="^blah",
matching_algorithm=StoragePath.MATCH_REGEX,
)
self.d1 = Document.objects.create(
checksum="A",
title="A",
content="first document",
)
self.d2 = Document.objects.create(
checksum="B",
title="B",
content="second document",
)
self.d3 = Document.objects.create(
checksum="C",
title="C",
content="unrelated document",
storage_path=self.sp3,
)
self.d4 = Document.objects.create(
checksum="D",
title="D",
content="auto document",
)
self.tag_first = Tag.objects.create(
name="tag1",
match="first",
matching_algorithm=Tag.MATCH_ANY,
)
self.tag_second = Tag.objects.create(
name="tag2",
match="second",
matching_algorithm=Tag.MATCH_ANY,
)
self.tag_inbox = Tag.objects.create(name="test", is_inbox_tag=True)
self.tag_no_match = Tag.objects.create(name="test2")
self.tag_auto = Tag.objects.create(
name="tagauto",
matching_algorithm=Tag.MATCH_AUTO,
)
self.d3.tags.add(self.tag_inbox)
self.d3.tags.add(self.tag_no_match)
self.d4.tags.add(self.tag_auto)
self.correspondent_first = Correspondent.objects.create(
name="c1",
match="first",
matching_algorithm=Correspondent.MATCH_ANY,
)
self.correspondent_second = Correspondent.objects.create(
name="c2",
match="second",
matching_algorithm=Correspondent.MATCH_ANY,
)
self.doctype_first = DocumentType.objects.create(
name="dt1",
match="first",
matching_algorithm=DocumentType.MATCH_ANY,
)
self.doctype_second = DocumentType.objects.create(
name="dt2",
match="second",
matching_algorithm=DocumentType.MATCH_ANY,
)
def get_updated_docs(self):
return (
Document.objects.get(title="A"),
Document.objects.get(title="B"),
Document.objects.get(title="C"),
Document.objects.get(title="D"),
)
def setUp(self) -> None:
super().setUp()
self.make_models()
def test_add_tags(self) -> None:
call_command("document_retagger", "--tags")
d_first, d_second, d_unrelated, d_auto = _get_docs()
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
assert d_first.tags.count() == 1
assert d_second.tags.count() == 1
assert d_unrelated.tags.count() == 2
assert d_auto.tags.count() == 1
assert d_first.tags.first() == tag_first
assert d_second.tags.first() == tag_second
self.assertEqual(d_first.tags.count(), 1)
self.assertEqual(d_second.tags.count(), 1)
self.assertEqual(d_unrelated.tags.count(), 2)
self.assertEqual(d_auto.tags.count(), 1)
def test_overwrite_removes_stale_tags_and_preserves_inbox(
self,
documents: DocumentTuple,
tags: TagTuple,
) -> None:
d1, *_ = documents
tag_first, tag_second, tag_inbox, tag_no_match, _ = tags
d1.tags.add(tag_second)
self.assertEqual(d_first.tags.first(), self.tag_first)
self.assertEqual(d_second.tags.first(), self.tag_second)
def test_add_type(self) -> None:
call_command("document_retagger", "--document_type")
d_first, d_second, _, _ = self.get_updated_docs()
self.assertEqual(d_first.document_type, self.doctype_first)
self.assertEqual(d_second.document_type, self.doctype_second)
def test_add_correspondent(self) -> None:
call_command("document_retagger", "--correspondent")
d_first, d_second, _, _ = self.get_updated_docs()
self.assertEqual(d_first.correspondent, self.correspondent_first)
self.assertEqual(d_second.correspondent, self.correspondent_second)
def test_overwrite_preserve_inbox(self) -> None:
self.d1.tags.add(self.tag_second)
call_command("document_retagger", "--tags", "--overwrite")
d_first, d_second, d_unrelated, d_auto = _get_docs()
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
assert Tag.objects.filter(id=tag_second.id).exists()
assert list(d_first.tags.values_list("id", flat=True)) == [tag_first.id]
assert list(d_second.tags.values_list("id", flat=True)) == [tag_second.id]
assert set(d_unrelated.tags.values_list("id", flat=True)) == {
tag_inbox.id,
tag_no_match.id,
}
assert d_auto.tags.count() == 0
self.assertIsNotNone(Tag.objects.get(id=self.tag_second.id))
@pytest.mark.usefixtures("documents")
@pytest.mark.parametrize(
"extra_args",
[
pytest.param([], id="no_base_url"),
pytest.param(["--base-url=http://localhost"], id="with_base_url"),
],
)
def test_suggest_does_not_apply_tags(self, extra_args: list[str]) -> None:
call_command("document_retagger", "--tags", "--suggest", *extra_args)
d_first, d_second, _, d_auto = _get_docs()
assert d_first.tags.count() == 0
assert d_second.tags.count() == 0
assert d_auto.tags.count() == 1
# ---------------------------------------------------------------------------
# Document type assignment
# ---------------------------------------------------------------------------
@pytest.mark.management
@pytest.mark.django_db
class TestRetaggerDocumentType(DirectoriesMixin):
@pytest.mark.usefixtures("documents")
def test_add_type(self, document_types: DocumentTypeTuple) -> None:
dt_first, dt_second = document_types
call_command("document_retagger", "--document_type")
d_first, d_second, _, _ = _get_docs()
assert d_first.document_type == dt_first
assert d_second.document_type == dt_second
@pytest.mark.usefixtures("documents", "document_types")
@pytest.mark.parametrize(
"extra_args",
[
pytest.param([], id="no_base_url"),
pytest.param(["--base-url=http://localhost"], id="with_base_url"),
],
)
def test_suggest_does_not_apply_document_type(self, extra_args: list[str]) -> None:
call_command("document_retagger", "--document_type", "--suggest", *extra_args)
d_first, d_second, _, _ = _get_docs()
assert d_first.document_type is None
assert d_second.document_type is None
@pytest.mark.parametrize(
("use_first_flag", "expects_assignment"),
[
pytest.param(["--use-first"], True, id="use_first_assigns_first_match"),
pytest.param([], False, id="no_use_first_skips_ambiguous_match"),
],
)
def test_use_first_with_multiple_matches(
self,
use_first_flag: list[str],
*,
expects_assignment: bool,
) -> None:
DocumentTypeFactory(
match="ambiguous",
matching_algorithm=MatchingModel.MATCH_ANY,
self.assertCountEqual(
[tag.id for tag in d_first.tags.all()],
[self.tag_first.id],
)
DocumentTypeFactory(
match="ambiguous",
matching_algorithm=MatchingModel.MATCH_ANY,
self.assertCountEqual(
[tag.id for tag in d_second.tags.all()],
[self.tag_second.id],
)
doc = DocumentFactory(content="ambiguous content")
call_command("document_retagger", "--document_type", *use_first_flag)
doc.refresh_from_db()
assert (doc.document_type is not None) is expects_assignment
# ---------------------------------------------------------------------------
# Correspondent assignment
# ---------------------------------------------------------------------------
@pytest.mark.management
@pytest.mark.django_db
class TestRetaggerCorrespondent(DirectoriesMixin):
@pytest.mark.usefixtures("documents")
def test_add_correspondent(self, correspondents: CorrespondentTuple) -> None:
c_first, c_second = correspondents
call_command("document_retagger", "--correspondent")
d_first, d_second, _, _ = _get_docs()
assert d_first.correspondent == c_first
assert d_second.correspondent == c_second
@pytest.mark.usefixtures("documents", "correspondents")
@pytest.mark.parametrize(
"extra_args",
[
pytest.param([], id="no_base_url"),
pytest.param(["--base-url=http://localhost"], id="with_base_url"),
],
)
def test_suggest_does_not_apply_correspondent(self, extra_args: list[str]) -> None:
call_command("document_retagger", "--correspondent", "--suggest", *extra_args)
d_first, d_second, _, _ = _get_docs()
assert d_first.correspondent is None
assert d_second.correspondent is None
@pytest.mark.parametrize(
("use_first_flag", "expects_assignment"),
[
pytest.param(["--use-first"], True, id="use_first_assigns_first_match"),
pytest.param([], False, id="no_use_first_skips_ambiguous_match"),
],
)
def test_use_first_with_multiple_matches(
self,
use_first_flag: list[str],
*,
expects_assignment: bool,
) -> None:
CorrespondentFactory(
match="ambiguous",
matching_algorithm=MatchingModel.MATCH_ANY,
self.assertCountEqual(
[tag.id for tag in d_unrelated.tags.all()],
[self.tag_inbox.id, self.tag_no_match.id],
)
CorrespondentFactory(
match="ambiguous",
matching_algorithm=MatchingModel.MATCH_ANY,
self.assertEqual(d_auto.tags.count(), 0)
def test_add_tags_suggest(self) -> None:
call_command("document_retagger", "--tags", "--suggest")
d_first, d_second, _, d_auto = self.get_updated_docs()
self.assertEqual(d_first.tags.count(), 0)
self.assertEqual(d_second.tags.count(), 0)
self.assertEqual(d_auto.tags.count(), 1)
def test_add_type_suggest(self) -> None:
call_command("document_retagger", "--document_type", "--suggest")
d_first, d_second, _, _ = self.get_updated_docs()
self.assertIsNone(d_first.document_type)
self.assertIsNone(d_second.document_type)
def test_add_correspondent_suggest(self) -> None:
call_command("document_retagger", "--correspondent", "--suggest")
d_first, d_second, _, _ = self.get_updated_docs()
self.assertIsNone(d_first.correspondent)
self.assertIsNone(d_second.correspondent)
def test_add_tags_suggest_url(self) -> None:
call_command(
"document_retagger",
"--tags",
"--suggest",
"--base-url=http://localhost",
)
doc = DocumentFactory(content="ambiguous content")
d_first, d_second, _, d_auto = self.get_updated_docs()
call_command("document_retagger", "--correspondent", *use_first_flag)
self.assertEqual(d_first.tags.count(), 0)
self.assertEqual(d_second.tags.count(), 0)
self.assertEqual(d_auto.tags.count(), 1)
doc.refresh_from_db()
assert (doc.correspondent is not None) is expects_assignment
def test_add_type_suggest_url(self) -> None:
call_command(
"document_retagger",
"--document_type",
"--suggest",
"--base-url=http://localhost",
)
d_first, d_second, _, _ = self.get_updated_docs()
self.assertIsNone(d_first.document_type)
self.assertIsNone(d_second.document_type)
# ---------------------------------------------------------------------------
# Storage path assignment
# ---------------------------------------------------------------------------
def test_add_correspondent_suggest_url(self) -> None:
call_command(
"document_retagger",
"--correspondent",
"--suggest",
"--base-url=http://localhost",
)
d_first, d_second, _, _ = self.get_updated_docs()
self.assertIsNone(d_first.correspondent)
self.assertIsNone(d_second.correspondent)
@pytest.mark.management
@pytest.mark.django_db
class TestRetaggerStoragePath(DirectoriesMixin):
@pytest.mark.usefixtures("documents")
def test_add_storage_path(self, storage_paths: StoragePathTuple) -> None:
def test_add_storage_path(self) -> None:
"""
GIVEN documents matching various storage path rules
WHEN document_retagger --storage_path is called
THEN matching documents get the correct path; existing path is unchanged
GIVEN:
- 2 storage paths with documents which match them
- 1 document which matches but has a storage path
WHEN:
- document retagger is called
THEN:
- Matching document's storage paths updated
- Non-matching documents have no storage path
- Existing storage patch left unchanged
"""
sp1, sp2, sp3 = storage_paths
call_command("document_retagger", "--storage_path")
d_first, d_second, d_unrelated, d_auto = _get_docs()
call_command(
"document_retagger",
"--storage_path",
)
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
assert d_first.storage_path == sp2
assert d_auto.storage_path == sp1
assert d_second.storage_path is None
assert d_unrelated.storage_path == sp3
self.assertEqual(d_first.storage_path, self.sp2)
self.assertEqual(d_auto.storage_path, self.sp1)
self.assertIsNone(d_second.storage_path)
self.assertEqual(d_unrelated.storage_path, self.sp3)
@pytest.mark.usefixtures("documents")
def test_overwrite_storage_path(self, storage_paths: StoragePathTuple) -> None:
def test_overwrite_storage_path(self) -> None:
"""
GIVEN a document with an existing storage path that matches a different rule
WHEN document_retagger --storage_path --overwrite is called
THEN the existing path is replaced by the newly matched path
GIVEN:
- 2 storage paths with documents which match them
- 1 document which matches but has a storage path
WHEN:
- document retagger is called with overwrite
THEN:
- Matching document's storage paths updated
- Non-matching documents have no storage path
- Existing storage patch overwritten
"""
sp1, sp2, _ = storage_paths
call_command("document_retagger", "--storage_path", "--overwrite")
d_first, d_second, d_unrelated, d_auto = _get_docs()
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
assert d_first.storage_path == sp2
assert d_auto.storage_path == sp1
assert d_second.storage_path is None
assert d_unrelated.storage_path == sp2
self.assertEqual(d_first.storage_path, self.sp2)
self.assertEqual(d_auto.storage_path, self.sp1)
self.assertIsNone(d_second.storage_path)
self.assertEqual(d_unrelated.storage_path, self.sp2)
@pytest.mark.parametrize(
("use_first_flag", "expects_assignment"),
[
pytest.param(["--use-first"], True, id="use_first_assigns_first_match"),
pytest.param([], False, id="no_use_first_skips_ambiguous_match"),
],
)
def test_use_first_with_multiple_matches(
self,
use_first_flag: list[str],
*,
expects_assignment: bool,
) -> None:
StoragePathFactory(
match="ambiguous",
matching_algorithm=MatchingModel.MATCH_ANY,
def test_id_range_parameter(self) -> None:
commandOutput = ""
Document.objects.create(
checksum="E",
title="E",
content="NOT the first document",
)
StoragePathFactory(
match="ambiguous",
matching_algorithm=MatchingModel.MATCH_ANY,
)
doc = DocumentFactory(content="ambiguous content")
call_command("document_retagger", "--tags", "--id-range", "1", "2")
# The retagger shouldn`t apply the 'first' tag to our new document
self.assertEqual(Document.objects.filter(tags__id=self.tag_first.id).count(), 1)
call_command("document_retagger", "--storage_path", *use_first_flag)
try:
commandOutput = call_command("document_retagger", "--tags", "--id-range")
except CommandError:
# Just ignore the error
None
self.assertIn(commandOutput, "Error: argument --id-range: expected 2 arguments")
doc.refresh_from_db()
assert (doc.storage_path is not None) is expects_assignment
try:
commandOutput = call_command(
"document_retagger",
"--tags",
"--id-range",
"a",
"b",
)
except CommandError:
# Just ignore the error
None
self.assertIn(commandOutput, "error: argument --id-range: invalid int value:")
# ---------------------------------------------------------------------------
# ID range filtering
# ---------------------------------------------------------------------------
@pytest.mark.management
@pytest.mark.django_db
class TestRetaggerIdRange(DirectoriesMixin):
@pytest.mark.usefixtures("documents")
@pytest.mark.parametrize(
("id_range_args", "expected_count"),
[
pytest.param(["1", "2"], 1, id="narrow_range_limits_scope"),
pytest.param(["1", "9999"], 2, id="wide_range_tags_all_matches"),
],
)
def test_id_range_limits_scope(
self,
tags: TagTuple,
id_range_args: list[str],
expected_count: int,
) -> None:
DocumentFactory(content="NOT the first document")
call_command("document_retagger", "--tags", "--id-range", *id_range_args)
tag_first, *_ = tags
assert Document.objects.filter(tags__id=tag_first.id).count() == expected_count
@pytest.mark.usefixtures("documents")
@pytest.mark.parametrize(
"args",
[
pytest.param(["--tags", "--id-range"], id="missing_both_values"),
pytest.param(["--tags", "--id-range", "a", "b"], id="non_integer_values"),
],
)
def test_id_range_invalid_arguments_raise(self, args: list[str]) -> None:
with pytest.raises((CommandError, SystemExit)):
call_command("document_retagger", *args)
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
@pytest.mark.management
@pytest.mark.django_db
class TestRetaggerEdgeCases(DirectoriesMixin):
@pytest.mark.usefixtures("documents")
def test_no_targets_exits_cleanly(self) -> None:
"""Calling the retagger with no classifier targets should not raise."""
call_command("document_retagger")
@pytest.mark.usefixtures("documents")
def test_inbox_only_skips_non_inbox_documents(self) -> None:
"""--inbox-only must restrict processing to documents with an inbox tag."""
call_command("document_retagger", "--tags", "--inbox-only")
d_first, _, d_unrelated, _ = _get_docs()
assert d_first.tags.count() == 0
assert d_unrelated.tags.count() == 2
call_command("document_retagger", "--tags", "--id-range", "1", "9999")
# Now we should have 2 documents
self.assertEqual(Document.objects.filter(tags__id=self.tag_first.id).count(), 2)

View File

@@ -204,61 +204,6 @@ def audit_log_check(app_configs, **kwargs):
return result
@register()
def check_v3_minimum_upgrade_version(
app_configs: object,
**kwargs: object,
) -> list[Error]:
"""Enforce that upgrades to v3 must start from v2.20.9.
v3 squashes all prior migrations into 0001_squashed and 0002_squashed.
If a user skips v2.20.9, the data migration in 1075_workflowaction_order
never runs and the squash may apply schema changes against an incomplete
database state.
"""
from django.db import DatabaseError
from django.db import OperationalError
try:
all_tables = connections["default"].introspection.table_names()
if "django_migrations" not in all_tables:
return []
with connections["default"].cursor() as cursor:
cursor.execute(
"SELECT name FROM django_migrations WHERE app = %s",
["documents"],
)
applied: set[str] = {row[0] for row in cursor.fetchall()}
if not applied:
return []
# Already in a valid v3 state
if {"0001_squashed", "0002_squashed"} & applied:
return []
# On v2.20.9 exactly — squash will pick up cleanly from here
if "1075_workflowaction_order" in applied:
return []
except (DatabaseError, OperationalError):
return []
return [
Error(
"Cannot upgrade to Paperless-ngx v3 from this version.",
hint=(
"Upgrading to v3 can only be performed from v2.20.9."
"Please upgrade to v2.20.9, run migrations, then upgrade to v3."
"See https://docs.paperless-ngx.com/setup/#upgrading for details."
),
id="paperless.E002",
),
]
@register()
def check_deprecated_db_settings(
app_configs: object,

View File

@@ -3,7 +3,6 @@ from pathlib import Path
from unittest import mock
import pytest
from django.core.checks import Error
from django.core.checks import Warning
from django.test import TestCase
from django.test import override_settings
@@ -14,7 +13,6 @@ from documents.tests.utils import FileSystemAssertsMixin
from paperless.checks import audit_log_check
from paperless.checks import binaries_check
from paperless.checks import check_deprecated_db_settings
from paperless.checks import check_v3_minimum_upgrade_version
from paperless.checks import debug_mode_check
from paperless.checks import paths_check
from paperless.checks import settings_values_check
@@ -397,240 +395,3 @@ class TestDeprecatedDbSettings:
assert len(result) == 1
assert "PAPERLESS_DBSSLCERT" in result[0].msg
class TestV3MinimumUpgradeVersionCheck:
"""Test suite for check_v3_minimum_upgrade_version system check."""
@pytest.fixture
def build_conn_mock(self, mocker: MockerFixture):
"""Factory fixture that builds a connections['default'] mock.
Usage::
conn = build_conn_mock(tables=["django_migrations"], applied=["1075_..."])
"""
def _build(tables: list[str], applied: list[str]) -> mock.MagicMock:
conn = mocker.MagicMock()
conn.introspection.table_names.return_value = tables
cursor = conn.cursor.return_value.__enter__.return_value
cursor.fetchall.return_value = [(name,) for name in applied]
return conn
return _build
def test_no_migrations_table_fresh_install(
self,
mocker: MockerFixture,
build_conn_mock,
) -> None:
"""
GIVEN:
- No django_migrations table exists in the database
WHEN:
- The v3 upgrade check runs
THEN:
- No errors are reported (fresh install, nothing to enforce)
"""
mocker.patch.dict(
"paperless.checks.connections",
{"default": build_conn_mock([], [])},
)
assert check_v3_minimum_upgrade_version(None) == []
def test_no_documents_migrations_fresh_install(
self,
mocker: MockerFixture,
build_conn_mock,
) -> None:
"""
GIVEN:
- django_migrations table exists but has no documents app rows
WHEN:
- The v3 upgrade check runs
THEN:
- No errors are reported (fresh install, nothing to enforce)
"""
mocker.patch.dict(
"paperless.checks.connections",
{"default": build_conn_mock(["django_migrations"], [])},
)
assert check_v3_minimum_upgrade_version(None) == []
def test_v3_state_with_0001_squashed(
self,
mocker: MockerFixture,
build_conn_mock,
) -> None:
"""
GIVEN:
- 0001_squashed is recorded in django_migrations
WHEN:
- The v3 upgrade check runs
THEN:
- No errors are reported (DB is already in a valid v3 state)
"""
mocker.patch.dict(
"paperless.checks.connections",
{
"default": build_conn_mock(
["django_migrations"],
["0001_squashed", "0002_squashed", "0003_workflowaction_order"],
),
},
)
assert check_v3_minimum_upgrade_version(None) == []
def test_v3_state_with_0002_squashed_only(
self,
mocker: MockerFixture,
build_conn_mock,
) -> None:
"""
GIVEN:
- Only 0002_squashed is recorded in django_migrations
WHEN:
- The v3 upgrade check runs
THEN:
- No errors are reported (0002_squashed alone confirms a valid v3 state)
"""
mocker.patch.dict(
"paperless.checks.connections",
{"default": build_conn_mock(["django_migrations"], ["0002_squashed"])},
)
assert check_v3_minimum_upgrade_version(None) == []
def test_v2_20_9_state_ready_to_upgrade(
self,
mocker: MockerFixture,
build_conn_mock,
) -> None:
"""
GIVEN:
- 1075_workflowaction_order (the last v2.20.9 migration) is in the DB
WHEN:
- The v3 upgrade check runs
THEN:
- No errors are reported (squash will pick up cleanly from this state)
"""
mocker.patch.dict(
"paperless.checks.connections",
{
"default": build_conn_mock(
["django_migrations"],
[
"1074_workflowrun_deleted_at_workflowrun_restored_at_and_more",
"1075_workflowaction_order",
],
),
},
)
assert check_v3_minimum_upgrade_version(None) == []
def test_v2_20_8_raises_error(
self,
mocker: MockerFixture,
build_conn_mock,
) -> None:
"""
GIVEN:
- 1074 (last v2.20.8 migration) is applied but 1075 is not
WHEN:
- The v3 upgrade check runs
THEN:
- An Error with id paperless.E002 is returned
"""
mocker.patch.dict(
"paperless.checks.connections",
{
"default": build_conn_mock(
["django_migrations"],
["1074_workflowrun_deleted_at_workflowrun_restored_at_and_more"],
),
},
)
result = check_v3_minimum_upgrade_version(None)
assert len(result) == 1
assert isinstance(result[0], Error)
assert result[0].id == "paperless.E002"
def test_very_old_version_raises_error(
self,
mocker: MockerFixture,
build_conn_mock,
) -> None:
"""
GIVEN:
- Only old migrations (well below v2.20.9) are applied
WHEN:
- The v3 upgrade check runs
THEN:
- An Error with id paperless.E002 is returned
"""
mocker.patch.dict(
"paperless.checks.connections",
{
"default": build_conn_mock(
["django_migrations"],
["1000_update_paperless_all", "1022_paperlesstask"],
),
},
)
result = check_v3_minimum_upgrade_version(None)
assert len(result) == 1
assert isinstance(result[0], Error)
assert result[0].id == "paperless.E002"
def test_error_hint_mentions_v2_20_9(
self,
mocker: MockerFixture,
build_conn_mock,
) -> None:
"""
GIVEN:
- DB is on an old v2 version (pre-v2.20.9)
WHEN:
- The v3 upgrade check runs
THEN:
- The error hint explicitly references v2.20.9 so users know what to do
"""
mocker.patch.dict(
"paperless.checks.connections",
{"default": build_conn_mock(["django_migrations"], ["1022_paperlesstask"])},
)
result = check_v3_minimum_upgrade_version(None)
assert len(result) == 1
assert "v2.20.9" in result[0].hint
def test_db_error_is_swallowed(self, mocker: MockerFixture) -> None:
"""
GIVEN:
- A DatabaseError is raised when querying the DB
WHEN:
- The v3 upgrade check runs
THEN:
- No exception propagates and an empty list is returned
"""
from django.db import DatabaseError
conn = mocker.MagicMock()
conn.introspection.table_names.side_effect = DatabaseError("connection refused")
mocker.patch.dict("paperless.checks.connections", {"default": conn})
assert check_v3_minimum_upgrade_version(None) == []
def test_operational_error_is_swallowed(self, mocker: MockerFixture) -> None:
"""
GIVEN:
- An OperationalError is raised when querying the DB
WHEN:
- The v3 upgrade check runs
THEN:
- No exception propagates and an empty list is returned
"""
from django.db import OperationalError
conn = mocker.MagicMock()
conn.introspection.table_names.side_effect = OperationalError("DB unavailable")
mocker.patch.dict("paperless.checks.connections", {"default": conn})
assert check_v3_minimum_upgrade_version(None) == []

View File

@@ -1,4 +1,4 @@
# Generated by Django 5.2.11 on 2026-03-03 16:27
# Generated by Django 5.2.9 on 2026-01-20 18:46
import django.db.models.deletion
import django.utils.timezone
@@ -15,50 +15,6 @@ class Migration(migrations.Migration):
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
replaces = [
("paperless_mail", "0001_initial"),
("paperless_mail", "0001_initial_squashed_0009_mailrule_assign_tags"),
("paperless_mail", "0002_auto_20201117_1334"),
("paperless_mail", "0003_auto_20201118_1940"),
("paperless_mail", "0004_mailrule_order"),
("paperless_mail", "0005_help_texts"),
("paperless_mail", "0006_auto_20210101_2340"),
("paperless_mail", "0007_auto_20210106_0138"),
("paperless_mail", "0008_auto_20210516_0940"),
("paperless_mail", "0009_alter_mailrule_action_alter_mailrule_folder"),
("paperless_mail", "0009_mailrule_assign_tags"),
("paperless_mail", "0010_auto_20220311_1602"),
("paperless_mail", "0011_remove_mailrule_assign_tag"),
(
"paperless_mail",
"0011_remove_mailrule_assign_tag_squashed_0024_alter_mailrule_name_and_more",
),
("paperless_mail", "0012_alter_mailrule_assign_tags"),
("paperless_mail", "0013_merge_20220412_1051"),
("paperless_mail", "0014_alter_mailrule_action"),
("paperless_mail", "0015_alter_mailrule_action"),
("paperless_mail", "0016_mailrule_consumption_scope"),
("paperless_mail", "0017_mailaccount_owner_mailrule_owner"),
("paperless_mail", "0018_processedmail"),
("paperless_mail", "0019_mailrule_filter_to"),
("paperless_mail", "0020_mailaccount_is_token"),
("paperless_mail", "0021_alter_mailaccount_password"),
("paperless_mail", "0022_mailrule_assign_owner_from_rule_and_more"),
("paperless_mail", "0023_remove_mailrule_filter_attachment_filename_and_more"),
("paperless_mail", "0024_alter_mailrule_name_and_more"),
(
"paperless_mail",
"0025_alter_mailaccount_owner_alter_mailrule_owner_and_more",
),
("paperless_mail", "0026_mailrule_enabled"),
(
"paperless_mail",
"0027_mailaccount_expiration_mailaccount_account_type_and_more",
),
("paperless_mail", "0028_alter_mailaccount_password_and_more"),
("paperless_mail", "0029_mailrule_pdf_layout"),
]
operations = [
migrations.CreateModel(
name="MailAccount",

View File

@@ -6,7 +6,7 @@ from django.db import models
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0001_squashed"),
("paperless_mail", "0001_initial"),
]
operations = [

10
uv.lock generated
View File

@@ -1342,11 +1342,11 @@ wheels = [
[[package]]
name = "faker"
version = "40.5.1"
version = "40.1.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/03/2a/96fff3edcb10f6505143448a4b91535f77b74865cec45be52690ee280443/faker-40.5.1.tar.gz", hash = "sha256:70222361cd82aa10cb86066d1a4e8f47f2bcdc919615c412045a69c4e6da0cd3", size = 1952684, upload-time = "2026-02-23T21:34:38.362Z" }
sdist = { url = "https://files.pythonhosted.org/packages/5e/77/1c3ff07b6739b9a1d23ca01ec0a90a309a33b78e345a3eb52f9ce9240e36/faker-40.1.2.tar.gz", hash = "sha256:b76a68163aa5f171d260fc24827a8349bc1db672f6a665359e8d0095e8135d30", size = 1949802, upload-time = "2026-01-13T20:51:49.917Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4d/a9/1eed4db92d0aec2f9bfdf1faae0ab0418b5e121dda5701f118a7a4f0cd6a/faker-40.5.1-py3-none-any.whl", hash = "sha256:c69640c1e13bad49b4bcebcbf1b52f9f1a872b6ea186c248ada34d798f1661bf", size = 1987053, upload-time = "2026-02-23T21:34:36.418Z" },
{ url = "https://files.pythonhosted.org/packages/46/ec/91a434c8a53d40c3598966621dea9c50512bec6ce8e76fa1751015e74cef/faker-40.1.2-py3-none-any.whl", hash = "sha256:93503165c165d330260e4379fd6dc07c94da90c611ed3191a0174d2ab9966a42", size = 1985633, upload-time = "2026-01-13T20:51:47.982Z" },
]
[[package]]
@@ -3121,7 +3121,6 @@ webserver = [
dev = [
{ name = "daphne", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "factory-boy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "faker", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "imagehash", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "prek", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pytest", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -3146,7 +3145,6 @@ lint = [
testing = [
{ name = "daphne", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "factory-boy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "faker", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "imagehash", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pytest", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pytest-cov", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -3259,7 +3257,6 @@ provides-extras = ["mariadb", "postgres", "webserver"]
dev = [
{ name = "daphne" },
{ name = "factory-boy", specifier = "~=3.3.1" },
{ name = "faker", specifier = "~=40.5.1" },
{ name = "imagehash" },
{ name = "prek", specifier = "~=0.3.0" },
{ name = "pytest", specifier = "~=9.0.0" },
@@ -3282,7 +3279,6 @@ lint = [
testing = [
{ name = "daphne" },
{ name = "factory-boy", specifier = "~=3.3.1" },
{ name = "faker", specifier = "~=40.5.1" },
{ name = "imagehash" },
{ name = "pytest", specifier = "~=9.0.0" },
{ name = "pytest-cov", specifier = "~=7.0.0" },