mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-13 12:41:23 +00:00
Compare commits
1 Commits
feature-cl
...
feature-dr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
17c628b686 |
@@ -72,6 +72,30 @@ PAPERLESS_DBHOST: postgres
|
||||
|
||||
See [`PAPERLESS_DBENGINE`](configuration.md#PAPERLESS_DBENGINE) for accepted values.
|
||||
|
||||
## Management Command: `--skip-checks` Removed
|
||||
|
||||
The `--skip-checks` flag has been removed from all Paperless-ngx management commands
|
||||
(`document_exporter`, `document_importer`, `document_retagger`, `document_archiver`,
|
||||
`document_thumbnails`, `document_index`, `document_renamer`, `document_sanity_checker`,
|
||||
`document_fuzzy_match`, and others).
|
||||
|
||||
These commands now set `requires_system_checks = []` internally, which both skips
|
||||
redundant checks at runtime (they are already run as a dedicated step during Docker
|
||||
startup via `init-system-checks`) and removes `--skip-checks` from the argument parser.
|
||||
|
||||
#### Action Required
|
||||
|
||||
Remove `--skip-checks` from any scripts, cron jobs, or automation that invokes
|
||||
these commands:
|
||||
|
||||
```bash
|
||||
# v2
|
||||
document_exporter /backup --skip-checks
|
||||
|
||||
# v3
|
||||
document_exporter /backup
|
||||
```
|
||||
|
||||
## Database Advanced Options
|
||||
|
||||
The individual SSL, timeout, and pooling variables have been removed in favor of a
|
||||
|
||||
@@ -9,7 +9,6 @@ from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
|
||||
@@ -192,12 +191,7 @@ class DocumentClassifier:
|
||||
|
||||
target_file_temp.rename(target_file)
|
||||
|
||||
def train(
|
||||
self,
|
||||
status_callback: Callable[[str], None] | None = None,
|
||||
) -> bool:
|
||||
notify = status_callback if status_callback is not None else lambda _: None
|
||||
|
||||
def train(self) -> bool:
|
||||
# Get non-inbox documents
|
||||
docs_queryset = (
|
||||
Document.objects.exclude(
|
||||
@@ -219,7 +213,6 @@ class DocumentClassifier:
|
||||
|
||||
# Step 1: Extract and preprocess training data from the database.
|
||||
logger.debug("Gathering data from database...")
|
||||
notify(f"Gathering data from {docs_queryset.count()} document(s)...")
|
||||
hasher = sha256()
|
||||
for doc in docs_queryset:
|
||||
y = -1
|
||||
@@ -297,7 +290,6 @@ class DocumentClassifier:
|
||||
|
||||
# Step 2: vectorize data
|
||||
logger.debug("Vectorizing data...")
|
||||
notify("Vectorizing document content...")
|
||||
|
||||
def content_generator() -> Iterator[str]:
|
||||
"""
|
||||
@@ -324,7 +316,6 @@ class DocumentClassifier:
|
||||
# Step 3: train the classifiers
|
||||
if num_tags > 0:
|
||||
logger.debug("Training tags classifier...")
|
||||
notify(f"Training tags classifier ({num_tags} tag(s))...")
|
||||
|
||||
if num_tags == 1:
|
||||
# Special case where only one tag has auto:
|
||||
@@ -348,9 +339,6 @@ class DocumentClassifier:
|
||||
|
||||
if num_correspondents > 0:
|
||||
logger.debug("Training correspondent classifier...")
|
||||
notify(
|
||||
f"Training correspondent classifier ({num_correspondents} correspondent(s))...",
|
||||
)
|
||||
self.correspondent_classifier = MLPClassifier(tol=0.01)
|
||||
self.correspondent_classifier.fit(data_vectorized, labels_correspondent)
|
||||
else:
|
||||
@@ -361,9 +349,6 @@ class DocumentClassifier:
|
||||
|
||||
if num_document_types > 0:
|
||||
logger.debug("Training document type classifier...")
|
||||
notify(
|
||||
f"Training document type classifier ({num_document_types} type(s))...",
|
||||
)
|
||||
self.document_type_classifier = MLPClassifier(tol=0.01)
|
||||
self.document_type_classifier.fit(data_vectorized, labels_document_type)
|
||||
else:
|
||||
@@ -376,7 +361,6 @@ class DocumentClassifier:
|
||||
logger.debug(
|
||||
"Training storage paths classifier...",
|
||||
)
|
||||
notify(f"Training storage path classifier ({num_storage_paths} path(s))...")
|
||||
self.storage_path_classifier = MLPClassifier(tol=0.01)
|
||||
self.storage_path_classifier.fit(
|
||||
data_vectorized,
|
||||
|
||||
@@ -151,6 +151,10 @@ class PaperlessCommand(RichCommand):
|
||||
supports_progress_bar: Adds --no-progress-bar argument (default: True)
|
||||
supports_multiprocessing: Adds --processes argument (default: False)
|
||||
|
||||
System checks are skipped by default (requires_system_checks = []) because
|
||||
these commands run post-startup where checks have already been performed by
|
||||
the application server. Subclasses that genuinely need checks can override.
|
||||
|
||||
Example usage:
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
@@ -189,6 +193,8 @@ class PaperlessCommand(RichCommand):
|
||||
stats.imported += 1
|
||||
"""
|
||||
|
||||
requires_system_checks: ClassVar[list] = []
|
||||
|
||||
supports_progress_bar: ClassVar[bool] = True
|
||||
supports_multiprocessing: ClassVar[bool] = False
|
||||
|
||||
|
||||
@@ -1,29 +1,13 @@
|
||||
from __future__ import annotations
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
import time
|
||||
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.tasks import train_classifier
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
class Command(BaseCommand):
|
||||
help = (
|
||||
"Trains the classifier on your data and saves the resulting models to a "
|
||||
"file. The document consumer will then automatically use this new model."
|
||||
)
|
||||
supports_progress_bar = False
|
||||
supports_multiprocessing = False
|
||||
|
||||
def handle(self, *args, **options) -> None:
|
||||
start = time.monotonic()
|
||||
|
||||
with self.buffered_logging("paperless.tasks"):
|
||||
train_classifier(
|
||||
scheduled=False,
|
||||
status_callback=lambda msg: self.console.print(f" {msg}"),
|
||||
)
|
||||
|
||||
elapsed = time.monotonic() - start
|
||||
self.console.print(
|
||||
f"[green]✓[/green] Classifier training complete ({elapsed:.1f}s)",
|
||||
)
|
||||
def handle(self, *args, **options):
|
||||
train_classifier(scheduled=False)
|
||||
|
||||
@@ -205,7 +205,7 @@ class Command(CryptMixin, PaperlessCommand):
|
||||
ContentType.objects.all().delete()
|
||||
Permission.objects.all().delete()
|
||||
for manifest_path in self.manifest_paths:
|
||||
call_command("loaddata", manifest_path)
|
||||
call_command("loaddata", manifest_path, skip_checks=True)
|
||||
except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
|
||||
self.stdout.write(self.style.ERROR("Database import failed"))
|
||||
if (
|
||||
|
||||
@@ -100,11 +100,7 @@ def index_reindex(*, iter_wrapper: IterWrapper[Document] = _identity) -> None:
|
||||
|
||||
|
||||
@shared_task
|
||||
def train_classifier(
|
||||
*,
|
||||
scheduled=True,
|
||||
status_callback: Callable[[str], None] | None = None,
|
||||
) -> None:
|
||||
def train_classifier(*, scheduled=True) -> None:
|
||||
task = PaperlessTask.objects.create(
|
||||
type=PaperlessTask.TaskType.SCHEDULED_TASK
|
||||
if scheduled
|
||||
@@ -140,7 +136,7 @@ def train_classifier(
|
||||
classifier = DocumentClassifier()
|
||||
|
||||
try:
|
||||
if classifier.train(status_callback=status_callback):
|
||||
if classifier.train():
|
||||
logger.info(
|
||||
f"Saving updated classifier model to {settings.MODEL_FILE}...",
|
||||
)
|
||||
|
||||
@@ -12,7 +12,12 @@ class TestApiSchema(APITestCase):
|
||||
Test that the schema is valid
|
||||
"""
|
||||
try:
|
||||
call_command("spectacular", "--validate", "--fail-on-warn")
|
||||
call_command(
|
||||
"spectacular",
|
||||
"--validate",
|
||||
"--fail-on-warn",
|
||||
skip_checks=True,
|
||||
)
|
||||
except CommandError as e:
|
||||
self.fail(f"Schema validation failed: {e}")
|
||||
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import filecmp
|
||||
import shutil
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
@@ -14,9 +11,6 @@ from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from documents.file_handling import generate_filename
|
||||
from documents.models import Document
|
||||
from documents.tasks import update_document_content_maybe_archive_file
|
||||
@@ -141,32 +135,14 @@ class TestRenamer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
class TestCreateClassifier:
|
||||
def test_create_classifier(self, mocker: MockerFixture) -> None:
|
||||
m = mocker.patch(
|
||||
"documents.management.commands.document_create_classifier.train_classifier",
|
||||
)
|
||||
class TestCreateClassifier(TestCase):
|
||||
@mock.patch(
|
||||
"documents.management.commands.document_create_classifier.train_classifier",
|
||||
)
|
||||
def test_create_classifier(self, m) -> None:
|
||||
call_command("document_create_classifier", skip_checks=True)
|
||||
|
||||
call_command("document_create_classifier", "--skip-checks")
|
||||
|
||||
m.assert_called_once_with(scheduled=False, status_callback=mocker.ANY)
|
||||
assert callable(m.call_args.kwargs["status_callback"])
|
||||
|
||||
def test_create_classifier_callback_output(self, mocker: MockerFixture) -> None:
|
||||
"""Callback passed to train_classifier writes each phase message to the console."""
|
||||
m = mocker.patch(
|
||||
"documents.management.commands.document_create_classifier.train_classifier",
|
||||
)
|
||||
|
||||
def invoke_callback(**kwargs):
|
||||
kwargs["status_callback"]("Vectorizing document content...")
|
||||
|
||||
m.side_effect = invoke_callback
|
||||
|
||||
stdout = StringIO()
|
||||
call_command("document_create_classifier", "--skip-checks", stdout=stdout)
|
||||
|
||||
assert "Vectorizing document content..." in stdout.getvalue()
|
||||
m.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.management
|
||||
@@ -176,7 +152,7 @@ class TestConvertMariaDBUUID(TestCase):
|
||||
m.alter_field.return_value = None
|
||||
|
||||
stdout = StringIO()
|
||||
call_command("convert_mariadb_uuid", stdout=stdout)
|
||||
call_command("convert_mariadb_uuid", stdout=stdout, skip_checks=True)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ class TestManageSuperUser(DirectoriesMixin, TestCase):
|
||||
"--no-color",
|
||||
stdout=out,
|
||||
stderr=StringIO(),
|
||||
skip_checks=True,
|
||||
)
|
||||
return out.getvalue()
|
||||
|
||||
|
||||
@@ -1665,7 +1665,7 @@ class TestManagementCommand(TestCase):
|
||||
"paperless_mail.management.commands.mail_fetcher.tasks.process_mail_accounts",
|
||||
)
|
||||
def test_mail_fetcher(self, m) -> None:
|
||||
call_command("mail_fetcher")
|
||||
call_command("mail_fetcher", skip_checks=True)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user