mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-24 00:49:27 +00:00
* refactor: switch consumer and callers to ParserRegistry (Phase 4) Replace all Django signal-based parser discovery with direct registry calls. Removes `_parser_cleanup`, `parser_is_new_style` shims, and all old-style isinstance checks. All parser instantiation now uses the `with parser_class() as parser:` context manager pattern. - documents/parsers.py: delegate to get_parser_registry(); drop lru_cache - documents/consumer.py: use registry + context manager; remove shims - documents/tasks.py: same pattern - documents/management/commands/document_thumbnails.py: same pattern - documents/views.py: get_metadata uses context manager - documents/checks.py: use get_parser_registry().all_parsers() - paperless/parsers/registry.py: add all_parsers() public method - tests: update mocks to target documents.consumer.get_parser_class_for_mime_type Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * refactor: drop get_parser_class_for_mime_type; callers use registry directly All callers now call get_parser_registry().get_parser_for_file() with the actual filename and path, enabling score() to use file extension hints. The MIME-only helper is removed. - consumer.py: passes self.filename + self.working_copy - tasks.py: passes document.original_filename + document.source_path - document_thumbnails.py: same pattern - views.py: passes Path(file).name + Path(file) - parsers.py: internal helpers inline the registry call with filename="" - test_parsers.py: drop TestParserDiscovery (was testing mock behavior); TestParserAvailability uses registry directly - test_consumer.py: mocks switch to documents.consumer.get_parser_registry Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * refactor: remove document_consumer_declaration signal infrastructure Remove the document_consumer_declaration signal that was previously used for parser registration. Each parser app no longer connects to this signal, and the signal declaration itself has been removed from documents/signals. Changes: - Remove document_consumer_declaration from documents/signals/__init__.py - Remove ready() methods and signal imports from all parser app configs - Delete signal shim files (signals.py) from all parser apps: - paperless_tesseract/signals.py - paperless_text/signals.py - paperless_tika/signals.py - paperless_mail/signals.py - paperless_remote/signals.py Parser discovery now happens exclusively through the ParserRegistry system introduced in the previous refactor phases. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * refactor: remove empty paperless_text and paperless_tika Django apps After parser classes were moved to paperless/parsers/ in the plugin refactor, these Django apps contained only empty AppConfig classes with no models, views, tasks, migrations, or other functionality. - Remove paperless_text and paperless_tika from INSTALLED_APPS - Delete empty app directories entirely - Update pyproject.toml test exclusions - Clean stale mypy baseline entries for moved parser files paperless_remote app is retained as it contains meaningful system checks for Azure AI configuration. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Moves the checks and tests to the main application and removes the old applications * Adds a comment to satisy Sonar * refactor: remove automatic log_summary() call from get_parser_registry() The summary was logged once per process, causing it to appear repeatedly during Docker startup (management commands, web server, each Celery worker subprocess). External parsers are already announced individually at INFO when discovered; the full summary is redundant noise. log_summary() is retained on ParserRegistry for manual/debug use. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Cleans up the duplicate test file/fixture * Fixes a race condition where webserver threads could race to populate the registry --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
362 lines
11 KiB
Python
362 lines
11 KiB
Python
import grp
|
|
import os
|
|
import pwd
|
|
import shutil
|
|
import stat
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
from django.conf import settings
|
|
from django.core.checks import Error
|
|
from django.core.checks import Tags
|
|
from django.core.checks import Warning
|
|
from django.core.checks import register
|
|
from django.db import connections
|
|
|
|
exists_message = "{} is set but doesn't exist."
|
|
exists_hint = "Create a directory at {}"
|
|
writeable_message = "{} is not writeable"
|
|
writeable_hint = (
|
|
"Set the permissions of {} to be writeable by the user running the "
|
|
"Paperless services"
|
|
)
|
|
|
|
|
|
def path_check(var, directory: Path) -> list[Error]:
|
|
messages: list[Error] = []
|
|
if directory:
|
|
if not directory.is_dir():
|
|
messages.append(
|
|
Error(exists_message.format(var), exists_hint.format(directory)),
|
|
)
|
|
else:
|
|
test_file: Path = directory / f"__paperless_write_test_{os.getpid()}__"
|
|
try:
|
|
with test_file.open("w"):
|
|
pass
|
|
except PermissionError:
|
|
dir_stat: os.stat_result = Path(directory).stat()
|
|
dir_mode: str = stat.filemode(dir_stat.st_mode)
|
|
dir_owner: str = pwd.getpwuid(dir_stat.st_uid).pw_name
|
|
dir_group: str = grp.getgrgid(dir_stat.st_gid).gr_name
|
|
messages.append(
|
|
Error(
|
|
writeable_message.format(var),
|
|
writeable_hint.format(
|
|
f"\n{dir_mode} {dir_owner} {dir_group} {directory}\n",
|
|
),
|
|
),
|
|
)
|
|
finally:
|
|
try:
|
|
if test_file.is_file():
|
|
test_file.unlink()
|
|
except (PermissionError, OSError):
|
|
# Skip cleanup if we can't access the file — expected in permission tests
|
|
pass
|
|
|
|
return messages
|
|
|
|
|
|
@register()
|
|
def paths_check(app_configs, **kwargs) -> list[Error]:
|
|
"""
|
|
Check the various paths for existence, readability and writeability
|
|
"""
|
|
|
|
return (
|
|
path_check("PAPERLESS_DATA_DIR", settings.DATA_DIR)
|
|
+ path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
|
|
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
|
|
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
|
|
)
|
|
|
|
|
|
@register()
|
|
def binaries_check(app_configs, **kwargs):
|
|
"""
|
|
Paperless requires the existence of a few binaries, so we do some checks
|
|
for those here.
|
|
"""
|
|
|
|
error = "Paperless can't find {}. Without it, consumption is impossible."
|
|
hint = "Either it's not in your ${PATH} or it's not installed."
|
|
|
|
binaries = (settings.CONVERT_BINARY, "tesseract", "gs")
|
|
|
|
check_messages = []
|
|
for binary in binaries:
|
|
if shutil.which(binary) is None:
|
|
check_messages.append(Warning(error.format(binary), hint))
|
|
|
|
return check_messages
|
|
|
|
|
|
@register()
|
|
def debug_mode_check(app_configs, **kwargs):
|
|
if settings.DEBUG:
|
|
return [
|
|
Warning(
|
|
"DEBUG mode is enabled. Disable Debug mode. This is a serious "
|
|
"security issue, since it puts security overrides in place "
|
|
"which are meant to be only used during development. This "
|
|
"also means that paperless will tell anyone various "
|
|
"debugging information when something goes wrong.",
|
|
),
|
|
]
|
|
else:
|
|
return []
|
|
|
|
|
|
@register()
|
|
def settings_values_check(app_configs, **kwargs):
|
|
"""
|
|
Validates at least some of the user provided settings
|
|
"""
|
|
|
|
def _ocrmypdf_settings_check():
|
|
"""
|
|
Validates some of the arguments which will be provided to ocrmypdf
|
|
against the valid options. Use "ocrmypdf --help" to see the valid
|
|
inputs
|
|
"""
|
|
msgs = []
|
|
if settings.OCR_OUTPUT_TYPE not in {
|
|
"pdfa",
|
|
"pdf",
|
|
"pdfa-1",
|
|
"pdfa-2",
|
|
"pdfa-3",
|
|
}:
|
|
msgs.append(
|
|
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
|
|
)
|
|
|
|
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
|
|
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
|
|
|
if settings.OCR_MODE == "skip_noarchive":
|
|
msgs.append(
|
|
Warning(
|
|
'OCR output mode "skip_noarchive" is deprecated and will be '
|
|
"removed in a future version. Please use "
|
|
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
|
|
),
|
|
)
|
|
|
|
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
|
|
msgs.append(
|
|
Error(
|
|
"OCR_SKIP_ARCHIVE_FILE setting "
|
|
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
|
|
),
|
|
)
|
|
|
|
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
|
|
msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid'))
|
|
return msgs
|
|
|
|
def _timezone_validate():
|
|
"""
|
|
Validates the user provided timezone is a valid timezone
|
|
"""
|
|
import zoneinfo
|
|
|
|
msgs = []
|
|
if settings.TIME_ZONE not in zoneinfo.available_timezones():
|
|
msgs.append(
|
|
Error(f'Timezone "{settings.TIME_ZONE}" is not a valid timezone'),
|
|
)
|
|
return msgs
|
|
|
|
def _email_certificate_validate():
|
|
msgs = []
|
|
# Existence checks
|
|
if (
|
|
settings.EMAIL_CERTIFICATE_FILE is not None
|
|
and not settings.EMAIL_CERTIFICATE_FILE.is_file()
|
|
):
|
|
msgs.append(
|
|
Error(
|
|
f"Email cert {settings.EMAIL_CERTIFICATE_FILE} is not a file",
|
|
),
|
|
)
|
|
return msgs
|
|
|
|
return (
|
|
_ocrmypdf_settings_check()
|
|
+ _timezone_validate()
|
|
+ _email_certificate_validate()
|
|
)
|
|
|
|
|
|
@register()
|
|
def audit_log_check(app_configs, **kwargs):
|
|
db_conn = connections["default"]
|
|
all_tables = db_conn.introspection.table_names()
|
|
result = []
|
|
|
|
if ("auditlog_logentry" in all_tables) and not settings.AUDIT_LOG_ENABLED:
|
|
result.append(
|
|
Warning(
|
|
("auditlog table was found but audit log is disabled."),
|
|
),
|
|
)
|
|
|
|
return result
|
|
|
|
|
|
@register(Tags.compatibility)
|
|
def check_v3_minimum_upgrade_version(
|
|
app_configs: object,
|
|
**kwargs: object,
|
|
) -> list[Error]:
|
|
"""
|
|
Enforce that upgrades to v3 must start from v2.20.10.
|
|
|
|
v3 squashes all prior migrations into 0001_squashed and 0002_squashed.
|
|
If a user skips v2.20.10, the data migration in 1075_workflowaction_order
|
|
never runs and the squash may apply schema changes against an incomplete
|
|
database state.
|
|
"""
|
|
from django.db import DatabaseError
|
|
from django.db import OperationalError
|
|
|
|
try:
|
|
all_tables = connections["default"].introspection.table_names()
|
|
|
|
if "django_migrations" not in all_tables:
|
|
return []
|
|
|
|
with connections["default"].cursor() as cursor:
|
|
cursor.execute(
|
|
"SELECT name FROM django_migrations WHERE app = %s",
|
|
["documents"],
|
|
)
|
|
applied: set[str] = {row[0] for row in cursor.fetchall()}
|
|
|
|
if not applied:
|
|
return []
|
|
|
|
# Already in a valid v3 state
|
|
if {"0001_squashed", "0002_squashed"} & applied:
|
|
return []
|
|
|
|
# On v2.20.10 exactly — squash will pick up cleanly from here
|
|
if "1075_workflowaction_order" in applied:
|
|
return []
|
|
|
|
except (DatabaseError, OperationalError):
|
|
return []
|
|
|
|
return [
|
|
Error(
|
|
"Cannot upgrade to Paperless-ngx v3 from this version.",
|
|
hint=(
|
|
"Upgrading to v3 can only be performed from v2.20.10."
|
|
"Please upgrade to v2.20.10, run migrations, then upgrade to v3."
|
|
"See https://docs.paperless-ngx.com/setup/#upgrading for details."
|
|
),
|
|
id="paperless.E002",
|
|
),
|
|
]
|
|
|
|
|
|
@register()
|
|
def check_deprecated_db_settings(
|
|
app_configs: object,
|
|
**kwargs: object,
|
|
) -> list[Warning]:
|
|
"""Check for deprecated database environment variables.
|
|
|
|
Detects legacy advanced options that should be migrated to
|
|
PAPERLESS_DB_OPTIONS. Returns one Warning per deprecated variable found.
|
|
"""
|
|
deprecated_vars: dict[str, str] = {
|
|
"PAPERLESS_DB_TIMEOUT": "timeout",
|
|
"PAPERLESS_DB_POOLSIZE": "pool.min_size / pool.max_size",
|
|
"PAPERLESS_DBSSLMODE": "sslmode",
|
|
"PAPERLESS_DBSSLROOTCERT": "sslrootcert",
|
|
"PAPERLESS_DBSSLCERT": "sslcert",
|
|
"PAPERLESS_DBSSLKEY": "sslkey",
|
|
}
|
|
|
|
warnings: list[Warning] = []
|
|
|
|
for var_name, db_option_key in deprecated_vars.items():
|
|
if not os.getenv(var_name):
|
|
continue
|
|
warnings.append(
|
|
Warning(
|
|
f"Deprecated environment variable: {var_name}",
|
|
hint=(
|
|
f"{var_name} is no longer supported and will be removed in v3.2. "
|
|
f"Set the equivalent option via PAPERLESS_DB_OPTIONS instead. "
|
|
f'Example: PAPERLESS_DB_OPTIONS=\'{{"{db_option_key}": "<value>"}}\'. '
|
|
"See https://docs.paperless-ngx.com/migration/ for the full reference."
|
|
),
|
|
id="paperless.W001",
|
|
),
|
|
)
|
|
|
|
return warnings
|
|
|
|
|
|
@register()
|
|
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
|
|
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
|
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
|
):
|
|
return [
|
|
Error(
|
|
"Azure AI remote parser requires endpoint and API key to be configured.",
|
|
),
|
|
]
|
|
|
|
return []
|
|
|
|
|
|
def get_tesseract_langs():
|
|
proc = subprocess.run(
|
|
[shutil.which("tesseract"), "--list-langs"],
|
|
capture_output=True,
|
|
)
|
|
|
|
# Decode bytes to string, split on newlines, trim out the header
|
|
proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
|
|
|
|
return [x.strip() for x in proc_lines]
|
|
|
|
|
|
@register()
|
|
def check_default_language_available(app_configs, **kwargs):
|
|
errs = []
|
|
|
|
if not settings.OCR_LANGUAGE:
|
|
errs.append(
|
|
Warning(
|
|
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
|
"This means that tesseract will fallback to english.",
|
|
),
|
|
)
|
|
return errs
|
|
|
|
# binaries_check in paperless will check and report if this doesn't exist
|
|
# So skip trying to do anything here and let that handle missing binaries
|
|
if shutil.which("tesseract") is not None:
|
|
installed_langs = get_tesseract_langs()
|
|
|
|
specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")]
|
|
|
|
for lang in specified_langs:
|
|
if lang not in installed_langs:
|
|
errs.append(
|
|
Error(
|
|
f"The selected ocr language {lang} is "
|
|
f"not installed. Paperless cannot OCR your documents "
|
|
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
|
|
),
|
|
)
|
|
|
|
return errs
|