mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-21 15:29:26 +00:00
* refactor: switch consumer and callers to ParserRegistry (Phase 4) Replace all Django signal-based parser discovery with direct registry calls. Removes `_parser_cleanup`, `parser_is_new_style` shims, and all old-style isinstance checks. All parser instantiation now uses the `with parser_class() as parser:` context manager pattern. - documents/parsers.py: delegate to get_parser_registry(); drop lru_cache - documents/consumer.py: use registry + context manager; remove shims - documents/tasks.py: same pattern - documents/management/commands/document_thumbnails.py: same pattern - documents/views.py: get_metadata uses context manager - documents/checks.py: use get_parser_registry().all_parsers() - paperless/parsers/registry.py: add all_parsers() public method - tests: update mocks to target documents.consumer.get_parser_class_for_mime_type Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * refactor: drop get_parser_class_for_mime_type; callers use registry directly All callers now call get_parser_registry().get_parser_for_file() with the actual filename and path, enabling score() to use file extension hints. The MIME-only helper is removed. - consumer.py: passes self.filename + self.working_copy - tasks.py: passes document.original_filename + document.source_path - document_thumbnails.py: same pattern - views.py: passes Path(file).name + Path(file) - parsers.py: internal helpers inline the registry call with filename="" - test_parsers.py: drop TestParserDiscovery (was testing mock behavior); TestParserAvailability uses registry directly - test_consumer.py: mocks switch to documents.consumer.get_parser_registry Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * refactor: remove document_consumer_declaration signal infrastructure Remove the document_consumer_declaration signal that was previously used for parser registration. Each parser app no longer connects to this signal, and the signal declaration itself has been removed from documents/signals. Changes: - Remove document_consumer_declaration from documents/signals/__init__.py - Remove ready() methods and signal imports from all parser app configs - Delete signal shim files (signals.py) from all parser apps: - paperless_tesseract/signals.py - paperless_text/signals.py - paperless_tika/signals.py - paperless_mail/signals.py - paperless_remote/signals.py Parser discovery now happens exclusively through the ParserRegistry system introduced in the previous refactor phases. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * refactor: remove empty paperless_text and paperless_tika Django apps After parser classes were moved to paperless/parsers/ in the plugin refactor, these Django apps contained only empty AppConfig classes with no models, views, tasks, migrations, or other functionality. - Remove paperless_text and paperless_tika from INSTALLED_APPS - Delete empty app directories entirely - Update pyproject.toml test exclusions - Clean stale mypy baseline entries for moved parser files paperless_remote app is retained as it contains meaningful system checks for Azure AI configuration. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Moves the checks and tests to the main application and removes the old applications * Adds a comment to satisy Sonar * refactor: remove automatic log_summary() call from get_parser_registry() The summary was logged once per process, causing it to appear repeatedly during Docker startup (management commands, web server, each Celery worker subprocess). External parsers are already announced individually at INFO when discovered; the full summary is redundant noise. log_summary() is retained on ParserRegistry for manual/debug use. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Cleans up the duplicate test file/fixture * Fixes a race condition where webserver threads could race to populate the registry --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
277 lines
9.0 KiB
Python
277 lines
9.0 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
import mimetypes
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import tempfile
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING
|
|
|
|
from django.conf import settings
|
|
|
|
from documents.loggers import LoggingMixin
|
|
from documents.utils import copy_file_with_basic_stats
|
|
from documents.utils import run_subprocess
|
|
from paperless.parsers.registry import get_parser_registry
|
|
|
|
if TYPE_CHECKING:
|
|
import datetime
|
|
|
|
logger = logging.getLogger("paperless.parsing")
|
|
|
|
|
|
def is_mime_type_supported(mime_type: str) -> bool:
|
|
"""
|
|
Returns True if the mime type is supported, False otherwise
|
|
"""
|
|
return get_parser_registry().get_parser_for_file(mime_type, "") is not None
|
|
|
|
|
|
def get_default_file_extension(mime_type: str) -> str:
|
|
"""
|
|
Returns the default file extension for a mimetype, or
|
|
an empty string if it could not be determined
|
|
"""
|
|
parser_class = get_parser_registry().get_parser_for_file(mime_type, "")
|
|
if parser_class is not None:
|
|
supported = parser_class.supported_mime_types()
|
|
if mime_type in supported:
|
|
return supported[mime_type]
|
|
|
|
ext = mimetypes.guess_extension(mime_type)
|
|
return ext if ext else ""
|
|
|
|
|
|
def is_file_ext_supported(ext: str) -> bool:
|
|
"""
|
|
Returns True if the file extension is supported, False otherwise
|
|
TODO: Investigate why this really exists, why not use mimetype
|
|
"""
|
|
if ext:
|
|
return ext.lower() in get_supported_file_extensions()
|
|
else:
|
|
return False
|
|
|
|
|
|
def get_supported_file_extensions() -> set[str]:
|
|
extensions = set()
|
|
for parser_class in get_parser_registry().all_parsers():
|
|
for mime_type, ext in parser_class.supported_mime_types().items():
|
|
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
|
# Python's stdlib might be behind, so also add what the parser
|
|
# says is the default extension
|
|
# This makes image/webp supported on Python < 3.11
|
|
extensions.add(ext)
|
|
|
|
return extensions
|
|
|
|
|
|
def run_convert(
|
|
input_file,
|
|
output_file,
|
|
*,
|
|
density=None,
|
|
scale=None,
|
|
alpha=None,
|
|
strip=False,
|
|
trim=False,
|
|
type=None,
|
|
depth=None,
|
|
auto_orient=False,
|
|
use_cropbox=False,
|
|
extra=None,
|
|
logging_group=None,
|
|
) -> None:
|
|
environment = os.environ.copy()
|
|
if settings.CONVERT_MEMORY_LIMIT:
|
|
# MAGICK_MEMORY_LIMIT sets the maximum amount of RAM the pixel cache can use.
|
|
# MAGICK_MAP_LIMIT sets the maximum amount of memory-mapped I/O allowed.
|
|
#
|
|
# For large-format documents ImageMagick will hit the RAM limit and
|
|
# immediately try to "map" the remaining data. If MAGICK_MAP_LIMIT isn't
|
|
# also set, the process may trigger an OOM kill because the default
|
|
# system/policy map limit is often too restrictive for these massive bitmaps.
|
|
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
|
environment["MAGICK_MAP_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
|
if settings.CONVERT_TMPDIR:
|
|
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
|
|
|
args = [settings.CONVERT_BINARY]
|
|
args += ["-density", str(density)] if density else []
|
|
args += ["-scale", str(scale)] if scale else []
|
|
args += ["-alpha", str(alpha)] if alpha else []
|
|
args += ["-strip"] if strip else []
|
|
args += ["-trim"] if trim else []
|
|
args += ["-type", str(type)] if type else []
|
|
args += ["-depth", str(depth)] if depth else []
|
|
args += ["-auto-orient"] if auto_orient else []
|
|
args += ["-define", "pdf:use-cropbox=true"] if use_cropbox else []
|
|
args += [str(input_file), str(output_file)]
|
|
|
|
logger.debug("Execute: " + " ".join(args), extra={"group": logging_group})
|
|
|
|
try:
|
|
run_subprocess(args, environment, logger)
|
|
except subprocess.CalledProcessError as e:
|
|
raise ParseError(f"Convert failed at {args}") from e
|
|
except Exception as e: # pragma: no cover
|
|
raise ParseError("Unknown error running convert") from e
|
|
|
|
|
|
def get_default_thumbnail() -> Path:
|
|
"""
|
|
Returns the path to a generic thumbnail
|
|
"""
|
|
return (Path(__file__).parent / "resources" / "document.webp").resolve()
|
|
|
|
|
|
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> Path:
|
|
out_path: Path = Path(temp_dir) / "convert_gs.webp"
|
|
|
|
# if convert fails, fall back to extracting
|
|
# the first PDF page as a PNG using Ghostscript
|
|
logger.warning(
|
|
"Thumbnail generation with ImageMagick failed, falling back "
|
|
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
|
|
extra={"group": logging_group},
|
|
)
|
|
# Ghostscript doesn't handle WebP outputs
|
|
gs_out_path: Path = Path(temp_dir) / "gs_out.png"
|
|
cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path]
|
|
|
|
try:
|
|
try:
|
|
run_subprocess(cmd, logger=logger)
|
|
except subprocess.CalledProcessError as e:
|
|
raise ParseError(f"Thumbnail (gs) failed at {cmd}") from e
|
|
# then run convert on the output from gs to make WebP
|
|
run_convert(
|
|
density=300,
|
|
scale="500x5000>",
|
|
alpha="remove",
|
|
strip=True,
|
|
trim=False,
|
|
auto_orient=True,
|
|
input_file=gs_out_path,
|
|
output_file=out_path,
|
|
logging_group=logging_group,
|
|
)
|
|
|
|
return out_path
|
|
|
|
except ParseError as e:
|
|
logger.error(f"Unable to make thumbnail with Ghostscript: {e}")
|
|
# The caller might expect a generated thumbnail that can be moved,
|
|
# so we need to copy it before it gets moved.
|
|
# https://github.com/paperless-ngx/paperless-ngx/issues/3631
|
|
default_thumbnail_path: Path = Path(temp_dir) / "document.webp"
|
|
copy_file_with_basic_stats(get_default_thumbnail(), default_thumbnail_path)
|
|
return default_thumbnail_path
|
|
|
|
|
|
def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -> Path:
|
|
"""
|
|
The thumbnail of a PDF is just a 500px wide image of the first page.
|
|
"""
|
|
out_path: Path = temp_dir / "convert.webp"
|
|
|
|
# Run convert to get a decent thumbnail
|
|
try:
|
|
run_convert(
|
|
density=300,
|
|
scale="500x5000>",
|
|
alpha="remove",
|
|
strip=True,
|
|
trim=False,
|
|
auto_orient=True,
|
|
use_cropbox=True,
|
|
input_file=f"{in_path}[0]",
|
|
output_file=str(out_path),
|
|
logging_group=logging_group,
|
|
)
|
|
except ParseError as e:
|
|
logger.error(f"Unable to make thumbnail with convert: {e}")
|
|
out_path = make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group)
|
|
|
|
return out_path
|
|
|
|
|
|
class ParseError(Exception):
|
|
pass
|
|
|
|
|
|
class DocumentParser(LoggingMixin):
|
|
"""
|
|
Subclass this to make your own parser. Have a look at
|
|
`paperless_tesseract.parsers` for inspiration.
|
|
"""
|
|
|
|
logging_name = "paperless.parsing"
|
|
|
|
def __init__(self, logging_group, progress_callback=None) -> None:
|
|
super().__init__()
|
|
self.renew_logging_group()
|
|
self.logging_group = logging_group
|
|
self.settings = self.get_settings()
|
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
|
self.tempdir = Path(
|
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
|
)
|
|
|
|
self.archive_path = None
|
|
self.text = None
|
|
self.date: datetime.datetime | None = None
|
|
self.progress_callback = progress_callback
|
|
|
|
def progress(self, current_progress, max_progress) -> None:
|
|
if self.progress_callback:
|
|
self.progress_callback(current_progress, max_progress)
|
|
|
|
def get_settings(self): # pragma: no cover
|
|
"""
|
|
A parser must implement this
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def read_file_handle_unicode_errors(self, filepath: Path) -> str:
|
|
"""
|
|
Helper utility for reading from a file, and handling a problem with its
|
|
unicode, falling back to ignoring the error to remove the invalid bytes
|
|
"""
|
|
try:
|
|
text = filepath.read_text(encoding="utf-8")
|
|
except UnicodeDecodeError as e:
|
|
self.log.warning(f"Unicode error during text reading, continuing: {e}")
|
|
text = filepath.read_bytes().decode("utf-8", errors="replace")
|
|
return text
|
|
|
|
def extract_metadata(self, document_path, mime_type):
|
|
return []
|
|
|
|
def get_page_count(self, document_path, mime_type) -> None:
|
|
return None
|
|
|
|
def parse(self, document_path, mime_type, file_name=None):
|
|
raise NotImplementedError
|
|
|
|
def get_archive_path(self):
|
|
return self.archive_path
|
|
|
|
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
|
"""
|
|
Returns the path to a file we can use as a thumbnail for this document.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def get_text(self):
|
|
return self.text
|
|
|
|
def get_date(self) -> datetime.datetime | None:
|
|
return self.date
|
|
|
|
def cleanup(self) -> None:
|
|
self.log.debug(f"Deleting directory {self.tempdir}")
|
|
shutil.rmtree(self.tempdir)
|