Compare commits
11 Commits
feature-ma
...
feature-re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b6fa3ed6eb | ||
|
|
b0bb31654f | ||
|
|
5c339f7f60 | ||
|
|
2098a11eb1 | ||
|
|
af8a8e791b | ||
|
|
8d4163bef3 | ||
|
|
e9e1d4ccca | ||
|
|
c955ba7d07 | ||
|
|
7028bb2163 | ||
|
|
5d4d87764c | ||
|
|
75dce7f19f |
@@ -256,7 +256,7 @@ lint.isort.force-single-line = true
|
|||||||
[tool.codespell]
|
[tool.codespell]
|
||||||
write-changes = true
|
write-changes = true
|
||||||
ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
|
ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
|
||||||
skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples/mail/*,src/documents/tests/samples/*,*.po,*.json"
|
skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/documents/tests/samples/*,*.po,*.json"
|
||||||
|
|
||||||
[tool.pytest]
|
[tool.pytest]
|
||||||
minversion = "9.0"
|
minversion = "9.0"
|
||||||
|
|||||||
@@ -51,9 +51,10 @@ from documents.templating.workflows import parse_w_workflow_placeholders
|
|||||||
from documents.utils import copy_basic_file_stats
|
from documents.utils import copy_basic_file_stats
|
||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
from paperless_mail.parsers import MailDocumentParser
|
||||||
|
|
||||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||||
|
|
||||||
@@ -68,7 +69,10 @@ def _parser_cleanup(parser: DocumentParser) -> None:
|
|||||||
|
|
||||||
TODO(stumpylog): Remove me in the future
|
TODO(stumpylog): Remove me in the future
|
||||||
"""
|
"""
|
||||||
if isinstance(parser, (MailDocumentParser, TextDocumentParser, TikaDocumentParser)):
|
if isinstance(
|
||||||
|
parser,
|
||||||
|
(TextDocumentParser, RemoteDocumentParser, TikaDocumentParser),
|
||||||
|
):
|
||||||
parser.__exit__(None, None, None)
|
parser.__exit__(None, None, None)
|
||||||
else:
|
else:
|
||||||
parser.cleanup()
|
parser.cleanup()
|
||||||
@@ -452,7 +456,10 @@ class ConsumerPlugin(
|
|||||||
# New-style parsers use __enter__/__exit__ for resource management.
|
# New-style parsers use __enter__/__exit__ for resource management.
|
||||||
# _parser_cleanup (below) handles __exit__; call __enter__ here.
|
# _parser_cleanup (below) handles __exit__; call __enter__ here.
|
||||||
# TODO(stumpylog): Remove me in the future
|
# TODO(stumpylog): Remove me in the future
|
||||||
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
|
if isinstance(
|
||||||
|
document_parser,
|
||||||
|
(TextDocumentParser, RemoteDocumentParser, TikaDocumentParser),
|
||||||
|
):
|
||||||
document_parser.__enter__()
|
document_parser.__enter__()
|
||||||
|
|
||||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
||||||
@@ -477,12 +484,17 @@ class ConsumerPlugin(
|
|||||||
isinstance(document_parser, MailDocumentParser)
|
isinstance(document_parser, MailDocumentParser)
|
||||||
and self.input_doc.mailrule_id
|
and self.input_doc.mailrule_id
|
||||||
):
|
):
|
||||||
document_parser.mailrule_id = self.input_doc.mailrule_id
|
document_parser.parse(
|
||||||
if isinstance(
|
self.working_copy,
|
||||||
|
mime_type,
|
||||||
|
self.filename,
|
||||||
|
self.input_doc.mailrule_id,
|
||||||
|
)
|
||||||
|
elif isinstance(
|
||||||
document_parser,
|
document_parser,
|
||||||
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
|
(TextDocumentParser, RemoteDocumentParser, TikaDocumentParser),
|
||||||
):
|
):
|
||||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
# TODO(stumpylog): Remove me in the future
|
||||||
document_parser.parse(self.working_copy, mime_type)
|
document_parser.parse(self.working_copy, mime_type)
|
||||||
else:
|
else:
|
||||||
document_parser.parse(self.working_copy, mime_type, self.filename)
|
document_parser.parse(self.working_copy, mime_type, self.filename)
|
||||||
@@ -496,9 +508,9 @@ class ConsumerPlugin(
|
|||||||
)
|
)
|
||||||
if isinstance(
|
if isinstance(
|
||||||
document_parser,
|
document_parser,
|
||||||
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
|
(TextDocumentParser, RemoteDocumentParser, TikaDocumentParser),
|
||||||
):
|
):
|
||||||
# TODO(stumpylog): Remove me in the future when all parsers use new protocol
|
# TODO(stumpylog): Remove me in the future
|
||||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||||
else:
|
else:
|
||||||
thumbnail = document_parser.get_thumbnail(
|
thumbnail = document_parser.get_thumbnail(
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ from documents.tests.utils import DummyProgressManager
|
|||||||
from documents.tests.utils import FileSystemAssertsMixin
|
from documents.tests.utils import FileSystemAssertsMixin
|
||||||
from documents.tests.utils import GetConsumerMixin
|
from documents.tests.utils import GetConsumerMixin
|
||||||
from paperless_mail.models import MailRule
|
from paperless_mail.models import MailRule
|
||||||
|
from paperless_mail.parsers import MailDocumentParser
|
||||||
|
|
||||||
|
|
||||||
class _BaseTestParser(DocumentParser):
|
class _BaseTestParser(DocumentParser):
|
||||||
@@ -1090,7 +1091,7 @@ class TestConsumer(
|
|||||||
self.assertEqual(command[1], "--replace-input")
|
self.assertEqual(command[1], "--replace-input")
|
||||||
|
|
||||||
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
||||||
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
|
@mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||||
def test_mail_parser_receives_mailrule(
|
def test_mail_parser_receives_mailrule(
|
||||||
self,
|
self,
|
||||||
@@ -1106,13 +1107,11 @@ class TestConsumer(
|
|||||||
THEN:
|
THEN:
|
||||||
- The mail parser should receive the mail rule
|
- The mail parser should receive the mail rule
|
||||||
"""
|
"""
|
||||||
from paperless_mail.signals import get_parser as mail_get_parser
|
|
||||||
|
|
||||||
mock_consumer_declaration_send.return_value = [
|
mock_consumer_declaration_send.return_value = [
|
||||||
(
|
(
|
||||||
None,
|
None,
|
||||||
{
|
{
|
||||||
"parser": mail_get_parser,
|
"parser": MailDocumentParser,
|
||||||
"mime_types": {"message/rfc822": ".eml"},
|
"mime_types": {"message/rfc822": ".eml"},
|
||||||
"weight": 0,
|
"weight": 0,
|
||||||
},
|
},
|
||||||
@@ -1124,10 +1123,9 @@ class TestConsumer(
|
|||||||
with self.get_consumer(
|
with self.get_consumer(
|
||||||
filepath=(
|
filepath=(
|
||||||
Path(__file__).parent.parent.parent
|
Path(__file__).parent.parent.parent
|
||||||
/ Path("paperless")
|
/ Path("paperless_mail")
|
||||||
/ Path("tests")
|
/ Path("tests")
|
||||||
/ Path("samples")
|
/ Path("samples")
|
||||||
/ Path("mail")
|
|
||||||
).resolve()
|
).resolve()
|
||||||
/ "html.eml",
|
/ "html.eml",
|
||||||
source=DocumentSource.MailFetch,
|
source=DocumentSource.MailFetch,
|
||||||
@@ -1138,10 +1136,12 @@ class TestConsumer(
|
|||||||
ConsumerError,
|
ConsumerError,
|
||||||
):
|
):
|
||||||
consumer.run()
|
consumer.run()
|
||||||
mock_mail_parser_parse.assert_called_once_with(
|
mock_mail_parser_parse.assert_called_once_with(
|
||||||
consumer.working_copy,
|
consumer.working_copy,
|
||||||
"message/rfc822",
|
"message/rfc822",
|
||||||
)
|
file_name="sample.pdf",
|
||||||
|
mailrule=mock_mailrule_get.return_value,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||||
|
|||||||
@@ -193,13 +193,13 @@ class ParserRegistry:
|
|||||||
that log output is predictable; scoring determines which parser wins
|
that log output is predictable; scoring determines which parser wins
|
||||||
at runtime regardless of registration order.
|
at runtime regardless of registration order.
|
||||||
"""
|
"""
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
self.register_builtin(TextDocumentParser)
|
self.register_builtin(TextDocumentParser)
|
||||||
|
self.register_builtin(RemoteDocumentParser)
|
||||||
self.register_builtin(TikaDocumentParser)
|
self.register_builtin(TikaDocumentParser)
|
||||||
self.register_builtin(MailDocumentParser)
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Discovery
|
# Discovery
|
||||||
|
|||||||
429
src/paperless/parsers/remote.py
Normal file
@@ -0,0 +1,429 @@
|
|||||||
|
"""
|
||||||
|
Built-in remote-OCR document parser.
|
||||||
|
|
||||||
|
Handles documents by sending them to a configured remote OCR engine
|
||||||
|
(currently Azure AI Vision / Document Intelligence) and retrieving both
|
||||||
|
the extracted text and a searchable PDF with an embedded text layer.
|
||||||
|
|
||||||
|
When no engine is configured, ``score()`` returns ``None`` so the parser
|
||||||
|
is effectively invisible to the registry — the tesseract parser handles
|
||||||
|
these MIME types instead.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
from paperless.version import __full_version_str__
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsing.remote")
|
||||||
|
|
||||||
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||||
|
"application/pdf": ".pdf",
|
||||||
|
"image/png": ".png",
|
||||||
|
"image/jpeg": ".jpg",
|
||||||
|
"image/tiff": ".tiff",
|
||||||
|
"image/bmp": ".bmp",
|
||||||
|
"image/gif": ".gif",
|
||||||
|
"image/webp": ".webp",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteEngineConfig:
|
||||||
|
"""Holds and validates the remote OCR engine configuration."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
engine: str | None,
|
||||||
|
api_key: str | None = None,
|
||||||
|
endpoint: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
self.engine = engine
|
||||||
|
self.api_key = api_key
|
||||||
|
self.endpoint = endpoint
|
||||||
|
|
||||||
|
def engine_is_valid(self) -> bool:
|
||||||
|
"""Return True when the engine is known and fully configured."""
|
||||||
|
return (
|
||||||
|
self.engine in ("azureai",)
|
||||||
|
and self.api_key is not None
|
||||||
|
and not (self.engine == "azureai" and self.endpoint is None)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteDocumentParser:
|
||||||
|
"""Parse documents via a remote OCR API (currently Azure AI Vision).
|
||||||
|
|
||||||
|
This parser sends documents to a remote engine that returns both
|
||||||
|
extracted text and a searchable PDF with an embedded text layer.
|
||||||
|
It does not depend on Tesseract or ocrmypdf.
|
||||||
|
|
||||||
|
Class attributes
|
||||||
|
----------------
|
||||||
|
name : str
|
||||||
|
Human-readable parser name.
|
||||||
|
version : str
|
||||||
|
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||||
|
author : str
|
||||||
|
Maintainer name.
|
||||||
|
url : str
|
||||||
|
Issue tracker / source URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "Paperless-ngx Remote OCR Parser"
|
||||||
|
version: str = __full_version_str__
|
||||||
|
author: str = "Paperless-ngx Contributors"
|
||||||
|
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Class methods
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
"""Return the MIME types this parser can handle.
|
||||||
|
|
||||||
|
The full set is always returned regardless of whether a remote
|
||||||
|
engine is configured. The ``score()`` method handles the
|
||||||
|
"am I active?" logic by returning ``None`` when not configured.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, str]
|
||||||
|
Mapping of MIME type to preferred file extension.
|
||||||
|
"""
|
||||||
|
return _SUPPORTED_MIME_TYPES
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the priority score for handling this file, or None.
|
||||||
|
|
||||||
|
Returns ``None`` when no valid remote engine is configured,
|
||||||
|
making the parser invisible to the registry for this file.
|
||||||
|
When configured, returns 20 — higher than the Tesseract parser's
|
||||||
|
default of 10 — so the remote engine takes priority.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the file.
|
||||||
|
filename:
|
||||||
|
Original filename including extension.
|
||||||
|
path:
|
||||||
|
Optional filesystem path. Not inspected by this parser.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
20 when the remote engine is configured and the MIME type is
|
||||||
|
supported, otherwise None.
|
||||||
|
"""
|
||||||
|
config = RemoteEngineConfig(
|
||||||
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
|
api_key=settings.REMOTE_OCR_API_KEY,
|
||||||
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||||
|
)
|
||||||
|
if not config.engine_is_valid():
|
||||||
|
return None
|
||||||
|
if mime_type not in _SUPPORTED_MIME_TYPES:
|
||||||
|
return None
|
||||||
|
return 20
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
"""Whether this parser can produce a searchable PDF archive copy.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always True — the remote engine always returns a PDF with an
|
||||||
|
embedded text layer that serves as the archive copy.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
"""Whether the parser must produce a PDF for the frontend to display.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always False — all supported originals are displayable by
|
||||||
|
the browser (PDF) or handled via the archive copy (images).
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||||
|
)
|
||||||
|
self._logging_group = logging_group
|
||||||
|
self._text: str | None = None
|
||||||
|
self._archive_path: Path | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Core parsing interface
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Send the document to the remote engine and store results.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the document file to parse.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
produce_archive:
|
||||||
|
Ignored — the remote engine always returns a searchable PDF,
|
||||||
|
which is stored as the archive copy regardless of this flag.
|
||||||
|
"""
|
||||||
|
config = RemoteEngineConfig(
|
||||||
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
|
api_key=settings.REMOTE_OCR_API_KEY,
|
||||||
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not config.engine_is_valid():
|
||||||
|
logger.warning(
|
||||||
|
"No valid remote parser engine is configured, content will be empty.",
|
||||||
|
)
|
||||||
|
self._text = ""
|
||||||
|
return
|
||||||
|
|
||||||
|
if config.engine == "azureai":
|
||||||
|
self._text = self._azure_ai_vision_parse(document_path, config)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Result accessors
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
"""Return the plain-text content extracted during parse."""
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self) -> datetime.datetime | None:
|
||||||
|
"""Return the document date detected during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
datetime.datetime | None
|
||||||
|
Always None — the remote parser does not detect dates.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
"""Return the path to the generated archive PDF, or None."""
|
||||||
|
return self._archive_path
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Thumbnail and metadata
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
"""Generate a thumbnail image for the document.
|
||||||
|
|
||||||
|
Uses the archive PDF produced by the remote engine when available,
|
||||||
|
otherwise falls back to the original document path (PDF inputs).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated WebP thumbnail inside the temp directory.
|
||||||
|
"""
|
||||||
|
# make_thumbnail_from_pdf lives in documents.parsers for now;
|
||||||
|
# it will move to paperless.parsers.utils when the tesseract
|
||||||
|
# parser is migrated in a later phase.
|
||||||
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
|
|
||||||
|
return make_thumbnail_from_pdf(
|
||||||
|
self._archive_path or document_path,
|
||||||
|
self._tempdir,
|
||||||
|
self._logging_group,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_page_count(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in a PDF document.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Page count for PDF inputs, or ``None`` for other MIME types.
|
||||||
|
"""
|
||||||
|
if mime_type != "application/pdf":
|
||||||
|
return None
|
||||||
|
|
||||||
|
from paperless.parsers.utils import get_page_count_for_pdf
|
||||||
|
|
||||||
|
return get_page_count_for_pdf(document_path, log=logger)
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract format-specific metadata from the document.
|
||||||
|
|
||||||
|
Delegates to the shared pikepdf-based extractor for PDF files.
|
||||||
|
Returns ``[]`` for all other MIME types.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the file to extract metadata from.
|
||||||
|
mime_type:
|
||||||
|
MIME type of the file. May be ``"application/pdf"`` when
|
||||||
|
called for the archive version of an image original.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
Zero or more metadata entries.
|
||||||
|
"""
|
||||||
|
if mime_type != "application/pdf":
|
||||||
|
return []
|
||||||
|
|
||||||
|
from paperless.parsers.utils import extract_pdf_metadata
|
||||||
|
|
||||||
|
return extract_pdf_metadata(document_path, log=logger)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Private helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _azure_ai_vision_parse(
|
||||||
|
self,
|
||||||
|
file: Path,
|
||||||
|
config: RemoteEngineConfig,
|
||||||
|
) -> str | None:
|
||||||
|
"""Send ``file`` to Azure AI Document Intelligence and return text.
|
||||||
|
|
||||||
|
Downloads the searchable PDF output from Azure and stores it at
|
||||||
|
``self._archive_path``. Returns the extracted text content, or
|
||||||
|
``None`` on failure (the error is logged).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
file:
|
||||||
|
Absolute path to the document to analyse.
|
||||||
|
config:
|
||||||
|
Validated remote engine configuration.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
Extracted text, or None if the Azure call failed.
|
||||||
|
"""
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# Callers must have already validated config via engine_is_valid():
|
||||||
|
# engine_is_valid() asserts api_key is not None and (for azureai)
|
||||||
|
# endpoint is not None, so these casts are provably safe.
|
||||||
|
assert config.endpoint is not None
|
||||||
|
assert config.api_key is not None
|
||||||
|
|
||||||
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||||
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||||
|
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
||||||
|
from azure.ai.documentintelligence.models import DocumentContentFormat
|
||||||
|
from azure.core.credentials import AzureKeyCredential
|
||||||
|
|
||||||
|
client = DocumentIntelligenceClient(
|
||||||
|
endpoint=config.endpoint,
|
||||||
|
credential=AzureKeyCredential(config.api_key),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with file.open("rb") as f:
|
||||||
|
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
||||||
|
poller = client.begin_analyze_document(
|
||||||
|
model_id="prebuilt-read",
|
||||||
|
body=analyze_request,
|
||||||
|
output_content_format=DocumentContentFormat.TEXT,
|
||||||
|
output=[AnalyzeOutputOption.PDF],
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
poller.wait()
|
||||||
|
result_id = poller.details["operation_id"]
|
||||||
|
result = poller.result()
|
||||||
|
|
||||||
|
self._archive_path = self._tempdir / "archive.pdf"
|
||||||
|
with self._archive_path.open("wb") as f:
|
||||||
|
for chunk in client.get_analyze_result_pdf(
|
||||||
|
model_id="prebuilt-read",
|
||||||
|
result_id=result_id,
|
||||||
|
):
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
return result.content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Azure AI Vision parsing failed: %s", e)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
|
|
||||||
|
return None
|
||||||
130
src/paperless/parsers/utils.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
"""
|
||||||
|
Shared utilities for Paperless-ngx document parsers.
|
||||||
|
|
||||||
|
Functions here are format-neutral helpers that multiple parsers need.
|
||||||
|
Keeping them here avoids parsers inheriting from each other just to
|
||||||
|
share implementation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsers.utils")
|
||||||
|
|
||||||
|
|
||||||
|
def get_page_count_for_pdf(
|
||||||
|
document_path: Path,
|
||||||
|
log: logging.Logger | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in a PDF file using pikepdf.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the PDF file.
|
||||||
|
log:
|
||||||
|
Logger to use for warnings. Falls back to the module-level logger
|
||||||
|
when omitted.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Page count, or ``None`` if the file cannot be opened or is not a
|
||||||
|
valid PDF.
|
||||||
|
"""
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
_log = log or logger
|
||||||
|
|
||||||
|
try:
|
||||||
|
with pikepdf.Pdf.open(document_path) as pdf:
|
||||||
|
return len(pdf.pages)
|
||||||
|
except Exception as e:
|
||||||
|
_log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdf_metadata(
|
||||||
|
document_path: Path,
|
||||||
|
log: logging.Logger | None = None,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract XMP/PDF metadata from a PDF file using pikepdf.
|
||||||
|
|
||||||
|
Reads all XMP metadata entries from the document and returns them as a
|
||||||
|
list of ``MetadataEntry`` dicts. The method never raises — any failure
|
||||||
|
to open the file or read a specific key is logged and skipped.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the PDF file.
|
||||||
|
log:
|
||||||
|
Logger to use for warnings and debug messages. Falls back to the
|
||||||
|
module-level logger when omitted.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
Zero or more metadata entries. Returns ``[]`` if the file cannot
|
||||||
|
be opened or contains no readable XMP metadata.
|
||||||
|
"""
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
|
_log = log or logger
|
||||||
|
result: list[MetadataEntry] = []
|
||||||
|
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
pdf = pikepdf.open(document_path)
|
||||||
|
meta = pdf.open_metadata()
|
||||||
|
except Exception as e:
|
||||||
|
_log.warning("Could not open PDF metadata for %s: %s", document_path, e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
for key, value in meta.items():
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = " ".join(str(e) for e in value)
|
||||||
|
value = str(value)
|
||||||
|
|
||||||
|
try:
|
||||||
|
m = namespace_pattern.match(key)
|
||||||
|
if m is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
namespace = m.group(1)
|
||||||
|
key_value = m.group(2)
|
||||||
|
|
||||||
|
try:
|
||||||
|
namespace.encode("utf-8")
|
||||||
|
key_value.encode("utf-8")
|
||||||
|
except UnicodeEncodeError as enc_err:
|
||||||
|
_log.debug("Skipping metadata key %s: %s", key, enc_err)
|
||||||
|
continue
|
||||||
|
|
||||||
|
result.append(
|
||||||
|
MetadataEntry(
|
||||||
|
namespace=namespace,
|
||||||
|
prefix=meta.REVERSE_NS[namespace],
|
||||||
|
key=key_value,
|
||||||
|
value=value,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
_log.warning(
|
||||||
|
"Error reading metadata key %s value %s: %s",
|
||||||
|
key,
|
||||||
|
value,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -10,7 +10,7 @@ from typing import TYPE_CHECKING
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
@@ -18,6 +18,8 @@ if TYPE_CHECKING:
|
|||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pytest_django.fixtures import SettingsWrapper
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Text parser sample files
|
# Text parser sample files
|
||||||
@@ -78,6 +80,92 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
|||||||
yield parser
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Remote parser sample files
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def remote_samples_dir(samples_dir: Path) -> Path:
|
||||||
|
"""Absolute path to the remote parser sample files directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
``<samples_dir>/remote/``
|
||||||
|
"""
|
||||||
|
return samples_dir / "remote"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_pdf_file(remote_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple digital PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``remote/simple-digital.pdf``.
|
||||||
|
"""
|
||||||
|
return remote_samples_dir / "simple-digital.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Remote parser instance
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
|
||||||
|
"""Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
RemoteDocumentParser
|
||||||
|
A ready-to-use parser instance.
|
||||||
|
"""
|
||||||
|
with RemoteDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Remote parser settings helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||||
|
"""Configure Django settings for a valid Azure AI OCR engine.
|
||||||
|
|
||||||
|
Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
|
||||||
|
``REMOTE_OCR_ENDPOINT`` to test values. Settings are restored
|
||||||
|
automatically after the test by pytest-django.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
SettingsWrapper
|
||||||
|
The modified settings object (for chaining further overrides).
|
||||||
|
"""
|
||||||
|
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||||
|
settings.REMOTE_OCR_API_KEY = "test-api-key"
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||||
|
return settings
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||||
|
"""Configure Django settings with no remote engine configured.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
SettingsWrapper
|
||||||
|
The modified settings object.
|
||||||
|
"""
|
||||||
|
settings.REMOTE_OCR_ENGINE = None
|
||||||
|
settings.REMOTE_OCR_API_KEY = None
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = None
|
||||||
|
return settings
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Tika parser sample files
|
# Tika parser sample files
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
@@ -159,166 +247,3 @@ def tika_parser() -> Generator[TikaDocumentParser, None, None]:
|
|||||||
"""
|
"""
|
||||||
with TikaDocumentParser() as parser:
|
with TikaDocumentParser() as parser:
|
||||||
yield parser
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Mail parser sample files
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def mail_samples_dir(samples_dir: Path) -> Path:
|
|
||||||
"""Absolute path to the mail parser sample files directory.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
``<samples_dir>/mail/``
|
|
||||||
"""
|
|
||||||
return samples_dir / "mail"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def broken_email_file(mail_samples_dir: Path) -> Path:
|
|
||||||
"""Path to a broken/malformed EML sample file.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``mail/broken.eml``.
|
|
||||||
"""
|
|
||||||
return mail_samples_dir / "broken.eml"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def simple_txt_email_file(mail_samples_dir: Path) -> Path:
|
|
||||||
"""Path to a plain-text email sample file.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``mail/simple_text.eml``.
|
|
||||||
"""
|
|
||||||
return mail_samples_dir / "simple_text.eml"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def simple_txt_email_pdf_file(mail_samples_dir: Path) -> Path:
|
|
||||||
"""Path to the expected PDF rendition of the plain-text email.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``mail/simple_text.eml.pdf``.
|
|
||||||
"""
|
|
||||||
return mail_samples_dir / "simple_text.eml.pdf"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def simple_txt_email_thumbnail_file(mail_samples_dir: Path) -> Path:
|
|
||||||
"""Path to the expected thumbnail for the plain-text email.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``mail/simple_text.eml.pdf.webp``.
|
|
||||||
"""
|
|
||||||
return mail_samples_dir / "simple_text.eml.pdf.webp"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def html_email_file(mail_samples_dir: Path) -> Path:
|
|
||||||
"""Path to an HTML email sample file.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``mail/html.eml``.
|
|
||||||
"""
|
|
||||||
return mail_samples_dir / "html.eml"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def html_email_pdf_file(mail_samples_dir: Path) -> Path:
|
|
||||||
"""Path to the expected PDF rendition of the HTML email.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``mail/html.eml.pdf``.
|
|
||||||
"""
|
|
||||||
return mail_samples_dir / "html.eml.pdf"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def html_email_thumbnail_file(mail_samples_dir: Path) -> Path:
|
|
||||||
"""Path to the expected thumbnail for the HTML email.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``mail/html.eml.pdf.webp``.
|
|
||||||
"""
|
|
||||||
return mail_samples_dir / "html.eml.pdf.webp"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def html_email_html_file(mail_samples_dir: Path) -> Path:
|
|
||||||
"""Path to the HTML body of the HTML email sample.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``mail/html.eml.html``.
|
|
||||||
"""
|
|
||||||
return mail_samples_dir / "html.eml.html"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def merged_pdf_first(mail_samples_dir: Path) -> Path:
|
|
||||||
"""Path to the first PDF used in PDF-merge tests.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``mail/first.pdf``.
|
|
||||||
"""
|
|
||||||
return mail_samples_dir / "first.pdf"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def merged_pdf_second(mail_samples_dir: Path) -> Path:
|
|
||||||
"""Path to the second PDF used in PDF-merge tests.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Absolute path to ``mail/second.pdf``.
|
|
||||||
"""
|
|
||||||
return mail_samples_dir / "second.pdf"
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Mail parser instance
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def mail_parser() -> Generator[MailDocumentParser, None, None]:
|
|
||||||
"""Yield a MailDocumentParser and clean up its temporary directory afterwards.
|
|
||||||
|
|
||||||
Yields
|
|
||||||
------
|
|
||||||
MailDocumentParser
|
|
||||||
A ready-to-use parser instance.
|
|
||||||
"""
|
|
||||||
with MailDocumentParser() as parser:
|
|
||||||
yield parser
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def nginx_base_url() -> Generator[str, None, None]:
|
|
||||||
"""
|
|
||||||
The base URL for the nginx HTTP server we expect to be alive
|
|
||||||
"""
|
|
||||||
yield "http://localhost:8080"
|
|
||||||
|
|||||||
490
src/paperless/tests/parsers/test_remote_parser.py
Normal file
@@ -0,0 +1,490 @@
|
|||||||
|
"""
|
||||||
|
Tests for paperless.parsers.remote.RemoteDocumentParser.
|
||||||
|
|
||||||
|
All tests use the context-manager protocol for parser lifecycle.
|
||||||
|
|
||||||
|
Fixture layout
|
||||||
|
--------------
|
||||||
|
make_azure_mock — factory (defined here; specific to this module)
|
||||||
|
azure_client — composes azure_settings + make_azure_mock + patch;
|
||||||
|
use when a test needs the client to succeed
|
||||||
|
failing_azure_client
|
||||||
|
— composes azure_settings + patch with RuntimeError;
|
||||||
|
use when a test needs the client to fail
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperless.parsers import ParserProtocol
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Callable
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pytest_django.fixtures import SettingsWrapper
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module-local fixtures
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_AZURE_CLIENT_TARGET = "azure.ai.documentintelligence.DocumentIntelligenceClient"
|
||||||
|
_DEFAULT_TEXT = "Extracted text."
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def make_azure_mock() -> Callable[[str], Mock]:
|
||||||
|
"""Return a factory that builds a mock Azure DocumentIntelligenceClient.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
mock_client = make_azure_mock() # default extracted text
|
||||||
|
mock_client = make_azure_mock("My text.") # custom extracted text
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _factory(text: str = _DEFAULT_TEXT) -> Mock:
|
||||||
|
mock_client = Mock()
|
||||||
|
mock_poller = Mock()
|
||||||
|
mock_poller.wait.return_value = None
|
||||||
|
mock_poller.details = {"operation_id": "fake-op-id"}
|
||||||
|
mock_poller.result.return_value.content = text
|
||||||
|
mock_client.begin_analyze_document.return_value = mock_poller
|
||||||
|
mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
return _factory
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def azure_client(
|
||||||
|
azure_settings: SettingsWrapper,
|
||||||
|
make_azure_mock: Callable[[str], Mock],
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> Mock:
|
||||||
|
"""Patch the Azure DI client with a succeeding mock and return the instance.
|
||||||
|
|
||||||
|
Implicitly applies ``azure_settings`` so tests using this fixture do not
|
||||||
|
also need ``@pytest.mark.usefixtures("azure_settings")``.
|
||||||
|
"""
|
||||||
|
mock_client = make_azure_mock()
|
||||||
|
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def failing_azure_client(
|
||||||
|
azure_settings: SettingsWrapper,
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> Mock:
|
||||||
|
"""Patch the Azure DI client to raise RuntimeError on every call.
|
||||||
|
|
||||||
|
Implicitly applies ``azure_settings``. Returns the mock instance so
|
||||||
|
tests can assert on calls such as ``close()``.
|
||||||
|
"""
|
||||||
|
mock_client = Mock()
|
||||||
|
mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
|
||||||
|
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Protocol contract
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserProtocol:
|
||||||
|
"""Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""
|
||||||
|
|
||||||
|
def test_isinstance_satisfies_protocol(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert isinstance(remote_parser, ParserProtocol)
|
||||||
|
|
||||||
|
def test_class_attributes_present(self) -> None:
|
||||||
|
assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
|
||||||
|
assert (
|
||||||
|
isinstance(RemoteDocumentParser.version, str)
|
||||||
|
and RemoteDocumentParser.version
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
|
||||||
|
)
|
||||||
|
assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# supported_mime_types
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserSupportedMimeTypes:
|
||||||
|
"""supported_mime_types() always returns the full set regardless of config."""
|
||||||
|
|
||||||
|
def test_returns_dict(self) -> None:
|
||||||
|
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||||
|
assert isinstance(mime_types, dict)
|
||||||
|
|
||||||
|
def test_includes_all_expected_types(self) -> None:
|
||||||
|
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||||
|
expected = {
|
||||||
|
"application/pdf",
|
||||||
|
"image/png",
|
||||||
|
"image/jpeg",
|
||||||
|
"image/tiff",
|
||||||
|
"image/bmp",
|
||||||
|
"image/gif",
|
||||||
|
"image/webp",
|
||||||
|
}
|
||||||
|
assert expected == set(mime_types.keys())
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
def test_returns_full_set_when_not_configured(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: No remote engine is configured
|
||||||
|
WHEN: supported_mime_types() is called
|
||||||
|
THEN: The full MIME type dict is still returned (score() handles activation)
|
||||||
|
"""
|
||||||
|
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||||
|
assert len(mime_types) == 7
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# score()
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserScore:
|
||||||
|
"""score() encodes the activation logic: None when unconfigured, 20 when active."""
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"mime_type",
|
||||||
|
[
|
||||||
|
pytest.param("application/pdf", id="pdf"),
|
||||||
|
pytest.param("image/png", id="png"),
|
||||||
|
pytest.param("image/jpeg", id="jpeg"),
|
||||||
|
pytest.param("image/tiff", id="tiff"),
|
||||||
|
pytest.param("image/bmp", id="bmp"),
|
||||||
|
pytest.param("image/gif", id="gif"),
|
||||||
|
pytest.param("image/webp", id="webp"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_score_returns_20_when_configured(self, mime_type: str) -> None:
|
||||||
|
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||||
|
assert result == 20
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"mime_type",
|
||||||
|
[
|
||||||
|
pytest.param("application/pdf", id="pdf"),
|
||||||
|
pytest.param("image/png", id="png"),
|
||||||
|
pytest.param("image/jpeg", id="jpeg"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_score_returns_none_when_no_engine(self, mime_type: str) -> None:
|
||||||
|
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_score_returns_none_when_api_key_missing(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||||
|
settings.REMOTE_OCR_API_KEY = None
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||||
|
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_score_returns_none_when_endpoint_missing(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||||
|
settings.REMOTE_OCR_API_KEY = "key"
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = None
|
||||||
|
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
def test_score_returns_none_for_unsupported_mime_type(self) -> None:
|
||||||
|
result = RemoteDocumentParser.score("text/plain", "doc.txt")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
def test_score_higher_than_tesseract_default(self) -> None:
|
||||||
|
"""Remote parser (20) outranks the tesseract default (10) when configured."""
|
||||||
|
score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert score is not None and score > 10
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserProperties:
|
||||||
|
def test_can_produce_archive_is_true(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert remote_parser.can_produce_archive is True
|
||||||
|
|
||||||
|
def test_requires_pdf_rendition_is_false(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert remote_parser.requires_pdf_rendition is False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserLifecycle:
|
||||||
|
def test_context_manager_cleans_up_tempdir(self) -> None:
|
||||||
|
with RemoteDocumentParser() as parser:
|
||||||
|
tempdir = parser._tempdir
|
||||||
|
assert tempdir.exists()
|
||||||
|
assert not tempdir.exists()
|
||||||
|
|
||||||
|
def test_context_manager_cleans_up_after_exception(self) -> None:
|
||||||
|
tempdir: Path | None = None
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
with RemoteDocumentParser() as parser:
|
||||||
|
tempdir = parser._tempdir
|
||||||
|
raise RuntimeError("boom")
|
||||||
|
assert tempdir is not None
|
||||||
|
assert not tempdir.exists()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# parse() — happy path
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserParse:
|
||||||
|
def test_parse_returns_text_from_azure(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_text() == _DEFAULT_TEXT
|
||||||
|
|
||||||
|
def test_parse_sets_archive_path(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
archive = remote_parser.get_archive_path()
|
||||||
|
assert archive is not None
|
||||||
|
assert archive.exists()
|
||||||
|
assert archive.suffix == ".pdf"
|
||||||
|
|
||||||
|
def test_parse_closes_client_on_success(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
azure_client.close.assert_called_once()
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
def test_parse_sets_empty_text_when_not_configured(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_text() == ""
|
||||||
|
assert remote_parser.get_archive_path() is None
|
||||||
|
|
||||||
|
def test_get_text_none_before_parse(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert remote_parser.get_text() is None
|
||||||
|
|
||||||
|
def test_get_date_always_none(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_date() is None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# parse() — Azure failure path
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserParseError:
|
||||||
|
def test_parse_returns_none_on_azure_error(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
failing_azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_text() is None
|
||||||
|
|
||||||
|
def test_parse_closes_client_on_error(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
failing_azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
failing_azure_client.close.assert_called_once()
|
||||||
|
|
||||||
|
def test_parse_logs_error_on_azure_failure(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
failing_azure_client: Mock,
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
mock_log = mocker.patch("paperless.parsers.remote.logger")
|
||||||
|
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
mock_log.error.assert_called_once()
|
||||||
|
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# get_page_count()
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserPageCount:
|
||||||
|
def test_page_count_for_pdf(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||||
|
assert isinstance(count, int)
|
||||||
|
assert count >= 1
|
||||||
|
|
||||||
|
def test_page_count_returns_none_for_image_mime(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
count = remote_parser.get_page_count(sample_pdf_file, "image/png")
|
||||||
|
assert count is None
|
||||||
|
|
||||||
|
def test_page_count_returns_none_for_invalid_pdf(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
tmp_path: Path,
|
||||||
|
) -> None:
|
||||||
|
bad_pdf = tmp_path / "bad.pdf"
|
||||||
|
bad_pdf.write_bytes(b"not a pdf at all")
|
||||||
|
count = remote_parser.get_page_count(bad_pdf, "application/pdf")
|
||||||
|
assert count is None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# extract_metadata()
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserMetadata:
|
||||||
|
def test_extract_metadata_non_pdf_returns_empty(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
def test_extract_metadata_pdf_returns_list(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||||
|
assert isinstance(result, list)
|
||||||
|
|
||||||
|
def test_extract_metadata_pdf_entries_have_required_keys(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||||
|
for entry in result:
|
||||||
|
assert "namespace" in entry
|
||||||
|
assert "prefix" in entry
|
||||||
|
assert "key" in entry
|
||||||
|
assert "value" in entry
|
||||||
|
assert isinstance(entry["value"], str)
|
||||||
|
|
||||||
|
def test_extract_metadata_does_not_raise_on_invalid_pdf(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
tmp_path: Path,
|
||||||
|
) -> None:
|
||||||
|
bad_pdf = tmp_path / "bad.pdf"
|
||||||
|
bad_pdf.write_bytes(b"not a pdf at all")
|
||||||
|
result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Registry integration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserRegistry:
|
||||||
|
def test_registered_in_defaults(self) -> None:
|
||||||
|
from paperless.parsers.registry import ParserRegistry
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_defaults()
|
||||||
|
|
||||||
|
assert RemoteDocumentParser in registry._builtins
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
def test_get_parser_returns_remote_when_configured(self) -> None:
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
|
registry = get_parser_registry()
|
||||||
|
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||||
|
|
||||||
|
assert parser_cls is RemoteDocumentParser
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
def test_get_parser_returns_none_for_pdf_when_not_configured(self) -> None:
|
||||||
|
"""With no tesseract parser registered yet, PDF has no handler if remote is off."""
|
||||||
|
from paperless.parsers.registry import ParserRegistry
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_defaults()
|
||||||
|
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||||
|
|
||||||
|
assert parser_cls is None
|
||||||
@@ -1,26 +1,6 @@
|
|||||||
"""
|
|
||||||
Built-in mail document parser.
|
|
||||||
|
|
||||||
Handles message/rfc822 (EML) MIME type by:
|
|
||||||
- Parsing the email using imap_tools
|
|
||||||
- Generating a PDF via Gotenberg (for display and archive)
|
|
||||||
- Extracting text via Tika for HTML content
|
|
||||||
- Extracting metadata from email headers
|
|
||||||
|
|
||||||
The parser always produces a PDF because EML files cannot be rendered
|
|
||||||
natively in a browser (requires_pdf_rendition=True).
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
import re
|
||||||
import shutil
|
|
||||||
import tempfile
|
|
||||||
from html import escape
|
from html import escape
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
from typing import Self
|
|
||||||
|
|
||||||
from bleach import clean
|
from bleach import clean
|
||||||
from bleach import linkify
|
from bleach import linkify
|
||||||
@@ -39,353 +19,65 @@ from imap_tools import MailAttachment
|
|||||||
from imap_tools import MailMessage
|
from imap_tools import MailMessage
|
||||||
from tika_client import TikaClient
|
from tika_client import TikaClient
|
||||||
|
|
||||||
|
from documents.parsers import DocumentParser
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from documents.parsers import make_thumbnail_from_pdf
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
from paperless.models import OutputTypeChoices
|
from paperless.models import OutputTypeChoices
|
||||||
from paperless.version import __full_version_str__
|
|
||||||
from paperless_mail.models import MailRule
|
from paperless_mail.models import MailRule
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
import datetime
|
|
||||||
from types import TracebackType
|
|
||||||
|
|
||||||
from paperless.parsers import MetadataEntry
|
class MailDocumentParser(DocumentParser):
|
||||||
|
"""
|
||||||
logger = logging.getLogger("paperless.parsing.mail")
|
This parser uses imap_tools to parse .eml files, generates pdf using
|
||||||
|
Gotenberg and sends the html part to a Tika server for text extraction.
|
||||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
|
||||||
"message/rfc822": ".eml",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class MailDocumentParser:
|
|
||||||
"""Parse .eml email files for Paperless-ngx.
|
|
||||||
|
|
||||||
Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
|
|
||||||
and sends the HTML part to a Tika server for text extraction. Because
|
|
||||||
EML files cannot be rendered natively in a browser, the parser always
|
|
||||||
produces a PDF rendition (requires_pdf_rendition=True).
|
|
||||||
|
|
||||||
The mailrule_id instance attribute may be set by the consumer before
|
|
||||||
calling parse() to apply mail-rule-specific PDF layout options:
|
|
||||||
|
|
||||||
parser.mailrule_id = rule.pk
|
|
||||||
parser.parse(path, mime_type)
|
|
||||||
|
|
||||||
Class attributes
|
|
||||||
----------------
|
|
||||||
name : str
|
|
||||||
Human-readable parser name.
|
|
||||||
version : str
|
|
||||||
Semantic version string, kept in sync with Paperless-ngx releases.
|
|
||||||
author : str
|
|
||||||
Maintainer name.
|
|
||||||
url : str
|
|
||||||
Issue tracker / source URL.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name: str = "Paperless-ngx Mail Parser"
|
logging_name = "paperless.parsing.mail"
|
||||||
version: str = __full_version_str__
|
|
||||||
author: str = "Paperless-ngx Contributors"
|
|
||||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
|
||||||
# Class methods
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def supported_mime_types(cls) -> dict[str, str]:
|
|
||||||
"""Return the MIME types this parser handles.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
dict[str, str]
|
|
||||||
Mapping of MIME type to preferred file extension.
|
|
||||||
"""
|
"""
|
||||||
return _SUPPORTED_MIME_TYPES
|
Converts our requested PDF/A output into the Gotenberg API
|
||||||
|
format
|
||||||
@classmethod
|
|
||||||
def score(
|
|
||||||
cls,
|
|
||||||
mime_type: str,
|
|
||||||
filename: str,
|
|
||||||
path: Path | None = None,
|
|
||||||
) -> int | None:
|
|
||||||
"""Return the priority score for handling this file.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
mime_type:
|
|
||||||
Detected MIME type of the file.
|
|
||||||
filename:
|
|
||||||
Original filename including extension.
|
|
||||||
path:
|
|
||||||
Optional filesystem path. Not inspected by this parser.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
int | None
|
|
||||||
20 if the MIME type is supported (higher than the default 10 to
|
|
||||||
give the mail parser clear priority), otherwise None.
|
|
||||||
"""
|
"""
|
||||||
if mime_type in _SUPPORTED_MIME_TYPES:
|
if settings.OCR_OUTPUT_TYPE in {
|
||||||
return 20
|
OutputTypeChoices.PDF_A,
|
||||||
|
OutputTypeChoices.PDF_A2,
|
||||||
|
}:
|
||||||
|
return PdfAFormat.A2b
|
||||||
|
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
|
||||||
|
self.log.warning(
|
||||||
|
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||||
|
)
|
||||||
|
return PdfAFormat.A2b
|
||||||
|
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
|
||||||
|
return PdfAFormat.A3b
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Properties
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
@property
|
|
||||||
def can_produce_archive(self) -> bool:
|
|
||||||
"""Whether this parser can produce a searchable PDF archive copy.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
bool
|
|
||||||
Always False — the mail parser produces a display PDF
|
|
||||||
(requires_pdf_rendition=True), not an optional OCR archive.
|
|
||||||
"""
|
|
||||||
return False
|
|
||||||
|
|
||||||
@property
|
|
||||||
def requires_pdf_rendition(self) -> bool:
|
|
||||||
"""Whether the parser must produce a PDF for the frontend to display.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
bool
|
|
||||||
Always True — EML files cannot be rendered natively in a browser,
|
|
||||||
so a PDF conversion is always required for display.
|
|
||||||
"""
|
|
||||||
return True
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Lifecycle
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def __init__(self, logging_group: object = None) -> None:
|
|
||||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
|
||||||
self._tempdir = Path(
|
|
||||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
|
||||||
)
|
|
||||||
self._text: str | None = None
|
|
||||||
self._date: datetime.datetime | None = None
|
|
||||||
self._archive_path: Path | None = None
|
|
||||||
self.mailrule_id: int | None = None
|
|
||||||
|
|
||||||
def __enter__(self) -> Self:
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __exit__(
|
|
||||||
self,
|
|
||||||
exc_type: type[BaseException] | None,
|
|
||||||
exc_val: BaseException | None,
|
|
||||||
exc_tb: TracebackType | None,
|
|
||||||
) -> None:
|
|
||||||
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
|
||||||
shutil.rmtree(self._tempdir, ignore_errors=True)
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Core parsing interface
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def parse(
|
|
||||||
self,
|
|
||||||
document_path: Path,
|
|
||||||
mime_type: str,
|
|
||||||
*,
|
|
||||||
produce_archive: bool = True,
|
|
||||||
) -> None:
|
|
||||||
"""Parse the given .eml into formatted text and a PDF archive.
|
|
||||||
|
|
||||||
The consumer may set ``self.mailrule_id`` before calling this method
|
|
||||||
to apply mail-rule-specific PDF layout options. The ``produce_archive``
|
|
||||||
flag is accepted for protocol compatibility but is always honoured —
|
|
||||||
the mail parser always produces a PDF since EML files cannot be
|
|
||||||
displayed natively.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
document_path:
|
|
||||||
Absolute path to the .eml file.
|
|
||||||
mime_type:
|
|
||||||
Detected MIME type of the document (should be "message/rfc822").
|
|
||||||
produce_archive:
|
|
||||||
Accepted for protocol compatibility. The PDF rendition is always
|
|
||||||
produced since EML files cannot be displayed natively in a browser.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
documents.parsers.ParseError
|
|
||||||
If the file cannot be parsed or PDF generation fails.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def strip_text(text: str) -> str:
|
|
||||||
"""Reduces the spacing of the given text string."""
|
|
||||||
text = re.sub(r"\s+", " ", text)
|
|
||||||
text = re.sub(r"(\n *)+", "\n", text)
|
|
||||||
return text.strip()
|
|
||||||
|
|
||||||
def build_formatted_text(mail_message: MailMessage) -> str:
|
|
||||||
"""Constructs a formatted string based on the given email."""
|
|
||||||
fmt_text = f"Subject: {mail_message.subject}\n\n"
|
|
||||||
fmt_text += f"From: {mail_message.from_values.full}\n\n"
|
|
||||||
to_list = [address.full for address in mail_message.to_values]
|
|
||||||
fmt_text += f"To: {', '.join(to_list)}\n\n"
|
|
||||||
if mail_message.cc_values:
|
|
||||||
fmt_text += (
|
|
||||||
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
|
|
||||||
)
|
|
||||||
if mail_message.bcc_values:
|
|
||||||
fmt_text += (
|
|
||||||
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
|
|
||||||
)
|
|
||||||
if mail_message.attachments:
|
|
||||||
att = []
|
|
||||||
for a in mail.attachments:
|
|
||||||
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
|
|
||||||
att.append(
|
|
||||||
f"{a.filename} ({attachment_size})",
|
|
||||||
)
|
|
||||||
fmt_text += f"Attachments: {', '.join(att)}\n\n"
|
|
||||||
|
|
||||||
if mail.html:
|
|
||||||
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
|
|
||||||
|
|
||||||
fmt_text += f"\n\n{strip_text(mail.text)}"
|
|
||||||
|
|
||||||
return fmt_text
|
|
||||||
|
|
||||||
logger.debug("Parsing file %s into an email", document_path.name)
|
|
||||||
mail = self.parse_file_to_message(document_path)
|
|
||||||
|
|
||||||
logger.debug("Building formatted text from email")
|
|
||||||
self._text = build_formatted_text(mail)
|
|
||||||
|
|
||||||
if is_naive(mail.date):
|
|
||||||
self._date = make_aware(mail.date)
|
|
||||||
else:
|
|
||||||
self._date = mail.date
|
|
||||||
|
|
||||||
logger.debug("Creating a PDF from the email")
|
|
||||||
if self.mailrule_id:
|
|
||||||
rule = MailRule.objects.get(pk=self.mailrule_id)
|
|
||||||
self._archive_path = self.generate_pdf(mail, rule.pdf_layout)
|
|
||||||
else:
|
|
||||||
self._archive_path = self.generate_pdf(mail)
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Result accessors
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def get_text(self) -> str | None:
|
|
||||||
"""Return the plain-text content extracted during parse.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
str | None
|
|
||||||
Extracted text, or None if parse has not been called yet.
|
|
||||||
"""
|
|
||||||
return self._text
|
|
||||||
|
|
||||||
def get_date(self) -> datetime.datetime | None:
|
|
||||||
"""Return the document date detected during parse.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
datetime.datetime | None
|
|
||||||
Date from the email headers, or None if not detected.
|
|
||||||
"""
|
|
||||||
return self._date
|
|
||||||
|
|
||||||
def get_archive_path(self) -> Path | None:
|
|
||||||
"""Return the path to the generated archive PDF, or None.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path | None
|
|
||||||
Path to the PDF produced by Gotenberg, or None if parse has not
|
|
||||||
been called yet.
|
|
||||||
"""
|
|
||||||
return self._archive_path
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Thumbnail and metadata
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def get_thumbnail(
|
def get_thumbnail(
|
||||||
self,
|
self,
|
||||||
document_path: Path,
|
document_path: Path,
|
||||||
mime_type: str,
|
mime_type: str,
|
||||||
file_name: str | None = None,
|
file_name=None,
|
||||||
) -> Path:
|
) -> Path:
|
||||||
"""Generate a thumbnail from the PDF rendition of the email.
|
if not self.archive_path:
|
||||||
|
self.archive_path = self.generate_pdf(
|
||||||
Converts the document to PDF first if not already done.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
document_path:
|
|
||||||
Absolute path to the source document.
|
|
||||||
mime_type:
|
|
||||||
Detected MIME type of the document.
|
|
||||||
file_name:
|
|
||||||
Kept for backward compatibility; not used.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Path to the generated WebP thumbnail inside the temporary directory.
|
|
||||||
"""
|
|
||||||
if not self._archive_path:
|
|
||||||
self._archive_path = self.generate_pdf(
|
|
||||||
self.parse_file_to_message(document_path),
|
self.parse_file_to_message(document_path),
|
||||||
)
|
)
|
||||||
|
|
||||||
return make_thumbnail_from_pdf(
|
return make_thumbnail_from_pdf(
|
||||||
self._archive_path,
|
self.archive_path,
|
||||||
self._tempdir,
|
self.tempdir,
|
||||||
|
self.logging_group,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_page_count(
|
def extract_metadata(self, document_path: Path, mime_type: str):
|
||||||
self,
|
result = []
|
||||||
document_path: Path,
|
|
||||||
mime_type: str,
|
|
||||||
) -> int | None:
|
|
||||||
"""Return the number of pages in the document.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
int | None
|
|
||||||
Always None — page count is not available for email files.
|
|
||||||
"""
|
|
||||||
return None
|
|
||||||
|
|
||||||
def extract_metadata(
|
|
||||||
self,
|
|
||||||
document_path: Path,
|
|
||||||
mime_type: str,
|
|
||||||
) -> list[MetadataEntry]:
|
|
||||||
"""Extract metadata from the email headers.
|
|
||||||
|
|
||||||
Returns email headers as metadata entries with prefix "header",
|
|
||||||
plus summary entries for attachments and date.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
list[MetadataEntry]
|
|
||||||
Sorted list of metadata entries, or ``[]`` on parse failure.
|
|
||||||
"""
|
|
||||||
result: list[MetadataEntry] = []
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mail = self.parse_file_to_message(document_path)
|
mail = self.parse_file_to_message(document_path)
|
||||||
except ParseError as e:
|
except ParseError as e:
|
||||||
logger.warning(
|
self.log.warning(
|
||||||
"Error while fetching document metadata for %s: %s",
|
f"Error while fetching document metadata for {document_path}: {e}",
|
||||||
document_path,
|
|
||||||
e,
|
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -394,7 +86,7 @@ class MailDocumentParser:
|
|||||||
try:
|
try:
|
||||||
value.encode("utf-8")
|
value.encode("utf-8")
|
||||||
except UnicodeEncodeError as e: # pragma: no cover
|
except UnicodeEncodeError as e: # pragma: no cover
|
||||||
logger.debug("Skipping header %s: %s", key, e)
|
self.log.debug(f"Skipping header {key}: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
result.append(
|
result.append(
|
||||||
@@ -431,44 +123,81 @@ class MailDocumentParser:
|
|||||||
result.sort(key=lambda item: (item["prefix"], item["key"]))
|
result.sort(key=lambda item: (item["prefix"], item["key"]))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
def parse(
|
||||||
# Email-specific methods
|
self,
|
||||||
# ------------------------------------------------------------------
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
file_name=None,
|
||||||
|
mailrule_id: int | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Parses the given .eml into formatted text, based on the decoded email.
|
||||||
|
|
||||||
def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
|
"""
|
||||||
"""Convert the OCR output type setting to a Gotenberg PdfAFormat."""
|
|
||||||
if settings.OCR_OUTPUT_TYPE in {
|
def strip_text(text: str):
|
||||||
OutputTypeChoices.PDF_A,
|
"""
|
||||||
OutputTypeChoices.PDF_A2,
|
Reduces the spacing of the given text string
|
||||||
}:
|
"""
|
||||||
return PdfAFormat.A2b
|
text = re.sub(r"\s+", " ", text)
|
||||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
|
text = re.sub(r"(\n *)+", "\n", text)
|
||||||
logger.warning(
|
return text.strip()
|
||||||
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
|
||||||
)
|
def build_formatted_text(mail_message: MailMessage) -> str:
|
||||||
return PdfAFormat.A2b
|
"""
|
||||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
|
Constructs a formatted string, based on the given email. Basically tries
|
||||||
return PdfAFormat.A3b
|
to get most of the email content, included front matter, into a nice string
|
||||||
return None
|
"""
|
||||||
|
fmt_text = f"Subject: {mail_message.subject}\n\n"
|
||||||
|
fmt_text += f"From: {mail_message.from_values.full}\n\n"
|
||||||
|
to_list = [address.full for address in mail_message.to_values]
|
||||||
|
fmt_text += f"To: {', '.join(to_list)}\n\n"
|
||||||
|
if mail_message.cc_values:
|
||||||
|
fmt_text += (
|
||||||
|
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
|
||||||
|
)
|
||||||
|
if mail_message.bcc_values:
|
||||||
|
fmt_text += (
|
||||||
|
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
|
||||||
|
)
|
||||||
|
if mail_message.attachments:
|
||||||
|
att = []
|
||||||
|
for a in mail.attachments:
|
||||||
|
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
|
||||||
|
att.append(
|
||||||
|
f"{a.filename} ({attachment_size})",
|
||||||
|
)
|
||||||
|
fmt_text += f"Attachments: {', '.join(att)}\n\n"
|
||||||
|
|
||||||
|
if mail.html:
|
||||||
|
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
|
||||||
|
|
||||||
|
fmt_text += f"\n\n{strip_text(mail.text)}"
|
||||||
|
|
||||||
|
return fmt_text
|
||||||
|
|
||||||
|
self.log.debug(f"Parsing file {document_path.name} into an email")
|
||||||
|
mail = self.parse_file_to_message(document_path)
|
||||||
|
|
||||||
|
self.log.debug("Building formatted text from email")
|
||||||
|
self.text = build_formatted_text(mail)
|
||||||
|
|
||||||
|
if is_naive(mail.date):
|
||||||
|
self.date = make_aware(mail.date)
|
||||||
|
else:
|
||||||
|
self.date = mail.date
|
||||||
|
|
||||||
|
self.log.debug("Creating a PDF from the email")
|
||||||
|
if mailrule_id:
|
||||||
|
rule = MailRule.objects.get(pk=mailrule_id)
|
||||||
|
self.archive_path = self.generate_pdf(mail, rule.pdf_layout)
|
||||||
|
else:
|
||||||
|
self.archive_path = self.generate_pdf(mail)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_file_to_message(filepath: Path) -> MailMessage:
|
def parse_file_to_message(filepath: Path) -> MailMessage:
|
||||||
"""Parse the given .eml file into a MailMessage object.
|
"""
|
||||||
|
Parses the given .eml file into a MailMessage object
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
filepath:
|
|
||||||
Path to the .eml file.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
MailMessage
|
|
||||||
Parsed mail message.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
documents.parsers.ParseError
|
|
||||||
If the file cannot be parsed or is missing required fields.
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
with filepath.open("rb") as eml:
|
with filepath.open("rb") as eml:
|
||||||
@@ -484,25 +213,8 @@ class MailDocumentParser:
|
|||||||
|
|
||||||
return parsed
|
return parsed
|
||||||
|
|
||||||
def tika_parse(self, html: str) -> str:
|
def tika_parse(self, html: str):
|
||||||
"""Send HTML content to the Tika server for text extraction.
|
self.log.info("Sending content to Tika server")
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
html:
|
|
||||||
HTML string to parse.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
str
|
|
||||||
Extracted plain text.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
documents.parsers.ParseError
|
|
||||||
If the Tika server cannot be reached or returns an error.
|
|
||||||
"""
|
|
||||||
logger.info("Sending content to Tika server")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
||||||
@@ -522,32 +234,16 @@ class MailDocumentParser:
|
|||||||
mail_message: MailMessage,
|
mail_message: MailMessage,
|
||||||
pdf_layout: MailRule.PdfLayout | None = None,
|
pdf_layout: MailRule.PdfLayout | None = None,
|
||||||
) -> Path:
|
) -> Path:
|
||||||
"""Generate a PDF from the email message.
|
archive_path = Path(self.tempdir) / "merged.pdf"
|
||||||
|
|
||||||
Creates separate PDFs for the email body and HTML content, then
|
|
||||||
merges them according to the requested layout.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
mail_message:
|
|
||||||
Parsed email message.
|
|
||||||
pdf_layout:
|
|
||||||
Layout option for the PDF. Falls back to the
|
|
||||||
EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Path to the generated PDF inside the temporary directory.
|
|
||||||
"""
|
|
||||||
archive_path = Path(self._tempdir) / "merged.pdf"
|
|
||||||
|
|
||||||
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
|
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
|
||||||
|
|
||||||
pdf_layout = pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
|
pdf_layout = (
|
||||||
|
pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
|
||||||
|
) # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout
|
||||||
|
|
||||||
# If no HTML content, create the PDF from the message.
|
# If no HTML content, create the PDF from the message
|
||||||
# Otherwise, create 2 PDFs and merge them with Gotenberg.
|
# Otherwise, create 2 PDFs and merge them with Gotenberg
|
||||||
if not mail_message.html:
|
if not mail_message.html:
|
||||||
archive_path.write_bytes(mail_pdf_file.read_bytes())
|
archive_path.write_bytes(mail_pdf_file.read_bytes())
|
||||||
else:
|
else:
|
||||||
@@ -556,7 +252,7 @@ class MailDocumentParser:
|
|||||||
mail_message.attachments,
|
mail_message.attachments,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug("Merging email text and HTML content into single PDF")
|
self.log.debug("Merging email text and HTML content into single PDF")
|
||||||
|
|
||||||
with (
|
with (
|
||||||
GotenbergClient(
|
GotenbergClient(
|
||||||
@@ -591,21 +287,15 @@ class MailDocumentParser:
|
|||||||
return archive_path
|
return archive_path
|
||||||
|
|
||||||
def mail_to_html(self, mail: MailMessage) -> Path:
|
def mail_to_html(self, mail: MailMessage) -> Path:
|
||||||
"""Convert the given email into an HTML file using a template.
|
"""
|
||||||
|
Converts the given email into an HTML file, formatted
|
||||||
Parameters
|
based on the given template
|
||||||
----------
|
|
||||||
mail:
|
|
||||||
Parsed mail message.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Path to the rendered HTML file inside the temporary directory.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def clean_html(text: str) -> str:
|
def clean_html(text: str) -> str:
|
||||||
"""Attempt to clean, escape, and linkify the given HTML string."""
|
"""
|
||||||
|
Attempts to clean, escape and linkify the given HTML string
|
||||||
|
"""
|
||||||
if isinstance(text, list):
|
if isinstance(text, list):
|
||||||
text = "\n".join([str(e) for e in text])
|
text = "\n".join([str(e) for e in text])
|
||||||
if not isinstance(text, str):
|
if not isinstance(text, str):
|
||||||
@@ -650,37 +340,19 @@ class MailDocumentParser:
|
|||||||
|
|
||||||
from django.template.loader import render_to_string
|
from django.template.loader import render_to_string
|
||||||
|
|
||||||
html_file = Path(self._tempdir) / "email_as_html.html"
|
html_file = Path(self.tempdir) / "email_as_html.html"
|
||||||
html_file.write_text(render_to_string("email_msg_template.html", context=data))
|
html_file.write_text(render_to_string("email_msg_template.html", context=data))
|
||||||
|
|
||||||
return html_file
|
return html_file
|
||||||
|
|
||||||
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
|
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
|
||||||
"""Create a PDF from the email body using an HTML template and Gotenberg.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
mail:
|
|
||||||
Parsed mail message.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Path to the generated PDF inside the temporary directory.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
documents.parsers.ParseError
|
|
||||||
If Gotenberg returns an error.
|
|
||||||
"""
|
"""
|
||||||
logger.info("Converting mail to PDF")
|
Creates a PDF based on the given email, using the email's values in a
|
||||||
|
an HTML template
|
||||||
|
"""
|
||||||
|
self.log.info("Converting mail to PDF")
|
||||||
|
|
||||||
css_file = (
|
css_file = Path(__file__).parent / "templates" / "output.css"
|
||||||
Path(__file__).parent.parent.parent
|
|
||||||
/ "paperless_mail"
|
|
||||||
/ "templates"
|
|
||||||
/ "output.css"
|
|
||||||
)
|
|
||||||
email_html_file = self.mail_to_html(mail)
|
email_html_file = self.mail_to_html(mail)
|
||||||
|
|
||||||
with (
|
with (
|
||||||
@@ -716,7 +388,7 @@ class MailDocumentParser:
|
|||||||
f"Error while converting email to PDF: {err}",
|
f"Error while converting email to PDF: {err}",
|
||||||
) from err
|
) from err
|
||||||
|
|
||||||
email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
|
email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
|
||||||
email_as_pdf_file.write_bytes(response.content)
|
email_as_pdf_file.write_bytes(response.content)
|
||||||
|
|
||||||
return email_as_pdf_file
|
return email_as_pdf_file
|
||||||
@@ -726,27 +398,11 @@ class MailDocumentParser:
|
|||||||
orig_html: str,
|
orig_html: str,
|
||||||
attachments: list[MailAttachment],
|
attachments: list[MailAttachment],
|
||||||
) -> Path:
|
) -> Path:
|
||||||
"""Generate a PDF from the HTML content of the email.
|
"""
|
||||||
|
Generates a PDF file based on the HTML and attachments of the email
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
orig_html:
|
|
||||||
Raw HTML string from the email body.
|
|
||||||
attachments:
|
|
||||||
List of email attachments (used as inline resources).
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Path
|
|
||||||
Path to the generated PDF inside the temporary directory.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
documents.parsers.ParseError
|
|
||||||
If Gotenberg returns an error.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def clean_html_script(text: str) -> str:
|
def clean_html_script(text: str):
|
||||||
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
|
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
|
||||||
text = compiled_open.sub("<div hidden ", text)
|
text = compiled_open.sub("<div hidden ", text)
|
||||||
|
|
||||||
@@ -754,9 +410,9 @@ class MailDocumentParser:
|
|||||||
text = compiled_close.sub("</div", text)
|
text = compiled_close.sub("</div", text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
logger.info("Converting message html to PDF")
|
self.log.info("Converting message html to PDF")
|
||||||
|
|
||||||
tempdir = Path(self._tempdir)
|
tempdir = Path(self.tempdir)
|
||||||
|
|
||||||
html_clean = clean_html_script(orig_html)
|
html_clean = clean_html_script(orig_html)
|
||||||
html_clean_file = tempdir / "index.html"
|
html_clean_file = tempdir / "index.html"
|
||||||
@@ -817,3 +473,9 @@ class MailDocumentParser:
|
|||||||
html_pdf = tempdir / "html.pdf"
|
html_pdf = tempdir / "html.pdf"
|
||||||
html_pdf.write_bytes(response.content)
|
html_pdf.write_bytes(response.content)
|
||||||
return html_pdf
|
return html_pdf
|
||||||
|
|
||||||
|
def get_settings(self) -> None:
|
||||||
|
"""
|
||||||
|
This parser does not implement additional settings yet
|
||||||
|
"""
|
||||||
|
return None
|
||||||
@@ -1,12 +1,7 @@
|
|||||||
def get_parser(*args, **kwargs):
|
def get_parser(*args, **kwargs):
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless_mail.parsers import MailDocumentParser
|
||||||
|
|
||||||
# MailDocumentParser accepts no constructor args in the new-style protocol.
|
return MailDocumentParser(*args, **kwargs)
|
||||||
# Pop legacy args that arrive from the signal-based consumer path.
|
|
||||||
# Phase 4 will replace this signal path with the ParserRegistry.
|
|
||||||
kwargs.pop("logging_group", None)
|
|
||||||
kwargs.pop("progress_callback", None)
|
|
||||||
return MailDocumentParser()
|
|
||||||
|
|
||||||
|
|
||||||
def mail_consumer_declaration(sender, **kwargs):
|
def mail_consumer_declaration(sender, **kwargs):
|
||||||
|
|||||||
@@ -1,9 +1,71 @@
|
|||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from paperless_mail.mail import MailAccountHandler
|
from paperless_mail.mail import MailAccountHandler
|
||||||
from paperless_mail.models import MailAccount
|
from paperless_mail.models import MailAccount
|
||||||
|
from paperless_mail.parsers import MailDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_dir() -> Path:
|
||||||
|
return (Path(__file__).parent / Path("samples")).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def broken_email_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "broken.eml"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_txt_email_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "simple_text.eml"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "simple_text.eml.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "simple_text.eml.pdf.webp"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def html_email_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "html.eml"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def html_email_pdf_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "html.eml.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def html_email_thumbnail_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "html.eml.pdf.webp"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def html_email_html_file(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "html.eml.html"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def merged_pdf_first(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "first.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def merged_pdf_second(sample_dir: Path) -> Path:
|
||||||
|
return sample_dir / "second.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def mail_parser() -> MailDocumentParser:
|
||||||
|
return MailDocumentParser(logging_group=None)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
@@ -27,3 +89,11 @@ def greenmail_mail_account(db: None) -> Generator[MailAccount, None, None]:
|
|||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def mail_account_handler() -> MailAccountHandler:
|
def mail_account_handler() -> MailAccountHandler:
|
||||||
return MailAccountHandler()
|
return MailAccountHandler()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def nginx_base_url() -> Generator[str, None, None]:
|
||||||
|
"""
|
||||||
|
The base URL for the nginx HTTP server we expect to be alive
|
||||||
|
"""
|
||||||
|
yield "http://localhost:8080"
|
||||||
|
|||||||
|
Before Width: | Height: | Size: 6.0 KiB After Width: | Height: | Size: 6.0 KiB |
|
Before Width: | Height: | Size: 2.8 KiB After Width: | Height: | Size: 2.8 KiB |
|
Before Width: | Height: | Size: 6.9 KiB After Width: | Height: | Size: 6.9 KiB |
|
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.2 KiB |
@@ -12,7 +12,7 @@ from pytest_httpx import HTTPXMock
|
|||||||
from pytest_mock import MockerFixture
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless_mail.parsers import MailDocumentParser
|
||||||
|
|
||||||
|
|
||||||
class TestEmailFileParsing:
|
class TestEmailFileParsing:
|
||||||
@@ -24,7 +24,7 @@ class TestEmailFileParsing:
|
|||||||
def test_parse_error_missing_file(
|
def test_parse_error_missing_file(
|
||||||
self,
|
self,
|
||||||
mail_parser: MailDocumentParser,
|
mail_parser: MailDocumentParser,
|
||||||
mail_samples_dir: Path,
|
sample_dir: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
@@ -35,7 +35,7 @@ class TestEmailFileParsing:
|
|||||||
- An Exception is thrown
|
- An Exception is thrown
|
||||||
"""
|
"""
|
||||||
# Check if exception is raised when parsing fails.
|
# Check if exception is raised when parsing fails.
|
||||||
test_file = mail_samples_dir / "doesntexist.eml"
|
test_file = sample_dir / "doesntexist.eml"
|
||||||
|
|
||||||
assert not test_file.exists()
|
assert not test_file.exists()
|
||||||
|
|
||||||
@@ -246,12 +246,12 @@ class TestEmailThumbnailGenerate:
|
|||||||
"""
|
"""
|
||||||
mocked_return = "Passing the return value through.."
|
mocked_return = "Passing the return value through.."
|
||||||
mock_make_thumbnail_from_pdf = mocker.patch(
|
mock_make_thumbnail_from_pdf = mocker.patch(
|
||||||
"paperless.parsers.mail.make_thumbnail_from_pdf",
|
"paperless_mail.parsers.make_thumbnail_from_pdf",
|
||||||
)
|
)
|
||||||
mock_make_thumbnail_from_pdf.return_value = mocked_return
|
mock_make_thumbnail_from_pdf.return_value = mocked_return
|
||||||
|
|
||||||
mock_generate_pdf = mocker.patch(
|
mock_generate_pdf = mocker.patch(
|
||||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||||
)
|
)
|
||||||
mock_generate_pdf.return_value = "Mocked return value.."
|
mock_generate_pdf.return_value = "Mocked return value.."
|
||||||
|
|
||||||
@@ -260,7 +260,8 @@ class TestEmailThumbnailGenerate:
|
|||||||
mock_generate_pdf.assert_called_once()
|
mock_generate_pdf.assert_called_once()
|
||||||
mock_make_thumbnail_from_pdf.assert_called_once_with(
|
mock_make_thumbnail_from_pdf.assert_called_once_with(
|
||||||
"Mocked return value..",
|
"Mocked return value..",
|
||||||
mail_parser._tempdir,
|
mail_parser.tempdir,
|
||||||
|
None,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert mocked_return == thumb
|
assert mocked_return == thumb
|
||||||
@@ -372,7 +373,7 @@ class TestParser:
|
|||||||
"""
|
"""
|
||||||
# Validate parsing returns the expected results
|
# Validate parsing returns the expected results
|
||||||
mock_generate_pdf = mocker.patch(
|
mock_generate_pdf = mocker.patch(
|
||||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||||
)
|
)
|
||||||
|
|
||||||
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
||||||
@@ -384,7 +385,7 @@ class TestParser:
|
|||||||
"BCC: fdf@fvf.de\n\n"
|
"BCC: fdf@fvf.de\n\n"
|
||||||
"\n\nThis is just a simple Text Mail."
|
"\n\nThis is just a simple Text Mail."
|
||||||
)
|
)
|
||||||
assert text_expected == mail_parser.get_text()
|
assert text_expected == mail_parser.text
|
||||||
assert (
|
assert (
|
||||||
datetime.datetime(
|
datetime.datetime(
|
||||||
2022,
|
2022,
|
||||||
@@ -395,7 +396,7 @@ class TestParser:
|
|||||||
43,
|
43,
|
||||||
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
||||||
)
|
)
|
||||||
== mail_parser.get_date()
|
== mail_parser.date
|
||||||
)
|
)
|
||||||
|
|
||||||
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
|
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
|
||||||
@@ -418,7 +419,7 @@ class TestParser:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
mock_generate_pdf = mocker.patch(
|
mock_generate_pdf = mocker.patch(
|
||||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Validate parsing returns the expected results
|
# Validate parsing returns the expected results
|
||||||
@@ -442,7 +443,7 @@ class TestParser:
|
|||||||
mail_parser.parse(html_email_file, "message/rfc822")
|
mail_parser.parse(html_email_file, "message/rfc822")
|
||||||
|
|
||||||
mock_generate_pdf.assert_called_once()
|
mock_generate_pdf.assert_called_once()
|
||||||
assert text_expected == mail_parser.get_text()
|
assert text_expected == mail_parser.text
|
||||||
assert (
|
assert (
|
||||||
datetime.datetime(
|
datetime.datetime(
|
||||||
2022,
|
2022,
|
||||||
@@ -453,7 +454,7 @@ class TestParser:
|
|||||||
19,
|
19,
|
||||||
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
||||||
)
|
)
|
||||||
== mail_parser.get_date()
|
== mail_parser.date
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_generate_pdf_parse_error(
|
def test_generate_pdf_parse_error(
|
||||||
@@ -500,7 +501,7 @@ class TestParser:
|
|||||||
|
|
||||||
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
||||||
|
|
||||||
assert mail_parser.get_archive_path() is not None
|
assert mail_parser.archive_path is not None
|
||||||
|
|
||||||
@pytest.mark.httpx_mock(can_send_already_matched_responses=True)
|
@pytest.mark.httpx_mock(can_send_already_matched_responses=True)
|
||||||
def test_generate_pdf_html_email(
|
def test_generate_pdf_html_email(
|
||||||
@@ -541,7 +542,7 @@ class TestParser:
|
|||||||
)
|
)
|
||||||
mail_parser.parse(html_email_file, "message/rfc822")
|
mail_parser.parse(html_email_file, "message/rfc822")
|
||||||
|
|
||||||
assert mail_parser.get_archive_path() is not None
|
assert mail_parser.archive_path is not None
|
||||||
|
|
||||||
def test_generate_pdf_html_email_html_to_pdf_failure(
|
def test_generate_pdf_html_email_html_to_pdf_failure(
|
||||||
self,
|
self,
|
||||||
@@ -711,10 +712,10 @@ class TestParser:
|
|||||||
|
|
||||||
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
|
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
|
||||||
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
|
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
|
||||||
mail_parser.mailrule_id = 1
|
|
||||||
mail_parser.parse(
|
mail_parser.parse(
|
||||||
document_path=html_email_file,
|
document_path=html_email_file,
|
||||||
mime_type="message/rfc822",
|
mime_type="message/rfc822",
|
||||||
|
mailrule_id=1,
|
||||||
)
|
)
|
||||||
args, _ = mock_merge_route.call_args
|
args, _ = mock_merge_route.call_args
|
||||||
assert len(args[0]) == expected_calls
|
assert len(args[0]) == expected_calls
|
||||||
@@ -11,7 +11,7 @@ from PIL import Image
|
|||||||
from pytest_mock import MockerFixture
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
from documents.tests.utils import util_call_with_backoff
|
from documents.tests.utils import util_call_with_backoff
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless_mail.parsers import MailDocumentParser
|
||||||
|
|
||||||
|
|
||||||
def extract_text(pdf_path: Path) -> str:
|
def extract_text(pdf_path: Path) -> str:
|
||||||
@@ -159,7 +159,7 @@ class TestParserLive:
|
|||||||
- The returned thumbnail image file shall match the expected hash
|
- The returned thumbnail image file shall match the expected hash
|
||||||
"""
|
"""
|
||||||
mock_generate_pdf = mocker.patch(
|
mock_generate_pdf = mocker.patch(
|
||||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||||
)
|
)
|
||||||
mock_generate_pdf.return_value = simple_txt_email_pdf_file
|
mock_generate_pdf.return_value = simple_txt_email_pdf_file
|
||||||
|
|
||||||
@@ -216,10 +216,10 @@ class TestParserLive:
|
|||||||
- The merged PDF shall contain text from both source PDFs
|
- The merged PDF shall contain text from both source PDFs
|
||||||
"""
|
"""
|
||||||
mock_generate_pdf_from_html = mocker.patch(
|
mock_generate_pdf_from_html = mocker.patch(
|
||||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_html",
|
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
|
||||||
)
|
)
|
||||||
mock_generate_pdf_from_mail = mocker.patch(
|
mock_generate_pdf_from_mail = mocker.patch(
|
||||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_mail",
|
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
|
||||||
)
|
)
|
||||||
mock_generate_pdf_from_mail.return_value = merged_pdf_first
|
mock_generate_pdf_from_mail.return_value = merged_pdf_first
|
||||||
mock_generate_pdf_from_html.return_value = merged_pdf_second
|
mock_generate_pdf_from_html.return_value = merged_pdf_second
|
||||||
@@ -1,118 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
|
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteEngineConfig:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
engine: str,
|
|
||||||
api_key: str | None = None,
|
|
||||||
endpoint: str | None = None,
|
|
||||||
):
|
|
||||||
self.engine = engine
|
|
||||||
self.api_key = api_key
|
|
||||||
self.endpoint = endpoint
|
|
||||||
|
|
||||||
def engine_is_valid(self):
|
|
||||||
valid = self.engine in ["azureai"] and self.api_key is not None
|
|
||||||
if self.engine == "azureai":
|
|
||||||
valid = valid and self.endpoint is not None
|
|
||||||
return valid
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
|
||||||
"""
|
|
||||||
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
|
|
||||||
as this is the only service that provides a remote OCR API with text-embedded PDF output.
|
|
||||||
"""
|
|
||||||
|
|
||||||
logging_name = "paperless.parsing.remote"
|
|
||||||
|
|
||||||
def get_settings(self) -> RemoteEngineConfig:
|
|
||||||
"""
|
|
||||||
Returns the configuration for the remote OCR engine, loaded from Django settings.
|
|
||||||
"""
|
|
||||||
return RemoteEngineConfig(
|
|
||||||
engine=settings.REMOTE_OCR_ENGINE,
|
|
||||||
api_key=settings.REMOTE_OCR_API_KEY,
|
|
||||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
|
||||||
)
|
|
||||||
|
|
||||||
def supported_mime_types(self):
|
|
||||||
if self.settings.engine_is_valid():
|
|
||||||
return {
|
|
||||||
"application/pdf": ".pdf",
|
|
||||||
"image/png": ".png",
|
|
||||||
"image/jpeg": ".jpg",
|
|
||||||
"image/tiff": ".tiff",
|
|
||||||
"image/bmp": ".bmp",
|
|
||||||
"image/gif": ".gif",
|
|
||||||
"image/webp": ".webp",
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def azure_ai_vision_parse(
|
|
||||||
self,
|
|
||||||
file: Path,
|
|
||||||
) -> str | None:
|
|
||||||
"""
|
|
||||||
Uses Azure AI Vision to parse the document and return the text content.
|
|
||||||
It requests a searchable PDF output with embedded text.
|
|
||||||
The PDF is saved to the archive_path attribute.
|
|
||||||
Returns the text content extracted from the document.
|
|
||||||
If the parsing fails, it returns None.
|
|
||||||
"""
|
|
||||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
|
||||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
|
||||||
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
|
||||||
from azure.ai.documentintelligence.models import DocumentContentFormat
|
|
||||||
from azure.core.credentials import AzureKeyCredential
|
|
||||||
|
|
||||||
client = DocumentIntelligenceClient(
|
|
||||||
endpoint=self.settings.endpoint,
|
|
||||||
credential=AzureKeyCredential(self.settings.api_key),
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
with file.open("rb") as f:
|
|
||||||
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
|
||||||
poller = client.begin_analyze_document(
|
|
||||||
model_id="prebuilt-read",
|
|
||||||
body=analyze_request,
|
|
||||||
output_content_format=DocumentContentFormat.TEXT,
|
|
||||||
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
|
|
||||||
content_type="application/json",
|
|
||||||
)
|
|
||||||
|
|
||||||
poller.wait()
|
|
||||||
result_id = poller.details["operation_id"]
|
|
||||||
result = poller.result()
|
|
||||||
|
|
||||||
# Download the PDF with embedded text
|
|
||||||
self.archive_path = self.tempdir / "archive.pdf"
|
|
||||||
with self.archive_path.open("wb") as f:
|
|
||||||
for chunk in client.get_analyze_result_pdf(
|
|
||||||
model_id="prebuilt-read",
|
|
||||||
result_id=result_id,
|
|
||||||
):
|
|
||||||
f.write(chunk)
|
|
||||||
return result.content
|
|
||||||
except Exception as e:
|
|
||||||
self.log.error(f"Azure AI Vision parsing failed: {e}")
|
|
||||||
finally:
|
|
||||||
client.close()
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
|
||||||
if not self.settings.engine_is_valid():
|
|
||||||
self.log.warning(
|
|
||||||
"No valid remote parser engine is configured, content will be empty.",
|
|
||||||
)
|
|
||||||
self.text = ""
|
|
||||||
elif self.settings.engine == "azureai":
|
|
||||||
self.text = self.azure_ai_vision_parse(document_path)
|
|
||||||
@@ -1,16 +1,36 @@
|
|||||||
def get_parser(*args, **kwargs):
|
from __future__ import annotations
|
||||||
from paperless_remote.parsers import RemoteDocumentParser
|
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
|
||||||
|
# The new RemoteDocumentParser does not accept the progress_callback
|
||||||
|
# kwarg injected by the old signal-based consumer. logging_group is
|
||||||
|
# forwarded as a positional arg.
|
||||||
|
# Phase 4 will replace this signal path with the new ParserRegistry.
|
||||||
|
kwargs.pop("progress_callback", None)
|
||||||
return RemoteDocumentParser(*args, **kwargs)
|
return RemoteDocumentParser(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def get_supported_mime_types():
|
def get_supported_mime_types() -> dict[str, str]:
|
||||||
from paperless_remote.parsers import RemoteDocumentParser
|
from django.conf import settings
|
||||||
|
|
||||||
return RemoteDocumentParser(None).supported_mime_types()
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
from paperless.parsers.remote import RemoteEngineConfig
|
||||||
|
|
||||||
|
config = RemoteEngineConfig(
|
||||||
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
|
api_key=settings.REMOTE_OCR_API_KEY,
|
||||||
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||||
|
)
|
||||||
|
if not config.engine_is_valid():
|
||||||
|
return {}
|
||||||
|
return RemoteDocumentParser.supported_mime_types()
|
||||||
|
|
||||||
|
|
||||||
def remote_consumer_declaration(sender, **kwargs):
|
def remote_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"parser": get_parser,
|
"parser": get_parser,
|
||||||
"weight": 5,
|
"weight": 5,
|
||||||
|
|||||||
@@ -1,131 +0,0 @@
|
|||||||
import uuid
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
from django.test import TestCase
|
|
||||||
from django.test import override_settings
|
|
||||||
|
|
||||||
from documents.tests.utils import DirectoriesMixin
|
|
||||||
from documents.tests.utils import FileSystemAssertsMixin
|
|
||||||
from paperless_remote.parsers import RemoteDocumentParser
|
|
||||||
from paperless_remote.signals import get_parser
|
|
||||||
|
|
||||||
|
|
||||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|
||||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
|
||||||
|
|
||||||
def assertContainsStrings(self, content: str, strings: list[str]) -> None:
|
|
||||||
# Asserts that all strings appear in content, in the given order.
|
|
||||||
indices = []
|
|
||||||
for s in strings:
|
|
||||||
if s in content:
|
|
||||||
indices.append(content.index(s))
|
|
||||||
else:
|
|
||||||
self.fail(f"'{s}' is not in '{content}'")
|
|
||||||
self.assertListEqual(indices, sorted(indices))
|
|
||||||
|
|
||||||
@mock.patch("paperless_tesseract.parsers.run_subprocess")
|
|
||||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
|
||||||
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess) -> None:
|
|
||||||
# Arrange mock Azure client
|
|
||||||
mock_client = mock.Mock()
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
|
|
||||||
# Simulate poller result and its `.details`
|
|
||||||
mock_poller = mock.Mock()
|
|
||||||
mock_poller.wait.return_value = None
|
|
||||||
mock_poller.details = {"operation_id": "fake-op-id"}
|
|
||||||
mock_client.begin_analyze_document.return_value = mock_poller
|
|
||||||
mock_poller.result.return_value.content = "This is a test document."
|
|
||||||
|
|
||||||
# Return dummy PDF bytes
|
|
||||||
mock_client.get_analyze_result_pdf.return_value = [
|
|
||||||
b"%PDF-",
|
|
||||||
b"1.7 ",
|
|
||||||
b"FAKEPDF",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Simulate pdftotext by writing dummy text to sidecar file
|
|
||||||
def fake_run(cmd, *args, **kwargs) -> None:
|
|
||||||
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
|
|
||||||
f.write("This is a test document.")
|
|
||||||
|
|
||||||
mock_subprocess.side_effect = fake_run
|
|
||||||
|
|
||||||
with override_settings(
|
|
||||||
REMOTE_OCR_ENGINE="azureai",
|
|
||||||
REMOTE_OCR_API_KEY="somekey",
|
|
||||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
|
||||||
):
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
parser.parse(
|
|
||||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.text.strip(),
|
|
||||||
["This is a test document."],
|
|
||||||
)
|
|
||||||
|
|
||||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
|
||||||
def test_get_text_with_azure_error_logged_and_returns_none(
|
|
||||||
self,
|
|
||||||
mock_client_cls,
|
|
||||||
) -> None:
|
|
||||||
mock_client = mock.Mock()
|
|
||||||
mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
|
|
||||||
with override_settings(
|
|
||||||
REMOTE_OCR_ENGINE="azureai",
|
|
||||||
REMOTE_OCR_API_KEY="somekey",
|
|
||||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
|
||||||
):
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
with mock.patch.object(parser.log, "error") as mock_log_error:
|
|
||||||
parser.parse(
|
|
||||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertIsNone(parser.text)
|
|
||||||
mock_client.begin_analyze_document.assert_called_once()
|
|
||||||
mock_client.close.assert_called_once()
|
|
||||||
mock_log_error.assert_called_once()
|
|
||||||
self.assertIn(
|
|
||||||
"Azure AI Vision parsing failed",
|
|
||||||
mock_log_error.call_args[0][0],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(
|
|
||||||
REMOTE_OCR_ENGINE="azureai",
|
|
||||||
REMOTE_OCR_API_KEY="key",
|
|
||||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
|
||||||
)
|
|
||||||
def test_supported_mime_types_valid_config(self) -> None:
|
|
||||||
parser = RemoteDocumentParser(uuid.uuid4())
|
|
||||||
expected_types = {
|
|
||||||
"application/pdf": ".pdf",
|
|
||||||
"image/png": ".png",
|
|
||||||
"image/jpeg": ".jpg",
|
|
||||||
"image/tiff": ".tiff",
|
|
||||||
"image/bmp": ".bmp",
|
|
||||||
"image/gif": ".gif",
|
|
||||||
"image/webp": ".webp",
|
|
||||||
}
|
|
||||||
self.assertEqual(parser.supported_mime_types(), expected_types)
|
|
||||||
|
|
||||||
def test_supported_mime_types_invalid_config(self) -> None:
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
self.assertEqual(parser.supported_mime_types(), {})
|
|
||||||
|
|
||||||
@override_settings(
|
|
||||||
REMOTE_OCR_ENGINE=None,
|
|
||||||
REMOTE_OCR_API_KEY=None,
|
|
||||||
REMOTE_OCR_ENDPOINT=None,
|
|
||||||
)
|
|
||||||
def test_parse_with_invalid_config(self) -> None:
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
|
|
||||||
self.assertEqual(parser.text, "")
|
|
||||||
@@ -1,4 +1,9 @@
|
|||||||
def get_parser(*args, **kwargs):
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
|
||||||
# TextDocumentParser accepts logging_group for constructor compatibility but
|
# TextDocumentParser accepts logging_group for constructor compatibility but
|
||||||
@@ -9,10 +14,10 @@ def get_parser(*args, **kwargs):
|
|||||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||||
kwargs.pop("logging_group", None)
|
kwargs.pop("logging_group", None)
|
||||||
kwargs.pop("progress_callback", None)
|
kwargs.pop("progress_callback", None)
|
||||||
return TextDocumentParser()
|
return TextDocumentParser(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def text_consumer_declaration(sender, **kwargs):
|
def text_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"parser": get_parser,
|
"parser": get_parser,
|
||||||
"weight": 10,
|
"weight": 10,
|
||||||
|
|||||||