mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-14 13:11:24 +00:00
Compare commits
8 Commits
dependabot
...
feature-re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2098a11eb1 | ||
|
|
af8a8e791b | ||
|
|
8d4163bef3 | ||
|
|
e9e1d4ccca | ||
|
|
c955ba7d07 | ||
|
|
7028bb2163 | ||
|
|
5d4d87764c | ||
|
|
75dce7f19f |
@@ -51,6 +51,7 @@ from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
|
||||
@@ -67,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
|
||||
|
||||
TODO(stumpylog): Remove me in the future
|
||||
"""
|
||||
if isinstance(parser, TextDocumentParser):
|
||||
if isinstance(parser, (TextDocumentParser, RemoteDocumentParser)):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
@@ -476,7 +477,10 @@ class ConsumerPlugin(
|
||||
self.filename,
|
||||
self.input_doc.mailrule_id,
|
||||
)
|
||||
elif isinstance(document_parser, TextDocumentParser):
|
||||
elif isinstance(
|
||||
document_parser,
|
||||
(TextDocumentParser, RemoteDocumentParser),
|
||||
):
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
else:
|
||||
@@ -489,7 +493,7 @@ class ConsumerPlugin(
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
if isinstance(document_parser, TextDocumentParser):
|
||||
if isinstance(document_parser, (TextDocumentParser, RemoteDocumentParser)):
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
else:
|
||||
|
||||
@@ -193,9 +193,11 @@ class ParserRegistry:
|
||||
that log output is predictable; scoring determines which parser wins
|
||||
at runtime regardless of registration order.
|
||||
"""
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
self.register_builtin(TextDocumentParser)
|
||||
self.register_builtin(RemoteDocumentParser)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Discovery
|
||||
|
||||
429
src/paperless/parsers/remote.py
Normal file
429
src/paperless/parsers/remote.py
Normal file
@@ -0,0 +1,429 @@
|
||||
"""
|
||||
Built-in remote-OCR document parser.
|
||||
|
||||
Handles documents by sending them to a configured remote OCR engine
|
||||
(currently Azure AI Vision / Document Intelligence) and retrieving both
|
||||
the extracted text and a searchable PDF with an embedded text layer.
|
||||
|
||||
When no engine is configured, ``score()`` returns ``None`` so the parser
|
||||
is effectively invisible to the registry — the tesseract parser handles
|
||||
these MIME types instead.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.remote")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/tiff": ".tiff",
|
||||
"image/bmp": ".bmp",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
|
||||
|
||||
class RemoteEngineConfig:
|
||||
"""Holds and validates the remote OCR engine configuration."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine: str | None,
|
||||
api_key: str | None = None,
|
||||
endpoint: str | None = None,
|
||||
) -> None:
|
||||
self.engine = engine
|
||||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
|
||||
def engine_is_valid(self) -> bool:
|
||||
"""Return True when the engine is known and fully configured."""
|
||||
return (
|
||||
self.engine in ("azureai",)
|
||||
and self.api_key is not None
|
||||
and not (self.engine == "azureai" and self.endpoint is None)
|
||||
)
|
||||
|
||||
|
||||
class RemoteDocumentParser:
|
||||
"""Parse documents via a remote OCR API (currently Azure AI Vision).
|
||||
|
||||
This parser sends documents to a remote engine that returns both
|
||||
extracted text and a searchable PDF with an embedded text layer.
|
||||
It does not depend on Tesseract or ocrmypdf.
|
||||
|
||||
Class attributes
|
||||
----------------
|
||||
name : str
|
||||
Human-readable parser name.
|
||||
version : str
|
||||
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||
author : str
|
||||
Maintainer name.
|
||||
url : str
|
||||
Issue tracker / source URL.
|
||||
"""
|
||||
|
||||
name: str = "Paperless-ngx Remote OCR Parser"
|
||||
version: str = __full_version_str__
|
||||
author: str = "Paperless-ngx Contributors"
|
||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict[str, str]:
|
||||
"""Return the MIME types this parser can handle.
|
||||
|
||||
The full set is always returned regardless of whether a remote
|
||||
engine is configured. The ``score()`` method handles the
|
||||
"am I active?" logic by returning ``None`` when not configured.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, str]
|
||||
Mapping of MIME type to preferred file extension.
|
||||
"""
|
||||
return _SUPPORTED_MIME_TYPES
|
||||
|
||||
@classmethod
|
||||
def score(
|
||||
cls,
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> int | None:
|
||||
"""Return the priority score for handling this file, or None.
|
||||
|
||||
Returns ``None`` when no valid remote engine is configured,
|
||||
making the parser invisible to the registry for this file.
|
||||
When configured, returns 20 — higher than the Tesseract parser's
|
||||
default of 10 — so the remote engine takes priority.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mime_type:
|
||||
Detected MIME type of the file.
|
||||
filename:
|
||||
Original filename including extension.
|
||||
path:
|
||||
Optional filesystem path. Not inspected by this parser.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
20 when the remote engine is configured and the MIME type is
|
||||
supported, otherwise None.
|
||||
"""
|
||||
config = RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
if not config.engine_is_valid():
|
||||
return None
|
||||
if mime_type not in _SUPPORTED_MIME_TYPES:
|
||||
return None
|
||||
return 20
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
"""Whether this parser can produce a searchable PDF archive copy.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always True — the remote engine always returns a PDF with an
|
||||
embedded text layer that serves as the archive copy.
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
"""Whether the parser must produce a PDF for the frontend to display.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always False — all supported originals are displayable by
|
||||
the browser (PDF) or handled via the archive copy (images).
|
||||
"""
|
||||
return False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self._tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
self._logging_group = logging_group
|
||||
self._text: str | None = None
|
||||
self._archive_path: Path | None = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
*,
|
||||
produce_archive: bool = True,
|
||||
) -> None:
|
||||
"""Send the document to the remote engine and store results.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the document file to parse.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
produce_archive:
|
||||
Ignored — the remote engine always returns a searchable PDF,
|
||||
which is stored as the archive copy regardless of this flag.
|
||||
"""
|
||||
config = RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
|
||||
if not config.engine_is_valid():
|
||||
logger.warning(
|
||||
"No valid remote parser engine is configured, content will be empty.",
|
||||
)
|
||||
self._text = ""
|
||||
return
|
||||
|
||||
if config.engine == "azureai":
|
||||
self._text = self._azure_ai_vision_parse(document_path, config)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
"""Return the plain-text content extracted during parse."""
|
||||
return self._text
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
datetime.datetime | None
|
||||
Always None — the remote parser does not detect dates.
|
||||
"""
|
||||
return None
|
||||
|
||||
def get_archive_path(self) -> Path | None:
|
||||
"""Return the path to the generated archive PDF, or None."""
|
||||
return self._archive_path
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thumbnail and metadata
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||
"""Generate a thumbnail image for the document.
|
||||
|
||||
Uses the archive PDF produced by the remote engine when available,
|
||||
otherwise falls back to the original document path (PDF inputs).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated WebP thumbnail inside the temp directory.
|
||||
"""
|
||||
# make_thumbnail_from_pdf lives in documents.parsers for now;
|
||||
# it will move to paperless.parsers.utils when the tesseract
|
||||
# parser is migrated in a later phase.
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
|
||||
return make_thumbnail_from_pdf(
|
||||
self._archive_path or document_path,
|
||||
self._tempdir,
|
||||
self._logging_group,
|
||||
)
|
||||
|
||||
def get_page_count(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in a PDF document.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Page count for PDF inputs, or ``None`` for other MIME types.
|
||||
"""
|
||||
if mime_type != "application/pdf":
|
||||
return None
|
||||
|
||||
from paperless.parsers.utils import get_page_count_for_pdf
|
||||
|
||||
return get_page_count_for_pdf(document_path, log=logger)
|
||||
|
||||
def extract_metadata(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> list[MetadataEntry]:
|
||||
"""Extract format-specific metadata from the document.
|
||||
|
||||
Delegates to the shared pikepdf-based extractor for PDF files.
|
||||
Returns ``[]`` for all other MIME types.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the file to extract metadata from.
|
||||
mime_type:
|
||||
MIME type of the file. May be ``"application/pdf"`` when
|
||||
called for the archive version of an image original.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[MetadataEntry]
|
||||
Zero or more metadata entries.
|
||||
"""
|
||||
if mime_type != "application/pdf":
|
||||
return []
|
||||
|
||||
from paperless.parsers.utils import extract_pdf_metadata
|
||||
|
||||
return extract_pdf_metadata(document_path, log=logger)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _azure_ai_vision_parse(
|
||||
self,
|
||||
file: Path,
|
||||
config: RemoteEngineConfig,
|
||||
) -> str | None:
|
||||
"""Send ``file`` to Azure AI Document Intelligence and return text.
|
||||
|
||||
Downloads the searchable PDF output from Azure and stores it at
|
||||
``self._archive_path``. Returns the extracted text content, or
|
||||
``None`` on failure (the error is logged).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file:
|
||||
Absolute path to the document to analyse.
|
||||
config:
|
||||
Validated remote engine configuration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if the Azure call failed.
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
# Callers must have already validated config via engine_is_valid():
|
||||
# engine_is_valid() asserts api_key is not None and (for azureai)
|
||||
# endpoint is not None, so these casts are provably safe.
|
||||
assert config.endpoint is not None
|
||||
assert config.api_key is not None
|
||||
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
||||
from azure.ai.documentintelligence.models import DocumentContentFormat
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
client = DocumentIntelligenceClient(
|
||||
endpoint=config.endpoint,
|
||||
credential=AzureKeyCredential(config.api_key),
|
||||
)
|
||||
|
||||
try:
|
||||
with file.open("rb") as f:
|
||||
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
||||
poller = client.begin_analyze_document(
|
||||
model_id="prebuilt-read",
|
||||
body=analyze_request,
|
||||
output_content_format=DocumentContentFormat.TEXT,
|
||||
output=[AnalyzeOutputOption.PDF],
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
poller.wait()
|
||||
result_id = poller.details["operation_id"]
|
||||
result = poller.result()
|
||||
|
||||
self._archive_path = self._tempdir / "archive.pdf"
|
||||
with self._archive_path.open("wb") as f:
|
||||
for chunk in client.get_analyze_result_pdf(
|
||||
model_id="prebuilt-read",
|
||||
result_id=result_id,
|
||||
):
|
||||
f.write(chunk)
|
||||
|
||||
return result.content
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Azure AI Vision parsing failed: %s", e)
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
return None
|
||||
130
src/paperless/parsers/utils.py
Normal file
130
src/paperless/parsers/utils.py
Normal file
@@ -0,0 +1,130 @@
|
||||
"""
|
||||
Shared utilities for Paperless-ngx document parsers.
|
||||
|
||||
Functions here are format-neutral helpers that multiple parsers need.
|
||||
Keeping them here avoids parsers inheriting from each other just to
|
||||
share implementation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
logger = logging.getLogger("paperless.parsers.utils")
|
||||
|
||||
|
||||
def get_page_count_for_pdf(
|
||||
document_path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in a PDF file using pikepdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger to use for warnings. Falls back to the module-level logger
|
||||
when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Page count, or ``None`` if the file cannot be opened or is not a
|
||||
valid PDF.
|
||||
"""
|
||||
import pikepdf
|
||||
|
||||
_log = log or logger
|
||||
|
||||
try:
|
||||
with pikepdf.Pdf.open(document_path) as pdf:
|
||||
return len(pdf.pages)
|
||||
except Exception as e:
|
||||
_log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
|
||||
return None
|
||||
|
||||
|
||||
def extract_pdf_metadata(
|
||||
document_path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> list[MetadataEntry]:
|
||||
"""Extract XMP/PDF metadata from a PDF file using pikepdf.
|
||||
|
||||
Reads all XMP metadata entries from the document and returns them as a
|
||||
list of ``MetadataEntry`` dicts. The method never raises — any failure
|
||||
to open the file or read a specific key is logged and skipped.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger to use for warnings and debug messages. Falls back to the
|
||||
module-level logger when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[MetadataEntry]
|
||||
Zero or more metadata entries. Returns ``[]`` if the file cannot
|
||||
be opened or contains no readable XMP metadata.
|
||||
"""
|
||||
import pikepdf
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
_log = log or logger
|
||||
result: list[MetadataEntry] = []
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
|
||||
try:
|
||||
pdf = pikepdf.open(document_path)
|
||||
meta = pdf.open_metadata()
|
||||
except Exception as e:
|
||||
_log.warning("Could not open PDF metadata for %s: %s", document_path, e)
|
||||
return []
|
||||
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join(str(e) for e in value)
|
||||
value = str(value)
|
||||
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
if m is None:
|
||||
continue
|
||||
|
||||
namespace = m.group(1)
|
||||
key_value = m.group(2)
|
||||
|
||||
try:
|
||||
namespace.encode("utf-8")
|
||||
key_value.encode("utf-8")
|
||||
except UnicodeEncodeError as enc_err:
|
||||
_log.debug("Skipping metadata key %s: %s", key, enc_err)
|
||||
continue
|
||||
|
||||
result.append(
|
||||
MetadataEntry(
|
||||
namespace=namespace,
|
||||
prefix=meta.REVERSE_NS[namespace],
|
||||
key=key_value,
|
||||
value=value,
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
_log.warning(
|
||||
"Error reading metadata key %s value %s: %s",
|
||||
key,
|
||||
value,
|
||||
e,
|
||||
)
|
||||
|
||||
return result
|
||||
@@ -10,12 +10,15 @@ from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Text parser sample files
|
||||
@@ -74,3 +77,89 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
||||
"""
|
||||
with TextDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def remote_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the remote parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/remote/``
|
||||
"""
|
||||
return samples_dir / "remote"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_pdf_file(remote_samples_dir: Path) -> Path:
|
||||
"""Path to a simple digital PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``remote/simple-digital.pdf``.
|
||||
"""
|
||||
return remote_samples_dir / "simple-digital.pdf"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser instance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
|
||||
"""Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
|
||||
|
||||
Yields
|
||||
------
|
||||
RemoteDocumentParser
|
||||
A ready-to-use parser instance.
|
||||
"""
|
||||
with RemoteDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser settings helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||
"""Configure Django settings for a valid Azure AI OCR engine.
|
||||
|
||||
Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
|
||||
``REMOTE_OCR_ENDPOINT`` to test values. Settings are restored
|
||||
automatically after the test by pytest-django.
|
||||
|
||||
Returns
|
||||
-------
|
||||
SettingsWrapper
|
||||
The modified settings object (for chaining further overrides).
|
||||
"""
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = "test-api-key"
|
||||
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||
return settings
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||
"""Configure Django settings with no remote engine configured.
|
||||
|
||||
Returns
|
||||
-------
|
||||
SettingsWrapper
|
||||
The modified settings object.
|
||||
"""
|
||||
settings.REMOTE_OCR_ENGINE = None
|
||||
settings.REMOTE_OCR_API_KEY = None
|
||||
settings.REMOTE_OCR_ENDPOINT = None
|
||||
return settings
|
||||
|
||||
490
src/paperless/tests/parsers/test_remote_parser.py
Normal file
490
src/paperless/tests/parsers/test_remote_parser.py
Normal file
@@ -0,0 +1,490 @@
|
||||
"""
|
||||
Tests for paperless.parsers.remote.RemoteDocumentParser.
|
||||
|
||||
All tests use the context-manager protocol for parser lifecycle.
|
||||
|
||||
Fixture layout
|
||||
--------------
|
||||
make_azure_mock — factory (defined here; specific to this module)
|
||||
azure_client — composes azure_settings + make_azure_mock + patch;
|
||||
use when a test needs the client to succeed
|
||||
failing_azure_client
|
||||
— composes azure_settings + patch with RuntimeError;
|
||||
use when a test needs the client to fail
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-local fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_AZURE_CLIENT_TARGET = "azure.ai.documentintelligence.DocumentIntelligenceClient"
|
||||
_DEFAULT_TEXT = "Extracted text."
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def make_azure_mock() -> Callable[[str], Mock]:
|
||||
"""Return a factory that builds a mock Azure DocumentIntelligenceClient.
|
||||
|
||||
Usage::
|
||||
|
||||
mock_client = make_azure_mock() # default extracted text
|
||||
mock_client = make_azure_mock("My text.") # custom extracted text
|
||||
"""
|
||||
|
||||
def _factory(text: str = _DEFAULT_TEXT) -> Mock:
|
||||
mock_client = Mock()
|
||||
mock_poller = Mock()
|
||||
mock_poller.wait.return_value = None
|
||||
mock_poller.details = {"operation_id": "fake-op-id"}
|
||||
mock_poller.result.return_value.content = text
|
||||
mock_client.begin_analyze_document.return_value = mock_poller
|
||||
mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
|
||||
return mock_client
|
||||
|
||||
return _factory
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def azure_client(
|
||||
azure_settings: SettingsWrapper,
|
||||
make_azure_mock: Callable[[str], Mock],
|
||||
mocker: MockerFixture,
|
||||
) -> Mock:
|
||||
"""Patch the Azure DI client with a succeeding mock and return the instance.
|
||||
|
||||
Implicitly applies ``azure_settings`` so tests using this fixture do not
|
||||
also need ``@pytest.mark.usefixtures("azure_settings")``.
|
||||
"""
|
||||
mock_client = make_azure_mock()
|
||||
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||
return mock_client
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def failing_azure_client(
|
||||
azure_settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> Mock:
|
||||
"""Patch the Azure DI client to raise RuntimeError on every call.
|
||||
|
||||
Implicitly applies ``azure_settings``. Returns the mock instance so
|
||||
tests can assert on calls such as ``close()``.
|
||||
"""
|
||||
mock_client = Mock()
|
||||
mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
|
||||
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||
return mock_client
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Protocol contract
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserProtocol:
|
||||
"""Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""
|
||||
|
||||
def test_isinstance_satisfies_protocol(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert isinstance(remote_parser, ParserProtocol)
|
||||
|
||||
def test_class_attributes_present(self) -> None:
|
||||
assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
|
||||
assert (
|
||||
isinstance(RemoteDocumentParser.version, str)
|
||||
and RemoteDocumentParser.version
|
||||
)
|
||||
assert (
|
||||
isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
|
||||
)
|
||||
assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# supported_mime_types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserSupportedMimeTypes:
|
||||
"""supported_mime_types() always returns the full set regardless of config."""
|
||||
|
||||
def test_returns_dict(self) -> None:
|
||||
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||
assert isinstance(mime_types, dict)
|
||||
|
||||
def test_includes_all_expected_types(self) -> None:
|
||||
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||
expected = {
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
}
|
||||
assert expected == set(mime_types.keys())
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
def test_returns_full_set_when_not_configured(self) -> None:
|
||||
"""
|
||||
GIVEN: No remote engine is configured
|
||||
WHEN: supported_mime_types() is called
|
||||
THEN: The full MIME type dict is still returned (score() handles activation)
|
||||
"""
|
||||
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||
assert len(mime_types) == 7
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# score()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserScore:
|
||||
"""score() encodes the activation logic: None when unconfigured, 20 when active."""
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
@pytest.mark.parametrize(
|
||||
"mime_type",
|
||||
[
|
||||
pytest.param("application/pdf", id="pdf"),
|
||||
pytest.param("image/png", id="png"),
|
||||
pytest.param("image/jpeg", id="jpeg"),
|
||||
pytest.param("image/tiff", id="tiff"),
|
||||
pytest.param("image/bmp", id="bmp"),
|
||||
pytest.param("image/gif", id="gif"),
|
||||
pytest.param("image/webp", id="webp"),
|
||||
],
|
||||
)
|
||||
def test_score_returns_20_when_configured(self, mime_type: str) -> None:
|
||||
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||
assert result == 20
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
@pytest.mark.parametrize(
|
||||
"mime_type",
|
||||
[
|
||||
pytest.param("application/pdf", id="pdf"),
|
||||
pytest.param("image/png", id="png"),
|
||||
pytest.param("image/jpeg", id="jpeg"),
|
||||
],
|
||||
)
|
||||
def test_score_returns_none_when_no_engine(self, mime_type: str) -> None:
|
||||
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
def test_score_returns_none_when_api_key_missing(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = None
|
||||
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
def test_score_returns_none_when_endpoint_missing(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = "key"
|
||||
settings.REMOTE_OCR_ENDPOINT = None
|
||||
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
def test_score_returns_none_for_unsupported_mime_type(self) -> None:
|
||||
result = RemoteDocumentParser.score("text/plain", "doc.txt")
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
def test_score_higher_than_tesseract_default(self) -> None:
|
||||
"""Remote parser (20) outranks the tesseract default (10) when configured."""
|
||||
score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert score is not None and score > 10
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Properties
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserProperties:
|
||||
def test_can_produce_archive_is_true(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert remote_parser.can_produce_archive is True
|
||||
|
||||
def test_requires_pdf_rendition_is_false(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert remote_parser.requires_pdf_rendition is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserLifecycle:
|
||||
def test_context_manager_cleans_up_tempdir(self) -> None:
|
||||
with RemoteDocumentParser() as parser:
|
||||
tempdir = parser._tempdir
|
||||
assert tempdir.exists()
|
||||
assert not tempdir.exists()
|
||||
|
||||
def test_context_manager_cleans_up_after_exception(self) -> None:
|
||||
tempdir: Path | None = None
|
||||
with pytest.raises(RuntimeError):
|
||||
with RemoteDocumentParser() as parser:
|
||||
tempdir = parser._tempdir
|
||||
raise RuntimeError("boom")
|
||||
assert tempdir is not None
|
||||
assert not tempdir.exists()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse() — happy path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserParse:
|
||||
def test_parse_returns_text_from_azure(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() == _DEFAULT_TEXT
|
||||
|
||||
def test_parse_sets_archive_path(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
archive = remote_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
assert archive.exists()
|
||||
assert archive.suffix == ".pdf"
|
||||
|
||||
def test_parse_closes_client_on_success(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
azure_client.close.assert_called_once()
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
def test_parse_sets_empty_text_when_not_configured(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() == ""
|
||||
assert remote_parser.get_archive_path() is None
|
||||
|
||||
def test_get_text_none_before_parse(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert remote_parser.get_text() is None
|
||||
|
||||
def test_get_date_always_none(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_date() is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse() — Azure failure path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserParseError:
|
||||
def test_parse_returns_none_on_azure_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() is None
|
||||
|
||||
def test_parse_closes_client_on_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
failing_azure_client.close.assert_called_once()
|
||||
|
||||
def test_parse_logs_error_on_azure_failure(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
mock_log = mocker.patch("paperless.parsers.remote.logger")
|
||||
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
mock_log.error.assert_called_once()
|
||||
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_page_count()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserPageCount:
|
||||
def test_page_count_for_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||
assert isinstance(count, int)
|
||||
assert count >= 1
|
||||
|
||||
def test_page_count_returns_none_for_image_mime(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
count = remote_parser.get_page_count(sample_pdf_file, "image/png")
|
||||
assert count is None
|
||||
|
||||
def test_page_count_returns_none_for_invalid_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
bad_pdf = tmp_path / "bad.pdf"
|
||||
bad_pdf.write_bytes(b"not a pdf at all")
|
||||
count = remote_parser.get_page_count(bad_pdf, "application/pdf")
|
||||
assert count is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# extract_metadata()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserMetadata:
|
||||
def test_extract_metadata_non_pdf_returns_empty(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
|
||||
assert result == []
|
||||
|
||||
def test_extract_metadata_pdf_returns_list(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||
assert isinstance(result, list)
|
||||
|
||||
def test_extract_metadata_pdf_entries_have_required_keys(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||
for entry in result:
|
||||
assert "namespace" in entry
|
||||
assert "prefix" in entry
|
||||
assert "key" in entry
|
||||
assert "value" in entry
|
||||
assert isinstance(entry["value"], str)
|
||||
|
||||
def test_extract_metadata_does_not_raise_on_invalid_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
bad_pdf = tmp_path / "bad.pdf"
|
||||
bad_pdf.write_bytes(b"not a pdf at all")
|
||||
result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
|
||||
assert result == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry integration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserRegistry:
|
||||
def test_registered_in_defaults(self) -> None:
|
||||
from paperless.parsers.registry import ParserRegistry
|
||||
|
||||
registry = ParserRegistry()
|
||||
registry.register_defaults()
|
||||
|
||||
assert RemoteDocumentParser in registry._builtins
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
def test_get_parser_returns_remote_when_configured(self) -> None:
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
registry = get_parser_registry()
|
||||
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||
|
||||
assert parser_cls is RemoteDocumentParser
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
def test_get_parser_returns_none_for_pdf_when_not_configured(self) -> None:
|
||||
"""With no tesseract parser registered yet, PDF has no handler if remote is off."""
|
||||
from paperless.parsers.registry import ParserRegistry
|
||||
|
||||
registry = ParserRegistry()
|
||||
registry.register_defaults()
|
||||
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||
|
||||
assert parser_cls is None
|
||||
@@ -1,118 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class RemoteEngineConfig:
|
||||
def __init__(
|
||||
self,
|
||||
engine: str,
|
||||
api_key: str | None = None,
|
||||
endpoint: str | None = None,
|
||||
):
|
||||
self.engine = engine
|
||||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
|
||||
def engine_is_valid(self):
|
||||
valid = self.engine in ["azureai"] and self.api_key is not None
|
||||
if self.engine == "azureai":
|
||||
valid = valid and self.endpoint is not None
|
||||
return valid
|
||||
|
||||
|
||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
"""
|
||||
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
|
||||
as this is the only service that provides a remote OCR API with text-embedded PDF output.
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.remote"
|
||||
|
||||
def get_settings(self) -> RemoteEngineConfig:
|
||||
"""
|
||||
Returns the configuration for the remote OCR engine, loaded from Django settings.
|
||||
"""
|
||||
return RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
|
||||
def supported_mime_types(self):
|
||||
if self.settings.engine_is_valid():
|
||||
return {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/tiff": ".tiff",
|
||||
"image/bmp": ".bmp",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
else:
|
||||
return {}
|
||||
|
||||
def azure_ai_vision_parse(
|
||||
self,
|
||||
file: Path,
|
||||
) -> str | None:
|
||||
"""
|
||||
Uses Azure AI Vision to parse the document and return the text content.
|
||||
It requests a searchable PDF output with embedded text.
|
||||
The PDF is saved to the archive_path attribute.
|
||||
Returns the text content extracted from the document.
|
||||
If the parsing fails, it returns None.
|
||||
"""
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
||||
from azure.ai.documentintelligence.models import DocumentContentFormat
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
client = DocumentIntelligenceClient(
|
||||
endpoint=self.settings.endpoint,
|
||||
credential=AzureKeyCredential(self.settings.api_key),
|
||||
)
|
||||
|
||||
try:
|
||||
with file.open("rb") as f:
|
||||
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
||||
poller = client.begin_analyze_document(
|
||||
model_id="prebuilt-read",
|
||||
body=analyze_request,
|
||||
output_content_format=DocumentContentFormat.TEXT,
|
||||
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
poller.wait()
|
||||
result_id = poller.details["operation_id"]
|
||||
result = poller.result()
|
||||
|
||||
# Download the PDF with embedded text
|
||||
self.archive_path = self.tempdir / "archive.pdf"
|
||||
with self.archive_path.open("wb") as f:
|
||||
for chunk in client.get_analyze_result_pdf(
|
||||
model_id="prebuilt-read",
|
||||
result_id=result_id,
|
||||
):
|
||||
f.write(chunk)
|
||||
return result.content
|
||||
except Exception as e:
|
||||
self.log.error(f"Azure AI Vision parsing failed: {e}")
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
return None
|
||||
|
||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||
if not self.settings.engine_is_valid():
|
||||
self.log.warning(
|
||||
"No valid remote parser engine is configured, content will be empty.",
|
||||
)
|
||||
self.text = ""
|
||||
elif self.settings.engine == "azureai":
|
||||
self.text = self.azure_ai_vision_parse(document_path)
|
||||
@@ -1,16 +1,36 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
|
||||
# The new RemoteDocumentParser does not accept the progress_callback
|
||||
# kwarg injected by the old signal-based consumer. logging_group is
|
||||
# forwarded as a positional arg.
|
||||
# Phase 4 will replace this signal path with the new ParserRegistry.
|
||||
kwargs.pop("progress_callback", None)
|
||||
return RemoteDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def get_supported_mime_types():
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
def get_supported_mime_types() -> dict[str, str]:
|
||||
from django.conf import settings
|
||||
|
||||
return RemoteDocumentParser(None).supported_mime_types()
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.remote import RemoteEngineConfig
|
||||
|
||||
config = RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
if not config.engine_is_valid():
|
||||
return {}
|
||||
return RemoteDocumentParser.supported_mime_types()
|
||||
|
||||
|
||||
def remote_consumer_declaration(sender, **kwargs):
|
||||
def remote_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 5,
|
||||
|
||||
@@ -1,131 +0,0 @@
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
from paperless_remote.signals import get_parser
|
||||
|
||||
|
||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
||||
|
||||
def assertContainsStrings(self, content: str, strings: list[str]) -> None:
|
||||
# Asserts that all strings appear in content, in the given order.
|
||||
indices = []
|
||||
for s in strings:
|
||||
if s in content:
|
||||
indices.append(content.index(s))
|
||||
else:
|
||||
self.fail(f"'{s}' is not in '{content}'")
|
||||
self.assertListEqual(indices, sorted(indices))
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.run_subprocess")
|
||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
||||
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess) -> None:
|
||||
# Arrange mock Azure client
|
||||
mock_client = mock.Mock()
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
# Simulate poller result and its `.details`
|
||||
mock_poller = mock.Mock()
|
||||
mock_poller.wait.return_value = None
|
||||
mock_poller.details = {"operation_id": "fake-op-id"}
|
||||
mock_client.begin_analyze_document.return_value = mock_poller
|
||||
mock_poller.result.return_value.content = "This is a test document."
|
||||
|
||||
# Return dummy PDF bytes
|
||||
mock_client.get_analyze_result_pdf.return_value = [
|
||||
b"%PDF-",
|
||||
b"1.7 ",
|
||||
b"FAKEPDF",
|
||||
]
|
||||
|
||||
# Simulate pdftotext by writing dummy text to sidecar file
|
||||
def fake_run(cmd, *args, **kwargs) -> None:
|
||||
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
|
||||
f.write("This is a test document.")
|
||||
|
||||
mock_subprocess.side_effect = fake_run
|
||||
|
||||
with override_settings(
|
||||
REMOTE_OCR_ENGINE="azureai",
|
||||
REMOTE_OCR_API_KEY="somekey",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
||||
):
|
||||
parser = get_parser(uuid.uuid4())
|
||||
parser.parse(
|
||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.text.strip(),
|
||||
["This is a test document."],
|
||||
)
|
||||
|
||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
||||
def test_get_text_with_azure_error_logged_and_returns_none(
|
||||
self,
|
||||
mock_client_cls,
|
||||
) -> None:
|
||||
mock_client = mock.Mock()
|
||||
mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
with override_settings(
|
||||
REMOTE_OCR_ENGINE="azureai",
|
||||
REMOTE_OCR_API_KEY="somekey",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
||||
):
|
||||
parser = get_parser(uuid.uuid4())
|
||||
with mock.patch.object(parser.log, "error") as mock_log_error:
|
||||
parser.parse(
|
||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertIsNone(parser.text)
|
||||
mock_client.begin_analyze_document.assert_called_once()
|
||||
mock_client.close.assert_called_once()
|
||||
mock_log_error.assert_called_once()
|
||||
self.assertIn(
|
||||
"Azure AI Vision parsing failed",
|
||||
mock_log_error.call_args[0][0],
|
||||
)
|
||||
|
||||
@override_settings(
|
||||
REMOTE_OCR_ENGINE="azureai",
|
||||
REMOTE_OCR_API_KEY="key",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
||||
)
|
||||
def test_supported_mime_types_valid_config(self) -> None:
|
||||
parser = RemoteDocumentParser(uuid.uuid4())
|
||||
expected_types = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/tiff": ".tiff",
|
||||
"image/bmp": ".bmp",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
self.assertEqual(parser.supported_mime_types(), expected_types)
|
||||
|
||||
def test_supported_mime_types_invalid_config(self) -> None:
|
||||
parser = get_parser(uuid.uuid4())
|
||||
self.assertEqual(parser.supported_mime_types(), {})
|
||||
|
||||
@override_settings(
|
||||
REMOTE_OCR_ENGINE=None,
|
||||
REMOTE_OCR_API_KEY=None,
|
||||
REMOTE_OCR_ENDPOINT=None,
|
||||
)
|
||||
def test_parse_with_invalid_config(self) -> None:
|
||||
parser = get_parser(uuid.uuid4())
|
||||
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
|
||||
self.assertEqual(parser.text, "")
|
||||
@@ -1,16 +1,20 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
# The new TextDocumentParser does not accept the legacy logging_group /
|
||||
# progress_callback kwargs injected by the old signal-based consumer.
|
||||
# These are dropped here; Phase 4 will replace this signal path with the
|
||||
# new ParserRegistry so the shim can be removed at that point.
|
||||
kwargs.pop("logging_group", None)
|
||||
# The new TextDocumentParser does not accept the progress_callback
|
||||
# kwarg injected by the old signal-based consumer. logging_group is
|
||||
# forwarded as a positional arg.
|
||||
# Phase 4 will replace this signal path with the new ParserRegistry.
|
||||
kwargs.pop("progress_callback", None)
|
||||
return TextDocumentParser()
|
||||
return TextDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def text_consumer_declaration(sender, **kwargs):
|
||||
def text_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 10,
|
||||
|
||||
Reference in New Issue
Block a user