Feature: migrate RemoteDocumentParser to ParserProtocol interface

Rewrites the remote OCR parser to the new plugin system contract:

- `supported_mime_types()` is now a classmethod that always returns the
  full set of 7 MIME types; the old instance-method hack (returning {}
  when unconfigured) is removed
- `score()` classmethod returns None when no remote engine is configured
  (making the parser invisible to the registry), and 20 when active —
  higher than the tesseract default of 10 so the remote engine takes
  priority when both are available
- No longer inherits from RasterisedDocumentParser; inherits no parser
  class at all — just implements the protocol directly
- `can_produce_archive = True`; `requires_pdf_rendition = False`
- `_azure_ai_vision_parse()` takes explicit config arg; API client
  created and closed within the method
- `get_page_count()` returns the PDF page count for application/pdf,
  delegating to the new `get_page_count_for_pdf()` utility
- `extract_metadata()` delegates to `extract_pdf_metadata()` for PDFs;
  returns [] for all other MIME types

New files:
- `src/paperless/parsers/utils.py` — shared `extract_pdf_metadata()` and
  `get_page_count_for_pdf()` utilities (pikepdf-based); both the remote
  and tesseract parsers will use these going forward
- `src/paperless/tests/parsers/test_remote_parser.py` — 42 pytest-style
  tests using pytest-django `settings` and pytest-mock `mocker` fixtures
- `src/paperless/tests/parsers/conftest.py` — remote parser instance,
  sample-file, and settings-helper fixtures

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-13 11:52:11 -07:00
parent 75dce7f19f
commit 5d4d87764c
5 changed files with 1058 additions and 164 deletions
+2
View File
@@ -193,9 +193,11 @@ class ParserRegistry:
that log output is predictable; scoring determines which parser wins
at runtime regardless of registration order.
"""
from paperless.parsers.remote import RemoteDocumentParser
from paperless.parsers.text import TextDocumentParser
self.register_builtin(TextDocumentParser)
self.register_builtin(RemoteDocumentParser)
# ------------------------------------------------------------------
# Discovery
+356 -52
View File
@@ -1,70 +1,381 @@
"""
Built-in remote-OCR document parser.
Handles documents by sending them to a configured remote OCR engine
(currently Azure AI Vision / Document Intelligence) and retrieving both
the extracted text and a searchable PDF with an embedded text layer.
When no engine is configured, ``score()`` returns ``None`` so the parser
is effectively invisible to the registry — the tesseract parser handles
these MIME types instead.
"""
from __future__ import annotations
import logging
import shutil
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Self
from django.conf import settings
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless.version import __full_version_str__
if TYPE_CHECKING:
import datetime
from types import TracebackType
from paperless.parsers import MetadataEntry
logger = logging.getLogger("paperless.parsing.remote")
_SUPPORTED_MIME_TYPES: dict[str, str] = {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
class RemoteEngineConfig:
"""Holds and validates the remote OCR engine configuration."""
def __init__(
self,
engine: str,
engine: str | None,
api_key: str | None = None,
endpoint: str | None = None,
):
) -> None:
self.engine = engine
self.api_key = api_key
self.endpoint = endpoint
def engine_is_valid(self):
valid = self.engine in ["azureai"] and self.api_key is not None
if self.engine == "azureai":
valid = valid and self.endpoint is not None
return valid
def engine_is_valid(self) -> bool:
"""Return True when the engine is known and fully configured."""
return (
self.engine in ("azureai",)
and self.api_key is not None
and not (self.engine == "azureai" and self.endpoint is None)
)
class RemoteDocumentParser(RasterisedDocumentParser):
"""
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
as this is the only service that provides a remote OCR API with text-embedded PDF output.
class RemoteDocumentParser:
"""Parse documents via a remote OCR API (currently Azure AI Vision).
This parser sends documents to a remote engine that returns both
extracted text and a searchable PDF with an embedded text layer.
It does not depend on Tesseract or ocrmypdf.
Class attributes
----------------
name : str
Human-readable parser name.
version : str
Semantic version string, kept in sync with Paperless-ngx releases.
author : str
Maintainer name.
url : str
Issue tracker / source URL.
"""
logging_name = "paperless.parsing.remote"
name: str = "Paperless-ngx Remote OCR Parser"
version: str = __full_version_str__
author: str = "Paperless-ngx Contributors"
url: str = "https://github.com/paperless-ngx/paperless-ngx"
def get_settings(self) -> RemoteEngineConfig:
# ------------------------------------------------------------------
# Class methods
# ------------------------------------------------------------------
@classmethod
def supported_mime_types(cls) -> dict[str, str]:
"""Return the MIME types this parser can handle.
The full set is always returned regardless of whether a remote
engine is configured. The ``score()`` method handles the
"am I active?" logic by returning ``None`` when not configured.
Returns
-------
dict[str, str]
Mapping of MIME type to preferred file extension.
"""
Returns the configuration for the remote OCR engine, loaded from Django settings.
return _SUPPORTED_MIME_TYPES
@classmethod
def score(
cls,
mime_type: str,
filename: str,
path: Path | None = None,
) -> int | None:
"""Return the priority score for handling this file, or None.
Returns ``None`` when no valid remote engine is configured,
making the parser invisible to the registry for this file.
When configured, returns 20 — higher than the Tesseract parser's
default of 10 — so the remote engine takes priority.
Parameters
----------
mime_type:
Detected MIME type of the file.
filename:
Original filename including extension.
path:
Optional filesystem path. Not inspected by this parser.
Returns
-------
int | None
20 when the remote engine is configured and the MIME type is
supported, otherwise None.
"""
return RemoteEngineConfig(
config = RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
)
if not config.engine_is_valid():
return None
if mime_type not in _SUPPORTED_MIME_TYPES:
return None
return 20
# ------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------
@property
def can_produce_archive(self) -> bool:
"""Whether this parser can produce a searchable PDF archive copy.
Returns
-------
bool
Always True — the remote engine always returns a PDF with an
embedded text layer that serves as the archive copy.
"""
return True
@property
def requires_pdf_rendition(self) -> bool:
"""Whether the parser must produce a PDF for the frontend to display.
Returns
-------
bool
Always False — all supported originals are displayable by
the browser (PDF) or handled via the archive copy (images).
"""
return False
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def __init__(self, logging_group: object = None) -> None:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
self._tempdir = Path(
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
)
self._logging_group = logging_group
self._text: str | None = None
self._archive_path: Path | None = None
def __enter__(self) -> Self:
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
logger.debug("Cleaning up temporary directory %s", self._tempdir)
shutil.rmtree(self._tempdir, ignore_errors=True)
# ------------------------------------------------------------------
# Core parsing interface
# ------------------------------------------------------------------
def parse(
self,
document_path: Path,
mime_type: str,
*,
produce_archive: bool = True,
) -> None:
"""Send the document to the remote engine and store results.
Parameters
----------
document_path:
Absolute path to the document file to parse.
mime_type:
Detected MIME type of the document.
produce_archive:
Ignored — the remote engine always returns a searchable PDF,
which is stored as the archive copy regardless of this flag.
"""
config = RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
)
def supported_mime_types(self):
if self.settings.engine_is_valid():
return {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
else:
return {}
if not config.engine_is_valid():
logger.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self._text = ""
return
def azure_ai_vision_parse(
if config.engine == "azureai":
self._text = self._azure_ai_vision_parse(document_path, config)
# ------------------------------------------------------------------
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
"""Return the plain-text content extracted during parse."""
return self._text
def get_date(self) -> datetime.datetime | None:
"""Return the document date detected during parse.
Returns
-------
datetime.datetime | None
Always None — the remote parser does not detect dates.
"""
return None
def get_archive_path(self) -> Path | None:
"""Return the path to the generated archive PDF, or None."""
return self._archive_path
# ------------------------------------------------------------------
# Thumbnail and metadata
# ------------------------------------------------------------------
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
"""Generate a thumbnail image for the document.
Uses the archive PDF produced by the remote engine when available,
otherwise falls back to the original document path (PDF inputs).
Parameters
----------
document_path:
Absolute path to the source document.
mime_type:
Detected MIME type of the document.
Returns
-------
Path
Path to the generated WebP thumbnail inside the temp directory.
"""
# make_thumbnail_from_pdf lives in documents.parsers for now;
# it will move to paperless.parsers.utils when the tesseract
# parser is migrated in a later phase.
from documents.parsers import make_thumbnail_from_pdf
return make_thumbnail_from_pdf(
self._archive_path or document_path,
self._tempdir,
self._logging_group,
)
def get_page_count(
self,
document_path: Path,
mime_type: str,
) -> int | None:
"""Return the number of pages in a PDF document.
Parameters
----------
document_path:
Absolute path to the source document.
mime_type:
Detected MIME type of the document.
Returns
-------
int | None
Page count for PDF inputs, or ``None`` for other MIME types.
"""
if mime_type != "application/pdf":
return None
from paperless.parsers.utils import get_page_count_for_pdf
return get_page_count_for_pdf(document_path, log=logger)
def extract_metadata(
self,
document_path: Path,
mime_type: str,
) -> list[MetadataEntry]:
"""Extract format-specific metadata from the document.
Delegates to the shared pikepdf-based extractor for PDF files.
Returns ``[]`` for all other MIME types.
Parameters
----------
document_path:
Absolute path to the file to extract metadata from.
mime_type:
MIME type of the file. May be ``"application/pdf"`` when
called for the archive version of an image original.
Returns
-------
list[MetadataEntry]
Zero or more metadata entries.
"""
if mime_type != "application/pdf":
return []
from paperless.parsers.utils import extract_pdf_metadata
return extract_pdf_metadata(document_path, log=logger)
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
def _azure_ai_vision_parse(
self,
file: Path,
config: RemoteEngineConfig,
) -> str | None:
"""
Uses Azure AI Vision to parse the document and return the text content.
It requests a searchable PDF output with embedded text.
The PDF is saved to the archive_path attribute.
Returns the text content extracted from the document.
If the parsing fails, it returns None.
"""Send ``file`` to Azure AI Document Intelligence and return text.
Downloads the searchable PDF output from Azure and stores it at
``self._archive_path``. Returns the extracted text content, or
``None`` on failure (the error is logged).
Parameters
----------
file:
Absolute path to the document to analyse.
config:
Validated remote engine configuration.
Returns
-------
str | None
Extracted text, or None if the Azure call failed.
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
@@ -73,8 +384,8 @@ class RemoteDocumentParser(RasterisedDocumentParser):
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
endpoint=self.settings.endpoint,
credential=AzureKeyCredential(self.settings.api_key),
endpoint=config.endpoint,
credential=AzureKeyCredential(config.api_key),
)
try:
@@ -84,7 +395,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
model_id="prebuilt-read",
body=analyze_request,
output_content_format=DocumentContentFormat.TEXT,
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
output=[AnalyzeOutputOption.PDF],
content_type="application/json",
)
@@ -92,27 +403,20 @@ class RemoteDocumentParser(RasterisedDocumentParser):
result_id = poller.details["operation_id"]
result = poller.result()
# Download the PDF with embedded text
self.archive_path = self.tempdir / "archive.pdf"
with self.archive_path.open("wb") as f:
self._archive_path = self._tempdir / "archive.pdf"
with self._archive_path.open("wb") as f:
for chunk in client.get_analyze_result_pdf(
model_id="prebuilt-read",
result_id=result_id,
):
f.write(chunk)
return result.content
except Exception as e:
self.log.error(f"Azure AI Vision parsing failed: {e}")
logger.error("Azure AI Vision parsing failed: %s", e)
finally:
client.close()
return None
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():
self.log.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self.text = ""
elif self.settings.engine == "azureai":
self.text = self.azure_ai_vision_parse(document_path)
+130
View File
@@ -0,0 +1,130 @@
"""
Shared utilities for Paperless-ngx document parsers.
Functions here are format-neutral helpers that multiple parsers need.
Keeping them here avoids parsers inheriting from each other just to
share implementation.
"""
from __future__ import annotations
import logging
import re
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from pathlib import Path
from paperless.parsers import MetadataEntry
logger = logging.getLogger("paperless.parsers.utils")
def get_page_count_for_pdf(
document_path: Path,
log: logging.Logger | None = None,
) -> int | None:
"""Return the number of pages in a PDF file using pikepdf.
Parameters
----------
document_path:
Absolute path to the PDF file.
log:
Logger to use for warnings. Falls back to the module-level logger
when omitted.
Returns
-------
int | None
Page count, or ``None`` if the file cannot be opened or is not a
valid PDF.
"""
import pikepdf
_log = log or logger
try:
with pikepdf.Pdf.open(document_path) as pdf:
return len(pdf.pages)
except Exception as e:
_log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
return None
def extract_pdf_metadata(
document_path: Path,
log: logging.Logger | None = None,
) -> list[MetadataEntry]:
"""Extract XMP/PDF metadata from a PDF file using pikepdf.
Reads all XMP metadata entries from the document and returns them as a
list of ``MetadataEntry`` dicts. The method never raises — any failure
to open the file or read a specific key is logged and skipped.
Parameters
----------
document_path:
Absolute path to the PDF file.
log:
Logger to use for warnings and debug messages. Falls back to the
module-level logger when omitted.
Returns
-------
list[MetadataEntry]
Zero or more metadata entries. Returns ``[]`` if the file cannot
be opened or contains no readable XMP metadata.
"""
import pikepdf
from paperless.parsers import MetadataEntry
_log = log or logger
result: list[MetadataEntry] = []
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
try:
pdf = pikepdf.open(document_path)
meta = pdf.open_metadata()
except Exception as e:
_log.warning("Could not open PDF metadata for %s: %s", document_path, e)
return []
for key, value in meta.items():
if isinstance(value, list):
value = " ".join(str(e) for e in value)
value = str(value)
try:
m = namespace_pattern.match(key)
if m is None:
continue
namespace = m.group(1)
key_value = m.group(2)
try:
namespace.encode("utf-8")
key_value.encode("utf-8")
except UnicodeEncodeError as enc_err:
_log.debug("Skipping metadata key %s: %s", key, enc_err)
continue
result.append(
MetadataEntry(
namespace=namespace,
prefix=meta.REVERSE_NS[namespace],
key=key_value,
value=value,
),
)
except Exception as e:
_log.warning(
"Error reading metadata key %s value %s: %s",
key,
value,
e,
)
return result
+89
View File
@@ -10,12 +10,15 @@ from typing import TYPE_CHECKING
import pytest
from paperless.parsers.remote import RemoteDocumentParser
from paperless.parsers.text import TextDocumentParser
if TYPE_CHECKING:
from collections.abc import Generator
from pathlib import Path
from pytest_django.fixtures import SettingsWrapper
# ------------------------------------------------------------------
# Text parser sample files
@@ -74,3 +77,89 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
"""
with TextDocumentParser() as parser:
yield parser
# ------------------------------------------------------------------
# Remote parser sample files
# ------------------------------------------------------------------
@pytest.fixture(scope="session")
def remote_samples_dir(samples_dir: Path) -> Path:
"""Absolute path to the remote parser sample files directory.
Returns
-------
Path
``<samples_dir>/remote/``
"""
return samples_dir / "remote"
@pytest.fixture(scope="session")
def sample_pdf_file(remote_samples_dir: Path) -> Path:
"""Path to a simple digital PDF sample file.
Returns
-------
Path
Absolute path to ``remote/simple-digital.pdf``.
"""
return remote_samples_dir / "simple-digital.pdf"
# ------------------------------------------------------------------
# Remote parser instance
# ------------------------------------------------------------------
@pytest.fixture()
def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
"""Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
Yields
------
RemoteDocumentParser
A ready-to-use parser instance.
"""
with RemoteDocumentParser() as parser:
yield parser
# ------------------------------------------------------------------
# Remote parser settings helpers
# ------------------------------------------------------------------
@pytest.fixture()
def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
"""Configure Django settings for a valid Azure AI OCR engine.
Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
``REMOTE_OCR_ENDPOINT`` to test values. Settings are restored
automatically after the test by pytest-django.
Returns
-------
SettingsWrapper
The modified settings object (for chaining further overrides).
"""
settings.REMOTE_OCR_ENGINE = "azureai"
settings.REMOTE_OCR_API_KEY = "test-api-key"
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
return settings
@pytest.fixture()
def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
"""Configure Django settings with no remote engine configured.
Returns
-------
SettingsWrapper
The modified settings object.
"""
settings.REMOTE_OCR_ENGINE = None
settings.REMOTE_OCR_API_KEY = None
settings.REMOTE_OCR_ENDPOINT = None
return settings
+481 -112
View File
@@ -1,131 +1,500 @@
import uuid
from pathlib import Path
from unittest import mock
"""
Tests for paperless.parsers.remote.RemoteDocumentParser.
from django.test import TestCase
from django.test import override_settings
from paperless_remote.parsers import RemoteDocumentParser
All tests use the context-manager protocol for parser lifecycle. The Azure
AI client is always mocked via the ``mocker`` fixture so no real network
calls are made. Django settings are overridden via the pytest-django
``settings`` fixture (or the ``azure_settings`` / ``no_engine_settings``
helpers defined in conftest.py).
"""
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.signals import get_parser
from __future__ import annotations
from typing import TYPE_CHECKING
from unittest.mock import Mock
import pytest
from paperless.parsers import ParserProtocol
from paperless.parsers.remote import RemoteDocumentParser
if TYPE_CHECKING:
from pathlib import Path
from pytest_django.fixtures import SettingsWrapper
from pytest_mock import MockerFixture
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def assertContainsStrings(self, content: str, strings: list[str]) -> None:
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
@mock.patch("paperless_tesseract.parsers.run_subprocess")
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess) -> None:
# Arrange mock Azure client
mock_client = mock.Mock()
mock_client_cls.return_value = mock_client
def _make_azure_mock(text: str = "Extracted text.") -> Mock:
"""Return a configured mock Azure DocumentIntelligenceClient."""
mock_client = Mock()
mock_poller = Mock()
mock_poller.wait.return_value = None
mock_poller.details = {"operation_id": "fake-op-id"}
mock_poller.result.return_value.content = text
mock_client.begin_analyze_document.return_value = mock_poller
mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
return mock_client
# Simulate poller result and its `.details`
mock_poller = mock.Mock()
mock_poller.wait.return_value = None
mock_poller.details = {"operation_id": "fake-op-id"}
mock_client.begin_analyze_document.return_value = mock_poller
mock_poller.result.return_value.content = "This is a test document."
# Return dummy PDF bytes
mock_client.get_analyze_result_pdf.return_value = [
b"%PDF-",
b"1.7 ",
b"FAKEPDF",
]
# ---------------------------------------------------------------------------
# Protocol contract
# ---------------------------------------------------------------------------
# Simulate pdftotext by writing dummy text to sidecar file
def fake_run(cmd, *args, **kwargs) -> None:
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
f.write("This is a test document.")
mock_subprocess.side_effect = fake_run
class TestRemoteParserProtocol:
"""Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""
with override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
):
parser = get_parser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_get_text_with_azure_error_logged_and_returns_none(
def test_isinstance_satisfies_protocol(
self,
mock_client_cls,
remote_parser: RemoteDocumentParser,
) -> None:
mock_client = mock.Mock()
mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
mock_client_cls.return_value = mock_client
assert isinstance(remote_parser, ParserProtocol)
with override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
):
parser = get_parser(uuid.uuid4())
with mock.patch.object(parser.log, "error") as mock_log_error:
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
def test_class_attributes_present(self) -> None:
assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
assert (
isinstance(RemoteDocumentParser.version, str)
and RemoteDocumentParser.version
)
assert (
isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
)
assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url
self.assertIsNone(parser.text)
mock_client.begin_analyze_document.assert_called_once()
mock_client.close.assert_called_once()
mock_log_error.assert_called_once()
self.assertIn(
"Azure AI Vision parsing failed",
mock_log_error.call_args[0][0],
# ---------------------------------------------------------------------------
# supported_mime_types
# ---------------------------------------------------------------------------
class TestRemoteParserSupportedMimeTypes:
"""supported_mime_types() always returns the full set regardless of config."""
def test_returns_dict(self) -> None:
mime_types = RemoteDocumentParser.supported_mime_types()
assert isinstance(mime_types, dict)
def test_includes_all_expected_types(self) -> None:
mime_types = RemoteDocumentParser.supported_mime_types()
expected = {
"application/pdf",
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
}
assert expected == set(mime_types.keys())
def test_returns_full_set_when_not_configured(
self,
no_engine_settings: SettingsWrapper,
) -> None:
"""
GIVEN: No remote engine is configured
WHEN: supported_mime_types() is called
THEN: The full MIME type dict is still returned (score() handles activation)
"""
mime_types = RemoteDocumentParser.supported_mime_types()
assert len(mime_types) == 7
# ---------------------------------------------------------------------------
# score()
# ---------------------------------------------------------------------------
class TestRemoteParserScore:
"""score() encodes the activation logic: None when unconfigured, 20 when active."""
@pytest.mark.parametrize(
"mime_type",
[
"application/pdf",
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
],
)
def test_score_returns_20_when_configured(
self,
azure_settings: SettingsWrapper,
mime_type: str,
) -> None:
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
assert result == 20
@pytest.mark.parametrize(
"mime_type",
["application/pdf", "image/png", "image/jpeg"],
)
def test_score_returns_none_when_no_engine(
self,
no_engine_settings: SettingsWrapper,
mime_type: str,
) -> None:
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
assert result is None
def test_score_returns_none_when_api_key_missing(
self,
settings: SettingsWrapper,
) -> None:
settings.REMOTE_OCR_ENGINE = "azureai"
settings.REMOTE_OCR_API_KEY = None
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
assert result is None
def test_score_returns_none_when_endpoint_missing(
self,
settings: SettingsWrapper,
) -> None:
settings.REMOTE_OCR_ENGINE = "azureai"
settings.REMOTE_OCR_API_KEY = "key"
settings.REMOTE_OCR_ENDPOINT = None
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
assert result is None
def test_score_returns_none_for_unsupported_mime_type(
self,
azure_settings: SettingsWrapper,
) -> None:
result = RemoteDocumentParser.score("text/plain", "doc.txt")
assert result is None
def test_score_higher_than_tesseract_default(
self,
azure_settings: SettingsWrapper,
) -> None:
"""Remote parser (20) outranks the tesseract default (10) when configured."""
score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
assert score is not None and score > 10
# ---------------------------------------------------------------------------
# Properties
# ---------------------------------------------------------------------------
class TestRemoteParserProperties:
def test_can_produce_archive_is_true(
self,
remote_parser: RemoteDocumentParser,
) -> None:
assert remote_parser.can_produce_archive is True
def test_requires_pdf_rendition_is_false(
self,
remote_parser: RemoteDocumentParser,
) -> None:
assert remote_parser.requires_pdf_rendition is False
# ---------------------------------------------------------------------------
# Lifecycle
# ---------------------------------------------------------------------------
class TestRemoteParserLifecycle:
def test_context_manager_cleans_up_tempdir(self) -> None:
with RemoteDocumentParser() as parser:
tempdir = parser._tempdir
assert tempdir.exists()
assert not tempdir.exists()
def test_context_manager_cleans_up_after_exception(self) -> None:
tempdir: Path | None = None
with pytest.raises(RuntimeError):
with RemoteDocumentParser() as parser:
tempdir = parser._tempdir
raise RuntimeError("boom")
assert tempdir is not None
assert not tempdir.exists()
# ---------------------------------------------------------------------------
# parse() — happy path with Azure mock
# ---------------------------------------------------------------------------
class TestRemoteParserParse:
def test_parse_returns_text_from_azure(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
azure_settings: SettingsWrapper,
mocker: MockerFixture,
) -> None:
mock_client = _make_azure_mock("Hello from Azure.")
mocker.patch(
"azure.ai.documentintelligence.DocumentIntelligenceClient",
return_value=mock_client,
)
@override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="key",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
)
def test_supported_mime_types_valid_config(self) -> None:
parser = RemoteDocumentParser(uuid.uuid4())
expected_types = {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
self.assertEqual(parser.supported_mime_types(), expected_types)
remote_parser.parse(sample_pdf_file, "application/pdf")
def test_supported_mime_types_invalid_config(self) -> None:
parser = get_parser(uuid.uuid4())
self.assertEqual(parser.supported_mime_types(), {})
assert remote_parser.get_text() == "Hello from Azure."
@override_settings(
REMOTE_OCR_ENGINE=None,
REMOTE_OCR_API_KEY=None,
REMOTE_OCR_ENDPOINT=None,
)
def test_parse_with_invalid_config(self) -> None:
parser = get_parser(uuid.uuid4())
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
self.assertEqual(parser.text, "")
def test_parse_sets_archive_path(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
azure_settings: SettingsWrapper,
mocker: MockerFixture,
) -> None:
mock_client = _make_azure_mock()
mocker.patch(
"azure.ai.documentintelligence.DocumentIntelligenceClient",
return_value=mock_client,
)
remote_parser.parse(sample_pdf_file, "application/pdf")
archive = remote_parser.get_archive_path()
assert archive is not None
assert archive.exists()
assert archive.suffix == ".pdf"
def test_parse_closes_client_on_success(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
azure_settings: SettingsWrapper,
mocker: MockerFixture,
) -> None:
mock_client = _make_azure_mock()
mocker.patch(
"azure.ai.documentintelligence.DocumentIntelligenceClient",
return_value=mock_client,
)
remote_parser.parse(sample_pdf_file, "application/pdf")
mock_client.close.assert_called_once()
def test_parse_sets_empty_text_when_not_configured(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
no_engine_settings: SettingsWrapper,
) -> None:
remote_parser.parse(sample_pdf_file, "application/pdf")
assert remote_parser.get_text() == ""
assert remote_parser.get_archive_path() is None
def test_get_text_none_before_parse(
self,
remote_parser: RemoteDocumentParser,
) -> None:
assert remote_parser.get_text() is None
def test_get_date_always_none(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
azure_settings: SettingsWrapper,
mocker: MockerFixture,
) -> None:
mock_client = _make_azure_mock()
mocker.patch(
"azure.ai.documentintelligence.DocumentIntelligenceClient",
return_value=mock_client,
)
remote_parser.parse(sample_pdf_file, "application/pdf")
assert remote_parser.get_date() is None
# ---------------------------------------------------------------------------
# parse() — Azure failure path
# ---------------------------------------------------------------------------
class TestRemoteParserParseError:
def test_parse_returns_none_on_azure_error(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
azure_settings: SettingsWrapper,
mocker: MockerFixture,
) -> None:
mock_client = Mock()
mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
mocker.patch(
"azure.ai.documentintelligence.DocumentIntelligenceClient",
return_value=mock_client,
)
remote_parser.parse(sample_pdf_file, "application/pdf")
assert remote_parser.get_text() is None
def test_parse_closes_client_on_error(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
azure_settings: SettingsWrapper,
mocker: MockerFixture,
) -> None:
mock_client = Mock()
mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
mocker.patch(
"azure.ai.documentintelligence.DocumentIntelligenceClient",
return_value=mock_client,
)
remote_parser.parse(sample_pdf_file, "application/pdf")
mock_client.close.assert_called_once()
def test_parse_logs_error_on_azure_failure(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
azure_settings: SettingsWrapper,
mocker: MockerFixture,
) -> None:
mock_client = Mock()
mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
mocker.patch(
"azure.ai.documentintelligence.DocumentIntelligenceClient",
return_value=mock_client,
)
mock_log = mocker.patch("paperless.parsers.remote.logger")
remote_parser.parse(sample_pdf_file, "application/pdf")
mock_log.error.assert_called_once()
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
# ---------------------------------------------------------------------------
# get_page_count()
# ---------------------------------------------------------------------------
class TestRemoteParserPageCount:
def test_page_count_for_pdf(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
) -> None:
count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
assert isinstance(count, int)
assert count >= 1
def test_page_count_returns_none_for_image_mime(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
) -> None:
count = remote_parser.get_page_count(sample_pdf_file, "image/png")
assert count is None
def test_page_count_returns_none_for_invalid_pdf(
self,
remote_parser: RemoteDocumentParser,
tmp_path: Path,
) -> None:
bad_pdf = tmp_path / "bad.pdf"
bad_pdf.write_bytes(b"not a pdf at all")
count = remote_parser.get_page_count(bad_pdf, "application/pdf")
assert count is None
# ---------------------------------------------------------------------------
# extract_metadata()
# ---------------------------------------------------------------------------
class TestRemoteParserMetadata:
def test_extract_metadata_non_pdf_returns_empty(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
) -> None:
result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
assert result == []
def test_extract_metadata_pdf_returns_list(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
) -> None:
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
assert isinstance(result, list)
def test_extract_metadata_pdf_entries_have_required_keys(
self,
remote_parser: RemoteDocumentParser,
sample_pdf_file: Path,
) -> None:
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
for entry in result:
assert "namespace" in entry
assert "prefix" in entry
assert "key" in entry
assert "value" in entry
assert isinstance(entry["value"], str)
def test_extract_metadata_does_not_raise_on_invalid_pdf(
self,
remote_parser: RemoteDocumentParser,
tmp_path: Path,
) -> None:
bad_pdf = tmp_path / "bad.pdf"
bad_pdf.write_bytes(b"not a pdf at all")
result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
assert result == []
# ---------------------------------------------------------------------------
# Registry integration
# ---------------------------------------------------------------------------
class TestRemoteParserRegistry:
def test_registered_in_defaults(self) -> None:
from paperless.parsers.registry import ParserRegistry
registry = ParserRegistry()
registry.register_defaults()
assert RemoteDocumentParser in registry._builtins
def test_get_parser_returns_remote_when_configured(
self,
azure_settings: SettingsWrapper,
) -> None:
from paperless.parsers.registry import get_parser_registry
registry = get_parser_registry()
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
assert parser_cls is RemoteDocumentParser
def test_get_parser_returns_none_for_pdf_when_not_configured(
self,
no_engine_settings: SettingsWrapper,
) -> None:
"""With no tesseract parser registered yet, PDF has no handler if remote is off."""
from paperless.parsers.registry import ParserRegistry
registry = ParserRegistry()
registry.register_defaults()
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
assert parser_cls is None