mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-14 05:01:24 +00:00
Compare commits
9 Commits
feature-ve
...
feature-re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2098a11eb1 | ||
|
|
af8a8e791b | ||
|
|
8d4163bef3 | ||
|
|
e9e1d4ccca | ||
|
|
c955ba7d07 | ||
|
|
7028bb2163 | ||
|
|
5d4d87764c | ||
|
|
75dce7f19f | ||
|
|
365ff99934 |
@@ -60,7 +60,7 @@ dependencies = [
|
|||||||
"llama-index-llms-openai>=0.6.13",
|
"llama-index-llms-openai>=0.6.13",
|
||||||
"llama-index-vector-stores-faiss>=0.5.2",
|
"llama-index-vector-stores-faiss>=0.5.2",
|
||||||
"nltk~=3.9.1",
|
"nltk~=3.9.1",
|
||||||
"ocrmypdf~=16.13.0",
|
"ocrmypdf~=17.3.0",
|
||||||
"openai>=1.76",
|
"openai>=1.76",
|
||||||
"pathvalidate~=3.3.1",
|
"pathvalidate~=3.3.1",
|
||||||
"pdf2image~=1.17.0",
|
"pdf2image~=1.17.0",
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ from documents.templating.workflows import parse_w_workflow_placeholders
|
|||||||
from documents.utils import copy_basic_file_stats
|
from documents.utils import copy_basic_file_stats
|
||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless_mail.parsers import MailDocumentParser
|
from paperless_mail.parsers import MailDocumentParser
|
||||||
|
|
||||||
@@ -67,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
|
|||||||
|
|
||||||
TODO(stumpylog): Remove me in the future
|
TODO(stumpylog): Remove me in the future
|
||||||
"""
|
"""
|
||||||
if isinstance(parser, TextDocumentParser):
|
if isinstance(parser, (TextDocumentParser, RemoteDocumentParser)):
|
||||||
parser.__exit__(None, None, None)
|
parser.__exit__(None, None, None)
|
||||||
else:
|
else:
|
||||||
parser.cleanup()
|
parser.cleanup()
|
||||||
@@ -476,7 +477,10 @@ class ConsumerPlugin(
|
|||||||
self.filename,
|
self.filename,
|
||||||
self.input_doc.mailrule_id,
|
self.input_doc.mailrule_id,
|
||||||
)
|
)
|
||||||
elif isinstance(document_parser, TextDocumentParser):
|
elif isinstance(
|
||||||
|
document_parser,
|
||||||
|
(TextDocumentParser, RemoteDocumentParser),
|
||||||
|
):
|
||||||
# TODO(stumpylog): Remove me in the future
|
# TODO(stumpylog): Remove me in the future
|
||||||
document_parser.parse(self.working_copy, mime_type)
|
document_parser.parse(self.working_copy, mime_type)
|
||||||
else:
|
else:
|
||||||
@@ -489,7 +493,7 @@ class ConsumerPlugin(
|
|||||||
ProgressStatusOptions.WORKING,
|
ProgressStatusOptions.WORKING,
|
||||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||||
)
|
)
|
||||||
if isinstance(document_parser, TextDocumentParser):
|
if isinstance(document_parser, (TextDocumentParser, RemoteDocumentParser)):
|
||||||
# TODO(stumpylog): Remove me in the future
|
# TODO(stumpylog): Remove me in the future
|
||||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -193,9 +193,11 @@ class ParserRegistry:
|
|||||||
that log output is predictable; scoring determines which parser wins
|
that log output is predictable; scoring determines which parser wins
|
||||||
at runtime regardless of registration order.
|
at runtime regardless of registration order.
|
||||||
"""
|
"""
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
|
||||||
self.register_builtin(TextDocumentParser)
|
self.register_builtin(TextDocumentParser)
|
||||||
|
self.register_builtin(RemoteDocumentParser)
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Discovery
|
# Discovery
|
||||||
|
|||||||
429
src/paperless/parsers/remote.py
Normal file
429
src/paperless/parsers/remote.py
Normal file
@@ -0,0 +1,429 @@
|
|||||||
|
"""
|
||||||
|
Built-in remote-OCR document parser.
|
||||||
|
|
||||||
|
Handles documents by sending them to a configured remote OCR engine
|
||||||
|
(currently Azure AI Vision / Document Intelligence) and retrieving both
|
||||||
|
the extracted text and a searchable PDF with an embedded text layer.
|
||||||
|
|
||||||
|
When no engine is configured, ``score()`` returns ``None`` so the parser
|
||||||
|
is effectively invisible to the registry — the tesseract parser handles
|
||||||
|
these MIME types instead.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
from paperless.version import __full_version_str__
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsing.remote")
|
||||||
|
|
||||||
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||||
|
"application/pdf": ".pdf",
|
||||||
|
"image/png": ".png",
|
||||||
|
"image/jpeg": ".jpg",
|
||||||
|
"image/tiff": ".tiff",
|
||||||
|
"image/bmp": ".bmp",
|
||||||
|
"image/gif": ".gif",
|
||||||
|
"image/webp": ".webp",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteEngineConfig:
|
||||||
|
"""Holds and validates the remote OCR engine configuration."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
engine: str | None,
|
||||||
|
api_key: str | None = None,
|
||||||
|
endpoint: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
self.engine = engine
|
||||||
|
self.api_key = api_key
|
||||||
|
self.endpoint = endpoint
|
||||||
|
|
||||||
|
def engine_is_valid(self) -> bool:
|
||||||
|
"""Return True when the engine is known and fully configured."""
|
||||||
|
return (
|
||||||
|
self.engine in ("azureai",)
|
||||||
|
and self.api_key is not None
|
||||||
|
and not (self.engine == "azureai" and self.endpoint is None)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteDocumentParser:
|
||||||
|
"""Parse documents via a remote OCR API (currently Azure AI Vision).
|
||||||
|
|
||||||
|
This parser sends documents to a remote engine that returns both
|
||||||
|
extracted text and a searchable PDF with an embedded text layer.
|
||||||
|
It does not depend on Tesseract or ocrmypdf.
|
||||||
|
|
||||||
|
Class attributes
|
||||||
|
----------------
|
||||||
|
name : str
|
||||||
|
Human-readable parser name.
|
||||||
|
version : str
|
||||||
|
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||||
|
author : str
|
||||||
|
Maintainer name.
|
||||||
|
url : str
|
||||||
|
Issue tracker / source URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "Paperless-ngx Remote OCR Parser"
|
||||||
|
version: str = __full_version_str__
|
||||||
|
author: str = "Paperless-ngx Contributors"
|
||||||
|
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Class methods
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
"""Return the MIME types this parser can handle.
|
||||||
|
|
||||||
|
The full set is always returned regardless of whether a remote
|
||||||
|
engine is configured. The ``score()`` method handles the
|
||||||
|
"am I active?" logic by returning ``None`` when not configured.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, str]
|
||||||
|
Mapping of MIME type to preferred file extension.
|
||||||
|
"""
|
||||||
|
return _SUPPORTED_MIME_TYPES
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the priority score for handling this file, or None.
|
||||||
|
|
||||||
|
Returns ``None`` when no valid remote engine is configured,
|
||||||
|
making the parser invisible to the registry for this file.
|
||||||
|
When configured, returns 20 — higher than the Tesseract parser's
|
||||||
|
default of 10 — so the remote engine takes priority.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the file.
|
||||||
|
filename:
|
||||||
|
Original filename including extension.
|
||||||
|
path:
|
||||||
|
Optional filesystem path. Not inspected by this parser.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
20 when the remote engine is configured and the MIME type is
|
||||||
|
supported, otherwise None.
|
||||||
|
"""
|
||||||
|
config = RemoteEngineConfig(
|
||||||
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
|
api_key=settings.REMOTE_OCR_API_KEY,
|
||||||
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||||
|
)
|
||||||
|
if not config.engine_is_valid():
|
||||||
|
return None
|
||||||
|
if mime_type not in _SUPPORTED_MIME_TYPES:
|
||||||
|
return None
|
||||||
|
return 20
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
"""Whether this parser can produce a searchable PDF archive copy.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always True — the remote engine always returns a PDF with an
|
||||||
|
embedded text layer that serves as the archive copy.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
"""Whether the parser must produce a PDF for the frontend to display.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always False — all supported originals are displayable by
|
||||||
|
the browser (PDF) or handled via the archive copy (images).
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||||
|
)
|
||||||
|
self._logging_group = logging_group
|
||||||
|
self._text: str | None = None
|
||||||
|
self._archive_path: Path | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Core parsing interface
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Send the document to the remote engine and store results.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the document file to parse.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
produce_archive:
|
||||||
|
Ignored — the remote engine always returns a searchable PDF,
|
||||||
|
which is stored as the archive copy regardless of this flag.
|
||||||
|
"""
|
||||||
|
config = RemoteEngineConfig(
|
||||||
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
|
api_key=settings.REMOTE_OCR_API_KEY,
|
||||||
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not config.engine_is_valid():
|
||||||
|
logger.warning(
|
||||||
|
"No valid remote parser engine is configured, content will be empty.",
|
||||||
|
)
|
||||||
|
self._text = ""
|
||||||
|
return
|
||||||
|
|
||||||
|
if config.engine == "azureai":
|
||||||
|
self._text = self._azure_ai_vision_parse(document_path, config)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Result accessors
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
"""Return the plain-text content extracted during parse."""
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self) -> datetime.datetime | None:
|
||||||
|
"""Return the document date detected during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
datetime.datetime | None
|
||||||
|
Always None — the remote parser does not detect dates.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
"""Return the path to the generated archive PDF, or None."""
|
||||||
|
return self._archive_path
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Thumbnail and metadata
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
"""Generate a thumbnail image for the document.
|
||||||
|
|
||||||
|
Uses the archive PDF produced by the remote engine when available,
|
||||||
|
otherwise falls back to the original document path (PDF inputs).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated WebP thumbnail inside the temp directory.
|
||||||
|
"""
|
||||||
|
# make_thumbnail_from_pdf lives in documents.parsers for now;
|
||||||
|
# it will move to paperless.parsers.utils when the tesseract
|
||||||
|
# parser is migrated in a later phase.
|
||||||
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
|
|
||||||
|
return make_thumbnail_from_pdf(
|
||||||
|
self._archive_path or document_path,
|
||||||
|
self._tempdir,
|
||||||
|
self._logging_group,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_page_count(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in a PDF document.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Page count for PDF inputs, or ``None`` for other MIME types.
|
||||||
|
"""
|
||||||
|
if mime_type != "application/pdf":
|
||||||
|
return None
|
||||||
|
|
||||||
|
from paperless.parsers.utils import get_page_count_for_pdf
|
||||||
|
|
||||||
|
return get_page_count_for_pdf(document_path, log=logger)
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract format-specific metadata from the document.
|
||||||
|
|
||||||
|
Delegates to the shared pikepdf-based extractor for PDF files.
|
||||||
|
Returns ``[]`` for all other MIME types.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the file to extract metadata from.
|
||||||
|
mime_type:
|
||||||
|
MIME type of the file. May be ``"application/pdf"`` when
|
||||||
|
called for the archive version of an image original.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
Zero or more metadata entries.
|
||||||
|
"""
|
||||||
|
if mime_type != "application/pdf":
|
||||||
|
return []
|
||||||
|
|
||||||
|
from paperless.parsers.utils import extract_pdf_metadata
|
||||||
|
|
||||||
|
return extract_pdf_metadata(document_path, log=logger)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Private helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _azure_ai_vision_parse(
|
||||||
|
self,
|
||||||
|
file: Path,
|
||||||
|
config: RemoteEngineConfig,
|
||||||
|
) -> str | None:
|
||||||
|
"""Send ``file`` to Azure AI Document Intelligence and return text.
|
||||||
|
|
||||||
|
Downloads the searchable PDF output from Azure and stores it at
|
||||||
|
``self._archive_path``. Returns the extracted text content, or
|
||||||
|
``None`` on failure (the error is logged).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
file:
|
||||||
|
Absolute path to the document to analyse.
|
||||||
|
config:
|
||||||
|
Validated remote engine configuration.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
Extracted text, or None if the Azure call failed.
|
||||||
|
"""
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# Callers must have already validated config via engine_is_valid():
|
||||||
|
# engine_is_valid() asserts api_key is not None and (for azureai)
|
||||||
|
# endpoint is not None, so these casts are provably safe.
|
||||||
|
assert config.endpoint is not None
|
||||||
|
assert config.api_key is not None
|
||||||
|
|
||||||
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||||
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||||
|
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
||||||
|
from azure.ai.documentintelligence.models import DocumentContentFormat
|
||||||
|
from azure.core.credentials import AzureKeyCredential
|
||||||
|
|
||||||
|
client = DocumentIntelligenceClient(
|
||||||
|
endpoint=config.endpoint,
|
||||||
|
credential=AzureKeyCredential(config.api_key),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with file.open("rb") as f:
|
||||||
|
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
||||||
|
poller = client.begin_analyze_document(
|
||||||
|
model_id="prebuilt-read",
|
||||||
|
body=analyze_request,
|
||||||
|
output_content_format=DocumentContentFormat.TEXT,
|
||||||
|
output=[AnalyzeOutputOption.PDF],
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
poller.wait()
|
||||||
|
result_id = poller.details["operation_id"]
|
||||||
|
result = poller.result()
|
||||||
|
|
||||||
|
self._archive_path = self._tempdir / "archive.pdf"
|
||||||
|
with self._archive_path.open("wb") as f:
|
||||||
|
for chunk in client.get_analyze_result_pdf(
|
||||||
|
model_id="prebuilt-read",
|
||||||
|
result_id=result_id,
|
||||||
|
):
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
return result.content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Azure AI Vision parsing failed: %s", e)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
|
|
||||||
|
return None
|
||||||
130
src/paperless/parsers/utils.py
Normal file
130
src/paperless/parsers/utils.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
"""
|
||||||
|
Shared utilities for Paperless-ngx document parsers.
|
||||||
|
|
||||||
|
Functions here are format-neutral helpers that multiple parsers need.
|
||||||
|
Keeping them here avoids parsers inheriting from each other just to
|
||||||
|
share implementation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsers.utils")
|
||||||
|
|
||||||
|
|
||||||
|
def get_page_count_for_pdf(
|
||||||
|
document_path: Path,
|
||||||
|
log: logging.Logger | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in a PDF file using pikepdf.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the PDF file.
|
||||||
|
log:
|
||||||
|
Logger to use for warnings. Falls back to the module-level logger
|
||||||
|
when omitted.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Page count, or ``None`` if the file cannot be opened or is not a
|
||||||
|
valid PDF.
|
||||||
|
"""
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
_log = log or logger
|
||||||
|
|
||||||
|
try:
|
||||||
|
with pikepdf.Pdf.open(document_path) as pdf:
|
||||||
|
return len(pdf.pages)
|
||||||
|
except Exception as e:
|
||||||
|
_log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdf_metadata(
|
||||||
|
document_path: Path,
|
||||||
|
log: logging.Logger | None = None,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract XMP/PDF metadata from a PDF file using pikepdf.
|
||||||
|
|
||||||
|
Reads all XMP metadata entries from the document and returns them as a
|
||||||
|
list of ``MetadataEntry`` dicts. The method never raises — any failure
|
||||||
|
to open the file or read a specific key is logged and skipped.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the PDF file.
|
||||||
|
log:
|
||||||
|
Logger to use for warnings and debug messages. Falls back to the
|
||||||
|
module-level logger when omitted.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
Zero or more metadata entries. Returns ``[]`` if the file cannot
|
||||||
|
be opened or contains no readable XMP metadata.
|
||||||
|
"""
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
|
_log = log or logger
|
||||||
|
result: list[MetadataEntry] = []
|
||||||
|
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
pdf = pikepdf.open(document_path)
|
||||||
|
meta = pdf.open_metadata()
|
||||||
|
except Exception as e:
|
||||||
|
_log.warning("Could not open PDF metadata for %s: %s", document_path, e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
for key, value in meta.items():
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = " ".join(str(e) for e in value)
|
||||||
|
value = str(value)
|
||||||
|
|
||||||
|
try:
|
||||||
|
m = namespace_pattern.match(key)
|
||||||
|
if m is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
namespace = m.group(1)
|
||||||
|
key_value = m.group(2)
|
||||||
|
|
||||||
|
try:
|
||||||
|
namespace.encode("utf-8")
|
||||||
|
key_value.encode("utf-8")
|
||||||
|
except UnicodeEncodeError as enc_err:
|
||||||
|
_log.debug("Skipping metadata key %s: %s", key, enc_err)
|
||||||
|
continue
|
||||||
|
|
||||||
|
result.append(
|
||||||
|
MetadataEntry(
|
||||||
|
namespace=namespace,
|
||||||
|
prefix=meta.REVERSE_NS[namespace],
|
||||||
|
key=key_value,
|
||||||
|
value=value,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
_log.warning(
|
||||||
|
"Error reading metadata key %s value %s: %s",
|
||||||
|
key,
|
||||||
|
value,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -10,12 +10,15 @@ from typing import TYPE_CHECKING
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pytest_django.fixtures import SettingsWrapper
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Text parser sample files
|
# Text parser sample files
|
||||||
@@ -74,3 +77,89 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
|||||||
"""
|
"""
|
||||||
with TextDocumentParser() as parser:
|
with TextDocumentParser() as parser:
|
||||||
yield parser
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Remote parser sample files
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def remote_samples_dir(samples_dir: Path) -> Path:
|
||||||
|
"""Absolute path to the remote parser sample files directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
``<samples_dir>/remote/``
|
||||||
|
"""
|
||||||
|
return samples_dir / "remote"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_pdf_file(remote_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple digital PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``remote/simple-digital.pdf``.
|
||||||
|
"""
|
||||||
|
return remote_samples_dir / "simple-digital.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Remote parser instance
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
|
||||||
|
"""Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
RemoteDocumentParser
|
||||||
|
A ready-to-use parser instance.
|
||||||
|
"""
|
||||||
|
with RemoteDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Remote parser settings helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||||
|
"""Configure Django settings for a valid Azure AI OCR engine.
|
||||||
|
|
||||||
|
Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
|
||||||
|
``REMOTE_OCR_ENDPOINT`` to test values. Settings are restored
|
||||||
|
automatically after the test by pytest-django.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
SettingsWrapper
|
||||||
|
The modified settings object (for chaining further overrides).
|
||||||
|
"""
|
||||||
|
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||||
|
settings.REMOTE_OCR_API_KEY = "test-api-key"
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||||
|
return settings
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||||
|
"""Configure Django settings with no remote engine configured.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
SettingsWrapper
|
||||||
|
The modified settings object.
|
||||||
|
"""
|
||||||
|
settings.REMOTE_OCR_ENGINE = None
|
||||||
|
settings.REMOTE_OCR_API_KEY = None
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = None
|
||||||
|
return settings
|
||||||
|
|||||||
490
src/paperless/tests/parsers/test_remote_parser.py
Normal file
490
src/paperless/tests/parsers/test_remote_parser.py
Normal file
@@ -0,0 +1,490 @@
|
|||||||
|
"""
|
||||||
|
Tests for paperless.parsers.remote.RemoteDocumentParser.
|
||||||
|
|
||||||
|
All tests use the context-manager protocol for parser lifecycle.
|
||||||
|
|
||||||
|
Fixture layout
|
||||||
|
--------------
|
||||||
|
make_azure_mock — factory (defined here; specific to this module)
|
||||||
|
azure_client — composes azure_settings + make_azure_mock + patch;
|
||||||
|
use when a test needs the client to succeed
|
||||||
|
failing_azure_client
|
||||||
|
— composes azure_settings + patch with RuntimeError;
|
||||||
|
use when a test needs the client to fail
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperless.parsers import ParserProtocol
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Callable
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pytest_django.fixtures import SettingsWrapper
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module-local fixtures
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_AZURE_CLIENT_TARGET = "azure.ai.documentintelligence.DocumentIntelligenceClient"
|
||||||
|
_DEFAULT_TEXT = "Extracted text."
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def make_azure_mock() -> Callable[[str], Mock]:
|
||||||
|
"""Return a factory that builds a mock Azure DocumentIntelligenceClient.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
mock_client = make_azure_mock() # default extracted text
|
||||||
|
mock_client = make_azure_mock("My text.") # custom extracted text
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _factory(text: str = _DEFAULT_TEXT) -> Mock:
|
||||||
|
mock_client = Mock()
|
||||||
|
mock_poller = Mock()
|
||||||
|
mock_poller.wait.return_value = None
|
||||||
|
mock_poller.details = {"operation_id": "fake-op-id"}
|
||||||
|
mock_poller.result.return_value.content = text
|
||||||
|
mock_client.begin_analyze_document.return_value = mock_poller
|
||||||
|
mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
return _factory
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def azure_client(
|
||||||
|
azure_settings: SettingsWrapper,
|
||||||
|
make_azure_mock: Callable[[str], Mock],
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> Mock:
|
||||||
|
"""Patch the Azure DI client with a succeeding mock and return the instance.
|
||||||
|
|
||||||
|
Implicitly applies ``azure_settings`` so tests using this fixture do not
|
||||||
|
also need ``@pytest.mark.usefixtures("azure_settings")``.
|
||||||
|
"""
|
||||||
|
mock_client = make_azure_mock()
|
||||||
|
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def failing_azure_client(
|
||||||
|
azure_settings: SettingsWrapper,
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> Mock:
|
||||||
|
"""Patch the Azure DI client to raise RuntimeError on every call.
|
||||||
|
|
||||||
|
Implicitly applies ``azure_settings``. Returns the mock instance so
|
||||||
|
tests can assert on calls such as ``close()``.
|
||||||
|
"""
|
||||||
|
mock_client = Mock()
|
||||||
|
mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
|
||||||
|
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Protocol contract
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserProtocol:
|
||||||
|
"""Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""
|
||||||
|
|
||||||
|
def test_isinstance_satisfies_protocol(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert isinstance(remote_parser, ParserProtocol)
|
||||||
|
|
||||||
|
def test_class_attributes_present(self) -> None:
|
||||||
|
assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
|
||||||
|
assert (
|
||||||
|
isinstance(RemoteDocumentParser.version, str)
|
||||||
|
and RemoteDocumentParser.version
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
|
||||||
|
)
|
||||||
|
assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# supported_mime_types
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserSupportedMimeTypes:
|
||||||
|
"""supported_mime_types() always returns the full set regardless of config."""
|
||||||
|
|
||||||
|
def test_returns_dict(self) -> None:
|
||||||
|
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||||
|
assert isinstance(mime_types, dict)
|
||||||
|
|
||||||
|
def test_includes_all_expected_types(self) -> None:
|
||||||
|
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||||
|
expected = {
|
||||||
|
"application/pdf",
|
||||||
|
"image/png",
|
||||||
|
"image/jpeg",
|
||||||
|
"image/tiff",
|
||||||
|
"image/bmp",
|
||||||
|
"image/gif",
|
||||||
|
"image/webp",
|
||||||
|
}
|
||||||
|
assert expected == set(mime_types.keys())
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
def test_returns_full_set_when_not_configured(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: No remote engine is configured
|
||||||
|
WHEN: supported_mime_types() is called
|
||||||
|
THEN: The full MIME type dict is still returned (score() handles activation)
|
||||||
|
"""
|
||||||
|
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||||
|
assert len(mime_types) == 7
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# score()
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserScore:
|
||||||
|
"""score() encodes the activation logic: None when unconfigured, 20 when active."""
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"mime_type",
|
||||||
|
[
|
||||||
|
pytest.param("application/pdf", id="pdf"),
|
||||||
|
pytest.param("image/png", id="png"),
|
||||||
|
pytest.param("image/jpeg", id="jpeg"),
|
||||||
|
pytest.param("image/tiff", id="tiff"),
|
||||||
|
pytest.param("image/bmp", id="bmp"),
|
||||||
|
pytest.param("image/gif", id="gif"),
|
||||||
|
pytest.param("image/webp", id="webp"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_score_returns_20_when_configured(self, mime_type: str) -> None:
|
||||||
|
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||||
|
assert result == 20
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"mime_type",
|
||||||
|
[
|
||||||
|
pytest.param("application/pdf", id="pdf"),
|
||||||
|
pytest.param("image/png", id="png"),
|
||||||
|
pytest.param("image/jpeg", id="jpeg"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_score_returns_none_when_no_engine(self, mime_type: str) -> None:
|
||||||
|
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_score_returns_none_when_api_key_missing(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||||
|
settings.REMOTE_OCR_API_KEY = None
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||||
|
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_score_returns_none_when_endpoint_missing(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||||
|
settings.REMOTE_OCR_API_KEY = "key"
|
||||||
|
settings.REMOTE_OCR_ENDPOINT = None
|
||||||
|
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
def test_score_returns_none_for_unsupported_mime_type(self) -> None:
|
||||||
|
result = RemoteDocumentParser.score("text/plain", "doc.txt")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
def test_score_higher_than_tesseract_default(self) -> None:
|
||||||
|
"""Remote parser (20) outranks the tesseract default (10) when configured."""
|
||||||
|
score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert score is not None and score > 10
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserProperties:
|
||||||
|
def test_can_produce_archive_is_true(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert remote_parser.can_produce_archive is True
|
||||||
|
|
||||||
|
def test_requires_pdf_rendition_is_false(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert remote_parser.requires_pdf_rendition is False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserLifecycle:
|
||||||
|
def test_context_manager_cleans_up_tempdir(self) -> None:
|
||||||
|
with RemoteDocumentParser() as parser:
|
||||||
|
tempdir = parser._tempdir
|
||||||
|
assert tempdir.exists()
|
||||||
|
assert not tempdir.exists()
|
||||||
|
|
||||||
|
def test_context_manager_cleans_up_after_exception(self) -> None:
|
||||||
|
tempdir: Path | None = None
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
with RemoteDocumentParser() as parser:
|
||||||
|
tempdir = parser._tempdir
|
||||||
|
raise RuntimeError("boom")
|
||||||
|
assert tempdir is not None
|
||||||
|
assert not tempdir.exists()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# parse() — happy path
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserParse:
|
||||||
|
def test_parse_returns_text_from_azure(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_text() == _DEFAULT_TEXT
|
||||||
|
|
||||||
|
def test_parse_sets_archive_path(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
archive = remote_parser.get_archive_path()
|
||||||
|
assert archive is not None
|
||||||
|
assert archive.exists()
|
||||||
|
assert archive.suffix == ".pdf"
|
||||||
|
|
||||||
|
def test_parse_closes_client_on_success(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
azure_client.close.assert_called_once()
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
def test_parse_sets_empty_text_when_not_configured(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_text() == ""
|
||||||
|
assert remote_parser.get_archive_path() is None
|
||||||
|
|
||||||
|
def test_get_text_none_before_parse(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert remote_parser.get_text() is None
|
||||||
|
|
||||||
|
def test_get_date_always_none(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_date() is None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# parse() — Azure failure path
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserParseError:
|
||||||
|
def test_parse_returns_none_on_azure_error(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
failing_azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
assert remote_parser.get_text() is None
|
||||||
|
|
||||||
|
def test_parse_closes_client_on_error(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
failing_azure_client: Mock,
|
||||||
|
) -> None:
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
failing_azure_client.close.assert_called_once()
|
||||||
|
|
||||||
|
def test_parse_logs_error_on_azure_failure(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
failing_azure_client: Mock,
|
||||||
|
mocker: MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
mock_log = mocker.patch("paperless.parsers.remote.logger")
|
||||||
|
|
||||||
|
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||||
|
|
||||||
|
mock_log.error.assert_called_once()
|
||||||
|
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# get_page_count()
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserPageCount:
|
||||||
|
def test_page_count_for_pdf(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||||
|
assert isinstance(count, int)
|
||||||
|
assert count >= 1
|
||||||
|
|
||||||
|
def test_page_count_returns_none_for_image_mime(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
count = remote_parser.get_page_count(sample_pdf_file, "image/png")
|
||||||
|
assert count is None
|
||||||
|
|
||||||
|
def test_page_count_returns_none_for_invalid_pdf(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
tmp_path: Path,
|
||||||
|
) -> None:
|
||||||
|
bad_pdf = tmp_path / "bad.pdf"
|
||||||
|
bad_pdf.write_bytes(b"not a pdf at all")
|
||||||
|
count = remote_parser.get_page_count(bad_pdf, "application/pdf")
|
||||||
|
assert count is None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# extract_metadata()
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserMetadata:
|
||||||
|
def test_extract_metadata_non_pdf_returns_empty(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
def test_extract_metadata_pdf_returns_list(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||||
|
assert isinstance(result, list)
|
||||||
|
|
||||||
|
def test_extract_metadata_pdf_entries_have_required_keys(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
sample_pdf_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||||
|
for entry in result:
|
||||||
|
assert "namespace" in entry
|
||||||
|
assert "prefix" in entry
|
||||||
|
assert "key" in entry
|
||||||
|
assert "value" in entry
|
||||||
|
assert isinstance(entry["value"], str)
|
||||||
|
|
||||||
|
def test_extract_metadata_does_not_raise_on_invalid_pdf(
|
||||||
|
self,
|
||||||
|
remote_parser: RemoteDocumentParser,
|
||||||
|
tmp_path: Path,
|
||||||
|
) -> None:
|
||||||
|
bad_pdf = tmp_path / "bad.pdf"
|
||||||
|
bad_pdf.write_bytes(b"not a pdf at all")
|
||||||
|
result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Registry integration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestRemoteParserRegistry:
|
||||||
|
def test_registered_in_defaults(self) -> None:
|
||||||
|
from paperless.parsers.registry import ParserRegistry
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_defaults()
|
||||||
|
|
||||||
|
assert RemoteDocumentParser in registry._builtins
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("azure_settings")
|
||||||
|
def test_get_parser_returns_remote_when_configured(self) -> None:
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
|
registry = get_parser_registry()
|
||||||
|
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||||
|
|
||||||
|
assert parser_cls is RemoteDocumentParser
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
|
def test_get_parser_returns_none_for_pdf_when_not_configured(self) -> None:
|
||||||
|
"""With no tesseract parser registered yet, PDF has no handler if remote is off."""
|
||||||
|
from paperless.parsers.registry import ParserRegistry
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_defaults()
|
||||||
|
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||||
|
|
||||||
|
assert parser_cls is None
|
||||||
@@ -1,118 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
|
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteEngineConfig:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
engine: str,
|
|
||||||
api_key: str | None = None,
|
|
||||||
endpoint: str | None = None,
|
|
||||||
):
|
|
||||||
self.engine = engine
|
|
||||||
self.api_key = api_key
|
|
||||||
self.endpoint = endpoint
|
|
||||||
|
|
||||||
def engine_is_valid(self):
|
|
||||||
valid = self.engine in ["azureai"] and self.api_key is not None
|
|
||||||
if self.engine == "azureai":
|
|
||||||
valid = valid and self.endpoint is not None
|
|
||||||
return valid
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
|
||||||
"""
|
|
||||||
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
|
|
||||||
as this is the only service that provides a remote OCR API with text-embedded PDF output.
|
|
||||||
"""
|
|
||||||
|
|
||||||
logging_name = "paperless.parsing.remote"
|
|
||||||
|
|
||||||
def get_settings(self) -> RemoteEngineConfig:
|
|
||||||
"""
|
|
||||||
Returns the configuration for the remote OCR engine, loaded from Django settings.
|
|
||||||
"""
|
|
||||||
return RemoteEngineConfig(
|
|
||||||
engine=settings.REMOTE_OCR_ENGINE,
|
|
||||||
api_key=settings.REMOTE_OCR_API_KEY,
|
|
||||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
|
||||||
)
|
|
||||||
|
|
||||||
def supported_mime_types(self):
|
|
||||||
if self.settings.engine_is_valid():
|
|
||||||
return {
|
|
||||||
"application/pdf": ".pdf",
|
|
||||||
"image/png": ".png",
|
|
||||||
"image/jpeg": ".jpg",
|
|
||||||
"image/tiff": ".tiff",
|
|
||||||
"image/bmp": ".bmp",
|
|
||||||
"image/gif": ".gif",
|
|
||||||
"image/webp": ".webp",
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def azure_ai_vision_parse(
|
|
||||||
self,
|
|
||||||
file: Path,
|
|
||||||
) -> str | None:
|
|
||||||
"""
|
|
||||||
Uses Azure AI Vision to parse the document and return the text content.
|
|
||||||
It requests a searchable PDF output with embedded text.
|
|
||||||
The PDF is saved to the archive_path attribute.
|
|
||||||
Returns the text content extracted from the document.
|
|
||||||
If the parsing fails, it returns None.
|
|
||||||
"""
|
|
||||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
|
||||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
|
||||||
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
|
||||||
from azure.ai.documentintelligence.models import DocumentContentFormat
|
|
||||||
from azure.core.credentials import AzureKeyCredential
|
|
||||||
|
|
||||||
client = DocumentIntelligenceClient(
|
|
||||||
endpoint=self.settings.endpoint,
|
|
||||||
credential=AzureKeyCredential(self.settings.api_key),
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
with file.open("rb") as f:
|
|
||||||
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
|
||||||
poller = client.begin_analyze_document(
|
|
||||||
model_id="prebuilt-read",
|
|
||||||
body=analyze_request,
|
|
||||||
output_content_format=DocumentContentFormat.TEXT,
|
|
||||||
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
|
|
||||||
content_type="application/json",
|
|
||||||
)
|
|
||||||
|
|
||||||
poller.wait()
|
|
||||||
result_id = poller.details["operation_id"]
|
|
||||||
result = poller.result()
|
|
||||||
|
|
||||||
# Download the PDF with embedded text
|
|
||||||
self.archive_path = self.tempdir / "archive.pdf"
|
|
||||||
with self.archive_path.open("wb") as f:
|
|
||||||
for chunk in client.get_analyze_result_pdf(
|
|
||||||
model_id="prebuilt-read",
|
|
||||||
result_id=result_id,
|
|
||||||
):
|
|
||||||
f.write(chunk)
|
|
||||||
return result.content
|
|
||||||
except Exception as e:
|
|
||||||
self.log.error(f"Azure AI Vision parsing failed: {e}")
|
|
||||||
finally:
|
|
||||||
client.close()
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
|
||||||
if not self.settings.engine_is_valid():
|
|
||||||
self.log.warning(
|
|
||||||
"No valid remote parser engine is configured, content will be empty.",
|
|
||||||
)
|
|
||||||
self.text = ""
|
|
||||||
elif self.settings.engine == "azureai":
|
|
||||||
self.text = self.azure_ai_vision_parse(document_path)
|
|
||||||
@@ -1,16 +1,36 @@
|
|||||||
def get_parser(*args, **kwargs):
|
from __future__ import annotations
|
||||||
from paperless_remote.parsers import RemoteDocumentParser
|
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
|
||||||
|
# The new RemoteDocumentParser does not accept the progress_callback
|
||||||
|
# kwarg injected by the old signal-based consumer. logging_group is
|
||||||
|
# forwarded as a positional arg.
|
||||||
|
# Phase 4 will replace this signal path with the new ParserRegistry.
|
||||||
|
kwargs.pop("progress_callback", None)
|
||||||
return RemoteDocumentParser(*args, **kwargs)
|
return RemoteDocumentParser(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def get_supported_mime_types():
|
def get_supported_mime_types() -> dict[str, str]:
|
||||||
from paperless_remote.parsers import RemoteDocumentParser
|
from django.conf import settings
|
||||||
|
|
||||||
return RemoteDocumentParser(None).supported_mime_types()
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
from paperless.parsers.remote import RemoteEngineConfig
|
||||||
|
|
||||||
|
config = RemoteEngineConfig(
|
||||||
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
|
api_key=settings.REMOTE_OCR_API_KEY,
|
||||||
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||||
|
)
|
||||||
|
if not config.engine_is_valid():
|
||||||
|
return {}
|
||||||
|
return RemoteDocumentParser.supported_mime_types()
|
||||||
|
|
||||||
|
|
||||||
def remote_consumer_declaration(sender, **kwargs):
|
def remote_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"parser": get_parser,
|
"parser": get_parser,
|
||||||
"weight": 5,
|
"weight": 5,
|
||||||
|
|||||||
@@ -1,131 +0,0 @@
|
|||||||
import uuid
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
from django.test import TestCase
|
|
||||||
from django.test import override_settings
|
|
||||||
|
|
||||||
from documents.tests.utils import DirectoriesMixin
|
|
||||||
from documents.tests.utils import FileSystemAssertsMixin
|
|
||||||
from paperless_remote.parsers import RemoteDocumentParser
|
|
||||||
from paperless_remote.signals import get_parser
|
|
||||||
|
|
||||||
|
|
||||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|
||||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
|
||||||
|
|
||||||
def assertContainsStrings(self, content: str, strings: list[str]) -> None:
|
|
||||||
# Asserts that all strings appear in content, in the given order.
|
|
||||||
indices = []
|
|
||||||
for s in strings:
|
|
||||||
if s in content:
|
|
||||||
indices.append(content.index(s))
|
|
||||||
else:
|
|
||||||
self.fail(f"'{s}' is not in '{content}'")
|
|
||||||
self.assertListEqual(indices, sorted(indices))
|
|
||||||
|
|
||||||
@mock.patch("paperless_tesseract.parsers.run_subprocess")
|
|
||||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
|
||||||
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess) -> None:
|
|
||||||
# Arrange mock Azure client
|
|
||||||
mock_client = mock.Mock()
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
|
|
||||||
# Simulate poller result and its `.details`
|
|
||||||
mock_poller = mock.Mock()
|
|
||||||
mock_poller.wait.return_value = None
|
|
||||||
mock_poller.details = {"operation_id": "fake-op-id"}
|
|
||||||
mock_client.begin_analyze_document.return_value = mock_poller
|
|
||||||
mock_poller.result.return_value.content = "This is a test document."
|
|
||||||
|
|
||||||
# Return dummy PDF bytes
|
|
||||||
mock_client.get_analyze_result_pdf.return_value = [
|
|
||||||
b"%PDF-",
|
|
||||||
b"1.7 ",
|
|
||||||
b"FAKEPDF",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Simulate pdftotext by writing dummy text to sidecar file
|
|
||||||
def fake_run(cmd, *args, **kwargs) -> None:
|
|
||||||
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
|
|
||||||
f.write("This is a test document.")
|
|
||||||
|
|
||||||
mock_subprocess.side_effect = fake_run
|
|
||||||
|
|
||||||
with override_settings(
|
|
||||||
REMOTE_OCR_ENGINE="azureai",
|
|
||||||
REMOTE_OCR_API_KEY="somekey",
|
|
||||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
|
||||||
):
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
parser.parse(
|
|
||||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.text.strip(),
|
|
||||||
["This is a test document."],
|
|
||||||
)
|
|
||||||
|
|
||||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
|
||||||
def test_get_text_with_azure_error_logged_and_returns_none(
|
|
||||||
self,
|
|
||||||
mock_client_cls,
|
|
||||||
) -> None:
|
|
||||||
mock_client = mock.Mock()
|
|
||||||
mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
|
|
||||||
with override_settings(
|
|
||||||
REMOTE_OCR_ENGINE="azureai",
|
|
||||||
REMOTE_OCR_API_KEY="somekey",
|
|
||||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
|
||||||
):
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
with mock.patch.object(parser.log, "error") as mock_log_error:
|
|
||||||
parser.parse(
|
|
||||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertIsNone(parser.text)
|
|
||||||
mock_client.begin_analyze_document.assert_called_once()
|
|
||||||
mock_client.close.assert_called_once()
|
|
||||||
mock_log_error.assert_called_once()
|
|
||||||
self.assertIn(
|
|
||||||
"Azure AI Vision parsing failed",
|
|
||||||
mock_log_error.call_args[0][0],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(
|
|
||||||
REMOTE_OCR_ENGINE="azureai",
|
|
||||||
REMOTE_OCR_API_KEY="key",
|
|
||||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
|
||||||
)
|
|
||||||
def test_supported_mime_types_valid_config(self) -> None:
|
|
||||||
parser = RemoteDocumentParser(uuid.uuid4())
|
|
||||||
expected_types = {
|
|
||||||
"application/pdf": ".pdf",
|
|
||||||
"image/png": ".png",
|
|
||||||
"image/jpeg": ".jpg",
|
|
||||||
"image/tiff": ".tiff",
|
|
||||||
"image/bmp": ".bmp",
|
|
||||||
"image/gif": ".gif",
|
|
||||||
"image/webp": ".webp",
|
|
||||||
}
|
|
||||||
self.assertEqual(parser.supported_mime_types(), expected_types)
|
|
||||||
|
|
||||||
def test_supported_mime_types_invalid_config(self) -> None:
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
self.assertEqual(parser.supported_mime_types(), {})
|
|
||||||
|
|
||||||
@override_settings(
|
|
||||||
REMOTE_OCR_ENGINE=None,
|
|
||||||
REMOTE_OCR_API_KEY=None,
|
|
||||||
REMOTE_OCR_ENDPOINT=None,
|
|
||||||
)
|
|
||||||
def test_parse_with_invalid_config(self) -> None:
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
|
|
||||||
self.assertEqual(parser.text, "")
|
|
||||||
@@ -221,7 +221,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
assert isinstance(self.settings, OcrConfig)
|
assert isinstance(self.settings, OcrConfig)
|
||||||
ocrmypdf_args = {
|
ocrmypdf_args = {
|
||||||
"input_file": input_file,
|
"input_file_or_options": input_file,
|
||||||
"output_file": output_file,
|
"output_file": output_file,
|
||||||
# need to use threads, since this will be run in daemonized
|
# need to use threads, since this will be run in daemonized
|
||||||
# processes via the task library.
|
# processes via the task library.
|
||||||
@@ -285,7 +285,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"for compatibility with img2pdf",
|
"for compatibility with img2pdf",
|
||||||
)
|
)
|
||||||
# Replace the input file with the non-alpha
|
# Replace the input file with the non-alpha
|
||||||
ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
|
ocrmypdf_args["input_file_or_options"] = self.remove_alpha(input_file)
|
||||||
|
|
||||||
if dpi:
|
if dpi:
|
||||||
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
||||||
|
|||||||
@@ -778,7 +778,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
safe_fallback=False,
|
safe_fallback=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(params["input_file"], "input.pdf")
|
self.assertEqual(params["input_file_or_options"], "input.pdf")
|
||||||
self.assertEqual(params["output_file"], "output.pdf")
|
self.assertEqual(params["output_file"], "output.pdf")
|
||||||
self.assertEqual(params["sidecar"], "sidecar.txt")
|
self.assertEqual(params["sidecar"], "sidecar.txt")
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +1,20 @@
|
|||||||
def get_parser(*args, **kwargs):
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
|
||||||
# The new TextDocumentParser does not accept the legacy logging_group /
|
# The new TextDocumentParser does not accept the progress_callback
|
||||||
# progress_callback kwargs injected by the old signal-based consumer.
|
# kwarg injected by the old signal-based consumer. logging_group is
|
||||||
# These are dropped here; Phase 4 will replace this signal path with the
|
# forwarded as a positional arg.
|
||||||
# new ParserRegistry so the shim can be removed at that point.
|
# Phase 4 will replace this signal path with the new ParserRegistry.
|
||||||
kwargs.pop("logging_group", None)
|
|
||||||
kwargs.pop("progress_callback", None)
|
kwargs.pop("progress_callback", None)
|
||||||
return TextDocumentParser()
|
return TextDocumentParser(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def text_consumer_declaration(sender, **kwargs):
|
def text_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"parser": get_parser,
|
"parser": get_parser,
|
||||||
"weight": 10,
|
"weight": 10,
|
||||||
|
|||||||
115
uv.lock
generated
115
uv.lock
generated
@@ -831,6 +831,15 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/2d/82/e5d2c1c67d19841e9edc74954c827444ae826978499bde3dfc1d007c8c11/deepmerge-2.0-py3-none-any.whl", hash = "sha256:6de9ce507115cff0bed95ff0ce9ecc31088ef50cbdf09bc90a09349a318b3d00", size = 13475, upload-time = "2024-08-30T05:31:48.659Z" },
|
{ url = "https://files.pythonhosted.org/packages/2d/82/e5d2c1c67d19841e9edc74954c827444ae826978499bde3dfc1d007c8c11/deepmerge-2.0-py3-none-any.whl", hash = "sha256:6de9ce507115cff0bed95ff0ce9ecc31088ef50cbdf09bc90a09349a318b3d00", size = 13475, upload-time = "2024-08-30T05:31:48.659Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "defusedxml"
|
||||||
|
version = "0.7.1"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "deprecated"
|
name = "deprecated"
|
||||||
version = "1.3.1"
|
version = "1.3.1"
|
||||||
@@ -1283,6 +1292,59 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/a6/ff/ee2f67c0ff146ec98b5df1df637b2bc2d17beeb05df9f427a67bd7a7d79c/flower-2.0.1-py2.py3-none-any.whl", hash = "sha256:9db2c621eeefbc844c8dd88be64aef61e84e2deb29b271e02ab2b5b9f01068e2", size = 383553, upload-time = "2023-08-13T14:37:41.552Z" },
|
{ url = "https://files.pythonhosted.org/packages/a6/ff/ee2f67c0ff146ec98b5df1df637b2bc2d17beeb05df9f427a67bd7a7d79c/flower-2.0.1-py2.py3-none-any.whl", hash = "sha256:9db2c621eeefbc844c8dd88be64aef61e84e2deb29b271e02ab2b5b9f01068e2", size = 383553, upload-time = "2023-08-13T14:37:41.552Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fonttools"
|
||||||
|
version = "4.62.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/5a/96/686339e0fda8142b7ebed39af53f4a5694602a729662f42a6209e3be91d0/fonttools-4.62.0.tar.gz", hash = "sha256:0dc477c12b8076b4eb9af2e440421b0433ffa9e1dcb39e0640a6c94665ed1098", size = 3579521, upload-time = "2026-03-09T16:50:06.217Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/e4/33/63d79ca41020dd460b51f1e0f58ad1ff0a36b7bcbdf8f3971d52836581e9/fonttools-4.62.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:196cafef9aeec5258425bd31a4e9a414b2ee0d1557bca184d7923d3d3bcd90f9", size = 2870816, upload-time = "2026-03-09T16:48:32.39Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/c0/7a/9aeec114bc9fc00d757a41f092f7107863d372e684a5b5724c043654477c/fonttools-4.62.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:153afc3012ff8761b1733e8fbe5d98623409774c44ffd88fbcb780e240c11d13", size = 2416127, upload-time = "2026-03-09T16:48:34.627Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/5a/71/12cfd8ae0478b7158ffa8850786781f67e73c00fd897ef9d053415c5f88b/fonttools-4.62.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:13b663fb197334de84db790353d59da2a7288fd14e9be329f5debc63ec0500a5", size = 5100678, upload-time = "2026-03-09T16:48:36.454Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/8a/d7/8e4845993ee233c2023d11babe9b3dae7d30333da1d792eeccebcb77baab/fonttools-4.62.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:591220d5333264b1df0d3285adbdfe2af4f6a45bbf9ca2b485f97c9f577c49ff", size = 5070859, upload-time = "2026-03-09T16:48:38.786Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/ae/a0/287ae04cd883a52e7bb1d92dfc4997dcffb54173761c751106845fa9e316/fonttools-4.62.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:579f35c121528a50c96bf6fcb6a393e81e7f896d4326bf40e379f1c971603db9", size = 5076689, upload-time = "2026-03-09T16:48:41.886Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/6d/4e/a2377ad26c36fcd3e671a1c316ea5ed83107de1588e2d897a98349363bc7/fonttools-4.62.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:44956b003151d5a289eba6c71fe590d63509267c37e26de1766ba15d9c589582", size = 5202053, upload-time = "2026-03-09T16:48:43.867Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/ab/9d/7ad1ffc080619f67d0b1e0fa6a0578f0be077404f13fd8e448d1616a94a3/fonttools-4.62.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:22bde4dc12a9e09b5ced77f3b5053d96cf10c4976c6ac0dee293418ef289d221", size = 2870004, upload-time = "2026-03-09T16:48:50.837Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/4d/8b/ba59069a490f61b737e064c3129453dbd28ee38e81d56af0d04d7e6b4de4/fonttools-4.62.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7199c73b326bad892f1cb53ffdd002128bfd58a89b8f662204fbf1daf8d62e85", size = 2414662, upload-time = "2026-03-09T16:48:53.295Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/8c/8c/c52a4310de58deeac7e9ea800892aec09b00bb3eb0c53265b31ec02be115/fonttools-4.62.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d732938633681d6e2324e601b79e93f7f72395ec8681f9cdae5a8c08bc167e72", size = 5032975, upload-time = "2026-03-09T16:48:55.718Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/0b/a1/d16318232964d786907b9b3613b8409f74cf0be2da400854509d3a864e43/fonttools-4.62.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:31a804c16d76038cc4e3826e07678efb0a02dc4f15396ea8e07088adbfb2578e", size = 4988544, upload-time = "2026-03-09T16:48:57.715Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/b2/8d/7e745ca3e65852adc5e52a83dc213fe1b07d61cb5b394970fcd4b1199d1e/fonttools-4.62.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:090e74ac86e68c20150e665ef8e7e0c20cb9f8b395302c9419fa2e4d332c3b51", size = 4971296, upload-time = "2026-03-09T16:48:59.678Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/e6/d4/b717a4874175146029ca1517e85474b1af80c9d9a306fc3161e71485eea5/fonttools-4.62.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8f086120e8be9e99ca1288aa5ce519833f93fe0ec6ebad2380c1dee18781f0b5", size = 5122503, upload-time = "2026-03-09T16:49:02.464Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/82/c7/985c1670aa6d82ef270f04cde11394c168f2002700353bd2bde405e59b8f/fonttools-4.62.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:274c8b8a87e439faf565d3bcd3f9f9e31bca7740755776a4a90a4bfeaa722efa", size = 2864929, upload-time = "2026-03-09T16:49:09.331Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/c1/dc/c409c8ceec0d3119e9ab0b7b1a2e3c76d1f4d66e4a9db5c59e6b7652e7df/fonttools-4.62.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:93e27131a5a0ae82aaadcffe309b1bae195f6711689722af026862bede05c07c", size = 2412586, upload-time = "2026-03-09T16:49:11.378Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/5f/ac/8e300dbf7b4d135287c261ffd92ede02d9f48f0d2db14665fbc8b059588a/fonttools-4.62.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83c6524c5b93bad9c2939d88e619fedc62e913c19e673f25d5ab74e7a5d074e5", size = 5013708, upload-time = "2026-03-09T16:49:14.063Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/fb/bc/60d93477b653eeb1ddf5f9ec34be689b79234d82dbdded269ac0252715b8/fonttools-4.62.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:106aec9226f9498fc5345125ff7200842c01eda273ae038f5049b0916907acee", size = 4964355, upload-time = "2026-03-09T16:49:16.515Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/cb/eb/6dc62bcc3c3598c28a3ecb77e69018869c3e109bd83031d4973c059d318b/fonttools-4.62.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:15d86b96c79013320f13bc1b15f94789edb376c0a2d22fb6088f33637e8dfcbc", size = 4953472, upload-time = "2026-03-09T16:49:18.494Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/82/b3/3af7592d9b254b7b7fec018135f8776bfa0d1ad335476c2791b1334dc5e4/fonttools-4.62.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f16c07e5250d5d71d0f990a59460bc5620c3cc456121f2cfb5b60475699905f", size = 5094701, upload-time = "2026-03-09T16:49:21.67Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/1a/64/61f69298aa6e7c363dcf00dd6371a654676900abe27d1effd1a74b43e5d0/fonttools-4.62.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:4fa5a9c716e2f75ef34b5a5c2ca0ee4848d795daa7e6792bf30fd4abf8993449", size = 2864222, upload-time = "2026-03-09T16:49:28.285Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/c6/57/6b08756fe4455336b1fe160ab3c11fccc90768ccb6ee03fb0b45851aace4/fonttools-4.62.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:625f5cbeb0b8f4e42343eaeb4bc2786718ddd84760a2f5e55fdd3db049047c00", size = 2410674, upload-time = "2026-03-09T16:49:30.504Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/6f/86/db65b63bb1b824b63e602e9be21b18741ddc99bcf5a7850f9181159ae107/fonttools-4.62.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6247e58b96b982709cd569a91a2ba935d406dccf17b6aa615afaed37ac3856aa", size = 4999387, upload-time = "2026-03-09T16:49:32.593Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/86/c8/c6669e42d2f4efd60d38a3252cebbb28851f968890efb2b9b15f9d1092b0/fonttools-4.62.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:840632ea9c1eab7b7f01c369e408c0721c287dfd7500ab937398430689852fd1", size = 4912506, upload-time = "2026-03-09T16:49:34.927Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/2e/49/0ae552aa098edd0ec548413fbf818f52ceb70535016215094a5ce9bf8f70/fonttools-4.62.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:28a9ea2a7467a816d1bec22658b0cce4443ac60abac3e293bdee78beb74588f3", size = 4951202, upload-time = "2026-03-09T16:49:37.1Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/71/65/ae38fc8a4cea6f162d74cf11f58e9aeef1baa7d0e3d1376dabd336c129e5/fonttools-4.62.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5ae611294f768d413949fd12693a8cba0e6332fbc1e07aba60121be35eac68d0", size = 5060758, upload-time = "2026-03-09T16:49:39.464Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/f8/65/f47f9b3db1ec156a1f222f1089ba076b2cc9ee1d024a8b0a60c54258517e/fonttools-4.62.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:0361a7d41d86937f1f752717c19f719d0fde064d3011038f9f19bdf5fc2f5c95", size = 2947079, upload-time = "2026-03-09T16:49:46.471Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/52/73/bc62e5058a0c22cf02b1e0169ef0c3ca6c3247216d719f95bead3c05a991/fonttools-4.62.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d4108c12773b3c97aa592311557c405d5b4fc03db2b969ed928fcf68e7b3c887", size = 2448802, upload-time = "2026-03-09T16:49:48.328Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/2b/df/bfaa0e845884935355670e6e68f137185ab87295f8bc838db575e4a66064/fonttools-4.62.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b448075f32708e8fb377fe7687f769a5f51a027172c591ba9a58693631b077a8", size = 5137378, upload-time = "2026-03-09T16:49:50.223Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/32/32/04f616979a18b48b52e634988b93d847b6346260faf85ecccaf7e2e9057f/fonttools-4.62.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e5f1fa8cc9f1a56a3e33ee6b954d6d9235e6b9d11eb7a6c9dfe2c2f829dc24db", size = 4920714, upload-time = "2026-03-09T16:49:53.172Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/3b/2e/274e16689c1dfee5c68302cd7c444213cfddd23cf4620374419625037ec6/fonttools-4.62.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f8c8ea812f82db1e884b9cdb663080453e28f0f9a1f5027a5adb59c4cc8d38d1", size = 5016012, upload-time = "2026-03-09T16:49:55.762Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/7f/0c/b08117270626e7117ac2f89d732fdd4386ec37d2ab3a944462d29e6f89a1/fonttools-4.62.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:03c6068adfdc67c565d217e92386b1cdd951abd4240d65180cec62fa74ba31b2", size = 5042766, upload-time = "2026-03-09T16:49:57.726Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/9c/57/c2487c281dde03abb2dec244fd67059b8d118bd30a653cbf69e94084cb23/fonttools-4.62.0-py3-none-any.whl", hash = "sha256:75064f19a10c50c74b336aa5ebe7b1f89fd0fb5255807bfd4b0c6317098f4af3", size = 1152427, upload-time = "2026-03-09T16:50:04.074Z" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fpdf2"
|
||||||
|
version = "2.8.7"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "defusedxml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "fonttools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "pillow", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/27/f2/72feae0b2827ed38013e4307b14f95bf0b3d124adfef4d38a7d57533f7be/fpdf2-2.8.7.tar.gz", hash = "sha256:7060ccee5a9c7ab0a271fb765a36a23639f83ef8996c34e3d46af0a17ede57f9", size = 362351, upload-time = "2026-02-28T05:39:16.456Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/66/0a/cf50ecffa1e3747ed9380a3adfc829259f1f86b3fdbd9e505af789003141/fpdf2-2.8.7-py3-none-any.whl", hash = "sha256:d391fc508a3ce02fc43a577c830cda4fe6f37646f2d143d489839940932fbc19", size = 327056, upload-time = "2026-02-28T05:39:14.619Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "frozenlist"
|
name = "frozenlist"
|
||||||
version = "1.8.0"
|
version = "1.8.0"
|
||||||
@@ -2722,10 +2784,11 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ocrmypdf"
|
name = "ocrmypdf"
|
||||||
version = "16.13.0"
|
version = "17.3.0"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "deprecation", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "deprecation", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "fpdf2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "img2pdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "img2pdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "packaging", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "packaging", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "pdfminer-six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "pdfminer-six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
@@ -2733,11 +2796,14 @@ dependencies = [
|
|||||||
{ name = "pikepdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "pikepdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "pillow", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "pillow", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "pluggy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "pluggy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "pydantic", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "pypdfium2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "uharfbuzz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/8c/52/be1aaece0703a736757d8957c0d4f19c37561054169b501eb0e7132f15e5/ocrmypdf-16.13.0.tar.gz", hash = "sha256:29d37e915234ce717374863a9cc5dd32d29e063dfe60c51380dda71254c88248", size = 7042247, upload-time = "2025-12-24T07:58:35.86Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/fa/fe/60bdc79529be1ad8b151d426ed2020d5ac90328c54e9ba92bd808e1535c1/ocrmypdf-17.3.0.tar.gz", hash = "sha256:4022f13aad3f405e330056a07aa8bd63714b48b414693831b56e2cf2c325f52d", size = 7378015, upload-time = "2026-02-21T09:30:07.207Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/41/b1/e2e7ad98de0d3ee05b44dbc3f78ccb158a620f3add82d00c85490120e7f2/ocrmypdf-16.13.0-py3-none-any.whl", hash = "sha256:fad8a6f7cc52cdc6225095c401a1766c778c47efe9f1e854ae4dc64a550a3d37", size = 165377, upload-time = "2025-12-24T07:58:33.925Z" },
|
{ url = "https://files.pythonhosted.org/packages/3d/b1/b7ae057a1bcb1495067ee3c4d48c1ce5fc66addd9492307c5a0ff799a7f2/ocrmypdf-17.3.0-py3-none-any.whl", hash = "sha256:c8882e7864954d3db6bcee49cc9f261b65bff66b7e5925eb68a1c281f41cad23", size = 488130, upload-time = "2026-02-21T09:30:05.236Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2978,7 +3044,7 @@ requires-dist = [
|
|||||||
{ name = "llama-index-vector-stores-faiss", specifier = ">=0.5.2" },
|
{ name = "llama-index-vector-stores-faiss", specifier = ">=0.5.2" },
|
||||||
{ name = "mysqlclient", marker = "extra == 'mariadb'", specifier = "~=2.2.7" },
|
{ name = "mysqlclient", marker = "extra == 'mariadb'", specifier = "~=2.2.7" },
|
||||||
{ name = "nltk", specifier = "~=3.9.1" },
|
{ name = "nltk", specifier = "~=3.9.1" },
|
||||||
{ name = "ocrmypdf", specifier = "~=16.13.0" },
|
{ name = "ocrmypdf", specifier = "~=17.3.0" },
|
||||||
{ name = "openai", specifier = ">=1.76" },
|
{ name = "openai", specifier = ">=1.76" },
|
||||||
{ name = "pathvalidate", specifier = "~=3.3.1" },
|
{ name = "pathvalidate", specifier = "~=3.3.1" },
|
||||||
{ name = "pdf2image", specifier = "~=1.17.0" },
|
{ name = "pdf2image", specifier = "~=1.17.0" },
|
||||||
@@ -3655,6 +3721,30 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/d1/81/ef2b1dfd1862567d573a4fdbc9f969067621764fbb74338496840a1d2977/pyopenssl-25.3.0-py3-none-any.whl", hash = "sha256:1fda6fc034d5e3d179d39e59c1895c9faeaf40a79de5fc4cbbfbe0d36f4a77b6", size = 57268, upload-time = "2025-09-17T00:32:19.474Z" },
|
{ url = "https://files.pythonhosted.org/packages/d1/81/ef2b1dfd1862567d573a4fdbc9f969067621764fbb74338496840a1d2977/pyopenssl-25.3.0-py3-none-any.whl", hash = "sha256:1fda6fc034d5e3d179d39e59c1895c9faeaf40a79de5fc4cbbfbe0d36f4a77b6", size = 57268, upload-time = "2025-09-17T00:32:19.474Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pypdfium2"
|
||||||
|
version = "5.6.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/3b/01/be763b9081c7eb823196e7d13d9c145bf75ac43f3c1466de81c21c24b381/pypdfium2-5.6.0.tar.gz", hash = "sha256:bcb9368acfe3547054698abbdae68ba0cbd2d3bda8e8ee437e061deef061976d", size = 270714, upload-time = "2026-03-08T01:05:06.5Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/6e/f6/9f9e190fe0e5a6b86b82f83bd8b5d3490348766062381140ca5cad8e00b1/pypdfium2-5.6.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e468c38997573f0e86f03273c2c1fbdea999de52ba43fee96acaa2f6b2ad35f7", size = 3412541, upload-time = "2026-03-08T01:04:25.45Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/ee/8d/e57492cb2228ba56ed57de1ff044c8ac114b46905f8b1445c33299ba0488/pypdfium2-5.6.0-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:ad3abddc5805424f962e383253ccad6a0d1d2ebd86afa9a9e1b9ca659773cd0d", size = 3592320, upload-time = "2026-03-08T01:04:27.509Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/f9/8a/8ab82e33e9c551494cbe1526ea250ca8cc4e9e98d6a4fc6b6f8d959aa1d1/pypdfium2-5.6.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6b5eb9eae5c45076395454522ca26add72ba8bd1fe473e1e4721aa58521470c", size = 3596450, upload-time = "2026-03-08T01:04:29.183Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/f5/b5/602a792282312ccb158cc63849528079d94b0a11efdc61f2a359edfb41e9/pypdfium2-5.6.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:258624da8ef45cdc426e11b33e9d83f9fb723c1c201c6e0f4ab5a85966c6b876", size = 3325442, upload-time = "2026-03-08T01:04:30.886Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/81/1f/9e48ec05ed8d19d736c2d1f23c1bd0f20673f02ef846a2576c69e237f15d/pypdfium2-5.6.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9367451c8a00931d6612db0822525a18c06f649d562cd323a719e46ac19c9bb", size = 3727434, upload-time = "2026-03-08T01:04:33.619Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/33/90/0efd020928b4edbd65f4f3c2af0c84e20b43a3ada8fa6d04f999a97afe7a/pypdfium2-5.6.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a757869f891eac1cc1372e38a4aa01adac8abc8fe2a8a4e2ebf50595e3bf5937", size = 4139029, upload-time = "2026-03-08T01:04:36.08Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/ff/49/a640b288a48dab1752281dd9b72c0679fccea107874e80a65a606b00efa9/pypdfium2-5.6.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:515be355222cc57ae9e62cd5c7c350b8e0c863efc539f80c7d75e2811ba45cb6", size = 3646387, upload-time = "2026-03-08T01:04:38.151Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/b0/3b/a344c19c01021eeb5d830c102e4fc9b1602f19c04aa7d11abbe2d188fd8e/pypdfium2-5.6.0-py3-none-manylinux_2_27_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d1c4753c7caf7d004211d7f57a21f10d127f5e0e5510a14d24bc073e7220a3ea", size = 3097212, upload-time = "2026-03-08T01:04:40.776Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/50/96/e48e13789ace22aeb9b7510904a1b1493ec588196e11bbacc122da330b3d/pypdfium2-5.6.0-py3-none-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c49729090281fdd85775fb8912c10bd19e99178efaa98f145ab06e7ce68554d2", size = 2965026, upload-time = "2026-03-08T01:04:42.857Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/cb/06/3100e44d4935f73af8f5d633d3bd40f0d36d606027085a0ef1f0566a6320/pypdfium2-5.6.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a4a1749a8d4afd62924a8d95cfa4f2e26fc32957ce34ac3b674be6f127ed252e", size = 4131431, upload-time = "2026-03-08T01:04:44.982Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/64/ef/d8df63569ce9a66c8496057782eb8af78e0d28667922d62ec958434e3d4b/pypdfium2-5.6.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:36469ebd0fdffb7130ce45ed9c44f8232d91571c89eb851bd1633c64b6f6114f", size = 3747469, upload-time = "2026-03-08T01:04:46.702Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/a6/47/fd2c6a67a49fade1acd719fbd11f7c375e7219912923ef2de0ea0ac1544e/pypdfium2-5.6.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9da900df09be3cf546b637a127a7b6428fb22d705951d731269e25fd3adef457", size = 4337578, upload-time = "2026-03-08T01:04:49.007Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/6b/f5/836c83e54b01e09478c4d6bf4912651d6053c932250fcee953f5c72d8e4a/pypdfium2-5.6.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:45fccd5622233c5ec91a885770ae7dd4004d4320ac05a4ad8fa03a66dea40244", size = 4376104, upload-time = "2026-03-08T01:04:51.04Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/6e/7f/b940b6a1664daf8f9bad87c6c99b84effa3611615b8708d10392dc33036c/pypdfium2-5.6.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:282dc030e767cd61bd0299f9d581052b91188e2b87561489057a8e7963e7e0cb", size = 3929824, upload-time = "2026-03-08T01:04:53.544Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/88/79/00267d92a6a58c229e364d474f5698efe446e0c7f4f152f58d0138715e99/pypdfium2-5.6.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:a1c1dfe950382c76a7bba1ba160ec5e40df8dd26b04a1124ae268fda55bc4cbe", size = 4270201, upload-time = "2026-03-08T01:04:55.81Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/e1/ab/b127f38aba41746bdf9ace15ba08411d7ef6ecba1326d529ba414eb1ed50/pypdfium2-5.6.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:43b0341ca6feb6c92e4b7a9eb4813e5466f5f5e8b6baeb14df0a94d5f312c00b", size = 4180793, upload-time = "2026-03-08T01:04:57.961Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyrefly"
|
name = "pyrefly"
|
||||||
version = "0.55.0"
|
version = "0.55.0"
|
||||||
@@ -5143,6 +5233,23 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/b1/5e/512aeb40fd819f4660d00f96f5c7371ee36fc8c6b605128c5ee59e0b28c6/u_msgpack_python-2.8.0-py2.py3-none-any.whl", hash = "sha256:1d853d33e78b72c4228a2025b4db28cda81214076e5b0422ed0ae1b1b2bb586a", size = 10590, upload-time = "2023-05-18T09:28:10.323Z" },
|
{ url = "https://files.pythonhosted.org/packages/b1/5e/512aeb40fd819f4660d00f96f5c7371ee36fc8c6b605128c5ee59e0b28c6/u_msgpack_python-2.8.0-py2.py3-none-any.whl", hash = "sha256:1d853d33e78b72c4228a2025b4db28cda81214076e5b0422ed0ae1b1b2bb586a", size = 10590, upload-time = "2023-05-18T09:28:10.323Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "uharfbuzz"
|
||||||
|
version = "0.53.3"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/1c/8d/7c82298bfa5c96f018541661bc2ccdf90dfe397bb2724db46725bf495466/uharfbuzz-0.53.3.tar.gz", hash = "sha256:9a87175c14d1361322ce2a3504e63c6b66062934a5edf47266aed5b33416806c", size = 1714488, upload-time = "2026-01-24T13:10:43.693Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/51/88/5df9337adb60d7b1ad150b162bbc5c56d783d15546714085d92b9531f8f3/uharfbuzz-0.53.3-cp310-abi3-macosx_10_9_universal2.whl", hash = "sha256:d977e41a501d9e8af3f2c329d75031037ee79634bc29ca3872e9115c44e67d25", size = 2722639, upload-time = "2026-01-24T13:10:22.436Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/39/c4/8b4b050e77d6cb9a84af509e5796734f0e687bd02ad11757a581bd6f197d/uharfbuzz-0.53.3-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:21d512c94aa992691aaf5b433deaca7e51f4ea54c68b99f535974073364f806f", size = 1647506, upload-time = "2026-01-24T13:10:24.16Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/30/ff/8e7cf78d525604f3e0a43b9468263fcf2acb5d208a3979c3bfa8dc61112d/uharfbuzz-0.53.3-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dca9a2e071c0c59ba8f382356f31a2518ac3dc7cc77e4f3519defc454c5b9a97", size = 1706448, upload-time = "2026-01-24T13:10:25.729Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/9b/a0/739471cdd52723ecc9fc80f36fb92c706a87265dc258521c1b14d99414f7/uharfbuzz-0.53.3-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1191a74ddcf18ec721161b6b33a8ab31b0c6a2b15c6724a9b663127bf7f07d2e", size = 2664628, upload-time = "2026-01-24T13:10:27.814Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/ae/4a/63a81e9eef922b9f26bd948b518b73704d01a8d8e83324b2f99084ab7af0/uharfbuzz-0.53.3-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:35ec3b600b3f63e7659792f9bd43e1ffb389d3d2aac8285f269d11efbe04787d", size = 2757384, upload-time = "2026-01-24T13:10:29.669Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/e2/d2/27be1201488323d0ff0c99fb966a0522b2736f79bd5a5b7b99526fca3d98/uharfbuzz-0.53.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6f0ad2812303d2c7ccff596fd6c9d5629874f3a83f30255e11639c9b7ba4e89d", size = 1335822, upload-time = "2026-01-24T13:10:34.774Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/70/99/53e39bcd4dec5981eb70a6a76285a862c8a76b80cd52e8f40fe51adab032/uharfbuzz-0.53.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:757d9ed1841912e8f229319f335cf7dd25a2fd377e444bda9deb720617192e12", size = 1237560, upload-time = "2026-01-24T13:10:36.971Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/aa/2b/04d8cde466acfe70373d4f489da5c6eab0aba07d50442dd21217cb0fd167/uharfbuzz-0.53.3-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d3a0b824811bd1be129356818e6cdbf0e4b056bb60aa9a5eb270bff9d21f24c", size = 1497923, upload-time = "2026-01-24T13:10:38.743Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/f3/01/a250521491bc995609275e0062c552b16f437a3ce15de83250176245093e/uharfbuzz-0.53.3-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9211d798b2921a99b8c34e810676137f66372d3b5447765b72d969bdfa6abe6a", size = 1556794, upload-time = "2026-01-24T13:10:40.262Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ujson"
|
name = "ujson"
|
||||||
version = "5.11.0"
|
version = "5.11.0"
|
||||||
|
|||||||
Reference in New Issue
Block a user