mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-14 05:01:24 +00:00
Compare commits
11 Commits
dev
...
feature-ti
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b8069d24b1 | ||
|
|
da06dd2c09 | ||
|
|
bc01e000ad | ||
|
|
23b051b2ee | ||
|
|
644a0f3c6b | ||
|
|
dcf4402b15 | ||
|
|
89d00247f6 | ||
|
|
c16bcb7fef | ||
|
|
d0b95f2cda | ||
|
|
2b33617262 | ||
|
|
0a9c67e9b1 |
3
.github/dependabot.yml
vendored
3
.github/dependabot.yml
vendored
@@ -157,6 +157,9 @@ updates:
|
|||||||
postgres:
|
postgres:
|
||||||
patterns:
|
patterns:
|
||||||
- "docker.io/library/postgres*"
|
- "docker.io/library/postgres*"
|
||||||
|
greenmail:
|
||||||
|
patterns:
|
||||||
|
- "docker.io/greenmail*"
|
||||||
- package-ecosystem: "pre-commit" # See documentation for possible values
|
- package-ecosystem: "pre-commit" # See documentation for possible values
|
||||||
directory: "/" # Location of package manifests
|
directory: "/" # Location of package manifests
|
||||||
schedule:
|
schedule:
|
||||||
|
|||||||
@@ -18,13 +18,13 @@ services:
|
|||||||
- "--log-level=warn"
|
- "--log-level=warn"
|
||||||
- "--log-format=text"
|
- "--log-format=text"
|
||||||
tika:
|
tika:
|
||||||
image: docker.io/apache/tika:latest
|
image: docker.io/apache/tika:3.2.3.0
|
||||||
hostname: tika
|
hostname: tika
|
||||||
container_name: tika
|
container_name: tika
|
||||||
network_mode: host
|
network_mode: host
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
greenmail:
|
greenmail:
|
||||||
image: greenmail/standalone:2.1.8
|
image: docker.io/greenmail/standalone:2.1.8
|
||||||
hostname: greenmail
|
hostname: greenmail
|
||||||
container_name: greenmail
|
container_name: greenmail
|
||||||
environment:
|
environment:
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ from documents.utils import copy_basic_file_stats
|
|||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
from paperless_mail.parsers import MailDocumentParser
|
from paperless_mail.parsers import MailDocumentParser
|
||||||
|
|
||||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||||
@@ -67,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
|
|||||||
|
|
||||||
TODO(stumpylog): Remove me in the future
|
TODO(stumpylog): Remove me in the future
|
||||||
"""
|
"""
|
||||||
if isinstance(parser, TextDocumentParser):
|
if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
|
||||||
parser.__exit__(None, None, None)
|
parser.__exit__(None, None, None)
|
||||||
else:
|
else:
|
||||||
parser.cleanup()
|
parser.cleanup()
|
||||||
@@ -448,6 +449,12 @@ class ConsumerPlugin(
|
|||||||
progress_callback=progress_callback,
|
progress_callback=progress_callback,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# New-style parsers use __enter__/__exit__ for resource management.
|
||||||
|
# _parser_cleanup (below) handles __exit__; call __enter__ here.
|
||||||
|
# TODO(stumpylog): Remove me in the future
|
||||||
|
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
|
||||||
|
document_parser.__enter__()
|
||||||
|
|
||||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
||||||
|
|
||||||
# Parse the document. This may take some time.
|
# Parse the document. This may take some time.
|
||||||
@@ -476,7 +483,7 @@ class ConsumerPlugin(
|
|||||||
self.filename,
|
self.filename,
|
||||||
self.input_doc.mailrule_id,
|
self.input_doc.mailrule_id,
|
||||||
)
|
)
|
||||||
elif isinstance(document_parser, TextDocumentParser):
|
elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
|
||||||
# TODO(stumpylog): Remove me in the future
|
# TODO(stumpylog): Remove me in the future
|
||||||
document_parser.parse(self.working_copy, mime_type)
|
document_parser.parse(self.working_copy, mime_type)
|
||||||
else:
|
else:
|
||||||
@@ -489,7 +496,7 @@ class ConsumerPlugin(
|
|||||||
ProgressStatusOptions.WORKING,
|
ProgressStatusOptions.WORKING,
|
||||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||||
)
|
)
|
||||||
if isinstance(document_parser, TextDocumentParser):
|
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
|
||||||
# TODO(stumpylog): Remove me in the future
|
# TODO(stumpylog): Remove me in the future
|
||||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -10,8 +10,8 @@ from documents.parsers import get_parser_class_for_mime_type
|
|||||||
from documents.parsers import get_supported_file_extensions
|
from documents.parsers import get_supported_file_extensions
|
||||||
from documents.parsers import is_file_ext_supported
|
from documents.parsers import is_file_ext_supported
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||||
from paperless_tika.parsers import TikaDocumentParser
|
|
||||||
|
|
||||||
|
|
||||||
class TestParserDiscovery(TestCase):
|
class TestParserDiscovery(TestCase):
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import tempfile
|
|||||||
import zipfile
|
import zipfile
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
from contextlib import nullcontext
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import mktime
|
from time import mktime
|
||||||
@@ -225,6 +226,7 @@ from paperless.celery import app as celery_app
|
|||||||
from paperless.config import AIConfig
|
from paperless.config import AIConfig
|
||||||
from paperless.config import GeneralConfig
|
from paperless.config import GeneralConfig
|
||||||
from paperless.models import ApplicationConfiguration
|
from paperless.models import ApplicationConfiguration
|
||||||
|
from paperless.parsers import ParserProtocol
|
||||||
from paperless.serialisers import GroupSerializer
|
from paperless.serialisers import GroupSerializer
|
||||||
from paperless.serialisers import UserSerializer
|
from paperless.serialisers import UserSerializer
|
||||||
from paperless.views import StandardPagination
|
from paperless.views import StandardPagination
|
||||||
@@ -1084,9 +1086,11 @@ class DocumentViewSet(
|
|||||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||||
if parser_class:
|
if parser_class:
|
||||||
parser = parser_class(progress_callback=None, logging_group=None)
|
parser = parser_class(progress_callback=None, logging_group=None)
|
||||||
|
cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return parser.extract_metadata(file, mime_type)
|
with cm:
|
||||||
|
return parser.extract_metadata(file, mime_type)
|
||||||
except Exception: # pragma: no cover
|
except Exception: # pragma: no cover
|
||||||
logger.exception(f"Issue getting metadata for {file}")
|
logger.exception(f"Issue getting metadata for {file}")
|
||||||
# TODO: cover GPG errors, remove later.
|
# TODO: cover GPG errors, remove later.
|
||||||
|
|||||||
@@ -194,8 +194,10 @@ class ParserRegistry:
|
|||||||
at runtime regardless of registration order.
|
at runtime regardless of registration order.
|
||||||
"""
|
"""
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
self.register_builtin(TextDocumentParser)
|
self.register_builtin(TextDocumentParser)
|
||||||
|
self.register_builtin(TikaDocumentParser)
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Discovery
|
# Discovery
|
||||||
|
|||||||
440
src/paperless/parsers/tika.py
Normal file
440
src/paperless/parsers/tika.py
Normal file
@@ -0,0 +1,440 @@
|
|||||||
|
"""
|
||||||
|
Built-in Tika document parser.
|
||||||
|
|
||||||
|
Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
|
||||||
|
sending them to an Apache Tika server for text extraction and a Gotenberg
|
||||||
|
server for PDF conversion. Because the source formats cannot be rendered by
|
||||||
|
a browser natively, the parser always produces a PDF rendition for display.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from contextlib import ExitStack
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
|
from gotenberg_client import GotenbergClient
|
||||||
|
from gotenberg_client.options import PdfAFormat
|
||||||
|
from tika_client import TikaClient
|
||||||
|
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
|
from paperless.config import OutputTypeConfig
|
||||||
|
from paperless.models import OutputTypeChoices
|
||||||
|
from paperless.version import __full_version_str__
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsing.tika")
|
||||||
|
|
||||||
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||||
|
"application/msword": ".doc",
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||||
|
"application/vnd.ms-excel": ".xls",
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
||||||
|
"application/vnd.ms-powerpoint": ".ppt",
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
|
||||||
|
"application/vnd.oasis.opendocument.presentation": ".odp",
|
||||||
|
"application/vnd.oasis.opendocument.spreadsheet": ".ods",
|
||||||
|
"application/vnd.oasis.opendocument.text": ".odt",
|
||||||
|
"application/vnd.oasis.opendocument.graphics": ".odg",
|
||||||
|
"text/rtf": ".rtf",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TikaDocumentParser:
|
||||||
|
"""Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
|
||||||
|
|
||||||
|
Text extraction is handled by the Tika server. PDF conversion for display
|
||||||
|
is handled by Gotenberg (LibreOffice route). Because the source formats
|
||||||
|
cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
|
||||||
|
True and the PDF is always produced regardless of the ``produce_archive``
|
||||||
|
flag passed to ``parse``.
|
||||||
|
|
||||||
|
Both ``TikaClient`` and ``GotenbergClient`` are opened once in
|
||||||
|
``__enter__`` via an ``ExitStack`` and shared across ``parse``,
|
||||||
|
``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
|
||||||
|
``ExitStack.close()`` in ``__exit__``. The parser must always be used
|
||||||
|
as a context manager.
|
||||||
|
|
||||||
|
Class attributes
|
||||||
|
----------------
|
||||||
|
name : str
|
||||||
|
Human-readable parser name.
|
||||||
|
version : str
|
||||||
|
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||||
|
author : str
|
||||||
|
Maintainer name.
|
||||||
|
url : str
|
||||||
|
Issue tracker / source URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "Paperless-ngx Tika Parser"
|
||||||
|
version: str = __full_version_str__
|
||||||
|
author: str = "Paperless-ngx Contributors"
|
||||||
|
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Class methods
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
"""Return the MIME types this parser handles.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, str]
|
||||||
|
Mapping of MIME type to preferred file extension.
|
||||||
|
"""
|
||||||
|
return _SUPPORTED_MIME_TYPES
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the priority score for handling this file.
|
||||||
|
|
||||||
|
Returns ``None`` when Tika integration is disabled so the registry
|
||||||
|
skips this parser entirely.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the file.
|
||||||
|
filename:
|
||||||
|
Original filename including extension.
|
||||||
|
path:
|
||||||
|
Optional filesystem path. Not inspected by this parser.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
|
||||||
|
"""
|
||||||
|
if not settings.TIKA_ENABLED:
|
||||||
|
return None
|
||||||
|
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||||
|
return 10
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
"""Whether this parser can produce a searchable PDF archive copy.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always False — Tika produces a display PDF, not an OCR archive.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
"""Whether the parser must produce a PDF for the frontend to display.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always True — Office formats cannot be rendered natively in a
|
||||||
|
browser, so a PDF conversion is always required for display.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||||
|
)
|
||||||
|
self._text: str | None = None
|
||||||
|
self._date: datetime.datetime | None = None
|
||||||
|
self._archive_path: Path | None = None
|
||||||
|
self._exit_stack = ExitStack()
|
||||||
|
self._tika_client: TikaClient | None = None
|
||||||
|
self._gotenberg_client: GotenbergClient | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
self._tika_client = self._exit_stack.enter_context(
|
||||||
|
TikaClient(
|
||||||
|
tika_url=settings.TIKA_ENDPOINT,
|
||||||
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self._gotenberg_client = self._exit_stack.enter_context(
|
||||||
|
GotenbergClient(
|
||||||
|
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||||
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
self._exit_stack.close()
|
||||||
|
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Core parsing interface
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Send the document to Tika for text extraction and Gotenberg for PDF.
|
||||||
|
|
||||||
|
Because ``requires_pdf_rendition`` is True the PDF conversion is
|
||||||
|
always performed — the ``produce_archive`` flag is intentionally
|
||||||
|
ignored.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the document file to parse.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
produce_archive:
|
||||||
|
Accepted for protocol compatibility but ignored; the PDF rendition
|
||||||
|
is always produced since the source format cannot be displayed
|
||||||
|
natively in the browser.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If Tika or Gotenberg returns an error.
|
||||||
|
"""
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
assert self._tika_client is not None
|
||||||
|
|
||||||
|
logger.info("Sending %s to Tika server", document_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
parsed = self._tika_client.tika.as_text.from_file(
|
||||||
|
document_path,
|
||||||
|
mime_type,
|
||||||
|
)
|
||||||
|
except httpx.HTTPStatusError as err:
|
||||||
|
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
|
||||||
|
# Tika fails with some files as multi-part form data
|
||||||
|
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
|
||||||
|
parsed = self._tika_client.tika.as_text.from_buffer(
|
||||||
|
document_path.read_bytes(),
|
||||||
|
mime_type,
|
||||||
|
)
|
||||||
|
else: # pragma: no cover
|
||||||
|
raise
|
||||||
|
except Exception as err:
|
||||||
|
raise ParseError(
|
||||||
|
f"Could not parse {document_path} with tika server at "
|
||||||
|
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||||
|
) from err
|
||||||
|
|
||||||
|
self._text = parsed.content
|
||||||
|
if self._text is not None:
|
||||||
|
self._text = self._text.strip()
|
||||||
|
|
||||||
|
self._date = parsed.created
|
||||||
|
if self._date is not None and timezone.is_naive(self._date):
|
||||||
|
self._date = timezone.make_aware(self._date)
|
||||||
|
|
||||||
|
# Always convert — requires_pdf_rendition=True means the browser
|
||||||
|
# cannot display the source format natively.
|
||||||
|
self._archive_path = self._convert_to_pdf(document_path)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Result accessors
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
"""Return the plain-text content extracted during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
Extracted text, or None if parse has not been called yet.
|
||||||
|
"""
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self) -> datetime.datetime | None:
|
||||||
|
"""Return the document date detected during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
datetime.datetime | None
|
||||||
|
Creation date from Tika metadata, or None if not detected.
|
||||||
|
"""
|
||||||
|
return self._date
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
"""Return the path to the generated PDF rendition, or None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path | None
|
||||||
|
Path to the PDF produced by Gotenberg, or None if parse has not
|
||||||
|
been called yet.
|
||||||
|
"""
|
||||||
|
return self._archive_path
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Thumbnail and metadata
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
"""Generate a thumbnail from the PDF rendition of the document.
|
||||||
|
|
||||||
|
Converts the document to PDF first if not already done.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated WebP thumbnail inside the temporary directory.
|
||||||
|
"""
|
||||||
|
if self._archive_path is None:
|
||||||
|
self._archive_path = self._convert_to_pdf(document_path)
|
||||||
|
return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
|
||||||
|
|
||||||
|
def get_page_count(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Always None — page count is not available from Tika.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract format-specific metadata via the Tika metadata endpoint.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
All key/value pairs returned by Tika, or ``[]`` on error.
|
||||||
|
"""
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
assert self._tika_client is not None
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = self._tika_client.metadata.from_file(document_path, mime_type)
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"namespace": "",
|
||||||
|
"prefix": "",
|
||||||
|
"key": key,
|
||||||
|
"value": parsed.data[key],
|
||||||
|
}
|
||||||
|
for key in parsed.data
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Error while fetching document metadata for %s: %s",
|
||||||
|
document_path,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Private helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _convert_to_pdf(self, document_path: Path) -> Path:
|
||||||
|
"""Convert the document to PDF using Gotenberg's LibreOffice route.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated PDF inside the temporary directory.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If Gotenberg returns an error.
|
||||||
|
"""
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
assert self._gotenberg_client is not None
|
||||||
|
|
||||||
|
pdf_path = self._tempdir / "convert.pdf"
|
||||||
|
|
||||||
|
logger.info("Converting %s to PDF as %s", document_path, pdf_path)
|
||||||
|
|
||||||
|
with self._gotenberg_client.libre_office.to_pdf() as route:
|
||||||
|
# Set the output format of the resulting PDF.
|
||||||
|
# OutputTypeConfig reads the database-stored ApplicationConfiguration
|
||||||
|
# first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
|
||||||
|
output_type = OutputTypeConfig().output_type
|
||||||
|
if output_type in {
|
||||||
|
OutputTypeChoices.PDF_A,
|
||||||
|
OutputTypeChoices.PDF_A2,
|
||||||
|
}:
|
||||||
|
route.pdf_format(PdfAFormat.A2b)
|
||||||
|
elif output_type == OutputTypeChoices.PDF_A1:
|
||||||
|
logger.warning(
|
||||||
|
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||||
|
)
|
||||||
|
route.pdf_format(PdfAFormat.A2b)
|
||||||
|
elif output_type == OutputTypeChoices.PDF_A3:
|
||||||
|
route.pdf_format(PdfAFormat.A3b)
|
||||||
|
|
||||||
|
route.convert(document_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = route.run()
|
||||||
|
pdf_path.write_bytes(response.content)
|
||||||
|
return pdf_path
|
||||||
|
except Exception as err:
|
||||||
|
raise ParseError(
|
||||||
|
f"Error while converting document to PDF: {err}",
|
||||||
|
) from err
|
||||||
@@ -11,6 +11,7 @@ from typing import TYPE_CHECKING
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
@@ -74,3 +75,86 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
|||||||
"""
|
"""
|
||||||
with TextDocumentParser() as parser:
|
with TextDocumentParser() as parser:
|
||||||
yield parser
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Tika parser sample files
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def tika_samples_dir(samples_dir: Path) -> Path:
|
||||||
|
"""Absolute path to the Tika parser sample files directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
``<samples_dir>/tika/``
|
||||||
|
"""
|
||||||
|
return samples_dir / "tika"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_odt_file(tika_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a sample ODT file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tika/sample.odt``.
|
||||||
|
"""
|
||||||
|
return tika_samples_dir / "sample.odt"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_docx_file(tika_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a sample DOCX file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tika/sample.docx``.
|
||||||
|
"""
|
||||||
|
return tika_samples_dir / "sample.docx"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_doc_file(tika_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a sample DOC file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tika/sample.doc``.
|
||||||
|
"""
|
||||||
|
return tika_samples_dir / "sample.doc"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_broken_odt(tika_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a broken ODT file that triggers the multi-part fallback.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tika/multi-part-broken.odt``.
|
||||||
|
"""
|
||||||
|
return tika_samples_dir / "multi-part-broken.odt"
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Tika parser instance
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
|
||||||
|
"""Yield a TikaDocumentParser and clean up its temporary directory afterwards.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
TikaDocumentParser
|
||||||
|
A ready-to-use parser instance.
|
||||||
|
"""
|
||||||
|
with TikaDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from documents.tests.utils import util_call_with_backoff
|
from documents.tests.utils import util_call_with_backoff
|
||||||
from paperless_tika.parsers import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
@@ -42,14 +42,15 @@ class TestTikaParserAgainstServer:
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
tika_parser.text
|
tika_parser.get_text()
|
||||||
== "This is an ODT test document, created September 14, 2022"
|
== "This is an ODT test document, created September 14, 2022"
|
||||||
)
|
)
|
||||||
assert tika_parser.archive_path is not None
|
archive = tika_parser.get_archive_path()
|
||||||
assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
|
assert archive is not None
|
||||||
|
assert b"PDF-" in archive.read_bytes()[:10]
|
||||||
|
|
||||||
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
|
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
|
||||||
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
|
# self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
|
||||||
|
|
||||||
def test_basic_parse_docx(
|
def test_basic_parse_docx(
|
||||||
self,
|
self,
|
||||||
@@ -74,14 +75,15 @@ class TestTikaParserAgainstServer:
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
tika_parser.text
|
tika_parser.get_text()
|
||||||
== "This is an DOCX test document, also made September 14, 2022"
|
== "This is an DOCX test document, also made September 14, 2022"
|
||||||
)
|
)
|
||||||
assert tika_parser.archive_path is not None
|
archive = tika_parser.get_archive_path()
|
||||||
with Path(tika_parser.archive_path).open("rb") as f:
|
assert archive is not None
|
||||||
|
with archive.open("rb") as f:
|
||||||
assert b"PDF-" in f.read()[:10]
|
assert b"PDF-" in f.read()[:10]
|
||||||
|
|
||||||
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
|
# self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
|
||||||
|
|
||||||
def test_basic_parse_doc(
|
def test_basic_parse_doc(
|
||||||
self,
|
self,
|
||||||
@@ -102,13 +104,12 @@ class TestTikaParserAgainstServer:
|
|||||||
[sample_doc_file, "application/msword"],
|
[sample_doc_file, "application/msword"],
|
||||||
)
|
)
|
||||||
|
|
||||||
assert tika_parser.text is not None
|
text = tika_parser.get_text()
|
||||||
assert (
|
assert text is not None
|
||||||
"This is a test document, saved in the older .doc format"
|
assert "This is a test document, saved in the older .doc format" in text
|
||||||
in tika_parser.text
|
archive = tika_parser.get_archive_path()
|
||||||
)
|
assert archive is not None
|
||||||
assert tika_parser.archive_path is not None
|
with archive.open("rb") as f:
|
||||||
with Path(tika_parser.archive_path).open("rb") as f:
|
|
||||||
assert b"PDF-" in f.read()[:10]
|
assert b"PDF-" in f.read()[:10]
|
||||||
|
|
||||||
def test_tika_fails_multi_part(
|
def test_tika_fails_multi_part(
|
||||||
@@ -133,6 +134,7 @@ class TestTikaParserAgainstServer:
|
|||||||
[sample_broken_odt, "application/vnd.oasis.opendocument.text"],
|
[sample_broken_odt, "application/vnd.oasis.opendocument.text"],
|
||||||
)
|
)
|
||||||
|
|
||||||
assert tika_parser.archive_path is not None
|
archive = tika_parser.get_archive_path()
|
||||||
with Path(tika_parser.archive_path).open("rb") as f:
|
assert archive is not None
|
||||||
|
with archive.open("rb") as f:
|
||||||
assert b"PDF-" in f.read()[:10]
|
assert b"PDF-" in f.read()[:10]
|
||||||
@@ -9,7 +9,56 @@ from pytest_django.fixtures import SettingsWrapper
|
|||||||
from pytest_httpx import HTTPXMock
|
from pytest_httpx import HTTPXMock
|
||||||
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from paperless_tika.parsers import TikaDocumentParser
|
from paperless.parsers import ParserProtocol
|
||||||
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class TestTikaParserRegistryInterface:
|
||||||
|
"""Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
|
||||||
|
|
||||||
|
def test_satisfies_parser_protocol(self) -> None:
|
||||||
|
assert isinstance(TikaDocumentParser(), ParserProtocol)
|
||||||
|
|
||||||
|
def test_supported_mime_types_is_classmethod(self) -> None:
|
||||||
|
mime_types = TikaDocumentParser.supported_mime_types()
|
||||||
|
assert isinstance(mime_types, dict)
|
||||||
|
assert len(mime_types) > 0
|
||||||
|
|
||||||
|
def test_score_returns_none_when_tika_disabled(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.TIKA_ENABLED = False
|
||||||
|
result = TikaDocumentParser.score(
|
||||||
|
"application/vnd.oasis.opendocument.text",
|
||||||
|
"sample.odt",
|
||||||
|
)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_score_returns_int_when_tika_enabled(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.TIKA_ENABLED = True
|
||||||
|
result = TikaDocumentParser.score(
|
||||||
|
"application/vnd.oasis.opendocument.text",
|
||||||
|
"sample.odt",
|
||||||
|
)
|
||||||
|
assert isinstance(result, int)
|
||||||
|
|
||||||
|
def test_score_returns_none_for_unsupported_mime(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
) -> None:
|
||||||
|
settings.TIKA_ENABLED = True
|
||||||
|
result = TikaDocumentParser.score("application/pdf", "doc.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_can_produce_archive_is_false(self) -> None:
|
||||||
|
assert TikaDocumentParser().can_produce_archive is False
|
||||||
|
|
||||||
|
def test_requires_pdf_rendition_is_true(self) -> None:
|
||||||
|
assert TikaDocumentParser().requires_pdf_rendition is True
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db()
|
@pytest.mark.django_db()
|
||||||
@@ -36,12 +85,12 @@ class TestTikaParser:
|
|||||||
|
|
||||||
tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
|
tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
|
||||||
|
|
||||||
assert tika_parser.text == "the content"
|
assert tika_parser.get_text() == "the content"
|
||||||
assert tika_parser.archive_path is not None
|
assert tika_parser.get_archive_path() is not None
|
||||||
with Path(tika_parser.archive_path).open("rb") as f:
|
with Path(tika_parser.get_archive_path()).open("rb") as f:
|
||||||
assert f.read() == b"PDF document"
|
assert f.read() == b"PDF document"
|
||||||
|
|
||||||
assert tika_parser.date == datetime.datetime(
|
assert tika_parser.get_date() == datetime.datetime(
|
||||||
2020,
|
2020,
|
||||||
11,
|
11,
|
||||||
21,
|
21,
|
||||||
@@ -89,7 +138,7 @@ class TestTikaParser:
|
|||||||
httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
|
httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
|
||||||
|
|
||||||
with pytest.raises(ParseError):
|
with pytest.raises(ParseError):
|
||||||
tika_parser.convert_to_pdf(sample_odt_file, None)
|
tika_parser._convert_to_pdf(sample_odt_file)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("setting_value", "expected_form_value"),
|
("setting_value", "expected_form_value"),
|
||||||
@@ -106,7 +155,6 @@ class TestTikaParser:
|
|||||||
expected_form_value: str,
|
expected_form_value: str,
|
||||||
httpx_mock: HTTPXMock,
|
httpx_mock: HTTPXMock,
|
||||||
settings: SettingsWrapper,
|
settings: SettingsWrapper,
|
||||||
tika_parser: TikaDocumentParser,
|
|
||||||
sample_odt_file: Path,
|
sample_odt_file: Path,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -117,6 +165,8 @@ class TestTikaParser:
|
|||||||
THEN:
|
THEN:
|
||||||
- Request to Gotenberg contains the expected PDF/A format string
|
- Request to Gotenberg contains the expected PDF/A format string
|
||||||
"""
|
"""
|
||||||
|
# Parser must be created after the setting is changed so that
|
||||||
|
# OutputTypeConfig reads the correct value at __init__ time.
|
||||||
settings.OCR_OUTPUT_TYPE = setting_value
|
settings.OCR_OUTPUT_TYPE = setting_value
|
||||||
httpx_mock.add_response(
|
httpx_mock.add_response(
|
||||||
status_code=codes.OK,
|
status_code=codes.OK,
|
||||||
@@ -124,7 +174,8 @@ class TestTikaParser:
|
|||||||
method="POST",
|
method="POST",
|
||||||
)
|
)
|
||||||
|
|
||||||
tika_parser.convert_to_pdf(sample_odt_file, None)
|
with TikaDocumentParser() as parser:
|
||||||
|
parser._convert_to_pdf(sample_odt_file)
|
||||||
|
|
||||||
request = httpx_mock.get_request()
|
request = httpx_mock.get_request()
|
||||||
|
|
||||||
@@ -1,10 +1,12 @@
|
|||||||
def get_parser(*args, **kwargs):
|
def get_parser(*args, **kwargs):
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
|
||||||
# The new TextDocumentParser does not accept the legacy logging_group /
|
# TextDocumentParser accepts logging_group for constructor compatibility but
|
||||||
# progress_callback kwargs injected by the old signal-based consumer.
|
# does not store or use it (no legacy DocumentParser base class).
|
||||||
# These are dropped here; Phase 4 will replace this signal path with the
|
# progress_callback is also not used. Both may arrive as a positional arg
|
||||||
# new ParserRegistry so the shim can be removed at that point.
|
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
||||||
|
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
||||||
|
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||||
kwargs.pop("logging_group", None)
|
kwargs.pop("logging_group", None)
|
||||||
kwargs.pop("progress_callback", None)
|
kwargs.pop("progress_callback", None)
|
||||||
return TextDocumentParser()
|
return TextDocumentParser()
|
||||||
|
|||||||
@@ -1,136 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
from django.conf import settings
|
|
||||||
from django.utils import timezone
|
|
||||||
from gotenberg_client import GotenbergClient
|
|
||||||
from gotenberg_client.options import PdfAFormat
|
|
||||||
from tika_client import TikaClient
|
|
||||||
|
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import ParseError
|
|
||||||
from documents.parsers import make_thumbnail_from_pdf
|
|
||||||
from paperless.config import OutputTypeConfig
|
|
||||||
from paperless.models import OutputTypeChoices
|
|
||||||
|
|
||||||
|
|
||||||
class TikaDocumentParser(DocumentParser):
|
|
||||||
"""
|
|
||||||
This parser sends documents to a local tika server
|
|
||||||
"""
|
|
||||||
|
|
||||||
logging_name = "paperless.parsing.tika"
|
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
|
||||||
if not self.archive_path:
|
|
||||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
|
||||||
|
|
||||||
return make_thumbnail_from_pdf(
|
|
||||||
self.archive_path,
|
|
||||||
self.tempdir,
|
|
||||||
self.logging_group,
|
|
||||||
)
|
|
||||||
|
|
||||||
def extract_metadata(self, document_path, mime_type):
|
|
||||||
try:
|
|
||||||
with TikaClient(
|
|
||||||
tika_url=settings.TIKA_ENDPOINT,
|
|
||||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
|
||||||
) as client:
|
|
||||||
parsed = client.metadata.from_file(document_path, mime_type)
|
|
||||||
return [
|
|
||||||
{
|
|
||||||
"namespace": "",
|
|
||||||
"prefix": "",
|
|
||||||
"key": key,
|
|
||||||
"value": parsed.data[key],
|
|
||||||
}
|
|
||||||
for key in parsed.data
|
|
||||||
]
|
|
||||||
except Exception as e:
|
|
||||||
self.log.warning(
|
|
||||||
f"Error while fetching document metadata for {document_path}: {e}",
|
|
||||||
)
|
|
||||||
return []
|
|
||||||
|
|
||||||
def parse(self, document_path: Path, mime_type: str, file_name=None) -> None:
|
|
||||||
self.log.info(f"Sending {document_path} to Tika server")
|
|
||||||
|
|
||||||
try:
|
|
||||||
with TikaClient(
|
|
||||||
tika_url=settings.TIKA_ENDPOINT,
|
|
||||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
|
||||||
) as client:
|
|
||||||
try:
|
|
||||||
parsed = client.tika.as_text.from_file(document_path, mime_type)
|
|
||||||
except httpx.HTTPStatusError as err:
|
|
||||||
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
|
|
||||||
# Tika fails with some files as multi-part form data
|
|
||||||
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
|
|
||||||
parsed = client.tika.as_text.from_buffer(
|
|
||||||
document_path.read_bytes(),
|
|
||||||
mime_type,
|
|
||||||
)
|
|
||||||
else: # pragma: no cover
|
|
||||||
raise
|
|
||||||
except Exception as err:
|
|
||||||
raise ParseError(
|
|
||||||
f"Could not parse {document_path} with tika server at "
|
|
||||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
|
||||||
) from err
|
|
||||||
|
|
||||||
self.text = parsed.content
|
|
||||||
if self.text is not None:
|
|
||||||
self.text = self.text.strip()
|
|
||||||
|
|
||||||
self.date = parsed.created
|
|
||||||
if self.date is not None and timezone.is_naive(self.date):
|
|
||||||
self.date = timezone.make_aware(self.date)
|
|
||||||
|
|
||||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
|
||||||
|
|
||||||
def convert_to_pdf(self, document_path: Path, file_name):
|
|
||||||
pdf_path = Path(self.tempdir) / "convert.pdf"
|
|
||||||
|
|
||||||
self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
|
|
||||||
|
|
||||||
with (
|
|
||||||
GotenbergClient(
|
|
||||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
|
||||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
|
||||||
) as client,
|
|
||||||
client.libre_office.to_pdf() as route,
|
|
||||||
):
|
|
||||||
# Set the output format of the resulting PDF
|
|
||||||
if settings.OCR_OUTPUT_TYPE in {
|
|
||||||
OutputTypeChoices.PDF_A,
|
|
||||||
OutputTypeChoices.PDF_A2,
|
|
||||||
}:
|
|
||||||
route.pdf_format(PdfAFormat.A2b)
|
|
||||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
|
|
||||||
self.log.warning(
|
|
||||||
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
|
||||||
)
|
|
||||||
route.pdf_format(PdfAFormat.A2b)
|
|
||||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
|
|
||||||
route.pdf_format(PdfAFormat.A3b)
|
|
||||||
|
|
||||||
route.convert(document_path)
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = route.run()
|
|
||||||
|
|
||||||
pdf_path.write_bytes(response.content)
|
|
||||||
|
|
||||||
return pdf_path
|
|
||||||
|
|
||||||
except Exception as err:
|
|
||||||
raise ParseError(
|
|
||||||
f"Error while converting document to PDF: {err}",
|
|
||||||
) from err
|
|
||||||
|
|
||||||
def get_settings(self) -> OutputTypeConfig:
|
|
||||||
"""
|
|
||||||
This parser only uses the PDF output type configuration currently
|
|
||||||
"""
|
|
||||||
return OutputTypeConfig()
|
|
||||||
@@ -1,7 +1,15 @@
|
|||||||
def get_parser(*args, **kwargs):
|
def get_parser(*args, **kwargs):
|
||||||
from paperless_tika.parsers import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
return TikaDocumentParser(*args, **kwargs)
|
# TikaDocumentParser accepts logging_group for constructor compatibility but
|
||||||
|
# does not store or use it (no legacy DocumentParser base class).
|
||||||
|
# progress_callback is also not used. Both may arrive as a positional arg
|
||||||
|
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
||||||
|
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
||||||
|
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||||
|
kwargs.pop("logging_group", None)
|
||||||
|
kwargs.pop("progress_callback", None)
|
||||||
|
return TikaDocumentParser()
|
||||||
|
|
||||||
|
|
||||||
def tika_consumer_declaration(sender, **kwargs):
|
def tika_consumer_declaration(sender, **kwargs):
|
||||||
|
|||||||
@@ -1,41 +0,0 @@
|
|||||||
from collections.abc import Generator
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from paperless_tika.parsers import TikaDocumentParser
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
|
|
||||||
try:
|
|
||||||
parser = TikaDocumentParser(logging_group=None)
|
|
||||||
yield parser
|
|
||||||
finally:
|
|
||||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
|
||||||
parser.cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def sample_dir() -> Path:
|
|
||||||
return (Path(__file__).parent / Path("samples")).resolve()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def sample_odt_file(sample_dir: Path) -> Path:
|
|
||||||
return sample_dir / "sample.odt"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def sample_docx_file(sample_dir: Path) -> Path:
|
|
||||||
return sample_dir / "sample.docx"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def sample_doc_file(sample_dir: Path) -> Path:
|
|
||||||
return sample_dir / "sample.doc"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def sample_broken_odt(sample_dir: Path) -> Path:
|
|
||||||
return sample_dir / "multi-part-broken.odt"
|
|
||||||
Reference in New Issue
Block a user