mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-10 03:01:23 +00:00
Compare commits
13 Commits
dependabot
...
feature-pa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7eb417e796 | ||
|
|
8c08362ebc | ||
|
|
c37ab946e1 | ||
|
|
82068303d0 | ||
|
|
cc8e9a7108 | ||
|
|
1870f69053 | ||
|
|
053d590cb8 | ||
|
|
987aa363dc | ||
|
|
b8f63026f7 | ||
|
|
3a232f0c8f | ||
|
|
404ef6b40d | ||
|
|
8c40491034 | ||
|
|
0f6bdaf5de |
@@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from celery import Celery
|
from celery import Celery
|
||||||
|
from celery.signals import worker_process_init
|
||||||
|
|
||||||
# Set the default Django settings module for the 'celery' program.
|
# Set the default Django settings module for the 'celery' program.
|
||||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
|
||||||
@@ -15,3 +16,18 @@ app.config_from_object("django.conf:settings", namespace="CELERY")
|
|||||||
|
|
||||||
# Load task modules from all registered Django apps.
|
# Load task modules from all registered Django apps.
|
||||||
app.autodiscover_tasks()
|
app.autodiscover_tasks()
|
||||||
|
|
||||||
|
|
||||||
|
@worker_process_init.connect
|
||||||
|
def on_worker_process_init(**kwargs) -> None:
|
||||||
|
"""Register built-in parsers eagerly in each Celery worker process.
|
||||||
|
|
||||||
|
This registers only the built-in parsers (no entrypoint discovery) so
|
||||||
|
that workers can begin consuming documents immediately. Entrypoint
|
||||||
|
discovery for third-party parsers is deferred to the first call of
|
||||||
|
``get_parser_registry()`` inside a task, keeping ``worker_process_init``
|
||||||
|
well within its 4-second timeout budget.
|
||||||
|
"""
|
||||||
|
from paperless.parsers.registry import init_builtin_parsers
|
||||||
|
|
||||||
|
init_builtin_parsers()
|
||||||
|
|||||||
379
src/paperless/parsers/__init__.py
Normal file
379
src/paperless/parsers/__init__.py
Normal file
@@ -0,0 +1,379 @@
|
|||||||
|
"""
|
||||||
|
Public interface for the Paperless-ngx parser plugin system.
|
||||||
|
|
||||||
|
This module defines ParserProtocol — the structural contract that every
|
||||||
|
document parser must satisfy, whether it is a built-in parser shipped with
|
||||||
|
Paperless-ngx or a third-party parser installed via a Python entrypoint.
|
||||||
|
|
||||||
|
Phase 1/2 scope: only the Protocol is defined here. The transitional
|
||||||
|
DocumentParser ABC (Phase 3) and concrete built-in parsers (Phase 3+) will
|
||||||
|
be added in later phases, so there are intentionally no imports of parser
|
||||||
|
implementations here.
|
||||||
|
|
||||||
|
Usage example (third-party parser)::
|
||||||
|
|
||||||
|
from paperless.parsers import ParserProtocol
|
||||||
|
|
||||||
|
class MyParser:
|
||||||
|
name = "my-parser"
|
||||||
|
version = "1.0.0"
|
||||||
|
author = "Acme Corp"
|
||||||
|
url = "https://example.com/my-parser"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
return {"application/x-my-format": ".myf"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return 10
|
||||||
|
|
||||||
|
# … implement remaining protocol methods …
|
||||||
|
|
||||||
|
assert isinstance(MyParser(), ParserProtocol)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Protocol
|
||||||
|
from typing import Self
|
||||||
|
from typing import TypedDict
|
||||||
|
from typing import runtime_checkable
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"MetadataEntry",
|
||||||
|
"ParserProtocol",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class MetadataEntry(TypedDict):
|
||||||
|
"""A single metadata field extracted from a document.
|
||||||
|
|
||||||
|
All four keys are required. Values are always serialised to strings —
|
||||||
|
type-specific conversion (dates, integers, lists) is the responsibility
|
||||||
|
of the parser before returning.
|
||||||
|
"""
|
||||||
|
|
||||||
|
namespace: str
|
||||||
|
"""URI of the metadata namespace (e.g. 'http://ns.adobe.com/pdf/1.3/')."""
|
||||||
|
|
||||||
|
prefix: str
|
||||||
|
"""Conventional namespace prefix (e.g. 'pdf', 'xmp', 'dc')."""
|
||||||
|
|
||||||
|
key: str
|
||||||
|
"""Field name within the namespace (e.g. 'Author', 'CreateDate')."""
|
||||||
|
|
||||||
|
value: str
|
||||||
|
"""String representation of the field value."""
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class ParserProtocol(Protocol):
|
||||||
|
"""Structural contract for all Paperless-ngx document parsers.
|
||||||
|
|
||||||
|
Both built-in parsers and third-party plugins (discovered via the
|
||||||
|
"paperless_ngx.parsers" entrypoint group) must satisfy this Protocol.
|
||||||
|
Because it is decorated with runtime_checkable, isinstance(obj,
|
||||||
|
ParserProtocol) works at runtime based on method presence, which is
|
||||||
|
useful for validation in ParserRegistry.discover.
|
||||||
|
|
||||||
|
Parsers must expose four string attributes at the class level so the
|
||||||
|
registry can log attribution information without instantiating the parser:
|
||||||
|
|
||||||
|
name : str
|
||||||
|
Human-readable parser name (e.g. "Tesseract OCR").
|
||||||
|
version : str
|
||||||
|
Semantic version string (e.g. "1.2.3").
|
||||||
|
author : str
|
||||||
|
Author or organisation name.
|
||||||
|
url : str
|
||||||
|
URL for documentation, source code, or issue tracker.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Class-level identity (checked by the registry, not Protocol methods)
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
name: str
|
||||||
|
version: str
|
||||||
|
author: str
|
||||||
|
url: str
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Class methods
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
"""Return a mapping of supported MIME types to preferred file extensions.
|
||||||
|
|
||||||
|
The keys are MIME type strings (e.g. "application/pdf"), and the
|
||||||
|
values are the preferred file extension including the leading dot
|
||||||
|
(e.g. ".pdf"). The registry uses this mapping both to decide whether
|
||||||
|
a parser is a candidate for a given file and to determine the default
|
||||||
|
extension when creating archive copies.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, str]
|
||||||
|
{mime_type: extension} mapping — may be empty if the parser
|
||||||
|
has been temporarily disabled.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return a priority score for handling this file, or None to decline.
|
||||||
|
|
||||||
|
The registry calls this after confirming that the MIME type is in
|
||||||
|
supported_mime_types. Parsers may inspect filename and optionally
|
||||||
|
the file at path to refine their confidence level.
|
||||||
|
|
||||||
|
A higher score wins. Return None to explicitly decline handling a file
|
||||||
|
even though the MIME type is listed as supported (e.g. when a feature
|
||||||
|
flag is disabled, or a required service is not configured).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mime_type:
|
||||||
|
The detected MIME type of the file to be parsed.
|
||||||
|
filename:
|
||||||
|
The original filename, including extension.
|
||||||
|
path:
|
||||||
|
Optional filesystem path to the file. Parsers that need to
|
||||||
|
inspect file content (e.g. magic-byte sniffing) may use this.
|
||||||
|
May be None when scoring happens before the file is available locally.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Priority score (higher wins), or None to decline.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
"""Whether this parser can produce a searchable PDF archive copy.
|
||||||
|
|
||||||
|
If True, the consumption pipeline may request an archive version when
|
||||||
|
processing the document, subject to the ARCHIVE_FILE_GENERATION
|
||||||
|
setting. If False, only thumbnail and text extraction are performed.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
"""Whether the parser must produce a PDF for the frontend to display.
|
||||||
|
|
||||||
|
True for formats the browser cannot display natively (e.g. DOCX, ODT).
|
||||||
|
When True, the pipeline always stores the PDF output regardless of the
|
||||||
|
ARCHIVE_FILE_GENERATION setting, since the original format cannot be
|
||||||
|
shown to the user.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Core parsing interface
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Parse document_path and populate internal state.
|
||||||
|
|
||||||
|
After a successful call, callers retrieve results via get_text,
|
||||||
|
get_date, and get_archive_path.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the document file to parse.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
produce_archive:
|
||||||
|
When True (the default) and can_produce_archive is also True,
|
||||||
|
the parser should produce a searchable PDF at the path returned
|
||||||
|
by get_archive_path. Pass False when only text extraction and
|
||||||
|
thumbnail generation are required and disk I/O should be minimised.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If parsing fails for any reason.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Result accessors
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
"""Return the plain-text content extracted during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
Extracted text, or None if no text could be found.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
def get_date(self) -> datetime.datetime | None:
|
||||||
|
"""Return the document date detected during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
datetime.datetime | None
|
||||||
|
Detected document date, or None if no date was found.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
"""Return the path to the generated archive PDF, or None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path | None
|
||||||
|
Path to the searchable PDF archive, or None if no archive was
|
||||||
|
produced (e.g. because produce_archive=False or the parser does
|
||||||
|
not support archive generation).
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Thumbnail and metadata
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
"""Generate and return the path to a thumbnail image for the document.
|
||||||
|
|
||||||
|
May be called independently of parse. The returned path must point to
|
||||||
|
an existing WebP image file inside the parser's temporary working
|
||||||
|
directory.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated thumbnail image (WebP format preferred).
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
def get_page_count(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in the document, if determinable.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Page count, or None if the parser cannot determine it.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract format-specific metadata from the document.
|
||||||
|
|
||||||
|
Called by the API view layer on demand — not during the consumption
|
||||||
|
pipeline. Results are returned to the frontend for per-file display.
|
||||||
|
|
||||||
|
For documents with an archive version, this method is called twice:
|
||||||
|
once for the original file (with its native MIME type) and once for
|
||||||
|
the archive file (with ``"application/pdf"``). Parsers that produce
|
||||||
|
archives should handle both cases.
|
||||||
|
|
||||||
|
Implementations must not raise. A failure to read metadata is not
|
||||||
|
fatal — log a warning and return whatever partial results were
|
||||||
|
collected, or ``[]`` if none.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the file to extract metadata from.
|
||||||
|
mime_type:
|
||||||
|
MIME type of the file at ``document_path``. May be
|
||||||
|
``"application/pdf"`` when called for the archive version.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
Zero or more metadata entries. Returns ``[]`` if no metadata
|
||||||
|
could be extracted or the format does not support it.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Context manager
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
"""Enter the parser context, returning the parser instance.
|
||||||
|
|
||||||
|
Implementations should perform any resource allocation here if not
|
||||||
|
done in __init__ (e.g. creating API clients or temp directories).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Self
|
||||||
|
The parser instance itself.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
"""Exit the parser context and release all resources.
|
||||||
|
|
||||||
|
Implementations must clean up all temporary files and other resources
|
||||||
|
regardless of whether an exception occurred.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
exc_type:
|
||||||
|
The exception class, or None if no exception was raised.
|
||||||
|
exc_val:
|
||||||
|
The exception instance, or None.
|
||||||
|
exc_tb:
|
||||||
|
The traceback, or None.
|
||||||
|
"""
|
||||||
|
...
|
||||||
365
src/paperless/parsers/registry.py
Normal file
365
src/paperless/parsers/registry.py
Normal file
@@ -0,0 +1,365 @@
|
|||||||
|
"""
|
||||||
|
Singleton registry that tracks all document parsers available to
|
||||||
|
Paperless-ngx — both built-ins shipped with the application and third-party
|
||||||
|
plugins installed via Python entrypoints.
|
||||||
|
|
||||||
|
Public surface
|
||||||
|
--------------
|
||||||
|
get_parser_registry
|
||||||
|
Lazy-initialise and return the shared ParserRegistry. This is the primary
|
||||||
|
entry point for production code.
|
||||||
|
|
||||||
|
init_builtin_parsers
|
||||||
|
Register built-in parsers only, without entrypoint discovery. Safe to
|
||||||
|
call from Celery worker_process_init where importing all entrypoints
|
||||||
|
would be wasteful or cause side effects.
|
||||||
|
|
||||||
|
reset_parser_registry
|
||||||
|
Reset module-level state. For tests only.
|
||||||
|
|
||||||
|
Entrypoint group
|
||||||
|
----------------
|
||||||
|
Third-party parsers must advertise themselves under the
|
||||||
|
"paperless_ngx.parsers" entrypoint group in their pyproject.toml::
|
||||||
|
|
||||||
|
[project.entry-points."paperless_ngx.parsers"]
|
||||||
|
my_parser = "my_package.parsers:MyParser"
|
||||||
|
|
||||||
|
The loaded class must expose the following attributes at the class level
|
||||||
|
(not just on instances) for the registry to accept it:
|
||||||
|
name, version, author, url, supported_mime_types (callable), score (callable).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from importlib.metadata import entry_points
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from paperless.parsers import ParserProtocol
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsers.registry")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module-level singleton state
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_registry: ParserRegistry | None = None
|
||||||
|
_discovery_complete: bool = False
|
||||||
|
|
||||||
|
# Attribute names that every registered external parser class must expose.
|
||||||
|
_REQUIRED_ATTRS: tuple[str, ...] = (
|
||||||
|
"name",
|
||||||
|
"version",
|
||||||
|
"author",
|
||||||
|
"url",
|
||||||
|
"supported_mime_types",
|
||||||
|
"score",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module-level accessor functions
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser_registry() -> ParserRegistry:
|
||||||
|
"""Return the shared ParserRegistry instance.
|
||||||
|
|
||||||
|
On the first call this function:
|
||||||
|
|
||||||
|
1. Creates a new ParserRegistry.
|
||||||
|
2. Calls register_defaults to install built-in parsers.
|
||||||
|
3. Calls discover to load third-party plugins via importlib.metadata entrypoints.
|
||||||
|
4. Calls log_summary to emit a startup summary.
|
||||||
|
|
||||||
|
Subsequent calls return the same instance immediately.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ParserRegistry
|
||||||
|
The shared registry singleton.
|
||||||
|
"""
|
||||||
|
global _registry, _discovery_complete
|
||||||
|
|
||||||
|
if _registry is None:
|
||||||
|
_registry = ParserRegistry()
|
||||||
|
_registry.register_defaults()
|
||||||
|
|
||||||
|
if not _discovery_complete:
|
||||||
|
_registry.discover()
|
||||||
|
_registry.log_summary()
|
||||||
|
_discovery_complete = True
|
||||||
|
|
||||||
|
return _registry
|
||||||
|
|
||||||
|
|
||||||
|
def init_builtin_parsers() -> None:
|
||||||
|
"""Register built-in parsers without performing entrypoint discovery.
|
||||||
|
|
||||||
|
Intended for use in Celery worker_process_init handlers where importing
|
||||||
|
all installed entrypoints would be wasteful, slow, or could produce
|
||||||
|
undesirable side effects. Entrypoint discovery (third-party plugins) is
|
||||||
|
deliberately not performed.
|
||||||
|
|
||||||
|
Safe to call multiple times — subsequent calls are no-ops.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
global _registry
|
||||||
|
|
||||||
|
if _registry is None:
|
||||||
|
_registry = ParserRegistry()
|
||||||
|
_registry.register_defaults()
|
||||||
|
_registry.log_summary()
|
||||||
|
|
||||||
|
|
||||||
|
def reset_parser_registry() -> None:
|
||||||
|
"""Reset the module-level registry state to its initial values.
|
||||||
|
|
||||||
|
Resets _registry and _discovery_complete so the next call to
|
||||||
|
get_parser_registry will re-initialise everything from scratch.
|
||||||
|
|
||||||
|
FOR TESTS ONLY. Do not call this in production code — resetting the
|
||||||
|
registry mid-request causes all subsequent parser lookups to go through
|
||||||
|
discovery again, which is expensive and may have unexpected side effects
|
||||||
|
in multi-threaded environments.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
global _registry, _discovery_complete
|
||||||
|
|
||||||
|
_registry = None
|
||||||
|
_discovery_complete = False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Registry class
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class ParserRegistry:
|
||||||
|
"""Registry that maps MIME types to the best available parser class.
|
||||||
|
|
||||||
|
Parsers are partitioned into two lists:
|
||||||
|
|
||||||
|
_builtins
|
||||||
|
Parser classes registered via register_builtin (populated by
|
||||||
|
register_defaults in Phase 3+).
|
||||||
|
|
||||||
|
_external
|
||||||
|
Parser classes loaded from installed Python entrypoints via discover.
|
||||||
|
|
||||||
|
When resolving a parser for a file, external parsers are evaluated
|
||||||
|
alongside built-in parsers using a uniform scoring mechanism. Both lists
|
||||||
|
are iterated together; the class with the highest score wins. If an
|
||||||
|
external parser wins, its attribution details are logged so users can
|
||||||
|
identify which third-party package handled their document.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._external: list[type[ParserProtocol]] = []
|
||||||
|
self._builtins: list[type[ParserProtocol]] = []
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Registration
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def register_builtin(self, parser_class: type[ParserProtocol]) -> None:
|
||||||
|
"""Register a built-in parser class.
|
||||||
|
|
||||||
|
Built-in parsers are shipped with Paperless-ngx and are appended to
|
||||||
|
the _builtins list. They are never overridden by external parsers;
|
||||||
|
instead, scoring determines which parser wins for any given file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
parser_class:
|
||||||
|
The parser class to register. Must satisfy ParserProtocol.
|
||||||
|
"""
|
||||||
|
self._builtins.append(parser_class)
|
||||||
|
|
||||||
|
def register_defaults(self) -> None:
|
||||||
|
"""Register the built-in parsers that ship with Paperless-ngx.
|
||||||
|
|
||||||
|
Each parser that has been migrated to the new ParserProtocol interface
|
||||||
|
is registered here. Parsers are added in ascending weight order so
|
||||||
|
that log output is predictable; scoring determines which parser wins
|
||||||
|
at runtime regardless of registration order.
|
||||||
|
"""
|
||||||
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
|
||||||
|
self.register_builtin(TextDocumentParser)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Discovery
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def discover(self) -> None:
|
||||||
|
"""Load third-party parsers from the "paperless_ngx.parsers" entrypoint group.
|
||||||
|
|
||||||
|
For each advertised entrypoint the method:
|
||||||
|
|
||||||
|
1. Calls ep.load() to import the class.
|
||||||
|
2. Validates that the class exposes all required attributes.
|
||||||
|
3. On success, appends the class to _external and logs an info message.
|
||||||
|
4. On failure (import error or missing attributes), logs an appropriate
|
||||||
|
warning/error and continues to the next entrypoint.
|
||||||
|
|
||||||
|
Errors during discovery of a single parser do not prevent other parsers
|
||||||
|
from being loaded.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
eps = entry_points(group="paperless_ngx.parsers")
|
||||||
|
|
||||||
|
for ep in eps:
|
||||||
|
try:
|
||||||
|
parser_class = ep.load()
|
||||||
|
except Exception:
|
||||||
|
logger.exception(
|
||||||
|
"Failed to load parser entrypoint '%s' — skipping.",
|
||||||
|
ep.name,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
missing = [
|
||||||
|
attr for attr in _REQUIRED_ATTRS if not hasattr(parser_class, attr)
|
||||||
|
]
|
||||||
|
if missing:
|
||||||
|
logger.warning(
|
||||||
|
"Parser loaded from entrypoint '%s' is missing required "
|
||||||
|
"attributes %r — skipping.",
|
||||||
|
ep.name,
|
||||||
|
missing,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
self._external.append(parser_class)
|
||||||
|
logger.info(
|
||||||
|
"Loaded third-party parser '%s' v%s by %s (entrypoint: '%s').",
|
||||||
|
parser_class.name,
|
||||||
|
parser_class.version,
|
||||||
|
parser_class.author,
|
||||||
|
ep.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Summary logging
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def log_summary(self) -> None:
|
||||||
|
"""Log a startup summary of all registered parsers.
|
||||||
|
|
||||||
|
Built-in parsers are listed first, followed by any external parsers
|
||||||
|
discovered from entrypoints. If no external parsers were found a
|
||||||
|
short informational message is logged instead of an empty list.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"Built-in parsers (%d):",
|
||||||
|
len(self._builtins),
|
||||||
|
)
|
||||||
|
for cls in self._builtins:
|
||||||
|
logger.info(
|
||||||
|
" [built-in] %s v%s — %s",
|
||||||
|
getattr(cls, "name", repr(cls)),
|
||||||
|
getattr(cls, "version", "unknown"),
|
||||||
|
getattr(cls, "url", "built-in"),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self._external:
|
||||||
|
logger.info("No third-party parsers discovered.")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Third-party parsers (%d):",
|
||||||
|
len(self._external),
|
||||||
|
)
|
||||||
|
for cls in self._external:
|
||||||
|
logger.info(
|
||||||
|
" [external] %s v%s by %s — report issues at %s",
|
||||||
|
getattr(cls, "name", repr(cls)),
|
||||||
|
getattr(cls, "version", "unknown"),
|
||||||
|
getattr(cls, "author", "unknown"),
|
||||||
|
getattr(cls, "url", "unknown"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Parser resolution
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_parser_for_file(
|
||||||
|
self,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> type[ParserProtocol] | None:
|
||||||
|
"""Return the best parser class for the given file, or None.
|
||||||
|
|
||||||
|
All registered parsers (external first, then built-ins) are evaluated
|
||||||
|
against the file. A parser is eligible if mime_type appears in the dict
|
||||||
|
returned by its supported_mime_types classmethod, and its score
|
||||||
|
classmethod returns a non-None integer.
|
||||||
|
|
||||||
|
The parser with the highest score wins. When two parsers return the
|
||||||
|
same score, the one that appears earlier in the evaluation order wins
|
||||||
|
(external parsers are evaluated before built-ins, giving third-party
|
||||||
|
packages a chance to override defaults at equal priority).
|
||||||
|
|
||||||
|
When an external parser is selected, its identity is logged at INFO
|
||||||
|
level so operators can trace which package handled a document.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mime_type:
|
||||||
|
The detected MIME type of the file.
|
||||||
|
filename:
|
||||||
|
The original filename, including extension.
|
||||||
|
path:
|
||||||
|
Optional filesystem path to the file. Forwarded to each
|
||||||
|
parser's score method.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
type[ParserProtocol] | None
|
||||||
|
The winning parser class, or None if no parser can handle the file.
|
||||||
|
"""
|
||||||
|
best_score: int | None = None
|
||||||
|
best_parser: type[ParserProtocol] | None = None
|
||||||
|
|
||||||
|
# External parsers are placed first so that, at equal scores, an
|
||||||
|
# external parser wins over a built-in (first-seen policy).
|
||||||
|
for parser_class in (*self._external, *self._builtins):
|
||||||
|
if mime_type not in parser_class.supported_mime_types():
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = parser_class.score(mime_type, filename, path)
|
||||||
|
if score is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if best_score is None or score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_parser = parser_class
|
||||||
|
|
||||||
|
if best_parser is not None and best_parser in self._external:
|
||||||
|
logger.info(
|
||||||
|
"Document handled by third-party parser '%s' v%s — %s",
|
||||||
|
getattr(best_parser, "name", repr(best_parser)),
|
||||||
|
getattr(best_parser, "version", "unknown"),
|
||||||
|
getattr(best_parser, "url", "unknown"),
|
||||||
|
)
|
||||||
|
|
||||||
|
return best_parser
|
||||||
320
src/paperless/parsers/text.py
Normal file
320
src/paperless/parsers/text.py
Normal file
@@ -0,0 +1,320 @@
|
|||||||
|
"""
|
||||||
|
Built-in plain-text document parser.
|
||||||
|
|
||||||
|
Handles text/plain, text/csv, and application/csv MIME types by reading the
|
||||||
|
file content directly. Thumbnails are generated by rendering a page-sized
|
||||||
|
WebP image from the first 100,000 characters using Pillow.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from PIL import Image
|
||||||
|
from PIL import ImageDraw
|
||||||
|
from PIL import ImageFont
|
||||||
|
|
||||||
|
from paperless.version import __full_version_str__
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsing.text")
|
||||||
|
|
||||||
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||||
|
"text/plain": ".txt",
|
||||||
|
"text/csv": ".csv",
|
||||||
|
"application/csv": ".csv",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TextDocumentParser:
|
||||||
|
"""Parse plain-text documents (txt, csv) for Paperless-ngx.
|
||||||
|
|
||||||
|
This parser reads the file content directly as UTF-8 text and renders a
|
||||||
|
simple thumbnail using Pillow. It does not perform OCR and does not
|
||||||
|
produce a searchable PDF archive copy.
|
||||||
|
|
||||||
|
Class attributes
|
||||||
|
----------------
|
||||||
|
name : str
|
||||||
|
Human-readable parser name.
|
||||||
|
version : str
|
||||||
|
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||||
|
author : str
|
||||||
|
Maintainer name.
|
||||||
|
url : str
|
||||||
|
Issue tracker / source URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "Paperless-ngx Text Parser"
|
||||||
|
version: str = __full_version_str__
|
||||||
|
author: str = "Paperless-ngx Contributors"
|
||||||
|
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Class methods
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
"""Return the MIME types this parser handles.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, str]
|
||||||
|
Mapping of MIME type to preferred file extension.
|
||||||
|
"""
|
||||||
|
return _SUPPORTED_MIME_TYPES
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the priority score for handling this file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the file.
|
||||||
|
filename:
|
||||||
|
Original filename including extension.
|
||||||
|
path:
|
||||||
|
Optional filesystem path. Not inspected by this parser.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
10 if the MIME type is supported, otherwise None.
|
||||||
|
"""
|
||||||
|
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||||
|
return 10
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Properties
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
"""Whether this parser can produce a searchable PDF archive copy.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always False — the text parser does not produce a PDF archive.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
"""Whether the parser must produce a PDF for the frontend to display.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
Always False — plain text files are displayable as-is.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||||
|
)
|
||||||
|
self._text: str | None = None
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||||
|
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Core parsing interface
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Read the document and store its text content.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the text file.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
produce_archive:
|
||||||
|
Ignored — this parser never produces a PDF archive.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
documents.parsers.ParseError
|
||||||
|
If the file cannot be read.
|
||||||
|
"""
|
||||||
|
self._text = self._read_text(document_path)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Result accessors
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
"""Return the plain-text content extracted during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
Extracted text, or None if parse has not been called yet.
|
||||||
|
"""
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
def get_date(self) -> datetime.datetime | None:
|
||||||
|
"""Return the document date detected during parse.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
datetime.datetime | None
|
||||||
|
Always None — the text parser does not detect dates.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
"""Return the path to a generated archive PDF, or None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path | None
|
||||||
|
Always None — the text parser does not produce a PDF archive.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Thumbnail and metadata
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
|
"""Render the first portion of the document as a WebP thumbnail.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Path to the generated WebP thumbnail inside the temporary directory.
|
||||||
|
"""
|
||||||
|
max_chars = 100_000
|
||||||
|
file_size_limit = 50 * 1024 * 1024
|
||||||
|
|
||||||
|
if document_path.stat().st_size > file_size_limit:
|
||||||
|
text = "[File too large to preview]"
|
||||||
|
else:
|
||||||
|
with Path(document_path).open("r", encoding="utf-8", errors="replace") as f:
|
||||||
|
text = f.read(max_chars)
|
||||||
|
|
||||||
|
img = Image.new("RGB", (500, 700), color="white")
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
font = ImageFont.truetype(
|
||||||
|
font=settings.THUMBNAIL_FONT_NAME,
|
||||||
|
size=20,
|
||||||
|
layout_engine=ImageFont.Layout.BASIC,
|
||||||
|
)
|
||||||
|
draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
|
||||||
|
|
||||||
|
out_path = self._tempdir / "thumb.webp"
|
||||||
|
img.save(out_path, format="WEBP")
|
||||||
|
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
def get_page_count(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> int | None:
|
||||||
|
"""Return the number of pages in the document.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
document_path:
|
||||||
|
Absolute path to the source document.
|
||||||
|
mime_type:
|
||||||
|
Detected MIME type of the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
int | None
|
||||||
|
Always None — page count is not meaningful for plain text.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
"""Extract format-specific metadata from the document.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[MetadataEntry]
|
||||||
|
Always ``[]`` — plain text files carry no structured metadata.
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Private helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _read_text(self, filepath: Path) -> str:
|
||||||
|
"""Read file content, replacing invalid UTF-8 bytes rather than failing.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filepath:
|
||||||
|
Path to the file to read.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
File content as a string.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return filepath.read_text(encoding="utf-8")
|
||||||
|
except UnicodeDecodeError as exc:
|
||||||
|
logger.warning(
|
||||||
|
"Unicode error reading %s, replacing bad bytes: %s",
|
||||||
|
filepath,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
return filepath.read_bytes().decode("utf-8", errors="replace")
|
||||||
48
src/paperless/tests/conftest.py
Normal file
48
src/paperless/tests/conftest.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
"""
|
||||||
|
Fixtures defined here are available to every test module under
|
||||||
|
src/paperless/tests/ (including sub-packages such as parsers/).
|
||||||
|
|
||||||
|
Session-scoped fixtures for the shared samples directory live here so
|
||||||
|
sub-package conftest files can reference them without duplicating path logic.
|
||||||
|
Parser-specific fixtures (concrete parser instances, format-specific sample
|
||||||
|
files) live in paperless/tests/parsers/conftest.py.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperless.parsers.registry import reset_parser_registry
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Generator
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def samples_dir() -> Path:
|
||||||
|
"""Absolute path to the shared parser sample files directory.
|
||||||
|
|
||||||
|
Sub-package conftest files derive format-specific paths from this root,
|
||||||
|
e.g. ``samples_dir / "text" / "test.txt"``.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Directory containing all sample documents used by parser tests.
|
||||||
|
"""
|
||||||
|
return (Path(__file__).parent / "samples").resolve()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def clean_registry() -> Generator[None, None, None]:
|
||||||
|
"""Reset the parser registry before and after every test.
|
||||||
|
|
||||||
|
This prevents registry state from leaking between tests that call
|
||||||
|
get_parser_registry() or init_builtin_parsers().
|
||||||
|
"""
|
||||||
|
reset_parser_registry()
|
||||||
|
yield
|
||||||
|
reset_parser_registry()
|
||||||
0
src/paperless/tests/parsers/__init__.py
Normal file
0
src/paperless/tests/parsers/__init__.py
Normal file
76
src/paperless/tests/parsers/conftest.py
Normal file
76
src/paperless/tests/parsers/conftest.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""
|
||||||
|
Parser fixtures that are used across multiple test modules in this package
|
||||||
|
are defined here. Format-specific sample-file fixtures are grouped by parser
|
||||||
|
so it is easy to see which files belong to which test module.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Generator
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Text parser sample files
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def text_samples_dir(samples_dir: Path) -> Path:
|
||||||
|
"""Absolute path to the text parser sample files directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
``<samples_dir>/text/``
|
||||||
|
"""
|
||||||
|
return samples_dir / "text"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def sample_txt_file(text_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a valid UTF-8 plain-text sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``text/test.txt``.
|
||||||
|
"""
|
||||||
|
return text_samples_dir / "test.txt"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def malformed_txt_file(text_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a text file containing invalid UTF-8 bytes.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``text/decode_error.txt``.
|
||||||
|
"""
|
||||||
|
return text_samples_dir / "decode_error.txt"
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Text parser instance
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def text_parser() -> Generator[TextDocumentParser, None, None]:
|
||||||
|
"""Yield a TextDocumentParser and clean up its temporary directory afterwards.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
TextDocumentParser
|
||||||
|
A ready-to-use parser instance.
|
||||||
|
"""
|
||||||
|
with TextDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
256
src/paperless/tests/parsers/test_text_parser.py
Normal file
256
src/paperless/tests/parsers/test_text_parser.py
Normal file
@@ -0,0 +1,256 @@
|
|||||||
|
"""
|
||||||
|
Tests for paperless.parsers.text.TextDocumentParser.
|
||||||
|
|
||||||
|
All tests use the context-manager protocol for parser lifecycle. Sample
|
||||||
|
files are provided by session-scoped fixtures defined in conftest.py.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperless.parsers import ParserProtocol
|
||||||
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextParserProtocol:
|
||||||
|
"""Verify that TextDocumentParser satisfies the ParserProtocol contract."""
|
||||||
|
|
||||||
|
def test_isinstance_satisfies_protocol(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert isinstance(text_parser, ParserProtocol)
|
||||||
|
|
||||||
|
def test_class_attributes_present(self) -> None:
|
||||||
|
assert isinstance(TextDocumentParser.name, str) and TextDocumentParser.name
|
||||||
|
assert (
|
||||||
|
isinstance(TextDocumentParser.version, str) and TextDocumentParser.version
|
||||||
|
)
|
||||||
|
assert isinstance(TextDocumentParser.author, str) and TextDocumentParser.author
|
||||||
|
assert isinstance(TextDocumentParser.url, str) and TextDocumentParser.url
|
||||||
|
|
||||||
|
def test_supported_mime_types_returns_dict(self) -> None:
|
||||||
|
mime_types = TextDocumentParser.supported_mime_types()
|
||||||
|
assert isinstance(mime_types, dict)
|
||||||
|
assert "text/plain" in mime_types
|
||||||
|
assert "text/csv" in mime_types
|
||||||
|
assert "application/csv" in mime_types
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("mime_type", "expected"),
|
||||||
|
[
|
||||||
|
("text/plain", 10),
|
||||||
|
("text/csv", 10),
|
||||||
|
("application/csv", 10),
|
||||||
|
("application/pdf", None),
|
||||||
|
("image/png", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_score(self, mime_type: str, expected: int | None) -> None:
|
||||||
|
assert TextDocumentParser.score(mime_type, "file.txt") == expected
|
||||||
|
|
||||||
|
def test_can_produce_archive_is_false(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert text_parser.can_produce_archive is False
|
||||||
|
|
||||||
|
def test_requires_pdf_rendition_is_false(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert text_parser.requires_pdf_rendition is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextParserLifecycle:
|
||||||
|
"""Verify context-manager behaviour and temporary directory cleanup."""
|
||||||
|
|
||||||
|
def test_context_manager_cleans_up_tempdir(self) -> None:
|
||||||
|
with TextDocumentParser() as parser:
|
||||||
|
tempdir = parser._tempdir
|
||||||
|
assert tempdir.exists()
|
||||||
|
assert not tempdir.exists()
|
||||||
|
|
||||||
|
def test_context_manager_cleans_up_after_exception(self) -> None:
|
||||||
|
tempdir: Path | None = None
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
with TextDocumentParser() as parser:
|
||||||
|
tempdir = parser._tempdir
|
||||||
|
raise RuntimeError("boom")
|
||||||
|
assert tempdir is not None
|
||||||
|
assert not tempdir.exists()
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextParserParse:
|
||||||
|
"""Verify parse() and the result accessors."""
|
||||||
|
|
||||||
|
def test_parse_valid_utf8(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
sample_txt_file: Path,
|
||||||
|
) -> None:
|
||||||
|
text_parser.parse(sample_txt_file, "text/plain")
|
||||||
|
|
||||||
|
assert text_parser.get_text() == "This is a test file.\n"
|
||||||
|
|
||||||
|
def test_parse_returns_none_for_archive_path(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
sample_txt_file: Path,
|
||||||
|
) -> None:
|
||||||
|
text_parser.parse(sample_txt_file, "text/plain")
|
||||||
|
|
||||||
|
assert text_parser.get_archive_path() is None
|
||||||
|
|
||||||
|
def test_parse_returns_none_for_date(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
sample_txt_file: Path,
|
||||||
|
) -> None:
|
||||||
|
text_parser.parse(sample_txt_file, "text/plain")
|
||||||
|
|
||||||
|
assert text_parser.get_date() is None
|
||||||
|
|
||||||
|
def test_parse_invalid_utf8_bytes_replaced(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
malformed_txt_file: Path,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- A text file containing invalid UTF-8 byte sequences
|
||||||
|
WHEN:
|
||||||
|
- The file is parsed
|
||||||
|
THEN:
|
||||||
|
- Parsing succeeds
|
||||||
|
- Invalid bytes are replaced with the Unicode replacement character
|
||||||
|
"""
|
||||||
|
text_parser.parse(malformed_txt_file, "text/plain")
|
||||||
|
|
||||||
|
assert text_parser.get_text() == "Pantothens\ufffdure\n"
|
||||||
|
|
||||||
|
def test_get_text_none_before_parse(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
assert text_parser.get_text() is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextParserThumbnail:
|
||||||
|
"""Verify thumbnail generation."""
|
||||||
|
|
||||||
|
def test_thumbnail_exists_and_is_file(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
sample_txt_file: Path,
|
||||||
|
) -> None:
|
||||||
|
thumb = text_parser.get_thumbnail(sample_txt_file, "text/plain")
|
||||||
|
|
||||||
|
assert thumb.exists()
|
||||||
|
assert thumb.is_file()
|
||||||
|
|
||||||
|
def test_thumbnail_large_file_does_not_read_all(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- A text file larger than 50 MB
|
||||||
|
WHEN:
|
||||||
|
- A thumbnail is requested
|
||||||
|
THEN:
|
||||||
|
- The thumbnail is generated without loading the full file
|
||||||
|
"""
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
delete=False,
|
||||||
|
mode="w",
|
||||||
|
encoding="utf-8",
|
||||||
|
suffix=".txt",
|
||||||
|
) as tmp:
|
||||||
|
tmp.write("A" * (51 * 1024 * 1024))
|
||||||
|
large_file = Path(tmp.name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
thumb = text_parser.get_thumbnail(large_file, "text/plain")
|
||||||
|
assert thumb.exists()
|
||||||
|
assert thumb.is_file()
|
||||||
|
finally:
|
||||||
|
large_file.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
def test_get_page_count_returns_none(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
sample_txt_file: Path,
|
||||||
|
) -> None:
|
||||||
|
assert text_parser.get_page_count(sample_txt_file, "text/plain") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextParserMetadata:
|
||||||
|
"""Verify extract_metadata behaviour."""
|
||||||
|
|
||||||
|
def test_extract_metadata_returns_empty_list(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
sample_txt_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = text_parser.extract_metadata(sample_txt_file, "text/plain")
|
||||||
|
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
def test_extract_metadata_returns_list_type(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
sample_txt_file: Path,
|
||||||
|
) -> None:
|
||||||
|
result = text_parser.extract_metadata(sample_txt_file, "text/plain")
|
||||||
|
|
||||||
|
assert isinstance(result, list)
|
||||||
|
|
||||||
|
def test_extract_metadata_ignores_mime_type(
|
||||||
|
self,
|
||||||
|
text_parser: TextDocumentParser,
|
||||||
|
sample_txt_file: Path,
|
||||||
|
) -> None:
|
||||||
|
"""extract_metadata returns [] regardless of the mime_type argument."""
|
||||||
|
assert text_parser.extract_metadata(sample_txt_file, "application/pdf") == []
|
||||||
|
assert text_parser.extract_metadata(sample_txt_file, "text/csv") == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextParserRegistry:
|
||||||
|
"""Verify that TextDocumentParser is registered by default."""
|
||||||
|
|
||||||
|
def test_registered_in_defaults(self) -> None:
|
||||||
|
from paperless.parsers.registry import ParserRegistry
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_defaults()
|
||||||
|
|
||||||
|
assert TextDocumentParser in registry._builtins
|
||||||
|
|
||||||
|
def test_get_parser_for_text_plain(self) -> None:
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
|
registry = get_parser_registry()
|
||||||
|
parser_cls = registry.get_parser_for_file("text/plain", "doc.txt")
|
||||||
|
|
||||||
|
assert parser_cls is TextDocumentParser
|
||||||
|
|
||||||
|
def test_get_parser_for_text_csv(self) -> None:
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
|
registry = get_parser_registry()
|
||||||
|
parser_cls = registry.get_parser_for_file("text/csv", "data.csv")
|
||||||
|
|
||||||
|
assert parser_cls is TextDocumentParser
|
||||||
|
|
||||||
|
def test_get_parser_for_unknown_type_returns_none(self) -> None:
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
|
registry = get_parser_registry()
|
||||||
|
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||||
|
|
||||||
|
assert parser_cls is None
|
||||||
1
src/paperless/tests/samples/text/decode_error.txt
Normal file
1
src/paperless/tests/samples/text/decode_error.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Pantothensäure
|
||||||
1
src/paperless/tests/samples/text/test.txt
Normal file
1
src/paperless/tests/samples/text/test.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
This is a test file.
|
||||||
710
src/paperless/tests/test_registry.py
Normal file
710
src/paperless/tests/test_registry.py
Normal file
@@ -0,0 +1,710 @@
|
|||||||
|
"""
|
||||||
|
Tests for :mod:`paperless.parsers` (ParserProtocol) and
|
||||||
|
:mod:`paperless.parsers.registry` (ParserRegistry + module-level helpers).
|
||||||
|
|
||||||
|
All tests use pytest-style functions/classes — no unittest.TestCase.
|
||||||
|
The ``clean_registry`` fixture ensures complete isolation between tests by
|
||||||
|
resetting the module-level singleton before and after every test.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from importlib.metadata import EntryPoint
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperless.parsers import ParserProtocol
|
||||||
|
from paperless.parsers.registry import ParserRegistry
|
||||||
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
from paperless.parsers.registry import init_builtin_parsers
|
||||||
|
from paperless.parsers.registry import reset_parser_registry
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def dummy_parser_cls() -> type:
|
||||||
|
"""Return a class that fully satisfies :class:`ParserProtocol`.
|
||||||
|
|
||||||
|
GIVEN: A need to exercise registry and Protocol logic with a minimal
|
||||||
|
but complete parser.
|
||||||
|
WHEN: A test requests this fixture.
|
||||||
|
THEN: A class with all required attributes and methods is returned.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class DummyParser:
|
||||||
|
name = "dummy-parser"
|
||||||
|
version = "0.1.0"
|
||||||
|
author = "Test Author"
|
||||||
|
url = "https://example.com/dummy-parser"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
|
return {"text/plain": ".txt"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(
|
||||||
|
cls,
|
||||||
|
mime_type: str,
|
||||||
|
filename: str,
|
||||||
|
path: Path | None = None,
|
||||||
|
) -> int | None:
|
||||||
|
return 10
|
||||||
|
|
||||||
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_pdf_rendition(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_date(self) -> None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_thumbnail(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> Path:
|
||||||
|
return Path("/tmp/thumbnail.webp")
|
||||||
|
|
||||||
|
def get_page_count(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> int | None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return DummyParser
|
||||||
|
|
||||||
|
|
||||||
|
class TestParserProtocol:
|
||||||
|
"""Verify runtime isinstance() checks against ParserProtocol."""
|
||||||
|
|
||||||
|
def test_compliant_class_instance_passes_isinstance(
|
||||||
|
self,
|
||||||
|
dummy_parser_cls: type,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A class that implements every method required by ParserProtocol.
|
||||||
|
WHEN: isinstance() is called with the Protocol.
|
||||||
|
THEN: The check passes (returns True).
|
||||||
|
"""
|
||||||
|
instance = dummy_parser_cls()
|
||||||
|
assert isinstance(instance, ParserProtocol)
|
||||||
|
|
||||||
|
def test_non_compliant_class_instance_fails_isinstance(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A plain class with no parser-related methods.
|
||||||
|
WHEN: isinstance() is called with ParserProtocol.
|
||||||
|
THEN: The check fails (returns False).
|
||||||
|
"""
|
||||||
|
|
||||||
|
class Unrelated:
|
||||||
|
pass
|
||||||
|
|
||||||
|
assert not isinstance(Unrelated(), ParserProtocol)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"missing_method",
|
||||||
|
[
|
||||||
|
pytest.param("parse", id="missing-parse"),
|
||||||
|
pytest.param("get_text", id="missing-get_text"),
|
||||||
|
pytest.param("get_thumbnail", id="missing-get_thumbnail"),
|
||||||
|
pytest.param("__enter__", id="missing-__enter__"),
|
||||||
|
pytest.param("__exit__", id="missing-__exit__"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partial_compliant_fails_isinstance(
|
||||||
|
self,
|
||||||
|
dummy_parser_cls: type,
|
||||||
|
missing_method: str,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A class that satisfies ParserProtocol except for one method.
|
||||||
|
WHEN: isinstance() is called with ParserProtocol.
|
||||||
|
THEN: The check fails because the Protocol is not fully satisfied.
|
||||||
|
"""
|
||||||
|
# Create a subclass and delete the specified method to break compliance.
|
||||||
|
partial_cls = type(
|
||||||
|
"PartialParser",
|
||||||
|
(dummy_parser_cls,),
|
||||||
|
{missing_method: None}, # Replace with None — not callable
|
||||||
|
)
|
||||||
|
assert not isinstance(partial_cls(), ParserProtocol)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRegistrySingleton:
|
||||||
|
"""Verify the module-level singleton lifecycle functions."""
|
||||||
|
|
||||||
|
def test_get_parser_registry_returns_instance(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: No registry has been created yet.
|
||||||
|
WHEN: get_parser_registry() is called.
|
||||||
|
THEN: A ParserRegistry instance is returned.
|
||||||
|
"""
|
||||||
|
registry = get_parser_registry()
|
||||||
|
assert isinstance(registry, ParserRegistry)
|
||||||
|
|
||||||
|
def test_get_parser_registry_same_instance_on_repeated_calls(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A registry instance was created by a prior call.
|
||||||
|
WHEN: get_parser_registry() is called a second time.
|
||||||
|
THEN: The exact same object (identity) is returned.
|
||||||
|
"""
|
||||||
|
first = get_parser_registry()
|
||||||
|
second = get_parser_registry()
|
||||||
|
assert first is second
|
||||||
|
|
||||||
|
def test_reset_parser_registry_gives_fresh_instance(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A registry instance already exists.
|
||||||
|
WHEN: reset_parser_registry() is called and then get_parser_registry()
|
||||||
|
is called again.
|
||||||
|
THEN: A new, distinct registry instance is returned.
|
||||||
|
"""
|
||||||
|
first = get_parser_registry()
|
||||||
|
reset_parser_registry()
|
||||||
|
second = get_parser_registry()
|
||||||
|
assert first is not second
|
||||||
|
|
||||||
|
def test_init_builtin_parsers_does_not_run_discover(
|
||||||
|
self,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: discover() would raise an exception if called.
|
||||||
|
WHEN: init_builtin_parsers() is called.
|
||||||
|
THEN: No exception is raised, confirming discover() was not invoked.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def exploding_discover(self) -> None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"discover() must not be called from init_builtin_parsers",
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(ParserRegistry, "discover", exploding_discover)
|
||||||
|
|
||||||
|
# Should complete without raising.
|
||||||
|
init_builtin_parsers()
|
||||||
|
|
||||||
|
def test_init_builtin_parsers_idempotent(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: init_builtin_parsers() has already been called once.
|
||||||
|
WHEN: init_builtin_parsers() is called a second time.
|
||||||
|
THEN: No error is raised and the same registry instance is reused.
|
||||||
|
"""
|
||||||
|
init_builtin_parsers()
|
||||||
|
# Capture the registry created by the first call.
|
||||||
|
import paperless.parsers.registry as reg_module
|
||||||
|
|
||||||
|
first_registry = reg_module._registry
|
||||||
|
|
||||||
|
init_builtin_parsers()
|
||||||
|
|
||||||
|
assert reg_module._registry is first_registry
|
||||||
|
|
||||||
|
|
||||||
|
class TestParserRegistryGetParserForFile:
|
||||||
|
"""Verify parser selection logic in get_parser_for_file()."""
|
||||||
|
|
||||||
|
def test_returns_none_when_no_parsers_registered(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A registry with no parsers registered.
|
||||||
|
WHEN: get_parser_for_file() is called for any MIME type.
|
||||||
|
THEN: None is returned.
|
||||||
|
"""
|
||||||
|
registry = ParserRegistry()
|
||||||
|
result = registry.get_parser_for_file("text/plain", "doc.txt")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_returns_none_for_unsupported_mime_type(
|
||||||
|
self,
|
||||||
|
dummy_parser_cls: type,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A registry with a parser that supports only 'text/plain'.
|
||||||
|
WHEN: get_parser_for_file() is called with 'application/pdf'.
|
||||||
|
THEN: None is returned.
|
||||||
|
"""
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_builtin(dummy_parser_cls)
|
||||||
|
result = registry.get_parser_for_file("application/pdf", "file.pdf")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_returns_parser_for_supported_mime_type(
|
||||||
|
self,
|
||||||
|
dummy_parser_cls: type,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A registry with a parser registered for 'text/plain'.
|
||||||
|
WHEN: get_parser_for_file() is called with 'text/plain'.
|
||||||
|
THEN: The registered parser class is returned.
|
||||||
|
"""
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_builtin(dummy_parser_cls)
|
||||||
|
result = registry.get_parser_for_file("text/plain", "readme.txt")
|
||||||
|
assert result is dummy_parser_cls
|
||||||
|
|
||||||
|
def test_highest_score_wins(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: Two parsers both supporting 'text/plain' with scores 5 and 20.
|
||||||
|
WHEN: get_parser_for_file() is called for 'text/plain'.
|
||||||
|
THEN: The parser with score 20 is returned.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class LowScoreParser:
|
||||||
|
name = "low"
|
||||||
|
version = "1.0"
|
||||||
|
author = "A"
|
||||||
|
url = "https://example.com/low"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"text/plain": ".txt"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return 5
|
||||||
|
|
||||||
|
class HighScoreParser:
|
||||||
|
name = "high"
|
||||||
|
version = "1.0"
|
||||||
|
author = "B"
|
||||||
|
url = "https://example.com/high"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"text/plain": ".txt"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return 20
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_builtin(LowScoreParser)
|
||||||
|
registry.register_builtin(HighScoreParser)
|
||||||
|
result = registry.get_parser_for_file("text/plain", "readme.txt")
|
||||||
|
assert result is HighScoreParser
|
||||||
|
|
||||||
|
def test_parser_returning_none_score_is_skipped(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A parser that returns None from score() for the given file.
|
||||||
|
WHEN: get_parser_for_file() is called.
|
||||||
|
THEN: That parser is skipped and None is returned (no other candidates).
|
||||||
|
"""
|
||||||
|
|
||||||
|
class DecliningParser:
|
||||||
|
name = "declining"
|
||||||
|
version = "1.0"
|
||||||
|
author = "A"
|
||||||
|
url = "https://example.com"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"text/plain": ".txt"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return None # Explicitly declines
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_builtin(DecliningParser)
|
||||||
|
result = registry.get_parser_for_file("text/plain", "readme.txt")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_all_parsers_decline_returns_none(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: Multiple parsers that all return None from score().
|
||||||
|
WHEN: get_parser_for_file() is called.
|
||||||
|
THEN: None is returned.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class AlwaysDeclines:
|
||||||
|
name = "declines"
|
||||||
|
version = "1.0"
|
||||||
|
author = "A"
|
||||||
|
url = "https://example.com"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"text/plain": ".txt"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return None
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_builtin(AlwaysDeclines)
|
||||||
|
registry._external.append(AlwaysDeclines)
|
||||||
|
result = registry.get_parser_for_file("text/plain", "file.txt")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_external_parser_beats_builtin_same_score(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: An external and a built-in parser both returning score 10.
|
||||||
|
WHEN: get_parser_for_file() is called.
|
||||||
|
THEN: The external parser wins because externals are evaluated first
|
||||||
|
and the first-seen-wins policy applies at equal scores.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class BuiltinParser:
|
||||||
|
name = "builtin"
|
||||||
|
version = "1.0"
|
||||||
|
author = "Core"
|
||||||
|
url = "https://example.com/builtin"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"text/plain": ".txt"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return 10
|
||||||
|
|
||||||
|
class ExternalParser:
|
||||||
|
name = "external"
|
||||||
|
version = "2.0"
|
||||||
|
author = "Third Party"
|
||||||
|
url = "https://example.com/external"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"text/plain": ".txt"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return 10
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_builtin(BuiltinParser)
|
||||||
|
registry._external.append(ExternalParser)
|
||||||
|
result = registry.get_parser_for_file("text/plain", "file.txt")
|
||||||
|
assert result is ExternalParser
|
||||||
|
|
||||||
|
def test_builtin_wins_when_external_declines(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: An external parser that declines (score None) and a built-in
|
||||||
|
that returns score 5.
|
||||||
|
WHEN: get_parser_for_file() is called.
|
||||||
|
THEN: The built-in parser is returned.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class DecliningExternal:
|
||||||
|
name = "declining-external"
|
||||||
|
version = "1.0"
|
||||||
|
author = "Third Party"
|
||||||
|
url = "https://example.com/declining"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"text/plain": ".txt"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return None
|
||||||
|
|
||||||
|
class AcceptingBuiltin:
|
||||||
|
name = "accepting-builtin"
|
||||||
|
version = "1.0"
|
||||||
|
author = "Core"
|
||||||
|
url = "https://example.com/accepting"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"text/plain": ".txt"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return 5
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_builtin(AcceptingBuiltin)
|
||||||
|
registry._external.append(DecliningExternal)
|
||||||
|
result = registry.get_parser_for_file("text/plain", "file.txt")
|
||||||
|
assert result is AcceptingBuiltin
|
||||||
|
|
||||||
|
|
||||||
|
class TestDiscover:
|
||||||
|
"""Verify entrypoint discovery in ParserRegistry.discover()."""
|
||||||
|
|
||||||
|
def test_discover_with_no_entrypoints(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: No entrypoints are registered under 'paperless_ngx.parsers'.
|
||||||
|
WHEN: discover() is called.
|
||||||
|
THEN: _external remains empty and no errors are raised.
|
||||||
|
"""
|
||||||
|
registry = ParserRegistry()
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"paperless.parsers.registry.entry_points",
|
||||||
|
return_value=[],
|
||||||
|
):
|
||||||
|
registry.discover()
|
||||||
|
|
||||||
|
assert registry._external == []
|
||||||
|
|
||||||
|
def test_discover_adds_valid_external_parser(self) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: One valid entrypoint whose loaded class has all required attrs.
|
||||||
|
WHEN: discover() is called.
|
||||||
|
THEN: The class is appended to _external.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class ValidExternal:
|
||||||
|
name = "valid-external"
|
||||||
|
version = "3.0.0"
|
||||||
|
author = "Someone"
|
||||||
|
url = "https://example.com/valid"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"application/pdf": ".pdf"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return 5
|
||||||
|
|
||||||
|
mock_ep = MagicMock(spec=EntryPoint)
|
||||||
|
mock_ep.name = "valid_external"
|
||||||
|
mock_ep.load.return_value = ValidExternal
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"paperless.parsers.registry.entry_points",
|
||||||
|
return_value=[mock_ep],
|
||||||
|
):
|
||||||
|
registry.discover()
|
||||||
|
|
||||||
|
assert ValidExternal in registry._external
|
||||||
|
|
||||||
|
def test_discover_skips_entrypoint_with_load_error(
|
||||||
|
self,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: An entrypoint whose load() method raises ImportError.
|
||||||
|
WHEN: discover() is called.
|
||||||
|
THEN: The entrypoint is skipped, an error is logged, and _external
|
||||||
|
remains empty.
|
||||||
|
"""
|
||||||
|
mock_ep = MagicMock(spec=EntryPoint)
|
||||||
|
mock_ep.name = "broken_ep"
|
||||||
|
mock_ep.load.side_effect = ImportError("missing dependency")
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
|
||||||
|
with caplog.at_level(logging.ERROR, logger="paperless.parsers.registry"):
|
||||||
|
with patch(
|
||||||
|
"paperless.parsers.registry.entry_points",
|
||||||
|
return_value=[mock_ep],
|
||||||
|
):
|
||||||
|
registry.discover()
|
||||||
|
|
||||||
|
assert registry._external == []
|
||||||
|
assert any(
|
||||||
|
"broken_ep" in record.message
|
||||||
|
for record in caplog.records
|
||||||
|
if record.levelno >= logging.ERROR
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_discover_skips_entrypoint_with_missing_attrs(
|
||||||
|
self,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A class loaded from an entrypoint that is missing the 'score'
|
||||||
|
attribute.
|
||||||
|
WHEN: discover() is called.
|
||||||
|
THEN: The entrypoint is skipped, a warning is logged, and _external
|
||||||
|
remains empty.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class MissingScore:
|
||||||
|
name = "missing-score"
|
||||||
|
version = "1.0"
|
||||||
|
author = "Someone"
|
||||||
|
url = "https://example.com"
|
||||||
|
|
||||||
|
# 'score' classmethod is intentionally absent.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"text/plain": ".txt"}
|
||||||
|
|
||||||
|
mock_ep = MagicMock(spec=EntryPoint)
|
||||||
|
mock_ep.name = "missing_score_ep"
|
||||||
|
mock_ep.load.return_value = MissingScore
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
|
||||||
|
with caplog.at_level(logging.WARNING, logger="paperless.parsers.registry"):
|
||||||
|
with patch(
|
||||||
|
"paperless.parsers.registry.entry_points",
|
||||||
|
return_value=[mock_ep],
|
||||||
|
):
|
||||||
|
registry.discover()
|
||||||
|
|
||||||
|
assert registry._external == []
|
||||||
|
assert any(
|
||||||
|
"missing_score_ep" in record.message
|
||||||
|
for record in caplog.records
|
||||||
|
if record.levelno >= logging.WARNING
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_discover_logs_loaded_parser_info(
|
||||||
|
self,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A valid entrypoint that loads successfully.
|
||||||
|
WHEN: discover() is called.
|
||||||
|
THEN: An INFO log message is emitted containing the parser name,
|
||||||
|
version, author, and entrypoint name.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class LoggableParser:
|
||||||
|
name = "loggable"
|
||||||
|
version = "4.2.0"
|
||||||
|
author = "Log Tester"
|
||||||
|
url = "https://example.com/loggable"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {"image/png": ".png"}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
mock_ep = MagicMock(spec=EntryPoint)
|
||||||
|
mock_ep.name = "loggable_ep"
|
||||||
|
mock_ep.load.return_value = LoggableParser
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
|
||||||
|
with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"):
|
||||||
|
with patch(
|
||||||
|
"paperless.parsers.registry.entry_points",
|
||||||
|
return_value=[mock_ep],
|
||||||
|
):
|
||||||
|
registry.discover()
|
||||||
|
|
||||||
|
info_messages = " ".join(
|
||||||
|
r.message for r in caplog.records if r.levelno == logging.INFO
|
||||||
|
)
|
||||||
|
assert "loggable" in info_messages
|
||||||
|
assert "4.2.0" in info_messages
|
||||||
|
assert "Log Tester" in info_messages
|
||||||
|
assert "loggable_ep" in info_messages
|
||||||
|
|
||||||
|
|
||||||
|
class TestLogSummary:
|
||||||
|
"""Verify log output from ParserRegistry.log_summary()."""
|
||||||
|
|
||||||
|
def test_log_summary_with_no_external_parsers(
|
||||||
|
self,
|
||||||
|
dummy_parser_cls: type,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A registry with one built-in parser and no external parsers.
|
||||||
|
WHEN: log_summary() is called.
|
||||||
|
THEN: The built-in parser name appears in the logs.
|
||||||
|
"""
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry.register_builtin(dummy_parser_cls)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"):
|
||||||
|
registry.log_summary()
|
||||||
|
|
||||||
|
all_messages = " ".join(r.message for r in caplog.records)
|
||||||
|
assert dummy_parser_cls.name in all_messages
|
||||||
|
|
||||||
|
def test_log_summary_with_external_parsers(
|
||||||
|
self,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A registry with one external parser registered.
|
||||||
|
WHEN: log_summary() is called.
|
||||||
|
THEN: The external parser name, version, author, and url appear in
|
||||||
|
the log output.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class ExtParser:
|
||||||
|
name = "ext-parser"
|
||||||
|
version = "9.9.9"
|
||||||
|
author = "Ext Corp"
|
||||||
|
url = "https://ext.example.com"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_mime_types(cls):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def score(cls, mime_type, filename, path=None):
|
||||||
|
return None
|
||||||
|
|
||||||
|
registry = ParserRegistry()
|
||||||
|
registry._external.append(ExtParser)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"):
|
||||||
|
registry.log_summary()
|
||||||
|
|
||||||
|
all_messages = " ".join(r.message for r in caplog.records)
|
||||||
|
assert "ext-parser" in all_messages
|
||||||
|
assert "9.9.9" in all_messages
|
||||||
|
assert "Ext Corp" in all_messages
|
||||||
|
assert "https://ext.example.com" in all_messages
|
||||||
|
|
||||||
|
def test_log_summary_logs_no_third_party_message_when_none(
|
||||||
|
self,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
GIVEN: A registry with no external parsers.
|
||||||
|
WHEN: log_summary() is called.
|
||||||
|
THEN: A message containing 'No third-party parsers discovered.' is
|
||||||
|
logged.
|
||||||
|
"""
|
||||||
|
registry = ParserRegistry()
|
||||||
|
|
||||||
|
with caplog.at_level(logging.INFO, logger="paperless.parsers.registry"):
|
||||||
|
registry.log_summary()
|
||||||
|
|
||||||
|
all_messages = " ".join(r.message for r in caplog.records)
|
||||||
|
assert "No third-party parsers discovered." in all_messages
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
from PIL import Image
|
|
||||||
from PIL import ImageDraw
|
|
||||||
from PIL import ImageFont
|
|
||||||
|
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
|
|
||||||
|
|
||||||
class TextDocumentParser(DocumentParser):
|
|
||||||
"""
|
|
||||||
This parser directly parses a text document (.txt, .md, or .csv)
|
|
||||||
"""
|
|
||||||
|
|
||||||
logging_name = "paperless.parsing.text"
|
|
||||||
|
|
||||||
def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
|
|
||||||
# Avoid reading entire file into memory
|
|
||||||
max_chars = 100_000
|
|
||||||
file_size_limit = 50 * 1024 * 1024
|
|
||||||
|
|
||||||
if document_path.stat().st_size > file_size_limit:
|
|
||||||
text = "[File too large to preview]"
|
|
||||||
else:
|
|
||||||
with Path(document_path).open("r", encoding="utf-8", errors="replace") as f:
|
|
||||||
text = f.read(max_chars)
|
|
||||||
|
|
||||||
img = Image.new("RGB", (500, 700), color="white")
|
|
||||||
draw = ImageDraw.Draw(img)
|
|
||||||
font = ImageFont.truetype(
|
|
||||||
font=settings.THUMBNAIL_FONT_NAME,
|
|
||||||
size=20,
|
|
||||||
layout_engine=ImageFont.Layout.BASIC,
|
|
||||||
)
|
|
||||||
draw.multiline_text((5, 5), text, font=font, fill="black", spacing=4)
|
|
||||||
|
|
||||||
out_path = self.tempdir / "thumb.webp"
|
|
||||||
img.save(out_path, format="WEBP")
|
|
||||||
|
|
||||||
return out_path
|
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
|
||||||
self.text = self.read_file_handle_unicode_errors(document_path)
|
|
||||||
|
|
||||||
def get_settings(self) -> None:
|
|
||||||
"""
|
|
||||||
This parser does not implement additional settings yet
|
|
||||||
"""
|
|
||||||
return None
|
|
||||||
Reference in New Issue
Block a user