Feat: add MetadataEntry TypedDict and extract_metadata to ParserProtocol

- Define MetadataEntry TypedDict (namespace, prefix, key, value) in
  paperless.parsers and export it from __all__
- Add extract_metadata(document_path, mime_type) -> list[MetadataEntry]
  to ParserProtocol; implementations must not raise
- Implement extract_metadata on TextDocumentParser (returns [])
- Update DummyParser fixture in test_registry to include extract_metadata
  and align parse/get_thumbnail signatures with the current Protocol
- Add TestTextParserMetadata tests covering empty-list return and
  mime_type-agnostic behaviour

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-09 16:07:10 -07:00
parent f7f162424b
commit c96e9f5dc7
4 changed files with 112 additions and 2 deletions

View File

@@ -38,6 +38,7 @@ from __future__ import annotations
from typing import TYPE_CHECKING
from typing import Protocol
from typing import Self
from typing import TypedDict
from typing import runtime_checkable
if TYPE_CHECKING:
@@ -46,10 +47,32 @@ if TYPE_CHECKING:
from types import TracebackType
__all__ = [
"MetadataEntry",
"ParserProtocol",
]
class MetadataEntry(TypedDict):
"""A single metadata field extracted from a document.
All four keys are required. Values are always serialised to strings —
type-specific conversion (dates, integers, lists) is the responsibility
of the parser before returning.
"""
namespace: str
"""URI of the metadata namespace (e.g. 'http://ns.adobe.com/pdf/1.3/')."""
prefix: str
"""Conventional namespace prefix (e.g. 'pdf', 'xmp', 'dc')."""
key: str
"""Field name within the namespace (e.g. 'Author', 'CreateDate')."""
value: str
"""String representation of the field value."""
@runtime_checkable
class ParserProtocol(Protocol):
"""Structural contract for all Paperless-ngx document parsers.
@@ -281,6 +304,41 @@ class ParserProtocol(Protocol):
"""
...
def extract_metadata(
self,
document_path: Path,
mime_type: str,
) -> list[MetadataEntry]:
"""Extract format-specific metadata from the document.
Called by the API view layer on demand — not during the consumption
pipeline. Results are returned to the frontend for per-file display.
For documents with an archive version, this method is called twice:
once for the original file (with its native MIME type) and once for
the archive file (with ``"application/pdf"``). Parsers that produce
archives should handle both cases.
Implementations must not raise. A failure to read metadata is not
fatal — log a warning and return whatever partial results were
collected, or ``[]`` if none.
Parameters
----------
document_path:
Absolute path to the file to extract metadata from.
mime_type:
MIME type of the file at ``document_path``. May be
``"application/pdf"`` when called for the archive version.
Returns
-------
list[MetadataEntry]
Zero or more metadata entries. Returns ``[]`` if no metadata
could be extracted or the format does not support it.
"""
...
# ------------------------------------------------------------------
# Context manager
# ------------------------------------------------------------------

View File

@@ -26,6 +26,8 @@ if TYPE_CHECKING:
import datetime
from types import TracebackType
from paperless.parsers import MetadataEntry
logger = logging.getLogger("paperless.parsing.text")
_SUPPORTED_MIME_TYPES: dict[str, str] = {
@@ -276,6 +278,20 @@ class TextDocumentParser:
"""
return None
def extract_metadata(
self,
document_path: Path,
mime_type: str,
) -> list[MetadataEntry]:
"""Extract format-specific metadata from the document.
Returns
-------
list[MetadataEntry]
Always ``[]`` — plain text files carry no structured metadata.
"""
return []
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------

View File

@@ -189,6 +189,37 @@ class TestTextParserThumbnail:
assert text_parser.get_page_count(sample_txt_file, "text/plain") is None
class TestTextParserMetadata:
"""Verify extract_metadata behaviour."""
def test_extract_metadata_returns_empty_list(
self,
text_parser: TextDocumentParser,
sample_txt_file: Path,
) -> None:
result = text_parser.extract_metadata(sample_txt_file, "text/plain")
assert result == []
def test_extract_metadata_returns_list_type(
self,
text_parser: TextDocumentParser,
sample_txt_file: Path,
) -> None:
result = text_parser.extract_metadata(sample_txt_file, "text/plain")
assert isinstance(result, list)
def test_extract_metadata_ignores_mime_type(
self,
text_parser: TextDocumentParser,
sample_txt_file: Path,
) -> None:
"""extract_metadata returns [] regardless of the mime_type argument."""
assert text_parser.extract_metadata(sample_txt_file, "application/pdf") == []
assert text_parser.extract_metadata(sample_txt_file, "text/csv") == []
class TestTextParserRegistry:
"""Verify that TextDocumentParser is registered by default."""

View File

@@ -66,7 +66,6 @@ def dummy_parser_cls() -> type:
self,
document_path: Path,
mime_type: str,
file_name: str | None = None,
*,
produce_archive: bool = True,
) -> None:
@@ -85,7 +84,6 @@ def dummy_parser_cls() -> type:
self,
document_path: Path,
mime_type: str,
file_name: str | None = None,
) -> Path:
return Path("/tmp/thumbnail.webp")
@@ -96,6 +94,13 @@ def dummy_parser_cls() -> type:
) -> int | None:
return None
def extract_metadata(
self,
document_path: Path,
mime_type: str,
) -> list:
return []
def __enter__(self) -> Self:
return self