Feat: add MetadataEntry TypedDict and extract_metadata to ParserProtocol

- Define MetadataEntry TypedDict (namespace, prefix, key, value) in paperless.parsers and export it from __all__ - Add extract_metadata(document_path, mime_type) -> list[MetadataEntry] to ParserProtocol; implementations must not raise - Implement extract_metadata on TextDocumentParser (returns []) - Update DummyParser fixture in test_registry to include extract_metadata and align parse/get_thumbnail signatures with the current Protocol - Add TestTextParserMetadata tests covering empty-list return and mime_type-agnostic behaviour Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-02 12:45:23 +00:00 · 2026-03-09 16:07:10 -07:00
parent f7f162424b
commit c96e9f5dc7
4 changed files with 112 additions and 2 deletions
@@ -38,6 +38,7 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 from typing import Protocol
 from typing import Self
+from typing import TypedDict
 from typing import runtime_checkable

 if TYPE_CHECKING:
@@ -46,10 +47,32 @@ if TYPE_CHECKING:
    from types import TracebackType

 __all__ = [
+    "MetadataEntry",
    "ParserProtocol",
 ]


+class MetadataEntry(TypedDict):
+    """A single metadata field extracted from a document.
+
+    All four keys are required. Values are always serialised to strings —
+    type-specific conversion (dates, integers, lists) is the responsibility
+    of the parser before returning.
+    """
+
+    namespace: str
+    """URI of the metadata namespace (e.g. 'http://ns.adobe.com/pdf/1.3/')."""
+
+    prefix: str
+    """Conventional namespace prefix (e.g. 'pdf', 'xmp', 'dc')."""
+
+    key: str
+    """Field name within the namespace (e.g. 'Author', 'CreateDate')."""
+
+    value: str
+    """String representation of the field value."""
+
+
@runtime_checkable
 class ParserProtocol(Protocol):
    """Structural contract for all Paperless-ngx document parsers.
@@ -281,6 +304,41 @@ class ParserProtocol(Protocol):
        """
        ...

+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract format-specific metadata from the document.
+
+        Called by the API view layer on demand — not during the consumption
+        pipeline. Results are returned to the frontend for per-file display.
+
+        For documents with an archive version, this method is called twice:
+        once for the original file (with its native MIME type) and once for
+        the archive file (with ``"application/pdf"``). Parsers that produce
+        archives should handle both cases.
+
+        Implementations must not raise. A failure to read metadata is not
+        fatal — log a warning and return whatever partial results were
+        collected, or ``[]`` if none.
+
+        Parameters
+        ----------
+        document_path:
+            Absolute path to the file to extract metadata from.
+        mime_type:
+            MIME type of the file at ``document_path``. May be
+            ``"application/pdf"`` when called for the archive version.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            Zero or more metadata entries. Returns ``[]`` if no metadata
+            could be extracted or the format does not support it.
+        """
+        ...
+
    # ------------------------------------------------------------------
    # Context manager
    # ------------------------------------------------------------------
@@ -26,6 +26,8 @@ if TYPE_CHECKING:
    import datetime
    from types import TracebackType

+    from paperless.parsers import MetadataEntry
+
 logger = logging.getLogger("paperless.parsing.text")

 _SUPPORTED_MIME_TYPES: dict[str, str] = {
@@ -276,6 +278,20 @@ class TextDocumentParser:
        """
        return None

+    def extract_metadata(
+        self,
+        document_path: Path,
+        mime_type: str,
+    ) -> list[MetadataEntry]:
+        """Extract format-specific metadata from the document.
+
+        Returns
+        -------
+        list[MetadataEntry]
+            Always ``[]`` — plain text files carry no structured metadata.
+        """
+        return []
+
    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------
@@ -189,6 +189,37 @@ class TestTextParserThumbnail:
        assert text_parser.get_page_count(sample_txt_file, "text/plain") is None


+class TestTextParserMetadata:
+    """Verify extract_metadata behaviour."""
+
+    def test_extract_metadata_returns_empty_list(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        result = text_parser.extract_metadata(sample_txt_file, "text/plain")
+
+        assert result == []
+
+    def test_extract_metadata_returns_list_type(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        result = text_parser.extract_metadata(sample_txt_file, "text/plain")
+
+        assert isinstance(result, list)
+
+    def test_extract_metadata_ignores_mime_type(
+        self,
+        text_parser: TextDocumentParser,
+        sample_txt_file: Path,
+    ) -> None:
+        """extract_metadata returns [] regardless of the mime_type argument."""
+        assert text_parser.extract_metadata(sample_txt_file, "application/pdf") == []
+        assert text_parser.extract_metadata(sample_txt_file, "text/csv") == []
+
+
 class TestTextParserRegistry:
    """Verify that TextDocumentParser is registered by default."""

@@ -66,7 +66,6 @@ def dummy_parser_cls() -> type:
            self,
            document_path: Path,
            mime_type: str,
-            file_name: str | None = None,
            *,
            produce_archive: bool = True,
        ) -> None:
@@ -85,7 +84,6 @@ def dummy_parser_cls() -> type:
            self,
            document_path: Path,
            mime_type: str,
-            file_name: str | None = None,
        ) -> Path:
            return Path("/tmp/thumbnail.webp")

@@ -96,6 +94,13 @@ def dummy_parser_cls() -> type:
        ) -> int | None:
            return None

+        def extract_metadata(
+            self,
+            document_path: Path,
+            mime_type: str,
+        ) -> list:
+            return []
+
        def __enter__(self) -> Self:
            return self