diff --git a/src/paperless/parsers/__init__.py b/src/paperless/parsers/__init__.py index 7be3abadd..ea67ade00 100644 --- a/src/paperless/parsers/__init__.py +++ b/src/paperless/parsers/__init__.py @@ -38,6 +38,7 @@ from __future__ import annotations from typing import TYPE_CHECKING from typing import Protocol from typing import Self +from typing import TypedDict from typing import runtime_checkable if TYPE_CHECKING: @@ -46,10 +47,32 @@ if TYPE_CHECKING: from types import TracebackType __all__ = [ + "MetadataEntry", "ParserProtocol", ] +class MetadataEntry(TypedDict): + """A single metadata field extracted from a document. + + All four keys are required. Values are always serialised to strings — + type-specific conversion (dates, integers, lists) is the responsibility + of the parser before returning. + """ + + namespace: str + """URI of the metadata namespace (e.g. 'http://ns.adobe.com/pdf/1.3/').""" + + prefix: str + """Conventional namespace prefix (e.g. 'pdf', 'xmp', 'dc').""" + + key: str + """Field name within the namespace (e.g. 'Author', 'CreateDate').""" + + value: str + """String representation of the field value.""" + + @runtime_checkable class ParserProtocol(Protocol): """Structural contract for all Paperless-ngx document parsers. @@ -281,6 +304,41 @@ class ParserProtocol(Protocol): """ ... + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list[MetadataEntry]: + """Extract format-specific metadata from the document. + + Called by the API view layer on demand — not during the consumption + pipeline. Results are returned to the frontend for per-file display. + + For documents with an archive version, this method is called twice: + once for the original file (with its native MIME type) and once for + the archive file (with ``"application/pdf"``). Parsers that produce + archives should handle both cases. + + Implementations must not raise. A failure to read metadata is not + fatal — log a warning and return whatever partial results were + collected, or ``[]`` if none. + + Parameters + ---------- + document_path: + Absolute path to the file to extract metadata from. + mime_type: + MIME type of the file at ``document_path``. May be + ``"application/pdf"`` when called for the archive version. + + Returns + ------- + list[MetadataEntry] + Zero or more metadata entries. Returns ``[]`` if no metadata + could be extracted or the format does not support it. + """ + ... + # ------------------------------------------------------------------ # Context manager # ------------------------------------------------------------------ diff --git a/src/paperless/parsers/text.py b/src/paperless/parsers/text.py index 9e4d89c85..99d9dab08 100644 --- a/src/paperless/parsers/text.py +++ b/src/paperless/parsers/text.py @@ -26,6 +26,8 @@ if TYPE_CHECKING: import datetime from types import TracebackType + from paperless.parsers import MetadataEntry + logger = logging.getLogger("paperless.parsing.text") _SUPPORTED_MIME_TYPES: dict[str, str] = { @@ -276,6 +278,20 @@ class TextDocumentParser: """ return None + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list[MetadataEntry]: + """Extract format-specific metadata from the document. + + Returns + ------- + list[MetadataEntry] + Always ``[]`` — plain text files carry no structured metadata. + """ + return [] + # ------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------ diff --git a/src/paperless/tests/parsers/test_text_parser.py b/src/paperless/tests/parsers/test_text_parser.py index 0b702eda2..d2f095f5c 100644 --- a/src/paperless/tests/parsers/test_text_parser.py +++ b/src/paperless/tests/parsers/test_text_parser.py @@ -189,6 +189,37 @@ class TestTextParserThumbnail: assert text_parser.get_page_count(sample_txt_file, "text/plain") is None +class TestTextParserMetadata: + """Verify extract_metadata behaviour.""" + + def test_extract_metadata_returns_empty_list( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + result = text_parser.extract_metadata(sample_txt_file, "text/plain") + + assert result == [] + + def test_extract_metadata_returns_list_type( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + result = text_parser.extract_metadata(sample_txt_file, "text/plain") + + assert isinstance(result, list) + + def test_extract_metadata_ignores_mime_type( + self, + text_parser: TextDocumentParser, + sample_txt_file: Path, + ) -> None: + """extract_metadata returns [] regardless of the mime_type argument.""" + assert text_parser.extract_metadata(sample_txt_file, "application/pdf") == [] + assert text_parser.extract_metadata(sample_txt_file, "text/csv") == [] + + class TestTextParserRegistry: """Verify that TextDocumentParser is registered by default.""" diff --git a/src/paperless/tests/test_registry.py b/src/paperless/tests/test_registry.py index 05b0d45a2..66c4baa45 100644 --- a/src/paperless/tests/test_registry.py +++ b/src/paperless/tests/test_registry.py @@ -66,7 +66,6 @@ def dummy_parser_cls() -> type: self, document_path: Path, mime_type: str, - file_name: str | None = None, *, produce_archive: bool = True, ) -> None: @@ -85,7 +84,6 @@ def dummy_parser_cls() -> type: self, document_path: Path, mime_type: str, - file_name: str | None = None, ) -> Path: return Path("/tmp/thumbnail.webp") @@ -96,6 +94,13 @@ def dummy_parser_cls() -> type: ) -> int | None: return None + def extract_metadata( + self, + document_path: Path, + mime_type: str, + ) -> list: + return [] + def __enter__(self) -> Self: return self