Updates so we can report a page count for these parsers, assuming we do have an archive produced when called

2026-06-16 18:34:19 +00:00 · 2026-03-19 11:42:38 -07:00
parent 9e3c93f72d
commit d01513a869
2 changed files with 18 additions and 2 deletions
@@ -358,11 +358,19 @@ class MailDocumentParser:
    ) -> int | None:
        """Return the number of pages in the document.

+        Counts pages in the archive PDF produced by a preceding parse()
+        call.  Returns ``None`` if parse() has not been called yet or if
+        no archive was produced.
+
        Returns
        -------
        int | None
-            Always None — page count is not available for email files.
+            Page count of the archive PDF, or ``None``.
        """
+        if self._archive_path is not None:
+            from paperless.parsers.utils import get_page_count_for_pdf
+
+            return get_page_count_for_pdf(self._archive_path, log=logger)
        return None

    def extract_metadata(
@@ -344,11 +344,19 @@ class TikaDocumentParser:
    ) -> int | None:
        """Return the number of pages in the document.

+        Counts pages in the archive PDF produced by a preceding parse()
+        call.  Returns ``None`` if parse() has not been called yet or if
+        no archive was produced.
+
        Returns
        -------
        int | None
-            Always None — page count is not available from Tika.
+            Page count of the archive PDF, or ``None``.
        """
+        if self._archive_path is not None:
+            from paperless.parsers.utils import get_page_count_for_pdf
+
+            return get_page_count_for_pdf(self._archive_path, log=logger)
        return None

    def extract_metadata(