diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 436408886..c8a740a7e 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1120,12 +1120,14 @@ class TestConsumer( self.assertEqual(command[1], "--replace-input") @mock.patch("paperless_mail.models.MailRule.objects.get") + @mock.patch("paperless.parsers.mail.MailDocumentParser.get_thumbnail") @mock.patch("paperless.parsers.mail.MailDocumentParser.parse") @mock.patch("documents.consumer.get_parser_registry") def test_mail_parser_receives_mailrule( self, mock_get_parser_registry: mock.Mock, mock_mail_parser_parse: mock.Mock, + mock_get_thumbnail: mock.Mock, mock_mailrule_get: mock.Mock, ) -> None: """ @@ -1136,6 +1138,7 @@ class TestConsumer( THEN: - The mail parser should receive the mail rule """ + from documents.parsers import ParseError from paperless.parsers.mail import MailDocumentParser mock_get_parser_registry.return_value.get_parser_for_file.return_value = ( @@ -1144,19 +1147,24 @@ class TestConsumer( mock_mailrule_get.return_value = mock.Mock( pdf_layout=MailRule.PdfLayout.HTML_ONLY, ) + mock_get_thumbnail.side_effect = ParseError("no thumbnail") + + src = ( + Path(__file__).parent.parent.parent + / Path("paperless") + / Path("tests") + / Path("samples") + / Path("mail") + / "html.eml" + ) + dst = self.dirs.scratch_dir / "html.eml" + shutil.copy(src, dst) + with self.get_consumer( - filepath=( - Path(__file__).parent.parent.parent - / Path("paperless") - / Path("tests") - / Path("samples") - / Path("mail") - ).resolve() - / "html.eml", + filepath=dst, source=DocumentSource.MailFetch, mailrule_id=1, ) as consumer: - # fails because no gotenberg with self.assertRaises( ConsumerError, ): diff --git a/src/paperless/parsers/__init__.py b/src/paperless/parsers/__init__.py index c9c1530a5..5ba42f1f1 100644 --- a/src/paperless/parsers/__init__.py +++ b/src/paperless/parsers/__init__.py @@ -281,13 +281,13 @@ class ParserProtocol(Protocol): # Result accessors # ------------------------------------------------------------------ - def get_text(self) -> str | None: + def get_text(self) -> str: """Return the plain-text content extracted during parse. Returns ------- - str | None - Extracted text, or None if no text could be found. + str + Extracted text, or an empty string if no text could be found. """ ... diff --git a/src/paperless/parsers/mail.py b/src/paperless/parsers/mail.py index 9914b2ec6..8188b7933 100644 --- a/src/paperless/parsers/mail.py +++ b/src/paperless/parsers/mail.py @@ -285,15 +285,15 @@ class MailDocumentParser: # Result accessors # ------------------------------------------------------------------ - def get_text(self) -> str | None: + def get_text(self) -> str: """Return the plain-text content extracted during parse. Returns ------- - str | None - Extracted text, or None if parse has not been called yet. + str + Extracted text, or an empty string if no text could be found. """ - return self._text + return self._text or "" def get_date(self) -> datetime.datetime | None: """Return the document date detected during parse. diff --git a/src/paperless/parsers/remote.py b/src/paperless/parsers/remote.py index 10e89649e..c851469aa 100644 --- a/src/paperless/parsers/remote.py +++ b/src/paperless/parsers/remote.py @@ -247,9 +247,9 @@ class RemoteDocumentParser: # Result accessors # ------------------------------------------------------------------ - def get_text(self) -> str | None: + def get_text(self) -> str: """Return the plain-text content extracted during parse.""" - return self._text + return self._text or "" def get_date(self) -> datetime.datetime | None: """Return the document date detected during parse. diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index e19922dd3..2e0d791ea 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -144,8 +144,8 @@ class RasterisedDocumentParser: # Result accessors # ------------------------------------------------------------------ - def get_text(self) -> str | None: - return self.text + def get_text(self) -> str: + return self.text or "" def get_date(self) -> datetime.datetime | None: return self.date diff --git a/src/paperless/parsers/text.py b/src/paperless/parsers/text.py index 00d738995..301c67149 100644 --- a/src/paperless/parsers/text.py +++ b/src/paperless/parsers/text.py @@ -189,15 +189,15 @@ class TextDocumentParser: # Result accessors # ------------------------------------------------------------------ - def get_text(self) -> str | None: + def get_text(self) -> str: """Return the plain-text content extracted during parse. Returns ------- - str | None - Extracted text, or None if parse has not been called yet. + str + Extracted text, or an empty string if no text could be found. """ - return self._text + return self._text or "" def get_date(self) -> datetime.datetime | None: """Return the document date detected during parse. diff --git a/src/paperless/parsers/tika.py b/src/paperless/parsers/tika.py index 674d74fe2..53082c8b4 100644 --- a/src/paperless/parsers/tika.py +++ b/src/paperless/parsers/tika.py @@ -265,9 +265,7 @@ class TikaDocumentParser: f"{settings.TIKA_ENDPOINT}: {err}", ) from err - self._text = parsed.content - if self._text is not None: - self._text = self._text.strip() + self._text = (parsed.content or "").strip() self._date = parsed.created if self._date is not None and timezone.is_naive(self._date): @@ -281,15 +279,15 @@ class TikaDocumentParser: # Result accessors # ------------------------------------------------------------------ - def get_text(self) -> str | None: + def get_text(self) -> str: """Return the plain-text content extracted during parse. Returns ------- - str | None - Extracted text, or None if parse has not been called yet. + str + Extracted text, or an empty string if no text could be found. """ - return self._text + return self._text or "" def get_date(self) -> datetime.datetime | None: """Return the document date detected during parse. diff --git a/src/paperless/tests/parsers/test_remote_parser.py b/src/paperless/tests/parsers/test_remote_parser.py index 892915bb5..b9e038e60 100644 --- a/src/paperless/tests/parsers/test_remote_parser.py +++ b/src/paperless/tests/parsers/test_remote_parser.py @@ -319,11 +319,11 @@ class TestRemoteParserParse: assert remote_parser.get_text() == "" assert remote_parser.get_archive_path() is None - def test_get_text_none_before_parse( + def test_get_text_empty_before_parse( self, remote_parser: RemoteDocumentParser, ) -> None: - assert remote_parser.get_text() is None + assert remote_parser.get_text() == "" def test_get_date_always_none( self, @@ -342,7 +342,7 @@ class TestRemoteParserParse: class TestRemoteParserParseError: - def test_parse_returns_none_on_azure_error( + def test_parse_returns_empty_on_azure_error( self, remote_parser: RemoteDocumentParser, simple_digital_pdf_file: Path, @@ -350,7 +350,7 @@ class TestRemoteParserParseError: ) -> None: remote_parser.parse(simple_digital_pdf_file, "application/pdf") - assert remote_parser.get_text() is None + assert remote_parser.get_text() == "" def test_parse_closes_client_on_error( self, diff --git a/src/paperless/tests/parsers/test_text_parser.py b/src/paperless/tests/parsers/test_text_parser.py index fd2a57857..eb94ef9b5 100644 --- a/src/paperless/tests/parsers/test_text_parser.py +++ b/src/paperless/tests/parsers/test_text_parser.py @@ -138,11 +138,11 @@ class TestTextParserParse: assert text_parser.get_text() == "Pantothens\ufffdure\n" - def test_get_text_none_before_parse( + def test_get_text_empty_before_parse( self, text_parser: TextDocumentParser, ) -> None: - assert text_parser.get_text() is None + assert text_parser.get_text() == "" class TestTextParserThumbnail: diff --git a/src/paperless/tests/test_registry.py b/src/paperless/tests/test_registry.py index 5c2d20d50..a371abc35 100644 --- a/src/paperless/tests/test_registry.py +++ b/src/paperless/tests/test_registry.py @@ -74,8 +74,8 @@ def dummy_parser_cls() -> type: Required to exist, but doesn't need to do anything """ - def get_text(self) -> str | None: - return None + def get_text(self) -> str: + return "" def get_date(self) -> None: return None