Fix: Update parser contract to require empty strings, not None (#12775)

Co-authored-by: stumpylog <797416+stumpylog@users.noreply.github.com>
This commit is contained in:
shamoon
2026-05-11 09:16:21 -07:00
committed by GitHub
parent 1527c347e3
commit 7471fedb43
10 changed files with 45 additions and 39 deletions
+17 -9
View File
@@ -1120,12 +1120,14 @@ class TestConsumer(
self.assertEqual(command[1], "--replace-input")
@mock.patch("paperless_mail.models.MailRule.objects.get")
@mock.patch("paperless.parsers.mail.MailDocumentParser.get_thumbnail")
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
@mock.patch("documents.consumer.get_parser_registry")
def test_mail_parser_receives_mailrule(
self,
mock_get_parser_registry: mock.Mock,
mock_mail_parser_parse: mock.Mock,
mock_get_thumbnail: mock.Mock,
mock_mailrule_get: mock.Mock,
) -> None:
"""
@@ -1136,6 +1138,7 @@ class TestConsumer(
THEN:
- The mail parser should receive the mail rule
"""
from documents.parsers import ParseError
from paperless.parsers.mail import MailDocumentParser
mock_get_parser_registry.return_value.get_parser_for_file.return_value = (
@@ -1144,19 +1147,24 @@ class TestConsumer(
mock_mailrule_get.return_value = mock.Mock(
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
)
mock_get_thumbnail.side_effect = ParseError("no thumbnail")
src = (
Path(__file__).parent.parent.parent
/ Path("paperless")
/ Path("tests")
/ Path("samples")
/ Path("mail")
/ "html.eml"
)
dst = self.dirs.scratch_dir / "html.eml"
shutil.copy(src, dst)
with self.get_consumer(
filepath=(
Path(__file__).parent.parent.parent
/ Path("paperless")
/ Path("tests")
/ Path("samples")
/ Path("mail")
).resolve()
/ "html.eml",
filepath=dst,
source=DocumentSource.MailFetch,
mailrule_id=1,
) as consumer:
# fails because no gotenberg
with self.assertRaises(
ConsumerError,
):
+3 -3
View File
@@ -281,13 +281,13 @@ class ParserProtocol(Protocol):
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
def get_text(self) -> str:
"""Return the plain-text content extracted during parse.
Returns
-------
str | None
Extracted text, or None if no text could be found.
str
Extracted text, or an empty string if no text could be found.
"""
...
+4 -4
View File
@@ -285,15 +285,15 @@ class MailDocumentParser:
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
def get_text(self) -> str:
"""Return the plain-text content extracted during parse.
Returns
-------
str | None
Extracted text, or None if parse has not been called yet.
str
Extracted text, or an empty string if no text could be found.
"""
return self._text
return self._text or ""
def get_date(self) -> datetime.datetime | None:
"""Return the document date detected during parse.
+2 -2
View File
@@ -247,9 +247,9 @@ class RemoteDocumentParser:
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
def get_text(self) -> str:
"""Return the plain-text content extracted during parse."""
return self._text
return self._text or ""
def get_date(self) -> datetime.datetime | None:
"""Return the document date detected during parse.
+2 -2
View File
@@ -144,8 +144,8 @@ class RasterisedDocumentParser:
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
return self.text
def get_text(self) -> str:
return self.text or ""
def get_date(self) -> datetime.datetime | None:
return self.date
+4 -4
View File
@@ -189,15 +189,15 @@ class TextDocumentParser:
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
def get_text(self) -> str:
"""Return the plain-text content extracted during parse.
Returns
-------
str | None
Extracted text, or None if parse has not been called yet.
str
Extracted text, or an empty string if no text could be found.
"""
return self._text
return self._text or ""
def get_date(self) -> datetime.datetime | None:
"""Return the document date detected during parse.
+5 -7
View File
@@ -265,9 +265,7 @@ class TikaDocumentParser:
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
self._text = parsed.content
if self._text is not None:
self._text = self._text.strip()
self._text = (parsed.content or "").strip()
self._date = parsed.created
if self._date is not None and timezone.is_naive(self._date):
@@ -281,15 +279,15 @@ class TikaDocumentParser:
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
def get_text(self) -> str:
"""Return the plain-text content extracted during parse.
Returns
-------
str | None
Extracted text, or None if parse has not been called yet.
str
Extracted text, or an empty string if no text could be found.
"""
return self._text
return self._text or ""
def get_date(self) -> datetime.datetime | None:
"""Return the document date detected during parse.
@@ -319,11 +319,11 @@ class TestRemoteParserParse:
assert remote_parser.get_text() == ""
assert remote_parser.get_archive_path() is None
def test_get_text_none_before_parse(
def test_get_text_empty_before_parse(
self,
remote_parser: RemoteDocumentParser,
) -> None:
assert remote_parser.get_text() is None
assert remote_parser.get_text() == ""
def test_get_date_always_none(
self,
@@ -342,7 +342,7 @@ class TestRemoteParserParse:
class TestRemoteParserParseError:
def test_parse_returns_none_on_azure_error(
def test_parse_returns_empty_on_azure_error(
self,
remote_parser: RemoteDocumentParser,
simple_digital_pdf_file: Path,
@@ -350,7 +350,7 @@ class TestRemoteParserParseError:
) -> None:
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
assert remote_parser.get_text() is None
assert remote_parser.get_text() == ""
def test_parse_closes_client_on_error(
self,
@@ -138,11 +138,11 @@ class TestTextParserParse:
assert text_parser.get_text() == "Pantothens\ufffdure\n"
def test_get_text_none_before_parse(
def test_get_text_empty_before_parse(
self,
text_parser: TextDocumentParser,
) -> None:
assert text_parser.get_text() is None
assert text_parser.get_text() == ""
class TestTextParserThumbnail:
+2 -2
View File
@@ -74,8 +74,8 @@ def dummy_parser_cls() -> type:
Required to exist, but doesn't need to do anything
"""
def get_text(self) -> str | None:
return None
def get_text(self) -> str:
return ""
def get_date(self) -> None:
return None