mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-17 12:05:24 +00:00
Fix: Update parser contract to require empty strings, not None (#12775)
Co-authored-by: stumpylog <797416+stumpylog@users.noreply.github.com>
This commit is contained in:
@@ -1120,12 +1120,14 @@ class TestConsumer(
|
||||
self.assertEqual(command[1], "--replace-input")
|
||||
|
||||
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
||||
@mock.patch("paperless.parsers.mail.MailDocumentParser.get_thumbnail")
|
||||
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_mail_parser_receives_mailrule(
|
||||
self,
|
||||
mock_get_parser_registry: mock.Mock,
|
||||
mock_mail_parser_parse: mock.Mock,
|
||||
mock_get_thumbnail: mock.Mock,
|
||||
mock_mailrule_get: mock.Mock,
|
||||
) -> None:
|
||||
"""
|
||||
@@ -1136,6 +1138,7 @@ class TestConsumer(
|
||||
THEN:
|
||||
- The mail parser should receive the mail rule
|
||||
"""
|
||||
from documents.parsers import ParseError
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
mock_get_parser_registry.return_value.get_parser_for_file.return_value = (
|
||||
@@ -1144,19 +1147,24 @@ class TestConsumer(
|
||||
mock_mailrule_get.return_value = mock.Mock(
|
||||
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
|
||||
)
|
||||
mock_get_thumbnail.side_effect = ParseError("no thumbnail")
|
||||
|
||||
src = (
|
||||
Path(__file__).parent.parent.parent
|
||||
/ Path("paperless")
|
||||
/ Path("tests")
|
||||
/ Path("samples")
|
||||
/ Path("mail")
|
||||
/ "html.eml"
|
||||
)
|
||||
dst = self.dirs.scratch_dir / "html.eml"
|
||||
shutil.copy(src, dst)
|
||||
|
||||
with self.get_consumer(
|
||||
filepath=(
|
||||
Path(__file__).parent.parent.parent
|
||||
/ Path("paperless")
|
||||
/ Path("tests")
|
||||
/ Path("samples")
|
||||
/ Path("mail")
|
||||
).resolve()
|
||||
/ "html.eml",
|
||||
filepath=dst,
|
||||
source=DocumentSource.MailFetch,
|
||||
mailrule_id=1,
|
||||
) as consumer:
|
||||
# fails because no gotenberg
|
||||
with self.assertRaises(
|
||||
ConsumerError,
|
||||
):
|
||||
|
||||
@@ -281,13 +281,13 @@ class ParserProtocol(Protocol):
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
def get_text(self) -> str:
|
||||
"""Return the plain-text content extracted during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if no text could be found.
|
||||
str
|
||||
Extracted text, or an empty string if no text could be found.
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
@@ -285,15 +285,15 @@ class MailDocumentParser:
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
def get_text(self) -> str:
|
||||
"""Return the plain-text content extracted during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if parse has not been called yet.
|
||||
str
|
||||
Extracted text, or an empty string if no text could be found.
|
||||
"""
|
||||
return self._text
|
||||
return self._text or ""
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
@@ -247,9 +247,9 @@ class RemoteDocumentParser:
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
def get_text(self) -> str:
|
||||
"""Return the plain-text content extracted during parse."""
|
||||
return self._text
|
||||
return self._text or ""
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
@@ -144,8 +144,8 @@ class RasterisedDocumentParser:
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
return self.text
|
||||
def get_text(self) -> str:
|
||||
return self.text or ""
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
return self.date
|
||||
|
||||
@@ -189,15 +189,15 @@ class TextDocumentParser:
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
def get_text(self) -> str:
|
||||
"""Return the plain-text content extracted during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if parse has not been called yet.
|
||||
str
|
||||
Extracted text, or an empty string if no text could be found.
|
||||
"""
|
||||
return self._text
|
||||
return self._text or ""
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
@@ -265,9 +265,7 @@ class TikaDocumentParser:
|
||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||
) from err
|
||||
|
||||
self._text = parsed.content
|
||||
if self._text is not None:
|
||||
self._text = self._text.strip()
|
||||
self._text = (parsed.content or "").strip()
|
||||
|
||||
self._date = parsed.created
|
||||
if self._date is not None and timezone.is_naive(self._date):
|
||||
@@ -281,15 +279,15 @@ class TikaDocumentParser:
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
def get_text(self) -> str:
|
||||
"""Return the plain-text content extracted during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if parse has not been called yet.
|
||||
str
|
||||
Extracted text, or an empty string if no text could be found.
|
||||
"""
|
||||
return self._text
|
||||
return self._text or ""
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
@@ -319,11 +319,11 @@ class TestRemoteParserParse:
|
||||
assert remote_parser.get_text() == ""
|
||||
assert remote_parser.get_archive_path() is None
|
||||
|
||||
def test_get_text_none_before_parse(
|
||||
def test_get_text_empty_before_parse(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert remote_parser.get_text() is None
|
||||
assert remote_parser.get_text() == ""
|
||||
|
||||
def test_get_date_always_none(
|
||||
self,
|
||||
@@ -342,7 +342,7 @@ class TestRemoteParserParse:
|
||||
|
||||
|
||||
class TestRemoteParserParseError:
|
||||
def test_parse_returns_none_on_azure_error(
|
||||
def test_parse_returns_empty_on_azure_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
simple_digital_pdf_file: Path,
|
||||
@@ -350,7 +350,7 @@ class TestRemoteParserParseError:
|
||||
) -> None:
|
||||
remote_parser.parse(simple_digital_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() is None
|
||||
assert remote_parser.get_text() == ""
|
||||
|
||||
def test_parse_closes_client_on_error(
|
||||
self,
|
||||
|
||||
@@ -138,11 +138,11 @@ class TestTextParserParse:
|
||||
|
||||
assert text_parser.get_text() == "Pantothens\ufffdure\n"
|
||||
|
||||
def test_get_text_none_before_parse(
|
||||
def test_get_text_empty_before_parse(
|
||||
self,
|
||||
text_parser: TextDocumentParser,
|
||||
) -> None:
|
||||
assert text_parser.get_text() is None
|
||||
assert text_parser.get_text() == ""
|
||||
|
||||
|
||||
class TestTextParserThumbnail:
|
||||
|
||||
@@ -74,8 +74,8 @@ def dummy_parser_cls() -> type:
|
||||
Required to exist, but doesn't need to do anything
|
||||
"""
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
return None
|
||||
def get_text(self) -> str:
|
||||
return ""
|
||||
|
||||
def get_date(self) -> None:
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user