mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-07-01 09:44:19 +00:00
63 lines
2.2 KiB
Python
63 lines
2.2 KiB
Python
"""Tests for paperless.parsers.utils helpers."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import codecs
|
|
from pathlib import Path
|
|
|
|
from paperless.parsers.utils import is_tagged_pdf
|
|
from paperless.parsers.utils import read_file_handle_unicode_errors
|
|
|
|
SAMPLES = Path(__file__).parent / "samples" / "tesseract"
|
|
|
|
|
|
class TestReadFileHandleUnicodeErrors:
|
|
def test_plain_utf8(self, tmp_path: Path) -> None:
|
|
f = tmp_path / "plain.txt"
|
|
f.write_bytes(b"hello world")
|
|
assert read_file_handle_unicode_errors(f) == "hello world"
|
|
|
|
def test_utf8_bom(self, tmp_path: Path) -> None:
|
|
f = tmp_path / "bom.txt"
|
|
f.write_bytes(codecs.BOM_UTF8 + b"hello")
|
|
assert read_file_handle_unicode_errors(f) == "hello"
|
|
|
|
def test_utf16_le(self, tmp_path: Path) -> None:
|
|
f = tmp_path / "utf16le.txt"
|
|
f.write_bytes(codecs.BOM_UTF16_LE + "hello".encode("utf-16-le"))
|
|
assert read_file_handle_unicode_errors(f) == "hello"
|
|
|
|
def test_utf16_be(self, tmp_path: Path) -> None:
|
|
f = tmp_path / "utf16be.txt"
|
|
f.write_bytes(codecs.BOM_UTF16_BE + "hello".encode("utf-16-be"))
|
|
assert read_file_handle_unicode_errors(f) == "hello"
|
|
|
|
def test_nul_bytes_stripped(self, tmp_path: Path) -> None:
|
|
f = tmp_path / "null-bytes.txt"
|
|
f.write_bytes(b"foo\x00bar")
|
|
assert read_file_handle_unicode_errors(f) == "foobar"
|
|
|
|
def test_invalid_utf8_replaced(self, tmp_path: Path) -> None:
|
|
f = tmp_path / "bad.txt"
|
|
f.write_bytes(b"ok\x80\x81bad")
|
|
result = read_file_handle_unicode_errors(f)
|
|
assert "ok" in result
|
|
assert "bad" in result
|
|
assert "\x00" not in result
|
|
|
|
|
|
class TestIsTaggedPdf:
|
|
def test_tagged_pdf_returns_true(self) -> None:
|
|
assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True
|
|
|
|
def test_untagged_pdf_returns_false(self) -> None:
|
|
assert is_tagged_pdf(SAMPLES / "multi-page-images.pdf") is False
|
|
|
|
def test_nonexistent_path_returns_false(self) -> None:
|
|
assert is_tagged_pdf(Path("/nonexistent/file.pdf")) is False
|
|
|
|
def test_corrupt_pdf_returns_false(self, tmp_path: Path) -> None:
|
|
bad = tmp_path / "bad.pdf"
|
|
bad.write_bytes(b"not a pdf")
|
|
assert is_tagged_pdf(bad) is False
|