Fix: Handle the UTF 16 and BOM text files better (#12994)

2026-07-01 17:54:25 +00:00 · 2026-06-13 05:35:38 -07:00
parent fb3816486c
commit 92c016ce47
3 changed files with 57 additions and 33 deletions
@@ -2,13 +2,50 @@

 from __future__ import annotations

+import codecs
 from pathlib import Path

 from paperless.parsers.utils import is_tagged_pdf
+from paperless.parsers.utils import read_file_handle_unicode_errors

 SAMPLES = Path(__file__).parent / "samples" / "tesseract"


+class TestReadFileHandleUnicodeErrors:
+    def test_plain_utf8(self, tmp_path: Path) -> None:
+        f = tmp_path / "plain.txt"
+        f.write_bytes(b"hello world")
+        assert read_file_handle_unicode_errors(f) == "hello world"
+
+    def test_utf8_bom(self, tmp_path: Path) -> None:
+        f = tmp_path / "bom.txt"
+        f.write_bytes(codecs.BOM_UTF8 + b"hello")
+        assert read_file_handle_unicode_errors(f) == "hello"
+
+    def test_utf16_le(self, tmp_path: Path) -> None:
+        f = tmp_path / "utf16le.txt"
+        f.write_bytes(codecs.BOM_UTF16_LE + "hello".encode("utf-16-le"))
+        assert read_file_handle_unicode_errors(f) == "hello"
+
+    def test_utf16_be(self, tmp_path: Path) -> None:
+        f = tmp_path / "utf16be.txt"
+        f.write_bytes(codecs.BOM_UTF16_BE + "hello".encode("utf-16-be"))
+        assert read_file_handle_unicode_errors(f) == "hello"
+
+    def test_nul_bytes_stripped(self, tmp_path: Path) -> None:
+        f = tmp_path / "null-bytes.txt"
+        f.write_bytes(b"foo\x00bar")
+        assert read_file_handle_unicode_errors(f) == "foobar"
+
+    def test_invalid_utf8_replaced(self, tmp_path: Path) -> None:
+        f = tmp_path / "bad.txt"
+        f.write_bytes(b"ok\x80\x81bad")
+        result = read_file_handle_unicode_errors(f)
+        assert "ok" in result
+        assert "bad" in result
+        assert "\x00" not in result
+
+
 class TestIsTaggedPdf:
    def test_tagged_pdf_returns_true(self) -> None:
        assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True