Handle the UTF 16 and BOM text files better

2026-06-06 13:49:44 +00:00 · 2026-06-01 13:51:11 -07:00
parent 2c58d86380
commit 2b19e28317
3 changed files with 57 additions and 33 deletions
@@ -20,6 +20,7 @@ from PIL import Image
 from PIL import ImageDraw
 from PIL import ImageFont

+from paperless.parsers.utils import read_file_handle_unicode_errors
 from paperless.version import __full_version_str__

 if TYPE_CHECKING:
@@ -183,7 +184,7 @@ class TextDocumentParser:
        documents.parsers.ParseError
            If the file cannot be read.
        """
-        self._text = self._read_text(document_path)
+        self._text = read_file_handle_unicode_errors(document_path, log=logger)

    # ------------------------------------------------------------------
    # Result accessors
@@ -295,30 +296,3 @@ class TextDocumentParser:
            Always ``[]`` — plain text files carry no structured metadata.
        """
        return []
-
-    # ------------------------------------------------------------------
-    # Private helpers
-    # ------------------------------------------------------------------
-
-    def _read_text(self, filepath: Path) -> str:
-        """Read file content, replacing invalid UTF-8 bytes rather than failing.
-
-        Parameters
-        ----------
-        filepath:
-            Path to the file to read.
-
-        Returns
-        -------
-        str
-            File content as a string.
-        """
-        try:
-            return filepath.read_text(encoding="utf-8")
-        except UnicodeDecodeError as exc:
-            logger.warning(
-                "Unicode error reading %s, replacing bad bytes: %s",
-                filepath,
-                exc,
-            )
-            return filepath.read_bytes().decode("utf-8", errors="replace")
@@ -8,6 +8,7 @@ share implementation.

 from __future__ import annotations

+import codecs
 import logging
 import re
 import tempfile
@@ -114,7 +115,7 @@ def read_file_handle_unicode_errors(
    filepath: Path,
    log: logging.Logger | None = None,
 ) -> str:
-    """Read a file as UTF-8 text, replacing invalid bytes rather than raising.
+    """Read a file as text, detecting encoding via BOM and stripping NUL bytes.

    Parameters
    ----------
@@ -127,15 +128,27 @@ def read_file_handle_unicode_errors(
    Returns
    -------
    str
-        File content as a string, with any invalid UTF-8 sequences replaced
-        by the Unicode replacement character.
+        File content as a string, with NUL bytes removed so the result is
+        safe to store in PostgreSQL text fields.
    """
    _log = log or logger
+    raw = filepath.read_bytes()
+
+    if raw.startswith((codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE)):
+        encoding = "utf-16"
+    elif raw.startswith(codecs.BOM_UTF8):
+        encoding = "utf-8-sig"
+    else:
+        encoding = "utf-8"
+
    try:
-        return filepath.read_text(encoding="utf-8")
+        text = raw.decode(encoding)
    except UnicodeDecodeError as e:
        _log.warning("Unicode error during text reading, continuing: %s", e)
-        return filepath.read_bytes().decode("utf-8", errors="replace")
+        text = raw.decode("utf-8", errors="replace")
+
+    # PostgreSQL rejects NUL (0x00) bytes in text fields
+    return text.replace("\x00", "")


 def get_page_count_for_pdf(
@@ -2,13 +2,50 @@

 from __future__ import annotations

+import codecs
 from pathlib import Path

 from paperless.parsers.utils import is_tagged_pdf
+from paperless.parsers.utils import read_file_handle_unicode_errors

 SAMPLES = Path(__file__).parent / "samples" / "tesseract"


+class TestReadFileHandleUnicodeErrors:
+    def test_plain_utf8(self, tmp_path: Path) -> None:
+        f = tmp_path / "plain.txt"
+        f.write_bytes(b"hello world")
+        assert read_file_handle_unicode_errors(f) == "hello world"
+
+    def test_utf8_bom(self, tmp_path: Path) -> None:
+        f = tmp_path / "bom.txt"
+        f.write_bytes(codecs.BOM_UTF8 + b"hello")
+        assert read_file_handle_unicode_errors(f) == "hello"
+
+    def test_utf16_le(self, tmp_path: Path) -> None:
+        f = tmp_path / "utf16le.txt"
+        f.write_bytes(codecs.BOM_UTF16_LE + "hello".encode("utf-16-le"))
+        assert read_file_handle_unicode_errors(f) == "hello"
+
+    def test_utf16_be(self, tmp_path: Path) -> None:
+        f = tmp_path / "utf16be.txt"
+        f.write_bytes(codecs.BOM_UTF16_BE + "hello".encode("utf-16-be"))
+        assert read_file_handle_unicode_errors(f) == "hello"
+
+    def test_nul_bytes_stripped(self, tmp_path: Path) -> None:
+        f = tmp_path / "null-bytes.txt"
+        f.write_bytes(b"foo\x00bar")
+        assert read_file_handle_unicode_errors(f) == "foobar"
+
+    def test_invalid_utf8_replaced(self, tmp_path: Path) -> None:
+        f = tmp_path / "bad.txt"
+        f.write_bytes(b"ok\x80\x81bad")
+        result = read_file_handle_unicode_errors(f)
+        assert "ok" in result
+        assert "bad" in result
+        assert "\x00" not in result
+
+
 class TestIsTaggedPdf:
    def test_tagged_pdf_returns_true(self) -> None:
        assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True