Handle the UTF 16 and BOM text files better

This commit is contained in:
stumpylog
2026-06-01 13:51:11 -07:00
parent 2c58d86380
commit 2b19e28317
3 changed files with 57 additions and 33 deletions
+2 -28
View File
@@ -20,6 +20,7 @@ from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from paperless.parsers.utils import read_file_handle_unicode_errors
from paperless.version import __full_version_str__
if TYPE_CHECKING:
@@ -183,7 +184,7 @@ class TextDocumentParser:
documents.parsers.ParseError
If the file cannot be read.
"""
self._text = self._read_text(document_path)
self._text = read_file_handle_unicode_errors(document_path, log=logger)
# ------------------------------------------------------------------
# Result accessors
@@ -295,30 +296,3 @@ class TextDocumentParser:
Always ``[]`` — plain text files carry no structured metadata.
"""
return []
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
def _read_text(self, filepath: Path) -> str:
"""Read file content, replacing invalid UTF-8 bytes rather than failing.
Parameters
----------
filepath:
Path to the file to read.
Returns
-------
str
File content as a string.
"""
try:
return filepath.read_text(encoding="utf-8")
except UnicodeDecodeError as exc:
logger.warning(
"Unicode error reading %s, replacing bad bytes: %s",
filepath,
exc,
)
return filepath.read_bytes().decode("utf-8", errors="replace")
+18 -5
View File
@@ -8,6 +8,7 @@ share implementation.
from __future__ import annotations
import codecs
import logging
import re
import tempfile
@@ -114,7 +115,7 @@ def read_file_handle_unicode_errors(
filepath: Path,
log: logging.Logger | None = None,
) -> str:
"""Read a file as UTF-8 text, replacing invalid bytes rather than raising.
"""Read a file as text, detecting encoding via BOM and stripping NUL bytes.
Parameters
----------
@@ -127,15 +128,27 @@ def read_file_handle_unicode_errors(
Returns
-------
str
File content as a string, with any invalid UTF-8 sequences replaced
by the Unicode replacement character.
File content as a string, with NUL bytes removed so the result is
safe to store in PostgreSQL text fields.
"""
_log = log or logger
raw = filepath.read_bytes()
if raw.startswith((codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE)):
encoding = "utf-16"
elif raw.startswith(codecs.BOM_UTF8):
encoding = "utf-8-sig"
else:
encoding = "utf-8"
try:
return filepath.read_text(encoding="utf-8")
text = raw.decode(encoding)
except UnicodeDecodeError as e:
_log.warning("Unicode error during text reading, continuing: %s", e)
return filepath.read_bytes().decode("utf-8", errors="replace")
text = raw.decode("utf-8", errors="replace")
# PostgreSQL rejects NUL (0x00) bytes in text fields
return text.replace("\x00", "")
def get_page_count_for_pdf(
+37
View File
@@ -2,13 +2,50 @@
from __future__ import annotations
import codecs
from pathlib import Path
from paperless.parsers.utils import is_tagged_pdf
from paperless.parsers.utils import read_file_handle_unicode_errors
SAMPLES = Path(__file__).parent / "samples" / "tesseract"
class TestReadFileHandleUnicodeErrors:
def test_plain_utf8(self, tmp_path: Path) -> None:
f = tmp_path / "plain.txt"
f.write_bytes(b"hello world")
assert read_file_handle_unicode_errors(f) == "hello world"
def test_utf8_bom(self, tmp_path: Path) -> None:
f = tmp_path / "bom.txt"
f.write_bytes(codecs.BOM_UTF8 + b"hello")
assert read_file_handle_unicode_errors(f) == "hello"
def test_utf16_le(self, tmp_path: Path) -> None:
f = tmp_path / "utf16le.txt"
f.write_bytes(codecs.BOM_UTF16_LE + "hello".encode("utf-16-le"))
assert read_file_handle_unicode_errors(f) == "hello"
def test_utf16_be(self, tmp_path: Path) -> None:
f = tmp_path / "utf16be.txt"
f.write_bytes(codecs.BOM_UTF16_BE + "hello".encode("utf-16-be"))
assert read_file_handle_unicode_errors(f) == "hello"
def test_nul_bytes_stripped(self, tmp_path: Path) -> None:
f = tmp_path / "null-bytes.txt"
f.write_bytes(b"foo\x00bar")
assert read_file_handle_unicode_errors(f) == "foobar"
def test_invalid_utf8_replaced(self, tmp_path: Path) -> None:
f = tmp_path / "bad.txt"
f.write_bytes(b"ok\x80\x81bad")
result = read_file_handle_unicode_errors(f)
assert "ok" in result
assert "bad" in result
assert "\x00" not in result
class TestIsTaggedPdf:
def test_tagged_pdf_returns_true(self) -> None:
assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True