mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-06 13:49:44 +00:00
Handle the UTF 16 and BOM text files better
This commit is contained in:
@@ -20,6 +20,7 @@ from PIL import Image
|
||||
from PIL import ImageDraw
|
||||
from PIL import ImageFont
|
||||
|
||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -183,7 +184,7 @@ class TextDocumentParser:
|
||||
documents.parsers.ParseError
|
||||
If the file cannot be read.
|
||||
"""
|
||||
self._text = self._read_text(document_path)
|
||||
self._text = read_file_handle_unicode_errors(document_path, log=logger)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
@@ -295,30 +296,3 @@ class TextDocumentParser:
|
||||
Always ``[]`` — plain text files carry no structured metadata.
|
||||
"""
|
||||
return []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _read_text(self, filepath: Path) -> str:
|
||||
"""Read file content, replacing invalid UTF-8 bytes rather than failing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath:
|
||||
Path to the file to read.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
File content as a string.
|
||||
"""
|
||||
try:
|
||||
return filepath.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError as exc:
|
||||
logger.warning(
|
||||
"Unicode error reading %s, replacing bad bytes: %s",
|
||||
filepath,
|
||||
exc,
|
||||
)
|
||||
return filepath.read_bytes().decode("utf-8", errors="replace")
|
||||
|
||||
@@ -8,6 +8,7 @@ share implementation.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import codecs
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
@@ -114,7 +115,7 @@ def read_file_handle_unicode_errors(
|
||||
filepath: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> str:
|
||||
"""Read a file as UTF-8 text, replacing invalid bytes rather than raising.
|
||||
"""Read a file as text, detecting encoding via BOM and stripping NUL bytes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -127,15 +128,27 @@ def read_file_handle_unicode_errors(
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
File content as a string, with any invalid UTF-8 sequences replaced
|
||||
by the Unicode replacement character.
|
||||
File content as a string, with NUL bytes removed so the result is
|
||||
safe to store in PostgreSQL text fields.
|
||||
"""
|
||||
_log = log or logger
|
||||
raw = filepath.read_bytes()
|
||||
|
||||
if raw.startswith((codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE)):
|
||||
encoding = "utf-16"
|
||||
elif raw.startswith(codecs.BOM_UTF8):
|
||||
encoding = "utf-8-sig"
|
||||
else:
|
||||
encoding = "utf-8"
|
||||
|
||||
try:
|
||||
return filepath.read_text(encoding="utf-8")
|
||||
text = raw.decode(encoding)
|
||||
except UnicodeDecodeError as e:
|
||||
_log.warning("Unicode error during text reading, continuing: %s", e)
|
||||
return filepath.read_bytes().decode("utf-8", errors="replace")
|
||||
text = raw.decode("utf-8", errors="replace")
|
||||
|
||||
# PostgreSQL rejects NUL (0x00) bytes in text fields
|
||||
return text.replace("\x00", "")
|
||||
|
||||
|
||||
def get_page_count_for_pdf(
|
||||
|
||||
@@ -2,13 +2,50 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import codecs
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers.utils import is_tagged_pdf
|
||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||
|
||||
SAMPLES = Path(__file__).parent / "samples" / "tesseract"
|
||||
|
||||
|
||||
class TestReadFileHandleUnicodeErrors:
|
||||
def test_plain_utf8(self, tmp_path: Path) -> None:
|
||||
f = tmp_path / "plain.txt"
|
||||
f.write_bytes(b"hello world")
|
||||
assert read_file_handle_unicode_errors(f) == "hello world"
|
||||
|
||||
def test_utf8_bom(self, tmp_path: Path) -> None:
|
||||
f = tmp_path / "bom.txt"
|
||||
f.write_bytes(codecs.BOM_UTF8 + b"hello")
|
||||
assert read_file_handle_unicode_errors(f) == "hello"
|
||||
|
||||
def test_utf16_le(self, tmp_path: Path) -> None:
|
||||
f = tmp_path / "utf16le.txt"
|
||||
f.write_bytes(codecs.BOM_UTF16_LE + "hello".encode("utf-16-le"))
|
||||
assert read_file_handle_unicode_errors(f) == "hello"
|
||||
|
||||
def test_utf16_be(self, tmp_path: Path) -> None:
|
||||
f = tmp_path / "utf16be.txt"
|
||||
f.write_bytes(codecs.BOM_UTF16_BE + "hello".encode("utf-16-be"))
|
||||
assert read_file_handle_unicode_errors(f) == "hello"
|
||||
|
||||
def test_nul_bytes_stripped(self, tmp_path: Path) -> None:
|
||||
f = tmp_path / "null-bytes.txt"
|
||||
f.write_bytes(b"foo\x00bar")
|
||||
assert read_file_handle_unicode_errors(f) == "foobar"
|
||||
|
||||
def test_invalid_utf8_replaced(self, tmp_path: Path) -> None:
|
||||
f = tmp_path / "bad.txt"
|
||||
f.write_bytes(b"ok\x80\x81bad")
|
||||
result = read_file_handle_unicode_errors(f)
|
||||
assert "ok" in result
|
||||
assert "bad" in result
|
||||
assert "\x00" not in result
|
||||
|
||||
|
||||
class TestIsTaggedPdf:
|
||||
def test_tagged_pdf_returns_true(self) -> None:
|
||||
assert is_tagged_pdf(SAMPLES / "simple-digital.pdf") is True
|
||||
|
||||
Reference in New Issue
Block a user