mirror of
https://github.com/domainaware/parsedmarc.git
synced 2026-05-19 10:25:26 +00:00
Update code formatting
This commit is contained in:
@@ -17,7 +17,10 @@ Generated by GPT-5 Use at your own risk.
|
||||
# UTF-8 scanning
|
||||
# -------------------------
|
||||
|
||||
def scan_line_for_utf8_errors(line_bytes: bytes, line_no: int, base_offset: int, context: int):
|
||||
|
||||
def scan_line_for_utf8_errors(
|
||||
line_bytes: bytes, line_no: int, base_offset: int, context: int
|
||||
):
|
||||
"""
|
||||
Scan one line of raw bytes for UTF-8 decoding errors.
|
||||
Returns a list of dicts describing each error.
|
||||
@@ -37,17 +40,19 @@ def scan_line_for_utf8_errors(line_bytes: bytes, line_no: int, base_offset: int,
|
||||
start_ctx = max(0, abs_index_in_line - context)
|
||||
end_ctx = min(len(line_bytes), abs_index_in_line + 1 + context)
|
||||
ctx_bytes = line_bytes[start_ctx:end_ctx]
|
||||
bad_byte = line_bytes[abs_index_in_line:abs_index_in_line+1]
|
||||
bad_byte = line_bytes[abs_index_in_line : abs_index_in_line + 1]
|
||||
col = abs_index_in_line + 1 # 1-based byte column
|
||||
|
||||
results.append({
|
||||
"line": line_no,
|
||||
"column": col,
|
||||
"abs_offset": abs_offset,
|
||||
"bad_byte_hex": bad_byte.hex(),
|
||||
"context_hex": ctx_bytes.hex(),
|
||||
"context_preview": ctx_bytes.decode("utf-8", errors="replace"),
|
||||
})
|
||||
results.append(
|
||||
{
|
||||
"line": line_no,
|
||||
"column": col,
|
||||
"abs_offset": abs_offset,
|
||||
"bad_byte_hex": bad_byte.hex(),
|
||||
"context_hex": ctx_bytes.hex(),
|
||||
"context_preview": ctx_bytes.decode("utf-8", errors="replace"),
|
||||
}
|
||||
)
|
||||
# Move past the offending byte and continue
|
||||
pos = abs_index_in_line + 1
|
||||
return results
|
||||
@@ -68,8 +73,10 @@ def scan_file_for_utf8_errors(path: str, context: int, limit: int):
|
||||
results = scan_line_for_utf8_errors(line, line_no, total_offset, context)
|
||||
for r in results:
|
||||
errors_found += 1
|
||||
print(f"[ERROR {errors_found}] Line {r['line']}, Column {r['column']}, "
|
||||
f"Absolute byte offset {r['abs_offset']}")
|
||||
print(
|
||||
f"[ERROR {errors_found}] Line {r['line']}, Column {r['column']}, "
|
||||
f"Absolute byte offset {r['abs_offset']}"
|
||||
)
|
||||
print(f" Bad byte: 0x{r['bad_byte_hex']}")
|
||||
print(f" Context (hex): {r['context_hex']}")
|
||||
print(f" Context (preview): {r['context_preview']}")
|
||||
@@ -90,6 +97,7 @@ def scan_file_for_utf8_errors(path: str, context: int, limit: int):
|
||||
# Whole-file conversion
|
||||
# -------------------------
|
||||
|
||||
|
||||
def detect_encoding_text(path: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Use charset-normalizer to detect file encoding.
|
||||
@@ -98,7 +106,10 @@ def detect_encoding_text(path: str) -> Tuple[str, str]:
|
||||
try:
|
||||
from charset_normalizer import from_path
|
||||
except ImportError:
|
||||
print("Please install charset-normalizer: pip install charset-normalizer", file=sys.stderr)
|
||||
print(
|
||||
"Please install charset-normalizer: pip install charset-normalizer",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(4)
|
||||
|
||||
matches = from_path(path)
|
||||
@@ -155,6 +166,7 @@ def verify_utf8_file(path: str) -> Tuple[bool, str]:
|
||||
# Targeted single-byte fixer
|
||||
# -------------------------
|
||||
|
||||
|
||||
def iter_lines_with_offsets(b: bytes):
|
||||
"""
|
||||
Yield (line_bytes, line_start_abs_offset). Preserves LF/CRLF/CR in bytes.
|
||||
@@ -162,7 +174,7 @@ def iter_lines_with_offsets(b: bytes):
|
||||
start = 0
|
||||
for i, byte in enumerate(b):
|
||||
if byte == 0x0A: # LF
|
||||
yield b[start:i+1], start
|
||||
yield b[start : i + 1], start
|
||||
start = i + 1
|
||||
if start < len(b):
|
||||
yield b[start:], start
|
||||
@@ -192,10 +204,12 @@ def repair_mixed_utf8_line(line: bytes, base_offset: int, fallback_chain: List[s
|
||||
except UnicodeDecodeError as e:
|
||||
# Append the valid prefix before the error
|
||||
if e.start > 0:
|
||||
out_fragments.append(line[pos:pos + e.start].decode("utf-8", errors="strict"))
|
||||
out_fragments.append(
|
||||
line[pos : pos + e.start].decode("utf-8", errors="strict")
|
||||
)
|
||||
|
||||
bad_index = pos + e.start # absolute index in 'line'
|
||||
bad_slice = line[bad_index:bad_index + 1] # FIX EXACTLY ONE BYTE
|
||||
bad_slice = line[bad_index : bad_index + 1] # FIX EXACTLY ONE BYTE
|
||||
|
||||
# Decode that single byte using the first working fallback
|
||||
decoded = None
|
||||
@@ -216,15 +230,17 @@ def repair_mixed_utf8_line(line: bytes, base_offset: int, fallback_chain: List[s
|
||||
|
||||
# Log the fix
|
||||
col_1based = bad_index + 1 # byte-based column
|
||||
fixes.append({
|
||||
"line_base_offset": base_offset,
|
||||
"line": None, # caller fills line number
|
||||
"column": col_1based,
|
||||
"abs_offset": base_offset + bad_index,
|
||||
"bad_bytes_hex": bad_slice.hex(),
|
||||
"used_encoding": used_enc,
|
||||
"replacement_preview": decoded
|
||||
})
|
||||
fixes.append(
|
||||
{
|
||||
"line_base_offset": base_offset,
|
||||
"line": None, # caller fills line number
|
||||
"column": col_1based,
|
||||
"abs_offset": base_offset + bad_index,
|
||||
"bad_bytes_hex": bad_slice.hex(),
|
||||
"used_encoding": used_enc,
|
||||
"replacement_preview": decoded,
|
||||
}
|
||||
)
|
||||
|
||||
# Advance exactly one byte past the offending byte and continue
|
||||
pos = bad_index + 1
|
||||
@@ -232,7 +248,13 @@ def repair_mixed_utf8_line(line: bytes, base_offset: int, fallback_chain: List[s
|
||||
return "".join(out_fragments), fixes
|
||||
|
||||
|
||||
def targeted_fix_to_utf8(src_path: str, out_path: str, fallback_chain: List[str], dry_run: bool, max_fixes: int):
|
||||
def targeted_fix_to_utf8(
|
||||
src_path: str,
|
||||
out_path: str,
|
||||
fallback_chain: List[str],
|
||||
dry_run: bool,
|
||||
max_fixes: int,
|
||||
):
|
||||
with open(src_path, "rb") as fb:
|
||||
data = fb.read()
|
||||
|
||||
@@ -261,10 +283,12 @@ def targeted_fix_to_utf8(src_path: str, out_path: str, fallback_chain: List[str]
|
||||
# Log fixes
|
||||
for f in fixes:
|
||||
total_fixes += 1
|
||||
print(f"[FIX {total_fixes}] Line {f['line']}, Column {f['column']}, Abs offset {f['abs_offset']}")
|
||||
print(
|
||||
f"[FIX {total_fixes}] Line {f['line']}, Column {f['column']}, Abs offset {f['abs_offset']}"
|
||||
)
|
||||
print(f" Bad bytes: 0x{f['bad_bytes_hex']}")
|
||||
print(f" Used encoding: {f['used_encoding']}")
|
||||
preview = f['replacement_preview'].replace("\r", "\\r").replace("\n", "\\n")
|
||||
preview = f["replacement_preview"].replace("\r", "\\r").replace("\n", "\\n")
|
||||
if len(preview) > 40:
|
||||
preview = preview[:40] + "…"
|
||||
print(f" Replacement preview: {preview}")
|
||||
@@ -299,6 +323,7 @@ def targeted_fix_to_utf8(src_path: str, out_path: str, fallback_chain: List[str]
|
||||
# CLI
|
||||
# -------------------------
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(
|
||||
description=(
|
||||
@@ -310,26 +335,60 @@ def main():
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
ap.add_argument("path", help="Path to the CSV/text file")
|
||||
ap.add_argument("--context", type=int, default=20, help="Bytes of context to show around errors (default: 20)")
|
||||
ap.add_argument("--limit", type=int, default=100, help="Max errors to report during scan (0 = unlimited)")
|
||||
ap.add_argument("--skip-scan", action="store_true", help="Skip initial scan for speed")
|
||||
ap.add_argument(
|
||||
"--context",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Bytes of context to show around errors (default: 20)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Max errors to report during scan (0 = unlimited)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--skip-scan", action="store_true", help="Skip initial scan for speed"
|
||||
)
|
||||
|
||||
# Whole-file convert
|
||||
ap.add_argument("--convert", action="store_true",
|
||||
help="Convert entire file to UTF-8 using auto/forced encoding "
|
||||
"(in-place by default; creates '<input>.bak').")
|
||||
ap.add_argument("--encoding", help="Force source encoding for --convert or first fallback for --fix")
|
||||
ap.add_argument("--output", help="Write to this path instead of in-place (no .bak is created in that case)")
|
||||
ap.add_argument(
|
||||
"--convert",
|
||||
action="store_true",
|
||||
help="Convert entire file to UTF-8 using auto/forced encoding "
|
||||
"(in-place by default; creates '<input>.bak').",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--encoding",
|
||||
help="Force source encoding for --convert or first fallback for --fix",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--output",
|
||||
help="Write to this path instead of in-place (no .bak is created in that case)",
|
||||
)
|
||||
|
||||
# Targeted fix
|
||||
ap.add_argument("--fix", action="store_true",
|
||||
help="Fix only invalid byte(s) via fallback encodings "
|
||||
"(in-place by default; creates '<input>.bak').")
|
||||
ap.add_argument("--fallbacks",
|
||||
help="Comma-separated fallback encodings (default: cp1252,iso-8859-1,iso-8859-15)")
|
||||
ap.add_argument("--dry-run", action="store_true",
|
||||
help="(fix) Print fixes but do not write or create a .bak")
|
||||
ap.add_argument("--max-fixes", type=int, default=0, help="(fix) Stop after N fixes (0 = unlimited)")
|
||||
ap.add_argument(
|
||||
"--fix",
|
||||
action="store_true",
|
||||
help="Fix only invalid byte(s) via fallback encodings "
|
||||
"(in-place by default; creates '<input>.bak').",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--fallbacks",
|
||||
help="Comma-separated fallback encodings (default: cp1252,iso-8859-1,iso-8859-15)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="(fix) Print fixes but do not write or create a .bak",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--max-fixes",
|
||||
type=int,
|
||||
default=0,
|
||||
help="(fix) Stop after N fixes (0 = unlimited)",
|
||||
)
|
||||
|
||||
args = ap.parse_args()
|
||||
path = args.path
|
||||
|
||||
Reference in New Issue
Block a user