diff --git a/src/paperless_ai/embedding.py b/src/paperless_ai/embedding.py
index e03ddae72..7ef841e4b 100644
--- a/src/paperless_ai/embedding.py
+++ b/src/paperless_ai/embedding.py
@@ -1,4 +1,5 @@
 import json
+import re
 from typing import TYPE_CHECKING
 
 from django.conf import settings
@@ -14,6 +15,9 @@ from paperless.config import AIConfig
 from paperless.models import LLMEmbeddingBackend
 from paperless.network import validate_outbound_http_url
 
+OCR_LEADER_REGEX = re.compile(r"[._\-\u00b7]{4,}")
+HORIZONTAL_WHITESPACE_REGEX = re.compile(r"[ \t\u00a0]+")
+
 
 def get_embedding_model() -> "BaseEmbedding":
     config = AIConfig()
@@ -100,6 +104,11 @@ def get_embedding_dim() -> int:
     return dim
 
 
+def _normalize_llm_index_text(text: str) -> str:
+    text = OCR_LEADER_REGEX.sub(" ", text)
+    return HORIZONTAL_WHITESPACE_REGEX.sub(" ", text)
+
+
 def build_llm_index_text(doc: Document) -> str:
     lines = [
         f"Title: {doc.title}",
@@ -121,4 +130,4 @@ def build_llm_index_text(doc: Document) -> str:
     lines.append("\nContent:\n")
     lines.append(doc.content or "")
 
-    return "\n".join(lines)
+    return _normalize_llm_index_text("\n".join(lines))
diff --git a/src/paperless_ai/tests/test_embedding.py b/src/paperless_ai/tests/test_embedding.py
index 6ee1994b8..f8bc98f7d 100644
--- a/src/paperless_ai/tests/test_embedding.py
+++ b/src/paperless_ai/tests/test_embedding.py
@@ -7,6 +7,7 @@ from django.conf import settings
 
 from documents.models import Document
 from paperless.models import LLMEmbeddingBackend
+from paperless_ai.embedding import _normalize_llm_index_text
 from paperless_ai.embedding import build_llm_index_text
 from paperless_ai.embedding import get_embedding_dim
 from paperless_ai.embedding import get_embedding_model
@@ -243,3 +244,27 @@ def test_build_llm_index_text(mock_document):
         assert "Notes: Note1,Note2" in result
         assert "Content:\n\nThis is the document content." in result
         assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result
+
+
+def test_build_llm_index_text_normalizes_ocr_punctuation_runs(mock_document):
+    mock_document.content = (
+        "Introduction ................................................ 7\n"
+        "Hardware Limitation ________________________________________ 9\n"
+        "Keep short punctuation like INV-100 and ellipses..."
+    )
+
+    with patch("documents.models.Note.objects.filter", return_value=[]):
+        result = build_llm_index_text(mock_document)
+
+    assert "Introduction 7" in result
+    assert "Hardware Limitation 9" in result
+    assert "INV-100" in result
+    assert "ellipses..." in result
+
+
+def test_normalize_llm_index_text_collapses_ocr_leaders_without_joining_lines():
+    assert _normalize_llm_index_text("A........B\nC____D----E") == "A B\nC D E"
+
+
+def test_normalize_llm_index_text_collapses_non_breaking_spaces():
+    assert _normalize_llm_index_text("A\u00a0........\u00a0B") == "A B"