Fix (beta): normalize long punctuation chunks to improve embedding (#12848)

2026-07-15 00:14:53 +00:00 · 2026-05-26 09:32:38 -07:00
parent 0a6e0db186
commit da3e845b8b
2 changed files with 35 additions and 1 deletions
@@ -1,4 +1,5 @@
 import json
+import re
 from typing import TYPE_CHECKING

 from django.conf import settings
@@ -14,6 +15,9 @@ from paperless.config import AIConfig
 from paperless.models import LLMEmbeddingBackend
 from paperless.network import validate_outbound_http_url

+OCR_LEADER_REGEX = re.compile(r"[._\-\u00b7]{4,}")
+HORIZONTAL_WHITESPACE_REGEX = re.compile(r"[ \t\u00a0]+")
+

 def get_embedding_model() -> "BaseEmbedding":
    config = AIConfig()
@@ -100,6 +104,11 @@ def get_embedding_dim() -> int:
    return dim


+def _normalize_llm_index_text(text: str) -> str:
+    text = OCR_LEADER_REGEX.sub(" ", text)
+    return HORIZONTAL_WHITESPACE_REGEX.sub(" ", text)
+
+
 def build_llm_index_text(doc: Document) -> str:
    lines = [
        f"Title: {doc.title}",
@@ -121,4 +130,4 @@ def build_llm_index_text(doc: Document) -> str:
    lines.append("\nContent:\n")
    lines.append(doc.content or "")

-    return "\n".join(lines)
+    return _normalize_llm_index_text("\n".join(lines))
@@ -7,6 +7,7 @@ from django.conf import settings

 from documents.models import Document
 from paperless.models import LLMEmbeddingBackend
+from paperless_ai.embedding import _normalize_llm_index_text
 from paperless_ai.embedding import build_llm_index_text
 from paperless_ai.embedding import get_embedding_dim
 from paperless_ai.embedding import get_embedding_model
@@ -243,3 +244,27 @@ def test_build_llm_index_text(mock_document):
        assert "Notes: Note1,Note2" in result
        assert "Content:\n\nThis is the document content." in result
        assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result
+
+
+def test_build_llm_index_text_normalizes_ocr_punctuation_runs(mock_document):
+    mock_document.content = (
+        "Introduction ................................................ 7\n"
+        "Hardware Limitation ________________________________________ 9\n"
+        "Keep short punctuation like INV-100 and ellipses..."
+    )
+
+    with patch("documents.models.Note.objects.filter", return_value=[]):
+        result = build_llm_index_text(mock_document)
+
+    assert "Introduction 7" in result
+    assert "Hardware Limitation 9" in result
+    assert "INV-100" in result
+    assert "ellipses..." in result
+
+
+def test_normalize_llm_index_text_collapses_ocr_leaders_without_joining_lines():
+    assert _normalize_llm_index_text("A........B\nC____D----E") == "A B\nC D E"
+
+
+def test_normalize_llm_index_text_collapses_non_breaking_spaces():
+    assert _normalize_llm_index_text("A\u00a0........\u00a0B") == "A B"