Fix (beta): normalize long punctuation chunks to improve embedding (#12848)

This commit is contained in:
shamoon
2026-05-26 09:32:38 -07:00
committed by GitHub
parent 0a6e0db186
commit da3e845b8b
2 changed files with 35 additions and 1 deletions
+10 -1
View File
@@ -1,4 +1,5 @@
import json
import re
from typing import TYPE_CHECKING
from django.conf import settings
@@ -14,6 +15,9 @@ from paperless.config import AIConfig
from paperless.models import LLMEmbeddingBackend
from paperless.network import validate_outbound_http_url
OCR_LEADER_REGEX = re.compile(r"[._\-\u00b7]{4,}")
HORIZONTAL_WHITESPACE_REGEX = re.compile(r"[ \t\u00a0]+")
def get_embedding_model() -> "BaseEmbedding":
config = AIConfig()
@@ -100,6 +104,11 @@ def get_embedding_dim() -> int:
return dim
def _normalize_llm_index_text(text: str) -> str:
text = OCR_LEADER_REGEX.sub(" ", text)
return HORIZONTAL_WHITESPACE_REGEX.sub(" ", text)
def build_llm_index_text(doc: Document) -> str:
lines = [
f"Title: {doc.title}",
@@ -121,4 +130,4 @@ def build_llm_index_text(doc: Document) -> str:
lines.append("\nContent:\n")
lines.append(doc.content or "")
return "\n".join(lines)
return _normalize_llm_index_text("\n".join(lines))
+25
View File
@@ -7,6 +7,7 @@ from django.conf import settings
from documents.models import Document
from paperless.models import LLMEmbeddingBackend
from paperless_ai.embedding import _normalize_llm_index_text
from paperless_ai.embedding import build_llm_index_text
from paperless_ai.embedding import get_embedding_dim
from paperless_ai.embedding import get_embedding_model
@@ -243,3 +244,27 @@ def test_build_llm_index_text(mock_document):
assert "Notes: Note1,Note2" in result
assert "Content:\n\nThis is the document content." in result
assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result
def test_build_llm_index_text_normalizes_ocr_punctuation_runs(mock_document):
mock_document.content = (
"Introduction ................................................ 7\n"
"Hardware Limitation ________________________________________ 9\n"
"Keep short punctuation like INV-100 and ellipses..."
)
with patch("documents.models.Note.objects.filter", return_value=[]):
result = build_llm_index_text(mock_document)
assert "Introduction 7" in result
assert "Hardware Limitation 9" in result
assert "INV-100" in result
assert "ellipses..." in result
def test_normalize_llm_index_text_collapses_ocr_leaders_without_joining_lines():
assert _normalize_llm_index_text("A........B\nC____D----E") == "A B\nC D E"
def test_normalize_llm_index_text_collapses_non_breaking_spaces():
assert _normalize_llm_index_text("A\u00a0........\u00a0B") == "A B"