mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-30 18:35:30 +00:00
Fix (beta): normalize long punctuation chunks to improve embedding (#12848)
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import json
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.conf import settings
|
||||
@@ -14,6 +15,9 @@ from paperless.config import AIConfig
|
||||
from paperless.models import LLMEmbeddingBackend
|
||||
from paperless.network import validate_outbound_http_url
|
||||
|
||||
OCR_LEADER_REGEX = re.compile(r"[._\-\u00b7]{4,}")
|
||||
HORIZONTAL_WHITESPACE_REGEX = re.compile(r"[ \t\u00a0]+")
|
||||
|
||||
|
||||
def get_embedding_model() -> "BaseEmbedding":
|
||||
config = AIConfig()
|
||||
@@ -100,6 +104,11 @@ def get_embedding_dim() -> int:
|
||||
return dim
|
||||
|
||||
|
||||
def _normalize_llm_index_text(text: str) -> str:
|
||||
text = OCR_LEADER_REGEX.sub(" ", text)
|
||||
return HORIZONTAL_WHITESPACE_REGEX.sub(" ", text)
|
||||
|
||||
|
||||
def build_llm_index_text(doc: Document) -> str:
|
||||
lines = [
|
||||
f"Title: {doc.title}",
|
||||
@@ -121,4 +130,4 @@ def build_llm_index_text(doc: Document) -> str:
|
||||
lines.append("\nContent:\n")
|
||||
lines.append(doc.content or "")
|
||||
|
||||
return "\n".join(lines)
|
||||
return _normalize_llm_index_text("\n".join(lines))
|
||||
|
||||
@@ -7,6 +7,7 @@ from django.conf import settings
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.models import LLMEmbeddingBackend
|
||||
from paperless_ai.embedding import _normalize_llm_index_text
|
||||
from paperless_ai.embedding import build_llm_index_text
|
||||
from paperless_ai.embedding import get_embedding_dim
|
||||
from paperless_ai.embedding import get_embedding_model
|
||||
@@ -243,3 +244,27 @@ def test_build_llm_index_text(mock_document):
|
||||
assert "Notes: Note1,Note2" in result
|
||||
assert "Content:\n\nThis is the document content." in result
|
||||
assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result
|
||||
|
||||
|
||||
def test_build_llm_index_text_normalizes_ocr_punctuation_runs(mock_document):
|
||||
mock_document.content = (
|
||||
"Introduction ................................................ 7\n"
|
||||
"Hardware Limitation ________________________________________ 9\n"
|
||||
"Keep short punctuation like INV-100 and ellipses..."
|
||||
)
|
||||
|
||||
with patch("documents.models.Note.objects.filter", return_value=[]):
|
||||
result = build_llm_index_text(mock_document)
|
||||
|
||||
assert "Introduction 7" in result
|
||||
assert "Hardware Limitation 9" in result
|
||||
assert "INV-100" in result
|
||||
assert "ellipses..." in result
|
||||
|
||||
|
||||
def test_normalize_llm_index_text_collapses_ocr_leaders_without_joining_lines():
|
||||
assert _normalize_llm_index_text("A........B\nC____D----E") == "A B\nC D E"
|
||||
|
||||
|
||||
def test_normalize_llm_index_text_collapses_non_breaking_spaces():
|
||||
assert _normalize_llm_index_text("A\u00a0........\u00a0B") == "A B"
|
||||
|
||||
Reference in New Issue
Block a user