diff --git a/src/paperless_ai/embedding.py b/src/paperless_ai/embedding.py index e03ddae72..7ef841e4b 100644 --- a/src/paperless_ai/embedding.py +++ b/src/paperless_ai/embedding.py @@ -1,4 +1,5 @@ import json +import re from typing import TYPE_CHECKING from django.conf import settings @@ -14,6 +15,9 @@ from paperless.config import AIConfig from paperless.models import LLMEmbeddingBackend from paperless.network import validate_outbound_http_url +OCR_LEADER_REGEX = re.compile(r"[._\-\u00b7]{4,}") +HORIZONTAL_WHITESPACE_REGEX = re.compile(r"[ \t\u00a0]+") + def get_embedding_model() -> "BaseEmbedding": config = AIConfig() @@ -100,6 +104,11 @@ def get_embedding_dim() -> int: return dim +def _normalize_llm_index_text(text: str) -> str: + text = OCR_LEADER_REGEX.sub(" ", text) + return HORIZONTAL_WHITESPACE_REGEX.sub(" ", text) + + def build_llm_index_text(doc: Document) -> str: lines = [ f"Title: {doc.title}", @@ -121,4 +130,4 @@ def build_llm_index_text(doc: Document) -> str: lines.append("\nContent:\n") lines.append(doc.content or "") - return "\n".join(lines) + return _normalize_llm_index_text("\n".join(lines)) diff --git a/src/paperless_ai/tests/test_embedding.py b/src/paperless_ai/tests/test_embedding.py index 6ee1994b8..f8bc98f7d 100644 --- a/src/paperless_ai/tests/test_embedding.py +++ b/src/paperless_ai/tests/test_embedding.py @@ -7,6 +7,7 @@ from django.conf import settings from documents.models import Document from paperless.models import LLMEmbeddingBackend +from paperless_ai.embedding import _normalize_llm_index_text from paperless_ai.embedding import build_llm_index_text from paperless_ai.embedding import get_embedding_dim from paperless_ai.embedding import get_embedding_model @@ -243,3 +244,27 @@ def test_build_llm_index_text(mock_document): assert "Notes: Note1,Note2" in result assert "Content:\n\nThis is the document content." in result assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result + + +def test_build_llm_index_text_normalizes_ocr_punctuation_runs(mock_document): + mock_document.content = ( + "Introduction ................................................ 7\n" + "Hardware Limitation ________________________________________ 9\n" + "Keep short punctuation like INV-100 and ellipses..." + ) + + with patch("documents.models.Note.objects.filter", return_value=[]): + result = build_llm_index_text(mock_document) + + assert "Introduction 7" in result + assert "Hardware Limitation 9" in result + assert "INV-100" in result + assert "ellipses..." in result + + +def test_normalize_llm_index_text_collapses_ocr_leaders_without_joining_lines(): + assert _normalize_llm_index_text("A........B\nC____D----E") == "A B\nC D E" + + +def test_normalize_llm_index_text_collapses_non_breaking_spaces(): + assert _normalize_llm_index_text("A\u00a0........\u00a0B") == "A B"