mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-06 08:08:51 +00:00
feat: add highlight_page/highlight_page_size params to search()
Gate expensive snippet/highlight generation to only the requested slice of hits, allowing the viewset to avoid generating highlights for all 10k results when only 25 are displayed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -435,6 +435,8 @@ class TantivyBackend:
|
||||
*,
|
||||
sort_reverse: bool,
|
||||
search_mode: SearchMode = SearchMode.QUERY,
|
||||
highlight_page: int | None = None,
|
||||
highlight_page_size: int | None = None,
|
||||
) -> SearchResults:
|
||||
"""
|
||||
Execute a search query against the document index.
|
||||
@@ -533,6 +535,15 @@ class TantivyBackend:
|
||||
hits: list[SearchHit] = []
|
||||
snippet_generator = None
|
||||
|
||||
# Determine which hits need highlights
|
||||
if highlight_page is not None and highlight_page_size is not None:
|
||||
hl_start = (highlight_page - 1) * highlight_page_size
|
||||
hl_end = hl_start + highlight_page_size
|
||||
else:
|
||||
# Highlight all hits (backward-compatible default)
|
||||
hl_start = 0
|
||||
hl_end = len(page_hits)
|
||||
|
||||
for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
|
||||
# Get the actual document from the searcher using the doc address
|
||||
actual_doc = searcher.doc(doc_address)
|
||||
@@ -541,8 +552,9 @@ class TantivyBackend:
|
||||
|
||||
highlights: dict[str, str] = {}
|
||||
|
||||
# Generate highlights if score > 0
|
||||
if score > 0:
|
||||
# Generate highlights if score > 0 and hit is in the highlight window
|
||||
hit_index = rank - offset - 1 # 0-based index within page_hits
|
||||
if score > 0 and hl_start <= hit_index < hl_end:
|
||||
try:
|
||||
if snippet_generator is None:
|
||||
snippet_generator = tantivy.SnippetGenerator.create(
|
||||
|
||||
@@ -428,6 +428,86 @@ class TestSearch:
|
||||
== 0
|
||||
)
|
||||
|
||||
def test_highlight_page_only_highlights_requested_slice(
|
||||
self,
|
||||
backend: TantivyBackend,
|
||||
):
|
||||
"""Only hits in the highlight_page slice should have non-empty highlights."""
|
||||
for i in range(6):
|
||||
doc = Document.objects.create(
|
||||
title=f"highlight doc {i}",
|
||||
content=f"searchable highlight content number {i}",
|
||||
checksum=f"HP{i}",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
r = backend.search(
|
||||
"searchable",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
highlight_page=1,
|
||||
highlight_page_size=3,
|
||||
)
|
||||
assert r.total == 6
|
||||
assert len(r.hits) == 6
|
||||
highlighted = [h for h in r.hits if h["highlights"]]
|
||||
not_highlighted = [h for h in r.hits if not h["highlights"]]
|
||||
assert len(highlighted) == 3
|
||||
assert len(not_highlighted) == 3
|
||||
|
||||
def test_highlight_page_2_highlights_correct_slice(self, backend: TantivyBackend):
|
||||
"""highlight_page=2 should highlight only the second page of results."""
|
||||
for i in range(6):
|
||||
doc = Document.objects.create(
|
||||
title=f"page2 doc {i}",
|
||||
content=f"searchable page2 content number {i}",
|
||||
checksum=f"HP2{i}",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
r = backend.search(
|
||||
"searchable",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
highlight_page=2,
|
||||
highlight_page_size=2,
|
||||
)
|
||||
assert r.total == 6
|
||||
assert len(r.hits) == 6
|
||||
highlighted = [h for h in r.hits if h["highlights"]]
|
||||
not_highlighted = [h for h in r.hits if not h["highlights"]]
|
||||
# Only 2 hits (the second page) should have highlights
|
||||
assert len(highlighted) == 2
|
||||
assert len(not_highlighted) == 4
|
||||
|
||||
def test_no_highlight_page_highlights_all(self, backend: TantivyBackend):
|
||||
"""When highlight_page is not specified, all hits get highlights (backward compat)."""
|
||||
for i in range(3):
|
||||
doc = Document.objects.create(
|
||||
title=f"compat doc {i}",
|
||||
content=f"searchable compat content {i}",
|
||||
checksum=f"HC{i}",
|
||||
)
|
||||
backend.add_or_update(doc)
|
||||
|
||||
r = backend.search(
|
||||
"searchable",
|
||||
user=None,
|
||||
page=1,
|
||||
page_size=10000,
|
||||
sort_field=None,
|
||||
sort_reverse=False,
|
||||
)
|
||||
assert len(r.hits) == 3
|
||||
for hit in r.hits:
|
||||
assert "content" in hit["highlights"]
|
||||
|
||||
|
||||
class TestRebuild:
|
||||
"""Test index rebuilding functionality."""
|
||||
|
||||
Reference in New Issue
Block a user