New Crowdin translations by GitHub Action

Auto translate strings
Feature: paginate search highlights and remove 10k document search limit (#12518 )
2026-06-28 08:14:17 +00:00 · 2026-04-16 01:00:16 +00:00 · 2026-04-15 23:22:00 +00:00 · 2026-04-15 23:20:31 +00:00 · 2026-04-15 15:26:00 -07:00 · 2026-04-15 20:59:43 +00:00
109 changed files with 188347 additions and 116446 deletions
@@ -101,7 +101,7 @@ and `mariadb`.

 #### [`PAPERLESS_DB_OPTIONS=<options>`](#PAPERLESS_DB_OPTIONS) {#PAPERLESS_DB_OPTIONS}

-: Advanced database connection options as a semicolon-delimited key-value string.
+: Advanced database connection options as a comma-delimited key-value string.
 Keys and values are separated by `=`. Dot-notation produces nested option
 dictionaries; for example, `pool.max_size=20` sets
 `OPTIONS["pool"]["max_size"] = 20`.
@@ -123,18 +123,36 @@ dictionaries; for example, `pool.max_size=20` sets
        to handle all pool connections across all workers:
        `(web_workers + celery_workers) * pool.max_size + safety_margin`.

+    !!! note "SQLite defaults"
+
+        SQLite connections are pre-configured with WAL journal mode, optimised
+        synchronous and cache settings, and a 5-second busy timeout. These defaults
+        suit most deployments. To override `init_command`, use `;` between PRAGMAs
+        within the value and `,` between options:
+
+        ```bash
+        PAPERLESS_DB_OPTIONS="init_command=PRAGMA journal_mode=DELETE;PRAGMA synchronous=FULL,transaction_mode=DEFERRED"
+        ```
+
+    !!! note "MariaDB: READ COMMITTED isolation level"
+
+        MariaDB connections default to `READ COMMITTED` isolation level, which
+        eliminates gap locking and reduces deadlock frequency. If binary logging is
+        enabled on your MariaDB server, this requires `binlog_format=ROW` (the
+        default for most managed MariaDB instances). Statement-based replication is
+        not compatible with `READ COMMITTED`.
+
    **Examples:**

    ```bash title="PostgreSQL: require SSL, set a custom CA certificate, and limit the pool size"
-    PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=5"
+    PAPERLESS_DB_OPTIONS="sslmode=require,sslrootcert=/certs/ca.pem,pool.max_size=5"
    ```

    ```bash title="MariaDB: require SSL with a custom CA certificate"
-    PAPERLESS_DB_OPTIONS="ssl_mode=REQUIRED;ssl.ca=/certs/ca.pem"
+    PAPERLESS_DB_OPTIONS="ssl_mode=REQUIRED,ssl.ca=/certs/ca.pem"
    ```

-    ```bash title="SQLite: set a busy timeout of 30 seconds"
-    # PostgreSQL: set a connection timeout
+    ```bash title="PostgreSQL or MariaDB: set a connection timeout"
    PAPERLESS_DB_OPTIONS="connect_timeout=10"
    ```

@@ -120,7 +120,7 @@ Users with any of the deprecated variables set should migrate to `PAPERLESS_DB_O
 Multiple options are combined in a single value:

 ```bash
-PAPERLESS_DB_OPTIONS="sslmode=require;sslrootcert=/certs/ca.pem;pool.max_size=10"
+PAPERLESS_DB_OPTIONS="sslmode=require,sslrootcert=/certs/ca.pem,pool.max_size=10"
 ```

 ## OCR and Archive File Generation Settings
@@ -43,7 +43,7 @@
        </div>
        <p class="card-text">
          @if (document) {
-            @if (document.__search_hit__ && document.__search_hit__.highlights) {
+            @if (hasSearchHighlights) {
              <span [innerHtml]="document.__search_hit__.highlights"></span>
            }
            @for (highlight of searchNoteHighlights; track highlight) {
@@ -52,7 +52,7 @@
                <span [innerHtml]="highlight"></span>
              </span>
            }
-            @if (!document.__search_hit__?.score) {
+            @if (shouldShowContentFallback) {
              <span class="result-content">{{contentTrimmed}}</span>
            }
          } @else {
@@ -65,7 +65,9 @@
  }
 }

-span ::ng-deep .match {
+.card-text ::ng-deep .match,
+.card-text ::ng-deep b {
+  font-weight: normal;
  color: black;
  background-color: rgb(255, 211, 66);
 }
@@ -127,6 +127,19 @@ describe('DocumentCardLargeComponent', () => {
    expect(component.searchNoteHighlights).toContain('<span>bananas</span>')
  })

+  it('should fall back to document content when a search hit has no highlights', () => {
+    component.document.__search_hit__ = {
+      score: 0.9,
+      rank: 1,
+      highlights: '',
+      note_highlights: null,
+    }
+    fixture.detectChanges()
+
+    expect(fixture.nativeElement.textContent).toContain('Cupcake ipsum')
+    expect(component.shouldShowContentFallback).toBe(true)
+  })
+
  it('should try to close the preview on mouse leave', () => {
    component.popupPreview = {
      close: jest.fn(),
@@ -164,6 +164,17 @@ export class DocumentCardLargeComponent
    )
  }

+  get hasSearchHighlights() {
+    return Boolean(this.document?.__search_hit__?.highlights?.trim()?.length)
+  }
+
+  get shouldShowContentFallback() {
+    return (
+      this.document?.__search_hit__?.score == null ||
+      (!this.hasSearchHighlights && this.searchNoteHighlights.length === 0)
+    )
+  }
+
  get notesEnabled(): boolean {
    return this.settingsService.get(SETTINGS_KEYS.NOTES_ENABLED)
  }
@@ -1,8 +1,12 @@
 import dataclasses
+from itertools import combinations
 from typing import Final

 import rapidfuzz
 from django.core.management import CommandError
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text

 from documents.management.commands.base import PaperlessCommand
 from documents.models import Document
@@ -10,8 +14,11 @@ from documents.models import Document

@dataclasses.dataclass(frozen=True, slots=True)
 class _WorkPackage:
-    first_doc: Document
-    second_doc: Document
+    pk_a: int
+    content_a: str
+    pk_b: int
+    content_b: str
+    score_cutoff: float


@dataclasses.dataclass(frozen=True, slots=True)
@@ -20,21 +27,20 @@ class _WorkResult:
    doc_two_pk: int
    ratio: float

-    def __lt__(self, other: "_WorkResult") -> bool:
-        return self.doc_one_pk < other.doc_one_pk
-

 def _process_and_match(work: _WorkPackage) -> _WorkResult:
    """
-    Does basic processing of document content, gets the basic ratio
-    and returns the result package.
+    Process document content and compute the fuzzy ratio.
+    score_cutoff lets rapidfuzz short-circuit when the score cannot reach the threshold.
    """
-    first_string = rapidfuzz.utils.default_process(work.first_doc.content)
-    second_string = rapidfuzz.utils.default_process(work.second_doc.content)
-
-    match = rapidfuzz.fuzz.ratio(first_string, second_string)
-
-    return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
+    first_string = rapidfuzz.utils.default_process(work.content_a)
+    second_string = rapidfuzz.utils.default_process(work.content_b)
+    ratio = rapidfuzz.fuzz.ratio(
+        first_string,
+        second_string,
+        score_cutoff=work.score_cutoff,
+    )
+    return _WorkResult(work.pk_a, work.pk_b, ratio)


 class Command(PaperlessCommand):
@@ -57,78 +63,169 @@ class Command(PaperlessCommand):
            action="store_true",
            help="If set, one document of matches above the ratio WILL BE DELETED",
        )
+        parser.add_argument(
+            "--yes",
+            default=False,
+            action="store_true",
+            help="Skip the confirmation prompt when used with --delete",
+        )
+
+    def _render_results(
+        self,
+        matches: list[_WorkResult],
+        *,
+        opt_ratio: float,
+        do_delete: bool,
+    ) -> list[int]:
+        """Render match results as a Rich table. Returns list of PKs to delete."""
+        if not matches:
+            self.console.print(
+                Panel(
+                    "[green]No duplicate documents found.[/green]",
+                    title="Fuzzy Match",
+                    border_style="green",
+                ),
+            )
+            return []
+
+        # Fetch titles for matched documents in a single query.
+        all_pks = {pk for m in matches for pk in (m.doc_one_pk, m.doc_two_pk)}
+        titles: dict[int, str] = dict(
+            Document.objects.filter(pk__in=all_pks)
+            .only("pk", "title")
+            .values_list("pk", "title"),
+        )
+
+        table = Table(
+            title=f"Fuzzy Matches (threshold: {opt_ratio:.1f}%)",
+            show_lines=True,
+            title_style="bold",
+        )
+        table.add_column("#", style="dim", width=4, no_wrap=True)
+        table.add_column("Document A", min_width=24)
+        table.add_column("Document B", min_width=24)
+        table.add_column("Similarity", width=11, justify="right")
+
+        maybe_delete_ids: list[int] = []
+
+        for i, match_result in enumerate(matches, 1):
+            pk_a = match_result.doc_one_pk
+            pk_b = match_result.doc_two_pk
+            ratio = match_result.ratio
+
+            if ratio >= 97.0:
+                ratio_style = "bold red"
+            elif ratio >= 92.0:
+                ratio_style = "red"
+            elif ratio >= 88.0:
+                ratio_style = "yellow"
+            else:
+                ratio_style = "dim"
+
+            table.add_row(
+                str(i),
+                f"[dim]#{pk_a}[/dim] {titles.get(pk_a, 'Unknown')}",
+                f"[dim]#{pk_b}[/dim] {titles.get(pk_b, 'Unknown')}",
+                Text(f"{ratio:.1f}%", style=ratio_style),
+            )
+            maybe_delete_ids.append(pk_b)
+
+        self.console.print(table)
+
+        summary = f"Found [bold]{len(matches)}[/bold] matching pair(s)."
+        if do_delete:
+            summary += f" [yellow]{len(maybe_delete_ids)}[/yellow] document(s) will be deleted."
+        self.console.print(summary)
+
+        return maybe_delete_ids

    def handle(self, *args, **options):
        RATIO_MIN: Final[float] = 0.0
        RATIO_MAX: Final[float] = 100.0

-        if options["delete"]:
-            self.stdout.write(
-                self.style.WARNING(
-                    "The command is configured to delete documents.  Use with caution",
-                ),
-            )
-
        opt_ratio = options["ratio"]
-        checked_pairs: set[tuple[int, int]] = set()
-        work_pkgs: list[_WorkPackage] = []

        if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
            raise CommandError("The ratio must be between 0 and 100")

-        all_docs = Document.objects.all().order_by("id")
-
-        for first_doc in all_docs:
-            for second_doc in all_docs:
-                if first_doc.pk == second_doc.pk:
-                    continue
-                if first_doc.content.strip() == "" or second_doc.content.strip() == "":
-                    continue
-                doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
-                doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
-                if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
-                    continue
-                checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
-                work_pkgs.append(_WorkPackage(first_doc, second_doc))
-
-        results: list[_WorkResult] = []
-        if self.process_count == 1:
-            for work in self.track(work_pkgs, description="Matching..."):
-                results.append(_process_and_match(work))
-        else:  # pragma: no cover
-            for proc_result in self.process_parallel(
-                _process_and_match,
-                work_pkgs,
-                description="Matching...",
-            ):
-                if proc_result.error:
-                    self.console.print(
-                        f"[red]Failed: {proc_result.error}[/red]",
-                    )
-                elif proc_result.result is not None:
-                    results.append(proc_result.result)
-
-        messages: list[str] = []
-        maybe_delete_ids: list[int] = []
-        for match_result in sorted(results):
-            if match_result.ratio >= opt_ratio:
-                messages.append(
-                    self.style.NOTICE(
-                        f"Document {match_result.doc_one_pk} fuzzy match"
-                        f" to {match_result.doc_two_pk}"
-                        f" (confidence {match_result.ratio:.3f})\n",
-                    ),
-                )
-                maybe_delete_ids.append(match_result.doc_two_pk)
-
-        if len(messages) == 0:
-            messages.append(self.style.SUCCESS("No matches found\n"))
-        self.stdout.writelines(messages)
-
        if options["delete"]:
-            self.stdout.write(
-                self.style.NOTICE(
-                    f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
+            self.console.print(
+                Panel(
+                    "[bold yellow]WARNING:[/bold yellow] This run is configured to delete"
+                    " documents. One document from each matched pair WILL BE PERMANENTLY DELETED.",
+                    title="Delete Mode",
+                    border_style="red",
                ),
            )
-            Document.objects.filter(pk__in=maybe_delete_ids).delete()
+
+        # Load only the fields we need -- avoids fetching title, archive_checksum, etc.
+        slim_docs: list[tuple[int, str]] = list(
+            Document.objects.only("id", "content")
+            .order_by("id")
+            .values_list("id", "content"),
+        )
+
+        # combinations() generates each unique pair exactly once -- no checked_pairs set needed.
+        # The total is computed cheaply so the progress bar can start immediately without
+        # materialising all pairs up front (n*(n-1)/2 can be hundreds of thousands).
+        n = len(slim_docs)
+        total_pairs = n * (n - 1) // 2
+
+        def _work_gen():
+            for (pk_a, ca), (pk_b, cb) in combinations(slim_docs, 2):
+                if ca.strip() and cb.strip():
+                    yield _WorkPackage(pk_a, ca, pk_b, cb, opt_ratio)
+
+        def _iter_matches():
+            if self.process_count == 1:
+                for work in self.track(
+                    _work_gen(),
+                    description="Matching...",
+                    total=total_pairs,
+                ):
+                    result = _process_and_match(work)
+                    if result.ratio >= opt_ratio:
+                        yield result
+            else:  # pragma: no cover
+                work_pkgs = list(_work_gen())
+                for proc_result in self.process_parallel(
+                    _process_and_match,
+                    work_pkgs,
+                    description="Matching...",
+                ):
+                    if proc_result.error:
+                        self.console.print(
+                            f"[red]Failed: {proc_result.error}[/red]",
+                        )
+                    elif (
+                        proc_result.result is not None
+                        and proc_result.result.ratio >= opt_ratio
+                    ):
+                        yield proc_result.result
+
+        matches = sorted(_iter_matches(), key=lambda m: m.ratio, reverse=True)
+        maybe_delete_ids = self._render_results(
+            matches,
+            opt_ratio=opt_ratio,
+            do_delete=options["delete"],
+        )
+
+        if options["delete"] and maybe_delete_ids:
+            confirmed = options["yes"]
+            if not confirmed:
+                self.console.print(
+                    f"\nDelete [bold]{len(maybe_delete_ids)}[/bold] document(s)? "
+                    "[bold]\\[y/N][/bold] ",
+                    end="",
+                )
+                answer = input().strip().lower()
+                confirmed = answer in {"y", "yes"}
+
+            if confirmed:
+                self.console.print(
+                    f"[red]Deleting {len(maybe_delete_ids)} document(s)...[/red]",
+                )
+                Document.objects.filter(pk__in=maybe_delete_ids).delete()
+                self.console.print("[green]Done.[/green]")
+            else:
+                self.console.print("[yellow]Deletion cancelled.[/yellow]")
@@ -1,6 +1,6 @@
+from documents.search._backend import SearchHit
 from documents.search._backend import SearchIndexLockError
 from documents.search._backend import SearchMode
-from documents.search._backend import SearchResults
 from documents.search._backend import TantivyBackend
 from documents.search._backend import TantivyRelevanceList
 from documents.search._backend import WriteBatch
@@ -10,9 +10,9 @@ from documents.search._schema import needs_rebuild
 from documents.search._schema import wipe_index

 __all__ = [
+    "SearchHit",
    "SearchIndexLockError",
    "SearchMode",
-    "SearchResults",
    "TantivyBackend",
    "TantivyRelevanceList",
    "WriteBatch",
@@ -1,9 +1,9 @@
 from __future__ import annotations

 import logging
+import re
 import threading
 from collections import Counter
-from dataclasses import dataclass
 from datetime import UTC
 from datetime import datetime
 from enum import StrEnum
@@ -88,45 +88,63 @@ class SearchHit(TypedDict):
    highlights: dict[str, str]


-@dataclass(frozen=True, slots=True)
-class SearchResults:
-    """
-    Container for search results with pagination metadata.
-
-    Attributes:
-        hits: List of search results with scores and highlights
-        total: Total matching documents across all pages (for pagination)
-        query: Preprocessed query string after date/syntax rewriting
-    """
-
-    hits: list[SearchHit]
-    total: int  # total matching documents (for pagination)
-    query: str  # preprocessed query string
-
-
 class TantivyRelevanceList:
    """
-    DRF-compatible list wrapper for Tantivy search hits.
+    DRF-compatible list wrapper for Tantivy search results.

-    Provides paginated access to search results while storing all hits in memory
-    for efficient ID retrieval. Used by Django REST framework for pagination.
+    Holds a lightweight ordered list of IDs (for pagination count and
+    ``selection_data``) together with a small page of rich ``SearchHit``
+    dicts (for serialization).  DRF's ``PageNumberPagination`` calls
+    ``__len__`` to compute the total page count and ``__getitem__`` to
+    slice the displayed page.

-    Methods:
-        __len__: Returns total hit count for pagination calculations
-        __getitem__: Slices the hit list for page-specific results
-
-    Note: Stores ALL post-filter hits so get_all_result_ids() can return
-    every matching document ID without requiring a second search query.
+    Args:
+        ordered_ids: All matching document IDs in display order.
+        page_hits: Rich SearchHit dicts for the requested DRF page only.
+        page_offset: Index into *ordered_ids* where *page_hits* starts.
    """

-    def __init__(self, hits: list[SearchHit]) -> None:
-        self._hits = hits
+    def __init__(
+        self,
+        ordered_ids: list[int],
+        page_hits: list[SearchHit],
+        page_offset: int = 0,
+    ) -> None:
+        self._ordered_ids = ordered_ids
+        self._page_hits = page_hits
+        self._page_offset = page_offset

    def __len__(self) -> int:
-        return len(self._hits)
+        return len(self._ordered_ids)

-    def __getitem__(self, key: slice) -> list[SearchHit]:
-        return self._hits[key]
+    def __getitem__(self, key: int | slice) -> SearchHit | list[SearchHit]:
+        if isinstance(key, int):
+            idx = key if key >= 0 else len(self._ordered_ids) + key
+            if self._page_offset <= idx < self._page_offset + len(self._page_hits):
+                return self._page_hits[idx - self._page_offset]
+            return SearchHit(
+                id=self._ordered_ids[key],
+                score=0.0,
+                rank=idx + 1,
+                highlights={},
+            )
+        start = key.start or 0
+        stop = key.stop or len(self._ordered_ids)
+        # DRF slices to extract the current page.  If the slice aligns
+        # with our pre-fetched page_hits, return them directly.
+        # We only check start — DRF always slices with stop=start+page_size,
+        # which exceeds page_hits length on the last page.
+        if start == self._page_offset:
+            return self._page_hits[: stop - start]
+        # Fallback: return stub dicts (no highlights).
+        return [
+            SearchHit(id=doc_id, score=0.0, rank=start + i + 1, highlights={})
+            for i, doc_id in enumerate(self._ordered_ids[key])
+        ]
+
+    def get_all_ids(self) -> list[int]:
+        """Return all matching document IDs in display order."""
+        return self._ordered_ids


 class SearchIndexLockError(Exception):
@@ -206,10 +224,13 @@ class WriteBatch:
        """
        Remove a document from the batch by its primary key.

-        Uses range query instead of term query to work around unsigned integer
-        type detection bug in tantivy-py 0.25.
+        Uses range_query instead of term_query to work around a tantivy-py bug
+        where Python integers are inferred as i64, producing Terms that never
+        match u64 fields.
+
+        TODO: Replace with term_query("id", doc_id) once
+        https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
        """
-        # Use range query to work around u64 deletion bug
        self._writer.delete_documents_by_query(
            tantivy.Query.range_query(
                self._backend._schema,
@@ -234,6 +255,34 @@ class TantivyBackend:
    the underlying index directory changes (e.g., during test isolation).
    """

+    # Maps DRF ordering field names to Tantivy index field names.
+    SORT_FIELD_MAP: dict[str, str] = {
+        "title": "title_sort",
+        "correspondent__name": "correspondent_sort",
+        "document_type__name": "type_sort",
+        "created": "created",
+        "added": "added",
+        "modified": "modified",
+        "archive_serial_number": "asn",
+        "page_count": "page_count",
+        "num_notes": "num_notes",
+    }
+
+    # Fields where Tantivy's sort order matches the ORM's sort order.
+    # Text-based fields (title, correspondent__name, document_type__name)
+    # are excluded because Tantivy's tokenized fast fields produce different
+    # ordering than the ORM's collation-based ordering.
+    SORTABLE_FIELDS: frozenset[str] = frozenset(
+        {
+            "created",
+            "added",
+            "modified",
+            "archive_serial_number",
+            "page_count",
+            "num_notes",
+        },
+    )
+
    def __init__(self, path: Path | None = None):
        # path=None → in-memory index (for tests)
        # path=some_dir → on-disk index (for production)
@@ -272,6 +321,36 @@ class TantivyBackend:
        if self._index is None:
            self.open()  # pragma: no cover

+    def _parse_query(
+        self,
+        query: str,
+        search_mode: SearchMode,
+    ) -> tantivy.Query:
+        """Parse a user query string into a Tantivy Query object."""
+        tz = get_current_timezone()
+        if search_mode is SearchMode.TEXT:
+            return parse_simple_text_query(self._index, query)
+        elif search_mode is SearchMode.TITLE:
+            return parse_simple_title_query(self._index, query)
+        else:
+            return parse_user_query(self._index, query, tz)
+
+    def _apply_permission_filter(
+        self,
+        query: tantivy.Query,
+        user: AbstractBaseUser | None,
+    ) -> tantivy.Query:
+        """Wrap a query with a permission filter if the user is not a superuser."""
+        if user is not None:
+            permission_filter = build_permission_filter(self._schema, user)
+            return tantivy.Query.boolean_query(
+                [
+                    (tantivy.Occur.Must, query),
+                    (tantivy.Occur.Must, permission_filter),
+                ],
+            )
+        return query
+
    def _build_tantivy_doc(
        self,
        document: Document,
@@ -326,12 +405,17 @@ class TantivyBackend:
            doc.add_unsigned("tag_id", tag.pk)
            tag_names.append(tag.name)

-        # Notes — JSON for structured queries (notes.user:alice, notes.note:text),
-        # companion text field for default full-text search.
+        # Notes — JSON for structured queries (notes.user:alice, notes.note:text).
+        # notes_text is a plain-text companion for snippet/highlight generation;
+        # tantivy's SnippetGenerator does not support JSON fields.
        num_notes = 0
+        note_texts: list[str] = []
        for note in document.notes.all():
            num_notes += 1
            doc.add_json("notes", {"note": note.note, "user": note.user.username})
+            note_texts.append(note.note)
+        if note_texts:
+            doc.add_text("notes_text", " ".join(note_texts))

        # Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y),
        # companion text field for default full-text search.
@@ -425,155 +509,125 @@ class TantivyBackend:
        with self.batch_update(lock_timeout=5.0) as batch:
            batch.remove(doc_id)

-    def search(
+    def highlight_hits(
        self,
        query: str,
-        user: AbstractBaseUser | None,
-        page: int,
-        page_size: int,
-        sort_field: str | None,
+        doc_ids: list[int],
        *,
-        sort_reverse: bool,
        search_mode: SearchMode = SearchMode.QUERY,
-    ) -> SearchResults:
+        rank_start: int = 1,
+    ) -> list[SearchHit]:
        """
-        Execute a search query against the document index.
+        Generate SearchHit dicts with highlights for specific document IDs.

-        Processes the user query through date rewriting, normalization, and
-        permission filtering before executing against Tantivy. Supports both
-        relevance-based and field-based sorting.
+        Unlike search(), this does not execute a ranked query — it looks up
+        each document by ID and generates snippets against the provided query.
+        Use this when you already know which documents to display (from
+        search_ids + ORM filtering) and just need highlight data.

-        QUERY search mode supports natural date keywords, field filters, etc.
-        TITLE search mode treats the query as plain text to search for in title only
-        TEXT search mode treats the query as plain text to search for in title and content
+        Note: Each doc_id requires an individual index lookup because tantivy-py
+        does not yet expose a batch fast-field read API. This is acceptable for
+        page-sized batches (typically 25 docs) but should not be called with
+        thousands of IDs.
+
+        TODO: When https://github.com/quickwit-oss/tantivy-py/pull/641 lands,
+        the per-doc range_query lookups here can be replaced with a single
+        collect_u64_fast_field("id", doc_addresses) call.

        Args:
-            query: User's search query
-            user: User for permission filtering (None for superuser/no filtering)
-            page: Page number (1-indexed) for pagination
-            page_size: Number of results per page
-            sort_field: Field to sort by (None for relevance ranking)
-            sort_reverse: Whether to reverse the sort order
-            search_mode: "query" for advanced Tantivy syntax, "text" for
-                plain-text search over title and content only, "title" for
-                plain-text search over title only
+            query: The search query (used for snippet generation)
+            doc_ids: Ordered list of document IDs to generate hits for
+            search_mode: Query parsing mode (for building the snippet query)
+            rank_start: Starting rank value (1-based absolute position in the
+                full result set; pass ``page_offset + 1`` for paginated calls)

        Returns:
-            SearchResults with hits, total count, and processed query
+            List of SearchHit dicts in the same order as doc_ids
        """
-        self._ensure_open()
-        tz = get_current_timezone()
-        if search_mode is SearchMode.TEXT:
-            user_query = parse_simple_text_query(self._index, query)
-        elif search_mode is SearchMode.TITLE:
-            user_query = parse_simple_title_query(self._index, query)
-        else:
-            user_query = parse_user_query(self._index, query, tz)
+        if not doc_ids:
+            return []

-        # Apply permission filter if user is not None (not superuser)
-        if user is not None:
-            permission_filter = build_permission_filter(self._schema, user)
-            final_query = tantivy.Query.boolean_query(
-                [
-                    (tantivy.Occur.Must, user_query),
-                    (tantivy.Occur.Must, permission_filter),
-                ],
+        self._ensure_open()
+        user_query = self._parse_query(query, search_mode)
+
+        # For notes_text snippet generation, we need a query that targets the
+        # notes_text field directly. user_query may contain JSON-field terms
+        # (e.g. notes.note:urgent) that the SnippetGenerator cannot resolve
+        # against a text field. Strip field:value prefixes so bare terms like
+        # "urgent" are re-parsed against notes_text, producing highlights even
+        # when the original query used structured syntax.
+        bare_query = re.sub(r"\w[\w.]*:", "", query).strip()
+        try:
+            notes_text_query = (
+                self._index.parse_query(bare_query, ["notes_text"])
+                if bare_query
+                else user_query
            )
-        else:
-            final_query = user_query
+        except Exception:
+            notes_text_query = user_query

        searcher = self._index.searcher()
-        offset = (page - 1) * page_size
-
-        # Map sort fields
-        sort_field_map = {
-            "title": "title_sort",
-            "correspondent__name": "correspondent_sort",
-            "document_type__name": "type_sort",
-            "created": "created",
-            "added": "added",
-            "modified": "modified",
-            "archive_serial_number": "asn",
-            "page_count": "page_count",
-            "num_notes": "num_notes",
-        }
-
-        # Perform search
-        if sort_field and sort_field in sort_field_map:
-            mapped_field = sort_field_map[sort_field]
-            results = searcher.search(
-                final_query,
-                limit=offset + page_size,
-                order_by_field=mapped_field,
-                order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
-            )
-            # Field sorting: hits are still (score, DocAddress) tuples; score unused
-            all_hits = [(hit[1], 0.0) for hit in results.hits]
-        else:
-            # Score-based search: hits are (score, DocAddress) tuples
-            results = searcher.search(final_query, limit=offset + page_size)
-            all_hits = [(hit[1], hit[0]) for hit in results.hits]
-
-        total = results.count
-
-        # Normalize scores for score-based searches
-        if not sort_field and all_hits:
-            max_score = max(hit[1] for hit in all_hits) or 1.0
-            all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
-
-        # Apply threshold filter if configured (score-based search only)
-        threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
-        if threshold is not None and not sort_field:
-            all_hits = [hit for hit in all_hits if hit[1] >= threshold]
-
-        # Get the page's hits
-        page_hits = all_hits[offset : offset + page_size]
-
-        # Build result hits with highlights
-        hits: list[SearchHit] = []
        snippet_generator = None
        notes_snippet_generator = None
+        hits: list[SearchHit] = []

-        for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
-            # Get the actual document from the searcher using the doc address
+        for rank, doc_id in enumerate(doc_ids, start=rank_start):
+            # Look up document by ID, scoring against the user query so that
+            # the returned SearchHit carries a real BM25 relevance score.
+            id_query = tantivy.Query.range_query(
+                self._schema,
+                "id",
+                tantivy.FieldType.Unsigned,
+                doc_id,
+                doc_id,
+            )
+            scored_query = tantivy.Query.boolean_query(
+                [
+                    (tantivy.Occur.Must, user_query),
+                    (tantivy.Occur.Must, id_query),
+                ],
+            )
+            results = searcher.search(scored_query, limit=1)
+
+            if not results.hits:
+                continue
+
+            score, doc_address = results.hits[0]
            actual_doc = searcher.doc(doc_address)
            doc_dict = actual_doc.to_dict()
-            doc_id = doc_dict["id"][0]

            highlights: dict[str, str] = {}
+            try:
+                if snippet_generator is None:
+                    snippet_generator = tantivy.SnippetGenerator.create(
+                        searcher,
+                        user_query,
+                        self._schema,
+                        "content",
+                    )

-            # Generate highlights if score > 0
-            if score > 0:
-                try:
-                    if snippet_generator is None:
-                        snippet_generator = tantivy.SnippetGenerator.create(
+                content_html = snippet_generator.snippet_from_doc(actual_doc).to_html()
+                if content_html:
+                    highlights["content"] = content_html
+
+                if "notes_text" in doc_dict:
+                    # Use notes_text (plain text) for snippet generation — tantivy's
+                    # SnippetGenerator does not support JSON fields.
+                    if notes_snippet_generator is None:
+                        notes_snippet_generator = tantivy.SnippetGenerator.create(
                            searcher,
-                            final_query,
+                            notes_text_query,
                            self._schema,
-                            "content",
+                            "notes_text",
                        )
+                    notes_html = notes_snippet_generator.snippet_from_doc(
+                        actual_doc,
+                    ).to_html()
+                    if notes_html:
+                        highlights["notes"] = notes_html

-                    content_snippet = snippet_generator.snippet_from_doc(actual_doc)
-                    if content_snippet:
-                        highlights["content"] = str(content_snippet)
-
-                    # Try notes highlights
-                    if "notes" in doc_dict:
-                        if notes_snippet_generator is None:
-                            notes_snippet_generator = tantivy.SnippetGenerator.create(
-                                searcher,
-                                final_query,
-                                self._schema,
-                                "notes",
-                            )
-                        notes_snippet = notes_snippet_generator.snippet_from_doc(
-                            actual_doc,
-                        )
-                        if notes_snippet:
-                            highlights["notes"] = str(notes_snippet)
-
-                except Exception:  # pragma: no cover
-                    logger.debug("Failed to generate highlights for doc %s", doc_id)
+            except Exception:  # pragma: no cover
+                logger.debug("Failed to generate highlights for doc %s", doc_id)

            hits.append(
                SearchHit(
@@ -584,11 +638,69 @@ class TantivyBackend:
                ),
            )

-        return SearchResults(
-            hits=hits,
-            total=total,
-            query=query,
-        )
+        return hits
+
+    def search_ids(
+        self,
+        query: str,
+        user: AbstractBaseUser | None,
+        *,
+        sort_field: str | None = None,
+        sort_reverse: bool = False,
+        search_mode: SearchMode = SearchMode.QUERY,
+        limit: int | None = None,
+    ) -> list[int]:
+        """
+        Return document IDs matching a query — no highlights or scores.
+
+        This is the lightweight companion to search(). Use it when you need the
+        full set of matching IDs (e.g. for ``selection_data``) but don't need
+        scores, ranks, or highlights.
+
+        Args:
+            query: User's search query
+            user: User for permission filtering (None for superuser/no filtering)
+            sort_field: Field to sort by (None for relevance ranking)
+            sort_reverse: Whether to reverse the sort order
+            search_mode: Query parsing mode (QUERY, TEXT, or TITLE)
+            limit: Maximum number of IDs to return (None = all matching docs)
+
+        Returns:
+            List of document IDs in the requested order
+        """
+        self._ensure_open()
+        user_query = self._parse_query(query, search_mode)
+        final_query = self._apply_permission_filter(user_query, user)
+
+        searcher = self._index.searcher()
+        effective_limit = limit if limit is not None else searcher.num_docs
+
+        if sort_field and sort_field in self.SORT_FIELD_MAP:
+            mapped_field = self.SORT_FIELD_MAP[sort_field]
+            results = searcher.search(
+                final_query,
+                limit=effective_limit,
+                order_by_field=mapped_field,
+                order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc,
+            )
+            all_hits = [(hit[1],) for hit in results.hits]
+        else:
+            results = searcher.search(final_query, limit=effective_limit)
+            all_hits = [(hit[1], hit[0]) for hit in results.hits]
+
+            # Normalize scores and apply threshold (relevance search only)
+            if all_hits:
+                max_score = max(hit[1] for hit in all_hits) or 1.0
+                all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
+
+            threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD
+            if threshold is not None:
+                all_hits = [hit for hit in all_hits if hit[1] >= threshold]
+
+        # TODO: Replace with searcher.collect_u64_fast_field("id", addrs) once
+        # https://github.com/quickwit-oss/tantivy-py/pull/641 lands — eliminates
+        # one stored-doc fetch per result (~80% reduction in search_ids latency).
+        return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]

    def autocomplete(
        self,
@@ -603,6 +715,10 @@ class TantivyBackend:
        frequency (how many documents contain each word). Optionally filters
        results to only words from documents visible to the specified user.

+        NOTE: This is the hottest search path (called per keystroke).
+        A future improvement would be to cache results in Redis, keyed by
+        (prefix, user_id), and invalidate on index writes.
+
        Args:
            term: Prefix to match against autocomplete words
            limit: Maximum number of suggestions to return
@@ -613,64 +729,94 @@ class TantivyBackend:
        """
        self._ensure_open()
        normalized_term = ascii_fold(term.lower())
+        if not normalized_term:
+            return []

        searcher = self._index.searcher()

-        # Apply permission filter for non-superusers so autocomplete words
-        # from invisible documents don't leak to other users.
+        # Build a prefix query on autocomplete_word so we only scan docs
+        # containing words that start with the prefix, not the entire index.
+        # tantivy regex is implicitly anchored; .+ avoids the empty-match
+        # error that .* triggers.  We OR with term_query to also match the
+        # exact prefix as a complete word.
+        escaped = re.escape(normalized_term)
+        prefix_query = tantivy.Query.boolean_query(
+            [
+                (
+                    tantivy.Occur.Should,
+                    tantivy.Query.term_query(
+                        self._schema,
+                        "autocomplete_word",
+                        normalized_term,
+                    ),
+                ),
+                (
+                    tantivy.Occur.Should,
+                    tantivy.Query.regex_query(
+                        self._schema,
+                        "autocomplete_word",
+                        f"{escaped}.+",
+                    ),
+                ),
+            ],
+        )
+
+        # Intersect with permission filter so autocomplete words from
+        # invisible documents don't leak to other users.
        if user is not None and not user.is_superuser:
-            base_query = build_permission_filter(self._schema, user)
+            final_query = tantivy.Query.boolean_query(
+                [
+                    (tantivy.Occur.Must, prefix_query),
+                    (tantivy.Occur.Must, build_permission_filter(self._schema, user)),
+                ],
+            )
        else:
-            base_query = tantivy.Query.all_query()
+            final_query = prefix_query

-        results = searcher.search(base_query, limit=10000)
+        results = searcher.search(final_query, limit=searcher.num_docs)

-        # Count how many visible documents each word appears in.
-        # Using Counter (not set) preserves per-word document frequency so
-        # we can rank suggestions by how commonly they occur — the same
-        # signal Whoosh used for Tf/Idf-based autocomplete ordering.
+        # Count how many visible documents each matching word appears in.
        word_counts: Counter[str] = Counter()
        for _score, doc_address in results.hits:
            stored_doc = searcher.doc(doc_address)
            doc_dict = stored_doc.to_dict()
            if "autocomplete_word" in doc_dict:
-                word_counts.update(doc_dict["autocomplete_word"])
+                for word in doc_dict["autocomplete_word"]:
+                    if word.startswith(normalized_term):
+                        word_counts[word] += 1

-        # Filter to prefix matches, sort by document frequency descending;
-        # break ties alphabetically for stable, deterministic output.
+        # Sort by document frequency descending; break ties alphabetically.
        matches = sorted(
-            (w for w in word_counts if w.startswith(normalized_term)),
+            word_counts,
            key=lambda w: (-word_counts[w], w),
        )

        return matches[:limit]

-    def more_like_this(
+    def more_like_this_ids(
        self,
        doc_id: int,
        user: AbstractBaseUser | None,
-        page: int,
-        page_size: int,
-    ) -> SearchResults:
+        *,
+        limit: int | None = None,
+    ) -> list[int]:
        """
-        Find documents similar to the given document using content analysis.
+        Return IDs of documents similar to the given document — no highlights.

-        Uses Tantivy's "more like this" query to find documents with similar
-        content patterns. The original document is excluded from results.
+        Lightweight companion to more_like_this(). The original document is
+        excluded from results.

        Args:
            doc_id: Primary key of the reference document
            user: User for permission filtering (None for no filtering)
-            page: Page number (1-indexed) for pagination
-            page_size: Number of results per page
+            limit: Maximum number of IDs to return (None = all matching docs)

        Returns:
-            SearchResults with similar documents (excluding the original)
+            List of similar document IDs (excluding the original)
        """
        self._ensure_open()
        searcher = self._index.searcher()

-        # First find the document address
        id_query = tantivy.Query.range_query(
            self._schema,
            "id",
@@ -681,13 +827,9 @@ class TantivyBackend:
        results = searcher.search(id_query, limit=1)

        if not results.hits:
-            # Document not found
-            return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}")
+            return []

-        # Extract doc_address from (score, doc_address) tuple
        doc_address = results.hits[0][1]
-
-        # Build more like this query
        mlt_query = tantivy.Query.more_like_this_query(
            doc_address,
            min_doc_frequency=1,
@@ -699,59 +841,21 @@ class TantivyBackend:
            boost_factor=None,
        )

-        # Apply permission filter
-        if user is not None:
-            permission_filter = build_permission_filter(self._schema, user)
-            final_query = tantivy.Query.boolean_query(
-                [
-                    (tantivy.Occur.Must, mlt_query),
-                    (tantivy.Occur.Must, permission_filter),
-                ],
-            )
-        else:
-            final_query = mlt_query
+        final_query = self._apply_permission_filter(mlt_query, user)

-        # Search
-        offset = (page - 1) * page_size
-        results = searcher.search(final_query, limit=offset + page_size)
+        effective_limit = limit if limit is not None else searcher.num_docs
+        # Fetch one extra to account for excluding the original document
+        results = searcher.search(final_query, limit=effective_limit + 1)

-        total = results.count
-        # Convert from (score, doc_address) to (doc_address, score)
-        all_hits = [(hit[1], hit[0]) for hit in results.hits]
+        # TODO: Replace with collect_u64_fast_field("id", addrs) once
+        # https://github.com/quickwit-oss/tantivy-py/pull/641 lands.
+        ids = []
+        for _score, doc_address in results.hits:
+            result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
+            if result_doc_id != doc_id:
+                ids.append(result_doc_id)

-        # Normalize scores
-        if all_hits:
-            max_score = max(hit[1] for hit in all_hits) or 1.0
-            all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits]
-
-        # Get page hits
-        page_hits = all_hits[offset : offset + page_size]
-
-        # Build results
-        hits: list[SearchHit] = []
-        for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1):
-            actual_doc = searcher.doc(doc_address)
-            doc_dict = actual_doc.to_dict()
-            result_doc_id = doc_dict["id"][0]
-
-            # Skip the original document
-            if result_doc_id == doc_id:
-                continue
-
-            hits.append(
-                SearchHit(
-                    id=result_doc_id,
-                    score=score,
-                    rank=rank,
-                    highlights={},  # MLT doesn't generate highlights
-                ),
-            )
-
-        return SearchResults(
-            hits=hits,
-            total=max(0, total - 1),  # Subtract 1 for the original document
-            query=f"more_like:{doc_id}",
-        )
+        return ids[:limit] if limit is not None else ids

    def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
        """
@@ -396,10 +396,17 @@ def build_permission_filter(
        Tantivy query that filters results to visible documents

    Implementation Notes:
-        - Uses range_query instead of term_query to work around unsigned integer
-          type detection bug in tantivy-py 0.25
-        - Uses boolean_query for "no owner" check since exists_query is not
-          available in tantivy-py 0.25.1 (available in master)
+        - Uses range_query instead of term_query for owner_id/viewer_id to work
+          around a tantivy-py bug where Python ints are inferred as i64, causing
+          term_query to return no hits on u64 fields.
+          TODO: Replace with term_query once
+          https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
+
+        - Uses range_query(owner_id, 1, MAX_U64) as an "owner exists" check
+          because exists_query is not yet available in tantivy-py 0.25.
+          TODO: Replace with exists_query("owner_id") once that is exposed in
+          a tantivy-py release.
+
        - Uses disjunction_max_query to combine permission clauses with OR logic
    """
    owner_any = tantivy.Query.range_query(
@@ -72,6 +72,9 @@ def build_schema() -> tantivy.Schema:

    # JSON fields — structured queries: notes.user:alice, custom_fields.name:invoice
    sb.add_json_field("notes", stored=True, tokenizer_name="paperless_text")
+    # Plain-text companion for notes — tantivy's SnippetGenerator does not support
+    # JSON fields, so highlights require a text field with the same content.
+    sb.add_text_field("notes_text", stored=True, tokenizer_name="paperless_text")
    sb.add_json_field("custom_fields", stored=True, tokenizer_name="paperless_text")

    for field in (
@@ -33,19 +33,12 @@ class TestWriteBatch:
        except RuntimeError:
            pass

-        r = backend.search(
-            "should survive",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-        )
-        assert r.total == 1
+        ids = backend.search_ids("should survive", user=None)
+        assert len(ids) == 1


 class TestSearch:
-    """Test search functionality."""
+    """Test search query parsing and matching via search_ids."""

    def test_text_mode_limits_default_search_to_title_and_content(
        self,
@@ -60,27 +53,20 @@ class TestSearch:
        )
        backend.add_or_update(doc)

-        metadata_only = backend.search(
-            "document_type:invoice",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TEXT,
+        assert (
+            len(
+                backend.search_ids(
+                    "document_type:invoice",
+                    user=None,
+                    search_mode=SearchMode.TEXT,
+                ),
+            )
+            == 0
        )
-        assert metadata_only.total == 0
-
-        content_match = backend.search(
-            "monthly",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TEXT,
+        assert (
+            len(backend.search_ids("monthly", user=None, search_mode=SearchMode.TEXT))
+            == 1
        )
-        assert content_match.total == 1

    def test_title_mode_limits_default_search_to_title_only(
        self,
@@ -95,27 +81,14 @@ class TestSearch:
        )
        backend.add_or_update(doc)

-        content_only = backend.search(
-            "monthly",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TITLE,
+        assert (
+            len(backend.search_ids("monthly", user=None, search_mode=SearchMode.TITLE))
+            == 0
        )
-        assert content_only.total == 0
-
-        title_match = backend.search(
-            "invoice",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TITLE,
+        assert (
+            len(backend.search_ids("invoice", user=None, search_mode=SearchMode.TITLE))
+            == 1
        )
-        assert title_match.total == 1

    def test_text_mode_matches_partial_term_substrings(
        self,
@@ -130,38 +103,16 @@ class TestSearch:
        )
        backend.add_or_update(doc)

-        prefix_match = backend.search(
-            "pass",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TEXT,
+        assert (
+            len(backend.search_ids("pass", user=None, search_mode=SearchMode.TEXT)) == 1
        )
-        assert prefix_match.total == 1
-
-        infix_match = backend.search(
-            "sswo",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TEXT,
+        assert (
+            len(backend.search_ids("sswo", user=None, search_mode=SearchMode.TEXT)) == 1
        )
-        assert infix_match.total == 1
-
-        phrase_match = backend.search(
-            "sswo re",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TEXT,
+        assert (
+            len(backend.search_ids("sswo re", user=None, search_mode=SearchMode.TEXT))
+            == 1
        )
-        assert phrase_match.total == 1

    def test_text_mode_does_not_match_on_partial_term_overlap(
        self,
@@ -176,16 +127,10 @@ class TestSearch:
        )
        backend.add_or_update(doc)

-        non_match = backend.search(
-            "raptor",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TEXT,
+        assert (
+            len(backend.search_ids("raptor", user=None, search_mode=SearchMode.TEXT))
+            == 0
        )
-        assert non_match.total == 0

    def test_text_mode_anchors_later_query_tokens_to_token_starts(
        self,
@@ -214,16 +159,9 @@ class TestSearch:
        backend.add_or_update(prefix_doc)
        backend.add_or_update(false_positive)

-        results = backend.search(
-            "Z-Berichte 6",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TEXT,
+        result_ids = set(
+            backend.search_ids("Z-Berichte 6", user=None, search_mode=SearchMode.TEXT),
        )
-        result_ids = {hit["id"] for hit in results.hits}

        assert exact_doc.id in result_ids
        assert prefix_doc.id in result_ids
@@ -242,16 +180,9 @@ class TestSearch:
        )
        backend.add_or_update(doc)

-        no_tokens = backend.search(
-            "!!!",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TEXT,
+        assert (
+            len(backend.search_ids("!!!", user=None, search_mode=SearchMode.TEXT)) == 0
        )
-        assert no_tokens.total == 0

    def test_title_mode_matches_partial_term_substrings(
        self,
@@ -266,59 +197,18 @@ class TestSearch:
        )
        backend.add_or_update(doc)

-        prefix_match = backend.search(
-            "pass",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TITLE,
+        assert (
+            len(backend.search_ids("pass", user=None, search_mode=SearchMode.TITLE))
+            == 1
        )
-        assert prefix_match.total == 1
-
-        infix_match = backend.search(
-            "sswo",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TITLE,
+        assert (
+            len(backend.search_ids("sswo", user=None, search_mode=SearchMode.TITLE))
+            == 1
        )
-        assert infix_match.total == 1
-
-        phrase_match = backend.search(
-            "sswo gu",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-            search_mode=SearchMode.TITLE,
+        assert (
+            len(backend.search_ids("sswo gu", user=None, search_mode=SearchMode.TITLE))
+            == 1
        )
-        assert phrase_match.total == 1
-
-    def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend):
-        """Search scores must be normalized so top hit has score 1.0 for UI consistency."""
-        for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]):
-            doc = Document.objects.create(
-                title=title,
-                content=title,
-                checksum=f"SN{i}",
-                pk=10 + i,
-            )
-            backend.add_or_update(doc)
-        r = backend.search(
-            "bank",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-        )
-        assert r.hits[0]["score"] == pytest.approx(1.0)
-        assert all(0.0 <= h["score"] <= 1.0 for h in r.hits)

    def test_sort_field_ascending(self, backend: TantivyBackend):
        """Searching with sort_reverse=False must return results in ascending ASN order."""
@@ -331,16 +221,14 @@ class TestSearch:
            )
            backend.add_or_update(doc)

-        r = backend.search(
+        ids = backend.search_ids(
            "sortable",
            user=None,
-            page=1,
-            page_size=10,
            sort_field="archive_serial_number",
            sort_reverse=False,
        )
-        assert r.total == 3
-        asns = [Document.objects.get(pk=h["id"]).archive_serial_number for h in r.hits]
+        assert len(ids) == 3
+        asns = [Document.objects.get(pk=doc_id).archive_serial_number for doc_id in ids]
        assert asns == [10, 20, 30]

    def test_sort_field_descending(self, backend: TantivyBackend):
@@ -354,79 +242,91 @@ class TestSearch:
            )
            backend.add_or_update(doc)

-        r = backend.search(
+        ids = backend.search_ids(
            "sortable",
            user=None,
-            page=1,
-            page_size=10,
            sort_field="archive_serial_number",
            sort_reverse=True,
        )
-        assert r.total == 3
-        asns = [Document.objects.get(pk=h["id"]).archive_serial_number for h in r.hits]
+        assert len(ids) == 3
+        asns = [Document.objects.get(pk=doc_id).archive_serial_number for doc_id in ids]
        assert asns == [30, 20, 10]

-    def test_fuzzy_threshold_filters_low_score_hits(
-        self,
-        backend: TantivyBackend,
-        settings,
-    ):
-        """When ADVANCED_FUZZY_SEARCH_THRESHOLD exceeds all normalized scores, hits must be filtered out."""
-        doc = Document.objects.create(
-            title="Invoice document",
-            content="financial report",
-            checksum="FT1",
-            pk=120,
-        )
-        backend.add_or_update(doc)

-        # Threshold above 1.0 filters every hit (normalized scores top out at 1.0)
-        settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 1.1
-        r = backend.search(
-            "invoice",
+class TestSearchIds:
+    """Test lightweight ID-only search."""
+
+    def test_returns_matching_ids(self, backend: TantivyBackend):
+        """search_ids must return IDs of all matching documents."""
+        docs = []
+        for i in range(5):
+            doc = Document.objects.create(
+                title=f"findable doc {i}",
+                content="common keyword",
+                checksum=f"SI{i}",
+            )
+            backend.add_or_update(doc)
+            docs.append(doc)
+        other = Document.objects.create(
+            title="unrelated",
+            content="nothing here",
+            checksum="SI_other",
+        )
+        backend.add_or_update(other)
+
+        ids = backend.search_ids(
+            "common keyword",
            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
+            search_mode=SearchMode.QUERY,
        )
-        assert r.hits == []
+        assert set(ids) == {d.pk for d in docs}
+        assert other.pk not in ids

-    def test_owner_filter(self, backend: TantivyBackend):
-        """Document owners can search their private documents; other users cannot access them."""
-        owner = User.objects.create_user("owner")
-        other = User.objects.create_user("other")
+    def test_respects_permission_filter(self, backend: TantivyBackend):
+        """search_ids must respect user permission filtering."""
+        owner = User.objects.create_user("ids_owner")
+        other = User.objects.create_user("ids_other")
        doc = Document.objects.create(
-            title="Private",
-            content="secret",
-            checksum="PF1",
-            pk=20,
+            title="private doc",
+            content="secret keyword",
+            checksum="SIP1",
            owner=owner,
        )
        backend.add_or_update(doc)

+        assert backend.search_ids(
+            "secret",
+            user=owner,
+            search_mode=SearchMode.QUERY,
+        ) == [doc.pk]
        assert (
-            backend.search(
-                "secret",
-                user=owner,
-                page=1,
-                page_size=10,
-                sort_field=None,
-                sort_reverse=False,
-            ).total
-            == 1
+            backend.search_ids("secret", user=other, search_mode=SearchMode.QUERY) == []
        )
-        assert (
-            backend.search(
-                "secret",
-                user=other,
-                page=1,
-                page_size=10,
-                sort_field=None,
-                sort_reverse=False,
-            ).total
-            == 0
+
+    def test_respects_fuzzy_threshold(self, backend: TantivyBackend, settings):
+        """search_ids must apply the same fuzzy threshold as search()."""
+        doc = Document.objects.create(
+            title="threshold test",
+            content="unique term",
+            checksum="SIT1",
        )
+        backend.add_or_update(doc)
+
+        settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 1.1
+        ids = backend.search_ids("unique", user=None, search_mode=SearchMode.QUERY)
+        assert ids == []
+
+    def test_returns_ids_for_text_mode(self, backend: TantivyBackend):
+        """search_ids must work with TEXT search mode."""
+        doc = Document.objects.create(
+            title="text mode doc",
+            content="findable phrase",
+            checksum="SIM1",
+        )
+        backend.add_or_update(doc)
+
+        ids = backend.search_ids("findable", user=None, search_mode=SearchMode.TEXT)
+        assert ids == [doc.pk]


 class TestRebuild:
@@ -490,57 +390,26 @@ class TestAutocomplete:
 class TestMoreLikeThis:
    """Test more like this functionality."""

-    def test_excludes_original(self, backend: TantivyBackend):
-        """More like this queries must exclude the reference document from results."""
+    def test_more_like_this_ids_excludes_original(self, backend: TantivyBackend):
+        """more_like_this_ids must return IDs of similar documents, excluding the original."""
        doc1 = Document.objects.create(
            title="Important document",
-            content="financial information",
-            checksum="MLT1",
-            pk=50,
+            content="financial information report",
+            checksum="MLTI1",
+            pk=150,
        )
        doc2 = Document.objects.create(
            title="Another document",
-            content="financial report",
-            checksum="MLT2",
-            pk=51,
+            content="financial information report",
+            checksum="MLTI2",
+            pk=151,
        )
        backend.add_or_update(doc1)
        backend.add_or_update(doc2)

-        results = backend.more_like_this(doc_id=50, user=None, page=1, page_size=10)
-        returned_ids = [hit["id"] for hit in results.hits]
-        assert 50 not in returned_ids  # Original document excluded
-
-    def test_with_user_applies_permission_filter(self, backend: TantivyBackend):
-        """more_like_this with a user must exclude documents that user cannot see."""
-        viewer = User.objects.create_user("mlt_viewer")
-        other = User.objects.create_user("mlt_other")
-        public_doc = Document.objects.create(
-            title="Public financial document",
-            content="quarterly financial analysis report figures",
-            checksum="MLT3",
-            pk=52,
-        )
-        private_doc = Document.objects.create(
-            title="Private financial document",
-            content="quarterly financial analysis report figures",
-            checksum="MLT4",
-            pk=53,
-            owner=other,
-        )
-        backend.add_or_update(public_doc)
-        backend.add_or_update(private_doc)
-
-        results = backend.more_like_this(doc_id=52, user=viewer, page=1, page_size=10)
-        returned_ids = [hit["id"] for hit in results.hits]
-        # private_doc is owned by other, so viewer cannot see it
-        assert 53 not in returned_ids
-
-    def test_document_not_in_index_returns_empty(self, backend: TantivyBackend):
-        """more_like_this for a doc_id absent from the index must return empty results."""
-        results = backend.more_like_this(doc_id=9999, user=None, page=1, page_size=10)
-        assert results.hits == []
-        assert results.total == 0
+        ids = backend.more_like_this_ids(doc_id=150, user=None)
+        assert 150 not in ids
+        assert 151 in ids


 class TestSingleton:
@@ -593,19 +462,10 @@ class TestFieldHandling:
        # Should not raise an exception
        backend.add_or_update(doc)

-        results = backend.search(
-            "test",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-        )
-        assert results.total == 1
+        assert len(backend.search_ids("test", user=None)) == 1

    def test_custom_fields_include_name_and_value(self, backend: TantivyBackend):
        """Custom fields must be indexed with both field name and value for structured queries."""
-        # Create a custom field
        field = CustomField.objects.create(
            name="Invoice Number",
            data_type=CustomField.FieldDataType.STRING,
@@ -622,18 +482,9 @@ class TestFieldHandling:
            value_text="INV-2024-001",
        )

-        # Should not raise an exception during indexing
        backend.add_or_update(doc)

-        results = backend.search(
-            "invoice",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-        )
-        assert results.total == 1
+        assert len(backend.search_ids("invoice", user=None)) == 1

    def test_select_custom_field_indexes_label_not_id(self, backend: TantivyBackend):
        """SELECT custom fields must index the human-readable label, not the opaque option ID."""
@@ -660,27 +511,8 @@ class TestFieldHandling:
        )
        backend.add_or_update(doc)

-        # Label should be findable
-        results = backend.search(
-            "custom_fields.value:invoice",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-        )
-        assert results.total == 1
-
-        # Opaque ID must not appear in the index
-        results = backend.search(
-            "custom_fields.value:opt_abc",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-        )
-        assert results.total == 0
+        assert len(backend.search_ids("custom_fields.value:invoice", user=None)) == 1
+        assert len(backend.search_ids("custom_fields.value:opt_abc", user=None)) == 0

    def test_none_custom_field_value_not_indexed(self, backend: TantivyBackend):
        """Custom field instances with no value set must not produce an index entry."""
@@ -702,16 +534,7 @@ class TestFieldHandling:
        )
        backend.add_or_update(doc)

-        # The string "none" must not appear as an indexed value
-        results = backend.search(
-            "custom_fields.value:none",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-        )
-        assert results.total == 0
+        assert len(backend.search_ids("custom_fields.value:none", user=None)) == 0

    def test_notes_include_user_information(self, backend: TantivyBackend):
        """Notes must be indexed with user information when available for structured queries."""
@@ -724,32 +547,96 @@ class TestFieldHandling:
        )
        Note.objects.create(document=doc, note="Important note", user=user)

-        # Should not raise an exception during indexing
        backend.add_or_update(doc)

-        # Test basic document search first
-        results = backend.search(
-            "test",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
-        )
-        assert results.total == 1, (
-            f"Expected 1, got {results.total}. Document content should be searchable."
+        ids = backend.search_ids("test", user=None)
+        assert len(ids) == 1, (
+            f"Expected 1, got {len(ids)}. Document content should be searchable."
        )

-        # Test notes search — must use structured JSON syntax now that note
-        # is no longer in DEFAULT_SEARCH_FIELDS
-        results = backend.search(
-            "notes.note:important",
-            user=None,
-            page=1,
-            page_size=10,
-            sort_field=None,
-            sort_reverse=False,
+        ids = backend.search_ids("notes.note:important", user=None)
+        assert len(ids) == 1, (
+            f"Expected 1, got {len(ids)}. Note content should be searchable via notes.note: prefix."
        )
-        assert results.total == 1, (
-            f"Expected 1, got {results.total}. Note content should be searchable via notes.note: prefix."
+
+
+class TestHighlightHits:
+    """Test highlight_hits returns proper HTML strings, not raw Snippet objects."""
+
+    def test_highlights_content_returns_html_string(self, backend: TantivyBackend):
+        """highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects."""
+        doc = Document.objects.create(
+            title="Highlight Test",
+            content="The quick brown fox jumps over the lazy dog",
+            checksum="HH1",
+            pk=90,
        )
+        backend.add_or_update(doc)
+
+        hits = backend.highlight_hits("quick", [doc.pk])
+
+        assert len(hits) == 1
+        highlights = hits[0]["highlights"]
+        assert "content" in highlights
+        content_highlight = highlights["content"]
+        assert isinstance(content_highlight, str), (
+            f"Expected str, got {type(content_highlight)}: {content_highlight!r}"
+        )
+        # Tantivy wraps matched terms in <b> tags
+        assert "<b>" in content_highlight, (
+            f"Expected HTML with <b> tags, got: {content_highlight!r}"
+        )
+
+    def test_highlights_notes_returns_html_string(self, backend: TantivyBackend):
+        """Note highlights must be HTML strings via notes_text companion field.
+
+        The notes JSON field does not support tantivy SnippetGenerator; the
+        notes_text plain-text field is used instead.  We use the full-text
+        query "urgent" (not notes.note:) because notes_text IS in
+        DEFAULT_SEARCH_FIELDS via the normal search path… actually, we use
+        notes.note: prefix so the query targets notes content directly, but
+        the snippet is generated from notes_text which stores the same text.
+        """
+        user = User.objects.create_user("hl_noteuser")
+        doc = Document.objects.create(
+            title="Doc with matching note",
+            content="unrelated content",
+            checksum="HH2",
+            pk=91,
+        )
+        Note.objects.create(document=doc, note="urgent payment required", user=user)
+        backend.add_or_update(doc)
+
+        # Use notes.note: prefix so the document matches the query and the
+        # notes_text snippet generator can produce highlights.
+        hits = backend.highlight_hits("notes.note:urgent", [doc.pk])
+
+        assert len(hits) == 1
+        highlights = hits[0]["highlights"]
+        assert "notes" in highlights
+        note_highlight = highlights["notes"]
+        assert isinstance(note_highlight, str), (
+            f"Expected str, got {type(note_highlight)}: {note_highlight!r}"
+        )
+        assert "<b>" in note_highlight, (
+            f"Expected HTML with <b> tags, got: {note_highlight!r}"
+        )
+
+    def test_empty_doc_list_returns_empty_hits(self, backend: TantivyBackend):
+        """highlight_hits with no doc IDs must return an empty list."""
+        hits = backend.highlight_hits("anything", [])
+        assert hits == []
+
+    def test_no_highlights_when_no_match(self, backend: TantivyBackend):
+        """Documents not matching the query should not appear in results."""
+        doc = Document.objects.create(
+            title="Unrelated",
+            content="completely different text",
+            checksum="HH3",
+            pk=92,
+        )
+        backend.add_or_update(doc)
+
+        hits = backend.highlight_hits("quick", [doc.pk])
+
+        assert len(hits) == 0
@@ -1503,6 +1503,126 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
            [d2.id, d1.id, d3.id],
        )

+    def test_search_ordering_by_score(self) -> None:
+        """ordering=-score must return results in descending relevance order (best first)."""
+        backend = get_backend()
+        # doc_high has more occurrences of the search term → higher BM25 score
+        doc_low = Document.objects.create(
+            title="score sort low",
+            content="apple",
+            checksum="SCL1",
+        )
+        doc_high = Document.objects.create(
+            title="score sort high",
+            content="apple apple apple apple apple",
+            checksum="SCH1",
+        )
+        backend.add_or_update(doc_low)
+        backend.add_or_update(doc_high)
+
+        # -score = descending = best first (highest score)
+        response = self.client.get("/api/documents/?query=apple&ordering=-score")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        ids = [r["id"] for r in response.data["results"]]
+        self.assertEqual(
+            ids[0],
+            doc_high.id,
+            "Most relevant doc should be first for -score",
+        )
+
+        # score = ascending = worst first (lowest score)
+        response = self.client.get("/api/documents/?query=apple&ordering=score")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        ids = [r["id"] for r in response.data["results"]]
+        self.assertEqual(
+            ids[0],
+            doc_low.id,
+            "Least relevant doc should be first for +score",
+        )
+
+    def test_search_with_tantivy_native_sort(self) -> None:
+        """When ordering by a Tantivy-sortable field, results must be correctly sorted."""
+        backend = get_backend()
+        for i, asn in enumerate([30, 10, 20]):
+            doc = Document.objects.create(
+                title=f"sortable doc {i}",
+                content="searchable content",
+                checksum=f"TNS{i}",
+                archive_serial_number=asn,
+            )
+            backend.add_or_update(doc)
+
+        response = self.client.get(
+            "/api/documents/?query=searchable&ordering=archive_serial_number",
+        )
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        asns = [doc["archive_serial_number"] for doc in response.data["results"]]
+        self.assertEqual(asns, [10, 20, 30])
+
+        response = self.client.get(
+            "/api/documents/?query=searchable&ordering=-archive_serial_number",
+        )
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        asns = [doc["archive_serial_number"] for doc in response.data["results"]]
+        self.assertEqual(asns, [30, 20, 10])
+
+    def test_search_page_2_returns_correct_slice(self) -> None:
+        """Page 2 must return the second slice, not overlap with page 1."""
+        backend = get_backend()
+        for i in range(10):
+            doc = Document.objects.create(
+                title=f"doc {i}",
+                content="paginated content",
+                checksum=f"PG2{i}",
+                archive_serial_number=i + 1,
+            )
+            backend.add_or_update(doc)
+
+        response = self.client.get(
+            "/api/documents/?query=paginated&ordering=archive_serial_number&page=1&page_size=3",
+        )
+        page1_ids = [r["id"] for r in response.data["results"]]
+        self.assertEqual(len(page1_ids), 3)
+
+        response = self.client.get(
+            "/api/documents/?query=paginated&ordering=archive_serial_number&page=2&page_size=3",
+        )
+        page2_ids = [r["id"] for r in response.data["results"]]
+        self.assertEqual(len(page2_ids), 3)
+
+        # No overlap between pages
+        self.assertEqual(set(page1_ids) & set(page2_ids), set())
+        # Page 2 ASNs are higher than page 1
+        page1_asns = [
+            Document.objects.get(pk=pk).archive_serial_number for pk in page1_ids
+        ]
+        page2_asns = [
+            Document.objects.get(pk=pk).archive_serial_number for pk in page2_ids
+        ]
+        self.assertTrue(max(page1_asns) < min(page2_asns))
+
+    def test_search_all_field_contains_all_ids_when_paginated(self) -> None:
+        """The 'all' field must contain every matching ID, even when paginated."""
+        backend = get_backend()
+        doc_ids = []
+        for i in range(10):
+            doc = Document.objects.create(
+                title=f"all field doc {i}",
+                content="allfield content",
+                checksum=f"AF{i}",
+            )
+            backend.add_or_update(doc)
+            doc_ids.append(doc.pk)
+
+        response = self.client.get(
+            "/api/documents/?query=allfield&page=1&page_size=3",
+            headers={"Accept": "application/json; version=9"},
+        )
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(len(response.data["results"]), 3)
+        # "all" must contain ALL 10 matching IDs
+        self.assertCountEqual(response.data["all"], doc_ids)
+
    @mock.patch("documents.bulk_edit.bulk_update_documents")
    def test_global_search(self, m) -> None:
        """
@@ -1,4 +1,5 @@
 from io import StringIO
+from unittest.mock import patch

 import pytest
 from django.core.management import CommandError
@@ -6,12 +7,11 @@ from django.core.management import call_command
 from django.test import TestCase

 from documents.models import Document
+from documents.tests.factories import DocumentFactory


@pytest.mark.management
 class TestFuzzyMatchCommand(TestCase):
-    MSG_REGEX = r"Document \d fuzzy match to \d \(confidence \d\d\.\d\d\d\)"
-
    def call_command(self, *args, **kwargs):
        stdout = StringIO()
        stderr = StringIO()
@@ -77,7 +77,7 @@ class TestFuzzyMatchCommand(TestCase):
            filename="other_test.pdf",
        )
        stdout, _ = self.call_command()
-        self.assertIn("No matches found", stdout)
+        self.assertIn("No duplicate documents found", stdout)

    def test_with_matches(self) -> None:
        """
@@ -106,7 +106,7 @@ class TestFuzzyMatchCommand(TestCase):
            filename="other_test.pdf",
        )
        stdout, _ = self.call_command("--processes", "1")
-        self.assertRegex(stdout, self.MSG_REGEX)
+        self.assertIn("Found 1 matching pair(s)", stdout)

    def test_with_3_matches(self) -> None:
        """
@@ -142,10 +142,8 @@ class TestFuzzyMatchCommand(TestCase):
            filename="final_test.pdf",
        )
        stdout, _ = self.call_command("--no-progress-bar", "--processes", "1")
-        lines = [x.strip() for x in stdout.splitlines() if x.strip()]
-        self.assertEqual(len(lines), 3)
-        for line in lines:
-            self.assertRegex(line, self.MSG_REGEX)
+        # 3 docs -> 3 unique pairs; summary confirms count and no duplication
+        self.assertIn("Found 3 matching pair(s)", stdout)

    def test_document_deletion(self) -> None:
        """
@@ -186,22 +184,47 @@ class TestFuzzyMatchCommand(TestCase):

        stdout, _ = self.call_command(
            "--delete",
+            "--yes",
            "--no-progress-bar",
            "--processes",
            "1",
        )

-        self.assertIn(
-            "The command is configured to delete documents.  Use with caution",
-            stdout,
-        )
-        self.assertRegex(stdout, self.MSG_REGEX)
-        self.assertIn("Deleting 1 documents based on ratio matches", stdout)
+        self.assertIn("Delete Mode", stdout)
+        self.assertIn("Found 1 matching pair(s)", stdout)
+        self.assertIn("Deleting 1 document(s)", stdout)

        self.assertEqual(Document.objects.count(), 2)
        self.assertIsNotNone(Document.objects.get(pk=1))
        self.assertIsNotNone(Document.objects.get(pk=2))

+    def test_document_deletion_cancelled(self) -> None:
+        """
+        GIVEN:
+            - 3 documents exist
+            - Document 1 to document 3 has a similarity over 85.0
+        WHEN:
+            - Command is called with --delete but user answers "n" at the prompt
+        THEN:
+            - No documents are deleted
+        """
+        DocumentFactory(content="first document scanned by bob")
+        DocumentFactory(content="second document scanned by alice")
+        DocumentFactory(content="first document scanned by pete")
+
+        self.assertEqual(Document.objects.count(), 3)
+
+        with patch("builtins.input", return_value="n"):
+            stdout, _ = self.call_command(
+                "--delete",
+                "--no-progress-bar",
+                "--processes",
+                "1",
+            )
+
+        self.assertIn("Deletion cancelled", stdout)
+        self.assertEqual(Document.objects.count(), 3)
+
    def test_empty_content(self) -> None:
        """
        GIVEN:
@@ -226,4 +249,30 @@ class TestFuzzyMatchCommand(TestCase):
            filename="other_test.pdf",
        )
        stdout, _ = self.call_command()
-        self.assertIn("No matches found", stdout)
+        self.assertIn("No duplicate documents found", stdout)
+
+
+@pytest.mark.management
+@pytest.mark.django_db
+@pytest.mark.parametrize(
+    ("content_a", "content_b"),
+    [
+        pytest.param("x" * 90 + "y" * 10, "x" * 100, id="yellow-90pct"),  # 88-92%
+        pytest.param("x" * 94 + "y" * 6, "x" * 100, id="red-94pct"),  # 92-97%
+        pytest.param("x" * 99 + "y", "x" * 100, id="bold-red-99pct"),  # ≥97%
+    ],
+)
+def test_similarity_color_band(content_a: str, content_b: str) -> None:
+    """Each parametrized case exercises one color branch in _render_results."""
+    DocumentFactory(content=content_a)
+    DocumentFactory(content=content_b)
+    stdout = StringIO()
+    call_command(
+        "document_fuzzy_match",
+        "--no-progress-bar",
+        "--processes",
+        "1",
+        stdout=stdout,
+        skip_checks=True,
+    )
+    assert "Found 1 matching pair(s)" in stdout.getvalue()
@@ -38,6 +38,7 @@ from django.db.models import Model
 from django.db.models import OuterRef
 from django.db.models import Prefetch
 from django.db.models import Q
+from django.db.models import QuerySet
 from django.db.models import Subquery
 from django.db.models import Sum
 from django.db.models import When
@@ -249,6 +250,13 @@ if settings.AUDIT_LOG_ENABLED:

 logger = logging.getLogger("paperless.api")

+# Crossover point for intersect_and_order: below this count use a targeted
+# IN-clause query; at or above this count fall back to a full-table scan +
+# Python set intersection.  The IN-clause is faster for small result sets but
+# degrades on SQLite with thousands of parameters.  PostgreSQL handles large IN
+# clauses efficiently, so this threshold mainly protects SQLite users.
+_TANTIVY_INTERSECT_THRESHOLD = 5_000
+

 class IndexView(TemplateView):
    template_name = "index.html"
@@ -2077,19 +2085,16 @@ class UnifiedSearchViewSet(DocumentViewSet):
        if not self._is_search_request():
            return super().list(request)

+        from documents.search import SearchHit
        from documents.search import SearchMode
+        from documents.search import TantivyBackend
        from documents.search import TantivyRelevanceList
        from documents.search import get_backend

-        try:
-            backend = get_backend()
-            # ORM-filtered queryset: permissions + field filters + ordering (DRF backends applied)
-            filtered_qs = self.filter_queryset(self.get_queryset())
-
-            user = None if request.user.is_superuser else request.user
-            active_search_params = self._get_active_search_params(request)
-
-            if len(active_search_params) > 1:
+        def parse_search_params() -> tuple[str | None, bool, bool, int, int]:
+            """Extract query string, search mode, and ordering from request."""
+            active = self._get_active_search_params(request)
+            if len(active) > 1:
                raise ValidationError(
                    {
                        "detail": _(
@@ -2098,73 +2103,161 @@ class UnifiedSearchViewSet(DocumentViewSet):
                    },
                )

-            if (
-                "text" in request.query_params
-                or "title_search" in request.query_params
-                or "query" in request.query_params
-            ):
-                if "text" in request.query_params:
-                    search_mode = SearchMode.TEXT
-                    query_str = request.query_params["text"]
-                elif "title_search" in request.query_params:
-                    search_mode = SearchMode.TITLE
-                    query_str = request.query_params["title_search"]
-                else:
-                    search_mode = SearchMode.QUERY
-                    query_str = request.query_params["query"]
-                results = backend.search(
-                    query_str,
-                    user=user,
-                    page=1,
-                    page_size=10000,
-                    sort_field=None,
-                    sort_reverse=False,
-                    search_mode=search_mode,
-                )
-            else:
-                # more_like_id — validate permission on the seed document first
-                try:
-                    more_like_doc_id = int(request.query_params["more_like_id"])
-                    more_like_doc = Document.objects.select_related("owner").get(
-                        pk=more_like_doc_id,
+            ordering_param = request.query_params.get("ordering", "")
+            sort_reverse = ordering_param.startswith("-")
+            sort_field_name = ordering_param.lstrip("-") or None
+            # "score" means relevance order — Tantivy handles it natively,
+            # so treat it as a Tantivy sort to preserve the ranked order through
+            # the ORM intersection step.
+            use_tantivy_sort = (
+                sort_field_name in TantivyBackend.SORTABLE_FIELDS
+                or sort_field_name is None
+                or sort_field_name == "score"
+            )
+
+            try:
+                page_num = int(request.query_params.get("page", 1))
+            except (TypeError, ValueError):
+                page_num = 1
+            page_size = (
+                self.paginator.get_page_size(request) or self.paginator.page_size
+            )
+
+            return sort_field_name, sort_reverse, use_tantivy_sort, page_num, page_size
+
+        def intersect_and_order(
+            all_ids: list[int],
+            filtered_qs: QuerySet[Document],
+            *,
+            use_tantivy_sort: bool,
+        ) -> list[int]:
+            """Intersect search IDs with ORM-visible IDs, preserving order."""
+            if not all_ids:
+                return []
+            if use_tantivy_sort:
+                if len(all_ids) <= _TANTIVY_INTERSECT_THRESHOLD:
+                    # Small result set: targeted IN-clause avoids a full-table scan.
+                    visible_ids = set(
+                        filtered_qs.filter(pk__in=all_ids).values_list("pk", flat=True),
                    )
-                except (TypeError, ValueError, Document.DoesNotExist):
-                    raise PermissionDenied(_("Invalid more_like_id"))
+                else:
+                    # Large result set: full-table scan + Python intersection is faster
+                    # than a large IN-clause on SQLite.
+                    visible_ids = set(
+                        filtered_qs.values_list("pk", flat=True),
+                    )
+                return [doc_id for doc_id in all_ids if doc_id in visible_ids]
+            return list(
+                filtered_qs.filter(id__in=all_ids).values_list("pk", flat=True),
+            )

-                if not has_perms_owner_aware(
-                    request.user,
-                    "view_document",
-                    more_like_doc,
-                ):
-                    raise PermissionDenied(_("Insufficient permissions."))
-
-                results = backend.more_like_this(
-                    more_like_doc_id,
-                    user=user,
-                    page=1,
-                    page_size=10000,
-                )
-
-            hits_by_id = {h["id"]: h for h in results.hits}
-
-            # Determine sort order: no ordering param -> Tantivy relevance; otherwise -> ORM order
-            ordering_param = request.query_params.get("ordering", "").lstrip("-")
-            if not ordering_param:
-                # Preserve Tantivy relevance order; intersect with ORM-visible IDs
-                orm_ids = set(filtered_qs.values_list("pk", flat=True))
-                ordered_hits = [h for h in results.hits if h["id"] in orm_ids]
+        def run_text_search(
+            backend: TantivyBackend,
+            user: User | None,
+            filtered_qs: QuerySet[Document],
+        ) -> tuple[list[int], list[SearchHit], int]:
+            """Handle text/title/query search: IDs, ORM intersection, page highlights."""
+            if "text" in request.query_params:
+                search_mode = SearchMode.TEXT
+                query_str = request.query_params["text"]
+            elif "title_search" in request.query_params:
+                search_mode = SearchMode.TITLE
+                query_str = request.query_params["title_search"]
            else:
-                # Use ORM ordering (already applied by DocumentsOrderingFilter)
-                hit_ids = set(hits_by_id.keys())
-                orm_ordered_ids = filtered_qs.filter(id__in=hit_ids).values_list(
-                    "pk",
-                    flat=True,
-                )
-                ordered_hits = [
-                    hits_by_id[pk] for pk in orm_ordered_ids if pk in hits_by_id
-                ]
+                search_mode = SearchMode.QUERY
+                query_str = request.query_params["query"]

-            rl = TantivyRelevanceList(ordered_hits)
+            # "score" is not a real Tantivy sort field — it means relevance order,
+            # which is Tantivy's default when no sort field is specified.
+            is_score_sort = sort_field_name == "score"
+            all_ids = backend.search_ids(
+                query_str,
+                user=user,
+                sort_field=(
+                    None if (not use_tantivy_sort or is_score_sort) else sort_field_name
+                ),
+                sort_reverse=sort_reverse,
+                search_mode=search_mode,
+            )
+            ordered_ids = intersect_and_order(
+                all_ids,
+                filtered_qs,
+                use_tantivy_sort=use_tantivy_sort,
+            )
+            # Tantivy returns relevance results best-first (descending score).
+            # ordering=score (ascending, worst-first) requires a reversal.
+            if is_score_sort and not sort_reverse:
+                ordered_ids = list(reversed(ordered_ids))
+
+            page_offset = (page_num - 1) * page_size
+            page_ids = ordered_ids[page_offset : page_offset + page_size]
+            page_hits = backend.highlight_hits(
+                query_str,
+                page_ids,
+                search_mode=search_mode,
+                rank_start=page_offset + 1,
+            )
+            return ordered_ids, page_hits, page_offset
+
+        def run_more_like_this(
+            backend: TantivyBackend,
+            user: User | None,
+            filtered_qs: QuerySet[Document],
+        ) -> tuple[list[int], list[SearchHit], int]:
+            """Handle more_like_id search: permission check, IDs, stub hits."""
+            try:
+                more_like_doc_id = int(request.query_params["more_like_id"])
+                more_like_doc = Document.objects.select_related("owner").get(
+                    pk=more_like_doc_id,
+                )
+            except (TypeError, ValueError, Document.DoesNotExist):
+                raise PermissionDenied(_("Invalid more_like_id"))
+
+            if not has_perms_owner_aware(
+                request.user,
+                "view_document",
+                more_like_doc,
+            ):
+                raise PermissionDenied(_("Insufficient permissions."))
+
+            all_ids = backend.more_like_this_ids(more_like_doc_id, user=user)
+            ordered_ids = intersect_and_order(
+                all_ids,
+                filtered_qs,
+                use_tantivy_sort=True,
+            )
+
+            page_offset = (page_num - 1) * page_size
+            page_ids = ordered_ids[page_offset : page_offset + page_size]
+            page_hits = [
+                SearchHit(id=doc_id, score=0.0, rank=rank, highlights={})
+                for rank, doc_id in enumerate(page_ids, start=page_offset + 1)
+            ]
+            return ordered_ids, page_hits, page_offset
+
+        try:
+            sort_field_name, sort_reverse, use_tantivy_sort, page_num, page_size = (
+                parse_search_params()
+            )
+
+            backend = get_backend()
+            filtered_qs = self.filter_queryset(self.get_queryset())
+            user = None if request.user.is_superuser else request.user
+
+            if "more_like_id" in request.query_params:
+                ordered_ids, page_hits, page_offset = run_more_like_this(
+                    backend,
+                    user,
+                    filtered_qs,
+                )
+            else:
+                ordered_ids, page_hits, page_offset = run_text_search(
+                    backend,
+                    user,
+                    filtered_qs,
+                )
+
+            rl = TantivyRelevanceList(ordered_ids, page_hits, page_offset)
            page = self.paginate_queryset(rl)

            if page is not None:
@@ -2174,15 +2267,18 @@ class UnifiedSearchViewSet(DocumentViewSet):
                if get_boolean(
                    str(request.query_params.get("include_selection_data", "false")),
                ):
-                    all_ids = [h["id"] for h in ordered_hits]
+                    # NOTE: pk__in=ordered_ids generates a large SQL IN clause
+                    # for big result sets.  Acceptable today but may need a temp
+                    # table or chunked approach if selection_data becomes slow
+                    # at scale (tens of thousands of matching documents).
                    response.data["selection_data"] = (
                        self._get_selection_data_for_queryset(
-                            filtered_qs.filter(pk__in=all_ids),
+                            filtered_qs.filter(pk__in=ordered_ids),
                        )
                    )
                return response

-            serializer = self.get_serializer(ordered_hits, many=True)
+            serializer = self.get_serializer(page_hits, many=True)
            return Response(serializer.data)

        except NotFound:
@@ -3088,20 +3184,17 @@ class GlobalSearchView(PassUserMixin):
                docs = all_docs.filter(title__icontains=query)[:OBJECT_LIMIT]
            else:
                user = None if request.user.is_superuser else request.user
-                fts_results = get_backend().search(
+                matching_ids = get_backend().search_ids(
                    query,
                    user=user,
-                    page=1,
-                    page_size=1000,
-                    sort_field=None,
-                    sort_reverse=False,
                    search_mode=SearchMode.TEXT,
+                    limit=OBJECT_LIMIT * 3,
                )
-                docs_by_id = all_docs.in_bulk([hit["id"] for hit in fts_results.hits])
+                docs_by_id = all_docs.in_bulk(matching_ids)
                docs = [
-                    docs_by_id[hit["id"]]
-                    for hit in fts_results.hits
-                    if hit["id"] in docs_by_id
+                    docs_by_id[doc_id]
+                    for doc_id in matching_ids
+                    if doc_id in docs_by_id
                ][:OBJECT_LIMIT]
        saved_views = (
            get_objects_for_user_owner_aware(
@@ -2,7 +2,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: paperless-ngx\n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2026-04-15 18:58+0000\n"
+"POT-Creation-Date: 2026-04-15 23:21+0000\n"
 "PO-Revision-Date: 2022-02-17 04:17\n"
 "Last-Translator: \n"
 "Language-Team: English\n"
@@ -1308,8 +1308,8 @@ msgid "workflow runs"
 msgstr ""

 #: documents/serialisers.py:463 documents/serialisers.py:815
-#: documents/serialisers.py:2547 documents/views.py:2139
-#: documents/views.py:2194 paperless_mail/serialisers.py:143
+#: documents/serialisers.py:2547 documents/views.py:2221
+#: documents/views.py:2290 paperless_mail/serialisers.py:143
 msgid "Insufficient permissions."
 msgstr ""

@@ -1349,7 +1349,7 @@ msgstr ""
 msgid "Duplicate document identifiers are not allowed."
 msgstr ""

-#: documents/serialisers.py:2633 documents/views.py:3811
+#: documents/serialisers.py:2633 documents/views.py:3904
 #, python-format
 msgid "Documents not found: %(ids)s"
 msgstr ""
@@ -1617,28 +1617,28 @@ msgstr ""
 msgid "Unable to parse URI {value}"
 msgstr ""

-#: documents/views.py:2096
+#: documents/views.py:2101
 msgid "Specify only one of text, title_search, query, or more_like_id."
 msgstr ""

-#: documents/views.py:2132 documents/views.py:2191
+#: documents/views.py:2214 documents/views.py:2287
 msgid "Invalid more_like_id"
 msgstr ""

-#: documents/views.py:3823
+#: documents/views.py:3916
 #, python-format
 msgid "Insufficient permissions to share document %(id)s."
 msgstr ""

-#: documents/views.py:3866
+#: documents/views.py:3959
 msgid "Bundle is already being processed."
 msgstr ""

-#: documents/views.py:3923
+#: documents/views.py:4016
 msgid "The share link bundle is still being prepared. Please try again later."
 msgstr ""

-#: documents/views.py:3933
+#: documents/views.py:4026
 msgid "The share link bundle is unavailable."
 msgstr ""

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Crowdin Bot	eae75e8798	New Crowdin translations by GitHub Action	2026-04-16 01:00:16 +00:00
GitHub Actions	dc06b679d3	Auto translate strings	2026-04-15 23:22:00 +00:00
Trenton H	3ffbb8862c	Feature: paginate search highlights and remove 10k document search limit (#12518 ) Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>	2026-04-15 23:20:31 +00:00
Trenton H	21db608d57	Feature: Tune SQLite default settings for increased speed (#12580 )	2026-04-15 15:26:00 -07:00
Trenton H	2fd1a1cf3a	Feature: Document fuzzy match improvements (#12579 ) Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 20:59:43 +00:00