From 3ffbb8862c8ef7f5ae3c86e899399778035ce5ce Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 15 Apr 2026 16:20:31 -0700 Subject: [PATCH] Feature: paginate search highlights and remove 10k document search limit (#12518) Co-authored-by: Claude Opus 4.6 Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> --- .../document-card-large.component.html | 4 +- .../document-card-large.component.scss | 4 +- .../document-card-large.component.spec.ts | 13 + .../document-card-large.component.ts | 11 + src/documents/search/__init__.py | 4 +- src/documents/search/_backend.py | 586 +++++++++++------- src/documents/search/_query.py | 15 +- src/documents/search/_schema.py | 3 + src/documents/tests/search/test_backend.py | 545 +++++++--------- src/documents/tests/test_api_search.py | 120 ++++ src/documents/views.py | 261 +++++--- src/paperless/views.py | 2 +- 12 files changed, 904 insertions(+), 664 deletions(-) diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html index f0da79260..1c69d0f53 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html @@ -43,7 +43,7 @@

@if (document) { - @if (document.__search_hit__ && document.__search_hit__.highlights) { + @if (hasSearchHighlights) { } @for (highlight of searchNoteHighlights; track highlight) { @@ -52,7 +52,7 @@ } - @if (!document.__search_hit__?.score) { + @if (shouldShowContentFallback) { {{contentTrimmed}} } } @else { diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.scss b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.scss index 75174a680..359346867 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.scss +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.scss @@ -65,7 +65,9 @@ } } -span ::ng-deep .match { +.card-text ::ng-deep .match, +.card-text ::ng-deep b { + font-weight: normal; color: black; background-color: rgb(255, 211, 66); } diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.spec.ts b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.spec.ts index 4d62c6a0a..e4e2bca74 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.spec.ts +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.spec.ts @@ -127,6 +127,19 @@ describe('DocumentCardLargeComponent', () => { expect(component.searchNoteHighlights).toContain('bananas') }) + it('should fall back to document content when a search hit has no highlights', () => { + component.document.__search_hit__ = { + score: 0.9, + rank: 1, + highlights: '', + note_highlights: null, + } + fixture.detectChanges() + + expect(fixture.nativeElement.textContent).toContain('Cupcake ipsum') + expect(component.shouldShowContentFallback).toBe(true) + }) + it('should try to close the preview on mouse leave', () => { component.popupPreview = { close: jest.fn(), diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts index 74dccfaf3..8f77489ec 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts @@ -164,6 +164,17 @@ export class DocumentCardLargeComponent ) } + get hasSearchHighlights() { + return Boolean(this.document?.__search_hit__?.highlights?.trim()?.length) + } + + get shouldShowContentFallback() { + return ( + this.document?.__search_hit__?.score == null || + (!this.hasSearchHighlights && this.searchNoteHighlights.length === 0) + ) + } + get notesEnabled(): boolean { return this.settingsService.get(SETTINGS_KEYS.NOTES_ENABLED) } diff --git a/src/documents/search/__init__.py b/src/documents/search/__init__.py index a4145d7ef..3f231b9c7 100644 --- a/src/documents/search/__init__.py +++ b/src/documents/search/__init__.py @@ -1,6 +1,6 @@ +from documents.search._backend import SearchHit from documents.search._backend import SearchIndexLockError from documents.search._backend import SearchMode -from documents.search._backend import SearchResults from documents.search._backend import TantivyBackend from documents.search._backend import TantivyRelevanceList from documents.search._backend import WriteBatch @@ -10,9 +10,9 @@ from documents.search._schema import needs_rebuild from documents.search._schema import wipe_index __all__ = [ + "SearchHit", "SearchIndexLockError", "SearchMode", - "SearchResults", "TantivyBackend", "TantivyRelevanceList", "WriteBatch", diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 2005a436f..8d2e974f2 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -1,9 +1,9 @@ from __future__ import annotations import logging +import re import threading from collections import Counter -from dataclasses import dataclass from datetime import UTC from datetime import datetime from enum import StrEnum @@ -88,45 +88,63 @@ class SearchHit(TypedDict): highlights: dict[str, str] -@dataclass(frozen=True, slots=True) -class SearchResults: - """ - Container for search results with pagination metadata. - - Attributes: - hits: List of search results with scores and highlights - total: Total matching documents across all pages (for pagination) - query: Preprocessed query string after date/syntax rewriting - """ - - hits: list[SearchHit] - total: int # total matching documents (for pagination) - query: str # preprocessed query string - - class TantivyRelevanceList: """ - DRF-compatible list wrapper for Tantivy search hits. + DRF-compatible list wrapper for Tantivy search results. - Provides paginated access to search results while storing all hits in memory - for efficient ID retrieval. Used by Django REST framework for pagination. + Holds a lightweight ordered list of IDs (for pagination count and + ``selection_data``) together with a small page of rich ``SearchHit`` + dicts (for serialization). DRF's ``PageNumberPagination`` calls + ``__len__`` to compute the total page count and ``__getitem__`` to + slice the displayed page. - Methods: - __len__: Returns total hit count for pagination calculations - __getitem__: Slices the hit list for page-specific results - - Note: Stores ALL post-filter hits so get_all_result_ids() can return - every matching document ID without requiring a second search query. + Args: + ordered_ids: All matching document IDs in display order. + page_hits: Rich SearchHit dicts for the requested DRF page only. + page_offset: Index into *ordered_ids* where *page_hits* starts. """ - def __init__(self, hits: list[SearchHit]) -> None: - self._hits = hits + def __init__( + self, + ordered_ids: list[int], + page_hits: list[SearchHit], + page_offset: int = 0, + ) -> None: + self._ordered_ids = ordered_ids + self._page_hits = page_hits + self._page_offset = page_offset def __len__(self) -> int: - return len(self._hits) + return len(self._ordered_ids) - def __getitem__(self, key: slice) -> list[SearchHit]: - return self._hits[key] + def __getitem__(self, key: int | slice) -> SearchHit | list[SearchHit]: + if isinstance(key, int): + idx = key if key >= 0 else len(self._ordered_ids) + key + if self._page_offset <= idx < self._page_offset + len(self._page_hits): + return self._page_hits[idx - self._page_offset] + return SearchHit( + id=self._ordered_ids[key], + score=0.0, + rank=idx + 1, + highlights={}, + ) + start = key.start or 0 + stop = key.stop or len(self._ordered_ids) + # DRF slices to extract the current page. If the slice aligns + # with our pre-fetched page_hits, return them directly. + # We only check start — DRF always slices with stop=start+page_size, + # which exceeds page_hits length on the last page. + if start == self._page_offset: + return self._page_hits[: stop - start] + # Fallback: return stub dicts (no highlights). + return [ + SearchHit(id=doc_id, score=0.0, rank=start + i + 1, highlights={}) + for i, doc_id in enumerate(self._ordered_ids[key]) + ] + + def get_all_ids(self) -> list[int]: + """Return all matching document IDs in display order.""" + return self._ordered_ids class SearchIndexLockError(Exception): @@ -206,10 +224,13 @@ class WriteBatch: """ Remove a document from the batch by its primary key. - Uses range query instead of term query to work around unsigned integer - type detection bug in tantivy-py 0.25. + Uses range_query instead of term_query to work around a tantivy-py bug + where Python integers are inferred as i64, producing Terms that never + match u64 fields. + + TODO: Replace with term_query("id", doc_id) once + https://github.com/quickwit-oss/tantivy-py/pull/642 lands. """ - # Use range query to work around u64 deletion bug self._writer.delete_documents_by_query( tantivy.Query.range_query( self._backend._schema, @@ -234,6 +255,34 @@ class TantivyBackend: the underlying index directory changes (e.g., during test isolation). """ + # Maps DRF ordering field names to Tantivy index field names. + SORT_FIELD_MAP: dict[str, str] = { + "title": "title_sort", + "correspondent__name": "correspondent_sort", + "document_type__name": "type_sort", + "created": "created", + "added": "added", + "modified": "modified", + "archive_serial_number": "asn", + "page_count": "page_count", + "num_notes": "num_notes", + } + + # Fields where Tantivy's sort order matches the ORM's sort order. + # Text-based fields (title, correspondent__name, document_type__name) + # are excluded because Tantivy's tokenized fast fields produce different + # ordering than the ORM's collation-based ordering. + SORTABLE_FIELDS: frozenset[str] = frozenset( + { + "created", + "added", + "modified", + "archive_serial_number", + "page_count", + "num_notes", + }, + ) + def __init__(self, path: Path | None = None): # path=None → in-memory index (for tests) # path=some_dir → on-disk index (for production) @@ -272,6 +321,36 @@ class TantivyBackend: if self._index is None: self.open() # pragma: no cover + def _parse_query( + self, + query: str, + search_mode: SearchMode, + ) -> tantivy.Query: + """Parse a user query string into a Tantivy Query object.""" + tz = get_current_timezone() + if search_mode is SearchMode.TEXT: + return parse_simple_text_query(self._index, query) + elif search_mode is SearchMode.TITLE: + return parse_simple_title_query(self._index, query) + else: + return parse_user_query(self._index, query, tz) + + def _apply_permission_filter( + self, + query: tantivy.Query, + user: AbstractBaseUser | None, + ) -> tantivy.Query: + """Wrap a query with a permission filter if the user is not a superuser.""" + if user is not None: + permission_filter = build_permission_filter(self._schema, user) + return tantivy.Query.boolean_query( + [ + (tantivy.Occur.Must, query), + (tantivy.Occur.Must, permission_filter), + ], + ) + return query + def _build_tantivy_doc( self, document: Document, @@ -326,12 +405,17 @@ class TantivyBackend: doc.add_unsigned("tag_id", tag.pk) tag_names.append(tag.name) - # Notes — JSON for structured queries (notes.user:alice, notes.note:text), - # companion text field for default full-text search. + # Notes — JSON for structured queries (notes.user:alice, notes.note:text). + # notes_text is a plain-text companion for snippet/highlight generation; + # tantivy's SnippetGenerator does not support JSON fields. num_notes = 0 + note_texts: list[str] = [] for note in document.notes.all(): num_notes += 1 doc.add_json("notes", {"note": note.note, "user": note.user.username}) + note_texts.append(note.note) + if note_texts: + doc.add_text("notes_text", " ".join(note_texts)) # Custom fields — JSON for structured queries (custom_fields.name:x, custom_fields.value:y), # companion text field for default full-text search. @@ -425,155 +509,125 @@ class TantivyBackend: with self.batch_update(lock_timeout=5.0) as batch: batch.remove(doc_id) - def search( + def highlight_hits( self, query: str, - user: AbstractBaseUser | None, - page: int, - page_size: int, - sort_field: str | None, + doc_ids: list[int], *, - sort_reverse: bool, search_mode: SearchMode = SearchMode.QUERY, - ) -> SearchResults: + rank_start: int = 1, + ) -> list[SearchHit]: """ - Execute a search query against the document index. + Generate SearchHit dicts with highlights for specific document IDs. - Processes the user query through date rewriting, normalization, and - permission filtering before executing against Tantivy. Supports both - relevance-based and field-based sorting. + Unlike search(), this does not execute a ranked query — it looks up + each document by ID and generates snippets against the provided query. + Use this when you already know which documents to display (from + search_ids + ORM filtering) and just need highlight data. - QUERY search mode supports natural date keywords, field filters, etc. - TITLE search mode treats the query as plain text to search for in title only - TEXT search mode treats the query as plain text to search for in title and content + Note: Each doc_id requires an individual index lookup because tantivy-py + does not yet expose a batch fast-field read API. This is acceptable for + page-sized batches (typically 25 docs) but should not be called with + thousands of IDs. + + TODO: When https://github.com/quickwit-oss/tantivy-py/pull/641 lands, + the per-doc range_query lookups here can be replaced with a single + collect_u64_fast_field("id", doc_addresses) call. Args: - query: User's search query - user: User for permission filtering (None for superuser/no filtering) - page: Page number (1-indexed) for pagination - page_size: Number of results per page - sort_field: Field to sort by (None for relevance ranking) - sort_reverse: Whether to reverse the sort order - search_mode: "query" for advanced Tantivy syntax, "text" for - plain-text search over title and content only, "title" for - plain-text search over title only + query: The search query (used for snippet generation) + doc_ids: Ordered list of document IDs to generate hits for + search_mode: Query parsing mode (for building the snippet query) + rank_start: Starting rank value (1-based absolute position in the + full result set; pass ``page_offset + 1`` for paginated calls) Returns: - SearchResults with hits, total count, and processed query + List of SearchHit dicts in the same order as doc_ids """ - self._ensure_open() - tz = get_current_timezone() - if search_mode is SearchMode.TEXT: - user_query = parse_simple_text_query(self._index, query) - elif search_mode is SearchMode.TITLE: - user_query = parse_simple_title_query(self._index, query) - else: - user_query = parse_user_query(self._index, query, tz) + if not doc_ids: + return [] - # Apply permission filter if user is not None (not superuser) - if user is not None: - permission_filter = build_permission_filter(self._schema, user) - final_query = tantivy.Query.boolean_query( - [ - (tantivy.Occur.Must, user_query), - (tantivy.Occur.Must, permission_filter), - ], + self._ensure_open() + user_query = self._parse_query(query, search_mode) + + # For notes_text snippet generation, we need a query that targets the + # notes_text field directly. user_query may contain JSON-field terms + # (e.g. notes.note:urgent) that the SnippetGenerator cannot resolve + # against a text field. Strip field:value prefixes so bare terms like + # "urgent" are re-parsed against notes_text, producing highlights even + # when the original query used structured syntax. + bare_query = re.sub(r"\w[\w.]*:", "", query).strip() + try: + notes_text_query = ( + self._index.parse_query(bare_query, ["notes_text"]) + if bare_query + else user_query ) - else: - final_query = user_query + except Exception: + notes_text_query = user_query searcher = self._index.searcher() - offset = (page - 1) * page_size - - # Map sort fields - sort_field_map = { - "title": "title_sort", - "correspondent__name": "correspondent_sort", - "document_type__name": "type_sort", - "created": "created", - "added": "added", - "modified": "modified", - "archive_serial_number": "asn", - "page_count": "page_count", - "num_notes": "num_notes", - } - - # Perform search - if sort_field and sort_field in sort_field_map: - mapped_field = sort_field_map[sort_field] - results = searcher.search( - final_query, - limit=offset + page_size, - order_by_field=mapped_field, - order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc, - ) - # Field sorting: hits are still (score, DocAddress) tuples; score unused - all_hits = [(hit[1], 0.0) for hit in results.hits] - else: - # Score-based search: hits are (score, DocAddress) tuples - results = searcher.search(final_query, limit=offset + page_size) - all_hits = [(hit[1], hit[0]) for hit in results.hits] - - total = results.count - - # Normalize scores for score-based searches - if not sort_field and all_hits: - max_score = max(hit[1] for hit in all_hits) or 1.0 - all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits] - - # Apply threshold filter if configured (score-based search only) - threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD - if threshold is not None and not sort_field: - all_hits = [hit for hit in all_hits if hit[1] >= threshold] - - # Get the page's hits - page_hits = all_hits[offset : offset + page_size] - - # Build result hits with highlights - hits: list[SearchHit] = [] snippet_generator = None notes_snippet_generator = None + hits: list[SearchHit] = [] - for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1): - # Get the actual document from the searcher using the doc address + for rank, doc_id in enumerate(doc_ids, start=rank_start): + # Look up document by ID, scoring against the user query so that + # the returned SearchHit carries a real BM25 relevance score. + id_query = tantivy.Query.range_query( + self._schema, + "id", + tantivy.FieldType.Unsigned, + doc_id, + doc_id, + ) + scored_query = tantivy.Query.boolean_query( + [ + (tantivy.Occur.Must, user_query), + (tantivy.Occur.Must, id_query), + ], + ) + results = searcher.search(scored_query, limit=1) + + if not results.hits: + continue + + score, doc_address = results.hits[0] actual_doc = searcher.doc(doc_address) doc_dict = actual_doc.to_dict() - doc_id = doc_dict["id"][0] highlights: dict[str, str] = {} + try: + if snippet_generator is None: + snippet_generator = tantivy.SnippetGenerator.create( + searcher, + user_query, + self._schema, + "content", + ) - # Generate highlights if score > 0 - if score > 0: - try: - if snippet_generator is None: - snippet_generator = tantivy.SnippetGenerator.create( + content_html = snippet_generator.snippet_from_doc(actual_doc).to_html() + if content_html: + highlights["content"] = content_html + + if "notes_text" in doc_dict: + # Use notes_text (plain text) for snippet generation — tantivy's + # SnippetGenerator does not support JSON fields. + if notes_snippet_generator is None: + notes_snippet_generator = tantivy.SnippetGenerator.create( searcher, - final_query, + notes_text_query, self._schema, - "content", + "notes_text", ) + notes_html = notes_snippet_generator.snippet_from_doc( + actual_doc, + ).to_html() + if notes_html: + highlights["notes"] = notes_html - content_snippet = snippet_generator.snippet_from_doc(actual_doc) - if content_snippet: - highlights["content"] = str(content_snippet) - - # Try notes highlights - if "notes" in doc_dict: - if notes_snippet_generator is None: - notes_snippet_generator = tantivy.SnippetGenerator.create( - searcher, - final_query, - self._schema, - "notes", - ) - notes_snippet = notes_snippet_generator.snippet_from_doc( - actual_doc, - ) - if notes_snippet: - highlights["notes"] = str(notes_snippet) - - except Exception: # pragma: no cover - logger.debug("Failed to generate highlights for doc %s", doc_id) + except Exception: # pragma: no cover + logger.debug("Failed to generate highlights for doc %s", doc_id) hits.append( SearchHit( @@ -584,11 +638,69 @@ class TantivyBackend: ), ) - return SearchResults( - hits=hits, - total=total, - query=query, - ) + return hits + + def search_ids( + self, + query: str, + user: AbstractBaseUser | None, + *, + sort_field: str | None = None, + sort_reverse: bool = False, + search_mode: SearchMode = SearchMode.QUERY, + limit: int | None = None, + ) -> list[int]: + """ + Return document IDs matching a query — no highlights or scores. + + This is the lightweight companion to search(). Use it when you need the + full set of matching IDs (e.g. for ``selection_data``) but don't need + scores, ranks, or highlights. + + Args: + query: User's search query + user: User for permission filtering (None for superuser/no filtering) + sort_field: Field to sort by (None for relevance ranking) + sort_reverse: Whether to reverse the sort order + search_mode: Query parsing mode (QUERY, TEXT, or TITLE) + limit: Maximum number of IDs to return (None = all matching docs) + + Returns: + List of document IDs in the requested order + """ + self._ensure_open() + user_query = self._parse_query(query, search_mode) + final_query = self._apply_permission_filter(user_query, user) + + searcher = self._index.searcher() + effective_limit = limit if limit is not None else searcher.num_docs + + if sort_field and sort_field in self.SORT_FIELD_MAP: + mapped_field = self.SORT_FIELD_MAP[sort_field] + results = searcher.search( + final_query, + limit=effective_limit, + order_by_field=mapped_field, + order=tantivy.Order.Desc if sort_reverse else tantivy.Order.Asc, + ) + all_hits = [(hit[1],) for hit in results.hits] + else: + results = searcher.search(final_query, limit=effective_limit) + all_hits = [(hit[1], hit[0]) for hit in results.hits] + + # Normalize scores and apply threshold (relevance search only) + if all_hits: + max_score = max(hit[1] for hit in all_hits) or 1.0 + all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits] + + threshold = settings.ADVANCED_FUZZY_SEARCH_THRESHOLD + if threshold is not None: + all_hits = [hit for hit in all_hits if hit[1] >= threshold] + + # TODO: Replace with searcher.collect_u64_fast_field("id", addrs) once + # https://github.com/quickwit-oss/tantivy-py/pull/641 lands — eliminates + # one stored-doc fetch per result (~80% reduction in search_ids latency). + return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits] def autocomplete( self, @@ -603,6 +715,10 @@ class TantivyBackend: frequency (how many documents contain each word). Optionally filters results to only words from documents visible to the specified user. + NOTE: This is the hottest search path (called per keystroke). + A future improvement would be to cache results in Redis, keyed by + (prefix, user_id), and invalidate on index writes. + Args: term: Prefix to match against autocomplete words limit: Maximum number of suggestions to return @@ -613,64 +729,94 @@ class TantivyBackend: """ self._ensure_open() normalized_term = ascii_fold(term.lower()) + if not normalized_term: + return [] searcher = self._index.searcher() - # Apply permission filter for non-superusers so autocomplete words - # from invisible documents don't leak to other users. + # Build a prefix query on autocomplete_word so we only scan docs + # containing words that start with the prefix, not the entire index. + # tantivy regex is implicitly anchored; .+ avoids the empty-match + # error that .* triggers. We OR with term_query to also match the + # exact prefix as a complete word. + escaped = re.escape(normalized_term) + prefix_query = tantivy.Query.boolean_query( + [ + ( + tantivy.Occur.Should, + tantivy.Query.term_query( + self._schema, + "autocomplete_word", + normalized_term, + ), + ), + ( + tantivy.Occur.Should, + tantivy.Query.regex_query( + self._schema, + "autocomplete_word", + f"{escaped}.+", + ), + ), + ], + ) + + # Intersect with permission filter so autocomplete words from + # invisible documents don't leak to other users. if user is not None and not user.is_superuser: - base_query = build_permission_filter(self._schema, user) + final_query = tantivy.Query.boolean_query( + [ + (tantivy.Occur.Must, prefix_query), + (tantivy.Occur.Must, build_permission_filter(self._schema, user)), + ], + ) else: - base_query = tantivy.Query.all_query() + final_query = prefix_query - results = searcher.search(base_query, limit=10000) + results = searcher.search(final_query, limit=searcher.num_docs) - # Count how many visible documents each word appears in. - # Using Counter (not set) preserves per-word document frequency so - # we can rank suggestions by how commonly they occur — the same - # signal Whoosh used for Tf/Idf-based autocomplete ordering. + # Count how many visible documents each matching word appears in. word_counts: Counter[str] = Counter() for _score, doc_address in results.hits: stored_doc = searcher.doc(doc_address) doc_dict = stored_doc.to_dict() if "autocomplete_word" in doc_dict: - word_counts.update(doc_dict["autocomplete_word"]) + for word in doc_dict["autocomplete_word"]: + if word.startswith(normalized_term): + word_counts[word] += 1 - # Filter to prefix matches, sort by document frequency descending; - # break ties alphabetically for stable, deterministic output. + # Sort by document frequency descending; break ties alphabetically. matches = sorted( - (w for w in word_counts if w.startswith(normalized_term)), + word_counts, key=lambda w: (-word_counts[w], w), ) return matches[:limit] - def more_like_this( + def more_like_this_ids( self, doc_id: int, user: AbstractBaseUser | None, - page: int, - page_size: int, - ) -> SearchResults: + *, + limit: int | None = None, + ) -> list[int]: """ - Find documents similar to the given document using content analysis. + Return IDs of documents similar to the given document — no highlights. - Uses Tantivy's "more like this" query to find documents with similar - content patterns. The original document is excluded from results. + Lightweight companion to more_like_this(). The original document is + excluded from results. Args: doc_id: Primary key of the reference document user: User for permission filtering (None for no filtering) - page: Page number (1-indexed) for pagination - page_size: Number of results per page + limit: Maximum number of IDs to return (None = all matching docs) Returns: - SearchResults with similar documents (excluding the original) + List of similar document IDs (excluding the original) """ self._ensure_open() searcher = self._index.searcher() - # First find the document address id_query = tantivy.Query.range_query( self._schema, "id", @@ -681,13 +827,9 @@ class TantivyBackend: results = searcher.search(id_query, limit=1) if not results.hits: - # Document not found - return SearchResults(hits=[], total=0, query=f"more_like:{doc_id}") + return [] - # Extract doc_address from (score, doc_address) tuple doc_address = results.hits[0][1] - - # Build more like this query mlt_query = tantivy.Query.more_like_this_query( doc_address, min_doc_frequency=1, @@ -699,59 +841,21 @@ class TantivyBackend: boost_factor=None, ) - # Apply permission filter - if user is not None: - permission_filter = build_permission_filter(self._schema, user) - final_query = tantivy.Query.boolean_query( - [ - (tantivy.Occur.Must, mlt_query), - (tantivy.Occur.Must, permission_filter), - ], - ) - else: - final_query = mlt_query + final_query = self._apply_permission_filter(mlt_query, user) - # Search - offset = (page - 1) * page_size - results = searcher.search(final_query, limit=offset + page_size) + effective_limit = limit if limit is not None else searcher.num_docs + # Fetch one extra to account for excluding the original document + results = searcher.search(final_query, limit=effective_limit + 1) - total = results.count - # Convert from (score, doc_address) to (doc_address, score) - all_hits = [(hit[1], hit[0]) for hit in results.hits] + # TODO: Replace with collect_u64_fast_field("id", addrs) once + # https://github.com/quickwit-oss/tantivy-py/pull/641 lands. + ids = [] + for _score, doc_address in results.hits: + result_doc_id = searcher.doc(doc_address).to_dict()["id"][0] + if result_doc_id != doc_id: + ids.append(result_doc_id) - # Normalize scores - if all_hits: - max_score = max(hit[1] for hit in all_hits) or 1.0 - all_hits = [(hit[0], hit[1] / max_score) for hit in all_hits] - - # Get page hits - page_hits = all_hits[offset : offset + page_size] - - # Build results - hits: list[SearchHit] = [] - for rank, (doc_address, score) in enumerate(page_hits, start=offset + 1): - actual_doc = searcher.doc(doc_address) - doc_dict = actual_doc.to_dict() - result_doc_id = doc_dict["id"][0] - - # Skip the original document - if result_doc_id == doc_id: - continue - - hits.append( - SearchHit( - id=result_doc_id, - score=score, - rank=rank, - highlights={}, # MLT doesn't generate highlights - ), - ) - - return SearchResults( - hits=hits, - total=max(0, total - 1), # Subtract 1 for the original document - query=f"more_like:{doc_id}", - ) + return ids[:limit] if limit is not None else ids def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch: """ diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index b7bcbbe9c..ed0bb4c15 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -396,10 +396,17 @@ def build_permission_filter( Tantivy query that filters results to visible documents Implementation Notes: - - Uses range_query instead of term_query to work around unsigned integer - type detection bug in tantivy-py 0.25 - - Uses boolean_query for "no owner" check since exists_query is not - available in tantivy-py 0.25.1 (available in master) + - Uses range_query instead of term_query for owner_id/viewer_id to work + around a tantivy-py bug where Python ints are inferred as i64, causing + term_query to return no hits on u64 fields. + TODO: Replace with term_query once + https://github.com/quickwit-oss/tantivy-py/pull/642 lands. + + - Uses range_query(owner_id, 1, MAX_U64) as an "owner exists" check + because exists_query is not yet available in tantivy-py 0.25. + TODO: Replace with exists_query("owner_id") once that is exposed in + a tantivy-py release. + - Uses disjunction_max_query to combine permission clauses with OR logic """ owner_any = tantivy.Query.range_query( diff --git a/src/documents/search/_schema.py b/src/documents/search/_schema.py index 5e9404235..479c60bc5 100644 --- a/src/documents/search/_schema.py +++ b/src/documents/search/_schema.py @@ -72,6 +72,9 @@ def build_schema() -> tantivy.Schema: # JSON fields — structured queries: notes.user:alice, custom_fields.name:invoice sb.add_json_field("notes", stored=True, tokenizer_name="paperless_text") + # Plain-text companion for notes — tantivy's SnippetGenerator does not support + # JSON fields, so highlights require a text field with the same content. + sb.add_text_field("notes_text", stored=True, tokenizer_name="paperless_text") sb.add_json_field("custom_fields", stored=True, tokenizer_name="paperless_text") for field in ( diff --git a/src/documents/tests/search/test_backend.py b/src/documents/tests/search/test_backend.py index ff9638e63..dd745253b 100644 --- a/src/documents/tests/search/test_backend.py +++ b/src/documents/tests/search/test_backend.py @@ -33,19 +33,12 @@ class TestWriteBatch: except RuntimeError: pass - r = backend.search( - "should survive", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - ) - assert r.total == 1 + ids = backend.search_ids("should survive", user=None) + assert len(ids) == 1 class TestSearch: - """Test search functionality.""" + """Test search query parsing and matching via search_ids.""" def test_text_mode_limits_default_search_to_title_and_content( self, @@ -60,27 +53,20 @@ class TestSearch: ) backend.add_or_update(doc) - metadata_only = backend.search( - "document_type:invoice", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TEXT, + assert ( + len( + backend.search_ids( + "document_type:invoice", + user=None, + search_mode=SearchMode.TEXT, + ), + ) + == 0 ) - assert metadata_only.total == 0 - - content_match = backend.search( - "monthly", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TEXT, + assert ( + len(backend.search_ids("monthly", user=None, search_mode=SearchMode.TEXT)) + == 1 ) - assert content_match.total == 1 def test_title_mode_limits_default_search_to_title_only( self, @@ -95,27 +81,14 @@ class TestSearch: ) backend.add_or_update(doc) - content_only = backend.search( - "monthly", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TITLE, + assert ( + len(backend.search_ids("monthly", user=None, search_mode=SearchMode.TITLE)) + == 0 ) - assert content_only.total == 0 - - title_match = backend.search( - "invoice", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TITLE, + assert ( + len(backend.search_ids("invoice", user=None, search_mode=SearchMode.TITLE)) + == 1 ) - assert title_match.total == 1 def test_text_mode_matches_partial_term_substrings( self, @@ -130,38 +103,16 @@ class TestSearch: ) backend.add_or_update(doc) - prefix_match = backend.search( - "pass", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TEXT, + assert ( + len(backend.search_ids("pass", user=None, search_mode=SearchMode.TEXT)) == 1 ) - assert prefix_match.total == 1 - - infix_match = backend.search( - "sswo", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TEXT, + assert ( + len(backend.search_ids("sswo", user=None, search_mode=SearchMode.TEXT)) == 1 ) - assert infix_match.total == 1 - - phrase_match = backend.search( - "sswo re", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TEXT, + assert ( + len(backend.search_ids("sswo re", user=None, search_mode=SearchMode.TEXT)) + == 1 ) - assert phrase_match.total == 1 def test_text_mode_does_not_match_on_partial_term_overlap( self, @@ -176,16 +127,10 @@ class TestSearch: ) backend.add_or_update(doc) - non_match = backend.search( - "raptor", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TEXT, + assert ( + len(backend.search_ids("raptor", user=None, search_mode=SearchMode.TEXT)) + == 0 ) - assert non_match.total == 0 def test_text_mode_anchors_later_query_tokens_to_token_starts( self, @@ -214,16 +159,9 @@ class TestSearch: backend.add_or_update(prefix_doc) backend.add_or_update(false_positive) - results = backend.search( - "Z-Berichte 6", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TEXT, + result_ids = set( + backend.search_ids("Z-Berichte 6", user=None, search_mode=SearchMode.TEXT), ) - result_ids = {hit["id"] for hit in results.hits} assert exact_doc.id in result_ids assert prefix_doc.id in result_ids @@ -242,16 +180,9 @@ class TestSearch: ) backend.add_or_update(doc) - no_tokens = backend.search( - "!!!", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TEXT, + assert ( + len(backend.search_ids("!!!", user=None, search_mode=SearchMode.TEXT)) == 0 ) - assert no_tokens.total == 0 def test_title_mode_matches_partial_term_substrings( self, @@ -266,59 +197,18 @@ class TestSearch: ) backend.add_or_update(doc) - prefix_match = backend.search( - "pass", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TITLE, + assert ( + len(backend.search_ids("pass", user=None, search_mode=SearchMode.TITLE)) + == 1 ) - assert prefix_match.total == 1 - - infix_match = backend.search( - "sswo", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TITLE, + assert ( + len(backend.search_ids("sswo", user=None, search_mode=SearchMode.TITLE)) + == 1 ) - assert infix_match.total == 1 - - phrase_match = backend.search( - "sswo gu", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - search_mode=SearchMode.TITLE, + assert ( + len(backend.search_ids("sswo gu", user=None, search_mode=SearchMode.TITLE)) + == 1 ) - assert phrase_match.total == 1 - - def test_scores_normalised_top_hit_is_one(self, backend: TantivyBackend): - """Search scores must be normalized so top hit has score 1.0 for UI consistency.""" - for i, title in enumerate(["bank invoice", "bank statement", "bank receipt"]): - doc = Document.objects.create( - title=title, - content=title, - checksum=f"SN{i}", - pk=10 + i, - ) - backend.add_or_update(doc) - r = backend.search( - "bank", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - ) - assert r.hits[0]["score"] == pytest.approx(1.0) - assert all(0.0 <= h["score"] <= 1.0 for h in r.hits) def test_sort_field_ascending(self, backend: TantivyBackend): """Searching with sort_reverse=False must return results in ascending ASN order.""" @@ -331,16 +221,14 @@ class TestSearch: ) backend.add_or_update(doc) - r = backend.search( + ids = backend.search_ids( "sortable", user=None, - page=1, - page_size=10, sort_field="archive_serial_number", sort_reverse=False, ) - assert r.total == 3 - asns = [Document.objects.get(pk=h["id"]).archive_serial_number for h in r.hits] + assert len(ids) == 3 + asns = [Document.objects.get(pk=doc_id).archive_serial_number for doc_id in ids] assert asns == [10, 20, 30] def test_sort_field_descending(self, backend: TantivyBackend): @@ -354,79 +242,91 @@ class TestSearch: ) backend.add_or_update(doc) - r = backend.search( + ids = backend.search_ids( "sortable", user=None, - page=1, - page_size=10, sort_field="archive_serial_number", sort_reverse=True, ) - assert r.total == 3 - asns = [Document.objects.get(pk=h["id"]).archive_serial_number for h in r.hits] + assert len(ids) == 3 + asns = [Document.objects.get(pk=doc_id).archive_serial_number for doc_id in ids] assert asns == [30, 20, 10] - def test_fuzzy_threshold_filters_low_score_hits( - self, - backend: TantivyBackend, - settings, - ): - """When ADVANCED_FUZZY_SEARCH_THRESHOLD exceeds all normalized scores, hits must be filtered out.""" - doc = Document.objects.create( - title="Invoice document", - content="financial report", - checksum="FT1", - pk=120, - ) - backend.add_or_update(doc) - # Threshold above 1.0 filters every hit (normalized scores top out at 1.0) - settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 1.1 - r = backend.search( - "invoice", +class TestSearchIds: + """Test lightweight ID-only search.""" + + def test_returns_matching_ids(self, backend: TantivyBackend): + """search_ids must return IDs of all matching documents.""" + docs = [] + for i in range(5): + doc = Document.objects.create( + title=f"findable doc {i}", + content="common keyword", + checksum=f"SI{i}", + ) + backend.add_or_update(doc) + docs.append(doc) + other = Document.objects.create( + title="unrelated", + content="nothing here", + checksum="SI_other", + ) + backend.add_or_update(other) + + ids = backend.search_ids( + "common keyword", user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, + search_mode=SearchMode.QUERY, ) - assert r.hits == [] + assert set(ids) == {d.pk for d in docs} + assert other.pk not in ids - def test_owner_filter(self, backend: TantivyBackend): - """Document owners can search their private documents; other users cannot access them.""" - owner = User.objects.create_user("owner") - other = User.objects.create_user("other") + def test_respects_permission_filter(self, backend: TantivyBackend): + """search_ids must respect user permission filtering.""" + owner = User.objects.create_user("ids_owner") + other = User.objects.create_user("ids_other") doc = Document.objects.create( - title="Private", - content="secret", - checksum="PF1", - pk=20, + title="private doc", + content="secret keyword", + checksum="SIP1", owner=owner, ) backend.add_or_update(doc) + assert backend.search_ids( + "secret", + user=owner, + search_mode=SearchMode.QUERY, + ) == [doc.pk] assert ( - backend.search( - "secret", - user=owner, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - ).total - == 1 + backend.search_ids("secret", user=other, search_mode=SearchMode.QUERY) == [] ) - assert ( - backend.search( - "secret", - user=other, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - ).total - == 0 + + def test_respects_fuzzy_threshold(self, backend: TantivyBackend, settings): + """search_ids must apply the same fuzzy threshold as search().""" + doc = Document.objects.create( + title="threshold test", + content="unique term", + checksum="SIT1", ) + backend.add_or_update(doc) + + settings.ADVANCED_FUZZY_SEARCH_THRESHOLD = 1.1 + ids = backend.search_ids("unique", user=None, search_mode=SearchMode.QUERY) + assert ids == [] + + def test_returns_ids_for_text_mode(self, backend: TantivyBackend): + """search_ids must work with TEXT search mode.""" + doc = Document.objects.create( + title="text mode doc", + content="findable phrase", + checksum="SIM1", + ) + backend.add_or_update(doc) + + ids = backend.search_ids("findable", user=None, search_mode=SearchMode.TEXT) + assert ids == [doc.pk] class TestRebuild: @@ -490,57 +390,26 @@ class TestAutocomplete: class TestMoreLikeThis: """Test more like this functionality.""" - def test_excludes_original(self, backend: TantivyBackend): - """More like this queries must exclude the reference document from results.""" + def test_more_like_this_ids_excludes_original(self, backend: TantivyBackend): + """more_like_this_ids must return IDs of similar documents, excluding the original.""" doc1 = Document.objects.create( title="Important document", - content="financial information", - checksum="MLT1", - pk=50, + content="financial information report", + checksum="MLTI1", + pk=150, ) doc2 = Document.objects.create( title="Another document", - content="financial report", - checksum="MLT2", - pk=51, + content="financial information report", + checksum="MLTI2", + pk=151, ) backend.add_or_update(doc1) backend.add_or_update(doc2) - results = backend.more_like_this(doc_id=50, user=None, page=1, page_size=10) - returned_ids = [hit["id"] for hit in results.hits] - assert 50 not in returned_ids # Original document excluded - - def test_with_user_applies_permission_filter(self, backend: TantivyBackend): - """more_like_this with a user must exclude documents that user cannot see.""" - viewer = User.objects.create_user("mlt_viewer") - other = User.objects.create_user("mlt_other") - public_doc = Document.objects.create( - title="Public financial document", - content="quarterly financial analysis report figures", - checksum="MLT3", - pk=52, - ) - private_doc = Document.objects.create( - title="Private financial document", - content="quarterly financial analysis report figures", - checksum="MLT4", - pk=53, - owner=other, - ) - backend.add_or_update(public_doc) - backend.add_or_update(private_doc) - - results = backend.more_like_this(doc_id=52, user=viewer, page=1, page_size=10) - returned_ids = [hit["id"] for hit in results.hits] - # private_doc is owned by other, so viewer cannot see it - assert 53 not in returned_ids - - def test_document_not_in_index_returns_empty(self, backend: TantivyBackend): - """more_like_this for a doc_id absent from the index must return empty results.""" - results = backend.more_like_this(doc_id=9999, user=None, page=1, page_size=10) - assert results.hits == [] - assert results.total == 0 + ids = backend.more_like_this_ids(doc_id=150, user=None) + assert 150 not in ids + assert 151 in ids class TestSingleton: @@ -593,19 +462,10 @@ class TestFieldHandling: # Should not raise an exception backend.add_or_update(doc) - results = backend.search( - "test", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - ) - assert results.total == 1 + assert len(backend.search_ids("test", user=None)) == 1 def test_custom_fields_include_name_and_value(self, backend: TantivyBackend): """Custom fields must be indexed with both field name and value for structured queries.""" - # Create a custom field field = CustomField.objects.create( name="Invoice Number", data_type=CustomField.FieldDataType.STRING, @@ -622,18 +482,9 @@ class TestFieldHandling: value_text="INV-2024-001", ) - # Should not raise an exception during indexing backend.add_or_update(doc) - results = backend.search( - "invoice", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - ) - assert results.total == 1 + assert len(backend.search_ids("invoice", user=None)) == 1 def test_select_custom_field_indexes_label_not_id(self, backend: TantivyBackend): """SELECT custom fields must index the human-readable label, not the opaque option ID.""" @@ -660,27 +511,8 @@ class TestFieldHandling: ) backend.add_or_update(doc) - # Label should be findable - results = backend.search( - "custom_fields.value:invoice", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - ) - assert results.total == 1 - - # Opaque ID must not appear in the index - results = backend.search( - "custom_fields.value:opt_abc", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - ) - assert results.total == 0 + assert len(backend.search_ids("custom_fields.value:invoice", user=None)) == 1 + assert len(backend.search_ids("custom_fields.value:opt_abc", user=None)) == 0 def test_none_custom_field_value_not_indexed(self, backend: TantivyBackend): """Custom field instances with no value set must not produce an index entry.""" @@ -702,16 +534,7 @@ class TestFieldHandling: ) backend.add_or_update(doc) - # The string "none" must not appear as an indexed value - results = backend.search( - "custom_fields.value:none", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - ) - assert results.total == 0 + assert len(backend.search_ids("custom_fields.value:none", user=None)) == 0 def test_notes_include_user_information(self, backend: TantivyBackend): """Notes must be indexed with user information when available for structured queries.""" @@ -724,32 +547,96 @@ class TestFieldHandling: ) Note.objects.create(document=doc, note="Important note", user=user) - # Should not raise an exception during indexing backend.add_or_update(doc) - # Test basic document search first - results = backend.search( - "test", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, - ) - assert results.total == 1, ( - f"Expected 1, got {results.total}. Document content should be searchable." + ids = backend.search_ids("test", user=None) + assert len(ids) == 1, ( + f"Expected 1, got {len(ids)}. Document content should be searchable." ) - # Test notes search — must use structured JSON syntax now that note - # is no longer in DEFAULT_SEARCH_FIELDS - results = backend.search( - "notes.note:important", - user=None, - page=1, - page_size=10, - sort_field=None, - sort_reverse=False, + ids = backend.search_ids("notes.note:important", user=None) + assert len(ids) == 1, ( + f"Expected 1, got {len(ids)}. Note content should be searchable via notes.note: prefix." ) - assert results.total == 1, ( - f"Expected 1, got {results.total}. Note content should be searchable via notes.note: prefix." + + +class TestHighlightHits: + """Test highlight_hits returns proper HTML strings, not raw Snippet objects.""" + + def test_highlights_content_returns_html_string(self, backend: TantivyBackend): + """highlight_hits must return HTML strings (from Snippet.to_html()), not Snippet objects.""" + doc = Document.objects.create( + title="Highlight Test", + content="The quick brown fox jumps over the lazy dog", + checksum="HH1", + pk=90, ) + backend.add_or_update(doc) + + hits = backend.highlight_hits("quick", [doc.pk]) + + assert len(hits) == 1 + highlights = hits[0]["highlights"] + assert "content" in highlights + content_highlight = highlights["content"] + assert isinstance(content_highlight, str), ( + f"Expected str, got {type(content_highlight)}: {content_highlight!r}" + ) + # Tantivy wraps matched terms in tags + assert "" in content_highlight, ( + f"Expected HTML with tags, got: {content_highlight!r}" + ) + + def test_highlights_notes_returns_html_string(self, backend: TantivyBackend): + """Note highlights must be HTML strings via notes_text companion field. + + The notes JSON field does not support tantivy SnippetGenerator; the + notes_text plain-text field is used instead. We use the full-text + query "urgent" (not notes.note:) because notes_text IS in + DEFAULT_SEARCH_FIELDS via the normal search path… actually, we use + notes.note: prefix so the query targets notes content directly, but + the snippet is generated from notes_text which stores the same text. + """ + user = User.objects.create_user("hl_noteuser") + doc = Document.objects.create( + title="Doc with matching note", + content="unrelated content", + checksum="HH2", + pk=91, + ) + Note.objects.create(document=doc, note="urgent payment required", user=user) + backend.add_or_update(doc) + + # Use notes.note: prefix so the document matches the query and the + # notes_text snippet generator can produce highlights. + hits = backend.highlight_hits("notes.note:urgent", [doc.pk]) + + assert len(hits) == 1 + highlights = hits[0]["highlights"] + assert "notes" in highlights + note_highlight = highlights["notes"] + assert isinstance(note_highlight, str), ( + f"Expected str, got {type(note_highlight)}: {note_highlight!r}" + ) + assert "" in note_highlight, ( + f"Expected HTML with tags, got: {note_highlight!r}" + ) + + def test_empty_doc_list_returns_empty_hits(self, backend: TantivyBackend): + """highlight_hits with no doc IDs must return an empty list.""" + hits = backend.highlight_hits("anything", []) + assert hits == [] + + def test_no_highlights_when_no_match(self, backend: TantivyBackend): + """Documents not matching the query should not appear in results.""" + doc = Document.objects.create( + title="Unrelated", + content="completely different text", + checksum="HH3", + pk=92, + ) + backend.add_or_update(doc) + + hits = backend.highlight_hits("quick", [doc.pk]) + + assert len(hits) == 0 diff --git a/src/documents/tests/test_api_search.py b/src/documents/tests/test_api_search.py index 9e0879e89..85f479010 100644 --- a/src/documents/tests/test_api_search.py +++ b/src/documents/tests/test_api_search.py @@ -1503,6 +1503,126 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): [d2.id, d1.id, d3.id], ) + def test_search_ordering_by_score(self) -> None: + """ordering=-score must return results in descending relevance order (best first).""" + backend = get_backend() + # doc_high has more occurrences of the search term → higher BM25 score + doc_low = Document.objects.create( + title="score sort low", + content="apple", + checksum="SCL1", + ) + doc_high = Document.objects.create( + title="score sort high", + content="apple apple apple apple apple", + checksum="SCH1", + ) + backend.add_or_update(doc_low) + backend.add_or_update(doc_high) + + # -score = descending = best first (highest score) + response = self.client.get("/api/documents/?query=apple&ordering=-score") + self.assertEqual(response.status_code, status.HTTP_200_OK) + ids = [r["id"] for r in response.data["results"]] + self.assertEqual( + ids[0], + doc_high.id, + "Most relevant doc should be first for -score", + ) + + # score = ascending = worst first (lowest score) + response = self.client.get("/api/documents/?query=apple&ordering=score") + self.assertEqual(response.status_code, status.HTTP_200_OK) + ids = [r["id"] for r in response.data["results"]] + self.assertEqual( + ids[0], + doc_low.id, + "Least relevant doc should be first for +score", + ) + + def test_search_with_tantivy_native_sort(self) -> None: + """When ordering by a Tantivy-sortable field, results must be correctly sorted.""" + backend = get_backend() + for i, asn in enumerate([30, 10, 20]): + doc = Document.objects.create( + title=f"sortable doc {i}", + content="searchable content", + checksum=f"TNS{i}", + archive_serial_number=asn, + ) + backend.add_or_update(doc) + + response = self.client.get( + "/api/documents/?query=searchable&ordering=archive_serial_number", + ) + self.assertEqual(response.status_code, status.HTTP_200_OK) + asns = [doc["archive_serial_number"] for doc in response.data["results"]] + self.assertEqual(asns, [10, 20, 30]) + + response = self.client.get( + "/api/documents/?query=searchable&ordering=-archive_serial_number", + ) + self.assertEqual(response.status_code, status.HTTP_200_OK) + asns = [doc["archive_serial_number"] for doc in response.data["results"]] + self.assertEqual(asns, [30, 20, 10]) + + def test_search_page_2_returns_correct_slice(self) -> None: + """Page 2 must return the second slice, not overlap with page 1.""" + backend = get_backend() + for i in range(10): + doc = Document.objects.create( + title=f"doc {i}", + content="paginated content", + checksum=f"PG2{i}", + archive_serial_number=i + 1, + ) + backend.add_or_update(doc) + + response = self.client.get( + "/api/documents/?query=paginated&ordering=archive_serial_number&page=1&page_size=3", + ) + page1_ids = [r["id"] for r in response.data["results"]] + self.assertEqual(len(page1_ids), 3) + + response = self.client.get( + "/api/documents/?query=paginated&ordering=archive_serial_number&page=2&page_size=3", + ) + page2_ids = [r["id"] for r in response.data["results"]] + self.assertEqual(len(page2_ids), 3) + + # No overlap between pages + self.assertEqual(set(page1_ids) & set(page2_ids), set()) + # Page 2 ASNs are higher than page 1 + page1_asns = [ + Document.objects.get(pk=pk).archive_serial_number for pk in page1_ids + ] + page2_asns = [ + Document.objects.get(pk=pk).archive_serial_number for pk in page2_ids + ] + self.assertTrue(max(page1_asns) < min(page2_asns)) + + def test_search_all_field_contains_all_ids_when_paginated(self) -> None: + """The 'all' field must contain every matching ID, even when paginated.""" + backend = get_backend() + doc_ids = [] + for i in range(10): + doc = Document.objects.create( + title=f"all field doc {i}", + content="allfield content", + checksum=f"AF{i}", + ) + backend.add_or_update(doc) + doc_ids.append(doc.pk) + + response = self.client.get( + "/api/documents/?query=allfield&page=1&page_size=3", + headers={"Accept": "application/json; version=9"}, + ) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.data["results"]), 3) + # "all" must contain ALL 10 matching IDs + self.assertCountEqual(response.data["all"], doc_ids) + @mock.patch("documents.bulk_edit.bulk_update_documents") def test_global_search(self, m) -> None: """ diff --git a/src/documents/views.py b/src/documents/views.py index f56725415..c57e43b35 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -38,6 +38,7 @@ from django.db.models import Model from django.db.models import OuterRef from django.db.models import Prefetch from django.db.models import Q +from django.db.models import QuerySet from django.db.models import Subquery from django.db.models import Sum from django.db.models import When @@ -249,6 +250,13 @@ if settings.AUDIT_LOG_ENABLED: logger = logging.getLogger("paperless.api") +# Crossover point for intersect_and_order: below this count use a targeted +# IN-clause query; at or above this count fall back to a full-table scan + +# Python set intersection. The IN-clause is faster for small result sets but +# degrades on SQLite with thousands of parameters. PostgreSQL handles large IN +# clauses efficiently, so this threshold mainly protects SQLite users. +_TANTIVY_INTERSECT_THRESHOLD = 5_000 + class IndexView(TemplateView): template_name = "index.html" @@ -2077,19 +2085,16 @@ class UnifiedSearchViewSet(DocumentViewSet): if not self._is_search_request(): return super().list(request) + from documents.search import SearchHit from documents.search import SearchMode + from documents.search import TantivyBackend from documents.search import TantivyRelevanceList from documents.search import get_backend - try: - backend = get_backend() - # ORM-filtered queryset: permissions + field filters + ordering (DRF backends applied) - filtered_qs = self.filter_queryset(self.get_queryset()) - - user = None if request.user.is_superuser else request.user - active_search_params = self._get_active_search_params(request) - - if len(active_search_params) > 1: + def parse_search_params() -> tuple[str | None, bool, bool, int, int]: + """Extract query string, search mode, and ordering from request.""" + active = self._get_active_search_params(request) + if len(active) > 1: raise ValidationError( { "detail": _( @@ -2098,73 +2103,161 @@ class UnifiedSearchViewSet(DocumentViewSet): }, ) - if ( - "text" in request.query_params - or "title_search" in request.query_params - or "query" in request.query_params - ): - if "text" in request.query_params: - search_mode = SearchMode.TEXT - query_str = request.query_params["text"] - elif "title_search" in request.query_params: - search_mode = SearchMode.TITLE - query_str = request.query_params["title_search"] - else: - search_mode = SearchMode.QUERY - query_str = request.query_params["query"] - results = backend.search( - query_str, - user=user, - page=1, - page_size=10000, - sort_field=None, - sort_reverse=False, - search_mode=search_mode, - ) - else: - # more_like_id — validate permission on the seed document first - try: - more_like_doc_id = int(request.query_params["more_like_id"]) - more_like_doc = Document.objects.select_related("owner").get( - pk=more_like_doc_id, + ordering_param = request.query_params.get("ordering", "") + sort_reverse = ordering_param.startswith("-") + sort_field_name = ordering_param.lstrip("-") or None + # "score" means relevance order — Tantivy handles it natively, + # so treat it as a Tantivy sort to preserve the ranked order through + # the ORM intersection step. + use_tantivy_sort = ( + sort_field_name in TantivyBackend.SORTABLE_FIELDS + or sort_field_name is None + or sort_field_name == "score" + ) + + try: + page_num = int(request.query_params.get("page", 1)) + except (TypeError, ValueError): + page_num = 1 + page_size = ( + self.paginator.get_page_size(request) or self.paginator.page_size + ) + + return sort_field_name, sort_reverse, use_tantivy_sort, page_num, page_size + + def intersect_and_order( + all_ids: list[int], + filtered_qs: QuerySet[Document], + *, + use_tantivy_sort: bool, + ) -> list[int]: + """Intersect search IDs with ORM-visible IDs, preserving order.""" + if not all_ids: + return [] + if use_tantivy_sort: + if len(all_ids) <= _TANTIVY_INTERSECT_THRESHOLD: + # Small result set: targeted IN-clause avoids a full-table scan. + visible_ids = set( + filtered_qs.filter(pk__in=all_ids).values_list("pk", flat=True), ) - except (TypeError, ValueError, Document.DoesNotExist): - raise PermissionDenied(_("Invalid more_like_id")) + else: + # Large result set: full-table scan + Python intersection is faster + # than a large IN-clause on SQLite. + visible_ids = set( + filtered_qs.values_list("pk", flat=True), + ) + return [doc_id for doc_id in all_ids if doc_id in visible_ids] + return list( + filtered_qs.filter(id__in=all_ids).values_list("pk", flat=True), + ) - if not has_perms_owner_aware( - request.user, - "view_document", - more_like_doc, - ): - raise PermissionDenied(_("Insufficient permissions.")) - - results = backend.more_like_this( - more_like_doc_id, - user=user, - page=1, - page_size=10000, - ) - - hits_by_id = {h["id"]: h for h in results.hits} - - # Determine sort order: no ordering param -> Tantivy relevance; otherwise -> ORM order - ordering_param = request.query_params.get("ordering", "").lstrip("-") - if not ordering_param: - # Preserve Tantivy relevance order; intersect with ORM-visible IDs - orm_ids = set(filtered_qs.values_list("pk", flat=True)) - ordered_hits = [h for h in results.hits if h["id"] in orm_ids] + def run_text_search( + backend: TantivyBackend, + user: User | None, + filtered_qs: QuerySet[Document], + ) -> tuple[list[int], list[SearchHit], int]: + """Handle text/title/query search: IDs, ORM intersection, page highlights.""" + if "text" in request.query_params: + search_mode = SearchMode.TEXT + query_str = request.query_params["text"] + elif "title_search" in request.query_params: + search_mode = SearchMode.TITLE + query_str = request.query_params["title_search"] else: - # Use ORM ordering (already applied by DocumentsOrderingFilter) - hit_ids = set(hits_by_id.keys()) - orm_ordered_ids = filtered_qs.filter(id__in=hit_ids).values_list( - "pk", - flat=True, - ) - ordered_hits = [ - hits_by_id[pk] for pk in orm_ordered_ids if pk in hits_by_id - ] + search_mode = SearchMode.QUERY + query_str = request.query_params["query"] - rl = TantivyRelevanceList(ordered_hits) + # "score" is not a real Tantivy sort field — it means relevance order, + # which is Tantivy's default when no sort field is specified. + is_score_sort = sort_field_name == "score" + all_ids = backend.search_ids( + query_str, + user=user, + sort_field=( + None if (not use_tantivy_sort or is_score_sort) else sort_field_name + ), + sort_reverse=sort_reverse, + search_mode=search_mode, + ) + ordered_ids = intersect_and_order( + all_ids, + filtered_qs, + use_tantivy_sort=use_tantivy_sort, + ) + # Tantivy returns relevance results best-first (descending score). + # ordering=score (ascending, worst-first) requires a reversal. + if is_score_sort and not sort_reverse: + ordered_ids = list(reversed(ordered_ids)) + + page_offset = (page_num - 1) * page_size + page_ids = ordered_ids[page_offset : page_offset + page_size] + page_hits = backend.highlight_hits( + query_str, + page_ids, + search_mode=search_mode, + rank_start=page_offset + 1, + ) + return ordered_ids, page_hits, page_offset + + def run_more_like_this( + backend: TantivyBackend, + user: User | None, + filtered_qs: QuerySet[Document], + ) -> tuple[list[int], list[SearchHit], int]: + """Handle more_like_id search: permission check, IDs, stub hits.""" + try: + more_like_doc_id = int(request.query_params["more_like_id"]) + more_like_doc = Document.objects.select_related("owner").get( + pk=more_like_doc_id, + ) + except (TypeError, ValueError, Document.DoesNotExist): + raise PermissionDenied(_("Invalid more_like_id")) + + if not has_perms_owner_aware( + request.user, + "view_document", + more_like_doc, + ): + raise PermissionDenied(_("Insufficient permissions.")) + + all_ids = backend.more_like_this_ids(more_like_doc_id, user=user) + ordered_ids = intersect_and_order( + all_ids, + filtered_qs, + use_tantivy_sort=True, + ) + + page_offset = (page_num - 1) * page_size + page_ids = ordered_ids[page_offset : page_offset + page_size] + page_hits = [ + SearchHit(id=doc_id, score=0.0, rank=rank, highlights={}) + for rank, doc_id in enumerate(page_ids, start=page_offset + 1) + ] + return ordered_ids, page_hits, page_offset + + try: + sort_field_name, sort_reverse, use_tantivy_sort, page_num, page_size = ( + parse_search_params() + ) + + backend = get_backend() + filtered_qs = self.filter_queryset(self.get_queryset()) + user = None if request.user.is_superuser else request.user + + if "more_like_id" in request.query_params: + ordered_ids, page_hits, page_offset = run_more_like_this( + backend, + user, + filtered_qs, + ) + else: + ordered_ids, page_hits, page_offset = run_text_search( + backend, + user, + filtered_qs, + ) + + rl = TantivyRelevanceList(ordered_ids, page_hits, page_offset) page = self.paginate_queryset(rl) if page is not None: @@ -2174,15 +2267,18 @@ class UnifiedSearchViewSet(DocumentViewSet): if get_boolean( str(request.query_params.get("include_selection_data", "false")), ): - all_ids = [h["id"] for h in ordered_hits] + # NOTE: pk__in=ordered_ids generates a large SQL IN clause + # for big result sets. Acceptable today but may need a temp + # table or chunked approach if selection_data becomes slow + # at scale (tens of thousands of matching documents). response.data["selection_data"] = ( self._get_selection_data_for_queryset( - filtered_qs.filter(pk__in=all_ids), + filtered_qs.filter(pk__in=ordered_ids), ) ) return response - serializer = self.get_serializer(ordered_hits, many=True) + serializer = self.get_serializer(page_hits, many=True) return Response(serializer.data) except NotFound: @@ -3088,20 +3184,17 @@ class GlobalSearchView(PassUserMixin): docs = all_docs.filter(title__icontains=query)[:OBJECT_LIMIT] else: user = None if request.user.is_superuser else request.user - fts_results = get_backend().search( + matching_ids = get_backend().search_ids( query, user=user, - page=1, - page_size=1000, - sort_field=None, - sort_reverse=False, search_mode=SearchMode.TEXT, + limit=OBJECT_LIMIT * 3, ) - docs_by_id = all_docs.in_bulk([hit["id"] for hit in fts_results.hits]) + docs_by_id = all_docs.in_bulk(matching_ids) docs = [ - docs_by_id[hit["id"]] - for hit in fts_results.hits - if hit["id"] in docs_by_id + docs_by_id[doc_id] + for doc_id in matching_ids + if doc_id in docs_by_id ][:OBJECT_LIMIT] saved_views = ( get_objects_for_user_owner_aware( diff --git a/src/paperless/views.py b/src/paperless/views.py index c9ded4c0d..9c0c99e5a 100644 --- a/src/paperless/views.py +++ b/src/paperless/views.py @@ -90,7 +90,7 @@ class StandardPagination(PageNumberPagination): query = self.page.paginator.object_list if isinstance(query, TantivyRelevanceList): - return [h["id"] for h in query._hits] + return query.get_all_ids() return self.page.paginator.object_list.values_list("pk", flat=True) def get_paginated_response_schema(self, schema):