diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 7081bdc7d..9dcb45c05 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging import re import threading -from collections import Counter from datetime import UTC from datetime import datetime from enum import StrEnum @@ -36,8 +35,10 @@ from documents.utils import identity if TYPE_CHECKING: from pathlib import Path - from django.contrib.auth.base_user import AbstractBaseUser + from django.contrib.auth.base_user import AbstractUser from django.db.models import QuerySet + from tantivy import Index + from tantivy import Schema from documents.models import Document @@ -324,7 +325,7 @@ class TantivyBackend: def _apply_permission_filter( self, query: tantivy.Query, - user: AbstractBaseUser | None, + user: AbstractUser | None, ) -> tantivy.Query: """Wrap a query with a permission filter if the user is not a superuser.""" if user is not None: @@ -633,7 +634,7 @@ class TantivyBackend: def search_ids( self, query: str, - user: AbstractBaseUser | None, + user: AbstractUser | None, *, sort_field: str | None = None, sort_reverse: bool = False, @@ -693,7 +694,7 @@ class TantivyBackend: self, term: str, limit: int, - user: AbstractBaseUser | None = None, + user: AbstractUser | None = None, ) -> list[str]: """ Get autocomplete suggestions for search queries. @@ -719,71 +720,32 @@ class TantivyBackend: if not normalized_term: return [] + if TYPE_CHECKING: + assert self._index is not None + assert isinstance(self._index, Index) + assert isinstance(self._schema, Schema) + searcher = self._index.searcher() - # Build a prefix query on autocomplete_word so we only scan docs - # containing words that start with the prefix, not the entire index. - # tantivy regex is implicitly anchored; .+ avoids the empty-match - # error that .* triggers. We OR with term_query to also match the - # exact prefix as a complete word. - escaped = re.escape(normalized_term) - prefix_query = tantivy.Query.boolean_query( - [ - ( - tantivy.Occur.Should, - tantivy.Query.term_query( - self._schema, - "autocomplete_word", - normalized_term, - ), - ), - ( - tantivy.Occur.Should, - tantivy.Query.regex_query( - self._schema, - "autocomplete_word", - f"{escaped}.+", - ), - ), - ], - ) - + permission_query = None # Intersect with permission filter so autocomplete words from # invisible documents don't leak to other users. if user is not None and not user.is_superuser: - final_query = tantivy.Query.boolean_query( - [ - (tantivy.Occur.Must, prefix_query), - (tantivy.Occur.Must, build_permission_filter(self._schema, user)), - ], - ) - else: - final_query = prefix_query + permission_query = build_permission_filter(self._schema, user) - results = searcher.search(final_query, limit=searcher.num_docs) - - # Count how many visible documents each matching word appears in. - word_counts: Counter[str] = Counter() - for _score, doc_address in results.hits: - stored_doc = searcher.doc(doc_address) - doc_dict = stored_doc.to_dict() - if "autocomplete_word" in doc_dict: - for word in doc_dict["autocomplete_word"]: - if word.startswith(normalized_term): - word_counts[word] += 1 - - # Sort by document frequency descending; break ties alphabetically. - matches = sorted( - word_counts, - key=lambda w: (-word_counts[w], w), + matches = searcher.terms_with_prefix( + "autocomplete_word", + normalized_term, + permission_query, + limit, ) - return matches[:limit] + return [x[0] for x in matches] def more_like_this_ids( self, doc_id: int, - user: AbstractBaseUser | None, + user: AbstractUser | None, *, limit: int | None = None, ) -> list[int]: diff --git a/uv.lock b/uv.lock index 028eb9bf8..31348aa25 100644 --- a/uv.lock +++ b/uv.lock @@ -4661,7 +4661,7 @@ wheels = [ [[package]] name = "tantivy" version = "0.26.0" -source = { git = "https://github.com/quickwit-oss/tantivy-py.git#fa1a1985b96001929fc1cafcdd9dc94e56658b2a" } +source = { git = "https://github.com/quickwit-oss/tantivy-py.git#bf3a0c0d6e18591728121b3b5fa735ab61862110" } [[package]] name = "tenacity" @@ -4896,12 +4896,12 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform == 'darwin'" }, ] wheels = [ - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d75eadcd97fe0dc7cd0eedc4d72152484c19cb2cfe46ce55766c8e129116425f" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:43b35116802c85fb88d99f4a396b8bd4472bfca1dd82e69499e5a4f9b8b4e252" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:442ec9dc78592564fdad69cf0beaa9da2f82ab810ccb4f13903869a90bf3f15d" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cc3a195701bba2239c313ee311487f80f8aaebe9e89b9073dddbcf2f93b5a0ba" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:072a0d6e4865e8b0dc0dbfe6ebed68fae235124222835ef03e5814d414d8c012" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:23ec7789017da9d95b6d543d790814785e6f30905c5443efa8257d1490d73f79" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d75eadcd97fe0dc7cd0eedc4d72152484c19cb2cfe46ce55766c8e129116425f", upload-time = "2026-03-23T15:16:54Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:43b35116802c85fb88d99f4a396b8bd4472bfca1dd82e69499e5a4f9b8b4e252", upload-time = "2026-03-23T15:16:58Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:442ec9dc78592564fdad69cf0beaa9da2f82ab810ccb4f13903869a90bf3f15d", upload-time = "2026-03-23T15:17:02Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cc3a195701bba2239c313ee311487f80f8aaebe9e89b9073dddbcf2f93b5a0ba", upload-time = "2026-03-23T15:17:06Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:072a0d6e4865e8b0dc0dbfe6ebed68fae235124222835ef03e5814d414d8c012", upload-time = "2026-03-23T15:17:10Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:23ec7789017da9d95b6d543d790814785e6f30905c5443efa8257d1490d73f79", upload-time = "2026-03-23T15:17:14Z" }, ] [[package]] @@ -4924,24 +4924,24 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform == 'linux'" }, ] wheels = [ - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp311-cp311-linux_s390x.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp312-cp312-linux_s390x.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313-linux_s390x.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313t-linux_s390x.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313t-manylinux_2_28_aarch64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314-linux_s390x.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314-manylinux_2_28_aarch64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314-manylinux_2_28_x86_64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314t-linux_s390x.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314t-manylinux_2_28_aarch64.whl" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314t-manylinux_2_28_x86_64.whl" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp311-cp311-linux_s390x.whl", upload-time = "2026-03-23T14:58:58Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", upload-time = "2026-03-23T14:59:00Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", upload-time = "2026-03-23T14:59:00Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp312-cp312-linux_s390x.whl", upload-time = "2026-03-23T14:59:01Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", upload-time = "2026-03-23T14:59:02Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", upload-time = "2026-03-23T14:59:03Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313-linux_s390x.whl", upload-time = "2026-03-23T14:59:04Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl", upload-time = "2026-03-23T14:59:04Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", upload-time = "2026-03-23T14:59:05Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313t-linux_s390x.whl", upload-time = "2026-03-23T14:59:07Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313t-manylinux_2_28_aarch64.whl", upload-time = "2026-03-23T14:59:07Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", upload-time = "2026-03-23T14:59:07Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314-linux_s390x.whl", upload-time = "2026-03-23T14:59:09Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314-manylinux_2_28_aarch64.whl", upload-time = "2026-03-23T14:59:10Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314-manylinux_2_28_x86_64.whl", upload-time = "2026-03-23T14:59:11Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314t-linux_s390x.whl", upload-time = "2026-03-23T14:59:12Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314t-manylinux_2_28_aarch64.whl", upload-time = "2026-03-23T14:59:12Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.11.0%2Bcpu-cp314-cp314t-manylinux_2_28_x86_64.whl", upload-time = "2026-03-23T14:59:13Z" }, ] [[package]]