diff --git a/pyproject.toml b/pyproject.toml index 19dfe3fdc..bfc8c7577 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,7 @@ dependencies = [ "scikit-learn~=1.8.0", "sentence-transformers>=4.1", "setproctitle~=1.3.4", - "tantivy>=0.25.1", + "tantivy @ git+https://github.com/quickwit-oss/tantivy-py.git", "tika-client~=0.11.0", "torch~=2.11.0", "watchfiles>=1.1.1", diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 46f56f339..7081bdc7d 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -222,24 +222,9 @@ class WriteBatch: self._writer.add_document(doc) def remove(self, doc_id: int) -> None: - """ - Remove a document from the batch by its primary key. - - Uses range_query instead of term_query to work around a tantivy-py bug - where Python integers are inferred as i64, producing Terms that never - match u64 fields. - - TODO: Replace with term_query("id", doc_id) once - https://github.com/quickwit-oss/tantivy-py/pull/642 lands. - """ + """Remove a document from the batch by its primary key.""" self._writer.delete_documents_by_query( - tantivy.Query.range_query( - self._backend._schema, - "id", - tantivy.FieldType.Unsigned, - doc_id, - doc_id, - ), + tantivy.Query.term_query(self._backend._schema, "id", doc_id), ) @@ -526,15 +511,6 @@ class TantivyBackend: Use this when you already know which documents to display (from search_ids + ORM filtering) and just need highlight data. - Note: Each doc_id requires an individual index lookup because tantivy-py - does not yet expose a batch fast-field read API. This is acceptable for - page-sized batches (typically 25 docs) but should not be called with - thousands of IDs. - - TODO: When https://github.com/quickwit-oss/tantivy-py/pull/641 lands, - the per-doc range_query lookups here can be replaced with a single - collect_u64_fast_field("id", doc_addresses) call. - Args: query: The search query (used for snippet generation) doc_ids: Ordered list of document IDs to generate hits for @@ -571,32 +547,42 @@ class TantivyBackend: notes_text_query = user_query searcher = self._index.searcher() + + # Fetch all requested docs in a single search: user_query MUST match + # and exactly the requested IDs MUST match (OR of term_queries). + id_filter = tantivy.Query.boolean_query( + [ + ( + tantivy.Occur.Should, + tantivy.Query.term_query(self._schema, "id", did), + ) + for did in doc_ids + ], + ) + batch_query = tantivy.Query.boolean_query( + [ + (tantivy.Occur.Must, user_query), + (tantivy.Occur.Must, id_filter), + ], + ) + batch_results = searcher.search(batch_query, limit=len(doc_ids)) + + result_addrs = [addr for _score, addr in batch_results.hits] + result_ids = searcher.fast_field_values("id", result_addrs) + addr_by_id: dict[int, tuple[float, tantivy.DocAddress]] = { + doc_id: (score, addr) + for (score, addr), doc_id in zip(batch_results.hits, result_ids) + } + snippet_generator = None notes_snippet_generator = None hits: list[SearchHit] = [] for rank, doc_id in enumerate(doc_ids, start=rank_start): - # Look up document by ID, scoring against the user query so that - # the returned SearchHit carries a real BM25 relevance score. - id_query = tantivy.Query.range_query( - self._schema, - "id", - tantivy.FieldType.Unsigned, - doc_id, - doc_id, - ) - scored_query = tantivy.Query.boolean_query( - [ - (tantivy.Occur.Must, user_query), - (tantivy.Occur.Must, id_query), - ], - ) - results = searcher.search(scored_query, limit=1) - - if not results.hits: + if doc_id not in addr_by_id: continue - score, doc_address = results.hits[0] + score, doc_address = addr_by_id[doc_id] actual_doc = searcher.doc(doc_address) doc_dict = actual_doc.to_dict() @@ -701,10 +687,7 @@ class TantivyBackend: if threshold is not None: all_hits = [hit for hit in all_hits if hit[1] >= threshold] - # TODO: Replace with searcher.collect_u64_fast_field("id", addrs) once - # https://github.com/quickwit-oss/tantivy-py/pull/641 lands — eliminates - # one stored-doc fetch per result (~80% reduction in search_ids latency). - return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits] + return searcher.fast_field_values("id", [doc_addr for doc_addr, *_ in all_hits]) def autocomplete( self, @@ -821,13 +804,7 @@ class TantivyBackend: self._ensure_open() searcher = self._index.searcher() - id_query = tantivy.Query.range_query( - self._schema, - "id", - tantivy.FieldType.Unsigned, - doc_id, - doc_id, - ) + id_query = tantivy.Query.term_query(self._schema, "id", doc_id) results = searcher.search(id_query, limit=1) if not results.hits: @@ -851,14 +828,9 @@ class TantivyBackend: # Fetch one extra to account for excluding the original document results = searcher.search(final_query, limit=effective_limit + 1) - # TODO: Replace with collect_u64_fast_field("id", addrs) once - # https://github.com/quickwit-oss/tantivy-py/pull/641 lands. - ids = [] - for _score, doc_address in results.hits: - result_doc_id = searcher.doc(doc_address).to_dict()["id"][0] - if result_doc_id != doc_id: - ids.append(result_doc_id) - + addrs = [addr for _score, addr in results.hits] + all_ids = searcher.fast_field_values("id", addrs) + ids = [rid for rid in all_ids if rid != doc_id] return ids[:limit] if limit is not None else ids def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch: diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 59421c763..0fbe4603b 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -410,9 +410,6 @@ def normalize_query(query: str) -> str: raise ValueError("Query too complex to process (normalization timed out)") -_MAX_U64 = 2**64 - 1 # u64 max — used as inclusive upper bound for "any owner" range - - def build_permission_filter( schema: tantivy.Schema, user: AbstractBaseUser, @@ -432,48 +429,16 @@ def build_permission_filter( Returns: Tantivy query that filters results to visible documents - - Implementation Notes: - - Uses range_query instead of term_query for owner_id/viewer_id to work - around a tantivy-py bug where Python ints are inferred as i64, causing - term_query to return no hits on u64 fields. - TODO: Replace with term_query once - https://github.com/quickwit-oss/tantivy-py/pull/642 lands. - - - Uses range_query(owner_id, 1, MAX_U64) as an "owner exists" check - because exists_query is not yet available in tantivy-py 0.25. - TODO: Replace with exists_query("owner_id") once that is exposed in - a tantivy-py release. - - - Uses disjunction_max_query to combine permission clauses with OR logic """ - owner_any = tantivy.Query.range_query( - schema, - "owner_id", - tantivy.FieldType.Unsigned, - 1, - _MAX_U64, - ) + owner_any = tantivy.Query.exists_query("owner_id") no_owner = tantivy.Query.boolean_query( [ (tantivy.Occur.Must, tantivy.Query.all_query()), (tantivy.Occur.MustNot, owner_any), ], ) - owned = tantivy.Query.range_query( - schema, - "owner_id", - tantivy.FieldType.Unsigned, - user.pk, - user.pk, - ) - shared = tantivy.Query.range_query( - schema, - "viewer_id", - tantivy.FieldType.Unsigned, - user.pk, - user.pk, - ) + owned = tantivy.Query.term_query(schema, "owner_id", user.pk) + shared = tantivy.Query.term_query(schema, "viewer_id", user.pk) return tantivy.Query.disjunction_max_query([no_owner, owned, shared]) diff --git a/src/documents/tests/test_admin.py b/src/documents/tests/test_admin.py index 533319c2f..79791563b 100644 --- a/src/documents/tests/test_admin.py +++ b/src/documents/tests/test_admin.py @@ -24,13 +24,7 @@ class TestDocumentAdmin(DirectoriesMixin, TestCase): backend = get_backend() searcher = backend._index.searcher() results = searcher.search( - tantivy.Query.range_query( - backend._schema, - "id", - tantivy.FieldType.Unsigned, - doc.pk, - doc.pk, - ), + tantivy.Query.term_query(backend._schema, "id", doc.pk), limit=1, ) if results.hits: diff --git a/uv.lock b/uv.lock index 1a530a92e..028eb9bf8 100644 --- a/uv.lock +++ b/uv.lock @@ -3056,7 +3056,7 @@ requires-dist = [ { name = "scikit-learn", specifier = "~=1.8.0" }, { name = "sentence-transformers", specifier = ">=4.1" }, { name = "setproctitle", specifier = "~=1.3.4" }, - { name = "tantivy", specifier = ">=0.25.1" }, + { name = "tantivy", git = "https://github.com/quickwit-oss/tantivy-py.git" }, { name = "tika-client", specifier = "~=0.11.0" }, { name = "torch", specifier = "~=2.11.0", index = "https://download.pytorch.org/whl/cpu" }, { name = "watchfiles", specifier = ">=1.1.1" }, @@ -4660,31 +4660,8 @@ wheels = [ [[package]] name = "tantivy" -version = "0.25.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1b/f9/0cd3955d155d3e3ef74b864769514dd191e5dacba9f0beb7af2d914942ce/tantivy-0.25.1.tar.gz", hash = "sha256:68a3314699a7d18fcf338b52bae8ce46a97dde1128a3e47e33fa4db7f71f265e", size = 75120, upload-time = "2025-12-02T11:57:12.997Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4e/7a/8a277f377e8a151fc0e71d4ffc1114aefb6e5e1c7dd609fed0955cf34ed8/tantivy-0.25.1-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:d363d7b4207d3a5aa7f0d212420df35bed18bdb6bae26a2a8bd57428388b7c29", size = 7637033, upload-time = "2025-12-02T11:56:18.104Z" }, - { url = "https://files.pythonhosted.org/packages/71/31/8b4acdedfc9f9a2d04b1340d07eef5213d6f151d1e18da0cb423e5f090d2/tantivy-0.25.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8f4389cf1d889a1df7c5a3195806b4b56c37cee10d8a26faaa0dea35a867b5ff", size = 3932180, upload-time = "2025-12-02T11:56:19.833Z" }, - { url = "https://files.pythonhosted.org/packages/2f/dc/3e8499c21b4b9795e8f2fc54c68ce5b92905aaeadadaa56ecfa9180b11b1/tantivy-0.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99864c09fc54652c3c2486cdf13f86cdc8200f4b481569cb291e095ca5d496e5", size = 4197620, upload-time = "2025-12-02T11:56:21.496Z" }, - { url = "https://files.pythonhosted.org/packages/f8/8e/f2ce62fffc811eb62bead92c7b23c2e218f817cbd54c4f3b802e03ba1438/tantivy-0.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05abf37ddbc5063c575548be0d62931629c086bff7a5a1b67cf5a8f5ebf4cd8c", size = 4183794, upload-time = "2025-12-02T11:56:23.215Z" }, - { url = "https://files.pythonhosted.org/packages/41/e7/6849c713ed0996c7628324c60512c4882006f0a62145e56c624a93407f90/tantivy-0.25.1-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:90fd919e5f611809f746560ecf36eb9be824dec62e21ae17a27243759edb9aa1", size = 7621494, upload-time = "2025-12-02T11:56:27.069Z" }, - { url = "https://files.pythonhosted.org/packages/c5/22/c3d8294600dc6e7fa350daef9ff337d3c06e132b81df727de9f7a50c692a/tantivy-0.25.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:4613c7cf6c23f3a97989819690a0f956d799354957de7a204abcc60083cebe02", size = 3925219, upload-time = "2025-12-02T11:56:29.403Z" }, - { url = "https://files.pythonhosted.org/packages/41/fc/cbb1df71dd44c9110eff4eaaeda9d44f2d06182fe0452193be20ddfba93f/tantivy-0.25.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c477bd20b4df804d57dfc5033431bef27cde605695ae141b03abbf6ebc069129", size = 4198699, upload-time = "2025-12-02T11:56:31.359Z" }, - { url = "https://files.pythonhosted.org/packages/47/4d/71abb78b774073c3ce12a4faa4351a9d910a71ffa3659526affba163873d/tantivy-0.25.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9b1a1ba1113c523c7ff7b10f282d6c4074006f7ef8d71e1d973d51bf7291ddb", size = 4183585, upload-time = "2025-12-02T11:56:33.317Z" }, - { url = "https://files.pythonhosted.org/packages/3d/25/73cfbcf1a8ea49be6c42817431cac46b70a119fe64da903fcc2d92b5b511/tantivy-0.25.1-cp313-cp313-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f51ff7196c6f31719202080ed8372d5e3d51e92c749c032fb8234f012e99744c", size = 7622530, upload-time = "2025-12-02T11:56:36.839Z" }, - { url = "https://files.pythonhosted.org/packages/12/c8/c0d7591cdf4f7e7a9fc4da786d1ca8cd1aacffaa2be16ea6d401a8e4a566/tantivy-0.25.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:550e63321bfcacc003859f2fa29c1e8e56450807b3c9a501c1add27cfb9236d9", size = 3925637, upload-time = "2025-12-02T11:56:38.425Z" }, - { url = "https://files.pythonhosted.org/packages/3a/09/bedfc223bffec7641b417dd7ab071134b2ef8f8550e9b1fb6014657ef52e/tantivy-0.25.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fde31cc8d6e122faf7902aeea32bc008a429a6e8904e34d3468126a3ec01b016", size = 4197322, upload-time = "2025-12-02T11:56:40.411Z" }, - { url = "https://files.pythonhosted.org/packages/f5/f1/1fa5183500c8042200c9f2b840d34f5bbcfb434a1ee750e7132262d2a5c9/tantivy-0.25.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b11bd5a518b0be645320b47af8493f6a40c4f3234313e37adcf4534a564d27dd", size = 4183143, upload-time = "2025-12-02T11:56:42.048Z" }, - { url = "https://files.pythonhosted.org/packages/8b/2f/581519492226f97d23bd0adc95dad991ebeaa73ea6abc8bff389a3096d9a/tantivy-0.25.1-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:dae99e75b7eaa9bf5bd16ab106b416370f08c135aed0e117d62a3201cd1ffe36", size = 7610316, upload-time = "2025-12-02T11:56:45.927Z" }, - { url = "https://files.pythonhosted.org/packages/91/40/5d7bc315ab9e6a22c5572656e8ada1c836cfa96dccf533377504fbc3c9d9/tantivy-0.25.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:506e9533c5ef4d3df43bad64ffecc0aa97c76e361ea610815dc3a20a9d6b30b3", size = 3919882, upload-time = "2025-12-02T11:56:48.469Z" }, - { url = "https://files.pythonhosted.org/packages/02/b9/e0ef2f57a6a72444cb66c2ffbc310ab33ffaace275f1c4b0319d84ea3f18/tantivy-0.25.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dbd4f8f264dacbcc9dee542832da2173fd53deaaea03f082d95214f8b5ed6bc", size = 4196031, upload-time = "2025-12-02T11:56:50.151Z" }, - { url = "https://files.pythonhosted.org/packages/1e/02/bf3f8cacfd08642e14a73f7956a3fb95d58119132c98c121b9065a1f8615/tantivy-0.25.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:824c643ccb640dd9e35e00c5d5054ddf3323f56fe4219d57d428a9eeea13d22c", size = 4183437, upload-time = "2025-12-02T11:56:51.818Z" }, - { url = "https://files.pythonhosted.org/packages/ff/44/9f1d67aa5030f7eebc966c863d1316a510a971dd8bb45651df4acdfae9ed/tantivy-0.25.1-cp314-cp314-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:7f5d29ae85dd0f23df8d15b3e7b341d4f9eb5a446bbb9640df48ac1f6d9e0c6c", size = 7623723, upload-time = "2025-12-02T11:56:55.066Z" }, - { url = "https://files.pythonhosted.org/packages/db/30/6e085bd3ed9d12da3c91c185854abd70f9dfd35fb36a75ea98428d42c30b/tantivy-0.25.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:f2d2938fb69a74fc1bb36edfaf7f0d1596fa1264db0f377bda2195c58bcb6245", size = 3926243, upload-time = "2025-12-02T11:56:57.058Z" }, - { url = "https://files.pythonhosted.org/packages/32/f5/a00d65433430f51718e5cc6938df571765d7c4e03aedec5aef4ab567aa9b/tantivy-0.25.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f5ff124c4802558e627091e780b362ca944169736caba5a372eef39a79d0ae0", size = 4207186, upload-time = "2025-12-02T11:56:58.803Z" }, - { url = "https://files.pythonhosted.org/packages/19/63/61bdb12fc95f2a7f77bd419a5149bfa9f28caa76cb569bf2b6b06e1d033e/tantivy-0.25.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43b80ef62a340416139c93d19264e5f808da48e04f9305f1092b8ed22be0a5be", size = 4187312, upload-time = "2025-12-02T11:57:00.595Z" }, -] +version = "0.26.0" +source = { git = "https://github.com/quickwit-oss/tantivy-py.git#fa1a1985b96001929fc1cafcdd9dc94e56658b2a" } [[package]] name = "tenacity"