From e6a334878c256d9a67c2a9ae02f97decd125048e Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Wed, 1 Apr 2026 10:08:27 -0700 Subject: [PATCH] Add simple text search mode and API param --- src/documents/search/_backend.py | 20 ++++++++++++++++++-- src/documents/search/_query.py | 27 +++++++++++++++++++++++++++ src/documents/views.py | 22 +++++++++++++++++++--- 3 files changed, 64 insertions(+), 5 deletions(-) diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index a1bff8a9f..d6a1ad41e 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -7,6 +7,7 @@ from collections import Counter from dataclasses import dataclass from datetime import UTC from datetime import datetime +from enum import StrEnum from typing import TYPE_CHECKING from typing import Self from typing import TypedDict @@ -20,6 +21,7 @@ from django.utils.timezone import get_current_timezone from guardian.shortcuts import get_users_with_perms from documents.search._query import build_permission_filter +from documents.search._query import parse_simple_text_query from documents.search._query import parse_user_query from documents.search._schema import _write_sentinels from documents.search._schema import build_schema @@ -45,6 +47,11 @@ _AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted T = TypeVar("T") +class SearchMode(StrEnum): + QUERY = "query" + TEXT = "text" + + def _ascii_fold(s: str) -> str: """ Normalize unicode to ASCII equivalent characters for search consistency. @@ -433,6 +440,7 @@ class TantivyBackend: sort_field: str | None, *, sort_reverse: bool, + search_mode: SearchMode = SearchMode.QUERY, ) -> SearchResults: """ Execute a search query against the document index. @@ -441,20 +449,28 @@ class TantivyBackend: permission filtering before executing against Tantivy. Supports both relevance-based and field-based sorting. + QUERY search mode supports natural date keywords, field filters, etc. + TEXT search mode treats the query as plain text to search for in title and content + Args: - query: User's search query (supports natural date keywords, field filters) + query: User's search query user: User for permission filtering (None for superuser/no filtering) page: Page number (1-indexed) for pagination page_size: Number of results per page sort_field: Field to sort by (None for relevance ranking) sort_reverse: Whether to reverse the sort order + search_mode: "query" for advanced Tantivy syntax, "text" for + plain-text search over title and content only Returns: SearchResults with hits, total count, and processed query """ self._ensure_open() tz = get_current_timezone() - user_query = parse_user_query(self._index, query, tz) + if search_mode is SearchMode.TEXT: + user_query = parse_simple_text_query(self._index, query) + else: + user_query = parse_user_query(self._index, query, tz) # Apply permission filter if user is not None (not superuser) if user is not None: diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 212df1516..6dbf78ca6 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -51,6 +51,7 @@ _WHOOSH_REL_RANGE_RE = regex.compile( ) # Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly _DATE8_RE = regex.compile(r"(?P\w+):(?P\d{8})\b") +_SIMPLE_QUERY_SPECIAL_CHARS_RE = regex.compile(r'([+\-!(){}\[\]^"~*?:\\/])') def _fmt(dt: datetime) -> str: @@ -436,6 +437,7 @@ DEFAULT_SEARCH_FIELDS = [ "document_type", "tag", ] +SIMPLE_SEARCH_FIELDS = ["title", "content"] _FIELD_BOOSTS = {"title": 2.0} @@ -495,3 +497,28 @@ def parse_user_query( ) return exact + + +def parse_simple_text_query( + index: tantivy.Index, + raw_query: str, +) -> tantivy.Query: + """ + Parse a plain-text query using Tantivy's default parser over title/content. + + Query string is escaped and normalized to be treated as "simple" text query. + """ + # strips special characters that would be interpreted as syntax by the parser + query_str = regex.sub( + _SIMPLE_QUERY_SPECIAL_CHARS_RE, + r"\\\1", + raw_query, + timeout=_REGEX_TIMEOUT, + ) + # collapse multiple spaces to a single space for cleaner parsing (and to prevent ReDoS on excessive whitespace) + query_str = regex.sub(r" {2,}", " ", query_str, timeout=_REGEX_TIMEOUT).strip() + return index.parse_query( + query_str, + SIMPLE_SEARCH_FIELDS, + field_boosts=_FIELD_BOOSTS, + ) diff --git a/src/documents/views.py b/src/documents/views.py index 024e846a0..80c83d1d7 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1995,6 +1995,12 @@ class ChatStreamingView(GenericAPIView): list=extend_schema( description="Document views including search", parameters=[ + OpenApiParameter( + name="text", + type=OpenApiTypes.STR, + location=OpenApiParameter.QUERY, + description="Simple text search query string", + ), OpenApiParameter( name="query", type=OpenApiTypes.STR, @@ -2033,7 +2039,8 @@ class UnifiedSearchViewSet(DocumentViewSet): def _is_search_request(self): return ( - "query" in self.request.query_params + "text" in self.request.query_params + or "query" in self.request.query_params or "more_like_id" in self.request.query_params ) @@ -2043,6 +2050,7 @@ class UnifiedSearchViewSet(DocumentViewSet): from documents.search import TantivyRelevanceList from documents.search import get_backend + from documents.search._backend import SearchMode try: backend = get_backend() @@ -2051,8 +2059,15 @@ class UnifiedSearchViewSet(DocumentViewSet): user = None if request.user.is_superuser else request.user - if "query" in request.query_params: - query_str = request.query_params["query"] + if "text" in request.query_params or "query" in request.query_params: + search_mode = ( + SearchMode.TEXT + if "text" in request.query_params + else SearchMode.QUERY + ) + query_str = ( + request.query_params.get("text") or request.query_params["query"] + ) results = backend.search( query_str, user=user, @@ -2060,6 +2075,7 @@ class UnifiedSearchViewSet(DocumentViewSet): page_size=10000, sort_field=None, sort_reverse=False, + search_mode=search_mode, ) else: # more_like_id — validate permission on the seed document first