Add simple text search mode and API param

This commit is contained in:
shamoon
2026-04-01 10:08:27 -07:00
parent 05c9e21fac
commit e6a334878c
3 changed files with 64 additions and 5 deletions

View File

@@ -7,6 +7,7 @@ from collections import Counter
from dataclasses import dataclass
from datetime import UTC
from datetime import datetime
from enum import StrEnum
from typing import TYPE_CHECKING
from typing import Self
from typing import TypedDict
@@ -20,6 +21,7 @@ from django.utils.timezone import get_current_timezone
from guardian.shortcuts import get_users_with_perms
from documents.search._query import build_permission_filter
from documents.search._query import parse_simple_text_query
from documents.search._query import parse_user_query
from documents.search._schema import _write_sentinels
from documents.search._schema import build_schema
@@ -45,6 +47,11 @@ _AUTOCOMPLETE_REGEX_TIMEOUT = 1.0 # seconds; guards against ReDoS on untrusted
T = TypeVar("T")
class SearchMode(StrEnum):
QUERY = "query"
TEXT = "text"
def _ascii_fold(s: str) -> str:
"""
Normalize unicode to ASCII equivalent characters for search consistency.
@@ -433,6 +440,7 @@ class TantivyBackend:
sort_field: str | None,
*,
sort_reverse: bool,
search_mode: SearchMode = SearchMode.QUERY,
) -> SearchResults:
"""
Execute a search query against the document index.
@@ -441,20 +449,28 @@ class TantivyBackend:
permission filtering before executing against Tantivy. Supports both
relevance-based and field-based sorting.
QUERY search mode supports natural date keywords, field filters, etc.
TEXT search mode treats the query as plain text to search for in title and content
Args:
query: User's search query (supports natural date keywords, field filters)
query: User's search query
user: User for permission filtering (None for superuser/no filtering)
page: Page number (1-indexed) for pagination
page_size: Number of results per page
sort_field: Field to sort by (None for relevance ranking)
sort_reverse: Whether to reverse the sort order
search_mode: "query" for advanced Tantivy syntax, "text" for
plain-text search over title and content only
Returns:
SearchResults with hits, total count, and processed query
"""
self._ensure_open()
tz = get_current_timezone()
user_query = parse_user_query(self._index, query, tz)
if search_mode is SearchMode.TEXT:
user_query = parse_simple_text_query(self._index, query)
else:
user_query = parse_user_query(self._index, query, tz)
# Apply permission filter if user is not None (not superuser)
if user is not None:

View File

@@ -51,6 +51,7 @@ _WHOOSH_REL_RANGE_RE = regex.compile(
)
# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly
_DATE8_RE = regex.compile(r"(?P<field>\w+):(?P<date8>\d{8})\b")
_SIMPLE_QUERY_SPECIAL_CHARS_RE = regex.compile(r'([+\-!(){}\[\]^"~*?:\\/])')
def _fmt(dt: datetime) -> str:
@@ -436,6 +437,7 @@ DEFAULT_SEARCH_FIELDS = [
"document_type",
"tag",
]
SIMPLE_SEARCH_FIELDS = ["title", "content"]
_FIELD_BOOSTS = {"title": 2.0}
@@ -495,3 +497,28 @@ def parse_user_query(
)
return exact
def parse_simple_text_query(
index: tantivy.Index,
raw_query: str,
) -> tantivy.Query:
"""
Parse a plain-text query using Tantivy's default parser over title/content.
Query string is escaped and normalized to be treated as "simple" text query.
"""
# strips special characters that would be interpreted as syntax by the parser
query_str = regex.sub(
_SIMPLE_QUERY_SPECIAL_CHARS_RE,
r"\\\1",
raw_query,
timeout=_REGEX_TIMEOUT,
)
# collapse multiple spaces to a single space for cleaner parsing (and to prevent ReDoS on excessive whitespace)
query_str = regex.sub(r" {2,}", " ", query_str, timeout=_REGEX_TIMEOUT).strip()
return index.parse_query(
query_str,
SIMPLE_SEARCH_FIELDS,
field_boosts=_FIELD_BOOSTS,
)

View File

@@ -1995,6 +1995,12 @@ class ChatStreamingView(GenericAPIView):
list=extend_schema(
description="Document views including search",
parameters=[
OpenApiParameter(
name="text",
type=OpenApiTypes.STR,
location=OpenApiParameter.QUERY,
description="Simple text search query string",
),
OpenApiParameter(
name="query",
type=OpenApiTypes.STR,
@@ -2033,7 +2039,8 @@ class UnifiedSearchViewSet(DocumentViewSet):
def _is_search_request(self):
return (
"query" in self.request.query_params
"text" in self.request.query_params
or "query" in self.request.query_params
or "more_like_id" in self.request.query_params
)
@@ -2043,6 +2050,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
from documents.search import TantivyRelevanceList
from documents.search import get_backend
from documents.search._backend import SearchMode
try:
backend = get_backend()
@@ -2051,8 +2059,15 @@ class UnifiedSearchViewSet(DocumentViewSet):
user = None if request.user.is_superuser else request.user
if "query" in request.query_params:
query_str = request.query_params["query"]
if "text" in request.query_params or "query" in request.query_params:
search_mode = (
SearchMode.TEXT
if "text" in request.query_params
else SearchMode.QUERY
)
query_str = (
request.query_params.get("text") or request.query_params["query"]
)
results = backend.search(
query_str,
user=user,
@@ -2060,6 +2075,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
page_size=10000,
sort_field=None,
sort_reverse=False,
search_mode=search_mode,
)
else:
# more_like_id — validate permission on the seed document first