refactor(search): add docstrings and complete type annotations to all search module functions

- Add descriptive docstrings to all functions in _schema.py, _tokenizer.py, and _query.py
- Complete type annotations for all function parameters and return values
- Fix 8 mypy strict errors in _query.py:
  - Add re.Match[str] type parameters for regex matches
  - Fix "Returning Any" error with str() cast
  - Add type annotations for build_permission_filter() and parse_user_query()
  - Remove lazy imports, move to module top level
- All 29 search module tests continue to pass

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Trenton H
2026-03-29 15:26:07 -07:00
parent 957049c512
commit 0078ef9cd5
3 changed files with 44 additions and 25 deletions
+37 -22
View File
@@ -7,9 +7,14 @@ from datetime import datetime
from datetime import timedelta
from typing import TYPE_CHECKING
import tantivy
from django.conf import settings
if TYPE_CHECKING:
from datetime import tzinfo
from django.contrib.auth.base_user import AbstractBaseUser
_DATE_ONLY_FIELDS = frozenset({"created"})
_DATE_KEYWORDS = frozenset(
@@ -36,10 +41,12 @@ _RELATIVE_RANGE_RE = re.compile(
def _fmt(dt: datetime) -> str:
"""Format a datetime as an ISO 8601 UTC string for use in Tantivy range queries."""
return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
def _iso_range(lo: datetime, hi: datetime) -> str:
"""Format a [lo TO hi] range string in ISO 8601 for Tantivy query syntax."""
return f"[{_fmt(lo)} TO {_fmt(hi)}]"
@@ -144,7 +151,9 @@ def _datetime_range(keyword: str, tz: tzinfo) -> str:
def _rewrite_compact_date(query: str) -> str:
def _sub(m: re.Match) -> str:
"""Rewrite Whoosh compact date tokens (14-digit YYYYMMDDHHmmss) to ISO 8601."""
def _sub(m: re.Match[str]) -> str:
raw = m.group(1)
try:
dt = datetime(
@@ -158,13 +167,15 @@ def _rewrite_compact_date(query: str) -> str:
)
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
except ValueError:
return m.group(0)
return str(m.group(0))
return _COMPACT_DATE_RE.sub(_sub, query)
def _rewrite_relative_range(query: str) -> str:
def _sub(m: re.Match) -> str:
"""Rewrite Whoosh relative ranges ([now-7d TO now]) to concrete ISO 8601 UTC boundaries."""
def _sub(m: re.Match[str]) -> str:
now = datetime.now(UTC)
def _offset(s: str | None) -> timedelta:
@@ -198,7 +209,7 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
query = _rewrite_compact_date(query)
query = _rewrite_relative_range(query)
def _replace(m: re.Match) -> str:
def _replace(m: re.Match[str]) -> str:
field, keyword = m.group(1), m.group(2)
if field in _DATE_ONLY_FIELDS:
return f"{field}:{_date_only_range(keyword, tz)}"
@@ -216,7 +227,7 @@ def normalize_query(query: str) -> str:
tag:foo,bar → tag:foo AND tag:bar
"""
def _expand(m: re.Match) -> str:
def _expand(m: re.Match[str]) -> str:
field = m.group(1)
values = [v.strip() for v in m.group(2).split(",") if v.strip()]
return " AND ".join(f"{field}:{v}" for v in values)
@@ -230,7 +241,10 @@ def normalize_query(query: str) -> str:
_MAX_U64 = 2**64 - 1 # u64 max — used as inclusive upper bound for "any owner" range
def build_permission_filter(schema, user):
def build_permission_filter(
schema: tantivy.Schema,
user: AbstractBaseUser,
) -> tantivy.Query:
"""
Returns a Query matching documents visible to user:
- no owner (public) → owner_id field absent (NULL in Django)
@@ -251,36 +265,34 @@ def build_permission_filter(schema, user):
simplify this to MustNot(exists_query("owner_id")) once released.
See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi
"""
import tantivy as _tantivy
owner_any = _tantivy.Query.range_query(
owner_any = tantivy.Query.range_query(
schema,
"owner_id",
_tantivy.FieldType.Unsigned,
tantivy.FieldType.Unsigned,
1,
_MAX_U64,
)
no_owner = _tantivy.Query.boolean_query(
no_owner = tantivy.Query.boolean_query(
[
(_tantivy.Occur.Must, _tantivy.Query.all_query()),
(_tantivy.Occur.MustNot, owner_any),
(tantivy.Occur.Must, tantivy.Query.all_query()),
(tantivy.Occur.MustNot, owner_any),
],
)
owned = _tantivy.Query.range_query(
owned = tantivy.Query.range_query(
schema,
"owner_id",
_tantivy.FieldType.Unsigned,
tantivy.FieldType.Unsigned,
user.pk,
user.pk,
)
shared = _tantivy.Query.range_query(
shared = tantivy.Query.range_query(
schema,
"viewer_id",
_tantivy.FieldType.Unsigned,
tantivy.FieldType.Unsigned,
user.pk,
user.pk,
)
return _tantivy.Query.disjunction_max_query([no_owner, owned, shared])
return tantivy.Query.disjunction_max_query([no_owner, owned, shared])
# ── parse_user_query (full pipeline) ─────────────────────────────────────────
@@ -297,8 +309,13 @@ DEFAULT_SEARCH_FIELDS = [
_FIELD_BOOSTS = {"title": 2.0}
def parse_user_query(index, schema, raw_query: str, tz: tzinfo):
from django.conf import settings
def parse_user_query(
index: tantivy.Index,
schema: tantivy.Schema,
raw_query: str,
tz: tzinfo,
) -> tantivy.Query:
"""Run the full query preprocessing pipeline: date rewriting → normalisation → Tantivy parse. Adds fuzzy blend if ADVANCED_FUZZY_SEARCH_THRESHOLD is set."""
query_str = rewrite_natural_date_keywords(raw_query, tz)
query_str = normalize_query(query_str)
@@ -311,8 +328,6 @@ def parse_user_query(index, schema, raw_query: str, tz: tzinfo):
threshold = getattr(settings, "ADVANCED_FUZZY_SEARCH_THRESHOLD", None)
if threshold is not None:
import tantivy
fuzzy = index.parse_query(
query_str,
DEFAULT_SEARCH_FIELDS,
+4
View File
@@ -16,6 +16,7 @@ SCHEMA_VERSION = 1
def build_schema() -> tantivy.Schema:
"""Build the Tantivy schema for the paperless document index."""
sb = tantivy.SchemaBuilder()
sb.add_unsigned_field("id", stored=True, indexed=True, fast=True)
@@ -70,6 +71,7 @@ def build_schema() -> tantivy.Schema:
def _needs_rebuild(index_dir: Path) -> bool:
"""Check if the search index needs rebuilding by comparing schema version and language sentinel files."""
version_file = index_dir / ".schema_version"
if not version_file.exists():
return True
@@ -92,6 +94,7 @@ def _needs_rebuild(index_dir: Path) -> bool:
def _wipe_index(index_dir: Path) -> None:
"""Delete all children in the index directory to prepare for rebuild."""
for child in list(index_dir.iterdir()):
if child.is_dir():
shutil.rmtree(child)
@@ -100,6 +103,7 @@ def _wipe_index(index_dir: Path) -> None:
def _write_sentinels(index_dir: Path) -> None:
"""Write schema version and language sentinel files so the next index open can skip rebuilding."""
(index_dir / ".schema_version").write_text(str(SCHEMA_VERSION))
(index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE)
+3 -3
View File
@@ -67,7 +67,7 @@ def register_tokenizers(index: tantivy.Index, language: str) -> None:
def _paperless_text(language: str) -> tantivy.TextAnalyzer:
"""simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
builder = (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
.filter(tantivy.Filter.remove_long(65))
@@ -88,7 +88,7 @@ def _paperless_text(language: str) -> tantivy.TextAnalyzer:
def _simple_analyzer() -> tantivy.TextAnalyzer:
"""simple -> lowercase -> ascii_fold. Used for shadow sort fields."""
"""Tokenizer for shadow sort fields (title_sort, correspondent_sort, type_sort): simple -> lowercase -> ascii_fold."""
return (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
.filter(tantivy.Filter.lowercase())
@@ -98,7 +98,7 @@ def _simple_analyzer() -> tantivy.TextAnalyzer:
def _bigram_analyzer() -> tantivy.TextAnalyzer:
"""ngram(2,2) -> lowercase. CJK / no-whitespace language support."""
"""Enables substring search in CJK text: ngram(2,2) -> lowercase. CJK / no-whitespace language support."""
return (
tantivy.TextAnalyzerBuilder(
tantivy.Tokenizer.ngram(min_gram=2, max_gram=2, prefix_only=False),