refactor(search): add docstrings and complete type annotations to all search module functions

- Add descriptive docstrings to all functions in _schema.py, _tokenizer.py, and _query.py - Complete type annotations for all function parameters and return values - Fix 8 mypy strict errors in _query.py: - Add re.Match[str] type parameters for regex matches - Fix "Returning Any" error with str() cast - Add type annotations for build_permission_filter() and parse_user_query() - Remove lazy imports, move to module top level - All 29 search module tests continue to pass Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-30 17:24:22 +00:00 · 2026-03-29 15:26:07 -07:00
parent 957049c512
commit 0078ef9cd5
3 changed files with 44 additions and 25 deletions
@@ -7,9 +7,14 @@ from datetime import datetime
 from datetime import timedelta
 from typing import TYPE_CHECKING

+import tantivy
+from django.conf import settings
+
 if TYPE_CHECKING:
    from datetime import tzinfo

+    from django.contrib.auth.base_user import AbstractBaseUser
+
 _DATE_ONLY_FIELDS = frozenset({"created"})

 _DATE_KEYWORDS = frozenset(
@@ -36,10 +41,12 @@ _RELATIVE_RANGE_RE = re.compile(


 def _fmt(dt: datetime) -> str:
+    """Format a datetime as an ISO 8601 UTC string for use in Tantivy range queries."""
    return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")


 def _iso_range(lo: datetime, hi: datetime) -> str:
+    """Format a [lo TO hi] range string in ISO 8601 for Tantivy query syntax."""
    return f"[{_fmt(lo)} TO {_fmt(hi)}]"


@@ -144,7 +151,9 @@ def _datetime_range(keyword: str, tz: tzinfo) -> str:


 def _rewrite_compact_date(query: str) -> str:
-    def _sub(m: re.Match) -> str:
+    """Rewrite Whoosh compact date tokens (14-digit YYYYMMDDHHmmss) to ISO 8601."""
+
+    def _sub(m: re.Match[str]) -> str:
        raw = m.group(1)
        try:
            dt = datetime(
@@ -158,13 +167,15 @@ def _rewrite_compact_date(query: str) -> str:
            )
            return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
        except ValueError:
-            return m.group(0)
+            return str(m.group(0))

    return _COMPACT_DATE_RE.sub(_sub, query)


 def _rewrite_relative_range(query: str) -> str:
-    def _sub(m: re.Match) -> str:
+    """Rewrite Whoosh relative ranges ([now-7d TO now]) to concrete ISO 8601 UTC boundaries."""
+
+    def _sub(m: re.Match[str]) -> str:
        now = datetime.now(UTC)

        def _offset(s: str | None) -> timedelta:
@@ -198,7 +209,7 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
    query = _rewrite_compact_date(query)
    query = _rewrite_relative_range(query)

-    def _replace(m: re.Match) -> str:
+    def _replace(m: re.Match[str]) -> str:
        field, keyword = m.group(1), m.group(2)
        if field in _DATE_ONLY_FIELDS:
            return f"{field}:{_date_only_range(keyword, tz)}"
@@ -216,7 +227,7 @@ def normalize_query(query: str) -> str:
    tag:foo,bar → tag:foo AND tag:bar
    """

-    def _expand(m: re.Match) -> str:
+    def _expand(m: re.Match[str]) -> str:
        field = m.group(1)
        values = [v.strip() for v in m.group(2).split(",") if v.strip()]
        return " AND ".join(f"{field}:{v}" for v in values)
@@ -230,7 +241,10 @@ def normalize_query(query: str) -> str:
 _MAX_U64 = 2**64 - 1  # u64 max — used as inclusive upper bound for "any owner" range


-def build_permission_filter(schema, user):
+def build_permission_filter(
+    schema: tantivy.Schema,
+    user: AbstractBaseUser,
+) -> tantivy.Query:
    """
    Returns a Query matching documents visible to user:
    - no owner (public)      → owner_id field absent (NULL in Django)
@@ -251,36 +265,34 @@ def build_permission_filter(schema, user):
    simplify this to MustNot(exists_query("owner_id")) once released.
    See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi
    """
-    import tantivy as _tantivy
-
-    owner_any = _tantivy.Query.range_query(
+    owner_any = tantivy.Query.range_query(
        schema,
        "owner_id",
-        _tantivy.FieldType.Unsigned,
+        tantivy.FieldType.Unsigned,
        1,
        _MAX_U64,
    )
-    no_owner = _tantivy.Query.boolean_query(
+    no_owner = tantivy.Query.boolean_query(
        [
-            (_tantivy.Occur.Must, _tantivy.Query.all_query()),
-            (_tantivy.Occur.MustNot, owner_any),
+            (tantivy.Occur.Must, tantivy.Query.all_query()),
+            (tantivy.Occur.MustNot, owner_any),
        ],
    )
-    owned = _tantivy.Query.range_query(
+    owned = tantivy.Query.range_query(
        schema,
        "owner_id",
-        _tantivy.FieldType.Unsigned,
+        tantivy.FieldType.Unsigned,
        user.pk,
        user.pk,
    )
-    shared = _tantivy.Query.range_query(
+    shared = tantivy.Query.range_query(
        schema,
        "viewer_id",
-        _tantivy.FieldType.Unsigned,
+        tantivy.FieldType.Unsigned,
        user.pk,
        user.pk,
    )
-    return _tantivy.Query.disjunction_max_query([no_owner, owned, shared])
+    return tantivy.Query.disjunction_max_query([no_owner, owned, shared])


 # ── parse_user_query (full pipeline) ─────────────────────────────────────────
@@ -297,8 +309,13 @@ DEFAULT_SEARCH_FIELDS = [
 _FIELD_BOOSTS = {"title": 2.0}


-def parse_user_query(index, schema, raw_query: str, tz: tzinfo):
-    from django.conf import settings
+def parse_user_query(
+    index: tantivy.Index,
+    schema: tantivy.Schema,
+    raw_query: str,
+    tz: tzinfo,
+) -> tantivy.Query:
+    """Run the full query preprocessing pipeline: date rewriting → normalisation → Tantivy parse. Adds fuzzy blend if ADVANCED_FUZZY_SEARCH_THRESHOLD is set."""

    query_str = rewrite_natural_date_keywords(raw_query, tz)
    query_str = normalize_query(query_str)
@@ -311,8 +328,6 @@ def parse_user_query(index, schema, raw_query: str, tz: tzinfo):

    threshold = getattr(settings, "ADVANCED_FUZZY_SEARCH_THRESHOLD", None)
    if threshold is not None:
-        import tantivy
-
        fuzzy = index.parse_query(
            query_str,
            DEFAULT_SEARCH_FIELDS,
@@ -16,6 +16,7 @@ SCHEMA_VERSION = 1


 def build_schema() -> tantivy.Schema:
+    """Build the Tantivy schema for the paperless document index."""
    sb = tantivy.SchemaBuilder()

    sb.add_unsigned_field("id", stored=True, indexed=True, fast=True)
@@ -70,6 +71,7 @@ def build_schema() -> tantivy.Schema:


 def _needs_rebuild(index_dir: Path) -> bool:
+    """Check if the search index needs rebuilding by comparing schema version and language sentinel files."""
    version_file = index_dir / ".schema_version"
    if not version_file.exists():
        return True
@@ -92,6 +94,7 @@ def _needs_rebuild(index_dir: Path) -> bool:


 def _wipe_index(index_dir: Path) -> None:
+    """Delete all children in the index directory to prepare for rebuild."""
    for child in list(index_dir.iterdir()):
        if child.is_dir():
            shutil.rmtree(child)
@@ -100,6 +103,7 @@ def _wipe_index(index_dir: Path) -> None:


 def _write_sentinels(index_dir: Path) -> None:
+    """Write schema version and language sentinel files so the next index open can skip rebuilding."""
    (index_dir / ".schema_version").write_text(str(SCHEMA_VERSION))
    (index_dir / ".schema_language").write_text(settings.SEARCH_LANGUAGE)

@@ -67,7 +67,7 @@ def register_tokenizers(index: tantivy.Index, language: str) -> None:


 def _paperless_text(language: str) -> tantivy.TextAnalyzer:
-    """simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
+    """Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
    builder = (
        tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
        .filter(tantivy.Filter.remove_long(65))
@@ -88,7 +88,7 @@ def _paperless_text(language: str) -> tantivy.TextAnalyzer:


 def _simple_analyzer() -> tantivy.TextAnalyzer:
-    """simple -> lowercase -> ascii_fold. Used for shadow sort fields."""
+    """Tokenizer for shadow sort fields (title_sort, correspondent_sort, type_sort): simple -> lowercase -> ascii_fold."""
    return (
        tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
        .filter(tantivy.Filter.lowercase())
@@ -98,7 +98,7 @@ def _simple_analyzer() -> tantivy.TextAnalyzer:


 def _bigram_analyzer() -> tantivy.TextAnalyzer:
-    """ngram(2,2) -> lowercase. CJK / no-whitespace language support."""
+    """Enables substring search in CJK text: ngram(2,2) -> lowercase. CJK / no-whitespace language support."""
    return (
        tantivy.TextAnalyzerBuilder(
            tantivy.Tokenizer.ngram(min_gram=2, max_gram=2, prefix_only=False),