refactor(tasks): add docstrings and rename _parse_legacy_result

- Add docstrings to _extract_input_data, _determine_trigger_source,
  _extract_owner_id explaining what each helper does and why
- Rename _parse_legacy_result -> _parse_consume_result: the function
  parses current consume_file string outputs (consumer.py returns
  "New document id N created" and "It is a duplicate of X (#N)"),
  not legacy data; the old name was misleading

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
stumpylog
2026-04-15 13:59:02 -07:00
parent 0bc9227d6f
commit 979d8a67f0

View File

@@ -1040,6 +1040,13 @@ def _extract_input_data(
args: tuple,
task_kwargs: dict,
) -> dict:
"""Build the input_data dict stored on the PaperlessTask record.
For consume_file tasks this includes the filename, MIME type, and any
non-null overrides from the DocumentMetadataOverrides object. For
mail_fetch tasks it captures the account_ids list. All other task
types store no input data and return {}.
"""
if task_type == PaperlessTask.TaskType.CONSUME_FILE:
input_doc, overrides = _get_consume_args(args, task_kwargs)
if input_doc is None:
@@ -1075,6 +1082,13 @@ def _determine_trigger_source(
task_kwargs: dict,
headers: dict,
) -> PaperlessTask.TriggerSource:
"""Resolve the TriggerSource for a task being published to the broker.
Priority order:
1. Explicit trigger_source header (set by beat schedule or apply_async callers).
2. For consume_file tasks, the DocumentSource on the input document.
3. MANUAL as the catch-all for all other cases.
"""
# Explicit header takes priority -- covers beat ("scheduled") and system auto-runs ("system")
header_source = headers.get("trigger_source")
if header_source == "scheduled":
@@ -1098,6 +1112,7 @@ def _extract_owner_id(
args: tuple,
task_kwargs: dict,
) -> int | None:
"""Return the owner_id from consume_file overrides, or None for all other task types."""
if task_type != PaperlessTask.TaskType.CONSUME_FILE:
return None
_, overrides = _get_consume_args(args, task_kwargs)
@@ -1106,7 +1121,15 @@ def _extract_owner_id(
return None
def _parse_legacy_result(result: str) -> dict | None:
def _parse_consume_result(result: str) -> dict | None:
"""Parse a consume_file string result into a structured dict.
consume_file returns human-readable strings rather than dicts (e.g.
"Success. New document id 42 created" or "It is a duplicate of foo (#7)").
This function extracts the document ID or duplicate reference so the
result can be stored as structured data on the PaperlessTask record.
Returns None when the string does not match any known pattern.
"""
if match := _re.search(r"New document id (\d+) created", result):
return {"document_id": int(match.group(1))}
if match := _re.search(r"It is a duplicate of .* \(#(\d+)\)", result):
@@ -1210,7 +1233,7 @@ def task_postrun_handler(
result_data = retval
elif isinstance(retval, str):
result_message = retval
result_data = _parse_legacy_result(retval)
result_data = _parse_consume_result(retval)
now = timezone.now()
task_instance = PaperlessTask.objects.filter(task_id=task_id).first()