mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-12 02:58:52 +00:00
232 lines
8.1 KiB
Python
232 lines
8.1 KiB
Python
"""
|
|
Workflow trigger matching profiling.
|
|
|
|
Run with:
|
|
uv run pytest ../test_workflow_profile.py \
|
|
-m profiling --override-ini="addopts=" -s -v
|
|
|
|
Corpus: 500 documents + correspondents + tags + sets of WorkflowTrigger
|
|
objects at 5 and 20 count to allow scaling comparisons.
|
|
|
|
Scenarios
|
|
---------
|
|
TestWorkflowMatchingProfile
|
|
- test_existing_document_5_workflows existing_document_matches_workflow x 5 triggers
|
|
- test_existing_document_20_workflows same x 20 triggers
|
|
- test_workflow_prefilter prefilter_documents_by_workflowtrigger on 500 docs
|
|
- test_trigger_type_comparison compare DOCUMENT_ADDED vs DOCUMENT_UPDATED overhead
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import random
|
|
import time
|
|
|
|
import pytest
|
|
from faker import Faker
|
|
from profiling import profile_block
|
|
|
|
from documents.matching import existing_document_matches_workflow
|
|
from documents.matching import prefilter_documents_by_workflowtrigger
|
|
from documents.models import Correspondent
|
|
from documents.models import Document
|
|
from documents.models import Tag
|
|
from documents.models import Workflow
|
|
from documents.models import WorkflowAction
|
|
from documents.models import WorkflowTrigger
|
|
|
|
pytestmark = [pytest.mark.profiling, pytest.mark.django_db]
|
|
|
|
NUM_DOCS = 500
|
|
NUM_CORRESPONDENTS = 10
|
|
NUM_TAGS = 20
|
|
SEED = 42
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Module-scoped fixture
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def module_db(django_db_setup, django_db_blocker):
|
|
"""Unlock the DB for the whole module (module-scoped)."""
|
|
with django_db_blocker.unblock():
|
|
yield
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def workflow_corpus(module_db):
|
|
"""
|
|
500 documents + correspondents + tags + sets of workflow triggers
|
|
at 5 and 20 count to allow scaling comparisons.
|
|
"""
|
|
fake = Faker()
|
|
Faker.seed(SEED)
|
|
rng = random.Random(SEED)
|
|
|
|
# ---- lookup objects ---------------------------------------------------
|
|
print("\n[setup] Creating lookup objects...") # noqa: T201
|
|
correspondents = [
|
|
Correspondent.objects.create(name=f"wfcorp-{i}-{fake.company()}"[:128])
|
|
for i in range(NUM_CORRESPONDENTS)
|
|
]
|
|
tags = [
|
|
Tag.objects.create(name=f"wftag-{i}-{fake.word()}"[:100])
|
|
for i in range(NUM_TAGS)
|
|
]
|
|
|
|
# ---- documents --------------------------------------------------------
|
|
print(f"[setup] Building {NUM_DOCS} documents...") # noqa: T201
|
|
raw_docs = []
|
|
for i in range(NUM_DOCS):
|
|
raw_docs.append(
|
|
Document(
|
|
title=fake.sentence(nb_words=4).rstrip("."),
|
|
content=fake.paragraph(nb_sentences=3),
|
|
checksum=f"WF{i:07d}",
|
|
correspondent=rng.choice(correspondents + [None] * 3),
|
|
),
|
|
)
|
|
documents = Document.objects.bulk_create(raw_docs, batch_size=500)
|
|
for doc in documents:
|
|
k = rng.randint(0, 3)
|
|
if k:
|
|
doc.tags.add(*rng.sample(tags, k))
|
|
|
|
sample_doc = documents[0]
|
|
print(f"[setup] Sample doc pk={sample_doc.pk}") # noqa: T201
|
|
|
|
# ---- build triggers at scale 5 and 20 --------------------------------
|
|
_wf_counter = [0]
|
|
|
|
def _make_triggers(n: int, trigger_type: int) -> list[WorkflowTrigger]:
|
|
triggers = []
|
|
for i in range(n):
|
|
# Alternate between no filter and a correspondent filter
|
|
corr = correspondents[i % NUM_CORRESPONDENTS] if i % 3 == 0 else None
|
|
trigger = WorkflowTrigger.objects.create(
|
|
type=trigger_type,
|
|
filter_has_correspondent=corr,
|
|
)
|
|
action = WorkflowAction.objects.create(
|
|
type=WorkflowAction.WorkflowActionType.ASSIGNMENT,
|
|
)
|
|
idx = _wf_counter[0]
|
|
_wf_counter[0] += 1
|
|
wf = Workflow.objects.create(name=f"wf-profile-{idx}")
|
|
wf.triggers.add(trigger)
|
|
wf.actions.add(action)
|
|
triggers.append(trigger)
|
|
return triggers
|
|
|
|
print("[setup] Creating workflow triggers...") # noqa: T201
|
|
triggers_5 = _make_triggers(5, WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED)
|
|
triggers_20 = _make_triggers(
|
|
20,
|
|
WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED,
|
|
)
|
|
triggers_added = _make_triggers(
|
|
5,
|
|
WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
|
|
)
|
|
|
|
yield {
|
|
"doc": sample_doc,
|
|
"triggers_5": triggers_5,
|
|
"triggers_20": triggers_20,
|
|
"triggers_added": triggers_added,
|
|
}
|
|
|
|
# Teardown
|
|
print("\n[teardown] Removing workflow corpus...") # noqa: T201
|
|
Workflow.objects.all().delete()
|
|
WorkflowTrigger.objects.all().delete()
|
|
WorkflowAction.objects.all().delete()
|
|
Document.objects.all().delete()
|
|
Correspondent.objects.all().delete()
|
|
Tag.objects.all().delete()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TestWorkflowMatchingProfile
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestWorkflowMatchingProfile:
|
|
"""Profile workflow trigger evaluation per document save."""
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _setup(self, workflow_corpus):
|
|
self.doc = workflow_corpus["doc"]
|
|
self.triggers_5 = workflow_corpus["triggers_5"]
|
|
self.triggers_20 = workflow_corpus["triggers_20"]
|
|
self.triggers_added = workflow_corpus["triggers_added"]
|
|
|
|
def test_existing_document_5_workflows(self):
|
|
"""existing_document_matches_workflow x 5 DOCUMENT_UPDATED triggers."""
|
|
doc = self.doc
|
|
triggers = self.triggers_5
|
|
|
|
with profile_block(
|
|
f"existing_document_matches_workflow [{len(triggers)} triggers]",
|
|
):
|
|
for trigger in triggers:
|
|
existing_document_matches_workflow(doc, trigger)
|
|
|
|
def test_existing_document_20_workflows(self):
|
|
"""existing_document_matches_workflow x 20 triggers -- shows linear scaling."""
|
|
doc = self.doc
|
|
triggers = self.triggers_20
|
|
|
|
with profile_block(
|
|
f"existing_document_matches_workflow [{len(triggers)} triggers]",
|
|
):
|
|
for trigger in triggers:
|
|
existing_document_matches_workflow(doc, trigger)
|
|
|
|
# Also time each call individually to show per-trigger overhead
|
|
timings = []
|
|
for trigger in triggers:
|
|
t0 = time.perf_counter()
|
|
existing_document_matches_workflow(doc, trigger)
|
|
timings.append((time.perf_counter() - t0) * 1_000_000)
|
|
avg_us = sum(timings) / len(timings)
|
|
print(f"\n Per-trigger avg: {avg_us:.1f} us (n={len(timings)})") # noqa: T201
|
|
|
|
def test_workflow_prefilter(self):
|
|
"""prefilter_documents_by_workflowtrigger on 500 docs -- tag + correspondent filters."""
|
|
qs = Document.objects.all()
|
|
print(f"\n Corpus: {qs.count()} documents") # noqa: T201
|
|
|
|
for trigger in self.triggers_20[:3]:
|
|
label = (
|
|
f"prefilter_documents_by_workflowtrigger "
|
|
f"[corr={trigger.filter_has_correspondent_id}]"
|
|
)
|
|
with profile_block(label):
|
|
result = prefilter_documents_by_workflowtrigger(qs, trigger)
|
|
# Evaluate the queryset
|
|
count = result.count()
|
|
print(f" -> {count} docs passed filter") # noqa: T201
|
|
|
|
def test_trigger_type_comparison(self):
|
|
"""Compare per-call overhead of DOCUMENT_UPDATED vs DOCUMENT_ADDED."""
|
|
doc = self.doc
|
|
runs = 200
|
|
|
|
for label, triggers in [
|
|
("DOCUMENT_UPDATED", self.triggers_5),
|
|
("DOCUMENT_ADDED", self.triggers_added),
|
|
]:
|
|
t0 = time.perf_counter()
|
|
for _ in range(runs):
|
|
for trigger in triggers:
|
|
existing_document_matches_workflow(doc, trigger)
|
|
total_calls = runs * len(triggers)
|
|
us_per_call = (time.perf_counter() - t0) / total_calls * 1_000_000
|
|
print( # noqa: T201
|
|
f" {label:<22s} {us_per_call:.2f} us/call "
|
|
f"({total_calls} calls, {len(triggers)} triggers)",
|
|
)
|