mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-14 12:08:51 +00:00
285 lines
9.8 KiB
Python
285 lines
9.8 KiB
Python
"""
|
|
Matching pipeline profiling.
|
|
|
|
Run with:
|
|
uv run pytest ../test_matching_profile.py \
|
|
-m profiling --override-ini="addopts=" -s -v
|
|
|
|
Corpus: 1 document + 50 correspondents, 100 tags, 25 doc types, 20 storage
|
|
paths. Labels are spread across all six matching algorithms
|
|
(NONE, ANY, ALL, LITERAL, REGEX, FUZZY, AUTO).
|
|
|
|
Classifier is passed as None -- MATCH_AUTO models skip prediction gracefully,
|
|
which is correct for isolating the ORM query and Python-side evaluation cost.
|
|
|
|
Scenarios
|
|
---------
|
|
TestMatchingPipelineProfile
|
|
- test_match_correspondents 50 correspondents, algorithm mix
|
|
- test_match_tags 100 tags
|
|
- test_match_document_types 25 doc types
|
|
- test_match_storage_paths 20 storage paths
|
|
- test_full_match_sequence all four in order (cumulative consumption cost)
|
|
- test_algorithm_breakdown each MATCH_* algorithm in isolation
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import random
|
|
|
|
import pytest
|
|
from faker import Faker
|
|
from profiling import profile_block
|
|
|
|
from documents.matching import match_correspondents
|
|
from documents.matching import match_document_types
|
|
from documents.matching import match_storage_paths
|
|
from documents.matching import match_tags
|
|
from documents.models import Correspondent
|
|
from documents.models import Document
|
|
from documents.models import DocumentType
|
|
from documents.models import MatchingModel
|
|
from documents.models import StoragePath
|
|
from documents.models import Tag
|
|
|
|
pytestmark = [pytest.mark.profiling, pytest.mark.django_db]
|
|
|
|
NUM_CORRESPONDENTS = 50
|
|
NUM_TAGS = 100
|
|
NUM_DOC_TYPES = 25
|
|
NUM_STORAGE_PATHS = 20
|
|
SEED = 42
|
|
|
|
# Algorithm distribution across labels (cycles through in order)
|
|
_ALGORITHMS = [
|
|
MatchingModel.MATCH_NONE,
|
|
MatchingModel.MATCH_ANY,
|
|
MatchingModel.MATCH_ALL,
|
|
MatchingModel.MATCH_LITERAL,
|
|
MatchingModel.MATCH_REGEX,
|
|
MatchingModel.MATCH_FUZZY,
|
|
MatchingModel.MATCH_AUTO,
|
|
]
|
|
|
|
|
|
def _algo(i: int) -> int:
|
|
return _ALGORITHMS[i % len(_ALGORITHMS)]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Module-scoped corpus fixture
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def module_db(django_db_setup, django_db_blocker):
|
|
"""Unlock the DB for the whole module (module-scoped)."""
|
|
with django_db_blocker.unblock():
|
|
yield
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def matching_corpus(module_db):
|
|
"""
|
|
1 document with realistic content + dense matching model sets.
|
|
Classifier=None so MATCH_AUTO models are simply skipped.
|
|
"""
|
|
fake = Faker()
|
|
Faker.seed(SEED)
|
|
random.seed(SEED)
|
|
|
|
# ---- matching models ---------------------------------------------------
|
|
print(f"\n[setup] Creating {NUM_CORRESPONDENTS} correspondents...") # noqa: T201
|
|
correspondents = []
|
|
for i in range(NUM_CORRESPONDENTS):
|
|
algo = _algo(i)
|
|
match_text = (
|
|
fake.word()
|
|
if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
|
|
else ""
|
|
)
|
|
if algo == MatchingModel.MATCH_REGEX:
|
|
match_text = r"\b" + fake.word() + r"\b"
|
|
correspondents.append(
|
|
Correspondent.objects.create(
|
|
name=f"mcorp-{i}-{fake.company()}"[:128],
|
|
matching_algorithm=algo,
|
|
match=match_text,
|
|
),
|
|
)
|
|
|
|
print(f"[setup] Creating {NUM_TAGS} tags...") # noqa: T201
|
|
tags = []
|
|
for i in range(NUM_TAGS):
|
|
algo = _algo(i)
|
|
match_text = (
|
|
fake.word()
|
|
if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
|
|
else ""
|
|
)
|
|
if algo == MatchingModel.MATCH_REGEX:
|
|
match_text = r"\b" + fake.word() + r"\b"
|
|
tags.append(
|
|
Tag.objects.create(
|
|
name=f"mtag-{i}-{fake.word()}"[:100],
|
|
matching_algorithm=algo,
|
|
match=match_text,
|
|
),
|
|
)
|
|
|
|
print(f"[setup] Creating {NUM_DOC_TYPES} doc types...") # noqa: T201
|
|
doc_types = []
|
|
for i in range(NUM_DOC_TYPES):
|
|
algo = _algo(i)
|
|
match_text = (
|
|
fake.word()
|
|
if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
|
|
else ""
|
|
)
|
|
if algo == MatchingModel.MATCH_REGEX:
|
|
match_text = r"\b" + fake.word() + r"\b"
|
|
doc_types.append(
|
|
DocumentType.objects.create(
|
|
name=f"mtype-{i}-{fake.word()}"[:128],
|
|
matching_algorithm=algo,
|
|
match=match_text,
|
|
),
|
|
)
|
|
|
|
print(f"[setup] Creating {NUM_STORAGE_PATHS} storage paths...") # noqa: T201
|
|
storage_paths = []
|
|
for i in range(NUM_STORAGE_PATHS):
|
|
algo = _algo(i)
|
|
match_text = (
|
|
fake.word()
|
|
if algo not in (MatchingModel.MATCH_NONE, MatchingModel.MATCH_AUTO)
|
|
else ""
|
|
)
|
|
if algo == MatchingModel.MATCH_REGEX:
|
|
match_text = r"\b" + fake.word() + r"\b"
|
|
storage_paths.append(
|
|
StoragePath.objects.create(
|
|
name=f"mpath-{i}-{fake.word()}",
|
|
path=f"{fake.word()}/{{title}}",
|
|
matching_algorithm=algo,
|
|
match=match_text,
|
|
),
|
|
)
|
|
|
|
# ---- document with diverse content ------------------------------------
|
|
doc = Document.objects.create(
|
|
title="quarterly invoice payment tax financial statement",
|
|
content=" ".join(fake.paragraph(nb_sentences=5) for _ in range(3)),
|
|
checksum="MATCHPROF0001",
|
|
)
|
|
|
|
print(f"[setup] Document pk={doc.pk}, content length={len(doc.content)} chars") # noqa: T201
|
|
print( # noqa: T201
|
|
f" Correspondents: {NUM_CORRESPONDENTS} "
|
|
f"({sum(1 for c in correspondents if c.matching_algorithm == MatchingModel.MATCH_AUTO)} AUTO)",
|
|
)
|
|
print( # noqa: T201
|
|
f" Tags: {NUM_TAGS} "
|
|
f"({sum(1 for t in tags if t.matching_algorithm == MatchingModel.MATCH_AUTO)} AUTO)",
|
|
)
|
|
|
|
yield {"doc": doc}
|
|
|
|
# Teardown
|
|
print("\n[teardown] Removing matching corpus...") # noqa: T201
|
|
Document.objects.all().delete()
|
|
Correspondent.objects.all().delete()
|
|
Tag.objects.all().delete()
|
|
DocumentType.objects.all().delete()
|
|
StoragePath.objects.all().delete()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TestMatchingPipelineProfile
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestMatchingPipelineProfile:
|
|
"""Profile the matching functions called per document during consumption."""
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _setup(self, matching_corpus):
|
|
self.doc = matching_corpus["doc"]
|
|
|
|
def test_match_correspondents(self):
|
|
"""50 correspondents, algorithm mix. Query count + time."""
|
|
with profile_block(
|
|
f"match_correspondents() [{NUM_CORRESPONDENTS} correspondents, mixed algorithms]",
|
|
):
|
|
result = match_correspondents(self.doc, classifier=None)
|
|
print(f" -> {len(result)} matched") # noqa: T201
|
|
|
|
def test_match_tags(self):
|
|
"""100 tags -- densest set in real installs."""
|
|
with profile_block(f"match_tags() [{NUM_TAGS} tags, mixed algorithms]"):
|
|
result = match_tags(self.doc, classifier=None)
|
|
print(f" -> {len(result)} matched") # noqa: T201
|
|
|
|
def test_match_document_types(self):
|
|
"""25 doc types."""
|
|
with profile_block(
|
|
f"match_document_types() [{NUM_DOC_TYPES} types, mixed algorithms]",
|
|
):
|
|
result = match_document_types(self.doc, classifier=None)
|
|
print(f" -> {len(result)} matched") # noqa: T201
|
|
|
|
def test_match_storage_paths(self):
|
|
"""20 storage paths."""
|
|
with profile_block(
|
|
f"match_storage_paths() [{NUM_STORAGE_PATHS} paths, mixed algorithms]",
|
|
):
|
|
result = match_storage_paths(self.doc, classifier=None)
|
|
print(f" -> {len(result)} matched") # noqa: T201
|
|
|
|
def test_full_match_sequence(self):
|
|
"""All four match_*() calls in order -- cumulative cost per document consumed."""
|
|
with profile_block(
|
|
"full match sequence: correspondents + doc_types + tags + storage_paths",
|
|
):
|
|
match_correspondents(self.doc, classifier=None)
|
|
match_document_types(self.doc, classifier=None)
|
|
match_tags(self.doc, classifier=None)
|
|
match_storage_paths(self.doc, classifier=None)
|
|
|
|
def test_algorithm_breakdown(self):
|
|
"""Create one correspondent per algorithm and time each independently."""
|
|
import time
|
|
|
|
from documents.matching import matches
|
|
|
|
fake = Faker()
|
|
algo_names = {
|
|
MatchingModel.MATCH_NONE: "MATCH_NONE",
|
|
MatchingModel.MATCH_ANY: "MATCH_ANY",
|
|
MatchingModel.MATCH_ALL: "MATCH_ALL",
|
|
MatchingModel.MATCH_LITERAL: "MATCH_LITERAL",
|
|
MatchingModel.MATCH_REGEX: "MATCH_REGEX",
|
|
MatchingModel.MATCH_FUZZY: "MATCH_FUZZY",
|
|
}
|
|
doc = self.doc
|
|
print() # noqa: T201
|
|
|
|
for algo, name in algo_names.items():
|
|
match_text = fake.word() if algo != MatchingModel.MATCH_NONE else ""
|
|
if algo == MatchingModel.MATCH_REGEX:
|
|
match_text = r"\b" + fake.word() + r"\b"
|
|
model = Correspondent(
|
|
name=f"algo-test-{name}",
|
|
matching_algorithm=algo,
|
|
match=match_text,
|
|
)
|
|
# Time 1000 iterations to get stable microsecond readings
|
|
runs = 1_000
|
|
t0 = time.perf_counter()
|
|
for _ in range(runs):
|
|
matches(model, doc)
|
|
us_per_call = (time.perf_counter() - t0) / runs * 1_000_000
|
|
print( # noqa: T201
|
|
f" {name:<20s} {us_per_call:8.2f} us/call (match={match_text[:20]!r})",
|
|
)
|