mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-04-27 10:29:26 +00:00
perf(fuzzy-match): stream work packages to avoid pre-build delay
Previously all N*(N-1)/2 _WorkPackage objects were materialised into a list before matching started, causing a ~30s pause on large corpora before the progress bar appeared. Now pairs are generated lazily via a generator for the single-process path. total_pairs = n*(n-1)//2 is computed cheaply so the progress bar can show accurate progress without reading ahead. The multiprocessing path still materialises the list (process_parallel requires it), but that path is uncommon and already pragma: no cover. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -169,19 +169,28 @@ class Command(PaperlessCommand):
|
||||
)
|
||||
|
||||
# combinations() generates each unique pair exactly once -- no checked_pairs set needed.
|
||||
work_pkgs: list[_WorkPackage] = [
|
||||
_WorkPackage(pk_a, ca, pk_b, cb, opt_ratio)
|
||||
for (pk_a, ca), (pk_b, cb) in combinations(slim_docs, 2)
|
||||
if ca.strip() and cb.strip()
|
||||
]
|
||||
# The total is computed cheaply so the progress bar can start immediately without
|
||||
# materialising all pairs up front (n*(n-1)/2 can be hundreds of thousands).
|
||||
n = len(slim_docs)
|
||||
total_pairs = n * (n - 1) // 2
|
||||
|
||||
def _work_gen():
|
||||
for (pk_a, ca), (pk_b, cb) in combinations(slim_docs, 2):
|
||||
if ca.strip() and cb.strip():
|
||||
yield _WorkPackage(pk_a, ca, pk_b, cb, opt_ratio)
|
||||
|
||||
def _iter_matches():
|
||||
if self.process_count == 1:
|
||||
for work in self.track(work_pkgs, description="Matching..."):
|
||||
for work in self.track(
|
||||
_work_gen(),
|
||||
description="Matching...",
|
||||
total=total_pairs,
|
||||
):
|
||||
result = _process_and_match(work)
|
||||
if result.ratio >= opt_ratio:
|
||||
yield result
|
||||
else: # pragma: no cover
|
||||
work_pkgs = list(_work_gen())
|
||||
for proc_result in self.process_parallel(
|
||||
_process_and_match,
|
||||
work_pkgs,
|
||||
|
||||
Reference in New Issue
Block a user