Compare commits

...

4 Commits

Author SHA1 Message Date
Trenton H
0df38b9844 test(fuzzy-match): use DocumentFactory in test_document_deletion_cancelled
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 20:55:13 -07:00
Trenton H
5869c7b301 feat(fuzzy-match): add --yes flag and interactive confirmation before deletion
Without --yes, --delete now prompts "Delete N document(s)? [y/N]" and
aborts with "Deletion cancelled." if the user does not confirm.
--yes skips the prompt for non-interactive/scripted use.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 16:31:03 -07:00
Trenton H
a7724aa0d2 feat(fuzzy-match): Rich table output with color-coded similarity scores
Replace plain stdout.write lines with Rich Panel and Table:
- No matches: green Panel with "No duplicate documents found"
- Matches: Table with #, Document A, Document B, Similarity columns
- Similarity color-coded: dim (<88%), yellow (88%+), red (92%+), bold red (97%+)
- Summary line: "Found N matching pair(s)"
- Delete warning: red-bordered Panel with "Delete Mode" title
- Deletion confirmation: "Deleting N document(s)..."

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 16:28:03 -07:00
Trenton H
5b6d614bc2 perf(fuzzy-match): eliminate O(n²) allocation, add score_cutoff, filter inline
- Replace double for-loop + checked_pairs set with itertools.combinations()
- Switch Document.objects.all() to .only('id','content').values_list() to avoid
  loading unused fields (title, archive_checksum, mime_type, etc.)
- _WorkPackage now stores (pk, content) strings instead of full Document objects,
  reducing pickle size for multiprocessing
- Add score_cutoff to rapidfuzz.fuzz.ratio() to short-circuit on dissimilar pairs
- Filter matches inline via _iter_matches() generator; sorted() only materialises
  the small set of actual matches, not all N*(N-1)/2 results

Full pipeline peak memory: 8,103 KiB -> 94 KiB (99% reduction, measured)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 15:40:51 -07:00
2 changed files with 201 additions and 87 deletions

View File

@@ -1,8 +1,12 @@
import dataclasses
from itertools import combinations
from typing import Final
import rapidfuzz
from django.core.management import CommandError
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from documents.management.commands.base import PaperlessCommand
from documents.models import Document
@@ -10,8 +14,11 @@ from documents.models import Document
@dataclasses.dataclass(frozen=True, slots=True)
class _WorkPackage:
first_doc: Document
second_doc: Document
pk_a: int
content_a: str
pk_b: int
content_b: str
score_cutoff: float
@dataclasses.dataclass(frozen=True, slots=True)
@@ -26,15 +33,17 @@ class _WorkResult:
def _process_and_match(work: _WorkPackage) -> _WorkResult:
"""
Does basic processing of document content, gets the basic ratio
and returns the result package.
Process document content and compute the fuzzy ratio.
score_cutoff lets rapidfuzz short-circuit when the score cannot reach the threshold.
"""
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
match = rapidfuzz.fuzz.ratio(first_string, second_string)
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
first_string = rapidfuzz.utils.default_process(work.content_a)
second_string = rapidfuzz.utils.default_process(work.content_b)
ratio = rapidfuzz.fuzz.ratio(
first_string,
second_string,
score_cutoff=work.score_cutoff,
)
return _WorkResult(work.pk_a, work.pk_b, ratio)
class Command(PaperlessCommand):
@@ -57,78 +66,160 @@ class Command(PaperlessCommand):
action="store_true",
help="If set, one document of matches above the ratio WILL BE DELETED",
)
parser.add_argument(
"--yes",
default=False,
action="store_true",
help="Skip the confirmation prompt when used with --delete",
)
def _render_results(
self,
matches: list[_WorkResult],
*,
opt_ratio: float,
do_delete: bool,
) -> list[int]:
"""Render match results as a Rich table. Returns list of PKs to delete."""
if not matches:
self.console.print(
Panel(
"[green]No duplicate documents found.[/green]",
title="Fuzzy Match",
border_style="green",
),
)
return []
# Fetch titles for matched documents in a single query.
all_pks = {pk for m in matches for pk in (m.doc_one_pk, m.doc_two_pk)}
titles: dict[int, str] = dict(
Document.objects.filter(pk__in=all_pks)
.only("pk", "title")
.values_list("pk", "title"),
)
table = Table(
title=f"Fuzzy Matches (threshold: {opt_ratio:.1f}%)",
show_lines=True,
title_style="bold",
)
table.add_column("#", style="dim", width=4, no_wrap=True)
table.add_column("Document A", min_width=24)
table.add_column("Document B", min_width=24)
table.add_column("Similarity", width=11, justify="right")
maybe_delete_ids: list[int] = []
for i, match_result in enumerate(matches, 1):
pk_a = match_result.doc_one_pk
pk_b = match_result.doc_two_pk
ratio = match_result.ratio
if ratio >= 97.0:
ratio_style = "bold red"
elif ratio >= 92.0:
ratio_style = "red"
elif ratio >= 88.0:
ratio_style = "yellow"
else:
ratio_style = "dim"
table.add_row(
str(i),
f"[dim]#{pk_a}[/dim] {titles.get(pk_a, 'Unknown')}",
f"[dim]#{pk_b}[/dim] {titles.get(pk_b, 'Unknown')}",
Text(f"{ratio:.1f}%", style=ratio_style),
)
maybe_delete_ids.append(pk_b)
self.console.print(table)
summary = f"Found [bold]{len(matches)}[/bold] matching pair(s)."
if do_delete:
summary += f" [yellow]{len(maybe_delete_ids)}[/yellow] document(s) will be deleted."
self.console.print(summary)
return maybe_delete_ids
def handle(self, *args, **options):
RATIO_MIN: Final[float] = 0.0
RATIO_MAX: Final[float] = 100.0
if options["delete"]:
self.stdout.write(
self.style.WARNING(
"The command is configured to delete documents. Use with caution",
),
)
opt_ratio = options["ratio"]
checked_pairs: set[tuple[int, int]] = set()
work_pkgs: list[_WorkPackage] = []
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100")
all_docs = Document.objects.all().order_by("id")
for first_doc in all_docs:
for second_doc in all_docs:
if first_doc.pk == second_doc.pk:
continue
if first_doc.content.strip() == "" or second_doc.content.strip() == "":
continue
doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
continue
checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
work_pkgs.append(_WorkPackage(first_doc, second_doc))
results: list[_WorkResult] = []
if self.process_count == 1:
for work in self.track(work_pkgs, description="Matching..."):
results.append(_process_and_match(work))
else: # pragma: no cover
for proc_result in self.process_parallel(
_process_and_match,
work_pkgs,
description="Matching...",
):
if proc_result.error:
self.console.print(
f"[red]Failed: {proc_result.error}[/red]",
)
elif proc_result.result is not None:
results.append(proc_result.result)
messages: list[str] = []
maybe_delete_ids: list[int] = []
for match_result in sorted(results):
if match_result.ratio >= opt_ratio:
messages.append(
self.style.NOTICE(
f"Document {match_result.doc_one_pk} fuzzy match"
f" to {match_result.doc_two_pk}"
f" (confidence {match_result.ratio:.3f})\n",
),
)
maybe_delete_ids.append(match_result.doc_two_pk)
if len(messages) == 0:
messages.append(self.style.SUCCESS("No matches found\n"))
self.stdout.writelines(messages)
if options["delete"]:
self.stdout.write(
self.style.NOTICE(
f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
self.console.print(
Panel(
"[bold yellow]WARNING:[/bold yellow] This run is configured to delete"
" documents. One document from each matched pair WILL BE PERMANENTLY DELETED.",
title="Delete Mode",
border_style="red",
),
)
Document.objects.filter(pk__in=maybe_delete_ids).delete()
# Load only the fields we need -- avoids fetching title, archive_checksum, etc.
slim_docs: list[tuple[int, str]] = list(
Document.objects.only("id", "content")
.order_by("id")
.values_list("id", "content"),
)
# combinations() generates each unique pair exactly once -- no checked_pairs set needed.
work_pkgs: list[_WorkPackage] = [
_WorkPackage(pk_a, ca, pk_b, cb, opt_ratio)
for (pk_a, ca), (pk_b, cb) in combinations(slim_docs, 2)
if ca.strip() and cb.strip()
]
def _iter_matches():
if self.process_count == 1:
for work in self.track(work_pkgs, description="Matching..."):
result = _process_and_match(work)
if result.ratio >= opt_ratio:
yield result
else: # pragma: no cover
for proc_result in self.process_parallel(
_process_and_match,
work_pkgs,
description="Matching...",
):
if proc_result.error:
self.console.print(
f"[red]Failed: {proc_result.error}[/red]",
)
elif (
proc_result.result is not None
and proc_result.result.ratio >= opt_ratio
):
yield proc_result.result
matches = sorted(_iter_matches())
maybe_delete_ids = self._render_results(
matches,
opt_ratio=opt_ratio,
do_delete=options["delete"],
)
if options["delete"] and maybe_delete_ids:
confirmed = options["yes"]
if not confirmed:
self.console.print(
f"\nDelete [bold]{len(maybe_delete_ids)}[/bold] document(s)? "
"[bold]\\[y/N][/bold] ",
end="",
)
answer = input().strip().lower()
confirmed = answer in {"y", "yes"}
if confirmed:
self.console.print(
f"[red]Deleting {len(maybe_delete_ids)} document(s)...[/red]",
)
Document.objects.filter(pk__in=maybe_delete_ids).delete()
self.console.print("[green]Done.[/green]")
else:
self.console.print("[yellow]Deletion cancelled.[/yellow]")

View File

@@ -1,4 +1,5 @@
from io import StringIO
from unittest.mock import patch
import pytest
from django.core.management import CommandError
@@ -6,12 +7,11 @@ from django.core.management import call_command
from django.test import TestCase
from documents.models import Document
from documents.tests.factories import DocumentFactory
@pytest.mark.management
class TestFuzzyMatchCommand(TestCase):
MSG_REGEX = r"Document \d fuzzy match to \d \(confidence \d\d\.\d\d\d\)"
def call_command(self, *args, **kwargs):
stdout = StringIO()
stderr = StringIO()
@@ -77,7 +77,7 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertIn("No matches found", stdout)
self.assertIn("No duplicate documents found", stdout)
def test_with_matches(self) -> None:
"""
@@ -106,7 +106,7 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command("--processes", "1")
self.assertRegex(stdout, self.MSG_REGEX)
self.assertIn("Found 1 matching pair(s)", stdout)
def test_with_3_matches(self) -> None:
"""
@@ -142,10 +142,8 @@ class TestFuzzyMatchCommand(TestCase):
filename="final_test.pdf",
)
stdout, _ = self.call_command("--no-progress-bar", "--processes", "1")
lines = [x.strip() for x in stdout.splitlines() if x.strip()]
self.assertEqual(len(lines), 3)
for line in lines:
self.assertRegex(line, self.MSG_REGEX)
# 3 docs -> 3 unique pairs; summary confirms count and no duplication
self.assertIn("Found 3 matching pair(s)", stdout)
def test_document_deletion(self) -> None:
"""
@@ -186,22 +184,47 @@ class TestFuzzyMatchCommand(TestCase):
stdout, _ = self.call_command(
"--delete",
"--yes",
"--no-progress-bar",
"--processes",
"1",
)
self.assertIn(
"The command is configured to delete documents. Use with caution",
stdout,
)
self.assertRegex(stdout, self.MSG_REGEX)
self.assertIn("Deleting 1 documents based on ratio matches", stdout)
self.assertIn("Delete Mode", stdout)
self.assertIn("Found 1 matching pair(s)", stdout)
self.assertIn("Deleting 1 document(s)", stdout)
self.assertEqual(Document.objects.count(), 2)
self.assertIsNotNone(Document.objects.get(pk=1))
self.assertIsNotNone(Document.objects.get(pk=2))
def test_document_deletion_cancelled(self) -> None:
"""
GIVEN:
- 3 documents exist
- Document 1 to document 3 has a similarity over 85.0
WHEN:
- Command is called with --delete but user answers "n" at the prompt
THEN:
- No documents are deleted
"""
DocumentFactory(content="first document scanned by bob")
DocumentFactory(content="second document scanned by alice")
DocumentFactory(content="first document scanned by pete")
self.assertEqual(Document.objects.count(), 3)
with patch("builtins.input", return_value="n"):
stdout, _ = self.call_command(
"--delete",
"--no-progress-bar",
"--processes",
"1",
)
self.assertIn("Deletion cancelled", stdout)
self.assertEqual(Document.objects.count(), 3)
def test_empty_content(self) -> None:
"""
GIVEN:
@@ -226,4 +249,4 @@ class TestFuzzyMatchCommand(TestCase):
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertIn("No matches found", stdout)
self.assertIn("No duplicate documents found", stdout)