diff --git a/src/documents/management/commands/document_fuzzy_match.py b/src/documents/management/commands/document_fuzzy_match.py index 60d00f9cc..d5f9b2ab7 100644 --- a/src/documents/management/commands/document_fuzzy_match.py +++ b/src/documents/management/commands/document_fuzzy_match.py @@ -1,8 +1,12 @@ import dataclasses +from itertools import combinations from typing import Final import rapidfuzz from django.core.management import CommandError +from rich.panel import Panel +from rich.table import Table +from rich.text import Text from documents.management.commands.base import PaperlessCommand from documents.models import Document @@ -10,8 +14,11 @@ from documents.models import Document @dataclasses.dataclass(frozen=True, slots=True) class _WorkPackage: - first_doc: Document - second_doc: Document + pk_a: int + content_a: str + pk_b: int + content_b: str + score_cutoff: float @dataclasses.dataclass(frozen=True, slots=True) @@ -20,21 +27,20 @@ class _WorkResult: doc_two_pk: int ratio: float - def __lt__(self, other: "_WorkResult") -> bool: - return self.doc_one_pk < other.doc_one_pk - def _process_and_match(work: _WorkPackage) -> _WorkResult: """ - Does basic processing of document content, gets the basic ratio - and returns the result package. + Process document content and compute the fuzzy ratio. + score_cutoff lets rapidfuzz short-circuit when the score cannot reach the threshold. """ - first_string = rapidfuzz.utils.default_process(work.first_doc.content) - second_string = rapidfuzz.utils.default_process(work.second_doc.content) - - match = rapidfuzz.fuzz.ratio(first_string, second_string) - - return _WorkResult(work.first_doc.pk, work.second_doc.pk, match) + first_string = rapidfuzz.utils.default_process(work.content_a) + second_string = rapidfuzz.utils.default_process(work.content_b) + ratio = rapidfuzz.fuzz.ratio( + first_string, + second_string, + score_cutoff=work.score_cutoff, + ) + return _WorkResult(work.pk_a, work.pk_b, ratio) class Command(PaperlessCommand): @@ -57,78 +63,169 @@ class Command(PaperlessCommand): action="store_true", help="If set, one document of matches above the ratio WILL BE DELETED", ) + parser.add_argument( + "--yes", + default=False, + action="store_true", + help="Skip the confirmation prompt when used with --delete", + ) + + def _render_results( + self, + matches: list[_WorkResult], + *, + opt_ratio: float, + do_delete: bool, + ) -> list[int]: + """Render match results as a Rich table. Returns list of PKs to delete.""" + if not matches: + self.console.print( + Panel( + "[green]No duplicate documents found.[/green]", + title="Fuzzy Match", + border_style="green", + ), + ) + return [] + + # Fetch titles for matched documents in a single query. + all_pks = {pk for m in matches for pk in (m.doc_one_pk, m.doc_two_pk)} + titles: dict[int, str] = dict( + Document.objects.filter(pk__in=all_pks) + .only("pk", "title") + .values_list("pk", "title"), + ) + + table = Table( + title=f"Fuzzy Matches (threshold: {opt_ratio:.1f}%)", + show_lines=True, + title_style="bold", + ) + table.add_column("#", style="dim", width=4, no_wrap=True) + table.add_column("Document A", min_width=24) + table.add_column("Document B", min_width=24) + table.add_column("Similarity", width=11, justify="right") + + maybe_delete_ids: list[int] = [] + + for i, match_result in enumerate(matches, 1): + pk_a = match_result.doc_one_pk + pk_b = match_result.doc_two_pk + ratio = match_result.ratio + + if ratio >= 97.0: + ratio_style = "bold red" + elif ratio >= 92.0: + ratio_style = "red" + elif ratio >= 88.0: + ratio_style = "yellow" + else: + ratio_style = "dim" + + table.add_row( + str(i), + f"[dim]#{pk_a}[/dim] {titles.get(pk_a, 'Unknown')}", + f"[dim]#{pk_b}[/dim] {titles.get(pk_b, 'Unknown')}", + Text(f"{ratio:.1f}%", style=ratio_style), + ) + maybe_delete_ids.append(pk_b) + + self.console.print(table) + + summary = f"Found [bold]{len(matches)}[/bold] matching pair(s)." + if do_delete: + summary += f" [yellow]{len(maybe_delete_ids)}[/yellow] document(s) will be deleted." + self.console.print(summary) + + return maybe_delete_ids def handle(self, *args, **options): RATIO_MIN: Final[float] = 0.0 RATIO_MAX: Final[float] = 100.0 - if options["delete"]: - self.stdout.write( - self.style.WARNING( - "The command is configured to delete documents. Use with caution", - ), - ) - opt_ratio = options["ratio"] - checked_pairs: set[tuple[int, int]] = set() - work_pkgs: list[_WorkPackage] = [] if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX: raise CommandError("The ratio must be between 0 and 100") - all_docs = Document.objects.all().order_by("id") - - for first_doc in all_docs: - for second_doc in all_docs: - if first_doc.pk == second_doc.pk: - continue - if first_doc.content.strip() == "" or second_doc.content.strip() == "": - continue - doc_1_to_doc_2 = (first_doc.pk, second_doc.pk) - doc_2_to_doc_1 = doc_1_to_doc_2[::-1] - if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs: - continue - checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1]) - work_pkgs.append(_WorkPackage(first_doc, second_doc)) - - results: list[_WorkResult] = [] - if self.process_count == 1: - for work in self.track(work_pkgs, description="Matching..."): - results.append(_process_and_match(work)) - else: # pragma: no cover - for proc_result in self.process_parallel( - _process_and_match, - work_pkgs, - description="Matching...", - ): - if proc_result.error: - self.console.print( - f"[red]Failed: {proc_result.error}[/red]", - ) - elif proc_result.result is not None: - results.append(proc_result.result) - - messages: list[str] = [] - maybe_delete_ids: list[int] = [] - for match_result in sorted(results): - if match_result.ratio >= opt_ratio: - messages.append( - self.style.NOTICE( - f"Document {match_result.doc_one_pk} fuzzy match" - f" to {match_result.doc_two_pk}" - f" (confidence {match_result.ratio:.3f})\n", - ), - ) - maybe_delete_ids.append(match_result.doc_two_pk) - - if len(messages) == 0: - messages.append(self.style.SUCCESS("No matches found\n")) - self.stdout.writelines(messages) - if options["delete"]: - self.stdout.write( - self.style.NOTICE( - f"Deleting {len(maybe_delete_ids)} documents based on ratio matches", + self.console.print( + Panel( + "[bold yellow]WARNING:[/bold yellow] This run is configured to delete" + " documents. One document from each matched pair WILL BE PERMANENTLY DELETED.", + title="Delete Mode", + border_style="red", ), ) - Document.objects.filter(pk__in=maybe_delete_ids).delete() + + # Load only the fields we need -- avoids fetching title, archive_checksum, etc. + slim_docs: list[tuple[int, str]] = list( + Document.objects.only("id", "content") + .order_by("id") + .values_list("id", "content"), + ) + + # combinations() generates each unique pair exactly once -- no checked_pairs set needed. + # The total is computed cheaply so the progress bar can start immediately without + # materialising all pairs up front (n*(n-1)/2 can be hundreds of thousands). + n = len(slim_docs) + total_pairs = n * (n - 1) // 2 + + def _work_gen(): + for (pk_a, ca), (pk_b, cb) in combinations(slim_docs, 2): + if ca.strip() and cb.strip(): + yield _WorkPackage(pk_a, ca, pk_b, cb, opt_ratio) + + def _iter_matches(): + if self.process_count == 1: + for work in self.track( + _work_gen(), + description="Matching...", + total=total_pairs, + ): + result = _process_and_match(work) + if result.ratio >= opt_ratio: + yield result + else: # pragma: no cover + work_pkgs = list(_work_gen()) + for proc_result in self.process_parallel( + _process_and_match, + work_pkgs, + description="Matching...", + ): + if proc_result.error: + self.console.print( + f"[red]Failed: {proc_result.error}[/red]", + ) + elif ( + proc_result.result is not None + and proc_result.result.ratio >= opt_ratio + ): + yield proc_result.result + + matches = sorted(_iter_matches(), key=lambda m: m.ratio, reverse=True) + maybe_delete_ids = self._render_results( + matches, + opt_ratio=opt_ratio, + do_delete=options["delete"], + ) + + if options["delete"] and maybe_delete_ids: + confirmed = options["yes"] + if not confirmed: + self.console.print( + f"\nDelete [bold]{len(maybe_delete_ids)}[/bold] document(s)? " + "[bold]\\[y/N][/bold] ", + end="", + ) + answer = input().strip().lower() + confirmed = answer in {"y", "yes"} + + if confirmed: + self.console.print( + f"[red]Deleting {len(maybe_delete_ids)} document(s)...[/red]", + ) + Document.objects.filter(pk__in=maybe_delete_ids).delete() + self.console.print("[green]Done.[/green]") + else: + self.console.print("[yellow]Deletion cancelled.[/yellow]") diff --git a/src/documents/tests/test_management_fuzzy.py b/src/documents/tests/test_management_fuzzy.py index 7c4acabec..fcd0a039a 100644 --- a/src/documents/tests/test_management_fuzzy.py +++ b/src/documents/tests/test_management_fuzzy.py @@ -1,4 +1,5 @@ from io import StringIO +from unittest.mock import patch import pytest from django.core.management import CommandError @@ -6,12 +7,11 @@ from django.core.management import call_command from django.test import TestCase from documents.models import Document +from documents.tests.factories import DocumentFactory @pytest.mark.management class TestFuzzyMatchCommand(TestCase): - MSG_REGEX = r"Document \d fuzzy match to \d \(confidence \d\d\.\d\d\d\)" - def call_command(self, *args, **kwargs): stdout = StringIO() stderr = StringIO() @@ -77,7 +77,7 @@ class TestFuzzyMatchCommand(TestCase): filename="other_test.pdf", ) stdout, _ = self.call_command() - self.assertIn("No matches found", stdout) + self.assertIn("No duplicate documents found", stdout) def test_with_matches(self) -> None: """ @@ -106,7 +106,7 @@ class TestFuzzyMatchCommand(TestCase): filename="other_test.pdf", ) stdout, _ = self.call_command("--processes", "1") - self.assertRegex(stdout, self.MSG_REGEX) + self.assertIn("Found 1 matching pair(s)", stdout) def test_with_3_matches(self) -> None: """ @@ -142,10 +142,8 @@ class TestFuzzyMatchCommand(TestCase): filename="final_test.pdf", ) stdout, _ = self.call_command("--no-progress-bar", "--processes", "1") - lines = [x.strip() for x in stdout.splitlines() if x.strip()] - self.assertEqual(len(lines), 3) - for line in lines: - self.assertRegex(line, self.MSG_REGEX) + # 3 docs -> 3 unique pairs; summary confirms count and no duplication + self.assertIn("Found 3 matching pair(s)", stdout) def test_document_deletion(self) -> None: """ @@ -186,22 +184,47 @@ class TestFuzzyMatchCommand(TestCase): stdout, _ = self.call_command( "--delete", + "--yes", "--no-progress-bar", "--processes", "1", ) - self.assertIn( - "The command is configured to delete documents. Use with caution", - stdout, - ) - self.assertRegex(stdout, self.MSG_REGEX) - self.assertIn("Deleting 1 documents based on ratio matches", stdout) + self.assertIn("Delete Mode", stdout) + self.assertIn("Found 1 matching pair(s)", stdout) + self.assertIn("Deleting 1 document(s)", stdout) self.assertEqual(Document.objects.count(), 2) self.assertIsNotNone(Document.objects.get(pk=1)) self.assertIsNotNone(Document.objects.get(pk=2)) + def test_document_deletion_cancelled(self) -> None: + """ + GIVEN: + - 3 documents exist + - Document 1 to document 3 has a similarity over 85.0 + WHEN: + - Command is called with --delete but user answers "n" at the prompt + THEN: + - No documents are deleted + """ + DocumentFactory(content="first document scanned by bob") + DocumentFactory(content="second document scanned by alice") + DocumentFactory(content="first document scanned by pete") + + self.assertEqual(Document.objects.count(), 3) + + with patch("builtins.input", return_value="n"): + stdout, _ = self.call_command( + "--delete", + "--no-progress-bar", + "--processes", + "1", + ) + + self.assertIn("Deletion cancelled", stdout) + self.assertEqual(Document.objects.count(), 3) + def test_empty_content(self) -> None: """ GIVEN: @@ -226,4 +249,30 @@ class TestFuzzyMatchCommand(TestCase): filename="other_test.pdf", ) stdout, _ = self.call_command() - self.assertIn("No matches found", stdout) + self.assertIn("No duplicate documents found", stdout) + + +@pytest.mark.management +@pytest.mark.django_db +@pytest.mark.parametrize( + ("content_a", "content_b"), + [ + pytest.param("x" * 90 + "y" * 10, "x" * 100, id="yellow-90pct"), # 88-92% + pytest.param("x" * 94 + "y" * 6, "x" * 100, id="red-94pct"), # 92-97% + pytest.param("x" * 99 + "y", "x" * 100, id="bold-red-99pct"), # ≥97% + ], +) +def test_similarity_color_band(content_a: str, content_b: str) -> None: + """Each parametrized case exercises one color branch in _render_results.""" + DocumentFactory(content=content_a) + DocumentFactory(content=content_b) + stdout = StringIO() + call_command( + "document_fuzzy_match", + "--no-progress-bar", + "--processes", + "1", + stdout=stdout, + skip_checks=True, + ) + assert "Found 1 matching pair(s)" in stdout.getvalue()