Feature: Document fuzzy match improvements (#12579)

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-17 01:14:55 +00:00 · 2026-04-15 20:59:43 +00:00
parent 2b4c1fe20d
commit 2fd1a1cf3a
2 changed files with 236 additions and 90 deletions
@@ -1,8 +1,12 @@
 import dataclasses
+from itertools import combinations
 from typing import Final

 import rapidfuzz
 from django.core.management import CommandError
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text

 from documents.management.commands.base import PaperlessCommand
 from documents.models import Document
@@ -10,8 +14,11 @@ from documents.models import Document

@dataclasses.dataclass(frozen=True, slots=True)
 class _WorkPackage:
-    first_doc: Document
-    second_doc: Document
+    pk_a: int
+    content_a: str
+    pk_b: int
+    content_b: str
+    score_cutoff: float


@dataclasses.dataclass(frozen=True, slots=True)
@@ -20,21 +27,20 @@ class _WorkResult:
    doc_two_pk: int
    ratio: float

-    def __lt__(self, other: "_WorkResult") -> bool:
-        return self.doc_one_pk < other.doc_one_pk
-

 def _process_and_match(work: _WorkPackage) -> _WorkResult:
    """
-    Does basic processing of document content, gets the basic ratio
-    and returns the result package.
+    Process document content and compute the fuzzy ratio.
+    score_cutoff lets rapidfuzz short-circuit when the score cannot reach the threshold.
    """
-    first_string = rapidfuzz.utils.default_process(work.first_doc.content)
-    second_string = rapidfuzz.utils.default_process(work.second_doc.content)
-
-    match = rapidfuzz.fuzz.ratio(first_string, second_string)
-
-    return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
+    first_string = rapidfuzz.utils.default_process(work.content_a)
+    second_string = rapidfuzz.utils.default_process(work.content_b)
+    ratio = rapidfuzz.fuzz.ratio(
+        first_string,
+        second_string,
+        score_cutoff=work.score_cutoff,
+    )
+    return _WorkResult(work.pk_a, work.pk_b, ratio)


 class Command(PaperlessCommand):
@@ -57,78 +63,169 @@ class Command(PaperlessCommand):
            action="store_true",
            help="If set, one document of matches above the ratio WILL BE DELETED",
        )
+        parser.add_argument(
+            "--yes",
+            default=False,
+            action="store_true",
+            help="Skip the confirmation prompt when used with --delete",
+        )
+
+    def _render_results(
+        self,
+        matches: list[_WorkResult],
+        *,
+        opt_ratio: float,
+        do_delete: bool,
+    ) -> list[int]:
+        """Render match results as a Rich table. Returns list of PKs to delete."""
+        if not matches:
+            self.console.print(
+                Panel(
+                    "[green]No duplicate documents found.[/green]",
+                    title="Fuzzy Match",
+                    border_style="green",
+                ),
+            )
+            return []
+
+        # Fetch titles for matched documents in a single query.
+        all_pks = {pk for m in matches for pk in (m.doc_one_pk, m.doc_two_pk)}
+        titles: dict[int, str] = dict(
+            Document.objects.filter(pk__in=all_pks)
+            .only("pk", "title")
+            .values_list("pk", "title"),
+        )
+
+        table = Table(
+            title=f"Fuzzy Matches (threshold: {opt_ratio:.1f}%)",
+            show_lines=True,
+            title_style="bold",
+        )
+        table.add_column("#", style="dim", width=4, no_wrap=True)
+        table.add_column("Document A", min_width=24)
+        table.add_column("Document B", min_width=24)
+        table.add_column("Similarity", width=11, justify="right")
+
+        maybe_delete_ids: list[int] = []
+
+        for i, match_result in enumerate(matches, 1):
+            pk_a = match_result.doc_one_pk
+            pk_b = match_result.doc_two_pk
+            ratio = match_result.ratio
+
+            if ratio >= 97.0:
+                ratio_style = "bold red"
+            elif ratio >= 92.0:
+                ratio_style = "red"
+            elif ratio >= 88.0:
+                ratio_style = "yellow"
+            else:
+                ratio_style = "dim"
+
+            table.add_row(
+                str(i),
+                f"[dim]#{pk_a}[/dim] {titles.get(pk_a, 'Unknown')}",
+                f"[dim]#{pk_b}[/dim] {titles.get(pk_b, 'Unknown')}",
+                Text(f"{ratio:.1f}%", style=ratio_style),
+            )
+            maybe_delete_ids.append(pk_b)
+
+        self.console.print(table)
+
+        summary = f"Found [bold]{len(matches)}[/bold] matching pair(s)."
+        if do_delete:
+            summary += f" [yellow]{len(maybe_delete_ids)}[/yellow] document(s) will be deleted."
+        self.console.print(summary)
+
+        return maybe_delete_ids

    def handle(self, *args, **options):
        RATIO_MIN: Final[float] = 0.0
        RATIO_MAX: Final[float] = 100.0

-        if options["delete"]:
-            self.stdout.write(
-                self.style.WARNING(
-                    "The command is configured to delete documents.  Use with caution",
-                ),
-            )
-
        opt_ratio = options["ratio"]
-        checked_pairs: set[tuple[int, int]] = set()
-        work_pkgs: list[_WorkPackage] = []

        if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
            raise CommandError("The ratio must be between 0 and 100")

-        all_docs = Document.objects.all().order_by("id")
-
-        for first_doc in all_docs:
-            for second_doc in all_docs:
-                if first_doc.pk == second_doc.pk:
-                    continue
-                if first_doc.content.strip() == "" or second_doc.content.strip() == "":
-                    continue
-                doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
-                doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
-                if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
-                    continue
-                checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
-                work_pkgs.append(_WorkPackage(first_doc, second_doc))
-
-        results: list[_WorkResult] = []
-        if self.process_count == 1:
-            for work in self.track(work_pkgs, description="Matching..."):
-                results.append(_process_and_match(work))
-        else:  # pragma: no cover
-            for proc_result in self.process_parallel(
-                _process_and_match,
-                work_pkgs,
-                description="Matching...",
-            ):
-                if proc_result.error:
-                    self.console.print(
-                        f"[red]Failed: {proc_result.error}[/red]",
-                    )
-                elif proc_result.result is not None:
-                    results.append(proc_result.result)
-
-        messages: list[str] = []
-        maybe_delete_ids: list[int] = []
-        for match_result in sorted(results):
-            if match_result.ratio >= opt_ratio:
-                messages.append(
-                    self.style.NOTICE(
-                        f"Document {match_result.doc_one_pk} fuzzy match"
-                        f" to {match_result.doc_two_pk}"
-                        f" (confidence {match_result.ratio:.3f})\n",
-                    ),
-                )
-                maybe_delete_ids.append(match_result.doc_two_pk)
-
-        if len(messages) == 0:
-            messages.append(self.style.SUCCESS("No matches found\n"))
-        self.stdout.writelines(messages)
-
        if options["delete"]:
-            self.stdout.write(
-                self.style.NOTICE(
-                    f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
+            self.console.print(
+                Panel(
+                    "[bold yellow]WARNING:[/bold yellow] This run is configured to delete"
+                    " documents. One document from each matched pair WILL BE PERMANENTLY DELETED.",
+                    title="Delete Mode",
+                    border_style="red",
                ),
            )
-            Document.objects.filter(pk__in=maybe_delete_ids).delete()
+
+        # Load only the fields we need -- avoids fetching title, archive_checksum, etc.
+        slim_docs: list[tuple[int, str]] = list(
+            Document.objects.only("id", "content")
+            .order_by("id")
+            .values_list("id", "content"),
+        )
+
+        # combinations() generates each unique pair exactly once -- no checked_pairs set needed.
+        # The total is computed cheaply so the progress bar can start immediately without
+        # materialising all pairs up front (n*(n-1)/2 can be hundreds of thousands).
+        n = len(slim_docs)
+        total_pairs = n * (n - 1) // 2
+
+        def _work_gen():
+            for (pk_a, ca), (pk_b, cb) in combinations(slim_docs, 2):
+                if ca.strip() and cb.strip():
+                    yield _WorkPackage(pk_a, ca, pk_b, cb, opt_ratio)
+
+        def _iter_matches():
+            if self.process_count == 1:
+                for work in self.track(
+                    _work_gen(),
+                    description="Matching...",
+                    total=total_pairs,
+                ):
+                    result = _process_and_match(work)
+                    if result.ratio >= opt_ratio:
+                        yield result
+            else:  # pragma: no cover
+                work_pkgs = list(_work_gen())
+                for proc_result in self.process_parallel(
+                    _process_and_match,
+                    work_pkgs,
+                    description="Matching...",
+                ):
+                    if proc_result.error:
+                        self.console.print(
+                            f"[red]Failed: {proc_result.error}[/red]",
+                        )
+                    elif (
+                        proc_result.result is not None
+                        and proc_result.result.ratio >= opt_ratio
+                    ):
+                        yield proc_result.result
+
+        matches = sorted(_iter_matches(), key=lambda m: m.ratio, reverse=True)
+        maybe_delete_ids = self._render_results(
+            matches,
+            opt_ratio=opt_ratio,
+            do_delete=options["delete"],
+        )
+
+        if options["delete"] and maybe_delete_ids:
+            confirmed = options["yes"]
+            if not confirmed:
+                self.console.print(
+                    f"\nDelete [bold]{len(maybe_delete_ids)}[/bold] document(s)? "
+                    "[bold]\\[y/N][/bold] ",
+                    end="",
+                )
+                answer = input().strip().lower()
+                confirmed = answer in {"y", "yes"}
+
+            if confirmed:
+                self.console.print(
+                    f"[red]Deleting {len(maybe_delete_ids)} document(s)...[/red]",
+                )
+                Document.objects.filter(pk__in=maybe_delete_ids).delete()
+                self.console.print("[green]Done.[/green]")
+            else:
+                self.console.print("[yellow]Deletion cancelled.[/yellow]")
@@ -1,4 +1,5 @@
 from io import StringIO
+from unittest.mock import patch

 import pytest
 from django.core.management import CommandError
@@ -6,12 +7,11 @@ from django.core.management import call_command
 from django.test import TestCase

 from documents.models import Document
+from documents.tests.factories import DocumentFactory


@pytest.mark.management
 class TestFuzzyMatchCommand(TestCase):
-    MSG_REGEX = r"Document \d fuzzy match to \d \(confidence \d\d\.\d\d\d\)"
-
    def call_command(self, *args, **kwargs):
        stdout = StringIO()
        stderr = StringIO()
@@ -77,7 +77,7 @@ class TestFuzzyMatchCommand(TestCase):
            filename="other_test.pdf",
        )
        stdout, _ = self.call_command()
-        self.assertIn("No matches found", stdout)
+        self.assertIn("No duplicate documents found", stdout)

    def test_with_matches(self) -> None:
        """
@@ -106,7 +106,7 @@ class TestFuzzyMatchCommand(TestCase):
            filename="other_test.pdf",
        )
        stdout, _ = self.call_command("--processes", "1")
-        self.assertRegex(stdout, self.MSG_REGEX)
+        self.assertIn("Found 1 matching pair(s)", stdout)

    def test_with_3_matches(self) -> None:
        """
@@ -142,10 +142,8 @@ class TestFuzzyMatchCommand(TestCase):
            filename="final_test.pdf",
        )
        stdout, _ = self.call_command("--no-progress-bar", "--processes", "1")
-        lines = [x.strip() for x in stdout.splitlines() if x.strip()]
-        self.assertEqual(len(lines), 3)
-        for line in lines:
-            self.assertRegex(line, self.MSG_REGEX)
+        # 3 docs -> 3 unique pairs; summary confirms count and no duplication
+        self.assertIn("Found 3 matching pair(s)", stdout)

    def test_document_deletion(self) -> None:
        """
@@ -186,22 +184,47 @@ class TestFuzzyMatchCommand(TestCase):

        stdout, _ = self.call_command(
            "--delete",
+            "--yes",
            "--no-progress-bar",
            "--processes",
            "1",
        )

-        self.assertIn(
-            "The command is configured to delete documents.  Use with caution",
-            stdout,
-        )
-        self.assertRegex(stdout, self.MSG_REGEX)
-        self.assertIn("Deleting 1 documents based on ratio matches", stdout)
+        self.assertIn("Delete Mode", stdout)
+        self.assertIn("Found 1 matching pair(s)", stdout)
+        self.assertIn("Deleting 1 document(s)", stdout)

        self.assertEqual(Document.objects.count(), 2)
        self.assertIsNotNone(Document.objects.get(pk=1))
        self.assertIsNotNone(Document.objects.get(pk=2))

+    def test_document_deletion_cancelled(self) -> None:
+        """
+        GIVEN:
+            - 3 documents exist
+            - Document 1 to document 3 has a similarity over 85.0
+        WHEN:
+            - Command is called with --delete but user answers "n" at the prompt
+        THEN:
+            - No documents are deleted
+        """
+        DocumentFactory(content="first document scanned by bob")
+        DocumentFactory(content="second document scanned by alice")
+        DocumentFactory(content="first document scanned by pete")
+
+        self.assertEqual(Document.objects.count(), 3)
+
+        with patch("builtins.input", return_value="n"):
+            stdout, _ = self.call_command(
+                "--delete",
+                "--no-progress-bar",
+                "--processes",
+                "1",
+            )
+
+        self.assertIn("Deletion cancelled", stdout)
+        self.assertEqual(Document.objects.count(), 3)
+
    def test_empty_content(self) -> None:
        """
        GIVEN:
@@ -226,4 +249,30 @@ class TestFuzzyMatchCommand(TestCase):
            filename="other_test.pdf",
        )
        stdout, _ = self.call_command()
-        self.assertIn("No matches found", stdout)
+        self.assertIn("No duplicate documents found", stdout)
+
+
+@pytest.mark.management
+@pytest.mark.django_db
+@pytest.mark.parametrize(
+    ("content_a", "content_b"),
+    [
+        pytest.param("x" * 90 + "y" * 10, "x" * 100, id="yellow-90pct"),  # 88-92%
+        pytest.param("x" * 94 + "y" * 6, "x" * 100, id="red-94pct"),  # 92-97%
+        pytest.param("x" * 99 + "y", "x" * 100, id="bold-red-99pct"),  # ≥97%
+    ],
+)
+def test_similarity_color_band(content_a: str, content_b: str) -> None:
+    """Each parametrized case exercises one color branch in _render_results."""
+    DocumentFactory(content=content_a)
+    DocumentFactory(content=content_b)
+    stdout = StringIO()
+    call_command(
+        "document_fuzzy_match",
+        "--no-progress-bar",
+        "--processes",
+        "1",
+        stdout=stdout,
+        skip_checks=True,
+    )
+    assert "Found 1 matching pair(s)" in stdout.getvalue()