From bd86dca57e90501db1c0b952d456bbc470d9cb66 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 19 May 2026 13:52:04 -0700 Subject: [PATCH] Fix: Password removal source file location (#12830) Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> --- src/documents/bulk_edit.py | 13 ++++- src/documents/signals/handlers.py | 14 +++++- src/documents/tests/test_bulk_edit.py | 38 ++++++++++++++ src/documents/tests/test_workflows.py | 71 +++++++++++++-------------- src/documents/workflows/actions.py | 16 +++--- 5 files changed, 105 insertions(+), 47 deletions(-) diff --git a/src/documents/bulk_edit.py b/src/documents/bulk_edit.py index de132326a..80a471016 100644 --- a/src/documents/bulk_edit.py +++ b/src/documents/bulk_edit.py @@ -35,6 +35,8 @@ from documents.versioning import get_latest_version_for_root from documents.versioning import get_root_document if TYPE_CHECKING: + from collections.abc import Mapping + from django.contrib.auth.models import User logger: logging.Logger = logging.getLogger("paperless.bulk_edit") @@ -882,6 +884,7 @@ def remove_password( source_mode: SourceMode = SourceModeChoices.LATEST_VERSION, user: User | None = None, trigger_source: PaperlessTask.TriggerSource = PaperlessTask.TriggerSource.WEB_UI, + source_paths_by_id: Mapping[int, Path] | None = None, ) -> Literal["OK"]: """ Remove password protection from PDF documents. @@ -893,9 +896,15 @@ def remove_password( pair = _resolve_root_and_source_doc(doc, source_mode=source_mode) try: logger.info( - f"Attempting password removal from document {doc_ids[0]}", + f"Attempting password removal from document {pair.root_doc.id}", ) - with pikepdf.open(pair.source_doc.source_path, password=password) as pdf: + # The caller may supply an explicit source path (e.g. the staged + # file during consumption, before source_path is populated). + source_path = (source_paths_by_id or {}).get( + doc.id, + pair.source_doc.source_path, + ) + with pikepdf.open(source_path, password=password) as pdf: filepath: Path = ( Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) / f"{pair.root_doc.id}_unprotected.pdf" diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index 22b28ac98..6f7628f5c 100644 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -879,6 +879,11 @@ def run_workflows( ) return None + # Track whether the caller supplied original_file. When set explicitly (e.g. by + # run_workflows_added during consumption), it points at the staged file that has + # not yet been moved into its final storage location. This matters for password + # removal, which must read from the staged path rather than document.source_path. + caller_supplied_original_file = original_file is not None if original_file is None: original_file = ( document.source_path if not use_overrides else document.original_file @@ -956,7 +961,14 @@ def run_workflows( original_file, ) elif action.type == WorkflowAction.WorkflowActionType.PASSWORD_REMOVAL: - execute_password_removal_action(action, document, logging_group) + execute_password_removal_action( + action, + document, + logging_group, + source_file=( + original_file if caller_supplied_original_file else None + ), + ) elif action.type == WorkflowAction.WorkflowActionType.MOVE_TO_TRASH: has_move_to_trash_action = True diff --git a/src/documents/tests/test_bulk_edit.py b/src/documents/tests/test_bulk_edit.py index 1b1e7550c..526588c48 100644 --- a/src/documents/tests/test_bulk_edit.py +++ b/src/documents/tests/test_bulk_edit.py @@ -1480,6 +1480,44 @@ class TestPDFActions(DirectoriesMixin, TestCase): self.assertEqual(task_kwargs["input_doc"].root_document_id, doc.id) self.assertIsNotNone(task_kwargs["overrides"]) + @mock.patch("documents.bulk_edit.update_document_content_maybe_archive_file.delay") + @mock.patch("documents.tasks.consume_file.apply_async") + @mock.patch("documents.bulk_edit.tempfile.mkdtemp") + @mock.patch("pikepdf.open") + def test_remove_password_update_document_uses_source_paths( + self, + mock_open, + mock_mkdtemp, + mock_consume_delay, + mock_update_document, + ) -> None: + doc = self.doc1 + source_file = self.dirs.scratch_dir / "consumption-source.pdf" + source_file.write_bytes(b"protected pdf content") + temp_dir = self.dirs.scratch_dir / "remove-password-source-file" + temp_dir.mkdir(parents=True, exist_ok=True) + mock_mkdtemp.return_value = str(temp_dir) + + fake_pdf = mock.MagicMock() + + def save_side_effect(target_path): + Path(target_path).write_bytes(b"new pdf content") + + fake_pdf.save.side_effect = save_side_effect + mock_open.return_value.__enter__.return_value = fake_pdf + + result = bulk_edit.remove_password( + [doc.id], + password="secret", + update_document=True, + source_paths_by_id={doc.id: source_file}, + ) + + self.assertEqual(result, "OK") + mock_open.assert_called_once_with(source_file, password="secret") + mock_update_document.assert_not_called() + mock_consume_delay.assert_called_once() + @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf") @mock.patch("documents.tasks.consume_file.apply_async") @mock.patch("pikepdf.open") diff --git a/src/documents/tests/test_workflows.py b/src/documents/tests/test_workflows.py index 1bfe294d0..243f9609e 100644 --- a/src/documents/tests/test_workflows.py +++ b/src/documents/tests/test_workflows.py @@ -4185,12 +4185,14 @@ class TestWorkflows( password="wrong", update_document=True, user=doc.owner, + source_paths_by_id=None, ), mock.call( [doc.id], password="right", update_document=True, user=doc.owner, + source_paths_by_id=None, ), ], ) @@ -4301,11 +4303,11 @@ class TestWorkflows( title="Protected", ) - with self.captureOnCommitCallbacks(execute=True): - document_consumption_finished.send( - sender=self.__class__, - document=doc, - ) + document_consumption_finished.send( + sender=self.__class__, + document=doc, + original_file=original_file, + ) assert mock_remove_password.call_count == 2 mock_remove_password.assert_has_calls( @@ -4315,73 +4317,70 @@ class TestWorkflows( password="first", update_document=True, user=doc.owner, + source_paths_by_id={doc.id: original_file}, ), mock.call( [doc.id], password="second", update_document=True, user=doc.owner, + source_paths_by_id={doc.id: original_file}, ), ], ) # ensure handler disconnected after first run - with self.captureOnCommitCallbacks(execute=True): - document_consumption_finished.send( - sender=self.__class__, - document=doc, - ) + document_consumption_finished.send( + sender=self.__class__, + document=doc, + ) assert mock_remove_password.call_count == 2 @mock.patch("documents.bulk_edit.remove_password") - def test_password_removal_deferred_until_transaction_commit( + def test_password_removal_document_added_uses_original_file( self, mock_remove_password, ) -> None: """ GIVEN: - - Workflow password removal action triggered during consumption - - document_consumption_finished signal fires inside a transaction + - Workflow password removal action on a DOCUMENT_ADDED trigger + - run_workflows called with an explicit original_file (staged file + from the consumer, before the source path is populated) WHEN: - - Signal fires but transaction has not yet committed + - The workflow runs THEN: - - Password removal is not attempted until the transaction commits + - remove_password is called with source_paths_by_id pointing at the + staged file rather than the not-yet-existing source_path """ - from django.db import transaction - + doc = Document.objects.create( + title="Protected", + checksum="pw-checksum-added", + ) + trigger = WorkflowTrigger.objects.create( + type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED, + ) action = WorkflowAction.objects.create( type=WorkflowAction.WorkflowActionType.PASSWORD_REMOVAL, passwords=["secret"], ) + workflow = Workflow.objects.create(name="Password workflow added") + workflow.triggers.add(trigger) + workflow.actions.add(action) + + mock_remove_password.return_value = "OK" temp_dir = Path(tempfile.mkdtemp()) - original_file = temp_dir / "file.pdf" + original_file = temp_dir / "staged.pdf" original_file.write_bytes(b"pdf content") - consumable = ConsumableDocument( - source=DocumentSource.ApiUpload, - original_file=original_file, - ) - execute_password_removal_action(action, consumable, logging_group=None) - - doc = Document.objects.create( - checksum="pw-checksum-on-commit", - title="Protected", - ) - - with self.captureOnCommitCallbacks(execute=True): - with transaction.atomic(): - document_consumption_finished.send( - sender=self.__class__, - document=doc, - ) - mock_remove_password.assert_not_called() + run_workflows(trigger.type, doc, original_file=original_file) mock_remove_password.assert_called_once_with( [doc.id], password="secret", update_document=True, user=doc.owner, + source_paths_by_id={doc.id: original_file}, ) def test_workflow_trash_action_soft_delete(self) -> None: diff --git a/src/documents/workflows/actions.py b/src/documents/workflows/actions.py index 948642dcb..be0921747 100644 --- a/src/documents/workflows/actions.py +++ b/src/documents/workflows/actions.py @@ -4,7 +4,6 @@ from pathlib import Path from django.conf import settings from django.contrib.auth.models import User -from django.db import transaction from django.utils import timezone from documents.data_models import ConsumableDocument @@ -277,6 +276,7 @@ def execute_password_removal_action( action: WorkflowAction, document: Document | ConsumableDocument, logging_group, + source_file: Path | None = None, ) -> None: """ Try to remove a password from a document using the configured list. @@ -296,15 +296,14 @@ def execute_password_removal_action( # hook the consumption-finished signal to attempt password removal later def handler(sender, **kwargs): consumed_document: Document = kwargs.get("document") - document_consumption_finished.disconnect(handler) if consumed_document is not None: - transaction.on_commit( - lambda: execute_password_removal_action( - action, - consumed_document, - logging_group, - ), + execute_password_removal_action( + action, + consumed_document, + logging_group, + source_file=kwargs.get("original_file"), ) + document_consumption_finished.disconnect(handler) document_consumption_finished.connect(handler, weak=False) return @@ -319,6 +318,7 @@ def execute_password_removal_action( password=password, update_document=True, user=document.owner, + source_paths_by_id={document.id: source_file} if source_file else None, ) logger.info( "Unlocked document %s using workflow action %s",