Merge branch 'dev' into feature-archive-ocr-decoupling

This commit is contained in:
Trenton H
2026-03-27 08:35:25 -07:00
committed by GitHub
13 changed files with 325 additions and 35 deletions
+2 -2
View File
@@ -82,8 +82,8 @@ def sample_doc(
return DocumentFactory(
title="test",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
content="test content",
pk=1,
filename="0000001.pdf",
+1 -1
View File
@@ -60,7 +60,7 @@ class DocumentFactory(DjangoModelFactory):
model = Document
title = factory.Faker("sentence", nb_words=4)
checksum = factory.Faker("md5")
checksum = factory.Faker("sha256")
content = factory.Faker("paragraph")
correspondent = None
document_type = None
+8 -2
View File
@@ -261,8 +261,14 @@ class TestConsumer(
self.assertIsFile(document.archive_path)
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
self.assertEqual(
document.checksum,
"1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
)
self.assertEqual(
document.archive_checksum,
"706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
)
self.assertIsNotFile(filename)
@@ -63,8 +63,8 @@ class TestExportImport(
self.d1 = Document.objects.create(
content="Content",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",
@@ -72,21 +72,21 @@ class TestExportImport(
)
self.d2 = Document.objects.create(
content="Content",
checksum="9c9691e51741c1f4f41a20896af31770",
checksum="550d1bae0f746d4f7c6be07054eb20cc2f11988a58ef64ceae45e98f85e92a5b",
title="wow2",
filename="0000002.pdf",
mime_type="application/pdf",
)
self.d3 = Document.objects.create(
content="Content",
checksum="d38d7ed02e988e072caf924e0f3fcb76",
checksum="f1ba6b7ff8548214a75adec228f5468a14fe187f445bc0b9485cbf1c35b15915",
title="wow2",
filename="0000003.pdf",
mime_type="application/pdf",
)
self.d4 = Document.objects.create(
content="Content",
checksum="82186aaa94f0b98697d704b90fd1c072",
checksum="a81b16b6b313cfd7e60eb7b12598d1343b58622b4030cfa19a2724a02e98db1b",
title="wow_dec",
filename="0000004.pdf",
mime_type="application/pdf",
@@ -239,7 +239,7 @@ class TestExportImport(
)
with Path(fname).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = hashlib.sha256(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["checksum"])
# Generated field "content_length" should not be exported,
@@ -253,7 +253,7 @@ class TestExportImport(
self.assertIsFile(fname)
with Path(fname).open("rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
checksum = hashlib.sha256(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["archive_checksum"])
elif element["model"] == "documents.note":
@@ -277,8 +277,8 @@ class TestCommandImport(
Document.objects.create(
content="Content",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
checksum="1093cf6e32adbd16b06969df09215d42c4a3a8938cc18b39455953f08d1ff2ab",
archive_checksum="706124ecde3c31616992fa979caed17a726b1c9ccdba70e82a4ff796cea97ccf",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",
@@ -0,0 +1,132 @@
import hashlib
import shutil
import tempfile
from pathlib import Path
from django.conf import settings
from django.db import connection
from django.test import override_settings
from documents.tests.utils import TestMigrations
def _sha256(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
class TestSha256ChecksumDataMigration(TestMigrations):
"""recompute_checksums correctly updates document checksums from MD5 to SHA256."""
migrate_from = "0015_document_version_index_and_more"
migrate_to = "0016_sha256_checksums"
reset_sequences = True
ORIGINAL_CONTENT = b"original file content for sha256 migration test"
ARCHIVE_CONTENT = b"archive file content for sha256 migration test"
def setUpBeforeMigration(self, apps) -> None:
self._originals_dir = Path(tempfile.mkdtemp())
self._archive_dir = Path(tempfile.mkdtemp())
self._settings_override = override_settings(
ORIGINALS_DIR=self._originals_dir,
ARCHIVE_DIR=self._archive_dir,
)
self._settings_override.enable()
Document = apps.get_model("documents", "Document")
# doc1: original file present, no archive
(settings.ORIGINALS_DIR / "doc1.txt").write_bytes(self.ORIGINAL_CONTENT)
self.doc1_id = Document.objects.create(
title="Doc 1",
mime_type="text/plain",
filename="doc1.txt",
checksum="a" * 32,
).pk
# doc2: original and archive both present
(settings.ORIGINALS_DIR / "doc2.txt").write_bytes(self.ORIGINAL_CONTENT)
(settings.ARCHIVE_DIR / "doc2.pdf").write_bytes(self.ARCHIVE_CONTENT)
self.doc2_id = Document.objects.create(
title="Doc 2",
mime_type="text/plain",
filename="doc2.txt",
checksum="b" * 32,
archive_filename="doc2.pdf",
archive_checksum="c" * 32,
).pk
# doc3: original file missing — checksum must stay unchanged
self.doc3_id = Document.objects.create(
title="Doc 3",
mime_type="text/plain",
filename="missing_original.txt",
checksum="d" * 32,
).pk
# doc4: original present, archive_filename set but archive file missing
(settings.ORIGINALS_DIR / "doc4.txt").write_bytes(self.ORIGINAL_CONTENT)
self.doc4_id = Document.objects.create(
title="Doc 4",
mime_type="text/plain",
filename="doc4.txt",
checksum="e" * 32,
archive_filename="missing_archive.pdf",
archive_checksum="f" * 32,
).pk
# doc5: original present, archive_filename is None — archive_checksum must stay null
(settings.ORIGINALS_DIR / "doc5.txt").write_bytes(self.ORIGINAL_CONTENT)
self.doc5_id = Document.objects.create(
title="Doc 5",
mime_type="text/plain",
filename="doc5.txt",
checksum="0" * 32,
archive_filename=None,
archive_checksum=None,
).pk
def _fixture_teardown(self) -> None:
super()._fixture_teardown()
# Django's SQLite backend returns [] from sequence_reset_sql(), so
# reset_sequences=True flushes rows but never clears sqlite_sequence.
# Explicitly delete the entry so subsequent tests start from pk=1.
if connection.vendor == "sqlite":
with connection.cursor() as cursor:
cursor.execute(
"DELETE FROM sqlite_sequence WHERE name='documents_document'",
)
def tearDown(self) -> None:
super().tearDown()
self._settings_override.disable()
shutil.rmtree(self._originals_dir, ignore_errors=True)
shutil.rmtree(self._archive_dir, ignore_errors=True)
def test_original_checksum_updated_to_sha256_when_file_exists(self) -> None:
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(pk=self.doc1_id)
self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT))
def test_both_checksums_updated_when_original_and_archive_exist(self) -> None:
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(pk=self.doc2_id)
self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT))
self.assertEqual(doc.archive_checksum, _sha256(self.ARCHIVE_CONTENT))
def test_checksum_unchanged_when_original_file_missing(self) -> None:
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(pk=self.doc3_id)
self.assertEqual(doc.checksum, "d" * 32)
def test_archive_checksum_unchanged_when_archive_file_missing(self) -> None:
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(pk=self.doc4_id)
# Original was updated (file exists)
self.assertEqual(doc.checksum, _sha256(self.ORIGINAL_CONTENT))
# Archive was not updated (file missing)
self.assertEqual(doc.archive_checksum, "f" * 32)
def test_archive_checksum_stays_null_when_no_archive_filename(self) -> None:
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(pk=self.doc5_id)
self.assertIsNone(doc.archive_checksum)