From 2fbf97590348c790927281bcb24bccaf8c022a33 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 23 Apr 2026 09:09:33 -0700 Subject: [PATCH] Experimetns with using magika instead of magic for mime detection --- pyproject.toml | 2 +- src/documents/consumer.py | 6 +-- src/documents/data_models.py | 5 ++- src/documents/serialisers.py | 5 ++- src/documents/tests/test_bulk_edit.py | 25 ++++++++++--- src/documents/tests/test_consumer.py | 37 +++++++++---------- src/documents/views.py | 4 +- src/paperless/mime_detection.py | 13 +++++++ src/paperless/serialisers.py | 4 +- .../parsers/test_convert_image_to_pdfa.py | 8 ++-- src/paperless_mail/mail.py | 17 ++++----- src/paperless_mail/tests/test_mail.py | 13 +++---- 12 files changed, 80 insertions(+), 59 deletions(-) create mode 100644 src/paperless/mime_detection.py diff --git a/pyproject.toml b/pyproject.toml index 19dfe3fdc..cfadade60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ dependencies = [ "llama-index-llms-ollama>=0.9.1", "llama-index-llms-openai>=0.6.13", "llama-index-vector-stores-faiss>=0.5.2", + "magika>=1.0.2", "nltk~=3.9.1", "ocrmypdf~=17.4.0", "openai>=1.76", @@ -66,7 +67,6 @@ dependencies = [ "python-dotenv~=1.2.1", "python-gnupg~=0.5.4", "python-ipware~=3.0.0", - "python-magic~=0.4.27", "rapidfuzz~=3.14.0", "redis[hiredis]~=5.2.1", "regex>=2025.9.18", diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 390ce3e66..9fde76299 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -8,7 +8,6 @@ from pathlib import Path from typing import TYPE_CHECKING from typing import Final -import magic from django.conf import settings from django.contrib.auth.models import User from django.db import transaction @@ -52,6 +51,7 @@ from documents.utils import compute_checksum from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless import mime_detection from paperless.config import OcrConfig from paperless.models import ArchiveFileGenerationChoices from paperless.parsers import ParserContext @@ -424,7 +424,7 @@ class ConsumerPlugin( # Determine the parser class. - mime_type = magic.from_file(self.working_copy, mime=True) + mime_type = mime_detection.from_file(self.working_copy) self.log.debug(f"Detected mime type: {mime_type}") @@ -446,7 +446,7 @@ class ConsumerPlugin( ], logger=self.log, ) - mime_type = magic.from_file(self.working_copy, mime=True) + mime_type = mime_detection.from_file(self.working_copy) self.log.debug(f"Detected mime type after qpdf: {mime_type}") # Save the original file for later self.unmodified_original = ( diff --git a/src/documents/data_models.py b/src/documents/data_models.py index 6d9e3a187..0cfa5688a 100644 --- a/src/documents/data_models.py +++ b/src/documents/data_models.py @@ -4,10 +4,11 @@ from enum import IntEnum from pathlib import Path from typing import TypedDict -import magic from guardian.shortcuts import get_groups_with_perms from guardian.shortcuts import get_users_with_perms +from paperless import mime_detection + @dataclasses.dataclass class DocumentMetadataOverrides: @@ -184,7 +185,7 @@ class ConsumableDocument: # Get the file type once at init # Note this function isn't called when the object is unpickled - self.mime_type = magic.from_file(self.original_file, mime=True) + self.mime_type = mime_detection.from_file(self.original_file) class ConsumeFileDuplicateResult(TypedDict): diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index e3037eeae..77a94edd8 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -11,7 +11,6 @@ from typing import Any from typing import Literal from typing import TypedDict -import magic from django.conf import settings from django.contrib.auth.models import Group from django.contrib.auth.models import User @@ -49,6 +48,8 @@ from rest_framework.exceptions import PermissionDenied from rest_framework.fields import SerializerMethodField from rest_framework.filters import OrderingFilter +from paperless import mime_detection + if settings.AUDIT_LOG_ENABLED: from auditlog.context import set_actor @@ -2159,7 +2160,7 @@ class PostDocumentSerializer(serializers.Serializer): def validate_document(self, document): document_data = document.file.read() - mime_type = magic.from_buffer(document_data, mime=True) + mime_type = mime_detection.from_buffer(document_data) if not is_mime_type_supported(mime_type): if ( diff --git a/src/documents/tests/test_bulk_edit.py b/src/documents/tests/test_bulk_edit.py index 0c44157a5..9537300c1 100644 --- a/src/documents/tests/test_bulk_edit.py +++ b/src/documents/tests/test_bulk_edit.py @@ -1097,7 +1097,10 @@ class TestPDFActions(DirectoriesMixin, TestCase): self.assertIsNotNone(task_kwargs["overrides"]) self.assertEqual(result, "OK") - @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf") + @mock.patch( + "documents.data_models.mime_detection.from_file", + return_value="application/pdf", + ) @mock.patch("documents.tasks.consume_file.apply_async") @mock.patch("pikepdf.open") def test_rotate_explicit_selection_uses_root_source_when_root_selected( @@ -1127,7 +1130,10 @@ class TestPDFActions(DirectoriesMixin, TestCase): @mock.patch("documents.tasks.consume_file.apply_async") @mock.patch("pikepdf.Pdf.save") - @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf") + @mock.patch( + "documents.data_models.mime_detection.from_file", + return_value="application/pdf", + ) def test_delete_pages(self, mock_magic, mock_pdf_save, mock_consume_delay): """ GIVEN: @@ -1151,7 +1157,10 @@ class TestPDFActions(DirectoriesMixin, TestCase): self.assertIsNotNone(task_kwargs["overrides"]) self.assertEqual(result, "OK") - @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf") + @mock.patch( + "documents.data_models.mime_detection.from_file", + return_value="application/pdf", + ) @mock.patch("documents.tasks.consume_file.apply_async") @mock.patch("pikepdf.open") def test_delete_pages_explicit_selection_uses_root_source_when_root_selected( @@ -1328,7 +1337,10 @@ class TestPDFActions(DirectoriesMixin, TestCase): ) self.assertIsNotNone(task_kwargs["overrides"]) - @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf") + @mock.patch( + "documents.data_models.mime_detection.from_file", + return_value="application/pdf", + ) @mock.patch("documents.tasks.consume_file.apply_async") @mock.patch("pikepdf.new") @mock.patch("pikepdf.open") @@ -1482,7 +1494,10 @@ class TestPDFActions(DirectoriesMixin, TestCase): self.assertEqual(task_kwargs["input_doc"].root_document_id, doc.id) self.assertIsNotNone(task_kwargs["overrides"]) - @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf") + @mock.patch( + "documents.data_models.mime_detection.from_file", + return_value="application/pdf", + ) @mock.patch("documents.tasks.consume_file.apply_async") @mock.patch("pikepdf.open") def test_remove_password_explicit_selection_uses_root_source_when_root_selected( diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 0ea714fc1..32075a6a3 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -140,28 +140,25 @@ class FaultyGenericExceptionParser(_BaseNewStyleParser): raise Exception("Generic exception.") -def fake_magic_from_file(file, *, mime=False): # NOSONAR - if mime: - filepath = Path(file) - if filepath.name.startswith("invalid_pdf"): - return "application/octet-stream" - if filepath.name.startswith("valid_pdf"): - return "application/pdf" - if filepath.suffix == ".pdf": - return "application/pdf" - elif filepath.suffix == ".png": - return "image/png" - elif filepath.suffix == ".webp": - return "image/webp" - elif filepath.suffix == ".eml": - return "message/rfc822" - else: - return "unknown" +def fake_magic_from_file(file): # NOSONAR + filepath = Path(file) + if filepath.name.startswith("invalid_pdf"): + return "application/octet-stream" + if filepath.name.startswith("valid_pdf"): + return "application/pdf" + if filepath.suffix == ".pdf": + return "application/pdf" + elif filepath.suffix == ".png": + return "image/png" + elif filepath.suffix == ".webp": + return "image/webp" + elif filepath.suffix == ".eml": + return "message/rfc822" else: - return "A verbose string that describes the contents of the file" + return "unknown" -@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) +@mock.patch("documents.consumer.mime_detection.from_file", fake_magic_from_file) class TestConsumer( DirectoriesMixin, FileSystemAssertsMixin, @@ -1146,7 +1143,7 @@ class TestConsumer( ) -@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) +@mock.patch("documents.consumer.mime_detection.from_file", fake_magic_from_file) class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase): def setUp(self) -> None: super().setUp() diff --git a/src/documents/views.py b/src/documents/views.py index 217550634..483fc22a2 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -20,7 +20,6 @@ from urllib.parse import quote from urllib.parse import urlparse import httpx -import magic import pathvalidate from django.conf import settings from django.contrib.auth.models import Group @@ -226,6 +225,7 @@ from documents.versioning import get_latest_version_for_root from documents.versioning import get_request_version_param from documents.versioning import get_root_document from documents.versioning import resolve_requested_version_for_root +from paperless import mime_detection from paperless import version from paperless.celery import app as celery_app from paperless.config import AIConfig @@ -4896,7 +4896,7 @@ def serve_logo(request: HttpRequest, filename: str | None = None) -> FileRespons raise Http404("No logo configured") path = app_logo.path - content_type = magic.from_file(path, mime=True) or "application/octet-stream" + content_type = mime_detection.from_file(path) or "application/octet-stream" return FileResponse( app_logo.open("rb"), diff --git a/src/paperless/mime_detection.py b/src/paperless/mime_detection.py new file mode 100644 index 000000000..4014fe75f --- /dev/null +++ b/src/paperless/mime_detection.py @@ -0,0 +1,13 @@ +from pathlib import Path + +from magika import Magika + +_magika = Magika() + + +def from_file(path: str | Path) -> str: + return _magika.identify_path(path).output.mime_type + + +def from_buffer(data: bytes) -> str: + return _magika.identify_bytes(data).output.mime_type diff --git a/src/paperless/serialisers.py b/src/paperless/serialisers.py index 92676df4e..c50d05814 100644 --- a/src/paperless/serialisers.py +++ b/src/paperless/serialisers.py @@ -1,7 +1,6 @@ import logging from io import BytesIO -import magic from allauth.mfa.adapter import get_adapter as get_mfa_adapter from allauth.mfa.models import Authenticator from allauth.mfa.totp.internal.auth import TOTP @@ -18,6 +17,7 @@ from PIL import Image from rest_framework import serializers from rest_framework.authtoken.serializers import AuthTokenSerializer +from paperless import mime_detection from paperless.models import ApplicationConfiguration from paperless.network import validate_outbound_http_url from paperless.validators import reject_dangerous_svg @@ -263,7 +263,7 @@ class ApplicationConfigurationSerializer( jpg/png/gif/svg. """ if file: - mime_type = magic.from_buffer(file.read(2048), mime=True) + mime_type = mime_detection.from_buffer(file.read(2048)) if mime_type == "image/svg+xml": reject_dangerous_svg(file) diff --git a/src/paperless/tests/parsers/test_convert_image_to_pdfa.py b/src/paperless/tests/parsers/test_convert_image_to_pdfa.py index 615900a25..3aa76f877 100644 --- a/src/paperless/tests/parsers/test_convert_image_to_pdfa.py +++ b/src/paperless/tests/parsers/test_convert_image_to_pdfa.py @@ -15,11 +15,11 @@ from pathlib import Path from typing import TYPE_CHECKING import img2pdf -import magic import pikepdf import pytest from documents.parsers import ParseError +from paperless import mime_detection if TYPE_CHECKING: from pytest_mock import MockerFixture @@ -43,7 +43,7 @@ class TestConvertImageToPdfa: result = tesseract_parser._convert_image_to_pdfa(simple_png_file) assert result.exists() - assert magic.from_file(str(result), mime=True) == "application/pdf" + assert mime_detection.from_file(result) == "application/pdf" def test_output_path_is_archive_pdf_in_tempdir( self, @@ -92,7 +92,7 @@ class TestConvertImageToPdfa: result = tesseract_parser._convert_image_to_pdfa(simple_png_file) assert result.exists() - assert magic.from_file(str(result), mime=True) == "application/pdf" + assert mime_detection.from_file(result) == "application/pdf" def test_image_dpi_setting_applies_fixed_dpi_layout( self, @@ -116,7 +116,7 @@ class TestConvertImageToPdfa: result = tesseract_parser._convert_image_to_pdfa(simple_no_dpi_png_file) spy.assert_called_once_with((150, 150)) - assert magic.from_file(str(result), mime=True) == "application/pdf" + assert mime_detection.from_file(result) == "application/pdf" def test_no_image_dpi_setting_skips_fixed_dpi_layout( self, diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py index d551cc8cd..cb8b9a56c 100644 --- a/src/paperless_mail/mail.py +++ b/src/paperless_mail/mail.py @@ -10,7 +10,6 @@ from fnmatch import fnmatch from pathlib import Path from typing import TYPE_CHECKING -import magic import pathvalidate from celery import chord from celery import shared_task @@ -40,6 +39,7 @@ from documents.models import Correspondent from documents.models import PaperlessTask from documents.parsers import is_mime_type_supported from documents.tasks import consume_file +from paperless import mime_detection from paperless.network import is_public_ip from paperless.network import resolve_hostname_ips from paperless_mail.models import MailAccount @@ -848,7 +848,7 @@ class MailAccountHandler(LoggingMixin): # don't trust the content type of the attachment. Could be # generic application/octet-stream. - mime_type = magic.from_buffer(att.payload, mime=True) + mime_type = mime_detection.from_buffer(att.payload) if is_mime_type_supported(mime_type): self.log.info( @@ -954,14 +954,11 @@ class MailAccountHandler(LoggingMixin): ) with Path(temp_filename).open("wb") as f: # Move "From"-header to beginning of file - # TODO: This ugly workaround is needed because the parser is - # chosen only by the mime_type detected via magic - # (see documents/consumer.py "mime_type = magic.from_file") - # Unfortunately magic sometimes fails to detect the mime - # type of .eml files correctly as message/rfc822 and instead - # detects text/plain. - # This also effects direct file consumption of .eml files - # which are not treated with this workaround. + # TODO: This workaround may no longer be needed with Magika, + # which has better text-format detection than libmagic. + # Previously libmagic would misidentify .eml files as text/plain + # instead of message/rfc822. Verify and remove if Magika handles + # it correctly. from_element = None for i, header in enumerate(message.obj._headers): if header[0] == "From": diff --git a/src/paperless_mail/tests/test_mail.py b/src/paperless_mail/tests/test_mail.py index 48e066646..991ae2877 100644 --- a/src/paperless_mail/tests/test_mail.py +++ b/src/paperless_mail/tests/test_mail.py @@ -192,14 +192,11 @@ class BogusMailBox(AbstractContextManager): raise Exception -def fake_magic_from_buffer(buffer, *, mime=False): - if mime: - if "PDF" in str(buffer): - return "application/pdf" - else: - return "unknown/type" +def fake_magic_from_buffer(buffer): + if "PDF" in str(buffer): + return "application/pdf" else: - return "Some verbose file description" + return "unknown/type" class MessageBuilder: @@ -408,7 +405,7 @@ def assert_eventually_equals( raise AssertionError(f"Expected {expected_value}, but got {actual}") -@mock.patch("paperless_mail.mail.magic.from_buffer", fake_magic_from_buffer) +@mock.patch("paperless_mail.mail.mime_detection.from_buffer", fake_magic_from_buffer) class TestMail( DirectoriesMixin, FileSystemAssertsMixin,