From 2fbf97590348c790927281bcb24bccaf8c022a33 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 23 Apr 2026 09:09:33 -0700
Subject: [PATCH] Experimetns with using magika instead of magic for mime
 detection

---
 pyproject.toml                                |  2 +-
 src/documents/consumer.py                     |  6 +--
 src/documents/data_models.py                  |  5 ++-
 src/documents/serialisers.py                  |  5 ++-
 src/documents/tests/test_bulk_edit.py         | 25 ++++++++++---
 src/documents/tests/test_consumer.py          | 37 +++++++++----------
 src/documents/views.py                        |  4 +-
 src/paperless/mime_detection.py               | 13 +++++++
 src/paperless/serialisers.py                  |  4 +-
 .../parsers/test_convert_image_to_pdfa.py     |  8 ++--
 src/paperless_mail/mail.py                    | 17 ++++-----
 src/paperless_mail/tests/test_mail.py         | 13 +++----
 12 files changed, 80 insertions(+), 59 deletions(-)
 create mode 100644 src/paperless/mime_detection.py

diff --git a/pyproject.toml b/pyproject.toml
index 19dfe3fdc..cfadade60 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,7 @@ dependencies = [
   "llama-index-llms-ollama>=0.9.1",
   "llama-index-llms-openai>=0.6.13",
   "llama-index-vector-stores-faiss>=0.5.2",
+  "magika>=1.0.2",
   "nltk~=3.9.1",
   "ocrmypdf~=17.4.0",
   "openai>=1.76",
@@ -66,7 +67,6 @@ dependencies = [
   "python-dotenv~=1.2.1",
   "python-gnupg~=0.5.4",
   "python-ipware~=3.0.0",
-  "python-magic~=0.4.27",
   "rapidfuzz~=3.14.0",
   "redis[hiredis]~=5.2.1",
   "regex>=2025.9.18",
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 390ce3e66..9fde76299 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -8,7 +8,6 @@ from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import Final
 
-import magic
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.db import transaction
@@ -52,6 +51,7 @@ from documents.utils import compute_checksum
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
+from paperless import mime_detection
 from paperless.config import OcrConfig
 from paperless.models import ArchiveFileGenerationChoices
 from paperless.parsers import ParserContext
@@ -424,7 +424,7 @@ class ConsumerPlugin(
 
             # Determine the parser class.
 
-            mime_type = magic.from_file(self.working_copy, mime=True)
+            mime_type = mime_detection.from_file(self.working_copy)
 
             self.log.debug(f"Detected mime type: {mime_type}")
 
@@ -446,7 +446,7 @@ class ConsumerPlugin(
                         ],
                         logger=self.log,
                     )
-                    mime_type = magic.from_file(self.working_copy, mime=True)
+                    mime_type = mime_detection.from_file(self.working_copy)
                     self.log.debug(f"Detected mime type after qpdf: {mime_type}")
                     # Save the original file for later
                     self.unmodified_original = (
diff --git a/src/documents/data_models.py b/src/documents/data_models.py
index 6d9e3a187..0cfa5688a 100644
--- a/src/documents/data_models.py
+++ b/src/documents/data_models.py
@@ -4,10 +4,11 @@ from enum import IntEnum
 from pathlib import Path
 from typing import TypedDict
 
-import magic
 from guardian.shortcuts import get_groups_with_perms
 from guardian.shortcuts import get_users_with_perms
 
+from paperless import mime_detection
+
 
 @dataclasses.dataclass
 class DocumentMetadataOverrides:
@@ -184,7 +185,7 @@ class ConsumableDocument:
 
         # Get the file type once at init
         # Note this function isn't called when the object is unpickled
-        self.mime_type = magic.from_file(self.original_file, mime=True)
+        self.mime_type = mime_detection.from_file(self.original_file)
 
 
 class ConsumeFileDuplicateResult(TypedDict):
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index e3037eeae..77a94edd8 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -11,7 +11,6 @@ from typing import Any
 from typing import Literal
 from typing import TypedDict
 
-import magic
 from django.conf import settings
 from django.contrib.auth.models import Group
 from django.contrib.auth.models import User
@@ -49,6 +48,8 @@ from rest_framework.exceptions import PermissionDenied
 from rest_framework.fields import SerializerMethodField
 from rest_framework.filters import OrderingFilter
 
+from paperless import mime_detection
+
 if settings.AUDIT_LOG_ENABLED:
     from auditlog.context import set_actor
 
@@ -2159,7 +2160,7 @@ class PostDocumentSerializer(serializers.Serializer):
 
     def validate_document(self, document):
         document_data = document.file.read()
-        mime_type = magic.from_buffer(document_data, mime=True)
+        mime_type = mime_detection.from_buffer(document_data)
 
         if not is_mime_type_supported(mime_type):
             if (
diff --git a/src/documents/tests/test_bulk_edit.py b/src/documents/tests/test_bulk_edit.py
index 0c44157a5..9537300c1 100644
--- a/src/documents/tests/test_bulk_edit.py
+++ b/src/documents/tests/test_bulk_edit.py
@@ -1097,7 +1097,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
             self.assertIsNotNone(task_kwargs["overrides"])
             self.assertEqual(result, "OK")
 
-    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
+    @mock.patch(
+        "documents.data_models.mime_detection.from_file",
+        return_value="application/pdf",
+    )
     @mock.patch("documents.tasks.consume_file.apply_async")
     @mock.patch("pikepdf.open")
     def test_rotate_explicit_selection_uses_root_source_when_root_selected(
@@ -1127,7 +1130,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
 
     @mock.patch("documents.tasks.consume_file.apply_async")
     @mock.patch("pikepdf.Pdf.save")
-    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
+    @mock.patch(
+        "documents.data_models.mime_detection.from_file",
+        return_value="application/pdf",
+    )
     def test_delete_pages(self, mock_magic, mock_pdf_save, mock_consume_delay):
         """
         GIVEN:
@@ -1151,7 +1157,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
         self.assertIsNotNone(task_kwargs["overrides"])
         self.assertEqual(result, "OK")
 
-    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
+    @mock.patch(
+        "documents.data_models.mime_detection.from_file",
+        return_value="application/pdf",
+    )
     @mock.patch("documents.tasks.consume_file.apply_async")
     @mock.patch("pikepdf.open")
     def test_delete_pages_explicit_selection_uses_root_source_when_root_selected(
@@ -1328,7 +1337,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
         )
         self.assertIsNotNone(task_kwargs["overrides"])
 
-    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
+    @mock.patch(
+        "documents.data_models.mime_detection.from_file",
+        return_value="application/pdf",
+    )
     @mock.patch("documents.tasks.consume_file.apply_async")
     @mock.patch("pikepdf.new")
     @mock.patch("pikepdf.open")
@@ -1482,7 +1494,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
         self.assertEqual(task_kwargs["input_doc"].root_document_id, doc.id)
         self.assertIsNotNone(task_kwargs["overrides"])
 
-    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
+    @mock.patch(
+        "documents.data_models.mime_detection.from_file",
+        return_value="application/pdf",
+    )
     @mock.patch("documents.tasks.consume_file.apply_async")
     @mock.patch("pikepdf.open")
     def test_remove_password_explicit_selection_uses_root_source_when_root_selected(
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index 0ea714fc1..32075a6a3 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -140,28 +140,25 @@ class FaultyGenericExceptionParser(_BaseNewStyleParser):
         raise Exception("Generic exception.")
 
 
-def fake_magic_from_file(file, *, mime=False):  # NOSONAR
-    if mime:
-        filepath = Path(file)
-        if filepath.name.startswith("invalid_pdf"):
-            return "application/octet-stream"
-        if filepath.name.startswith("valid_pdf"):
-            return "application/pdf"
-        if filepath.suffix == ".pdf":
-            return "application/pdf"
-        elif filepath.suffix == ".png":
-            return "image/png"
-        elif filepath.suffix == ".webp":
-            return "image/webp"
-        elif filepath.suffix == ".eml":
-            return "message/rfc822"
-        else:
-            return "unknown"
+def fake_magic_from_file(file):  # NOSONAR
+    filepath = Path(file)
+    if filepath.name.startswith("invalid_pdf"):
+        return "application/octet-stream"
+    if filepath.name.startswith("valid_pdf"):
+        return "application/pdf"
+    if filepath.suffix == ".pdf":
+        return "application/pdf"
+    elif filepath.suffix == ".png":
+        return "image/png"
+    elif filepath.suffix == ".webp":
+        return "image/webp"
+    elif filepath.suffix == ".eml":
+        return "message/rfc822"
     else:
-        return "A verbose string that describes the contents of the file"
+        return "unknown"
 
 
-@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
+@mock.patch("documents.consumer.mime_detection.from_file", fake_magic_from_file)
 class TestConsumer(
     DirectoriesMixin,
     FileSystemAssertsMixin,
@@ -1146,7 +1143,7 @@ class TestConsumer(
             )
 
 
-@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
+@mock.patch("documents.consumer.mime_detection.from_file", fake_magic_from_file)
 class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
     def setUp(self) -> None:
         super().setUp()
diff --git a/src/documents/views.py b/src/documents/views.py
index 217550634..483fc22a2 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -20,7 +20,6 @@ from urllib.parse import quote
 from urllib.parse import urlparse
 
 import httpx
-import magic
 import pathvalidate
 from django.conf import settings
 from django.contrib.auth.models import Group
@@ -226,6 +225,7 @@ from documents.versioning import get_latest_version_for_root
 from documents.versioning import get_request_version_param
 from documents.versioning import get_root_document
 from documents.versioning import resolve_requested_version_for_root
+from paperless import mime_detection
 from paperless import version
 from paperless.celery import app as celery_app
 from paperless.config import AIConfig
@@ -4896,7 +4896,7 @@ def serve_logo(request: HttpRequest, filename: str | None = None) -> FileRespons
         raise Http404("No logo configured")
 
     path = app_logo.path
-    content_type = magic.from_file(path, mime=True) or "application/octet-stream"
+    content_type = mime_detection.from_file(path) or "application/octet-stream"
 
     return FileResponse(
         app_logo.open("rb"),
diff --git a/src/paperless/mime_detection.py b/src/paperless/mime_detection.py
new file mode 100644
index 000000000..4014fe75f
--- /dev/null
+++ b/src/paperless/mime_detection.py
@@ -0,0 +1,13 @@
+from pathlib import Path
+
+from magika import Magika
+
+_magika = Magika()
+
+
+def from_file(path: str | Path) -> str:
+    return _magika.identify_path(path).output.mime_type
+
+
+def from_buffer(data: bytes) -> str:
+    return _magika.identify_bytes(data).output.mime_type
diff --git a/src/paperless/serialisers.py b/src/paperless/serialisers.py
index 92676df4e..c50d05814 100644
--- a/src/paperless/serialisers.py
+++ b/src/paperless/serialisers.py
@@ -1,7 +1,6 @@
 import logging
 from io import BytesIO
 
-import magic
 from allauth.mfa.adapter import get_adapter as get_mfa_adapter
 from allauth.mfa.models import Authenticator
 from allauth.mfa.totp.internal.auth import TOTP
@@ -18,6 +17,7 @@ from PIL import Image
 from rest_framework import serializers
 from rest_framework.authtoken.serializers import AuthTokenSerializer
 
+from paperless import mime_detection
 from paperless.models import ApplicationConfiguration
 from paperless.network import validate_outbound_http_url
 from paperless.validators import reject_dangerous_svg
@@ -263,7 +263,7 @@ class ApplicationConfigurationSerializer(
         jpg/png/gif/svg.
         """
         if file:
-            mime_type = magic.from_buffer(file.read(2048), mime=True)
+            mime_type = mime_detection.from_buffer(file.read(2048))
 
             if mime_type == "image/svg+xml":
                 reject_dangerous_svg(file)
diff --git a/src/paperless/tests/parsers/test_convert_image_to_pdfa.py b/src/paperless/tests/parsers/test_convert_image_to_pdfa.py
index 615900a25..3aa76f877 100644
--- a/src/paperless/tests/parsers/test_convert_image_to_pdfa.py
+++ b/src/paperless/tests/parsers/test_convert_image_to_pdfa.py
@@ -15,11 +15,11 @@ from pathlib import Path
 from typing import TYPE_CHECKING
 
 import img2pdf
-import magic
 import pikepdf
 import pytest
 
 from documents.parsers import ParseError
+from paperless import mime_detection
 
 if TYPE_CHECKING:
     from pytest_mock import MockerFixture
@@ -43,7 +43,7 @@ class TestConvertImageToPdfa:
         result = tesseract_parser._convert_image_to_pdfa(simple_png_file)
 
         assert result.exists()
-        assert magic.from_file(str(result), mime=True) == "application/pdf"
+        assert mime_detection.from_file(result) == "application/pdf"
 
     def test_output_path_is_archive_pdf_in_tempdir(
         self,
@@ -92,7 +92,7 @@ class TestConvertImageToPdfa:
         result = tesseract_parser._convert_image_to_pdfa(simple_png_file)
 
         assert result.exists()
-        assert magic.from_file(str(result), mime=True) == "application/pdf"
+        assert mime_detection.from_file(result) == "application/pdf"
 
     def test_image_dpi_setting_applies_fixed_dpi_layout(
         self,
@@ -116,7 +116,7 @@ class TestConvertImageToPdfa:
         result = tesseract_parser._convert_image_to_pdfa(simple_no_dpi_png_file)
 
         spy.assert_called_once_with((150, 150))
-        assert magic.from_file(str(result), mime=True) == "application/pdf"
+        assert mime_detection.from_file(result) == "application/pdf"
 
     def test_no_image_dpi_setting_skips_fixed_dpi_layout(
         self,
diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py
index d551cc8cd..cb8b9a56c 100644
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -10,7 +10,6 @@ from fnmatch import fnmatch
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-import magic
 import pathvalidate
 from celery import chord
 from celery import shared_task
@@ -40,6 +39,7 @@ from documents.models import Correspondent
 from documents.models import PaperlessTask
 from documents.parsers import is_mime_type_supported
 from documents.tasks import consume_file
+from paperless import mime_detection
 from paperless.network import is_public_ip
 from paperless.network import resolve_hostname_ips
 from paperless_mail.models import MailAccount
@@ -848,7 +848,7 @@ class MailAccountHandler(LoggingMixin):
 
             # don't trust the content type of the attachment. Could be
             # generic application/octet-stream.
-            mime_type = magic.from_buffer(att.payload, mime=True)
+            mime_type = mime_detection.from_buffer(att.payload)
 
             if is_mime_type_supported(mime_type):
                 self.log.info(
@@ -954,14 +954,11 @@ class MailAccountHandler(LoggingMixin):
         )
         with Path(temp_filename).open("wb") as f:
             # Move "From"-header to beginning of file
-            # TODO: This ugly workaround is needed because the parser is
-            #   chosen only by the mime_type detected via magic
-            #   (see documents/consumer.py "mime_type = magic.from_file")
-            #   Unfortunately magic sometimes fails to detect the mime
-            #   type of .eml files correctly as message/rfc822 and instead
-            #   detects text/plain.
-            #   This also effects direct file consumption of .eml files
-            #   which are not treated with this workaround.
+            # TODO: This workaround may no longer be needed with Magika,
+            #   which has better text-format detection than libmagic.
+            #   Previously libmagic would misidentify .eml files as text/plain
+            #   instead of message/rfc822. Verify and remove if Magika handles
+            #   it correctly.
             from_element = None
             for i, header in enumerate(message.obj._headers):
                 if header[0] == "From":
diff --git a/src/paperless_mail/tests/test_mail.py b/src/paperless_mail/tests/test_mail.py
index 48e066646..991ae2877 100644
--- a/src/paperless_mail/tests/test_mail.py
+++ b/src/paperless_mail/tests/test_mail.py
@@ -192,14 +192,11 @@ class BogusMailBox(AbstractContextManager):
             raise Exception
 
 
-def fake_magic_from_buffer(buffer, *, mime=False):
-    if mime:
-        if "PDF" in str(buffer):
-            return "application/pdf"
-        else:
-            return "unknown/type"
+def fake_magic_from_buffer(buffer):
+    if "PDF" in str(buffer):
+        return "application/pdf"
     else:
-        return "Some verbose file description"
+        return "unknown/type"
 
 
 class MessageBuilder:
@@ -408,7 +405,7 @@ def assert_eventually_equals(
     raise AssertionError(f"Expected {expected_value}, but got {actual}")
 
 
-@mock.patch("paperless_mail.mail.magic.from_buffer", fake_magic_from_buffer)
+@mock.patch("paperless_mail.mail.mime_detection.from_buffer", fake_magic_from_buffer)
 class TestMail(
     DirectoriesMixin,
     FileSystemAssertsMixin,