Experimetns with using magika instead of magic for mime detection

2026-06-22 13:24:18 +00:00 · 2026-04-23 09:09:33 -07:00
parent 1a3b56496a
commit 2fbf975903
12 changed files with 80 additions and 59 deletions
@@ -57,6 +57,7 @@ dependencies = [
  "llama-index-llms-ollama>=0.9.1",
  "llama-index-llms-openai>=0.6.13",
  "llama-index-vector-stores-faiss>=0.5.2",
+  "magika>=1.0.2",
  "nltk~=3.9.1",
  "ocrmypdf~=17.4.0",
  "openai>=1.76",
@@ -66,7 +67,6 @@ dependencies = [
  "python-dotenv~=1.2.1",
  "python-gnupg~=0.5.4",
  "python-ipware~=3.0.0",
-  "python-magic~=0.4.27",
  "rapidfuzz~=3.14.0",
  "redis[hiredis]~=5.2.1",
  "regex>=2025.9.18",
@@ -8,7 +8,6 @@ from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import Final

-import magic
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.db import transaction
@@ -52,6 +51,7 @@ from documents.utils import compute_checksum
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
+from paperless import mime_detection
 from paperless.config import OcrConfig
 from paperless.models import ArchiveFileGenerationChoices
 from paperless.parsers import ParserContext
@@ -424,7 +424,7 @@ class ConsumerPlugin(

            # Determine the parser class.

-            mime_type = magic.from_file(self.working_copy, mime=True)
+            mime_type = mime_detection.from_file(self.working_copy)

            self.log.debug(f"Detected mime type: {mime_type}")

@@ -446,7 +446,7 @@ class ConsumerPlugin(
                        ],
                        logger=self.log,
                    )
-                    mime_type = magic.from_file(self.working_copy, mime=True)
+                    mime_type = mime_detection.from_file(self.working_copy)
                    self.log.debug(f"Detected mime type after qpdf: {mime_type}")
                    # Save the original file for later
                    self.unmodified_original = (
@@ -4,10 +4,11 @@ from enum import IntEnum
 from pathlib import Path
 from typing import TypedDict

-import magic
 from guardian.shortcuts import get_groups_with_perms
 from guardian.shortcuts import get_users_with_perms

+from paperless import mime_detection
+

@dataclasses.dataclass
 class DocumentMetadataOverrides:
@@ -184,7 +185,7 @@ class ConsumableDocument:

        # Get the file type once at init
        # Note this function isn't called when the object is unpickled
-        self.mime_type = magic.from_file(self.original_file, mime=True)
+        self.mime_type = mime_detection.from_file(self.original_file)


 class ConsumeFileDuplicateResult(TypedDict):
@@ -11,7 +11,6 @@ from typing import Any
 from typing import Literal
 from typing import TypedDict

-import magic
 from django.conf import settings
 from django.contrib.auth.models import Group
 from django.contrib.auth.models import User
@@ -49,6 +48,8 @@ from rest_framework.exceptions import PermissionDenied
 from rest_framework.fields import SerializerMethodField
 from rest_framework.filters import OrderingFilter

+from paperless import mime_detection
+
 if settings.AUDIT_LOG_ENABLED:
    from auditlog.context import set_actor

@@ -2159,7 +2160,7 @@ class PostDocumentSerializer(serializers.Serializer):

    def validate_document(self, document):
        document_data = document.file.read()
-        mime_type = magic.from_buffer(document_data, mime=True)
+        mime_type = mime_detection.from_buffer(document_data)

        if not is_mime_type_supported(mime_type):
            if (
@@ -1097,7 +1097,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
            self.assertIsNotNone(task_kwargs["overrides"])
            self.assertEqual(result, "OK")

-    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
+    @mock.patch(
+        "documents.data_models.mime_detection.from_file",
+        return_value="application/pdf",
+    )
    @mock.patch("documents.tasks.consume_file.apply_async")
    @mock.patch("pikepdf.open")
    def test_rotate_explicit_selection_uses_root_source_when_root_selected(
@@ -1127,7 +1130,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):

    @mock.patch("documents.tasks.consume_file.apply_async")
    @mock.patch("pikepdf.Pdf.save")
-    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
+    @mock.patch(
+        "documents.data_models.mime_detection.from_file",
+        return_value="application/pdf",
+    )
    def test_delete_pages(self, mock_magic, mock_pdf_save, mock_consume_delay):
        """
        GIVEN:
@@ -1151,7 +1157,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        self.assertIsNotNone(task_kwargs["overrides"])
        self.assertEqual(result, "OK")

-    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
+    @mock.patch(
+        "documents.data_models.mime_detection.from_file",
+        return_value="application/pdf",
+    )
    @mock.patch("documents.tasks.consume_file.apply_async")
    @mock.patch("pikepdf.open")
    def test_delete_pages_explicit_selection_uses_root_source_when_root_selected(
@@ -1328,7 +1337,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        )
        self.assertIsNotNone(task_kwargs["overrides"])

-    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
+    @mock.patch(
+        "documents.data_models.mime_detection.from_file",
+        return_value="application/pdf",
+    )
    @mock.patch("documents.tasks.consume_file.apply_async")
    @mock.patch("pikepdf.new")
    @mock.patch("pikepdf.open")
@@ -1482,7 +1494,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        self.assertEqual(task_kwargs["input_doc"].root_document_id, doc.id)
        self.assertIsNotNone(task_kwargs["overrides"])

-    @mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
+    @mock.patch(
+        "documents.data_models.mime_detection.from_file",
+        return_value="application/pdf",
+    )
    @mock.patch("documents.tasks.consume_file.apply_async")
    @mock.patch("pikepdf.open")
    def test_remove_password_explicit_selection_uses_root_source_when_root_selected(
@@ -140,28 +140,25 @@ class FaultyGenericExceptionParser(_BaseNewStyleParser):
        raise Exception("Generic exception.")


-def fake_magic_from_file(file, *, mime=False):  # NOSONAR
-    if mime:
-        filepath = Path(file)
-        if filepath.name.startswith("invalid_pdf"):
-            return "application/octet-stream"
-        if filepath.name.startswith("valid_pdf"):
-            return "application/pdf"
-        if filepath.suffix == ".pdf":
-            return "application/pdf"
-        elif filepath.suffix == ".png":
-            return "image/png"
-        elif filepath.suffix == ".webp":
-            return "image/webp"
-        elif filepath.suffix == ".eml":
-            return "message/rfc822"
-        else:
-            return "unknown"
+def fake_magic_from_file(file):  # NOSONAR
+    filepath = Path(file)
+    if filepath.name.startswith("invalid_pdf"):
+        return "application/octet-stream"
+    if filepath.name.startswith("valid_pdf"):
+        return "application/pdf"
+    if filepath.suffix == ".pdf":
+        return "application/pdf"
+    elif filepath.suffix == ".png":
+        return "image/png"
+    elif filepath.suffix == ".webp":
+        return "image/webp"
+    elif filepath.suffix == ".eml":
+        return "message/rfc822"
    else:
-        return "A verbose string that describes the contents of the file"
+        return "unknown"


-@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
+@mock.patch("documents.consumer.mime_detection.from_file", fake_magic_from_file)
 class TestConsumer(
    DirectoriesMixin,
    FileSystemAssertsMixin,
@@ -1146,7 +1143,7 @@ class TestConsumer(
            )


-@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
+@mock.patch("documents.consumer.mime_detection.from_file", fake_magic_from_file)
 class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
    def setUp(self) -> None:
        super().setUp()
@@ -20,7 +20,6 @@ from urllib.parse import quote
 from urllib.parse import urlparse

 import httpx
-import magic
 import pathvalidate
 from django.conf import settings
 from django.contrib.auth.models import Group
@@ -226,6 +225,7 @@ from documents.versioning import get_latest_version_for_root
 from documents.versioning import get_request_version_param
 from documents.versioning import get_root_document
 from documents.versioning import resolve_requested_version_for_root
+from paperless import mime_detection
 from paperless import version
 from paperless.celery import app as celery_app
 from paperless.config import AIConfig
@@ -4896,7 +4896,7 @@ def serve_logo(request: HttpRequest, filename: str | None = None) -> FileRespons
        raise Http404("No logo configured")

    path = app_logo.path
-    content_type = magic.from_file(path, mime=True) or "application/octet-stream"
+    content_type = mime_detection.from_file(path) or "application/octet-stream"

    return FileResponse(
        app_logo.open("rb"),
@@ -0,0 +1,13 @@
+from pathlib import Path
+
+from magika import Magika
+
+_magika = Magika()
+
+
+def from_file(path: str | Path) -> str:
+    return _magika.identify_path(path).output.mime_type
+
+
+def from_buffer(data: bytes) -> str:
+    return _magika.identify_bytes(data).output.mime_type
@@ -1,7 +1,6 @@
 import logging
 from io import BytesIO

-import magic
 from allauth.mfa.adapter import get_adapter as get_mfa_adapter
 from allauth.mfa.models import Authenticator
 from allauth.mfa.totp.internal.auth import TOTP
@@ -18,6 +17,7 @@ from PIL import Image
 from rest_framework import serializers
 from rest_framework.authtoken.serializers import AuthTokenSerializer

+from paperless import mime_detection
 from paperless.models import ApplicationConfiguration
 from paperless.network import validate_outbound_http_url
 from paperless.validators import reject_dangerous_svg
@@ -263,7 +263,7 @@ class ApplicationConfigurationSerializer(
        jpg/png/gif/svg.
        """
        if file:
-            mime_type = magic.from_buffer(file.read(2048), mime=True)
+            mime_type = mime_detection.from_buffer(file.read(2048))

            if mime_type == "image/svg+xml":
                reject_dangerous_svg(file)
@@ -15,11 +15,11 @@ from pathlib import Path
 from typing import TYPE_CHECKING

 import img2pdf
-import magic
 import pikepdf
 import pytest

 from documents.parsers import ParseError
+from paperless import mime_detection

 if TYPE_CHECKING:
    from pytest_mock import MockerFixture
@@ -43,7 +43,7 @@ class TestConvertImageToPdfa:
        result = tesseract_parser._convert_image_to_pdfa(simple_png_file)

        assert result.exists()
-        assert magic.from_file(str(result), mime=True) == "application/pdf"
+        assert mime_detection.from_file(result) == "application/pdf"

    def test_output_path_is_archive_pdf_in_tempdir(
        self,
@@ -92,7 +92,7 @@ class TestConvertImageToPdfa:
        result = tesseract_parser._convert_image_to_pdfa(simple_png_file)

        assert result.exists()
-        assert magic.from_file(str(result), mime=True) == "application/pdf"
+        assert mime_detection.from_file(result) == "application/pdf"

    def test_image_dpi_setting_applies_fixed_dpi_layout(
        self,
@@ -116,7 +116,7 @@ class TestConvertImageToPdfa:
        result = tesseract_parser._convert_image_to_pdfa(simple_no_dpi_png_file)

        spy.assert_called_once_with((150, 150))
-        assert magic.from_file(str(result), mime=True) == "application/pdf"
+        assert mime_detection.from_file(result) == "application/pdf"

    def test_no_image_dpi_setting_skips_fixed_dpi_layout(
        self,
@@ -10,7 +10,6 @@ from fnmatch import fnmatch
 from pathlib import Path
 from typing import TYPE_CHECKING

-import magic
 import pathvalidate
 from celery import chord
 from celery import shared_task
@@ -40,6 +39,7 @@ from documents.models import Correspondent
 from documents.models import PaperlessTask
 from documents.parsers import is_mime_type_supported
 from documents.tasks import consume_file
+from paperless import mime_detection
 from paperless.network import is_public_ip
 from paperless.network import resolve_hostname_ips
 from paperless_mail.models import MailAccount
@@ -848,7 +848,7 @@ class MailAccountHandler(LoggingMixin):

            # don't trust the content type of the attachment. Could be
            # generic application/octet-stream.
-            mime_type = magic.from_buffer(att.payload, mime=True)
+            mime_type = mime_detection.from_buffer(att.payload)

            if is_mime_type_supported(mime_type):
                self.log.info(
@@ -954,14 +954,11 @@ class MailAccountHandler(LoggingMixin):
        )
        with Path(temp_filename).open("wb") as f:
            # Move "From"-header to beginning of file
-            # TODO: This ugly workaround is needed because the parser is
-            #   chosen only by the mime_type detected via magic
-            #   (see documents/consumer.py "mime_type = magic.from_file")
-            #   Unfortunately magic sometimes fails to detect the mime
-            #   type of .eml files correctly as message/rfc822 and instead
-            #   detects text/plain.
-            #   This also effects direct file consumption of .eml files
-            #   which are not treated with this workaround.
+            # TODO: This workaround may no longer be needed with Magika,
+            #   which has better text-format detection than libmagic.
+            #   Previously libmagic would misidentify .eml files as text/plain
+            #   instead of message/rfc822. Verify and remove if Magika handles
+            #   it correctly.
            from_element = None
            for i, header in enumerate(message.obj._headers):
                if header[0] == "From":
@@ -192,14 +192,11 @@ class BogusMailBox(AbstractContextManager):
            raise Exception


-def fake_magic_from_buffer(buffer, *, mime=False):
-    if mime:
-        if "PDF" in str(buffer):
-            return "application/pdf"
-        else:
-            return "unknown/type"
+def fake_magic_from_buffer(buffer):
+    if "PDF" in str(buffer):
+        return "application/pdf"
    else:
-        return "Some verbose file description"
+        return "unknown/type"


 class MessageBuilder:
@@ -408,7 +405,7 @@ def assert_eventually_equals(
    raise AssertionError(f"Expected {expected_value}, but got {actual}")


-@mock.patch("paperless_mail.mail.magic.from_buffer", fake_magic_from_buffer)
+@mock.patch("paperless_mail.mail.mime_detection.from_buffer", fake_magic_from_buffer)
 class TestMail(
    DirectoriesMixin,
    FileSystemAssertsMixin,