Experimetns with using magika instead of magic for mime detection

This commit is contained in:
Trenton H
2026-04-23 09:09:33 -07:00
parent 1a3b56496a
commit 2fbf975903
12 changed files with 80 additions and 59 deletions
+1 -1
View File
@@ -57,6 +57,7 @@ dependencies = [
"llama-index-llms-ollama>=0.9.1",
"llama-index-llms-openai>=0.6.13",
"llama-index-vector-stores-faiss>=0.5.2",
"magika>=1.0.2",
"nltk~=3.9.1",
"ocrmypdf~=17.4.0",
"openai>=1.76",
@@ -66,7 +67,6 @@ dependencies = [
"python-dotenv~=1.2.1",
"python-gnupg~=0.5.4",
"python-ipware~=3.0.0",
"python-magic~=0.4.27",
"rapidfuzz~=3.14.0",
"redis[hiredis]~=5.2.1",
"regex>=2025.9.18",
+3 -3
View File
@@ -8,7 +8,6 @@ from pathlib import Path
from typing import TYPE_CHECKING
from typing import Final
import magic
from django.conf import settings
from django.contrib.auth.models import User
from django.db import transaction
@@ -52,6 +51,7 @@ from documents.utils import compute_checksum
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless import mime_detection
from paperless.config import OcrConfig
from paperless.models import ArchiveFileGenerationChoices
from paperless.parsers import ParserContext
@@ -424,7 +424,7 @@ class ConsumerPlugin(
# Determine the parser class.
mime_type = magic.from_file(self.working_copy, mime=True)
mime_type = mime_detection.from_file(self.working_copy)
self.log.debug(f"Detected mime type: {mime_type}")
@@ -446,7 +446,7 @@ class ConsumerPlugin(
],
logger=self.log,
)
mime_type = magic.from_file(self.working_copy, mime=True)
mime_type = mime_detection.from_file(self.working_copy)
self.log.debug(f"Detected mime type after qpdf: {mime_type}")
# Save the original file for later
self.unmodified_original = (
+3 -2
View File
@@ -4,10 +4,11 @@ from enum import IntEnum
from pathlib import Path
from typing import TypedDict
import magic
from guardian.shortcuts import get_groups_with_perms
from guardian.shortcuts import get_users_with_perms
from paperless import mime_detection
@dataclasses.dataclass
class DocumentMetadataOverrides:
@@ -184,7 +185,7 @@ class ConsumableDocument:
# Get the file type once at init
# Note this function isn't called when the object is unpickled
self.mime_type = magic.from_file(self.original_file, mime=True)
self.mime_type = mime_detection.from_file(self.original_file)
class ConsumeFileDuplicateResult(TypedDict):
+3 -2
View File
@@ -11,7 +11,6 @@ from typing import Any
from typing import Literal
from typing import TypedDict
import magic
from django.conf import settings
from django.contrib.auth.models import Group
from django.contrib.auth.models import User
@@ -49,6 +48,8 @@ from rest_framework.exceptions import PermissionDenied
from rest_framework.fields import SerializerMethodField
from rest_framework.filters import OrderingFilter
from paperless import mime_detection
if settings.AUDIT_LOG_ENABLED:
from auditlog.context import set_actor
@@ -2159,7 +2160,7 @@ class PostDocumentSerializer(serializers.Serializer):
def validate_document(self, document):
document_data = document.file.read()
mime_type = magic.from_buffer(document_data, mime=True)
mime_type = mime_detection.from_buffer(document_data)
if not is_mime_type_supported(mime_type):
if (
+20 -5
View File
@@ -1097,7 +1097,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
self.assertIsNotNone(task_kwargs["overrides"])
self.assertEqual(result, "OK")
@mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
@mock.patch(
"documents.data_models.mime_detection.from_file",
return_value="application/pdf",
)
@mock.patch("documents.tasks.consume_file.apply_async")
@mock.patch("pikepdf.open")
def test_rotate_explicit_selection_uses_root_source_when_root_selected(
@@ -1127,7 +1130,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
@mock.patch("documents.tasks.consume_file.apply_async")
@mock.patch("pikepdf.Pdf.save")
@mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
@mock.patch(
"documents.data_models.mime_detection.from_file",
return_value="application/pdf",
)
def test_delete_pages(self, mock_magic, mock_pdf_save, mock_consume_delay):
"""
GIVEN:
@@ -1151,7 +1157,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
self.assertIsNotNone(task_kwargs["overrides"])
self.assertEqual(result, "OK")
@mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
@mock.patch(
"documents.data_models.mime_detection.from_file",
return_value="application/pdf",
)
@mock.patch("documents.tasks.consume_file.apply_async")
@mock.patch("pikepdf.open")
def test_delete_pages_explicit_selection_uses_root_source_when_root_selected(
@@ -1328,7 +1337,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
)
self.assertIsNotNone(task_kwargs["overrides"])
@mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
@mock.patch(
"documents.data_models.mime_detection.from_file",
return_value="application/pdf",
)
@mock.patch("documents.tasks.consume_file.apply_async")
@mock.patch("pikepdf.new")
@mock.patch("pikepdf.open")
@@ -1482,7 +1494,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
self.assertEqual(task_kwargs["input_doc"].root_document_id, doc.id)
self.assertIsNotNone(task_kwargs["overrides"])
@mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
@mock.patch(
"documents.data_models.mime_detection.from_file",
return_value="application/pdf",
)
@mock.patch("documents.tasks.consume_file.apply_async")
@mock.patch("pikepdf.open")
def test_remove_password_explicit_selection_uses_root_source_when_root_selected(
+17 -20
View File
@@ -140,28 +140,25 @@ class FaultyGenericExceptionParser(_BaseNewStyleParser):
raise Exception("Generic exception.")
def fake_magic_from_file(file, *, mime=False): # NOSONAR
if mime:
filepath = Path(file)
if filepath.name.startswith("invalid_pdf"):
return "application/octet-stream"
if filepath.name.startswith("valid_pdf"):
return "application/pdf"
if filepath.suffix == ".pdf":
return "application/pdf"
elif filepath.suffix == ".png":
return "image/png"
elif filepath.suffix == ".webp":
return "image/webp"
elif filepath.suffix == ".eml":
return "message/rfc822"
else:
return "unknown"
def fake_magic_from_file(file): # NOSONAR
filepath = Path(file)
if filepath.name.startswith("invalid_pdf"):
return "application/octet-stream"
if filepath.name.startswith("valid_pdf"):
return "application/pdf"
if filepath.suffix == ".pdf":
return "application/pdf"
elif filepath.suffix == ".png":
return "image/png"
elif filepath.suffix == ".webp":
return "image/webp"
elif filepath.suffix == ".eml":
return "message/rfc822"
else:
return "A verbose string that describes the contents of the file"
return "unknown"
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
@mock.patch("documents.consumer.mime_detection.from_file", fake_magic_from_file)
class TestConsumer(
DirectoriesMixin,
FileSystemAssertsMixin,
@@ -1146,7 +1143,7 @@ class TestConsumer(
)
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
@mock.patch("documents.consumer.mime_detection.from_file", fake_magic_from_file)
class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
def setUp(self) -> None:
super().setUp()
+2 -2
View File
@@ -20,7 +20,6 @@ from urllib.parse import quote
from urllib.parse import urlparse
import httpx
import magic
import pathvalidate
from django.conf import settings
from django.contrib.auth.models import Group
@@ -226,6 +225,7 @@ from documents.versioning import get_latest_version_for_root
from documents.versioning import get_request_version_param
from documents.versioning import get_root_document
from documents.versioning import resolve_requested_version_for_root
from paperless import mime_detection
from paperless import version
from paperless.celery import app as celery_app
from paperless.config import AIConfig
@@ -4896,7 +4896,7 @@ def serve_logo(request: HttpRequest, filename: str | None = None) -> FileRespons
raise Http404("No logo configured")
path = app_logo.path
content_type = magic.from_file(path, mime=True) or "application/octet-stream"
content_type = mime_detection.from_file(path) or "application/octet-stream"
return FileResponse(
app_logo.open("rb"),
+13
View File
@@ -0,0 +1,13 @@
from pathlib import Path
from magika import Magika
_magika = Magika()
def from_file(path: str | Path) -> str:
return _magika.identify_path(path).output.mime_type
def from_buffer(data: bytes) -> str:
return _magika.identify_bytes(data).output.mime_type
+2 -2
View File
@@ -1,7 +1,6 @@
import logging
from io import BytesIO
import magic
from allauth.mfa.adapter import get_adapter as get_mfa_adapter
from allauth.mfa.models import Authenticator
from allauth.mfa.totp.internal.auth import TOTP
@@ -18,6 +17,7 @@ from PIL import Image
from rest_framework import serializers
from rest_framework.authtoken.serializers import AuthTokenSerializer
from paperless import mime_detection
from paperless.models import ApplicationConfiguration
from paperless.network import validate_outbound_http_url
from paperless.validators import reject_dangerous_svg
@@ -263,7 +263,7 @@ class ApplicationConfigurationSerializer(
jpg/png/gif/svg.
"""
if file:
mime_type = magic.from_buffer(file.read(2048), mime=True)
mime_type = mime_detection.from_buffer(file.read(2048))
if mime_type == "image/svg+xml":
reject_dangerous_svg(file)
@@ -15,11 +15,11 @@ from pathlib import Path
from typing import TYPE_CHECKING
import img2pdf
import magic
import pikepdf
import pytest
from documents.parsers import ParseError
from paperless import mime_detection
if TYPE_CHECKING:
from pytest_mock import MockerFixture
@@ -43,7 +43,7 @@ class TestConvertImageToPdfa:
result = tesseract_parser._convert_image_to_pdfa(simple_png_file)
assert result.exists()
assert magic.from_file(str(result), mime=True) == "application/pdf"
assert mime_detection.from_file(result) == "application/pdf"
def test_output_path_is_archive_pdf_in_tempdir(
self,
@@ -92,7 +92,7 @@ class TestConvertImageToPdfa:
result = tesseract_parser._convert_image_to_pdfa(simple_png_file)
assert result.exists()
assert magic.from_file(str(result), mime=True) == "application/pdf"
assert mime_detection.from_file(result) == "application/pdf"
def test_image_dpi_setting_applies_fixed_dpi_layout(
self,
@@ -116,7 +116,7 @@ class TestConvertImageToPdfa:
result = tesseract_parser._convert_image_to_pdfa(simple_no_dpi_png_file)
spy.assert_called_once_with((150, 150))
assert magic.from_file(str(result), mime=True) == "application/pdf"
assert mime_detection.from_file(result) == "application/pdf"
def test_no_image_dpi_setting_skips_fixed_dpi_layout(
self,
+7 -10
View File
@@ -10,7 +10,6 @@ from fnmatch import fnmatch
from pathlib import Path
from typing import TYPE_CHECKING
import magic
import pathvalidate
from celery import chord
from celery import shared_task
@@ -40,6 +39,7 @@ from documents.models import Correspondent
from documents.models import PaperlessTask
from documents.parsers import is_mime_type_supported
from documents.tasks import consume_file
from paperless import mime_detection
from paperless.network import is_public_ip
from paperless.network import resolve_hostname_ips
from paperless_mail.models import MailAccount
@@ -848,7 +848,7 @@ class MailAccountHandler(LoggingMixin):
# don't trust the content type of the attachment. Could be
# generic application/octet-stream.
mime_type = magic.from_buffer(att.payload, mime=True)
mime_type = mime_detection.from_buffer(att.payload)
if is_mime_type_supported(mime_type):
self.log.info(
@@ -954,14 +954,11 @@ class MailAccountHandler(LoggingMixin):
)
with Path(temp_filename).open("wb") as f:
# Move "From"-header to beginning of file
# TODO: This ugly workaround is needed because the parser is
# chosen only by the mime_type detected via magic
# (see documents/consumer.py "mime_type = magic.from_file")
# Unfortunately magic sometimes fails to detect the mime
# type of .eml files correctly as message/rfc822 and instead
# detects text/plain.
# This also effects direct file consumption of .eml files
# which are not treated with this workaround.
# TODO: This workaround may no longer be needed with Magika,
# which has better text-format detection than libmagic.
# Previously libmagic would misidentify .eml files as text/plain
# instead of message/rfc822. Verify and remove if Magika handles
# it correctly.
from_element = None
for i, header in enumerate(message.obj._headers):
if header[0] == "From":
+5 -8
View File
@@ -192,14 +192,11 @@ class BogusMailBox(AbstractContextManager):
raise Exception
def fake_magic_from_buffer(buffer, *, mime=False):
if mime:
if "PDF" in str(buffer):
return "application/pdf"
else:
return "unknown/type"
def fake_magic_from_buffer(buffer):
if "PDF" in str(buffer):
return "application/pdf"
else:
return "Some verbose file description"
return "unknown/type"
class MessageBuilder:
@@ -408,7 +405,7 @@ def assert_eventually_equals(
raise AssertionError(f"Expected {expected_value}, but got {actual}")
@mock.patch("paperless_mail.mail.magic.from_buffer", fake_magic_from_buffer)
@mock.patch("paperless_mail.mail.mime_detection.from_buffer", fake_magic_from_buffer)
class TestMail(
DirectoriesMixin,
FileSystemAssertsMixin,