mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-04 21:55:25 +00:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 2fbf975903 |
+1
-1
@@ -57,6 +57,7 @@ dependencies = [
|
||||
"llama-index-llms-ollama>=0.9.1",
|
||||
"llama-index-llms-openai>=0.6.13",
|
||||
"llama-index-vector-stores-faiss>=0.5.2",
|
||||
"magika>=1.0.2",
|
||||
"nltk~=3.9.1",
|
||||
"ocrmypdf~=17.4.0",
|
||||
"openai>=1.76",
|
||||
@@ -66,7 +67,6 @@ dependencies = [
|
||||
"python-dotenv~=1.2.1",
|
||||
"python-gnupg~=0.5.4",
|
||||
"python-ipware~=3.0.0",
|
||||
"python-magic~=0.4.27",
|
||||
"rapidfuzz~=3.14.0",
|
||||
"redis[hiredis]~=5.2.1",
|
||||
"regex>=2025.9.18",
|
||||
|
||||
@@ -8,7 +8,6 @@ from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Final
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.db import transaction
|
||||
@@ -52,6 +51,7 @@ from documents.utils import compute_checksum
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless import mime_detection
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileGenerationChoices
|
||||
from paperless.parsers import ParserContext
|
||||
@@ -424,7 +424,7 @@ class ConsumerPlugin(
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
mime_type = magic.from_file(self.working_copy, mime=True)
|
||||
mime_type = mime_detection.from_file(self.working_copy)
|
||||
|
||||
self.log.debug(f"Detected mime type: {mime_type}")
|
||||
|
||||
@@ -446,7 +446,7 @@ class ConsumerPlugin(
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
mime_type = magic.from_file(self.working_copy, mime=True)
|
||||
mime_type = mime_detection.from_file(self.working_copy)
|
||||
self.log.debug(f"Detected mime type after qpdf: {mime_type}")
|
||||
# Save the original file for later
|
||||
self.unmodified_original = (
|
||||
|
||||
@@ -4,10 +4,11 @@ from enum import IntEnum
|
||||
from pathlib import Path
|
||||
from typing import TypedDict
|
||||
|
||||
import magic
|
||||
from guardian.shortcuts import get_groups_with_perms
|
||||
from guardian.shortcuts import get_users_with_perms
|
||||
|
||||
from paperless import mime_detection
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class DocumentMetadataOverrides:
|
||||
@@ -184,7 +185,7 @@ class ConsumableDocument:
|
||||
|
||||
# Get the file type once at init
|
||||
# Note this function isn't called when the object is unpickled
|
||||
self.mime_type = magic.from_file(self.original_file, mime=True)
|
||||
self.mime_type = mime_detection.from_file(self.original_file)
|
||||
|
||||
|
||||
class ConsumeFileDuplicateResult(TypedDict):
|
||||
|
||||
@@ -11,7 +11,6 @@ from typing import Any
|
||||
from typing import Literal
|
||||
from typing import TypedDict
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Group
|
||||
from django.contrib.auth.models import User
|
||||
@@ -49,6 +48,8 @@ from rest_framework.exceptions import PermissionDenied
|
||||
from rest_framework.fields import SerializerMethodField
|
||||
from rest_framework.filters import OrderingFilter
|
||||
|
||||
from paperless import mime_detection
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.context import set_actor
|
||||
|
||||
@@ -2159,7 +2160,7 @@ class PostDocumentSerializer(serializers.Serializer):
|
||||
|
||||
def validate_document(self, document):
|
||||
document_data = document.file.read()
|
||||
mime_type = magic.from_buffer(document_data, mime=True)
|
||||
mime_type = mime_detection.from_buffer(document_data)
|
||||
|
||||
if not is_mime_type_supported(mime_type):
|
||||
if (
|
||||
|
||||
@@ -1097,7 +1097,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
self.assertIsNotNone(task_kwargs["overrides"])
|
||||
self.assertEqual(result, "OK")
|
||||
|
||||
@mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
|
||||
@mock.patch(
|
||||
"documents.data_models.mime_detection.from_file",
|
||||
return_value="application/pdf",
|
||||
)
|
||||
@mock.patch("documents.tasks.consume_file.apply_async")
|
||||
@mock.patch("pikepdf.open")
|
||||
def test_rotate_explicit_selection_uses_root_source_when_root_selected(
|
||||
@@ -1127,7 +1130,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
|
||||
@mock.patch("documents.tasks.consume_file.apply_async")
|
||||
@mock.patch("pikepdf.Pdf.save")
|
||||
@mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
|
||||
@mock.patch(
|
||||
"documents.data_models.mime_detection.from_file",
|
||||
return_value="application/pdf",
|
||||
)
|
||||
def test_delete_pages(self, mock_magic, mock_pdf_save, mock_consume_delay):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -1151,7 +1157,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
self.assertIsNotNone(task_kwargs["overrides"])
|
||||
self.assertEqual(result, "OK")
|
||||
|
||||
@mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
|
||||
@mock.patch(
|
||||
"documents.data_models.mime_detection.from_file",
|
||||
return_value="application/pdf",
|
||||
)
|
||||
@mock.patch("documents.tasks.consume_file.apply_async")
|
||||
@mock.patch("pikepdf.open")
|
||||
def test_delete_pages_explicit_selection_uses_root_source_when_root_selected(
|
||||
@@ -1328,7 +1337,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
)
|
||||
self.assertIsNotNone(task_kwargs["overrides"])
|
||||
|
||||
@mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
|
||||
@mock.patch(
|
||||
"documents.data_models.mime_detection.from_file",
|
||||
return_value="application/pdf",
|
||||
)
|
||||
@mock.patch("documents.tasks.consume_file.apply_async")
|
||||
@mock.patch("pikepdf.new")
|
||||
@mock.patch("pikepdf.open")
|
||||
@@ -1482,7 +1494,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
self.assertEqual(task_kwargs["input_doc"].root_document_id, doc.id)
|
||||
self.assertIsNotNone(task_kwargs["overrides"])
|
||||
|
||||
@mock.patch("documents.data_models.magic.from_file", return_value="application/pdf")
|
||||
@mock.patch(
|
||||
"documents.data_models.mime_detection.from_file",
|
||||
return_value="application/pdf",
|
||||
)
|
||||
@mock.patch("documents.tasks.consume_file.apply_async")
|
||||
@mock.patch("pikepdf.open")
|
||||
def test_remove_password_explicit_selection_uses_root_source_when_root_selected(
|
||||
|
||||
@@ -140,28 +140,25 @@ class FaultyGenericExceptionParser(_BaseNewStyleParser):
|
||||
raise Exception("Generic exception.")
|
||||
|
||||
|
||||
def fake_magic_from_file(file, *, mime=False): # NOSONAR
|
||||
if mime:
|
||||
filepath = Path(file)
|
||||
if filepath.name.startswith("invalid_pdf"):
|
||||
return "application/octet-stream"
|
||||
if filepath.name.startswith("valid_pdf"):
|
||||
return "application/pdf"
|
||||
if filepath.suffix == ".pdf":
|
||||
return "application/pdf"
|
||||
elif filepath.suffix == ".png":
|
||||
return "image/png"
|
||||
elif filepath.suffix == ".webp":
|
||||
return "image/webp"
|
||||
elif filepath.suffix == ".eml":
|
||||
return "message/rfc822"
|
||||
else:
|
||||
return "unknown"
|
||||
def fake_magic_from_file(file): # NOSONAR
|
||||
filepath = Path(file)
|
||||
if filepath.name.startswith("invalid_pdf"):
|
||||
return "application/octet-stream"
|
||||
if filepath.name.startswith("valid_pdf"):
|
||||
return "application/pdf"
|
||||
if filepath.suffix == ".pdf":
|
||||
return "application/pdf"
|
||||
elif filepath.suffix == ".png":
|
||||
return "image/png"
|
||||
elif filepath.suffix == ".webp":
|
||||
return "image/webp"
|
||||
elif filepath.suffix == ".eml":
|
||||
return "message/rfc822"
|
||||
else:
|
||||
return "A verbose string that describes the contents of the file"
|
||||
return "unknown"
|
||||
|
||||
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
@mock.patch("documents.consumer.mime_detection.from_file", fake_magic_from_file)
|
||||
class TestConsumer(
|
||||
DirectoriesMixin,
|
||||
FileSystemAssertsMixin,
|
||||
@@ -1146,7 +1143,7 @@ class TestConsumer(
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
@mock.patch("documents.consumer.mime_detection.from_file", fake_magic_from_file)
|
||||
class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
|
||||
@@ -20,7 +20,6 @@ from urllib.parse import quote
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
import magic
|
||||
import pathvalidate
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Group
|
||||
@@ -226,6 +225,7 @@ from documents.versioning import get_latest_version_for_root
|
||||
from documents.versioning import get_request_version_param
|
||||
from documents.versioning import get_root_document
|
||||
from documents.versioning import resolve_requested_version_for_root
|
||||
from paperless import mime_detection
|
||||
from paperless import version
|
||||
from paperless.celery import app as celery_app
|
||||
from paperless.config import AIConfig
|
||||
@@ -4896,7 +4896,7 @@ def serve_logo(request: HttpRequest, filename: str | None = None) -> FileRespons
|
||||
raise Http404("No logo configured")
|
||||
|
||||
path = app_logo.path
|
||||
content_type = magic.from_file(path, mime=True) or "application/octet-stream"
|
||||
content_type = mime_detection.from_file(path) or "application/octet-stream"
|
||||
|
||||
return FileResponse(
|
||||
app_logo.open("rb"),
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
from pathlib import Path
|
||||
|
||||
from magika import Magika
|
||||
|
||||
_magika = Magika()
|
||||
|
||||
|
||||
def from_file(path: str | Path) -> str:
|
||||
return _magika.identify_path(path).output.mime_type
|
||||
|
||||
|
||||
def from_buffer(data: bytes) -> str:
|
||||
return _magika.identify_bytes(data).output.mime_type
|
||||
@@ -1,7 +1,6 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
|
||||
import magic
|
||||
from allauth.mfa.adapter import get_adapter as get_mfa_adapter
|
||||
from allauth.mfa.models import Authenticator
|
||||
from allauth.mfa.totp.internal.auth import TOTP
|
||||
@@ -18,6 +17,7 @@ from PIL import Image
|
||||
from rest_framework import serializers
|
||||
from rest_framework.authtoken.serializers import AuthTokenSerializer
|
||||
|
||||
from paperless import mime_detection
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.network import validate_outbound_http_url
|
||||
from paperless.validators import reject_dangerous_svg
|
||||
@@ -263,7 +263,7 @@ class ApplicationConfigurationSerializer(
|
||||
jpg/png/gif/svg.
|
||||
"""
|
||||
if file:
|
||||
mime_type = magic.from_buffer(file.read(2048), mime=True)
|
||||
mime_type = mime_detection.from_buffer(file.read(2048))
|
||||
|
||||
if mime_type == "image/svg+xml":
|
||||
reject_dangerous_svg(file)
|
||||
|
||||
@@ -15,11 +15,11 @@ from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import img2pdf
|
||||
import magic
|
||||
import pikepdf
|
||||
import pytest
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from paperless import mime_detection
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytest_mock import MockerFixture
|
||||
@@ -43,7 +43,7 @@ class TestConvertImageToPdfa:
|
||||
result = tesseract_parser._convert_image_to_pdfa(simple_png_file)
|
||||
|
||||
assert result.exists()
|
||||
assert magic.from_file(str(result), mime=True) == "application/pdf"
|
||||
assert mime_detection.from_file(result) == "application/pdf"
|
||||
|
||||
def test_output_path_is_archive_pdf_in_tempdir(
|
||||
self,
|
||||
@@ -92,7 +92,7 @@ class TestConvertImageToPdfa:
|
||||
result = tesseract_parser._convert_image_to_pdfa(simple_png_file)
|
||||
|
||||
assert result.exists()
|
||||
assert magic.from_file(str(result), mime=True) == "application/pdf"
|
||||
assert mime_detection.from_file(result) == "application/pdf"
|
||||
|
||||
def test_image_dpi_setting_applies_fixed_dpi_layout(
|
||||
self,
|
||||
@@ -116,7 +116,7 @@ class TestConvertImageToPdfa:
|
||||
result = tesseract_parser._convert_image_to_pdfa(simple_no_dpi_png_file)
|
||||
|
||||
spy.assert_called_once_with((150, 150))
|
||||
assert magic.from_file(str(result), mime=True) == "application/pdf"
|
||||
assert mime_detection.from_file(result) == "application/pdf"
|
||||
|
||||
def test_no_image_dpi_setting_skips_fixed_dpi_layout(
|
||||
self,
|
||||
|
||||
@@ -10,7 +10,6 @@ from fnmatch import fnmatch
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import magic
|
||||
import pathvalidate
|
||||
from celery import chord
|
||||
from celery import shared_task
|
||||
@@ -40,6 +39,7 @@ from documents.models import Correspondent
|
||||
from documents.models import PaperlessTask
|
||||
from documents.parsers import is_mime_type_supported
|
||||
from documents.tasks import consume_file
|
||||
from paperless import mime_detection
|
||||
from paperless.network import is_public_ip
|
||||
from paperless.network import resolve_hostname_ips
|
||||
from paperless_mail.models import MailAccount
|
||||
@@ -848,7 +848,7 @@ class MailAccountHandler(LoggingMixin):
|
||||
|
||||
# don't trust the content type of the attachment. Could be
|
||||
# generic application/octet-stream.
|
||||
mime_type = magic.from_buffer(att.payload, mime=True)
|
||||
mime_type = mime_detection.from_buffer(att.payload)
|
||||
|
||||
if is_mime_type_supported(mime_type):
|
||||
self.log.info(
|
||||
@@ -954,14 +954,11 @@ class MailAccountHandler(LoggingMixin):
|
||||
)
|
||||
with Path(temp_filename).open("wb") as f:
|
||||
# Move "From"-header to beginning of file
|
||||
# TODO: This ugly workaround is needed because the parser is
|
||||
# chosen only by the mime_type detected via magic
|
||||
# (see documents/consumer.py "mime_type = magic.from_file")
|
||||
# Unfortunately magic sometimes fails to detect the mime
|
||||
# type of .eml files correctly as message/rfc822 and instead
|
||||
# detects text/plain.
|
||||
# This also effects direct file consumption of .eml files
|
||||
# which are not treated with this workaround.
|
||||
# TODO: This workaround may no longer be needed with Magika,
|
||||
# which has better text-format detection than libmagic.
|
||||
# Previously libmagic would misidentify .eml files as text/plain
|
||||
# instead of message/rfc822. Verify and remove if Magika handles
|
||||
# it correctly.
|
||||
from_element = None
|
||||
for i, header in enumerate(message.obj._headers):
|
||||
if header[0] == "From":
|
||||
|
||||
@@ -192,14 +192,11 @@ class BogusMailBox(AbstractContextManager):
|
||||
raise Exception
|
||||
|
||||
|
||||
def fake_magic_from_buffer(buffer, *, mime=False):
|
||||
if mime:
|
||||
if "PDF" in str(buffer):
|
||||
return "application/pdf"
|
||||
else:
|
||||
return "unknown/type"
|
||||
def fake_magic_from_buffer(buffer):
|
||||
if "PDF" in str(buffer):
|
||||
return "application/pdf"
|
||||
else:
|
||||
return "Some verbose file description"
|
||||
return "unknown/type"
|
||||
|
||||
|
||||
class MessageBuilder:
|
||||
@@ -408,7 +405,7 @@ def assert_eventually_equals(
|
||||
raise AssertionError(f"Expected {expected_value}, but got {actual}")
|
||||
|
||||
|
||||
@mock.patch("paperless_mail.mail.magic.from_buffer", fake_magic_from_buffer)
|
||||
@mock.patch("paperless_mail.mail.mime_detection.from_buffer", fake_magic_from_buffer)
|
||||
class TestMail(
|
||||
DirectoriesMixin,
|
||||
FileSystemAssertsMixin,
|
||||
|
||||
Reference in New Issue
Block a user