mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-07-03 18:54:27 +00:00
Fixes potential sources for ReDOS
This commit is contained in:
@@ -7,6 +7,7 @@ from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import regex as regex_mod
|
||||
from django.conf import settings
|
||||
from pdf2image import convert_from_path
|
||||
from pikepdf import Page
|
||||
@@ -22,6 +23,8 @@ from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import StopConsumeTaskError
|
||||
from documents.plugins.helpers import ProgressManager
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.regex import safe_regex_match
|
||||
from documents.regex import safe_regex_sub
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
@@ -68,8 +71,8 @@ class Barcode:
|
||||
Note: This does NOT exclude ASN or separator barcodes - they can also be used
|
||||
as tags if they match a tag mapping pattern (e.g., {"ASN12.*": "JOHN"}).
|
||||
"""
|
||||
for regex in self.settings.barcode_tag_mapping:
|
||||
if re.match(regex, self.value, flags=re.IGNORECASE):
|
||||
for pattern in self.settings.barcode_tag_mapping:
|
||||
if safe_regex_match(pattern, self.value, flags=regex_mod.IGNORECASE):
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -392,11 +395,16 @@ class BarcodePlugin(ConsumeTaskPlugin):
|
||||
for raw in tag_texts.split(","):
|
||||
try:
|
||||
tag_str: str | None = None
|
||||
for regex in self.settings.barcode_tag_mapping:
|
||||
if re.match(regex, raw, flags=re.IGNORECASE):
|
||||
sub = self.settings.barcode_tag_mapping[regex]
|
||||
for pattern in self.settings.barcode_tag_mapping:
|
||||
if safe_regex_match(pattern, raw, flags=regex_mod.IGNORECASE):
|
||||
sub = self.settings.barcode_tag_mapping[pattern]
|
||||
tag_str = (
|
||||
re.sub(regex, sub, raw, flags=re.IGNORECASE)
|
||||
safe_regex_sub(
|
||||
pattern,
|
||||
sub,
|
||||
raw,
|
||||
flags=regex_mod.IGNORECASE,
|
||||
)
|
||||
if sub
|
||||
else raw
|
||||
)
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
import datetime
|
||||
import re
|
||||
from collections.abc import Iterator
|
||||
from re import Match
|
||||
|
||||
import regex
|
||||
from regex import Match
|
||||
|
||||
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||
from documents.regex import safe_regex_finditer
|
||||
|
||||
|
||||
class RegexDateParserPlugin(DateParserPluginBase):
|
||||
@@ -14,7 +16,7 @@ class RegexDateParserPlugin(DateParserPluginBase):
|
||||
passed to its constructor.
|
||||
"""
|
||||
|
||||
DATE_REGEX = re.compile(
|
||||
DATE_REGEX = regex.compile(
|
||||
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
|
||||
@@ -22,7 +24,7 @@ class RegexDateParserPlugin(DateParserPluginBase):
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
|
||||
re.IGNORECASE,
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
|
||||
def _process_match(
|
||||
@@ -45,7 +47,7 @@ class RegexDateParserPlugin(DateParserPluginBase):
|
||||
"""
|
||||
Finds all regex matches in content and yields valid dates.
|
||||
"""
|
||||
for m in re.finditer(self.DATE_REGEX, content):
|
||||
for m in safe_regex_finditer(self.DATE_REGEX, content):
|
||||
date = self._process_match(m, date_order)
|
||||
if date is not None:
|
||||
yield date
|
||||
|
||||
@@ -48,3 +48,73 @@ def safe_regex_search(pattern: str, text: str, *, flags: int = 0):
|
||||
textwrap.shorten(pattern, width=80, placeholder="…"),
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def safe_regex_match(pattern: str, text: str, *, flags: int = 0):
|
||||
"""
|
||||
Run a regex match with a timeout. Returns a match object or None.
|
||||
Validation errors and timeouts are logged and treated as no match.
|
||||
"""
|
||||
|
||||
try:
|
||||
validate_regex_pattern(pattern)
|
||||
compiled = regex.compile(pattern, flags=flags)
|
||||
except (regex.error, ValueError) as exc:
|
||||
logger.error(
|
||||
"Error while processing regular expression %s: %s",
|
||||
textwrap.shorten(pattern, width=80, placeholder="…"),
|
||||
exc,
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
return compiled.match(text, timeout=REGEX_TIMEOUT_SECONDS)
|
||||
except TimeoutError:
|
||||
logger.warning(
|
||||
"Regular expression matching timed out for pattern %s",
|
||||
textwrap.shorten(pattern, width=80, placeholder="…"),
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def safe_regex_sub(pattern: str, repl: str, text: str, *, flags: int = 0) -> str | None:
|
||||
"""
|
||||
Run a regex substitution with a timeout. Returns the substituted string,
|
||||
or None on error/timeout.
|
||||
"""
|
||||
|
||||
try:
|
||||
validate_regex_pattern(pattern)
|
||||
compiled = regex.compile(pattern, flags=flags)
|
||||
except (regex.error, ValueError) as exc:
|
||||
logger.error(
|
||||
"Error while processing regular expression %s: %s",
|
||||
textwrap.shorten(pattern, width=80, placeholder="…"),
|
||||
exc,
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
return compiled.sub(repl, text, timeout=REGEX_TIMEOUT_SECONDS)
|
||||
except TimeoutError:
|
||||
logger.warning(
|
||||
"Regular expression substitution timed out for pattern %s",
|
||||
textwrap.shorten(pattern, width=80, placeholder="…"),
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def safe_regex_finditer(compiled_pattern: regex.Pattern, text: str):
|
||||
"""
|
||||
Run regex finditer with a timeout. Yields match objects.
|
||||
Stops iteration on timeout.
|
||||
"""
|
||||
|
||||
try:
|
||||
yield from compiled_pattern.finditer(text, timeout=REGEX_TIMEOUT_SECONDS)
|
||||
except TimeoutError:
|
||||
logger.warning(
|
||||
"Regular expression finditer timed out for pattern %s",
|
||||
textwrap.shorten(compiled_pattern.pattern, width=80, placeholder="…"),
|
||||
)
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user