From d6542a691e75e6dab7abd5959c9719f6f49c9501 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 2 Apr 2026 13:04:14 -0700 Subject: [PATCH] Fixes potential sources for ReDOS --- src/documents/barcodes.py | 20 ++++-- .../plugins/date_parsing/regex_parser.py | 12 ++-- src/documents/regex.py | 70 +++++++++++++++++++ 3 files changed, 91 insertions(+), 11 deletions(-) diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 31ef052c4..38a28081a 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -7,6 +7,7 @@ from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING +import regex as regex_mod from django.conf import settings from pdf2image import convert_from_path from pikepdf import Page @@ -22,6 +23,8 @@ from documents.plugins.base import ConsumeTaskPlugin from documents.plugins.base import StopConsumeTaskError from documents.plugins.helpers import ProgressManager from documents.plugins.helpers import ProgressStatusOptions +from documents.regex import safe_regex_match +from documents.regex import safe_regex_sub from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats from documents.utils import maybe_override_pixel_limit @@ -68,8 +71,8 @@ class Barcode: Note: This does NOT exclude ASN or separator barcodes - they can also be used as tags if they match a tag mapping pattern (e.g., {"ASN12.*": "JOHN"}). """ - for regex in self.settings.barcode_tag_mapping: - if re.match(regex, self.value, flags=re.IGNORECASE): + for pattern in self.settings.barcode_tag_mapping: + if safe_regex_match(pattern, self.value, flags=regex_mod.IGNORECASE): return True return False @@ -392,11 +395,16 @@ class BarcodePlugin(ConsumeTaskPlugin): for raw in tag_texts.split(","): try: tag_str: str | None = None - for regex in self.settings.barcode_tag_mapping: - if re.match(regex, raw, flags=re.IGNORECASE): - sub = self.settings.barcode_tag_mapping[regex] + for pattern in self.settings.barcode_tag_mapping: + if safe_regex_match(pattern, raw, flags=regex_mod.IGNORECASE): + sub = self.settings.barcode_tag_mapping[pattern] tag_str = ( - re.sub(regex, sub, raw, flags=re.IGNORECASE) + safe_regex_sub( + pattern, + sub, + raw, + flags=regex_mod.IGNORECASE, + ) if sub else raw ) diff --git a/src/documents/plugins/date_parsing/regex_parser.py b/src/documents/plugins/date_parsing/regex_parser.py index 2df8f9295..07a9e24f0 100644 --- a/src/documents/plugins/date_parsing/regex_parser.py +++ b/src/documents/plugins/date_parsing/regex_parser.py @@ -1,9 +1,11 @@ import datetime -import re from collections.abc import Iterator -from re import Match + +import regex +from regex import Match from documents.plugins.date_parsing.base import DateParserPluginBase +from documents.regex import safe_regex_finditer class RegexDateParserPlugin(DateParserPluginBase): @@ -14,7 +16,7 @@ class RegexDateParserPlugin(DateParserPluginBase): passed to its constructor. """ - DATE_REGEX = re.compile( + DATE_REGEX = regex.compile( r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|" r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|" r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|" @@ -22,7 +24,7 @@ class RegexDateParserPlugin(DateParserPluginBase): r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|" r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|" r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))", - re.IGNORECASE, + regex.IGNORECASE, ) def _process_match( @@ -45,7 +47,7 @@ class RegexDateParserPlugin(DateParserPluginBase): """ Finds all regex matches in content and yields valid dates. """ - for m in re.finditer(self.DATE_REGEX, content): + for m in safe_regex_finditer(self.DATE_REGEX, content): date = self._process_match(m, date_order) if date is not None: yield date diff --git a/src/documents/regex.py b/src/documents/regex.py index 35acc5af0..849d417d8 100644 --- a/src/documents/regex.py +++ b/src/documents/regex.py @@ -48,3 +48,73 @@ def safe_regex_search(pattern: str, text: str, *, flags: int = 0): textwrap.shorten(pattern, width=80, placeholder="…"), ) return None + + +def safe_regex_match(pattern: str, text: str, *, flags: int = 0): + """ + Run a regex match with a timeout. Returns a match object or None. + Validation errors and timeouts are logged and treated as no match. + """ + + try: + validate_regex_pattern(pattern) + compiled = regex.compile(pattern, flags=flags) + except (regex.error, ValueError) as exc: + logger.error( + "Error while processing regular expression %s: %s", + textwrap.shorten(pattern, width=80, placeholder="…"), + exc, + ) + return None + + try: + return compiled.match(text, timeout=REGEX_TIMEOUT_SECONDS) + except TimeoutError: + logger.warning( + "Regular expression matching timed out for pattern %s", + textwrap.shorten(pattern, width=80, placeholder="…"), + ) + return None + + +def safe_regex_sub(pattern: str, repl: str, text: str, *, flags: int = 0) -> str | None: + """ + Run a regex substitution with a timeout. Returns the substituted string, + or None on error/timeout. + """ + + try: + validate_regex_pattern(pattern) + compiled = regex.compile(pattern, flags=flags) + except (regex.error, ValueError) as exc: + logger.error( + "Error while processing regular expression %s: %s", + textwrap.shorten(pattern, width=80, placeholder="…"), + exc, + ) + return None + + try: + return compiled.sub(repl, text, timeout=REGEX_TIMEOUT_SECONDS) + except TimeoutError: + logger.warning( + "Regular expression substitution timed out for pattern %s", + textwrap.shorten(pattern, width=80, placeholder="…"), + ) + return None + + +def safe_regex_finditer(compiled_pattern: regex.Pattern, text: str): + """ + Run regex finditer with a timeout. Yields match objects. + Stops iteration on timeout. + """ + + try: + yield from compiled_pattern.finditer(text, timeout=REGEX_TIMEOUT_SECONDS) + except TimeoutError: + logger.warning( + "Regular expression finditer timed out for pattern %s", + textwrap.shorten(compiled_pattern.pattern, width=80, placeholder="…"), + ) + return