Files
paperless-ngx/src/documents/plugins/date_parsing/base.py
2026-03-04 15:03:33 -08:00

121 lines
3.3 KiB
Python

import datetime
import logging
from abc import ABC
from abc import abstractmethod
from collections.abc import Iterator
from dataclasses import dataclass
from types import TracebackType
from typing import Self
import dateparser
logger = logging.getLogger(__name__)
@dataclass(frozen=True, slots=True)
class DateParserConfig:
"""
Configuration for a DateParser instance.
This object is created by the factory and passed to the
parser's constructor, decoupling the parser from settings.
"""
languages: list[str]
timezone_str: str
ignore_dates: set[datetime.date]
# A "now" timestamp for filtering future dates.
# Passed in by the factory.
reference_time: datetime.datetime
# Settings for the default RegexDateParser
# Other plugins should use or consider these, but it is not required
filename_date_order: str | None
content_date_order: str
class DateParserPluginBase(ABC):
"""
Abstract base class for date parsing strategies.
Instances are configured via a DateParserConfig object.
"""
def __init__(self, config: DateParserConfig):
"""
Initializes the parser with its configuration.
"""
self.config = config
def __enter__(self) -> Self:
"""
Enter the runtime context related to this object.
Subclasses can override this to acquire resources (connections, handles).
"""
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
"""
Exit the runtime context related to this object.
Subclasses can override this to release resources.
"""
# Default implementation does nothing.
# Returning None implies exceptions are propagated.
def _parse_string(
self,
date_string: str,
date_order: str,
) -> datetime.datetime | None:
"""
Helper method to parse a single date string using dateparser.
Uses configuration from `self.config`.
"""
try:
return dateparser.parse(
date_string,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True,
"TIMEZONE": self.config.timezone_str,
},
locales=self.config.languages,
)
except Exception as e:
logger.error(f"Error while parsing date string '{date_string}': {e}")
return None
def _filter_date(
self,
date: datetime.datetime | None,
) -> datetime.datetime | None:
"""
Helper method to validate a parsed datetime object.
Uses configuration from `self.config`.
"""
if (
date is not None
and date.year > 1900
and date <= self.config.reference_time
and date.date() not in self.config.ignore_dates
):
return date
return None
@abstractmethod
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
"""
Parses a document's filename and content, yielding valid datetime objects.
"""