Compare commits

...

15 Commits

Author SHA1 Message Date
Sean Whalen
989bfd8f07 Code cleanup 2024-11-02 11:40:37 -04:00
Sean Whalen
908cc2918c Merge branch 'ramspoluri-master' 2024-11-02 11:39:34 -04:00
Sean Whalen
bd5774d71d Merge branch 'master' of https://github.com/ramspoluri/parsedmarc into ramspoluri-master 2024-11-02 11:38:41 -04:00
Sean Whalen
8e9112bad3 Merge branch 'master' of https://github.com/ramspoluri/parsedmarc 2024-11-02 10:48:15 -04:00
Sean Whalen
40e041a8af Merge branch 'master' of https://github.com/ramspoluri/parsedmarc 2024-11-02 10:48:10 -04:00
Sean Whalen
7ba433cddb Fix code style 2024-11-02 10:39:05 -04:00
Sean Whalen
6d467c93f9 Update __init__.py
Add reference to https://www.rfc-editor.org/rfc/rfc3501#page-52
2024-11-02 10:35:22 -04:00
Sean Whalen
be38e83761 Code cleanup 2024-11-02 10:28:11 -04:00
Sean Whalen
ef4e1ac8dc Code cleanup 2024-11-02 10:26:30 -04:00
Sean Whalen
39e4c22ecc Fix syntax 2024-11-02 10:23:23 -04:00
Sean Whalen
88ff3a2c23 Update syntax to support Python < 3.10 2024-11-02 10:04:01 -04:00
Sean Whalen
d8aee569f7 Update __init__.py 2024-11-02 09:50:55 -04:00
Sean Whalen
debc28cc6e 8.15.4
- Fix crash if aggregate report timespan is > 24 hours
2024-10-24 19:53:44 -04:00
Sean Whalen
52ccf0536c 8.15.3
- Ignore aggregate reports with a timespan of > 24 hours (Fixes #282)
2024-10-24 19:43:28 -04:00
ramspoluri
f618f69c6c Added 'since' option to search for messages since a certain time
- Added `since` option under `mailbox` section to search for messages since a certain time instead of going through the complete mailbox during testing scenarios. Acceptable values -`5m|3h|2d|1w`, units - {"m":"minutes", "h":"hours", "d":"days", "w":"weeks"}). Defaults to `1d` if an incorrect value is provided.
    - Not to mark messages as read if test option is selected (works only for MSGraphConnection)
2024-05-24 20:43:36 +05:30
9 changed files with 138 additions and 26 deletions

View File

@@ -101,6 +101,7 @@
"sourcetype",
"STARTTLS",
"tasklist",
"timespan",
"tlsa",
"tlsrpt",
"toctree",

View File

@@ -1,6 +1,16 @@
Changelog
=========
8.15.4
------
- Fix crash if aggregate report timespan is > 24 hours
8.15.3
------
- Ignore aggregate reports with a timespan of > 24 hours (Fixes #282)
8.15.2
------

View File

@@ -9,6 +9,7 @@ fi
. venv/bin/activate
pip install .[build]
ruff format .
ruff check .
cd docs
make clean
make html

View File

@@ -166,6 +166,9 @@ The full set of configuration options are:
- `check_timeout` - int: Number of seconds to wait for a IMAP
IDLE response or the number of seconds until the next
mail check (Default: `30`)
- `since` - str: Search for messages since certain time. (Examples: `5m|3h|2d|1w`)
Acceptable units - {"m":"minutes", "h":"hours", "d":"days", "w":"weeks"}).
Defaults to `1d` if incorrect value is provided.
- `imap`
- `host` - str: The IMAP server hostname or IP address
- `port` - int: The IMAP server port (Default: `993`)

View File

@@ -17,7 +17,7 @@ import zlib
from base64 import b64decode
from collections import OrderedDict
from csv import DictWriter
from datetime import datetime
from datetime import datetime, timedelta
from io import BytesIO, StringIO
from typing import Callable
@@ -28,13 +28,18 @@ from lxml import etree
from mailsuite.smtp import send_email
from parsedmarc.log import logger
from parsedmarc.mail import MailboxConnection
from parsedmarc.mail import (
MailboxConnection,
IMAPConnection,
MSGraphConnection,
GmailConnection,
)
from parsedmarc.utils import get_base_domain, get_ip_address_info
from parsedmarc.utils import is_outlook_msg, convert_outlook_msg
from parsedmarc.utils import parse_email
from parsedmarc.utils import timestamp_to_human, human_timestamp_to_datetime
__version__ = "8.15.2"
__version__ = "8.15.4"
logger.debug("parsedmarc v{0}".format(__version__))
@@ -519,7 +524,7 @@ def parse_aggregate_report_xml(
date_range = report["report_metadata"]["date_range"]
if int(date_range["end"]) - int(date_range["begin"]) > 2 * 86400:
_error = "Time span > 24 hours - RFC 7489 section 7.2"
errors.append(_error)
raise InvalidAggregateReport(_error)
date_range["begin"] = timestamp_to_human(date_range["begin"])
date_range["end"] = timestamp_to_human(date_range["end"])
new_report_metadata["begin_date"] = date_range["begin"]
@@ -1499,6 +1504,7 @@ def get_dmarc_reports_from_mailbox(
strip_attachment_payloads=False,
results=None,
batch_size=10,
since=None,
create_folders=True,
):
"""
@@ -1522,6 +1528,8 @@ def get_dmarc_reports_from_mailbox(
results (dict): Results from the previous run
batch_size (int): Number of messages to read and process before saving
(use 0 for no limit)
since: Search for messages since certain time
(units - {"m":"minutes", "h":"hours", "d":"days", "w":"weeks"})
create_folders (bool): Whether to create the destination folders
(not used in watch)
@@ -1534,6 +1542,9 @@ def get_dmarc_reports_from_mailbox(
if connection is None:
raise ValueError("Must supply a connection")
# current_time useful to fetch_messages later in the program
current_time = None
aggregate_reports = []
forensic_reports = []
smtp_tls_reports = []
@@ -1557,11 +1568,50 @@ def get_dmarc_reports_from_mailbox(
connection.create_folder(smtp_tls_reports_folder)
connection.create_folder(invalid_reports_folder)
messages = connection.fetch_messages(reports_folder, batch_size=batch_size)
if since:
_since = 1440 # default one day
if re.match(r"\d+[mhd]$", since):
s = re.split(r"(\d+)", since)
if s[2] == "m":
_since = int(s[1])
elif s[2] == "h":
_since = int(s[1]) * 60
elif s[2] == "d":
_since = int(s[1]) * 60 * 24
elif s[2] == "w":
_since = int(s[1]) * 60 * 24 * 7
else:
logger.warning(
"Incorrect format for 'since' option. \
Provided value:{0}, Expected values:(5m|3h|2d|1w). \
Ignoring option, fetching messages for last 24hrs"
"SMTP does not support a time or timezone in since."
"See https://www.rfc-editor.org/rfc/rfc3501#page-52".format(since)
)
if isinstance(connection, IMAPConnection):
logger.debug(
"Only days and weeks values in 'since' option are \
considered for IMAP conections. Examples: 2d or 1w"
)
since = (datetime.utcnow() - timedelta(minutes=_since)).date()
current_time = datetime.utcnow().date()
elif isinstance(connection, MSGraphConnection):
since = (datetime.utcnow() - timedelta(minutes=_since)).isoformat() + "Z"
current_time = datetime.utcnow().isoformat() + "Z"
elif isinstance(connection, GmailConnection):
since = (datetime.utcnow() - timedelta(minutes=_since)).strftime("%s")
current_time = datetime.utcnow().strftime("%s")
else:
pass
messages = connection.fetch_messages(
reports_folder, batch_size=batch_size, since=since
)
total_messages = len(messages)
logger.debug("Found {0} messages in {1}".format(len(messages), reports_folder))
if batch_size:
if batch_size and not since:
message_limit = min(total_messages, batch_size)
else:
message_limit = total_messages
@@ -1575,7 +1625,13 @@ def get_dmarc_reports_from_mailbox(
i + 1, message_limit, msg_uid
)
)
msg_content = connection.fetch_message(msg_uid)
if isinstance(mailbox, MSGraphConnection):
if test:
msg_content = connection.fetch_message(msg_uid, mark_read=False)
else:
msg_content = connection.fetch_message(msg_uid, mark_read=True)
else:
msg_content = connection.fetch_message(msg_uid)
try:
sa = strip_attachment_payloads
parsed_email = parse_report_email(
@@ -1706,7 +1762,12 @@ def get_dmarc_reports_from_mailbox(
]
)
total_messages = len(connection.fetch_messages(reports_folder))
if current_time:
total_messages = len(
connection.fetch_messages(reports_folder, since=current_time)
)
else:
total_messages = len(connection.fetch_messages(reports_folder))
if not test and not batch_size and total_messages > 0:
# Process emails that came in during the last run
@@ -1725,6 +1786,7 @@ def get_dmarc_reports_from_mailbox(
reverse_dns_map_path=reverse_dns_map_path,
reverse_dns_map_url=reverse_dns_map_url,
offline=offline,
since=current_time,
)
return results

View File

@@ -510,6 +510,7 @@ def _main():
mailbox_test=False,
mailbox_batch_size=10,
mailbox_check_timeout=30,
mailbox_since=None,
imap_host=None,
imap_skip_certificate_verification=False,
imap_ssl=True,
@@ -714,6 +715,8 @@ def _main():
opts.mailbox_batch_size = mailbox_config.getint("batch_size")
if "check_timeout" in mailbox_config:
opts.mailbox_check_timeout = mailbox_config.getint("check_timeout")
if "since" in mailbox_config:
opts.mailbox_since = mailbox_config["since"]
if "imap" in config.sections():
imap_config = config["imap"]
@@ -1540,6 +1543,7 @@ def _main():
nameservers=opts.nameservers,
test=opts.mailbox_test,
strip_attachment_payloads=opts.strip_attachment_payloads,
since=opts.mailbox_since,
)
aggregate_reports += reports["aggregate_reports"]

View File

@@ -69,18 +69,32 @@ class GmailConnection(MailboxConnection):
else:
raise e
def _fetch_all_message_ids(self, reports_label_id, page_token=None):
results = (
self.service.users()
.messages()
.list(
userId="me",
includeSpamTrash=self.include_spam_trash,
labelIds=[reports_label_id],
pageToken=page_token,
def _fetch_all_message_ids(self, reports_label_id, page_token=None, since=None):
if since:
results = (
self.service.users()
.messages()
.list(
userId="me",
includeSpamTrash=self.include_spam_trash,
labelIds=[reports_label_id],
pageToken=page_token,
q=f"after:{since}",
)
.execute()
)
else:
results = (
self.service.users()
.messages()
.list(
userId="me",
includeSpamTrash=self.include_spam_trash,
labelIds=[reports_label_id],
pageToken=page_token,
)
.execute()
)
.execute()
)
messages = results.get("messages", [])
for message in messages:
yield message["id"]
@@ -92,7 +106,13 @@ class GmailConnection(MailboxConnection):
def fetch_messages(self, reports_folder: str, **kwargs) -> List[str]:
reports_label_id = self._find_label_id_for_label(reports_folder)
return [id for id in self._fetch_all_message_ids(reports_label_id)]
since = kwargs.get("since")
if since:
return [
id for id in self._fetch_all_message_ids(reports_label_id, since=since)
]
else:
return [id for id in self._fetch_all_message_ids(reports_label_id)]
def fetch_message(self, message_id):
msg = (

View File

@@ -147,15 +147,20 @@ class MSGraphConnection(MailboxConnection):
"""Returns a list of message UIDs in the specified folder"""
folder_id = self._find_folder_id_from_folder_path(folder_name)
url = f"/users/{self.mailbox_name}/mailFolders/" f"{folder_id}/messages"
since = kwargs.get("since")
if not since:
since = None
batch_size = kwargs.get("batch_size")
if not batch_size:
batch_size = 0
emails = self._get_all_messages(url, batch_size)
emails = self._get_all_messages(url, batch_size, since)
return [email["id"] for email in emails]
def _get_all_messages(self, url, batch_size):
def _get_all_messages(self, url, batch_size, since):
messages: list
params = {"$select": "id"}
if since:
params["$filter"] = f"receivedDateTime ge {since}"
if batch_size and batch_size > 0:
params["$top"] = batch_size
else:
@@ -166,7 +171,7 @@ class MSGraphConnection(MailboxConnection):
messages = result.json()["value"]
# Loop if next page is present and not obtained message limit.
while "@odata.nextLink" in result.json() and (
batch_size == 0 or batch_size - len(messages) > 0
since is not None or (batch_size == 0 or batch_size - len(messages) > 0)
):
result = self._client.get(result.json()["@odata.nextLink"])
if result.status_code != 200:
@@ -183,14 +188,16 @@ class MSGraphConnection(MailboxConnection):
f"Failed to mark message read" f"{resp.status_code}: {resp.json()}"
)
def fetch_message(self, message_id: str):
def fetch_message(self, message_id: str, **kwargs):
url = f"/users/{self.mailbox_name}/messages/{message_id}/$value"
result = self._client.get(url)
if result.status_code != 200:
raise RuntimeWarning(
f"Failed to fetch message" f"{result.status_code}: {result.json()}"
)
self.mark_message_read(message_id)
mark_read = kwargs.get("mark_read")
if mark_read:
self.mark_message_read(message_id)
return result.text
def delete_message(self, message_id: str):

View File

@@ -39,7 +39,11 @@ class IMAPConnection(MailboxConnection):
def fetch_messages(self, reports_folder: str, **kwargs):
self._client.select_folder(reports_folder)
return self._client.search()
since = kwargs.get("since")
if since:
return self._client.search(["SINCE", since])
else:
return self._client.search()
def fetch_message(self, message_id):
return self._client.fetch_message(message_id, parse=False)