Merge branch 'master' of https://github.com/ramspoluri/parsedmarc into ramspoluri-master

This commit is contained in:
Sean Whalen
2024-11-02 11:38:41 -04:00
6 changed files with 154 additions and 59 deletions

View File

@@ -166,6 +166,9 @@ The full set of configuration options are:
- `check_timeout` - int: Number of seconds to wait for a IMAP - `check_timeout` - int: Number of seconds to wait for a IMAP
IDLE response or the number of seconds until the next IDLE response or the number of seconds until the next
mail check (Default: `30`) mail check (Default: `30`)
- `since` - str: Search for messages since certain time. (Examples: `5m|3h|2d|1w`)
Acceptable units - {"m":"minutes", "h":"hours", "d":"days", "w":"weeks"}).
Defaults to `1d` if incorrect value is provided.
- `imap` - `imap`
- `host` - str: The IMAP server hostname or IP address - `host` - str: The IMAP server hostname or IP address
- `port` - int: The IMAP server port (Default: `993`) - `port` - int: The IMAP server port (Default: `993`)

View File

@@ -17,7 +17,7 @@ import zlib
from base64 import b64decode from base64 import b64decode
from collections import OrderedDict from collections import OrderedDict
from csv import DictWriter from csv import DictWriter
from datetime import datetime from datetime import datetime, timedelta
from io import BytesIO, StringIO from io import BytesIO, StringIO
from typing import Callable from typing import Callable
@@ -28,7 +28,8 @@ from lxml import etree
from mailsuite.smtp import send_email from mailsuite.smtp import send_email
from parsedmarc.log import logger from parsedmarc.log import logger
from parsedmarc.mail import MailboxConnection from parsedmarc.mail import MailboxConnection, IMAPConnection, \
MSGraphConnection, GmailConnection
from parsedmarc.utils import get_base_domain, get_ip_address_info from parsedmarc.utils import get_base_domain, get_ip_address_info
from parsedmarc.utils import is_outlook_msg, convert_outlook_msg from parsedmarc.utils import is_outlook_msg, convert_outlook_msg
from parsedmarc.utils import parse_email from parsedmarc.utils import parse_email
@@ -1483,24 +1484,23 @@ def get_dmarc_reports_from_mbox(
) )
def get_dmarc_reports_from_mailbox( def get_dmarc_reports_from_mailbox(connection: MailboxConnection,
connection: MailboxConnection, reports_folder="INBOX",
reports_folder="INBOX", archive_folder="Archive",
archive_folder="Archive", delete=False,
delete=False, test=False,
test=False, ip_db_path=None,
ip_db_path=None, always_use_local_files=False,
always_use_local_files=False, reverse_dns_map_path=None,
reverse_dns_map_path=None, reverse_dns_map_url=None,
reverse_dns_map_url=None, offline=False,
offline=False, nameservers=None,
nameservers=None, dns_timeout=6.0,
dns_timeout=6.0, strip_attachment_payloads=False,
strip_attachment_payloads=False, results=None,
results=None, batch_size=10,
batch_size=10, since=None,
create_folders=True, create_folders=True):
):
""" """
Fetches and parses DMARC reports from a mailbox Fetches and parses DMARC reports from a mailbox
@@ -1522,6 +1522,8 @@ def get_dmarc_reports_from_mailbox(
results (dict): Results from the previous run results (dict): Results from the previous run
batch_size (int): Number of messages to read and process before saving batch_size (int): Number of messages to read and process before saving
(use 0 for no limit) (use 0 for no limit)
since: Search for messages since certain time
(units - {"m":"minutes", "h":"hours", "d":"days", "w":"weeks"})
create_folders (bool): Whether to create the destination folders create_folders (bool): Whether to create the destination folders
(not used in watch) (not used in watch)
@@ -1534,6 +1536,9 @@ def get_dmarc_reports_from_mailbox(
if connection is None: if connection is None:
raise ValueError("Must supply a connection") raise ValueError("Must supply a connection")
# current_time useful to fetch_messages later in the program
current_time = None
aggregate_reports = [] aggregate_reports = []
forensic_reports = [] forensic_reports = []
smtp_tls_reports = [] smtp_tls_reports = []
@@ -1557,11 +1562,48 @@ def get_dmarc_reports_from_mailbox(
connection.create_folder(smtp_tls_reports_folder) connection.create_folder(smtp_tls_reports_folder)
connection.create_folder(invalid_reports_folder) connection.create_folder(invalid_reports_folder)
messages = connection.fetch_messages(reports_folder, batch_size=batch_size) if since:
_since = 1440 # default one day
if re.match(r'\d+[mhd]$', since):
s = re.split(r'(\d+)', since)
if s[2] == 'm':
_since = int(s[1])
elif s[2] == 'h':
_since = int(s[1])*60
elif s[2] == 'd':
_since = int(s[1])*60*24
elif s[2] == 'w':
_since = int(s[1])*60*24*7
else:
logger.warning("Incorrect format for \'since\' option. \
Provided value:{0}, Expected values:(5m|3h|2d|1w). \
Ignoring option, fetching messages for last 24hrs" \
"SMTP does not support a time or timezone in since." \
"See https://www.rfc-editor.org/rfc/rfc3501#page-52"
.format(since))
if isinstance(connection, IMAPConnection):
logger.debug("Only days and weeks values in \'since\' option are \
considered for IMAP conections. Examples: 2d or 1w")
since = (datetime.utcnow() - timedelta(minutes=_since)).date()
current_time = datetime.utcnow().date()
elif isinstance(connection, MSGraphConnection):
since = (datetime.utcnow() - timedelta(minutes=_since)) \
.isoformat() + 'Z'
current_time = datetime.utcnow().isoformat() + 'Z'
elif isinstance(connection, GmailConnection):
since = (datetime.utcnow() - timedelta(minutes=_since)) \
.strftime('%s')
current_time = datetime.utcnow().strftime('%s')
else:
pass
messages = connection.fetch_messages(reports_folder, batch_size=batch_size,
since=since)
total_messages = len(messages) total_messages = len(messages)
logger.debug("Found {0} messages in {1}".format(len(messages), reports_folder)) logger.debug("Found {0} messages in {1}".format(len(messages), reports_folder))
if batch_size: if batch_size and not since:
message_limit = min(total_messages, batch_size) message_limit = min(total_messages, batch_size)
else: else:
message_limit = total_messages message_limit = total_messages
@@ -1570,12 +1612,18 @@ def get_dmarc_reports_from_mailbox(
for i in range(message_limit): for i in range(message_limit):
msg_uid = messages[i] msg_uid = messages[i]
logger.debug( logger.debug("Processing message {0} of {1}: UID {2}".format(
"Processing message {0} of {1}: UID {2}".format( i+1, message_limit, msg_uid
i + 1, message_limit, msg_uid ))
) if isinstance(mailbox, MSGraphConnection):
) if test:
msg_content = connection.fetch_message(msg_uid) msg_content = connection.fetch_message(msg_uid,
mark_read=False)
else:
msg_content = connection.fetch_message(msg_uid,
mark_read=True)
else:
msg_content = connection.fetch_message(msg_uid)
try: try:
sa = strip_attachment_payloads sa = strip_attachment_payloads
parsed_email = parse_report_email( parsed_email = parse_report_email(
@@ -1706,7 +1754,11 @@ def get_dmarc_reports_from_mailbox(
] ]
) )
total_messages = len(connection.fetch_messages(reports_folder)) if current_time:
total_messages = len(connection.fetch_messages(reports_folder,
since=current_time))
else:
total_messages = len(connection.fetch_messages(reports_folder))
if not test and not batch_size and total_messages > 0: if not test and not batch_size and total_messages > 0:
# Process emails that came in during the last run # Process emails that came in during the last run
@@ -1725,6 +1777,7 @@ def get_dmarc_reports_from_mailbox(
reverse_dns_map_path=reverse_dns_map_path, reverse_dns_map_path=reverse_dns_map_path,
reverse_dns_map_url=reverse_dns_map_url, reverse_dns_map_url=reverse_dns_map_url,
offline=offline, offline=offline,
since=current_time,
) )
return results return results

View File

@@ -510,6 +510,7 @@ def _main():
mailbox_test=False, mailbox_test=False,
mailbox_batch_size=10, mailbox_batch_size=10,
mailbox_check_timeout=30, mailbox_check_timeout=30,
mailbox_since=None,
imap_host=None, imap_host=None,
imap_skip_certificate_verification=False, imap_skip_certificate_verification=False,
imap_ssl=True, imap_ssl=True,
@@ -713,7 +714,10 @@ def _main():
if "batch_size" in mailbox_config: if "batch_size" in mailbox_config:
opts.mailbox_batch_size = mailbox_config.getint("batch_size") opts.mailbox_batch_size = mailbox_config.getint("batch_size")
if "check_timeout" in mailbox_config: if "check_timeout" in mailbox_config:
opts.mailbox_check_timeout = mailbox_config.getint("check_timeout") opts.mailbox_check_timeout = mailbox_config.getint(
"check_timeout")
if "since" in mailbox_config:
opts.mailbox_since = mailbox_config["since"]
if "imap" in config.sections(): if "imap" in config.sections():
imap_config = config["imap"] imap_config = config["imap"]
@@ -1540,6 +1544,7 @@ def _main():
nameservers=opts.nameservers, nameservers=opts.nameservers,
test=opts.mailbox_test, test=opts.mailbox_test,
strip_attachment_payloads=opts.strip_attachment_payloads, strip_attachment_payloads=opts.strip_attachment_payloads,
since=opts.mailbox_since,
) )
aggregate_reports += reports["aggregate_reports"] aggregate_reports += reports["aggregate_reports"]

View File

@@ -69,18 +69,33 @@ class GmailConnection(MailboxConnection):
else: else:
raise e raise e
def _fetch_all_message_ids(self, reports_label_id, page_token=None): def _fetch_all_message_ids(self, reports_label_id, page_token=None,
results = ( since=None):
self.service.users() if since:
.messages() results = (
.list( self.service.users()
userId="me", .messages()
includeSpamTrash=self.include_spam_trash, .list(
labelIds=[reports_label_id], userId="me",
pageToken=page_token, includeSpamTrash=self.include_spam_trash,
labelIds=[reports_label_id],
pageToken=page_token,
q=f'after:{since}',
)
.execute()
)
else:
results = (
self.service.users()
.messages()
.list(
userId="me",
includeSpamTrash=self.include_spam_trash,
labelIds=[reports_label_id],
pageToken=page_token,
)
.execute()
) )
.execute()
)
messages = results.get("messages", []) messages = results.get("messages", [])
for message in messages: for message in messages:
yield message["id"] yield message["id"]
@@ -92,7 +107,12 @@ class GmailConnection(MailboxConnection):
def fetch_messages(self, reports_folder: str, **kwargs) -> List[str]: def fetch_messages(self, reports_folder: str, **kwargs) -> List[str]:
reports_label_id = self._find_label_id_for_label(reports_folder) reports_label_id = self._find_label_id_for_label(reports_folder)
return [id for id in self._fetch_all_message_ids(reports_label_id)] since = kwargs.get('since')
if since:
return [id for id in self._fetch_all_message_ids(reports_label_id,
since=since)]
else:
return [id for id in self._fetch_all_message_ids(reports_label_id)]
def fetch_message(self, message_id): def fetch_message(self, message_id):
msg = ( msg = (

View File

@@ -146,16 +146,24 @@ class MSGraphConnection(MailboxConnection):
def fetch_messages(self, folder_name: str, **kwargs) -> List[str]: def fetch_messages(self, folder_name: str, **kwargs) -> List[str]:
"""Returns a list of message UIDs in the specified folder""" """Returns a list of message UIDs in the specified folder"""
folder_id = self._find_folder_id_from_folder_path(folder_name) folder_id = self._find_folder_id_from_folder_path(folder_name)
url = f"/users/{self.mailbox_name}/mailFolders/" f"{folder_id}/messages" url = f'/users/{self.mailbox_name}/mailFolders/' \
batch_size = kwargs.get("batch_size") f'{folder_id}/messages'
since = kwargs.get('since')
if not since:
since = None
batch_size = kwargs.get('batch_size')
if not batch_size: if not batch_size:
batch_size = 0 batch_size = 0
emails = self._get_all_messages(url, batch_size) emails = self._get_all_messages(url, batch_size, since)
return [email["id"] for email in emails] return [email['id'] for email in emails]
def _get_all_messages(self, url, batch_size): def _get_all_messages(self, url, batch_size, since):
messages: list messages: list
params = {"$select": "id"} params = {
'$select': 'id'
}
if since:
params['$filter'] = f'receivedDateTime ge {since}'
if batch_size and batch_size > 0: if batch_size and batch_size > 0:
params["$top"] = batch_size params["$top"] = batch_size
else: else:
@@ -165,10 +173,11 @@ class MSGraphConnection(MailboxConnection):
raise RuntimeError(f"Failed to fetch messages {result.text}") raise RuntimeError(f"Failed to fetch messages {result.text}")
messages = result.json()["value"] messages = result.json()["value"]
# Loop if next page is present and not obtained message limit. # Loop if next page is present and not obtained message limit.
while "@odata.nextLink" in result.json() and ( while '@odata.nextLink' in result.json() and (
batch_size == 0 or batch_size - len(messages) > 0 since is not None or (
): batch_size == 0 or
result = self._client.get(result.json()["@odata.nextLink"]) batch_size - len(messages) > 0)):
result = self._client.get(result.json()['@odata.nextLink'])
if result.status_code != 200: if result.status_code != 200:
raise RuntimeError(f"Failed to fetch messages {result.text}") raise RuntimeError(f"Failed to fetch messages {result.text}")
messages.extend(result.json()["value"]) messages.extend(result.json()["value"])
@@ -183,14 +192,15 @@ class MSGraphConnection(MailboxConnection):
f"Failed to mark message read" f"{resp.status_code}: {resp.json()}" f"Failed to mark message read" f"{resp.status_code}: {resp.json()}"
) )
def fetch_message(self, message_id: str): def fetch_message(self, message_id: str, **kwargs):
url = f"/users/{self.mailbox_name}/messages/{message_id}/$value" url = f'/users/{self.mailbox_name}/messages/{message_id}/$value'
result = self._client.get(url) result = self._client.get(url)
if result.status_code != 200: if result.status_code != 200:
raise RuntimeWarning( raise RuntimeWarning(f"Failed to fetch message"
f"Failed to fetch message" f"{result.status_code}: {result.json()}" f"{result.status_code}: {result.json()}")
) mark_read = kwargs.get('mark_read')
self.mark_message_read(message_id) if mark_read:
self.mark_message_read(message_id)
return result.text return result.text
def delete_message(self, message_id: str): def delete_message(self, message_id: str):

View File

@@ -39,7 +39,11 @@ class IMAPConnection(MailboxConnection):
def fetch_messages(self, reports_folder: str, **kwargs): def fetch_messages(self, reports_folder: str, **kwargs):
self._client.select_folder(reports_folder) self._client.select_folder(reports_folder)
return self._client.search() since = kwargs.get('since')
if since:
return self._client.search([u'SINCE', since])
else:
return self._client.search()
def fetch_message(self, message_id): def fetch_message(self, message_id):
return self._client.fetch_message(message_id, parse=False) return self._client.fetch_message(message_id, parse=False)