From c991feb8607688ee02b9bf7b20d23e91b7d431de Mon Sep 17 00:00:00 2001 From: Sean Whalen Date: Mon, 23 Sep 2019 00:12:51 -0400 Subject: [PATCH] Auto detect mbox files and add IMAP timeouts --- parsedmarc/__init__.py | 6 +++++ parsedmarc/cli.py | 60 +++++++++++++++++++++++++++--------------- parsedmarc/utils.py | 17 +++++++++--- requirements.txt | 2 +- setup.py | 2 +- 5 files changed, 60 insertions(+), 27 deletions(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index b9018e0..862a449 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -976,6 +976,8 @@ def get_dmarc_reports_from_inbox(connection=None, port=None, ssl=True, verify=True, + timeout=30, + max_retries=4, reports_folder="INBOX", archive_folder="Archive", delete=False, @@ -996,6 +998,8 @@ def get_dmarc_reports_from_inbox(connection=None, port: The mail server port ssl (bool): Use SSL/TLS verify (bool): Verify SSL/TLS certificate + timeout (float): IMAP timeout in seconds + max_retries (int): The maximum number of retries after a timeout reports_folder: The IMAP folder where reports can be found archive_folder: The folder to move processed mail to delete (bool): Delete messages after processing them @@ -1034,6 +1038,8 @@ def get_dmarc_reports_from_inbox(connection=None, else: server = IMAPClient(host, user, password, port=port, ssl=ssl, verify=verify, + timeout=timeout, + max_retries=max_retries, initial_folder=reports_folder) server.create_folder(archive_folder) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index 523766e..be9b6c5 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -20,6 +20,7 @@ from parsedmarc import get_dmarc_reports_from_inbox, watch_inbox, \ parse_report_file, get_dmarc_reports_from_mbox, elastic, kafkaclient, \ splunk, save_output, email_results, ParserError, __version__, \ InvalidDMARCReport +from parsedmarc.utils import is_mbox logger = logging.getLogger("parsedmarc") @@ -152,8 +153,7 @@ def _main(): "(--silent implied)") arg_parser.add_argument("file_path", nargs="*", help="one or more paths to aggregate or forensic " - "report files or emails; prepend " - "mailboxes with 'mbox:' ") + "report files, emails, or mbox files'") strip_attachment_help = "remove attachment payloads from forensic " \ "report output" arg_parser.add_argument("--strip-attachment-payloads", @@ -199,6 +199,8 @@ def _main(): imap_skip_certificate_verification=False, imap_ssl=True, imap_port=993, + imap_timeout=30, + imap_max_retries=4, imap_user=None, imap_password=None, imap_reports_folder="INBOX", @@ -211,6 +213,7 @@ def _main(): hec_index=None, hec_skip_certificate_verification=False, elasticsearch_hosts=None, + elasticsearch_timeout=60, elasticsearch_number_of_shards=1, elasticsearch_number_of_replicas=1, elasticsearch_index_suffix=None, @@ -284,7 +287,11 @@ def _main(): "imap config section") exit(-1) if "port" in imap_config: - opts.imap_port = imap_config["port"] + opts.imap_port = imap_config.getint("port") + if "timeout" in imap_config: + opts.imap_timeout = imap_config.getfloat("timeout") + if "max_retries" in imap_config: + opts.imap_port = imap_config.getint("max_retries") if "ssl" in imap_config: opts.imap_ssl = imap_config.getboolean("ssl") if "skip_certificate_verification" in imap_config: @@ -323,6 +330,9 @@ def _main(): logger.critical("hosts setting missing from the " "elasticsearch config section") exit(-1) + if "timeout" in elasticsearch_config: + timeout = elasticsearch_config.getfloat("timeout") + opts.elasticsearch_timeout = timeout if "number_of_shards" in elasticsearch_config: number_of_shards = elasticsearch_config.getint( "number_of_shards") @@ -478,7 +488,8 @@ def _main(): es_forensic_index, suffix) elastic.set_hosts(opts.elasticsearch_hosts, opts.elasticsearch_ssl, - opts.elasticsearch_ssl_cert_path) + opts.elasticsearch_ssl_cert_path, + timeout=opts.elasticsearch_timeout) elastic.migrate_indexes(aggregate_indexes=[es_aggregate_index], forensic_indexes=[es_forensic_index]) except elastic.ElasticsearchError as error: @@ -503,15 +514,19 @@ def _main(): file_paths = [] mbox_paths = [] + for file_path in args.file_path: - if not file_path.startswith("mbox:"): - file_paths += glob(file_path) - else: - mbox_paths += glob(file_path[5:]) + file_paths += glob(file_path) + for file_path in file_paths: + if is_mbox(file_path): + mbox_paths.append(file_path) file_paths = list(set(file_paths)) mbox_paths = list(set(mbox_paths)) + for mbox_path in mbox_paths: + file_paths.remove(mbox_path) + counter = Value('i', 0) pool = Pool(opts.n_procs, initializer=init, initargs=(counter,)) results = pool.starmap_async(cli_parse, @@ -566,19 +581,22 @@ def _main(): verify = False if opts.imap_ssl is False: ssl = False - reports = get_dmarc_reports_from_inbox(host=opts.imap_host, - port=opts.imap_port, - ssl=ssl, - verify=verify, - user=opts.imap_user, - password=opts.imap_password, - reports_folder=rf, - archive_folder=af, - delete=opts.imap_delete, - offline=opts.offline, - nameservers=ns, - test=opts.imap_test, - strip_attachment_payloads=sa + reports = get_dmarc_reports_from_inbox( + host=opts.imap_host, + port=opts.imap_port, + ssl=ssl, + verify=verify, + timeout=opts.imap_timeout, + max_retries=opts.imap_max_retries, + user=opts.imap_user, + password=opts.imap_password, + reports_folder=rf, + archive_folder=af, + delete=opts.imap_delete, + offline=opts.offline, + nameservers=ns, + test=opts.imap_test, + strip_attachment_payloads=sa ) aggregate_reports += reports["aggregate_reports"] diff --git a/parsedmarc/utils.py b/parsedmarc/utils.py index ec25d9f..d2149c1 100644 --- a/parsedmarc/utils.py +++ b/parsedmarc/utils.py @@ -16,6 +16,8 @@ import hashlib import base64 import platform import atexit +import io +import mailbox import dateparser import dns.reversename @@ -422,18 +424,25 @@ def get_filename_safe_string(string): return string -def is_mbox(content): +def is_mbox(path): """ Checks if the given content is a MBOX mailbox file Args: - content: Content to check + path: Content to check Returns: bool: A flag the indicates if a file is a MBOX mailbox file """ - return type(content) == bytes and content.startswith( - b"\xD0\x0D\xBB\xAD") + _is_mbox = False + try: + mbox = mailbox.mbox(path) + if len(mbox.keys()) > 0: + _is_mbox = True + except Exception as e: + logger.debug("Error checking for MBOX file: {0}".format(e.__str__())) + + return _is_mbox def is_outlook_msg(content): diff --git a/requirements.txt b/requirements.txt index 631057d..0680ca2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ dateparser>=0.7.1 elasticsearch>=6.3.1,<7.0.0 elasticsearch-dsl>=6.3.1,<7.0.0 kafka-python>=1.4.4 -mailsuite>=1.1.0 +mailsuite>=1.3.0 nose>=1.3.7 flake8>=3.7.8 doc8>=0.8.0 diff --git a/setup.py b/setup.py index ab2f217..4e455de 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,7 @@ setup( 'requests>=2.2.16.0', 'imapclient>=2.1.0', 'mail-parser>=3.9.2', 'dateparser>=0.7.1', - 'mailsuite>=1.2.1', + 'mailsuite>=1.3.1', 'elasticsearch>=6.3.1,<7.0.0', 'elasticsearch-dsl>=6.3.1,<7.0.0', 'kafka-python>=1.4.4',