From 83e229aeb1e563088de76c5d573bfab8a0a0ae37 Mon Sep 17 00:00:00 2001 From: atanas argirov Date: Mon, 28 Dec 2020 15:57:32 +0000 Subject: [PATCH 01/31] * added output_{json,csv}_{aggregate,forensic}_file command line args * refactored save_output() to support output_* --- parsedmarc/__init__.py | 14 +++++++++----- parsedmarc/cli.py | 14 +++++++++++++- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 4924d07..49de20b 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -1235,7 +1235,11 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, logger.warning("IMAP connection timeout. Reconnecting...") -def save_output(results, output_directory="output"): +def save_output(results, output_directory="output", \ + output_json_aggregate_file="aggregate.json", \ + output_json_forensic_file="forensic.json", \ + output_csv_aggregate_file="aggregate.csv", \ + output_csv_forensic_file="forensic.csv"): """ Save report data in the given directory @@ -1253,22 +1257,22 @@ def save_output(results, output_directory="output"): else: os.makedirs(output_directory) - with open("{0}".format(os.path.join(output_directory, "aggregate.json")), + with open("{0}".format(os.path.join(output_directory, output_json_aggregate_file)), "w", newline="\n", encoding="utf-8") as agg_json: agg_json.write(json.dumps(aggregate_reports, ensure_ascii=False, indent=2)) - with open("{0}".format(os.path.join(output_directory, "aggregate.csv")), + with open("{0}".format(os.path.join(output_directory, output_csv_aggregate_file)), "w", newline="\n", encoding="utf-8") as agg_csv: csv = parsed_aggregate_reports_to_csv(aggregate_reports) agg_csv.write(csv) - with open("{0}".format(os.path.join(output_directory, "forensic.json")), + with open("{0}".format(os.path.join(output_directory, output_json_forensic_file)), "w", newline="\n", encoding="utf-8") as for_json: for_json.write(json.dumps(forensic_reports, ensure_ascii=False, indent=2)) - with open("{0}".format(os.path.join(output_directory, "forensic.csv")), + with open("{0}".format(os.path.join(output_directory, output_csv_forensic_file)), "w", newline="\n", encoding="utf-8") as for_csv: csv = parsed_forensic_reports_to_csv(forensic_reports) for_csv.write(csv) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index 9dad3fb..262a82b 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -82,7 +82,7 @@ def _main(): if opts.save_aggregate: for report in reports_["aggregate_reports"]: try: - if opts.elasticsearch_hosts: + opts.elasticsearch_hosts: shards = opts.elasticsearch_number_of_shards replicas = opts.elasticsearch_number_of_replicas elastic.save_aggregate_report_to_elasticsearch( @@ -160,6 +160,14 @@ def _main(): help=strip_attachment_help, action="store_true") arg_parser.add_argument("-o", "--output", help="write output files to the given directory") + arg_parser.add_argument("--output-json-aggregate-file", + help="output aggregate JSON file") + arg_parser.add_argument("--output-json-forensic-file", + help="output forensic JSON file") + arg_parser.add_argument("--output-csv-aggregate-file", + help="output aggregate CSV file") + arg_parser.add_argument("--output-csv-forensic-file", + help="output forensic CSV file") arg_parser.add_argument("-n", "--nameservers", nargs="+", help="nameservers to query") arg_parser.add_argument("-t", "--dns_timeout", @@ -188,6 +196,10 @@ def _main(): offline=args.offline, strip_attachment_payloads=args.strip_attachment_payloads, output=args.output, + output_json_aggregate_file=args.output_json_aggregate_file, + output_json_forensic_file=args.output_json_forensic_file, + output_csv_aggregate_file=args.output_csv_aggregate_file, + output_csv_forensic_file=args.output_csv_forensic_file, nameservers=args.nameservers, silent=args.silent, dns_timeout=args.dns_timeout, From 478452de203fbdcb4644b989c066124f6c98eb6a Mon Sep 17 00:00:00 2001 From: Mauro Faccenda Date: Wed, 20 Jan 2021 15:53:19 +0100 Subject: [PATCH 02/31] pass offline parameter to wait_inbox() --- parsedmarc/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index e9119e0..68d9e89 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -670,6 +670,7 @@ def _main(): test=opts.imap_test, nameservers=opts.nameservers, dns_timeout=opts.dns_timeout, + offline=opts.offline, strip_attachment_payloads=sa) except FileExistsError as error: logger.error("{0}".format(error.__str__())) From be8395dbe341e5cca09fd7ff3630b6240f846ac3 Mon Sep 17 00:00:00 2001 From: Ola Thoresen Date: Wed, 20 Jan 2021 19:56:15 +0100 Subject: [PATCH 03/31] Detecting other IMAP-errors. Adding short sleep to avoid hammering the IMAP-server on error --- parsedmarc/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index ca4f1e7..5fe1f29 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -8,6 +8,7 @@ import shutil import xml.parsers.expat as expat import json from datetime import datetime +from time import sleep from collections import OrderedDict from io import BytesIO, StringIO from gzip import GzipFile @@ -1234,6 +1235,10 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, idle_timeout=idle_timeout) except (timeout, IMAPClientError): logger.warning("IMAP connection timeout. Reconnecting...") + sleep(5) + except Exception as e: + logger.warning("IMAP connection error. {0}. Reconnecting...".format(e)) + sleep(5) def save_output(results, output_directory="output"): From 0e2636225e7ffdb2ca7705075da665cab047a0c3 Mon Sep 17 00:00:00 2001 From: Ola Thoresen Date: Thu, 21 Jan 2021 08:24:44 +0100 Subject: [PATCH 04/31] Modifying some log-levels to INFO --- parsedmarc/__init__.py | 4 ++-- parsedmarc/cli.py | 2 ++ parsedmarc/elastic.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index ca4f1e7..cde57c9 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -943,7 +943,7 @@ def get_dmarc_reports_from_mbox(input_, nameservers=None, dns_timeout=2.0, input_)) for i in range(len(message_keys)): message_key = message_keys[i] - logger.debug("Processing message {0} of {1}".format( + logger.info("Processing message {0} of {1}".format( i+1, total_messages )) msg_content = mbox.get_string(message_key) @@ -1071,7 +1071,7 @@ def get_dmarc_reports_from_inbox(connection=None, reports_folder)) for i in range(len(messages)): msg_uid = messages[i] - logger.debug("Processing message {0} of {1}: UID {2}".format( + logger.info("Processing message {0} of {1}: UID {2}".format( i+1, total_messages, msg_uid )) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index e9119e0..752eacd 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -490,6 +490,8 @@ def _main(): logger.error("You must supply input files, or an IMAP configuration") exit(1) + logger.info("Starting dmarcparse") + if opts.save_aggregate or opts.save_forensic: try: if opts.elasticsearch_hosts: diff --git a/parsedmarc/elastic.py b/parsedmarc/elastic.py index b448f28..3f2d8a2 100644 --- a/parsedmarc/elastic.py +++ b/parsedmarc/elastic.py @@ -295,7 +295,7 @@ def save_aggregate_report_to_elasticsearch(aggregate_report, Raises: AlreadySaved """ - logger.debug("Saving aggregate report to Elasticsearch") + logger.info("Saving aggregate report to Elasticsearch") aggregate_report = aggregate_report.copy() metadata = aggregate_report["report_metadata"] org_name = metadata["org_name"] @@ -423,7 +423,7 @@ def save_forensic_report_to_elasticsearch(forensic_report, AlreadySaved """ - logger.debug("Saving forensic report to Elasticsearch") + logger.info("Saving forensic report to Elasticsearch") forensic_report = forensic_report.copy() sample_date = None if forensic_report["parsed_sample"]["date"] is not None: From 76614bdc94acc796b0307372e640d3836f0a77c2 Mon Sep 17 00:00:00 2001 From: Ola Thoresen Date: Thu, 21 Jan 2021 08:34:56 +0100 Subject: [PATCH 05/31] Fixing flake-error --- parsedmarc/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 5fe1f29..809dd63 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -1237,7 +1237,8 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, logger.warning("IMAP connection timeout. Reconnecting...") sleep(5) except Exception as e: - logger.warning("IMAP connection error. {0}. Reconnecting...".format(e)) + logger.warning("IMAP connection error. {0}. " + "Reconnecting...".format(e)) sleep(5) From a00cee8ba4c493f37194e948a5de55655307fb86 Mon Sep 17 00:00:00 2001 From: Ola Thoresen Date: Fri, 22 Jan 2021 10:38:04 +0100 Subject: [PATCH 06/31] Adding a log line to see the sender of a report when it is parsed --- parsedmarc/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index cde57c9..5ac6965 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -776,6 +776,7 @@ def parse_report_email(input_, offline=False, nameservers=None, subject = None feedback_report = None sample = None + logger.info("Parsing mail from {0}".format(msg_headers["From"])) if "Subject" in msg_headers: subject = msg_headers["Subject"] for part in msg.walk(): From c853c470879ad2ba2cc56595197cb9fe9f2927d9 Mon Sep 17 00:00:00 2001 From: Ola Thoresen Date: Fri, 22 Jan 2021 15:06:35 +0100 Subject: [PATCH 07/31] Ensuring mail from is set --- parsedmarc/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 5ac6965..73c0130 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -776,7 +776,8 @@ def parse_report_email(input_, offline=False, nameservers=None, subject = None feedback_report = None sample = None - logger.info("Parsing mail from {0}".format(msg_headers["From"])) + if "From" in msg_headers: + logger.info("Parsing mail from {0}".format(msg_headers["From"])) if "Subject" in msg_headers: subject = msg_headers["Subject"] for part in msg.walk(): From bc684c891340abf39b91b5a2a758f6ace623d2f9 Mon Sep 17 00:00:00 2001 From: Tom Henderson Date: Fri, 5 Feb 2021 13:37:09 +1300 Subject: [PATCH 08/31] Add option to process messages in batches --- README.rst | 1 + docs/index.rst | 1 + parsedmarc/__init__.py | 25 ++++++++++++++++++------- parsedmarc/cli.py | 14 +++++++++++--- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index b028445..ce3f2ea 100644 --- a/README.rst +++ b/README.rst @@ -155,6 +155,7 @@ The full set of configuration options are: - ``watch`` - bool: Use the IMAP ``IDLE`` command to process messages as they arrive - ``delete`` - bool: Delete messages after processing them, instead of archiving them - ``test`` - bool: Do not move or delete messages + - ``batch_size`` - int: Number of messages to read and process before saving. Defaults to all messages if not set. - ``elasticsearch`` - ``hosts`` - str: A comma separated list of hostnames and ports or URLs (e.g. ``127.0.0.1:9200`` or ``https://user:secret@localhost``) diff --git a/docs/index.rst b/docs/index.rst index 449f048..7897a30 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -161,6 +161,7 @@ The full set of configuration options are: - ``watch`` - bool: Use the IMAP ``IDLE`` command to process messages as they arrive - ``delete`` - bool: Delete messages after processing them, instead of archiving them - ``test`` - bool: Do not move or delete messages + - ``batch_size`` - int: Number of messages to read and process before saving. Defaults to all messages if not set. - ``elasticsearch`` - ``hosts`` - str: A comma separated list of hostnames and ports or URLs (e.g. ``127.0.0.1:9200`` or ``https://user:secret@localhost``) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index ca4f1e7..8380df7 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -1004,7 +1004,8 @@ def get_dmarc_reports_from_inbox(connection=None, nameservers=None, dns_timeout=6.0, strip_attachment_payloads=False, - results=None): + results=None, + batch_size=None): """ Fetches and parses DMARC reports from an inbox @@ -1028,6 +1029,7 @@ def get_dmarc_reports_from_inbox(connection=None, strip_attachment_payloads (bool): Remove attachment payloads from forensic report results results (dict): Results from the previous run + batch_size (int): Number of messages to read and process before saving Returns: OrderedDict: Lists of ``aggregate_reports`` and ``forensic_reports`` @@ -1069,11 +1071,18 @@ def get_dmarc_reports_from_inbox(connection=None, total_messages = len(messages) logger.debug("Found {0} messages in {1}".format(len(messages), reports_folder)) - for i in range(len(messages)): + + if batch_size: + message_limit = batch_size + else: + message_limit = total_messages + + logger.debug("Processing {0} messages".format(message_limit)) + + for i in range(message_limit): msg_uid = messages[i] logger.debug("Processing message {0} of {1}: UID {2}".format( - i+1, total_messages, msg_uid - + i+1, message_limit, msg_uid )) msg_content = server.fetch_message(msg_uid, parse=False) sa = strip_attachment_payloads @@ -1165,7 +1174,7 @@ def get_dmarc_reports_from_inbox(connection=None, total_messages = len(server.search()) - if not test and total_messages > 0: + if not test and not batch_size and total_messages > 0: # Process emails that came in during the last run results = get_dmarc_reports_from_inbox( connection=server, @@ -1187,7 +1196,7 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, verify=True, reports_folder="INBOX", archive_folder="Archive", delete=False, test=False, idle_timeout=30, offline=False, nameservers=None, - dns_timeout=6.0, strip_attachment_payloads=False): + dns_timeout=6.0, strip_attachment_payloads=False, batch_size=None): """ Use an IDLE IMAP connection to parse incoming emails, and pass the results to a callback function @@ -1210,6 +1219,7 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, dns_timeout (float): Set the DNS query timeout strip_attachment_payloads (bool): Replace attachment payloads in forensic report samples with None + batch_size (int): Number of messages to read and process before saving """ sa = strip_attachment_payloads @@ -1222,7 +1232,8 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, offline=offline, nameservers=nameservers, dns_timeout=dns_timeout, - strip_attachment_payloads=sa) + strip_attachment_payloads=sa, + batch_size=batch_size) callback(res) while True: diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index e9119e0..df1a870 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -210,6 +210,7 @@ def _main(): imap_watch=False, imap_delete=False, imap_test=False, + imap_batch_size=None, hec=None, hec_token=None, hec_index=None, @@ -327,6 +328,10 @@ def _main(): opts.imap_delete = imap_config.getboolean("delete") if "test" in imap_config: opts.imap_test = imap_config.getboolean("test") + if "batch_size" in imap_config: + opts.imap_batch_size = imap_config.getint("batch_size") + else: + opts.imap_batch_size = None if "elasticsearch" in config: elasticsearch_config = config["elasticsearch"] if "hosts" in elasticsearch_config: @@ -613,8 +618,9 @@ def _main(): offline=opts.offline, nameservers=ns, test=opts.imap_test, - strip_attachment_payloads=sa - ) + strip_attachment_payloads=sa, + batch_size=opts.imap_batch_size + ) aggregate_reports += reports["aggregate_reports"] forensic_reports += reports["forensic_reports"] @@ -670,7 +676,9 @@ def _main(): test=opts.imap_test, nameservers=opts.nameservers, dns_timeout=opts.dns_timeout, - strip_attachment_payloads=sa) + strip_attachment_payloads=sa, + batch_size=opts.imap_batch_size + ) except FileExistsError as error: logger.error("{0}".format(error.__str__())) exit(1) From bafa4861b153199db60145ae7020d18ddaa7a4bc Mon Sep 17 00:00:00 2001 From: Tom Henderson Date: Fri, 5 Feb 2021 14:27:22 +1300 Subject: [PATCH 09/31] Update docs --- README.rst | 11 +++++++++-- docs/example.ini | 4 ++++ docs/index.rst | 12 +++++++++--- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index b028445..2d297f9 100644 --- a/README.rst +++ b/README.rst @@ -128,11 +128,15 @@ For example token = HECTokenGoesHere index = email + [s3] + bucket = my-bucket + path = /parsedmarc + The full set of configuration options are: - ``general`` - - ``save_aggregate`` - bool: Save aggregate report data to the Elasticsearch and/or Splunk - - ``save_forensic`` - bool: Save forensic report data to the Elasticsearch and/or Splunk + - ``save_aggregate`` - bool: Save aggregate report data to Elasticsearch, Splunk and/or S3 + - ``save_forensic`` - bool: Save forensic report data to Elasticsearch, Splunk and/or S3 - ``strip_attachment_payloads`` - bool: Remove attachment payloads from results - ``output`` - str: Directory to place JSON and CSV files in - ``offline`` - bool: Do not use online queries for geolocation or DNS @@ -191,6 +195,9 @@ The full set of configuration options are: - ``subject`` - str: The Subject header to use in the email (Default: parsedmarc report) - ``attachment`` - str: The ZIP attachment filenames - ``message`` - str: The email message (Default: Please see the attached parsedmarc report.) +- ``s3`` + - ``bucket`` - str: The S3 bucket name + - ``path`` - int: The path to upload reports to (Default: /) .. warning:: diff --git a/docs/example.ini b/docs/example.ini index a27a670..a9a2985 100644 --- a/docs/example.ini +++ b/docs/example.ini @@ -18,3 +18,7 @@ ssl = False url = https://splunkhec.example.com token = HECTokenGoesHere index = email + +[s3] +bucket = my-bucket +path = /parsedmarc \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 449f048..7004cc4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -132,11 +132,15 @@ For example token = HECTokenGoesHere index = email + [s3] + bucket = my-bucket + path = /parsedmarc + The full set of configuration options are: - ``general`` - - ``save_aggregate`` - bool: Save aggregate report data to the Elasticsearch and/or Splunk - - ``save_forensic`` - bool: Save forensic report data to the Elasticsearch and/or Splunk + - ``save_aggregate`` - bool: Save aggregate report data to the Elasticsearch, Splunk and/or S3 + - ``save_forensic`` - bool: Save forensic report data to the Elasticsearch, Splunk and/or S3 - ``strip_attachment_payloads`` - bool: Remove attachment payloads from results - ``output`` - str: Directory to place JSON and CSV files in - ``offline`` - bool: Do not use online queries for geolocation or DNS @@ -200,7 +204,9 @@ The full set of configuration options are: - ``subject`` - str: The Subject header to use in the email (Default: parsedmarc report) - ``attachment`` - str: The ZIP attachment filenames - ``message`` - str: The email message (Default: Please see the attached parsedmarc report.) - +- ``s3`` + - ``bucket`` - str: The S3 bucket name + - ``path`` - int: The path to upload reports to (Default: /) .. warning:: From 755ee3ded70adf4c61debf09aaddc150ab757bee Mon Sep 17 00:00:00 2001 From: Tom Henderson Date: Fri, 5 Feb 2021 14:28:46 +1300 Subject: [PATCH 10/31] Add new settings for s3 --- parsedmarc/cli.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index e9119e0..a403391 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -241,6 +241,8 @@ def _main(): smtp_to=[], smtp_subject="parsedmarc report", smtp_message="Please see the attached DMARC results.", + s3_bucket=None, + s3_path=None, log_file=args.log_file, n_procs=1, chunk_size=1 @@ -469,6 +471,22 @@ def _main(): opts.smtp_attachment = smtp_config["attachment"] if "message" in smtp_config: opts.smtp_message = smtp_config["message"] + if "s3" in config.sections(): + s3_config = config["s3"] + if "bucket" in s3_config: + opts.s3_bucket = s3_config["bucket"] + else: + logger.critical("bucket setting missing from the " + "s3 config section") + exit(-1) + if "path" in s3_config: + opts.s3_path = s3_config["path"] + if opts.s3_path.startswith("/"): + opts.s3_path = opts.s3_path[1:] + if opts.s3_path.endswith("/"): + opts.s3_path = opts.s3_path[:-1] + else: + opts.s3_path = "" logging.basicConfig(level=logging.WARNING) logger.setLevel(logging.WARNING) From 291d389f69d279b249cab2b01c6bd7e7227b79e9 Mon Sep 17 00:00:00 2001 From: Tom Henderson Date: Fri, 5 Feb 2021 14:29:27 +1300 Subject: [PATCH 11/31] Add boto3 --- requirements.txt | 1 + setup.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4200a90..9ef3fab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,4 @@ sphinx_rtd_theme>=0.4.3 wheel>=0.33.6 codecov>=2.0.15 lxml>=4.4.0 +boto3>=1.16.63 \ No newline at end of file diff --git a/setup.py b/setup.py index 6faaf91..6929893 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,8 @@ setup( 'elasticsearch-dsl>=7.2.0,<8.0.0', 'kafka-python>=1.4.4', 'tqdm>=4.31.1', - 'lxml>=4.4.0' + 'lxml>=4.4.0', + 'boto3>=1.16.63' ], entry_points={ From a4acd5f2320ae6f518446c2a6223e7e664092301 Mon Sep 17 00:00:00 2001 From: Tom Henderson Date: Fri, 5 Feb 2021 14:30:02 +1300 Subject: [PATCH 12/31] Add S3Client --- parsedmarc/s3.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 parsedmarc/s3.py diff --git a/parsedmarc/s3.py b/parsedmarc/s3.py new file mode 100644 index 0000000..8f64aa0 --- /dev/null +++ b/parsedmarc/s3.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +import logging +import json +import boto3 + +from parsedmarc.utils import human_timestamp_to_datetime + +logger = logging.getLogger("parsedmarc") + + +class S3Client(object): + """A client for a Amazon S3""" + + def __init__(self, bucket_name, bucket_path): + """ + Initializes the S3Client + Args: + bucket_name (str): The S3 Bucket + bucket_path (str): The path to save reports + """ + self.bucket_name = bucket_name + self.bucket_path = bucket_path + self.metadata_keys = [ + "org_name", + "org_email", + "report_id", + "begin_date", + "end_date", + ] + + self.s3 = boto3.resource('s3') + self.bucket = self.s3.Bucket(self.bucket_name) + + + def save_aggregate_report_to_s3(self, report): + self.save_report_to_s3(report, 'aggregate') + + + def save_forensic_report_to_s3(self, report): + self.save_report_to_s3(report, 'forensic') + + + def save_report_to_s3(self, report, report_type): + report_date = human_timestamp_to_datetime(report["report_metadata"]["begin_date"]) + report_id = report["report_metadata"]["report_id"] + object_path = "{0}/{1}/year={2}/month={3:02d}/day={4:02d}/{5}.json".format( + self.bucket_path, + report_type, + report_date.year, + report_date.month, + report_date.day, + report_id + ) + logger.debug("Saving {0} report to s3://{1}/{2}".format(report_type, self.bucket_name, object_path)) + object_metadata = { + k: v + for k, v in report["report_metadata"].items() + if k in self.metadata_keys + } + self.bucket.put_object( + Body=json.dumps(report), + Key=object_path, + Metadata=object_metadata + ) From 5f6b94583938ff3aedbf1d9e7c526e838e8c7c9c Mon Sep 17 00:00:00 2001 From: Tom Henderson Date: Fri, 5 Feb 2021 14:30:54 +1300 Subject: [PATCH 13/31] Save reports to s3 --- parsedmarc/cli.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index a403391..1087745 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -19,7 +19,7 @@ from tqdm import tqdm from parsedmarc import get_dmarc_reports_from_inbox, watch_inbox, \ parse_report_file, get_dmarc_reports_from_mbox, elastic, kafkaclient, \ splunk, save_output, email_results, ParserError, __version__, \ - InvalidDMARCReport + InvalidDMARCReport, s3 from parsedmarc.utils import is_mbox logger = logging.getLogger("parsedmarc") @@ -79,6 +79,14 @@ def _main(): ) except Exception as error_: logger.error("Kafka Error: {0}".format(error_.__str__())) + if opts.s3_bucket: + try: + s3_client = s3.S3Client( + bucket_name=opts.s3_bucket, + bucket_path=opts.s3_path, + ) + except Exception as error_: + logger.error("S3 Error: {0}".format(error_.__str__())) if opts.save_aggregate: for report in reports_["aggregate_reports"]: try: @@ -104,6 +112,11 @@ def _main(): except Exception as error_: logger.error("Kafka Error: {0}".format( error_.__str__())) + try: + if opts.s3_bucket: + s3_client.save_aggregate_report_to_s3(report) + except Exception as error_: + logger.error("S3 Error: {0}".format(error_.__str__())) if opts.hec: try: aggregate_reports_ = reports_["aggregate_reports"] @@ -138,6 +151,11 @@ def _main(): except Exception as error_: logger.error("Kafka Error: {0}".format( error_.__str__())) + try: + if opts.s3_bucket: + s3_client.save_forensic_report_to_s3(report) + except Exception as error_: + logger.error("S3 Error: {0}".format(error_.__str__())) if opts.hec: try: forensic_reports_ = reports_["forensic_reports"] From eba722cddceafd50cdcee3e7c2ed6b3aff01f387 Mon Sep 17 00:00:00 2001 From: Tom Henderson Date: Fri, 5 Feb 2021 14:38:52 +1300 Subject: [PATCH 14/31] Fix path example --- README.rst | 2 +- docs/example.ini | 2 +- docs/index.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 2d297f9..7c85ce1 100644 --- a/README.rst +++ b/README.rst @@ -130,7 +130,7 @@ For example [s3] bucket = my-bucket - path = /parsedmarc + path = parsedmarc The full set of configuration options are: diff --git a/docs/example.ini b/docs/example.ini index a9a2985..efa56b3 100644 --- a/docs/example.ini +++ b/docs/example.ini @@ -21,4 +21,4 @@ index = email [s3] bucket = my-bucket -path = /parsedmarc \ No newline at end of file +path = parsedmarc \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 7004cc4..c63ecef 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -134,7 +134,7 @@ For example [s3] bucket = my-bucket - path = /parsedmarc + path = parsedmarc The full set of configuration options are: From 9522c9b6e4400a3758a74d143f1b6d7a69d0a12b Mon Sep 17 00:00:00 2001 From: Tom Henderson Date: Fri, 5 Feb 2021 14:51:32 +1300 Subject: [PATCH 15/31] Ensure message_limit is not greater than total_messages --- parsedmarc/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 8380df7..78bd356 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -1073,7 +1073,7 @@ def get_dmarc_reports_from_inbox(connection=None, reports_folder)) if batch_size: - message_limit = batch_size + message_limit = min(total_messages, batch_size) else: message_limit = total_messages From de05be90df8ef7fc1233be6803c5991da7c9345c Mon Sep 17 00:00:00 2001 From: Tom Henderson Date: Fri, 5 Feb 2021 14:53:43 +1300 Subject: [PATCH 16/31] Fix flake8 error --- parsedmarc/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 78bd356..2f192a5 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -1196,7 +1196,8 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, verify=True, reports_folder="INBOX", archive_folder="Archive", delete=False, test=False, idle_timeout=30, offline=False, nameservers=None, - dns_timeout=6.0, strip_attachment_payloads=False, batch_size=None): + dns_timeout=6.0, strip_attachment_payloads=False, + batch_size=None): """ Use an IDLE IMAP connection to parse incoming emails, and pass the results to a callback function From 85e7fd4ce6d60163e14341ed70f3da81bbe979f6 Mon Sep 17 00:00:00 2001 From: Tom Henderson Date: Fri, 5 Feb 2021 15:58:57 +1300 Subject: [PATCH 17/31] Fix flake8 errors --- parsedmarc/s3.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/parsedmarc/s3.py b/parsedmarc/s3.py index 8f64aa0..41910ed 100644 --- a/parsedmarc/s3.py +++ b/parsedmarc/s3.py @@ -28,23 +28,23 @@ class S3Client(object): "begin_date", "end_date", ] - + self.s3 = boto3.resource('s3') self.bucket = self.s3.Bucket(self.bucket_name) - def save_aggregate_report_to_s3(self, report): self.save_report_to_s3(report, 'aggregate') - def save_forensic_report_to_s3(self, report): self.save_report_to_s3(report, 'forensic') - def save_report_to_s3(self, report, report_type): - report_date = human_timestamp_to_datetime(report["report_metadata"]["begin_date"]) + report_date = human_timestamp_to_datetime( + report["report_metadata"]["begin_date"] + ) report_id = report["report_metadata"]["report_id"] - object_path = "{0}/{1}/year={2}/month={3:02d}/day={4:02d}/{5}.json".format( + path_template = "{0}/{1}/year={2}/month={3:02d}/day={4:02d}/{5}.json" + object_path = path_template.format( self.bucket_path, report_type, report_date.year, @@ -52,7 +52,10 @@ class S3Client(object): report_date.day, report_id ) - logger.debug("Saving {0} report to s3://{1}/{2}".format(report_type, self.bucket_name, object_path)) + logger.debug("Saving {0} report to s3://{1}/{2}".format( + report_type, + self.bucket_name, + object_path)) object_metadata = { k: v for k, v in report["report_metadata"].items() From 394dddd2df6c826b4702620b2c637fd2d895b59f Mon Sep 17 00:00:00 2001 From: supaeasy <59504964+supaeasy@users.noreply.github.com> Date: Fri, 5 Feb 2021 15:16:51 +0100 Subject: [PATCH 18/31] Update README.rst I struggled too long with this to not let others know. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index b028445..3428a0c 100644 --- a/README.rst +++ b/README.rst @@ -145,7 +145,7 @@ The full set of configuration options are: - ``chunk_size`` - int: Number of files to give to each process when running in parallel. Setting this to a number larger than one can improve performance when processing thousands of files - ``imap`` - ``host`` - str: The IMAP server hostname or IP address - - ``port`` - int: The IMAP server port (Default: 993) + - ``port`` - int: The IMAP server port (Default: 993) If your Hoster publishes another port, still try 993. Otherwise Error:"wrong SSL version" - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``skip_certificate_verification`` - bool: Skip certificate verification (not recommended) - ``user`` - str: The IMAP user From 36c592cc5a6cf52995763c62c6087402538946f9 Mon Sep 17 00:00:00 2001 From: atanas argirov Date: Thu, 11 Feb 2021 18:22:29 +0000 Subject: [PATCH 19/31] * added defaults for arg parser --- parsedmarc/__init__.py | 1 - parsedmarc/cli.py | 17 +++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 49de20b..e27aee8 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -1234,7 +1234,6 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, except (timeout, IMAPClientError): logger.warning("IMAP connection timeout. Reconnecting...") - def save_output(results, output_directory="output", \ output_json_aggregate_file="aggregate.json", \ output_json_forensic_file="forensic.json", \ diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index 262a82b..b0d4d72 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -82,7 +82,7 @@ def _main(): if opts.save_aggregate: for report in reports_["aggregate_reports"]: try: - opts.elasticsearch_hosts: + if opts.elasticsearch_hosts: shards = opts.elasticsearch_number_of_shards replicas = opts.elasticsearch_number_of_replicas elastic.save_aggregate_report_to_elasticsearch( @@ -161,13 +161,13 @@ def _main(): arg_parser.add_argument("-o", "--output", help="write output files to the given directory") arg_parser.add_argument("--output-json-aggregate-file", - help="output aggregate JSON file") + help="output aggregate JSON file", default="aggregate.json") arg_parser.add_argument("--output-json-forensic-file", - help="output forensic JSON file") + help="output forensic JSON file", default="forensic.json") arg_parser.add_argument("--output-csv-aggregate-file", - help="output aggregate CSV file") + help="output aggregate CSV file", default="aggregate.csv") arg_parser.add_argument("--output-csv-forensic-file", - help="output forensic CSV file") + help="output forensic CSV file", default="forensic.csv") arg_parser.add_argument("-n", "--nameservers", nargs="+", help="nameservers to query") arg_parser.add_argument("-t", "--dns_timeout", @@ -191,6 +191,7 @@ def _main(): forensic_reports = [] args = arg_parser.parse_args() + opts = Namespace(file_path=args.file_path, config_file=args.config_file, offline=args.offline, @@ -631,7 +632,11 @@ def _main(): ("forensic_reports", forensic_reports)]) if opts.output: - save_output(results, output_directory=opts.output) + save_output(results, output_directory=opts.output, \ + output_json_aggregate_file=opts.output_json_aggregate_file, \ + output_json_forensic_file=opts.output_json_forensic_file, \ + output_csv_aggregate_file=opts.output_csv_aggregate_file, \ + output_csv_forensic_file=opts.output_csv_forensic_file) process_reports(results) From e51f2b0127bad34903fee4c88d44b5f4660b0913 Mon Sep 17 00:00:00 2001 From: atanas argirov Date: Fri, 12 Feb 2021 10:50:25 +0000 Subject: [PATCH 20/31] * general cleanup to meet linter rules --- parsedmarc/__init__.py | 27 ++++++++++++++++++--------- parsedmarc/cli.py | 38 +++++++++++++++++++++----------------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index e27aee8..7f0c7d0 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -1234,11 +1234,12 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, except (timeout, IMAPClientError): logger.warning("IMAP connection timeout. Reconnecting...") -def save_output(results, output_directory="output", \ - output_json_aggregate_file="aggregate.json", \ - output_json_forensic_file="forensic.json", \ - output_csv_aggregate_file="aggregate.csv", \ - output_csv_forensic_file="forensic.csv"): + +def save_output(results, output_directory="output", + output_json_aggregate="aggregate.json", + output_json_forensic="forensic.json", + output_csv_aggregate="aggregate.csv", + output_csv_forensic="forensic.csv"): """ Save report data in the given directory @@ -1256,22 +1257,30 @@ def save_output(results, output_directory="output", \ else: os.makedirs(output_directory) - with open("{0}".format(os.path.join(output_directory, output_json_aggregate_file)), + with open("{0}" + .format(os.path.join(output_directory, + output_json_aggregate)), "w", newline="\n", encoding="utf-8") as agg_json: agg_json.write(json.dumps(aggregate_reports, ensure_ascii=False, indent=2)) - with open("{0}".format(os.path.join(output_directory, output_csv_aggregate_file)), + with open("{0}" + .format(os.path.join(output_directory, + output_csv_aggregate)), "w", newline="\n", encoding="utf-8") as agg_csv: csv = parsed_aggregate_reports_to_csv(aggregate_reports) agg_csv.write(csv) - with open("{0}".format(os.path.join(output_directory, output_json_forensic_file)), + with open("{0}" + .format(os.path.join(output_directory, + output_json_forensic)), "w", newline="\n", encoding="utf-8") as for_json: for_json.write(json.dumps(forensic_reports, ensure_ascii=False, indent=2)) - with open("{0}".format(os.path.join(output_directory, output_csv_forensic_file)), + with open("{0}" + .format(os.path.join(output_directory, + output_csv_forensic)), "w", newline="\n", encoding="utf-8") as for_csv: csv = parsed_forensic_reports_to_csv(forensic_reports) for_csv.write(csv) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index b0d4d72..777dfc5 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -160,14 +160,18 @@ def _main(): help=strip_attachment_help, action="store_true") arg_parser.add_argument("-o", "--output", help="write output files to the given directory") - arg_parser.add_argument("--output-json-aggregate-file", - help="output aggregate JSON file", default="aggregate.json") - arg_parser.add_argument("--output-json-forensic-file", - help="output forensic JSON file", default="forensic.json") - arg_parser.add_argument("--output-csv-aggregate-file", - help="output aggregate CSV file", default="aggregate.csv") - arg_parser.add_argument("--output-csv-forensic-file", - help="output forensic CSV file", default="forensic.csv") + arg_parser.add_argument("--output-json-aggregate", + help="output aggregate JSON file", + default="aggregate.json") + arg_parser.add_argument("--output-json-forensic", + help="output forensic JSON file", + default="forensic.json") + arg_parser.add_argument("--output-csv-aggregate", + help="output aggregate CSV file", + default="aggregate.csv") + arg_parser.add_argument("--output-csv-forensic", + help="output forensic CSV file", + default="forensic.csv") arg_parser.add_argument("-n", "--nameservers", nargs="+", help="nameservers to query") arg_parser.add_argument("-t", "--dns_timeout", @@ -197,10 +201,10 @@ def _main(): offline=args.offline, strip_attachment_payloads=args.strip_attachment_payloads, output=args.output, - output_json_aggregate_file=args.output_json_aggregate_file, - output_json_forensic_file=args.output_json_forensic_file, - output_csv_aggregate_file=args.output_csv_aggregate_file, - output_csv_forensic_file=args.output_csv_forensic_file, + output_json_aggregate=args.output_json_aggregate, + output_json_forensic=args.output_json_forensic, + output_csv_aggregate=args.output_csv_aggregate, + output_csv_forensic=args.output_csv_forensic, nameservers=args.nameservers, silent=args.silent, dns_timeout=args.dns_timeout, @@ -632,11 +636,11 @@ def _main(): ("forensic_reports", forensic_reports)]) if opts.output: - save_output(results, output_directory=opts.output, \ - output_json_aggregate_file=opts.output_json_aggregate_file, \ - output_json_forensic_file=opts.output_json_forensic_file, \ - output_csv_aggregate_file=opts.output_csv_aggregate_file, \ - output_csv_forensic_file=opts.output_csv_forensic_file) + save_output(results, output_directory=opts.output, + output_json_aggregate=opts.output_json_aggregate, + output_json_forensic=opts.output_json_forensic, + output_csv_aggregate=opts.output_csv_aggregate, + output_csv_forensic=opts.output_csv_forensic) process_reports(results) From 775a6f21819d7aaf389a75bd5f21d4e61fba5644 Mon Sep 17 00:00:00 2001 From: Silvian I Date: Mon, 31 May 2021 15:40:57 +0200 Subject: [PATCH 21/31] Fix server connection timeout while processiong large dmarc files --- parsedmarc/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 562a0a8..9873316 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -304,8 +304,12 @@ def parse_aggregate_report_xml(xml, offline=False, nameservers=None, new_report["policy_published"] = new_policy_published if type(report["record"]) == list: - for record in report["record"]: - report_record = _parse_report_record(record, + for i in range(len(report["record"])): + if i % 20 == 0 and i > 0: + logger.debug("Sending noop cmd") + server.noop() + logger.debug("Processed {0}/{1}".format(i, len(report["record"]))) + report_record = _parse_report_record(report["record"][i], offline=offline, nameservers=nameservers, dns_timeout=timeout, @@ -1039,6 +1043,7 @@ def get_dmarc_reports_from_inbox(connection=None, raise ValueError("Must supply a connection, or a username and " "password") + global server aggregate_reports = [] forensic_reports = [] aggregate_report_msg_uids = [] From 3615ad3799f058e8012b39f708aa500f51aa2cf4 Mon Sep 17 00:00:00 2001 From: Silvian I Date: Mon, 31 May 2021 15:40:57 +0200 Subject: [PATCH 22/31] Fix server connection timeout while processiong large dmarc files --- parsedmarc/__init__.py | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 562a0a8..9de72c1 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -203,7 +203,7 @@ def _parse_report_record(record, offline=False, nameservers=None, def parse_aggregate_report_xml(xml, offline=False, nameservers=None, - timeout=2.0, parallel=False): + timeout=2.0, parallel=False, server=None): """Parses a DMARC XML report string and returns a consistent OrderedDict Args: @@ -213,6 +213,7 @@ def parse_aggregate_report_xml(xml, offline=False, nameservers=None, (Cloudflare's public DNS resolvers by default) timeout (float): Sets the DNS timeout in seconds parallel (bool): Parallel processing + server (IMAPClient): Connection object Returns: OrderedDict: The parsed aggregate DMARC report @@ -304,8 +305,13 @@ def parse_aggregate_report_xml(xml, offline=False, nameservers=None, new_report["policy_published"] = new_policy_published if type(report["record"]) == list: - for record in report["record"]: - report_record = _parse_report_record(record, + for i in range(len(report["record"])): + if server is not None and i > 0 and i % 20 == 0: + logger.debug("Sending noop cmd") + server.noop() + logger.debug("Processed {0}/{1}".format( + i, len(report["record"]))) + report_record = _parse_report_record(report["record"][i], offline=offline, nameservers=nameservers, dns_timeout=timeout, @@ -385,7 +391,8 @@ def extract_xml(input_): def parse_aggregate_report_file(_input, offline=False, nameservers=None, dns_timeout=2.0, - parallel=False): + parallel=False, + server=None): """Parses a file at the given path, a file-like object. or bytes as a aggregate DMARC report @@ -396,6 +403,7 @@ def parse_aggregate_report_file(_input, offline=False, nameservers=None, (Cloudflare's public DNS resolvers by default) dns_timeout (float): Sets the DNS timeout in seconds parallel (bool): Parallel processing + server (IMAPClient): Connection object Returns: OrderedDict: The parsed DMARC aggregate report @@ -406,7 +414,8 @@ def parse_aggregate_report_file(_input, offline=False, nameservers=None, offline=offline, nameservers=nameservers, timeout=dns_timeout, - parallel=parallel) + parallel=parallel, + server=server) def parsed_aggregate_reports_to_csv_rows(reports): @@ -738,7 +747,7 @@ def parsed_forensic_reports_to_csv(reports): def parse_report_email(input_, offline=False, nameservers=None, dns_timeout=2.0, strip_attachment_payloads=False, - parallel=False): + parallel=False, server=None): """ Parses a DMARC report from an email @@ -750,6 +759,7 @@ def parse_report_email(input_, offline=False, nameservers=None, strip_attachment_payloads (bool): Remove attachment payloads from forensic report results parallel (bool): Parallel processing + server (IMAPClient): Connection object Returns: OrderedDict: @@ -813,7 +823,8 @@ def parse_report_email(input_, offline=False, nameservers=None, offline=offline, nameservers=ns, dns_timeout=dns_timeout, - parallel=parallel) + parallel=parallel, + server=server) result = OrderedDict([("report_type", "aggregate"), ("report", aggregate_report)]) return result @@ -863,7 +874,7 @@ def parse_report_email(input_, offline=False, nameservers=None, def parse_report_file(input_, nameservers=None, dns_timeout=2.0, strip_attachment_payloads=False, - offline=False, parallel=False): + offline=False, parallel=False, server=None): """Parses a DMARC aggregate or forensic file at the given path, a file-like object. or bytes @@ -876,6 +887,7 @@ def parse_report_file(input_, nameservers=None, dns_timeout=2.0, forensic report results offline (bool): Do not make online queries for geolocation or DNS parallel (bool): Parallel processing + server (IMAPClient): Connection object Returns: OrderedDict: The parsed DMARC report @@ -895,7 +907,8 @@ def parse_report_file(input_, nameservers=None, dns_timeout=2.0, offline=offline, nameservers=nameservers, dns_timeout=dns_timeout, - parallel=parallel) + parallel=parallel, + server=server) results = OrderedDict([("report_type", "aggregate"), ("report", report)]) except InvalidAggregateReport: @@ -906,7 +919,8 @@ def parse_report_file(input_, nameservers=None, dns_timeout=2.0, nameservers=nameservers, dns_timeout=dns_timeout, strip_attachment_payloads=sa, - parallel=parallel) + parallel=parallel, + server=server) except InvalidDMARCReport: raise InvalidDMARCReport("Not a valid aggregate or forensic " "report") @@ -1083,7 +1097,8 @@ def get_dmarc_reports_from_inbox(connection=None, nameservers=nameservers, dns_timeout=dns_timeout, offline=offline, - strip_attachment_payloads=sa) + strip_attachment_payloads=sa, + server=server) if parsed_email["report_type"] == "aggregate": aggregate_reports.append(parsed_email["report"]) aggregate_report_msg_uids.append(msg_uid) From 837ba7ef4d0e87fa67ce09b765145003eaed40d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matth=C3=A4us=20Wander?= Date: Sun, 6 Jun 2021 16:35:07 +0200 Subject: [PATCH 23/31] Added splunk installation guide --- splunk/README.rst | 73 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 5 deletions(-) diff --git a/splunk/README.rst b/splunk/README.rst index 44a8ccf..42198f5 100644 --- a/splunk/README.rst +++ b/splunk/README.rst @@ -1,9 +1,53 @@ -================= -Splunk dashboards -================= +=================== +Splunk Installation +=================== -Setup guide ------------ +Install Splunk for use with Docker +---------------------------------- + +Download latest Splunk image:: + + docker pull splunk/splunk:latest + +Run Splunk with Docker +---------------------- + +Listen on all network interfaces:: + + docker run -d -p 8000:8000 -p 8088:8088 -e "SPLUNK_START_ARGS=--accept-license" -e "SPLUNK_PASSWORD=password1234" -e "SPLUNK_HEC_TOKEN=hec-token-1234" --name splunk splunk/splunk:latest + +Listen on localhost for use with reverse proxy with base URL `/splunk`:: + + docker run -d -p 127.0.0.1:8000:8000 -p 127.0.0.1:8088:8088 -e "SPLUNK_START_ARGS=--accept-license" -e "SPLUNK_PASSWORD=password1234" -e "SPLUNK_HEC_TOKEN=hec-token-1234" -e "SPLUNK_ROOT_ENDPOINT=/splunk" --name splunk splunk/splunk:latest + +Set up reverse proxy, e.g. Apache2:: + + ProxyPass /splunk http://127.0.0.1:8000/splunk + ProxyPassReverse /splunk http://127.0.0.1:8000/splunk + +Splunk Configuration +-------------------- + +Access web UI at http://127.0.0.1:8000 and log in with `admin:password1234`. + +Create App and Index +~~~~~~~~~~~~~~~~~~~~ + +- Settings > Data > Indexes: New Index + + - Index name: "email" + +- HEC token `hec-token-1234` should be already set up. + + - Check under Settings > Data > Data inputs: HTTP Event Collector + +- Apps > Manage Apps: Create app + + - Name: "parsedmarc" + - Folder name: "parsedmarc" + +Create Dashboards +~~~~~~~~~~~~~~~~~ 1. Navigate to the app you want to add the dashboards to, or create a new app called DMARC 2. Click Dashboards @@ -22,3 +66,22 @@ Setup guide 15. Paste the content of ''dmarc_forensic_dashboard.xml`` into the source editor 16. If the index storing the DMARC data is not named email, replace index="email" accordingly 17. Click Save + +============== +Example Config +============== + +parsedmarc.ini:: + + [splunk_hec] + url = https://127.0.0.1:8088/ + token = hec-token-1234 + index = email + skip_certificate_verification = True + +Note that `skip_certificate_verification = True` disables security checks. + +Run parsedmarc:: + + python3 -m parsedmarc.cli -c parsedmarc.ini + From ca36db5f24626f239ac7f1b239f58f68b66a343b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matth=C3=A4us=20Wander?= Date: Sun, 6 Jun 2021 16:44:40 +0200 Subject: [PATCH 24/31] Minor formatting --- splunk/README.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/splunk/README.rst b/splunk/README.rst index 42198f5..acf7bce 100644 --- a/splunk/README.rst +++ b/splunk/README.rst @@ -16,7 +16,7 @@ Listen on all network interfaces:: docker run -d -p 8000:8000 -p 8088:8088 -e "SPLUNK_START_ARGS=--accept-license" -e "SPLUNK_PASSWORD=password1234" -e "SPLUNK_HEC_TOKEN=hec-token-1234" --name splunk splunk/splunk:latest -Listen on localhost for use with reverse proxy with base URL `/splunk`:: +Listen on localhost for use with reverse proxy with base URL ``/splunk``:: docker run -d -p 127.0.0.1:8000:8000 -p 127.0.0.1:8088:8088 -e "SPLUNK_START_ARGS=--accept-license" -e "SPLUNK_PASSWORD=password1234" -e "SPLUNK_HEC_TOKEN=hec-token-1234" -e "SPLUNK_ROOT_ENDPOINT=/splunk" --name splunk splunk/splunk:latest @@ -28,7 +28,7 @@ Set up reverse proxy, e.g. Apache2:: Splunk Configuration -------------------- -Access web UI at http://127.0.0.1:8000 and log in with `admin:password1234`. +Access web UI at http://127.0.0.1:8000 and log in with ``admin:password1234``. Create App and Index ~~~~~~~~~~~~~~~~~~~~ @@ -37,7 +37,7 @@ Create App and Index - Index name: "email" -- HEC token `hec-token-1234` should be already set up. +- HEC token ``hec-token-1234`` should be already set up. - Check under Settings > Data > Data inputs: HTTP Event Collector @@ -79,7 +79,7 @@ parsedmarc.ini:: index = email skip_certificate_verification = True -Note that `skip_certificate_verification = True` disables security checks. +Note that ``skip_certificate_verification = True`` disables security checks. Run parsedmarc:: From 89816bbc6efa19176ed85acf3066cc7a5290260f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 20 Jun 2021 03:58:46 +1000 Subject: [PATCH 25/31] fix what was broken in merge train --- parsedmarc/cli.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index 26c65b9..7bbecf6 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -736,9 +736,8 @@ def _main(): nameservers=opts.nameservers, dns_timeout=opts.dns_timeout, strip_attachment_payloads=sa, - batch_size=opts.imap_batch_size - offline=opts.offline, - strip_attachment_payloads=sa) + batch_size=opts.imap_batch_size, + offline=opts.offline) except FileExistsError as error: logger.error("{0}".format(error.__str__())) exit(1) From 3d0f7c8c83ee4098e1bcaa95d82f995d7c9c9bd6 Mon Sep 17 00:00:00 2001 From: Sean Whalen <44679+seanthegeek@users.noreply.github.com> Date: Sun, 20 Jun 2021 13:10:12 -0400 Subject: [PATCH 26/31] 7.0.0 Closes issues #221 #219 #155 #103 --- CHANGELOG.md | 13 +++++++ README.rst | 39 +++++++++++++------ docs/index.rst | 88 ++++++++++++++++++++++++------------------ parsedmarc/__init__.py | 24 +++++++----- parsedmarc/cli.py | 40 +++++++++++-------- parsedmarc/utils.py | 4 +- 6 files changed, 131 insertions(+), 77 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4e8506..cfd49c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,19 @@ Changelog ========= +7.0.0 +----- + +- Fix issue #221: Crash when handling invalid reports without root node (PR #248) +- Use UTC datetime objects for Elasticsearch output (PR #245) +- Fix issues #219, #155, and #103: IMAP connections break on large emails (PR #241) +- Add support for saving reports to S3 buckets (PR #223) +- Pass `offline` parameter to `wait_inbox()` (PR #216) +- Add more details to logging (PR #220) +- Add options customizing the names of output files (Modifications based on PR #225) +- Wait for 5 seconds before attempting to reconnect to an IMAP server (PR #217) +- Add option to process messages in batches (PR #222) + 6.12.0 ------ diff --git a/README.rst b/README.rst index d0d41ab..517c50b 100644 --- a/README.rst +++ b/README.rst @@ -58,17 +58,20 @@ CLI help :: - usage: parsedmarc [-h] [-c CONFIG_FILE] [--strip-attachment-payloads] - [-o OUTPUT] [-n NAMESERVERS [NAMESERVERS ...]] - [-t DNS_TIMEOUT] [--offline] [-s] [--debug] - [--log-file LOG_FILE] [-v] - [file_path [file_path ...]] + usage: parsedmarc [-h] [-c CONFIG_FILE] [--strip-attachment-payloads] [-o OUTPUT] + [--aggregate-json-filename AGGREGATE_JSON_FILENAME] + [--forensic-json-filename FORENSIC_JSON_FILENAME] + [--aggregate-csv-filename AGGREGATE_CSV_FILENAME] + [--forensic-csv-filename FORENSIC_CSV_FILENAME] + [-n NAMESERVERS [NAMESERVERS ...]] [-t DNS_TIMEOUT] [--offline] + [-s] [--verbose] [--debug] [--log-file LOG_FILE] [-v] + [file_path ...] Parses DMARC reports positional arguments: file_path one or more paths to aggregate or forensic report - files or emails + files, emails, or mbox files' optional arguments: -h, --help show this help message and exit @@ -78,18 +81,27 @@ CLI help remove attachment payloads from forensic report output -o OUTPUT, --output OUTPUT write output files to the given directory + --aggregate-json-filename AGGREGATE_JSON_FILENAME + filename for the aggregate JSON output file + --forensic-json-filename FORENSIC_JSON_FILENAME + filename for the forensic JSON output file + --aggregate-csv-filename AGGREGATE_CSV_FILENAME + filename for the aggregate CSV output file + --forensic-csv-filename FORENSIC_CSV_FILENAME + filename for the forensic CSV output file -n NAMESERVERS [NAMESERVERS ...], --nameservers NAMESERVERS [NAMESERVERS ...] - nameservers to query (default is Cloudflare's - nameservers) + nameservers to query -t DNS_TIMEOUT, --dns_timeout DNS_TIMEOUT number of seconds to wait for an answer from DNS (default: 2.0) --offline do not make online queries for geolocation or DNS -s, --silent only print errors and warnings + --verbose more verbose output --debug print debugging information --log-file LOG_FILE output logging to a file -v, --version show program's version number and exit + .. note:: In ``parsedmarc`` 6.0.0, most CLI options were moved to a configuration file, described below. @@ -139,6 +151,8 @@ The full set of configuration options are: - ``save_forensic`` - bool: Save forensic report data to Elasticsearch, Splunk and/or S3 - ``strip_attachment_payloads`` - bool: Remove attachment payloads from results - ``output`` - str: Directory to place JSON and CSV files in + - ``aggregate_json_filename`` - str: filename for the aggregate JSON output file + - ``forensic_json_filename`` - str: filename for the forensic JSON output file - ``offline`` - bool: Do not use online queries for geolocation or DNS - ``nameservers`` - str: A comma separated list of DNS resolvers (Default: `Cloudflare's public resolvers`_) - ``dns_timeout`` - float: DNS timeout period @@ -146,10 +160,14 @@ The full set of configuration options are: - ``silent`` - bool: Only print errors (Default: True) - ``log_file`` - str: Write log messages to a file at this path - ``n_procs`` - int: Number of process to run in parallel when parsing in CLI mode (Default: 1) - - ``chunk_size`` - int: Number of files to give to each process when running in parallel. Setting this to a number larger than one can improve performance when processing thousands of files + - ``chunk_size`` - int: Number of files to give to each process when running in parallel. + .. note:: + Setting this to a number larger than one can improve performance when processing thousands of files - ``imap`` - ``host`` - str: The IMAP server hostname or IP address - - ``port`` - int: The IMAP server port (Default: 993) If your Hoster publishes another port, still try 993. Otherwise Error:"wrong SSL version" + - ``port`` - int: The IMAP server port (Default: 993). + .. note:: + If your host recommends another port, still try 993 - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``skip_certificate_verification`` - bool: Skip certificate verification (not recommended) - ``user`` - str: The IMAP user @@ -162,7 +180,6 @@ The full set of configuration options are: - ``batch_size`` - int: Number of messages to read and process before saving. Defaults to all messages if not set. - ``elasticsearch`` - ``hosts`` - str: A comma separated list of hostnames and ports or URLs (e.g. ``127.0.0.1:9200`` or ``https://user:secret@localhost``) - .. note:: Special characters in the username or password must be `URL encoded`_. - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) diff --git a/docs/index.rst b/docs/index.rst index 568c11d..6e36e89 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -62,36 +62,48 @@ CLI help :: - usage: parsedmarc [-h] [-c CONFIG_FILE] [--strip-attachment-payloads] - [-o OUTPUT] [-n NAMESERVERS [NAMESERVERS ...]] - [-t DNS_TIMEOUT] [--offline] [-s] [--debug] - [--log-file LOG_FILE] [-v] - [file_path [file_path ...]] + usage: parsedmarc [-h] [-c CONFIG_FILE] [--strip-attachment-payloads] [-o OUTPUT] + [--aggregate-json-filename AGGREGATE_JSON_FILENAME] + [--forensic-json-filename FORENSIC_JSON_FILENAME] + [--aggregate-csv-filename AGGREGATE_CSV_FILENAME] + [--forensic-csv-filename FORENSIC_CSV_FILENAME] + [-n NAMESERVERS [NAMESERVERS ...]] [-t DNS_TIMEOUT] [--offline] + [-s] [--verbose] [--debug] [--log-file LOG_FILE] [-v] + [file_path ...] - Parses DMARC reports + Parses DMARC reports - positional arguments: - file_path one or more paths to aggregate or forensic report - files or emails + positional arguments: + file_path one or more paths to aggregate or forensic report + files, emails, or mbox files' - optional arguments: - -h, --help show this help message and exit - -c CONFIG_FILE, --config-file CONFIG_FILE - a path to a configuration file (--silent implied) - --strip-attachment-payloads - remove attachment payloads from forensic report output - -o OUTPUT, --output OUTPUT - write output files to the given directory - -n NAMESERVERS [NAMESERVERS ...], --nameservers NAMESERVERS [NAMESERVERS ...] - nameservers to query - -t DNS_TIMEOUT, --dns_timeout DNS_TIMEOUT - number of seconds to wait for an answer from DNS - (default: 2.0) - --offline do not make online queries for geolocation or DNS - -s, --silent only print errors and warnings - --debug print debugging information - --log-file LOG_FILE output logging to a file - -v, --version show program's version number and exit + optional arguments: + -h, --help show this help message and exit + -c CONFIG_FILE, --config-file CONFIG_FILE + a path to a configuration file (--silent implied) + --strip-attachment-payloads + remove attachment payloads from forensic report output + -o OUTPUT, --output OUTPUT + write output files to the given directory + --aggregate-json-filename AGGREGATE_JSON_FILENAME + filename for the aggregate JSON output file + --forensic-json-filename FORENSIC_JSON_FILENAME + filename for the forensic JSON output file + --aggregate-csv-filename AGGREGATE_CSV_FILENAME + filename for the aggregate CSV output file + --forensic-csv-filename FORENSIC_CSV_FILENAME + filename for the forensic CSV output file + -n NAMESERVERS [NAMESERVERS ...], --nameservers NAMESERVERS [NAMESERVERS ...] + nameservers to query + -t DNS_TIMEOUT, --dns_timeout DNS_TIMEOUT + number of seconds to wait for an answer from DNS + (default: 2.0) + --offline do not make online queries for geolocation or DNS + -s, --silent only print errors and warnings + --verbose more verbose output + --debug print debugging information + --log-file LOG_FILE output logging to a file + -v, --version show program's version number and exit .. note:: @@ -139,10 +151,12 @@ For example The full set of configuration options are: - ``general`` - - ``save_aggregate`` - bool: Save aggregate report data to the Elasticsearch, Splunk and/or S3 - - ``save_forensic`` - bool: Save forensic report data to the Elasticsearch, Splunk and/or S3 + - ``save_aggregate`` - bool: Save aggregate report data to Elasticsearch, Splunk and/or S3 + - ``save_forensic`` - bool: Save forensic report data to Elasticsearch, Splunk and/or S3 - ``strip_attachment_payloads`` - bool: Remove attachment payloads from results - ``output`` - str: Directory to place JSON and CSV files in + - ``aggregate_json_filename`` - str: filename for the aggregate JSON output file + - ``forensic_json_filename`` - str: filename for the forensic JSON output file - ``offline`` - bool: Do not use online queries for geolocation or DNS - ``nameservers`` - str: A comma separated list of DNS resolvers (Default: `Cloudflare's public resolvers`_) - ``dns_timeout`` - float: DNS timeout period @@ -150,16 +164,18 @@ The full set of configuration options are: - ``silent`` - bool: Only print errors (Default: True) - ``log_file`` - str: Write log messages to a file at this path - ``n_procs`` - int: Number of process to run in parallel when parsing in CLI mode (Default: 1) - - ``chunk_size`` - int: Number of files to give to each process when running in parallel. Setting this to a number larger than one can improve performance when processing thousands of files + - ``chunk_size`` - int: Number of files to give to each process when running in parallel. + .. note:: + Setting this to a number larger than one can improve performance when processing thousands of files - ``imap`` - ``host`` - str: The IMAP server hostname or IP address - - ``port`` - int: The IMAP server port (Default: 993) + - ``port`` - int: The IMAP server port (Default: 993). + .. note:: + If your host recommends another port, still try 993 - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``skip_certificate_verification`` - bool: Skip certificate verification (not recommended) - - ``timeout`` - float: Timeout in seconds to wait for an IMAP operation to complete (Default: 30) - - ``max_retries`` - int: The maximum number of retries after a timeout - ``user`` - str: The IMAP user - - ``password`` - str: The IMAP password (escape ``%`` with a second ``%``) + - ``password`` - str: The IMAP password - ``reports_folder`` - str: The IMAP folder where the incoming reports can be found (Default: INBOX) - ``archive_folder`` - str: The IMAP folder to sort processed emails into (Default: Archive) - ``watch`` - bool: Use the IMAP ``IDLE`` command to process messages as they arrive @@ -168,14 +184,10 @@ The full set of configuration options are: - ``batch_size`` - int: Number of messages to read and process before saving. Defaults to all messages if not set. - ``elasticsearch`` - ``hosts`` - str: A comma separated list of hostnames and ports or URLs (e.g. ``127.0.0.1:9200`` or ``https://user:secret@localhost``) - .. note:: Special characters in the username or password must be `URL encoded`_. - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - - ``user`` - str: Basic auth username - - ``password`` - str: Basic auth password - ``cert_path`` - str: Path to a trusted certificates - - ``timeout`` - float: Timeout in seconds (Default: 60) - ``index_suffix`` - str: A suffix to apply to the index names - ``monthly_indexes`` - bool: Use monthly indexes instead of daily indexes - ``number_of_shards`` - int: The number of shards to use when creating the index (Default: 1) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index a07209e..f601b7a 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -36,7 +36,7 @@ from parsedmarc.utils import is_outlook_msg, convert_outlook_msg from parsedmarc.utils import timestamp_to_human, human_timestamp_to_datetime from parsedmarc.utils import parse_email -__version__ = "6.12.0" +__version__ = "7.0.0" logging.basicConfig( format='%(levelname)8s:%(filename)s:%(lineno)d:' @@ -1274,16 +1274,20 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, def save_output(results, output_directory="output", - output_json_aggregate="aggregate.json", - output_json_forensic="forensic.json", - output_csv_aggregate="aggregate.csv", - output_csv_forensic="forensic.csv"): + aggregate_json_filename="aggregate.json", + forensic_json_filename="forensic.json", + aggregate_csv_filename="aggregate.csv", + forensic_csv_filename="forensic.csv"): """ Save report data in the given directory Args: results (OrderedDict): Parsing results - output_directory: The patch to the directory to save in + output_directory (str): The patch to the directory to save in + aggregate_json_filename (str): Output filename for the aggregate JSON report + forensic_json_filename (str): Output filename for the forensic JSON report + aggregate_csv_filename (str): Output filename for the aggregate CSV report + forensic_csv_filename (str): Output filename for the forensic CSV report """ aggregate_reports = results["aggregate_reports"] @@ -1297,28 +1301,28 @@ def save_output(results, output_directory="output", with open("{0}" .format(os.path.join(output_directory, - output_json_aggregate)), + aggregate_json_filename)), "w", newline="\n", encoding="utf-8") as agg_json: agg_json.write(json.dumps(aggregate_reports, ensure_ascii=False, indent=2)) with open("{0}" .format(os.path.join(output_directory, - output_csv_aggregate)), + aggregate_csv_filename)), "w", newline="\n", encoding="utf-8") as agg_csv: csv = parsed_aggregate_reports_to_csv(aggregate_reports) agg_csv.write(csv) with open("{0}" .format(os.path.join(output_directory, - output_json_forensic)), + forensic_json_filename)), "w", newline="\n", encoding="utf-8") as for_json: for_json.write(json.dumps(forensic_reports, ensure_ascii=False, indent=2)) with open("{0}" .format(os.path.join(output_directory, - output_csv_forensic)), + forensic_csv_filename)), "w", newline="\n", encoding="utf-8") as for_csv: csv = parsed_forensic_reports_to_csv(forensic_reports) for_csv.write(csv) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index 7bbecf6..ae739fe 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -178,17 +178,17 @@ def _main(): help=strip_attachment_help, action="store_true") arg_parser.add_argument("-o", "--output", help="write output files to the given directory") - arg_parser.add_argument("--output-json-aggregate", - help="output aggregate JSON file", + arg_parser.add_argument("--aggregate-json-filename", + help="filename for the aggregate JSON output file", default="aggregate.json") - arg_parser.add_argument("--output-json-forensic", - help="output forensic JSON file", + arg_parser.add_argument("--forensic-json-filename", + help="filename for the forensic JSON output file", default="forensic.json") - arg_parser.add_argument("--output-csv-aggregate", - help="output aggregate CSV file", + arg_parser.add_argument("--aggregate-csv-filename", + help="filename for the aggregate CSV output file", default="aggregate.csv") - arg_parser.add_argument("--output-csv-forensic", - help="output forensic CSV file", + arg_parser.add_argument("--forensic-csv-filename", + help="filename for the forensic CSV output file", default="forensic.csv") arg_parser.add_argument("-n", "--nameservers", nargs="+", help="nameservers to query") @@ -221,10 +221,10 @@ def _main(): offline=args.offline, strip_attachment_payloads=args.strip_attachment_payloads, output=args.output, - output_json_aggregate=args.output_json_aggregate, - output_json_forensic=args.output_json_forensic, - output_csv_aggregate=args.output_csv_aggregate, - output_csv_forensic=args.output_csv_forensic, + aggregate_csv_filename=args.aggregate_csv_filename, + aggreate_json_filename=args.aggregate_json_filename, + forensic_csv_filename=args.forensic_csv_filename, + forensic_json_filename=args.forensic_json_filename, nameservers=args.nameservers, silent=args.silent, dns_timeout=args.dns_timeout, @@ -302,6 +302,14 @@ def _main(): "strip_attachment_payloads"] if "output" in general_config: opts.output = general_config["output"] + if "aggregate_json_filename" in general_config: + opts.aggregate_json_filename = general_config["aggregate_json_filename"] + if "forensic_json_filename" in general_config: + opts.forensic_json_filename = general_config["forensic_json_filename"] + if "aggregate_csv_filename" in general_config: + opts.aggregate_csv_filename = general_config["aggregate_csv_filename"] + if "forensic_csv_filename" in general_config: + opts.forensic_csv_filename = general_config["forensic_csv_filename"] if "nameservers" in general_config: opts.nameservers = _str_to_list(general_config["nameservers"]) if "dns_timeout" in general_config: @@ -689,10 +697,10 @@ def _main(): if opts.output: save_output(results, output_directory=opts.output, - output_json_aggregate=opts.output_json_aggregate, - output_json_forensic=opts.output_json_forensic, - output_csv_aggregate=opts.output_csv_aggregate, - output_csv_forensic=opts.output_csv_forensic) + aggregate_json_filename=opts.aggregate_json_filename, + forensic_json_filename=opts.forensic_json_filename, + aggregate_csv_filename=opts.aggregate_csv_filename, + forensic_csv_filename=opts.forensic_csv_filename) process_reports(results) diff --git a/parsedmarc/utils.py b/parsedmarc/utils.py index dbf525f..6b5f980 100644 --- a/parsedmarc/utils.py +++ b/parsedmarc/utils.py @@ -157,7 +157,7 @@ def query_dns(domain, record_type, cache=None, nameservers=None, timeout=2.0): if record_type == "TXT": resource_records = list(map( lambda r: r.strings, - resolver.query(domain, record_type, lifetime=timeout))) + resolver.resolve(domain, record_type, lifetime=timeout))) _resource_record = [ resource_record[0][:0].join(resource_record) for resource_record in resource_records if resource_record] @@ -165,7 +165,7 @@ def query_dns(domain, record_type, cache=None, nameservers=None, timeout=2.0): else: records = list(map( lambda r: r.to_text().replace('"', '').rstrip("."), - resolver.query(domain, record_type, lifetime=timeout))) + resolver.resolve(domain, record_type, lifetime=timeout))) if cache: cache[cache_key] = records From e841a49ca71c6e977b2c8755b69b1992c86a9109 Mon Sep 17 00:00:00 2001 From: Sean Whalen <44679+seanthegeek@users.noreply.github.com> Date: Sun, 20 Jun 2021 14:24:49 -0400 Subject: [PATCH 27/31] Fix documentation formatting --- README.rst | 9 ++++++--- docs/index.rst | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 517c50b..6e3f18c 100644 --- a/README.rst +++ b/README.rst @@ -161,12 +161,14 @@ The full set of configuration options are: - ``log_file`` - str: Write log messages to a file at this path - ``n_procs`` - int: Number of process to run in parallel when parsing in CLI mode (Default: 1) - ``chunk_size`` - int: Number of files to give to each process when running in parallel. - .. note:: + + .. note:: Setting this to a number larger than one can improve performance when processing thousands of files - ``imap`` - ``host`` - str: The IMAP server hostname or IP address - ``port`` - int: The IMAP server port (Default: 993). - .. note:: + + .. note:: If your host recommends another port, still try 993 - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``skip_certificate_verification`` - bool: Skip certificate verification (not recommended) @@ -180,7 +182,8 @@ The full set of configuration options are: - ``batch_size`` - int: Number of messages to read and process before saving. Defaults to all messages if not set. - ``elasticsearch`` - ``hosts`` - str: A comma separated list of hostnames and ports or URLs (e.g. ``127.0.0.1:9200`` or ``https://user:secret@localhost``) - .. note:: + + .. note:: Special characters in the username or password must be `URL encoded`_. - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``cert_path`` - str: Path to a trusted certificates diff --git a/docs/index.rst b/docs/index.rst index 6e36e89..e84c3b5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -165,12 +165,14 @@ The full set of configuration options are: - ``log_file`` - str: Write log messages to a file at this path - ``n_procs`` - int: Number of process to run in parallel when parsing in CLI mode (Default: 1) - ``chunk_size`` - int: Number of files to give to each process when running in parallel. - .. note:: + + .. note:: Setting this to a number larger than one can improve performance when processing thousands of files - ``imap`` - ``host`` - str: The IMAP server hostname or IP address - ``port`` - int: The IMAP server port (Default: 993). - .. note:: + + .. note:: If your host recommends another port, still try 993 - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``skip_certificate_verification`` - bool: Skip certificate verification (not recommended) @@ -184,7 +186,8 @@ The full set of configuration options are: - ``batch_size`` - int: Number of messages to read and process before saving. Defaults to all messages if not set. - ``elasticsearch`` - ``hosts`` - str: A comma separated list of hostnames and ports or URLs (e.g. ``127.0.0.1:9200`` or ``https://user:secret@localhost``) - .. note:: + + .. note:: Special characters in the username or password must be `URL encoded`_. - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``cert_path`` - str: Path to a trusted certificates From 6d5f8a9ec3b0ce40087106649f8560537e874836 Mon Sep 17 00:00:00 2001 From: Sean Whalen <44679+seanthegeek@users.noreply.github.com> Date: Sun, 20 Jun 2021 15:45:15 -0400 Subject: [PATCH 28/31] Fix documentation formatting --- README.rst | 3 +++ docs/index.rst | 3 +++ 2 files changed, 6 insertions(+) diff --git a/README.rst b/README.rst index 6e3f18c..3d5684e 100644 --- a/README.rst +++ b/README.rst @@ -165,11 +165,13 @@ The full set of configuration options are: .. note:: Setting this to a number larger than one can improve performance when processing thousands of files - ``imap`` + - ``host`` - str: The IMAP server hostname or IP address - ``port`` - int: The IMAP server port (Default: 993). .. note:: If your host recommends another port, still try 993 + - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``skip_certificate_verification`` - bool: Skip certificate verification (not recommended) - ``user`` - str: The IMAP user @@ -185,6 +187,7 @@ The full set of configuration options are: .. note:: Special characters in the username or password must be `URL encoded`_. + - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``cert_path`` - str: Path to a trusted certificates - ``index_suffix`` - str: A suffix to apply to the index names diff --git a/docs/index.rst b/docs/index.rst index e84c3b5..fa9cd62 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -169,11 +169,13 @@ The full set of configuration options are: .. note:: Setting this to a number larger than one can improve performance when processing thousands of files - ``imap`` + - ``host`` - str: The IMAP server hostname or IP address - ``port`` - int: The IMAP server port (Default: 993). .. note:: If your host recommends another port, still try 993 + - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``skip_certificate_verification`` - bool: Skip certificate verification (not recommended) - ``user`` - str: The IMAP user @@ -189,6 +191,7 @@ The full set of configuration options are: .. note:: Special characters in the username or password must be `URL encoded`_. + - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``cert_path`` - str: Path to a trusted certificates - ``index_suffix`` - str: A suffix to apply to the index names From 4c5a266f19abe261c45eb5eed1031d3449ecf12d Mon Sep 17 00:00:00 2001 From: Sean Whalen <44679+seanthegeek@users.noreply.github.com> Date: Sun, 20 Jun 2021 19:07:18 -0400 Subject: [PATCH 29/31] PEP 8 fixes --- parsedmarc/__init__.py | 8 ++++---- parsedmarc/cli.py | 12 ++++++++---- parsedmarc/elastic.py | 6 ++++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index f601b7a..d798225 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -1284,10 +1284,10 @@ def save_output(results, output_directory="output", Args: results (OrderedDict): Parsing results output_directory (str): The patch to the directory to save in - aggregate_json_filename (str): Output filename for the aggregate JSON report - forensic_json_filename (str): Output filename for the forensic JSON report - aggregate_csv_filename (str): Output filename for the aggregate CSV report - forensic_csv_filename (str): Output filename for the forensic CSV report + aggregate_json_filename (str): Filename for the aggregate JSON file + forensic_json_filename (str): Filename for the forensic JSON file + aggregate_csv_filename (str): Filename for the aggregate CSV file + forensic_csv_filename (str): Filename for the forensic CSV file """ aggregate_reports = results["aggregate_reports"] diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index ae739fe..374e6af 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -303,13 +303,17 @@ def _main(): if "output" in general_config: opts.output = general_config["output"] if "aggregate_json_filename" in general_config: - opts.aggregate_json_filename = general_config["aggregate_json_filename"] + opts.aggregate_json_filename = general_config[ + "aggregate_json_filename"] if "forensic_json_filename" in general_config: - opts.forensic_json_filename = general_config["forensic_json_filename"] + opts.forensic_json_filename = general_config[ + "forensic_json_filename"] if "aggregate_csv_filename" in general_config: - opts.aggregate_csv_filename = general_config["aggregate_csv_filename"] + opts.aggregate_csv_filename = general_config[ + "aggregate_csv_filename"] if "forensic_csv_filename" in general_config: - opts.forensic_csv_filename = general_config["forensic_csv_filename"] + opts.forensic_csv_filename = general_config[ + "forensic_csv_filename"] if "nameservers" in general_config: opts.nameservers = _str_to_list(general_config["nameservers"]) if "dns_timeout" in general_config: diff --git a/parsedmarc/elastic.py b/parsedmarc/elastic.py index 992d02c..7085983 100644 --- a/parsedmarc/elastic.py +++ b/parsedmarc/elastic.py @@ -301,8 +301,10 @@ def save_aggregate_report_to_elasticsearch(aggregate_report, org_name = metadata["org_name"] report_id = metadata["report_id"] domain = aggregate_report["policy_published"]["domain"] - begin_date = human_timestamp_to_datetime(metadata["begin_date"], to_utc=True) - end_date = human_timestamp_to_datetime(metadata["end_date"], to_utc=True) + begin_date = human_timestamp_to_datetime(metadata["begin_date"], + to_utc=True) + end_date = human_timestamp_to_datetime(metadata["end_date"], + to_utc=True) begin_date_human = begin_date.strftime("%Y-%m-%d %H:%M:%SZ") end_date_human = end_date.strftime("%Y-%m-%d %H:%M:%SZ") if monthly_indexes: From a10e6592fe4e433a9770f37257569bc7855f8f03 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Wed, 23 Jun 2021 12:06:29 +0100 Subject: [PATCH 30/31] fix startup bug Pretty silly typo means `parsedmarc` completely fails unless `parsedmarc.ini` has `general.aggregate_json_filename` explicitly set --- parsedmarc/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index 374e6af..be2026a 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -222,7 +222,7 @@ def _main(): strip_attachment_payloads=args.strip_attachment_payloads, output=args.output, aggregate_csv_filename=args.aggregate_csv_filename, - aggreate_json_filename=args.aggregate_json_filename, + aggregate_json_filename=args.aggregate_json_filename, forensic_csv_filename=args.forensic_csv_filename, forensic_json_filename=args.forensic_json_filename, nameservers=args.nameservers, From 6d689ca8f552d9639fca2773ee540ae95e244203 Mon Sep 17 00:00:00 2001 From: Sean Whalen <44679+seanthegeek@users.noreply.github.com> Date: Wed, 23 Jun 2021 15:03:12 -0400 Subject: [PATCH 31/31] 7.0.1 --- CHANGELOG.md | 5 +++++ parsedmarc/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cfd49c2..8cd66b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ Changelog ========= +7.0.1 +----- + +- Fix startup error (PR #254) + 7.0.0 ----- diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index d798225..2750819 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -36,7 +36,7 @@ from parsedmarc.utils import is_outlook_msg, convert_outlook_msg from parsedmarc.utils import timestamp_to_human, human_timestamp_to_datetime from parsedmarc.utils import parse_email -__version__ = "7.0.0" +__version__ = "7.0.1" logging.basicConfig( format='%(levelname)8s:%(filename)s:%(lineno)d:'