From 3d0f7c8c83ee4098e1bcaa95d82f995d7c9c9bd6 Mon Sep 17 00:00:00 2001 From: Sean Whalen <44679+seanthegeek@users.noreply.github.com> Date: Sun, 20 Jun 2021 13:10:12 -0400 Subject: [PATCH] 7.0.0 Closes issues #221 #219 #155 #103 --- CHANGELOG.md | 13 +++++++ README.rst | 39 +++++++++++++------ docs/index.rst | 88 ++++++++++++++++++++++++------------------ parsedmarc/__init__.py | 24 +++++++----- parsedmarc/cli.py | 40 +++++++++++-------- parsedmarc/utils.py | 4 +- 6 files changed, 131 insertions(+), 77 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4e8506..cfd49c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,19 @@ Changelog ========= +7.0.0 +----- + +- Fix issue #221: Crash when handling invalid reports without root node (PR #248) +- Use UTC datetime objects for Elasticsearch output (PR #245) +- Fix issues #219, #155, and #103: IMAP connections break on large emails (PR #241) +- Add support for saving reports to S3 buckets (PR #223) +- Pass `offline` parameter to `wait_inbox()` (PR #216) +- Add more details to logging (PR #220) +- Add options customizing the names of output files (Modifications based on PR #225) +- Wait for 5 seconds before attempting to reconnect to an IMAP server (PR #217) +- Add option to process messages in batches (PR #222) + 6.12.0 ------ diff --git a/README.rst b/README.rst index d0d41ab..517c50b 100644 --- a/README.rst +++ b/README.rst @@ -58,17 +58,20 @@ CLI help :: - usage: parsedmarc [-h] [-c CONFIG_FILE] [--strip-attachment-payloads] - [-o OUTPUT] [-n NAMESERVERS [NAMESERVERS ...]] - [-t DNS_TIMEOUT] [--offline] [-s] [--debug] - [--log-file LOG_FILE] [-v] - [file_path [file_path ...]] + usage: parsedmarc [-h] [-c CONFIG_FILE] [--strip-attachment-payloads] [-o OUTPUT] + [--aggregate-json-filename AGGREGATE_JSON_FILENAME] + [--forensic-json-filename FORENSIC_JSON_FILENAME] + [--aggregate-csv-filename AGGREGATE_CSV_FILENAME] + [--forensic-csv-filename FORENSIC_CSV_FILENAME] + [-n NAMESERVERS [NAMESERVERS ...]] [-t DNS_TIMEOUT] [--offline] + [-s] [--verbose] [--debug] [--log-file LOG_FILE] [-v] + [file_path ...] Parses DMARC reports positional arguments: file_path one or more paths to aggregate or forensic report - files or emails + files, emails, or mbox files' optional arguments: -h, --help show this help message and exit @@ -78,18 +81,27 @@ CLI help remove attachment payloads from forensic report output -o OUTPUT, --output OUTPUT write output files to the given directory + --aggregate-json-filename AGGREGATE_JSON_FILENAME + filename for the aggregate JSON output file + --forensic-json-filename FORENSIC_JSON_FILENAME + filename for the forensic JSON output file + --aggregate-csv-filename AGGREGATE_CSV_FILENAME + filename for the aggregate CSV output file + --forensic-csv-filename FORENSIC_CSV_FILENAME + filename for the forensic CSV output file -n NAMESERVERS [NAMESERVERS ...], --nameservers NAMESERVERS [NAMESERVERS ...] - nameservers to query (default is Cloudflare's - nameservers) + nameservers to query -t DNS_TIMEOUT, --dns_timeout DNS_TIMEOUT number of seconds to wait for an answer from DNS (default: 2.0) --offline do not make online queries for geolocation or DNS -s, --silent only print errors and warnings + --verbose more verbose output --debug print debugging information --log-file LOG_FILE output logging to a file -v, --version show program's version number and exit + .. note:: In ``parsedmarc`` 6.0.0, most CLI options were moved to a configuration file, described below. @@ -139,6 +151,8 @@ The full set of configuration options are: - ``save_forensic`` - bool: Save forensic report data to Elasticsearch, Splunk and/or S3 - ``strip_attachment_payloads`` - bool: Remove attachment payloads from results - ``output`` - str: Directory to place JSON and CSV files in + - ``aggregate_json_filename`` - str: filename for the aggregate JSON output file + - ``forensic_json_filename`` - str: filename for the forensic JSON output file - ``offline`` - bool: Do not use online queries for geolocation or DNS - ``nameservers`` - str: A comma separated list of DNS resolvers (Default: `Cloudflare's public resolvers`_) - ``dns_timeout`` - float: DNS timeout period @@ -146,10 +160,14 @@ The full set of configuration options are: - ``silent`` - bool: Only print errors (Default: True) - ``log_file`` - str: Write log messages to a file at this path - ``n_procs`` - int: Number of process to run in parallel when parsing in CLI mode (Default: 1) - - ``chunk_size`` - int: Number of files to give to each process when running in parallel. Setting this to a number larger than one can improve performance when processing thousands of files + - ``chunk_size`` - int: Number of files to give to each process when running in parallel. + .. note:: + Setting this to a number larger than one can improve performance when processing thousands of files - ``imap`` - ``host`` - str: The IMAP server hostname or IP address - - ``port`` - int: The IMAP server port (Default: 993) If your Hoster publishes another port, still try 993. Otherwise Error:"wrong SSL version" + - ``port`` - int: The IMAP server port (Default: 993). + .. note:: + If your host recommends another port, still try 993 - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``skip_certificate_verification`` - bool: Skip certificate verification (not recommended) - ``user`` - str: The IMAP user @@ -162,7 +180,6 @@ The full set of configuration options are: - ``batch_size`` - int: Number of messages to read and process before saving. Defaults to all messages if not set. - ``elasticsearch`` - ``hosts`` - str: A comma separated list of hostnames and ports or URLs (e.g. ``127.0.0.1:9200`` or ``https://user:secret@localhost``) - .. note:: Special characters in the username or password must be `URL encoded`_. - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) diff --git a/docs/index.rst b/docs/index.rst index 568c11d..6e36e89 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -62,36 +62,48 @@ CLI help :: - usage: parsedmarc [-h] [-c CONFIG_FILE] [--strip-attachment-payloads] - [-o OUTPUT] [-n NAMESERVERS [NAMESERVERS ...]] - [-t DNS_TIMEOUT] [--offline] [-s] [--debug] - [--log-file LOG_FILE] [-v] - [file_path [file_path ...]] + usage: parsedmarc [-h] [-c CONFIG_FILE] [--strip-attachment-payloads] [-o OUTPUT] + [--aggregate-json-filename AGGREGATE_JSON_FILENAME] + [--forensic-json-filename FORENSIC_JSON_FILENAME] + [--aggregate-csv-filename AGGREGATE_CSV_FILENAME] + [--forensic-csv-filename FORENSIC_CSV_FILENAME] + [-n NAMESERVERS [NAMESERVERS ...]] [-t DNS_TIMEOUT] [--offline] + [-s] [--verbose] [--debug] [--log-file LOG_FILE] [-v] + [file_path ...] - Parses DMARC reports + Parses DMARC reports - positional arguments: - file_path one or more paths to aggregate or forensic report - files or emails + positional arguments: + file_path one or more paths to aggregate or forensic report + files, emails, or mbox files' - optional arguments: - -h, --help show this help message and exit - -c CONFIG_FILE, --config-file CONFIG_FILE - a path to a configuration file (--silent implied) - --strip-attachment-payloads - remove attachment payloads from forensic report output - -o OUTPUT, --output OUTPUT - write output files to the given directory - -n NAMESERVERS [NAMESERVERS ...], --nameservers NAMESERVERS [NAMESERVERS ...] - nameservers to query - -t DNS_TIMEOUT, --dns_timeout DNS_TIMEOUT - number of seconds to wait for an answer from DNS - (default: 2.0) - --offline do not make online queries for geolocation or DNS - -s, --silent only print errors and warnings - --debug print debugging information - --log-file LOG_FILE output logging to a file - -v, --version show program's version number and exit + optional arguments: + -h, --help show this help message and exit + -c CONFIG_FILE, --config-file CONFIG_FILE + a path to a configuration file (--silent implied) + --strip-attachment-payloads + remove attachment payloads from forensic report output + -o OUTPUT, --output OUTPUT + write output files to the given directory + --aggregate-json-filename AGGREGATE_JSON_FILENAME + filename for the aggregate JSON output file + --forensic-json-filename FORENSIC_JSON_FILENAME + filename for the forensic JSON output file + --aggregate-csv-filename AGGREGATE_CSV_FILENAME + filename for the aggregate CSV output file + --forensic-csv-filename FORENSIC_CSV_FILENAME + filename for the forensic CSV output file + -n NAMESERVERS [NAMESERVERS ...], --nameservers NAMESERVERS [NAMESERVERS ...] + nameservers to query + -t DNS_TIMEOUT, --dns_timeout DNS_TIMEOUT + number of seconds to wait for an answer from DNS + (default: 2.0) + --offline do not make online queries for geolocation or DNS + -s, --silent only print errors and warnings + --verbose more verbose output + --debug print debugging information + --log-file LOG_FILE output logging to a file + -v, --version show program's version number and exit .. note:: @@ -139,10 +151,12 @@ For example The full set of configuration options are: - ``general`` - - ``save_aggregate`` - bool: Save aggregate report data to the Elasticsearch, Splunk and/or S3 - - ``save_forensic`` - bool: Save forensic report data to the Elasticsearch, Splunk and/or S3 + - ``save_aggregate`` - bool: Save aggregate report data to Elasticsearch, Splunk and/or S3 + - ``save_forensic`` - bool: Save forensic report data to Elasticsearch, Splunk and/or S3 - ``strip_attachment_payloads`` - bool: Remove attachment payloads from results - ``output`` - str: Directory to place JSON and CSV files in + - ``aggregate_json_filename`` - str: filename for the aggregate JSON output file + - ``forensic_json_filename`` - str: filename for the forensic JSON output file - ``offline`` - bool: Do not use online queries for geolocation or DNS - ``nameservers`` - str: A comma separated list of DNS resolvers (Default: `Cloudflare's public resolvers`_) - ``dns_timeout`` - float: DNS timeout period @@ -150,16 +164,18 @@ The full set of configuration options are: - ``silent`` - bool: Only print errors (Default: True) - ``log_file`` - str: Write log messages to a file at this path - ``n_procs`` - int: Number of process to run in parallel when parsing in CLI mode (Default: 1) - - ``chunk_size`` - int: Number of files to give to each process when running in parallel. Setting this to a number larger than one can improve performance when processing thousands of files + - ``chunk_size`` - int: Number of files to give to each process when running in parallel. + .. note:: + Setting this to a number larger than one can improve performance when processing thousands of files - ``imap`` - ``host`` - str: The IMAP server hostname or IP address - - ``port`` - int: The IMAP server port (Default: 993) + - ``port`` - int: The IMAP server port (Default: 993). + .. note:: + If your host recommends another port, still try 993 - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - ``skip_certificate_verification`` - bool: Skip certificate verification (not recommended) - - ``timeout`` - float: Timeout in seconds to wait for an IMAP operation to complete (Default: 30) - - ``max_retries`` - int: The maximum number of retries after a timeout - ``user`` - str: The IMAP user - - ``password`` - str: The IMAP password (escape ``%`` with a second ``%``) + - ``password`` - str: The IMAP password - ``reports_folder`` - str: The IMAP folder where the incoming reports can be found (Default: INBOX) - ``archive_folder`` - str: The IMAP folder to sort processed emails into (Default: Archive) - ``watch`` - bool: Use the IMAP ``IDLE`` command to process messages as they arrive @@ -168,14 +184,10 @@ The full set of configuration options are: - ``batch_size`` - int: Number of messages to read and process before saving. Defaults to all messages if not set. - ``elasticsearch`` - ``hosts`` - str: A comma separated list of hostnames and ports or URLs (e.g. ``127.0.0.1:9200`` or ``https://user:secret@localhost``) - .. note:: Special characters in the username or password must be `URL encoded`_. - ``ssl`` - bool: Use an encrypted SSL/TLS connection (Default: True) - - ``user`` - str: Basic auth username - - ``password`` - str: Basic auth password - ``cert_path`` - str: Path to a trusted certificates - - ``timeout`` - float: Timeout in seconds (Default: 60) - ``index_suffix`` - str: A suffix to apply to the index names - ``monthly_indexes`` - bool: Use monthly indexes instead of daily indexes - ``number_of_shards`` - int: The number of shards to use when creating the index (Default: 1) diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index a07209e..f601b7a 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -36,7 +36,7 @@ from parsedmarc.utils import is_outlook_msg, convert_outlook_msg from parsedmarc.utils import timestamp_to_human, human_timestamp_to_datetime from parsedmarc.utils import parse_email -__version__ = "6.12.0" +__version__ = "7.0.0" logging.basicConfig( format='%(levelname)8s:%(filename)s:%(lineno)d:' @@ -1274,16 +1274,20 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, def save_output(results, output_directory="output", - output_json_aggregate="aggregate.json", - output_json_forensic="forensic.json", - output_csv_aggregate="aggregate.csv", - output_csv_forensic="forensic.csv"): + aggregate_json_filename="aggregate.json", + forensic_json_filename="forensic.json", + aggregate_csv_filename="aggregate.csv", + forensic_csv_filename="forensic.csv"): """ Save report data in the given directory Args: results (OrderedDict): Parsing results - output_directory: The patch to the directory to save in + output_directory (str): The patch to the directory to save in + aggregate_json_filename (str): Output filename for the aggregate JSON report + forensic_json_filename (str): Output filename for the forensic JSON report + aggregate_csv_filename (str): Output filename for the aggregate CSV report + forensic_csv_filename (str): Output filename for the forensic CSV report """ aggregate_reports = results["aggregate_reports"] @@ -1297,28 +1301,28 @@ def save_output(results, output_directory="output", with open("{0}" .format(os.path.join(output_directory, - output_json_aggregate)), + aggregate_json_filename)), "w", newline="\n", encoding="utf-8") as agg_json: agg_json.write(json.dumps(aggregate_reports, ensure_ascii=False, indent=2)) with open("{0}" .format(os.path.join(output_directory, - output_csv_aggregate)), + aggregate_csv_filename)), "w", newline="\n", encoding="utf-8") as agg_csv: csv = parsed_aggregate_reports_to_csv(aggregate_reports) agg_csv.write(csv) with open("{0}" .format(os.path.join(output_directory, - output_json_forensic)), + forensic_json_filename)), "w", newline="\n", encoding="utf-8") as for_json: for_json.write(json.dumps(forensic_reports, ensure_ascii=False, indent=2)) with open("{0}" .format(os.path.join(output_directory, - output_csv_forensic)), + forensic_csv_filename)), "w", newline="\n", encoding="utf-8") as for_csv: csv = parsed_forensic_reports_to_csv(forensic_reports) for_csv.write(csv) diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index 7bbecf6..ae739fe 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -178,17 +178,17 @@ def _main(): help=strip_attachment_help, action="store_true") arg_parser.add_argument("-o", "--output", help="write output files to the given directory") - arg_parser.add_argument("--output-json-aggregate", - help="output aggregate JSON file", + arg_parser.add_argument("--aggregate-json-filename", + help="filename for the aggregate JSON output file", default="aggregate.json") - arg_parser.add_argument("--output-json-forensic", - help="output forensic JSON file", + arg_parser.add_argument("--forensic-json-filename", + help="filename for the forensic JSON output file", default="forensic.json") - arg_parser.add_argument("--output-csv-aggregate", - help="output aggregate CSV file", + arg_parser.add_argument("--aggregate-csv-filename", + help="filename for the aggregate CSV output file", default="aggregate.csv") - arg_parser.add_argument("--output-csv-forensic", - help="output forensic CSV file", + arg_parser.add_argument("--forensic-csv-filename", + help="filename for the forensic CSV output file", default="forensic.csv") arg_parser.add_argument("-n", "--nameservers", nargs="+", help="nameservers to query") @@ -221,10 +221,10 @@ def _main(): offline=args.offline, strip_attachment_payloads=args.strip_attachment_payloads, output=args.output, - output_json_aggregate=args.output_json_aggregate, - output_json_forensic=args.output_json_forensic, - output_csv_aggregate=args.output_csv_aggregate, - output_csv_forensic=args.output_csv_forensic, + aggregate_csv_filename=args.aggregate_csv_filename, + aggreate_json_filename=args.aggregate_json_filename, + forensic_csv_filename=args.forensic_csv_filename, + forensic_json_filename=args.forensic_json_filename, nameservers=args.nameservers, silent=args.silent, dns_timeout=args.dns_timeout, @@ -302,6 +302,14 @@ def _main(): "strip_attachment_payloads"] if "output" in general_config: opts.output = general_config["output"] + if "aggregate_json_filename" in general_config: + opts.aggregate_json_filename = general_config["aggregate_json_filename"] + if "forensic_json_filename" in general_config: + opts.forensic_json_filename = general_config["forensic_json_filename"] + if "aggregate_csv_filename" in general_config: + opts.aggregate_csv_filename = general_config["aggregate_csv_filename"] + if "forensic_csv_filename" in general_config: + opts.forensic_csv_filename = general_config["forensic_csv_filename"] if "nameservers" in general_config: opts.nameservers = _str_to_list(general_config["nameservers"]) if "dns_timeout" in general_config: @@ -689,10 +697,10 @@ def _main(): if opts.output: save_output(results, output_directory=opts.output, - output_json_aggregate=opts.output_json_aggregate, - output_json_forensic=opts.output_json_forensic, - output_csv_aggregate=opts.output_csv_aggregate, - output_csv_forensic=opts.output_csv_forensic) + aggregate_json_filename=opts.aggregate_json_filename, + forensic_json_filename=opts.forensic_json_filename, + aggregate_csv_filename=opts.aggregate_csv_filename, + forensic_csv_filename=opts.forensic_csv_filename) process_reports(results) diff --git a/parsedmarc/utils.py b/parsedmarc/utils.py index dbf525f..6b5f980 100644 --- a/parsedmarc/utils.py +++ b/parsedmarc/utils.py @@ -157,7 +157,7 @@ def query_dns(domain, record_type, cache=None, nameservers=None, timeout=2.0): if record_type == "TXT": resource_records = list(map( lambda r: r.strings, - resolver.query(domain, record_type, lifetime=timeout))) + resolver.resolve(domain, record_type, lifetime=timeout))) _resource_record = [ resource_record[0][:0].join(resource_record) for resource_record in resource_records if resource_record] @@ -165,7 +165,7 @@ def query_dns(domain, record_type, cache=None, nameservers=None, timeout=2.0): else: records = list(map( lambda r: r.to_text().replace('"', '').rstrip("."), - resolver.query(domain, record_type, lifetime=timeout))) + resolver.resolve(domain, record_type, lifetime=timeout))) if cache: cache[cache_key] = records