diff --git a/.gitignore b/.gitignore index b5eaed7..89f65eb 100644 --- a/.gitignore +++ b/.gitignore @@ -106,7 +106,6 @@ ENV/ .idea/ # I/O files - output/ *.zip *.gz @@ -121,3 +120,6 @@ output/ # Data files *.dat *.mmdb + +# Temp files +tmp/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 46056aa..1a716a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,23 @@ +4.2.0 +------ + +- Save each aggregate report record as a separate Splunk event +- Fix IAMP delete action +- Suppress Splunk SSL validation warnings +- Change default logging level to `WARNING` + + +4.1.9 +----- + +- Workaround for forensic/ruf reports that are missing `Arrival-Date` and/or +`Reported-Domain` + +4.1.8 +----- + +- Be more forgiving of weird XML + 4.1.7 ----- diff --git a/docs/index.rst b/docs/index.rst index 22b8fee..72ae0f6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -390,7 +390,7 @@ To set up visual dashboards of DMARC data, install Elasticsearch and Kibana. wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add - echo "deb https://artifacts.elastic.co/packages/6.x/apt stable main" | sudo tee -a /etc/apt/sources.list.d/elastic-6.x.list sudo apt-get update - sudo apt-get install -y elasticsearch kibana + sudo apt-get install -y default-jre-headless elasticsearch kibana sudo systemctl daemon-reload sudo systemctl enable elasticsearch.service sudo systemctl enable kibana.service diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 1190662..7e65715 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -44,13 +44,14 @@ import imapclient.exceptions import dateparser import mailparser -__version__ = "4.1.7" +__version__ = "4.2.0" logger = logging.getLogger(__name__) -logger.setLevel(logging.ERROR) +logger.setLevel(logging.WARNING) feedback_report_regex = re.compile(r"^([\w\-]+): (.+)$", re.MULTILINE) -xml_schema_regex = re.compile(r"\s*", re.MULTILINE) +xml_header_regex = re.compile(r"^<\?xml .*$", re.MULTILINE) +xml_schema_regex = re.compile(r"<\/?xs:schema.>", re.MULTILINE) MAGIC_ZIP = b"\x50\x4B\x03\x04" MAGIC_GZIP = b"\x1F\x8B" @@ -392,23 +393,28 @@ def _parse_report_record(record, nameservers=None, timeout=2.0): new_record["auth_results"] = OrderedDict([("dkim", []), ("spf", [])]) if record["auth_results"] is not None: auth_results = record["auth_results"].copy() + if "spf" not in auth_results: + auth_results["spf"] = [] + if "dkim" not in auth_results: + auth_results["dkim"] = [] else: auth_results = new_record["auth_results"].copy() - if "dkim" in auth_results: - if type(auth_results["dkim"]) != list: - auth_results["dkim"] = [auth_results["dkim"]] - for result in auth_results["dkim"]: - if "domain" in result and result["domain"] is not None: - new_result = OrderedDict([("domain", result["domain"])]) - if "selector" in result and result["selector"] is not None: - new_result["selector"] = result["selector"] - else: - new_result["selector"] = "none" - if "result" in result and result["result"] is not None: - new_result["result"] = result["result"] - else: - new_result["result"] = "none" - new_record["auth_results"]["dkim"].append(new_result) + + if type(auth_results["dkim"]) != list: + auth_results["dkim"] = [auth_results["dkim"]] + for result in auth_results["dkim"]: + if "domain" in result and result["domain"] is not None: + new_result = OrderedDict([("domain", result["domain"])]) + if "selector" in result and result["selector"] is not None: + new_result["selector"] = result["selector"] + else: + new_result["selector"] = "none" + if "result" in result and result["result"] is not None: + new_result["result"] = result["result"] + else: + new_result["result"] = "none" + new_record["auth_results"]["dkim"].append(new_result) + if type(auth_results["spf"]) != list: auth_results["spf"] = [auth_results["spf"]] for result in auth_results["spf"]: @@ -424,16 +430,19 @@ def _parse_report_record(record, nameservers=None, timeout=2.0): new_record["auth_results"]["spf"].append(new_result) if "envelope_from" not in new_record["identifiers"]: - envelope_from = new_record["auth_results"]["spf"][-1]["domain"] + envelope_from = None + if len(auth_results["spf"]) > 0: + envelope_from = new_record["auth_results"]["spf"][-1]["domain"] if envelope_from is not None: envelope_from = str(envelope_from).lower() new_record["identifiers"]["envelope_from"] = envelope_from elif new_record["identifiers"]["envelope_from"] is None: - envelope_from = new_record["auth_results"]["spf"][-1]["domain"] - if envelope_from is not None: - envelope_from = str(envelope_from).lower() - new_record["identifiers"]["envelope_from"] = envelope_from + if len(auth_results["spf"]) > 0: + envelope_from = new_record["auth_results"]["spf"][-1]["domain"] + if envelope_from is not None: + envelope_from = str(envelope_from).lower() + new_record["identifiers"]["envelope_from"] = envelope_from envelope_to = None if "envelope_to" in new_record["identifiers"]: @@ -457,9 +466,20 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0): Returns: OrderedDict: The parsed aggregate DMARC report """ + errors = [] + try: + xmltodict.parse(xml)["feedback"] + except Exception as e: + errors.append(e.__str__()) + + try: + # Replace XML header (sometimes they are invalid) + xml = xml_header_regex.sub("", xml) + # Remove invalid schema tags - xml = xml_schema_regex.sub("", xml) + xml = xml_schema_regex.sub('', xml) + report = xmltodict.parse(xml)["feedback"] report_metadata = report["report_metadata"] schema = "draft" @@ -467,7 +487,13 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0): schema = report["version"] new_report = OrderedDict([("xml_schema", schema)]) new_report_metadata = OrderedDict() - org_name = _get_base_domain(report_metadata["org_name"]) + if report_metadata["org_name"] is None: + if report_metadata["email"] is not None: + report_metadata["org_name"] = report_metadata[ + "email"].split("@")[-1] + org_name = report_metadata["org_name"] + if org_name is not None: + org_name = _get_base_domain(org_name) new_report_metadata["org_name"] = org_name new_report_metadata["org_email"] = report_metadata["email"] extra = None @@ -484,7 +510,6 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0): date_range["end"] = _timestamp_to_human(date_range["end"]) new_report_metadata["begin_date"] = date_range["begin"] new_report_metadata["end_date"] = date_range["end"] - errors = [] if "error" in report["report_metadata"]: if type(report["report_metadata"]["error"]) != list: errors = [report["report_metadata"]["error"]] @@ -526,13 +551,16 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0): if type(report["record"]) == list: for record in report["record"]: - records.append(_parse_report_record(record, - nameservers=nameservers, - timeout=timeout)) + report_record = _parse_report_record(record, + nameservers=nameservers, + timeout=timeout) + records.append(report_record) else: - records.append(_parse_report_record(report["record"], - nameservers=nameservers)) + report_record = _parse_report_record(report["record"], + nameservers=nameservers, + timeout=timeout) + records.append(report_record) new_report["records"] = records @@ -726,7 +754,7 @@ def parsed_aggregate_reports_to_csv(reports): def parse_forensic_report(feedback_report, sample, sample_headers_only, - nameservers=None, timeout=2.0): + msg_date, nameservers=None, timeout=2.0): """ Converts a DMARC forensic report and sample to a ``OrderedDict`` @@ -734,12 +762,13 @@ def parse_forensic_report(feedback_report, sample, sample_headers_only, feedback_report (str): A message's feedback report as a string sample (str): The RFC 822 headers or RFC 822 message sample sample_headers_only (bool): Set true if the sample is only headers + msg_date (str): The message's date header nameservers (list): A list of one or more nameservers to use (Cloudflare's public DNS resolvers by default) timeout (float): Sets the DNS timeout in seconds Returns: - OrderedDict: An parsed report and sample + OrderedDict: A parsed report and sample """ def convert_address(original_address): @@ -777,14 +806,14 @@ def parse_forensic_report(feedback_report, sample, sample_headers_only, for report_value in report_values: key = report_value[0].lower().replace("-", "_") parsed_report[key] = report_value[1] - if key == "arrival_date": - arrival_utc = dateparser.parse(parsed_report["arrival_date"], - settings={"TO_TIMEZONE": "UTC"}) - arrival_utc = arrival_utc.strftime("%Y-%m-%d %H:%M:%S") - parsed_report["arrival_date_utc"] = arrival_utc - if "arrival_date_utc" not in parsed_report: - raise InvalidForensicReport("Missing Arrival-Date") + if "arrival_date" not in parsed_report: + parsed_report["arrival_date"] = msg_date + + arrival_utc = dateparser.parse(parsed_report["arrival_date"], + settings={"TO_TIMEZONE": "UTC"}) + arrival_utc = arrival_utc.strftime("%Y-%m-%d %H:%M:%S") + parsed_report["arrival_date_utc"] = arrival_utc ip_address = parsed_report["source_ip"] parsed_report["source"] = _get_ip_address_info(ip_address, @@ -827,7 +856,11 @@ def parse_forensic_report(feedback_report, sample, sample_headers_only, if "date_utc" in received: received["date_utc"] = received["date_utc"].replace("T", " ") - parsed_sample["from"] = convert_address(parsed_sample["from"][0]) + msg_from = convert_address(parsed_sample["from"][0]) + parsed_sample["from"] = msg_from + if "reported_domain" not in parsed_report: + domain = msg_from["address"].split("@")[-1].lower() + parsed_report["reported_domain"] = domain if "reply_to" in parsed_sample: parsed_sample["reply_to"] = list(map(lambda x: convert_address(x), @@ -1012,6 +1045,7 @@ def parse_report_email(input_, nameservers=None, timeout=2.0): sample = None if "subject" in msg: subject = decode_header(msg["subject"]) + date = decode_header(msg["date"]) for part in msg.walk(): content_type = part.get_content_type() payload = part.get_payload() @@ -1039,6 +1073,7 @@ def parse_report_email(input_, nameservers=None, timeout=2.0): forensic_report = parse_forensic_report(feedback_report, sample, sample_headers_only, + date, nameservers=nameservers, timeout=timeout) @@ -1214,10 +1249,8 @@ def get_dmarc_reports_from_inbox(host=None, if type(msg_uids) == str: msg_uids = [msg_uids] - for chunk in chunks(msg_uids, 100): - server.add_flags(chunk, [imapclient.DELETED]) - - server.expunge() + server.delete_messages(msg_uids, silent=True) + server.expunge(msg_uids) def move_messages(msg_uids, folder): if type(msg_uids) == str: diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index fcf3d62..7be74b5 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -83,7 +83,7 @@ def _main(): arg_parser.add_argument("--imap-port", default=None, help="IMAP port") arg_parser.add_argument("--imap-no-ssl", action="store_true", default=False, - help="Do not use SSL when connecting to IMAP") + help="Do not use SSL/TLS when connecting to IMAP") arg_parser.add_argument("-r", "--reports-folder", default="INBOX", help="The IMAP folder containing the reports\n" "Default: INBOX") @@ -157,7 +157,7 @@ def _main(): help="Do not move or delete IMAP messages", action="store_true", default=False) arg_parser.add_argument("-s", "--silent", action="store_true", - help="Only print errors") + help="Only print errors and warnings") arg_parser.add_argument("--debug", action="store_true", help="Print debugging information") arg_parser.add_argument("-v", "--version", action="version", @@ -168,8 +168,9 @@ def _main(): args = arg_parser.parse_args() - logging.basicConfig(level=logging.ERROR) - logger.setLevel(logging.ERROR) + logging.basicConfig(level=logging.WARNING) + logger.setLevel(logging.WARNING) + if args.debug: logging.basicConfig(level=logging.DEBUG) logger.setLevel(logging.DEBUG) diff --git a/parsedmarc/splunk.py b/parsedmarc/splunk.py index 96189b9..714f2d4 100644 --- a/parsedmarc/splunk.py +++ b/parsedmarc/splunk.py @@ -1,11 +1,14 @@ from urllib.parse import urlparse import socket import json +import urllib3 import requests from parsedmarc import __version__, human_timestamp_to_timestamp +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + class SplunkError(RuntimeError): """Raised when a Splunk API error occurs""" @@ -93,11 +96,12 @@ class HECClient(object): new_report["spf_results"] = record["auth_results"][ "spf"] - data["sourcetype"] = "dmarc:aggregate" - timestamp = human_timestamp_to_timestamp(new_report["begin_date"]) - data["time"] = timestamp - data["event"] = new_report.copy() - json_str += "{0}\n".format(json.dumps(data)) + data["sourcetype"] = "dmarc:aggregate" + timestamp = human_timestamp_to_timestamp( + new_report["begin_date"]) + data["time"] = timestamp + data["event"] = new_report.copy() + json_str += "{0}\n".format(json.dumps(data)) try: response = self.session.post(self.url, data=json_str).json() except Exception as e: @@ -135,3 +139,6 @@ class HECClient(object): raise SplunkError(e.__str__()) if response["code"] != 0: raise SplunkError(response["text"]) + + + diff --git a/requirements.txt b/requirements.txt index b10e457..aff41e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ dnspython +urllib3 requests publicsuffix xmltodict diff --git a/samples/!example.com!1538204542!1538463818.xml.sample b/samples/!example.com!1538204542!1538463818.xml.sample new file mode 100644 index 0000000..b68f81a --- /dev/null +++ b/samples/!example.com!1538204542!1538463818.xml.sample @@ -0,0 +1,40 @@ + + + + + administrator@accurateplastics.com + example.com:1538463741 + + 1538413632 + 1538413632 + + + + example.com + r + r +

none

+ reject + 100 +
+ + + 12.20.127.122 + 1 + + none + fail + fail + + + + example.com + + + + + none + + + +
diff --git a/samples/addisonfoods.com!example.com!1536105600!1536191999.xml.sample b/samples/addisonfoods.com!example.com!1536105600!1536191999.xml.sample new file mode 100644 index 0000000..8b7b0fe --- /dev/null +++ b/samples/addisonfoods.com!example.com!1536105600!1536191999.xml.sample @@ -0,0 +1,44 @@ + + + 1.0 + + addisonfoods.com + postmaster@addisonfoods.com + 3ceb5548498640beaeb47327e202b0b9 + + 1536105600 + 1536191999 + + + + example.com + r + r +

none

+ none + 100 + 0 +
+ + + 109.203.100.17 + 1 + + none + fail + fail + + + + example.com + example.com + + + + toptierhighticket.club + default + pass + + + +
diff --git a/samples/estadocuenta1.infonacot.gob.mx!example.com!1536853302!1536939702!2940.xml.sample b/samples/estadocuenta1.infonacot.gob.mx!example.com!1536853302!1536939702!2940.xml.sample new file mode 100644 index 0000000..795e9d9 --- /dev/null +++ b/samples/estadocuenta1.infonacot.gob.mx!example.com!1536853302!1536939702!2940.xml.sample @@ -0,0 +1,34 @@ + + + + 2940 + XYZ Corporation + admin@estadocuenta1.infonacot.gob.mx + http://estadocuenta1.infonacot.gob.mx + + 1536853302 + 1536939702 + + + + example.com +

none

+
+ + + 148.243.137.254 + 1 + + none + fail + fail + + + + estadocuenta1.infonacot.gob.mx + example.com + + + + +
diff --git a/samples/ikea.com!xxxxxx.de!1538690400!1538776800.xml.sample b/samples/ikea.com!example.de!1538690400!1538776800.xml.sample similarity index 87% rename from samples/ikea.com!xxxxxx.de!1538690400!1538776800.xml.sample rename to samples/ikea.com!example.de!1538690400!1538776800.xml.sample index ac3f8b0..1f0c3d1 100644 --- a/samples/ikea.com!xxxxxx.de!1538690400!1538776800.xml.sample +++ b/samples/ikea.com!example.de!1538690400!1538776800.xml.sample @@ -10,7 +10,7 @@ - xxxxxx.de + example.de r r

none

@@ -29,12 +29,12 @@ - xxxxxx.de - xxxxxx.de + example.de + example.de - xxxxxx.de + example.de pass diff --git a/samples/usssa.com!example.com!1538784000!1538870399.xml.sample b/samples/usssa.com!example.com!1538784000!1538870399.xml.sample new file mode 100644 index 0000000..3593b4c --- /dev/null +++ b/samples/usssa.com!example.com!1538784000!1538870399.xml.sample @@ -0,0 +1,56 @@ + + + 1.0 + + usssa.com + postmaster@usssa.com + 8953b4d4a4ee4218b6ac0e2cb2667ee1 + + 1538784000 + 1538870399 + + + + example.com + r + r +

none

+ none + 100 + 0 +
+ + + 12.20.127.40 + 1 + + none + fail + fail + + + + + example.com + + + + + + + 199.230.200.36 + 1 + + none + fail + fail + + + + + example.com + + + + +
diff --git a/setup.py b/setup.py index c2da17e..b94a56c 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ from setuptools import setup from codecs import open from os import path -__version__ = "4.1.7" +__version__ = "4.2.0" description = "A Python package and CLI for parsing aggregate and " \ "forensic DMARC reports" @@ -92,8 +92,8 @@ setup( # requirements files see: # https://packaging.python.org/en/latest/requirements.html install_requires=['dnspython', 'publicsuffix', 'xmltodict', 'geoip2', - 'dnspython', 'imapclient', 'mail-parser', 'dateparser', - 'elasticsearch>=6.3.0,<7.0.0', + 'urllib3', 'requests', 'imapclient', 'mail-parser', + 'dateparser', 'elasticsearch>=6.3.0,<7.0.0', 'elasticsearch-dsl>=6.2.1,<7.0.0' ],