From 16a4be2205f1801afff34696d3e4683bc8cdee02 Mon Sep 17 00:00:00 2001 From: Sean Whalen Date: Sun, 7 Oct 2018 12:50:02 -0400 Subject: [PATCH] 4.1.8 - Be more forgiving of weird XML --- .gitignore | 4 +- CHANGELOG.md | 5 ++ parsedmarc/__init__.py | 53 +++++++++++++----- ...ample.com!1538204542!1538463818.xml.sample | 40 +++++++++++++ ....com!1536853302!1536939702!2940.xml.sample | 34 +++++++++++ ...ample.de!1538690400!1538776800.xml.sample} | 8 +-- ...ample.com!1538784000!1538870399.xml.sample | 56 +++++++++++++++++++ setup.py | 2 +- 8 files changed, 181 insertions(+), 21 deletions(-) create mode 100644 samples/!example.com!1538204542!1538463818.xml.sample create mode 100644 samples/estadocuenta1.infonacot.gob.mx!example.com!1536853302!1536939702!2940.xml.sample rename samples/{ikea.com!xxxxxx.de!1538690400!1538776800.xml.sample => ikea.com!example.de!1538690400!1538776800.xml.sample} (87%) create mode 100644 samples/usssa.com!example.com!1538784000!1538870399.xml.sample diff --git a/.gitignore b/.gitignore index b5eaed7..89f65eb 100644 --- a/.gitignore +++ b/.gitignore @@ -106,7 +106,6 @@ ENV/ .idea/ # I/O files - output/ *.zip *.gz @@ -121,3 +120,6 @@ output/ # Data files *.dat *.mmdb + +# Temp files +tmp/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 46056aa..353132b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +4.1.8 +----- + +- Be more forgiving of weird XML + 4.1.7 ----- diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 1190662..0ca2bee 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -44,13 +44,14 @@ import imapclient.exceptions import dateparser import mailparser -__version__ = "4.1.7" +__version__ = "4.1.8" logger = logging.getLogger(__name__) logger.setLevel(logging.ERROR) feedback_report_regex = re.compile(r"^([\w\-]+): (.+)$", re.MULTILINE) -xml_schema_regex = re.compile(r"\s*", re.MULTILINE) +xml_header_regex = re.compile(r"^<\?xml .*$", re.MULTILINE) +xml_schema_regex = re.compile(r"<\/?xs:schema.>", re.MULTILINE) MAGIC_ZIP = b"\x50\x4B\x03\x04" MAGIC_GZIP = b"\x1F\x8B" @@ -424,16 +425,19 @@ def _parse_report_record(record, nameservers=None, timeout=2.0): new_record["auth_results"]["spf"].append(new_result) if "envelope_from" not in new_record["identifiers"]: - envelope_from = new_record["auth_results"]["spf"][-1]["domain"] + envelope_from = None + if len(auth_results["spf"]) > 0: + envelope_from = new_record["auth_results"]["spf"][-1]["domain"] if envelope_from is not None: envelope_from = str(envelope_from).lower() new_record["identifiers"]["envelope_from"] = envelope_from elif new_record["identifiers"]["envelope_from"] is None: - envelope_from = new_record["auth_results"]["spf"][-1]["domain"] - if envelope_from is not None: - envelope_from = str(envelope_from).lower() - new_record["identifiers"]["envelope_from"] = envelope_from + if len(auth_results["spf"]) > 0: + envelope_from = new_record["auth_results"]["spf"][-1]["domain"] + if envelope_from is not None: + envelope_from = str(envelope_from).lower() + new_record["identifiers"]["envelope_from"] = envelope_from envelope_to = None if "envelope_to" in new_record["identifiers"]: @@ -457,9 +461,20 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0): Returns: OrderedDict: The parsed aggregate DMARC report """ + errors = [] + try: + xmltodict.parse(xml)["feedback"] + except Exception as e: + errors.append(e.__str__()) + + try: + # Replace XML header (sometimes they are invalid) + xml = xml_header_regex.sub("", xml) + # Remove invalid schema tags - xml = xml_schema_regex.sub("", xml) + xml = xml_schema_regex.sub('', xml) + report = xmltodict.parse(xml)["feedback"] report_metadata = report["report_metadata"] schema = "draft" @@ -467,7 +482,13 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0): schema = report["version"] new_report = OrderedDict([("xml_schema", schema)]) new_report_metadata = OrderedDict() - org_name = _get_base_domain(report_metadata["org_name"]) + if report_metadata["org_name"] is None: + if report_metadata["email"] is not None: + report_metadata["org_name"] = report_metadata[ + "email"].split("@")[-1] + org_name = report_metadata["org_name"] + if org_name is not None: + org_name = _get_base_domain(org_name) new_report_metadata["org_name"] = org_name new_report_metadata["org_email"] = report_metadata["email"] extra = None @@ -484,7 +505,6 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0): date_range["end"] = _timestamp_to_human(date_range["end"]) new_report_metadata["begin_date"] = date_range["begin"] new_report_metadata["end_date"] = date_range["end"] - errors = [] if "error" in report["report_metadata"]: if type(report["report_metadata"]["error"]) != list: errors = [report["report_metadata"]["error"]] @@ -526,13 +546,16 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0): if type(report["record"]) == list: for record in report["record"]: - records.append(_parse_report_record(record, - nameservers=nameservers, - timeout=timeout)) + report_record = _parse_report_record(record, + nameservers=nameservers, + timeout=timeout) + records.append(report_record) else: - records.append(_parse_report_record(report["record"], - nameservers=nameservers)) + report_record = _parse_report_record(report["record"], + nameservers=nameservers, + timeout=timeout) + records.append(report_record) new_report["records"] = records diff --git a/samples/!example.com!1538204542!1538463818.xml.sample b/samples/!example.com!1538204542!1538463818.xml.sample new file mode 100644 index 0000000..b68f81a --- /dev/null +++ b/samples/!example.com!1538204542!1538463818.xml.sample @@ -0,0 +1,40 @@ + + + + + administrator@accurateplastics.com + example.com:1538463741 + + 1538413632 + 1538413632 + + + + example.com + r + r +

none

+ reject + 100 +
+ + + 12.20.127.122 + 1 + + none + fail + fail + + + + example.com + + + + + none + + + +
diff --git a/samples/estadocuenta1.infonacot.gob.mx!example.com!1536853302!1536939702!2940.xml.sample b/samples/estadocuenta1.infonacot.gob.mx!example.com!1536853302!1536939702!2940.xml.sample new file mode 100644 index 0000000..795e9d9 --- /dev/null +++ b/samples/estadocuenta1.infonacot.gob.mx!example.com!1536853302!1536939702!2940.xml.sample @@ -0,0 +1,34 @@ + + + + 2940 + XYZ Corporation + admin@estadocuenta1.infonacot.gob.mx + http://estadocuenta1.infonacot.gob.mx + + 1536853302 + 1536939702 + + + + example.com +

none

+
+ + + 148.243.137.254 + 1 + + none + fail + fail + + + + estadocuenta1.infonacot.gob.mx + example.com + + + + +
diff --git a/samples/ikea.com!xxxxxx.de!1538690400!1538776800.xml.sample b/samples/ikea.com!example.de!1538690400!1538776800.xml.sample similarity index 87% rename from samples/ikea.com!xxxxxx.de!1538690400!1538776800.xml.sample rename to samples/ikea.com!example.de!1538690400!1538776800.xml.sample index ac3f8b0..1f0c3d1 100644 --- a/samples/ikea.com!xxxxxx.de!1538690400!1538776800.xml.sample +++ b/samples/ikea.com!example.de!1538690400!1538776800.xml.sample @@ -10,7 +10,7 @@ - xxxxxx.de + example.de r r

none

@@ -29,12 +29,12 @@ - xxxxxx.de - xxxxxx.de + example.de + example.de - xxxxxx.de + example.de pass diff --git a/samples/usssa.com!example.com!1538784000!1538870399.xml.sample b/samples/usssa.com!example.com!1538784000!1538870399.xml.sample new file mode 100644 index 0000000..3593b4c --- /dev/null +++ b/samples/usssa.com!example.com!1538784000!1538870399.xml.sample @@ -0,0 +1,56 @@ + + + 1.0 + + usssa.com + postmaster@usssa.com + 8953b4d4a4ee4218b6ac0e2cb2667ee1 + + 1538784000 + 1538870399 + + + + example.com + r + r +

none

+ none + 100 + 0 +
+ + + 12.20.127.40 + 1 + + none + fail + fail + + + + + example.com + + + + + + + 199.230.200.36 + 1 + + none + fail + fail + + + + + example.com + + + + +
diff --git a/setup.py b/setup.py index c2da17e..47fba1c 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ from setuptools import setup from codecs import open from os import path -__version__ = "4.1.7" +__version__ = "4.1.8" description = "A Python package and CLI for parsing aggregate and " \ "forensic DMARC reports"