4.1.8 - Be more forgiving of weird XML

This commit is contained in:
Sean Whalen
2018-10-07 12:50:02 -04:00
parent ead03b9779
commit 16a4be2205
8 changed files with 181 additions and 21 deletions

4
.gitignore vendored
View File

@@ -106,7 +106,6 @@ ENV/
.idea/
# I/O files
output/
*.zip
*.gz
@@ -121,3 +120,6 @@ output/
# Data files
*.dat
*.mmdb
# Temp files
tmp/

View File

@@ -1,3 +1,8 @@
4.1.8
-----
- Be more forgiving of weird XML
4.1.7
-----

View File

@@ -44,13 +44,14 @@ import imapclient.exceptions
import dateparser
import mailparser
__version__ = "4.1.7"
__version__ = "4.1.8"
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)
feedback_report_regex = re.compile(r"^([\w\-]+): (.+)$", re.MULTILINE)
xml_schema_regex = re.compile(r"\s*<xs:schema.*>", re.MULTILINE)
xml_header_regex = re.compile(r"^<\?xml .*$", re.MULTILINE)
xml_schema_regex = re.compile(r"<\/?xs:schema.>", re.MULTILINE)
MAGIC_ZIP = b"\x50\x4B\x03\x04"
MAGIC_GZIP = b"\x1F\x8B"
@@ -424,16 +425,19 @@ def _parse_report_record(record, nameservers=None, timeout=2.0):
new_record["auth_results"]["spf"].append(new_result)
if "envelope_from" not in new_record["identifiers"]:
envelope_from = new_record["auth_results"]["spf"][-1]["domain"]
envelope_from = None
if len(auth_results["spf"]) > 0:
envelope_from = new_record["auth_results"]["spf"][-1]["domain"]
if envelope_from is not None:
envelope_from = str(envelope_from).lower()
new_record["identifiers"]["envelope_from"] = envelope_from
elif new_record["identifiers"]["envelope_from"] is None:
envelope_from = new_record["auth_results"]["spf"][-1]["domain"]
if envelope_from is not None:
envelope_from = str(envelope_from).lower()
new_record["identifiers"]["envelope_from"] = envelope_from
if len(auth_results["spf"]) > 0:
envelope_from = new_record["auth_results"]["spf"][-1]["domain"]
if envelope_from is not None:
envelope_from = str(envelope_from).lower()
new_record["identifiers"]["envelope_from"] = envelope_from
envelope_to = None
if "envelope_to" in new_record["identifiers"]:
@@ -457,9 +461,20 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0):
Returns:
OrderedDict: The parsed aggregate DMARC report
"""
errors = []
try:
xmltodict.parse(xml)["feedback"]
except Exception as e:
errors.append(e.__str__())
try:
# Replace XML header (sometimes they are invalid)
xml = xml_header_regex.sub("", xml)
# Remove invalid schema tags
xml = xml_schema_regex.sub("", xml)
xml = xml_schema_regex.sub('<?xml version="1.0"?>', xml)
report = xmltodict.parse(xml)["feedback"]
report_metadata = report["report_metadata"]
schema = "draft"
@@ -467,7 +482,13 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0):
schema = report["version"]
new_report = OrderedDict([("xml_schema", schema)])
new_report_metadata = OrderedDict()
org_name = _get_base_domain(report_metadata["org_name"])
if report_metadata["org_name"] is None:
if report_metadata["email"] is not None:
report_metadata["org_name"] = report_metadata[
"email"].split("@")[-1]
org_name = report_metadata["org_name"]
if org_name is not None:
org_name = _get_base_domain(org_name)
new_report_metadata["org_name"] = org_name
new_report_metadata["org_email"] = report_metadata["email"]
extra = None
@@ -484,7 +505,6 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0):
date_range["end"] = _timestamp_to_human(date_range["end"])
new_report_metadata["begin_date"] = date_range["begin"]
new_report_metadata["end_date"] = date_range["end"]
errors = []
if "error" in report["report_metadata"]:
if type(report["report_metadata"]["error"]) != list:
errors = [report["report_metadata"]["error"]]
@@ -526,13 +546,16 @@ def parse_aggregate_report_xml(xml, nameservers=None, timeout=2.0):
if type(report["record"]) == list:
for record in report["record"]:
records.append(_parse_report_record(record,
nameservers=nameservers,
timeout=timeout))
report_record = _parse_report_record(record,
nameservers=nameservers,
timeout=timeout)
records.append(report_record)
else:
records.append(_parse_report_record(report["record"],
nameservers=nameservers))
report_record = _parse_report_record(report["record"],
nameservers=nameservers,
timeout=timeout)
records.append(report_record)
new_report["records"] = records

View File

@@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8" ?>
<feedback>
<report_metadata>
<org_name></org_name>
<email>administrator@accurateplastics.com</email>
<report_id>example.com:1538463741</report_id>
<date_range>
<begin>1538413632</begin>
<end>1538413632</end>
</date_range>
</report_metadata>
<policy_published>
<domain>example.com</domain>
<adkim>r</adkim>
<aspf>r</aspf>
<p>none</p>
<sp>reject</sp>
<pct>100</pct>
</policy_published>
<record>
<row>
<source_ip>12.20.127.122</source_ip>
<count>1</count>
<policy_evaluated>
<disposition>none</disposition>
<dkim>fail</dkim>
<spf>fail</spf>
</policy_evaluated>
</row>
<identifiers>
<header_from>example.com</header_from>
</identifiers>
<auth_results>
<spf>
<domain></domain>
<result>none</result>
</spf>
</auth_results>
</record>
</feedback>

View File

@@ -0,0 +1,34 @@
<?xml version="1.0"?>
<feedback>
<report_metadata>
<report_id>2940</report_id>
<org_name>XYZ Corporation</org_name>
<email>admin@estadocuenta1.infonacot.gob.mx</email>
<extra_contact_info>http://estadocuenta1.infonacot.gob.mx</extra_contact_info>
<date_range>
<begin>1536853302</begin>
<end>1536939702</end>
</date_range>
</report_metadata>
<policy_published>
<domain>example.com</domain>
<p>none</p>
</policy_published>
<record>
<row>
<source_ip>148.243.137.254</source_ip>
<count>1</count>
<policy_evaluated>
<disposition>none</disposition>
<dkim>fail</dkim>
<spf>fail</spf>
</policy_evaluated>
</row>
<identifiers>
<envelope_to>estadocuenta1.infonacot.gob.mx</envelope_to>
<header_from>example.com</header_from>
</identifiers>
<auth_results>
</auth_results>
</record>
</feedback>

View File

@@ -10,7 +10,7 @@
</date_range>
</report_metadata>
<policy_published>
<domain>xxxxxx.de</domain>
<domain>example.de</domain>
<adkim>r</adkim>
<aspf>r</aspf>
<p>none</p>
@@ -29,12 +29,12 @@
</policy_evaluated>
</row>
<identifiers>
<header_from>xxxxxx.de</header_from>
<envelope_from>xxxxxx.de</envelope_from>
<header_from>example.de</header_from>
<envelope_from>example.de</envelope_from>
</identifiers>
<auth_results>
<dkim>
<domain>xxxxxx.de</domain>
<domain>example.de</domain>
<result>pass</result>
</dkim>
<spf>

View File

@@ -0,0 +1,56 @@
<?xml version="1.0"?>
<feedback>
<version>1.0</version>
<report_metadata>
<org_name>usssa.com</org_name>
<email>postmaster@usssa.com</email>
<report_id>8953b4d4a4ee4218b6ac0e2cb2667ee1</report_id>
<date_range>
<begin>1538784000</begin>
<end>1538870399</end>
</date_range>
</report_metadata>
<policy_published>
<domain>example.com</domain>
<adkim>r</adkim>
<aspf>r</aspf>
<p>none</p>
<sp>none</sp>
<pct>100</pct>
<fo>0</fo>
</policy_published>
<record>
<row>
<source_ip>12.20.127.40</source_ip>
<count>1</count>
<policy_evaluated>
<disposition>none</disposition>
<dkim>fail</dkim>
<spf>fail</spf>
</policy_evaluated>
</row>
<identifiers>
<envelope_from></envelope_from>
<header_from>example.com</header_from>
</identifiers>
<auth_results>
</auth_results>
</record>
<record>
<row>
<source_ip>199.230.200.36</source_ip>
<count>1</count>
<policy_evaluated>
<disposition>none</disposition>
<dkim>fail</dkim>
<spf>fail</spf>
</policy_evaluated>
</row>
<identifiers>
<envelope_from></envelope_from>
<header_from>example.com</header_from>
</identifiers>
<auth_results>
</auth_results>
</record>
</feedback>

View File

@@ -14,7 +14,7 @@ from setuptools import setup
from codecs import open
from os import path
__version__ = "4.1.7"
__version__ = "4.1.8"
description = "A Python package and CLI for parsing aggregate and " \
"forensic DMARC reports"