diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 5379711..43e75b9 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -24,6 +24,7 @@ import mailbox import mailparser from expiringdict import ExpiringDict import xmltodict +from lxml import etree from mailsuite.imap import IMAPClient from mailsuite.smtp import send_email @@ -355,11 +356,11 @@ def extract_xml(input_): file_object.seek(0) if header.startswith(MAGIC_ZIP): _zip = zipfile.ZipFile(file_object) - xml = _zip.open(_zip.namelist()[0]).read().decode() + xml = _zip.open(_zip.namelist()[0]).read().decode(errors='ignore') elif header.startswith(MAGIC_GZIP): - xml = GzipFile(fileobj=file_object).read().decode() + xml = GzipFile(fileobj=file_object).read().decode(errors='ignore') elif header.startswith(MAGIC_XML): - xml = file_object.read().decode() + xml = file_object.read().decode(errors='ignore') else: file_object.close() raise InvalidAggregateReport("Not a valid zip, gzip, or xml file") @@ -419,7 +420,7 @@ def parsed_aggregate_reports_to_csv_rows(reports): return str(obj).lower() if type(reports) == OrderedDict: - reports = [reports] + reports = [reports['report']] rows = [] diff --git a/requirements.txt b/requirements.txt index 0680ca2..478c64d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,3 +29,4 @@ sphinx>=1.0.5 sphinx_rtd_theme>=0.4.3 wheel>=0.33.6 codecov>=2.0.15 +lxml>=4.4.1 diff --git a/samples/aggregate/invalid_utf_8.xml b/samples/aggregate/invalid_utf_8.xml new file mode 100644 index 0000000..e2346f3 --- /dev/null +++ b/samples/aggregate/invalid_utf_8.xml @@ -0,0 +1,40 @@ + + + + + administrator@accurateplastics.com + example.com:1538463741 + + 1538413632 + 1538413632 + + + + example.com + r + r +

none

+ reject + 100 +
+ + + 12.20.127.122 + 1 + + none + fail + fail + + + + bad_byte‘ + + + + + none + + + +
diff --git a/samples/aggregate/invalid_xml.xml b/samples/aggregate/invalid_xml.xml new file mode 100644 index 0000000..f9df750 --- /dev/null +++ b/samples/aggregate/invalid_xml.xml @@ -0,0 +1,40 @@ + + + + veeam.com + + sonexushealth.com:1530233361 + + 1530133200 + 1530219600 + + + + example.com + r + r +

none

+ none + 100 +
+ + + 199.230.200.36 + 1 + + none + fail + fail + + + + example.com + + + + + none + + + +
diff --git a/setup.py b/setup.py index 8b731a6..17e1570 100644 --- a/setup.py +++ b/setup.py @@ -104,7 +104,7 @@ setup( 'elasticsearch>=6.3.1,<7.0.0', 'elasticsearch-dsl>=6.3.1,<7.0.0', 'kafka-python>=1.4.4', - 'tqdm>=4.31.1' + 'tqdm>=4.31.1', 'lxml>=4.4.1' ], entry_points={