diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py
index 5379711..43e75b9 100644
--- a/parsedmarc/__init__.py
+++ b/parsedmarc/__init__.py
@@ -24,6 +24,7 @@ import mailbox
import mailparser
from expiringdict import ExpiringDict
import xmltodict
+from lxml import etree
from mailsuite.imap import IMAPClient
from mailsuite.smtp import send_email
@@ -355,11 +356,11 @@ def extract_xml(input_):
file_object.seek(0)
if header.startswith(MAGIC_ZIP):
_zip = zipfile.ZipFile(file_object)
- xml = _zip.open(_zip.namelist()[0]).read().decode()
+ xml = _zip.open(_zip.namelist()[0]).read().decode(errors='ignore')
elif header.startswith(MAGIC_GZIP):
- xml = GzipFile(fileobj=file_object).read().decode()
+ xml = GzipFile(fileobj=file_object).read().decode(errors='ignore')
elif header.startswith(MAGIC_XML):
- xml = file_object.read().decode()
+ xml = file_object.read().decode(errors='ignore')
else:
file_object.close()
raise InvalidAggregateReport("Not a valid zip, gzip, or xml file")
@@ -419,7 +420,7 @@ def parsed_aggregate_reports_to_csv_rows(reports):
return str(obj).lower()
if type(reports) == OrderedDict:
- reports = [reports]
+ reports = [reports['report']]
rows = []
diff --git a/requirements.txt b/requirements.txt
index 0680ca2..478c64d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,3 +29,4 @@ sphinx>=1.0.5
sphinx_rtd_theme>=0.4.3
wheel>=0.33.6
codecov>=2.0.15
+lxml>=4.4.1
diff --git a/samples/aggregate/invalid_utf_8.xml b/samples/aggregate/invalid_utf_8.xml
new file mode 100644
index 0000000..e2346f3
--- /dev/null
+++ b/samples/aggregate/invalid_utf_8.xml
@@ -0,0 +1,40 @@
+
+
+
+
+ administrator@accurateplastics.com
+ example.com:1538463741
+
+ 1538413632
+ 1538413632
+
+
+
+ example.com
+ r
+ r
+ none
+ reject
+ 100
+
+
+
+ 12.20.127.122
+ 1
+
+ none
+ fail
+ fail
+
+
+
+ bad_byte‘
+
+
+
+
+ none
+
+
+
+
diff --git a/samples/aggregate/invalid_xml.xml b/samples/aggregate/invalid_xml.xml
new file mode 100644
index 0000000..f9df750
--- /dev/null
+++ b/samples/aggregate/invalid_xml.xml
@@ -0,0 +1,40 @@
+
+
+
+ veeam.com
+
+ sonexushealth.com:1530233361
+
+ 1530133200
+ 1530219600
+
+
+
+ example.com
+ r
+ r
+ none
+ none
+ 100
+
+
+
+ 199.230.200.36
+ 1
+
+ none
+ fail
+ fail
+
+
+
+ example.com
+
+
+
+
+ none
+
+
+
+
diff --git a/setup.py b/setup.py
index 8b731a6..17e1570 100644
--- a/setup.py
+++ b/setup.py
@@ -104,7 +104,7 @@ setup(
'elasticsearch>=6.3.1,<7.0.0',
'elasticsearch-dsl>=6.3.1,<7.0.0',
'kafka-python>=1.4.4',
- 'tqdm>=4.31.1'
+ 'tqdm>=4.31.1', 'lxml>=4.4.1'
],
entry_points={