diff --git a/.gitignore b/.gitignore index 9b845e1..fd61433 100644 --- a/.gitignore +++ b/.gitignore @@ -118,7 +118,8 @@ output/ # Data files *.dat -*.mmdb +GeoIP* +GeoLite* # Temp files tmp/ @@ -127,5 +128,3 @@ tmp/ prod*.ini stage*.ini dev*.ini - -*.mmdb diff --git a/CHANGELOG.md b/CHANGELOG.md index 8cd66b3..8cb5da6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,18 @@ Changelog ========= +7.1.0 +----- + +- A static copy of the DBIP database is now included for use when a copy of the MaxMind GeoLite2 Country database is not installed (Closes #275) +- Add `ip_db_path` to as a parameter and `general` setting for a custom IP geolocation database location (Closes #184) +- Search default Homebrew path when searching for a copy of the MaxMind GeoLite2 Country database (Closes #272) +- Fix log messages written to root logger (PR #276) +- Fix `--offline` option in CLI not being passed as a boolean (PR #265) +- Set Elasticsearch shard replication to `0` (PR #274) +- Add support for syslog output (PR #263 closes #227) +- Do not print TQDDM progress bar when running in a no-interactive TTY (PR #264) + 7.0.1 ----- diff --git a/README.rst b/README.rst index 6184f7a..dc21171 100644 --- a/README.rst +++ b/README.rst @@ -104,7 +104,17 @@ CLI help .. note:: - In ``parsedmarc`` 6.0.0, most CLI options were moved to a configuration file, described below. + Starting in ``parsedmarc`` 7.1.0, a static copy of the `IP to Country Lite database`_ from IPDB is + distributed with ``parsedmarc``, under the terms of the `Creative Commons Attribution 4.0 International License`_. as + a fallback if the `MaxMind GeoLite2 Country database`_ is not installed However, ``parsedmarc`` cannot install updated + versions of these databases as they are released, so MaxMind's databases and `geoipupdate`_ tool is still the + preferable solution. + + The location of the database file can be overridden by using the ``ip_db_path`` setting. + +.. note:: + + Starting in ``parsedmarc`` 6.0.0, most CLI options were moved to a configuration file, described below. Configuration file ================== @@ -157,6 +167,7 @@ The full set of configuration options are: - ``output`` - str: Directory to place JSON and CSV files in - ``aggregate_json_filename`` - str: filename for the aggregate JSON output file - ``forensic_json_filename`` - str: filename for the forensic JSON output file + - ``ip_db_path`` - str: An optional custim path to a MMDB file from MaxMind or DBIP - ``offline`` - bool: Do not use online queries for geolocation or DNS - ``nameservers`` - str: A comma separated list of DNS resolvers (Default: `Cloudflare's public resolvers`_) - ``dns_timeout`` - float: DNS timeout period @@ -481,6 +492,14 @@ https://github.com/domainaware/parsedmarc/issues .. _Demystifying DMARC: https://seanthegeek.net/459/demystifying-dmarc/ +.. _IP to Country Lite database: https://db-ip.com/db/download/ip-to-country-lite + +.. _Creative Commons Attribution 4.0 International License: https://creativecommons.org/licenses/by/4.0/ + +.. _MaxMind GeoLite2 Country database: https://dev.maxmind.com/geoip/geolite2-free-geolocation-data + +.. _geoipupdate: https://github.com/maxmind/geoipupdate + .. _Cloudflare's public resolvers: https://1.1.1.1/ .. _URL encoded: https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_reserved_characters diff --git a/docs/index.rst b/docs/index.rst index fa9cd62..87c864a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -148,6 +148,10 @@ For example bucket = my-bucket path = parsedmarc + [syslog] + server = localhost + port = 514 + The full set of configuration options are: - ``general`` @@ -157,6 +161,7 @@ The full set of configuration options are: - ``output`` - str: Directory to place JSON and CSV files in - ``aggregate_json_filename`` - str: filename for the aggregate JSON output file - ``forensic_json_filename`` - str: filename for the forensic JSON output file + - ``ip_db_path`` - str: An optional custim path to a MMDB file from MaxMind or DBIP - ``offline`` - bool: Do not use online queries for geolocation or DNS - ``nameservers`` - str: A comma separated list of DNS resolvers (Default: `Cloudflare's public resolvers`_) - ``dns_timeout`` - float: DNS timeout period @@ -496,6 +501,16 @@ Installation geoipupdate setup ----------------- +.. note:: + + Starting in ``parsedmarc`` 7.1.0, a static copy of the `IP to Country Lite database`_ from IPDB is + distributed with ``parsedmarc``, under the terms of the `Creative Commons Attribution 4.0 International License`_. as + a fallback if the `MaxMind GeoLite2 Country database`_ is not installed However, ``parsedmarc`` cannot install updated + versions of these databases as they are released, so MaxMind's databases and `geoipupdate`_ tool is still the + preferable solution. + + The location of the database file can be overridden by using the ``ip_db_path`` setting. + On Debian 10 (Buster) or later, run: .. code-block:: bash @@ -1601,6 +1616,14 @@ Indices and tables .. _Demystifying DMARC: https://seanthegeek.net/459/demystifying-dmarc/ +.. _IP to Country Lite database: https://db-ip.com/db/download/ip-to-country-lite + +.. _Creative Commons Attribution 4.0 International License: https://creativecommons.org/licenses/by/4.0/ + +.. _MaxMind GeoLite2 Country database: https://dev.maxmind.com/geoip/geolite2-free-geolocation-data + +.. _geoipupdate: https://github.com/maxmind/geoipupdate + .. _Cloudflare's public resolvers: https://1.1.1.1/ .. _URL encoded: https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_reserved_characters diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 712b822..1c504ec 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -36,7 +36,7 @@ from parsedmarc.utils import is_outlook_msg, convert_outlook_msg from parsedmarc.utils import timestamp_to_human, human_timestamp_to_datetime from parsedmarc.utils import parse_email -__version__ = "7.0.1" +__version__ = "7.1.0" logging.basicConfig( format='%(levelname)8s:%(filename)s:%(lineno)d:' @@ -73,14 +73,15 @@ class InvalidForensicReport(InvalidDMARCReport): """Raised when an invalid DMARC forensic report is encountered""" -def _parse_report_record(record, offline=False, nameservers=None, - dns_timeout=2.0, parallel=False): +def _parse_report_record(record, ip_db_path=None, offline=False, + nameservers=None, dns_timeout=2.0, parallel=False): """ Converts a record from a DMARC aggregate report into a more consistent format Args: record (OrderedDict): The record to convert + ip_db_path (str): Path to a MMDB file from MaxMind or DBIP offline (bool): Do not query online for geolocation or DNS nameservers (list): A list of one or more nameservers to use (Cloudflare's public DNS resolvers by default) @@ -93,6 +94,7 @@ def _parse_report_record(record, offline=False, nameservers=None, new_record = OrderedDict() new_record_source = get_ip_address_info(record["row"]["source_ip"], cache=IP_ADDRESS_CACHE, + ip_db_path=ip_db_path, offline=offline, nameservers=nameservers, timeout=dns_timeout, @@ -203,12 +205,13 @@ def _parse_report_record(record, offline=False, nameservers=None, return new_record -def parse_aggregate_report_xml(xml, offline=False, nameservers=None, +def parse_aggregate_report_xml(xml, ip_db_path=None, offline=False, nameservers=None, timeout=2.0, parallel=False, server=None): """Parses a DMARC XML report string and returns a consistent OrderedDict Args: xml (str): A string of DMARC aggregate report XML + ip_db_path (str): Path to a MMDB file from MaxMind or DBIP offline (bool): Do not query online for geolocation or DNS nameservers (list): A list of one or more nameservers to use (Cloudflare's public DNS resolvers by default) @@ -314,6 +317,7 @@ def parse_aggregate_report_xml(xml, offline=False, nameservers=None, logger.debug("Processed {0}/{1}".format( i, len(report["record"]))) report_record = _parse_report_record(report["record"][i], + ip_db_path=ip_db_path, offline=offline, nameservers=nameservers, dns_timeout=timeout, @@ -322,6 +326,7 @@ def parse_aggregate_report_xml(xml, offline=False, nameservers=None, else: report_record = _parse_report_record(report["record"], + ip_db_path=ip_db_path, offline=offline, nameservers=nameservers, dns_timeout=timeout, @@ -391,7 +396,7 @@ def extract_xml(input_): return xml -def parse_aggregate_report_file(_input, offline=False, nameservers=None, +def parse_aggregate_report_file(_input, offline=False, ip_db_path=None, nameservers=None, dns_timeout=2.0, parallel=False, server=None): @@ -401,6 +406,7 @@ def parse_aggregate_report_file(_input, offline=False, nameservers=None, Args: _input: A path to a file, a file like object, or bytes offline (bool): Do not query online for geolocation or DNS + ip_db_path (str): Path to a MMDB file from MaxMind or DBIP nameservers (list): A list of one or more nameservers to use (Cloudflare's public DNS resolvers by default) dns_timeout (float): Sets the DNS timeout in seconds @@ -413,6 +419,7 @@ def parse_aggregate_report_file(_input, offline=False, nameservers=None, xml = extract_xml(_input) return parse_aggregate_report_xml(xml, + ip_db_path=ip_db_path, offline=offline, nameservers=nameservers, timeout=dns_timeout, @@ -560,7 +567,7 @@ def parsed_aggregate_reports_to_csv(reports): def parse_forensic_report(feedback_report, sample, msg_date, - offline=False, nameservers=None, dns_timeout=2.0, + offline=False, ip_db_path=None, nameservers=None, dns_timeout=2.0, strip_attachment_payloads=False, parallel=False): """ @@ -568,6 +575,7 @@ def parse_forensic_report(feedback_report, sample, msg_date, Args: feedback_report (str): A message's feedback report as a string + ip_db_path (str): Path to a MMDB file from MaxMind or DBIP offline (bool): Do not query online for geolocation or DNS sample (str): The RFC 822 headers or RFC 822 message sample msg_date (str): The message's date header @@ -619,6 +627,7 @@ def parse_forensic_report(feedback_report, sample, msg_date, ip_address = re.split(r'\s', parsed_report["source_ip"]).pop(0) parsed_report_source = get_ip_address_info(ip_address, + ip_db_path=ip_db_path, offline=offline, nameservers=nameservers, timeout=dns_timeout, @@ -747,7 +756,7 @@ def parsed_forensic_reports_to_csv(reports): return csv_file.getvalue() -def parse_report_email(input_, offline=False, nameservers=None, +def parse_report_email(input_, offline=False, ip_db_path=None, nameservers=None, dns_timeout=2.0, strip_attachment_payloads=False, parallel=False, server=None): """ @@ -755,6 +764,7 @@ def parse_report_email(input_, offline=False, nameservers=None, Args: input_: An emailed DMARC report in RFC 822 format, as bytes or a string + ip_db_path (str): Path to a MMDB file from MaxMind or DBIP offline (bool): Do not query online for geolocation on DNS nameservers (list): A list of one or more nameservers to use dns_timeout (float): Sets the DNS timeout in seconds @@ -824,6 +834,7 @@ def parse_report_email(input_, offline=False, nameservers=None, ns = nameservers aggregate_report = parse_aggregate_report_file( payload, + ip_db_path=ip_db_path, offline=offline, nameservers=ns, dns_timeout=dns_timeout, @@ -877,7 +888,7 @@ def parse_report_email(input_, offline=False, nameservers=None, def parse_report_file(input_, nameservers=None, dns_timeout=2.0, - strip_attachment_payloads=False, + strip_attachment_payloads=False, ip_db_path=None, offline=False, parallel=False, server=None): """Parses a DMARC aggregate or forensic file at the given path, a file-like object. or bytes @@ -889,6 +900,7 @@ def parse_report_file(input_, nameservers=None, dns_timeout=2.0, dns_timeout (float): Sets the DNS timeout in seconds strip_attachment_payloads (bool): Remove attachment payloads from forensic report results + ip_db_path (str): Path to a MMDB file from MaxMind or DBIP offline (bool): Do not make online queries for geolocation or DNS parallel (bool): Parallel processing server (IMAPClient): Connection object @@ -908,6 +920,7 @@ def parse_report_file(input_, nameservers=None, dns_timeout=2.0, file_object.close() try: report = parse_aggregate_report_file(content, + ip_db_path=ip_db_path, offline=offline, nameservers=nameservers, dns_timeout=dns_timeout, @@ -919,6 +932,7 @@ def parse_report_file(input_, nameservers=None, dns_timeout=2.0, try: sa = strip_attachment_payloads results = parse_report_email(content, + ip_db_path=ip_db_path, offline=offline, nameservers=nameservers, dns_timeout=dns_timeout, @@ -933,7 +947,9 @@ def parse_report_file(input_, nameservers=None, dns_timeout=2.0, def get_dmarc_reports_from_mbox(input_, nameservers=None, dns_timeout=2.0, strip_attachment_payloads=False, - offline=False, parallel=False): + ip_db_path=None, + offline=False, + parallel=False): """Parses a mailbox in mbox format containing e-mails with attached DMARC reports @@ -944,6 +960,7 @@ def get_dmarc_reports_from_mbox(input_, nameservers=None, dns_timeout=2.0, dns_timeout (float): Sets the DNS timeout in seconds strip_attachment_payloads (bool): Remove attachment payloads from forensic report results + ip_db_path (str): Path to a MMDB file from MaxMind or DBIP offline (bool): Do not make online queries for geolocation or DNS parallel (bool): Parallel processing @@ -968,6 +985,7 @@ def get_dmarc_reports_from_mbox(input_, nameservers=None, dns_timeout=2.0, try: sa = strip_attachment_payloads parsed_email = parse_report_email(msg_content, + ip_db_path=ip_db_path, offline=offline, nameservers=nameservers, dns_timeout=dns_timeout, @@ -1018,6 +1036,7 @@ def get_dmarc_reports_from_inbox(connection=None, archive_folder="Archive", delete=False, test=False, + ip_db_path=None, offline=False, nameservers=None, dns_timeout=6.0, @@ -1041,6 +1060,7 @@ def get_dmarc_reports_from_inbox(connection=None, archive_folder: The folder to move processed mail to delete (bool): Delete messages after processing them test (bool): Do not move or delete messages after processing them + ip_db_path (str): Path to a MMDB file from MaxMind or DBIP offline (bool): Do not query onfline for geolocation or DNS nameservers (list): A list of DNS nameservers to query dns_timeout (float): Set the DNS query timeout @@ -1109,6 +1129,7 @@ def get_dmarc_reports_from_inbox(connection=None, parsed_email = parse_report_email(msg_content, nameservers=nameservers, dns_timeout=dns_timeout, + ip_db_path=ip_db_path, offline=offline, strip_attachment_payloads=sa, server=server) @@ -1206,6 +1227,7 @@ def get_dmarc_reports_from_inbox(connection=None, dns_timeout=dns_timeout, strip_attachment_payloads=strip_attachment_payloads, results=results, + ip_db_path=ip_db_path, offline=offline ) @@ -1215,7 +1237,8 @@ def get_dmarc_reports_from_inbox(connection=None, def watch_inbox(host, username, password, callback, port=None, ssl=True, verify=True, reports_folder="INBOX", archive_folder="Archive", delete=False, test=False, - idle_timeout=30, offline=False, nameservers=None, + idle_timeout=30, ip_db_path=None, + offline=False, nameservers=None, dns_timeout=6.0, strip_attachment_payloads=False, batch_size=None): """ @@ -1234,6 +1257,7 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, delete (bool): Delete messages after processing them test (bool): Do not move or delete messages after processing them idle_timeout (int): Number of seconds to wait for a IMAP IDLE response + ip_db_path (str): Path to a MMDB file from MaxMind or DBIP offline (bool): Do not query online for geolocation or DNS nameservers (list): A list of one or more nameservers to use (Cloudflare's public DNS resolvers by default) @@ -1250,6 +1274,7 @@ def watch_inbox(host, username, password, callback, port=None, ssl=True, archive_folder=archive_folder, delete=delete, test=test, + ip_db_path=ip_db_path, offline=offline, nameservers=nameservers, dns_timeout=dns_timeout, @@ -1283,7 +1308,7 @@ def save_output(results, output_directory="output", Args: results (OrderedDict): Parsing results - output_directory (str): The patch to the directory to save in + output_directory (str): The path to the directory to save in aggregate_json_filename (str): Filename for the aggregate JSON file forensic_json_filename (str): Filename for the forensic JSON file aggregate_csv_filename (str): Filename for the aggregate CSV file diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index 54962d3..7865d9f 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -32,11 +32,12 @@ def _str_to_list(s): return list(map(lambda i: i.lstrip(), _list)) -def cli_parse(file_path, sa, nameservers, dns_timeout, offline, - parallel=False): +def cli_parse(file_path, sa, nameservers, dns_timeout, + ip_db_path, offline, parallel=False): """Separated this function for multiprocessing""" try: file_results = parse_report_file(file_path, + ip_db_path=ip_db_path, offline=offline, nameservers=nameservers, dns_timeout=dns_timeout, @@ -355,6 +356,8 @@ def _main(): opts.n_procs = general_config.getint("n_procs") if "chunk_size" in general_config: opts.chunk_size = general_config.getint("chunk_size") + if "ip_db_path" in general_config: + opts.ip_db_path = general_config["ip_db_path"] if "imap" in config.sections(): imap_config = config["imap"] if "host" in imap_config: @@ -656,6 +659,7 @@ def _main(): repeat(opts.strip_attachment_payloads), repeat(opts.nameservers), repeat(opts.dns_timeout), + repeat(opts.ip_db_path), repeat(opts.offline), repeat(opts.n_procs >= 1)), opts.chunk_size) @@ -683,10 +687,11 @@ def _main(): forensic_reports.append(result[0]["report"]) for mbox_path in mbox_paths: - reports = get_dmarc_reports_from_mbox(mbox_path, opts.nameservers, - opts.dns_timeout, - opts.strip_attachment_payloads, - opts.offline, False) + reports = get_dmarc_reports_from_mbox(mbox_path, nameservers=opts.nameservers, + dns_timeout=opts.dns_timeout, + strip_attachment_payloads=opts.strip_attachment_payloads, + ip_db_path=opts.ip_db_path, + offline=opts.offline, parallel=False) aggregate_reports += reports["aggregate_reports"] forensic_reports += reports["forensic_reports"] @@ -718,6 +723,7 @@ def _main(): password=opts.imap_password, reports_folder=rf, archive_folder=af, + ip_db_path=opts.ip_db_path, delete=opts.imap_delete, offline=opts.offline, nameservers=ns, @@ -786,6 +792,7 @@ def _main(): dns_timeout=opts.dns_timeout, strip_attachment_payloads=sa, batch_size=opts.imap_batch_size, + ip_db_path=opts.ip_db_path, offline=opts.offline) except FileExistsError as error: logger.error("{0}".format(error.__str__())) diff --git a/parsedmarc/resources/__init__.py b/parsedmarc/resources/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parsedmarc/resources/dbip-country-lite.mmdb b/parsedmarc/resources/dbip-country-lite.mmdb new file mode 100644 index 0000000..5dcb5ed Binary files /dev/null and b/parsedmarc/resources/dbip-country-lite.mmdb differ diff --git a/parsedmarc/utils.py b/parsedmarc/utils.py index 9446726..5d4c4bf 100644 --- a/parsedmarc/utils.py +++ b/parsedmarc/utils.py @@ -16,6 +16,11 @@ import platform import atexit import mailbox import re +try: + import importlib.resources as pkg_resources +except ImportError: + # Try backported to PY<37 `importlib_resources` + import importlib_resources as pkg_resources import dateparser import dns.reversename @@ -26,6 +31,8 @@ import geoip2.errors import requests import publicsuffix2 +import parsedmarc.resources + USER_AGENT = "Mozilla/5.0 (({0} {1})) parsedmarc".format( platform.system(), platform.release(), @@ -249,7 +256,7 @@ def human_timestamp_to_datetime(human_timestamp, to_utc=False): def human_timestamp_to_timestamp(human_timestamp): """ - Converts a human-readable timestamp into a into a UNIX timestamp + Converts a human-readable timestamp into a UNIX timestamp Args: human_timestamp (str): A timestamp in `YYYY-MM-DD HH:MM:SS`` format @@ -261,49 +268,46 @@ def human_timestamp_to_timestamp(human_timestamp): return human_timestamp_to_datetime(human_timestamp).timestamp() -def get_ip_address_country(ip_address): +def get_ip_address_country(ip_address, db_path=None): """ - Uses the MaxMind Geolite2 Country database to return the ISO code for the - country associated with the given IPv4 or IPv6 address + Returns the ISO code for the country associated + with the given IPv4 or IPv6 address Args: ip_address (str): The IP address to query for + db_path (str): Path to a MMDB file from MaxMind or DBIP Returns: str: And ISO country code associated with the given IP address """ - system_paths = [ + db_paths = [ "GeoLite2-Country.mmdb", "/usr/local/share/GeoIP/GeoLite2-Country.mmdb", "/usr/share/GeoIP/GeoLite2-Country.mmdb", "/var/lib/GeoIP/GeoLite2-Country.mmdb", "/var/local/lib/GeoIP/GeoLite2-Country.mmdb", + "/usr/local/var/GeoIP/GeoLite2-Country.mmdb", "%SystemDrive%\\ProgramData\\MaxMind\\GeoIPUpdate\\GeoIP\\" "GeoLite2-Country.mmdb", - "C:\\GeoIP\\GeoLite2-Country.mmdb" + "C:\\GeoIP\\GeoLite2-Country.mmdb", + "dbip-country-lite.mmdb", + "dbip-country.mmdb", ] - db_path = None - - for system_path in system_paths: - if os.path.exists(system_path): - db_path = system_path - break + if db_path is None: + for system_path in db_paths: + if os.path.exists(system_path): + db_path = system_path + break if db_path is None: - db_path = os.path.join(tempdir, "GeoLite2-Country.mmdb") - if not os.path.exists(db_path): - logger.warning("GeoLite2-Country.mmdb is missing. " - "Please follow the instructions at " - "https://dev.maxmind.com/geoip/geoipupdate/ " - "to get the latest version.") - return None - else: - db_age = datetime.now() - datetime.fromtimestamp( - os.stat(db_path).st_mtime) - if db_age > timedelta(days=7): - logger.warning("GeoLite2-Country.mmdb is more than a week old") - db_path = db_path + with pkg_resources.path(parsedmarc.resources, "dbip-country-lite.mmdb") as path: + db_path = path + + db_age = datetime.now() - datetime.fromtimestamp( + os.stat(db_path).st_mtime) + if db_age > timedelta(days=30): + logger.warning("IP database is more than a month old") db_reader = geoip2.database.Reader(db_path) @@ -317,13 +321,14 @@ def get_ip_address_country(ip_address): return country -def get_ip_address_info(ip_address, cache=None, offline=False, +def get_ip_address_info(ip_address, ip_db_path=None, cache=None, offline=False, nameservers=None, timeout=2.0, parallel=False): """ Returns reverse DNS and country information for the given IP address Args: ip_address (str): The IP address to check + ip_db_path (str): path to a MMDB file from MaxMind or DBIP cache (ExpiringDict): Cache storage offline (bool): Do not make online queries for geolocation or DNS nameservers (list): A list of one or more nameservers to use @@ -348,7 +353,7 @@ def get_ip_address_info(ip_address, cache=None, offline=False, reverse_dns = get_reverse_dns(ip_address, nameservers=nameservers, timeout=timeout) - country = get_ip_address_country(ip_address) + country = get_ip_address_country(ip_address, db_path=ip_db_path) info["country"] = country info["reverse_dns"] = reverse_dns info["base_domain"] = None