diff --git a/parsedmarc/__init__.py b/parsedmarc/__init__.py index 0c82f73..2bb8dd2 100644 --- a/parsedmarc/__init__.py +++ b/parsedmarc/__init__.py @@ -690,6 +690,8 @@ def parsed_aggregate_reports_to_csv_rows(reports): row["source_country"] = record["source"]["country"] row["source_reverse_dns"] = record["source"]["reverse_dns"] row["source_base_domain"] = record["source"]["base_domain"] + row["source_name"] = record["source"]["name"] + row["source_type"] = record["source"]["type"] row["count"] = record["count"] row["spf_aligned"] = record["alignment"]["spf"] row["dkim_aligned"] = record["alignment"]["dkim"] @@ -758,8 +760,8 @@ def parsed_aggregate_reports_to_csv(reports): "org_extra_contact_info", "report_id", "begin_date", "end_date", "errors", "domain", "adkim", "aspf", "p", "sp", "pct", "fo", "source_ip_address", "source_country", "source_reverse_dns", - "source_base_domain", "count", "spf_aligned", - "dkim_aligned", "dmarc_aligned", "disposition", + "source_base_domain", "source_name", "source_type" "count", + "spf_aligned", "dkim_aligned", "dmarc_aligned", "disposition", "policy_override_reasons", "policy_override_comments", "envelope_from", "header_from", "envelope_to", "dkim_domains", "dkim_selectors", "dkim_results", @@ -918,6 +920,8 @@ def parsed_forensic_reports_to_csv_rows(reports): row["source_ip_address"] = report["source"]["ip_address"] row["source_reverse_dns"] = report["source"]["reverse_dns"] row["source_base_domain"] = report["source"]["base_domain"] + row["source_name"] = report["source"]["name"] + row["source_type"] = report["source"]["type"] row["source_country"] = report["source"]["country"] del row["source"] row["subject"] = report["parsed_sample"]["subject"] @@ -947,7 +951,8 @@ def parsed_forensic_reports_to_csv(reports): "original_mail_from", "original_rcpt_to", "arrival_date", "arrival_date_utc", "subject", "message_id", "authentication_results", "dkim_domain", "source_ip_address", - "source_country", "source_reverse_dns", "source_base_domain", + "source_country", "source_reverse_dns", + "source_base_domain", "source_name", "source_type", "delivery_result", "auth_failure", "reported_domain", "authentication_mechanisms", "sample_headers_only"] diff --git a/parsedmarc/elastic.py b/parsedmarc/elastic.py index 07c3f9e..f4eb2a9 100644 --- a/parsedmarc/elastic.py +++ b/parsedmarc/elastic.py @@ -61,6 +61,8 @@ class _AggregateReportDoc(Document): source_country = Text() source_reverse_dns = Text() source_Base_domain = Text() + source_type = Text() + source_name = Text() message_count = Integer disposition = Text() dkim_aligned = Boolean() diff --git a/parsedmarc/opensearch.py b/parsedmarc/opensearch.py index ae44437..54488e0 100644 --- a/parsedmarc/opensearch.py +++ b/parsedmarc/opensearch.py @@ -60,6 +60,8 @@ class _AggregateReportDoc(Document): source_country = Text() source_reverse_dns = Text() source_Base_domain = Text() + source_type = Text() + source_name = Text() message_count = Integer disposition = Text() dkim_aligned = Boolean() diff --git a/parsedmarc/resources/maps/README.md b/parsedmarc/resources/maps/README.md new file mode 100644 index 0000000..e220507 --- /dev/null +++ b/parsedmarc/resources/maps/README.md @@ -0,0 +1,48 @@ +# About + +These files are meant to make it easier to identify who or what a sending source is. Please consider contributing +additional mappings in a GitHub Pull Request. + +## base_reverse_dns_map.csv + +A CSV file with three fields: `base_reverse_dns`, `service_name`, and `service_type`. +Most of the time the base reverse DNS of sending service is closely related to the name of the +service, but not always. Sometimes services will use multiple reverse DNS domains for the same service. For example, +Intuit Mailchimp uses the base domains `mcdlv.net`, `mcsv.net`, +and `rsgsv.net`. Having all of these mapped makes it easier to answer questions like: "How many emails is +Intuit Mailchimp sending as my domains?" + +The `service_type` is based on the primary service provided by that entity. For example, most ISPs provide email +hosting to their customers , but the primary purpose of the service is to provide internet access. Likewise, nearly all +email `Marketing` services are `SaaS` platforms, but it is more useful to identify them as marketing platforms. For +individual entities that use their own reverse DNS domain names but do not provide a `SaaS` platform, setting the +`service_type` to the industry is most useful, with the notable exception on `Email Security` services. The +current `service_type` values in use are: + +Email Provider +Email Security +Entertainment +Finance +Food +Government +Government Media +Healthcare +Industrial +ISP +Logistics +Marketing +MSP +Nonprofit +Print +Real Estate +Retail +SaaS +Social Media +Technology +Travel +University +Web Host + +The file currently contains over 400 mappings from a wide variety of email sending services, including large email +providers, SaaS platforms, small web hosts, and healthcare companies. Ideally this mapping will continuously grow to +include many other services and industries. diff --git a/parsedmarc/resources/maps/__init__.py b/parsedmarc/resources/maps/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parsedmarc/splunk.py b/parsedmarc/splunk.py index 0e0ca61..9d94feb 100644 --- a/parsedmarc/splunk.py +++ b/parsedmarc/splunk.py @@ -85,6 +85,8 @@ class HECClient(object): "reverse_dns"] new_report["source_base_domain"] = record["source"][ "base_domain"] + new_report["source_type"] = record["source"]["type"] + new_report["source_name"] = record["source"]["name"] new_report["message_count"] = record["count"] new_report["disposition"] = record["policy_evaluated"][ "disposition" diff --git a/parsedmarc/utils.py b/parsedmarc/utils.py index 3155d62..077f705 100644 --- a/parsedmarc/utils.py +++ b/parsedmarc/utils.py @@ -16,6 +16,7 @@ import base64 import atexit import mailbox import re +import csv try: import importlib.resources as pkg_resources except ImportError: @@ -32,6 +33,7 @@ import publicsuffixlist from parsedmarc.log import logger import parsedmarc.resources.dbip +import parsedmarc.resources.maps parenthesis_regex = re.compile(r'\s*\(.*\)\s*') @@ -293,6 +295,34 @@ def get_ip_address_country(ip_address, db_path=None): return country +def get_service_from_reverse_dns_base_domain(reverse_dns_base_domain): + """ + Returns the service name of a given base domain name from reverse DNS. + + Args: + reverse_dns_base_domain (str): The base domain of the reverse DNS lookup + Returns: + dict: A dictionary containing name and type. + If the service is unknown, the name will be + the supplied reverse_dns_base_domain and the type will be None + """ + reverse_dns_base_domain = reverse_dns_base_domain.lower().strip() + service_map = dict() + with pkg_resources.path(parsedmarc.resources.maps, + "base_reverse_dns_map.csv") as path: + with open(path) as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + service_map[row["base_reverse_dns"].lower().strip()] = dict(name=row["name"], + type=row["type"]) + try: + service = service_map[reverse_dns_base_domain] + except KeyError: + service = dict(sname=reverse_dns_base_domain, type=None) + + return service + + def get_ip_address_info(ip_address, ip_db_path=None, cache=None, offline=False, nameservers=None, timeout=2.0): """ @@ -315,12 +345,12 @@ def get_ip_address_info(ip_address, ip_db_path=None, cache=None, offline=False, if cache is not None: info = cache.get(ip_address, None) if info: - logger.debug("IP address " + ip_address + " was found in cache") + logger.debug(f"IP address {ip_address} was found in cache") return info else: - logger.debug("IP address " + ip_address + " not found in cache") + logger.debug(f"IP address {ip_address} not found in cache") else: - logger.debug("IP address cache not specified") + logger.debug("IP address cache was not specified") info = OrderedDict() info["ip_address"] = ip_address if offline: @@ -333,9 +363,14 @@ def get_ip_address_info(ip_address, ip_db_path=None, cache=None, offline=False, info["country"] = country info["reverse_dns"] = reverse_dns info["base_domain"] = None + info["service_name"] = None + info["service_type"] = None if reverse_dns is not None: base_domain = get_base_domain(reverse_dns) + service = get_service_from_reverse_dns_base_domain(base_domain) info["base_domain"] = base_domain + info["service_type"] = service["service_type"] + info["service_name"] = service["service_name"] if cache is not None: cache[ip_address] = info