From 1d8af3ccff4d0f28aff15978be5cfa4f06e003f4 Mon Sep 17 00:00:00 2001 From: Sean Whalen Date: Thu, 24 Apr 2025 13:48:51 -0400 Subject: [PATCH] Add find_unknown_base_reverse_dns.py --- parsedmarc/resources/maps/README.md | 42 +++++++++-- .../maps/find_unknown_base_reverse_dns.py | 69 +++++++++++++++++++ 2 files changed, 106 insertions(+), 5 deletions(-) create mode 100755 parsedmarc/resources/maps/find_unknown_base_reverse_dns.py diff --git a/parsedmarc/resources/maps/README.md b/parsedmarc/resources/maps/README.md index f1fbc83..faaaceb 100644 --- a/parsedmarc/resources/maps/README.md +++ b/parsedmarc/resources/maps/README.md @@ -19,33 +19,65 @@ The `service_type` is based on the following rule precedence: 3. All telecommunications providers that offer internet access are identified as `ISP`, even if they also offer other services, such as web hosting or email hosting. 4. All web hosting providers are identified as `Web Hosting`, even if the service also offers email hosting. 5. All email account providers are identified as `Email Provider`, no matter how or where they are hosted -6. All legitimate platforms offering their Software as a Service SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics. +6. All legitimate platforms offering their Software as a Service (SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics. 7. All other senders that use their own domain as a Reverse DNS base domain should be identified based on their industry +- Agriculture +- Automotive +- Beauty +- Construction +- Consulting +- Defense +- Education - Email Provider - Email Security -- Education - Entertainment +- Event Planning - Finance - Food - Government - Government Media - Healthcare +- IaaS - Industrial - ISP - Logistics +- Manufacturing - Marketing - MSP +- MSSP +- News - Nonprofit +- PaaS +- Photography - Print +- Publishing - Real Estate - Retail - SaaS +- Science +- Search Engine - Social Media +- Sports +- Staffing - Technology - Travel - Web Host -The file currently contains over 600 mappings from a wide variety of email sending services, including large email -providers, SaaS platforms, small web hosts, and healthcare companies. Ideally this mapping will continuously grow to -include many other services and industries. +The file currently contains over 1,400 mappings from a wide variety of email sending sources. + +## known_unknown_base_reverse_dns.txt + +A list of reverse DNS base domains that could not be identified as belonging to a particular organization, service, or industry. + +## base_reverse_dns.csv + +A CSV with the fields `base_reverse_dns` and optionally `count`. This CSV can be generated byy exporting the base DNS data from the Kibana on Splunk dashboards provided by parsedmarc. This file is not tracked by Git. + +## unknown_base_reverse_dns.csv + +A CSV file with the fields `base_reverse_dns` and `count`. This file is not tracked by Git. + +## find_unknown_base_reverse_dns.py + +This is a python script that reads the domains in `base_reverse_dns.csv` and writes the domains that are not in `base_reverse_dns_map.csv` or `known_unknown_base_reverse_dns.txt` to `unknown_base_reverse_dns.csv`. This is useful for identifying potential additional domains to contribute to `base_reverse_dns_map.csv` and `known_unknown_base_reverse_dns.txt`. diff --git a/parsedmarc/resources/maps/find_unknown_base_reverse_dns.py b/parsedmarc/resources/maps/find_unknown_base_reverse_dns.py new file mode 100755 index 0000000..a5568ca --- /dev/null +++ b/parsedmarc/resources/maps/find_unknown_base_reverse_dns.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +import logging +import os +import csv + + +def _main(): + input_csv_file_path = "base_reverse_dns.csv" + base_reverse_dns_map_file_path = "base_reverse_dns_map.csv" + known_unknown_list_file_path = "known_unknown_base_reverse_dns.txt" + output_csv_file_path = "unknown_base_reverse_dns.csv" + + csv_headers = ["base_reverse_dns", "count"] + + output_rows = [] + + logging.basicConfig() + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + + for p in [ + input_csv_file_path, + base_reverse_dns_map_file_path, + known_unknown_list_file_path, + ]: + if not os.path.exists(p): + logger.error(f"{p} does not exist") + exit(1) + logger.info(f"Loading {known_unknown_list_file_path}") + known_unknown_domains = [] + with open(known_unknown_list_file_path) as f: + for line in f.readlines(): + domain = line.lower().strip() + if domain in known_unknown_domains: + logger.warning(f"{domain} is in {known_unknown_list_file_path} multiple times") + else: + known_unknown_domains.append(domain) + logger.info(f"Loading {base_reverse_dns_map_file_path}") + known_domains = [] + with open(base_reverse_dns_map_file_path) as f: + for row in csv.DictReader(f): + domain = row["base_reverse_dns"].lower().strip() + if domain in known_domains: + logger.warning( + f"{domain} is in {base_reverse_dns_map_file_path} multiple times" + ) + else: + known_domains.append(domain) + if domain in known_unknown_domains and known_domains: + pass + logger.warning(f"{domain} is in {known_unknown_list_file_path} and {base_reverse_dns_map_file_path}") + + logger.info(f"Checking domains against {base_reverse_dns_map_file_path}") + with open(input_csv_file_path) as f: + for row in csv.DictReader(f): + domain = row["base_reverse_dns"].lower().strip() + if domain not in known_domains or known_unknown_domains: + logger.info(f"New unknown domain found: {domain}") + output_rows.append(row) + logger.info(f"Writing {output_csv_file_path}") + with open(output_csv_file_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=csv_headers) + writer.writeheader() + writer.writerows(output_rows) + + +if __name__ == "__main__": + _main()