Compare commits

..

2 Commits

Author SHA1 Message Date
Sean Whalen
66506056ac Upgrade elasticsearch version to >=8.0.0 2025-03-22 14:15:05 -04:00
Sean Whalen
eb912ce68d Upgrade elasticsearch version from 7.4.0 to 8.17.1 2025-03-22 14:07:52 -04:00
15 changed files with 92 additions and 664 deletions

View File

@@ -11,26 +11,13 @@ on:
jobs:
build:
runs-on: ubuntu-latest
services:
elasticsearch:
image: elasticsearch:8.18.2
env:
discovery.type: single-node
cluster.name: parsedmarc-cluster
discovery.seed_hosts: elasticsearch
bootstrap.memory_lock: true
xpack.security.enabled: false
xpack.license.self_generated.type: basic
ports:
- 9200:9200
- 9300:9300
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
@@ -42,6 +29,13 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install -y libemail-outlook-message-perl
wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg
sudo apt-get install apt-transport-https
echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-8.x.list
sudo apt-get update && sudo apt-get install elasticsearch
sudo sed -i 's/xpack.security.enabled: true/xpack.security.enabled: false/' /etc/elasticsearch/elasticsearch.yml
sudo systemctl restart elasticsearch
sudo systemctl --no-pager status elasticsearch
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip

3
.gitignore vendored
View File

@@ -139,6 +139,3 @@ samples/private
parsedmarc.ini
scratch.py
parsedmarc/resources/maps/base_reverse_dns.csv
parsedmarc/resources/maps/unknown_base_reverse_dns.csv

View File

@@ -1,19 +1,6 @@
Changelog
=========
8.18.2
------
- Merged PR #603
- Fixes issue #595 - CI test fails for Elasticsearch
- Moved Elasticsearch to a separate Docker service container for CI testing
- Dropped Python 3.8 from CI testing
- Fixes lookup and saving of DMARC forensic reports in Elasticsearch and OpenSearch
- Updated fallback `base_reverse_dns_map.csv`, which now includes over 1,400 lines
- Updated included `dbip-country-lite.mmdb` to the June 2025 release
- Automatically fall back to the internal `base_reverse_dns_map.csv` if the received file is not valid (Fixes #602)
- Print the received data to the debug log
8.18.1
------

View File

@@ -18,7 +18,6 @@ if [ -d "./../parsedmarc-docs" ]; then
cp -rf build/html/* ../../parsedmarc-docs/
fi
cd ..
sort -o "parsedmarc/resources/maps/known_unknown_base_reverse_dns.txt" "parsedmarc/resources/maps/known_unknown_base_reverse_dns.txt"
./sortmaps.py
python3 tests.py
rm -rf dist/ build/

View File

@@ -39,7 +39,7 @@ from parsedmarc.utils import is_outlook_msg, convert_outlook_msg
from parsedmarc.utils import parse_email
from parsedmarc.utils import timestamp_to_human, human_timestamp_to_datetime
__version__ = "8.18.2"
__version__ = "8.18.1"
logger.debug("parsedmarc v{0}".format(__version__))

View File

@@ -552,8 +552,8 @@ def save_forensic_report_to_elasticsearch(
for original_header in original_headers:
headers[original_header.lower()] = original_headers[original_header]
arrival_date = human_timestamp_to_datetime(forensic_report["arrival_date_utc"])
arrival_date_epoch_milliseconds = int(arrival_date.timestamp() * 1000)
arrival_date_human = forensic_report["arrival_date_utc"]
arrival_date = human_timestamp_to_datetime(arrival_date_human)
if index_suffix is not None:
search_index = "dmarc_forensic_{0}*".format(index_suffix)
@@ -562,35 +562,20 @@ def save_forensic_report_to_elasticsearch(
if index_prefix is not None:
search_index = "{0}{1}".format(index_prefix, search_index)
search = Search(index=search_index)
q = Q(dict(match=dict(arrival_date=arrival_date_epoch_milliseconds)))
arrival_query = {"match": {"arrival_date": arrival_date}}
q = Q(arrival_query)
from_ = None
to_ = None
subject = None
if "from" in headers:
# We convert the FROM header from a string list to a flat string.
headers["from"] = headers["from"][0]
if headers["from"][0] == "":
headers["from"] = headers["from"][1]
else:
headers["from"] = " <".join(headers["from"]) + ">"
from_ = dict()
from_["sample.headers.from"] = headers["from"]
from_query = Q(dict(match_phrase=from_))
q = q & from_query
from_ = headers["from"]
from_query = {"match_phrase": {"sample.headers.from": from_}}
q = q & Q(from_query)
if "to" in headers:
# We convert the TO header from a string list to a flat string.
headers["to"] = headers["to"][0]
if headers["to"][0] == "":
headers["to"] = headers["to"][1]
else:
headers["to"] = " <".join(headers["to"]) + ">"
to_ = dict()
to_["sample.headers.to"] = headers["to"]
to_query = Q(dict(match_phrase=to_))
q = q & to_query
to_ = headers["to"]
to_query = {"match_phrase": {"sample.headers.to": to_}}
q = q & Q(to_query)
if "subject" in headers:
subject = headers["subject"]
subject_query = {"match_phrase": {"sample.headers.subject": subject}}
@@ -604,9 +589,7 @@ def save_forensic_report_to_elasticsearch(
"A forensic sample to {0} from {1} "
"with a subject of {2} and arrival date of {3} "
"already exists in "
"Elasticsearch".format(
to_, from_, subject, forensic_report["arrival_date_utc"]
)
"Elasticsearch".format(to_, from_, subject, arrival_date_human)
)
parsed_sample = forensic_report["parsed_sample"]
@@ -642,7 +625,7 @@ def save_forensic_report_to_elasticsearch(
user_agent=forensic_report["user_agent"],
version=forensic_report["version"],
original_mail_from=forensic_report["original_mail_from"],
arrival_date=arrival_date_epoch_milliseconds,
arrival_date=arrival_date,
domain=forensic_report["reported_domain"],
original_envelope_id=forensic_report["original_envelope_id"],
authentication_results=forensic_report["authentication_results"],

View File

@@ -552,8 +552,8 @@ def save_forensic_report_to_opensearch(
for original_header in original_headers:
headers[original_header.lower()] = original_headers[original_header]
arrival_date = human_timestamp_to_datetime(forensic_report["arrival_date_utc"])
arrival_date_epoch_milliseconds = int(arrival_date.timestamp() * 1000)
arrival_date_human = forensic_report["arrival_date_utc"]
arrival_date = human_timestamp_to_datetime(arrival_date_human)
if index_suffix is not None:
search_index = "dmarc_forensic_{0}*".format(index_suffix)
@@ -562,35 +562,20 @@ def save_forensic_report_to_opensearch(
if index_prefix is not None:
search_index = "{0}{1}".format(index_prefix, search_index)
search = Search(index=search_index)
q = Q(dict(match=dict(arrival_date=arrival_date_epoch_milliseconds)))
arrival_query = {"match": {"arrival_date": arrival_date}}
q = Q(arrival_query)
from_ = None
to_ = None
subject = None
if "from" in headers:
# We convert the FROM header from a string list to a flat string.
headers["from"] = headers["from"][0]
if headers["from"][0] == "":
headers["from"] = headers["from"][1]
else:
headers["from"] = " <".join(headers["from"]) + ">"
from_ = dict()
from_["sample.headers.from"] = headers["from"]
from_query = Q(dict(match_phrase=from_))
q = q & from_query
from_ = headers["from"]
from_query = {"match_phrase": {"sample.headers.from": from_}}
q = q & Q(from_query)
if "to" in headers:
# We convert the TO header from a string list to a flat string.
headers["to"] = headers["to"][0]
if headers["to"][0] == "":
headers["to"] = headers["to"][1]
else:
headers["to"] = " <".join(headers["to"]) + ">"
to_ = dict()
to_["sample.headers.to"] = headers["to"]
to_query = Q(dict(match_phrase=to_))
q = q & to_query
to_ = headers["to"]
to_query = {"match_phrase": {"sample.headers.to": to_}}
q = q & Q(to_query)
if "subject" in headers:
subject = headers["subject"]
subject_query = {"match_phrase": {"sample.headers.subject": subject}}
@@ -604,9 +589,7 @@ def save_forensic_report_to_opensearch(
"A forensic sample to {0} from {1} "
"with a subject of {2} and arrival date of {3} "
"already exists in "
"OpenSearch".format(
to_, from_, subject, forensic_report["arrival_date_utc"]
)
"OpenSearch".format(to_, from_, subject, arrival_date_human)
)
parsed_sample = forensic_report["parsed_sample"]
@@ -642,7 +625,7 @@ def save_forensic_report_to_opensearch(
user_agent=forensic_report["user_agent"],
version=forensic_report["version"],
original_mail_from=forensic_report["original_mail_from"],
arrival_date=arrival_date_epoch_milliseconds,
arrival_date=arrival_date,
domain=forensic_report["reported_domain"],
original_envelope_id=forensic_report["original_envelope_id"],
authentication_results=forensic_report["authentication_results"],

View File

@@ -1,7 +1,7 @@
# About
`dbip-country-lite.mmdb` is provided by [dbip][dbip] under a
[Creative Commons Attribution 4.0 International License][cc].
[ Creative Commons Attribution 4.0 International License][cc].
[dbip]: https://db-ip.com/db/download/ip-to-country-lite
[dbip]: https://db-ip.com/db/lite.php
[cc]: http://creativecommons.org/licenses/by/4.0/

View File

@@ -19,65 +19,33 @@ The `service_type` is based on the following rule precedence:
3. All telecommunications providers that offer internet access are identified as `ISP`, even if they also offer other services, such as web hosting or email hosting.
4. All web hosting providers are identified as `Web Hosting`, even if the service also offers email hosting.
5. All email account providers are identified as `Email Provider`, no matter how or where they are hosted
6. All legitimate platforms offering their Software as a Service (SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics.
6. All legitimate platforms offering their Software as a Service SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics.
7. All other senders that use their own domain as a Reverse DNS base domain should be identified based on their industry
- Agriculture
- Automotive
- Beauty
- Construction
- Consulting
- Defense
- Education
- Email Provider
- Email Security
- Education
- Entertainment
- Event Planning
- Finance
- Food
- Government
- Government Media
- Healthcare
- IaaS
- Industrial
- ISP
- Logistics
- Manufacturing
- Marketing
- MSP
- MSSP
- News
- Nonprofit
- PaaS
- Photography
- Print
- Publishing
- Real Estate
- Retail
- SaaS
- Science
- Search Engine
- Social Media
- Sports
- Staffing
- Technology
- Travel
- Web Host
The file currently contains over 1,400 mappings from a wide variety of email sending sources.
## known_unknown_base_reverse_dns.txt
A list of reverse DNS base domains that could not be identified as belonging to a particular organization, service, or industry.
## base_reverse_dns.csv
A CSV with the fields `source_name` and optionally `message_countcount`. This CSV can be generated byy exporting the base DNS data from the Kibana on Splunk dashboards provided by parsedmarc. This file is not tracked by Git.
## unknown_base_reverse_dns.csv
A CSV file with the fields `source_name` and `message_count`. This file is not tracked by Git.
## find_unknown_base_reverse_dns.py
This is a python script that reads the domains in `base_reverse_dns.csv` and writes the domains that are not in `base_reverse_dns_map.csv` or `known_unknown_base_reverse_dns.txt` to `unknown_base_reverse_dns.csv`. This is useful for identifying potential additional domains to contribute to `base_reverse_dns_map.csv` and `known_unknown_base_reverse_dns.txt`.
The file currently contains over 600 mappings from a wide variety of email sending services, including large email
providers, SaaS platforms, small web hosts, and healthcare companies. Ideally this mapping will continuously grow to
include many other services and industries.

File diff suppressed because it is too large Load Diff

View File

@@ -1,73 +0,0 @@
#!/usr/bin/env python
import logging
import os
import csv
def _main():
input_csv_file_path = "base_reverse_dns.csv"
base_reverse_dns_map_file_path = "base_reverse_dns_map.csv"
known_unknown_list_file_path = "known_unknown_base_reverse_dns.txt"
output_csv_file_path = "unknown_base_reverse_dns.csv"
csv_headers = ["source_name", "message_count"]
output_rows = []
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
for p in [
input_csv_file_path,
base_reverse_dns_map_file_path,
known_unknown_list_file_path,
]:
if not os.path.exists(p):
logger.error(f"{p} does not exist")
exit(1)
logger.info(f"Loading {known_unknown_list_file_path}")
known_unknown_domains = []
with open(known_unknown_list_file_path) as f:
for line in f.readlines():
domain = line.lower().strip()
if domain in known_unknown_domains:
logger.warning(
f"{domain} is in {known_unknown_list_file_path} multiple times"
)
else:
known_unknown_domains.append(domain)
logger.info(f"Loading {base_reverse_dns_map_file_path}")
known_domains = []
with open(base_reverse_dns_map_file_path) as f:
for row in csv.DictReader(f):
domain = row["base_reverse_dns"].lower().strip()
if domain in known_domains:
logger.warning(
f"{domain} is in {base_reverse_dns_map_file_path} multiple times"
)
else:
known_domains.append(domain)
if domain in known_unknown_domains and known_domains:
pass
logger.warning(
f"{domain} is in {known_unknown_list_file_path} and {base_reverse_dns_map_file_path}"
)
logger.info(f"Checking domains against {base_reverse_dns_map_file_path}")
with open(input_csv_file_path) as f:
for row in csv.DictReader(f):
domain = row["source_name"].lower().strip()
if domain not in known_domains and domain not in known_unknown_domains:
logger.info(f"New unknown domain found: {domain}")
output_rows.append(row)
logger.info(f"Writing {output_csv_file_path}")
with open(output_csv_file_path, "w") as f:
writer = csv.DictWriter(f, fieldnames=csv_headers)
writer.writeheader()
writer.writerows(output_rows)
if __name__ == "__main__":
_main()

View File

@@ -1,125 +0,0 @@
200.in-addr.arpa
adlucrumnewsletter.com
admin.corpivensa.gob.ve
aerospacevitro.us.com
albagroup-eg.com
anteldata.net.uy
antonaoll.com
aosau.net
arandomserver.com
asmecam.it
b8sales.com
bestinvestingtime.com
biocorp.com
bisno1.co.jp
bluhosting.com
bodiax.pp.ua
bost-law.com
brnonet.cz
brushinglegal.de
christus.mx
cloud-edm.com
cloudlogin.co
cnode.io
commerceinsurance.com
coolblaze.com
cps.com.ar
detrot.xyz
digi.net.my
dkginternet.com
doorsrv.com
dreamtechmedia.com
ds.network
emailperegrine.com
epsilon-group.com
eyecandyhosting.xyz
fetscorp.shop
formicidaehunt.net
fosterheap.com
gendns.com
ginous.eu.com
gist-th.com
gophermedia.com
gqlists.us.com
gratzl.de
hgnbroken.us.com
hosting1337.com
hostingmichigan.com
hostname.localhost
hostnetwork.com
hostwhitelabel.com
idcfcloud.net
immenzaces.com
ivol.co
jalanet.co.id
kahlaa.com
kbronet.com.tw
kdnursing.org
kitchenaildbd.com
legenditds.com
lighthouse-media.com
lohkal.com
lonestarmm.net
magnetmail.net
manhattanbulletpoint.com
masterclassjournal.com
moderntradingnews.com
moonjaws.com
motion4ever.net
mschosting.com
mspnet.pro
mts-nn.ru
mxthunder.net
myrewards.net
mysagestore.com
ncport.ru
nebdig.com
neovet-base.ru
nic.name
nidix.net
ogicom.net
omegabrasil.inf.br
onnet21.com
ovaltinalization.co
overta.ru
passionatesmiles.com
planethoster.net
pmnhost.net
popiup.com
prima.com.ar
prima.net.ar
proudserver.com
qontenciplc.autos
raxa.host
sahacker-2020.com
samsales.site
satirogluet.com
securednshost.com
securen.net
securerelay.in
securev.net
servershost.biz
smallvillages.com
solusoftware.com
spiritualtechnologies.io
sprout.org
stableserver.net
stockexchangejournal.com
suksangroup.com
system.eu.com
t-jon.com
tenkids.net
thaicloudsolutions.com
thaimonster.com
tullostrucking.com
unite.services
urawasl.com
us.servername.us
vendimetry.com
vibrantwellnesscorp.com
wallstreetsgossip.com
weblinkinternational.com
xsfati.us.com
xspmail.jp
zerowebhosting.net
znlc.jp

View File

@@ -19,11 +19,10 @@ import csv
import io
try:
from importlib.resources import files
import importlib.resources as pkg_resources
except ImportError:
# Try backported to PY<3 `importlib_resources`
from importlib.resources import files
# Try backported to PY<37 `importlib_resources`
import importlib_resources as pkg_resources
from dateutil.parser import parse as parse_date
import dns.reversename
@@ -281,13 +280,14 @@ def get_ip_address_country(ip_address, db_path=None):
break
if db_path is None:
db_path = str(
files(parsedmarc.resources.dbip).joinpath("dbip-country-lite.mmdb")
)
with pkg_resources.path(
parsedmarc.resources.dbip, "dbip-country-lite.mmdb"
) as path:
db_path = path
db_age = datetime.now() - datetime.fromtimestamp(os.stat(db_path).st_mtime)
if db_age > timedelta(days=30):
logger.warning("IP database is more than a month old")
db_age = datetime.now() - datetime.fromtimestamp(os.stat(db_path).st_mtime)
if db_age > timedelta(days=30):
logger.warning("IP database is more than a month old")
db_reader = geoip2.database.Reader(db_path)
@@ -352,20 +352,15 @@ def get_service_from_reverse_dns_base_domain(
load_csv(csv_file)
except requests.exceptions.RequestException as e:
logger.warning(f"Failed to fetch reverse DNS map: {e}")
except Exception:
logger.warning("Not a valid CSV file")
csv_file.seek(0)
logger.debug(csv_file.read())
if len(reverse_dns_map) == 0:
logger.info("Loading included reverse DNS map...")
path = str(
files(parsedmarc.resources.maps).joinpath("base_reverse_dns_map.csv")
)
if local_file_path is not None:
path = local_file_path
with open(path) as csv_file:
load_csv(csv_file)
with pkg_resources.path(
parsedmarc.resources.maps, "base_reverse_dns_map.csv"
) as path:
if local_file_path is not None:
path = local_file_path
with open(path) as csv_file:
load_csv(csv_file)
try:
service = reverse_dns_map[base_domain]
except KeyError:

View File

@@ -34,8 +34,8 @@ dependencies = [
"boto3>=1.16.63",
"dateparser>=1.1.1",
"dnspython>=2.0.0",
"elasticsearch-dsl==7.4.0",
"elasticsearch<7.14.0",
"elasticsearch-dsl==8.17.1",
"elasticsearch<=8.0.0",
"expiringdict>=1.1.4",
"geoip2>=3.0.0",
"google-api-core>=2.4.0",