Upgrade elasticsearch version to >=8.0.0

Upgrade elasticsearch version from 7.4.0 to 8.17.1
2026-04-13 09:08:55 +00:00 · 2025-03-22 14:15:05 -04:00 · 2025-03-22 14:07:52 -04:00
15 changed files with 92 additions and 664 deletions
--- a/.github/workflows/python-tests.yml
+++ b/.github/workflows/python-tests.yml
@@ -11,26 +11,13 @@ on:

 jobs:
  build:
-    runs-on: ubuntu-latest

-    services:
-      elasticsearch:
-        image: elasticsearch:8.18.2
-        env:
-          discovery.type: single-node
-          cluster.name: parsedmarc-cluster
-          discovery.seed_hosts: elasticsearch
-          bootstrap.memory_lock: true
-          xpack.security.enabled: false
-          xpack.license.self_generated.type: basic
-        ports:
-          - 9200:9200
-          - 9300:9300
+    runs-on: ubuntu-latest

    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]

    steps:
    - uses: actions/checkout@v4
@@ -42,6 +29,13 @@ jobs:
      run: |
        sudo apt-get update
        sudo apt-get install -y libemail-outlook-message-perl
+        wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg
+        sudo apt-get install apt-transport-https
+        echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-8.x.list
+        sudo apt-get update && sudo apt-get install elasticsearch
+        sudo sed -i 's/xpack.security.enabled: true/xpack.security.enabled: false/' /etc/elasticsearch/elasticsearch.yml
+        sudo systemctl restart elasticsearch
+        sudo systemctl --no-pager status elasticsearch
    - name: Install Python dependencies
      run: |
        python -m pip install --upgrade pip
--- a/.gitignore
+++ b/.gitignore
@@ -139,6 +139,3 @@ samples/private

 parsedmarc.ini
 scratch.py
-
-parsedmarc/resources/maps/base_reverse_dns.csv
-parsedmarc/resources/maps/unknown_base_reverse_dns.csv
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,19 +1,6 @@
 Changelog
 =========

-8.18.2
------
-
- Merged PR #603
-  - Fixes issue #595 - CI test fails for Elasticsearch
-    - Moved Elasticsearch to a separate Docker service container for CI testing
-    - Dropped Python 3.8 from CI testing
-  - Fixes lookup and saving of DMARC forensic reports in Elasticsearch and OpenSearch
- Updated fallback `base_reverse_dns_map.csv`, which now includes over 1,400 lines
- Updated included `dbip-country-lite.mmdb` to the June 2025 release
- Automatically fall back to the internal `base_reverse_dns_map.csv` if the received file is not valid (Fixes #602)
-  - Print the received data to the debug log
-
 8.18.1
 ------

--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,6 @@ if [  -d "./../parsedmarc-docs" ]; then
  cp -rf build/html/* ../../parsedmarc-docs/
 fi
 cd ..
-sort -o "parsedmarc/resources/maps/known_unknown_base_reverse_dns.txt" "parsedmarc/resources/maps/known_unknown_base_reverse_dns.txt"
 ./sortmaps.py
 python3 tests.py
 rm -rf dist/ build/
--- a/parsedmarc/init.py
+++ b/parsedmarc/init.py
@@ -39,7 +39,7 @@ from parsedmarc.utils import is_outlook_msg, convert_outlook_msg
 from parsedmarc.utils import parse_email
 from parsedmarc.utils import timestamp_to_human, human_timestamp_to_datetime

-__version__ = "8.18.2"
+__version__ = "8.18.1"

 logger.debug("parsedmarc v{0}".format(__version__))

--- a/parsedmarc/elastic.py
+++ b/parsedmarc/elastic.py
@@ -552,8 +552,8 @@ def save_forensic_report_to_elasticsearch(
    for original_header in original_headers:
        headers[original_header.lower()] = original_headers[original_header]

-    arrival_date = human_timestamp_to_datetime(forensic_report["arrival_date_utc"])
-    arrival_date_epoch_milliseconds = int(arrival_date.timestamp() * 1000)
+    arrival_date_human = forensic_report["arrival_date_utc"]
+    arrival_date = human_timestamp_to_datetime(arrival_date_human)

    if index_suffix is not None:
        search_index = "dmarc_forensic_{0}*".format(index_suffix)
@@ -562,35 +562,20 @@ def save_forensic_report_to_elasticsearch(
    if index_prefix is not None:
        search_index = "{0}{1}".format(index_prefix, search_index)
    search = Search(index=search_index)
-    q = Q(dict(match=dict(arrival_date=arrival_date_epoch_milliseconds)))
+    arrival_query = {"match": {"arrival_date": arrival_date}}
+    q = Q(arrival_query)

    from_ = None
    to_ = None
    subject = None
    if "from" in headers:
-        # We convert the FROM header from a string list to a flat string.
-        headers["from"] = headers["from"][0]
-        if headers["from"][0] == "":
-            headers["from"] = headers["from"][1]
-        else:
-            headers["from"] = " <".join(headers["from"]) + ">"
-
-        from_ = dict()
-        from_["sample.headers.from"] = headers["from"]
-        from_query = Q(dict(match_phrase=from_))
-        q = q & from_query
+        from_ = headers["from"]
+        from_query = {"match_phrase": {"sample.headers.from": from_}}
+        q = q & Q(from_query)
    if "to" in headers:
-        # We convert the TO header from a string list to a flat string.
-        headers["to"] = headers["to"][0]
-        if headers["to"][0] == "":
-            headers["to"] = headers["to"][1]
-        else:
-            headers["to"] = " <".join(headers["to"]) + ">"
-
-        to_ = dict()
-        to_["sample.headers.to"] = headers["to"]
-        to_query = Q(dict(match_phrase=to_))
-        q = q & to_query
+        to_ = headers["to"]
+        to_query = {"match_phrase": {"sample.headers.to": to_}}
+        q = q & Q(to_query)
    if "subject" in headers:
        subject = headers["subject"]
        subject_query = {"match_phrase": {"sample.headers.subject": subject}}
@@ -604,9 +589,7 @@ def save_forensic_report_to_elasticsearch(
            "A forensic sample to {0} from {1} "
            "with a subject of {2} and arrival date of {3} "
            "already exists in "
-            "Elasticsearch".format(
-                to_, from_, subject, forensic_report["arrival_date_utc"]
-            )
+            "Elasticsearch".format(to_, from_, subject, arrival_date_human)
        )

    parsed_sample = forensic_report["parsed_sample"]
@@ -642,7 +625,7 @@ def save_forensic_report_to_elasticsearch(
            user_agent=forensic_report["user_agent"],
            version=forensic_report["version"],
            original_mail_from=forensic_report["original_mail_from"],
-            arrival_date=arrival_date_epoch_milliseconds,
+            arrival_date=arrival_date,
            domain=forensic_report["reported_domain"],
            original_envelope_id=forensic_report["original_envelope_id"],
            authentication_results=forensic_report["authentication_results"],
--- a/parsedmarc/opensearch.py
+++ b/parsedmarc/opensearch.py
@@ -552,8 +552,8 @@ def save_forensic_report_to_opensearch(
    for original_header in original_headers:
        headers[original_header.lower()] = original_headers[original_header]

-    arrival_date = human_timestamp_to_datetime(forensic_report["arrival_date_utc"])
-    arrival_date_epoch_milliseconds = int(arrival_date.timestamp() * 1000)
+    arrival_date_human = forensic_report["arrival_date_utc"]
+    arrival_date = human_timestamp_to_datetime(arrival_date_human)

    if index_suffix is not None:
        search_index = "dmarc_forensic_{0}*".format(index_suffix)
@@ -562,35 +562,20 @@ def save_forensic_report_to_opensearch(
    if index_prefix is not None:
        search_index = "{0}{1}".format(index_prefix, search_index)
    search = Search(index=search_index)
-    q = Q(dict(match=dict(arrival_date=arrival_date_epoch_milliseconds)))
+    arrival_query = {"match": {"arrival_date": arrival_date}}
+    q = Q(arrival_query)

    from_ = None
    to_ = None
    subject = None
    if "from" in headers:
-        # We convert the FROM header from a string list to a flat string.
-        headers["from"] = headers["from"][0]
-        if headers["from"][0] == "":
-            headers["from"] = headers["from"][1]
-        else:
-            headers["from"] = " <".join(headers["from"]) + ">"
-
-        from_ = dict()
-        from_["sample.headers.from"] = headers["from"]
-        from_query = Q(dict(match_phrase=from_))
-        q = q & from_query
+        from_ = headers["from"]
+        from_query = {"match_phrase": {"sample.headers.from": from_}}
+        q = q & Q(from_query)
    if "to" in headers:
-        # We convert the TO header from a string list to a flat string.
-        headers["to"] = headers["to"][0]
-        if headers["to"][0] == "":
-            headers["to"] = headers["to"][1]
-        else:
-            headers["to"] = " <".join(headers["to"]) + ">"
-
-        to_ = dict()
-        to_["sample.headers.to"] = headers["to"]
-        to_query = Q(dict(match_phrase=to_))
-        q = q & to_query
+        to_ = headers["to"]
+        to_query = {"match_phrase": {"sample.headers.to": to_}}
+        q = q & Q(to_query)
    if "subject" in headers:
        subject = headers["subject"]
        subject_query = {"match_phrase": {"sample.headers.subject": subject}}
@@ -604,9 +589,7 @@ def save_forensic_report_to_opensearch(
            "A forensic sample to {0} from {1} "
            "with a subject of {2} and arrival date of {3} "
            "already exists in "
-            "OpenSearch".format(
-                to_, from_, subject, forensic_report["arrival_date_utc"]
-            )
+            "OpenSearch".format(to_, from_, subject, arrival_date_human)
        )

    parsed_sample = forensic_report["parsed_sample"]
@@ -642,7 +625,7 @@ def save_forensic_report_to_opensearch(
            user_agent=forensic_report["user_agent"],
            version=forensic_report["version"],
            original_mail_from=forensic_report["original_mail_from"],
-            arrival_date=arrival_date_epoch_milliseconds,
+            arrival_date=arrival_date,
            domain=forensic_report["reported_domain"],
            original_envelope_id=forensic_report["original_envelope_id"],
            authentication_results=forensic_report["authentication_results"],
--- a/parsedmarc/resources/dbip/README.md
+++ b/parsedmarc/resources/dbip/README.md
@@ -1,7 +1,7 @@
 # About

 `dbip-country-lite.mmdb` is provided by [dbip][dbip] under a
-[Creative Commons Attribution 4.0 International License][cc].
+[ Creative Commons Attribution 4.0 International License][cc].

-[dbip]: https://db-ip.com/db/download/ip-to-country-lite
+[dbip]: https://db-ip.com/db/lite.php
 [cc]: http://creativecommons.org/licenses/by/4.0/
--- a/parsedmarc/resources/dbip/dbip-country-lite.mmdb
+++ b/parsedmarc/resources/dbip/dbip-country-lite.mmdb
--- a/parsedmarc/resources/maps/README.md
+++ b/parsedmarc/resources/maps/README.md
@@ -19,65 +19,33 @@ The `service_type` is based on the following rule precedence:
 3. All telecommunications providers that offer internet access are identified as `ISP`, even if they also offer other services, such as web hosting or email hosting.
 4. All web hosting providers are identified as `Web Hosting`, even if the service also offers email hosting.
 5. All email account providers are identified as `Email Provider`, no matter how or where they are hosted
-6. All legitimate platforms offering their Software as a Service (SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics.
+6. All legitimate platforms offering their Software as a Service SaaS) are identified as `SaaS`, regardless of industry. This helps simplify metrics.
 7. All other senders that use their own domain as a Reverse DNS base domain should be identified based on their industry

- Agriculture
- Automotive
- Beauty
- Construction
- Consulting
- Defense
- Education
 - Email Provider
 - Email Security
+- Education
 - Entertainment
- Event Planning
 - Finance
 - Food
 - Government
 - Government Media
 - Healthcare
- IaaS
 - Industrial
 - ISP
 - Logistics
- Manufacturing
 - Marketing
 - MSP
- MSSP
- News
 - Nonprofit
- PaaS
- Photography
 - Print
- Publishing
 - Real Estate
 - Retail
 - SaaS
- Science
- Search Engine
 - Social Media
- Sports
- Staffing
 - Technology
 - Travel
 - Web Host

-The file currently contains over 1,400 mappings from a wide variety of email sending sources.
-
-## known_unknown_base_reverse_dns.txt
-
-A list of reverse DNS base domains that could not be identified as belonging to a particular organization, service, or industry.
-
-## base_reverse_dns.csv
-
-A CSV with the fields `source_name` and optionally `message_countcount`. This CSV can be generated byy exporting the base DNS data from the Kibana on Splunk dashboards provided by parsedmarc. This file is not tracked by Git.
-
-## unknown_base_reverse_dns.csv
-
-A CSV file with the fields `source_name` and `message_count`. This file is not tracked by Git.
-
-## find_unknown_base_reverse_dns.py
-
-This is a python script that reads the domains in `base_reverse_dns.csv` and writes the domains that are not in `base_reverse_dns_map.csv` or `known_unknown_base_reverse_dns.txt` to `unknown_base_reverse_dns.csv`. This is useful for identifying potential additional domains to contribute to `base_reverse_dns_map.csv` and `known_unknown_base_reverse_dns.txt`.
+The file currently contains over 600 mappings from a wide variety of email sending services, including large email
+providers, SaaS platforms, small web hosts, and healthcare companies. Ideally this mapping will continuously grow to
+include many other services and industries.
--- a/parsedmarc/resources/maps/base_reverse_dns_map.csv
+++ b/parsedmarc/resources/maps/base_reverse_dns_map.csv
--- a/parsedmarc/resources/maps/find_unknown_base_reverse_dns.py
+++ b/parsedmarc/resources/maps/find_unknown_base_reverse_dns.py
@@ -1,73 +0,0 @@
-#!/usr/bin/env python
-
-import logging
-import os
-import csv
-
-
-def _main():
-    input_csv_file_path = "base_reverse_dns.csv"
-    base_reverse_dns_map_file_path = "base_reverse_dns_map.csv"
-    known_unknown_list_file_path = "known_unknown_base_reverse_dns.txt"
-    output_csv_file_path = "unknown_base_reverse_dns.csv"
-
-    csv_headers = ["source_name", "message_count"]
-
-    output_rows = []
-
-    logging.basicConfig()
-    logger = logging.getLogger(__name__)
-    logger.setLevel(logging.INFO)
-
-    for p in [
-        input_csv_file_path,
-        base_reverse_dns_map_file_path,
-        known_unknown_list_file_path,
-    ]:
-        if not os.path.exists(p):
-            logger.error(f"{p} does not exist")
-            exit(1)
-    logger.info(f"Loading {known_unknown_list_file_path}")
-    known_unknown_domains = []
-    with open(known_unknown_list_file_path) as f:
-        for line in f.readlines():
-            domain = line.lower().strip()
-            if domain in known_unknown_domains:
-                logger.warning(
-                    f"{domain} is in {known_unknown_list_file_path} multiple times"
-                )
-            else:
-                known_unknown_domains.append(domain)
-    logger.info(f"Loading {base_reverse_dns_map_file_path}")
-    known_domains = []
-    with open(base_reverse_dns_map_file_path) as f:
-        for row in csv.DictReader(f):
-            domain = row["base_reverse_dns"].lower().strip()
-            if domain in known_domains:
-                logger.warning(
-                    f"{domain} is in {base_reverse_dns_map_file_path} multiple times"
-                )
-            else:
-                known_domains.append(domain)
-            if domain in known_unknown_domains and known_domains:
-                pass
-                logger.warning(
-                    f"{domain} is in {known_unknown_list_file_path} and {base_reverse_dns_map_file_path}"
-                )
-
-    logger.info(f"Checking domains against {base_reverse_dns_map_file_path}")
-    with open(input_csv_file_path) as f:
-        for row in csv.DictReader(f):
-            domain = row["source_name"].lower().strip()
-            if domain not in known_domains and domain not in known_unknown_domains:
-                logger.info(f"New unknown domain found: {domain}")
-                output_rows.append(row)
-    logger.info(f"Writing {output_csv_file_path}")
-    with open(output_csv_file_path, "w") as f:
-        writer = csv.DictWriter(f, fieldnames=csv_headers)
-        writer.writeheader()
-        writer.writerows(output_rows)
-
-
-if __name__ == "__main__":
-    _main()
--- a/parsedmarc/resources/maps/known_unknown_base_reverse_dns.txt
+++ b/parsedmarc/resources/maps/known_unknown_base_reverse_dns.txt
@@ -1,125 +0,0 @@
-200.in-addr.arpa
-adlucrumnewsletter.com
-admin.corpivensa.gob.ve
-aerospacevitro.us.com
-albagroup-eg.com
-anteldata.net.uy
-antonaoll.com
-aosau.net
-arandomserver.com
-asmecam.it
-b8sales.com
-bestinvestingtime.com
-biocorp.com
-bisno1.co.jp
-bluhosting.com
-bodiax.pp.ua
-bost-law.com
-brnonet.cz
-brushinglegal.de
-christus.mx
-cloud-edm.com
-cloudlogin.co
-cnode.io
-commerceinsurance.com
-coolblaze.com
-cps.com.ar
-detrot.xyz
-digi.net.my
-dkginternet.com
-doorsrv.com
-dreamtechmedia.com
-ds.network
-emailperegrine.com
-epsilon-group.com
-eyecandyhosting.xyz
-fetscorp.shop
-formicidaehunt.net
-fosterheap.com
-gendns.com
-ginous.eu.com
-gist-th.com
-gophermedia.com
-gqlists.us.com
-gratzl.de
-hgnbroken.us.com
-hosting1337.com
-hostingmichigan.com
-hostname.localhost
-hostnetwork.com
-hostwhitelabel.com
-idcfcloud.net
-immenzaces.com
-ivol.co
-jalanet.co.id
-kahlaa.com
-kbronet.com.tw
-kdnursing.org
-kitchenaildbd.com
-legenditds.com
-lighthouse-media.com
-lohkal.com
-lonestarmm.net
-magnetmail.net
-manhattanbulletpoint.com
-masterclassjournal.com
-moderntradingnews.com
-moonjaws.com
-motion4ever.net
-mschosting.com
-mspnet.pro
-mts-nn.ru
-mxthunder.net
-myrewards.net
-mysagestore.com
-ncport.ru
-nebdig.com
-neovet-base.ru
-nic.name
-nidix.net
-ogicom.net
-omegabrasil.inf.br
-onnet21.com
-ovaltinalization.co
-overta.ru
-passionatesmiles.com
-planethoster.net
-pmnhost.net
-popiup.com
-prima.com.ar
-prima.net.ar
-proudserver.com
-qontenciplc.autos
-raxa.host
-sahacker-2020.com
-samsales.site
-satirogluet.com
-securednshost.com
-securen.net
-securerelay.in
-securev.net
-servershost.biz
-smallvillages.com
-solusoftware.com
-spiritualtechnologies.io
-sprout.org
-stableserver.net
-stockexchangejournal.com
-suksangroup.com
-system.eu.com
-t-jon.com
-tenkids.net
-thaicloudsolutions.com
-thaimonster.com
-tullostrucking.com
-unite.services
-urawasl.com
-us.servername.us
-vendimetry.com
-vibrantwellnesscorp.com
-wallstreetsgossip.com
-weblinkinternational.com
-xsfati.us.com
-xspmail.jp
-zerowebhosting.net
-znlc.jp
--- a/parsedmarc/utils.py
+++ b/parsedmarc/utils.py
@@ -19,11 +19,10 @@ import csv
 import io

 try:
-    from importlib.resources import files
+    import importlib.resources as pkg_resources
 except ImportError:
-    # Try backported to PY<3 `importlib_resources`
-    from importlib.resources import files
-
+    # Try backported to PY<37 `importlib_resources`
+    import importlib_resources as pkg_resources

 from dateutil.parser import parse as parse_date
 import dns.reversename
@@ -281,13 +280,14 @@ def get_ip_address_country(ip_address, db_path=None):
                break

    if db_path is None:
-        db_path = str(
-            files(parsedmarc.resources.dbip).joinpath("dbip-country-lite.mmdb")
-        )
+        with pkg_resources.path(
+            parsedmarc.resources.dbip, "dbip-country-lite.mmdb"
+        ) as path:
+            db_path = path

-    db_age = datetime.now() - datetime.fromtimestamp(os.stat(db_path).st_mtime)
-    if db_age > timedelta(days=30):
-        logger.warning("IP database is more than a month old")
+        db_age = datetime.now() - datetime.fromtimestamp(os.stat(db_path).st_mtime)
+        if db_age > timedelta(days=30):
+            logger.warning("IP database is more than a month old")

    db_reader = geoip2.database.Reader(db_path)

@@ -352,20 +352,15 @@ def get_service_from_reverse_dns_base_domain(
            load_csv(csv_file)
        except requests.exceptions.RequestException as e:
            logger.warning(f"Failed to fetch reverse DNS map: {e}")
-        except Exception:
-            logger.warning("Not a valid CSV file")
-            csv_file.seek(0)
-            logger.debug(csv_file.read())
-
    if len(reverse_dns_map) == 0:
        logger.info("Loading included reverse DNS map...")
-        path = str(
-            files(parsedmarc.resources.maps).joinpath("base_reverse_dns_map.csv")
-        )
-        if local_file_path is not None:
-            path = local_file_path
-        with open(path) as csv_file:
-            load_csv(csv_file)
+        with pkg_resources.path(
+            parsedmarc.resources.maps, "base_reverse_dns_map.csv"
+        ) as path:
+            if local_file_path is not None:
+                path = local_file_path
+            with open(path) as csv_file:
+                load_csv(csv_file)
    try:
        service = reverse_dns_map[base_domain]
    except KeyError:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,8 +34,8 @@ dependencies = [
    "boto3>=1.16.63",
    "dateparser>=1.1.1",
    "dnspython>=2.0.0",
-    "elasticsearch-dsl==7.4.0",
-    "elasticsearch<7.14.0",
+    "elasticsearch-dsl==8.17.1",
+    "elasticsearch<=8.0.0",
    "expiringdict>=1.1.4",
    "geoip2>=3.0.0",
    "google-api-core>=2.4.0",
Author	SHA1	Message	Date
Sean Whalen	66506056ac	Upgrade elasticsearch version to >=8.0.0	2025-03-22 14:15:05 -04:00
Sean Whalen	eb912ce68d	Upgrade elasticsearch version from 7.4.0 to 8.17.1	2025-03-22 14:07:52 -04:00