parsedmarc/tests/test_maps.py

"""Tests for the map-maintenance scripts under parsedmarc/resources/maps/.

These scripts are maintainer-only batch tooling — they do not ship in the
wheel — but they still need regression coverage because they enforce the
privacy and integrity rules for the reverse-DNS map data files."""

import unittest


class TestMapScriptsIPDetection(unittest.TestCase):
    """Full-IP detection and PSL folding in the map-maintenance scripts."""

    def test_collect_domain_info_detects_full_ips(self):
        import parsedmarc.resources.maps.collect_domain_info as cdi

        # Dotted and dashed four-octet patterns with valid octets: detected.
        self.assertTrue(cdi._has_full_ip("74-208-244-234.cprapid.com"))
        self.assertTrue(cdi._has_full_ip("host.192.168.1.1.example.com"))
        self.assertTrue(cdi._has_full_ip("a-10-20-30-40-brand.com"))
        # Three octets is NOT a full IP — OVH's reverse-DNS pattern stays safe.
        self.assertFalse(cdi._has_full_ip("ip-147-135-108.us"))
        # Out-of-range octet fails the 0-255 sanity check.
        self.assertFalse(cdi._has_full_ip("999-1-2-3-foo.com"))
        # Pure domain, no IP.
        self.assertFalse(cdi._has_full_ip("example.com"))

    def test_find_unknown_detects_full_ips(self):
        import parsedmarc.resources.maps.find_unknown_base_reverse_dns as fu

        self.assertTrue(fu._has_full_ip("170-254-144-204-nobreinternet.com.br"))
        self.assertFalse(fu._has_full_ip("ip-147-135-108.us"))
        self.assertFalse(fu._has_full_ip("cprapid.com"))

    def test_apply_psl_override_dot_prefix(self):
        import parsedmarc.resources.maps.collect_domain_info as cdi

        ov = [".cprapid.com", ".linode.com"]
        self.assertEqual(cdi._apply_psl_override("foo.cprapid.com", ov), "cprapid.com")
        self.assertEqual(cdi._apply_psl_override("a.b.linode.com", ov), "linode.com")

    def test_apply_psl_override_dash_prefix(self):
        import parsedmarc.resources.maps.collect_domain_info as cdi

        ov = ["-nobre.com.br"]
        self.assertEqual(
            cdi._apply_psl_override("1-2-3-4-nobre.com.br", ov), "nobre.com.br"
        )

    def test_apply_psl_override_no_match(self):
        import parsedmarc.resources.maps.collect_domain_info as cdi

        ov = [".cprapid.com"]
        self.assertEqual(cdi._apply_psl_override("example.com", ov), "example.com")


class TestDetectPSLOverrides(unittest.TestCase):
    """Cluster detection, brand-tail extraction, and full-pipeline behaviour
    for `detect_psl_overrides.py`."""

    def setUp(self):
        import parsedmarc.resources.maps.detect_psl_overrides as dpo

        self.dpo = dpo

    def test_extract_brand_tail_dot_separator(self):
        self.assertEqual(
            self.dpo.extract_brand_tail("74-208-244-234.cprapid.com"),
            ".cprapid.com",
        )

    def test_extract_brand_tail_dash_separator(self):
        self.assertEqual(
            self.dpo.extract_brand_tail("170-254-144-204-nobre.com.br"),
            "-nobre.com.br",
        )

    def test_extract_brand_tail_no_separator(self):
        self.assertEqual(
            self.dpo.extract_brand_tail("host134-254-143-190tigobusiness.com.ni"),
            "tigobusiness.com.ni",
        )

    def test_extract_brand_tail_no_ip_returns_none(self):
        self.assertIsNone(self.dpo.extract_brand_tail("plain.example.com"))

    def test_extract_brand_tail_rejects_short_tail(self):
        """A tail shorter than MIN_TAIL_LEN is rejected to avoid folding to `.com`."""
        # Four-octet IP followed by only `.br` (2 chars after the dot) — too short.
        self.assertIsNone(self.dpo.extract_brand_tail("1-2-3-4.br"))

    def test_detect_clusters_meets_threshold(self):
        domains = [
            "1-2-3-4.cprapid.com",
            "5-6-7-8.cprapid.com",
            "9-10-11-12.cprapid.com",
            "1-2-3-4-other.com.br",  # not enough of these
        ]
        clusters = self.dpo.detect_clusters(domains, threshold=3, known_overrides=set())
        self.assertIn(".cprapid.com", clusters)
        self.assertEqual(len(clusters[".cprapid.com"]), 3)
        self.assertNotIn("-other.com.br", clusters)

    def test_detect_clusters_honours_threshold(self):
        domains = [
            "1-2-3-4.cprapid.com",
            "5-6-7-8.cprapid.com",
        ]
        clusters = self.dpo.detect_clusters(domains, threshold=3, known_overrides=set())
        self.assertEqual(clusters, {})

    def test_detect_clusters_skips_known_overrides(self):
        """Tails already in psl_overrides.txt must not be re-proposed."""
        domains = [
            "1-2-3-4.cprapid.com",
            "5-6-7-8.cprapid.com",
            "9-10-11-12.cprapid.com",
        ]
        clusters = self.dpo.detect_clusters(
            domains, threshold=3, known_overrides={".cprapid.com"}
        )
        self.assertNotIn(".cprapid.com", clusters)

    def test_apply_override_matches_first(self):
        """apply_override iterates in list order and returns on the first match."""
        ov = [".cprapid.com", "-nobre.com.br"]
        self.assertEqual(
            self.dpo.apply_override("1-2-3-4.cprapid.com", ov), "cprapid.com"
        )
        self.assertEqual(
            self.dpo.apply_override("1-2-3-4-nobre.com.br", ov), "nobre.com.br"
        )
        self.assertEqual(self.dpo.apply_override("unrelated.com", ov), "unrelated.com")

    def test_has_full_ip_shared_with_other_scripts(self):
        """The detect script's IP check must agree with the other map scripts."""
        self.assertTrue(self.dpo.has_full_ip("74-208-244-234.cprapid.com"))
        self.assertFalse(self.dpo.has_full_ip("ip-147-135-108.us"))
        self.assertFalse(self.dpo.has_full_ip("example.com"))


if __name__ == "__main__":
    unittest.main(verbosity=2)