Fix duplicate detection for normalized aggregate reports in Elasticsearch/OpenSearch (#666)

Change date_begin/date_end queries from exact match to range queries (gte/lte) so that previously saved normalized time buckets are correctly detected as duplicates within the original report's date range. Co-authored-by: seanthegeek <44679+seanthegeek@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: seanthegeek <44679+seanthegeek@users.noreply.github.com>
2026-06-28 04:54:19 +00:00 · 2026-03-06 13:21:54 -05:00
parent e98fdfa96b
commit ae5d20ecf5
2 changed files with 4 additions and 4 deletions
@@ -413,8 +413,8 @@ def save_aggregate_report_to_elasticsearch(
    org_name_query = Q(dict(match_phrase=dict(org_name=org_name)))  # type: ignore
    report_id_query = Q(dict(match_phrase=dict(report_id=report_id)))  # pyright: ignore[reportArgumentType]
    domain_query = Q(dict(match_phrase={"published_policy.domain": domain}))  # pyright: ignore[reportArgumentType]
-    begin_date_query = Q(dict(match=dict(date_begin=begin_date)))  # pyright: ignore[reportArgumentType]
-    end_date_query = Q(dict(match=dict(date_end=end_date)))  # pyright: ignore[reportArgumentType]
+    begin_date_query = Q(dict(range=dict(date_begin=dict(gte=begin_date))))  # pyright: ignore[reportArgumentType]
+    end_date_query = Q(dict(range=dict(date_end=dict(lte=end_date))))  # pyright: ignore[reportArgumentType]

    if index_suffix is not None:
        search_index = "dmarc_aggregate_{0}*".format(index_suffix)
@@ -413,8 +413,8 @@ def save_aggregate_report_to_opensearch(
    org_name_query = Q(dict(match_phrase=dict(org_name=org_name)))
    report_id_query = Q(dict(match_phrase=dict(report_id=report_id)))
    domain_query = Q(dict(match_phrase={"published_policy.domain": domain}))
-    begin_date_query = Q(dict(match=dict(date_begin=begin_date)))
-    end_date_query = Q(dict(match=dict(date_end=end_date)))
+    begin_date_query = Q(dict(range=dict(date_begin=dict(gte=begin_date))))
+    end_date_query = Q(dict(range=dict(date_end=dict(lte=end_date))))

    if index_suffix is not None:
        search_index = "dmarc_aggregate_{0}*".format(index_suffix)