diff --git a/digital_land/expectations/operations/dataset.py b/digital_land/expectations/operations/dataset.py index 484b1f49..0fdb0117 100644 --- a/digital_land/expectations/operations/dataset.py +++ b/digital_land/expectations/operations/dataset.py @@ -320,10 +320,9 @@ def duplicate_geometry_check(conn, spatial_field: str): CASE WHEN pct_overlap_a > {MATCH_THRESHOLD} AND pct_overlap_b > {MATCH_THRESHOLD} THEN 'Complete match (two-way)' WHEN pct_overlap_a > {MATCH_THRESHOLD} OR pct_overlap_b > {MATCH_THRESHOLD} THEN 'Single match (one-way)' - ELSE 'undefined' END as intersection_type, + ELSE 'Any match' END as intersection_type, row_number() OVER (PARTITION BY entity_join_key ORDER BY pct_comb_overlap) as key_count FROM calc - WHERE pct_overlap_a > 0.9 OR pct_overlap_b > 0.9 -- should this use MATCH_THRESHOLD? ORDER BY entity_join_key ) @@ -374,7 +373,18 @@ def duplicate_geometry_check(conn, spatial_field: str): for row in rows if row["intersection_type"] == "Single match (one-way)" ] - message = f"There are {len(complete_matches)} complete matches and {len(single_matches)} single matches in the dataset" + + any_matches = [ + { + "entity_a": row["entity_a"], + "organisation_entity_a": row["organisation_entity_a"], + "entity_b": row["entity_b"], + "organisation_entity_b": row["organisation_entity_b"], + } + for row in rows + if row["intersection_type"] == "Any match" + ] + message = f"There are {len(complete_matches)} complete matches, {len(single_matches)} single matches and {len(any_matches)} any matches in the dataset" else: complete_matches = [ { @@ -386,6 +396,7 @@ def duplicate_geometry_check(conn, spatial_field: str): for row in rows ] single_matches = [] + any_matches = [] message = ( f"There are {len(complete_matches)} complete matches in the dataset" ) @@ -394,10 +405,12 @@ def duplicate_geometry_check(conn, spatial_field: str): message = "There are no duplicate geometries/points in the dataset" complete_matches = [] single_matches = [] + any_matches = [] details = { "actual": len(rows), "expected": 0, "complete_matches": complete_matches, "single_matches": single_matches, + "any_matches": any_matches, } return result, message, details diff --git a/tests/integration/expectations/operations/test_dataset.py b/tests/integration/expectations/operations/test_dataset.py index 533358f9..75156cdc 100644 --- a/tests/integration/expectations/operations/test_dataset.py +++ b/tests/integration/expectations/operations/test_dataset.py @@ -281,8 +281,11 @@ def test_duplicate_geometry_check(dataset_path): conn.close() assert not result - assert message == "There are 1 complete matches and 2 single matches in the dataset" - assert details["actual"] == 3 + assert ( + message + == "There are 1 complete matches, 2 single matches and 3 any matches in the dataset" + ) + assert details["actual"] == 6 assert details["expected"] == 0 assert details["complete_matches"][0]["entity_a"] == 1 @@ -295,7 +298,7 @@ def test_duplicate_geometry_check(dataset_path): assert details["single_matches"][1]["organisation_entity_a"] == 101 assert details["single_matches"][1]["organisation_entity_b"] == 102 - # entity 4 shouldn't have any duplicates + # entity 4 has partial overlap with entities 1, 2 and 3 - flagged as any_match only assert not any( row["entity_a"] == 4 or row["entity_b"] == 4 for row in details["complete_matches"] @@ -304,6 +307,9 @@ def test_duplicate_geometry_check(dataset_path): row["entity_a"] == 4 or row["entity_b"] == 4 for row in details["single_matches"] ) + assert any( + row["entity_a"] == 4 or row["entity_b"] == 4 for row in details["any_matches"] + ) def test_duplicate_geometry_check_point(dataset_path): @@ -362,7 +368,7 @@ def test_duplicate_geometry_check_no_dupes(dataset_path): }, { "entity": 4, - "geometry": "POLYGON((1 1, 1 3, 3 3, 3 1, 1 1))", + "geometry": "POLYGON((3 3, 3 5, 5 5, 5 3, 3 3))", # no overlap with entity 1 "organisation_entity": 103, }, ] @@ -381,5 +387,6 @@ def test_duplicate_geometry_check_no_dupes(dataset_path): assert message == "There are no duplicate geometries/points in the dataset" assert not details["complete_matches"] assert not details["single_matches"] + assert not details["any_matches"] assert details["actual"] == 0 assert details["expected"] == 0