Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions digital_land/expectations/operations/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,10 +320,9 @@ def duplicate_geometry_check(conn, spatial_field: str):
CASE
WHEN pct_overlap_a > {MATCH_THRESHOLD} AND pct_overlap_b > {MATCH_THRESHOLD} THEN 'Complete match (two-way)'
WHEN pct_overlap_a > {MATCH_THRESHOLD} OR pct_overlap_b > {MATCH_THRESHOLD} THEN 'Single match (one-way)'
ELSE 'undefined' END as intersection_type,
ELSE 'Any match' END as intersection_type,
row_number() OVER (PARTITION BY entity_join_key ORDER BY pct_comb_overlap) as key_count
FROM calc
WHERE pct_overlap_a > 0.9 OR pct_overlap_b > 0.9 -- should this use MATCH_THRESHOLD?
ORDER BY entity_join_key
)

Expand Down Expand Up @@ -374,7 +373,18 @@ def duplicate_geometry_check(conn, spatial_field: str):
for row in rows
if row["intersection_type"] == "Single match (one-way)"
]
message = f"There are {len(complete_matches)} complete matches and {len(single_matches)} single matches in the dataset"

any_matches = [
{
"entity_a": row["entity_a"],
"organisation_entity_a": row["organisation_entity_a"],
"entity_b": row["entity_b"],
"organisation_entity_b": row["organisation_entity_b"],
}
for row in rows
if row["intersection_type"] == "Any match"
]
message = f"There are {len(complete_matches)} complete matches, {len(single_matches)} single matches and {len(any_matches)} any matches in the dataset"
else:
complete_matches = [
{
Expand All @@ -386,6 +396,7 @@ def duplicate_geometry_check(conn, spatial_field: str):
for row in rows
]
single_matches = []
any_matches = []
message = (
f"There are {len(complete_matches)} complete matches in the dataset"
)
Expand All @@ -394,10 +405,12 @@ def duplicate_geometry_check(conn, spatial_field: str):
message = "There are no duplicate geometries/points in the dataset"
complete_matches = []
single_matches = []
any_matches = []
details = {
"actual": len(rows),
"expected": 0,
"complete_matches": complete_matches,
"single_matches": single_matches,
"any_matches": any_matches,
}
return result, message, details
15 changes: 11 additions & 4 deletions tests/integration/expectations/operations/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,11 @@ def test_duplicate_geometry_check(dataset_path):
conn.close()

assert not result
assert message == "There are 1 complete matches and 2 single matches in the dataset"
assert details["actual"] == 3
assert (
message
== "There are 1 complete matches, 2 single matches and 3 any matches in the dataset"
)
assert details["actual"] == 6
assert details["expected"] == 0

assert details["complete_matches"][0]["entity_a"] == 1
Expand All @@ -295,7 +298,7 @@ def test_duplicate_geometry_check(dataset_path):
assert details["single_matches"][1]["organisation_entity_a"] == 101
assert details["single_matches"][1]["organisation_entity_b"] == 102

# entity 4 shouldn't have any duplicates
# entity 4 has partial overlap with entities 1, 2 and 3 - flagged as any_match only
assert not any(
row["entity_a"] == 4 or row["entity_b"] == 4
for row in details["complete_matches"]
Expand All @@ -304,6 +307,9 @@ def test_duplicate_geometry_check(dataset_path):
row["entity_a"] == 4 or row["entity_b"] == 4
for row in details["single_matches"]
)
assert any(
row["entity_a"] == 4 or row["entity_b"] == 4 for row in details["any_matches"]
)


def test_duplicate_geometry_check_point(dataset_path):
Expand Down Expand Up @@ -362,7 +368,7 @@ def test_duplicate_geometry_check_no_dupes(dataset_path):
},
{
"entity": 4,
"geometry": "POLYGON((1 1, 1 3, 3 3, 3 1, 1 1))",
"geometry": "POLYGON((3 3, 3 5, 5 5, 5 3, 3 3))", # no overlap with entity 1
"organisation_entity": 103,
},
]
Expand All @@ -381,5 +387,6 @@ def test_duplicate_geometry_check_no_dupes(dataset_path):
assert message == "There are no duplicate geometries/points in the dataset"
assert not details["complete_matches"]
assert not details["single_matches"]
assert not details["any_matches"]
assert details["actual"] == 0
assert details["expected"] == 0
Loading