Merge pull request #1417 from datajoint/fix/1049-skip-duplicates-unique-postgres

dimitri-yatsenko · web-flow · commit cda7e1a307ed · 2026-03-31T11:13:58.000-05:00
docs+test: skip_duplicates enforces secondary unique constraints on PostgreSQL (fixes #1049)
diff --git a/src/datajoint/table.py b/src/datajoint/table.py
@@ -745,7 +745,13 @@ def insert(
         replace : bool, optional
             If True, replaces the existing tuple.
         skip_duplicates : bool, optional
-            If True, silently skip duplicate inserts.
+            If True, silently skip rows with duplicate primary key values.
+            On **PostgreSQL**, secondary unique constraint violations still
+            raise an error even when ``skip_duplicates=True``, because the
+            generated ``ON CONFLICT (pk) DO NOTHING`` clause targets only
+            the primary key. On **MySQL**, ``ON DUPLICATE KEY UPDATE``
+            catches all unique-key conflicts, so secondary unique violations
+            are also silently skipped.
         ignore_extra_fields : bool, optional
             If False (default), fields that are not in the heading raise error.
         allow_direct_insert : bool, optional
diff --git a/tests/integration/test_skip_duplicates.py b/tests/integration/test_skip_duplicates.py
@@ -0,0 +1,205 @@
+"""
+Tests for skip_duplicates behavior with secondary unique constraints.
+
+Verifies that skip_duplicates=True on PostgreSQL skips primary key
+duplicates while still raising on secondary unique constraint violations.
+Resolves #1049.
+"""
+
+import time
+
+import pytest
+
+import datajoint as dj
+from datajoint.errors import DuplicateError
+
+
+@pytest.fixture(scope="function")
+def schema_by_backend(connection_by_backend, db_creds_by_backend):
+    """Create a fresh schema per test, parameterized across backends."""
+    backend = db_creds_by_backend["backend"]
+    test_id = str(int(time.time() * 1000))[-8:]
+    schema_name = f"djtest_skipdup_{backend}_{test_id}"[:64]
+
+    if connection_by_backend.is_connected:
+        try:
+            connection_by_backend.query(
+                f"DROP DATABASE IF EXISTS {connection_by_backend.adapter.quote_identifier(schema_name)}"
+            )
+        except Exception:
+            pass
+
+    schema = dj.Schema(schema_name, connection=connection_by_backend)
+    yield schema
+
+    if connection_by_backend.is_connected:
+        try:
+            connection_by_backend.query(
+                f"DROP DATABASE IF EXISTS {connection_by_backend.adapter.quote_identifier(schema_name)}"
+            )
+        except Exception:
+            pass
+
+
+def test_skip_duplicates_pk_match(schema_by_backend):
+    """skip_duplicates=True silently skips rows whose PK already exists."""
+
+    @schema_by_backend
+    class Item(dj.Manual):
+        definition = """
+        item_id : int
+        ---
+        name : varchar(100)
+        email : varchar(100)
+        unique index (email)
+        """
+
+    Item.insert1(dict(item_id=1, name="Alice", email="alice@example.com"))
+
+    # Same PK, different values — should be silently skipped
+    Item.insert1(
+        dict(item_id=1, name="Bob", email="bob@example.com"),
+        skip_duplicates=True,
+    )
+
+    # Original row unchanged
+    row = (Item & "item_id=1").fetch1()
+    assert row["name"] == "Alice"
+    assert row["email"] == "alice@example.com"
+
+
+def test_skip_duplicates_unique_violation_raises_on_postgres(schema_by_backend, db_creds_by_backend):
+    """On PostgreSQL, skip_duplicates=True still raises on secondary unique violations.
+
+    Regression test for #1049: a row with a *new* PK but a *conflicting*
+    secondary unique index value must raise DuplicateError on PostgreSQL.
+    """
+    if db_creds_by_backend["backend"] != "postgresql":
+        pytest.skip("PostgreSQL-specific: ON CONFLICT (pk) DO NOTHING preserves unique constraints")
+
+    @schema_by_backend
+    class Item(dj.Manual):
+        definition = """
+        item_id : int
+        ---
+        name : varchar(100)
+        email : varchar(100)
+        unique index (email)
+        """
+
+    Item.insert1(dict(item_id=1, name="Alice", email="alice@example.com"))
+
+    # New PK (2) but email conflicts with existing row (1)
+    with pytest.raises(DuplicateError):
+        Item.insert1(
+            dict(item_id=2, name="Bob", email="alice@example.com"),
+            skip_duplicates=True,
+        )
+
+
+def test_skip_duplicates_unique_on_mysql(schema_by_backend, db_creds_by_backend):
+    """On MySQL, skip_duplicates=True silently skips secondary unique conflicts.
+
+    Documents the known MySQL asymmetry: ON DUPLICATE KEY UPDATE catches
+    all unique key conflicts, not just primary key.
+    """
+    if db_creds_by_backend["backend"] != "mysql":
+        pytest.skip("MySQL-specific: ON DUPLICATE KEY UPDATE catches all unique keys")
+
+    @schema_by_backend
+    class Item(dj.Manual):
+        definition = """
+        item_id : int
+        ---
+        name : varchar(100)
+        email : varchar(100)
+        unique index (email)
+        """
+
+    Item.insert1(dict(item_id=1, name="Alice", email="alice@example.com"))
+
+    # New PK (2) but email conflicts — MySQL silently skips
+    Item.insert1(
+        dict(item_id=2, name="Bob", email="alice@example.com"),
+        skip_duplicates=True,
+    )
+
+    # Only the original row exists
+    assert len(Item()) == 1
+    assert (Item & "item_id=1").fetch1()["name"] == "Alice"
+
+
+def test_skip_duplicates_no_unique_index(schema_by_backend):
+    """skip_duplicates=True works normally on tables without secondary unique indexes."""
+
+    @schema_by_backend
+    class Simple(dj.Manual):
+        definition = """
+        item_id : int
+        ---
+        name : varchar(100)
+        """
+
+    Simple.insert1(dict(item_id=1, name="Alice"))
+
+    # Same PK, different name — silently skipped
+    Simple.insert1(dict(item_id=1, name="Bob"), skip_duplicates=True)
+    assert (Simple & "item_id=1").fetch1()["name"] == "Alice"
+
+    # New PK — inserted
+    Simple.insert1(dict(item_id=2, name="Bob"), skip_duplicates=True)
+    assert len(Simple()) == 2
+
+
+def test_skip_duplicates_composite_unique(schema_by_backend, db_creds_by_backend):
+    """skip_duplicates=True with a composite secondary unique index."""
+    if db_creds_by_backend["backend"] != "postgresql":
+        pytest.skip("PostgreSQL-specific unique constraint enforcement")
+
+    @schema_by_backend
+    class Record(dj.Manual):
+        definition = """
+        record_id : int
+        ---
+        first_name : varchar(100)
+        last_name : varchar(100)
+        data : varchar(255)
+        unique index (first_name, last_name)
+        """
+
+    Record.insert1(dict(record_id=1, first_name="Alice", last_name="Smith", data="v1"))
+
+    # New PK but composite unique (first_name, last_name) conflicts
+    with pytest.raises(DuplicateError):
+        Record.insert1(
+            dict(record_id=2, first_name="Alice", last_name="Smith", data="v2"),
+            skip_duplicates=True,
+        )
+
+
+def test_skip_duplicates_batch_mixed(schema_by_backend, db_creds_by_backend):
+    """Batch insert with skip_duplicates=True: PK duplicates skipped, unique conflicts raise."""
+    if db_creds_by_backend["backend"] != "postgresql":
+        pytest.skip("PostgreSQL-specific unique constraint enforcement")
+
+    @schema_by_backend
+    class Item(dj.Manual):
+        definition = """
+        item_id : int
+        ---
+        email : varchar(100)
+        unique index (email)
+        """
+
+    Item.insert1(dict(item_id=1, email="alice@example.com"))
+
+    # Batch: row 2 is new (OK), row 1 is PK dup (skip), row 3 conflicts on email
+    with pytest.raises(DuplicateError):
+        Item.insert(
+            [
+                dict(item_id=2, email="bob@example.com"),
+                dict(item_id=1, email="duplicate-pk@example.com"),  # PK dup — skipped
+                dict(item_id=3, email="alice@example.com"),  # unique conflict — error
+            ],
+            skip_duplicates=True,
+        )