feat: discover and load downstream schemas for cascade and drop

dimitri-yatsenko · claude · dimitri-yatsenko · commit ee344d9f8a49 · 2026-03-31T12:19:49.000-05:00
Previously, cascade delete and drop only traversed tables in
explicitly activated schemas. If a dependent table lived in an
unactivated schema (common in multi-schema pipelines), it was
invisible to the dependency graph, causing FK errors at delete time.

New Dependencies.load_all_downstream() method iteratively discovers
schemas that reference the loaded schemas via FK relationships,
expanding the dependency graph until all downstream schemas are
included. Uses information_schema (MySQL) and pg_constraint
(PostgreSQL) to find cross-schema FK references.

Diagram.cascade() and Table.drop() now call load_all_downstream()
before building the dependency graph.

Includes integration test: two schemas where the downstream schema
has an FK to the upstream schema, verifying that cascade delete
discovers and deletes from both.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/datajoint/adapters/base.py b/src/datajoint/adapters/base.py
@@ -830,6 +830,26 @@ def load_foreign_keys_sql(self, schemas_list: str, like_pattern: str) -> str:
         """
         ...
 
+    def find_downstream_schemas_sql(self, schemas_list: str) -> str:
+        """
+        Generate query to find schemas with FK references to the given schemas.
+
+        Used to discover unloaded schemas that depend on loaded ones.
+
+        Parameters
+        ----------
+        schemas_list : str
+            Comma-separated, quoted schema names for an IN clause.
+
+        Returns
+        -------
+        str
+            SQL query returning rows with a single column ``schema_name``
+            containing distinct schema names that reference the given schemas.
+        """
+        raise NotImplementedError
+        ...
+
     @abstractmethod
     def get_constraint_info_sql(self, constraint_name: str, schema_name: str, table_name: str) -> str:
         """
diff --git a/src/datajoint/adapters/mysql.py b/src/datajoint/adapters/mysql.py
@@ -687,6 +687,15 @@ def load_foreign_keys_sql(self, schemas_list: str, like_pattern: str) -> str:
             f"OR referenced_table_schema is not NULL AND table_schema in ({schemas_list}))"
         )
 
+    def find_downstream_schemas_sql(self, schemas_list: str) -> str:
+        """Find schemas with FK references to the given schemas."""
+        return (
+            f"SELECT DISTINCT table_schema as schema_name "
+            f"FROM information_schema.key_column_usage "
+            f"WHERE referenced_table_schema IN ({schemas_list}) "
+            f"AND table_schema NOT IN ({schemas_list})"
+        )
+
     def get_constraint_info_sql(self, constraint_name: str, schema_name: str, table_name: str) -> str:
         """Query to get FK constraint details from information_schema."""
         return (
diff --git a/src/datajoint/adapters/postgres.py b/src/datajoint/adapters/postgres.py
@@ -847,6 +847,20 @@ def load_foreign_keys_sql(self, schemas_list: str, like_pattern: str) -> str:
             f"ORDER BY c.conname, cols.ord"
         )
 
+    def find_downstream_schemas_sql(self, schemas_list: str) -> str:
+        """Find schemas with FK references to the given schemas."""
+        return (
+            f"SELECT DISTINCT ns1.nspname as schema_name "
+            f"FROM pg_constraint c "
+            f"JOIN pg_class cl1 ON c.conrelid = cl1.oid "
+            f"JOIN pg_namespace ns1 ON cl1.relnamespace = ns1.oid "
+            f"JOIN pg_class cl2 ON c.confrelid = cl2.oid "
+            f"JOIN pg_namespace ns2 ON cl2.relnamespace = ns2.oid "
+            f"WHERE c.contype = 'f' "
+            f"AND ns2.nspname IN ({schemas_list}) "
+            f"AND ns1.nspname NOT IN ({schemas_list})"
+        )
+
     def get_constraint_info_sql(self, constraint_name: str, schema_name: str, table_name: str) -> str:
         """
         Query to get FK constraint details from information_schema.
diff --git a/src/datajoint/dependencies.py b/src/datajoint/dependencies.py
@@ -140,9 +140,9 @@ def clear(self) -> None:
         self._node_alias_count = itertools.count()  # reset alias IDs for consistency
         super().clear()
 
-    def load(self, force: bool = True) -> None:
+    def load(self, force: bool = True, schema_names: set[str] | None = None) -> None:
         """
-        Load dependencies for all loaded schemas.
+        Load dependencies for the given schemas.
 
         Called before operations requiring dependencies: delete, drop,
         populate, progress.
@@ -151,6 +151,8 @@ def load(self, force: bool = True) -> None:
         ----------
         force : bool, optional
             If True (default), reload even if already loaded.
+        schema_names : set[str], optional
+            Schema names to load. If None, uses all activated schemas.
         """
         # reload from scratch to prevent duplication of renamed edges
         if self._loaded and not force:
@@ -162,7 +164,11 @@ def load(self, force: bool = True) -> None:
         adapter = self._conn.adapter
 
         # Build schema list for IN clause
-        schemas_list = ", ".join(adapter.quote_string(s) for s in self._conn.schemas)
+        names = schema_names if schema_names is not None else set(self._conn.schemas)
+        if not names:
+            self._loaded = True
+            return
+        schemas_list = ", ".join(adapter.quote_string(s) for s in names)
 
         # Load primary keys and foreign keys via adapter methods
         # Note: Both PyMySQL and psycopg use %s placeholders, so escape % as %%
@@ -220,6 +226,33 @@ def load(self, force: bool = True) -> None:
             raise DataJointError("DataJoint can only work with acyclic dependencies")
         self._loaded = True
 
+    def load_all_downstream(self) -> None:
+        """
+        Load dependencies including all downstream schemas reachable via FK chains.
+
+        Iteratively discovers schemas that reference the currently loaded
+        schemas, expanding the dependency graph until no new schemas are
+        found. This ensures that cascade delete and drop reach all
+        dependent tables, even those in schemas that haven't been
+        explicitly activated.
+        """
+        adapter = self._conn.adapter
+        known_schemas = set(self._conn.schemas)
+        if not known_schemas:
+            self.load()
+            return
+
+        max_iterations = 50
+        for _ in range(max_iterations):
+            schemas_list = ", ".join(adapter.quote_string(s) for s in known_schemas)
+            result = self._conn.query(adapter.find_downstream_schemas_sql(schemas_list))
+            new_schemas = {row[0] for row in result} - known_schemas
+            if not new_schemas:
+                break
+            known_schemas |= new_schemas
+
+        self.load(force=True, schema_names=known_schemas)
+
     def topo_sort(self) -> list[str]:
         """
         Return table names in topological order.
diff --git a/src/datajoint/diagram.py b/src/datajoint/diagram.py
@@ -347,7 +347,7 @@ def cascade(cls, table_expr, part_integrity="enforce"):
         >>> dj.Diagram.cascade(Session & 'subject_id=1')
         """
         conn = table_expr.connection
-        conn.dependencies.load()
+        conn.dependencies.load_all_downstream()
         node = table_expr.full_table_name
 
         result = cls.__new__(cls)
diff --git a/src/datajoint/table.py b/src/datajoint/table.py
@@ -1170,6 +1170,7 @@ def drop(self, prompt: bool | None = None, part_integrity: str = "enforce"):
         import networkx as nx
         from .diagram import Diagram
 
+        self.connection.dependencies.load_all_downstream()
         diagram = Diagram(self)
         # Expand to include all descendants (cross-schema)
         descendants = set(nx.descendants(diagram, self.full_table_name)) | {self.full_table_name}
diff --git a/tests/integration/test_cascade_delete.py b/tests/integration/test_cascade_delete.py
@@ -226,3 +226,69 @@ class Child(dj.Manual):
     # Data must still be intact
     assert len(Parent()) == 2
     assert len(Child()) == 3
+
+
+def test_cascade_discovers_downstream_schema(connection_by_backend, db_creds_by_backend):
+    """Cascade delete discovers and includes tables in unloaded downstream schemas."""
+    import time
+
+    backend = db_creds_by_backend["backend"]
+    test_id = str(int(time.time() * 1000))[-8:]
+
+    upstream_name = f"djtest_upstream_{backend}_{test_id}"[:64]
+    downstream_name = f"djtest_downstream_{backend}_{test_id}"[:64]
+
+    qi = connection_by_backend.adapter.quote_identifier
+
+    # Clean up any previous runs
+    for name in (downstream_name, upstream_name):
+        try:
+            connection_by_backend.query(f"DROP DATABASE IF EXISTS {qi(name)}")
+        except Exception:
+            pass
+
+    # Create upstream schema and table
+    upstream = dj.Schema(upstream_name, connection=connection_by_backend)
+
+    @upstream
+    class Parent(dj.Manual):
+        definition = """
+        parent_id : int
+        ---
+        name : varchar(100)
+        """
+
+    # Create downstream schema with FK to upstream — separate schema object
+    downstream = dj.Schema(downstream_name, connection=connection_by_backend)
+
+    @downstream
+    class Child(dj.Manual):
+        definition = """
+        -> Parent
+        child_id : int
+        ---
+        data : varchar(100)
+        """
+
+    # Insert data
+    Parent.insert1(dict(parent_id=1, name="Alice"))
+    Child.insert1(dict(parent_id=1, child_id=1, data="row1"))
+    Child.insert1(dict(parent_id=1, child_id=2, data="row2"))
+
+    # Verify cascade preview discovers the downstream schema
+    counts = dj.Diagram.cascade(Parent & "parent_id=1").counts()
+    assert Parent.full_table_name in counts
+    assert Child.full_table_name in counts
+    assert counts[Child.full_table_name] == 2
+
+    # Verify actual delete cascades across schemas
+    (Parent & "parent_id=1").delete()
+    assert len(Parent()) == 0
+    assert len(Child()) == 0
+
+    # Clean up
+    for name in (downstream_name, upstream_name):
+        try:
+            connection_by_backend.query(f"DROP DATABASE IF EXISTS {qi(name)}")
+        except Exception:
+            pass