From 400142a15b7039614dc9bab460ea93a2f0def92d Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sun, 8 Feb 2026 17:20:19 -0500
Subject: [PATCH] Shorten test suite runtime with parallel execution and
 reduced iterations

Add pytest-xdist for parallel test execution across all CI jobs, cap
bootstrap min_n at 49 in pure Python mode with wider convergence
tolerances, share TROP fixtures via class-scoped fixture to eliminate
7 redundant fits, reduce simulation counts and methodology test data
sizes. Full suite verified: 1035 passed, 0 failures.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/rust-test.yml    |  10 +-
 CLAUDE.md                          |   2 +-
 pyproject.toml                     |   1 +
 tests/conftest.py                  |   4 +-
 tests/test_ci_params.py            |  24 +++-
 tests/test_methodology_callaway.py |  11 +-
 tests/test_power.py                |  10 +-
 tests/test_staggered.py            |  14 ++-
 tests/test_trop.py                 | 181 +++++++++--------------------
 9 files changed, 100 insertions(+), 157 deletions(-)

diff --git a/.github/workflows/rust-test.yml b/.github/workflows/rust-test.yml
index 2ed2dfb..4bd2861 100644
--- a/.github/workflows/rust-test.yml
+++ b/.github/workflows/rust-test.yml
@@ -64,7 +64,7 @@ jobs:
         uses: dtolnay/rust-toolchain@stable
 
       - name: Install test dependencies
-        run: pip install pytest numpy pandas scipy
+        run: pip install pytest pytest-xdist numpy pandas scipy
 
       - name: Build and install with maturin
         run: |
@@ -121,14 +121,14 @@ jobs:
       - name: Run tests with Rust backend (Unix)
         if: runner.os != 'Windows'
         working-directory: /tmp
-        run: DIFF_DIFF_BACKEND=rust pytest tests/ -x -q
+        run: DIFF_DIFF_BACKEND=rust pytest tests/ -q -n auto --dist worksteal
 
       - name: Run tests with Rust backend (Windows)
         if: runner.os == 'Windows'
         working-directory: ${{ runner.temp }}
         run: |
           $env:DIFF_DIFF_BACKEND="rust"
-          pytest tests/ -x -q
+          pytest tests/ -q -n auto --dist worksteal
         shell: pwsh
 
   # Test pure Python fallback (without Rust extension)
@@ -144,7 +144,7 @@ jobs:
           python-version: '3.11'
 
       - name: Install dependencies
-        run: pip install numpy pandas scipy pytest
+        run: pip install numpy pandas scipy pytest pytest-xdist
 
       - name: Verify pure Python mode
         run: |
@@ -152,4 +152,4 @@ jobs:
           PYTHONPATH=. python -c "from diff_diff import HAS_RUST_BACKEND; print(f'HAS_RUST_BACKEND: {HAS_RUST_BACKEND}'); assert not HAS_RUST_BACKEND"
 
       - name: Run tests in pure Python mode
-        run: PYTHONPATH=. DIFF_DIFF_BACKEND=python pytest tests/ -x -q --ignore=tests/test_rust_backend.py
+        run: PYTHONPATH=. DIFF_DIFF_BACKEND=python pytest tests/ -q --ignore=tests/test_rust_backend.py -n auto --dist worksteal
diff --git a/CLAUDE.md b/CLAUDE.md
index c296ba0..c841b08 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -378,7 +378,7 @@ Tests mirror the source modules:
 - `tests/test_pretrends.py` - Tests for pre-trends power analysis
 - `tests/test_datasets.py` - Tests for dataset loading functions
 
-Session-scoped `ci_params` fixture in `conftest.py` scales bootstrap iterations and TROP grid sizes in pure Python mode — use `ci_params.bootstrap(n)` and `ci_params.grid(values)` in new tests with `n_bootstrap >= 20`. For SE convergence tests (analytical vs bootstrap comparison), use `ci_params.bootstrap(n, min_n=199)` to ensure sufficient iterations.
+Session-scoped `ci_params` fixture in `conftest.py` scales bootstrap iterations and TROP grid sizes in pure Python mode — use `ci_params.bootstrap(n)` and `ci_params.grid(values)` in new tests with `n_bootstrap >= 20`. For SE convergence tests (analytical vs bootstrap comparison), use `ci_params.bootstrap(n, min_n=199)` with a conditional tolerance: `threshold = 0.40 if n_boot < 100 else 0.15`. The `min_n` parameter is capped at 49 in pure Python mode to keep CI fast, so convergence tests use wider tolerances when running with fewer bootstrap iterations.
 
 ### Test Writing Guidelines
 
diff --git a/pyproject.toml b/pyproject.toml
index 25ab08f..5b03c60 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
     "pytest>=7.0",
+    "pytest-xdist>=3.0",
     "pytest-cov>=4.0",
     "black>=23.0",
     "ruff>=0.1.0",
diff --git a/tests/conftest.py b/tests/conftest.py
index a1541e1..8a1a3c8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -110,10 +110,12 @@ def bootstrap(n: int, *, min_n: int = 11) -> int:
 
         Use a larger min_n for tests comparing analytical vs bootstrap SEs,
         which need more iterations for stable convergence.
+        In pure Python mode, min_n is capped at 49 to keep CI fast.
         """
         if not _PURE_PYTHON_MODE or n <= 10:
             return n
-        return min(n, max(min_n, int(math.sqrt(n) * 1.6)))
+        effective_min = min(min_n, 49)
+        return min(n, max(effective_min, int(math.sqrt(n) * 1.6)))
 
     @staticmethod
     def grid(values: list) -> list:
diff --git a/tests/test_ci_params.py b/tests/test_ci_params.py
index f1e0e1c..a5d3ead 100644
--- a/tests/test_ci_params.py
+++ b/tests/test_ci_params.py
@@ -7,20 +7,21 @@
 
 
 class TestCIParamsBootstrap:
-    def test_min_n_in_pure_python_mode(self, monkeypatch):
-        """min_n raises the floor in pure Python mode."""
+    def test_min_n_capped_at_49_in_pure_python_mode(self, monkeypatch):
+        """min_n is capped at 49 in pure Python mode."""
         monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", True)
-        assert CIParams.bootstrap(499, min_n=199) == 199
+        assert CIParams.bootstrap(499, min_n=199) == 49
 
     def test_min_n_passthrough_in_rust_mode(self, monkeypatch):
         """min_n has no effect when Rust backend is available."""
         monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", False)
         assert CIParams.bootstrap(499, min_n=199) == 499
 
-    def test_min_n_capped_at_original_request(self, monkeypatch):
-        """min_n never exceeds the original n."""
+    def test_min_n_cap_then_n_cap(self, monkeypatch):
+        """min_n cap (49) applies, then result is min(n, effective_floor)."""
         monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", True)
-        assert CIParams.bootstrap(100, min_n=199) == 100
+        # effective_min = min(199, 49) = 49; max(49, 16) = 49; min(100, 49) = 49
+        assert CIParams.bootstrap(100, min_n=199) == 49
 
     def test_n_lte_10_ignores_min_n(self, monkeypatch):
         """n <= 10 always returns n regardless of min_n or mode."""
@@ -31,3 +32,14 @@ def test_default_min_n_preserves_existing_behavior(self, monkeypatch):
         """Default min_n=11 matches pre-change behavior."""
         monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", True)
         assert CIParams.bootstrap(499) == max(11, int(math.sqrt(499) * 1.6))  # 35
+
+    def test_min_n_cap_with_high_min_n(self, monkeypatch):
+        """min_n=249 is also capped at 49 in pure Python mode."""
+        monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", True)
+        assert CIParams.bootstrap(499, min_n=249) == 49
+
+    def test_n_still_caps_result(self, monkeypatch):
+        """Original n still caps the result when min_n is below cap."""
+        monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", True)
+        # effective_min = min(40, 49) = 40; max(40, 8) = 40; min(30, 40) = 30
+        assert CIParams.bootstrap(30, min_n=40) == 30
diff --git a/tests/test_methodology_callaway.py b/tests/test_methodology_callaway.py
index e3cbb0c..05fdac2 100644
--- a/tests/test_methodology_callaway.py
+++ b/tests/test_methodology_callaway.py
@@ -803,11 +803,12 @@ class TestSEFormulas:
     @pytest.mark.slow
     def test_analytical_se_close_to_bootstrap_se(self, ci_params):
         """
-        Analytical and bootstrap SEs should be within 20%.
+        Analytical and bootstrap SEs should be within 25%.
 
         Analytical SEs use influence function aggregation.
         Bootstrap SEs use multiplier bootstrap.
-        They should converge for large samples.
+        They should converge for large samples. Wider tolerance (40%)
+        when min_n cap reduces bootstrap iterations in pure Python mode.
 
         This test is marked slow because it uses 499 bootstrap iterations
         for thorough validation of SE convergence.
@@ -833,10 +834,12 @@ def test_analytical_se_close_to_bootstrap_se(self, ci_params):
             time='period', first_treat='first_treat'
         )
 
-        # Check overall ATT SE
+        # Check overall ATT SE (wider tolerance when min_n cap reduces
+        # bootstrap iterations in pure Python mode)
         if results_boot.overall_se > 0:
             rel_diff = abs(results_anal.overall_se - results_boot.overall_se) / results_boot.overall_se
-            assert rel_diff < 0.25, \
+            threshold = 0.40 if n_boot < 100 else 0.25
+            assert rel_diff < threshold, \
                 f"Analytical SE ({results_anal.overall_se}) differs from bootstrap SE " \
                 f"({results_boot.overall_se}) by {rel_diff*100:.1f}%"
 
diff --git a/tests/test_power.py b/tests/test_power.py
index 54bd08b..d012cda 100644
--- a/tests/test_power.py
+++ b/tests/test_power.py
@@ -399,7 +399,7 @@ def test_simulation_with_large_effect(self):
             n_periods=4,
             treatment_effect=10.0,  # Very large effect
             sigma=1.0,  # Low noise
-            n_simulations=50,
+            n_simulations=30,
             seed=42,
             progress=False,
         )
@@ -416,7 +416,7 @@ def test_simulation_with_zero_effect(self):
             n_periods=4,
             treatment_effect=0.0,  # No effect
             sigma=1.0,
-            n_simulations=50,
+            n_simulations=30,
             seed=42,
             progress=False,
         )
@@ -459,7 +459,7 @@ def test_simulation_coverage(self):
             n_periods=4,
             treatment_effect=5.0,
             sigma=2.0,
-            n_simulations=100,
+            n_simulations=50,
             seed=42,
             progress=False,
         )
@@ -476,7 +476,7 @@ def test_simulation_bias(self):
             n_periods=4,
             treatment_effect=5.0,
             sigma=1.0,
-            n_simulations=100,
+            n_simulations=50,
             seed=42,
             progress=False,
         )
@@ -524,7 +524,7 @@ def test_simulation_confidence_interval(self):
         did = DifferenceInDifferences()
         results = simulate_power(
             estimator=did,
-            n_simulations=100,
+            n_simulations=50,
             seed=42,
             progress=False,
         )
diff --git a/tests/test_staggered.py b/tests/test_staggered.py
index ddc6089..71e01f5 100644
--- a/tests/test_staggered.py
+++ b/tests/test_staggered.py
@@ -1579,12 +1579,13 @@ def test_analytical_se_vs_bootstrap_se(self, ci_params):
         # Point estimates should match exactly
         assert abs(results_analytical.overall_att - results_bootstrap.overall_att) < 1e-10
 
-        # SEs should be similar (within 15%)
-        # Note: Some difference expected due to bootstrap variance vs asymptotic variance
+        # SEs should be similar (within 15% with enough bootstrap iterations,
+        # wider tolerance when min_n cap reduces iterations in pure Python mode)
         rel_diff = abs(
             results_analytical.overall_se - results_bootstrap.overall_se
         ) / results_bootstrap.overall_se
-        assert rel_diff < 0.15, (
+        threshold = 0.40 if n_boot < 100 else 0.15
+        assert rel_diff < threshold, (
             f"Analytical SE ({results_analytical.overall_se:.4f}) differs from "
             f"bootstrap SE ({results_bootstrap.overall_se:.4f}) by {rel_diff:.1%}"
         )
@@ -1726,7 +1727,9 @@ def test_event_study_analytical_se(self, ci_params):
         assert results_analytical.event_study_effects is not None
         assert results_bootstrap.event_study_effects is not None
 
-        # Check each event time SE is similar
+        # Check each event time SE is similar (wider tolerance when
+        # min_n cap reduces bootstrap iterations in pure Python mode)
+        threshold = 0.40 if n_boot < 100 else 0.20
         for e in results_analytical.event_study_effects:
             if e in results_bootstrap.event_study_effects:
                 se_analytical = results_analytical.event_study_effects[e]['se']
@@ -1734,8 +1737,7 @@ def test_event_study_analytical_se(self, ci_params):
 
                 if se_bootstrap > 0:
                     rel_diff = abs(se_analytical - se_bootstrap) / se_bootstrap
-                    # Allow 20% difference for event study (more variance)
-                    assert rel_diff < 0.20, (
+                    assert rel_diff < threshold, (
                         f"Event study SE at e={e}: analytical={se_analytical:.4f}, "
                         f"bootstrap={se_bootstrap:.4f}, diff={rel_diff:.1%}"
                     )
diff --git a/tests/test_trop.py b/tests/test_trop.py
index 87fd0c0..1ee95da 100644
--- a/tests/test_trop.py
+++ b/tests/test_trop.py
@@ -312,48 +312,51 @@ def test_no_control_units(self):
 class TestTROPResults:
     """Tests for TROPResults dataclass."""
 
-    def test_summary(self, simple_panel_data):
-        """Test that summary produces string output."""
+    @pytest.fixture(scope="class")
+    def fitted_results(self):
+        """Shared TROP fit for read-only result tests (class-scoped to avoid redundant fits)."""
+        # Inline data generation (same as simple_panel_data fixture)
+        rng = np.random.default_rng(123)
+        n_units, n_treated, n_pre, n_post, true_att = 20, 5, 5, 3, 3.0
+        data = []
+        for i in range(n_units):
+            is_treated = i < n_treated
+            for t in range(n_pre + n_post):
+                post = t >= n_pre
+                y = 10.0 + i * 0.1 + t * 0.5
+                if is_treated and post:
+                    y += true_att
+                y += rng.normal(0, 0.5)
+                data.append({
+                    "unit": i, "period": t, "outcome": y,
+                    "treated": 1 if (is_treated and post) else 0,
+                })
+        panel = pd.DataFrame(data)
+
         trop_est = TROP(
             lambda_time_grid=[0.0, 1.0],
             lambda_unit_grid=[0.0, 1.0],
             lambda_nn_grid=[0.0, 0.1],
             n_bootstrap=10,
-            seed=42
+            seed=42,
         )
-        results = trop_est.fit(
-            simple_panel_data,
-            outcome="outcome",
-            treatment="treated",
-            unit="unit",
-            time="period",
+        return trop_est.fit(
+            panel, outcome="outcome", treatment="treated",
+            unit="unit", time="period",
         )
 
-        summary = results.summary()
+    def test_summary(self, fitted_results):
+        """Test that summary produces string output."""
+        summary = fitted_results.summary()
         assert isinstance(summary, str)
         assert "ATT" in summary
         assert "TROP" in summary
         assert "LOOCV" in summary
         assert "Lambda" in summary
 
-    def test_to_dict(self, simple_panel_data):
+    def test_to_dict(self, fitted_results):
         """Test conversion to dictionary."""
-        trop_est = TROP(
-            lambda_time_grid=[0.0, 1.0],
-            lambda_unit_grid=[0.0, 1.0],
-            lambda_nn_grid=[0.0, 0.1],
-            n_bootstrap=10,
-            seed=42
-        )
-        results = trop_est.fit(
-            simple_panel_data,
-            outcome="outcome",
-            treatment="treated",
-            unit="unit",
-            time="period",
-        )
-
-        d = results.to_dict()
+        d = fitted_results.to_dict()
         assert "att" in d
         assert "se" in d
         assert "lambda_time" in d
@@ -361,98 +364,38 @@ def test_to_dict(self, simple_panel_data):
         assert "lambda_nn" in d
         assert "effective_rank" in d
 
-    def test_to_dataframe(self, simple_panel_data):
+    def test_to_dataframe(self, fitted_results):
         """Test conversion to DataFrame."""
-        trop_est = TROP(
-            lambda_time_grid=[0.0, 1.0],
-            lambda_unit_grid=[0.0, 1.0],
-            lambda_nn_grid=[0.0, 0.1],
-            n_bootstrap=10,
-            seed=42
-        )
-        results = trop_est.fit(
-            simple_panel_data,
-            outcome="outcome",
-            treatment="treated",
-            unit="unit",
-            time="period",
-        )
-
-        df = results.to_dataframe()
+        df = fitted_results.to_dataframe()
         assert isinstance(df, pd.DataFrame)
         assert len(df) == 1
         assert "att" in df.columns
 
-    def test_get_treatment_effects_df(self, simple_panel_data):
+    def test_get_treatment_effects_df(self, fitted_results):
         """Test getting treatment effects DataFrame."""
-        trop_est = TROP(
-            lambda_time_grid=[0.0, 1.0],
-            lambda_unit_grid=[0.0, 1.0],
-            lambda_nn_grid=[0.0, 0.1],
-            n_bootstrap=10,
-            seed=42
-        )
-        results = trop_est.fit(
-            simple_panel_data,
-            outcome="outcome",
-            treatment="treated",
-            unit="unit",
-            time="period",
-        )
-
-        effects_df = results.get_treatment_effects_df()
+        effects_df = fitted_results.get_treatment_effects_df()
         assert isinstance(effects_df, pd.DataFrame)
         assert "unit" in effects_df.columns
         assert "time" in effects_df.columns
         assert "effect" in effects_df.columns
-        assert len(effects_df) == results.n_treated_obs
+        assert len(effects_df) == fitted_results.n_treated_obs
 
-    def test_get_unit_effects_df(self, simple_panel_data):
+    def test_get_unit_effects_df(self, fitted_results):
         """Test getting unit effects DataFrame."""
-        trop_est = TROP(
-            lambda_time_grid=[0.0, 1.0],
-            lambda_unit_grid=[0.0, 1.0],
-            lambda_nn_grid=[0.0, 0.1],
-            n_bootstrap=10,
-            seed=42
-        )
-        results = trop_est.fit(
-            simple_panel_data,
-            outcome="outcome",
-            treatment="treated",
-            unit="unit",
-            time="period",
-        )
-
-        effects_df = results.get_unit_effects_df()
+        effects_df = fitted_results.get_unit_effects_df()
         assert isinstance(effects_df, pd.DataFrame)
         assert "unit" in effects_df.columns
         assert "effect" in effects_df.columns
 
-    def test_get_time_effects_df(self, simple_panel_data):
+    def test_get_time_effects_df(self, fitted_results):
         """Test getting time effects DataFrame."""
-        trop_est = TROP(
-            lambda_time_grid=[0.0, 1.0],
-            lambda_unit_grid=[0.0, 1.0],
-            lambda_nn_grid=[0.0, 0.1],
-            n_bootstrap=10,
-            seed=42
-        )
-        results = trop_est.fit(
-            simple_panel_data,
-            outcome="outcome",
-            treatment="treated",
-            unit="unit",
-            time="period",
-        )
-
-        effects_df = results.get_time_effects_df()
+        effects_df = fitted_results.get_time_effects_df()
         assert isinstance(effects_df, pd.DataFrame)
         assert "time" in effects_df.columns
         assert "effect" in effects_df.columns
 
-    def test_is_significant(self, simple_panel_data, ci_params):
-        """Test significance property."""
+    def test_significance_properties(self, simple_panel_data, ci_params):
+        """Test is_significant and significance_stars properties."""
         n_boot = ci_params.bootstrap(30)
         trop_est = TROP(
             lambda_time_grid=[0.0, 1.0],
@@ -471,27 +414,7 @@ def test_is_significant(self, simple_panel_data, ci_params):
         )
 
         assert isinstance(results.is_significant, bool)
-
-    def test_significance_stars(self, simple_panel_data, ci_params):
-        """Test significance stars."""
-        n_boot = ci_params.bootstrap(30)
-        trop_est = TROP(
-            lambda_time_grid=[0.0, 1.0],
-            lambda_unit_grid=[0.0, 1.0],
-            lambda_nn_grid=[0.0, 0.1],
-            n_bootstrap=n_boot,
-            seed=42
-        )
-        results = trop_est.fit(
-            simple_panel_data,
-            outcome="outcome",
-            treatment="treated",
-            unit="unit",
-            time="period",
-        )
-
-        stars = results.significance_stars
-        assert stars in ["", ".", "*", "**", "***"]
+        assert results.significance_stars in ["", ".", "*", "**", "***"]
 
     def test_nan_propagation_when_se_zero(self):
         """Test that inference fields are NaN when SE is zero/undefined.
@@ -807,12 +730,12 @@ def test_factor_model_reduces_bias(self, ci_params):
         Following paper's simulation: when true DGP has interactive fixed effects,
         the factor model component should help recover the treatment effect.
         """
-        # Generate data with known factor structure
+        # Generate data with known factor structure (reduced size for CI speed)
         data = generate_factor_dgp(
-            n_units=40,
-            n_pre=10,
-            n_post=5,
-            n_treated=8,
+            n_units=25,
+            n_pre=7,
+            n_post=3,
+            n_treated=5,
             n_factors=2,
             treatment_effect=2.0,
             factor_strength=1.5,  # Strong factors
@@ -856,12 +779,12 @@ def test_paper_dgp_recovery(self, ci_params):
 
         This is a methodological validation test.
         """
-        # Generate data similar to paper's simulation
+        # Generate data similar to paper's simulation (reduced size for CI speed)
         rng = np.random.default_rng(2024)
-        n_units = 50
-        n_treated = 10
-        n_pre = 10
-        n_post = 5
+        n_units = 30
+        n_treated = 6
+        n_pre = 7
+        n_post = 3
         n_factors = 2
         true_tau = 0.0  # Null treatment effect