From 400142a15b7039614dc9bab460ea93a2f0def92d Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 8 Feb 2026 17:20:19 -0500 Subject: [PATCH] Shorten test suite runtime with parallel execution and reduced iterations Add pytest-xdist for parallel test execution across all CI jobs, cap bootstrap min_n at 49 in pure Python mode with wider convergence tolerances, share TROP fixtures via class-scoped fixture to eliminate 7 redundant fits, reduce simulation counts and methodology test data sizes. Full suite verified: 1035 passed, 0 failures. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/rust-test.yml | 10 +- CLAUDE.md | 2 +- pyproject.toml | 1 + tests/conftest.py | 4 +- tests/test_ci_params.py | 24 +++- tests/test_methodology_callaway.py | 11 +- tests/test_power.py | 10 +- tests/test_staggered.py | 14 ++- tests/test_trop.py | 181 +++++++++-------------------- 9 files changed, 100 insertions(+), 157 deletions(-) diff --git a/.github/workflows/rust-test.yml b/.github/workflows/rust-test.yml index 2ed2dfb..4bd2861 100644 --- a/.github/workflows/rust-test.yml +++ b/.github/workflows/rust-test.yml @@ -64,7 +64,7 @@ jobs: uses: dtolnay/rust-toolchain@stable - name: Install test dependencies - run: pip install pytest numpy pandas scipy + run: pip install pytest pytest-xdist numpy pandas scipy - name: Build and install with maturin run: | @@ -121,14 +121,14 @@ jobs: - name: Run tests with Rust backend (Unix) if: runner.os != 'Windows' working-directory: /tmp - run: DIFF_DIFF_BACKEND=rust pytest tests/ -x -q + run: DIFF_DIFF_BACKEND=rust pytest tests/ -q -n auto --dist worksteal - name: Run tests with Rust backend (Windows) if: runner.os == 'Windows' working-directory: ${{ runner.temp }} run: | $env:DIFF_DIFF_BACKEND="rust" - pytest tests/ -x -q + pytest tests/ -q -n auto --dist worksteal shell: pwsh # Test pure Python fallback (without Rust extension) @@ -144,7 +144,7 @@ jobs: python-version: '3.11' - name: Install dependencies - run: pip install numpy pandas scipy pytest + run: pip install numpy pandas scipy pytest pytest-xdist - name: Verify pure Python mode run: | @@ -152,4 +152,4 @@ jobs: PYTHONPATH=. python -c "from diff_diff import HAS_RUST_BACKEND; print(f'HAS_RUST_BACKEND: {HAS_RUST_BACKEND}'); assert not HAS_RUST_BACKEND" - name: Run tests in pure Python mode - run: PYTHONPATH=. DIFF_DIFF_BACKEND=python pytest tests/ -x -q --ignore=tests/test_rust_backend.py + run: PYTHONPATH=. DIFF_DIFF_BACKEND=python pytest tests/ -q --ignore=tests/test_rust_backend.py -n auto --dist worksteal diff --git a/CLAUDE.md b/CLAUDE.md index c296ba0..c841b08 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -378,7 +378,7 @@ Tests mirror the source modules: - `tests/test_pretrends.py` - Tests for pre-trends power analysis - `tests/test_datasets.py` - Tests for dataset loading functions -Session-scoped `ci_params` fixture in `conftest.py` scales bootstrap iterations and TROP grid sizes in pure Python mode — use `ci_params.bootstrap(n)` and `ci_params.grid(values)` in new tests with `n_bootstrap >= 20`. For SE convergence tests (analytical vs bootstrap comparison), use `ci_params.bootstrap(n, min_n=199)` to ensure sufficient iterations. +Session-scoped `ci_params` fixture in `conftest.py` scales bootstrap iterations and TROP grid sizes in pure Python mode — use `ci_params.bootstrap(n)` and `ci_params.grid(values)` in new tests with `n_bootstrap >= 20`. For SE convergence tests (analytical vs bootstrap comparison), use `ci_params.bootstrap(n, min_n=199)` with a conditional tolerance: `threshold = 0.40 if n_boot < 100 else 0.15`. The `min_n` parameter is capped at 49 in pure Python mode to keep CI fast, so convergence tests use wider tolerances when running with fewer bootstrap iterations. ### Test Writing Guidelines diff --git a/pyproject.toml b/pyproject.toml index 25ab08f..5b03c60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ [project.optional-dependencies] dev = [ "pytest>=7.0", + "pytest-xdist>=3.0", "pytest-cov>=4.0", "black>=23.0", "ruff>=0.1.0", diff --git a/tests/conftest.py b/tests/conftest.py index a1541e1..8a1a3c8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -110,10 +110,12 @@ def bootstrap(n: int, *, min_n: int = 11) -> int: Use a larger min_n for tests comparing analytical vs bootstrap SEs, which need more iterations for stable convergence. + In pure Python mode, min_n is capped at 49 to keep CI fast. """ if not _PURE_PYTHON_MODE or n <= 10: return n - return min(n, max(min_n, int(math.sqrt(n) * 1.6))) + effective_min = min(min_n, 49) + return min(n, max(effective_min, int(math.sqrt(n) * 1.6))) @staticmethod def grid(values: list) -> list: diff --git a/tests/test_ci_params.py b/tests/test_ci_params.py index f1e0e1c..a5d3ead 100644 --- a/tests/test_ci_params.py +++ b/tests/test_ci_params.py @@ -7,20 +7,21 @@ class TestCIParamsBootstrap: - def test_min_n_in_pure_python_mode(self, monkeypatch): - """min_n raises the floor in pure Python mode.""" + def test_min_n_capped_at_49_in_pure_python_mode(self, monkeypatch): + """min_n is capped at 49 in pure Python mode.""" monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", True) - assert CIParams.bootstrap(499, min_n=199) == 199 + assert CIParams.bootstrap(499, min_n=199) == 49 def test_min_n_passthrough_in_rust_mode(self, monkeypatch): """min_n has no effect when Rust backend is available.""" monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", False) assert CIParams.bootstrap(499, min_n=199) == 499 - def test_min_n_capped_at_original_request(self, monkeypatch): - """min_n never exceeds the original n.""" + def test_min_n_cap_then_n_cap(self, monkeypatch): + """min_n cap (49) applies, then result is min(n, effective_floor).""" monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", True) - assert CIParams.bootstrap(100, min_n=199) == 100 + # effective_min = min(199, 49) = 49; max(49, 16) = 49; min(100, 49) = 49 + assert CIParams.bootstrap(100, min_n=199) == 49 def test_n_lte_10_ignores_min_n(self, monkeypatch): """n <= 10 always returns n regardless of min_n or mode.""" @@ -31,3 +32,14 @@ def test_default_min_n_preserves_existing_behavior(self, monkeypatch): """Default min_n=11 matches pre-change behavior.""" monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", True) assert CIParams.bootstrap(499) == max(11, int(math.sqrt(499) * 1.6)) # 35 + + def test_min_n_cap_with_high_min_n(self, monkeypatch): + """min_n=249 is also capped at 49 in pure Python mode.""" + monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", True) + assert CIParams.bootstrap(499, min_n=249) == 49 + + def test_n_still_caps_result(self, monkeypatch): + """Original n still caps the result when min_n is below cap.""" + monkeypatch.setattr(conftest_module, "_PURE_PYTHON_MODE", True) + # effective_min = min(40, 49) = 40; max(40, 8) = 40; min(30, 40) = 30 + assert CIParams.bootstrap(30, min_n=40) == 30 diff --git a/tests/test_methodology_callaway.py b/tests/test_methodology_callaway.py index e3cbb0c..05fdac2 100644 --- a/tests/test_methodology_callaway.py +++ b/tests/test_methodology_callaway.py @@ -803,11 +803,12 @@ class TestSEFormulas: @pytest.mark.slow def test_analytical_se_close_to_bootstrap_se(self, ci_params): """ - Analytical and bootstrap SEs should be within 20%. + Analytical and bootstrap SEs should be within 25%. Analytical SEs use influence function aggregation. Bootstrap SEs use multiplier bootstrap. - They should converge for large samples. + They should converge for large samples. Wider tolerance (40%) + when min_n cap reduces bootstrap iterations in pure Python mode. This test is marked slow because it uses 499 bootstrap iterations for thorough validation of SE convergence. @@ -833,10 +834,12 @@ def test_analytical_se_close_to_bootstrap_se(self, ci_params): time='period', first_treat='first_treat' ) - # Check overall ATT SE + # Check overall ATT SE (wider tolerance when min_n cap reduces + # bootstrap iterations in pure Python mode) if results_boot.overall_se > 0: rel_diff = abs(results_anal.overall_se - results_boot.overall_se) / results_boot.overall_se - assert rel_diff < 0.25, \ + threshold = 0.40 if n_boot < 100 else 0.25 + assert rel_diff < threshold, \ f"Analytical SE ({results_anal.overall_se}) differs from bootstrap SE " \ f"({results_boot.overall_se}) by {rel_diff*100:.1f}%" diff --git a/tests/test_power.py b/tests/test_power.py index 54bd08b..d012cda 100644 --- a/tests/test_power.py +++ b/tests/test_power.py @@ -399,7 +399,7 @@ def test_simulation_with_large_effect(self): n_periods=4, treatment_effect=10.0, # Very large effect sigma=1.0, # Low noise - n_simulations=50, + n_simulations=30, seed=42, progress=False, ) @@ -416,7 +416,7 @@ def test_simulation_with_zero_effect(self): n_periods=4, treatment_effect=0.0, # No effect sigma=1.0, - n_simulations=50, + n_simulations=30, seed=42, progress=False, ) @@ -459,7 +459,7 @@ def test_simulation_coverage(self): n_periods=4, treatment_effect=5.0, sigma=2.0, - n_simulations=100, + n_simulations=50, seed=42, progress=False, ) @@ -476,7 +476,7 @@ def test_simulation_bias(self): n_periods=4, treatment_effect=5.0, sigma=1.0, - n_simulations=100, + n_simulations=50, seed=42, progress=False, ) @@ -524,7 +524,7 @@ def test_simulation_confidence_interval(self): did = DifferenceInDifferences() results = simulate_power( estimator=did, - n_simulations=100, + n_simulations=50, seed=42, progress=False, ) diff --git a/tests/test_staggered.py b/tests/test_staggered.py index ddc6089..71e01f5 100644 --- a/tests/test_staggered.py +++ b/tests/test_staggered.py @@ -1579,12 +1579,13 @@ def test_analytical_se_vs_bootstrap_se(self, ci_params): # Point estimates should match exactly assert abs(results_analytical.overall_att - results_bootstrap.overall_att) < 1e-10 - # SEs should be similar (within 15%) - # Note: Some difference expected due to bootstrap variance vs asymptotic variance + # SEs should be similar (within 15% with enough bootstrap iterations, + # wider tolerance when min_n cap reduces iterations in pure Python mode) rel_diff = abs( results_analytical.overall_se - results_bootstrap.overall_se ) / results_bootstrap.overall_se - assert rel_diff < 0.15, ( + threshold = 0.40 if n_boot < 100 else 0.15 + assert rel_diff < threshold, ( f"Analytical SE ({results_analytical.overall_se:.4f}) differs from " f"bootstrap SE ({results_bootstrap.overall_se:.4f}) by {rel_diff:.1%}" ) @@ -1726,7 +1727,9 @@ def test_event_study_analytical_se(self, ci_params): assert results_analytical.event_study_effects is not None assert results_bootstrap.event_study_effects is not None - # Check each event time SE is similar + # Check each event time SE is similar (wider tolerance when + # min_n cap reduces bootstrap iterations in pure Python mode) + threshold = 0.40 if n_boot < 100 else 0.20 for e in results_analytical.event_study_effects: if e in results_bootstrap.event_study_effects: se_analytical = results_analytical.event_study_effects[e]['se'] @@ -1734,8 +1737,7 @@ def test_event_study_analytical_se(self, ci_params): if se_bootstrap > 0: rel_diff = abs(se_analytical - se_bootstrap) / se_bootstrap - # Allow 20% difference for event study (more variance) - assert rel_diff < 0.20, ( + assert rel_diff < threshold, ( f"Event study SE at e={e}: analytical={se_analytical:.4f}, " f"bootstrap={se_bootstrap:.4f}, diff={rel_diff:.1%}" ) diff --git a/tests/test_trop.py b/tests/test_trop.py index 87fd0c0..1ee95da 100644 --- a/tests/test_trop.py +++ b/tests/test_trop.py @@ -312,48 +312,51 @@ def test_no_control_units(self): class TestTROPResults: """Tests for TROPResults dataclass.""" - def test_summary(self, simple_panel_data): - """Test that summary produces string output.""" + @pytest.fixture(scope="class") + def fitted_results(self): + """Shared TROP fit for read-only result tests (class-scoped to avoid redundant fits).""" + # Inline data generation (same as simple_panel_data fixture) + rng = np.random.default_rng(123) + n_units, n_treated, n_pre, n_post, true_att = 20, 5, 5, 3, 3.0 + data = [] + for i in range(n_units): + is_treated = i < n_treated + for t in range(n_pre + n_post): + post = t >= n_pre + y = 10.0 + i * 0.1 + t * 0.5 + if is_treated and post: + y += true_att + y += rng.normal(0, 0.5) + data.append({ + "unit": i, "period": t, "outcome": y, + "treated": 1 if (is_treated and post) else 0, + }) + panel = pd.DataFrame(data) + trop_est = TROP( lambda_time_grid=[0.0, 1.0], lambda_unit_grid=[0.0, 1.0], lambda_nn_grid=[0.0, 0.1], n_bootstrap=10, - seed=42 + seed=42, ) - results = trop_est.fit( - simple_panel_data, - outcome="outcome", - treatment="treated", - unit="unit", - time="period", + return trop_est.fit( + panel, outcome="outcome", treatment="treated", + unit="unit", time="period", ) - summary = results.summary() + def test_summary(self, fitted_results): + """Test that summary produces string output.""" + summary = fitted_results.summary() assert isinstance(summary, str) assert "ATT" in summary assert "TROP" in summary assert "LOOCV" in summary assert "Lambda" in summary - def test_to_dict(self, simple_panel_data): + def test_to_dict(self, fitted_results): """Test conversion to dictionary.""" - trop_est = TROP( - lambda_time_grid=[0.0, 1.0], - lambda_unit_grid=[0.0, 1.0], - lambda_nn_grid=[0.0, 0.1], - n_bootstrap=10, - seed=42 - ) - results = trop_est.fit( - simple_panel_data, - outcome="outcome", - treatment="treated", - unit="unit", - time="period", - ) - - d = results.to_dict() + d = fitted_results.to_dict() assert "att" in d assert "se" in d assert "lambda_time" in d @@ -361,98 +364,38 @@ def test_to_dict(self, simple_panel_data): assert "lambda_nn" in d assert "effective_rank" in d - def test_to_dataframe(self, simple_panel_data): + def test_to_dataframe(self, fitted_results): """Test conversion to DataFrame.""" - trop_est = TROP( - lambda_time_grid=[0.0, 1.0], - lambda_unit_grid=[0.0, 1.0], - lambda_nn_grid=[0.0, 0.1], - n_bootstrap=10, - seed=42 - ) - results = trop_est.fit( - simple_panel_data, - outcome="outcome", - treatment="treated", - unit="unit", - time="period", - ) - - df = results.to_dataframe() + df = fitted_results.to_dataframe() assert isinstance(df, pd.DataFrame) assert len(df) == 1 assert "att" in df.columns - def test_get_treatment_effects_df(self, simple_panel_data): + def test_get_treatment_effects_df(self, fitted_results): """Test getting treatment effects DataFrame.""" - trop_est = TROP( - lambda_time_grid=[0.0, 1.0], - lambda_unit_grid=[0.0, 1.0], - lambda_nn_grid=[0.0, 0.1], - n_bootstrap=10, - seed=42 - ) - results = trop_est.fit( - simple_panel_data, - outcome="outcome", - treatment="treated", - unit="unit", - time="period", - ) - - effects_df = results.get_treatment_effects_df() + effects_df = fitted_results.get_treatment_effects_df() assert isinstance(effects_df, pd.DataFrame) assert "unit" in effects_df.columns assert "time" in effects_df.columns assert "effect" in effects_df.columns - assert len(effects_df) == results.n_treated_obs + assert len(effects_df) == fitted_results.n_treated_obs - def test_get_unit_effects_df(self, simple_panel_data): + def test_get_unit_effects_df(self, fitted_results): """Test getting unit effects DataFrame.""" - trop_est = TROP( - lambda_time_grid=[0.0, 1.0], - lambda_unit_grid=[0.0, 1.0], - lambda_nn_grid=[0.0, 0.1], - n_bootstrap=10, - seed=42 - ) - results = trop_est.fit( - simple_panel_data, - outcome="outcome", - treatment="treated", - unit="unit", - time="period", - ) - - effects_df = results.get_unit_effects_df() + effects_df = fitted_results.get_unit_effects_df() assert isinstance(effects_df, pd.DataFrame) assert "unit" in effects_df.columns assert "effect" in effects_df.columns - def test_get_time_effects_df(self, simple_panel_data): + def test_get_time_effects_df(self, fitted_results): """Test getting time effects DataFrame.""" - trop_est = TROP( - lambda_time_grid=[0.0, 1.0], - lambda_unit_grid=[0.0, 1.0], - lambda_nn_grid=[0.0, 0.1], - n_bootstrap=10, - seed=42 - ) - results = trop_est.fit( - simple_panel_data, - outcome="outcome", - treatment="treated", - unit="unit", - time="period", - ) - - effects_df = results.get_time_effects_df() + effects_df = fitted_results.get_time_effects_df() assert isinstance(effects_df, pd.DataFrame) assert "time" in effects_df.columns assert "effect" in effects_df.columns - def test_is_significant(self, simple_panel_data, ci_params): - """Test significance property.""" + def test_significance_properties(self, simple_panel_data, ci_params): + """Test is_significant and significance_stars properties.""" n_boot = ci_params.bootstrap(30) trop_est = TROP( lambda_time_grid=[0.0, 1.0], @@ -471,27 +414,7 @@ def test_is_significant(self, simple_panel_data, ci_params): ) assert isinstance(results.is_significant, bool) - - def test_significance_stars(self, simple_panel_data, ci_params): - """Test significance stars.""" - n_boot = ci_params.bootstrap(30) - trop_est = TROP( - lambda_time_grid=[0.0, 1.0], - lambda_unit_grid=[0.0, 1.0], - lambda_nn_grid=[0.0, 0.1], - n_bootstrap=n_boot, - seed=42 - ) - results = trop_est.fit( - simple_panel_data, - outcome="outcome", - treatment="treated", - unit="unit", - time="period", - ) - - stars = results.significance_stars - assert stars in ["", ".", "*", "**", "***"] + assert results.significance_stars in ["", ".", "*", "**", "***"] def test_nan_propagation_when_se_zero(self): """Test that inference fields are NaN when SE is zero/undefined. @@ -807,12 +730,12 @@ def test_factor_model_reduces_bias(self, ci_params): Following paper's simulation: when true DGP has interactive fixed effects, the factor model component should help recover the treatment effect. """ - # Generate data with known factor structure + # Generate data with known factor structure (reduced size for CI speed) data = generate_factor_dgp( - n_units=40, - n_pre=10, - n_post=5, - n_treated=8, + n_units=25, + n_pre=7, + n_post=3, + n_treated=5, n_factors=2, treatment_effect=2.0, factor_strength=1.5, # Strong factors @@ -856,12 +779,12 @@ def test_paper_dgp_recovery(self, ci_params): This is a methodological validation test. """ - # Generate data similar to paper's simulation + # Generate data similar to paper's simulation (reduced size for CI speed) rng = np.random.default_rng(2024) - n_units = 50 - n_treated = 10 - n_pre = 10 - n_post = 5 + n_units = 30 + n_treated = 6 + n_pre = 7 + n_post = 3 n_factors = 2 true_tau = 0.0 # Null treatment effect