Skip to content

Commit 9017fbc

Browse files
authored
fix(node): Set backfill max job duration to a safe amount of time (#90)
* fix(node): Set backfill max job duration to a safe amount of time Our timeout crash is caused by a footgun on Reth's backfill threshold configuration API. Here's how it roughly looks: - reth allows the mdbx tx timeout sentinel thread to have a configurable max timeout, hidden behind a flag (--db.read-transaction-timeout). We currently, AFAIK, do not configure this. The default value is 5 minutes. Any transactions that live beyond that are killed. - Reth, for the exex backfill, sets the max_duration for its backfill processes to 30 seconds. This does not exceed the default max timeout for the mdbx sentinel thread. - The `ExecutionStageThresholds` default for max duration for execution jobs is 10 minutes. This, by far, exceeds the mdbx timeout sentinel thread max duration. On the execution stage for the reth node this is safe as it calls `disable_long_read_transaction_safety()`, which bypasses the timeout. This is not done on backfill, so this default is unsafe. This is therefore a massive footgun for any exex that configures backfill thresholds. - We were only tweaking the `max_blocks` config value, and using the unsafe default for max duration, when setting the backfill thresholds. This leads to us avoiding an OOM crash, but then running into an mdbx crash due to the sentinel thread forcibly killing the transaction. - Before `set_backfill_thresholds` was added, what caused the OOM was the massive amount of blocks being processed. `max_duration` had a sane default (30s). Once it was introduced, this new bug was added due to the `::default()` usage. - The fix, is therefore, to limit the max duration to a reasonably low time. 30s should be fine. - Note: 30s is a reasonably margin due to a few reasons: - We avoid the hard 5m timeout. - The intended default for exex backfill was already 30s. - There's a 60s timeout warning already, this avoids it entirely. We've seen this error spuriously over the entirety of the signet node's lifecycle. * chore: expose backfill max duration config on env var * chore: version bump
1 parent 3b978ed commit 9017fbc

3 files changed

Lines changed: 36 additions & 17 deletions

File tree

Cargo.toml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ members = ["crates/*"]
33
resolver = "2"
44

55
[workspace.package]
6-
version = "0.16.0-rc.8"
6+
version = "0.16.0-rc.9"
77
edition = "2024"
88
rust-version = "1.88"
99
authors = ["init4"]
@@ -34,15 +34,15 @@ debug = false
3434
incremental = false
3535

3636
[workspace.dependencies]
37-
signet-blobber = { version = "0.16.0-rc.8", path = "crates/blobber" }
38-
signet-block-processor = { version = "0.16.0-rc.8", path = "crates/block-processor" }
39-
signet-db = { version = "0.16.0-rc.8", path = "crates/db" }
40-
signet-genesis = { version = "0.16.0-rc.8", path = "crates/genesis" }
41-
signet-node = { version = "0.16.0-rc.8", path = "crates/node" }
42-
signet-node-config = { version = "0.16.0-rc.8", path = "crates/node-config" }
43-
signet-node-tests = { version = "0.16.0-rc.8", path = "crates/node-tests" }
44-
signet-node-types = { version = "0.16.0-rc.8", path = "crates/node-types" }
45-
signet-rpc = { version = "0.16.0-rc.8", path = "crates/rpc" }
37+
signet-blobber = { version = "0.16.0-rc.9", path = "crates/blobber" }
38+
signet-block-processor = { version = "0.16.0-rc.9", path = "crates/block-processor" }
39+
signet-db = { version = "0.16.0-rc.9", path = "crates/db" }
40+
signet-genesis = { version = "0.16.0-rc.9", path = "crates/genesis" }
41+
signet-node = { version = "0.16.0-rc.9", path = "crates/node" }
42+
signet-node-config = { version = "0.16.0-rc.9", path = "crates/node-config" }
43+
signet-node-tests = { version = "0.16.0-rc.9", path = "crates/node-tests" }
44+
signet-node-types = { version = "0.16.0-rc.9", path = "crates/node-types" }
45+
signet-rpc = { version = "0.16.0-rc.9", path = "crates/rpc" }
4646

4747
init4-bin-base = { version = "0.18.0-rc.8", features = ["alloy"] }
4848

crates/node-config/src/core.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use std::{
1212
fmt::Display,
1313
path::PathBuf,
1414
sync::{Arc, OnceLock},
15+
time::Duration,
1516
};
1617
use tracing::warn;
1718
use trevm::revm::primitives::hardfork::SpecId;
@@ -72,6 +73,15 @@ pub struct SignetNodeConfig {
7273
optional
7374
)]
7475
backfill_max_blocks: Option<u64>,
76+
77+
/// Maximum duration of a backfill batch.
78+
/// This prevents MDBX from killing the read transaction, leading to a crash if reth's default mdbx read transaction timeout is exceeded.
79+
#[from_env(
80+
var = "BACKFILL_MAX_DURATION",
81+
desc = "Maximum duration of a backfill batch, in seconds",
82+
optional
83+
)]
84+
backfill_max_duration: Option<u64>,
7585
}
7686

7787
impl Display for SignetNodeConfig {
@@ -105,6 +115,7 @@ impl SignetNodeConfig {
105115
genesis,
106116
slot_calculator,
107117
backfill_max_blocks: None, // Uses default of 10,000 via accessor
118+
backfill_max_duration: None, // Uses default of 30 seconds via accessor
108119
}
109120
}
110121

@@ -252,6 +263,12 @@ impl SignetNodeConfig {
252263
// Default to 10,000 if not explicitly configured
253264
Some(self.backfill_max_blocks.unwrap_or(10_000))
254265
}
266+
267+
/// Get the maximum duration of a backfill batch.
268+
/// Returns `Some(30)` seconds by default if not configured.
269+
pub fn backfill_max_duration(&self) -> Option<Duration> {
270+
Some(Duration::from_secs(self.backfill_max_duration.unwrap_or(30)))
271+
}
255272
}
256273

257274
#[cfg(test)]
@@ -272,6 +289,7 @@ mod defaults {
272289
genesis: GenesisSpec::Known(KnownChains::Test),
273290
slot_calculator: SlotCalculator::new(0, 0, 12),
274291
backfill_max_blocks: None, // Uses default of 10,000 via accessor
292+
backfill_max_duration: None, // Uses default of 30 seconds via accessor
275293
}
276294
}
277295
}

crates/node/src/node.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -299,13 +299,14 @@ where
299299
/// This should be called after `set_with_head` to configure how many
300300
/// blocks can be processed per backfill batch.
301301
fn set_backfill_thresholds(&mut self) {
302-
if let Some(max_blocks) = self.config.backfill_max_blocks() {
303-
self.host.notifications.set_backfill_thresholds(ExecutionStageThresholds {
304-
max_blocks: Some(max_blocks),
305-
..Default::default()
306-
});
307-
debug!(max_blocks, "configured backfill thresholds");
308-
}
302+
let max_blocks = self.config.backfill_max_blocks();
303+
let max_duration = self.config.backfill_max_duration();
304+
self.host.notifications.set_backfill_thresholds(ExecutionStageThresholds {
305+
max_blocks,
306+
max_duration,
307+
..Default::default()
308+
});
309+
debug!(?max_blocks, ?max_duration, "configured backfill thresholds");
309310
}
310311

311312
/// Runs on any notification received from the ExEx context.

0 commit comments

Comments
 (0)