From af5496ed118d33256bdd1044a2c11c8cf97a4010 Mon Sep 17 00:00:00 2001 From: danda Date: Tue, 31 Dec 2024 09:58:40 -0800 Subject: [PATCH] feat: monitor for stalled chain and send alert closes #7. adds a blockchain watchdog that checks every hour if the tip height has advanced or not. Sends alert email if height is less or equal to height at the last check and enters a warning mode. In warning mode it waits until the height is greater than previous, and then sends a recovery alert and switches to normal mode. --- Cargo.lock | 24 +++- Cargo.toml | 1 + src/main.rs | 3 +- src/model/config.rs | 4 + src/neptune_rpc.rs | 109 +++++++++++++++++++ templates/email/neptune_blockchain_alert.txt | 64 +++++++++++ 6 files changed, 203 insertions(+), 2 deletions(-) create mode 100644 templates/email/neptune_blockchain_alert.txt diff --git a/Cargo.lock b/Cargo.lock index 74835ea..8408497 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -960,6 +960,27 @@ dependencies = [ "syn 2.0.89", ] +[[package]] +name = "derive_more" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.89", + "unicode-xid", +] + [[package]] name = "diff" version = "0.1.13" @@ -2018,6 +2039,7 @@ dependencies = [ "boilerplate", "chrono", "clap", + "derive_more", "html-escaper", "lettre", "neptune-cash", @@ -3268,7 +3290,7 @@ dependencies = [ "arbitrary", "const_format", "hex", - "itertools 0.12.1", + "itertools 0.13.0", "ndarray", "num", "num-traits", diff --git a/Cargo.toml b/Cargo.toml index 2eabf70..e9f5638 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ chrono = "0.4.34" # only should be used inside main.rs, for the binary. anyhow = "1.0.86" arc-swap = "1.7.1" +derive_more = { version = "1.0.0", features = ["display"] } [patch.crates-io] # 694f27daf78aade0ed0dc07e3babaab036cd5572 is tip of branch: master as of 2024-04-30 diff --git a/src/main.rs b/src/main.rs index 447164a..a5ce3f5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -36,7 +36,8 @@ async fn main() -> Result<(), anyhow::Error> { // this will log warnings if smtp not configured or mis-configured. alert_email::check_alert_params(); - tokio::task::spawn(neptune_rpc::watchdog(app_state)); + tokio::task::spawn(neptune_rpc::watchdog(app_state.clone())); + tokio::task::spawn(neptune_rpc::blockchain_watchdog(app_state)); info!("Running on http://localhost:{port}"); diff --git a/src/model/config.rs b/src/model/config.rs index 2e01e62..87a0ac3 100644 --- a/src/model/config.rs +++ b/src/model/config.rs @@ -38,6 +38,10 @@ pub struct Config { #[clap(long, default_value = "10", value_name = "seconds")] pub neptune_rpc_watchdog_secs: u64, + /// Sets interval in seconds to check that block-height has increased + #[clap(long, default_value = "3600", value_name = "seconds")] + pub neptune_blockchain_watchdog_secs: u64, + /// admin email for receiving alert emails #[arg(long, value_name = "email")] pub admin_email: Option, diff --git a/src/neptune_rpc.rs b/src/neptune_rpc.rs index 0c8a06d..8724324 100644 --- a/src/neptune_rpc.rs +++ b/src/neptune_rpc.rs @@ -6,6 +6,7 @@ use chrono::DateTime; use chrono::TimeDelta; use chrono::Utc; use clap::Parser; +use neptune_cash::models::blockchain::block::block_height::BlockHeight; use neptune_cash::rpc_server::RPCClient; use std::net::Ipv4Addr; use std::net::SocketAddr; @@ -111,3 +112,111 @@ pub struct NeptuneRpcAlertEmail { now: DateTime, duration: TimeDelta, } + +#[derive(Clone, Copy, derive_more::Display)] +pub enum BlockchainState { + Normal, + Warn, +} + +#[derive(boilerplate::Boilerplate)] +#[boilerplate(filename = "email/neptune_blockchain_alert.txt")] +pub struct NeptuneBlockchainAlertEmail { + config: Config, + last_height: BlockHeight, + height: BlockHeight, + last_blockchain_state: BlockchainState, + blockchain_state: BlockchainState, + app_started: DateTime, + app_duration: TimeDelta, + since: DateTime, + now: DateTime, + duration: TimeDelta, +} + +/// a tokio task that periodically pings neptune-core rpc server to ensure +/// the blockchain keeps growing and has not stalled or shortened somehow. +/// +/// If not connected, a single connection attempt is made for each timer iteration. +/// +/// States: +/// normal: the present tip is higher than at the last check. +/// warn: the present tip is same or lower than at the last check. +/// +/// Whenever the state changes a log message is printed and an email +/// alert is sent to admin, if admin_email config field is set. In this way, +/// the site admin gets notified if a problem occurs, and upon recovery. +pub async fn blockchain_watchdog(app_state: AppState) { + let mut last_height: BlockHeight = Default::default(); + let mut last_blockchain_state = BlockchainState::Normal; + let app_started = chrono::offset::Utc::now(); + let mut since = chrono::offset::Utc::now(); + let watchdog_secs = app_state.load().config.neptune_blockchain_watchdog_secs; + + debug!("neptune-core blockchain watchdog started"); + + loop { + let result = app_state + .load() + .rpc_client + .block_height(context::current()) + .await; + + if let Ok(height) = result { + // send admin alert if there is a state change. + let subject = match last_blockchain_state { + BlockchainState::Normal if height < last_height => { + "alert! ** WARNING ** blockchain height is shrinking" + } + BlockchainState::Normal if height == last_height => { + "alert! ** WARNING ** blockchain height is stalled" + } + BlockchainState::Warn if height > last_height => { + "alert! ** Recovery ** blockchain height is growing again" + } + _ => "", // no state change + }; + + if !subject.is_empty() { + let blockchain_state = match last_blockchain_state { + BlockchainState::Normal => BlockchainState::Warn, + BlockchainState::Warn => BlockchainState::Normal, + }; + + let config = Config::parse(); + let now = chrono::offset::Utc::now(); + let duration = now.signed_duration_since(since); + let app_duration = now.signed_duration_since(app_started); + let body = NeptuneBlockchainAlertEmail { + config, + last_height, + height, + last_blockchain_state, + blockchain_state, + now, + app_started, + app_duration, + since, + duration, + } + .to_string(); + + let msg = format!("alert: neptune-core blockchain status change: previous: {last_blockchain_state}, now: {blockchain_state}. prev_height: {last_height}, now_height: {height}"); + match blockchain_state { + BlockchainState::Normal => info!("{msg}"), + BlockchainState::Warn => warn!("{msg}"), + }; + + let _ = alert_email::send(&app_state, subject, body).await; + + last_blockchain_state = blockchain_state; + } + + // update state. + last_height = height; + since = chrono::offset::Utc::now(); + + tokio::time::sleep(tokio::time::Duration::from_secs(watchdog_secs)).await; + } + } +} diff --git a/templates/email/neptune_blockchain_alert.txt b/templates/email/neptune_blockchain_alert.txt new file mode 100644 index 0000000..93fceef --- /dev/null +++ b/templates/email/neptune_blockchain_alert.txt @@ -0,0 +1,64 @@ +%% if matches!(self.blockchain_state, BlockchainState::Normal) { +**** ALERT: Neptune Blockchain Height Recovery **** +%% } else { +**** ALERT: Neptune Blockchain Height Possible Outage **** +%% } + +site: {{self.config.site_name}} at {{self.config.site_domain}}:{{self.config.listen_port}} + +-- Details -- + +Event: Neptune Blockchain Height Monitor Status Change. + +Event Time: {{self.now.to_rfc3339()}} + +Event Description: + +%% if matches!(self.blockchain_state, BlockchainState::Normal) { +The present block height is greater than the height at last check. Service is restored. +%% } else if self.last_height == self.height { +The present block height is equal to the height at last check. +This may indicate a problem with neptune-core. +%% } else { +The present block height is less than the height at last check. +This may indicate a problem with neptune-core. +%% } + +New Status: + blockchain monitor: {{self.blockchain_state}} + last_height: {{self.last_height}} + height: {{self.height}} + Now: {{self.now.to_rfc3339()}} + +Previous Status: + blockchain monitor: {{self.last_blockchain_state}} + Since: {{self.since.to_rfc3339()}} + Duration: {{self.duration}} + +Block Explorer Uptime: + Started: {{self.app_started.to_rfc3339()}} + Duration: {{self.app_duration}} seconds + +Neptune-core RPC: + Host: {{self.config.site_domain}} (localhost) + Port: {{self.config.neptune_rpc_port}} + +Recommended action: + +%% if matches!(self.blockchain_state, BlockchainState::Normal) { + Check neptune-core logs to ensure it is operating correctly. + + No further corrective action should be necessary. +%% } else { + If only one hour has passed since the last block: + + 1. It is possible/likely that a block simply has not been found yet. + 2. Check neptune-core logs to ensure it is operating correctly. + 3. Check other nodes to ensure they are at the same block height. + + if two or more hours have passed since the last block: + 1. Check neptune-core logs to ensure it is operating correctly. + 2. Check other nodes to ensure they are at the same block height. + 3. Consider restarting neptune-core + 4. Consider filing an issue, or alerting neptune-core developers. +%% }