diff --git a/Cargo.lock b/Cargo.lock index 74835ea..8408497 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -960,6 +960,27 @@ dependencies = [ "syn 2.0.89", ] +[[package]] +name = "derive_more" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.89", + "unicode-xid", +] + [[package]] name = "diff" version = "0.1.13" @@ -2018,6 +2039,7 @@ dependencies = [ "boilerplate", "chrono", "clap", + "derive_more", "html-escaper", "lettre", "neptune-cash", @@ -3268,7 +3290,7 @@ dependencies = [ "arbitrary", "const_format", "hex", - "itertools 0.12.1", + "itertools 0.13.0", "ndarray", "num", "num-traits", diff --git a/Cargo.toml b/Cargo.toml index 2eabf70..e9f5638 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ chrono = "0.4.34" # only should be used inside main.rs, for the binary. anyhow = "1.0.86" arc-swap = "1.7.1" +derive_more = { version = "1.0.0", features = ["display"] } [patch.crates-io] # 694f27daf78aade0ed0dc07e3babaab036cd5572 is tip of branch: master as of 2024-04-30 diff --git a/src/main.rs b/src/main.rs index 447164a..a5ce3f5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -36,7 +36,8 @@ async fn main() -> Result<(), anyhow::Error> { // this will log warnings if smtp not configured or mis-configured. alert_email::check_alert_params(); - tokio::task::spawn(neptune_rpc::watchdog(app_state)); + tokio::task::spawn(neptune_rpc::watchdog(app_state.clone())); + tokio::task::spawn(neptune_rpc::blockchain_watchdog(app_state)); info!("Running on http://localhost:{port}"); diff --git a/src/model/config.rs b/src/model/config.rs index 2e01e62..87a0ac3 100644 --- a/src/model/config.rs +++ b/src/model/config.rs @@ -38,6 +38,10 @@ pub struct Config { #[clap(long, default_value = "10", value_name = "seconds")] pub neptune_rpc_watchdog_secs: u64, + /// Sets interval in seconds to check that block-height has increased + #[clap(long, default_value = "3600", value_name = "seconds")] + pub neptune_blockchain_watchdog_secs: u64, + /// admin email for receiving alert emails #[arg(long, value_name = "email")] pub admin_email: Option, diff --git a/src/neptune_rpc.rs b/src/neptune_rpc.rs index 0c8a06d..8724324 100644 --- a/src/neptune_rpc.rs +++ b/src/neptune_rpc.rs @@ -6,6 +6,7 @@ use chrono::DateTime; use chrono::TimeDelta; use chrono::Utc; use clap::Parser; +use neptune_cash::models::blockchain::block::block_height::BlockHeight; use neptune_cash::rpc_server::RPCClient; use std::net::Ipv4Addr; use std::net::SocketAddr; @@ -111,3 +112,111 @@ pub struct NeptuneRpcAlertEmail { now: DateTime, duration: TimeDelta, } + +#[derive(Clone, Copy, derive_more::Display)] +pub enum BlockchainState { + Normal, + Warn, +} + +#[derive(boilerplate::Boilerplate)] +#[boilerplate(filename = "email/neptune_blockchain_alert.txt")] +pub struct NeptuneBlockchainAlertEmail { + config: Config, + last_height: BlockHeight, + height: BlockHeight, + last_blockchain_state: BlockchainState, + blockchain_state: BlockchainState, + app_started: DateTime, + app_duration: TimeDelta, + since: DateTime, + now: DateTime, + duration: TimeDelta, +} + +/// a tokio task that periodically pings neptune-core rpc server to ensure +/// the blockchain keeps growing and has not stalled or shortened somehow. +/// +/// If not connected, a single connection attempt is made for each timer iteration. +/// +/// States: +/// normal: the present tip is higher than at the last check. +/// warn: the present tip is same or lower than at the last check. +/// +/// Whenever the state changes a log message is printed and an email +/// alert is sent to admin, if admin_email config field is set. In this way, +/// the site admin gets notified if a problem occurs, and upon recovery. +pub async fn blockchain_watchdog(app_state: AppState) { + let mut last_height: BlockHeight = Default::default(); + let mut last_blockchain_state = BlockchainState::Normal; + let app_started = chrono::offset::Utc::now(); + let mut since = chrono::offset::Utc::now(); + let watchdog_secs = app_state.load().config.neptune_blockchain_watchdog_secs; + + debug!("neptune-core blockchain watchdog started"); + + loop { + let result = app_state + .load() + .rpc_client + .block_height(context::current()) + .await; + + if let Ok(height) = result { + // send admin alert if there is a state change. + let subject = match last_blockchain_state { + BlockchainState::Normal if height < last_height => { + "alert! ** WARNING ** blockchain height is shrinking" + } + BlockchainState::Normal if height == last_height => { + "alert! ** WARNING ** blockchain height is stalled" + } + BlockchainState::Warn if height > last_height => { + "alert! ** Recovery ** blockchain height is growing again" + } + _ => "", // no state change + }; + + if !subject.is_empty() { + let blockchain_state = match last_blockchain_state { + BlockchainState::Normal => BlockchainState::Warn, + BlockchainState::Warn => BlockchainState::Normal, + }; + + let config = Config::parse(); + let now = chrono::offset::Utc::now(); + let duration = now.signed_duration_since(since); + let app_duration = now.signed_duration_since(app_started); + let body = NeptuneBlockchainAlertEmail { + config, + last_height, + height, + last_blockchain_state, + blockchain_state, + now, + app_started, + app_duration, + since, + duration, + } + .to_string(); + + let msg = format!("alert: neptune-core blockchain status change: previous: {last_blockchain_state}, now: {blockchain_state}. prev_height: {last_height}, now_height: {height}"); + match blockchain_state { + BlockchainState::Normal => info!("{msg}"), + BlockchainState::Warn => warn!("{msg}"), + }; + + let _ = alert_email::send(&app_state, subject, body).await; + + last_blockchain_state = blockchain_state; + } + + // update state. + last_height = height; + since = chrono::offset::Utc::now(); + + tokio::time::sleep(tokio::time::Duration::from_secs(watchdog_secs)).await; + } + } +} diff --git a/templates/email/neptune_blockchain_alert.txt b/templates/email/neptune_blockchain_alert.txt new file mode 100644 index 0000000..93fceef --- /dev/null +++ b/templates/email/neptune_blockchain_alert.txt @@ -0,0 +1,64 @@ +%% if matches!(self.blockchain_state, BlockchainState::Normal) { +**** ALERT: Neptune Blockchain Height Recovery **** +%% } else { +**** ALERT: Neptune Blockchain Height Possible Outage **** +%% } + +site: {{self.config.site_name}} at {{self.config.site_domain}}:{{self.config.listen_port}} + +-- Details -- + +Event: Neptune Blockchain Height Monitor Status Change. + +Event Time: {{self.now.to_rfc3339()}} + +Event Description: + +%% if matches!(self.blockchain_state, BlockchainState::Normal) { +The present block height is greater than the height at last check. Service is restored. +%% } else if self.last_height == self.height { +The present block height is equal to the height at last check. +This may indicate a problem with neptune-core. +%% } else { +The present block height is less than the height at last check. +This may indicate a problem with neptune-core. +%% } + +New Status: + blockchain monitor: {{self.blockchain_state}} + last_height: {{self.last_height}} + height: {{self.height}} + Now: {{self.now.to_rfc3339()}} + +Previous Status: + blockchain monitor: {{self.last_blockchain_state}} + Since: {{self.since.to_rfc3339()}} + Duration: {{self.duration}} + +Block Explorer Uptime: + Started: {{self.app_started.to_rfc3339()}} + Duration: {{self.app_duration}} seconds + +Neptune-core RPC: + Host: {{self.config.site_domain}} (localhost) + Port: {{self.config.neptune_rpc_port}} + +Recommended action: + +%% if matches!(self.blockchain_state, BlockchainState::Normal) { + Check neptune-core logs to ensure it is operating correctly. + + No further corrective action should be necessary. +%% } else { + If only one hour has passed since the last block: + + 1. It is possible/likely that a block simply has not been found yet. + 2. Check neptune-core logs to ensure it is operating correctly. + 3. Check other nodes to ensure they are at the same block height. + + if two or more hours have passed since the last block: + 1. Check neptune-core logs to ensure it is operating correctly. + 2. Check other nodes to ensure they are at the same block height. + 3. Consider restarting neptune-core + 4. Consider filing an issue, or alerting neptune-core developers. +%% }