feat: monitor for stalled chain and send alert

closes #7.

adds a blockchain watchdog that checks every hour if the tip height has
advanced or not.  Sends alert email if height is less or equal to
height at the last check and enters a warning mode.  In warning mode
it waits until the height is greater than previous, and then sends a
recovery alert and switches to normal mode.
This commit is contained in:
danda 2024-12-31 09:58:40 -08:00
parent dd7e80298d
commit af5496ed11
6 changed files with 203 additions and 2 deletions

24
Cargo.lock generated
View File

@ -960,6 +960,27 @@ dependencies = [
"syn 2.0.89", "syn 2.0.89",
] ]
[[package]]
name = "derive_more"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05"
dependencies = [
"derive_more-impl",
]
[[package]]
name = "derive_more-impl"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.89",
"unicode-xid",
]
[[package]] [[package]]
name = "diff" name = "diff"
version = "0.1.13" version = "0.1.13"
@ -2018,6 +2039,7 @@ dependencies = [
"boilerplate", "boilerplate",
"chrono", "chrono",
"clap", "clap",
"derive_more",
"html-escaper", "html-escaper",
"lettre", "lettre",
"neptune-cash", "neptune-cash",
@ -3268,7 +3290,7 @@ dependencies = [
"arbitrary", "arbitrary",
"const_format", "const_format",
"hex", "hex",
"itertools 0.12.1", "itertools 0.13.0",
"ndarray", "ndarray",
"num", "num",
"num-traits", "num-traits",

View File

@ -34,6 +34,7 @@ chrono = "0.4.34"
# only should be used inside main.rs, for the binary. # only should be used inside main.rs, for the binary.
anyhow = "1.0.86" anyhow = "1.0.86"
arc-swap = "1.7.1" arc-swap = "1.7.1"
derive_more = { version = "1.0.0", features = ["display"] }
[patch.crates-io] [patch.crates-io]
# 694f27daf78aade0ed0dc07e3babaab036cd5572 is tip of branch: master as of 2024-04-30 # 694f27daf78aade0ed0dc07e3babaab036cd5572 is tip of branch: master as of 2024-04-30

View File

@ -36,7 +36,8 @@ async fn main() -> Result<(), anyhow::Error> {
// this will log warnings if smtp not configured or mis-configured. // this will log warnings if smtp not configured or mis-configured.
alert_email::check_alert_params(); alert_email::check_alert_params();
tokio::task::spawn(neptune_rpc::watchdog(app_state)); tokio::task::spawn(neptune_rpc::watchdog(app_state.clone()));
tokio::task::spawn(neptune_rpc::blockchain_watchdog(app_state));
info!("Running on http://localhost:{port}"); info!("Running on http://localhost:{port}");

View File

@ -38,6 +38,10 @@ pub struct Config {
#[clap(long, default_value = "10", value_name = "seconds")] #[clap(long, default_value = "10", value_name = "seconds")]
pub neptune_rpc_watchdog_secs: u64, pub neptune_rpc_watchdog_secs: u64,
/// Sets interval in seconds to check that block-height has increased
#[clap(long, default_value = "3600", value_name = "seconds")]
pub neptune_blockchain_watchdog_secs: u64,
/// admin email for receiving alert emails /// admin email for receiving alert emails
#[arg(long, value_name = "email")] #[arg(long, value_name = "email")]
pub admin_email: Option<String>, pub admin_email: Option<String>,

View File

@ -6,6 +6,7 @@ use chrono::DateTime;
use chrono::TimeDelta; use chrono::TimeDelta;
use chrono::Utc; use chrono::Utc;
use clap::Parser; use clap::Parser;
use neptune_cash::models::blockchain::block::block_height::BlockHeight;
use neptune_cash::rpc_server::RPCClient; use neptune_cash::rpc_server::RPCClient;
use std::net::Ipv4Addr; use std::net::Ipv4Addr;
use std::net::SocketAddr; use std::net::SocketAddr;
@ -111,3 +112,111 @@ pub struct NeptuneRpcAlertEmail {
now: DateTime<Utc>, now: DateTime<Utc>,
duration: TimeDelta, duration: TimeDelta,
} }
#[derive(Clone, Copy, derive_more::Display)]
pub enum BlockchainState {
Normal,
Warn,
}
#[derive(boilerplate::Boilerplate)]
#[boilerplate(filename = "email/neptune_blockchain_alert.txt")]
pub struct NeptuneBlockchainAlertEmail {
config: Config,
last_height: BlockHeight,
height: BlockHeight,
last_blockchain_state: BlockchainState,
blockchain_state: BlockchainState,
app_started: DateTime<Utc>,
app_duration: TimeDelta,
since: DateTime<Utc>,
now: DateTime<Utc>,
duration: TimeDelta,
}
/// a tokio task that periodically pings neptune-core rpc server to ensure
/// the blockchain keeps growing and has not stalled or shortened somehow.
///
/// If not connected, a single connection attempt is made for each timer iteration.
///
/// States:
/// normal: the present tip is higher than at the last check.
/// warn: the present tip is same or lower than at the last check.
///
/// Whenever the state changes a log message is printed and an email
/// alert is sent to admin, if admin_email config field is set. In this way,
/// the site admin gets notified if a problem occurs, and upon recovery.
pub async fn blockchain_watchdog(app_state: AppState) {
let mut last_height: BlockHeight = Default::default();
let mut last_blockchain_state = BlockchainState::Normal;
let app_started = chrono::offset::Utc::now();
let mut since = chrono::offset::Utc::now();
let watchdog_secs = app_state.load().config.neptune_blockchain_watchdog_secs;
debug!("neptune-core blockchain watchdog started");
loop {
let result = app_state
.load()
.rpc_client
.block_height(context::current())
.await;
if let Ok(height) = result {
// send admin alert if there is a state change.
let subject = match last_blockchain_state {
BlockchainState::Normal if height < last_height => {
"alert! ** WARNING ** blockchain height is shrinking"
}
BlockchainState::Normal if height == last_height => {
"alert! ** WARNING ** blockchain height is stalled"
}
BlockchainState::Warn if height > last_height => {
"alert! ** Recovery ** blockchain height is growing again"
}
_ => "", // no state change
};
if !subject.is_empty() {
let blockchain_state = match last_blockchain_state {
BlockchainState::Normal => BlockchainState::Warn,
BlockchainState::Warn => BlockchainState::Normal,
};
let config = Config::parse();
let now = chrono::offset::Utc::now();
let duration = now.signed_duration_since(since);
let app_duration = now.signed_duration_since(app_started);
let body = NeptuneBlockchainAlertEmail {
config,
last_height,
height,
last_blockchain_state,
blockchain_state,
now,
app_started,
app_duration,
since,
duration,
}
.to_string();
let msg = format!("alert: neptune-core blockchain status change: previous: {last_blockchain_state}, now: {blockchain_state}. prev_height: {last_height}, now_height: {height}");
match blockchain_state {
BlockchainState::Normal => info!("{msg}"),
BlockchainState::Warn => warn!("{msg}"),
};
let _ = alert_email::send(&app_state, subject, body).await;
last_blockchain_state = blockchain_state;
}
// update state.
last_height = height;
since = chrono::offset::Utc::now();
tokio::time::sleep(tokio::time::Duration::from_secs(watchdog_secs)).await;
}
}
}

View File

@ -0,0 +1,64 @@
%% if matches!(self.blockchain_state, BlockchainState::Normal) {
**** ALERT: Neptune Blockchain Height Recovery ****
%% } else {
**** ALERT: Neptune Blockchain Height Possible Outage ****
%% }
site: {{self.config.site_name}} at {{self.config.site_domain}}:{{self.config.listen_port}}
-- Details --
Event: Neptune Blockchain Height Monitor Status Change.
Event Time: {{self.now.to_rfc3339()}}
Event Description:
%% if matches!(self.blockchain_state, BlockchainState::Normal) {
The present block height is greater than the height at last check. Service is restored.
%% } else if self.last_height == self.height {
The present block height is equal to the height at last check.
This may indicate a problem with neptune-core.
%% } else {
The present block height is less than the height at last check.
This may indicate a problem with neptune-core.
%% }
New Status:
blockchain monitor: {{self.blockchain_state}}
last_height: {{self.last_height}}
height: {{self.height}}
Now: {{self.now.to_rfc3339()}}
Previous Status:
blockchain monitor: {{self.last_blockchain_state}}
Since: {{self.since.to_rfc3339()}}
Duration: {{self.duration}}
Block Explorer Uptime:
Started: {{self.app_started.to_rfc3339()}}
Duration: {{self.app_duration}} seconds
Neptune-core RPC:
Host: {{self.config.site_domain}} (localhost)
Port: {{self.config.neptune_rpc_port}}
Recommended action:
%% if matches!(self.blockchain_state, BlockchainState::Normal) {
Check neptune-core logs to ensure it is operating correctly.
No further corrective action should be necessary.
%% } else {
If only one hour has passed since the last block:
1. It is possible/likely that a block simply has not been found yet.
2. Check neptune-core logs to ensure it is operating correctly.
3. Check other nodes to ensure they are at the same block height.
if two or more hours have passed since the last block:
1. Check neptune-core logs to ensure it is operating correctly.
2. Check other nodes to ensure they are at the same block height.
3. Consider restarting neptune-core
4. Consider filing an issue, or alerting neptune-core developers.
%% }