neptune-explorer/src/bin/scraper.rs

use env_logger;
use log::LevelFilter;
use log::{error, info, warn};
use rand::seq::IteratorRandom;
use regex::Regex;
use reqwest::Client;
use std::collections::HashSet;
use std::sync::{Arc, Mutex};
use std::time::Duration;
use tokio::{signal, time};
use url::Url;

/// Scrape the explorer website when running locally.
///
/// This program maintains a dictionary of URLs, which is initially populated
/// with 'http://localhost:3000'. It fetches a random URL from the dictionary in
/// each iteration, logs positive messages if successful, extracts new URLs from
/// the response body to add to the dictionary, logs warnings or errors for
/// request failures or timeouts, sleeps for a bit, and continues until Ctrl-C
/// is pressed.
///
/// Run with:
///  `> cargo run --bin scraper`
#[tokio::main]
async fn main() {
    // Initialize logger
    env_logger::builder().filter_level(LevelFilter::Info).init();

    let client = Client::builder()
        .timeout(Duration::from_millis(300))
        .build()
        .expect("Failed to build HTTP client");

    let root_url = "http://localhost:3000".to_string();
    let urls = Arc::new(Mutex::new(HashSet::from([root_url.clone()])));

    let href_regex = Regex::new(r#"<a\s+(?:[^>]*?\s+)?href=['\"](.*?)['\"]"#).unwrap();

    info!("Starting fetch loop. Press Ctrl-C to stop.");

    let urls_clone = Arc::clone(&urls);
    let fetch_loop = async move {
        loop {
            // Pick a random URL safely
            let url_opt = {
                let urls_guard = urls_clone.lock().unwrap();
                urls_guard.iter().choose(&mut rand::rng()).cloned()
            };

            if let Some(url) = url_opt {
                match client.get(&url).send().await {
                    Ok(resp) => {
                        if resp.status().is_success() {
                            match resp.text().await {
                                Ok(text) => {
                                    info!("Success fetching {}", url);
                                    let mut urls_guard = urls_clone.lock().unwrap();
                                    for cap in href_regex.captures_iter(&text) {
                                        let href = &cap[1];
                                        if let Ok(parsed_url) =
                                            Url::parse(&[&root_url.clone(), href].concat())
                                        {
                                            let normalized = parsed_url.as_str();
                                            if urls_guard.insert(normalized.to_owned()) {
                                                info!(
                                                    "Added new URL to dictionary: {}",
                                                    normalized
                                                );
                                            }
                                        }
                                    }
                                }
                                Err(e) => {
                                    warn!("Failed to read response body from {}: {}", url, e);
                                }
                            }
                        } else {
                            warn!("Non-success status {} from {}", resp.status(), url);
                        }
                    }
                    Err(err) => {
                        if err.is_timeout() {
                            warn!("Timeout fetching {}", url);
                        } else {
                            error!("Error fetching {}: {}", url, err);
                        }
                    }
                }
            } else {
                warn!("URL dictionary is empty, no URL to fetch");
            }

            time::sleep(Duration::from_millis(500)).await;
        }
    };

    tokio::select! {
        _ = fetch_loop => {}, // This runs indefinitely unless stopped
        _ = signal::ctrl_c() => {
            info!("Ctrl-C received, stopping...");
        }
    }
}
security: Add attack binary Add a binary that deploys a flooding attack. Also, add a binary that periodically queries a random URL from the website. This scraper will be used to measure whether the attack (or its countermeasure) is effective. 2025-10-09 18:32:51 +02:00			`use env_logger;`
			`use log::LevelFilter;`
			`use log::{error, info, warn};`
			`use rand::seq::IteratorRandom;`
			`use regex::Regex;`
			`use reqwest::Client;`
			`use std::collections::HashSet;`
			`use std::sync::{Arc, Mutex};`
			`use std::time::Duration;`
			`use tokio::{signal, time};`
			`use url::Url;`

			`/// Scrape the explorer website when running locally.`
			`///`
			`/// This program maintains a dictionary of URLs, which is initially populated`
			`/// with 'http://localhost:3000'. It fetches a random URL from the dictionary in`
			`/// each iteration, logs positive messages if successful, extracts new URLs from`
			`/// the response body to add to the dictionary, logs warnings or errors for`
			`/// request failures or timeouts, sleeps for a bit, and continues until Ctrl-C`
			`/// is pressed.`
			`///`
			`/// Run with:`
			/// `> cargo run --bin scraper`
			`#[tokio::main]`
			`async fn main() {`
			`// Initialize logger`
			`env_logger::builder().filter_level(LevelFilter::Info).init();`

			`let client = Client::builder()`
			`.timeout(Duration::from_millis(300))`
			`.build()`
			`.expect("Failed to build HTTP client");`

			`let root_url = "http://localhost:3000".to_string();`
			`let urls = Arc::new(Mutex::new(HashSet::from([root_url.clone()])));`

			`let href_regex = Regex::new(r#"<a\s+(?:[^>]?\s+)?href=['\"](.?)['\"]"#).unwrap();`

			`info!("Starting fetch loop. Press Ctrl-C to stop.");`

			`let urls_clone = Arc::clone(&urls);`
			`let fetch_loop = async move {`
			`loop {`
			`// Pick a random URL safely`
			`let url_opt = {`
			`let urls_guard = urls_clone.lock().unwrap();`
			`urls_guard.iter().choose(&mut rand::rng()).cloned()`
			`};`

			`if let Some(url) = url_opt {`
			`match client.get(&url).send().await {`
			`Ok(resp) => {`
			`if resp.status().is_success() {`
			`match resp.text().await {`
			`Ok(text) => {`
			`info!("Success fetching {}", url);`
			`let mut urls_guard = urls_clone.lock().unwrap();`
			`for cap in href_regex.captures_iter(&text) {`
			`let href = &cap[1];`
			`if let Ok(parsed_url) =`
			`Url::parse(&[&root_url.clone(), href].concat())`
			`{`
			`let normalized = parsed_url.as_str();`
			`if urls_guard.insert(normalized.to_owned()) {`
			`info!(`
			`"Added new URL to dictionary: {}",`
			`normalized`
			`);`
			`}`
			`}`
			`}`
			`}`
			`Err(e) => {`
			`warn!("Failed to read response body from {}: {}", url, e);`
			`}`
			`}`
			`} else {`
			`warn!("Non-success status {} from {}", resp.status(), url);`
			`}`
			`}`
			`Err(err) => {`
			`if err.is_timeout() {`
			`warn!("Timeout fetching {}", url);`
			`} else {`
			`error!("Error fetching {}: {}", url, err);`
			`}`
			`}`
			`}`
			`} else {`
			`warn!("URL dictionary is empty, no URL to fetch");`
			`}`

			`time::sleep(Duration::from_millis(500)).await;`
			`}`
			`};`

			`tokio::select! {`
			`_ = fetch_loop => {}, // This runs indefinitely unless stopped`
			`_ = signal::ctrl_c() => {`
			`info!("Ctrl-C received, stopping...");`
			`}`
			`}`
			`}`