diff --git a/CHANGELOG.md b/CHANGELOG.md index 13d68cd..2d234aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Highlights + +- Added a hardened outbound profile for cluster and data-plane deployments +- Ambient proxy environment variables are now ignored by default +- Added hostname, port, and redirect restrictions for tighter egress policy + +### What's Changed + +- `fix(security): harden outbound fetch policy and add deployment guidance` + ## [0.1.3] - 2026-03-12 ### Highlights diff --git a/README.md b/README.md index 558a697..3d8ac2e 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,9 @@ fetchkit fetch https://example.com -o json # Custom user agent fetchkit fetch https://example.com --user-agent "MyBot/1.0" +# Hardened outbound policy for cluster/data-plane use +fetchkit fetch https://example.com --hardened + # Show full documentation fetchkit --llmtxt ``` @@ -85,6 +88,9 @@ Run as a Model Context Protocol server: ```bash fetchkit mcp + +# Hardened profile for cluster/data-plane use +fetchkit mcp --hardened ``` Exposes `fetchkit` tool over JSON-RPC 2.0 stdio transport. Returns markdown with frontmatter (same format as CLI). Compatible with Claude Desktop and other MCP clients. @@ -129,6 +135,17 @@ let request = FetchRequest::new("https://example.com"); let response = tool.execute(request).await.unwrap(); ``` +### Hardened Tool Profile + +```rust +use fetchkit::Tool; + +let tool = Tool::builder() + .hardened() + .allow_prefix("https://docs.example.com") + .build(); +``` + ## Python Bindings ```bash @@ -197,9 +214,10 @@ let tool = Tool::builder() DNS pinning prevents DNS rebinding attacks. IPv6-mapped IPv4 addresses are canonicalized before validation. Redirects are followed manually in the default fetcher so each hop is revalidated against scheme and DNS policy. Allow/block prefixes are matched against parsed URLs rather than raw strings, which prevents lookalike host overmatches such as `allowed.example.com.evil.test`. -Proxy environment variables are ignored by default; opt in with `ToolBuilder::respect_proxy_env(true)` only when you intentionally want `HTTP_PROXY`/`HTTPS_PROXY` routing. +Proxy environment variables are ignored by default. Use the hardened profile for cluster-facing deployments and opt in with `ToolBuilder::respect_proxy_env(true)` only when it is part of an intentional egress design. See [`specs/threat-model.md`](specs/threat-model.md) for the full threat model. +See [`docs/hardening.md`](docs/hardening.md) for deployment guidance. ## Configuration diff --git a/crates/fetchkit-cli/src/main.rs b/crates/fetchkit-cli/src/main.rs index 41767b0..aaadf62 100644 --- a/crates/fetchkit-cli/src/main.rs +++ b/crates/fetchkit-cli/src/main.rs @@ -43,7 +43,15 @@ struct Cli { #[derive(Subcommand, Debug)] enum Commands { /// Run as MCP (Model Context Protocol) server over stdio - Mcp, + Mcp { + /// Apply the hardened outbound policy profile + #[arg(long)] + hardened: bool, + + /// Allow HTTP_PROXY/HTTPS_PROXY/NO_PROXY from the environment + #[arg(long)] + allow_env_proxy: bool, + }, /// Fetch URL and output as markdown with metadata frontmatter Fetch { /// URL to fetch @@ -56,6 +64,14 @@ enum Commands { /// Custom User-Agent #[arg(long)] user_agent: Option, + + /// Apply the hardened outbound policy profile + #[arg(long)] + hardened: bool, + + /// Allow HTTP_PROXY/HTTPS_PROXY/NO_PROXY from the environment + #[arg(long)] + allow_env_proxy: bool, }, } @@ -70,15 +86,20 @@ async fn main() { } match cli.command { - Some(Commands::Mcp) => { - mcp::run_server().await; + Some(Commands::Mcp { + hardened, + allow_env_proxy, + }) => { + mcp::run_server(build_tool(None, hardened, allow_env_proxy)).await; } Some(Commands::Fetch { url, output, user_agent, + hardened, + allow_env_proxy, }) => { - run_fetch(&url, output, user_agent).await; + run_fetch(&url, output, user_agent, hardened, allow_env_proxy).await; } None => { eprintln!("Usage: fetchkit fetch "); @@ -89,18 +110,34 @@ async fn main() { } } -async fn run_fetch(url: &str, output: OutputFormat, user_agent: Option) { - // Build request with markdown conversion - let request = FetchRequest::new(url).as_markdown(); - - // Build tool +fn build_tool(user_agent: Option, hardened: bool, allow_env_proxy: bool) -> Tool { let mut builder = Tool::builder().enable_markdown(true); + if hardened { + builder = builder.hardened(); + } + + if allow_env_proxy { + builder = builder.use_env_proxy(true); + } + if let Some(ua) = user_agent { builder = builder.user_agent(ua); } - let tool = builder.build(); + builder.build() +} + +async fn run_fetch( + url: &str, + output: OutputFormat, + user_agent: Option, + hardened: bool, + allow_env_proxy: bool, +) { + // Build request with markdown conversion + let request = FetchRequest::new(url).as_markdown(); + let tool = build_tool(user_agent, hardened, allow_env_proxy); // Execute request match tool.execute(request).await { diff --git a/crates/fetchkit-cli/src/mcp.rs b/crates/fetchkit-cli/src/mcp.rs index 0700af2..79f6441 100644 --- a/crates/fetchkit-cli/src/mcp.rs +++ b/crates/fetchkit-cli/src/mcp.rs @@ -67,10 +67,8 @@ struct McpServer { } impl McpServer { - fn new() -> Self { - Self { - tool: Tool::default(), - } + fn new(tool: Tool) -> Self { + Self { tool } } async fn handle_request(&self, request: JsonRpcRequest) -> JsonRpcResponse { @@ -222,8 +220,8 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String { } /// Run the MCP server over stdio -pub async fn run_server() { - let server = McpServer::new(); +pub async fn run_server(tool: Tool) { + let server = McpServer::new(tool); let stdin = io::stdin(); let mut stdout = io::stdout(); diff --git a/crates/fetchkit-cli/tests/cli_integration.rs b/crates/fetchkit-cli/tests/cli_integration.rs index cb35319..469b464 100644 --- a/crates/fetchkit-cli/tests/cli_integration.rs +++ b/crates/fetchkit-cli/tests/cli_integration.rs @@ -152,6 +152,32 @@ fn test_help_flag() { assert!(stdout.contains("fetch") || stdout.contains("mcp")); } +#[test] +fn test_fetch_help_lists_hardening_flags() { + let output = Command::new(fetchkit_bin()) + .args(["fetch", "--help"]) + .output() + .expect("failed to run fetchkit"); + + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(output.status.success()); + assert!(stdout.contains("--hardened")); + assert!(stdout.contains("--allow-env-proxy")); +} + +#[test] +fn test_mcp_help_lists_hardening_flags() { + let output = Command::new(fetchkit_bin()) + .args(["mcp", "--help"]) + .output() + .expect("failed to run fetchkit"); + + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(output.status.success()); + assert!(stdout.contains("--hardened")); + assert!(stdout.contains("--allow-env-proxy")); +} + // ============================================================================ // --version flag // ============================================================================ diff --git a/crates/fetchkit-python/src/lib.rs b/crates/fetchkit-python/src/lib.rs index a14ebac..8ceceac 100644 --- a/crates/fetchkit-python/src/lib.rs +++ b/crates/fetchkit-python/src/lib.rs @@ -177,7 +177,21 @@ pub struct PyFetchKitTool { impl PyFetchKitTool { /// Create a new tool with default options #[new] - #[pyo3(signature = (enable_markdown=true, enable_text=true, user_agent=None, allow_prefixes=None, block_prefixes=None, max_body_size=None, respect_proxy_env=false))] + #[allow(clippy::too_many_arguments)] + #[pyo3(signature = ( + enable_markdown=true, + enable_text=true, + user_agent=None, + allow_prefixes=None, + block_prefixes=None, + max_body_size=None, + block_private_ips=true, + respect_proxy_env=false, + allowed_ports=None, + blocked_hosts=None, + same_host_redirects_only=false, + hardened=false + ))] fn new( enable_markdown: bool, enable_text: bool, @@ -185,13 +199,26 @@ impl PyFetchKitTool { allow_prefixes: Option>, block_prefixes: Option>, max_body_size: Option, + block_private_ips: bool, respect_proxy_env: bool, + allowed_ports: Option>, + blocked_hosts: Option>, + same_host_redirects_only: bool, + hardened: bool, ) -> PyResult { let mut builder = ToolBuilder::new() .enable_markdown(enable_markdown) .enable_text(enable_text) .respect_proxy_env(respect_proxy_env); + if hardened { + builder = builder.hardened(); + } + + builder = builder + .block_private_ips(block_private_ips) + .same_host_redirects_only(same_host_redirects_only); + if let Some(ua) = user_agent { builder = builder.user_agent(ua); } @@ -212,6 +239,22 @@ impl PyFetchKitTool { builder = builder.max_body_size(max_bytes); } + if let Some(ports) = allowed_ports { + for port in ports { + builder = builder.allow_port(port); + } + } + + if let Some(hosts) = blocked_hosts { + for host in hosts { + builder = if host.starts_with('.') { + builder.block_host_suffix(host) + } else { + builder.block_host(host) + }; + } + } + let runtime = tokio::runtime::Runtime::new() .map_err(|e| PyValueError::new_err(format!("Failed to create runtime: {}", e)))?; @@ -280,7 +323,9 @@ fn fetch( as_markdown: Option, as_text: Option, ) -> PyResult { - let tool = PyFetchKitTool::new(true, true, None, None, None, None, false)?; + let tool = PyFetchKitTool::new( + true, true, None, None, None, None, true, false, None, None, false, false, + )?; tool.fetch(url, method, as_markdown, as_text) } diff --git a/crates/fetchkit/src/client.rs b/crates/fetchkit/src/client.rs index 4cb4d4f..b1bf9af 100644 --- a/crates/fetchkit/src/client.rs +++ b/crates/fetchkit/src/client.rs @@ -7,6 +7,7 @@ use crate::dns::DnsPolicy; use crate::error::FetchError; use crate::fetchers::FetcherRegistry; use crate::types::{FetchRequest, FetchResponse}; +use url::Url; /// Fetch options that can be configured via tool builder #[derive(Debug, Clone, Default)] @@ -30,6 +31,80 @@ pub struct FetchOptions { pub enable_save_to_file: bool, /// Whether to respect HTTP_PROXY/HTTPS_PROXY/NO_PROXY from the environment pub respect_proxy_env: bool, + /// Restrict outbound requests to these ports. Empty means any port. + pub allowed_ports: Vec, + /// Block exact hosts and suffix rules. Leading '.' means suffix match. + pub blocked_hosts: Vec, + /// Restrict redirects to the original host only. + pub same_host_redirects_only: bool, +} + +impl FetchOptions { + pub(crate) fn validate_url(&self, url: &Url) -> Result<(), FetchError> { + self.validate_host(url)?; + self.validate_port(url)?; + Ok(()) + } + + pub(crate) fn validate_redirect_target( + &self, + current_url: &Url, + next_url: &Url, + ) -> Result<(), FetchError> { + self.validate_url(next_url)?; + + if self.same_host_redirects_only + && normalized_host(current_url) != normalized_host(next_url) + { + return Err(FetchError::BlockedUrl); + } + + Ok(()) + } + + fn validate_host(&self, url: &Url) -> Result<(), FetchError> { + let Some(host) = normalized_host(url) else { + return Ok(()); + }; + + if self + .blocked_hosts + .iter() + .any(|rule| host_matches_rule(&host, rule)) + { + return Err(FetchError::BlockedUrl); + } + + Ok(()) + } + + fn validate_port(&self, url: &Url) -> Result<(), FetchError> { + if self.allowed_ports.is_empty() { + return Ok(()); + } + + let port = url.port_or_known_default().ok_or(FetchError::BlockedUrl)?; + if self.allowed_ports.contains(&port) { + Ok(()) + } else { + Err(FetchError::BlockedUrl) + } + } +} + +fn normalized_host(url: &Url) -> Option { + url.host_str() + .map(|host| host.trim_end_matches('.').to_ascii_lowercase()) +} + +fn host_matches_rule(host: &str, rule: &str) -> bool { + let normalized_rule = rule.trim_end_matches('.').to_ascii_lowercase(); + + if let Some(suffix) = normalized_rule.strip_prefix('.') { + host == suffix || host.ends_with(&format!(".{suffix}")) + } else { + host == normalized_rule + } } /// Fetch a URL and return the response @@ -107,5 +182,45 @@ mod tests { assert!(options.max_body_size.is_none()); assert!(!options.enable_save_to_file); assert!(!options.respect_proxy_env); + assert!(options.allowed_ports.is_empty()); + assert!(options.blocked_hosts.is_empty()); + assert!(!options.same_host_redirects_only); + } + + #[test] + fn test_validate_url_blocks_configured_host_and_port() { + let options = FetchOptions { + allowed_ports: vec![443], + blocked_hosts: vec!["localhost".to_string(), ".internal".to_string()], + ..Default::default() + }; + + assert!(matches!( + options.validate_url(&Url::parse("https://api.internal").unwrap()), + Err(FetchError::BlockedUrl) + )); + assert!(matches!( + options.validate_url(&Url::parse("https://example.com:8443").unwrap()), + Err(FetchError::BlockedUrl) + )); + assert!(options + .validate_url(&Url::parse("https://example.com").unwrap()) + .is_ok()); + } + + #[test] + fn test_validate_redirect_target_blocks_cross_host_when_enabled() { + let options = FetchOptions { + same_host_redirects_only: true, + ..Default::default() + }; + + let current = Url::parse("https://example.com/start").unwrap(); + let next = Url::parse("https://www.example.com/end").unwrap(); + + assert!(matches!( + options.validate_redirect_target(¤t, &next), + Err(FetchError::BlockedUrl) + )); } } diff --git a/crates/fetchkit/src/fetchers/default.rs b/crates/fetchkit/src/fetchers/default.rs index 31fccf3..b3324d5 100644 --- a/crates/fetchkit/src/fetchers/default.rs +++ b/crates/fetchkit/src/fetchers/default.rs @@ -337,7 +337,7 @@ async fn send_request_following_redirects( .await .map_err(FetchError::from_reqwest)?; - let Some(next_url) = redirect_target(¤t_url, &response)? else { + let Some(next_url) = redirect_target(¤t_url, &response, options)? else { return Ok(response); }; @@ -394,6 +394,7 @@ fn build_client_for_url( fn redirect_target( base_url: &Url, response: &reqwest::Response, + options: &FetchOptions, ) -> Result, FetchError> { if !response.status().is_redirection() { return Ok(None); @@ -419,6 +420,8 @@ fn redirect_target( return Err(FetchError::InvalidUrlScheme); } + options.validate_redirect_target(base_url, &next_url)?; + Ok(Some(next_url)) } @@ -658,7 +661,7 @@ mod tests { let base_url = Url::parse(&format!("{}/start", origin.uri())).unwrap(); let response = client.get(base_url.clone()).send().await.unwrap(); - let redirect = redirect_target(&base_url, &response).unwrap(); + let redirect = redirect_target(&base_url, &response, &FetchOptions::default()).unwrap(); assert_eq!( redirect.unwrap(), Url::parse(&format!("{}/final", origin.uri())).unwrap() @@ -683,7 +686,7 @@ mod tests { let base_url = Url::parse(&format!("{}/start", origin.uri())).unwrap(); let response = client.get(base_url.clone()).send().await.unwrap(); - let redirect = redirect_target(&base_url, &response); + let redirect = redirect_target(&base_url, &response, &FetchOptions::default()); assert!(matches!(redirect, Err(FetchError::InvalidUrlScheme))); } } diff --git a/crates/fetchkit/src/fetchers/mod.rs b/crates/fetchkit/src/fetchers/mod.rs index 76ca6e7..9d56f69 100644 --- a/crates/fetchkit/src/fetchers/mod.rs +++ b/crates/fetchkit/src/fetchers/mod.rs @@ -141,6 +141,8 @@ impl FetcherRegistry { let parsed_url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; + options.validate_url(&parsed_url)?; + // THREAT[TM-INPUT-002]: Normalize URL before prefix matching to prevent // encoding-based bypasses (case, trailing dots, default ports) // THREAT[TM-INPUT-007]: URL-aware prefix matching prevents subdomain tricks diff --git a/crates/fetchkit/src/tool.rs b/crates/fetchkit/src/tool.rs index 2ef21f4..548d5ef 100644 --- a/crates/fetchkit/src/tool.rs +++ b/crates/fetchkit/src/tool.rs @@ -93,6 +93,12 @@ pub struct ToolBuilder { enable_save_to_file: bool, /// Whether to honor proxy environment variables respect_proxy_env: bool, + /// Restrict outbound requests to these ports. Empty means any port. + allowed_ports: Vec, + /// Block exact hosts and suffix rules. Leading '.' means suffix match. + blocked_hosts: Vec, + /// Restrict redirects to the original host only. + same_host_redirects_only: bool, } impl ToolBuilder { @@ -152,6 +158,40 @@ impl ToolBuilder { self } + /// Allow outbound requests to a specific port. + /// + /// If no ports are configured, any URL port is allowed. + pub fn allow_port(mut self, port: u16) -> Self { + if !self.allowed_ports.contains(&port) { + self.allowed_ports.push(port); + } + self + } + + /// Block an exact hostname before DNS resolution. + pub fn block_host(mut self, host: impl Into) -> Self { + self.blocked_hosts.push(host.into()); + self + } + + /// Block a hostname suffix before DNS resolution. + /// + /// Suffixes should usually start with `.` such as `.cluster.local`. + pub fn block_host_suffix(mut self, suffix: impl Into) -> Self { + let mut suffix = suffix.into(); + if !suffix.starts_with('.') { + suffix.insert(0, '.'); + } + self.blocked_hosts.push(suffix); + self + } + + /// Restrict redirects to the original host only. + pub fn same_host_redirects_only(mut self, enable: bool) -> Self { + self.same_host_redirects_only = enable; + self + } + /// Control private/reserved IP range blocking (SSRF prevention) /// /// Enabled by default. When enabled, FetchKit resolves hostnames to IP @@ -178,6 +218,32 @@ impl ToolBuilder { self } + /// Alias for [`respect_proxy_env`](Self::respect_proxy_env). + pub fn use_env_proxy(mut self, enable: bool) -> Self { + self.respect_proxy_env = enable; + self + } + + /// Apply a production-oriented hardening profile. + /// + /// This preset keeps private IP blocking enabled, ignores ambient proxy + /// environment variables, restricts outbound traffic to ports 80 and 443, + /// blocks common internal DNS suffixes, and only follows same-host redirects. + pub fn hardened(mut self) -> Self { + self = self + .block_private_ips(true) + .use_env_proxy(false) + .allow_port(80) + .allow_port(443) + .block_host("localhost") + .block_host_suffix(".local") + .block_host_suffix(".internal") + .block_host_suffix(".svc") + .block_host_suffix(".cluster.local") + .same_host_redirects_only(true); + self + } + /// Build the tool pub fn build(self) -> Tool { Tool { @@ -190,6 +256,9 @@ impl ToolBuilder { max_body_size: self.max_body_size, enable_save_to_file: self.enable_save_to_file, respect_proxy_env: self.respect_proxy_env, + allowed_ports: self.allowed_ports, + blocked_hosts: self.blocked_hosts, + same_host_redirects_only: self.same_host_redirects_only, } } } @@ -235,6 +304,9 @@ pub struct Tool { max_body_size: Option, enable_save_to_file: bool, respect_proxy_env: bool, + allowed_ports: Vec, + blocked_hosts: Vec, + same_host_redirects_only: bool, } impl Default for Tool { @@ -343,6 +415,9 @@ impl Tool { max_body_size: self.max_body_size, enable_save_to_file: self.enable_save_to_file, respect_proxy_env: self.respect_proxy_env, + allowed_ports: self.allowed_ports.clone(), + blocked_hosts: self.blocked_hosts.clone(), + same_host_redirects_only: self.same_host_redirects_only, } } @@ -406,6 +481,9 @@ mod tests { assert_eq!(tool.max_body_size, Some(1024)); assert!(!tool.enable_save_to_file); assert!(tool.respect_proxy_env); + assert!(tool.allowed_ports.is_empty()); + assert!(tool.blocked_hosts.is_empty()); + assert!(!tool.same_host_redirects_only); } #[test] @@ -420,6 +498,21 @@ mod tests { assert!(tool.max_body_size.is_none()); assert!(!tool.enable_save_to_file); assert!(!tool.respect_proxy_env); + assert!(tool.allowed_ports.is_empty()); + assert!(tool.blocked_hosts.is_empty()); + assert!(!tool.same_host_redirects_only); + } + + #[test] + fn test_tool_builder_hardened_profile() { + let tool = Tool::builder().hardened().build(); + + assert!(tool.dns_policy.block_private); + assert!(!tool.respect_proxy_env); + assert_eq!(tool.allowed_ports, vec![80, 443]); + assert!(tool.blocked_hosts.contains(&"localhost".to_string())); + assert!(tool.blocked_hosts.contains(&".cluster.local".to_string())); + assert!(tool.same_host_redirects_only); } #[test] diff --git a/crates/fetchkit/tests/ssrf_security.rs b/crates/fetchkit/tests/ssrf_security.rs index 7003e90..a695566 100644 --- a/crates/fetchkit/tests/ssrf_security.rs +++ b/crates/fetchkit/tests/ssrf_security.rs @@ -8,9 +8,78 @@ //! Tests that need loopback (wiremock) must explicitly opt out. use fetchkit::{FetchError, FetchRequest, Tool}; +use std::env; +use std::sync::{Mutex, OnceLock}; +use std::time::Duration; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::net::TcpListener; +use tokio::sync::oneshot; +use tokio::time::timeout; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, ResponseTemplate}; +fn proxy_env_lock() -> &'static Mutex<()> { + static LOCK: OnceLock> = OnceLock::new(); + LOCK.get_or_init(|| Mutex::new(())) +} + +struct ProxyEnvGuard { + http_proxy: Option, + https_proxy: Option, + no_proxy: Option, +} + +impl ProxyEnvGuard { + fn set(proxy_url: &str) -> Self { + let guard = Self { + http_proxy: env::var("HTTP_PROXY").ok(), + https_proxy: env::var("HTTPS_PROXY").ok(), + no_proxy: env::var("NO_PROXY").ok(), + }; + + env::set_var("HTTP_PROXY", proxy_url); + env::set_var("HTTPS_PROXY", proxy_url); + env::remove_var("NO_PROXY"); + + guard + } +} + +impl Drop for ProxyEnvGuard { + fn drop(&mut self) { + restore_env_var("HTTP_PROXY", self.http_proxy.as_deref()); + restore_env_var("HTTPS_PROXY", self.https_proxy.as_deref()); + restore_env_var("NO_PROXY", self.no_proxy.as_deref()); + } +} + +fn restore_env_var(key: &str, value: Option<&str>) { + if let Some(value) = value { + env::set_var(key, value); + } else { + env::remove_var(key); + } +} + +async fn spawn_test_proxy() -> (String, oneshot::Receiver<()>) { + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let (tx, rx) = oneshot::channel(); + + tokio::spawn(async move { + if let Ok(Ok((mut stream, _))) = timeout(Duration::from_secs(2), listener.accept()).await { + let mut buf = [0_u8; 1024]; + let _ = stream.read(&mut buf).await; + let _ = stream + .write_all(b"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\n\r\n") + .await; + let _ = tx.send(()); + } + }); + + (format!("http://{}", addr), rx) +} + // ============================================================================ // TM-SSRF-001: Private IP access via URL (blocked by default) // ============================================================================ @@ -374,6 +443,112 @@ async fn test_ssrf_010_redirect_scheme_validation() { assert!(matches!(result, Err(FetchError::InvalidUrlScheme))); } +#[tokio::test] +async fn test_ssrf_010_same_host_redirect_policy_blocks_cross_host_redirect() { + let mock_server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/redirect")) + .respond_with( + ResponseTemplate::new(302).insert_header("Location", "https://other.example/final"), + ) + .mount(&mock_server) + .await; + + let tool = Tool::builder() + .block_private_ips(false) + .same_host_redirects_only(true) + .build(); + let req = FetchRequest::new(format!("{}/redirect", mock_server.uri())); + let result = tool.execute(req).await; + + assert!(matches!(result, Err(FetchError::BlockedUrl))); +} + +// ============================================================================ +// TM-NET-004: Ambient proxy environment variables +// ============================================================================ + +#[tokio::test] +#[allow(clippy::await_holding_lock)] +async fn test_net_004_env_proxy_ignored_by_default() { + let _lock = proxy_env_lock().lock().unwrap(); + let (proxy_url, proxy_hit) = spawn_test_proxy().await; + let _env = ProxyEnvGuard::set(&proxy_url); + + let mock_server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("direct") + .insert_header("content-type", "text/plain"), + ) + .mount(&mock_server) + .await; + + let tool = Tool::builder().block_private_ips(false).build(); + let req = FetchRequest::new(format!("{}/", mock_server.uri())); + let response = tool.execute(req).await.unwrap(); + + assert_eq!(response.status_code, 200); + assert_eq!(response.content.as_deref(), Some("direct")); + assert!(timeout(Duration::from_millis(300), proxy_hit) + .await + .is_err()); +} + +#[tokio::test] +#[allow(clippy::await_holding_lock)] +async fn test_net_004_env_proxy_can_be_opted_in() { + let _lock = proxy_env_lock().lock().unwrap(); + let (proxy_url, proxy_hit) = spawn_test_proxy().await; + let _env = ProxyEnvGuard::set(&proxy_url); + + let mock_server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("direct") + .insert_header("content-type", "text/plain"), + ) + .mount(&mock_server) + .await; + + let tool = Tool::builder() + .block_private_ips(false) + .use_env_proxy(true) + .build(); + let req = FetchRequest::new(format!("{}/", mock_server.uri())); + let response = tool.execute(req).await.unwrap(); + + assert_eq!(response.status_code, 502); + assert!(timeout(Duration::from_secs(1), proxy_hit).await.is_ok()); +} + +// ============================================================================ +// Hardened profile: host and port restrictions +// ============================================================================ + +#[tokio::test] +async fn test_hardened_profile_blocks_internal_hostname_suffixes() { + let tool = Tool::builder().hardened().build(); + let req = FetchRequest::new("https://api.default.svc/status"); + let result = tool.execute(req).await; + + assert!(matches!(result, Err(FetchError::BlockedUrl))); +} + +#[tokio::test] +async fn test_hardened_profile_blocks_non_standard_ports() { + let tool = Tool::builder().hardened().build(); + let req = FetchRequest::new("https://example.com:8443/"); + let result = tool.execute(req).await; + + assert!(matches!(result, Err(FetchError::BlockedUrl))); +} + // ============================================================================ // TM-DOS-001: Max body size limit // ============================================================================ diff --git a/docs/hardening.md b/docs/hardening.md new file mode 100644 index 0000000..7c88c89 --- /dev/null +++ b/docs/hardening.md @@ -0,0 +1,97 @@ +# fetchkit Hardening Guide + +This guide is for operators running `fetchkit` in shared clusters, AI agent data planes, or other environments where untrusted users can influence fetched URLs. + +## What fetchkit can enforce + +`fetchkit` can harden its own outbound fetch path: + +- blocks private and reserved IP ranges by default +- revalidates every redirect hop +- pins DNS to the validated IP to reduce DNS rebinding risk +- ignores `HTTP_PROXY`, `HTTPS_PROXY`, and `NO_PROXY` by default +- can block internal hostnames before DNS resolution +- can restrict outbound traffic to an allowed port set +- can restrict redirects to the original host only + +Use the hardened profile for cluster-facing deployments: + +```rust +use fetchkit::{FetchRequest, Tool}; + +let tool = Tool::builder() + .hardened() + .allow_prefix("https://docs.example.com") + .build(); + +let response = tool + .execute(FetchRequest::new("https://docs.example.com").as_markdown()) + .await?; +``` + +The hardened profile does all of this: + +- keeps private IP blocking enabled +- ignores ambient proxy environment variables +- allows only ports `80` and `443` +- blocks `localhost`, `.local`, `.internal`, `.svc`, and `.cluster.local` +- only follows same-host redirects + +CLI and MCP equivalents: + +```bash +fetchkit fetch https://example.com --hardened +fetchkit mcp --hardened +``` + +## What fetchkit cannot guarantee + +`fetchkit` cannot guarantee that the pod, container, or VM itself has no internal network reachability. If the runtime can open arbitrary sockets to internal services, an infrastructure mistake can still expose that path outside of `fetchkit`. + +Treat library checks as defense in depth, not the only boundary. + +## Recommended deployment pattern + +For cluster deployments, use both application policy and network policy: + +1. Run `fetchkit` in a dedicated namespace or workload class. +2. Deny direct egress from `fetchkit` except to DNS and a dedicated egress proxy. +3. Make the egress proxy the only component allowed to reach the public Internet. +4. Block RFC1918 ranges, cluster pod/service CIDRs, link-local ranges, loopback, and metadata endpoints at the proxy or network layer. +5. Keep `fetchkit` hardening enabled inside the application. + +This gives you two independent checks: + +- `fetchkit` rejects obviously unsafe targets before dialing. +- the network path still cannot reach internal addresses if application policy is bypassed or misconfigured. + +## Proxy guidance + +By default, `fetchkit` ignores `HTTP_PROXY`, `HTTPS_PROXY`, and `NO_PROXY`. This is intentional. In cluster environments, inherited proxy variables can silently route requests around your expected enforcement path. + +Only opt in to proxy environment variables if that proxy is part of your intended design: + +```rust +let tool = Tool::builder() + .hardened() + .respect_proxy_env(true) + .build(); +``` + +```bash +fetchkit fetch https://example.com --hardened --allow-env-proxy +``` + +If you need a proxy, prefer a dedicated egress proxy with explicit policy over ambient proxy settings that every process inherits. + +## Recommended app policy + +For Internet-facing fetching from untrusted input: + +- keep `block_private_ips(true)` +- use `.hardened()` +- add `allow_prefix(...)` if you know the domains ahead of time +- keep `same_host_redirects_only(true)` unless cross-host redirects are required +- only opt in to `respect_proxy_env(true)` when the proxy is deliberate and hardened + +If you must fetch nonstandard ports or internal-looking public domains, start from `.hardened()` and then add the narrowest exception you need. diff --git a/specs/initial.md b/specs/initial.md index b223fdb..526af1d 100644 --- a/specs/initial.md +++ b/specs/initial.md @@ -50,11 +50,16 @@ Provide a builder to configure tool options, including: - Support User-Agent override (e.g., `user_agent`). - Support `block_private_ips(bool)` for SSRF prevention (default: `true`). - Support `max_body_size(usize)` for bounded response bodies. +- Support `max_body_size(usize)` for bounded response bodies. +- Support `respect_proxy_env(bool)` to opt in to `HTTP_PROXY` / `HTTPS_PROXY` / `NO_PROXY` + (default: `false`). +- Support port allow-listing via repeated `allow_port(u16)` calls. +- Support hostname blocking before DNS via exact host rules and suffix rules. +- Support `same_host_redirects_only(bool)` for stricter redirect handling. +- Support `hardened()` preset for production-facing data plane deployments. - Support `enable_save_to_file(bool)` for file download (default: `false`). When enabled, adds `save_to_file` to input schema and `saved_path`/`bytes_written` to output. Requires a `FileSaver` implementation at execution time. -- Support `respect_proxy_env(bool)` to opt into `HTTP_PROXY`/`HTTPS_PROXY` - inheritance (default: `false`). #### Types @@ -111,7 +116,12 @@ Provide a builder to configure tool options, including: - `` (positional, required) - `--output ` / `-o` (optional, default `md`) - `--user-agent ` (optional, overrides default User-Agent) + - `--hardened` (optional, applies the hardened outbound policy profile) + - `--allow-env-proxy` (optional, opt in to `HTTP_PROXY` / `HTTPS_PROXY` / `NO_PROXY`) - `--help` (standard help) +- MCP subcommand options: + - `--hardened` (optional, applies the hardened outbound policy profile) + - `--allow-env-proxy` (optional, opt in to `HTTP_PROXY` / `HTTPS_PROXY` / `NO_PROXY`) - Global options: - `--llmtxt` (full help with examples and tool details) - `--help` (standard help) @@ -149,6 +159,8 @@ Provide a builder to configure tool options, including: - Matching is URL-aware: scheme and host are normalized, trailing dots are ignored, path matches respect segment boundaries, and an explicit prefix port must match. If the prefix omits a port, any port on the same scheme+host matches. +- Exact host and hostname suffix block rules (if configured) are applied before DNS resolution. +- If one or more allowed ports are configured, the URL port must match one of them. ### SSRF Prevention (DNS Policy) @@ -165,8 +177,8 @@ By default, FetchKit blocks connections to private/reserved IP ranges: - User-Agent: configurable via tool builder or CLI/MCP/Python options (default `Everruns FetchKit/1.0`). -- Proxy env vars are ignored by default. Callers must opt in via - `ToolBuilder::respect_proxy_env(true)` if they need environment-configured proxies. +- Ambient proxy environment variables are ignored by default. + - Opt in via `ToolBuilder::respect_proxy_env(true)` or CLI `--allow-env-proxy`. - Accept header: - Markdown: `text/html, text/markdown, text/plain, */*;q=0.8` - Text: `text/html, text/plain, */*;q=0.8` @@ -175,7 +187,9 @@ By default, FetchKit blocks connections to private/reserved IP ranges: - Redirects: - Follow at most 10 hops. - Each hop is resolved and validated independently against the DNS policy. + - Each hop is also validated against configured host and port restrictions. - Redirects to non-HTTP(S) schemes are rejected. + - Optional hardened mode restricts redirects to the original host only. ### Timeouts diff --git a/specs/threat-model.md b/specs/threat-model.md index 586b9ef..c539dfb 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -181,6 +181,7 @@ redirect target, not the original host. | TM-NET-002 | TLS certificate validation bypass | Low | Uses reqwest defaults (system certificate store via rustls-platform-verifier) | MITIGATED | | TM-NET-003 | Connection reuse leaking context | Low | New reqwest client per request; no connection pooling across requests | MITIGATED | | TM-NET-004 | Proxy environment variables (HTTP_PROXY) | Medium | Clients ignore ambient proxy env by default; callers can opt in explicitly | MITIGATED | +| TM-NET-004 | Proxy environment variables (HTTP_PROXY) | Medium | Ambient proxy env is ignored by default; opt-in required via builder/CLI | MITIGATED | | TM-NET-005 | Man-in-the-middle on HTTP (non-TLS) | Medium | HTTP scheme is allowed; content can be intercepted/modified on the wire | **ACCEPTED** | ### Mitigation Details @@ -199,10 +200,11 @@ connection pool state from leaking between requests. This is a defense-in-depth measure. **TM-NET-004 — Proxy environment variables (MITIGATED):** -FetchKit calls `reqwest::ClientBuilder::no_proxy()` by default, so shared runtimes do -not silently inherit `HTTP_PROXY`, `HTTPS_PROXY`, or `NO_PROXY` from the process -environment. Callers that need proxy routing must opt in explicitly via -`ToolBuilder::respect_proxy_env(true)`. +FetchKit disables ambient `HTTP_PROXY`, `HTTPS_PROXY`, and `NO_PROXY` handling by +default via `reqwest::ClientBuilder::no_proxy()`. Callers must opt in explicitly +via `ToolBuilder::respect_proxy_env(true)` or the CLI `--allow-env-proxy` flag. +This prevents inherited container proxy settings from silently bypassing the +expected outbound path. ## 3. Input Validation (TM-INPUT) @@ -369,6 +371,7 @@ None — all previously open threats have been mitigated. | Proxy config | TM-NET-004 | Opt in with `respect_proxy_env(true)` only when an explicit proxy is required | | Content filtering | TM-LEAK-003 | Filter sensitive data from responses | | URL allow-listing | TM-INPUT-002, TM-INPUT-007 | Use allow_prefixes for positive security model (now URL-aware) | +| Network isolation | TM-SSRF, TM-NET | Route FetchKit through dedicated egress controls; library checks are defense in depth | ## Security Controls Matrix @@ -376,11 +379,15 @@ None — all previously open threats have been mitigated. |---------|----------|---------------| | Scheme validation | TM-INPUT | `starts_with("http://")` check; also enforced at each redirect hop | | URL prefix allow/block | TM-INPUT | URL-aware prefix matching via parsed URL components | +| Hostname block rules | TM-INPUT | Exact host and suffix checks before DNS resolution | +| Port allow-listing | TM-INPUT | Optional port restrictions validated before connect and on redirects | | Private IP blocking | TM-SSRF | `DnsPolicy::block_private_ips()` with resolve-then-check | | DNS pinning | TM-SSRF | `reqwest::ClientBuilder::resolve()` per redirect hop | | IPv6-mapped-IPv4 canonicalization | TM-SSRF | `IpAddr::to_canonical()` before range check | | IPv4-compatible/6to4 extraction | TM-SSRF | Extract embedded IPv4 from `::` and `2002::` prefixes, validate | | Manual redirect following | TM-SSRF | `Policy::none()` with IP validation at each hop | +| Ambient proxy suppression | TM-NET | `reqwest::ClientBuilder::no_proxy()` unless caller opts in | +| Same-host redirect hardening | TM-NET | Optional `same_host_redirects_only(true)` for hardened deployments | | First-byte timeout | TM-DOS | 1-second connect+response timeout | | Body timeout | TM-DOS | 30-second streaming body timeout | | Body size limit | TM-DOS | Configurable `max_body_size` (default 10 MB) |