diff --git a/README.md b/README.md index b2933c3..558a697 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ AI-friendly web content fetching tool designed for LLM consumption. Rust library - **HTML-to-Text** - Plain text extraction with clean formatting - **Binary detection** - Returns metadata only for images, PDFs, etc. - **Timeout handling** - 1s first-byte, 30s body with partial content on timeout +- **Safety limits** - 10 MB default decompressed body cap with truncation - **URL filtering** - URL-aware allow/block lists for controlled access - **SSRF protection** - Resolve-then-check blocks private IPs by default - **MCP server** - Model Context Protocol support for AI tool integration @@ -196,17 +197,19 @@ let tool = Tool::builder() DNS pinning prevents DNS rebinding attacks. IPv6-mapped IPv4 addresses are canonicalized before validation. Redirects are followed manually in the default fetcher so each hop is revalidated against scheme and DNS policy. Allow/block prefixes are matched against parsed URLs rather than raw strings, which prevents lookalike host overmatches such as `allowed.example.com.evil.test`. +Proxy environment variables are ignored by default; opt in with `ToolBuilder::respect_proxy_env(true)` only when you intentionally want `HTTP_PROXY`/`HTTPS_PROXY` routing. See [`specs/threat-model.md`](specs/threat-model.md) for the full threat model. ## Configuration -### Timeouts +### Timeouts And Limits - **First-byte**: 1 second (connect + initial response) - **Body**: 30 seconds total +- **Body size**: 10 MB decompressed content by default -Partial content is returned on body timeout with `truncated: true`. +Partial content is returned on body timeout or body-size limit with `truncated: true`. ### Binary Content diff --git a/crates/fetchkit-python/src/lib.rs b/crates/fetchkit-python/src/lib.rs index 8d0e5e6..a14ebac 100644 --- a/crates/fetchkit-python/src/lib.rs +++ b/crates/fetchkit-python/src/lib.rs @@ -177,17 +177,20 @@ pub struct PyFetchKitTool { impl PyFetchKitTool { /// Create a new tool with default options #[new] - #[pyo3(signature = (enable_markdown=true, enable_text=true, user_agent=None, allow_prefixes=None, block_prefixes=None))] + #[pyo3(signature = (enable_markdown=true, enable_text=true, user_agent=None, allow_prefixes=None, block_prefixes=None, max_body_size=None, respect_proxy_env=false))] fn new( enable_markdown: bool, enable_text: bool, user_agent: Option, allow_prefixes: Option>, block_prefixes: Option>, + max_body_size: Option, + respect_proxy_env: bool, ) -> PyResult { let mut builder = ToolBuilder::new() .enable_markdown(enable_markdown) - .enable_text(enable_text); + .enable_text(enable_text) + .respect_proxy_env(respect_proxy_env); if let Some(ua) = user_agent { builder = builder.user_agent(ua); @@ -205,6 +208,10 @@ impl PyFetchKitTool { } } + if let Some(max_bytes) = max_body_size { + builder = builder.max_body_size(max_bytes); + } + let runtime = tokio::runtime::Runtime::new() .map_err(|e| PyValueError::new_err(format!("Failed to create runtime: {}", e)))?; @@ -273,7 +280,7 @@ fn fetch( as_markdown: Option, as_text: Option, ) -> PyResult { - let tool = PyFetchKitTool::new(true, true, None, None, None)?; + let tool = PyFetchKitTool::new(true, true, None, None, None, None, false)?; tool.fetch(url, method, as_markdown, as_text) } diff --git a/crates/fetchkit/src/client.rs b/crates/fetchkit/src/client.rs index 1989933..4cb4d4f 100644 --- a/crates/fetchkit/src/client.rs +++ b/crates/fetchkit/src/client.rs @@ -28,6 +28,8 @@ pub struct FetchOptions { pub max_body_size: Option, /// Enable save_to_file parameter in requests pub enable_save_to_file: bool, + /// Whether to respect HTTP_PROXY/HTTPS_PROXY/NO_PROXY from the environment + pub respect_proxy_env: bool, } /// Fetch a URL and return the response @@ -102,5 +104,8 @@ mod tests { assert!(!options.enable_text); // Safe by default: private IPs blocked assert!(options.dns_policy.block_private); + assert!(options.max_body_size.is_none()); + assert!(!options.enable_save_to_file); + assert!(!options.respect_proxy_env); } } diff --git a/crates/fetchkit/src/fetchers/default.rs b/crates/fetchkit/src/fetchers/default.rs index b0ebff6..31fccf3 100644 --- a/crates/fetchkit/src/fetchers/default.rs +++ b/crates/fetchkit/src/fetchers/default.rs @@ -1,3 +1,6 @@ +// Decisions: +// - Ignore ambient proxy env by default. Shared agent runtimes should not inherit network routing. +// - Cap decompressed textual bodies. LLM-oriented fetches do not need unbounded response growth. //! Default HTTP fetcher //! //! Handles general HTTP/HTTPS URLs with HTML conversion support. @@ -367,6 +370,11 @@ fn build_client_for_url( .timeout(FIRST_BYTE_TIMEOUT) .redirect(reqwest::redirect::Policy::none()); + if !options.respect_proxy_env { + // THREAT[TM-NET-004]: Ignore ambient proxy env by default in shared runtimes. + client_builder = client_builder.no_proxy(); + } + if options.dns_policy.block_private { if let Some(host) = url.host_str() { let port = url.port_or_known_default().unwrap_or(80); diff --git a/crates/fetchkit/src/fetchers/github_repo.rs b/crates/fetchkit/src/fetchers/github_repo.rs index f1276a8..e177f4c 100644 --- a/crates/fetchkit/src/fetchers/github_repo.rs +++ b/crates/fetchkit/src/fetchers/github_repo.rs @@ -162,6 +162,11 @@ impl Fetcher for GitHubRepoFetcher { .timeout(API_TIMEOUT) .redirect(reqwest::redirect::Policy::none()); + if !options.respect_proxy_env { + // THREAT[TM-NET-004]: Ignore ambient proxy env by default in shared runtimes. + client_builder = client_builder.no_proxy(); + } + if options.dns_policy.block_private { let validated_addr = options .dns_policy diff --git a/crates/fetchkit/src/tool.rs b/crates/fetchkit/src/tool.rs index a99bfb8..2ef21f4 100644 --- a/crates/fetchkit/src/tool.rs +++ b/crates/fetchkit/src/tool.rs @@ -91,6 +91,8 @@ pub struct ToolBuilder { max_body_size: Option, /// Enable save_to_file parameter (opt-in) enable_save_to_file: bool, + /// Whether to honor proxy environment variables + respect_proxy_env: bool, } impl ToolBuilder { @@ -167,6 +169,15 @@ impl ToolBuilder { self } + /// Control whether reqwest inherits proxy configuration from the process environment. + /// + /// Disabled by default so shared runtimes do not accidentally route requests + /// through ambient HTTP(S)_PROXY configuration. + pub fn respect_proxy_env(mut self, respect: bool) -> Self { + self.respect_proxy_env = respect; + self + } + /// Build the tool pub fn build(self) -> Tool { Tool { @@ -178,6 +189,7 @@ impl ToolBuilder { dns_policy: self.dns_policy, max_body_size: self.max_body_size, enable_save_to_file: self.enable_save_to_file, + respect_proxy_env: self.respect_proxy_env, } } } @@ -222,6 +234,7 @@ pub struct Tool { dns_policy: DnsPolicy, max_body_size: Option, enable_save_to_file: bool, + respect_proxy_env: bool, } impl Default for Tool { @@ -329,6 +342,7 @@ impl Tool { dns_policy: self.dns_policy.clone(), max_body_size: self.max_body_size, enable_save_to_file: self.enable_save_to_file, + respect_proxy_env: self.respect_proxy_env, } } @@ -378,6 +392,8 @@ mod tests { .user_agent("TestAgent/1.0") .allow_prefix("https://allowed.com") .block_prefix("https://blocked.com") + .max_body_size(1024) + .respect_proxy_env(true) .build(); assert!(!tool.enable_markdown); @@ -387,6 +403,9 @@ mod tests { assert_eq!(tool.block_prefixes, vec!["https://blocked.com"]); // Safe by default: private IPs blocked assert!(tool.dns_policy.block_private); + assert_eq!(tool.max_body_size, Some(1024)); + assert!(!tool.enable_save_to_file); + assert!(tool.respect_proxy_env); } #[test] @@ -395,6 +414,14 @@ mod tests { assert!(!tool.dns_policy.block_private); } + #[test] + fn test_tool_builder_security_defaults() { + let tool = Tool::builder().build(); + assert!(tool.max_body_size.is_none()); + assert!(!tool.enable_save_to_file); + assert!(!tool.respect_proxy_env); + } + #[test] fn test_tool_description() { let tool = Tool::default(); diff --git a/crates/fetchkit/tests/integration.rs b/crates/fetchkit/tests/integration.rs index 0996665..57ca38e 100644 --- a/crates/fetchkit/tests/integration.rs +++ b/crates/fetchkit/tests/integration.rs @@ -286,6 +286,44 @@ async fn test_size_for_text_content() { assert_eq!(resp.size, Some(body.len() as u64)); } +#[tokio::test] +async fn test_text_body_truncated_at_safety_limit() { + let mock_server = MockServer::start().await; + + let body = "A".repeat(1024); + + Mock::given(method("GET")) + .and(path("/large")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string(body) + .insert_header("content-type", "text/plain"), + ) + .mount(&mock_server) + .await; + + let req = FetchRequest::new(format!("{}/large", mock_server.uri())); + let resp = fetch_with_options( + req, + FetchOptions { + enable_markdown: true, + enable_text: true, + dns_policy: DnsPolicy::allow_all(), + max_body_size: Some(128), + ..Default::default() + }, + ) + .await + .unwrap(); + + assert_eq!(resp.size, Some(128)); + assert_eq!(resp.truncated, Some(true)); + + let content = resp.content.unwrap(); + assert!(content.starts_with(&"A".repeat(128))); + assert!(content.contains("[..content truncated...]")); +} + #[tokio::test] async fn test_url_prefix_allow_list() { let mock_server = MockServer::start().await; diff --git a/crates/fetchkit/tests/proxy_env.rs b/crates/fetchkit/tests/proxy_env.rs new file mode 100644 index 0000000..bbdb344 --- /dev/null +++ b/crates/fetchkit/tests/proxy_env.rs @@ -0,0 +1,61 @@ +use fetchkit::{FetchError, FetchRequest, Tool}; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +struct EnvGuard { + key: &'static str, + original: Option, +} + +impl EnvGuard { + fn set(key: &'static str, value: &str) -> Self { + let original = std::env::var(key).ok(); + std::env::set_var(key, value); + Self { key, original } + } +} + +impl Drop for EnvGuard { + fn drop(&mut self) { + match &self.original { + Some(value) => std::env::set_var(self.key, value), + None => std::env::remove_var(self.key), + } + } +} + +#[tokio::test] +async fn test_proxy_env_is_ignored_by_default_but_can_be_enabled() { + let mock_server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("proxied?") + .insert_header("content-type", "text/plain"), + ) + .mount(&mock_server) + .await; + + let _http_proxy = EnvGuard::set("HTTP_PROXY", "http://127.0.0.1:9"); + let _https_proxy = EnvGuard::set("HTTPS_PROXY", "http://127.0.0.1:9"); + let _no_proxy = EnvGuard::set("NO_PROXY", ""); + + let request = FetchRequest::new(format!("{}/", mock_server.uri())); + + let direct_tool = Tool::builder().block_private_ips(false).build(); + let direct_response = direct_tool.execute(request.clone()).await.unwrap(); + assert_eq!(direct_response.status_code, 200); + assert_eq!(direct_response.content.as_deref(), Some("proxied?")); + + let proxied_tool = Tool::builder() + .block_private_ips(false) + .respect_proxy_env(true) + .build(); + let proxied_result = proxied_tool.execute(request).await; + assert!(matches!( + proxied_result, + Err(FetchError::FirstByteTimeout) | Err(FetchError::ConnectError(_)) + )); +} diff --git a/docs/security.md b/docs/security.md new file mode 100644 index 0000000..742f99e --- /dev/null +++ b/docs/security.md @@ -0,0 +1,34 @@ +# Security Notes + +FetchKit is intended to run in agent, server, and cluster environments where URL input may be +user-controlled. + +## Safe Defaults + +- Private and reserved IP ranges are blocked by default via resolve-then-check DNS validation. +- Redirects are followed manually so every hop is revalidated. +- Textual response bodies are capped at 10 MB after decompression by default. Larger responses are truncated + and marked with `truncated: true`. +- `HTTP_PROXY`, `HTTPS_PROXY`, and `NO_PROXY` are ignored by default. + +## Multi-Tenant Deployment + +For shared VMs, containers, or clusters: + +- Keep private-IP blocking enabled. +- Keep proxy inheritance disabled unless outbound traffic must traverse a trusted proxy. +- Use allow-lists where possible instead of relying only on block-lists. +- Apply caller-side rate limits and concurrency limits around FetchKit. + +If you need different limits, configure them through `ToolBuilder`: + +```rust +use fetchkit::ToolBuilder; + +let tool = ToolBuilder::new() + .max_body_size(1024 * 1024) + .respect_proxy_env(false) + .build(); +``` + +See [`specs/threat-model.md`](../specs/threat-model.md) for the full threat inventory. diff --git a/specs/fetchers.md b/specs/fetchers.md index 3b483ff..305179a 100644 --- a/specs/fetchers.md +++ b/specs/fetchers.md @@ -38,6 +38,7 @@ Central dispatcher that: - Binary content detection (returns metadata only) - Timeout handling with partial content support - Binary-aware file saving via `fetch_to_file()` override (accepts binary content when saving) + - Decompressed body size cap with partial content truncation - Returns: Standard `FetchResponse` with format `"markdown"`, `"text"`, or `"raw"` #### GitHubRepoFetcher @@ -71,6 +72,10 @@ Fetchers receive `FetchOptions` for: - `enable_text` - Enable text conversion - `enable_save_to_file` - Enable file saving support - `dns_policy` - DNS resolution policy for SSRF prevention (default: block private IPs) +- `max_body_size` - Maximum response body size after decompression + (default: 10 MB) +- `respect_proxy_env` - Whether to honor `HTTP_PROXY`/`HTTPS_PROXY`/`NO_PROXY` + from the process environment (default: disabled) ### Extensibility @@ -94,6 +99,8 @@ Both built-in fetchers integrate resolve-then-check DNS validation: - Validate IP against blocked ranges (private, loopback, link-local, etc.) - Pin validated IP via `reqwest::ClientBuilder::resolve()` to prevent DNS rebinding - Enabled by default via `DnsPolicy::default()` (blocks private IPs) +- Ignore ambient proxy env by default so shared runtimes do not silently route + traffic through operator-provided proxies unless explicitly enabled - See `specs/threat-model.md` for threat IDs: TM-SSRF-001 through TM-SSRF-010 ## Module Structure diff --git a/specs/initial.md b/specs/initial.md index defbabb..b223fdb 100644 --- a/specs/initial.md +++ b/specs/initial.md @@ -49,9 +49,12 @@ Provide a builder to configure tool options, including: - Support enabling/disabling request options (feature gating). - Support User-Agent override (e.g., `user_agent`). - Support `block_private_ips(bool)` for SSRF prevention (default: `true`). +- Support `max_body_size(usize)` for bounded response bodies. - Support `enable_save_to_file(bool)` for file download (default: `false`). When enabled, adds `save_to_file` to input schema and `saved_path`/`bytes_written` to output. Requires a `FileSaver` implementation at execution time. +- Support `respect_proxy_env(bool)` to opt into `HTTP_PROXY`/`HTTPS_PROXY` + inheritance (default: `false`). #### Types @@ -162,6 +165,8 @@ By default, FetchKit blocks connections to private/reserved IP ranges: - User-Agent: configurable via tool builder or CLI/MCP/Python options (default `Everruns FetchKit/1.0`). +- Proxy env vars are ignored by default. Callers must opt in via + `ToolBuilder::respect_proxy_env(true)` if they need environment-configured proxies. - Accept header: - Markdown: `text/html, text/markdown, text/plain, */*;q=0.8` - Text: `text/html, text/plain, */*;q=0.8` @@ -176,10 +181,15 @@ By default, FetchKit blocks connections to private/reserved IP ranges: - First-byte timeout (connect + first response byte): 1s. - Body timeout: 30s total. +- Maximum response body size: 10 MB by default; configurable via `max_body_size`. - On body timeout: - Return partial body - Set `truncated: true` - - Append `\n\n[..more content timed out...]` to content + - Append truncation marker to content +- On body size limit: + - Return partial body + - Set `truncated: true` + - Append truncation marker to content ### Response Rules @@ -222,6 +232,7 @@ When `save_to_file` is set on the request: - For binary: `size` from `Content-Length` if present. - For text/HTML: `size` equals bytes read from body stream (before conversion). +- If body truncation happens, `size` equals the captured byte count after limits are applied. #### Filename diff --git a/specs/threat-model.md b/specs/threat-model.md index 470a975..586b9ef 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -8,6 +8,19 @@ influence which URLs are fetched. This document identifies threats that arise wh runs inside a container or cluster with access to internal network resources, and tracks mitigations implemented in the library. +## Verification Status + +Last verified: 2026-03-13 + +Verified in this review: +- `cargo test --workspace -- --nocapture` +- `cargo clippy --workspace --all-targets -- -D warnings` +- `RUSTDOCFLAGS="-D warnings" cargo doc --workspace --no-deps` +- `cargo run -p fetchkit-cli -- fetch https://example.com --output json` +- `cargo run -p fetchkit-cli -- fetch http://127.0.0.1 --output json` +- `HTTP_PROXY=http://127.0.0.1:9 HTTPS_PROXY=http://127.0.0.1:9 cargo run -p fetchkit-cli -- fetch https://example.com --output json` +- JSON-RPC smoke test against `cargo run -p fetchkit-cli -- mcp` + ## Threat ID Scheme **Format:** `TM--` @@ -167,7 +180,7 @@ redirect target, not the original host. | TM-NET-001 | HTTP downgrade (HTTPS URL redirects to HTTP) | Medium | Scheme validated per hop (non-HTTP blocked); HTTPS→HTTP downgrade allowed | **ACCEPTED** | | TM-NET-002 | TLS certificate validation bypass | Low | Uses reqwest defaults (system certificate store via rustls-platform-verifier) | MITIGATED | | TM-NET-003 | Connection reuse leaking context | Low | New reqwest client per request; no connection pooling across requests | MITIGATED | -| TM-NET-004 | Proxy environment variables (HTTP_PROXY) | Medium | Reqwest respects system proxy env vars; attacker could set these in container | **CALLER RISK** | +| TM-NET-004 | Proxy environment variables (HTTP_PROXY) | Medium | Clients ignore ambient proxy env by default; callers can opt in explicitly | MITIGATED | | TM-NET-005 | Man-in-the-middle on HTTP (non-TLS) | Medium | HTTP scheme is allowed; content can be intercepted/modified on the wire | **ACCEPTED** | ### Mitigation Details @@ -185,10 +198,11 @@ The `DefaultFetcher` creates a new `reqwest::Client` per request, which prevents connection pool state from leaking between requests. This is a defense-in-depth measure. -**TM-NET-004 — Proxy environment variables (CALLER RISK):** -Reqwest automatically reads `HTTP_PROXY`, `HTTPS_PROXY`, and `NO_PROXY` environment -variables. In a container environment, these should be controlled by the operator. -This is the caller's responsibility to configure or clear. +**TM-NET-004 — Proxy environment variables (MITIGATED):** +FetchKit calls `reqwest::ClientBuilder::no_proxy()` by default, so shared runtimes do +not silently inherit `HTTP_PROXY`, `HTTPS_PROXY`, or `NO_PROXY` from the process +environment. Callers that need proxy routing must opt in explicitly via +`ToolBuilder::respect_proxy_env(true)`. ## 3. Input Validation (TM-INPUT) @@ -352,7 +366,7 @@ None — all previously open threats have been mitigated. | Responsibility | Related Threats | Description | |---------------|----------------|-------------| | Rate limiting | TM-DOS-004 | Caller must implement request rate limits | -| Proxy config | TM-NET-004 | Clear or set HTTP_PROXY env vars appropriately | +| Proxy config | TM-NET-004 | Opt in with `respect_proxy_env(true)` only when an explicit proxy is required | | Content filtering | TM-LEAK-003 | Filter sensitive data from responses | | URL allow-listing | TM-INPUT-002, TM-INPUT-007 | Use allow_prefixes for positive security model (now URL-aware) | @@ -373,6 +387,7 @@ None — all previously open threats have been mitigated. | Script tag stripping | TM-CONV | Skip `script`/`style`/`noscript`/`iframe`/`svg` | | Binary detection | TM-CONV | Content-Type prefix matching | | New client per request | TM-NET | No connection pool state leakage | +| Proxy env isolation | TM-NET | `reqwest::ClientBuilder::no_proxy()` by default | | Path traversal prevention | TM-INPUT | Lexical path normalization in `LocalFileSaver` | | Save feature gating | TM-INPUT | `enable_save_to_file` disabled by default; schema gated |