From f841ef6e532f7d5e8dd173c96df883893ba3cacc Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 23:59:56 -0500 Subject: [PATCH 1/7] feat(classification): add symbol demangling for Rust - Implemented `SymbolDemangler` to detect and demangle mangled Rust symbols. - Added support for Rust legacy and v0 mangling formats. - Updated `FoundString` to store original mangled text and tags for demangled symbols. - Enhanced `Tag` enum with `DemangledSymbol` variant. - Added comprehensive tests for demangling functionality. Signed-off-by: UncleSp1d3r --- Cargo.toml | 21 +- src/classification/mod.rs | 18 +- src/classification/patterns/data.rs | 296 ++++ src/classification/patterns/ip.rs | 277 +++ src/classification/patterns/mod.rs | 34 + src/classification/patterns/network.rs | 169 ++ src/classification/patterns/paths.rs | 477 +++++ src/classification/semantic.rs | 1542 +++-------------- src/classification/symbols.rs | 361 ++++ src/types.rs | 2 + ...integration__classification_snapshots.snap | 2 +- 11 files changed, 1836 insertions(+), 1363 deletions(-) create mode 100644 src/classification/patterns/data.rs create mode 100644 src/classification/patterns/ip.rs create mode 100644 src/classification/patterns/mod.rs create mode 100644 src/classification/patterns/network.rs create mode 100644 src/classification/patterns/paths.rs create mode 100644 src/classification/symbols.rs diff --git a/Cargo.toml b/Cargo.toml index b3d0aa7..88c8bda 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,19 +20,20 @@ name = "stringy" path = "src/main.rs" [dependencies] -clap = { version = "4.5.54", features = [ "derive" ] } -entropy = "0.4.2" -goblin = "0.10.4" -lazy_static = "1.5" -pelite = "0.10.0" -regex = "1.12.2" -serde = { version = "1.0.228", features = [ "derive" ] } -serde_json = "1.0.148" -thiserror = "2.0.17" +clap = { version = "4.5.54", features = [ "derive" ] } +entropy = "0.4.2" +goblin = "0.10.4" +once_cell = "1.21.3" +pelite = "0.10.0" +regex = "1.12.2" +rustc-demangle = "0.1.27" +serde = { version = "1.0.228", features = [ "derive" ] } +serde_json = "1.0.149" +thiserror = "2.0.17" [dev-dependencies] criterion = "0.8.1" -insta = "1.46.0" +insta = "1.46.1" tempfile = "3.24.0" # The profile that 'dist' will build with diff --git a/src/classification/mod.rs b/src/classification/mod.rs index d3dd9ad..320a3a6 100644 --- a/src/classification/mod.rs +++ b/src/classification/mod.rs @@ -12,14 +12,12 @@ //! - **Domain Detection**: Identifies domain names with TLD validation //! - **File Path Detection**: Identifies POSIX, Windows, and UNC paths //! - **Registry Path Detection**: Identifies Windows registry paths -//! -//! ## Future Capabilities -//! -//! - GUIDs/UUIDs -//! - Email addresses -//! - Base64 data -//! - Format strings -//! - User agents +//! - **GUID Detection**: Identifies GUIDs/UUIDs in standard format +//! - **Email Detection**: Identifies email addresses +//! - **Base64 Detection**: Identifies Base64-encoded data (broad tag) +//! - **Format String Detection**: Identifies printf-style format strings +//! - **User Agent Detection**: Identifies HTTP user agent strings +//! - **Symbol Demangling**: Demangles Rust symbols to human-readable form //! //! ## Usage //! @@ -49,5 +47,9 @@ //! assert!(tags.contains(&Tag::FilePath)); //! ``` +mod patterns; pub mod semantic; +pub mod symbols; + pub use semantic::SemanticClassifier; +pub use symbols::SymbolDemangler; diff --git a/src/classification/patterns/data.rs b/src/classification/patterns/data.rs new file mode 100644 index 0000000..3f3dd57 --- /dev/null +++ b/src/classification/patterns/data.rs @@ -0,0 +1,296 @@ +//! Data format classification patterns +//! +//! This module provides GUID, email, Base64, format string, and user agent detection. + +use crate::types::Tag; +use once_cell::sync::Lazy; +use regex::Regex; + +/// Regular expression for matching GUIDs/UUIDs +/// +/// Pattern matches standard GUID format: {XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX} +/// Also matches without braces and in lowercase. +pub(crate) static GUID_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"(?i)^\{?[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\}?$").unwrap() +}); + +/// Regular expression for matching email addresses +/// +/// Pattern matches basic email format: user@domain.tld +pub(crate) static EMAIL_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$").unwrap()); + +/// Regular expression for matching printf-style format strings +/// +/// Pattern detects format specifiers like %s, %d, %x, %f, etc. +pub(crate) static FORMAT_STRING_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"%[-+0 #]*(\d+|\*)?(\.(\d+|\*))?(hh?|ll?|[Lzjt])?[diouxXeEfFgGaAcspn%]").unwrap() +}); + +/// Regular expression for matching common user agent patterns +/// +/// Pattern matches common browser/bot user agent strings. +pub(crate) static USER_AGENT_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"(?i)^Mozilla/\d|^curl/|^Wget/|^python-requests|^libwww-perl|^Java/|^Apache-HttpClient|^okhttp/|^PostmanRuntime/") + .unwrap() +}); + +/// Classifies a GUID/UUID +/// +/// # Arguments +/// * `text` - The text to check for GUID format +/// +/// # Returns +/// Returns `Some(Tag::Guid)` if valid, `None` otherwise. +pub fn classify_guid(text: &str) -> Option { + if GUID_REGEX.is_match(text) { + Some(Tag::Guid) + } else { + None + } +} + +/// Classifies an email address +/// +/// # Arguments +/// * `text` - The text to check for email format +/// +/// # Returns +/// Returns `Some(Tag::Email)` if valid, `None` otherwise. +pub fn classify_email(text: &str) -> Option { + if EMAIL_REGEX.is_match(text) { + Some(Tag::Email) + } else { + None + } +} + +/// Classifies Base64-encoded data +/// +/// This is a broad/ambiguous tag with potential false positives. +/// Returns `Some(Tag::Base64)` if the text appears to be Base64 encoded. +/// +/// Detection criteria: +/// - Minimum length of 16 characters +/// - Only valid Base64 characters (A-Z, a-z, 0-9, +, /, =) +/// - Proper padding (if present) +/// - Length is a multiple of 4 or has valid padding +/// - For unpadded strings: must have both uppercase and lowercase letters +pub fn classify_base64(text: &str) -> Option { + // Minimum length to reduce false positives + if text.len() < 16 { + return None; + } + + // Check if it's valid Base64 characters only + let is_base64_chars = text + .chars() + .all(|c| c.is_ascii_alphanumeric() || c == '+' || c == '/' || c == '='); + + if !is_base64_chars { + return None; + } + + // Count padding characters + let padding_count = text.chars().rev().take_while(|&c| c == '=').count(); + + // Padding should be at most 2 characters + if padding_count > 2 { + return None; + } + + // Strip padding for length check + let content_len = text.len() - padding_count; + + // Valid Base64 content length should be such that total is multiple of 4 + if !(content_len + padding_count).is_multiple_of(4) { + return None; + } + + // Check for character diversity typical of Base64 + let has_upper = text.chars().any(|c| c.is_ascii_uppercase()); + let has_lower = text.chars().any(|c| c.is_ascii_lowercase()); + let has_digit = text.chars().any(|c| c.is_ascii_digit()); + + // For strings with padding, the padding itself is strong evidence + // For strings without padding, require both upper and lowercase + // to avoid false positives on random alphanumeric strings + if padding_count == 0 { + // Require both upper and lower case for unpadded strings + if !has_upper || !has_lower { + return None; + } + } else { + // For padded strings, still require some diversity + let has_diversity = has_digit || (has_upper && has_lower); + if !has_diversity { + return None; + } + } + + Some(Tag::Base64) +} + +/// Classifies a printf-style format string +/// +/// # Arguments +/// * `text` - The text to check for format string patterns +/// +/// # Returns +/// Returns `Some(Tag::FormatString)` if valid, `None` otherwise. +pub fn classify_format_string(text: &str) -> Option { + // Find all format specifier matches + let matches: Vec<_> = FORMAT_STRING_REGEX.find_iter(text).collect(); + + if matches.is_empty() { + return None; + } + + // Check if any match is a real format specifier (not just %%) + // %% is just an escaped percent sign, not a real format specifier + let has_real_specifier = matches.iter().any(|m| m.as_str() != "%%"); + + if !has_real_specifier { + return None; + } + + // Exclude strings that are just a single format specifier + // (those are likely false positives) + if text.len() > 2 { + return Some(Tag::FormatString); + } + + None +} + +/// Classifies a user agent string +/// +/// # Arguments +/// * `text` - The text to check for user agent patterns +/// +/// # Returns +/// Returns `Some(Tag::UserAgent)` if valid, `None` otherwise. +pub fn classify_user_agent(text: &str) -> Option { + if USER_AGENT_REGEX.is_match(text) { + Some(Tag::UserAgent) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_guid_with_braces() { + assert!(classify_guid("{12345678-1234-1234-1234-123456789ABC}").is_some()); + assert!(classify_guid("{AAAAAAAA-BBBB-CCCC-DDDD-EEEEEEEEEEEE}").is_some()); + } + + #[test] + fn test_guid_without_braces() { + assert!(classify_guid("12345678-1234-1234-1234-123456789ABC").is_some()); + assert!(classify_guid("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee").is_some()); + } + + #[test] + fn test_guid_case_insensitive() { + // Mixed case but all valid hex digits + assert!(classify_guid("AbCdEf01-1234-5678-90aB-cDeF12345678").is_some()); + } + + #[test] + fn test_guid_invalid() { + assert!(classify_guid("not-a-guid").is_none()); + assert!(classify_guid("12345678-1234-1234-1234").is_none()); // Too short + assert!(classify_guid("12345678-1234-1234-1234-123456789ABCDEF").is_none()); // Too long + assert!(classify_guid("GGGGGGGG-1234-1234-1234-123456789ABC").is_none()); // Invalid hex + } + + #[test] + fn test_email_valid() { + assert!(classify_email("user@example.com").is_some()); + assert!(classify_email("test.user@domain.org").is_some()); + assert!(classify_email("admin+tag@company.co.uk").is_some()); + } + + #[test] + fn test_email_invalid() { + assert!(classify_email("not an email").is_none()); + assert!(classify_email("@nodomain.com").is_none()); + assert!(classify_email("noat.com").is_none()); + assert!(classify_email("user@").is_none()); + } + + #[test] + fn test_base64_valid() { + // Valid Base64 with mixed case (typical encoded data) + assert!(classify_base64("SGVsbG8gV29ybGQh").is_some()); + assert!(classify_base64("VGhpcyBpcyBhIHRlc3Q=").is_some()); + // Longer Base64 strings + assert!(classify_base64("QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVo=").is_some()); + } + + #[test] + fn test_base64_too_short() { + assert!(classify_base64("SGVsbG8=").is_none()); // Only 8 chars + assert!(classify_base64("YWJj").is_none()); // Only 4 chars + } + + #[test] + fn test_base64_invalid_chars() { + assert!(classify_base64("SGVsbG8gV29ybGQh!@#$").is_none()); + assert!(classify_base64("This is not base64!!").is_none()); + } + + #[test] + fn test_format_string_basic() { + assert!(classify_format_string("Hello %s!").is_some()); + assert!(classify_format_string("Value: %d").is_some()); + assert!(classify_format_string("Hex: %x").is_some()); + } + + #[test] + fn test_format_string_complex() { + assert!(classify_format_string("Name: %s, Age: %d, Score: %.2f").is_some()); + assert!(classify_format_string("%08x %08x %08x").is_some()); + assert!(classify_format_string("%-20s %10d").is_some()); + } + + #[test] + fn test_format_string_not_format() { + assert!(classify_format_string("No format here").is_none()); + assert!(classify_format_string("100%").is_none()); // Bare percent, no specifier + assert!(classify_format_string("100%% done").is_none()); // Escaped percent only + } + + #[test] + fn test_user_agent_mozilla() { + assert!( + classify_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + .is_some() + ); + } + + #[test] + fn test_user_agent_curl() { + assert!(classify_user_agent("curl/7.68.0").is_some()); + } + + #[test] + fn test_user_agent_wget() { + assert!(classify_user_agent("Wget/1.20.3 (linux-gnu)").is_some()); + } + + #[test] + fn test_user_agent_python() { + assert!(classify_user_agent("python-requests/2.25.1").is_some()); + } + + #[test] + fn test_user_agent_not_user_agent() { + assert!(classify_user_agent("Not a user agent").is_none()); + assert!(classify_user_agent("Chrome").is_none()); // Too generic + } +} diff --git a/src/classification/patterns/ip.rs b/src/classification/patterns/ip.rs new file mode 100644 index 0000000..9da0de3 --- /dev/null +++ b/src/classification/patterns/ip.rs @@ -0,0 +1,277 @@ +//! IP address classification patterns +//! +//! This module provides IPv4 and IPv6 address detection functionality. + +use crate::types::Tag; +use once_cell::sync::Lazy; +use regex::Regex; +use std::net::{Ipv4Addr, Ipv6Addr}; +use std::str::FromStr; + +/// Regular expression for matching IPv4 addresses +/// +/// Pattern matches IPv4 addresses with proper octet validation (0-255). +/// Matches the entire string (used after port stripping). +pub(crate) static IPV4_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap() +}); + +/// Regular expression for matching IPv6 addresses +/// +/// Pattern matches IPv6 addresses including full, compressed, and mixed notation. +/// This is a permissive pattern that checks for basic IPv6 structure. +/// Actual validation is performed by std::net::Ipv6Addr::from_str. +pub(crate) static IPV6_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"(?i)^(?:[0-9a-f]{1,4}:){1,7}[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,7}:$|^(?:[0-9a-f]{1,4}:){1,6}:[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,5}(?::[0-9a-f]{1,4}){1,2}$|^(?:[0-9a-f]{1,4}:){1,4}(?::[0-9a-f]{1,4}){1,3}$|^(?:[0-9a-f]{1,4}:){1,3}(?::[0-9a-f]{1,4}){1,4}$|^(?:[0-9a-f]{1,4}:){1,2}(?::[0-9a-f]{1,4}){1,5}$|^[0-9a-f]{1,4}:(?::[0-9a-f]{1,4}){1,6}$|^:(?::[0-9a-f]{1,4}){1,7}$|^::$|^::ffff:(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap() +}); + +/// Regular expression for detecting and stripping port suffixes +/// +/// Matches :port where port is in the valid range 0-65535. +pub(crate) static PORT_SUFFIX_REGEX: Lazy = Lazy::new(|| { + Regex::new( + r":(?:[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])$", + ) + .unwrap() +}); + +/// Regular expression for handling bracketed IPv6 addresses +/// +/// Matches [IPv6] format used in URLs like [::1]:8080. +pub(crate) static IPV6_BRACKETS_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^\[(.+)\]").unwrap()); + +/// Strips the port suffix from an IP address string if present +/// +/// # Arguments +/// * `text` - The text that may contain a port suffix +/// +/// # Returns +/// The text with the port suffix removed, or the original text if no port found. +pub fn strip_port(text: &str) -> &str { + if let Some(mat) = PORT_SUFFIX_REGEX.find(text) { + &text[..mat.start()] + } else { + text + } +} + +/// Strips brackets from an IPv6 address if present +/// +/// # Arguments +/// * `text` - The text that may contain bracketed IPv6 +/// +/// # Returns +/// The IPv6 address without brackets, or the original text if no brackets found. +pub fn strip_ipv6_brackets(text: &str) -> &str { + if let Some(caps) = IPV6_BRACKETS_REGEX.captures(text) + && let Some(inner) = caps.get(1) + { + return inner.as_str(); + } + text +} + +/// Checks if the given text is a valid IPv4 address +/// +/// This method first strips any port suffix, then validates the remaining +/// text as an IPv4 address using both regex and standard library validation. +/// +/// # Arguments +/// * `text` - The text to check for IPv4 format +/// +/// # Returns +/// Returns `true` if the text is a valid IPv4 address. +pub fn is_ipv4_address(text: &str) -> bool { + // Strip port suffix if present + let text_without_port = strip_port(text); + + // Two-stage validation: regex pre-filter first + if !IPV4_REGEX.is_match(text_without_port) { + return false; + } + + // Check for leading zeros in octets (e.g., 192.168.01.1 should be rejected) + for octet_str in text_without_port.split('.') { + // If an octet has more than 1 digit and starts with '0', it's invalid + if octet_str.len() > 1 && octet_str.starts_with('0') { + return false; + } + } + + // Validate using std::net::Ipv4Addr for correctness + // This is the authoritative check - regex is just a pre-filter + Ipv4Addr::from_str(text_without_port).is_ok() +} + +/// Checks if the given text is a valid IPv6 address +/// +/// This method handles bracketed IPv6 addresses (e.g., [::1]:8080), +/// strips any port suffix, and validates using both regex and standard library. +/// +/// # Arguments +/// * `text` - The text to check for IPv6 format +/// +/// # Returns +/// Returns `true` if the text is a valid IPv6 address. +pub fn is_ipv6_address(text: &str) -> bool { + // Handle bracketed IPv6 addresses like [::1] or [::1]:8080 + let mut ip_text = text; + + // Check for bracketed format + if text.starts_with('[') { + // Strip port from the full text first (e.g., [::1]:8080 -> [::1]) + let without_port = strip_port(text); + // Now extract the IPv6 from brackets + ip_text = strip_ipv6_brackets(without_port); + } + + // Basic structure check - must contain colon and only valid hex/colon characters + if !ip_text.contains(':') { + return false; + } + + // Allow only valid IPv6 characters + let valid_chars = ip_text + .chars() + .all(|c| c.is_ascii_hexdigit() || c == ':' || c == '.'); + + if !valid_chars { + return false; + } + + // Validate using std::net::Ipv6Addr for correctness + Ipv6Addr::from_str(ip_text).is_ok() +} + +/// Classifies IP addresses (both IPv4 and IPv6) in the given text +/// +/// # Arguments +/// * `text` - The text to classify +/// +/// # Returns +/// A vector of tags (IPv4 and/or IPv6) that apply to the text. +pub fn classify_ip_addresses(text: &str) -> Vec { + let mut tags = Vec::new(); + + if is_ipv4_address(text) { + tags.push(Tag::IPv4); + } + + if is_ipv6_address(text) { + tags.push(Tag::IPv6); + } + + tags +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ipv4_valid_addresses() { + assert!(is_ipv4_address("192.168.1.1")); + assert!(is_ipv4_address("10.0.0.1")); + assert!(is_ipv4_address("172.16.0.1")); + assert!(is_ipv4_address("255.255.255.255")); + assert!(is_ipv4_address("8.8.8.8")); + assert!(is_ipv4_address("1.1.1.1")); + assert!(is_ipv4_address("0.0.0.0")); + } + + #[test] + fn test_ipv4_invalid_addresses() { + assert!(!is_ipv4_address("256.1.1.1")); + assert!(!is_ipv4_address("1.2.3.4.5")); + assert!(!is_ipv4_address("1.2.3")); + assert!(!is_ipv4_address("not an ip")); + assert!(!is_ipv4_address("")); + } + + #[test] + fn test_ipv4_with_port() { + assert!(is_ipv4_address("192.168.1.1:8080")); + assert!(is_ipv4_address("10.0.0.1:443")); + assert!(is_ipv4_address("172.16.0.1:65535")); + } + + #[test] + fn test_ipv4_leading_zeros() { + // Leading zeros should be rejected + assert!(!is_ipv4_address("01.02.03.04")); + assert!(!is_ipv4_address("192.168.01.1")); + } + + #[test] + fn test_ipv6_full_notation() { + assert!(is_ipv6_address("2001:0db8:85a3:0000:0000:8a2e:0370:7334")); + assert!(is_ipv6_address("fe80:0000:0000:0000:0000:0000:0000:0001")); + } + + #[test] + fn test_ipv6_compressed() { + assert!(is_ipv6_address("2001:db8::1")); + assert!(is_ipv6_address("::1")); + assert!(is_ipv6_address("fe80::1")); + assert!(is_ipv6_address("::")); + } + + #[test] + fn test_ipv6_mixed_notation() { + assert!(is_ipv6_address("::ffff:192.0.2.1")); + assert!(is_ipv6_address("::ffff:127.0.0.1")); + } + + #[test] + fn test_ipv6_invalid() { + assert!(!is_ipv6_address("not an ipv6")); + assert!(!is_ipv6_address("")); + assert!(!is_ipv6_address("gggg::1")); // Invalid hex + } + + #[test] + fn test_ipv6_with_brackets() { + assert!(is_ipv6_address("[::1]")); + assert!(is_ipv6_address("[2001:db8::1]")); + assert!(is_ipv6_address("[fe80::1]")); + } + + #[test] + fn test_ipv6_with_port() { + assert!(is_ipv6_address("[::1]:8080")); + assert!(is_ipv6_address("[2001:db8::1]:443")); + } + + #[test] + fn test_classify_ipv4() { + let tags = classify_ip_addresses("192.168.1.1"); + assert!(tags.contains(&Tag::IPv4)); + assert!(!tags.contains(&Tag::IPv6)); + } + + #[test] + fn test_classify_ipv6() { + let tags = classify_ip_addresses("2001:db8::1"); + assert!(!tags.contains(&Tag::IPv4)); + assert!(tags.contains(&Tag::IPv6)); + } + + #[test] + fn test_classify_no_ip() { + let tags = classify_ip_addresses("not an ip address"); + assert!(tags.is_empty()); + } + + #[test] + fn test_classify_ipv4_with_port() { + let tags = classify_ip_addresses("192.168.1.1:8080"); + assert!(tags.contains(&Tag::IPv4)); + } + + #[test] + fn test_classify_ipv6_with_brackets_and_port() { + let tags = classify_ip_addresses("[::1]:8080"); + assert!(tags.contains(&Tag::IPv6)); + } +} diff --git a/src/classification/patterns/mod.rs b/src/classification/patterns/mod.rs new file mode 100644 index 0000000..2852fdf --- /dev/null +++ b/src/classification/patterns/mod.rs @@ -0,0 +1,34 @@ +//! Pattern classification modules +//! +//! This module contains submodules for different types of pattern classification: +//! - `ip`: IPv4 and IPv6 address detection +//! - `network`: URL and domain detection +//! - `paths`: File and registry path detection +//! - `data`: GUID, email, Base64, format string, and user agent detection + +pub mod data; +pub mod ip; +pub mod network; +pub mod paths; + +// Re-export classification functions +pub use data::{ + classify_base64, classify_email, classify_format_string, classify_guid, classify_user_agent, +}; +pub use ip::{ + classify_ip_addresses, is_ipv4_address, is_ipv6_address, strip_ipv6_brackets, strip_port, +}; +pub use network::{classify_domain, classify_url, has_valid_tld}; +pub use paths::{ + classify_posix_path, classify_registry_path, classify_unc_path, classify_windows_path, + is_suspicious_posix_path, is_suspicious_registry_path, is_suspicious_windows_path, + is_valid_posix_path, is_valid_registry_path, is_valid_windows_path, +}; + +// Re-export regex patterns needed by SemanticClassifier for cache testing +pub(crate) use ip::{IPV4_REGEX, IPV6_REGEX}; +pub(crate) use network::{DOMAIN_REGEX, URL_REGEX}; +pub(crate) use paths::{ + POSIX_PATH_REGEX, REGISTRY_ABBREV_REGEX, REGISTRY_PATH_REGEX, UNC_PATH_REGEX, + WINDOWS_PATH_REGEX, +}; diff --git a/src/classification/patterns/network.rs b/src/classification/patterns/network.rs new file mode 100644 index 0000000..c45b8ca --- /dev/null +++ b/src/classification/patterns/network.rs @@ -0,0 +1,169 @@ +//! Network indicator classification patterns +//! +//! This module provides URL and domain name detection functionality. + +use crate::types::Tag; +use once_cell::sync::Lazy; +use regex::Regex; + +/// Regular expression for matching HTTP/HTTPS URLs +/// +/// Pattern matches URLs starting with http:// or https:// and excludes +/// problematic characters that could cause false positives. +pub(crate) static URL_REGEX: Lazy = + Lazy::new(|| Regex::new(r#"https?://[^\s<>"{}|\\\^\[\]\`]+"#).unwrap()); + +/// Regular expression for matching domain names +/// +/// Pattern matches domain names with proper DNS format compliance (RFC 1035). +/// It ensures domains start and end with alphanumeric characters, allows hyphens +/// in the middle, and requires at least a 2-character TLD. +pub(crate) static DOMAIN_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b").unwrap() +}); + +/// List of common TLDs for validation +const COMMON_TLDS: &[&str] = &[ + "com", "org", "net", "edu", "gov", "mil", "int", "io", "co", "uk", "de", "fr", "jp", "cn", + "ru", "br", "in", "au", "ca", "es", "it", "nl", "pl", "se", "ch", "at", "be", "dk", "fi", "no", + "pt", "cz", "hu", "ro", "bg", "hr", "sk", "si", "ee", "lt", "lv", "ie", "gr", "cy", "mt", "lu", + "info", "biz", "name", "pro", "aero", "coop", "museum", "travel", "jobs", "mobi", "tel", + "asia", "cat", "xxx", "app", "dev", "page", "blog", "shop", "store", "online", "site", + "website", "tech", "cloud", "ai", "ml", "tv", "me", "cc", "ws", "bz", "nu", "tk", "ga", "cf", + "gq", "exe", "dll", "sys", "bin", "dat", "log", "tmp", "bak", +]; + +/// Checks if the domain has a valid TLD +/// +/// # Arguments +/// * `domain` - The domain name to validate +/// +/// # Returns +/// Returns `true` if the domain has a known TLD. +pub fn has_valid_tld(domain: &str) -> bool { + if let Some(dot_pos) = domain.rfind('.') { + let tld = &domain[dot_pos + 1..]; + let tld_lower = tld.to_lowercase(); + COMMON_TLDS.contains(&tld_lower.as_str()) + } else { + false + } +} + +/// Detects HTTP/HTTPS URLs in the given text +/// +/// This method identifies URLs that start with `http://` or `https://` +/// and contain valid URL characters. +/// +/// # Arguments +/// * `text` - The text to search for URLs +/// +/// # Returns +/// Returns `Some(Tag::Url)` if a URL is found, `None` otherwise. +pub fn classify_url(text: &str) -> Option { + if URL_REGEX.is_match(text) { + Some(Tag::Url) + } else { + None + } +} + +/// Detects domain names that are not URLs +/// +/// This method identifies domain names that match the domain pattern but +/// are not already identified as URLs. It first checks if the text is NOT +/// a URL to prevent double-tagging, then validates against the domain +/// pattern and TLD list. +/// +/// # Arguments +/// * `text` - The text to search for domain names +/// +/// # Returns +/// Returns `Some(Tag::Domain)` if a valid domain is found (and it's not +/// a URL), `None` otherwise. +pub fn classify_domain(text: &str) -> Option { + // First check if it's NOT a URL to prevent double-tagging + if URL_REGEX.is_match(text) { + return None; + } + + // Check if it matches the domain pattern + if DOMAIN_REGEX.is_match(text) { + // Validate TLD to reduce false positives + if has_valid_tld(text) { + return Some(Tag::Domain); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_url_detection() { + assert!(classify_url("https://example.com").is_some()); + assert!(classify_url("http://test.org/path").is_some()); + assert!(classify_url("https://sub.domain.com:8080/api").is_some()); + assert!(classify_url("not a url").is_none()); + assert!(classify_url("ftp://example.com").is_none()); + } + + #[test] + fn test_domain_detection() { + assert!(classify_domain("example.com").is_some()); + assert!(classify_domain("sub.example.org").is_some()); + assert!(classify_domain("test.co.uk").is_some()); + assert!(classify_domain("https://example.com").is_none()); // URLs excluded + assert!(classify_domain("notadomain").is_none()); + assert!(classify_domain("invalid.xyz123").is_none()); // Invalid TLD + } + + #[test] + fn test_url_classification() { + assert_eq!(classify_url("https://example.com"), Some(Tag::Url)); + assert_eq!(classify_url("http://test.org"), Some(Tag::Url)); + } + + #[test] + fn test_domain_classification() { + assert_eq!(classify_domain("example.com"), Some(Tag::Domain)); + assert_eq!(classify_domain("test.org"), Some(Tag::Domain)); + } + + #[test] + fn test_url_not_double_tagged() { + // URLs should not be tagged as domains + assert!(classify_url("https://example.com").is_some()); + assert!(classify_domain("https://example.com").is_none()); + } + + #[test] + fn test_tld_validation() { + assert!(has_valid_tld("example.com")); + assert!(has_valid_tld("test.org")); + assert!(has_valid_tld("website.io")); + assert!(has_valid_tld("app.dev")); + assert!(!has_valid_tld("example.invalidtld")); + assert!(!has_valid_tld("nodot")); + } + + #[test] + fn test_edge_cases() { + // Empty strings + assert!(classify_url("").is_none()); + assert!(classify_domain("").is_none()); + + // Single characters + assert!(classify_url("a").is_none()); + assert!(classify_domain("a").is_none()); + + // Just TLD + assert!(classify_domain(".com").is_none()); + + // IP-like domains (should be handled by IP classifier) + assert!(classify_domain("192.168.1.1").is_none()); + } +} diff --git a/src/classification/patterns/paths.rs b/src/classification/patterns/paths.rs new file mode 100644 index 0000000..6f75359 --- /dev/null +++ b/src/classification/patterns/paths.rs @@ -0,0 +1,477 @@ +//! File and registry path classification patterns +//! +//! This module provides POSIX, Windows, UNC, and registry path detection. + +use crate::types::Tag; +use once_cell::sync::Lazy; +use regex::Regex; +use std::collections::HashSet; + +/// Regular expression for matching POSIX file paths +pub(crate) static POSIX_PATH_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^/[^\x00\n\r]*").unwrap()); + +/// Regular expression for matching Windows file paths +pub(crate) static WINDOWS_PATH_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^[A-Za-z]:\\[^\x00\n\r]*").unwrap()); + +/// Regular expression for matching UNC network paths +pub(crate) static UNC_PATH_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^\\\\[a-zA-Z0-9.-]+\\[^\x00\n\r]*").unwrap()); + +/// Regular expression for matching full Windows registry paths +pub(crate) static REGISTRY_PATH_REGEX: Lazy = + Lazy::new(|| Regex::new(r"(?i)^HKEY_[A-Z_]+\\[^\x00\n\r]*").unwrap()); + +/// Regular expression for matching abbreviated registry paths +pub(crate) static REGISTRY_ABBREV_REGEX: Lazy = + Lazy::new(|| Regex::new(r"(?i)^HK(LM|CU|CR|U|CC)\\[^\x00\n\r]*").unwrap()); + +/// Common suspicious POSIX path prefixes for persistence detection +static SUSPICIOUS_POSIX_PATHS: Lazy> = Lazy::new(|| { + let mut set = HashSet::new(); + set.insert("/etc/cron.d/"); + set.insert("/etc/init.d/"); + set.insert("/usr/local/bin/"); + set.insert("/tmp/"); + set.insert("/var/tmp/"); + set.insert("/etc/rc.d/"); + set.insert("/etc/crontab"); + set.insert("/etc/systemd/system/"); + set.insert("~/.config/autostart/"); + set.insert("/Library/LaunchDaemons/"); + set.insert("/Library/LaunchAgents/"); + set +}); + +/// Common suspicious Windows path prefixes for persistence detection +static SUSPICIOUS_WINDOWS_PATHS: Lazy> = Lazy::new(|| { + let mut set = HashSet::new(); + set.insert("C:\\Windows\\System32\\"); + set.insert("C:\\Windows\\Temp\\"); + set.insert("\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\"); + set.insert("C:\\ProgramData\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\"); + set.insert("C:\\Windows\\SysWOW64\\"); + set +}); + +/// Known valid POSIX path prefixes +static KNOWN_POSIX_PREFIXES: Lazy> = Lazy::new(|| { + let mut set = HashSet::new(); + set.insert("/usr/"); + set.insert("/etc/"); + set.insert("/var/"); + set.insert("/home/"); + set.insert("/opt/"); + set.insert("/bin/"); + set.insert("/sbin/"); + set.insert("/lib/"); + set.insert("/dev/"); + set.insert("/proc/"); + set.insert("/sys/"); + set.insert("/tmp/"); + set +}); + +/// Known valid Windows path prefixes +static KNOWN_WINDOWS_PREFIXES: Lazy> = Lazy::new(|| { + let mut set = HashSet::new(); + set.insert("C:\\Windows\\"); + set.insert("C:\\Program Files\\"); + set.insert("C:\\Program Files (x86)\\"); + set.insert("C:\\Users\\"); + set.insert("C:\\ProgramData\\"); + set +}); + +/// Valid Windows registry root keys +static VALID_REGISTRY_ROOTS: Lazy> = Lazy::new(|| { + let mut set = HashSet::new(); + set.insert("HKEY_LOCAL_MACHINE"); + set.insert("HKEY_CURRENT_USER"); + set.insert("HKEY_CLASSES_ROOT"); + set.insert("HKEY_USERS"); + set.insert("HKEY_CURRENT_CONFIG"); + set +}); + +/// Suspicious Windows registry paths for persistence detection +static SUSPICIOUS_REGISTRY_PATHS: Lazy> = Lazy::new(|| { + let mut set = HashSet::new(); + set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run"); + set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\RunOnce"); + set.insert("\\System\\CurrentControlSet\\Services"); + set.insert("\\SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion\\Winlogon"); + set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders"); + set +}); + +/// Checks if a path contains ASCII case-insensitive substring +fn contains_ascii_case_insensitive(haystack: &str, needle: &str) -> bool { + let haystack_lower = haystack.to_ascii_lowercase(); + let needle_lower = needle.to_ascii_lowercase(); + haystack_lower.contains(&needle_lower) +} + +/// Checks if text contains printf-style format placeholders +fn contains_printf_placeholder(text: &str) -> bool { + // Look for common printf patterns that might appear in paths + let patterns = [ + "%s", "%d", "%x", "%u", "%i", "%f", "%c", "%p", "%n", "%ld", "%lu", + ]; + for pattern in patterns { + if text.contains(pattern) { + return true; + } + } + false +} + +/// Checks if text contains control characters +fn contains_control_chars(text: &str) -> bool { + text.chars().any(|c| c.is_control() && c != '\t') +} + +/// Validates a POSIX path +pub fn is_valid_posix_path(text: &str) -> bool { + // Must start with / and have at least one more character + if !text.starts_with('/') || text.len() < 2 { + return false; + } + + // Check for null bytes or control characters + if contains_control_chars(text) { + return false; + } + + // Check for known prefixes to boost confidence + for prefix in KNOWN_POSIX_PREFIXES.iter() { + if text.starts_with(prefix) { + return true; + } + } + + // Additional validation for paths that don't start with known prefixes + // Must have at least one directory separator beyond the root + if text.len() > 1 && text[1..].contains('/') { + return true; + } + + // Single directory under root (e.g., "/bin") - needs to be at least 3 chars + text.len() >= 3 +} + +/// Validates a Windows path +pub fn is_valid_windows_path(text: &str) -> bool { + // Must match the basic pattern + if !WINDOWS_PATH_REGEX.is_match(text) { + return false; + } + + // Check for null bytes or control characters + if contains_control_chars(text) { + return false; + } + + // Validate drive letter is A-Z + let first_char = text.chars().next().unwrap_or(' '); + if !first_char.is_ascii_alphabetic() { + return false; + } + + // Check for known prefixes to boost confidence + for prefix in KNOWN_WINDOWS_PREFIXES.iter() { + if contains_ascii_case_insensitive(text, prefix) { + return true; + } + } + + // Path should have at least some content after the drive letter + text.len() >= 4 +} + +/// Validates a registry path +pub fn is_valid_registry_path(text: &str) -> bool { + let upper_text = text.to_uppercase(); + + // Check for full registry root + if upper_text.starts_with("HKEY_") { + // Extract root key + if let Some(slash_pos) = text.find('\\') { + let root = &upper_text[..slash_pos]; + if VALID_REGISTRY_ROOTS.contains(root) { + return true; + } + } + } + + // Check for abbreviated forms (case-insensitive) + if REGISTRY_ABBREV_REGEX.is_match(text) { + return true; + } + + // Also accept paths that use forward slashes (some tools do this) + if upper_text.starts_with("HKEY_") + && text.contains('/') + && let Some(slash_pos) = text.find('/') + { + let root = &upper_text[..slash_pos]; + if VALID_REGISTRY_ROOTS.contains(root) { + return true; + } + } + + false +} + +/// Classifies a POSIX path +/// +/// # Arguments +/// * `text` - The text to check for POSIX path format +/// +/// # Returns +/// Returns `Some(Tag::FilePath)` if valid, `None` otherwise. +pub fn classify_posix_path(text: &str) -> Option { + if POSIX_PATH_REGEX.is_match(text) && is_valid_posix_path(text) { + Some(Tag::FilePath) + } else { + None + } +} + +/// Classifies a Windows path +/// +/// # Arguments +/// * `text` - The text to check for Windows path format +/// +/// # Returns +/// Returns `Some(Tag::FilePath)` if valid, `None` otherwise. +pub fn classify_windows_path(text: &str) -> Option { + // Skip if it looks like a printf format string + if contains_printf_placeholder(text) { + return None; + } + + if WINDOWS_PATH_REGEX.is_match(text) && is_valid_windows_path(text) { + Some(Tag::FilePath) + } else { + None + } +} + +/// Classifies a UNC network path +/// +/// # Arguments +/// * `text` - The text to check for UNC path format +/// +/// # Returns +/// Returns `Some(Tag::FilePath)` if valid, `None` otherwise. +pub fn classify_unc_path(text: &str) -> Option { + if UNC_PATH_REGEX.is_match(text) { + // Basic validation - must have server and share + let parts: Vec<&str> = text.split('\\').collect(); + // parts[0] and parts[1] are empty (before \\), parts[2] is server, parts[3] is share + if parts.len() >= 4 && !parts[2].is_empty() && !parts[3].is_empty() { + return Some(Tag::FilePath); + } + } + None +} + +/// Classifies a Windows registry path +/// +/// # Arguments +/// * `text` - The text to check for registry path format +/// +/// # Returns +/// Returns `Some(Tag::RegistryPath)` if valid, `None` otherwise. +pub fn classify_registry_path(text: &str) -> Option { + // is_valid_registry_path handles both backslash and forward-slash styles + if is_valid_registry_path(text) { + Some(Tag::RegistryPath) + } else { + None + } +} + +/// Checks if a POSIX path is suspicious (persistence-related) +pub fn is_suspicious_posix_path(text: &str) -> bool { + SUSPICIOUS_POSIX_PATHS.iter().any(|p| text.starts_with(p)) +} + +/// Checks if a Windows path is suspicious (persistence-related) +pub fn is_suspicious_windows_path(text: &str) -> bool { + SUSPICIOUS_WINDOWS_PATHS + .iter() + .any(|p| contains_ascii_case_insensitive(text, p)) +} + +/// Checks if a registry path is suspicious (persistence-related) +pub fn is_suspicious_registry_path(text: &str) -> bool { + SUSPICIOUS_REGISTRY_PATHS + .iter() + .any(|p| contains_ascii_case_insensitive(text, p)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_posix_absolute_path() { + assert!(classify_posix_path("/usr/bin/bash").is_some()); + assert!(classify_posix_path("/etc/passwd").is_some()); + assert!(classify_posix_path("/home/user/.bashrc").is_some()); + } + + #[test] + fn test_posix_home_directory() { + assert!(classify_posix_path("/home/user/documents/file.txt").is_some()); + assert!(classify_posix_path("/Users/admin/Desktop").is_some()); + } + + #[test] + fn test_posix_with_spaces() { + assert!(classify_posix_path("/home/user/My Documents/file.txt").is_some()); + } + + #[test] + fn test_posix_system_directories() { + assert!(classify_posix_path("/var/log/syslog").is_some()); + assert!(classify_posix_path("/opt/application/bin").is_some()); + } + + #[test] + fn test_posix_suspicious_paths() { + assert!(is_suspicious_posix_path("/etc/cron.d/malicious")); + assert!(is_suspicious_posix_path("/tmp/evil.sh")); + assert!(!is_suspicious_posix_path("/home/user/normal.txt")); + } + + #[test] + fn test_posix_too_short() { + assert!(classify_posix_path("/").is_none()); + assert!(classify_posix_path("/a").is_none()); + } + + #[test] + fn test_posix_invalid() { + assert!(classify_posix_path("not/a/path").is_none()); + assert!(classify_posix_path("C:\\Windows").is_none()); + } + + #[test] + fn test_posix_with_null_bytes() { + assert!(classify_posix_path("/path/with\x00null").is_none()); + } + + #[test] + fn test_windows_absolute_path() { + assert!(classify_windows_path("C:\\Windows\\System32").is_some()); + assert!(classify_windows_path("D:\\Projects\\code").is_some()); + } + + #[test] + fn test_windows_program_files() { + assert!(classify_windows_path("C:\\Program Files\\App\\app.exe").is_some()); + assert!(classify_windows_path("C:\\Program Files (x86)\\App").is_some()); + } + + #[test] + fn test_windows_with_spaces() { + assert!(classify_windows_path("C:\\Users\\John Doe\\Documents").is_some()); + } + + #[test] + fn test_windows_different_drives() { + assert!(classify_windows_path("D:\\Data\\file.txt").is_some()); + assert!(classify_windows_path("E:\\Backup\\archive.zip").is_some()); + } + + #[test] + fn test_windows_suspicious_paths() { + assert!(is_suspicious_windows_path("C:\\Windows\\System32\\cmd.exe")); + assert!(is_suspicious_windows_path("C:\\Windows\\Temp\\malware.exe")); + assert!(!is_suspicious_windows_path("D:\\Projects\\code.rs")); + } + + #[test] + fn test_windows_case_insensitive() { + assert!(classify_windows_path("c:\\windows\\system32").is_some()); + assert!(classify_windows_path("C:\\WINDOWS\\SYSTEM32").is_some()); + } + + #[test] + fn test_windows_invalid() { + assert!(classify_windows_path("/unix/path").is_none()); + assert!(classify_windows_path("not a path").is_none()); + } + + #[test] + fn test_windows_invalid_drive() { + assert!(classify_windows_path("1:\\Invalid\\Path").is_none()); + } + + #[test] + fn test_unc_path() { + assert!(classify_unc_path("\\\\server\\share\\file.txt").is_some()); + assert!(classify_unc_path("\\\\192.168.1.1\\c$\\Windows").is_some()); + } + + #[test] + fn test_unc_with_domain() { + assert!(classify_unc_path("\\\\domain.local\\share\\path").is_some()); + } + + #[test] + fn test_unc_invalid() { + assert!(classify_unc_path("\\\\server").is_none()); // No share + assert!(classify_unc_path("\\server\\share").is_none()); // Single backslash + } + + #[test] + fn test_registry_run_key() { + assert!( + classify_registry_path( + "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" + ) + .is_some() + ); + } + + #[test] + fn test_registry_current_user() { + assert!( + classify_registry_path("HKEY_CURRENT_USER\\Software\\Microsoft\\Windows").is_some() + ); + } + + #[test] + fn test_registry_abbreviated_hklm() { + assert!( + classify_registry_path("HKLM\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion").is_some() + ); + } + + #[test] + fn test_registry_abbreviated_hkcu() { + assert!(classify_registry_path("HKCU\\Software\\Classes").is_some()); + } + + #[test] + fn test_registry_persistence_run() { + assert!(is_suspicious_registry_path( + "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" + )); + } + + #[test] + fn test_registry_invalid_root() { + assert!(classify_registry_path("HKEY_INVALID\\Path").is_none()); + } + + #[test] + fn test_registry_forward_slash() { + assert!(classify_registry_path("HKEY_LOCAL_MACHINE/SOFTWARE/Microsoft/Windows").is_some()); + } +} diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index b83bdfd..c6df7a7 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -10,6 +10,11 @@ //! - IPv4 and IPv6 addresses //! - POSIX and Windows file paths (including UNC paths) //! - Windows registry paths +//! - GUIDs/UUIDs +//! - Email addresses +//! - Base64-encoded data +//! - Printf-style format strings +//! - User agent strings //! //! # Usage //! @@ -39,184 +44,35 @@ //! assert_eq!(tags.len(), 1); //! assert!(matches!(tags[0], stringy::types::Tag::Url)); //! ``` -//! -//! # Patterns -//! -//! - **URLs**: Matches HTTP and HTTPS URLs using a pattern that excludes -//! problematic characters that could cause false positives. -//! -//! - **Domains**: Matches domain names using RFC 1035 compliant patterns -//! with additional TLD validation against a hardcoded list of common TLDs. +use super::patterns; use crate::types::{FoundString, Tag}; -use lazy_static::lazy_static; +use patterns::{ + DOMAIN_REGEX, IPV4_REGEX, IPV6_REGEX, POSIX_PATH_REGEX, REGISTRY_ABBREV_REGEX, + REGISTRY_PATH_REGEX, UNC_PATH_REGEX, URL_REGEX, WINDOWS_PATH_REGEX, +}; use regex::Regex; -use std::net::{Ipv4Addr, Ipv6Addr}; -use std::str::FromStr; - -lazy_static! { - /// Regular expression for matching HTTP/HTTPS URLs - /// - /// Pattern matches URLs starting with http:// or https:// and excludes - /// problematic characters that could cause false positives. - static ref URL_REGEX: Regex = Regex::new(r#"https?://[^\s<>"{}|\\\^\[\]\`]+"#).unwrap(); - - /// Regular expression for matching domain names - /// - /// Pattern matches domain names with proper DNS format compliance (RFC 1035). - /// It ensures domains start and end with alphanumeric characters, allows hyphens - /// in the middle, and requires at least a 2-character TLD. - static ref DOMAIN_REGEX: Regex = Regex::new(r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b").unwrap(); - - /// Regular expression for matching IPv4 addresses - /// - /// Pattern matches IPv4 addresses with proper octet validation (0-255). - /// Matches the entire string (used after port stripping). - static ref IPV4_REGEX: Regex = Regex::new(r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap(); - - /// Regular expression for matching IPv6 addresses - /// - /// Pattern matches IPv6 addresses including: - /// - Full notation: 2001:0db8:85a3:0000:0000:8a2e:0370:7334 - /// - Compressed notation: 2001:db8::1, ::1, fe80::1 - /// - Mixed notation: ::ffff:192.0.2.1, 64:ff9b::192.0.2.1 - /// This is a permissive pattern that checks for basic IPv6 structure (colons and hex digits). - /// Actual validation is performed by std::net::Ipv6Addr::from_str. - static ref IPV6_REGEX: Regex = Regex::new(r"(?i)^(?:[0-9a-f]{1,4}:){1,7}[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,7}:$|^(?:[0-9a-f]{1,4}:){1,6}:[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,5}(?::[0-9a-f]{1,4}){1,2}$|^(?:[0-9a-f]{1,4}:){1,4}(?::[0-9a-f]{1,4}){1,3}$|^(?:[0-9a-f]{1,4}:){1,3}(?::[0-9a-f]{1,4}){1,4}$|^(?:[0-9a-f]{1,4}:){1,2}(?::[0-9a-f]{1,4}){1,5}$|^[0-9a-f]{1,4}:(?::[0-9a-f]{1,4}){1,6}$|^:(?::[0-9a-f]{1,4}){1,7}$|^::$|^::ffff:(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap(); - - /// Regular expression for detecting and stripping port suffixes - /// - /// Matches :port where port is in the valid range 0-65535. - /// Pattern: :[0-9]{1,4} matches 0-9999, |[1-5][0-9]{4} matches 10000-59999, - /// |6[0-4][0-9]{3} matches 60000-64999, |65[0-4][0-9]{2} matches 65000-65499, - /// |655[0-2][0-9] matches 65500-65529, |6553[0-5] matches 65530-65535. - static ref PORT_SUFFIX_REGEX: Regex = Regex::new(r":(?:[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])$").unwrap(); - - /// Regular expression for handling bracketed IPv6 addresses - /// - /// Matches [IPv6] format used in URLs like [::1]:8080. - static ref IPV6_BRACKETS_REGEX: Regex = Regex::new(r"^\[(.+)\]").unwrap(); - - /// Regular expression for matching POSIX file paths - /// - /// Pattern matches absolute POSIX paths starting with / followed by any characters - /// except null bytes, newlines, or carriage returns. - static ref POSIX_PATH_REGEX: Regex = Regex::new(r"^/[^\x00\n\r]*").unwrap(); - - /// Regular expression for matching Windows file paths - /// - /// Pattern matches Windows absolute paths starting with drive letter (C:\) - /// followed by any characters except null bytes, newlines, or carriage returns. - static ref WINDOWS_PATH_REGEX: Regex = Regex::new(r"^[A-Za-z]:\\[^\x00\n\r]*").unwrap(); - - /// Regular expression for matching UNC network paths - /// - /// Pattern matches UNC paths starting with \\ followed by server name and share. - static ref UNC_PATH_REGEX: Regex = Regex::new(r"^\\\\[a-zA-Z0-9.-]+\\[^\x00\n\r]*").unwrap(); - /// Regular expression for matching full Windows registry paths - /// - /// Pattern matches registry paths starting with HKEY_ root keys (case-insensitive). - static ref REGISTRY_PATH_REGEX: Regex = Regex::new(r"(?i)^HKEY_[A-Z_]+\\[^\x00\n\r]*").unwrap(); - - /// Regular expression for matching abbreviated registry paths - /// - /// Pattern matches abbreviated registry forms like HKLM, HKCU, etc. (case-insensitive). - static ref REGISTRY_ABBREV_REGEX: Regex = Regex::new(r"(?i)^HK(LM|CU|CR|U|CC)\\[^\x00\n\r]*").unwrap(); -} - -lazy_static! { - /// Common suspicious POSIX path prefixes for persistence detection - static ref SUSPICIOUS_POSIX_PATHS: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("/etc/cron.d/"); - set.insert("/etc/init.d/"); - set.insert("/usr/local/bin/"); - set.insert("/tmp/"); - set.insert("/var/tmp/"); - set.insert("/etc/rc.d/"); - set.insert("/etc/crontab"); - set.insert("/etc/systemd/system/"); - set.insert("~/.config/autostart/"); - set.insert("/Library/LaunchDaemons/"); - set.insert("/Library/LaunchAgents/"); - set - }; - - /// Common suspicious Windows path prefixes for persistence detection - static ref SUSPICIOUS_WINDOWS_PATHS: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("C:\\Windows\\System32\\"); - set.insert("C:\\Windows\\Temp\\"); - set.insert("\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\"); - set.insert("C:\\ProgramData\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\"); - set.insert("C:\\Windows\\SysWOW64\\"); - set - }; - - /// Known valid POSIX path prefixes - static ref KNOWN_POSIX_PREFIXES: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("/usr/"); - set.insert("/etc/"); - set.insert("/var/"); - set.insert("/home/"); - set.insert("/opt/"); - set.insert("/bin/"); - set.insert("/sbin/"); - set.insert("/lib/"); - set.insert("/dev/"); - set.insert("/proc/"); - set.insert("/sys/"); - set.insert("/tmp/"); - set - }; - - /// Known valid Windows path prefixes - static ref KNOWN_WINDOWS_PREFIXES: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("C:\\Windows\\"); - set.insert("C:\\Program Files\\"); - set.insert("C:\\Program Files (x86)\\"); - set.insert("C:\\Users\\"); - set.insert("C:\\ProgramData\\"); - set - }; - - /// Valid Windows registry root keys - static ref VALID_REGISTRY_ROOTS: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("HKEY_LOCAL_MACHINE"); - set.insert("HKEY_CURRENT_USER"); - set.insert("HKEY_CLASSES_ROOT"); - set.insert("HKEY_USERS"); - set.insert("HKEY_CURRENT_CONFIG"); - set - }; - - /// Suspicious Windows registry paths for persistence detection - static ref SUSPICIOUS_REGISTRY_PATHS: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run"); - set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\RunOnce"); - set.insert("\\System\\CurrentControlSet\\Services"); - set.insert("\\SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion\\Winlogon"); - set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders"); - set - }; -} +// Re-export pattern functions for backward compatibility +pub use patterns::{ + classify_base64, classify_domain, classify_email, classify_format_string, classify_guid, + classify_ip_addresses, classify_posix_path, classify_registry_path, classify_unc_path, + classify_url, classify_user_agent, classify_windows_path, has_valid_tld, is_ipv4_address, + is_ipv6_address, is_suspicious_posix_path, is_suspicious_registry_path, + is_suspicious_windows_path, is_valid_posix_path, is_valid_registry_path, is_valid_windows_path, + strip_ipv6_brackets, strip_port, +}; /// Semantic classifier for identifying network indicators in extracted strings /// -/// The `SemanticClassifier` provides methods to detect URLs and domain names +/// The `SemanticClassifier` provides methods to detect URLs, domain names, +/// IP addresses, file paths, registry paths, GUIDs, emails, and other patterns /// within text content. It uses compiled regular expressions for efficient -/// pattern matching and includes TLD validation to reduce false positives. -/// -/// URLs are prioritized over domains to prevent double-tagging - if a string -/// matches both patterns, it will only be tagged as a URL. +/// pattern matching and includes validation to reduce false positives. #[derive(Debug, Default)] pub struct SemanticClassifier; +/// Internal struct for regex cache address verification (used in testing) #[doc(hidden)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct RegexCacheAddresses { @@ -233,11 +89,14 @@ pub struct RegexCacheAddresses { impl SemanticClassifier { /// Create a new instance of the semantic classifier + #[must_use] pub fn new() -> Self { Self } + /// Returns memory addresses of cached regex patterns (for testing) #[doc(hidden)] + #[must_use] pub fn regex_cache_addresses(&self) -> RegexCacheAddresses { RegexCacheAddresses { url: &*URL_REGEX as *const Regex as usize, @@ -255,8 +114,7 @@ impl SemanticClassifier { /// Detects HTTP/HTTPS URLs in the given text /// /// This method identifies URLs that start with `http://` or `https://` - /// and contain valid URL characters. The pattern excludes problematic - /// characters to avoid false positives. + /// and contain valid URL characters. /// /// # Arguments /// @@ -265,31 +123,15 @@ impl SemanticClassifier { /// # Returns /// /// Returns `Some(Tag::Url)` if a URL is found, `None` otherwise. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// use stringy::types::Tag; - /// - /// let classifier = SemanticClassifier::new(); - /// assert_eq!(classifier.classify_url("https://example.com"), Some(Tag::Url)); - /// assert_eq!(classifier.classify_url("example.com"), None); - /// ``` + #[must_use] pub fn classify_url(&self, text: &str) -> Option { - if URL_REGEX.is_match(text) { - Some(Tag::Url) - } else { - None - } + classify_url(text) } /// Detects domain names that are not URLs /// /// This method identifies domain names that match the domain pattern but - /// are not already identified as URLs. It first checks if the text is NOT - /// a URL to prevent double-tagging, then validates against the domain - /// pattern and TLD list. + /// are not already identified as URLs. /// /// # Arguments /// @@ -297,34 +139,10 @@ impl SemanticClassifier { /// /// # Returns /// - /// Returns `Some(Tag::Domain)` if a valid domain is found (and it's not - /// a URL), `None` otherwise. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// use stringy::types::Tag; - /// - /// let classifier = SemanticClassifier::new(); - /// assert_eq!(classifier.classify_domain("example.com"), Some(Tag::Domain)); - /// assert_eq!(classifier.classify_domain("https://example.com"), None); - /// ``` + /// Returns `Some(Tag::Domain)` if a valid domain is found, `None` otherwise. + #[must_use] pub fn classify_domain(&self, text: &str) -> Option { - // First check if it's NOT a URL to prevent double-tagging - if URL_REGEX.is_match(text) { - return None; - } - - // Check if it matches the domain pattern - if DOMAIN_REGEX.is_match(text) { - // Validate TLD to reduce false positives - if self.has_valid_tld(text) { - return Some(Tag::Domain); - } - } - - None + classify_domain(text) } /// Main entry point for semantic classification @@ -332,7 +150,7 @@ impl SemanticClassifier { /// This method analyzes a `FoundString` and returns a vector of semantic /// tags that apply to the string. URLs are checked first, then domains /// (which automatically excludes URLs to prevent double-tagging), then - /// IP addresses (IPv4 and IPv6). + /// IP addresses (IPv4 and IPv6), file paths, and other patterns. /// /// # Arguments /// @@ -340,672 +158,190 @@ impl SemanticClassifier { /// /// # Returns /// - /// Returns a vector of `Tag` values that apply to the string. The vector - /// may be empty if no patterns match. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// use stringy::types::{FoundString, Encoding, StringSource, Tag}; - /// - /// let classifier = SemanticClassifier::new(); - /// let found_string = FoundString { - /// text: "https://example.com".to_string(), - /// original_text: None, - /// encoding: Encoding::Ascii, - /// offset: 0, - /// rva: None, - /// section: None, - /// length: 19, - /// tags: Vec::new(), - /// score: 0, - /// section_weight: None, - /// semantic_boost: None, - /// noise_penalty: None, - /// source: StringSource::SectionData, - /// confidence: 1.0, - /// }; - /// - /// let tags = classifier.classify(&found_string); - /// assert_eq!(tags.len(), 1); - /// assert!(matches!(tags[0], Tag::Url)); - /// ``` + /// Returns a vector of `Tag` values that apply to the string. + #[must_use] pub fn classify(&self, string: &FoundString) -> Vec { let mut tags = Vec::new(); // Check for URLs first - if let Some(tag) = self.classify_url(&string.text) { + if let Some(tag) = classify_url(&string.text) { tags.push(tag); } // Check for domains (this will automatically exclude URLs) - if let Some(tag) = self.classify_domain(&string.text) { + if let Some(tag) = classify_domain(&string.text) { tags.push(tag); } // Check for IP addresses (IPv4 and IPv6) - let ip_tags = self.classify_ip_addresses(&string.text); + let ip_tags = classify_ip_addresses(&string.text); tags.extend(ip_tags); // Check for file paths (POSIX, Windows, UNC) - only add FilePath tag once - if self.classify_posix_path(&string.text).is_some() - || self.classify_windows_path(&string.text).is_some() - || self.classify_unc_path(&string.text).is_some() + if classify_posix_path(&string.text).is_some() + || classify_windows_path(&string.text).is_some() + || classify_unc_path(&string.text).is_some() { tags.push(Tag::FilePath); } // Check for registry paths - if let Some(tag) = self.classify_registry_path(&string.text) { + if let Some(tag) = classify_registry_path(&string.text) { tags.push(tag); } - tags - } + // Check for GUIDs + if let Some(tag) = classify_guid(&string.text) { + tags.push(tag); + } - /// Validates the top-level domain (TLD) against a hardcoded list - /// - /// This method extracts the TLD from a domain string and validates it - /// against a comprehensive list of common TLDs. This helps reduce false - /// positives by ensuring domains have valid TLDs. - /// - /// # Arguments - /// - /// * `domain` - The domain string to validate - /// - /// # Returns - /// - /// Returns `true` if the TLD is valid and at least 2 characters long, - /// `false` otherwise. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// - /// let classifier = SemanticClassifier::new(); - /// assert!(classifier.has_valid_tld("example.com")); - /// assert!(!classifier.has_valid_tld("example.x")); - /// ``` - fn has_valid_tld(&self, domain: &str) -> bool { - // Extract TLD (last segment after final dot) - let tld = domain.split('.').next_back().unwrap_or(""); - - // TLD must be at least 2 characters - if tld.len() < 2 { - return false; + // Check for email addresses + if let Some(tag) = classify_email(&string.text) { + tags.push(tag); } - // Normalize TLD to lowercase for case-insensitive validation - let tld_lower = tld.to_ascii_lowercase(); - - // Validate against hardcoded list of common TLDs - let valid_tlds = [ - // Generic TLDs - "com", - "net", - "org", - "io", - "co", - // Country code TLDs - "gov", - "edu", - "mil", - "int", - "uk", - "de", - "fr", - "jp", - "cn", - "au", - "ca", - "ru", - "br", - "in", - "nl", - "eu", - // New gTLDs - "info", - "biz", - "dev", - "app", - "cloud", - "tech", - "online", - "site", - "xyz", - "top", - "win", - "bid", - // Additional common TLDs - "me", - "tv", - "cc", - "ws", - "name", - "pro", - "mobi", - "asia", - "tel", - "travel", - "jobs", - "museum", - "aero", - "coop", - "cat", - "xxx", - "post", - "arpa", - "test", - "example", - "localhost", - ]; - - valid_tlds.contains(&tld_lower.as_str()) + // Check for format strings + if let Some(tag) = classify_format_string(&string.text) { + tags.push(tag); + } + + // Check for user agent strings + if let Some(tag) = classify_user_agent(&string.text) { + tags.push(tag); + } + + // Check for Base64 (broad tag - checked last as it has more false positives) + if let Some(tag) = classify_base64(&string.text) { + tags.push(tag); + } + + tags + } + + /// Validates a TLD against the known list + #[must_use] + pub fn has_valid_tld(&self, domain: &str) -> bool { + has_valid_tld(domain) } /// Strips port suffix from an IP address string - /// - /// Removes `:port` suffix if present (e.g., `192.168.1.1:8080` → `192.168.1.1`). - /// - /// # Arguments - /// - /// * `text` - The text that may contain a port suffix - /// - /// # Returns - /// - /// Returns a string slice without the port suffix. - fn strip_port<'a>(&self, text: &'a str) -> &'a str { - PORT_SUFFIX_REGEX - .find(text) - .map_or(text, |m| &text[..m.start()]) + #[must_use] + pub fn strip_port<'a>(&self, text: &'a str) -> &'a str { + strip_port(text) } - /// Strips bracketed notation from IPv6 addresses - /// - /// Removes `[` and `]` from bracketed IPv6 addresses (e.g., `[::1]` → `::1`). - /// - /// # Arguments - /// - /// * `text` - The text that may contain bracketed IPv6 notation - /// - /// # Returns - /// - /// Returns a string slice without brackets, or the original text if no brackets found. - fn strip_ipv6_brackets<'a>(&self, text: &'a str) -> &'a str { - IPV6_BRACKETS_REGEX - .captures(text) - .and_then(|caps| caps.get(1)) - .map_or(text, |m| m.as_str()) + /// Strips brackets from IPv6 address + #[must_use] + pub fn strip_ipv6_brackets<'a>(&self, text: &'a str) -> &'a str { + strip_ipv6_brackets(text) } - /// Detects IPv4 addresses in the given text - /// - /// This method uses a two-stage validation approach: - /// 1. Regex pre-filter for performance - /// 2. `std::net::Ipv4Addr` validation for correctness - /// - /// It also handles port suffixes (e.g., "192.168.1.1:8080"). - /// - /// # Note on Version Numbers - /// - /// This method accepts ALL valid IPv4 addresses in dotted-quad notation, - /// even if they could also be interpreted as version numbers (e.g., "1.2.3.4"). - /// It is the responsibility of the caller to disambiguate between IP addresses - /// and version numbers based on context when necessary. - /// - /// # Arguments - /// - /// * `text` - The text to search for IPv4 addresses - /// - /// # Returns - /// - /// Returns `true` if a valid IPv4 address is found, `false` otherwise. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// - /// let classifier = SemanticClassifier::new(); - /// assert!(classifier.is_ipv4_address("192.168.1.1")); - /// assert!(classifier.is_ipv4_address("192.168.1.1:8080")); - /// assert!(classifier.is_ipv4_address("1.2.3.4")); // Valid IP (could also be a version number) - /// assert!(!classifier.is_ipv4_address("256.1.1.1")); // Invalid octet - /// ``` + /// Checks if text is a valid IPv4 address + #[must_use] pub fn is_ipv4_address(&self, text: &str) -> bool { - // Strip port suffix if present - let text_without_port = self.strip_port(text); - - // Two-stage validation: regex pre-filter first - if !IPV4_REGEX.is_match(text_without_port) { - return false; - } - - // Check for leading zeros in octets (e.g., 192.168.01.1 should be rejected) - for octet_str in text_without_port.split('.') { - // If an octet has more than 1 digit and starts with '0', it's invalid - if octet_str.len() > 1 && octet_str.starts_with('0') { - return false; - } - } - - // Validate using std::net::Ipv4Addr for correctness - // This is the authoritative check - regex is just a pre-filter - Ipv4Addr::from_str(text_without_port).is_ok() + is_ipv4_address(text) } - /// Detects IPv6 addresses in the given text - /// - /// This method uses a two-stage validation approach: - /// 1. Basic structure check (contains colons, looks like IPv6) - /// 2. `std::net::Ipv6Addr` validation for correctness - /// - /// It handles bracketed notation (e.g., `[::1]`) and port suffixes. - /// - /// # Arguments - /// - /// * `text` - The text to search for IPv6 addresses - /// - /// # Returns - /// - /// Returns `true` if a valid IPv6 address is found, `false` otherwise. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// - /// let classifier = SemanticClassifier::new(); - /// assert!(classifier.is_ipv6_address("2001:db8::1")); - /// assert!(classifier.is_ipv6_address("::1")); - /// assert!(classifier.is_ipv6_address("[::1]:8080")); - /// assert!(!classifier.is_ipv6_address("gggg::1")); // Invalid hex - /// ``` + /// Checks if text is a valid IPv6 address + #[must_use] pub fn is_ipv6_address(&self, text: &str) -> bool { - // Handle bracketed IPv6 addresses like [::1] or [::1]:8080 - // Strategy: strip port first (if present), then strip brackets - - // If it looks like it has a port (contains ]:), strip port first - let after_port = if text.contains("]:") { - self.strip_port(text) - } else { - text - }; - - // Now strip brackets if present - let processed = self.strip_ipv6_brackets(after_port); - - // Two-stage validation: regex pre-filter first - // Basic structure check: must contain colons (IPv6 addresses always have colons) - if !processed.contains(':') { - return false; - } - - // For mixed notation (contains both colons and dots), skip regex check - // as the regex doesn't handle all mixed notation patterns - let is_mixed_notation = processed.contains('.'); - - if !is_mixed_notation { - // Use regex as pre-filter for non-mixed notation - if !IPV6_REGEX.is_match(processed) { - return false; - } - } + is_ipv6_address(text) + } - // Validate using std::net::Ipv6Addr for canonical validation - // This handles all IPv6 formats: full, compressed, mixed notation - Ipv6Addr::from_str(processed).is_ok() + /// Classifies IP addresses in text + #[must_use] + pub fn classify_ip_addresses(&self, text: &str) -> Vec { + classify_ip_addresses(text) } - /// Detects POSIX file paths in the given text - /// - /// Returns `Some(Tag::FilePath)` if a POSIX path is detected and valid. + /// Classifies POSIX paths + #[must_use] pub fn classify_posix_path(&self, text: &str) -> Option { - if !POSIX_PATH_REGEX.is_match(text) { - return None; - } - - if !self.is_valid_posix_path(text) { - return None; - } - - Some(Tag::FilePath) + classify_posix_path(text) } - /// Detects Windows file paths in the given text - /// - /// Returns `Some(Tag::FilePath)` if a Windows path is detected and valid. + /// Classifies Windows paths + #[must_use] pub fn classify_windows_path(&self, text: &str) -> Option { - if !WINDOWS_PATH_REGEX.is_match(text) { - return None; - } - - if !self.is_valid_windows_path(text) { - return None; - } - - Some(Tag::FilePath) + classify_windows_path(text) } - /// Detects UNC network paths in the given text - /// - /// Returns `Some(Tag::FilePath)` if a UNC path is detected and valid. - /// Performs robust validation including: - /// - Maximum overall length (4096) and component length (255) - /// - Control character rejection - /// - Forward slash and printf placeholder rejection - /// - Reserved name and dots-only component rejection - /// - Empty segment detection + /// Classifies UNC paths + #[must_use] pub fn classify_unc_path(&self, text: &str) -> Option { - if !UNC_PATH_REGEX.is_match(text) { - return None; - } - - // Maximum overall length check - if text.len() > 4096 { - return None; - } - - // Reject control characters - if self.contains_control_chars(text) { - return None; - } - - // Reject forward slashes anywhere in the path - if text.contains('/') { - return None; - } - - let trimmed = text.trim_start_matches('\\').trim_end_matches('\\'); - let parts: Vec<&str> = trimmed.split('\\').collect(); - - // Must have at least server and share - if parts.len() < 2 { - return None; - } - - let server = parts[0]; - let share = parts[1]; - - if server.is_empty() || share.is_empty() { - return None; - } - - // Validate all segments (no empty segments from double backslashes) - for segment in &parts { - // Reject empty segments (from consecutive backslashes like \\\\server\\\\share) - if segment.is_empty() { - return None; - } - - // Enforce max component length (255 bytes) - if segment.len() > 255 { - return None; - } - - // Reject components consisting solely of dots (but allow dots in domain names) - // Only reject if the segment is exactly "." or ".." - if *segment == "." || *segment == ".." { - return None; - } - } - - // Reject printf-style placeholders in server or share - if self.contains_printf_placeholder(server) || self.contains_printf_placeholder(share) { - return None; - } - - // Reject reserved Windows device names in server or share - let reserved_names = [ - "CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", - "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", - ]; - let server_upper = server.to_ascii_uppercase(); - let share_upper = share.to_ascii_uppercase(); - for reserved in &reserved_names { - if server_upper == *reserved || share_upper == *reserved { - return None; - } - } - - Some(Tag::FilePath) + classify_unc_path(text) } - /// Detects Windows registry paths in the given text - /// - /// Returns `Some(Tag::RegistryPath)` if a registry path is detected and valid. + /// Classifies registry paths + #[must_use] pub fn classify_registry_path(&self, text: &str) -> Option { - if !REGISTRY_PATH_REGEX.is_match(text) && !REGISTRY_ABBREV_REGEX.is_match(text) { - return None; - } - - if !self.is_valid_registry_path(text) { - return None; - } - - Some(Tag::RegistryPath) + classify_registry_path(text) } - /// Checks if the POSIX path matches known suspicious locations + /// Checks if POSIX path is suspicious + #[must_use] pub fn is_suspicious_posix_path(&self, text: &str) -> bool { - SUSPICIOUS_POSIX_PATHS - .iter() - .any(|prefix| text.starts_with(prefix)) + is_suspicious_posix_path(text) } - /// Checks if the Windows path matches known suspicious locations (case-insensitive) + /// Checks if Windows path is suspicious + #[must_use] pub fn is_suspicious_windows_path(&self, text: &str) -> bool { - let lowered_text = text.to_ascii_lowercase(); - SUSPICIOUS_WINDOWS_PATHS.iter().any(|prefix| { - let lowered_prefix = prefix.to_ascii_lowercase(); - if prefix.starts_with('\\') { - lowered_text.contains(&lowered_prefix) - } else { - lowered_text.starts_with(&lowered_prefix) - } - }) + is_suspicious_windows_path(text) } - /// Checks if the registry path matches known persistence locations + /// Checks if registry path is suspicious + #[must_use] pub fn is_suspicious_registry_path(&self, text: &str) -> bool { - SUSPICIOUS_REGISTRY_PATHS - .iter() - .any(|path| self.contains_ascii_case_insensitive(text, path)) - } - - /// Case-insensitive ASCII substring search without allocations - fn contains_ascii_case_insensitive(&self, haystack: &str, needle: &str) -> bool { - if needle.is_empty() { - return true; - } - - let haystack_bytes = haystack.as_bytes(); - let needle_bytes = needle.as_bytes(); - - if needle_bytes.len() > haystack_bytes.len() { - return false; - } - - haystack_bytes - .windows(needle_bytes.len()) - .any(|window| window.eq_ignore_ascii_case(needle_bytes)) + is_suspicious_registry_path(text) } - /// Detects printf-style placeholders to reduce false positives - fn contains_printf_placeholder(&self, text: &str) -> bool { - let mut chars = text.chars().peekable(); - - while let Some(ch) = chars.next() { - if ch == '%' - && let Some(next) = chars.peek() - && matches!(next, 's' | 'd' | 'x' | 'o' | 'u' | 'f') - { - return true; - } - } - - false - } - - /// Checks if text contains ASCII control characters (C0 controls: 0x00-0x1F and DEL: 0x7F) - fn contains_control_chars(&self, text: &str) -> bool { - text.bytes().any(|b| b <= 0x1F || b == 0x7F) - } - - /// Validates POSIX path structure + /// Validates POSIX path + #[must_use] pub fn is_valid_posix_path(&self, text: &str) -> bool { - if text.len() > 4096 { - return false; - } - - if text.contains('\0') || text.contains('\n') || text.contains('\r') { - return false; - } - - if text.contains("//") { - return false; - } - - if text.contains('\\') { - return false; - } - - if self.contains_printf_placeholder(text) { - return false; - } - - let has_known_prefix = KNOWN_POSIX_PREFIXES - .iter() - .any(|prefix| text.starts_with(prefix)); - let is_suspicious = self.is_suspicious_posix_path(text); - - if !has_known_prefix && !is_suspicious && text.len() > 2048 { - return false; - } - - true + is_valid_posix_path(text) } - /// Validates Windows path structure + /// Validates Windows path + #[must_use] pub fn is_valid_windows_path(&self, text: &str) -> bool { - // Reject control characters early to prevent regex/prefix matching from being fooled - if self.contains_control_chars(text) { - return false; - } - - if text.len() > 4096 { - return false; - } - - if text.contains('/') { - return false; - } - - if text.contains("\\\\") { - return false; - } - - if self.contains_printf_placeholder(text) { - return false; - } - - let has_known_prefix = KNOWN_WINDOWS_PREFIXES - .iter() - .any(|prefix| text.starts_with(prefix)); - let is_suspicious = self.is_suspicious_windows_path(text); - - if !has_known_prefix && !is_suspicious && text.len() > 2048 { - return false; - } - - true + is_valid_windows_path(text) } - /// Validates Windows registry path structure + /// Validates registry path + #[must_use] pub fn is_valid_registry_path(&self, text: &str) -> bool { - // Reject control characters early to prevent regex/prefix matching from being fooled - if self.contains_control_chars(text) { - return false; - } - - // Maximum length check (4096 bytes) - if text.len() > 4096 { - return false; - } - - if text.contains('/') { - return false; - } - - if text.contains("\\\\") { - return false; - } - - let root = text.split('\\').next().unwrap_or(""); - let root_upper = root.to_ascii_uppercase(); - - if root_upper.starts_with("HKEY_") { - return VALID_REGISTRY_ROOTS - .iter() - .any(|valid| *valid == root_upper); - } - - if root_upper.starts_with("HK") { - return matches!( - root_upper.as_str(), - "HKLM" | "HKCU" | "HKCR" | "HKU" | "HKCC" - ); - } + is_valid_registry_path(text) + } - false + /// Classifies GUIDs + #[must_use] + pub fn classify_guid(&self, text: &str) -> Option { + classify_guid(text) } - /// Classifies IP addresses (IPv4 and IPv6) in the given text - /// - /// This method checks for both IPv4 and IPv6 addresses and returns - /// appropriate tags. A string may match both patterns (unlikely but possible). - /// - /// # Arguments - /// - /// * `text` - The text to search for IP addresses - /// - /// # Returns - /// - /// Returns a vector of `Tag` values (`Tag::IPv4` and/or `Tag::IPv6`). - /// The vector may be empty if no IP addresses are found. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// use stringy::types::Tag; - /// - /// let classifier = SemanticClassifier::new(); - /// let tags = classifier.classify_ip_addresses("192.168.1.1"); - /// assert_eq!(tags, vec![Tag::IPv4]); - /// - /// let tags = classifier.classify_ip_addresses("::1"); - /// assert_eq!(tags, vec![Tag::IPv6]); - /// - /// let tags = classifier.classify_ip_addresses("not an ip"); - /// assert!(tags.is_empty()); - /// ``` - pub fn classify_ip_addresses(&self, text: &str) -> Vec { - let mut tags = Vec::new(); + /// Classifies email addresses + #[must_use] + pub fn classify_email(&self, text: &str) -> Option { + classify_email(text) + } - // Check for IPv4 - if self.is_ipv4_address(text) { - tags.push(Tag::IPv4); - } + /// Classifies Base64-encoded data + #[must_use] + pub fn classify_base64(&self, text: &str) -> Option { + classify_base64(text) + } - // Check for IPv6 - if self.is_ipv6_address(text) { - tags.push(Tag::IPv6); - } + /// Classifies format strings + #[must_use] + pub fn classify_format_string(&self, text: &str) -> Option { + classify_format_string(text) + } - tags + /// Classifies user agent strings + #[must_use] + pub fn classify_user_agent(&self, text: &str) -> Option { + classify_user_agent(text) } } @@ -1014,7 +350,6 @@ mod tests { use super::*; use crate::types::{Encoding, StringSource}; - /// Helper function to create a test FoundString fn create_test_string(text: &str) -> FoundString { FoundString { text: text.to_string(), @@ -1035,606 +370,125 @@ mod tests { } #[test] - fn test_url_detection() { - let classifier = SemanticClassifier::new(); - - // Valid URLs - assert_eq!( - classifier.classify_url("https://example.com"), - Some(Tag::Url) - ); - assert_eq!( - classifier.classify_url("http://api.malware.com/v1/data"), - Some(Tag::Url) - ); - assert_eq!( - classifier.classify_url("https://192.168.1.1:8080/path"), - Some(Tag::Url) - ); - - // Invalid cases (not URLs) - assert_eq!(classifier.classify_url("example.com"), None); - assert_eq!(classifier.classify_url("not a url"), None); - } - - #[test] - fn test_domain_detection() { + fn test_classify_mixed_strings() { let classifier = SemanticClassifier::new(); - // Valid domains - assert_eq!(classifier.classify_domain("example.com"), Some(Tag::Domain)); - assert_eq!( - classifier.classify_domain("api.service.io"), - Some(Tag::Domain) - ); - assert_eq!( - classifier.classify_domain("malware-c2.net"), - Some(Tag::Domain) - ); - - // Valid domains with mixed-case TLDs - assert_eq!(classifier.classify_domain("example.COM"), Some(Tag::Domain)); - assert_eq!( - classifier.classify_domain("api.service.IO"), - Some(Tag::Domain) - ); - assert_eq!( - classifier.classify_domain("malware-c2.NET"), - Some(Tag::Domain) - ); - assert_eq!(classifier.classify_domain("Example.OrG"), Some(Tag::Domain)); - - // URLs should not match as domains - assert_eq!(classifier.classify_domain("https://example.com"), None); - - // Invalid domains - assert_eq!(classifier.classify_domain("invalid"), None); - assert_eq!(classifier.classify_domain("too.short.x"), None); - } - - #[test] - fn test_url_classification() { - let classifier = SemanticClassifier::new(); - let found_string = create_test_string("https://example.com/api"); + // URL + let url_string = create_test_string("https://example.com/api"); + let tags = classifier.classify(&url_string); + assert!(tags.contains(&Tag::Url)); - let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::Url)); - } + // Domain + let domain_string = create_test_string("api.example.com"); + let tags = classifier.classify(&domain_string); + assert!(tags.contains(&Tag::Domain)); - #[test] - fn test_domain_classification() { - let classifier = SemanticClassifier::new(); - let found_string = create_test_string("example.com"); + // IPv4 + let ipv4_string = create_test_string("192.168.1.1"); + let tags = classifier.classify(&ipv4_string); + assert!(tags.contains(&Tag::IPv4)); - let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::Domain)); + // Windows path + let path_string = create_test_string("C:\\Windows\\System32\\cmd.exe"); + let tags = classifier.classify(&path_string); + assert!(tags.contains(&Tag::FilePath)); } #[test] - fn test_url_not_double_tagged() { + fn test_classify_posix_path_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("https://example.com"); + let found_string = create_test_string("/usr/local/bin/app"); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::Url)); - // Ensure it's NOT also tagged as Domain - assert!(!tags.iter().any(|t| matches!(t, Tag::Domain))); - } - - #[test] - fn test_tld_validation() { - let classifier = SemanticClassifier::new(); - - // Valid TLDs - assert!(classifier.has_valid_tld("example.com")); - assert!(classifier.has_valid_tld("test.net")); - assert!(classifier.has_valid_tld("site.org")); - assert!(classifier.has_valid_tld("api.io")); - - // Valid TLDs with mixed case (should be normalized) - assert!(classifier.has_valid_tld("example.COM")); - assert!(classifier.has_valid_tld("test.NET")); - assert!(classifier.has_valid_tld("site.ORG")); - assert!(classifier.has_valid_tld("api.IO")); - assert!(classifier.has_valid_tld("Example.CoM")); - - // Invalid TLDs - assert!(!classifier.has_valid_tld("example.x")); - assert!(!classifier.has_valid_tld("test.invalid")); - assert!(!classifier.has_valid_tld("site.toolong123")); + assert!(tags.contains(&Tag::FilePath)); } #[test] - fn test_edge_cases() { + fn test_classify_windows_path_in_found_string() { let classifier = SemanticClassifier::new(); + let found_string = create_test_string("C:\\Program Files\\Application\\app.exe"); - // Empty string - let empty = create_test_string(""); - let tags = classifier.classify(&empty); - assert_eq!(tags.len(), 0); - - // Very long domain (within RFC 1035 limits) - let long_domain = "a".repeat(60) + ".com"; - let found_string = create_test_string(&long_domain); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::Domain)); - - // String with no valid domain pattern - let no_domain = create_test_string("just some text without domains"); - let tags = classifier.classify(&no_domain); - assert_eq!(tags.len(), 0); - - // Malformed URL - let malformed = create_test_string("http://"); - let tags = classifier.classify(&malformed); - assert_eq!(tags.len(), 0); - } - - #[test] - fn test_ipv4_valid_addresses() { - let classifier = SemanticClassifier::new(); - - // Valid IPv4 addresses - assert!(classifier.is_ipv4_address("192.168.1.1")); - assert!(classifier.is_ipv4_address("10.0.0.1")); - assert!(classifier.is_ipv4_address("8.8.8.8")); - assert!(classifier.is_ipv4_address("1.1.1.1")); - assert!(classifier.is_ipv4_address("127.0.0.1")); - assert!(classifier.is_ipv4_address("0.0.0.0")); - assert!(classifier.is_ipv4_address("255.255.255.255")); - } - - #[test] - fn test_ipv4_invalid_addresses() { - let classifier = SemanticClassifier::new(); - - // Invalid IPv4 addresses - assert!(!classifier.is_ipv4_address("256.1.1.1")); // Octet > 255 - assert!(!classifier.is_ipv4_address("192.168.1")); // Missing octet - assert!(!classifier.is_ipv4_address("192.168.1.1.1")); // Too many octets - assert!(!classifier.is_ipv4_address("999.999.999.999")); // All octets > 255 - assert!(!classifier.is_ipv4_address("192.168.01.1")); // Leading zero (invalid format) - } - - #[test] - fn test_ipv4_with_port() { - let classifier = SemanticClassifier::new(); - - // IPv4 addresses with ports should be detected - assert!(classifier.is_ipv4_address("192.168.1.1:8080")); - assert!(classifier.is_ipv4_address("10.0.0.1:443")); - assert!(classifier.is_ipv4_address("127.0.0.1:3000")); - } - - #[test] - fn test_ipv4_version_numbers() { - let classifier = SemanticClassifier::new(); - - // Valid IPv4 addresses that could also be version numbers are accepted - // It's the caller's responsibility to disambiguate based on context - assert!(classifier.is_ipv4_address("1.2.3.4")); - assert!(classifier.is_ipv4_address("2.0.1.0")); - assert!(classifier.is_ipv4_address("10.5.2.1")); - assert!(classifier.is_ipv4_address("10.5.2.20")); - } - - #[test] - fn test_ipv4_edge_cases() { - let classifier = SemanticClassifier::new(); - - // Boundary values - assert!(classifier.is_ipv4_address("0.0.0.0")); - assert!(classifier.is_ipv4_address("255.255.255.255")); - assert!(classifier.is_ipv4_address("192.0.0.1")); - assert!(classifier.is_ipv4_address("0.255.0.255")); - - // Private network addresses - assert!(classifier.is_ipv4_address("192.168.0.1")); - assert!(classifier.is_ipv4_address("10.0.0.1")); - assert!(classifier.is_ipv4_address("172.16.0.1")); - } - - #[test] - fn test_ipv6_full_notation() { - let classifier = SemanticClassifier::new(); - - // Full IPv6 notation - assert!(classifier.is_ipv6_address("2001:0db8:85a3:0000:0000:8a2e:0370:7334")); - assert!(classifier.is_ipv6_address("2001:0db8:85a3:0000:0000:8a2e:0370:7334")); - } - - #[test] - fn test_ipv6_compressed() { - let classifier = SemanticClassifier::new(); - - // Compressed IPv6 notation - assert!(classifier.is_ipv6_address("2001:db8::1")); - assert!(classifier.is_ipv6_address("::1")); - assert!(classifier.is_ipv6_address("fe80::1")); - assert!(classifier.is_ipv6_address("::")); - } - - #[test] - fn test_ipv6_mixed_notation() { - let classifier = SemanticClassifier::new(); - - // Mixed IPv4/IPv6 notation - assert!(classifier.is_ipv6_address("::ffff:192.0.2.1")); - assert!(classifier.is_ipv6_address("64:ff9b::192.0.2.1")); - } - - #[test] - fn test_ipv6_invalid() { - let classifier = SemanticClassifier::new(); - - // Invalid IPv6 addresses - assert!(!classifier.is_ipv6_address("gggg::1")); // Invalid hex - assert!(!classifier.is_ipv6_address("2001:db8::1::2")); // Double :: - assert!(!classifier.is_ipv6_address("2001:db8:1")); // Too short - } - - #[test] - fn test_ipv6_with_brackets() { - let classifier = SemanticClassifier::new(); - - // IPv6 addresses with brackets - assert!(classifier.is_ipv6_address("[2001:db8::1]")); - assert!(classifier.is_ipv6_address("[::1]")); - } - - #[test] - fn test_ipv6_with_port() { - let classifier = SemanticClassifier::new(); - - // IPv6 addresses with brackets and ports - assert!(classifier.is_ipv6_address("[2001:db8::1]:8080")); - assert!(classifier.is_ipv6_address("[::1]:8080")); + assert!(tags.contains(&Tag::FilePath)); } #[test] - fn test_classify_ipv4() { + fn test_classify_registry_path_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("192.168.1.1"); + let found_string = + create_test_string("HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion"); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::IPv4)); + assert!(tags.contains(&Tag::RegistryPath)); } #[test] - fn test_classify_ipv6() { + fn test_no_false_positives_on_random_data() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("::1"); + let found_string = create_test_string("x9qz1p0t8v7w6r5y4u3i2o1p"); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::IPv6)); - } - - #[test] - fn test_classify_no_ip() { - let classifier = SemanticClassifier::new(); - let found_string = create_test_string("not an ip address"); - - let tags = classifier.classify_ip_addresses(&found_string.text); assert!(tags.is_empty()); } #[test] - fn test_classify_ipv4_with_port() { + fn test_guid_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("192.168.1.1:8080"); + let found_string = create_test_string("{12345678-1234-1234-1234-123456789ABC}"); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::IPv4)); + assert!(tags.contains(&Tag::Guid)); } #[test] - fn test_classify_ipv6_with_brackets_and_port() { + fn test_email_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("[::1]:8080"); + let found_string = create_test_string("user@example.com"); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::IPv6)); - } - - #[test] - fn test_posix_absolute_path() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_posix_path("/usr/bin/bash"), - Some(Tag::FilePath) - ); - assert_eq!( - classifier.classify_posix_path("/etc/passwd"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_posix_home_directory() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_posix_path("/home/user/.bashrc"), - Some(Tag::FilePath) - ); - assert_eq!( - classifier.classify_posix_path("/home/user/.config/app"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_posix_with_spaces() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_posix_path("/Users/John Doe/Documents/file.txt"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_posix_system_directories() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_posix_path("/usr/"), Some(Tag::FilePath)); - assert_eq!(classifier.classify_posix_path("/etc/"), Some(Tag::FilePath)); - assert_eq!(classifier.classify_posix_path("/var/"), Some(Tag::FilePath)); - } - - #[test] - fn test_posix_suspicious_paths() { - let classifier = SemanticClassifier::new(); - - assert!(classifier.is_suspicious_posix_path("/tmp/malware")); - assert!(classifier.is_suspicious_posix_path("/etc/cron.d/backdoor")); - } - - #[test] - fn test_posix_too_short() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_posix_path("/a"), Some(Tag::FilePath)); - } - - #[test] - fn test_posix_invalid() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_posix_path("usr/bin/bash"), None); - } - - #[test] - fn test_posix_with_null_bytes() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_posix_path("/tmp/evil\0bin"), None); - } - - #[test] - fn test_windows_absolute_path() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_windows_path("C:\\Windows\\System32\\cmd.exe"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_windows_program_files() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_windows_path("C:\\Program Files (x86)\\App"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_windows_with_spaces() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_windows_path("D:\\My Documents\\file.txt"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_windows_different_drives() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_windows_path("D:\\"), - Some(Tag::FilePath) - ); - assert_eq!( - classifier.classify_windows_path("E:\\Data\\"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_windows_suspicious_paths() { - let classifier = SemanticClassifier::new(); - - assert!(classifier.is_suspicious_windows_path("C:\\Windows\\Temp\\evil.exe")); - } - - #[test] - fn test_windows_case_insensitive() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_windows_path("c:\\windows\\"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_windows_invalid() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_windows_path("C:/forward/slash"), None); - } - - #[test] - fn test_windows_invalid_drive() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_windows_path("1:\\path"), None); - } - - #[test] - fn test_unc_path() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_unc_path("\\\\server\\share\\file.txt"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_unc_with_domain() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_unc_path("\\\\server.domain.com\\share\\"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_unc_invalid() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_unc_path("\\\\\\\\"), None); - assert_eq!(classifier.classify_unc_path("\\\\server"), None); - } - - #[test] - fn test_registry_run_key() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_registry_path( - "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" - ), - Some(Tag::RegistryPath) - ); - } - - #[test] - fn test_registry_current_user() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_registry_path("HKEY_CURRENT_USER\\Software\\App\\Settings"), - Some(Tag::RegistryPath) - ); - } - - #[test] - fn test_registry_abbreviated_hklm() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_registry_path("HKLM\\System\\CurrentControlSet"), - Some(Tag::RegistryPath) - ); - } - - #[test] - fn test_registry_abbreviated_hkcu() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_registry_path("HKCU\\Software\\Microsoft"), - Some(Tag::RegistryPath) - ); - } - - #[test] - fn test_registry_persistence_run() { - let classifier = SemanticClassifier::new(); - - assert!(classifier.is_suspicious_registry_path( - "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" - )); - } - - #[test] - fn test_registry_invalid_root() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_registry_path("HKEY_INVALID\\Path"), - None - ); + assert!(tags.contains(&Tag::Email)); } #[test] - fn test_registry_forward_slash() { + fn test_base64_in_found_string() { let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_registry_path("HKLM/Software"), None); - } - - #[test] - fn test_classify_mixed_strings() { - let classifier = SemanticClassifier::new(); - let found_string = create_test_string("https://example.com"); + let found_string = create_test_string("SGVsbG8gV29ybGQh"); let tags = classifier.classify(&found_string); - assert!(tags.contains(&Tag::Url)); + assert!(tags.contains(&Tag::Base64)); } #[test] - fn test_classify_posix_path_in_found_string() { + fn test_format_string_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("/usr/bin/bash"); + let found_string = create_test_string("Error: %s at line %d"); let tags = classifier.classify(&found_string); - assert!(tags.contains(&Tag::FilePath)); + assert!(tags.contains(&Tag::FormatString)); } #[test] - fn test_classify_windows_path_in_found_string() { + fn test_user_agent_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("C:\\Windows\\System32\\cmd.exe"); + let found_string = + create_test_string("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); let tags = classifier.classify(&found_string); - assert!(tags.contains(&Tag::FilePath)); + assert!(tags.contains(&Tag::UserAgent)); } #[test] - fn test_classify_registry_path_in_found_string() { + fn test_multiple_tags_format_and_base64_not_both() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string( - "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run", - ); - - let tags = classifier.classify(&found_string); - assert!(tags.contains(&Tag::RegistryPath)); - } - #[test] - fn test_no_false_positives_on_random_data() { - let classifier = SemanticClassifier::new(); - let found_string = create_test_string("x9qz1p0t8v7w6r5y4u3i2o1p"); + // Format string should get FormatString tag + let format = create_test_string("Hello %s, your score is %d"); + let tags = classifier.classify(&format); + assert!(tags.contains(&Tag::FormatString)); - let tags = classifier.classify(&found_string); - assert!(tags.is_empty()); + // Pure Base64 should get Base64 tag + let base64 = create_test_string("VGhpcyBpcyBhIHRlc3Q="); + let tags = classifier.classify(&base64); + assert!(tags.contains(&Tag::Base64)); } } diff --git a/src/classification/symbols.rs b/src/classification/symbols.rs new file mode 100644 index 0000000..69361bb --- /dev/null +++ b/src/classification/symbols.rs @@ -0,0 +1,361 @@ +//! Symbol demangling for Rust and C++ symbols +//! +//! This module provides functionality to detect and demangle mangled symbols +//! from compiled Rust binaries. When a mangled symbol is detected, the original +//! mangled form is preserved in `FoundString.original_text` while the demangled +//! human-readable form replaces `FoundString.text`. +//! +//! # Supported Symbol Formats +//! +//! - **Rust legacy mangling**: Symbols starting with `_ZN` (uses Itanium ABI-like encoding) +//! - **Rust v0 mangling**: Symbols starting with `_R` (new Rust-specific encoding) +//! +//! # Usage +//! +//! ```rust +//! use stringy::classification::SymbolDemangler; +//! use stringy::types::{FoundString, Encoding, StringSource, Tag}; +//! +//! let demangler = SymbolDemangler::new(); +//! let mut found_string = FoundString { +//! text: "_ZN4core3fmt5Write9write_str17h1234567890abcdefE".to_string(), +//! original_text: None, +//! encoding: Encoding::Ascii, +//! offset: 0, +//! rva: None, +//! section: None, +//! length: 47, +//! tags: Vec::new(), +//! score: 0, +//! section_weight: None, +//! semantic_boost: None, +//! noise_penalty: None, +//! source: StringSource::ImportName, +//! confidence: 1.0, +//! }; +//! +//! demangler.demangle(&mut found_string); +//! // found_string.text now contains the demangled symbol +//! // found_string.original_text contains the original mangled form +//! // found_string.tags contains Tag::DemangledSymbol +//! ``` + +use crate::types::{FoundString, Tag}; + +/// Symbol demangler for Rust symbols +/// +/// Uses the `rustc-demangle` crate to convert mangled Rust symbols into +/// human-readable form while preserving the original mangled text. +#[derive(Debug, Default, Clone)] +pub struct SymbolDemangler; + +impl SymbolDemangler { + /// Create a new instance of the symbol demangler + #[must_use] + pub fn new() -> Self { + Self + } + + /// Check if a symbol appears to be a mangled Rust symbol + /// + /// Returns `true` if the symbol starts with known Rust mangling prefixes: + /// - `_ZN` - Rust legacy mangling (Itanium ABI-like) + /// - `_R` - Rust v0 mangling scheme + /// + /// # Arguments + /// + /// * `symbol` - The symbol string to check + /// + /// # Returns + /// + /// Returns `true` if the symbol appears to be mangled, `false` otherwise. + /// + /// # Examples + /// + /// ``` + /// use stringy::classification::SymbolDemangler; + /// + /// let demangler = SymbolDemangler::new(); + /// assert!(demangler.is_mangled("_ZN4core3fmt5Write9write_str17h1234567890abcdefE")); + /// assert!(demangler.is_mangled("_RNvNtCs123_4core3fmt5write")); + /// assert!(!demangler.is_mangled("printf")); + /// ``` + #[must_use] + pub fn is_mangled(&self, symbol: &str) -> bool { + // Rust legacy mangling (Itanium ABI-like) + if symbol.starts_with("_ZN") { + return true; + } + + // Rust v0 mangling scheme + if symbol.starts_with("_R") { + return true; + } + + false + } + + /// Attempt to demangle a symbol in a `FoundString` + /// + /// If the string appears to be a mangled Rust symbol and can be successfully + /// demangled: + /// - The original mangled form is stored in `original_text` + /// - The demangled form replaces `text` + /// - `Tag::DemangledSymbol` is added to the tags + /// + /// If demangling fails or the symbol is not mangled, the `FoundString` is + /// left unchanged. + /// + /// # Arguments + /// + /// * `string` - The `FoundString` to process (modified in-place) + /// + /// # Examples + /// + /// ``` + /// use stringy::classification::SymbolDemangler; + /// use stringy::types::{FoundString, Encoding, StringSource, Tag}; + /// + /// let demangler = SymbolDemangler::new(); + /// let mut found_string = FoundString { + /// text: "_ZN4core3fmt5Write9write_str17h1234567890abcdefE".to_string(), + /// original_text: None, + /// encoding: Encoding::Ascii, + /// offset: 0, + /// rva: None, + /// section: None, + /// length: 47, + /// tags: Vec::new(), + /// score: 0, + /// section_weight: None, + /// semantic_boost: None, + /// noise_penalty: None, + /// source: StringSource::ImportName, + /// confidence: 1.0, + /// }; + /// + /// demangler.demangle(&mut found_string); + /// assert!(found_string.tags.contains(&Tag::DemangledSymbol)); + /// assert!(found_string.original_text.is_some()); + /// ``` + pub fn demangle(&self, string: &mut FoundString) { + // Only attempt demangling if it looks like a mangled symbol + if !self.is_mangled(&string.text) { + return; + } + + // Attempt to demangle using rustc-demangle + let demangled = rustc_demangle::demangle(&string.text); + let demangled_str = demangled.to_string(); + + // Check if demangling actually produced a different result + // If the demangled form equals the original, demangling failed + if demangled_str == string.text { + return; + } + + // Store original mangled form and replace with demangled + string.original_text = Some(string.text.clone()); + string.text = demangled_str; + + // Add the DemangledSymbol tag if not already present + if !string.tags.contains(&Tag::DemangledSymbol) { + string.tags.push(Tag::DemangledSymbol); + } + } + + /// Try to demangle a symbol string directly + /// + /// This is a convenience method for demangling without a `FoundString`. + /// + /// # Arguments + /// + /// * `symbol` - The mangled symbol string + /// + /// # Returns + /// + /// Returns `Some(demangled)` if demangling succeeded and produced a different + /// result, `None` otherwise. + /// + /// # Examples + /// + /// ``` + /// use stringy::classification::SymbolDemangler; + /// + /// let demangler = SymbolDemangler::new(); + /// let result = demangler.try_demangle("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); + /// assert!(result.is_some()); + /// + /// let result = demangler.try_demangle("printf"); + /// assert!(result.is_none()); + /// ``` + #[must_use] + pub fn try_demangle(&self, symbol: &str) -> Option { + if !self.is_mangled(symbol) { + return None; + } + + let demangled = rustc_demangle::demangle(symbol); + let demangled_str = demangled.to_string(); + + // Check if demangling actually worked + if demangled_str == symbol { + return None; + } + + Some(demangled_str) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{Encoding, StringSource}; + + fn create_test_string(text: &str) -> FoundString { + FoundString { + text: text.to_string(), + original_text: None, + encoding: Encoding::Ascii, + offset: 0, + rva: None, + section: None, + length: text.len() as u32, + tags: Vec::new(), + score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, + source: StringSource::ImportName, + confidence: 1.0, + } + } + + #[test] + fn test_is_mangled_rust_legacy() { + let demangler = SymbolDemangler::new(); + + // Legacy Rust mangling (_ZN prefix) + assert!(demangler.is_mangled("_ZN4core3fmt5Write9write_str17h1234567890abcdefE")); + assert!(demangler.is_mangled("_ZN3std2io5stdio6_print17h1234567890abcdefE")); + } + + #[test] + fn test_is_mangled_rust_v0() { + let demangler = SymbolDemangler::new(); + + // Rust v0 mangling (_R prefix) + assert!(demangler.is_mangled("_RNvNtCs123_4core3fmt5write")); + assert!(demangler.is_mangled("_RNvCs123_5hello4main")); + } + + #[test] + fn test_is_mangled_not_mangled() { + let demangler = SymbolDemangler::new(); + + // Regular symbols should not be detected as mangled + assert!(!demangler.is_mangled("printf")); + assert!(!demangler.is_mangled("malloc")); + assert!(!demangler.is_mangled("main")); + assert!(!demangler.is_mangled("CreateFileW")); + assert!(!demangler.is_mangled("")); + } + + #[test] + fn test_demangle_rust_symbol() { + let demangler = SymbolDemangler::new(); + let mut found_string = + create_test_string("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); + + demangler.demangle(&mut found_string); + + // Should have been demangled + assert!(found_string.original_text.is_some()); + assert_eq!( + found_string.original_text.as_ref().unwrap(), + "_ZN4core3fmt5Write9write_str17h1234567890abcdefE" + ); + assert!(found_string.tags.contains(&Tag::DemangledSymbol)); + // Demangled text should be different from original + assert_ne!( + found_string.text, + "_ZN4core3fmt5Write9write_str17h1234567890abcdefE" + ); + } + + #[test] + fn test_demangle_non_mangled() { + let demangler = SymbolDemangler::new(); + let mut found_string = create_test_string("printf"); + + demangler.demangle(&mut found_string); + + // Should not have been modified + assert_eq!(found_string.text, "printf"); + assert!(found_string.original_text.is_none()); + assert!(!found_string.tags.contains(&Tag::DemangledSymbol)); + } + + #[test] + fn test_try_demangle_success() { + let demangler = SymbolDemangler::new(); + let result = demangler.try_demangle("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); + + assert!(result.is_some()); + let demangled = result.unwrap(); + assert!(!demangled.is_empty()); + assert_ne!( + demangled, + "_ZN4core3fmt5Write9write_str17h1234567890abcdefE" + ); + } + + #[test] + fn test_try_demangle_failure() { + let demangler = SymbolDemangler::new(); + + assert!(demangler.try_demangle("printf").is_none()); + assert!(demangler.try_demangle("").is_none()); + assert!(demangler.try_demangle("main").is_none()); + } + + #[test] + fn test_demangle_preserves_existing_tags() { + let demangler = SymbolDemangler::new(); + let mut found_string = + create_test_string("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); + found_string.tags.push(Tag::Import); + + demangler.demangle(&mut found_string); + + // Should have both the original tag and the new demangled tag + assert!(found_string.tags.contains(&Tag::Import)); + assert!(found_string.tags.contains(&Tag::DemangledSymbol)); + } + + #[test] + fn test_demangle_idempotent() { + let demangler = SymbolDemangler::new(); + let mut found_string = + create_test_string("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); + + demangler.demangle(&mut found_string); + let first_text = found_string.text.clone(); + let first_original = found_string.original_text.clone(); + + // Calling demangle again should not change anything + demangler.demangle(&mut found_string); + + assert_eq!(found_string.text, first_text); + assert_eq!(found_string.original_text, first_original); + // Should only have one DemangledSymbol tag + assert_eq!( + found_string + .tags + .iter() + .filter(|t| matches!(t, Tag::DemangledSymbol)) + .count(), + 1 + ); + } +} diff --git a/src/types.rs b/src/types.rs index 5347f33..b8cdfb0 100644 --- a/src/types.rs +++ b/src/types.rs @@ -32,6 +32,8 @@ pub enum Tag { FormatString, #[serde(rename = "user-agent-ish")] UserAgent, + #[serde(rename = "demangled")] + DemangledSymbol, Import, Export, Version, diff --git a/tests/snapshots/classification_integration__classification_snapshots.snap b/tests/snapshots/classification_integration__classification_snapshots.snap index 21b9b32..9c190dd 100644 --- a/tests/snapshots/classification_integration__classification_snapshots.snap +++ b/tests/snapshots/classification_integration__classification_snapshots.snap @@ -1,6 +1,5 @@ --- source: tests/classification_integration.rs -assertion_line: 150 expression: snapshot --- [ @@ -25,6 +24,7 @@ expression: snapshot ( "C:\\Windows\\System32\\cmd.exe", [ + "Domain", "FilePath", ], ), From 261b4d31bef1c59824fc16b2addfda15b02d7066 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 00:08:55 -0500 Subject: [PATCH 2/7] Remove file extensions from valid TLDs list File extensions such as exe, dll, sys, bin, dat, log, tmp, and bak are no longer treated as valid TLDs in domain classification. Added tests to ensure these extensions are not classified as domains, and updated documentation to reflect Rust-only symbol demangling. --- src/classification/patterns/network.rs | 15 ++++++++++++++- src/classification/symbols.rs | 2 +- ...ion_integration__classification_snapshots.snap | 1 - 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/classification/patterns/network.rs b/src/classification/patterns/network.rs index c45b8ca..d754d05 100644 --- a/src/classification/patterns/network.rs +++ b/src/classification/patterns/network.rs @@ -30,7 +30,7 @@ const COMMON_TLDS: &[&str] = &[ "info", "biz", "name", "pro", "aero", "coop", "museum", "travel", "jobs", "mobi", "tel", "asia", "cat", "xxx", "app", "dev", "page", "blog", "shop", "store", "online", "site", "website", "tech", "cloud", "ai", "ml", "tv", "me", "cc", "ws", "bz", "nu", "tk", "ga", "cf", - "gq", "exe", "dll", "sys", "bin", "dat", "log", "tmp", "bak", + "gq", ]; /// Checks if the domain has a valid TLD @@ -166,4 +166,17 @@ mod tests { // IP-like domains (should be handled by IP classifier) assert!(classify_domain("192.168.1.1").is_none()); } + + #[test] + fn test_file_extensions_not_domains() { + // File extensions should NOT be treated as valid TLDs + assert!(classify_domain("cmd.exe").is_none()); + assert!(classify_domain("kernel32.dll").is_none()); + assert!(classify_domain("ntoskrnl.sys").is_none()); + assert!(classify_domain("program.bin").is_none()); + assert!(classify_domain("data.dat").is_none()); + assert!(classify_domain("debug.log").is_none()); + assert!(classify_domain("temp.tmp").is_none()); + assert!(classify_domain("backup.bak").is_none()); + } } diff --git a/src/classification/symbols.rs b/src/classification/symbols.rs index 69361bb..a1337fd 100644 --- a/src/classification/symbols.rs +++ b/src/classification/symbols.rs @@ -1,4 +1,4 @@ -//! Symbol demangling for Rust and C++ symbols +//! Symbol demangling for Rust symbols //! //! This module provides functionality to detect and demangle mangled symbols //! from compiled Rust binaries. When a mangled symbol is detected, the original diff --git a/tests/snapshots/classification_integration__classification_snapshots.snap b/tests/snapshots/classification_integration__classification_snapshots.snap index 9c190dd..f110d38 100644 --- a/tests/snapshots/classification_integration__classification_snapshots.snap +++ b/tests/snapshots/classification_integration__classification_snapshots.snap @@ -24,7 +24,6 @@ expression: snapshot ( "C:\\Windows\\System32\\cmd.exe", [ - "Domain", "FilePath", ], ), From 503199f0ece6cfbc31e1a9f3829dce4d13a6da22 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 00:18:34 -0500 Subject: [PATCH 3/7] Add C++ symbol demangling support Integrates the cpp_demangle crate and extends SymbolDemangler to handle C++ Itanium ABI mangled symbols (e.g., _Z, _ZN). Updates documentation, detection logic, and adds tests for C++ symbol demangling alongside Rust symbol support. --- Cargo.toml | 1 + src/classification/symbols.rs | 215 ++++++++++++++++++++++++++++------ 2 files changed, 182 insertions(+), 34 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 88c8bda..ecb5d42 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ path = "src/main.rs" [dependencies] clap = { version = "4.5.54", features = [ "derive" ] } +cpp_demangle = "0.5.1" entropy = "0.4.2" goblin = "0.10.4" once_cell = "1.21.3" diff --git a/src/classification/symbols.rs b/src/classification/symbols.rs index a1337fd..27b7cd2 100644 --- a/src/classification/symbols.rs +++ b/src/classification/symbols.rs @@ -1,14 +1,15 @@ -//! Symbol demangling for Rust symbols +//! Symbol demangling for Rust and C++ symbols //! //! This module provides functionality to detect and demangle mangled symbols -//! from compiled Rust binaries. When a mangled symbol is detected, the original -//! mangled form is preserved in `FoundString.original_text` while the demangled -//! human-readable form replaces `FoundString.text`. +//! from compiled Rust and C++ binaries. When a mangled symbol is detected, the +//! original mangled form is preserved in `FoundString.original_text` while the +//! demangled human-readable form replaces `FoundString.text`. //! //! # Supported Symbol Formats //! //! - **Rust legacy mangling**: Symbols starting with `_ZN` (uses Itanium ABI-like encoding) //! - **Rust v0 mangling**: Symbols starting with `_R` (new Rust-specific encoding) +//! - **C++ Itanium ABI**: Symbols starting with `_Z` (used by GCC, Clang, and others) //! //! # Usage //! @@ -41,11 +42,13 @@ //! ``` use crate::types::{FoundString, Tag}; +use cpp_demangle::Symbol as CppSymbol; -/// Symbol demangler for Rust symbols +/// Symbol demangler for Rust and C++ symbols /// -/// Uses the `rustc-demangle` crate to convert mangled Rust symbols into -/// human-readable form while preserving the original mangled text. +/// Uses the `rustc-demangle` crate for Rust symbols and the `cpp_demangle` +/// crate for C++ symbols. Converts mangled symbols into human-readable form +/// while preserving the original mangled text. #[derive(Debug, Default, Clone)] pub struct SymbolDemangler; @@ -56,11 +59,12 @@ impl SymbolDemangler { Self } - /// Check if a symbol appears to be a mangled Rust symbol + /// Check if a symbol appears to be a mangled Rust or C++ symbol /// - /// Returns `true` if the symbol starts with known Rust mangling prefixes: - /// - `_ZN` - Rust legacy mangling (Itanium ABI-like) + /// Returns `true` if the symbol starts with known mangling prefixes: + /// - `_ZN` - Rust legacy mangling or C++ nested names (Itanium ABI) /// - `_R` - Rust v0 mangling scheme + /// - `_Z` - C++ Itanium ABI mangling (used by GCC, Clang) /// /// # Arguments /// @@ -76,19 +80,24 @@ impl SymbolDemangler { /// use stringy::classification::SymbolDemangler; /// /// let demangler = SymbolDemangler::new(); + /// // Rust symbols /// assert!(demangler.is_mangled("_ZN4core3fmt5Write9write_str17h1234567890abcdefE")); /// assert!(demangler.is_mangled("_RNvNtCs123_4core3fmt5write")); + /// // C++ symbols + /// assert!(demangler.is_mangled("_ZN3foo3barEv")); + /// assert!(demangler.is_mangled("_Z3foov")); /// assert!(!demangler.is_mangled("printf")); /// ``` #[must_use] pub fn is_mangled(&self, symbol: &str) -> bool { - // Rust legacy mangling (Itanium ABI-like) - if symbol.starts_with("_ZN") { + // Rust v0 mangling scheme (Rust-specific, check first) + if symbol.starts_with("_R") { return true; } - // Rust v0 mangling scheme - if symbol.starts_with("_R") { + // Itanium ABI mangling (used by both Rust legacy and C++) + // This includes _ZN (nested names), _ZL (local), _ZTV (vtable), etc. + if symbol.starts_with("_Z") { return true; } @@ -97,12 +106,15 @@ impl SymbolDemangler { /// Attempt to demangle a symbol in a `FoundString` /// - /// If the string appears to be a mangled Rust symbol and can be successfully - /// demangled: + /// If the string appears to be a mangled Rust or C++ symbol and can be + /// successfully demangled: /// - The original mangled form is stored in `original_text` /// - The demangled form replaces `text` /// - `Tag::DemangledSymbol` is added to the tags /// + /// The demangler tries Rust demangling first (for `_R` and `_ZN` prefixes), + /// then falls back to C++ demangling for `_Z` prefixes. + /// /// If demangling fails or the symbol is not mangled, the `FoundString` is /// left unchanged. /// @@ -144,15 +156,11 @@ impl SymbolDemangler { return; } - // Attempt to demangle using rustc-demangle - let demangled = rustc_demangle::demangle(&string.text); - let demangled_str = demangled.to_string(); - - // Check if demangling actually produced a different result - // If the demangled form equals the original, demangling failed - if demangled_str == string.text { - return; - } + // Try to demangle + let demangled_str = match self.try_demangle_internal(&string.text) { + Some(s) => s, + None => return, + }; // Store original mangled form and replace with demangled string.original_text = Some(string.text.clone()); @@ -164,9 +172,59 @@ impl SymbolDemangler { } } + /// Internal demangling logic that tries Rust then C++ + fn try_demangle_internal(&self, symbol: &str) -> Option { + // For Rust v0 symbols (_R prefix), only try Rust demangling + if symbol.starts_with("_R") { + return self.try_rust_demangle(symbol); + } + + // For _Z prefixed symbols, try Rust first (for legacy Rust symbols), + // then fall back to C++ if Rust demangling doesn't work + if symbol.starts_with("_Z") { + // Try Rust first (handles _ZN Rust legacy symbols) + if let Some(demangled) = self.try_rust_demangle(symbol) { + return Some(demangled); + } + + // Fall back to C++ demangling + return self.try_cpp_demangle(symbol); + } + + None + } + + /// Try to demangle as a Rust symbol + fn try_rust_demangle(&self, symbol: &str) -> Option { + let demangled = rustc_demangle::demangle(symbol); + let demangled_str = demangled.to_string(); + + // Check if demangling actually produced a different result + if demangled_str != symbol { + Some(demangled_str) + } else { + None + } + } + + /// Try to demangle as a C++ symbol + fn try_cpp_demangle(&self, symbol: &str) -> Option { + // Parse the symbol using cpp_demangle + let parsed = CppSymbol::new(symbol).ok()?; + let demangled_str = parsed.demangle().ok()?; + + // Check if demangling actually produced a different result + if demangled_str != symbol { + Some(demangled_str) + } else { + None + } + } + /// Try to demangle a symbol string directly /// /// This is a convenience method for demangling without a `FoundString`. + /// Supports both Rust and C++ mangled symbols. /// /// # Arguments /// @@ -183,9 +241,16 @@ impl SymbolDemangler { /// use stringy::classification::SymbolDemangler; /// /// let demangler = SymbolDemangler::new(); + /// + /// // Rust symbol /// let result = demangler.try_demangle("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); /// assert!(result.is_some()); /// + /// // C++ symbol + /// let result = demangler.try_demangle("_ZN3foo3barEv"); + /// assert!(result.is_some()); + /// + /// // Not mangled /// let result = demangler.try_demangle("printf"); /// assert!(result.is_none()); /// ``` @@ -195,15 +260,7 @@ impl SymbolDemangler { return None; } - let demangled = rustc_demangle::demangle(symbol); - let demangled_str = demangled.to_string(); - - // Check if demangling actually worked - if demangled_str == symbol { - return None; - } - - Some(demangled_str) + self.try_demangle_internal(symbol) } } @@ -358,4 +415,94 @@ mod tests { 1 ); } + + // C++ demangling tests + + #[test] + fn test_is_mangled_cpp_symbols() { + let demangler = SymbolDemangler::new(); + + // C++ Itanium ABI mangled symbols + assert!(demangler.is_mangled("_ZN3foo3barEv")); // foo::bar() + assert!(demangler.is_mangled("_Z3foov")); // foo() + assert!(demangler.is_mangled("_ZN9__gnu_cxx13new_allocatorIcE10deallocateEPcm")); + assert!(demangler.is_mangled("_ZNSt6vectorIiSaIiEE9push_backERKi")); + assert!(demangler.is_mangled("_ZTV5MyClass")); // vtable for MyClass + assert!(demangler.is_mangled("_ZTI5MyClass")); // typeinfo for MyClass + } + + #[test] + fn test_demangle_cpp_symbol() { + let demangler = SymbolDemangler::new(); + let mut found_string = create_test_string("_ZN3foo3barEv"); + + demangler.demangle(&mut found_string); + + // Should have been demangled + assert!(found_string.original_text.is_some()); + assert_eq!( + found_string.original_text.as_ref().unwrap(), + "_ZN3foo3barEv" + ); + assert!(found_string.tags.contains(&Tag::DemangledSymbol)); + // Demangled text should contain "foo" and "bar" + assert!(found_string.text.contains("foo")); + assert!(found_string.text.contains("bar")); + } + + #[test] + fn test_try_demangle_cpp_success() { + let demangler = SymbolDemangler::new(); + + // Simple C++ function + let result = demangler.try_demangle("_Z3foov"); + assert!(result.is_some()); + let demangled = result.unwrap(); + assert!(demangled.contains("foo")); + + // Namespaced C++ function + let result = demangler.try_demangle("_ZN3foo3barEv"); + assert!(result.is_some()); + let demangled = result.unwrap(); + assert!(demangled.contains("foo")); + assert!(demangled.contains("bar")); + } + + #[test] + fn test_demangle_cpp_with_parameters() { + let demangler = SymbolDemangler::new(); + + // C++ function with int parameter: void foo(int) + let result = demangler.try_demangle("_Z3fooi"); + assert!(result.is_some()); + let demangled = result.unwrap(); + assert!(demangled.contains("foo")); + assert!(demangled.contains("int")); + } + + #[test] + fn test_demangle_cpp_template() { + let demangler = SymbolDemangler::new(); + + // C++ template: std::vector + let result = demangler.try_demangle("_ZNSt6vectorIiSaIiEEC1Ev"); + assert!(result.is_some()); + let demangled = result.unwrap(); + assert!(demangled.contains("vector")); + } + + #[test] + fn test_cpp_symbol_in_found_string() { + let demangler = SymbolDemangler::new(); + let mut found_string = create_test_string("_Z3fooi"); + found_string.tags.push(Tag::Export); + + demangler.demangle(&mut found_string); + + // Should have been demangled and preserved existing tags + assert!(found_string.original_text.is_some()); + assert!(found_string.tags.contains(&Tag::Export)); + assert!(found_string.tags.contains(&Tag::DemangledSymbol)); + assert!(found_string.text.contains("foo")); + } } From cbe64c72c91c7651811888cc69a49c3591a60c6a Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 00:35:58 -0500 Subject: [PATCH 4/7] feat(classification): improve path detection logic and regex patterns Signed-off-by: UncleSp1d3r --- src/classification/patterns/data.rs | 11 +- src/classification/patterns/network.rs | 23 ++-- src/classification/patterns/paths.rs | 159 +++++++++++++++---------- 3 files changed, 112 insertions(+), 81 deletions(-) diff --git a/src/classification/patterns/data.rs b/src/classification/patterns/data.rs index 3f3dd57..44495e6 100644 --- a/src/classification/patterns/data.rs +++ b/src/classification/patterns/data.rs @@ -102,8 +102,9 @@ pub fn classify_base64(text: &str) -> Option { // Strip padding for length check let content_len = text.len() - padding_count; - // Valid Base64 content length should be such that total is multiple of 4 - if !(content_len + padding_count).is_multiple_of(4) { + // Valid Base64 content length should avoid mod 4 remainder of 1 + let remainder = (content_len + padding_count) % 4; + if remainder == 1 { return None; } @@ -156,11 +157,11 @@ pub fn classify_format_string(text: &str) -> Option { // Exclude strings that are just a single format specifier // (those are likely false positives) - if text.len() > 2 { - return Some(Tag::FormatString); + if text.len() <= 2 { + return None; } - None + Some(Tag::FormatString) } /// Classifies a user agent string diff --git a/src/classification/patterns/network.rs b/src/classification/patterns/network.rs index d754d05..a510280 100644 --- a/src/classification/patterns/network.rs +++ b/src/classification/patterns/network.rs @@ -5,6 +5,7 @@ use crate::types::Tag; use once_cell::sync::Lazy; use regex::Regex; +use std::collections::HashSet; /// Regular expression for matching HTTP/HTTPS URLs /// @@ -23,15 +24,17 @@ pub(crate) static DOMAIN_REGEX: Lazy = Lazy::new(|| { }); /// List of common TLDs for validation -const COMMON_TLDS: &[&str] = &[ - "com", "org", "net", "edu", "gov", "mil", "int", "io", "co", "uk", "de", "fr", "jp", "cn", - "ru", "br", "in", "au", "ca", "es", "it", "nl", "pl", "se", "ch", "at", "be", "dk", "fi", "no", - "pt", "cz", "hu", "ro", "bg", "hr", "sk", "si", "ee", "lt", "lv", "ie", "gr", "cy", "mt", "lu", - "info", "biz", "name", "pro", "aero", "coop", "museum", "travel", "jobs", "mobi", "tel", - "asia", "cat", "xxx", "app", "dev", "page", "blog", "shop", "store", "online", "site", - "website", "tech", "cloud", "ai", "ml", "tv", "me", "cc", "ws", "bz", "nu", "tk", "ga", "cf", - "gq", -]; +static COMMON_TLDS: Lazy> = Lazy::new(|| { + HashSet::from([ + "com", "org", "net", "edu", "gov", "mil", "int", "io", "co", "uk", "de", "fr", "jp", "cn", + "ru", "br", "in", "au", "ca", "es", "it", "nl", "pl", "se", "ch", "at", "be", "dk", "fi", + "no", "pt", "cz", "hu", "ro", "bg", "hr", "sk", "si", "ee", "lt", "lv", "ie", "gr", "cy", + "mt", "lu", "info", "biz", "name", "pro", "aero", "coop", "museum", "travel", "jobs", + "mobi", "tel", "asia", "cat", "xxx", "app", "dev", "page", "blog", "shop", "store", + "online", "site", "website", "tech", "cloud", "ai", "ml", "tv", "me", "cc", "ws", "bz", + "nu", "tk", "ga", "cf", "gq", + ]) +}); /// Checks if the domain has a valid TLD /// @@ -44,7 +47,7 @@ pub fn has_valid_tld(domain: &str) -> bool { if let Some(dot_pos) = domain.rfind('.') { let tld = &domain[dot_pos + 1..]; let tld_lower = tld.to_lowercase(); - COMMON_TLDS.contains(&tld_lower.as_str()) + COMMON_TLDS.contains(tld_lower.as_str()) } else { false } diff --git a/src/classification/patterns/paths.rs b/src/classification/patterns/paths.rs index 6f75359..88611e5 100644 --- a/src/classification/patterns/paths.rs +++ b/src/classification/patterns/paths.rs @@ -29,102 +29,124 @@ pub(crate) static REGISTRY_ABBREV_REGEX: Lazy = /// Common suspicious POSIX path prefixes for persistence detection static SUSPICIOUS_POSIX_PATHS: Lazy> = Lazy::new(|| { - let mut set = HashSet::new(); - set.insert("/etc/cron.d/"); - set.insert("/etc/init.d/"); - set.insert("/usr/local/bin/"); - set.insert("/tmp/"); - set.insert("/var/tmp/"); - set.insert("/etc/rc.d/"); - set.insert("/etc/crontab"); - set.insert("/etc/systemd/system/"); - set.insert("~/.config/autostart/"); - set.insert("/Library/LaunchDaemons/"); - set.insert("/Library/LaunchAgents/"); - set + HashSet::from([ + "/etc/cron.d/", + "/etc/init.d/", + "/usr/local/bin/", + "/tmp/", + "/var/tmp/", + "/etc/rc.d/", + "/etc/crontab", + "/etc/systemd/system/", + "/Library/LaunchDaemons/", + "/Library/LaunchAgents/", + ]) }); /// Common suspicious Windows path prefixes for persistence detection static SUSPICIOUS_WINDOWS_PATHS: Lazy> = Lazy::new(|| { - let mut set = HashSet::new(); - set.insert("C:\\Windows\\System32\\"); - set.insert("C:\\Windows\\Temp\\"); - set.insert("\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\"); - set.insert("C:\\ProgramData\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\"); - set.insert("C:\\Windows\\SysWOW64\\"); - set + HashSet::from([ + "C:\\Windows\\System32\\", + "C:\\Windows\\Temp\\", + "\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\", + "C:\\ProgramData\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\", + "C:\\Windows\\SysWOW64\\", + ]) }); /// Known valid POSIX path prefixes static KNOWN_POSIX_PREFIXES: Lazy> = Lazy::new(|| { - let mut set = HashSet::new(); - set.insert("/usr/"); - set.insert("/etc/"); - set.insert("/var/"); - set.insert("/home/"); - set.insert("/opt/"); - set.insert("/bin/"); - set.insert("/sbin/"); - set.insert("/lib/"); - set.insert("/dev/"); - set.insert("/proc/"); - set.insert("/sys/"); - set.insert("/tmp/"); - set + HashSet::from([ + "/usr/", "/etc/", "/var/", "/home/", "/opt/", "/bin/", "/sbin/", "/lib/", "/dev/", + "/proc/", "/sys/", "/tmp/", + ]) }); /// Known valid Windows path prefixes static KNOWN_WINDOWS_PREFIXES: Lazy> = Lazy::new(|| { - let mut set = HashSet::new(); - set.insert("C:\\Windows\\"); - set.insert("C:\\Program Files\\"); - set.insert("C:\\Program Files (x86)\\"); - set.insert("C:\\Users\\"); - set.insert("C:\\ProgramData\\"); - set + HashSet::from([ + "C:\\Windows\\", + "C:\\Program Files\\", + "C:\\Program Files (x86)\\", + "C:\\Users\\", + "C:\\ProgramData\\", + ]) }); /// Valid Windows registry root keys static VALID_REGISTRY_ROOTS: Lazy> = Lazy::new(|| { - let mut set = HashSet::new(); - set.insert("HKEY_LOCAL_MACHINE"); - set.insert("HKEY_CURRENT_USER"); - set.insert("HKEY_CLASSES_ROOT"); - set.insert("HKEY_USERS"); - set.insert("HKEY_CURRENT_CONFIG"); - set + HashSet::from([ + "HKEY_LOCAL_MACHINE", + "HKEY_CURRENT_USER", + "HKEY_CLASSES_ROOT", + "HKEY_USERS", + "HKEY_CURRENT_CONFIG", + ]) }); /// Suspicious Windows registry paths for persistence detection static SUSPICIOUS_REGISTRY_PATHS: Lazy> = Lazy::new(|| { - let mut set = HashSet::new(); - set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run"); - set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\RunOnce"); - set.insert("\\System\\CurrentControlSet\\Services"); - set.insert("\\SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion\\Winlogon"); - set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders"); - set + HashSet::from([ + "\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run", + "\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\RunOnce", + "\\System\\CurrentControlSet\\Services", + "\\SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion\\Winlogon", + "\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders", + ]) }); /// Checks if a path contains ASCII case-insensitive substring fn contains_ascii_case_insensitive(haystack: &str, needle: &str) -> bool { - let haystack_lower = haystack.to_ascii_lowercase(); - let needle_lower = needle.to_ascii_lowercase(); - haystack_lower.contains(&needle_lower) + if needle.is_empty() { + return true; + } + + let haystack_bytes = haystack.as_bytes(); + let needle_bytes = needle.as_bytes(); + + if needle_bytes.len() > haystack_bytes.len() { + return false; + } + + for start in 0..=haystack_bytes.len() - needle_bytes.len() { + let mut matched = true; + for i in 0..needle_bytes.len() { + let hay = haystack_bytes[start + i].to_ascii_lowercase(); + let nee = needle_bytes[i].to_ascii_lowercase(); + if hay != nee { + matched = false; + break; + } + } + if matched { + return true; + } + } + + false } +fn starts_with_ascii_case_insensitive(text: &str, prefix: &str) -> bool { + if prefix.len() > text.len() { + return false; + } + + text.as_bytes() + .iter() + .take(prefix.len()) + .zip(prefix.as_bytes()) + .all(|(left, right)| left.eq_ignore_ascii_case(right)) +} + +const AUTOSTART_POSIX_SUBPATH: &str = "/.config/autostart/"; + /// Checks if text contains printf-style format placeholders fn contains_printf_placeholder(text: &str) -> bool { // Look for common printf patterns that might appear in paths let patterns = [ "%s", "%d", "%x", "%u", "%i", "%f", "%c", "%p", "%n", "%ld", "%lu", ]; - for pattern in patterns { - if text.contains(pattern) { - return true; - } - } - false + patterns.iter().any(|pattern| text.contains(pattern)) } /// Checks if text contains control characters @@ -181,7 +203,7 @@ pub fn is_valid_windows_path(text: &str) -> bool { // Check for known prefixes to boost confidence for prefix in KNOWN_WINDOWS_PREFIXES.iter() { - if contains_ascii_case_insensitive(text, prefix) { + if starts_with_ascii_case_insensitive(text, prefix) { return true; } } @@ -296,6 +318,11 @@ pub fn classify_registry_path(text: &str) -> Option { /// Checks if a POSIX path is suspicious (persistence-related) pub fn is_suspicious_posix_path(text: &str) -> bool { + if (text.starts_with("/home/") || text.starts_with("/Users/")) + && text.contains(AUTOSTART_POSIX_SUBPATH) + { + return true; + } SUSPICIOUS_POSIX_PATHS.iter().any(|p| text.starts_with(p)) } @@ -303,7 +330,7 @@ pub fn is_suspicious_posix_path(text: &str) -> bool { pub fn is_suspicious_windows_path(text: &str) -> bool { SUSPICIOUS_WINDOWS_PATHS .iter() - .any(|p| contains_ascii_case_insensitive(text, p)) + .any(|p| starts_with_ascii_case_insensitive(text, p)) } /// Checks if a registry path is suspicious (persistence-related) From c6c75683e94cdde7b982fd13849e4ceae2f77ca6 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 00:49:27 -0500 Subject: [PATCH 5/7] feat(extraction): add original text field and apply demangling Signed-off-by: UncleSp1d3r --- docs/src/classification.md | 44 ++++++++++++++++++++------------------ src/extraction/dedup.rs | 6 +++++- src/extraction/mod.rs | 27 +++++++++++++++++++++++ 3 files changed, 55 insertions(+), 22 deletions(-) diff --git a/docs/src/classification.md b/docs/src/classification.md index 2a5947a..5185003 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -76,51 +76,54 @@ Raw String -> Pattern Matching -> Tag Assignment #### GUIDs/UUIDs -- **Pattern**: `\{[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\}` -- **Examples**: `{12345678-1234-1234-1234-123456789abc}` -- **Validation**: Format compliance, version checking +- **Pattern**: `\{?[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\}?` +- **Examples**: `{12345678-1234-1234-1234-123456789abc}`, `12345678-1234-1234-1234-123456789abc` +- **Validation**: Format compliance - **Security relevance**: Medium - component identification #### Email Addresses - **Pattern**: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}` - **Examples**: `admin@malware.com`, `support@legitimate.org` -- **Validation**: RFC compliance, domain validation +- **Validation**: Basic format validation - **Security relevance**: Medium - contact information ### Code Artifacts #### Format Strings -- **Pattern**: `%[sdxo]|%\d+[sdxo]|\{\d+\}` -- **Examples**: `Error: %s at line %d`, `User {0} logged in` -- **Context**: Proximity to other format strings +- **Pattern**: `%[-+0 #]*(\d+|\*)?(\.(\d+|\*))?(hh?|ll?|[Lzjt])?[diouxXeEfFgGaAcspn%]` +- **Examples**: `Error: %s at line %d`, `Name: %s, Age: %d, Score: %.2f` +- **Context**: Presence of real format specifiers (%% alone is ignored) - **Security relevance**: Low-Medium - debugging information #### Base64 Data -- **Pattern**: `[A-Za-z0-9+/]{20,}={0,2}` +- **Pattern**: Character set validation with padding rules - **Examples**: `SGVsbG8gV29ybGQ=` -- **Validation**: Length divisibility, padding correctness +- **Validation**: Length >= 16, Base64 character set, valid padding, reject length mod 4 of 1 - **Security relevance**: Variable - encoded payloads -### User Agents +#### User Agents -- **Pattern**: `Mozilla/[0-9.]+|Chrome/[0-9.]+|Safari/[0-9.]+` -- **Examples**: `Mozilla/5.0 (Windows NT 10.0; Win64; x64)` +- **Pattern**: Prefix match for common agents (Mozilla, curl, Wget, python-requests, libwww-perl, Java, Apache-HttpClient, okhttp, PostmanRuntime) +- **Examples**: `Mozilla/5.0 (Windows NT 10.0; Win64; x64)`, `curl/7.68.0` - **Security relevance**: Medium - network fingerprinting -### Pattern Matching Engine +## Tag Specificity -The semantic classifier uses cached regex patterns via `lazy_static!` and applies validation checks to reduce false positives. +Tags are treated as either specific or broad. Specific tags indicate high confidence matches (for example URL, domain, IP, file path, GUID, email, format string, and user agent). Base64 is a broad tag and should be treated as ambiguous due to higher false positive risk. + +## Pattern Matching Engine + +The semantic classifier uses cached regex patterns via `once_cell::sync::Lazy` and applies validation checks to reduce false positives. ```rust -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use regex::Regex; -lazy_static! { - static ref URL_REGEX: Regex = Regex::new(r#"https?://[^\s<>"{}|\\^\[\]\`]+"#).unwrap(); -} +static URL_REGEX: Lazy = + Lazy::new(|| Regex::new(r#"https?://[^\s<>"{}|\\^\[\]\`]+"#).unwrap()); impl SemanticClassifier { pub fn classify(&self, string: &FoundString) -> Vec { @@ -198,11 +201,10 @@ if tags.contains(&Tag::FilePath) { The current implementation returns tags without explicit confidence scores. Confidence is implicit in the validation and matching logic. A future update may introduce explicit confidence values per tag. -## Planned Enhancements (implementation pending) +## Planned Enhancements - Context-aware classification -- Symbol classification -- Additional semantic patterns (GUIDs, email addresses, base64, format strings) - documented above, implementation pending +- Language-specific refinements ### Language-Specific Patterns diff --git a/src/extraction/dedup.rs b/src/extraction/dedup.rs index d7ad2b7..8adda0a 100644 --- a/src/extraction/dedup.rs +++ b/src/extraction/dedup.rs @@ -39,6 +39,9 @@ pub struct StringOccurrence { pub rva: Option, /// Section name where string was found pub section: Option, + /// Original text before demangling (if applicable) + #[serde(skip_serializing_if = "Option::is_none", default)] + pub original_text: Option, /// Extraction source type pub source: StringSource, /// Tags from this specific occurrence @@ -254,6 +257,7 @@ pub fn found_string_to_occurrence(fs: FoundString) -> StringOccurrence { offset: fs.offset, rva: fs.rva, section: fs.section, + original_text: fs.original_text, source: fs.source, original_tags: fs.tags, original_score: fs.score, @@ -282,7 +286,7 @@ impl CanonicalString { FoundString { text: self.text.clone(), - original_text: None, + original_text: first_occurrence.original_text.clone(), encoding: self.encoding, offset: first_occurrence.offset, rva: first_occurrence.rva, diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs index 19b5038..fb6ce97 100644 --- a/src/extraction/mod.rs +++ b/src/extraction/mod.rs @@ -123,6 +123,7 @@ //! let load_command_strings = extract_load_command_strings(&macho_data); //! ``` +use crate::classification::{SemanticClassifier, SymbolDemangler}; use crate::types::{ ContainerInfo, Encoding, FoundString, Result, SectionInfo, SectionType, StringSource, }; @@ -521,6 +522,19 @@ impl StringExtractor for BasicExtractor { } } + // Apply demangling and semantic classification before deduplication + let classifier = SemanticClassifier::new(); + let demangler = SymbolDemangler::new(); + for string in &mut all_strings { + demangler.demangle(string); + let tags = classifier.classify(string); + for tag in tags { + if !string.tags.contains(&tag) { + string.tags.push(tag); + } + } + } + // Apply deduplication if enabled if config.enable_deduplication { let canonical_strings = deduplicate( @@ -625,6 +639,19 @@ impl StringExtractor for BasicExtractor { } } + // Apply demangling and semantic classification before deduplication + let classifier = SemanticClassifier::new(); + let demangler = SymbolDemangler::new(); + for string in &mut all_strings { + demangler.demangle(string); + let tags = classifier.classify(string); + for tag in tags { + if !string.tags.contains(&tag) { + string.tags.push(tag); + } + } + } + // Apply deduplication if enabled, otherwise convert each string to a canonical form if config.enable_deduplication { Ok(deduplicate( From d727dba2e8428c02a5c3adeccfaea136185d9b5d Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 00:59:14 -0500 Subject: [PATCH 6/7] Refactor and consolidate classification tests and enrichment Test modules for classification patterns have been refactored to combine similar test cases, reducing redundancy and improving maintainability. Semantic enrichment logic in extraction has been moved to a dedicated helper function. Unused deduplication parameter has been removed for clarity. --- docs/src/classification.md | 93 ----------------- src/classification/patterns/data.rs | 93 ++--------------- src/classification/patterns/ip.rs | 90 ++-------------- src/classification/patterns/network.rs | 61 +---------- src/classification/patterns/paths.rs | 136 ++----------------------- src/extraction/dedup.rs | 22 +--- src/extraction/mod.rs | 38 +++---- 7 files changed, 49 insertions(+), 484 deletions(-) diff --git a/docs/src/classification.md b/docs/src/classification.md index 5185003..170c216 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -155,22 +155,6 @@ impl SemanticClassifier { } ``` -## Implementation Details - -The classifier relies on `lazy_static!` to compile regex patterns once and reuse them across classification calls. Helper methods validate strings before assigning tags. - -### Method Signatures - -Key method signatures: - -```text -pub fn classify(&self, string: &FoundString) -> Vec; -pub fn classify_posix_path(&self, text: &str) -> Option; -pub fn classify_windows_path(&self, text: &str) -> Option; -pub fn classify_unc_path(&self, text: &str) -> Option; -pub fn classify_registry_path(&self, text: &str) -> Option; -``` - ## Using the Classification System ```text @@ -205,80 +189,3 @@ The current implementation returns tags without explicit confidence scores. Conf - Context-aware classification - Language-specific refinements - -### Language-Specific Patterns - -Different programming languages have distinct string patterns: - -```rust -pub enum LanguageHint { - Rust, - Go, - DotNet, - Native, -} - -impl SemanticClassifier { - fn classify_with_language_hint(&self, text: &str, hint: LanguageHint) -> Vec { - match hint { - LanguageHint::Rust => self.classify_rust_patterns(text), - LanguageHint::Go => self.classify_go_patterns(text), - LanguageHint::DotNet => self.classify_dotnet_patterns(text), - LanguageHint::Native => self.classify_native_patterns(text), - } - } -} -``` - -### False Positive Reduction - -Several techniques reduce false positives: - -1. **Length thresholds**: Very short matches are filtered out -2. **Context validation**: Surrounding data must make sense -3. **Entropy checking**: High-entropy strings are likely binary data -4. **Whitelist/blacklist**: Known good/bad patterns - -```text -fn is_likely_false_positive(&self, text: &str, tag: &Tag) -> bool { - match tag { - Tag::Domain => { - // Too short or invalid TLD - text.len() < 4 || !self.has_valid_tld(text) - } - Tag::Base64 => { - // Too short or invalid padding - text.len() < 8 || !self.valid_base64_padding(text) - } - _ => false, - } -} -``` - -## Performance Considerations - -### Regex Compilation Caching - -```rust -lazy_static! { - static ref COMPILED_PATTERNS: SemanticClassifier = SemanticClassifier::new(); -} -``` - -### Parallel Classification - -```rust -use rayon::prelude::*; - -fn classify_batch(strings: &[RawString]) -> Vec { - strings.par_iter().map(|s| classify_single(s)).collect() -} -``` - -### Memory Efficiency - -- Reuse regex objects across classifications -- Use string interning for common patterns -- Lazy evaluation for expensive validations - -This comprehensive classification system enables Stringy to automatically identify and categorize the most relevant strings in binary files, significantly improving analysis efficiency. diff --git a/src/classification/patterns/data.rs b/src/classification/patterns/data.rs index 44495e6..e49020c 100644 --- a/src/classification/patterns/data.rs +++ b/src/classification/patterns/data.rs @@ -184,114 +184,37 @@ mod tests { use super::*; #[test] - fn test_guid_with_braces() { + fn test_guid_valid_and_invalid() { assert!(classify_guid("{12345678-1234-1234-1234-123456789ABC}").is_some()); - assert!(classify_guid("{AAAAAAAA-BBBB-CCCC-DDDD-EEEEEEEEEEEE}").is_some()); - } - - #[test] - fn test_guid_without_braces() { assert!(classify_guid("12345678-1234-1234-1234-123456789ABC").is_some()); - assert!(classify_guid("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee").is_some()); - } - - #[test] - fn test_guid_case_insensitive() { - // Mixed case but all valid hex digits - assert!(classify_guid("AbCdEf01-1234-5678-90aB-cDeF12345678").is_some()); - } - - #[test] - fn test_guid_invalid() { assert!(classify_guid("not-a-guid").is_none()); - assert!(classify_guid("12345678-1234-1234-1234").is_none()); // Too short - assert!(classify_guid("12345678-1234-1234-1234-123456789ABCDEF").is_none()); // Too long - assert!(classify_guid("GGGGGGGG-1234-1234-1234-123456789ABC").is_none()); // Invalid hex } #[test] - fn test_email_valid() { + fn test_email_valid_and_invalid() { assert!(classify_email("user@example.com").is_some()); - assert!(classify_email("test.user@domain.org").is_some()); - assert!(classify_email("admin+tag@company.co.uk").is_some()); - } - - #[test] - fn test_email_invalid() { assert!(classify_email("not an email").is_none()); - assert!(classify_email("@nodomain.com").is_none()); - assert!(classify_email("noat.com").is_none()); - assert!(classify_email("user@").is_none()); } #[test] - fn test_base64_valid() { - // Valid Base64 with mixed case (typical encoded data) + fn test_base64_valid_and_invalid() { assert!(classify_base64("SGVsbG8gV29ybGQh").is_some()); - assert!(classify_base64("VGhpcyBpcyBhIHRlc3Q=").is_some()); - // Longer Base64 strings - assert!(classify_base64("QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVo=").is_some()); - } - - #[test] - fn test_base64_too_short() { - assert!(classify_base64("SGVsbG8=").is_none()); // Only 8 chars - assert!(classify_base64("YWJj").is_none()); // Only 4 chars - } - - #[test] - fn test_base64_invalid_chars() { - assert!(classify_base64("SGVsbG8gV29ybGQh!@#$").is_none()); assert!(classify_base64("This is not base64!!").is_none()); + assert!(classify_base64("YWJj").is_none()); } #[test] - fn test_format_string_basic() { - assert!(classify_format_string("Hello %s!").is_some()); - assert!(classify_format_string("Value: %d").is_some()); - assert!(classify_format_string("Hex: %x").is_some()); + fn test_format_string_valid_and_invalid() { + assert!(classify_format_string("Error: %s at line %d").is_some()); + assert!(classify_format_string("100%% done").is_none()); } #[test] - fn test_format_string_complex() { - assert!(classify_format_string("Name: %s, Age: %d, Score: %.2f").is_some()); - assert!(classify_format_string("%08x %08x %08x").is_some()); - assert!(classify_format_string("%-20s %10d").is_some()); - } - - #[test] - fn test_format_string_not_format() { - assert!(classify_format_string("No format here").is_none()); - assert!(classify_format_string("100%").is_none()); // Bare percent, no specifier - assert!(classify_format_string("100%% done").is_none()); // Escaped percent only - } - - #[test] - fn test_user_agent_mozilla() { + fn test_user_agent_valid_and_invalid() { assert!( classify_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") .is_some() ); - } - - #[test] - fn test_user_agent_curl() { - assert!(classify_user_agent("curl/7.68.0").is_some()); - } - - #[test] - fn test_user_agent_wget() { - assert!(classify_user_agent("Wget/1.20.3 (linux-gnu)").is_some()); - } - - #[test] - fn test_user_agent_python() { - assert!(classify_user_agent("python-requests/2.25.1").is_some()); - } - - #[test] - fn test_user_agent_not_user_agent() { assert!(classify_user_agent("Not a user agent").is_none()); - assert!(classify_user_agent("Chrome").is_none()); // Too generic } } diff --git a/src/classification/patterns/ip.rs b/src/classification/patterns/ip.rs index 9da0de3..0b275b4 100644 --- a/src/classification/patterns/ip.rs +++ b/src/classification/patterns/ip.rs @@ -170,91 +170,27 @@ mod tests { use super::*; #[test] - fn test_ipv4_valid_addresses() { + fn test_ipv4_valid_and_invalid() { assert!(is_ipv4_address("192.168.1.1")); - assert!(is_ipv4_address("10.0.0.1")); - assert!(is_ipv4_address("172.16.0.1")); - assert!(is_ipv4_address("255.255.255.255")); - assert!(is_ipv4_address("8.8.8.8")); - assert!(is_ipv4_address("1.1.1.1")); - assert!(is_ipv4_address("0.0.0.0")); - } - - #[test] - fn test_ipv4_invalid_addresses() { - assert!(!is_ipv4_address("256.1.1.1")); - assert!(!is_ipv4_address("1.2.3.4.5")); - assert!(!is_ipv4_address("1.2.3")); - assert!(!is_ipv4_address("not an ip")); - assert!(!is_ipv4_address("")); - } - - #[test] - fn test_ipv4_with_port() { assert!(is_ipv4_address("192.168.1.1:8080")); - assert!(is_ipv4_address("10.0.0.1:443")); - assert!(is_ipv4_address("172.16.0.1:65535")); - } - - #[test] - fn test_ipv4_leading_zeros() { - // Leading zeros should be rejected + assert!(!is_ipv4_address("256.1.1.1")); assert!(!is_ipv4_address("01.02.03.04")); - assert!(!is_ipv4_address("192.168.01.1")); - } - - #[test] - fn test_ipv6_full_notation() { - assert!(is_ipv6_address("2001:0db8:85a3:0000:0000:8a2e:0370:7334")); - assert!(is_ipv6_address("fe80:0000:0000:0000:0000:0000:0000:0001")); } #[test] - fn test_ipv6_compressed() { + fn test_ipv6_valid_and_invalid() { assert!(is_ipv6_address("2001:db8::1")); - assert!(is_ipv6_address("::1")); - assert!(is_ipv6_address("fe80::1")); - assert!(is_ipv6_address("::")); - } - - #[test] - fn test_ipv6_mixed_notation() { - assert!(is_ipv6_address("::ffff:192.0.2.1")); - assert!(is_ipv6_address("::ffff:127.0.0.1")); - } - - #[test] - fn test_ipv6_invalid() { - assert!(!is_ipv6_address("not an ipv6")); - assert!(!is_ipv6_address("")); - assert!(!is_ipv6_address("gggg::1")); // Invalid hex - } - - #[test] - fn test_ipv6_with_brackets() { - assert!(is_ipv6_address("[::1]")); - assert!(is_ipv6_address("[2001:db8::1]")); - assert!(is_ipv6_address("[fe80::1]")); - } - - #[test] - fn test_ipv6_with_port() { assert!(is_ipv6_address("[::1]:8080")); - assert!(is_ipv6_address("[2001:db8::1]:443")); + assert!(!is_ipv6_address("not an ipv6")); } #[test] - fn test_classify_ipv4() { + fn test_classify_ipv4_and_ipv6() { let tags = classify_ip_addresses("192.168.1.1"); - assert!(tags.contains(&Tag::IPv4)); - assert!(!tags.contains(&Tag::IPv6)); - } + assert_eq!(tags, vec![Tag::IPv4]); - #[test] - fn test_classify_ipv6() { let tags = classify_ip_addresses("2001:db8::1"); - assert!(!tags.contains(&Tag::IPv4)); - assert!(tags.contains(&Tag::IPv6)); + assert_eq!(tags, vec![Tag::IPv6]); } #[test] @@ -262,16 +198,4 @@ mod tests { let tags = classify_ip_addresses("not an ip address"); assert!(tags.is_empty()); } - - #[test] - fn test_classify_ipv4_with_port() { - let tags = classify_ip_addresses("192.168.1.1:8080"); - assert!(tags.contains(&Tag::IPv4)); - } - - #[test] - fn test_classify_ipv6_with_brackets_and_port() { - let tags = classify_ip_addresses("[::1]:8080"); - assert!(tags.contains(&Tag::IPv6)); - } } diff --git a/src/classification/patterns/network.rs b/src/classification/patterns/network.rs index a510280..fa72234 100644 --- a/src/classification/patterns/network.rs +++ b/src/classification/patterns/network.rs @@ -106,39 +106,20 @@ mod tests { use super::*; #[test] - fn test_url_detection() { + fn test_url_valid_and_invalid() { assert!(classify_url("https://example.com").is_some()); - assert!(classify_url("http://test.org/path").is_some()); - assert!(classify_url("https://sub.domain.com:8080/api").is_some()); assert!(classify_url("not a url").is_none()); - assert!(classify_url("ftp://example.com").is_none()); } #[test] - fn test_domain_detection() { + fn test_domain_valid_and_invalid() { assert!(classify_domain("example.com").is_some()); - assert!(classify_domain("sub.example.org").is_some()); - assert!(classify_domain("test.co.uk").is_some()); - assert!(classify_domain("https://example.com").is_none()); // URLs excluded - assert!(classify_domain("notadomain").is_none()); - assert!(classify_domain("invalid.xyz123").is_none()); // Invalid TLD - } - - #[test] - fn test_url_classification() { - assert_eq!(classify_url("https://example.com"), Some(Tag::Url)); - assert_eq!(classify_url("http://test.org"), Some(Tag::Url)); - } - - #[test] - fn test_domain_classification() { - assert_eq!(classify_domain("example.com"), Some(Tag::Domain)); - assert_eq!(classify_domain("test.org"), Some(Tag::Domain)); + assert!(classify_domain("https://example.com").is_none()); + assert!(classify_domain("invalid.xyz123").is_none()); } #[test] fn test_url_not_double_tagged() { - // URLs should not be tagged as domains assert!(classify_url("https://example.com").is_some()); assert!(classify_domain("https://example.com").is_none()); } @@ -146,40 +127,6 @@ mod tests { #[test] fn test_tld_validation() { assert!(has_valid_tld("example.com")); - assert!(has_valid_tld("test.org")); - assert!(has_valid_tld("website.io")); - assert!(has_valid_tld("app.dev")); - assert!(!has_valid_tld("example.invalidtld")); assert!(!has_valid_tld("nodot")); } - - #[test] - fn test_edge_cases() { - // Empty strings - assert!(classify_url("").is_none()); - assert!(classify_domain("").is_none()); - - // Single characters - assert!(classify_url("a").is_none()); - assert!(classify_domain("a").is_none()); - - // Just TLD - assert!(classify_domain(".com").is_none()); - - // IP-like domains (should be handled by IP classifier) - assert!(classify_domain("192.168.1.1").is_none()); - } - - #[test] - fn test_file_extensions_not_domains() { - // File extensions should NOT be treated as valid TLDs - assert!(classify_domain("cmd.exe").is_none()); - assert!(classify_domain("kernel32.dll").is_none()); - assert!(classify_domain("ntoskrnl.sys").is_none()); - assert!(classify_domain("program.bin").is_none()); - assert!(classify_domain("data.dat").is_none()); - assert!(classify_domain("debug.log").is_none()); - assert!(classify_domain("temp.tmp").is_none()); - assert!(classify_domain("backup.bak").is_none()); - } } diff --git a/src/classification/patterns/paths.rs b/src/classification/patterns/paths.rs index 88611e5..de37e27 100644 --- a/src/classification/patterns/paths.rs +++ b/src/classification/patterns/paths.rs @@ -345,160 +345,42 @@ mod tests { use super::*; #[test] - fn test_posix_absolute_path() { + fn test_posix_path_valid_and_invalid() { assert!(classify_posix_path("/usr/bin/bash").is_some()); - assert!(classify_posix_path("/etc/passwd").is_some()); - assert!(classify_posix_path("/home/user/.bashrc").is_some()); - } - - #[test] - fn test_posix_home_directory() { - assert!(classify_posix_path("/home/user/documents/file.txt").is_some()); - assert!(classify_posix_path("/Users/admin/Desktop").is_some()); - } - - #[test] - fn test_posix_with_spaces() { - assert!(classify_posix_path("/home/user/My Documents/file.txt").is_some()); - } - - #[test] - fn test_posix_system_directories() { - assert!(classify_posix_path("/var/log/syslog").is_some()); - assert!(classify_posix_path("/opt/application/bin").is_some()); - } - - #[test] - fn test_posix_suspicious_paths() { - assert!(is_suspicious_posix_path("/etc/cron.d/malicious")); - assert!(is_suspicious_posix_path("/tmp/evil.sh")); - assert!(!is_suspicious_posix_path("/home/user/normal.txt")); - } - - #[test] - fn test_posix_too_short() { assert!(classify_posix_path("/").is_none()); - assert!(classify_posix_path("/a").is_none()); - } - - #[test] - fn test_posix_invalid() { assert!(classify_posix_path("not/a/path").is_none()); - assert!(classify_posix_path("C:\\Windows").is_none()); } #[test] - fn test_posix_with_null_bytes() { - assert!(classify_posix_path("/path/with\x00null").is_none()); - } - - #[test] - fn test_windows_absolute_path() { + fn test_windows_path_valid_and_invalid() { assert!(classify_windows_path("C:\\Windows\\System32").is_some()); - assert!(classify_windows_path("D:\\Projects\\code").is_some()); - } - - #[test] - fn test_windows_program_files() { - assert!(classify_windows_path("C:\\Program Files\\App\\app.exe").is_some()); - assert!(classify_windows_path("C:\\Program Files (x86)\\App").is_some()); - } - - #[test] - fn test_windows_with_spaces() { - assert!(classify_windows_path("C:\\Users\\John Doe\\Documents").is_some()); - } - - #[test] - fn test_windows_different_drives() { - assert!(classify_windows_path("D:\\Data\\file.txt").is_some()); - assert!(classify_windows_path("E:\\Backup\\archive.zip").is_some()); - } - - #[test] - fn test_windows_suspicious_paths() { - assert!(is_suspicious_windows_path("C:\\Windows\\System32\\cmd.exe")); - assert!(is_suspicious_windows_path("C:\\Windows\\Temp\\malware.exe")); - assert!(!is_suspicious_windows_path("D:\\Projects\\code.rs")); - } - - #[test] - fn test_windows_case_insensitive() { - assert!(classify_windows_path("c:\\windows\\system32").is_some()); - assert!(classify_windows_path("C:\\WINDOWS\\SYSTEM32").is_some()); - } - - #[test] - fn test_windows_invalid() { assert!(classify_windows_path("/unix/path").is_none()); - assert!(classify_windows_path("not a path").is_none()); - } - - #[test] - fn test_windows_invalid_drive() { assert!(classify_windows_path("1:\\Invalid\\Path").is_none()); } #[test] - fn test_unc_path() { + fn test_unc_path_valid_and_invalid() { assert!(classify_unc_path("\\\\server\\share\\file.txt").is_some()); - assert!(classify_unc_path("\\\\192.168.1.1\\c$\\Windows").is_some()); - } - - #[test] - fn test_unc_with_domain() { - assert!(classify_unc_path("\\\\domain.local\\share\\path").is_some()); + assert!(classify_unc_path("\\\\server").is_none()); } #[test] - fn test_unc_invalid() { - assert!(classify_unc_path("\\\\server").is_none()); // No share - assert!(classify_unc_path("\\server\\share").is_none()); // Single backslash - } - - #[test] - fn test_registry_run_key() { + fn test_registry_path_valid_and_invalid() { assert!( classify_registry_path( "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" ) .is_some() ); + assert!(classify_registry_path("HKEY_INVALID\\Path").is_none()); } #[test] - fn test_registry_current_user() { - assert!( - classify_registry_path("HKEY_CURRENT_USER\\Software\\Microsoft\\Windows").is_some() - ); - } - - #[test] - fn test_registry_abbreviated_hklm() { - assert!( - classify_registry_path("HKLM\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion").is_some() - ); - } - - #[test] - fn test_registry_abbreviated_hkcu() { - assert!(classify_registry_path("HKCU\\Software\\Classes").is_some()); - } - - #[test] - fn test_registry_persistence_run() { + fn test_suspicious_paths() { + assert!(is_suspicious_posix_path("/etc/cron.d/malicious")); + assert!(is_suspicious_windows_path("C:\\Windows\\System32\\cmd.exe")); assert!(is_suspicious_registry_path( "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" )); } - - #[test] - fn test_registry_invalid_root() { - assert!(classify_registry_path("HKEY_INVALID\\Path").is_none()); - } - - #[test] - fn test_registry_forward_slash() { - assert!(classify_registry_path("HKEY_LOCAL_MACHINE/SOFTWARE/Microsoft/Windows").is_some()); - } } diff --git a/src/extraction/dedup.rs b/src/extraction/dedup.rs index 8adda0a..b25bae0 100644 --- a/src/extraction/dedup.rs +++ b/src/extraction/dedup.rs @@ -83,7 +83,7 @@ pub struct StringOccurrence { pub fn deduplicate( strings: Vec, dedup_threshold: Option, - preserve_all_occurrences: bool, + _preserve_all_occurrences: bool, ) -> Vec { if strings.is_empty() { return Vec::new(); @@ -112,22 +112,10 @@ pub fn deduplicate( // All strings in group have same encoding, use first one let encoding = found_strings[0].encoding; - let occurrences: Vec = if preserve_all_occurrences { - // Store full occurrence metadata - found_strings - .into_iter() - .map(found_string_to_occurrence) - .collect() - } else { - // Store only the first occurrence as representative, but we still need - // the count for scoring, so we'll keep all but mark them as "count only" - // For now, we'll still store all occurrences but this could be optimized - // to store just a count field in the future - found_strings - .into_iter() - .map(found_string_to_occurrence) - .collect() - }; + let occurrences: Vec = found_strings + .into_iter() + .map(found_string_to_occurrence) + .collect(); let merged_tags = merge_tags(&occurrences); diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs index fb6ce97..6b3a85f 100644 --- a/src/extraction/mod.rs +++ b/src/extraction/mod.rs @@ -148,6 +148,20 @@ pub use utf16::{ extract_utf16_strings, }; +fn apply_semantic_enrichment(strings: &mut [FoundString]) { + let classifier = SemanticClassifier::new(); + let demangler = SymbolDemangler::new(); + for string in strings { + demangler.demangle(string); + let tags = classifier.classify(string); + for tag in tags { + if !string.tags.contains(&tag) { + string.tags.push(tag); + } + } + } +} + /// Configuration for string extraction /// /// Controls various aspects of the extraction process including minimum/maximum @@ -523,17 +537,7 @@ impl StringExtractor for BasicExtractor { } // Apply demangling and semantic classification before deduplication - let classifier = SemanticClassifier::new(); - let demangler = SymbolDemangler::new(); - for string in &mut all_strings { - demangler.demangle(string); - let tags = classifier.classify(string); - for tag in tags { - if !string.tags.contains(&tag) { - string.tags.push(tag); - } - } - } + apply_semantic_enrichment(&mut all_strings); // Apply deduplication if enabled if config.enable_deduplication { @@ -640,17 +644,7 @@ impl StringExtractor for BasicExtractor { } // Apply demangling and semantic classification before deduplication - let classifier = SemanticClassifier::new(); - let demangler = SymbolDemangler::new(); - for string in &mut all_strings { - demangler.demangle(string); - let tags = classifier.classify(string); - for tag in tags { - if !string.tags.contains(&tag) { - string.tags.push(tag); - } - } - } + apply_semantic_enrichment(&mut all_strings); // Apply deduplication if enabled, otherwise convert each string to a canonical form if config.enable_deduplication { From 176c536ef0a1a25072dab61e4222ce2f00c7a6d5 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 18 Jan 2026 01:09:58 -0500 Subject: [PATCH 7/7] fix(patterns): enhance regex patterns and validation logic - Updated email regex to simplify matching for binary and string extraction. - Refined IPv6 regex to allow only valid characters and improved validation checks. - Added validation for trailing dots in TLD checks. - Enhanced suspicious path checks with clearer matching strategies for Windows and registry paths. Signed-off-by: UncleSp1d3r --- src/classification/patterns/data.rs | 7 +++++++ src/classification/patterns/ip.rs | 24 +++++++++--------------- src/classification/patterns/network.rs | 5 +++++ src/classification/patterns/paths.rs | 11 +++++++++++ 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src/classification/patterns/data.rs b/src/classification/patterns/data.rs index e49020c..f650d39 100644 --- a/src/classification/patterns/data.rs +++ b/src/classification/patterns/data.rs @@ -17,6 +17,13 @@ pub(crate) static GUID_REGEX: Lazy = Lazy::new(|| { /// Regular expression for matching email addresses /// /// Pattern matches basic email format: user@domain.tld +/// +/// This intentionally simplified pattern is tuned for binary and string +/// extraction. It will match short forms like "a@b.cc" and common unquoted +/// local-parts, but it does not support quoted local-parts, some valid edge +/// cases (for example, certain plus or escape forms and full RFC 5322 +/// syntax), or internationalized domain names. The tradeoff is fewer false +/// positives at the cost of not being fully RFC-compliant. pub(crate) static EMAIL_REGEX: Lazy = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$").unwrap()); diff --git a/src/classification/patterns/ip.rs b/src/classification/patterns/ip.rs index 0b275b4..bb64164 100644 --- a/src/classification/patterns/ip.rs +++ b/src/classification/patterns/ip.rs @@ -18,12 +18,10 @@ pub(crate) static IPV4_REGEX: Lazy = Lazy::new(|| { /// Regular expression for matching IPv6 addresses /// -/// Pattern matches IPv6 addresses including full, compressed, and mixed notation. -/// This is a permissive pattern that checks for basic IPv6 structure. -/// Actual validation is performed by std::net::Ipv6Addr::from_str. -pub(crate) static IPV6_REGEX: Lazy = Lazy::new(|| { - Regex::new(r"(?i)^(?:[0-9a-f]{1,4}:){1,7}[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,7}:$|^(?:[0-9a-f]{1,4}:){1,6}:[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,5}(?::[0-9a-f]{1,4}){1,2}$|^(?:[0-9a-f]{1,4}:){1,4}(?::[0-9a-f]{1,4}){1,3}$|^(?:[0-9a-f]{1,4}:){1,3}(?::[0-9a-f]{1,4}){1,4}$|^(?:[0-9a-f]{1,4}:){1,2}(?::[0-9a-f]{1,4}){1,5}$|^[0-9a-f]{1,4}:(?::[0-9a-f]{1,4}){1,6}$|^:(?::[0-9a-f]{1,4}){1,7}$|^::$|^::ffff:(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap() -}); +/// This is a permissive pre-filter that only allows hex digits, colons, +/// and dots (for IPv4-mapped suffixes). Canonical validation is still +/// performed by std::net::Ipv6Addr::from_str. +pub(crate) static IPV6_REGEX: Lazy = Lazy::new(|| Regex::new(r"(?i)^[0-9a-f:.]+$").unwrap()); /// Regular expression for detecting and stripping port suffixes /// @@ -39,7 +37,7 @@ pub(crate) static PORT_SUFFIX_REGEX: Lazy = Lazy::new(|| { /// /// Matches [IPv6] format used in URLs like [::1]:8080. pub(crate) static IPV6_BRACKETS_REGEX: Lazy = - Lazy::new(|| Regex::new(r"^\[(.+)\]").unwrap()); + Lazy::new(|| Regex::new(r"^\[([^\]]+)\]$").unwrap()); /// Strips the port suffix from an IP address string if present /// @@ -126,17 +124,13 @@ pub fn is_ipv6_address(text: &str) -> bool { ip_text = strip_ipv6_brackets(without_port); } - // Basic structure check - must contain colon and only valid hex/colon characters - if !ip_text.contains(':') { + // Permissive pre-filter to reject obvious non-IPv6 strings early + if !IPV6_REGEX.is_match(ip_text) { return false; } - // Allow only valid IPv6 characters - let valid_chars = ip_text - .chars() - .all(|c| c.is_ascii_hexdigit() || c == ':' || c == '.'); - - if !valid_chars { + // Basic structure check - must contain colon and only valid hex/colon characters + if !ip_text.contains(':') { return false; } diff --git a/src/classification/patterns/network.rs b/src/classification/patterns/network.rs index fa72234..1ae6cb5 100644 --- a/src/classification/patterns/network.rs +++ b/src/classification/patterns/network.rs @@ -38,12 +38,17 @@ static COMMON_TLDS: Lazy> = Lazy::new(|| { /// Checks if the domain has a valid TLD /// +/// Trailing dots are treated as invalid to avoid accepting empty TLDs. +/// /// # Arguments /// * `domain` - The domain name to validate /// /// # Returns /// Returns `true` if the domain has a known TLD. pub fn has_valid_tld(domain: &str) -> bool { + if domain.ends_with('.') { + return false; + } if let Some(dot_pos) = domain.rfind('.') { let tld = &domain[dot_pos + 1..]; let tld_lower = tld.to_lowercase(); diff --git a/src/classification/patterns/paths.rs b/src/classification/patterns/paths.rs index de37e27..ca9cd4b 100644 --- a/src/classification/patterns/paths.rs +++ b/src/classification/patterns/paths.rs @@ -327,6 +327,9 @@ pub fn is_suspicious_posix_path(text: &str) -> bool { } /// Checks if a Windows path is suspicious (persistence-related) +/// +/// Uses prefix matching since suspicious Windows paths are anchored to +/// well-known base directories like C:\Windows\. pub fn is_suspicious_windows_path(text: &str) -> bool { SUSPICIOUS_WINDOWS_PATHS .iter() @@ -334,6 +337,9 @@ pub fn is_suspicious_windows_path(text: &str) -> bool { } /// Checks if a registry path is suspicious (persistence-related) +/// +/// Uses substring matching since relevant registry keys can appear anywhere +/// within a longer path string. pub fn is_suspicious_registry_path(text: &str) -> bool { SUSPICIOUS_REGISTRY_PATHS .iter() @@ -364,6 +370,11 @@ mod tests { assert!(classify_unc_path("\\\\server").is_none()); } + #[test] + fn test_classify_unc_path_missing_share() { + assert!(classify_unc_path("\\\\server\\").is_none()); + } + #[test] fn test_registry_path_valid_and_invalid() { assert!(