diff --git a/Cargo.toml b/Cargo.toml index b3d0aa7..ecb5d42 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,19 +20,21 @@ name = "stringy" path = "src/main.rs" [dependencies] -clap = { version = "4.5.54", features = [ "derive" ] } -entropy = "0.4.2" -goblin = "0.10.4" -lazy_static = "1.5" -pelite = "0.10.0" -regex = "1.12.2" -serde = { version = "1.0.228", features = [ "derive" ] } -serde_json = "1.0.148" -thiserror = "2.0.17" +clap = { version = "4.5.54", features = [ "derive" ] } +cpp_demangle = "0.5.1" +entropy = "0.4.2" +goblin = "0.10.4" +once_cell = "1.21.3" +pelite = "0.10.0" +regex = "1.12.2" +rustc-demangle = "0.1.27" +serde = { version = "1.0.228", features = [ "derive" ] } +serde_json = "1.0.149" +thiserror = "2.0.17" [dev-dependencies] criterion = "0.8.1" -insta = "1.46.0" +insta = "1.46.1" tempfile = "3.24.0" # The profile that 'dist' will build with diff --git a/docs/src/classification.md b/docs/src/classification.md index 2a5947a..170c216 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -76,51 +76,54 @@ Raw String -> Pattern Matching -> Tag Assignment #### GUIDs/UUIDs -- **Pattern**: `\{[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\}` -- **Examples**: `{12345678-1234-1234-1234-123456789abc}` -- **Validation**: Format compliance, version checking +- **Pattern**: `\{?[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\}?` +- **Examples**: `{12345678-1234-1234-1234-123456789abc}`, `12345678-1234-1234-1234-123456789abc` +- **Validation**: Format compliance - **Security relevance**: Medium - component identification #### Email Addresses - **Pattern**: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}` - **Examples**: `admin@malware.com`, `support@legitimate.org` -- **Validation**: RFC compliance, domain validation +- **Validation**: Basic format validation - **Security relevance**: Medium - contact information ### Code Artifacts #### Format Strings -- **Pattern**: `%[sdxo]|%\d+[sdxo]|\{\d+\}` -- **Examples**: `Error: %s at line %d`, `User {0} logged in` -- **Context**: Proximity to other format strings +- **Pattern**: `%[-+0 #]*(\d+|\*)?(\.(\d+|\*))?(hh?|ll?|[Lzjt])?[diouxXeEfFgGaAcspn%]` +- **Examples**: `Error: %s at line %d`, `Name: %s, Age: %d, Score: %.2f` +- **Context**: Presence of real format specifiers (%% alone is ignored) - **Security relevance**: Low-Medium - debugging information #### Base64 Data -- **Pattern**: `[A-Za-z0-9+/]{20,}={0,2}` +- **Pattern**: Character set validation with padding rules - **Examples**: `SGVsbG8gV29ybGQ=` -- **Validation**: Length divisibility, padding correctness +- **Validation**: Length >= 16, Base64 character set, valid padding, reject length mod 4 of 1 - **Security relevance**: Variable - encoded payloads -### User Agents +#### User Agents -- **Pattern**: `Mozilla/[0-9.]+|Chrome/[0-9.]+|Safari/[0-9.]+` -- **Examples**: `Mozilla/5.0 (Windows NT 10.0; Win64; x64)` +- **Pattern**: Prefix match for common agents (Mozilla, curl, Wget, python-requests, libwww-perl, Java, Apache-HttpClient, okhttp, PostmanRuntime) +- **Examples**: `Mozilla/5.0 (Windows NT 10.0; Win64; x64)`, `curl/7.68.0` - **Security relevance**: Medium - network fingerprinting -### Pattern Matching Engine +## Tag Specificity -The semantic classifier uses cached regex patterns via `lazy_static!` and applies validation checks to reduce false positives. +Tags are treated as either specific or broad. Specific tags indicate high confidence matches (for example URL, domain, IP, file path, GUID, email, format string, and user agent). Base64 is a broad tag and should be treated as ambiguous due to higher false positive risk. + +## Pattern Matching Engine + +The semantic classifier uses cached regex patterns via `once_cell::sync::Lazy` and applies validation checks to reduce false positives. ```rust -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use regex::Regex; -lazy_static! { - static ref URL_REGEX: Regex = Regex::new(r#"https?://[^\s<>"{}|\\^\[\]\`]+"#).unwrap(); -} +static URL_REGEX: Lazy = + Lazy::new(|| Regex::new(r#"https?://[^\s<>"{}|\\^\[\]\`]+"#).unwrap()); impl SemanticClassifier { pub fn classify(&self, string: &FoundString) -> Vec { @@ -152,22 +155,6 @@ impl SemanticClassifier { } ``` -## Implementation Details - -The classifier relies on `lazy_static!` to compile regex patterns once and reuse them across classification calls. Helper methods validate strings before assigning tags. - -### Method Signatures - -Key method signatures: - -```text -pub fn classify(&self, string: &FoundString) -> Vec; -pub fn classify_posix_path(&self, text: &str) -> Option; -pub fn classify_windows_path(&self, text: &str) -> Option; -pub fn classify_unc_path(&self, text: &str) -> Option; -pub fn classify_registry_path(&self, text: &str) -> Option; -``` - ## Using the Classification System ```text @@ -198,85 +185,7 @@ if tags.contains(&Tag::FilePath) { The current implementation returns tags without explicit confidence scores. Confidence is implicit in the validation and matching logic. A future update may introduce explicit confidence values per tag. -## Planned Enhancements (implementation pending) +## Planned Enhancements - Context-aware classification -- Symbol classification -- Additional semantic patterns (GUIDs, email addresses, base64, format strings) - documented above, implementation pending - -### Language-Specific Patterns - -Different programming languages have distinct string patterns: - -```rust -pub enum LanguageHint { - Rust, - Go, - DotNet, - Native, -} - -impl SemanticClassifier { - fn classify_with_language_hint(&self, text: &str, hint: LanguageHint) -> Vec { - match hint { - LanguageHint::Rust => self.classify_rust_patterns(text), - LanguageHint::Go => self.classify_go_patterns(text), - LanguageHint::DotNet => self.classify_dotnet_patterns(text), - LanguageHint::Native => self.classify_native_patterns(text), - } - } -} -``` - -### False Positive Reduction - -Several techniques reduce false positives: - -1. **Length thresholds**: Very short matches are filtered out -2. **Context validation**: Surrounding data must make sense -3. **Entropy checking**: High-entropy strings are likely binary data -4. **Whitelist/blacklist**: Known good/bad patterns - -```text -fn is_likely_false_positive(&self, text: &str, tag: &Tag) -> bool { - match tag { - Tag::Domain => { - // Too short or invalid TLD - text.len() < 4 || !self.has_valid_tld(text) - } - Tag::Base64 => { - // Too short or invalid padding - text.len() < 8 || !self.valid_base64_padding(text) - } - _ => false, - } -} -``` - -## Performance Considerations - -### Regex Compilation Caching - -```rust -lazy_static! { - static ref COMPILED_PATTERNS: SemanticClassifier = SemanticClassifier::new(); -} -``` - -### Parallel Classification - -```rust -use rayon::prelude::*; - -fn classify_batch(strings: &[RawString]) -> Vec { - strings.par_iter().map(|s| classify_single(s)).collect() -} -``` - -### Memory Efficiency - -- Reuse regex objects across classifications -- Use string interning for common patterns -- Lazy evaluation for expensive validations - -This comprehensive classification system enables Stringy to automatically identify and categorize the most relevant strings in binary files, significantly improving analysis efficiency. +- Language-specific refinements diff --git a/src/classification/mod.rs b/src/classification/mod.rs index d3dd9ad..320a3a6 100644 --- a/src/classification/mod.rs +++ b/src/classification/mod.rs @@ -12,14 +12,12 @@ //! - **Domain Detection**: Identifies domain names with TLD validation //! - **File Path Detection**: Identifies POSIX, Windows, and UNC paths //! - **Registry Path Detection**: Identifies Windows registry paths -//! -//! ## Future Capabilities -//! -//! - GUIDs/UUIDs -//! - Email addresses -//! - Base64 data -//! - Format strings -//! - User agents +//! - **GUID Detection**: Identifies GUIDs/UUIDs in standard format +//! - **Email Detection**: Identifies email addresses +//! - **Base64 Detection**: Identifies Base64-encoded data (broad tag) +//! - **Format String Detection**: Identifies printf-style format strings +//! - **User Agent Detection**: Identifies HTTP user agent strings +//! - **Symbol Demangling**: Demangles Rust symbols to human-readable form //! //! ## Usage //! @@ -49,5 +47,9 @@ //! assert!(tags.contains(&Tag::FilePath)); //! ``` +mod patterns; pub mod semantic; +pub mod symbols; + pub use semantic::SemanticClassifier; +pub use symbols::SymbolDemangler; diff --git a/src/classification/patterns/data.rs b/src/classification/patterns/data.rs new file mode 100644 index 0000000..f650d39 --- /dev/null +++ b/src/classification/patterns/data.rs @@ -0,0 +1,227 @@ +//! Data format classification patterns +//! +//! This module provides GUID, email, Base64, format string, and user agent detection. + +use crate::types::Tag; +use once_cell::sync::Lazy; +use regex::Regex; + +/// Regular expression for matching GUIDs/UUIDs +/// +/// Pattern matches standard GUID format: {XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX} +/// Also matches without braces and in lowercase. +pub(crate) static GUID_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"(?i)^\{?[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\}?$").unwrap() +}); + +/// Regular expression for matching email addresses +/// +/// Pattern matches basic email format: user@domain.tld +/// +/// This intentionally simplified pattern is tuned for binary and string +/// extraction. It will match short forms like "a@b.cc" and common unquoted +/// local-parts, but it does not support quoted local-parts, some valid edge +/// cases (for example, certain plus or escape forms and full RFC 5322 +/// syntax), or internationalized domain names. The tradeoff is fewer false +/// positives at the cost of not being fully RFC-compliant. +pub(crate) static EMAIL_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$").unwrap()); + +/// Regular expression for matching printf-style format strings +/// +/// Pattern detects format specifiers like %s, %d, %x, %f, etc. +pub(crate) static FORMAT_STRING_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"%[-+0 #]*(\d+|\*)?(\.(\d+|\*))?(hh?|ll?|[Lzjt])?[diouxXeEfFgGaAcspn%]").unwrap() +}); + +/// Regular expression for matching common user agent patterns +/// +/// Pattern matches common browser/bot user agent strings. +pub(crate) static USER_AGENT_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"(?i)^Mozilla/\d|^curl/|^Wget/|^python-requests|^libwww-perl|^Java/|^Apache-HttpClient|^okhttp/|^PostmanRuntime/") + .unwrap() +}); + +/// Classifies a GUID/UUID +/// +/// # Arguments +/// * `text` - The text to check for GUID format +/// +/// # Returns +/// Returns `Some(Tag::Guid)` if valid, `None` otherwise. +pub fn classify_guid(text: &str) -> Option { + if GUID_REGEX.is_match(text) { + Some(Tag::Guid) + } else { + None + } +} + +/// Classifies an email address +/// +/// # Arguments +/// * `text` - The text to check for email format +/// +/// # Returns +/// Returns `Some(Tag::Email)` if valid, `None` otherwise. +pub fn classify_email(text: &str) -> Option { + if EMAIL_REGEX.is_match(text) { + Some(Tag::Email) + } else { + None + } +} + +/// Classifies Base64-encoded data +/// +/// This is a broad/ambiguous tag with potential false positives. +/// Returns `Some(Tag::Base64)` if the text appears to be Base64 encoded. +/// +/// Detection criteria: +/// - Minimum length of 16 characters +/// - Only valid Base64 characters (A-Z, a-z, 0-9, +, /, =) +/// - Proper padding (if present) +/// - Length is a multiple of 4 or has valid padding +/// - For unpadded strings: must have both uppercase and lowercase letters +pub fn classify_base64(text: &str) -> Option { + // Minimum length to reduce false positives + if text.len() < 16 { + return None; + } + + // Check if it's valid Base64 characters only + let is_base64_chars = text + .chars() + .all(|c| c.is_ascii_alphanumeric() || c == '+' || c == '/' || c == '='); + + if !is_base64_chars { + return None; + } + + // Count padding characters + let padding_count = text.chars().rev().take_while(|&c| c == '=').count(); + + // Padding should be at most 2 characters + if padding_count > 2 { + return None; + } + + // Strip padding for length check + let content_len = text.len() - padding_count; + + // Valid Base64 content length should avoid mod 4 remainder of 1 + let remainder = (content_len + padding_count) % 4; + if remainder == 1 { + return None; + } + + // Check for character diversity typical of Base64 + let has_upper = text.chars().any(|c| c.is_ascii_uppercase()); + let has_lower = text.chars().any(|c| c.is_ascii_lowercase()); + let has_digit = text.chars().any(|c| c.is_ascii_digit()); + + // For strings with padding, the padding itself is strong evidence + // For strings without padding, require both upper and lowercase + // to avoid false positives on random alphanumeric strings + if padding_count == 0 { + // Require both upper and lower case for unpadded strings + if !has_upper || !has_lower { + return None; + } + } else { + // For padded strings, still require some diversity + let has_diversity = has_digit || (has_upper && has_lower); + if !has_diversity { + return None; + } + } + + Some(Tag::Base64) +} + +/// Classifies a printf-style format string +/// +/// # Arguments +/// * `text` - The text to check for format string patterns +/// +/// # Returns +/// Returns `Some(Tag::FormatString)` if valid, `None` otherwise. +pub fn classify_format_string(text: &str) -> Option { + // Find all format specifier matches + let matches: Vec<_> = FORMAT_STRING_REGEX.find_iter(text).collect(); + + if matches.is_empty() { + return None; + } + + // Check if any match is a real format specifier (not just %%) + // %% is just an escaped percent sign, not a real format specifier + let has_real_specifier = matches.iter().any(|m| m.as_str() != "%%"); + + if !has_real_specifier { + return None; + } + + // Exclude strings that are just a single format specifier + // (those are likely false positives) + if text.len() <= 2 { + return None; + } + + Some(Tag::FormatString) +} + +/// Classifies a user agent string +/// +/// # Arguments +/// * `text` - The text to check for user agent patterns +/// +/// # Returns +/// Returns `Some(Tag::UserAgent)` if valid, `None` otherwise. +pub fn classify_user_agent(text: &str) -> Option { + if USER_AGENT_REGEX.is_match(text) { + Some(Tag::UserAgent) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_guid_valid_and_invalid() { + assert!(classify_guid("{12345678-1234-1234-1234-123456789ABC}").is_some()); + assert!(classify_guid("12345678-1234-1234-1234-123456789ABC").is_some()); + assert!(classify_guid("not-a-guid").is_none()); + } + + #[test] + fn test_email_valid_and_invalid() { + assert!(classify_email("user@example.com").is_some()); + assert!(classify_email("not an email").is_none()); + } + + #[test] + fn test_base64_valid_and_invalid() { + assert!(classify_base64("SGVsbG8gV29ybGQh").is_some()); + assert!(classify_base64("This is not base64!!").is_none()); + assert!(classify_base64("YWJj").is_none()); + } + + #[test] + fn test_format_string_valid_and_invalid() { + assert!(classify_format_string("Error: %s at line %d").is_some()); + assert!(classify_format_string("100%% done").is_none()); + } + + #[test] + fn test_user_agent_valid_and_invalid() { + assert!( + classify_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + .is_some() + ); + assert!(classify_user_agent("Not a user agent").is_none()); + } +} diff --git a/src/classification/patterns/ip.rs b/src/classification/patterns/ip.rs new file mode 100644 index 0000000..bb64164 --- /dev/null +++ b/src/classification/patterns/ip.rs @@ -0,0 +1,195 @@ +//! IP address classification patterns +//! +//! This module provides IPv4 and IPv6 address detection functionality. + +use crate::types::Tag; +use once_cell::sync::Lazy; +use regex::Regex; +use std::net::{Ipv4Addr, Ipv6Addr}; +use std::str::FromStr; + +/// Regular expression for matching IPv4 addresses +/// +/// Pattern matches IPv4 addresses with proper octet validation (0-255). +/// Matches the entire string (used after port stripping). +pub(crate) static IPV4_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap() +}); + +/// Regular expression for matching IPv6 addresses +/// +/// This is a permissive pre-filter that only allows hex digits, colons, +/// and dots (for IPv4-mapped suffixes). Canonical validation is still +/// performed by std::net::Ipv6Addr::from_str. +pub(crate) static IPV6_REGEX: Lazy = Lazy::new(|| Regex::new(r"(?i)^[0-9a-f:.]+$").unwrap()); + +/// Regular expression for detecting and stripping port suffixes +/// +/// Matches :port where port is in the valid range 0-65535. +pub(crate) static PORT_SUFFIX_REGEX: Lazy = Lazy::new(|| { + Regex::new( + r":(?:[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])$", + ) + .unwrap() +}); + +/// Regular expression for handling bracketed IPv6 addresses +/// +/// Matches [IPv6] format used in URLs like [::1]:8080. +pub(crate) static IPV6_BRACKETS_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^\[([^\]]+)\]$").unwrap()); + +/// Strips the port suffix from an IP address string if present +/// +/// # Arguments +/// * `text` - The text that may contain a port suffix +/// +/// # Returns +/// The text with the port suffix removed, or the original text if no port found. +pub fn strip_port(text: &str) -> &str { + if let Some(mat) = PORT_SUFFIX_REGEX.find(text) { + &text[..mat.start()] + } else { + text + } +} + +/// Strips brackets from an IPv6 address if present +/// +/// # Arguments +/// * `text` - The text that may contain bracketed IPv6 +/// +/// # Returns +/// The IPv6 address without brackets, or the original text if no brackets found. +pub fn strip_ipv6_brackets(text: &str) -> &str { + if let Some(caps) = IPV6_BRACKETS_REGEX.captures(text) + && let Some(inner) = caps.get(1) + { + return inner.as_str(); + } + text +} + +/// Checks if the given text is a valid IPv4 address +/// +/// This method first strips any port suffix, then validates the remaining +/// text as an IPv4 address using both regex and standard library validation. +/// +/// # Arguments +/// * `text` - The text to check for IPv4 format +/// +/// # Returns +/// Returns `true` if the text is a valid IPv4 address. +pub fn is_ipv4_address(text: &str) -> bool { + // Strip port suffix if present + let text_without_port = strip_port(text); + + // Two-stage validation: regex pre-filter first + if !IPV4_REGEX.is_match(text_without_port) { + return false; + } + + // Check for leading zeros in octets (e.g., 192.168.01.1 should be rejected) + for octet_str in text_without_port.split('.') { + // If an octet has more than 1 digit and starts with '0', it's invalid + if octet_str.len() > 1 && octet_str.starts_with('0') { + return false; + } + } + + // Validate using std::net::Ipv4Addr for correctness + // This is the authoritative check - regex is just a pre-filter + Ipv4Addr::from_str(text_without_port).is_ok() +} + +/// Checks if the given text is a valid IPv6 address +/// +/// This method handles bracketed IPv6 addresses (e.g., [::1]:8080), +/// strips any port suffix, and validates using both regex and standard library. +/// +/// # Arguments +/// * `text` - The text to check for IPv6 format +/// +/// # Returns +/// Returns `true` if the text is a valid IPv6 address. +pub fn is_ipv6_address(text: &str) -> bool { + // Handle bracketed IPv6 addresses like [::1] or [::1]:8080 + let mut ip_text = text; + + // Check for bracketed format + if text.starts_with('[') { + // Strip port from the full text first (e.g., [::1]:8080 -> [::1]) + let without_port = strip_port(text); + // Now extract the IPv6 from brackets + ip_text = strip_ipv6_brackets(without_port); + } + + // Permissive pre-filter to reject obvious non-IPv6 strings early + if !IPV6_REGEX.is_match(ip_text) { + return false; + } + + // Basic structure check - must contain colon and only valid hex/colon characters + if !ip_text.contains(':') { + return false; + } + + // Validate using std::net::Ipv6Addr for correctness + Ipv6Addr::from_str(ip_text).is_ok() +} + +/// Classifies IP addresses (both IPv4 and IPv6) in the given text +/// +/// # Arguments +/// * `text` - The text to classify +/// +/// # Returns +/// A vector of tags (IPv4 and/or IPv6) that apply to the text. +pub fn classify_ip_addresses(text: &str) -> Vec { + let mut tags = Vec::new(); + + if is_ipv4_address(text) { + tags.push(Tag::IPv4); + } + + if is_ipv6_address(text) { + tags.push(Tag::IPv6); + } + + tags +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ipv4_valid_and_invalid() { + assert!(is_ipv4_address("192.168.1.1")); + assert!(is_ipv4_address("192.168.1.1:8080")); + assert!(!is_ipv4_address("256.1.1.1")); + assert!(!is_ipv4_address("01.02.03.04")); + } + + #[test] + fn test_ipv6_valid_and_invalid() { + assert!(is_ipv6_address("2001:db8::1")); + assert!(is_ipv6_address("[::1]:8080")); + assert!(!is_ipv6_address("not an ipv6")); + } + + #[test] + fn test_classify_ipv4_and_ipv6() { + let tags = classify_ip_addresses("192.168.1.1"); + assert_eq!(tags, vec![Tag::IPv4]); + + let tags = classify_ip_addresses("2001:db8::1"); + assert_eq!(tags, vec![Tag::IPv6]); + } + + #[test] + fn test_classify_no_ip() { + let tags = classify_ip_addresses("not an ip address"); + assert!(tags.is_empty()); + } +} diff --git a/src/classification/patterns/mod.rs b/src/classification/patterns/mod.rs new file mode 100644 index 0000000..2852fdf --- /dev/null +++ b/src/classification/patterns/mod.rs @@ -0,0 +1,34 @@ +//! Pattern classification modules +//! +//! This module contains submodules for different types of pattern classification: +//! - `ip`: IPv4 and IPv6 address detection +//! - `network`: URL and domain detection +//! - `paths`: File and registry path detection +//! - `data`: GUID, email, Base64, format string, and user agent detection + +pub mod data; +pub mod ip; +pub mod network; +pub mod paths; + +// Re-export classification functions +pub use data::{ + classify_base64, classify_email, classify_format_string, classify_guid, classify_user_agent, +}; +pub use ip::{ + classify_ip_addresses, is_ipv4_address, is_ipv6_address, strip_ipv6_brackets, strip_port, +}; +pub use network::{classify_domain, classify_url, has_valid_tld}; +pub use paths::{ + classify_posix_path, classify_registry_path, classify_unc_path, classify_windows_path, + is_suspicious_posix_path, is_suspicious_registry_path, is_suspicious_windows_path, + is_valid_posix_path, is_valid_registry_path, is_valid_windows_path, +}; + +// Re-export regex patterns needed by SemanticClassifier for cache testing +pub(crate) use ip::{IPV4_REGEX, IPV6_REGEX}; +pub(crate) use network::{DOMAIN_REGEX, URL_REGEX}; +pub(crate) use paths::{ + POSIX_PATH_REGEX, REGISTRY_ABBREV_REGEX, REGISTRY_PATH_REGEX, UNC_PATH_REGEX, + WINDOWS_PATH_REGEX, +}; diff --git a/src/classification/patterns/network.rs b/src/classification/patterns/network.rs new file mode 100644 index 0000000..1ae6cb5 --- /dev/null +++ b/src/classification/patterns/network.rs @@ -0,0 +1,137 @@ +//! Network indicator classification patterns +//! +//! This module provides URL and domain name detection functionality. + +use crate::types::Tag; +use once_cell::sync::Lazy; +use regex::Regex; +use std::collections::HashSet; + +/// Regular expression for matching HTTP/HTTPS URLs +/// +/// Pattern matches URLs starting with http:// or https:// and excludes +/// problematic characters that could cause false positives. +pub(crate) static URL_REGEX: Lazy = + Lazy::new(|| Regex::new(r#"https?://[^\s<>"{}|\\\^\[\]\`]+"#).unwrap()); + +/// Regular expression for matching domain names +/// +/// Pattern matches domain names with proper DNS format compliance (RFC 1035). +/// It ensures domains start and end with alphanumeric characters, allows hyphens +/// in the middle, and requires at least a 2-character TLD. +pub(crate) static DOMAIN_REGEX: Lazy = Lazy::new(|| { + Regex::new(r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b").unwrap() +}); + +/// List of common TLDs for validation +static COMMON_TLDS: Lazy> = Lazy::new(|| { + HashSet::from([ + "com", "org", "net", "edu", "gov", "mil", "int", "io", "co", "uk", "de", "fr", "jp", "cn", + "ru", "br", "in", "au", "ca", "es", "it", "nl", "pl", "se", "ch", "at", "be", "dk", "fi", + "no", "pt", "cz", "hu", "ro", "bg", "hr", "sk", "si", "ee", "lt", "lv", "ie", "gr", "cy", + "mt", "lu", "info", "biz", "name", "pro", "aero", "coop", "museum", "travel", "jobs", + "mobi", "tel", "asia", "cat", "xxx", "app", "dev", "page", "blog", "shop", "store", + "online", "site", "website", "tech", "cloud", "ai", "ml", "tv", "me", "cc", "ws", "bz", + "nu", "tk", "ga", "cf", "gq", + ]) +}); + +/// Checks if the domain has a valid TLD +/// +/// Trailing dots are treated as invalid to avoid accepting empty TLDs. +/// +/// # Arguments +/// * `domain` - The domain name to validate +/// +/// # Returns +/// Returns `true` if the domain has a known TLD. +pub fn has_valid_tld(domain: &str) -> bool { + if domain.ends_with('.') { + return false; + } + if let Some(dot_pos) = domain.rfind('.') { + let tld = &domain[dot_pos + 1..]; + let tld_lower = tld.to_lowercase(); + COMMON_TLDS.contains(tld_lower.as_str()) + } else { + false + } +} + +/// Detects HTTP/HTTPS URLs in the given text +/// +/// This method identifies URLs that start with `http://` or `https://` +/// and contain valid URL characters. +/// +/// # Arguments +/// * `text` - The text to search for URLs +/// +/// # Returns +/// Returns `Some(Tag::Url)` if a URL is found, `None` otherwise. +pub fn classify_url(text: &str) -> Option { + if URL_REGEX.is_match(text) { + Some(Tag::Url) + } else { + None + } +} + +/// Detects domain names that are not URLs +/// +/// This method identifies domain names that match the domain pattern but +/// are not already identified as URLs. It first checks if the text is NOT +/// a URL to prevent double-tagging, then validates against the domain +/// pattern and TLD list. +/// +/// # Arguments +/// * `text` - The text to search for domain names +/// +/// # Returns +/// Returns `Some(Tag::Domain)` if a valid domain is found (and it's not +/// a URL), `None` otherwise. +pub fn classify_domain(text: &str) -> Option { + // First check if it's NOT a URL to prevent double-tagging + if URL_REGEX.is_match(text) { + return None; + } + + // Check if it matches the domain pattern + if DOMAIN_REGEX.is_match(text) { + // Validate TLD to reduce false positives + if has_valid_tld(text) { + return Some(Tag::Domain); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_url_valid_and_invalid() { + assert!(classify_url("https://example.com").is_some()); + assert!(classify_url("not a url").is_none()); + } + + #[test] + fn test_domain_valid_and_invalid() { + assert!(classify_domain("example.com").is_some()); + assert!(classify_domain("https://example.com").is_none()); + assert!(classify_domain("invalid.xyz123").is_none()); + } + + #[test] + fn test_url_not_double_tagged() { + assert!(classify_url("https://example.com").is_some()); + assert!(classify_domain("https://example.com").is_none()); + } + + #[test] + fn test_tld_validation() { + assert!(has_valid_tld("example.com")); + assert!(!has_valid_tld("nodot")); + } +} diff --git a/src/classification/patterns/paths.rs b/src/classification/patterns/paths.rs new file mode 100644 index 0000000..ca9cd4b --- /dev/null +++ b/src/classification/patterns/paths.rs @@ -0,0 +1,397 @@ +//! File and registry path classification patterns +//! +//! This module provides POSIX, Windows, UNC, and registry path detection. + +use crate::types::Tag; +use once_cell::sync::Lazy; +use regex::Regex; +use std::collections::HashSet; + +/// Regular expression for matching POSIX file paths +pub(crate) static POSIX_PATH_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^/[^\x00\n\r]*").unwrap()); + +/// Regular expression for matching Windows file paths +pub(crate) static WINDOWS_PATH_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^[A-Za-z]:\\[^\x00\n\r]*").unwrap()); + +/// Regular expression for matching UNC network paths +pub(crate) static UNC_PATH_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^\\\\[a-zA-Z0-9.-]+\\[^\x00\n\r]*").unwrap()); + +/// Regular expression for matching full Windows registry paths +pub(crate) static REGISTRY_PATH_REGEX: Lazy = + Lazy::new(|| Regex::new(r"(?i)^HKEY_[A-Z_]+\\[^\x00\n\r]*").unwrap()); + +/// Regular expression for matching abbreviated registry paths +pub(crate) static REGISTRY_ABBREV_REGEX: Lazy = + Lazy::new(|| Regex::new(r"(?i)^HK(LM|CU|CR|U|CC)\\[^\x00\n\r]*").unwrap()); + +/// Common suspicious POSIX path prefixes for persistence detection +static SUSPICIOUS_POSIX_PATHS: Lazy> = Lazy::new(|| { + HashSet::from([ + "/etc/cron.d/", + "/etc/init.d/", + "/usr/local/bin/", + "/tmp/", + "/var/tmp/", + "/etc/rc.d/", + "/etc/crontab", + "/etc/systemd/system/", + "/Library/LaunchDaemons/", + "/Library/LaunchAgents/", + ]) +}); + +/// Common suspicious Windows path prefixes for persistence detection +static SUSPICIOUS_WINDOWS_PATHS: Lazy> = Lazy::new(|| { + HashSet::from([ + "C:\\Windows\\System32\\", + "C:\\Windows\\Temp\\", + "\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\", + "C:\\ProgramData\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\", + "C:\\Windows\\SysWOW64\\", + ]) +}); + +/// Known valid POSIX path prefixes +static KNOWN_POSIX_PREFIXES: Lazy> = Lazy::new(|| { + HashSet::from([ + "/usr/", "/etc/", "/var/", "/home/", "/opt/", "/bin/", "/sbin/", "/lib/", "/dev/", + "/proc/", "/sys/", "/tmp/", + ]) +}); + +/// Known valid Windows path prefixes +static KNOWN_WINDOWS_PREFIXES: Lazy> = Lazy::new(|| { + HashSet::from([ + "C:\\Windows\\", + "C:\\Program Files\\", + "C:\\Program Files (x86)\\", + "C:\\Users\\", + "C:\\ProgramData\\", + ]) +}); + +/// Valid Windows registry root keys +static VALID_REGISTRY_ROOTS: Lazy> = Lazy::new(|| { + HashSet::from([ + "HKEY_LOCAL_MACHINE", + "HKEY_CURRENT_USER", + "HKEY_CLASSES_ROOT", + "HKEY_USERS", + "HKEY_CURRENT_CONFIG", + ]) +}); + +/// Suspicious Windows registry paths for persistence detection +static SUSPICIOUS_REGISTRY_PATHS: Lazy> = Lazy::new(|| { + HashSet::from([ + "\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run", + "\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\RunOnce", + "\\System\\CurrentControlSet\\Services", + "\\SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion\\Winlogon", + "\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders", + ]) +}); + +/// Checks if a path contains ASCII case-insensitive substring +fn contains_ascii_case_insensitive(haystack: &str, needle: &str) -> bool { + if needle.is_empty() { + return true; + } + + let haystack_bytes = haystack.as_bytes(); + let needle_bytes = needle.as_bytes(); + + if needle_bytes.len() > haystack_bytes.len() { + return false; + } + + for start in 0..=haystack_bytes.len() - needle_bytes.len() { + let mut matched = true; + for i in 0..needle_bytes.len() { + let hay = haystack_bytes[start + i].to_ascii_lowercase(); + let nee = needle_bytes[i].to_ascii_lowercase(); + if hay != nee { + matched = false; + break; + } + } + if matched { + return true; + } + } + + false +} + +fn starts_with_ascii_case_insensitive(text: &str, prefix: &str) -> bool { + if prefix.len() > text.len() { + return false; + } + + text.as_bytes() + .iter() + .take(prefix.len()) + .zip(prefix.as_bytes()) + .all(|(left, right)| left.eq_ignore_ascii_case(right)) +} + +const AUTOSTART_POSIX_SUBPATH: &str = "/.config/autostart/"; + +/// Checks if text contains printf-style format placeholders +fn contains_printf_placeholder(text: &str) -> bool { + // Look for common printf patterns that might appear in paths + let patterns = [ + "%s", "%d", "%x", "%u", "%i", "%f", "%c", "%p", "%n", "%ld", "%lu", + ]; + patterns.iter().any(|pattern| text.contains(pattern)) +} + +/// Checks if text contains control characters +fn contains_control_chars(text: &str) -> bool { + text.chars().any(|c| c.is_control() && c != '\t') +} + +/// Validates a POSIX path +pub fn is_valid_posix_path(text: &str) -> bool { + // Must start with / and have at least one more character + if !text.starts_with('/') || text.len() < 2 { + return false; + } + + // Check for null bytes or control characters + if contains_control_chars(text) { + return false; + } + + // Check for known prefixes to boost confidence + for prefix in KNOWN_POSIX_PREFIXES.iter() { + if text.starts_with(prefix) { + return true; + } + } + + // Additional validation for paths that don't start with known prefixes + // Must have at least one directory separator beyond the root + if text.len() > 1 && text[1..].contains('/') { + return true; + } + + // Single directory under root (e.g., "/bin") - needs to be at least 3 chars + text.len() >= 3 +} + +/// Validates a Windows path +pub fn is_valid_windows_path(text: &str) -> bool { + // Must match the basic pattern + if !WINDOWS_PATH_REGEX.is_match(text) { + return false; + } + + // Check for null bytes or control characters + if contains_control_chars(text) { + return false; + } + + // Validate drive letter is A-Z + let first_char = text.chars().next().unwrap_or(' '); + if !first_char.is_ascii_alphabetic() { + return false; + } + + // Check for known prefixes to boost confidence + for prefix in KNOWN_WINDOWS_PREFIXES.iter() { + if starts_with_ascii_case_insensitive(text, prefix) { + return true; + } + } + + // Path should have at least some content after the drive letter + text.len() >= 4 +} + +/// Validates a registry path +pub fn is_valid_registry_path(text: &str) -> bool { + let upper_text = text.to_uppercase(); + + // Check for full registry root + if upper_text.starts_with("HKEY_") { + // Extract root key + if let Some(slash_pos) = text.find('\\') { + let root = &upper_text[..slash_pos]; + if VALID_REGISTRY_ROOTS.contains(root) { + return true; + } + } + } + + // Check for abbreviated forms (case-insensitive) + if REGISTRY_ABBREV_REGEX.is_match(text) { + return true; + } + + // Also accept paths that use forward slashes (some tools do this) + if upper_text.starts_with("HKEY_") + && text.contains('/') + && let Some(slash_pos) = text.find('/') + { + let root = &upper_text[..slash_pos]; + if VALID_REGISTRY_ROOTS.contains(root) { + return true; + } + } + + false +} + +/// Classifies a POSIX path +/// +/// # Arguments +/// * `text` - The text to check for POSIX path format +/// +/// # Returns +/// Returns `Some(Tag::FilePath)` if valid, `None` otherwise. +pub fn classify_posix_path(text: &str) -> Option { + if POSIX_PATH_REGEX.is_match(text) && is_valid_posix_path(text) { + Some(Tag::FilePath) + } else { + None + } +} + +/// Classifies a Windows path +/// +/// # Arguments +/// * `text` - The text to check for Windows path format +/// +/// # Returns +/// Returns `Some(Tag::FilePath)` if valid, `None` otherwise. +pub fn classify_windows_path(text: &str) -> Option { + // Skip if it looks like a printf format string + if contains_printf_placeholder(text) { + return None; + } + + if WINDOWS_PATH_REGEX.is_match(text) && is_valid_windows_path(text) { + Some(Tag::FilePath) + } else { + None + } +} + +/// Classifies a UNC network path +/// +/// # Arguments +/// * `text` - The text to check for UNC path format +/// +/// # Returns +/// Returns `Some(Tag::FilePath)` if valid, `None` otherwise. +pub fn classify_unc_path(text: &str) -> Option { + if UNC_PATH_REGEX.is_match(text) { + // Basic validation - must have server and share + let parts: Vec<&str> = text.split('\\').collect(); + // parts[0] and parts[1] are empty (before \\), parts[2] is server, parts[3] is share + if parts.len() >= 4 && !parts[2].is_empty() && !parts[3].is_empty() { + return Some(Tag::FilePath); + } + } + None +} + +/// Classifies a Windows registry path +/// +/// # Arguments +/// * `text` - The text to check for registry path format +/// +/// # Returns +/// Returns `Some(Tag::RegistryPath)` if valid, `None` otherwise. +pub fn classify_registry_path(text: &str) -> Option { + // is_valid_registry_path handles both backslash and forward-slash styles + if is_valid_registry_path(text) { + Some(Tag::RegistryPath) + } else { + None + } +} + +/// Checks if a POSIX path is suspicious (persistence-related) +pub fn is_suspicious_posix_path(text: &str) -> bool { + if (text.starts_with("/home/") || text.starts_with("/Users/")) + && text.contains(AUTOSTART_POSIX_SUBPATH) + { + return true; + } + SUSPICIOUS_POSIX_PATHS.iter().any(|p| text.starts_with(p)) +} + +/// Checks if a Windows path is suspicious (persistence-related) +/// +/// Uses prefix matching since suspicious Windows paths are anchored to +/// well-known base directories like C:\Windows\. +pub fn is_suspicious_windows_path(text: &str) -> bool { + SUSPICIOUS_WINDOWS_PATHS + .iter() + .any(|p| starts_with_ascii_case_insensitive(text, p)) +} + +/// Checks if a registry path is suspicious (persistence-related) +/// +/// Uses substring matching since relevant registry keys can appear anywhere +/// within a longer path string. +pub fn is_suspicious_registry_path(text: &str) -> bool { + SUSPICIOUS_REGISTRY_PATHS + .iter() + .any(|p| contains_ascii_case_insensitive(text, p)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_posix_path_valid_and_invalid() { + assert!(classify_posix_path("/usr/bin/bash").is_some()); + assert!(classify_posix_path("/").is_none()); + assert!(classify_posix_path("not/a/path").is_none()); + } + + #[test] + fn test_windows_path_valid_and_invalid() { + assert!(classify_windows_path("C:\\Windows\\System32").is_some()); + assert!(classify_windows_path("/unix/path").is_none()); + assert!(classify_windows_path("1:\\Invalid\\Path").is_none()); + } + + #[test] + fn test_unc_path_valid_and_invalid() { + assert!(classify_unc_path("\\\\server\\share\\file.txt").is_some()); + assert!(classify_unc_path("\\\\server").is_none()); + } + + #[test] + fn test_classify_unc_path_missing_share() { + assert!(classify_unc_path("\\\\server\\").is_none()); + } + + #[test] + fn test_registry_path_valid_and_invalid() { + assert!( + classify_registry_path( + "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" + ) + .is_some() + ); + assert!(classify_registry_path("HKEY_INVALID\\Path").is_none()); + } + + #[test] + fn test_suspicious_paths() { + assert!(is_suspicious_posix_path("/etc/cron.d/malicious")); + assert!(is_suspicious_windows_path("C:\\Windows\\System32\\cmd.exe")); + assert!(is_suspicious_registry_path( + "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" + )); + } +} diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index b83bdfd..c6df7a7 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -10,6 +10,11 @@ //! - IPv4 and IPv6 addresses //! - POSIX and Windows file paths (including UNC paths) //! - Windows registry paths +//! - GUIDs/UUIDs +//! - Email addresses +//! - Base64-encoded data +//! - Printf-style format strings +//! - User agent strings //! //! # Usage //! @@ -39,184 +44,35 @@ //! assert_eq!(tags.len(), 1); //! assert!(matches!(tags[0], stringy::types::Tag::Url)); //! ``` -//! -//! # Patterns -//! -//! - **URLs**: Matches HTTP and HTTPS URLs using a pattern that excludes -//! problematic characters that could cause false positives. -//! -//! - **Domains**: Matches domain names using RFC 1035 compliant patterns -//! with additional TLD validation against a hardcoded list of common TLDs. +use super::patterns; use crate::types::{FoundString, Tag}; -use lazy_static::lazy_static; +use patterns::{ + DOMAIN_REGEX, IPV4_REGEX, IPV6_REGEX, POSIX_PATH_REGEX, REGISTRY_ABBREV_REGEX, + REGISTRY_PATH_REGEX, UNC_PATH_REGEX, URL_REGEX, WINDOWS_PATH_REGEX, +}; use regex::Regex; -use std::net::{Ipv4Addr, Ipv6Addr}; -use std::str::FromStr; - -lazy_static! { - /// Regular expression for matching HTTP/HTTPS URLs - /// - /// Pattern matches URLs starting with http:// or https:// and excludes - /// problematic characters that could cause false positives. - static ref URL_REGEX: Regex = Regex::new(r#"https?://[^\s<>"{}|\\\^\[\]\`]+"#).unwrap(); - - /// Regular expression for matching domain names - /// - /// Pattern matches domain names with proper DNS format compliance (RFC 1035). - /// It ensures domains start and end with alphanumeric characters, allows hyphens - /// in the middle, and requires at least a 2-character TLD. - static ref DOMAIN_REGEX: Regex = Regex::new(r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b").unwrap(); - - /// Regular expression for matching IPv4 addresses - /// - /// Pattern matches IPv4 addresses with proper octet validation (0-255). - /// Matches the entire string (used after port stripping). - static ref IPV4_REGEX: Regex = Regex::new(r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap(); - - /// Regular expression for matching IPv6 addresses - /// - /// Pattern matches IPv6 addresses including: - /// - Full notation: 2001:0db8:85a3:0000:0000:8a2e:0370:7334 - /// - Compressed notation: 2001:db8::1, ::1, fe80::1 - /// - Mixed notation: ::ffff:192.0.2.1, 64:ff9b::192.0.2.1 - /// This is a permissive pattern that checks for basic IPv6 structure (colons and hex digits). - /// Actual validation is performed by std::net::Ipv6Addr::from_str. - static ref IPV6_REGEX: Regex = Regex::new(r"(?i)^(?:[0-9a-f]{1,4}:){1,7}[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,7}:$|^(?:[0-9a-f]{1,4}:){1,6}:[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,5}(?::[0-9a-f]{1,4}){1,2}$|^(?:[0-9a-f]{1,4}:){1,4}(?::[0-9a-f]{1,4}){1,3}$|^(?:[0-9a-f]{1,4}:){1,3}(?::[0-9a-f]{1,4}){1,4}$|^(?:[0-9a-f]{1,4}:){1,2}(?::[0-9a-f]{1,4}){1,5}$|^[0-9a-f]{1,4}:(?::[0-9a-f]{1,4}){1,6}$|^:(?::[0-9a-f]{1,4}){1,7}$|^::$|^::ffff:(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap(); - - /// Regular expression for detecting and stripping port suffixes - /// - /// Matches :port where port is in the valid range 0-65535. - /// Pattern: :[0-9]{1,4} matches 0-9999, |[1-5][0-9]{4} matches 10000-59999, - /// |6[0-4][0-9]{3} matches 60000-64999, |65[0-4][0-9]{2} matches 65000-65499, - /// |655[0-2][0-9] matches 65500-65529, |6553[0-5] matches 65530-65535. - static ref PORT_SUFFIX_REGEX: Regex = Regex::new(r":(?:[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])$").unwrap(); - - /// Regular expression for handling bracketed IPv6 addresses - /// - /// Matches [IPv6] format used in URLs like [::1]:8080. - static ref IPV6_BRACKETS_REGEX: Regex = Regex::new(r"^\[(.+)\]").unwrap(); - - /// Regular expression for matching POSIX file paths - /// - /// Pattern matches absolute POSIX paths starting with / followed by any characters - /// except null bytes, newlines, or carriage returns. - static ref POSIX_PATH_REGEX: Regex = Regex::new(r"^/[^\x00\n\r]*").unwrap(); - - /// Regular expression for matching Windows file paths - /// - /// Pattern matches Windows absolute paths starting with drive letter (C:\) - /// followed by any characters except null bytes, newlines, or carriage returns. - static ref WINDOWS_PATH_REGEX: Regex = Regex::new(r"^[A-Za-z]:\\[^\x00\n\r]*").unwrap(); - - /// Regular expression for matching UNC network paths - /// - /// Pattern matches UNC paths starting with \\ followed by server name and share. - static ref UNC_PATH_REGEX: Regex = Regex::new(r"^\\\\[a-zA-Z0-9.-]+\\[^\x00\n\r]*").unwrap(); - /// Regular expression for matching full Windows registry paths - /// - /// Pattern matches registry paths starting with HKEY_ root keys (case-insensitive). - static ref REGISTRY_PATH_REGEX: Regex = Regex::new(r"(?i)^HKEY_[A-Z_]+\\[^\x00\n\r]*").unwrap(); - - /// Regular expression for matching abbreviated registry paths - /// - /// Pattern matches abbreviated registry forms like HKLM, HKCU, etc. (case-insensitive). - static ref REGISTRY_ABBREV_REGEX: Regex = Regex::new(r"(?i)^HK(LM|CU|CR|U|CC)\\[^\x00\n\r]*").unwrap(); -} - -lazy_static! { - /// Common suspicious POSIX path prefixes for persistence detection - static ref SUSPICIOUS_POSIX_PATHS: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("/etc/cron.d/"); - set.insert("/etc/init.d/"); - set.insert("/usr/local/bin/"); - set.insert("/tmp/"); - set.insert("/var/tmp/"); - set.insert("/etc/rc.d/"); - set.insert("/etc/crontab"); - set.insert("/etc/systemd/system/"); - set.insert("~/.config/autostart/"); - set.insert("/Library/LaunchDaemons/"); - set.insert("/Library/LaunchAgents/"); - set - }; - - /// Common suspicious Windows path prefixes for persistence detection - static ref SUSPICIOUS_WINDOWS_PATHS: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("C:\\Windows\\System32\\"); - set.insert("C:\\Windows\\Temp\\"); - set.insert("\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\"); - set.insert("C:\\ProgramData\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\"); - set.insert("C:\\Windows\\SysWOW64\\"); - set - }; - - /// Known valid POSIX path prefixes - static ref KNOWN_POSIX_PREFIXES: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("/usr/"); - set.insert("/etc/"); - set.insert("/var/"); - set.insert("/home/"); - set.insert("/opt/"); - set.insert("/bin/"); - set.insert("/sbin/"); - set.insert("/lib/"); - set.insert("/dev/"); - set.insert("/proc/"); - set.insert("/sys/"); - set.insert("/tmp/"); - set - }; - - /// Known valid Windows path prefixes - static ref KNOWN_WINDOWS_PREFIXES: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("C:\\Windows\\"); - set.insert("C:\\Program Files\\"); - set.insert("C:\\Program Files (x86)\\"); - set.insert("C:\\Users\\"); - set.insert("C:\\ProgramData\\"); - set - }; - - /// Valid Windows registry root keys - static ref VALID_REGISTRY_ROOTS: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("HKEY_LOCAL_MACHINE"); - set.insert("HKEY_CURRENT_USER"); - set.insert("HKEY_CLASSES_ROOT"); - set.insert("HKEY_USERS"); - set.insert("HKEY_CURRENT_CONFIG"); - set - }; - - /// Suspicious Windows registry paths for persistence detection - static ref SUSPICIOUS_REGISTRY_PATHS: std::collections::HashSet<&'static str> = { - let mut set = std::collections::HashSet::new(); - set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run"); - set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\RunOnce"); - set.insert("\\System\\CurrentControlSet\\Services"); - set.insert("\\SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion\\Winlogon"); - set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders"); - set - }; -} +// Re-export pattern functions for backward compatibility +pub use patterns::{ + classify_base64, classify_domain, classify_email, classify_format_string, classify_guid, + classify_ip_addresses, classify_posix_path, classify_registry_path, classify_unc_path, + classify_url, classify_user_agent, classify_windows_path, has_valid_tld, is_ipv4_address, + is_ipv6_address, is_suspicious_posix_path, is_suspicious_registry_path, + is_suspicious_windows_path, is_valid_posix_path, is_valid_registry_path, is_valid_windows_path, + strip_ipv6_brackets, strip_port, +}; /// Semantic classifier for identifying network indicators in extracted strings /// -/// The `SemanticClassifier` provides methods to detect URLs and domain names +/// The `SemanticClassifier` provides methods to detect URLs, domain names, +/// IP addresses, file paths, registry paths, GUIDs, emails, and other patterns /// within text content. It uses compiled regular expressions for efficient -/// pattern matching and includes TLD validation to reduce false positives. -/// -/// URLs are prioritized over domains to prevent double-tagging - if a string -/// matches both patterns, it will only be tagged as a URL. +/// pattern matching and includes validation to reduce false positives. #[derive(Debug, Default)] pub struct SemanticClassifier; +/// Internal struct for regex cache address verification (used in testing) #[doc(hidden)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct RegexCacheAddresses { @@ -233,11 +89,14 @@ pub struct RegexCacheAddresses { impl SemanticClassifier { /// Create a new instance of the semantic classifier + #[must_use] pub fn new() -> Self { Self } + /// Returns memory addresses of cached regex patterns (for testing) #[doc(hidden)] + #[must_use] pub fn regex_cache_addresses(&self) -> RegexCacheAddresses { RegexCacheAddresses { url: &*URL_REGEX as *const Regex as usize, @@ -255,8 +114,7 @@ impl SemanticClassifier { /// Detects HTTP/HTTPS URLs in the given text /// /// This method identifies URLs that start with `http://` or `https://` - /// and contain valid URL characters. The pattern excludes problematic - /// characters to avoid false positives. + /// and contain valid URL characters. /// /// # Arguments /// @@ -265,31 +123,15 @@ impl SemanticClassifier { /// # Returns /// /// Returns `Some(Tag::Url)` if a URL is found, `None` otherwise. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// use stringy::types::Tag; - /// - /// let classifier = SemanticClassifier::new(); - /// assert_eq!(classifier.classify_url("https://example.com"), Some(Tag::Url)); - /// assert_eq!(classifier.classify_url("example.com"), None); - /// ``` + #[must_use] pub fn classify_url(&self, text: &str) -> Option { - if URL_REGEX.is_match(text) { - Some(Tag::Url) - } else { - None - } + classify_url(text) } /// Detects domain names that are not URLs /// /// This method identifies domain names that match the domain pattern but - /// are not already identified as URLs. It first checks if the text is NOT - /// a URL to prevent double-tagging, then validates against the domain - /// pattern and TLD list. + /// are not already identified as URLs. /// /// # Arguments /// @@ -297,34 +139,10 @@ impl SemanticClassifier { /// /// # Returns /// - /// Returns `Some(Tag::Domain)` if a valid domain is found (and it's not - /// a URL), `None` otherwise. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// use stringy::types::Tag; - /// - /// let classifier = SemanticClassifier::new(); - /// assert_eq!(classifier.classify_domain("example.com"), Some(Tag::Domain)); - /// assert_eq!(classifier.classify_domain("https://example.com"), None); - /// ``` + /// Returns `Some(Tag::Domain)` if a valid domain is found, `None` otherwise. + #[must_use] pub fn classify_domain(&self, text: &str) -> Option { - // First check if it's NOT a URL to prevent double-tagging - if URL_REGEX.is_match(text) { - return None; - } - - // Check if it matches the domain pattern - if DOMAIN_REGEX.is_match(text) { - // Validate TLD to reduce false positives - if self.has_valid_tld(text) { - return Some(Tag::Domain); - } - } - - None + classify_domain(text) } /// Main entry point for semantic classification @@ -332,7 +150,7 @@ impl SemanticClassifier { /// This method analyzes a `FoundString` and returns a vector of semantic /// tags that apply to the string. URLs are checked first, then domains /// (which automatically excludes URLs to prevent double-tagging), then - /// IP addresses (IPv4 and IPv6). + /// IP addresses (IPv4 and IPv6), file paths, and other patterns. /// /// # Arguments /// @@ -340,672 +158,190 @@ impl SemanticClassifier { /// /// # Returns /// - /// Returns a vector of `Tag` values that apply to the string. The vector - /// may be empty if no patterns match. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// use stringy::types::{FoundString, Encoding, StringSource, Tag}; - /// - /// let classifier = SemanticClassifier::new(); - /// let found_string = FoundString { - /// text: "https://example.com".to_string(), - /// original_text: None, - /// encoding: Encoding::Ascii, - /// offset: 0, - /// rva: None, - /// section: None, - /// length: 19, - /// tags: Vec::new(), - /// score: 0, - /// section_weight: None, - /// semantic_boost: None, - /// noise_penalty: None, - /// source: StringSource::SectionData, - /// confidence: 1.0, - /// }; - /// - /// let tags = classifier.classify(&found_string); - /// assert_eq!(tags.len(), 1); - /// assert!(matches!(tags[0], Tag::Url)); - /// ``` + /// Returns a vector of `Tag` values that apply to the string. + #[must_use] pub fn classify(&self, string: &FoundString) -> Vec { let mut tags = Vec::new(); // Check for URLs first - if let Some(tag) = self.classify_url(&string.text) { + if let Some(tag) = classify_url(&string.text) { tags.push(tag); } // Check for domains (this will automatically exclude URLs) - if let Some(tag) = self.classify_domain(&string.text) { + if let Some(tag) = classify_domain(&string.text) { tags.push(tag); } // Check for IP addresses (IPv4 and IPv6) - let ip_tags = self.classify_ip_addresses(&string.text); + let ip_tags = classify_ip_addresses(&string.text); tags.extend(ip_tags); // Check for file paths (POSIX, Windows, UNC) - only add FilePath tag once - if self.classify_posix_path(&string.text).is_some() - || self.classify_windows_path(&string.text).is_some() - || self.classify_unc_path(&string.text).is_some() + if classify_posix_path(&string.text).is_some() + || classify_windows_path(&string.text).is_some() + || classify_unc_path(&string.text).is_some() { tags.push(Tag::FilePath); } // Check for registry paths - if let Some(tag) = self.classify_registry_path(&string.text) { + if let Some(tag) = classify_registry_path(&string.text) { tags.push(tag); } - tags - } + // Check for GUIDs + if let Some(tag) = classify_guid(&string.text) { + tags.push(tag); + } - /// Validates the top-level domain (TLD) against a hardcoded list - /// - /// This method extracts the TLD from a domain string and validates it - /// against a comprehensive list of common TLDs. This helps reduce false - /// positives by ensuring domains have valid TLDs. - /// - /// # Arguments - /// - /// * `domain` - The domain string to validate - /// - /// # Returns - /// - /// Returns `true` if the TLD is valid and at least 2 characters long, - /// `false` otherwise. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// - /// let classifier = SemanticClassifier::new(); - /// assert!(classifier.has_valid_tld("example.com")); - /// assert!(!classifier.has_valid_tld("example.x")); - /// ``` - fn has_valid_tld(&self, domain: &str) -> bool { - // Extract TLD (last segment after final dot) - let tld = domain.split('.').next_back().unwrap_or(""); - - // TLD must be at least 2 characters - if tld.len() < 2 { - return false; + // Check for email addresses + if let Some(tag) = classify_email(&string.text) { + tags.push(tag); } - // Normalize TLD to lowercase for case-insensitive validation - let tld_lower = tld.to_ascii_lowercase(); - - // Validate against hardcoded list of common TLDs - let valid_tlds = [ - // Generic TLDs - "com", - "net", - "org", - "io", - "co", - // Country code TLDs - "gov", - "edu", - "mil", - "int", - "uk", - "de", - "fr", - "jp", - "cn", - "au", - "ca", - "ru", - "br", - "in", - "nl", - "eu", - // New gTLDs - "info", - "biz", - "dev", - "app", - "cloud", - "tech", - "online", - "site", - "xyz", - "top", - "win", - "bid", - // Additional common TLDs - "me", - "tv", - "cc", - "ws", - "name", - "pro", - "mobi", - "asia", - "tel", - "travel", - "jobs", - "museum", - "aero", - "coop", - "cat", - "xxx", - "post", - "arpa", - "test", - "example", - "localhost", - ]; - - valid_tlds.contains(&tld_lower.as_str()) + // Check for format strings + if let Some(tag) = classify_format_string(&string.text) { + tags.push(tag); + } + + // Check for user agent strings + if let Some(tag) = classify_user_agent(&string.text) { + tags.push(tag); + } + + // Check for Base64 (broad tag - checked last as it has more false positives) + if let Some(tag) = classify_base64(&string.text) { + tags.push(tag); + } + + tags + } + + /// Validates a TLD against the known list + #[must_use] + pub fn has_valid_tld(&self, domain: &str) -> bool { + has_valid_tld(domain) } /// Strips port suffix from an IP address string - /// - /// Removes `:port` suffix if present (e.g., `192.168.1.1:8080` → `192.168.1.1`). - /// - /// # Arguments - /// - /// * `text` - The text that may contain a port suffix - /// - /// # Returns - /// - /// Returns a string slice without the port suffix. - fn strip_port<'a>(&self, text: &'a str) -> &'a str { - PORT_SUFFIX_REGEX - .find(text) - .map_or(text, |m| &text[..m.start()]) + #[must_use] + pub fn strip_port<'a>(&self, text: &'a str) -> &'a str { + strip_port(text) } - /// Strips bracketed notation from IPv6 addresses - /// - /// Removes `[` and `]` from bracketed IPv6 addresses (e.g., `[::1]` → `::1`). - /// - /// # Arguments - /// - /// * `text` - The text that may contain bracketed IPv6 notation - /// - /// # Returns - /// - /// Returns a string slice without brackets, or the original text if no brackets found. - fn strip_ipv6_brackets<'a>(&self, text: &'a str) -> &'a str { - IPV6_BRACKETS_REGEX - .captures(text) - .and_then(|caps| caps.get(1)) - .map_or(text, |m| m.as_str()) + /// Strips brackets from IPv6 address + #[must_use] + pub fn strip_ipv6_brackets<'a>(&self, text: &'a str) -> &'a str { + strip_ipv6_brackets(text) } - /// Detects IPv4 addresses in the given text - /// - /// This method uses a two-stage validation approach: - /// 1. Regex pre-filter for performance - /// 2. `std::net::Ipv4Addr` validation for correctness - /// - /// It also handles port suffixes (e.g., "192.168.1.1:8080"). - /// - /// # Note on Version Numbers - /// - /// This method accepts ALL valid IPv4 addresses in dotted-quad notation, - /// even if they could also be interpreted as version numbers (e.g., "1.2.3.4"). - /// It is the responsibility of the caller to disambiguate between IP addresses - /// and version numbers based on context when necessary. - /// - /// # Arguments - /// - /// * `text` - The text to search for IPv4 addresses - /// - /// # Returns - /// - /// Returns `true` if a valid IPv4 address is found, `false` otherwise. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// - /// let classifier = SemanticClassifier::new(); - /// assert!(classifier.is_ipv4_address("192.168.1.1")); - /// assert!(classifier.is_ipv4_address("192.168.1.1:8080")); - /// assert!(classifier.is_ipv4_address("1.2.3.4")); // Valid IP (could also be a version number) - /// assert!(!classifier.is_ipv4_address("256.1.1.1")); // Invalid octet - /// ``` + /// Checks if text is a valid IPv4 address + #[must_use] pub fn is_ipv4_address(&self, text: &str) -> bool { - // Strip port suffix if present - let text_without_port = self.strip_port(text); - - // Two-stage validation: regex pre-filter first - if !IPV4_REGEX.is_match(text_without_port) { - return false; - } - - // Check for leading zeros in octets (e.g., 192.168.01.1 should be rejected) - for octet_str in text_without_port.split('.') { - // If an octet has more than 1 digit and starts with '0', it's invalid - if octet_str.len() > 1 && octet_str.starts_with('0') { - return false; - } - } - - // Validate using std::net::Ipv4Addr for correctness - // This is the authoritative check - regex is just a pre-filter - Ipv4Addr::from_str(text_without_port).is_ok() + is_ipv4_address(text) } - /// Detects IPv6 addresses in the given text - /// - /// This method uses a two-stage validation approach: - /// 1. Basic structure check (contains colons, looks like IPv6) - /// 2. `std::net::Ipv6Addr` validation for correctness - /// - /// It handles bracketed notation (e.g., `[::1]`) and port suffixes. - /// - /// # Arguments - /// - /// * `text` - The text to search for IPv6 addresses - /// - /// # Returns - /// - /// Returns `true` if a valid IPv6 address is found, `false` otherwise. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// - /// let classifier = SemanticClassifier::new(); - /// assert!(classifier.is_ipv6_address("2001:db8::1")); - /// assert!(classifier.is_ipv6_address("::1")); - /// assert!(classifier.is_ipv6_address("[::1]:8080")); - /// assert!(!classifier.is_ipv6_address("gggg::1")); // Invalid hex - /// ``` + /// Checks if text is a valid IPv6 address + #[must_use] pub fn is_ipv6_address(&self, text: &str) -> bool { - // Handle bracketed IPv6 addresses like [::1] or [::1]:8080 - // Strategy: strip port first (if present), then strip brackets - - // If it looks like it has a port (contains ]:), strip port first - let after_port = if text.contains("]:") { - self.strip_port(text) - } else { - text - }; - - // Now strip brackets if present - let processed = self.strip_ipv6_brackets(after_port); - - // Two-stage validation: regex pre-filter first - // Basic structure check: must contain colons (IPv6 addresses always have colons) - if !processed.contains(':') { - return false; - } - - // For mixed notation (contains both colons and dots), skip regex check - // as the regex doesn't handle all mixed notation patterns - let is_mixed_notation = processed.contains('.'); - - if !is_mixed_notation { - // Use regex as pre-filter for non-mixed notation - if !IPV6_REGEX.is_match(processed) { - return false; - } - } + is_ipv6_address(text) + } - // Validate using std::net::Ipv6Addr for canonical validation - // This handles all IPv6 formats: full, compressed, mixed notation - Ipv6Addr::from_str(processed).is_ok() + /// Classifies IP addresses in text + #[must_use] + pub fn classify_ip_addresses(&self, text: &str) -> Vec { + classify_ip_addresses(text) } - /// Detects POSIX file paths in the given text - /// - /// Returns `Some(Tag::FilePath)` if a POSIX path is detected and valid. + /// Classifies POSIX paths + #[must_use] pub fn classify_posix_path(&self, text: &str) -> Option { - if !POSIX_PATH_REGEX.is_match(text) { - return None; - } - - if !self.is_valid_posix_path(text) { - return None; - } - - Some(Tag::FilePath) + classify_posix_path(text) } - /// Detects Windows file paths in the given text - /// - /// Returns `Some(Tag::FilePath)` if a Windows path is detected and valid. + /// Classifies Windows paths + #[must_use] pub fn classify_windows_path(&self, text: &str) -> Option { - if !WINDOWS_PATH_REGEX.is_match(text) { - return None; - } - - if !self.is_valid_windows_path(text) { - return None; - } - - Some(Tag::FilePath) + classify_windows_path(text) } - /// Detects UNC network paths in the given text - /// - /// Returns `Some(Tag::FilePath)` if a UNC path is detected and valid. - /// Performs robust validation including: - /// - Maximum overall length (4096) and component length (255) - /// - Control character rejection - /// - Forward slash and printf placeholder rejection - /// - Reserved name and dots-only component rejection - /// - Empty segment detection + /// Classifies UNC paths + #[must_use] pub fn classify_unc_path(&self, text: &str) -> Option { - if !UNC_PATH_REGEX.is_match(text) { - return None; - } - - // Maximum overall length check - if text.len() > 4096 { - return None; - } - - // Reject control characters - if self.contains_control_chars(text) { - return None; - } - - // Reject forward slashes anywhere in the path - if text.contains('/') { - return None; - } - - let trimmed = text.trim_start_matches('\\').trim_end_matches('\\'); - let parts: Vec<&str> = trimmed.split('\\').collect(); - - // Must have at least server and share - if parts.len() < 2 { - return None; - } - - let server = parts[0]; - let share = parts[1]; - - if server.is_empty() || share.is_empty() { - return None; - } - - // Validate all segments (no empty segments from double backslashes) - for segment in &parts { - // Reject empty segments (from consecutive backslashes like \\\\server\\\\share) - if segment.is_empty() { - return None; - } - - // Enforce max component length (255 bytes) - if segment.len() > 255 { - return None; - } - - // Reject components consisting solely of dots (but allow dots in domain names) - // Only reject if the segment is exactly "." or ".." - if *segment == "." || *segment == ".." { - return None; - } - } - - // Reject printf-style placeholders in server or share - if self.contains_printf_placeholder(server) || self.contains_printf_placeholder(share) { - return None; - } - - // Reject reserved Windows device names in server or share - let reserved_names = [ - "CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", - "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", - ]; - let server_upper = server.to_ascii_uppercase(); - let share_upper = share.to_ascii_uppercase(); - for reserved in &reserved_names { - if server_upper == *reserved || share_upper == *reserved { - return None; - } - } - - Some(Tag::FilePath) + classify_unc_path(text) } - /// Detects Windows registry paths in the given text - /// - /// Returns `Some(Tag::RegistryPath)` if a registry path is detected and valid. + /// Classifies registry paths + #[must_use] pub fn classify_registry_path(&self, text: &str) -> Option { - if !REGISTRY_PATH_REGEX.is_match(text) && !REGISTRY_ABBREV_REGEX.is_match(text) { - return None; - } - - if !self.is_valid_registry_path(text) { - return None; - } - - Some(Tag::RegistryPath) + classify_registry_path(text) } - /// Checks if the POSIX path matches known suspicious locations + /// Checks if POSIX path is suspicious + #[must_use] pub fn is_suspicious_posix_path(&self, text: &str) -> bool { - SUSPICIOUS_POSIX_PATHS - .iter() - .any(|prefix| text.starts_with(prefix)) + is_suspicious_posix_path(text) } - /// Checks if the Windows path matches known suspicious locations (case-insensitive) + /// Checks if Windows path is suspicious + #[must_use] pub fn is_suspicious_windows_path(&self, text: &str) -> bool { - let lowered_text = text.to_ascii_lowercase(); - SUSPICIOUS_WINDOWS_PATHS.iter().any(|prefix| { - let lowered_prefix = prefix.to_ascii_lowercase(); - if prefix.starts_with('\\') { - lowered_text.contains(&lowered_prefix) - } else { - lowered_text.starts_with(&lowered_prefix) - } - }) + is_suspicious_windows_path(text) } - /// Checks if the registry path matches known persistence locations + /// Checks if registry path is suspicious + #[must_use] pub fn is_suspicious_registry_path(&self, text: &str) -> bool { - SUSPICIOUS_REGISTRY_PATHS - .iter() - .any(|path| self.contains_ascii_case_insensitive(text, path)) - } - - /// Case-insensitive ASCII substring search without allocations - fn contains_ascii_case_insensitive(&self, haystack: &str, needle: &str) -> bool { - if needle.is_empty() { - return true; - } - - let haystack_bytes = haystack.as_bytes(); - let needle_bytes = needle.as_bytes(); - - if needle_bytes.len() > haystack_bytes.len() { - return false; - } - - haystack_bytes - .windows(needle_bytes.len()) - .any(|window| window.eq_ignore_ascii_case(needle_bytes)) + is_suspicious_registry_path(text) } - /// Detects printf-style placeholders to reduce false positives - fn contains_printf_placeholder(&self, text: &str) -> bool { - let mut chars = text.chars().peekable(); - - while let Some(ch) = chars.next() { - if ch == '%' - && let Some(next) = chars.peek() - && matches!(next, 's' | 'd' | 'x' | 'o' | 'u' | 'f') - { - return true; - } - } - - false - } - - /// Checks if text contains ASCII control characters (C0 controls: 0x00-0x1F and DEL: 0x7F) - fn contains_control_chars(&self, text: &str) -> bool { - text.bytes().any(|b| b <= 0x1F || b == 0x7F) - } - - /// Validates POSIX path structure + /// Validates POSIX path + #[must_use] pub fn is_valid_posix_path(&self, text: &str) -> bool { - if text.len() > 4096 { - return false; - } - - if text.contains('\0') || text.contains('\n') || text.contains('\r') { - return false; - } - - if text.contains("//") { - return false; - } - - if text.contains('\\') { - return false; - } - - if self.contains_printf_placeholder(text) { - return false; - } - - let has_known_prefix = KNOWN_POSIX_PREFIXES - .iter() - .any(|prefix| text.starts_with(prefix)); - let is_suspicious = self.is_suspicious_posix_path(text); - - if !has_known_prefix && !is_suspicious && text.len() > 2048 { - return false; - } - - true + is_valid_posix_path(text) } - /// Validates Windows path structure + /// Validates Windows path + #[must_use] pub fn is_valid_windows_path(&self, text: &str) -> bool { - // Reject control characters early to prevent regex/prefix matching from being fooled - if self.contains_control_chars(text) { - return false; - } - - if text.len() > 4096 { - return false; - } - - if text.contains('/') { - return false; - } - - if text.contains("\\\\") { - return false; - } - - if self.contains_printf_placeholder(text) { - return false; - } - - let has_known_prefix = KNOWN_WINDOWS_PREFIXES - .iter() - .any(|prefix| text.starts_with(prefix)); - let is_suspicious = self.is_suspicious_windows_path(text); - - if !has_known_prefix && !is_suspicious && text.len() > 2048 { - return false; - } - - true + is_valid_windows_path(text) } - /// Validates Windows registry path structure + /// Validates registry path + #[must_use] pub fn is_valid_registry_path(&self, text: &str) -> bool { - // Reject control characters early to prevent regex/prefix matching from being fooled - if self.contains_control_chars(text) { - return false; - } - - // Maximum length check (4096 bytes) - if text.len() > 4096 { - return false; - } - - if text.contains('/') { - return false; - } - - if text.contains("\\\\") { - return false; - } - - let root = text.split('\\').next().unwrap_or(""); - let root_upper = root.to_ascii_uppercase(); - - if root_upper.starts_with("HKEY_") { - return VALID_REGISTRY_ROOTS - .iter() - .any(|valid| *valid == root_upper); - } - - if root_upper.starts_with("HK") { - return matches!( - root_upper.as_str(), - "HKLM" | "HKCU" | "HKCR" | "HKU" | "HKCC" - ); - } + is_valid_registry_path(text) + } - false + /// Classifies GUIDs + #[must_use] + pub fn classify_guid(&self, text: &str) -> Option { + classify_guid(text) } - /// Classifies IP addresses (IPv4 and IPv6) in the given text - /// - /// This method checks for both IPv4 and IPv6 addresses and returns - /// appropriate tags. A string may match both patterns (unlikely but possible). - /// - /// # Arguments - /// - /// * `text` - The text to search for IP addresses - /// - /// # Returns - /// - /// Returns a vector of `Tag` values (`Tag::IPv4` and/or `Tag::IPv6`). - /// The vector may be empty if no IP addresses are found. - /// - /// # Examples - /// - /// ``` - /// use stringy::classification::SemanticClassifier; - /// use stringy::types::Tag; - /// - /// let classifier = SemanticClassifier::new(); - /// let tags = classifier.classify_ip_addresses("192.168.1.1"); - /// assert_eq!(tags, vec![Tag::IPv4]); - /// - /// let tags = classifier.classify_ip_addresses("::1"); - /// assert_eq!(tags, vec![Tag::IPv6]); - /// - /// let tags = classifier.classify_ip_addresses("not an ip"); - /// assert!(tags.is_empty()); - /// ``` - pub fn classify_ip_addresses(&self, text: &str) -> Vec { - let mut tags = Vec::new(); + /// Classifies email addresses + #[must_use] + pub fn classify_email(&self, text: &str) -> Option { + classify_email(text) + } - // Check for IPv4 - if self.is_ipv4_address(text) { - tags.push(Tag::IPv4); - } + /// Classifies Base64-encoded data + #[must_use] + pub fn classify_base64(&self, text: &str) -> Option { + classify_base64(text) + } - // Check for IPv6 - if self.is_ipv6_address(text) { - tags.push(Tag::IPv6); - } + /// Classifies format strings + #[must_use] + pub fn classify_format_string(&self, text: &str) -> Option { + classify_format_string(text) + } - tags + /// Classifies user agent strings + #[must_use] + pub fn classify_user_agent(&self, text: &str) -> Option { + classify_user_agent(text) } } @@ -1014,7 +350,6 @@ mod tests { use super::*; use crate::types::{Encoding, StringSource}; - /// Helper function to create a test FoundString fn create_test_string(text: &str) -> FoundString { FoundString { text: text.to_string(), @@ -1035,606 +370,125 @@ mod tests { } #[test] - fn test_url_detection() { - let classifier = SemanticClassifier::new(); - - // Valid URLs - assert_eq!( - classifier.classify_url("https://example.com"), - Some(Tag::Url) - ); - assert_eq!( - classifier.classify_url("http://api.malware.com/v1/data"), - Some(Tag::Url) - ); - assert_eq!( - classifier.classify_url("https://192.168.1.1:8080/path"), - Some(Tag::Url) - ); - - // Invalid cases (not URLs) - assert_eq!(classifier.classify_url("example.com"), None); - assert_eq!(classifier.classify_url("not a url"), None); - } - - #[test] - fn test_domain_detection() { + fn test_classify_mixed_strings() { let classifier = SemanticClassifier::new(); - // Valid domains - assert_eq!(classifier.classify_domain("example.com"), Some(Tag::Domain)); - assert_eq!( - classifier.classify_domain("api.service.io"), - Some(Tag::Domain) - ); - assert_eq!( - classifier.classify_domain("malware-c2.net"), - Some(Tag::Domain) - ); - - // Valid domains with mixed-case TLDs - assert_eq!(classifier.classify_domain("example.COM"), Some(Tag::Domain)); - assert_eq!( - classifier.classify_domain("api.service.IO"), - Some(Tag::Domain) - ); - assert_eq!( - classifier.classify_domain("malware-c2.NET"), - Some(Tag::Domain) - ); - assert_eq!(classifier.classify_domain("Example.OrG"), Some(Tag::Domain)); - - // URLs should not match as domains - assert_eq!(classifier.classify_domain("https://example.com"), None); - - // Invalid domains - assert_eq!(classifier.classify_domain("invalid"), None); - assert_eq!(classifier.classify_domain("too.short.x"), None); - } - - #[test] - fn test_url_classification() { - let classifier = SemanticClassifier::new(); - let found_string = create_test_string("https://example.com/api"); + // URL + let url_string = create_test_string("https://example.com/api"); + let tags = classifier.classify(&url_string); + assert!(tags.contains(&Tag::Url)); - let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::Url)); - } + // Domain + let domain_string = create_test_string("api.example.com"); + let tags = classifier.classify(&domain_string); + assert!(tags.contains(&Tag::Domain)); - #[test] - fn test_domain_classification() { - let classifier = SemanticClassifier::new(); - let found_string = create_test_string("example.com"); + // IPv4 + let ipv4_string = create_test_string("192.168.1.1"); + let tags = classifier.classify(&ipv4_string); + assert!(tags.contains(&Tag::IPv4)); - let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::Domain)); + // Windows path + let path_string = create_test_string("C:\\Windows\\System32\\cmd.exe"); + let tags = classifier.classify(&path_string); + assert!(tags.contains(&Tag::FilePath)); } #[test] - fn test_url_not_double_tagged() { + fn test_classify_posix_path_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("https://example.com"); + let found_string = create_test_string("/usr/local/bin/app"); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::Url)); - // Ensure it's NOT also tagged as Domain - assert!(!tags.iter().any(|t| matches!(t, Tag::Domain))); - } - - #[test] - fn test_tld_validation() { - let classifier = SemanticClassifier::new(); - - // Valid TLDs - assert!(classifier.has_valid_tld("example.com")); - assert!(classifier.has_valid_tld("test.net")); - assert!(classifier.has_valid_tld("site.org")); - assert!(classifier.has_valid_tld("api.io")); - - // Valid TLDs with mixed case (should be normalized) - assert!(classifier.has_valid_tld("example.COM")); - assert!(classifier.has_valid_tld("test.NET")); - assert!(classifier.has_valid_tld("site.ORG")); - assert!(classifier.has_valid_tld("api.IO")); - assert!(classifier.has_valid_tld("Example.CoM")); - - // Invalid TLDs - assert!(!classifier.has_valid_tld("example.x")); - assert!(!classifier.has_valid_tld("test.invalid")); - assert!(!classifier.has_valid_tld("site.toolong123")); + assert!(tags.contains(&Tag::FilePath)); } #[test] - fn test_edge_cases() { + fn test_classify_windows_path_in_found_string() { let classifier = SemanticClassifier::new(); + let found_string = create_test_string("C:\\Program Files\\Application\\app.exe"); - // Empty string - let empty = create_test_string(""); - let tags = classifier.classify(&empty); - assert_eq!(tags.len(), 0); - - // Very long domain (within RFC 1035 limits) - let long_domain = "a".repeat(60) + ".com"; - let found_string = create_test_string(&long_domain); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::Domain)); - - // String with no valid domain pattern - let no_domain = create_test_string("just some text without domains"); - let tags = classifier.classify(&no_domain); - assert_eq!(tags.len(), 0); - - // Malformed URL - let malformed = create_test_string("http://"); - let tags = classifier.classify(&malformed); - assert_eq!(tags.len(), 0); - } - - #[test] - fn test_ipv4_valid_addresses() { - let classifier = SemanticClassifier::new(); - - // Valid IPv4 addresses - assert!(classifier.is_ipv4_address("192.168.1.1")); - assert!(classifier.is_ipv4_address("10.0.0.1")); - assert!(classifier.is_ipv4_address("8.8.8.8")); - assert!(classifier.is_ipv4_address("1.1.1.1")); - assert!(classifier.is_ipv4_address("127.0.0.1")); - assert!(classifier.is_ipv4_address("0.0.0.0")); - assert!(classifier.is_ipv4_address("255.255.255.255")); - } - - #[test] - fn test_ipv4_invalid_addresses() { - let classifier = SemanticClassifier::new(); - - // Invalid IPv4 addresses - assert!(!classifier.is_ipv4_address("256.1.1.1")); // Octet > 255 - assert!(!classifier.is_ipv4_address("192.168.1")); // Missing octet - assert!(!classifier.is_ipv4_address("192.168.1.1.1")); // Too many octets - assert!(!classifier.is_ipv4_address("999.999.999.999")); // All octets > 255 - assert!(!classifier.is_ipv4_address("192.168.01.1")); // Leading zero (invalid format) - } - - #[test] - fn test_ipv4_with_port() { - let classifier = SemanticClassifier::new(); - - // IPv4 addresses with ports should be detected - assert!(classifier.is_ipv4_address("192.168.1.1:8080")); - assert!(classifier.is_ipv4_address("10.0.0.1:443")); - assert!(classifier.is_ipv4_address("127.0.0.1:3000")); - } - - #[test] - fn test_ipv4_version_numbers() { - let classifier = SemanticClassifier::new(); - - // Valid IPv4 addresses that could also be version numbers are accepted - // It's the caller's responsibility to disambiguate based on context - assert!(classifier.is_ipv4_address("1.2.3.4")); - assert!(classifier.is_ipv4_address("2.0.1.0")); - assert!(classifier.is_ipv4_address("10.5.2.1")); - assert!(classifier.is_ipv4_address("10.5.2.20")); - } - - #[test] - fn test_ipv4_edge_cases() { - let classifier = SemanticClassifier::new(); - - // Boundary values - assert!(classifier.is_ipv4_address("0.0.0.0")); - assert!(classifier.is_ipv4_address("255.255.255.255")); - assert!(classifier.is_ipv4_address("192.0.0.1")); - assert!(classifier.is_ipv4_address("0.255.0.255")); - - // Private network addresses - assert!(classifier.is_ipv4_address("192.168.0.1")); - assert!(classifier.is_ipv4_address("10.0.0.1")); - assert!(classifier.is_ipv4_address("172.16.0.1")); - } - - #[test] - fn test_ipv6_full_notation() { - let classifier = SemanticClassifier::new(); - - // Full IPv6 notation - assert!(classifier.is_ipv6_address("2001:0db8:85a3:0000:0000:8a2e:0370:7334")); - assert!(classifier.is_ipv6_address("2001:0db8:85a3:0000:0000:8a2e:0370:7334")); - } - - #[test] - fn test_ipv6_compressed() { - let classifier = SemanticClassifier::new(); - - // Compressed IPv6 notation - assert!(classifier.is_ipv6_address("2001:db8::1")); - assert!(classifier.is_ipv6_address("::1")); - assert!(classifier.is_ipv6_address("fe80::1")); - assert!(classifier.is_ipv6_address("::")); - } - - #[test] - fn test_ipv6_mixed_notation() { - let classifier = SemanticClassifier::new(); - - // Mixed IPv4/IPv6 notation - assert!(classifier.is_ipv6_address("::ffff:192.0.2.1")); - assert!(classifier.is_ipv6_address("64:ff9b::192.0.2.1")); - } - - #[test] - fn test_ipv6_invalid() { - let classifier = SemanticClassifier::new(); - - // Invalid IPv6 addresses - assert!(!classifier.is_ipv6_address("gggg::1")); // Invalid hex - assert!(!classifier.is_ipv6_address("2001:db8::1::2")); // Double :: - assert!(!classifier.is_ipv6_address("2001:db8:1")); // Too short - } - - #[test] - fn test_ipv6_with_brackets() { - let classifier = SemanticClassifier::new(); - - // IPv6 addresses with brackets - assert!(classifier.is_ipv6_address("[2001:db8::1]")); - assert!(classifier.is_ipv6_address("[::1]")); - } - - #[test] - fn test_ipv6_with_port() { - let classifier = SemanticClassifier::new(); - - // IPv6 addresses with brackets and ports - assert!(classifier.is_ipv6_address("[2001:db8::1]:8080")); - assert!(classifier.is_ipv6_address("[::1]:8080")); + assert!(tags.contains(&Tag::FilePath)); } #[test] - fn test_classify_ipv4() { + fn test_classify_registry_path_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("192.168.1.1"); + let found_string = + create_test_string("HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion"); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::IPv4)); + assert!(tags.contains(&Tag::RegistryPath)); } #[test] - fn test_classify_ipv6() { + fn test_no_false_positives_on_random_data() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("::1"); + let found_string = create_test_string("x9qz1p0t8v7w6r5y4u3i2o1p"); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::IPv6)); - } - - #[test] - fn test_classify_no_ip() { - let classifier = SemanticClassifier::new(); - let found_string = create_test_string("not an ip address"); - - let tags = classifier.classify_ip_addresses(&found_string.text); assert!(tags.is_empty()); } #[test] - fn test_classify_ipv4_with_port() { + fn test_guid_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("192.168.1.1:8080"); + let found_string = create_test_string("{12345678-1234-1234-1234-123456789ABC}"); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::IPv4)); + assert!(tags.contains(&Tag::Guid)); } #[test] - fn test_classify_ipv6_with_brackets_and_port() { + fn test_email_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("[::1]:8080"); + let found_string = create_test_string("user@example.com"); let tags = classifier.classify(&found_string); - assert_eq!(tags.len(), 1); - assert!(matches!(tags[0], Tag::IPv6)); - } - - #[test] - fn test_posix_absolute_path() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_posix_path("/usr/bin/bash"), - Some(Tag::FilePath) - ); - assert_eq!( - classifier.classify_posix_path("/etc/passwd"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_posix_home_directory() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_posix_path("/home/user/.bashrc"), - Some(Tag::FilePath) - ); - assert_eq!( - classifier.classify_posix_path("/home/user/.config/app"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_posix_with_spaces() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_posix_path("/Users/John Doe/Documents/file.txt"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_posix_system_directories() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_posix_path("/usr/"), Some(Tag::FilePath)); - assert_eq!(classifier.classify_posix_path("/etc/"), Some(Tag::FilePath)); - assert_eq!(classifier.classify_posix_path("/var/"), Some(Tag::FilePath)); - } - - #[test] - fn test_posix_suspicious_paths() { - let classifier = SemanticClassifier::new(); - - assert!(classifier.is_suspicious_posix_path("/tmp/malware")); - assert!(classifier.is_suspicious_posix_path("/etc/cron.d/backdoor")); - } - - #[test] - fn test_posix_too_short() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_posix_path("/a"), Some(Tag::FilePath)); - } - - #[test] - fn test_posix_invalid() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_posix_path("usr/bin/bash"), None); - } - - #[test] - fn test_posix_with_null_bytes() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_posix_path("/tmp/evil\0bin"), None); - } - - #[test] - fn test_windows_absolute_path() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_windows_path("C:\\Windows\\System32\\cmd.exe"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_windows_program_files() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_windows_path("C:\\Program Files (x86)\\App"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_windows_with_spaces() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_windows_path("D:\\My Documents\\file.txt"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_windows_different_drives() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_windows_path("D:\\"), - Some(Tag::FilePath) - ); - assert_eq!( - classifier.classify_windows_path("E:\\Data\\"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_windows_suspicious_paths() { - let classifier = SemanticClassifier::new(); - - assert!(classifier.is_suspicious_windows_path("C:\\Windows\\Temp\\evil.exe")); - } - - #[test] - fn test_windows_case_insensitive() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_windows_path("c:\\windows\\"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_windows_invalid() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_windows_path("C:/forward/slash"), None); - } - - #[test] - fn test_windows_invalid_drive() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_windows_path("1:\\path"), None); - } - - #[test] - fn test_unc_path() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_unc_path("\\\\server\\share\\file.txt"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_unc_with_domain() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_unc_path("\\\\server.domain.com\\share\\"), - Some(Tag::FilePath) - ); - } - - #[test] - fn test_unc_invalid() { - let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_unc_path("\\\\\\\\"), None); - assert_eq!(classifier.classify_unc_path("\\\\server"), None); - } - - #[test] - fn test_registry_run_key() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_registry_path( - "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" - ), - Some(Tag::RegistryPath) - ); - } - - #[test] - fn test_registry_current_user() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_registry_path("HKEY_CURRENT_USER\\Software\\App\\Settings"), - Some(Tag::RegistryPath) - ); - } - - #[test] - fn test_registry_abbreviated_hklm() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_registry_path("HKLM\\System\\CurrentControlSet"), - Some(Tag::RegistryPath) - ); - } - - #[test] - fn test_registry_abbreviated_hkcu() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_registry_path("HKCU\\Software\\Microsoft"), - Some(Tag::RegistryPath) - ); - } - - #[test] - fn test_registry_persistence_run() { - let classifier = SemanticClassifier::new(); - - assert!(classifier.is_suspicious_registry_path( - "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" - )); - } - - #[test] - fn test_registry_invalid_root() { - let classifier = SemanticClassifier::new(); - - assert_eq!( - classifier.classify_registry_path("HKEY_INVALID\\Path"), - None - ); + assert!(tags.contains(&Tag::Email)); } #[test] - fn test_registry_forward_slash() { + fn test_base64_in_found_string() { let classifier = SemanticClassifier::new(); - - assert_eq!(classifier.classify_registry_path("HKLM/Software"), None); - } - - #[test] - fn test_classify_mixed_strings() { - let classifier = SemanticClassifier::new(); - let found_string = create_test_string("https://example.com"); + let found_string = create_test_string("SGVsbG8gV29ybGQh"); let tags = classifier.classify(&found_string); - assert!(tags.contains(&Tag::Url)); + assert!(tags.contains(&Tag::Base64)); } #[test] - fn test_classify_posix_path_in_found_string() { + fn test_format_string_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("/usr/bin/bash"); + let found_string = create_test_string("Error: %s at line %d"); let tags = classifier.classify(&found_string); - assert!(tags.contains(&Tag::FilePath)); + assert!(tags.contains(&Tag::FormatString)); } #[test] - fn test_classify_windows_path_in_found_string() { + fn test_user_agent_in_found_string() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string("C:\\Windows\\System32\\cmd.exe"); + let found_string = + create_test_string("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); let tags = classifier.classify(&found_string); - assert!(tags.contains(&Tag::FilePath)); + assert!(tags.contains(&Tag::UserAgent)); } #[test] - fn test_classify_registry_path_in_found_string() { + fn test_multiple_tags_format_and_base64_not_both() { let classifier = SemanticClassifier::new(); - let found_string = create_test_string( - "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run", - ); - - let tags = classifier.classify(&found_string); - assert!(tags.contains(&Tag::RegistryPath)); - } - #[test] - fn test_no_false_positives_on_random_data() { - let classifier = SemanticClassifier::new(); - let found_string = create_test_string("x9qz1p0t8v7w6r5y4u3i2o1p"); + // Format string should get FormatString tag + let format = create_test_string("Hello %s, your score is %d"); + let tags = classifier.classify(&format); + assert!(tags.contains(&Tag::FormatString)); - let tags = classifier.classify(&found_string); - assert!(tags.is_empty()); + // Pure Base64 should get Base64 tag + let base64 = create_test_string("VGhpcyBpcyBhIHRlc3Q="); + let tags = classifier.classify(&base64); + assert!(tags.contains(&Tag::Base64)); } } diff --git a/src/classification/symbols.rs b/src/classification/symbols.rs new file mode 100644 index 0000000..27b7cd2 --- /dev/null +++ b/src/classification/symbols.rs @@ -0,0 +1,508 @@ +//! Symbol demangling for Rust and C++ symbols +//! +//! This module provides functionality to detect and demangle mangled symbols +//! from compiled Rust and C++ binaries. When a mangled symbol is detected, the +//! original mangled form is preserved in `FoundString.original_text` while the +//! demangled human-readable form replaces `FoundString.text`. +//! +//! # Supported Symbol Formats +//! +//! - **Rust legacy mangling**: Symbols starting with `_ZN` (uses Itanium ABI-like encoding) +//! - **Rust v0 mangling**: Symbols starting with `_R` (new Rust-specific encoding) +//! - **C++ Itanium ABI**: Symbols starting with `_Z` (used by GCC, Clang, and others) +//! +//! # Usage +//! +//! ```rust +//! use stringy::classification::SymbolDemangler; +//! use stringy::types::{FoundString, Encoding, StringSource, Tag}; +//! +//! let demangler = SymbolDemangler::new(); +//! let mut found_string = FoundString { +//! text: "_ZN4core3fmt5Write9write_str17h1234567890abcdefE".to_string(), +//! original_text: None, +//! encoding: Encoding::Ascii, +//! offset: 0, +//! rva: None, +//! section: None, +//! length: 47, +//! tags: Vec::new(), +//! score: 0, +//! section_weight: None, +//! semantic_boost: None, +//! noise_penalty: None, +//! source: StringSource::ImportName, +//! confidence: 1.0, +//! }; +//! +//! demangler.demangle(&mut found_string); +//! // found_string.text now contains the demangled symbol +//! // found_string.original_text contains the original mangled form +//! // found_string.tags contains Tag::DemangledSymbol +//! ``` + +use crate::types::{FoundString, Tag}; +use cpp_demangle::Symbol as CppSymbol; + +/// Symbol demangler for Rust and C++ symbols +/// +/// Uses the `rustc-demangle` crate for Rust symbols and the `cpp_demangle` +/// crate for C++ symbols. Converts mangled symbols into human-readable form +/// while preserving the original mangled text. +#[derive(Debug, Default, Clone)] +pub struct SymbolDemangler; + +impl SymbolDemangler { + /// Create a new instance of the symbol demangler + #[must_use] + pub fn new() -> Self { + Self + } + + /// Check if a symbol appears to be a mangled Rust or C++ symbol + /// + /// Returns `true` if the symbol starts with known mangling prefixes: + /// - `_ZN` - Rust legacy mangling or C++ nested names (Itanium ABI) + /// - `_R` - Rust v0 mangling scheme + /// - `_Z` - C++ Itanium ABI mangling (used by GCC, Clang) + /// + /// # Arguments + /// + /// * `symbol` - The symbol string to check + /// + /// # Returns + /// + /// Returns `true` if the symbol appears to be mangled, `false` otherwise. + /// + /// # Examples + /// + /// ``` + /// use stringy::classification::SymbolDemangler; + /// + /// let demangler = SymbolDemangler::new(); + /// // Rust symbols + /// assert!(demangler.is_mangled("_ZN4core3fmt5Write9write_str17h1234567890abcdefE")); + /// assert!(demangler.is_mangled("_RNvNtCs123_4core3fmt5write")); + /// // C++ symbols + /// assert!(demangler.is_mangled("_ZN3foo3barEv")); + /// assert!(demangler.is_mangled("_Z3foov")); + /// assert!(!demangler.is_mangled("printf")); + /// ``` + #[must_use] + pub fn is_mangled(&self, symbol: &str) -> bool { + // Rust v0 mangling scheme (Rust-specific, check first) + if symbol.starts_with("_R") { + return true; + } + + // Itanium ABI mangling (used by both Rust legacy and C++) + // This includes _ZN (nested names), _ZL (local), _ZTV (vtable), etc. + if symbol.starts_with("_Z") { + return true; + } + + false + } + + /// Attempt to demangle a symbol in a `FoundString` + /// + /// If the string appears to be a mangled Rust or C++ symbol and can be + /// successfully demangled: + /// - The original mangled form is stored in `original_text` + /// - The demangled form replaces `text` + /// - `Tag::DemangledSymbol` is added to the tags + /// + /// The demangler tries Rust demangling first (for `_R` and `_ZN` prefixes), + /// then falls back to C++ demangling for `_Z` prefixes. + /// + /// If demangling fails or the symbol is not mangled, the `FoundString` is + /// left unchanged. + /// + /// # Arguments + /// + /// * `string` - The `FoundString` to process (modified in-place) + /// + /// # Examples + /// + /// ``` + /// use stringy::classification::SymbolDemangler; + /// use stringy::types::{FoundString, Encoding, StringSource, Tag}; + /// + /// let demangler = SymbolDemangler::new(); + /// let mut found_string = FoundString { + /// text: "_ZN4core3fmt5Write9write_str17h1234567890abcdefE".to_string(), + /// original_text: None, + /// encoding: Encoding::Ascii, + /// offset: 0, + /// rva: None, + /// section: None, + /// length: 47, + /// tags: Vec::new(), + /// score: 0, + /// section_weight: None, + /// semantic_boost: None, + /// noise_penalty: None, + /// source: StringSource::ImportName, + /// confidence: 1.0, + /// }; + /// + /// demangler.demangle(&mut found_string); + /// assert!(found_string.tags.contains(&Tag::DemangledSymbol)); + /// assert!(found_string.original_text.is_some()); + /// ``` + pub fn demangle(&self, string: &mut FoundString) { + // Only attempt demangling if it looks like a mangled symbol + if !self.is_mangled(&string.text) { + return; + } + + // Try to demangle + let demangled_str = match self.try_demangle_internal(&string.text) { + Some(s) => s, + None => return, + }; + + // Store original mangled form and replace with demangled + string.original_text = Some(string.text.clone()); + string.text = demangled_str; + + // Add the DemangledSymbol tag if not already present + if !string.tags.contains(&Tag::DemangledSymbol) { + string.tags.push(Tag::DemangledSymbol); + } + } + + /// Internal demangling logic that tries Rust then C++ + fn try_demangle_internal(&self, symbol: &str) -> Option { + // For Rust v0 symbols (_R prefix), only try Rust demangling + if symbol.starts_with("_R") { + return self.try_rust_demangle(symbol); + } + + // For _Z prefixed symbols, try Rust first (for legacy Rust symbols), + // then fall back to C++ if Rust demangling doesn't work + if symbol.starts_with("_Z") { + // Try Rust first (handles _ZN Rust legacy symbols) + if let Some(demangled) = self.try_rust_demangle(symbol) { + return Some(demangled); + } + + // Fall back to C++ demangling + return self.try_cpp_demangle(symbol); + } + + None + } + + /// Try to demangle as a Rust symbol + fn try_rust_demangle(&self, symbol: &str) -> Option { + let demangled = rustc_demangle::demangle(symbol); + let demangled_str = demangled.to_string(); + + // Check if demangling actually produced a different result + if demangled_str != symbol { + Some(demangled_str) + } else { + None + } + } + + /// Try to demangle as a C++ symbol + fn try_cpp_demangle(&self, symbol: &str) -> Option { + // Parse the symbol using cpp_demangle + let parsed = CppSymbol::new(symbol).ok()?; + let demangled_str = parsed.demangle().ok()?; + + // Check if demangling actually produced a different result + if demangled_str != symbol { + Some(demangled_str) + } else { + None + } + } + + /// Try to demangle a symbol string directly + /// + /// This is a convenience method for demangling without a `FoundString`. + /// Supports both Rust and C++ mangled symbols. + /// + /// # Arguments + /// + /// * `symbol` - The mangled symbol string + /// + /// # Returns + /// + /// Returns `Some(demangled)` if demangling succeeded and produced a different + /// result, `None` otherwise. + /// + /// # Examples + /// + /// ``` + /// use stringy::classification::SymbolDemangler; + /// + /// let demangler = SymbolDemangler::new(); + /// + /// // Rust symbol + /// let result = demangler.try_demangle("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); + /// assert!(result.is_some()); + /// + /// // C++ symbol + /// let result = demangler.try_demangle("_ZN3foo3barEv"); + /// assert!(result.is_some()); + /// + /// // Not mangled + /// let result = demangler.try_demangle("printf"); + /// assert!(result.is_none()); + /// ``` + #[must_use] + pub fn try_demangle(&self, symbol: &str) -> Option { + if !self.is_mangled(symbol) { + return None; + } + + self.try_demangle_internal(symbol) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{Encoding, StringSource}; + + fn create_test_string(text: &str) -> FoundString { + FoundString { + text: text.to_string(), + original_text: None, + encoding: Encoding::Ascii, + offset: 0, + rva: None, + section: None, + length: text.len() as u32, + tags: Vec::new(), + score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, + source: StringSource::ImportName, + confidence: 1.0, + } + } + + #[test] + fn test_is_mangled_rust_legacy() { + let demangler = SymbolDemangler::new(); + + // Legacy Rust mangling (_ZN prefix) + assert!(demangler.is_mangled("_ZN4core3fmt5Write9write_str17h1234567890abcdefE")); + assert!(demangler.is_mangled("_ZN3std2io5stdio6_print17h1234567890abcdefE")); + } + + #[test] + fn test_is_mangled_rust_v0() { + let demangler = SymbolDemangler::new(); + + // Rust v0 mangling (_R prefix) + assert!(demangler.is_mangled("_RNvNtCs123_4core3fmt5write")); + assert!(demangler.is_mangled("_RNvCs123_5hello4main")); + } + + #[test] + fn test_is_mangled_not_mangled() { + let demangler = SymbolDemangler::new(); + + // Regular symbols should not be detected as mangled + assert!(!demangler.is_mangled("printf")); + assert!(!demangler.is_mangled("malloc")); + assert!(!demangler.is_mangled("main")); + assert!(!demangler.is_mangled("CreateFileW")); + assert!(!demangler.is_mangled("")); + } + + #[test] + fn test_demangle_rust_symbol() { + let demangler = SymbolDemangler::new(); + let mut found_string = + create_test_string("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); + + demangler.demangle(&mut found_string); + + // Should have been demangled + assert!(found_string.original_text.is_some()); + assert_eq!( + found_string.original_text.as_ref().unwrap(), + "_ZN4core3fmt5Write9write_str17h1234567890abcdefE" + ); + assert!(found_string.tags.contains(&Tag::DemangledSymbol)); + // Demangled text should be different from original + assert_ne!( + found_string.text, + "_ZN4core3fmt5Write9write_str17h1234567890abcdefE" + ); + } + + #[test] + fn test_demangle_non_mangled() { + let demangler = SymbolDemangler::new(); + let mut found_string = create_test_string("printf"); + + demangler.demangle(&mut found_string); + + // Should not have been modified + assert_eq!(found_string.text, "printf"); + assert!(found_string.original_text.is_none()); + assert!(!found_string.tags.contains(&Tag::DemangledSymbol)); + } + + #[test] + fn test_try_demangle_success() { + let demangler = SymbolDemangler::new(); + let result = demangler.try_demangle("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); + + assert!(result.is_some()); + let demangled = result.unwrap(); + assert!(!demangled.is_empty()); + assert_ne!( + demangled, + "_ZN4core3fmt5Write9write_str17h1234567890abcdefE" + ); + } + + #[test] + fn test_try_demangle_failure() { + let demangler = SymbolDemangler::new(); + + assert!(demangler.try_demangle("printf").is_none()); + assert!(demangler.try_demangle("").is_none()); + assert!(demangler.try_demangle("main").is_none()); + } + + #[test] + fn test_demangle_preserves_existing_tags() { + let demangler = SymbolDemangler::new(); + let mut found_string = + create_test_string("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); + found_string.tags.push(Tag::Import); + + demangler.demangle(&mut found_string); + + // Should have both the original tag and the new demangled tag + assert!(found_string.tags.contains(&Tag::Import)); + assert!(found_string.tags.contains(&Tag::DemangledSymbol)); + } + + #[test] + fn test_demangle_idempotent() { + let demangler = SymbolDemangler::new(); + let mut found_string = + create_test_string("_ZN4core3fmt5Write9write_str17h1234567890abcdefE"); + + demangler.demangle(&mut found_string); + let first_text = found_string.text.clone(); + let first_original = found_string.original_text.clone(); + + // Calling demangle again should not change anything + demangler.demangle(&mut found_string); + + assert_eq!(found_string.text, first_text); + assert_eq!(found_string.original_text, first_original); + // Should only have one DemangledSymbol tag + assert_eq!( + found_string + .tags + .iter() + .filter(|t| matches!(t, Tag::DemangledSymbol)) + .count(), + 1 + ); + } + + // C++ demangling tests + + #[test] + fn test_is_mangled_cpp_symbols() { + let demangler = SymbolDemangler::new(); + + // C++ Itanium ABI mangled symbols + assert!(demangler.is_mangled("_ZN3foo3barEv")); // foo::bar() + assert!(demangler.is_mangled("_Z3foov")); // foo() + assert!(demangler.is_mangled("_ZN9__gnu_cxx13new_allocatorIcE10deallocateEPcm")); + assert!(demangler.is_mangled("_ZNSt6vectorIiSaIiEE9push_backERKi")); + assert!(demangler.is_mangled("_ZTV5MyClass")); // vtable for MyClass + assert!(demangler.is_mangled("_ZTI5MyClass")); // typeinfo for MyClass + } + + #[test] + fn test_demangle_cpp_symbol() { + let demangler = SymbolDemangler::new(); + let mut found_string = create_test_string("_ZN3foo3barEv"); + + demangler.demangle(&mut found_string); + + // Should have been demangled + assert!(found_string.original_text.is_some()); + assert_eq!( + found_string.original_text.as_ref().unwrap(), + "_ZN3foo3barEv" + ); + assert!(found_string.tags.contains(&Tag::DemangledSymbol)); + // Demangled text should contain "foo" and "bar" + assert!(found_string.text.contains("foo")); + assert!(found_string.text.contains("bar")); + } + + #[test] + fn test_try_demangle_cpp_success() { + let demangler = SymbolDemangler::new(); + + // Simple C++ function + let result = demangler.try_demangle("_Z3foov"); + assert!(result.is_some()); + let demangled = result.unwrap(); + assert!(demangled.contains("foo")); + + // Namespaced C++ function + let result = demangler.try_demangle("_ZN3foo3barEv"); + assert!(result.is_some()); + let demangled = result.unwrap(); + assert!(demangled.contains("foo")); + assert!(demangled.contains("bar")); + } + + #[test] + fn test_demangle_cpp_with_parameters() { + let demangler = SymbolDemangler::new(); + + // C++ function with int parameter: void foo(int) + let result = demangler.try_demangle("_Z3fooi"); + assert!(result.is_some()); + let demangled = result.unwrap(); + assert!(demangled.contains("foo")); + assert!(demangled.contains("int")); + } + + #[test] + fn test_demangle_cpp_template() { + let demangler = SymbolDemangler::new(); + + // C++ template: std::vector + let result = demangler.try_demangle("_ZNSt6vectorIiSaIiEEC1Ev"); + assert!(result.is_some()); + let demangled = result.unwrap(); + assert!(demangled.contains("vector")); + } + + #[test] + fn test_cpp_symbol_in_found_string() { + let demangler = SymbolDemangler::new(); + let mut found_string = create_test_string("_Z3fooi"); + found_string.tags.push(Tag::Export); + + demangler.demangle(&mut found_string); + + // Should have been demangled and preserved existing tags + assert!(found_string.original_text.is_some()); + assert!(found_string.tags.contains(&Tag::Export)); + assert!(found_string.tags.contains(&Tag::DemangledSymbol)); + assert!(found_string.text.contains("foo")); + } +} diff --git a/src/extraction/dedup.rs b/src/extraction/dedup.rs index d7ad2b7..b25bae0 100644 --- a/src/extraction/dedup.rs +++ b/src/extraction/dedup.rs @@ -39,6 +39,9 @@ pub struct StringOccurrence { pub rva: Option, /// Section name where string was found pub section: Option, + /// Original text before demangling (if applicable) + #[serde(skip_serializing_if = "Option::is_none", default)] + pub original_text: Option, /// Extraction source type pub source: StringSource, /// Tags from this specific occurrence @@ -80,7 +83,7 @@ pub struct StringOccurrence { pub fn deduplicate( strings: Vec, dedup_threshold: Option, - preserve_all_occurrences: bool, + _preserve_all_occurrences: bool, ) -> Vec { if strings.is_empty() { return Vec::new(); @@ -109,22 +112,10 @@ pub fn deduplicate( // All strings in group have same encoding, use first one let encoding = found_strings[0].encoding; - let occurrences: Vec = if preserve_all_occurrences { - // Store full occurrence metadata - found_strings - .into_iter() - .map(found_string_to_occurrence) - .collect() - } else { - // Store only the first occurrence as representative, but we still need - // the count for scoring, so we'll keep all but mark them as "count only" - // For now, we'll still store all occurrences but this could be optimized - // to store just a count field in the future - found_strings - .into_iter() - .map(found_string_to_occurrence) - .collect() - }; + let occurrences: Vec = found_strings + .into_iter() + .map(found_string_to_occurrence) + .collect(); let merged_tags = merge_tags(&occurrences); @@ -254,6 +245,7 @@ pub fn found_string_to_occurrence(fs: FoundString) -> StringOccurrence { offset: fs.offset, rva: fs.rva, section: fs.section, + original_text: fs.original_text, source: fs.source, original_tags: fs.tags, original_score: fs.score, @@ -282,7 +274,7 @@ impl CanonicalString { FoundString { text: self.text.clone(), - original_text: None, + original_text: first_occurrence.original_text.clone(), encoding: self.encoding, offset: first_occurrence.offset, rva: first_occurrence.rva, diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs index 19b5038..6b3a85f 100644 --- a/src/extraction/mod.rs +++ b/src/extraction/mod.rs @@ -123,6 +123,7 @@ //! let load_command_strings = extract_load_command_strings(&macho_data); //! ``` +use crate::classification::{SemanticClassifier, SymbolDemangler}; use crate::types::{ ContainerInfo, Encoding, FoundString, Result, SectionInfo, SectionType, StringSource, }; @@ -147,6 +148,20 @@ pub use utf16::{ extract_utf16_strings, }; +fn apply_semantic_enrichment(strings: &mut [FoundString]) { + let classifier = SemanticClassifier::new(); + let demangler = SymbolDemangler::new(); + for string in strings { + demangler.demangle(string); + let tags = classifier.classify(string); + for tag in tags { + if !string.tags.contains(&tag) { + string.tags.push(tag); + } + } + } +} + /// Configuration for string extraction /// /// Controls various aspects of the extraction process including minimum/maximum @@ -521,6 +536,9 @@ impl StringExtractor for BasicExtractor { } } + // Apply demangling and semantic classification before deduplication + apply_semantic_enrichment(&mut all_strings); + // Apply deduplication if enabled if config.enable_deduplication { let canonical_strings = deduplicate( @@ -625,6 +643,9 @@ impl StringExtractor for BasicExtractor { } } + // Apply demangling and semantic classification before deduplication + apply_semantic_enrichment(&mut all_strings); + // Apply deduplication if enabled, otherwise convert each string to a canonical form if config.enable_deduplication { Ok(deduplicate( diff --git a/src/types.rs b/src/types.rs index 5347f33..b8cdfb0 100644 --- a/src/types.rs +++ b/src/types.rs @@ -32,6 +32,8 @@ pub enum Tag { FormatString, #[serde(rename = "user-agent-ish")] UserAgent, + #[serde(rename = "demangled")] + DemangledSymbol, Import, Export, Version, diff --git a/tests/snapshots/classification_integration__classification_snapshots.snap b/tests/snapshots/classification_integration__classification_snapshots.snap index 21b9b32..f110d38 100644 --- a/tests/snapshots/classification_integration__classification_snapshots.snap +++ b/tests/snapshots/classification_integration__classification_snapshots.snap @@ -1,6 +1,5 @@ --- source: tests/classification_integration.rs -assertion_line: 150 expression: snapshot --- [