EvilBit-Labs · unclesp1d3r · Jan 18, 2026 · Jan 18, 2026 · Jan 18, 2026 · Jan 18, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,19 +20,21 @@ name = "stringy"
 path = "src/main.rs"
 
 [dependencies]
-clap        = { version = "4.5.54", features = [ "derive" ] }
-entropy     = "0.4.2"
-goblin      = "0.10.4"
-lazy_static = "1.5"
-pelite      = "0.10.0"
-regex       = "1.12.2"
-serde       = { version = "1.0.228", features = [ "derive" ] }
-serde_json  = "1.0.148"
-thiserror   = "2.0.17"
+clap           = { version = "4.5.54", features = [ "derive" ] }
+cpp_demangle   = "0.5.1"
+entropy        = "0.4.2"
+goblin         = "0.10.4"
+once_cell      = "1.21.3"
+pelite         = "0.10.0"
+regex          = "1.12.2"
+rustc-demangle = "0.1.27"
+serde          = { version = "1.0.228", features = [ "derive" ] }
+serde_json     = "1.0.149"
+thiserror      = "2.0.17"
 
 [dev-dependencies]
 criterion = "0.8.1"
-insta     = "1.46.0"
+insta     = "1.46.1"
 tempfile  = "3.24.0"
 
 # The profile that 'dist' will build with

diff --git a/docs/src/classification.md b/docs/src/classification.md
@@ -76,51 +76,54 @@ Raw String -> Pattern Matching -> Tag Assignment
 
 #### GUIDs/UUIDs
 
-- **Pattern**: `\{[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\}`
-- **Examples**: `{12345678-1234-1234-1234-123456789abc}`
-- **Validation**: Format compliance, version checking
+- **Pattern**: `\{?[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\}?`
+- **Examples**: `{12345678-1234-1234-1234-123456789abc}`, `12345678-1234-1234-1234-123456789abc`
+- **Validation**: Format compliance
 - **Security relevance**: Medium - component identification
 
 #### Email Addresses
 
 - **Pattern**: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`
 - **Examples**: `admin@malware.com`, `support@legitimate.org`
-- **Validation**: RFC compliance, domain validation
+- **Validation**: Basic format validation
 - **Security relevance**: Medium - contact information
 
 ### Code Artifacts
 
 #### Format Strings
 
-- **Pattern**: `%[sdxo]|%\d+[sdxo]|\{\d+\}`
-- **Examples**: `Error: %s at line %d`, `User {0} logged in`
-- **Context**: Proximity to other format strings
+- **Pattern**: `%[-+0 #]*(\d+|\*)?(\.(\d+|\*))?(hh?|ll?|[Lzjt])?[diouxXeEfFgGaAcspn%]`
+- **Examples**: `Error: %s at line %d`, `Name: %s, Age: %d, Score: %.2f`
+- **Context**: Presence of real format specifiers (%% alone is ignored)
 - **Security relevance**: Low-Medium - debugging information
 
 #### Base64 Data
 
-- **Pattern**: `[A-Za-z0-9+/]{20,}={0,2}`
+- **Pattern**: Character set validation with padding rules
 - **Examples**: `SGVsbG8gV29ybGQ=`
-- **Validation**: Length divisibility, padding correctness
+- **Validation**: Length >= 16, Base64 character set, valid padding, reject length mod 4 of 1
 - **Security relevance**: Variable - encoded payloads
 
-### User Agents
+#### User Agents
 
-- **Pattern**: `Mozilla/[0-9.]+|Chrome/[0-9.]+|Safari/[0-9.]+`
-- **Examples**: `Mozilla/5.0 (Windows NT 10.0; Win64; x64)`
+- **Pattern**: Prefix match for common agents (Mozilla, curl, Wget, python-requests, libwww-perl, Java, Apache-HttpClient, okhttp, PostmanRuntime)
+- **Examples**: `Mozilla/5.0 (Windows NT 10.0; Win64; x64)`, `curl/7.68.0`
 - **Security relevance**: Medium - network fingerprinting
 
-### Pattern Matching Engine
+## Tag Specificity
 
-The semantic classifier uses cached regex patterns via `lazy_static!` and applies validation checks to reduce false positives.
+Tags are treated as either specific or broad. Specific tags indicate high confidence matches (for example URL, domain, IP, file path, GUID, email, format string, and user agent). Base64 is a broad tag and should be treated as ambiguous due to higher false positive risk.
+
+## Pattern Matching Engine
+
+The semantic classifier uses cached regex patterns via `once_cell::sync::Lazy` and applies validation checks to reduce false positives.
-## Tag Specificity
-
-The semantic classifier uses cached regex patterns via `lazy_static!` and applies validation checks to reduce false positives.
-Tags are treated as either specific or broad. Specific tags indicate high confidence matches (for example URL, domain, IP, file path, GUID, email, format string, and user agent). Base64 is a broad tag and should be treated as ambiguous due to higher false positive risk.
-
-## Pattern Matching Engine
-
-The semantic classifier uses cached regex patterns via `once_cell::sync::Lazy` and applies validation checks to reduce false positives.
+## Tag Specificity
+
+Tags are treated as either specific or broad. Specific tags indicate high-confidence matches (for example URL, domain, IP, file path, GUID, email, format string, and user agent). Base64 is a broad tag, which should be treated as ambiguous due to higher false-positive risk.
+
+## Pattern Matching Engine
+
+The semantic classifier uses cached regex patterns via `once_cell::sync::Lazy` and applies validation checks to reduce false positives.
-## Tag Specificity
-
-The semantic classifier uses cached regex patterns via `lazy_static!` and applies validation checks to reduce false positives.
-Tags are treated as either specific or broad. Specific tags indicate high confidence matches (for example URL, domain, IP, file path, GUID, email, format string, and user agent). Base64 is a broad tag and should be treated as ambiguous due to higher false positive risk.
-
-## Pattern Matching Engine
-
-The semantic classifier uses cached regex patterns via `once_cell::sync::Lazy` and applies validation checks to reduce false positives.
+## Tag Specificity
+
+Tags are treated as either specific or broad. Specific tags indicate high-confidence matches (for example URL, domain, IP, file path, GUID, email, format string, and user agent). Base64 is a broad tag, which should be treated as ambiguous due to higher false-positive risk.
+
+## Pattern Matching Engine
+
+The semantic classifier uses cached regex patterns via `once_cell::sync::Lazy` and applies validation checks to reduce false positives.
 
 ```rust
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
 use regex::Regex;
 
-lazy_static! {
-    static ref URL_REGEX: Regex = Regex::new(r#"https?://[^\s<>"{}|\\^\[\]\`]+"#).unwrap();
-}
+static URL_REGEX: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#"https?://[^\s<>"{}|\\^\[\]\`]+"#).unwrap());
 
 impl SemanticClassifier {
     pub fn classify(&self, string: &FoundString) -> Vec<Tag> {
@@ -152,22 +155,6 @@ impl SemanticClassifier {
 }
 ```
 
-## Implementation Details
-
-The classifier relies on `lazy_static!` to compile regex patterns once and reuse them across classification calls. Helper methods validate strings before assigning tags.
-
-### Method Signatures
-
-Key method signatures:
-
-```text
-pub fn classify(&self, string: &FoundString) -> Vec<Tag>;
-pub fn classify_posix_path(&self, text: &str) -> Option<Tag>;
-pub fn classify_windows_path(&self, text: &str) -> Option<Tag>;
-pub fn classify_unc_path(&self, text: &str) -> Option<Tag>;
-pub fn classify_registry_path(&self, text: &str) -> Option<Tag>;
-```
-
 ## Using the Classification System
 
 ```text
@@ -198,85 +185,7 @@ if tags.contains(&Tag::FilePath) {
 
 The current implementation returns tags without explicit confidence scores. Confidence is implicit in the validation and matching logic. A future update may introduce explicit confidence values per tag.
 
-## Planned Enhancements (implementation pending)
+## Planned Enhancements
 
 - Context-aware classification
-- Symbol classification
-- Additional semantic patterns (GUIDs, email addresses, base64, format strings) - documented above, implementation pending
-
-### Language-Specific Patterns
-
-Different programming languages have distinct string patterns:
-
-```rust
-pub enum LanguageHint {
-    Rust,
-    Go,
-    DotNet,
-    Native,
-}
-
-impl SemanticClassifier {
-    fn classify_with_language_hint(&self, text: &str, hint: LanguageHint) -> Vec<Tag> {
-        match hint {
-            LanguageHint::Rust => self.classify_rust_patterns(text),
-            LanguageHint::Go => self.classify_go_patterns(text),
-            LanguageHint::DotNet => self.classify_dotnet_patterns(text),
-            LanguageHint::Native => self.classify_native_patterns(text),
-        }
-    }
-}
-```
-
-### False Positive Reduction
-
-Several techniques reduce false positives:
-
-1. **Length thresholds**: Very short matches are filtered out
-2. **Context validation**: Surrounding data must make sense
-3. **Entropy checking**: High-entropy strings are likely binary data
-4. **Whitelist/blacklist**: Known good/bad patterns
-
-```text
-fn is_likely_false_positive(&self, text: &str, tag: &Tag) -> bool {
-    match tag {
-        Tag::Domain => {
-            // Too short or invalid TLD
-            text.len() < 4 || !self.has_valid_tld(text)
-        }
-        Tag::Base64 => {
-            // Too short or invalid padding
-            text.len() < 8 || !self.valid_base64_padding(text)
-        }
-        _ => false,
-    }
-}
-```
-
-## Performance Considerations
-
-### Regex Compilation Caching
-
-```rust
-lazy_static! {
-    static ref COMPILED_PATTERNS: SemanticClassifier = SemanticClassifier::new();
-}
-```
-
-### Parallel Classification
-
-```rust
-use rayon::prelude::*;
-
-fn classify_batch(strings: &[RawString]) -> Vec<ClassifiedString> {
-    strings.par_iter().map(|s| classify_single(s)).collect()
-}
-```
-
-### Memory Efficiency
-
-- Reuse regex objects across classifications
-- Use string interning for common patterns
-- Lazy evaluation for expensive validations
-
-This comprehensive classification system enables Stringy to automatically identify and categorize the most relevant strings in binary files, significantly improving analysis efficiency.
+- Language-specific refinements
diff --git a/src/classification/mod.rs b/src/classification/mod.rs
@@ -12,14 +12,12 @@
 //! - **Domain Detection**: Identifies domain names with TLD validation
 //! - **File Path Detection**: Identifies POSIX, Windows, and UNC paths
 //! - **Registry Path Detection**: Identifies Windows registry paths
-//!
-//! ## Future Capabilities
-//!
-//! - GUIDs/UUIDs
-//! - Email addresses
-//! - Base64 data
-//! - Format strings
-//! - User agents
+//! - **GUID Detection**: Identifies GUIDs/UUIDs in standard format
+//! - **Email Detection**: Identifies email addresses
+//! - **Base64 Detection**: Identifies Base64-encoded data (broad tag)
+//! - **Format String Detection**: Identifies printf-style format strings
+//! - **User Agent Detection**: Identifies HTTP user agent strings
+//! - **Symbol Demangling**: Demangles Rust symbols to human-readable form
 //!
 //! ## Usage
 //!
@@ -49,5 +47,9 @@
 //! assert!(tags.contains(&Tag::FilePath));
 //! ```
 
+mod patterns;
 pub mod semantic;
+pub mod symbols;
+
 pub use semantic::SemanticClassifier;
+pub use symbols::SymbolDemangler;