From af90065c7de9ffd100824aadbf184174b7095177 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 21:55:35 -0500 Subject: [PATCH 01/10] feat(prompts): add continuous integration and simplicity review prompts Signed-off-by: UncleSp1d3r --- .github/prompts/cicheck.prompt.md | 16 ++++++++++++++++ .../simplicity-review.prompt.md | 9 ++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 .github/prompts/cicheck.prompt.md rename .github/{prompt => prompts}/simplicity-review.prompt.md (76%) diff --git a/.github/prompts/cicheck.prompt.md b/.github/prompts/cicheck.prompt.md new file mode 100644 index 0000000..f26bb82 --- /dev/null +++ b/.github/prompts/cicheck.prompt.md @@ -0,0 +1,16 @@ +--- +agent: agent +name: Continuous Integration Check +description: This prompt is used to run and fix issues identified by the continuous integration check command. +model: OpenAI GPT-5.2-Codex (copilot) +--- + +Run `just ci-check` and analyze any failures or warnings. If there are any issues, fix them and run the command again. Continue this process until `just ci-check` passes completely without any failures or warnings. Focus on: + +1. Linting errors +2. Test failures +3. Formatting issues +4. Security issues +5. ERB template issues + +After each fix, re-run `just ci-check` to verify the changes resolved the issues. Only stop when all checks pass successfully. Provide a summary of the changes made to fix the issues once `just ci-check` passes. diff --git a/.github/prompt/simplicity-review.prompt.md b/.github/prompts/simplicity-review.prompt.md similarity index 76% rename from .github/prompt/simplicity-review.prompt.md rename to .github/prompts/simplicity-review.prompt.md index 7464254..faf99dd 100644 --- a/.github/prompt/simplicity-review.prompt.md +++ b/.github/prompts/simplicity-review.prompt.md @@ -1,6 +1,13 @@ +--- +agent: agent +name: Simplicity Review +description: This prompt is used to review and simplify code changes by applying principles of simplicity, idiomatic coding, and test proportionality. +model: OpenAI GPT-5.2-Codex +--- + CODE SIMPLIFICATION REVIEW -Start by examining the uncommitted changes in the current codebase. +Start by examining the uncommitted changes (or the changes in the current branch if there are no uncommitted changes) in the current codebase. ANALYSIS STEPS: From b896a6cefc791ee852a41d44b8d309e6c8e40585 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 22:06:15 -0500 Subject: [PATCH 02/10] chore(docs): update code blocks to use text formatting Signed-off-by: UncleSp1d3r --- docs/src/api.md | 48 +++++++++++++++++------------------ docs/src/architecture.md | 12 ++++----- docs/src/binary-formats.md | 12 ++++----- docs/src/classification.md | 6 ++--- docs/src/output-formats.md | 2 +- docs/src/performance.md | 10 ++++---- docs/src/string-extraction.md | 42 +++++++++++++++--------------- 7 files changed, 66 insertions(+), 66 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 3c23a02..ac01a30 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -8,7 +8,7 @@ This page provides an overview of Stringy's public API. For complete API documen The primary data structure representing an extracted string with metadata. -```rust +```text #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FoundString { /// The extracted string text @@ -36,7 +36,7 @@ pub struct FoundString { Supported string encodings. -```rust +```text #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum Encoding { Ascii, @@ -50,7 +50,7 @@ pub enum Encoding { Semantic classification tags. -```rust +```text #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum Tag { Url, @@ -78,7 +78,7 @@ pub enum Tag { Extract strings from binary data. -```rust +```text pub fn extract_strings( data: &[u8], config: &ExtractionConfig @@ -96,7 +96,7 @@ pub fn extract_strings( **Example:** -```rust +```text use stringy::{extract_strings, ExtractionConfig}; let data = std::fs::read("binary.exe")?; @@ -112,7 +112,7 @@ for string in strings { Detect the binary format of the given data. -```rust +```text pub fn detect_format(data: &[u8]) -> BinaryFormat ``` @@ -126,7 +126,7 @@ pub fn detect_format(data: &[u8]) -> BinaryFormat **Example:** -```rust +```text use stringy::detect_format; let data = std::fs::read("binary")?; @@ -140,7 +140,7 @@ println!("Detected format: {:?}", format); Configuration options for string extraction. -```rust +```text pub struct ExtractionConfig { /// Minimum length for ASCII strings pub min_ascii_len: usize, @@ -180,7 +180,7 @@ impl Default for ExtractionConfig { Configuration for semantic classification. -```rust +```text pub struct ClassificationConfig { /// Enable URL detection pub detect_urls: bool, @@ -209,7 +209,7 @@ pub struct ClassificationConfig { Trait for implementing binary format parsers. -```rust +```text pub trait ContainerParser { /// Detect if this parser can handle the given data fn detect(data: &[u8]) -> bool @@ -225,7 +225,7 @@ pub trait ContainerParser { Information about a parsed binary container. -```rust +```text pub struct ContainerInfo { /// The binary format detected pub format: BinaryFormat, @@ -242,7 +242,7 @@ pub struct ContainerInfo { Information about a section within the binary. -```rust +```text pub struct SectionInfo { /// Section name pub name: String, @@ -267,7 +267,7 @@ pub struct SectionInfo { Trait for implementing output formatters. -```rust +```text pub trait OutputFormatter { /// Format the strings for output fn format(&self, strings: &[FoundString], config: &OutputConfig) -> Result; @@ -276,7 +276,7 @@ pub trait OutputFormatter { ### Built-in Formatters -```rust +```text // Human-readable table format pub struct HumanFormatter; @@ -289,7 +289,7 @@ pub struct YaraFormatter; **Example:** -```rust +```text use stringy::output::{JsonFormatter, OutputFormatter, OutputConfig}; let formatter = JsonFormatter::new(); @@ -304,7 +304,7 @@ println!("{}", output); Comprehensive error type for the library. -```rust +```text #[derive(Debug, thiserror::Error)] pub enum StringyError { #[error("Unsupported file format")] @@ -331,7 +331,7 @@ pub enum StringyError { Convenient result type alias. -```rust +```text pub type Result = std::result::Result; ``` @@ -341,7 +341,7 @@ pub type Result = std::result::Result; Implement custom semantic classifiers: -```rust +```text use stringy::classification::{ClassificationResult, Classifier}; pub struct CustomClassifier { @@ -360,7 +360,7 @@ impl Classifier for CustomClassifier { For large files, use memory mapping: -```rust +```text use memmap2::Mmap; use std::fs::File; @@ -373,7 +373,7 @@ let strings = extract_strings(&mmap[..], &config)?; Process multiple files in parallel: -```rust +```text use rayon::prelude::*; let files = vec!["file1.exe", "file2.dll", "file3.so"]; @@ -390,7 +390,7 @@ let results: Vec<_> = files Optional features can be enabled in `Cargo.toml`: -```toml +```text [dependencies] stringy = { version = "0.1", features = ["pe-resources", "dwarf-debug"] } ``` @@ -406,7 +406,7 @@ Available features: ### Basic String Extraction -```rust +```text use stringy::{ExtractionConfig, extract_strings}; fn main() -> stringy::Result<()> { @@ -425,7 +425,7 @@ fn main() -> stringy::Result<()> { ### Filtered Extraction -```rust +```text use stringy::{Encoding, ExtractionConfig, Tag, extract_strings}; fn extract_network_indicators(data: &[u8]) -> stringy::Result> { @@ -454,7 +454,7 @@ fn extract_network_indicators(data: &[u8]) -> stringy::Result> { ### Custom Output Format -```rust +```text use serde_json::json; use stringy::output::{OutputConfig, OutputFormatter}; diff --git a/docs/src/architecture.md b/docs/src/architecture.md index 3792144..00ef674 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -31,7 +31,7 @@ Handles binary format detection and parsing using the `goblin` crate with compre The parsers implement intelligent section prioritization: -```rust +```text // Example: ELF section weights ".rodata" | ".rodata.str1.*" => 10.0 // Highest priority ".comment" | ".note.*" => 9.0 // Build info, very likely strings @@ -109,7 +109,7 @@ Formats results for different use cases with consistent data structures. ### 1. Binary Analysis Phase ✅ **Implemented** -```rust +```text // Format detection using goblin let format = detect_format(&data); // Returns BinaryFormat enum let parser = create_parser(format)?; // Creates appropriate parser @@ -128,7 +128,7 @@ let container_info = parser.parse(&data)?; ### 2. String Extraction Phase 🚧 **Framework Ready** -```rust +```text // Extract strings from prioritized sections (by weight) let mut all_strings = Vec::new(); for section in container_info.sections.iter().filter(|s| s.weight > 5.0) { @@ -151,7 +151,7 @@ let unique_strings = deduplicate(all_strings); ### 3. Classification Phase 🚧 **Types Ready** -```rust +```text // Apply semantic classification with context awareness for string in &mut unique_strings { let context = StringContext { @@ -167,7 +167,7 @@ for string in &mut unique_strings { ### 4. Output Phase 🚧 **Interfaces Defined** -```rust +```text // Sort by relevance score (descending) unique_strings.sort_by_key(|s| std::cmp::Reverse(s.score)); @@ -207,7 +207,7 @@ pub trait ContainerParser { Each parser implements intelligent section classification: -```rust +```text // ELF Example fn classify_section(section: &SectionHeader, name: &str) -> SectionType { if section.sh_flags & SHF_EXECINSTR != 0 { diff --git a/docs/src/binary-formats.md b/docs/src/binary-formats.md index df7d342..8456f62 100644 --- a/docs/src/binary-formats.md +++ b/docs/src/binary-formats.md @@ -56,7 +56,7 @@ The ELF parser now provides comprehensive symbol extraction with: ### Implementation Details -```rust +```text impl ElfParser { fn classify_section(section: &SectionHeader, name: &str) -> SectionType { // Check executable flag first @@ -231,7 +231,7 @@ PE resources are particularly rich sources of strings. The PE parser now provide #### Usage Example -```rust +```text use stringy::extraction::extract_resource_strings; use stringy::types::Tag; @@ -251,7 +251,7 @@ let ui_strings: Vec<_> = strings.iter() ### Implementation Details -```rust +```text impl PeParser { fn classify_section(section: &SectionTable) -> SectionType { let name = String::from_utf8_lossy(§ion.name); @@ -349,7 +349,7 @@ Mach-O load commands contain valuable strings: ### Implementation Details -```rust +```text impl MachoParser { fn classify_section(segment_name: &str, section_name: &str) -> SectionType { match (segment_name, section_name) { @@ -382,7 +382,7 @@ impl MachoParser { Different formats require different weighting strategies: -```rust +```text fn calculate_section_weight(format: BinaryFormat, section_type: SectionType) -> i32 { match (format, section_type) { (BinaryFormat::Elf, SectionType::StringData) => 10, // .rodata @@ -397,7 +397,7 @@ fn calculate_section_weight(format: BinaryFormat, section_type: SectionType) -> Stringy uses `goblin` for robust format detection: -```rust +```text pub fn detect_format(data: &[u8]) -> BinaryFormat { match Object::parse(data) { Ok(Object::Elf(_)) => BinaryFormat::Elf, diff --git a/docs/src/classification.md b/docs/src/classification.md index fea0ba3..2a5947a 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -160,7 +160,7 @@ The classifier relies on `lazy_static!` to compile regex patterns once and reuse Key method signatures: -```rust +```text pub fn classify(&self, string: &FoundString) -> Vec; pub fn classify_posix_path(&self, text: &str) -> Option; pub fn classify_windows_path(&self, text: &str) -> Option; @@ -170,7 +170,7 @@ pub fn classify_registry_path(&self, text: &str) -> Option; ## Using the Classification System -```rust +```text use stringy::classification::SemanticClassifier; use stringy::types::{Encoding, FoundString, StringSource, Tag}; @@ -237,7 +237,7 @@ Several techniques reduce false positives: 3. **Entropy checking**: High-entropy strings are likely binary data 4. **Whitelist/blacklist**: Known good/bad patterns -```rust +```text fn is_likely_false_positive(&self, text: &str, tag: &Tag) -> bool { match tag { Tag::Domain => { diff --git a/docs/src/output-formats.md b/docs/src/output-formats.md index bffbfac..001fbca 100644 --- a/docs/src/output-formats.md +++ b/docs/src/output-formats.md @@ -38,7 +38,7 @@ Machine-readable format with one JSON object per line, ideal for automation and ### Example Output -```json +```text {"text":"https://api.example.com/v1/users","encoding":"utf-8","offset":4096,"rva":4096,"section":".rdata","length":31,"tags":["url"],"score":95,"source":"SectionData"} {"text":"{12345678-1234-1234-1234-123456789abc}","encoding":"utf-8","offset":8192,"rva":8192,"section":".rdata","length":38,"tags":["guid"],"score":87,"source":"SectionData"} ``` diff --git a/docs/src/performance.md b/docs/src/performance.md index 33770dc..6a42066 100644 --- a/docs/src/performance.md +++ b/docs/src/performance.md @@ -27,7 +27,7 @@ Stringy is designed for efficient analysis of binary files, from small executabl Stringy uses memory mapping for efficient file access: -```rust +```text // Automatic memory mapping for large files if file_size > MEMORY_MAP_THRESHOLD { let mmap = unsafe { Mmap::map(&file)? }; @@ -93,7 +93,7 @@ Core extraction pipeline is optimized for single-threaded performance: Future versions will support parallel processing: -```rust +```text // Planned parallel section processing sections.par_iter() .flat_map(|section| extract_from_section(section, data)) @@ -110,7 +110,7 @@ sections.par_iter() #### Regex Caching -```rust +```text lazy_static! { static ref URL_REGEX: Regex = Regex::new(r"https?://[^\s]+").unwrap(); static ref DOMAIN_REGEX: Regex = Regex::new(r"[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap(); @@ -119,7 +119,7 @@ lazy_static! { #### Efficient String Scanning -```rust +```text // Optimized ASCII scanning with SIMD potential fn scan_ascii_optimized(data: &[u8]) -> Vec { let mut matches = Vec::new(); @@ -147,7 +147,7 @@ fn scan_ascii_optimized(data: &[u8]) -> Vec { Stringy uses sequential access patterns optimized for modern storage: -```rust +```text // Sequential section processing for section in container.sections { let section_data = &data[section.offset..section.offset + section.size]; diff --git a/docs/src/string-extraction.md b/docs/src/string-extraction.md index f3ca3a9..f57a95d 100644 --- a/docs/src/string-extraction.md +++ b/docs/src/string-extraction.md @@ -27,7 +27,7 @@ UTF-16LE extraction is now implemented and available for Windows PE binary strin #### Basic Extraction -```rust +```text use stringy::extraction::ascii::{extract_ascii_strings, AsciiExtractionConfig}; let data = b"Hello\0World\0Test123"; @@ -41,7 +41,7 @@ for string in strings { #### Configuration -```rust +```text use stringy::extraction::ascii::AsciiExtractionConfig; // Default configuration (min_length: 4, no max_length) @@ -61,7 +61,7 @@ UTF-8 extraction builds on ASCII extraction and handles multi-byte characters. S #### Implementation Details -```rust +```text fn extract_ascii_strings(data: &[u8], min_len: usize) -> Vec { let mut strings = Vec::new(); let mut current_string = Vec::new(); @@ -158,7 +158,7 @@ Boosts or reduces confidence based on section context: ### Configuration -```rust +```text use stringy::extraction::config::{NoiseFilterConfig, FilterWeights}; // Default configuration @@ -183,7 +183,7 @@ config.filter_weights = FilterWeights { ### Using Noise Filters -```rust +```text use stringy::extraction::config::NoiseFilterConfig; use stringy::extraction::filters::{CompositeNoiseFilter, FilterContext}; use stringy::types::SectionType; @@ -255,7 +255,7 @@ UTF-16 extraction is implemented in `src/extraction/utf16.rs` following the patt **Usage Example**: -```rust +```text use stringy::extraction::utf16::{extract_utf16_strings, Utf16ExtractionConfig, ByteOrder}; // Extract UTF-16LE strings from Windows PE binary @@ -277,7 +277,7 @@ let strings = extract_utf16_strings(data, &config); **Configuration**: -```rust +```text use stringy::extraction::utf16::{Utf16ExtractionConfig, ByteOrder}; // Default configuration (min_length: 3, byte_order: Auto, confidence_threshold: 0.5) @@ -412,7 +412,7 @@ Different sections have different string extraction strategies. - **STRINGTABLE**: Localized UI strings - **RT_MANIFEST**: XML manifest data -```rust +```text fn extract_pe_resources(pe: &PE, data: &[u8]) -> Vec { let mut strings = Vec::new(); @@ -445,7 +445,7 @@ Strings are canonicalized while preserving important metadata: When duplicates are found: -```rust +```text struct DeduplicatedString { canonical_text: String, occurrences: Vec, @@ -463,7 +463,7 @@ struct StringOccurrence { ### Deduplication Algorithm -```rust +```text fn deduplicate_strings(strings: Vec) -> Vec { let mut map: HashMap = HashMap::new(); @@ -483,7 +483,7 @@ fn deduplicate_strings(strings: Vec) -> Vec { ### Extraction Configuration -```rust +```text use stringy::extraction::{ByteOrder, Encoding, ExtractionConfig}; pub struct ExtractionConfig { @@ -500,7 +500,7 @@ pub struct ExtractionConfig { **UTF-16 Configuration Examples**: -```rust +```text use stringy::extraction::{ExtractionConfig, Encoding, ByteOrder}; // Extract UTF-16LE strings from Windows PE binary @@ -519,7 +519,7 @@ config.utf16_byte_order = ByteOrder::Auto; ### Noise Filter Configuration -```rust +```text use stringy::extraction::config::NoiseFilterConfig; pub struct NoiseFilterConfig { @@ -535,7 +535,7 @@ pub struct NoiseFilterConfig { ### Filter Weights -```rust +```text use stringy::extraction::config::FilterWeights; pub struct FilterWeights { @@ -552,7 +552,7 @@ All weights must sum to 1.0. The configuration validates this automatically. ### Encoding Selection -```rust +```text pub enum EncodingFilter { All, Specific(Vec), @@ -563,7 +563,7 @@ pub enum EncodingFilter { ### Section Filtering -```rust +```text pub struct SectionFilter { pub include_sections: Option>, pub exclude_sections: Option>, @@ -578,7 +578,7 @@ pub struct SectionFilter { Large files use memory mapping for efficient access: -```rust +```text use memmap2::Mmap; fn extract_from_large_file(path: &Path) -> Result> { @@ -593,7 +593,7 @@ fn extract_from_large_file(path: &Path) -> Result> { Section extraction can be parallelized: -```rust +```text use rayon::prelude::*; fn extract_parallel(sections: &[SectionInfo], data: &[u8]) -> Vec { @@ -608,7 +608,7 @@ fn extract_parallel(sections: &[SectionInfo], data: &[u8]) -> Vec { Pattern matching uses cached regex compilation: -```rust +```text lazy_static! { static ref URL_REGEX: Regex = Regex::new(r"https?://[^\s]+").unwrap(); static ref GUID_REGEX: Regex = Regex::new(r"\{[0-9a-fA-F-]{36}\}").unwrap(); @@ -649,7 +649,7 @@ Noise filtering is designed for minimal overhead: #### Basic Extraction with Filtering -```rust +```text use stringy::extraction::ascii::{extract_ascii_strings, AsciiExtractionConfig}; use stringy::extraction::config::NoiseFilterConfig; use stringy::extraction::filters::{CompositeNoiseFilter, FilterContext}; @@ -670,7 +670,7 @@ let filtered: Vec<_> = strings #### Custom Filter Configuration -```rust +```text use stringy::extraction::config::{NoiseFilterConfig, FilterWeights}; let mut config = NoiseFilterConfig::default(); From f8dc568907e329416658d6a82649938ed2e97c63 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 22:06:32 -0500 Subject: [PATCH 03/10] chore(docs): update code blocks to use text formatting Signed-off-by: UncleSp1d3r --- .github/copilot-instructions.md | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 861cb51..71905fb 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -52,7 +52,7 @@ Convert external errors with `From` implementations. Provide offsets, section na Container parsers assign weights (1.0-10.0) to sections based on string likelihood: -```rust +```text // ELF example from container/elf.rs ".rodata" | ".rodata.str1.*" => 10.0 // Highest priority ".comment" | ".note.*" => 9.0 // Build info @@ -83,10 +83,24 @@ Use `#[non_exhaustive]` for public API structs like `ContainerInfo` and provide ```rust #[non_exhaustive] -pub struct ContainerInfo { /* fields */ } +pub struct ContainerInfo {/* fields */} impl ContainerInfo { - pub fn new(format: BinaryFormat, sections: Vec, ...) -> Self { ... } + pub fn new( + format: BinaryFormat, + sections: Vec, + imports: Vec, + exports: Vec, + resources: Option>, + ) -> Self { + Self { + format, + sections, + imports, + exports, + resources, + } + } } ``` @@ -186,7 +200,7 @@ The `justfile` uses OS annotations (`[windows]`/`[unix]`) for cross-platform com **Adding a new section weight** (in `container/elf.rs`, `pe.rs`, or `macho.rs`): -```rust +```text let weight = match section_name { ".mydata" => 8.0, // New section type _ => existing_match_arms @@ -195,7 +209,7 @@ let weight = match section_name { **Extracting strings from a section**: -```rust +```text use stringy::extraction::{extract_ascii_strings, AsciiExtractionConfig}; let config = AsciiExtractionConfig { min_length: 4, max_length: 1024 }; let strings = extract_ascii_strings(§ion_data, &config); From d60eed24017afb4694374b6b329a4447d6829a79 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 22:07:00 -0500 Subject: [PATCH 04/10] chore(docs): update output format code blocks to text Signed-off-by: UncleSp1d3r --- ...e_Flows__Stringy_v1.0_User_Interactions.md | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md b/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md index 5da1ef7..9fd4a03 100644 --- a/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md +++ b/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md @@ -82,7 +82,7 @@ sequenceDiagram **Output Format (TTY):** -``` +```text String Tags Score Section https://malicious-c2.example.com/api url 95 .rdata C:\Windows\System32\kernel32.dll filepath 88 .rdata @@ -106,14 +106,19 @@ core::fmt::Display::fmt export 85 .text **Steps:** 1. User invokes Stringy with filtering flags + 2. System performs standard analysis pipeline (Parsing... Extracting... Classifying... Ranking...) + 3. System applies filters with AND logic: + - String must have tag "url" OR "ipv4" - String length must be >= 10 characters - String encoding must be UTF-16 4. System outputs filtered results in table format + 5. If no strings match filters, system displays to stderr: "Analyzed 1,234 strings, 0 matched filters" + 6. If strings match, system outputs table with matching strings only **Filter Combination Rules:** @@ -174,7 +179,7 @@ If multiple output format flags are specified (e.g., --json and --yara), the sys **JSON Format:** -```json +```text {"text":"https://example.com","encoding":"Ascii","offset":4096,"rva":8192,"section":".rdata","length":19,"tags":["url","domain"],"score":95,"source":"SectionData","confidence":1.0} {"text":"C:\\Windows\\System32","encoding":"Utf16Le","offset":8192,"rva":12288,"section":".data","length":38,"tags":["filepath"],"score":88,"source":"SectionData","confidence":0.95} ``` @@ -198,21 +203,27 @@ If multiple output format flags are specified (e.g., --json and --yara), the sys **Steps:** 1. User invokes Stringy with --yara flag + 2. System performs standard analysis pipeline + 3. Progress indicators go to stderr + 4. System generates complete YARA rule template to stdout + 5. Rule includes: + - Rule name (derived from binary filename) - Metadata section (file hash, analysis date, tool version) - Strings section with properly escaped strings - Condition section (basic template) 6. Strings are escaped according to YARA syntax rules + 7. Very long strings (>200 chars) are truncated with comment **YARA Output Format:** -``` +```text rule binary_strings { meta: description = "Strings extracted from binary.exe" @@ -268,7 +279,7 @@ rule binary_strings { **Output Format (Non-TTY):** -``` +```text https://malicious-c2.example.com/api C:\Windows\System32\kernel32.dll core::fmt::Display::fmt From 0efc334f89025ac044de5fa3d856eea24c037484 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 22:07:12 -0500 Subject: [PATCH 05/10] chore(docs): update code blocks to use text formatting Signed-off-by: UncleSp1d3r --- codebase_analysis.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/codebase_analysis.md b/codebase_analysis.md index 7af0617..04fd62e 100644 --- a/codebase_analysis.md +++ b/codebase_analysis.md @@ -101,7 +101,7 @@ Library entry point with module declarations and public re-exports. -```rust +```text #![forbid(unsafe_code)] #![deny(warnings)] @@ -122,7 +122,7 @@ pub use types::{BinaryFormat, ContainerInfo, Encoding, FoundString /* ... */}; CLI placeholder using `clap` derive macros. -```rust +```text #[derive(Parser)] #[command(name = "stringy")] struct Cli { @@ -159,7 +159,7 @@ Core data structures with comprehensive type definitions: Defines the `ContainerParser` trait and format detection. -```rust +```text pub trait ContainerParser { fn detect(data: &[u8]) -> bool where @@ -213,7 +213,7 @@ Mach-O parser for macOS/iOS binaries: Main extraction framework with `StringExtractor` trait and `BasicExtractor`. -```rust +```text pub trait StringExtractor { fn extract(&self, data: &[u8], info: &ContainerInfo) -> Vec; } @@ -271,7 +271,7 @@ Semantic classifier with pattern matching: **N/A** - Stringy is a command-line tool, not a web service. The public API is exposed as a Rust library: -```rust +```text // Library usage use stringy::{detect_format, create_parser, BasicExtractor, SemanticClassifier}; From 3fff6aa3701d092e2efe8daf8120264ce8755e86 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 22:07:26 -0500 Subject: [PATCH 06/10] feat(types): add original_text field to FoundString struct - Introduced `original_text` field to preserve pre-demangled text. - Updated related extraction and classification modules to initialize this field. Signed-off-by: UncleSp1d3r --- Cargo.toml | 6 ++--- src/classification/mod.rs | 4 +++ src/classification/semantic.rs | 12 +++++++++ src/extraction/ascii.rs | 12 +++++++++ src/extraction/dedup.rs | 16 ++++++++++++ src/extraction/macho_load_commands.rs | 8 ++++++ src/extraction/mod.rs | 20 +++++++++++++++ src/extraction/pe_resources.rs | 12 +++++++++ src/extraction/utf16.rs | 4 +++ src/types.rs | 36 +++++++++++++++++++++++++++ tests/classification_integration.rs | 4 +++ tests/test_deduplication.rs | 8 ++++++ 12 files changed, 139 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fcd24a9..b3d0aa7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,13 +3,13 @@ name = "stringy" version = "0.1.0" edition = "2024" rust-version = "1.91" -authors = [ "UncleSp1d3r " ] +authors = ["UncleSp1d3r "] description = "A smarter alternative to the strings command that leverages format-specific knowledge" license = "Apache-2.0" repository = "https://github.com/EvilBit-Labs/Stringy" homepage = "http://evilbitlabs.io/Stringy/" -keywords = [ "binary", "strings", "analysis", "reverse-engineering", "malware" ] -categories = [ "command-line-utilities", "development-tools" ] +keywords = ["binary", "strings", "analysis", "reverse-engineering", "malware"] +categories = ["command-line-utilities", "development-tools"] [lib] name = "stringy" diff --git a/src/classification/mod.rs b/src/classification/mod.rs index a63b138..d3dd9ad 100644 --- a/src/classification/mod.rs +++ b/src/classification/mod.rs @@ -30,6 +30,7 @@ //! let classifier = SemanticClassifier::new(); //! let found_string = FoundString { //! text: "C:\\Windows\\System32\\cmd.exe".to_string(), +//! original_text: None, //! encoding: Encoding::Ascii, //! offset: 0, //! rva: None, @@ -37,6 +38,9 @@ //! length: 27, //! tags: Vec::new(), //! score: 0, +//! section_weight: None, +//! semantic_boost: None, +//! noise_penalty: None, //! source: StringSource::SectionData, //! confidence: 1.0, //! }; diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index 2740f2f..b83bdfd 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -20,6 +20,7 @@ //! let classifier = SemanticClassifier::new(); //! let found_string = FoundString { //! text: "https://example.com/api".to_string(), +//! original_text: None, //! encoding: Encoding::Ascii, //! offset: 0, //! rva: None, @@ -27,6 +28,9 @@ //! length: 24, //! tags: Vec::new(), //! score: 0, +//! section_weight: None, +//! semantic_boost: None, +//! noise_penalty: None, //! source: StringSource::SectionData, //! confidence: 1.0, //! }; @@ -348,6 +352,7 @@ impl SemanticClassifier { /// let classifier = SemanticClassifier::new(); /// let found_string = FoundString { /// text: "https://example.com".to_string(), + /// original_text: None, /// encoding: Encoding::Ascii, /// offset: 0, /// rva: None, @@ -355,6 +360,9 @@ impl SemanticClassifier { /// length: 19, /// tags: Vec::new(), /// score: 0, + /// section_weight: None, + /// semantic_boost: None, + /// noise_penalty: None, /// source: StringSource::SectionData, /// confidence: 1.0, /// }; @@ -1010,6 +1018,7 @@ mod tests { fn create_test_string(text: &str) -> FoundString { FoundString { text: text.to_string(), + original_text: None, encoding: Encoding::Ascii, offset: 0, rva: None, @@ -1017,6 +1026,9 @@ mod tests { length: text.len() as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 1.0, } diff --git a/src/extraction/ascii.rs b/src/extraction/ascii.rs index 8fb4fb6..9f9d82f 100644 --- a/src/extraction/ascii.rs +++ b/src/extraction/ascii.rs @@ -230,6 +230,7 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec let text = String::from_utf8(bytes).expect("ASCII bytes should be valid UTF-8"); strings.push(FoundString { text, + original_text: None, encoding: Encoding::Ascii, offset: start as u64, rva: None, @@ -237,6 +238,9 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec length: len as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 1.0, }); @@ -260,6 +264,7 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec let text = String::from_utf8(bytes).expect("ASCII bytes should be valid UTF-8"); strings.push(FoundString { text, + original_text: None, encoding: Encoding::Ascii, offset: start as u64, rva: None, @@ -267,6 +272,9 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec length: len as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 1.0, }); @@ -276,6 +284,7 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec let text = String::from_utf8(bytes).expect("ASCII bytes should be valid UTF-8"); strings.push(FoundString { text, + original_text: None, encoding: Encoding::Ascii, offset: start as u64, rva: None, @@ -283,6 +292,9 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec length: len as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 1.0, }); diff --git a/src/extraction/dedup.rs b/src/extraction/dedup.rs index b8da59a..d7ad2b7 100644 --- a/src/extraction/dedup.rs +++ b/src/extraction/dedup.rs @@ -282,6 +282,7 @@ impl CanonicalString { FoundString { text: self.text.clone(), + original_text: None, encoding: self.encoding, offset: first_occurrence.offset, rva: first_occurrence.rva, @@ -289,6 +290,9 @@ impl CanonicalString { length: first_occurrence.length, tags: self.merged_tags.clone(), score: self.combined_score, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: first_occurrence.source, confidence: max_confidence, } @@ -325,6 +329,7 @@ mod tests { FoundString { text: text.to_string(), + original_text: None, encoding, offset, rva: Some(offset + 0x1000), @@ -332,6 +337,9 @@ mod tests { length, tags, score, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source, confidence, } @@ -804,6 +812,7 @@ mod tests { let strings = vec![ FoundString { text: "Test".to_string(), + original_text: None, encoding: Encoding::Utf16Le, offset: 0x100, rva: Some(0x1000), @@ -811,11 +820,15 @@ mod tests { length: 8, // 4 characters * 2 bytes = 8 bytes tags: vec![], score: 10, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 0.8, }, FoundString { text: "Test".to_string(), + original_text: None, encoding: Encoding::Utf16Le, offset: 0x200, rva: Some(0x2000), @@ -823,6 +836,9 @@ mod tests { length: 8, tags: vec![], score: 15, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 0.9, }, diff --git a/src/extraction/macho_load_commands.rs b/src/extraction/macho_load_commands.rs index 35a3254..b3ed743 100644 --- a/src/extraction/macho_load_commands.rs +++ b/src/extraction/macho_load_commands.rs @@ -100,6 +100,7 @@ fn extract_dylib_strings(macho: &MachO) -> Vec { strings.push(FoundString { text: lib.to_string(), + original_text: None, encoding: Encoding::Utf8, source: StringSource::LoadCommand, tags, @@ -108,6 +109,9 @@ fn extract_dylib_strings(macho: &MachO) -> Vec { rva: None, length, score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, confidence: 1.0, }); } @@ -129,6 +133,7 @@ fn extract_rpath_strings(macho: &MachO) -> Vec { strings.push(FoundString { text: rpath.to_string(), + original_text: None, encoding: Encoding::Utf8, source: StringSource::LoadCommand, tags, @@ -137,6 +142,9 @@ fn extract_rpath_strings(macho: &MachO) -> Vec { rva: None, length, score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, confidence: 1.0, }); } diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs index 1ced150..19b5038 100644 --- a/src/extraction/mod.rs +++ b/src/extraction/mod.rs @@ -483,6 +483,7 @@ impl StringExtractor for BasicExtractor { let length = import.name.len() as u32; all_strings.push(FoundString { text: import.name.clone(), + original_text: None, encoding: Encoding::Utf8, offset: 0, rva: None, @@ -490,6 +491,9 @@ impl StringExtractor for BasicExtractor { length, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ImportName, confidence: 1.0, }); @@ -500,6 +504,7 @@ impl StringExtractor for BasicExtractor { let length = export.name.len() as u32; all_strings.push(FoundString { text: export.name.clone(), + original_text: None, encoding: Encoding::Utf8, offset: 0, rva: None, @@ -507,6 +512,9 @@ impl StringExtractor for BasicExtractor { length, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ExportName, confidence: 1.0, }); @@ -579,6 +587,7 @@ impl StringExtractor for BasicExtractor { let length = import.name.len() as u32; all_strings.push(FoundString { text: import.name.clone(), + original_text: None, encoding: Encoding::Utf8, offset: 0, rva: None, @@ -586,6 +595,9 @@ impl StringExtractor for BasicExtractor { length, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ImportName, confidence: 1.0, }); @@ -596,6 +608,7 @@ impl StringExtractor for BasicExtractor { let length = export.name.len() as u32; all_strings.push(FoundString { text: export.name.clone(), + original_text: None, encoding: Encoding::Utf8, offset: 0, rva: None, @@ -603,6 +616,9 @@ impl StringExtractor for BasicExtractor { length, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ExportName, confidence: 1.0, }); @@ -748,6 +764,7 @@ impl StringExtractor for BasicExtractor { let found_string = FoundString { text, + original_text: None, encoding, offset: absolute_offset, rva, @@ -755,6 +772,9 @@ impl StringExtractor for BasicExtractor { length: length as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence, }; diff --git a/src/extraction/pe_resources.rs b/src/extraction/pe_resources.rs index 695d834..7938667 100644 --- a/src/extraction/pe_resources.rs +++ b/src/extraction/pe_resources.rs @@ -426,6 +426,7 @@ pub fn extract_version_info_strings(data: &[u8]) -> Vec { let length = text.len() as u32; let found_string = FoundString { text, + original_text: None, encoding: Encoding::Utf16Le, offset: 0, // pelite doesn't provide offsets easily rva: None, @@ -433,6 +434,9 @@ pub fn extract_version_info_strings(data: &[u8]) -> Vec { length, tags: vec![Tag::Version, Tag::Resource], score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ResourceString, confidence: 1.0, }; @@ -582,6 +586,7 @@ pub fn extract_string_table_strings(data: &[u8]) -> Vec { let found_string = FoundString { text, + original_text: None, encoding: Encoding::Utf16Le, offset: 0, // File offset not easily available from pelite DataEntry rva, @@ -589,6 +594,9 @@ pub fn extract_string_table_strings(data: &[u8]) -> Vec { length: text_len, tags: vec![Tag::Resource], score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ResourceString, confidence: 1.0, }; @@ -770,6 +778,7 @@ pub fn extract_manifest_strings(data: &[u8]) -> Vec { let length = manifest_text.len() as u32; let found_string = FoundString { text: manifest_text, + original_text: None, encoding, offset: 0, // File offset not easily available from pelite DataEntry rva, @@ -777,6 +786,9 @@ pub fn extract_manifest_strings(data: &[u8]) -> Vec { length, tags: vec![Tag::Manifest, Tag::Resource], score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ResourceString, confidence: 1.0, }; diff --git a/src/extraction/utf16.rs b/src/extraction/utf16.rs index 711ffbb..2eb3f1f 100644 --- a/src/extraction/utf16.rs +++ b/src/extraction/utf16.rs @@ -822,6 +822,7 @@ fn extract_utf16_strings_with_byte_order( if utf16_confidence >= config.confidence_threshold { found_strings.push(FoundString { text, + original_text: None, encoding: match byte_order { ByteOrder::LE => Encoding::Utf16Le, ByteOrder::BE => Encoding::Utf16Be, @@ -833,6 +834,9 @@ fn extract_utf16_strings_with_byte_order( length: bytes_for_decoding.len() as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: utf16_confidence, }); diff --git a/src/types.rs b/src/types.rs index bccd80c..0e2274b 100644 --- a/src/types.rs +++ b/src/types.rs @@ -223,10 +223,22 @@ pub struct ResourceStringEntry { } /// A string found in the binary with metadata +/// +/// The `original_text` field preserves the pre-demangled text when demangling +/// is applied. Debug-only fields provide transparency into how the final score +/// was produced and are only populated when debug mode is enabled. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FoundString { /// The extracted string text pub text: String, + /// Original text before demangling (if applicable) + /// + /// When a string is identified as a mangled symbol (e.g., C++ or Rust mangled names), + /// this field preserves the original mangled form before demangling is applied. + /// The `text` field will contain the demangled version. This is `None` for strings + /// that are not mangled symbols or when demangling is not performed. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub original_text: Option, /// The encoding used for this string pub encoding: Encoding, /// File offset where the string was found @@ -241,6 +253,30 @@ pub struct FoundString { pub tags: Vec, /// Relevance score for ranking pub score: i32, + /// Section weight contribution to the final score (debug only) + /// + /// When debug mode is enabled, this field contains the weight assigned based on + /// the section where the string was found. Higher weights indicate sections more + /// likely to contain meaningful strings (e.g., .rodata vs .text). This is `None` + /// unless explicitly populated by the ranking system in debug mode. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub section_weight: Option, + /// Semantic classification boost to the final score (debug only) + /// + /// When debug mode is enabled, this field contains the score boost applied based on + /// semantic tags (URLs, file paths, GUIDs, etc.). Strings with valuable semantic + /// meaning receive positive boosts. This is `None` unless explicitly populated by + /// the ranking system in debug mode. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub semantic_boost: Option, + /// Noise penalty applied to the final score (debug only) + /// + /// When debug mode is enabled, this field contains the penalty applied for noise + /// characteristics (low confidence, repetitive patterns, etc.). Higher penalties + /// indicate strings more likely to be noise. This is `None` unless explicitly + /// populated by the ranking system in debug mode. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub noise_penalty: Option, /// Source of the string (section data, import, etc.) pub source: StringSource, /// Confidence score from noise filtering (0.0-1.0) diff --git a/tests/classification_integration.rs b/tests/classification_integration.rs index 710f2ea..62433bb 100644 --- a/tests/classification_integration.rs +++ b/tests/classification_integration.rs @@ -6,6 +6,7 @@ use stringy::types::{Encoding, FoundString, StringSource, Tag}; fn make_found_string(text: &str) -> FoundString { FoundString { text: text.to_string(), + original_text: None, encoding: Encoding::Ascii, offset: 0, rva: None, @@ -13,6 +14,9 @@ fn make_found_string(text: &str) -> FoundString { length: text.len() as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 1.0, } diff --git a/tests/test_deduplication.rs b/tests/test_deduplication.rs index 8a1fb5c..f2093dd 100644 --- a/tests/test_deduplication.rs +++ b/tests/test_deduplication.rs @@ -278,6 +278,7 @@ fn test_deduplication_score_bonuses() { let strings = vec![ FoundString { text: "TestString".to_string(), + original_text: None, encoding: Encoding::Utf8, offset: 0x100, rva: Some(0x1000), @@ -285,11 +286,15 @@ fn test_deduplication_score_bonuses() { length: 10, tags: vec![], score: 10, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 0.8, }, FoundString { text: "TestString".to_string(), + original_text: None, encoding: Encoding::Utf8, offset: 0x200, rva: Some(0x2000), @@ -297,6 +302,9 @@ fn test_deduplication_score_bonuses() { length: 10, tags: vec![], score: 15, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ImportName, confidence: 0.9, }, From a7b4d3bc07a222e714400d94fe77f226a20fe46b Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 22:37:11 -0500 Subject: [PATCH 07/10] Add tests for FoundString optional fields and serde Added unit tests to verify serialization and deserialization of the FoundString struct, focusing on the handling of optional fields (original_text, section_weight, semantic_boost, noise_penalty). Updated the related project plan ticket to mark acceptance criteria as complete. Also added .serena project configuration files. --- .serena/.gitignore | 1 + .serena/project.yml | 89 +++++++++++++++ ..._model_for_demangling_and_debug_support.md | 12 +- src/types.rs | 107 ++++++++++++++++++ 4 files changed, 203 insertions(+), 6 deletions(-) create mode 100644 .serena/.gitignore create mode 100644 .serena/project.yml diff --git a/.serena/.gitignore b/.serena/.gitignore new file mode 100644 index 0000000..14d86ad --- /dev/null +++ b/.serena/.gitignore @@ -0,0 +1 @@ +/cache diff --git a/.serena/project.yml b/.serena/project.yml new file mode 100644 index 0000000..f3f3374 --- /dev/null +++ b/.serena/project.yml @@ -0,0 +1,89 @@ +# list of languages for which language servers are started; choose from: +# al bash clojure cpp csharp +# csharp_omnisharp dart elixir elm erlang +# fortran fsharp go groovy haskell +# java julia kotlin lua markdown +# matlab nix pascal perl php +# powershell python python_jedi r rego +# ruby ruby_solargraph rust scala swift +# terraform toml typescript typescript_vts vue +# yaml zig +# (This list may be outdated. For the current list, see values of Language enum here: +# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py +# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.) +# Note: +# - For C, use cpp +# - For JavaScript, use typescript +# - For Free Pascal/Lazarus, use pascal +# Special requirements: +# - csharp: Requires the presence of a .sln file in the project folder. +# - pascal: Requires Free Pascal Compiler (fpc) and optionally Lazarus. +# When using multiple languages, the first language server that supports a given file will be used for that file. +# The first language is the default language and the respective language server will be used as a fallback. +# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored. +languages: +- rust + +# the encoding used by text files in the project +# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings +encoding: "utf-8" + +# whether to use project's .gitignore files to ignore files +ignore_all_files_in_gitignore: true + +# list of additional paths to ignore in all projects +# same syntax as gitignore, so you can use * and ** +ignored_paths: [] + +# whether the project is in read-only mode +# If set to true, all editing tools will be disabled and attempts to use them will result in an error +# Added on 2025-04-18 +read_only: false + +# list of tool names to exclude. We recommend not excluding any tools, see the readme for more details. +# Below is the complete list of tools for convenience. +# To make sure you have the latest list of tools, and to view their descriptions, +# execute `uv run scripts/print_tool_overview.py`. +# +# * `activate_project`: Activates a project by name. +# * `check_onboarding_performed`: Checks whether project onboarding was already performed. +# * `create_text_file`: Creates/overwrites a file in the project directory. +# * `delete_lines`: Deletes a range of lines within a file. +# * `delete_memory`: Deletes a memory from Serena's project-specific memory store. +# * `execute_shell_command`: Executes a shell command. +# * `find_referencing_code_snippets`: Finds code snippets in which the symbol at the given location is referenced. +# * `find_referencing_symbols`: Finds symbols that reference the symbol at the given location (optionally filtered by type). +# * `find_symbol`: Performs a global (or local) search for symbols with/containing a given name/substring (optionally filtered by type). +# * `get_current_config`: Prints the current configuration of the agent, including the active and available projects, tools, contexts, and modes. +# * `get_symbols_overview`: Gets an overview of the top-level symbols defined in a given file. +# * `initial_instructions`: Gets the initial instructions for the current project. +# Should only be used in settings where the system prompt cannot be set, +# e.g. in clients you have no control over, like Claude Desktop. +# * `insert_after_symbol`: Inserts content after the end of the definition of a given symbol. +# * `insert_at_line`: Inserts content at a given line in a file. +# * `insert_before_symbol`: Inserts content before the beginning of the definition of a given symbol. +# * `list_dir`: Lists files and directories in the given directory (optionally with recursion). +# * `list_memories`: Lists memories in Serena's project-specific memory store. +# * `onboarding`: Performs onboarding (identifying the project structure and essential tasks, e.g. for testing or building). +# * `prepare_for_new_conversation`: Provides instructions for preparing for a new conversation (in order to continue with the necessary context). +# * `read_file`: Reads a file within the project directory. +# * `read_memory`: Reads the memory with the given name from Serena's project-specific memory store. +# * `remove_project`: Removes a project from the Serena configuration. +# * `replace_lines`: Replaces a range of lines within a file with new content. +# * `replace_symbol_body`: Replaces the full definition of a symbol. +# * `restart_language_server`: Restarts the language server, may be necessary when edits not through Serena happen. +# * `search_for_pattern`: Performs a search for a pattern in the project. +# * `summarize_changes`: Provides instructions for summarizing the changes made to the codebase. +# * `switch_modes`: Activates modes by providing a list of their names +# * `think_about_collected_information`: Thinking tool for pondering the completeness of collected information. +# * `think_about_task_adherence`: Thinking tool for determining whether the agent is still on track with the current task. +# * `think_about_whether_you_are_done`: Thinking tool for determining whether the task is truly completed. +# * `write_memory`: Writes a named memory (for future reference) to Serena's project-specific memory store. +excluded_tools: [] + +# initial prompt for the project. It will always be given to the LLM upon activating the project +# (contrary to the memories, which are loaded on demand). +initial_prompt: "" + +project_name: "Stringy" +included_optional_tools: [] diff --git a/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md b/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md index 3a2da39..4d21225 100644 --- a/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md +++ b/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md @@ -22,12 +22,12 @@ Update the `FoundString` struct in file:src/types.rs to support symbol demanglin ## Acceptance Criteria -- [ ] FoundString struct includes original_text field -- [ ] FoundString struct includes optional breakdown fields (section_weight, semantic_boost, noise_penalty) -- [ ] All fields properly serialize/deserialize with serde -- [ ] Existing tests updated and passing -- [ ] Documentation updated with field descriptions and usage examples -- [ ] No breaking changes to existing code that creates FoundString instances +- [x] FoundString struct includes original_text field +- [x] FoundString struct includes optional breakdown fields (section_weight, semantic_boost, noise_penalty) +- [x] All fields properly serialize/deserialize with serde +- [x] Existing tests updated and passing +- [x] Documentation updated with field descriptions and usage examples +- [x] No breaking changes to existing code that creates FoundString instances ## References diff --git a/src/types.rs b/src/types.rs index 0e2274b..dfb85f4 100644 --- a/src/types.rs +++ b/src/types.rs @@ -343,3 +343,110 @@ impl From for StringyError { StringyError::ParseError(format!("Resource lookup error: {}", err)) } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Creates a test FoundString with all optional fields set to None + fn create_test_found_string() -> FoundString { + FoundString { + text: "test_string".to_string(), + original_text: None, + encoding: Encoding::Ascii, + offset: 0x1000, + rva: Some(0x2000), + section: Some(".rodata".to_string()), + length: 11, + tags: vec![Tag::Url], + score: 100, + section_weight: None, + semantic_boost: None, + noise_penalty: None, + source: StringSource::SectionData, + confidence: 0.85, + } + } + + #[test] + fn test_found_string_serde_optional_fields_none() { + // Test that optional fields are skipped when None + let found_string = create_test_found_string(); + let json = serde_json::to_string(&found_string).expect("Serialization failed"); + + // Verify optional fields are not present in JSON + assert!(!json.contains("original_text")); + assert!(!json.contains("section_weight")); + assert!(!json.contains("semantic_boost")); + assert!(!json.contains("noise_penalty")); + + // Verify required fields are present + assert!(json.contains("text")); + assert!(json.contains("encoding")); + assert!(json.contains("offset")); + } + + #[test] + fn test_found_string_serde_optional_fields_some() { + // Test that optional fields are included when Some + let mut found_string = create_test_found_string(); + found_string.original_text = Some("_ZN4test6mangled".to_string()); + found_string.section_weight = Some(50); + found_string.semantic_boost = Some(25); + found_string.noise_penalty = Some(-10); + + let json = serde_json::to_string(&found_string).expect("Serialization failed"); + + // Verify optional fields are present in JSON + assert!(json.contains("original_text")); + assert!(json.contains("_ZN4test6mangled")); + assert!(json.contains("section_weight")); + assert!(json.contains("semantic_boost")); + assert!(json.contains("noise_penalty")); + } + + #[test] + fn test_found_string_serde_roundtrip() { + // Test serialization/deserialization roundtrip with all fields + let mut found_string = create_test_found_string(); + found_string.original_text = Some("mangled_name".to_string()); + found_string.section_weight = Some(75); + found_string.semantic_boost = Some(30); + found_string.noise_penalty = Some(-5); + + let json = serde_json::to_string(&found_string).expect("Serialization failed"); + let deserialized: FoundString = + serde_json::from_str(&json).expect("Deserialization failed"); + + assert_eq!(found_string.text, deserialized.text); + assert_eq!(found_string.original_text, deserialized.original_text); + assert_eq!(found_string.section_weight, deserialized.section_weight); + assert_eq!(found_string.semantic_boost, deserialized.semantic_boost); + assert_eq!(found_string.noise_penalty, deserialized.noise_penalty); + } + + #[test] + fn test_found_string_deserialize_missing_optional_fields() { + // Test that missing optional fields default to None during deserialization + let json = r#"{ + "text": "test", + "encoding": "Ascii", + "offset": 0, + "rva": null, + "section": null, + "length": 4, + "tags": [], + "score": 0, + "source": "SectionData", + "confidence": 1.0 + }"#; + + let deserialized: FoundString = serde_json::from_str(json).expect("Deserialization failed"); + + assert_eq!(deserialized.text, "test"); + assert_eq!(deserialized.original_text, None); + assert_eq!(deserialized.section_weight, None); + assert_eq!(deserialized.semantic_boost, None); + assert_eq!(deserialized.noise_penalty, None); + } +} From 4ce923d7732447446a115fc1abbac94b23f4ee25 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 22:50:32 -0500 Subject: [PATCH 08/10] feat(types): add #[non_exhaustive] and builder pattern to FoundString Add #[non_exhaustive] attribute to FoundString struct to comply with AGENTS.md guidelines for public API structs. This prevents external code from using struct literal syntax, ensuring forward compatibility when new fields are added. Add constructor and builder methods: - new() constructor with required fields - with_rva(), with_section(), with_tags(), with_score() - with_confidence(), with_original_text() - with_section_weight(), with_semantic_boost(), with_noise_penalty() Update test files to use the new constructor/builder pattern. Co-Authored-By: Claude Opus 4.5 --- src/types.rs | 104 ++++++++++++++++++++++++++++ tests/classification_integration.rs | 23 ++---- tests/test_deduplication.rs | 54 ++++++--------- 3 files changed, 133 insertions(+), 48 deletions(-) diff --git a/src/types.rs b/src/types.rs index dfb85f4..5347f33 100644 --- a/src/types.rs +++ b/src/types.rs @@ -228,6 +228,7 @@ pub struct ResourceStringEntry { /// is applied. Debug-only fields provide transparency into how the final score /// was produced and are only populated when debug mode is enabled. #[derive(Debug, Clone, Serialize, Deserialize)] +#[non_exhaustive] pub struct FoundString { /// The extracted string text pub text: String, @@ -290,6 +291,109 @@ pub struct FoundString { } impl FoundString { + /// Creates a new FoundString with required fields and sensible defaults + /// + /// # Arguments + /// + /// * `text` - The extracted string text + /// * `encoding` - The encoding used for this string + /// * `offset` - File offset where the string was found + /// * `length` - Length of the string in bytes + /// * `source` - Source of the string (section data, import, etc.) + /// + /// # Returns + /// + /// A new FoundString with optional fields set to None/empty and confidence + /// set to 1.0 + #[must_use] + pub fn new( + text: String, + encoding: Encoding, + offset: u64, + length: u32, + source: StringSource, + ) -> Self { + Self { + text, + original_text: None, + encoding, + offset, + rva: None, + section: None, + length, + tags: Vec::new(), + score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, + source, + confidence: 1.0, + } + } + + /// Sets the RVA (Relative Virtual Address) + #[must_use] + pub fn with_rva(mut self, rva: u64) -> Self { + self.rva = Some(rva); + self + } + + /// Sets the section name + #[must_use] + pub fn with_section(mut self, section: String) -> Self { + self.section = Some(section); + self + } + + /// Sets the tags + #[must_use] + pub fn with_tags(mut self, tags: Vec) -> Self { + self.tags = tags; + self + } + + /// Sets the score + #[must_use] + pub fn with_score(mut self, score: i32) -> Self { + self.score = score; + self + } + + /// Sets the confidence + #[must_use] + pub fn with_confidence(mut self, confidence: f32) -> Self { + self.confidence = confidence; + self + } + + /// Sets the original text (for demangled symbols) + #[must_use] + pub fn with_original_text(mut self, original_text: String) -> Self { + self.original_text = Some(original_text); + self + } + + /// Sets the section weight (debug mode) + #[must_use] + pub fn with_section_weight(mut self, weight: i32) -> Self { + self.section_weight = Some(weight); + self + } + + /// Sets the semantic boost (debug mode) + #[must_use] + pub fn with_semantic_boost(mut self, boost: i32) -> Self { + self.semantic_boost = Some(boost); + self + } + + /// Sets the noise penalty (debug mode) + #[must_use] + pub fn with_noise_penalty(mut self, penalty: i32) -> Self { + self.noise_penalty = Some(penalty); + self + } + /// Returns true if confidence is high (>= 0.7) pub fn is_high_confidence(&self) -> bool { self.confidence >= 0.7 diff --git a/tests/classification_integration.rs b/tests/classification_integration.rs index 62433bb..4a1ddda 100644 --- a/tests/classification_integration.rs +++ b/tests/classification_integration.rs @@ -4,22 +4,13 @@ use stringy::classification::SemanticClassifier; use stringy::types::{Encoding, FoundString, StringSource, Tag}; fn make_found_string(text: &str) -> FoundString { - FoundString { - text: text.to_string(), - original_text: None, - encoding: Encoding::Ascii, - offset: 0, - rva: None, - section: None, - length: text.len() as u32, - tags: Vec::new(), - score: 0, - section_weight: None, - semantic_boost: None, - noise_penalty: None, - source: StringSource::SectionData, - confidence: 1.0, - } + FoundString::new( + text.to_string(), + Encoding::Ascii, + 0, + text.len() as u32, + StringSource::SectionData, + ) } fn classify_tags(classifier: &SemanticClassifier, text: &str) -> Vec { diff --git a/tests/test_deduplication.rs b/tests/test_deduplication.rs index f2093dd..236e5cc 100644 --- a/tests/test_deduplication.rs +++ b/tests/test_deduplication.rs @@ -276,38 +276,28 @@ fn test_deduplication_score_bonuses() { // Create strings with different sources to test multi-source bonus let strings = vec![ - FoundString { - text: "TestString".to_string(), - original_text: None, - encoding: Encoding::Utf8, - offset: 0x100, - rva: Some(0x1000), - section: Some(".rodata".to_string()), - length: 10, - tags: vec![], - score: 10, - section_weight: None, - semantic_boost: None, - noise_penalty: None, - source: StringSource::SectionData, - confidence: 0.8, - }, - FoundString { - text: "TestString".to_string(), - original_text: None, - encoding: Encoding::Utf8, - offset: 0x200, - rva: Some(0x2000), - section: Some(".data".to_string()), - length: 10, - tags: vec![], - score: 15, - section_weight: None, - semantic_boost: None, - noise_penalty: None, - source: StringSource::ImportName, - confidence: 0.9, - }, + FoundString::new( + "TestString".to_string(), + Encoding::Utf8, + 0x100, + 10, + StringSource::SectionData, + ) + .with_rva(0x1000) + .with_section(".rodata".to_string()) + .with_score(10) + .with_confidence(0.8), + FoundString::new( + "TestString".to_string(), + Encoding::Utf8, + 0x200, + 10, + StringSource::ImportName, + ) + .with_rva(0x2000) + .with_section(".data".to_string()) + .with_score(15) + .with_confidence(0.9), ]; let canonical = deduplicate(strings, None, true); From 6c985d0216a8492d659a2f2ad52c276da3d7b266 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 22:57:33 -0500 Subject: [PATCH 09/10] ci: add CI Gate job for branch protection rules Add a gate job that always runs and aggregates the status of all CI jobs. This enables branch protection rules to require "CI Gate" as a status check, which will pass if: - All jobs succeeded, OR - Conditional jobs were skipped (due to path filters) and other jobs passed The gate job fails only if any job actually failed or was cancelled. Co-Authored-By: Claude Opus 4.5 --- .github/workflows/ci.yml | 45 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3175728..11527d0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -176,3 +176,48 @@ jobs: with: token: ${{ secrets.QLTY_COVERAGE_TOKEN }} files: target/lcov.info + + # Gate job for branch protection - always runs and reports aggregate status + # Use this as the required status check instead of individual jobs + ci-gate: + name: CI Gate + runs-on: ubuntu-latest + needs: [quality, msrv, test, test-cross-platform, coverage] + if: always() + steps: + - name: Check job results + run: | + echo "Quality: ${{ needs.quality.result }}" + echo "MSRV: ${{ needs.msrv.result }}" + echo "Test: ${{ needs.test.result }}" + echo "Test Cross-Platform: ${{ needs.test-cross-platform.result }}" + echo "Coverage: ${{ needs.coverage.result }}" + + # Fail if any required job failed + # Jobs that were skipped (due to path filters) are OK + if [[ "${{ needs.quality.result }}" == "failure" || "${{ needs.quality.result }}" == "cancelled" ]]; then + echo "::error::Quality checks failed" + exit 1 + fi + + if [[ "${{ needs.msrv.result }}" == "failure" || "${{ needs.msrv.result }}" == "cancelled" ]]; then + echo "::error::MSRV check failed" + exit 1 + fi + + if [[ "${{ needs.test.result }}" == "failure" || "${{ needs.test.result }}" == "cancelled" ]]; then + echo "::error::Tests failed" + exit 1 + fi + + if [[ "${{ needs.test-cross-platform.result }}" == "failure" || "${{ needs.test-cross-platform.result }}" == "cancelled" ]]; then + echo "::error::Cross-platform tests failed" + exit 1 + fi + + if [[ "${{ needs.coverage.result }}" == "failure" || "${{ needs.coverage.result }}" == "cancelled" ]]; then + echo "::error::Coverage generation failed" + exit 1 + fi + + echo "All CI checks passed (or were skipped due to no relevant changes)" From ee01259de1edf8853488c2768fd0fff45463f963 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 23:07:08 -0500 Subject: [PATCH 10/10] chore(ci): disable ARM runners due to unavailability Signed-off-by: UncleSp1d3r --- .github/workflows/ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 11527d0..d4b7675 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -114,8 +114,9 @@ jobs: # Primary Support - Linux - os: ubuntu-latest platform: "Linux" - - os: arm - platform: "Linux" + # Disabled due to lack of ARM runners on GitHub Actions, will re-enable when available + # - os: arm + # platform: "Linux" # Primary Support - macOS (using available runners) - os: macos-latest platform: "macOS"