diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 861cb51..71905fb 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -52,7 +52,7 @@ Convert external errors with `From` implementations. Provide offsets, section na Container parsers assign weights (1.0-10.0) to sections based on string likelihood: -```rust +```text // ELF example from container/elf.rs ".rodata" | ".rodata.str1.*" => 10.0 // Highest priority ".comment" | ".note.*" => 9.0 // Build info @@ -83,10 +83,24 @@ Use `#[non_exhaustive]` for public API structs like `ContainerInfo` and provide ```rust #[non_exhaustive] -pub struct ContainerInfo { /* fields */ } +pub struct ContainerInfo {/* fields */} impl ContainerInfo { - pub fn new(format: BinaryFormat, sections: Vec, ...) -> Self { ... } + pub fn new( + format: BinaryFormat, + sections: Vec, + imports: Vec, + exports: Vec, + resources: Option>, + ) -> Self { + Self { + format, + sections, + imports, + exports, + resources, + } + } } ``` @@ -186,7 +200,7 @@ The `justfile` uses OS annotations (`[windows]`/`[unix]`) for cross-platform com **Adding a new section weight** (in `container/elf.rs`, `pe.rs`, or `macho.rs`): -```rust +```text let weight = match section_name { ".mydata" => 8.0, // New section type _ => existing_match_arms @@ -195,7 +209,7 @@ let weight = match section_name { **Extracting strings from a section**: -```rust +```text use stringy::extraction::{extract_ascii_strings, AsciiExtractionConfig}; let config = AsciiExtractionConfig { min_length: 4, max_length: 1024 }; let strings = extract_ascii_strings(§ion_data, &config); diff --git a/.github/prompts/cicheck.prompt.md b/.github/prompts/cicheck.prompt.md new file mode 100644 index 0000000..f26bb82 --- /dev/null +++ b/.github/prompts/cicheck.prompt.md @@ -0,0 +1,16 @@ +--- +agent: agent +name: Continuous Integration Check +description: This prompt is used to run and fix issues identified by the continuous integration check command. +model: OpenAI GPT-5.2-Codex (copilot) +--- + +Run `just ci-check` and analyze any failures or warnings. If there are any issues, fix them and run the command again. Continue this process until `just ci-check` passes completely without any failures or warnings. Focus on: + +1. Linting errors +2. Test failures +3. Formatting issues +4. Security issues +5. ERB template issues + +After each fix, re-run `just ci-check` to verify the changes resolved the issues. Only stop when all checks pass successfully. Provide a summary of the changes made to fix the issues once `just ci-check` passes. diff --git a/.github/prompt/simplicity-review.prompt.md b/.github/prompts/simplicity-review.prompt.md similarity index 76% rename from .github/prompt/simplicity-review.prompt.md rename to .github/prompts/simplicity-review.prompt.md index 7464254..faf99dd 100644 --- a/.github/prompt/simplicity-review.prompt.md +++ b/.github/prompts/simplicity-review.prompt.md @@ -1,6 +1,13 @@ +--- +agent: agent +name: Simplicity Review +description: This prompt is used to review and simplify code changes by applying principles of simplicity, idiomatic coding, and test proportionality. +model: OpenAI GPT-5.2-Codex +--- + CODE SIMPLIFICATION REVIEW -Start by examining the uncommitted changes in the current codebase. +Start by examining the uncommitted changes (or the changes in the current branch if there are no uncommitted changes) in the current codebase. ANALYSIS STEPS: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3175728..d4b7675 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -114,8 +114,9 @@ jobs: # Primary Support - Linux - os: ubuntu-latest platform: "Linux" - - os: arm - platform: "Linux" + # Disabled due to lack of ARM runners on GitHub Actions, will re-enable when available + # - os: arm + # platform: "Linux" # Primary Support - macOS (using available runners) - os: macos-latest platform: "macOS" @@ -176,3 +177,48 @@ jobs: with: token: ${{ secrets.QLTY_COVERAGE_TOKEN }} files: target/lcov.info + + # Gate job for branch protection - always runs and reports aggregate status + # Use this as the required status check instead of individual jobs + ci-gate: + name: CI Gate + runs-on: ubuntu-latest + needs: [quality, msrv, test, test-cross-platform, coverage] + if: always() + steps: + - name: Check job results + run: | + echo "Quality: ${{ needs.quality.result }}" + echo "MSRV: ${{ needs.msrv.result }}" + echo "Test: ${{ needs.test.result }}" + echo "Test Cross-Platform: ${{ needs.test-cross-platform.result }}" + echo "Coverage: ${{ needs.coverage.result }}" + + # Fail if any required job failed + # Jobs that were skipped (due to path filters) are OK + if [[ "${{ needs.quality.result }}" == "failure" || "${{ needs.quality.result }}" == "cancelled" ]]; then + echo "::error::Quality checks failed" + exit 1 + fi + + if [[ "${{ needs.msrv.result }}" == "failure" || "${{ needs.msrv.result }}" == "cancelled" ]]; then + echo "::error::MSRV check failed" + exit 1 + fi + + if [[ "${{ needs.test.result }}" == "failure" || "${{ needs.test.result }}" == "cancelled" ]]; then + echo "::error::Tests failed" + exit 1 + fi + + if [[ "${{ needs.test-cross-platform.result }}" == "failure" || "${{ needs.test-cross-platform.result }}" == "cancelled" ]]; then + echo "::error::Cross-platform tests failed" + exit 1 + fi + + if [[ "${{ needs.coverage.result }}" == "failure" || "${{ needs.coverage.result }}" == "cancelled" ]]; then + echo "::error::Coverage generation failed" + exit 1 + fi + + echo "All CI checks passed (or were skipped due to no relevant changes)" diff --git a/.serena/.gitignore b/.serena/.gitignore new file mode 100644 index 0000000..14d86ad --- /dev/null +++ b/.serena/.gitignore @@ -0,0 +1 @@ +/cache diff --git a/.serena/project.yml b/.serena/project.yml new file mode 100644 index 0000000..f3f3374 --- /dev/null +++ b/.serena/project.yml @@ -0,0 +1,89 @@ +# list of languages for which language servers are started; choose from: +# al bash clojure cpp csharp +# csharp_omnisharp dart elixir elm erlang +# fortran fsharp go groovy haskell +# java julia kotlin lua markdown +# matlab nix pascal perl php +# powershell python python_jedi r rego +# ruby ruby_solargraph rust scala swift +# terraform toml typescript typescript_vts vue +# yaml zig +# (This list may be outdated. For the current list, see values of Language enum here: +# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py +# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.) +# Note: +# - For C, use cpp +# - For JavaScript, use typescript +# - For Free Pascal/Lazarus, use pascal +# Special requirements: +# - csharp: Requires the presence of a .sln file in the project folder. +# - pascal: Requires Free Pascal Compiler (fpc) and optionally Lazarus. +# When using multiple languages, the first language server that supports a given file will be used for that file. +# The first language is the default language and the respective language server will be used as a fallback. +# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored. +languages: +- rust + +# the encoding used by text files in the project +# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings +encoding: "utf-8" + +# whether to use project's .gitignore files to ignore files +ignore_all_files_in_gitignore: true + +# list of additional paths to ignore in all projects +# same syntax as gitignore, so you can use * and ** +ignored_paths: [] + +# whether the project is in read-only mode +# If set to true, all editing tools will be disabled and attempts to use them will result in an error +# Added on 2025-04-18 +read_only: false + +# list of tool names to exclude. We recommend not excluding any tools, see the readme for more details. +# Below is the complete list of tools for convenience. +# To make sure you have the latest list of tools, and to view their descriptions, +# execute `uv run scripts/print_tool_overview.py`. +# +# * `activate_project`: Activates a project by name. +# * `check_onboarding_performed`: Checks whether project onboarding was already performed. +# * `create_text_file`: Creates/overwrites a file in the project directory. +# * `delete_lines`: Deletes a range of lines within a file. +# * `delete_memory`: Deletes a memory from Serena's project-specific memory store. +# * `execute_shell_command`: Executes a shell command. +# * `find_referencing_code_snippets`: Finds code snippets in which the symbol at the given location is referenced. +# * `find_referencing_symbols`: Finds symbols that reference the symbol at the given location (optionally filtered by type). +# * `find_symbol`: Performs a global (or local) search for symbols with/containing a given name/substring (optionally filtered by type). +# * `get_current_config`: Prints the current configuration of the agent, including the active and available projects, tools, contexts, and modes. +# * `get_symbols_overview`: Gets an overview of the top-level symbols defined in a given file. +# * `initial_instructions`: Gets the initial instructions for the current project. +# Should only be used in settings where the system prompt cannot be set, +# e.g. in clients you have no control over, like Claude Desktop. +# * `insert_after_symbol`: Inserts content after the end of the definition of a given symbol. +# * `insert_at_line`: Inserts content at a given line in a file. +# * `insert_before_symbol`: Inserts content before the beginning of the definition of a given symbol. +# * `list_dir`: Lists files and directories in the given directory (optionally with recursion). +# * `list_memories`: Lists memories in Serena's project-specific memory store. +# * `onboarding`: Performs onboarding (identifying the project structure and essential tasks, e.g. for testing or building). +# * `prepare_for_new_conversation`: Provides instructions for preparing for a new conversation (in order to continue with the necessary context). +# * `read_file`: Reads a file within the project directory. +# * `read_memory`: Reads the memory with the given name from Serena's project-specific memory store. +# * `remove_project`: Removes a project from the Serena configuration. +# * `replace_lines`: Replaces a range of lines within a file with new content. +# * `replace_symbol_body`: Replaces the full definition of a symbol. +# * `restart_language_server`: Restarts the language server, may be necessary when edits not through Serena happen. +# * `search_for_pattern`: Performs a search for a pattern in the project. +# * `summarize_changes`: Provides instructions for summarizing the changes made to the codebase. +# * `switch_modes`: Activates modes by providing a list of their names +# * `think_about_collected_information`: Thinking tool for pondering the completeness of collected information. +# * `think_about_task_adherence`: Thinking tool for determining whether the agent is still on track with the current task. +# * `think_about_whether_you_are_done`: Thinking tool for determining whether the task is truly completed. +# * `write_memory`: Writes a named memory (for future reference) to Serena's project-specific memory store. +excluded_tools: [] + +# initial prompt for the project. It will always be given to the LLM upon activating the project +# (contrary to the memories, which are loaded on demand). +initial_prompt: "" + +project_name: "Stringy" +included_optional_tools: [] diff --git a/Cargo.toml b/Cargo.toml index fcd24a9..b3d0aa7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,13 +3,13 @@ name = "stringy" version = "0.1.0" edition = "2024" rust-version = "1.91" -authors = [ "UncleSp1d3r " ] +authors = ["UncleSp1d3r "] description = "A smarter alternative to the strings command that leverages format-specific knowledge" license = "Apache-2.0" repository = "https://github.com/EvilBit-Labs/Stringy" homepage = "http://evilbitlabs.io/Stringy/" -keywords = [ "binary", "strings", "analysis", "reverse-engineering", "malware" ] -categories = [ "command-line-utilities", "development-tools" ] +keywords = ["binary", "strings", "analysis", "reverse-engineering", "malware"] +categories = ["command-line-utilities", "development-tools"] [lib] name = "stringy" diff --git a/codebase_analysis.md b/codebase_analysis.md index 7af0617..04fd62e 100644 --- a/codebase_analysis.md +++ b/codebase_analysis.md @@ -101,7 +101,7 @@ Library entry point with module declarations and public re-exports. -```rust +```text #![forbid(unsafe_code)] #![deny(warnings)] @@ -122,7 +122,7 @@ pub use types::{BinaryFormat, ContainerInfo, Encoding, FoundString /* ... */}; CLI placeholder using `clap` derive macros. -```rust +```text #[derive(Parser)] #[command(name = "stringy")] struct Cli { @@ -159,7 +159,7 @@ Core data structures with comprehensive type definitions: Defines the `ContainerParser` trait and format detection. -```rust +```text pub trait ContainerParser { fn detect(data: &[u8]) -> bool where @@ -213,7 +213,7 @@ Mach-O parser for macOS/iOS binaries: Main extraction framework with `StringExtractor` trait and `BasicExtractor`. -```rust +```text pub trait StringExtractor { fn extract(&self, data: &[u8], info: &ContainerInfo) -> Vec; } @@ -271,7 +271,7 @@ Semantic classifier with pattern matching: **N/A** - Stringy is a command-line tool, not a web service. The public API is exposed as a Rust library: -```rust +```text // Library usage use stringy::{detect_format, create_parser, BasicExtractor, SemanticClassifier}; diff --git a/docs/src/api.md b/docs/src/api.md index 3c23a02..ac01a30 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -8,7 +8,7 @@ This page provides an overview of Stringy's public API. For complete API documen The primary data structure representing an extracted string with metadata. -```rust +```text #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FoundString { /// The extracted string text @@ -36,7 +36,7 @@ pub struct FoundString { Supported string encodings. -```rust +```text #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum Encoding { Ascii, @@ -50,7 +50,7 @@ pub enum Encoding { Semantic classification tags. -```rust +```text #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum Tag { Url, @@ -78,7 +78,7 @@ pub enum Tag { Extract strings from binary data. -```rust +```text pub fn extract_strings( data: &[u8], config: &ExtractionConfig @@ -96,7 +96,7 @@ pub fn extract_strings( **Example:** -```rust +```text use stringy::{extract_strings, ExtractionConfig}; let data = std::fs::read("binary.exe")?; @@ -112,7 +112,7 @@ for string in strings { Detect the binary format of the given data. -```rust +```text pub fn detect_format(data: &[u8]) -> BinaryFormat ``` @@ -126,7 +126,7 @@ pub fn detect_format(data: &[u8]) -> BinaryFormat **Example:** -```rust +```text use stringy::detect_format; let data = std::fs::read("binary")?; @@ -140,7 +140,7 @@ println!("Detected format: {:?}", format); Configuration options for string extraction. -```rust +```text pub struct ExtractionConfig { /// Minimum length for ASCII strings pub min_ascii_len: usize, @@ -180,7 +180,7 @@ impl Default for ExtractionConfig { Configuration for semantic classification. -```rust +```text pub struct ClassificationConfig { /// Enable URL detection pub detect_urls: bool, @@ -209,7 +209,7 @@ pub struct ClassificationConfig { Trait for implementing binary format parsers. -```rust +```text pub trait ContainerParser { /// Detect if this parser can handle the given data fn detect(data: &[u8]) -> bool @@ -225,7 +225,7 @@ pub trait ContainerParser { Information about a parsed binary container. -```rust +```text pub struct ContainerInfo { /// The binary format detected pub format: BinaryFormat, @@ -242,7 +242,7 @@ pub struct ContainerInfo { Information about a section within the binary. -```rust +```text pub struct SectionInfo { /// Section name pub name: String, @@ -267,7 +267,7 @@ pub struct SectionInfo { Trait for implementing output formatters. -```rust +```text pub trait OutputFormatter { /// Format the strings for output fn format(&self, strings: &[FoundString], config: &OutputConfig) -> Result; @@ -276,7 +276,7 @@ pub trait OutputFormatter { ### Built-in Formatters -```rust +```text // Human-readable table format pub struct HumanFormatter; @@ -289,7 +289,7 @@ pub struct YaraFormatter; **Example:** -```rust +```text use stringy::output::{JsonFormatter, OutputFormatter, OutputConfig}; let formatter = JsonFormatter::new(); @@ -304,7 +304,7 @@ println!("{}", output); Comprehensive error type for the library. -```rust +```text #[derive(Debug, thiserror::Error)] pub enum StringyError { #[error("Unsupported file format")] @@ -331,7 +331,7 @@ pub enum StringyError { Convenient result type alias. -```rust +```text pub type Result = std::result::Result; ``` @@ -341,7 +341,7 @@ pub type Result = std::result::Result; Implement custom semantic classifiers: -```rust +```text use stringy::classification::{ClassificationResult, Classifier}; pub struct CustomClassifier { @@ -360,7 +360,7 @@ impl Classifier for CustomClassifier { For large files, use memory mapping: -```rust +```text use memmap2::Mmap; use std::fs::File; @@ -373,7 +373,7 @@ let strings = extract_strings(&mmap[..], &config)?; Process multiple files in parallel: -```rust +```text use rayon::prelude::*; let files = vec!["file1.exe", "file2.dll", "file3.so"]; @@ -390,7 +390,7 @@ let results: Vec<_> = files Optional features can be enabled in `Cargo.toml`: -```toml +```text [dependencies] stringy = { version = "0.1", features = ["pe-resources", "dwarf-debug"] } ``` @@ -406,7 +406,7 @@ Available features: ### Basic String Extraction -```rust +```text use stringy::{ExtractionConfig, extract_strings}; fn main() -> stringy::Result<()> { @@ -425,7 +425,7 @@ fn main() -> stringy::Result<()> { ### Filtered Extraction -```rust +```text use stringy::{Encoding, ExtractionConfig, Tag, extract_strings}; fn extract_network_indicators(data: &[u8]) -> stringy::Result> { @@ -454,7 +454,7 @@ fn extract_network_indicators(data: &[u8]) -> stringy::Result> { ### Custom Output Format -```rust +```text use serde_json::json; use stringy::output::{OutputConfig, OutputFormatter}; diff --git a/docs/src/architecture.md b/docs/src/architecture.md index 3792144..00ef674 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -31,7 +31,7 @@ Handles binary format detection and parsing using the `goblin` crate with compre The parsers implement intelligent section prioritization: -```rust +```text // Example: ELF section weights ".rodata" | ".rodata.str1.*" => 10.0 // Highest priority ".comment" | ".note.*" => 9.0 // Build info, very likely strings @@ -109,7 +109,7 @@ Formats results for different use cases with consistent data structures. ### 1. Binary Analysis Phase ✅ **Implemented** -```rust +```text // Format detection using goblin let format = detect_format(&data); // Returns BinaryFormat enum let parser = create_parser(format)?; // Creates appropriate parser @@ -128,7 +128,7 @@ let container_info = parser.parse(&data)?; ### 2. String Extraction Phase 🚧 **Framework Ready** -```rust +```text // Extract strings from prioritized sections (by weight) let mut all_strings = Vec::new(); for section in container_info.sections.iter().filter(|s| s.weight > 5.0) { @@ -151,7 +151,7 @@ let unique_strings = deduplicate(all_strings); ### 3. Classification Phase 🚧 **Types Ready** -```rust +```text // Apply semantic classification with context awareness for string in &mut unique_strings { let context = StringContext { @@ -167,7 +167,7 @@ for string in &mut unique_strings { ### 4. Output Phase 🚧 **Interfaces Defined** -```rust +```text // Sort by relevance score (descending) unique_strings.sort_by_key(|s| std::cmp::Reverse(s.score)); @@ -207,7 +207,7 @@ pub trait ContainerParser { Each parser implements intelligent section classification: -```rust +```text // ELF Example fn classify_section(section: &SectionHeader, name: &str) -> SectionType { if section.sh_flags & SHF_EXECINSTR != 0 { diff --git a/docs/src/binary-formats.md b/docs/src/binary-formats.md index df7d342..8456f62 100644 --- a/docs/src/binary-formats.md +++ b/docs/src/binary-formats.md @@ -56,7 +56,7 @@ The ELF parser now provides comprehensive symbol extraction with: ### Implementation Details -```rust +```text impl ElfParser { fn classify_section(section: &SectionHeader, name: &str) -> SectionType { // Check executable flag first @@ -231,7 +231,7 @@ PE resources are particularly rich sources of strings. The PE parser now provide #### Usage Example -```rust +```text use stringy::extraction::extract_resource_strings; use stringy::types::Tag; @@ -251,7 +251,7 @@ let ui_strings: Vec<_> = strings.iter() ### Implementation Details -```rust +```text impl PeParser { fn classify_section(section: &SectionTable) -> SectionType { let name = String::from_utf8_lossy(§ion.name); @@ -349,7 +349,7 @@ Mach-O load commands contain valuable strings: ### Implementation Details -```rust +```text impl MachoParser { fn classify_section(segment_name: &str, section_name: &str) -> SectionType { match (segment_name, section_name) { @@ -382,7 +382,7 @@ impl MachoParser { Different formats require different weighting strategies: -```rust +```text fn calculate_section_weight(format: BinaryFormat, section_type: SectionType) -> i32 { match (format, section_type) { (BinaryFormat::Elf, SectionType::StringData) => 10, // .rodata @@ -397,7 +397,7 @@ fn calculate_section_weight(format: BinaryFormat, section_type: SectionType) -> Stringy uses `goblin` for robust format detection: -```rust +```text pub fn detect_format(data: &[u8]) -> BinaryFormat { match Object::parse(data) { Ok(Object::Elf(_)) => BinaryFormat::Elf, diff --git a/docs/src/classification.md b/docs/src/classification.md index fea0ba3..2a5947a 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -160,7 +160,7 @@ The classifier relies on `lazy_static!` to compile regex patterns once and reuse Key method signatures: -```rust +```text pub fn classify(&self, string: &FoundString) -> Vec; pub fn classify_posix_path(&self, text: &str) -> Option; pub fn classify_windows_path(&self, text: &str) -> Option; @@ -170,7 +170,7 @@ pub fn classify_registry_path(&self, text: &str) -> Option; ## Using the Classification System -```rust +```text use stringy::classification::SemanticClassifier; use stringy::types::{Encoding, FoundString, StringSource, Tag}; @@ -237,7 +237,7 @@ Several techniques reduce false positives: 3. **Entropy checking**: High-entropy strings are likely binary data 4. **Whitelist/blacklist**: Known good/bad patterns -```rust +```text fn is_likely_false_positive(&self, text: &str, tag: &Tag) -> bool { match tag { Tag::Domain => { diff --git a/docs/src/output-formats.md b/docs/src/output-formats.md index bffbfac..001fbca 100644 --- a/docs/src/output-formats.md +++ b/docs/src/output-formats.md @@ -38,7 +38,7 @@ Machine-readable format with one JSON object per line, ideal for automation and ### Example Output -```json +```text {"text":"https://api.example.com/v1/users","encoding":"utf-8","offset":4096,"rva":4096,"section":".rdata","length":31,"tags":["url"],"score":95,"source":"SectionData"} {"text":"{12345678-1234-1234-1234-123456789abc}","encoding":"utf-8","offset":8192,"rva":8192,"section":".rdata","length":38,"tags":["guid"],"score":87,"source":"SectionData"} ``` diff --git a/docs/src/performance.md b/docs/src/performance.md index 33770dc..6a42066 100644 --- a/docs/src/performance.md +++ b/docs/src/performance.md @@ -27,7 +27,7 @@ Stringy is designed for efficient analysis of binary files, from small executabl Stringy uses memory mapping for efficient file access: -```rust +```text // Automatic memory mapping for large files if file_size > MEMORY_MAP_THRESHOLD { let mmap = unsafe { Mmap::map(&file)? }; @@ -93,7 +93,7 @@ Core extraction pipeline is optimized for single-threaded performance: Future versions will support parallel processing: -```rust +```text // Planned parallel section processing sections.par_iter() .flat_map(|section| extract_from_section(section, data)) @@ -110,7 +110,7 @@ sections.par_iter() #### Regex Caching -```rust +```text lazy_static! { static ref URL_REGEX: Regex = Regex::new(r"https?://[^\s]+").unwrap(); static ref DOMAIN_REGEX: Regex = Regex::new(r"[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap(); @@ -119,7 +119,7 @@ lazy_static! { #### Efficient String Scanning -```rust +```text // Optimized ASCII scanning with SIMD potential fn scan_ascii_optimized(data: &[u8]) -> Vec { let mut matches = Vec::new(); @@ -147,7 +147,7 @@ fn scan_ascii_optimized(data: &[u8]) -> Vec { Stringy uses sequential access patterns optimized for modern storage: -```rust +```text // Sequential section processing for section in container.sections { let section_data = &data[section.offset..section.offset + section.size]; diff --git a/docs/src/string-extraction.md b/docs/src/string-extraction.md index f3ca3a9..f57a95d 100644 --- a/docs/src/string-extraction.md +++ b/docs/src/string-extraction.md @@ -27,7 +27,7 @@ UTF-16LE extraction is now implemented and available for Windows PE binary strin #### Basic Extraction -```rust +```text use stringy::extraction::ascii::{extract_ascii_strings, AsciiExtractionConfig}; let data = b"Hello\0World\0Test123"; @@ -41,7 +41,7 @@ for string in strings { #### Configuration -```rust +```text use stringy::extraction::ascii::AsciiExtractionConfig; // Default configuration (min_length: 4, no max_length) @@ -61,7 +61,7 @@ UTF-8 extraction builds on ASCII extraction and handles multi-byte characters. S #### Implementation Details -```rust +```text fn extract_ascii_strings(data: &[u8], min_len: usize) -> Vec { let mut strings = Vec::new(); let mut current_string = Vec::new(); @@ -158,7 +158,7 @@ Boosts or reduces confidence based on section context: ### Configuration -```rust +```text use stringy::extraction::config::{NoiseFilterConfig, FilterWeights}; // Default configuration @@ -183,7 +183,7 @@ config.filter_weights = FilterWeights { ### Using Noise Filters -```rust +```text use stringy::extraction::config::NoiseFilterConfig; use stringy::extraction::filters::{CompositeNoiseFilter, FilterContext}; use stringy::types::SectionType; @@ -255,7 +255,7 @@ UTF-16 extraction is implemented in `src/extraction/utf16.rs` following the patt **Usage Example**: -```rust +```text use stringy::extraction::utf16::{extract_utf16_strings, Utf16ExtractionConfig, ByteOrder}; // Extract UTF-16LE strings from Windows PE binary @@ -277,7 +277,7 @@ let strings = extract_utf16_strings(data, &config); **Configuration**: -```rust +```text use stringy::extraction::utf16::{Utf16ExtractionConfig, ByteOrder}; // Default configuration (min_length: 3, byte_order: Auto, confidence_threshold: 0.5) @@ -412,7 +412,7 @@ Different sections have different string extraction strategies. - **STRINGTABLE**: Localized UI strings - **RT_MANIFEST**: XML manifest data -```rust +```text fn extract_pe_resources(pe: &PE, data: &[u8]) -> Vec { let mut strings = Vec::new(); @@ -445,7 +445,7 @@ Strings are canonicalized while preserving important metadata: When duplicates are found: -```rust +```text struct DeduplicatedString { canonical_text: String, occurrences: Vec, @@ -463,7 +463,7 @@ struct StringOccurrence { ### Deduplication Algorithm -```rust +```text fn deduplicate_strings(strings: Vec) -> Vec { let mut map: HashMap = HashMap::new(); @@ -483,7 +483,7 @@ fn deduplicate_strings(strings: Vec) -> Vec { ### Extraction Configuration -```rust +```text use stringy::extraction::{ByteOrder, Encoding, ExtractionConfig}; pub struct ExtractionConfig { @@ -500,7 +500,7 @@ pub struct ExtractionConfig { **UTF-16 Configuration Examples**: -```rust +```text use stringy::extraction::{ExtractionConfig, Encoding, ByteOrder}; // Extract UTF-16LE strings from Windows PE binary @@ -519,7 +519,7 @@ config.utf16_byte_order = ByteOrder::Auto; ### Noise Filter Configuration -```rust +```text use stringy::extraction::config::NoiseFilterConfig; pub struct NoiseFilterConfig { @@ -535,7 +535,7 @@ pub struct NoiseFilterConfig { ### Filter Weights -```rust +```text use stringy::extraction::config::FilterWeights; pub struct FilterWeights { @@ -552,7 +552,7 @@ All weights must sum to 1.0. The configuration validates this automatically. ### Encoding Selection -```rust +```text pub enum EncodingFilter { All, Specific(Vec), @@ -563,7 +563,7 @@ pub enum EncodingFilter { ### Section Filtering -```rust +```text pub struct SectionFilter { pub include_sections: Option>, pub exclude_sections: Option>, @@ -578,7 +578,7 @@ pub struct SectionFilter { Large files use memory mapping for efficient access: -```rust +```text use memmap2::Mmap; fn extract_from_large_file(path: &Path) -> Result> { @@ -593,7 +593,7 @@ fn extract_from_large_file(path: &Path) -> Result> { Section extraction can be parallelized: -```rust +```text use rayon::prelude::*; fn extract_parallel(sections: &[SectionInfo], data: &[u8]) -> Vec { @@ -608,7 +608,7 @@ fn extract_parallel(sections: &[SectionInfo], data: &[u8]) -> Vec { Pattern matching uses cached regex compilation: -```rust +```text lazy_static! { static ref URL_REGEX: Regex = Regex::new(r"https?://[^\s]+").unwrap(); static ref GUID_REGEX: Regex = Regex::new(r"\{[0-9a-fA-F-]{36}\}").unwrap(); @@ -649,7 +649,7 @@ Noise filtering is designed for minimal overhead: #### Basic Extraction with Filtering -```rust +```text use stringy::extraction::ascii::{extract_ascii_strings, AsciiExtractionConfig}; use stringy::extraction::config::NoiseFilterConfig; use stringy::extraction::filters::{CompositeNoiseFilter, FilterContext}; @@ -670,7 +670,7 @@ let filtered: Vec<_> = strings #### Custom Filter Configuration -```rust +```text use stringy::extraction::config::{NoiseFilterConfig, FilterWeights}; let mut config = NoiseFilterConfig::default(); diff --git a/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md b/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md index 5da1ef7..9fd4a03 100644 --- a/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md +++ b/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md @@ -82,7 +82,7 @@ sequenceDiagram **Output Format (TTY):** -``` +```text String Tags Score Section https://malicious-c2.example.com/api url 95 .rdata C:\Windows\System32\kernel32.dll filepath 88 .rdata @@ -106,14 +106,19 @@ core::fmt::Display::fmt export 85 .text **Steps:** 1. User invokes Stringy with filtering flags + 2. System performs standard analysis pipeline (Parsing... Extracting... Classifying... Ranking...) + 3. System applies filters with AND logic: + - String must have tag "url" OR "ipv4" - String length must be >= 10 characters - String encoding must be UTF-16 4. System outputs filtered results in table format + 5. If no strings match filters, system displays to stderr: "Analyzed 1,234 strings, 0 matched filters" + 6. If strings match, system outputs table with matching strings only **Filter Combination Rules:** @@ -174,7 +179,7 @@ If multiple output format flags are specified (e.g., --json and --yara), the sys **JSON Format:** -```json +```text {"text":"https://example.com","encoding":"Ascii","offset":4096,"rva":8192,"section":".rdata","length":19,"tags":["url","domain"],"score":95,"source":"SectionData","confidence":1.0} {"text":"C:\\Windows\\System32","encoding":"Utf16Le","offset":8192,"rva":12288,"section":".data","length":38,"tags":["filepath"],"score":88,"source":"SectionData","confidence":0.95} ``` @@ -198,21 +203,27 @@ If multiple output format flags are specified (e.g., --json and --yara), the sys **Steps:** 1. User invokes Stringy with --yara flag + 2. System performs standard analysis pipeline + 3. Progress indicators go to stderr + 4. System generates complete YARA rule template to stdout + 5. Rule includes: + - Rule name (derived from binary filename) - Metadata section (file hash, analysis date, tool version) - Strings section with properly escaped strings - Condition section (basic template) 6. Strings are escaped according to YARA syntax rules + 7. Very long strings (>200 chars) are truncated with comment **YARA Output Format:** -``` +```text rule binary_strings { meta: description = "Strings extracted from binary.exe" @@ -268,7 +279,7 @@ rule binary_strings { **Output Format (Non-TTY):** -``` +```text https://malicious-c2.example.com/api C:\Windows\System32\kernel32.dll core::fmt::Display::fmt diff --git a/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md b/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md index 3a2da39..4d21225 100644 --- a/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md +++ b/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md @@ -22,12 +22,12 @@ Update the `FoundString` struct in file:src/types.rs to support symbol demanglin ## Acceptance Criteria -- [ ] FoundString struct includes original_text field -- [ ] FoundString struct includes optional breakdown fields (section_weight, semantic_boost, noise_penalty) -- [ ] All fields properly serialize/deserialize with serde -- [ ] Existing tests updated and passing -- [ ] Documentation updated with field descriptions and usage examples -- [ ] No breaking changes to existing code that creates FoundString instances +- [x] FoundString struct includes original_text field +- [x] FoundString struct includes optional breakdown fields (section_weight, semantic_boost, noise_penalty) +- [x] All fields properly serialize/deserialize with serde +- [x] Existing tests updated and passing +- [x] Documentation updated with field descriptions and usage examples +- [x] No breaking changes to existing code that creates FoundString instances ## References diff --git a/src/classification/mod.rs b/src/classification/mod.rs index a63b138..d3dd9ad 100644 --- a/src/classification/mod.rs +++ b/src/classification/mod.rs @@ -30,6 +30,7 @@ //! let classifier = SemanticClassifier::new(); //! let found_string = FoundString { //! text: "C:\\Windows\\System32\\cmd.exe".to_string(), +//! original_text: None, //! encoding: Encoding::Ascii, //! offset: 0, //! rva: None, @@ -37,6 +38,9 @@ //! length: 27, //! tags: Vec::new(), //! score: 0, +//! section_weight: None, +//! semantic_boost: None, +//! noise_penalty: None, //! source: StringSource::SectionData, //! confidence: 1.0, //! }; diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index 2740f2f..b83bdfd 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -20,6 +20,7 @@ //! let classifier = SemanticClassifier::new(); //! let found_string = FoundString { //! text: "https://example.com/api".to_string(), +//! original_text: None, //! encoding: Encoding::Ascii, //! offset: 0, //! rva: None, @@ -27,6 +28,9 @@ //! length: 24, //! tags: Vec::new(), //! score: 0, +//! section_weight: None, +//! semantic_boost: None, +//! noise_penalty: None, //! source: StringSource::SectionData, //! confidence: 1.0, //! }; @@ -348,6 +352,7 @@ impl SemanticClassifier { /// let classifier = SemanticClassifier::new(); /// let found_string = FoundString { /// text: "https://example.com".to_string(), + /// original_text: None, /// encoding: Encoding::Ascii, /// offset: 0, /// rva: None, @@ -355,6 +360,9 @@ impl SemanticClassifier { /// length: 19, /// tags: Vec::new(), /// score: 0, + /// section_weight: None, + /// semantic_boost: None, + /// noise_penalty: None, /// source: StringSource::SectionData, /// confidence: 1.0, /// }; @@ -1010,6 +1018,7 @@ mod tests { fn create_test_string(text: &str) -> FoundString { FoundString { text: text.to_string(), + original_text: None, encoding: Encoding::Ascii, offset: 0, rva: None, @@ -1017,6 +1026,9 @@ mod tests { length: text.len() as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 1.0, } diff --git a/src/extraction/ascii.rs b/src/extraction/ascii.rs index 8fb4fb6..9f9d82f 100644 --- a/src/extraction/ascii.rs +++ b/src/extraction/ascii.rs @@ -230,6 +230,7 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec let text = String::from_utf8(bytes).expect("ASCII bytes should be valid UTF-8"); strings.push(FoundString { text, + original_text: None, encoding: Encoding::Ascii, offset: start as u64, rva: None, @@ -237,6 +238,9 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec length: len as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 1.0, }); @@ -260,6 +264,7 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec let text = String::from_utf8(bytes).expect("ASCII bytes should be valid UTF-8"); strings.push(FoundString { text, + original_text: None, encoding: Encoding::Ascii, offset: start as u64, rva: None, @@ -267,6 +272,9 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec length: len as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 1.0, }); @@ -276,6 +284,7 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec let text = String::from_utf8(bytes).expect("ASCII bytes should be valid UTF-8"); strings.push(FoundString { text, + original_text: None, encoding: Encoding::Ascii, offset: start as u64, rva: None, @@ -283,6 +292,9 @@ pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec length: len as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 1.0, }); diff --git a/src/extraction/dedup.rs b/src/extraction/dedup.rs index b8da59a..d7ad2b7 100644 --- a/src/extraction/dedup.rs +++ b/src/extraction/dedup.rs @@ -282,6 +282,7 @@ impl CanonicalString { FoundString { text: self.text.clone(), + original_text: None, encoding: self.encoding, offset: first_occurrence.offset, rva: first_occurrence.rva, @@ -289,6 +290,9 @@ impl CanonicalString { length: first_occurrence.length, tags: self.merged_tags.clone(), score: self.combined_score, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: first_occurrence.source, confidence: max_confidence, } @@ -325,6 +329,7 @@ mod tests { FoundString { text: text.to_string(), + original_text: None, encoding, offset, rva: Some(offset + 0x1000), @@ -332,6 +337,9 @@ mod tests { length, tags, score, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source, confidence, } @@ -804,6 +812,7 @@ mod tests { let strings = vec![ FoundString { text: "Test".to_string(), + original_text: None, encoding: Encoding::Utf16Le, offset: 0x100, rva: Some(0x1000), @@ -811,11 +820,15 @@ mod tests { length: 8, // 4 characters * 2 bytes = 8 bytes tags: vec![], score: 10, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 0.8, }, FoundString { text: "Test".to_string(), + original_text: None, encoding: Encoding::Utf16Le, offset: 0x200, rva: Some(0x2000), @@ -823,6 +836,9 @@ mod tests { length: 8, tags: vec![], score: 15, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: 0.9, }, diff --git a/src/extraction/macho_load_commands.rs b/src/extraction/macho_load_commands.rs index 35a3254..b3ed743 100644 --- a/src/extraction/macho_load_commands.rs +++ b/src/extraction/macho_load_commands.rs @@ -100,6 +100,7 @@ fn extract_dylib_strings(macho: &MachO) -> Vec { strings.push(FoundString { text: lib.to_string(), + original_text: None, encoding: Encoding::Utf8, source: StringSource::LoadCommand, tags, @@ -108,6 +109,9 @@ fn extract_dylib_strings(macho: &MachO) -> Vec { rva: None, length, score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, confidence: 1.0, }); } @@ -129,6 +133,7 @@ fn extract_rpath_strings(macho: &MachO) -> Vec { strings.push(FoundString { text: rpath.to_string(), + original_text: None, encoding: Encoding::Utf8, source: StringSource::LoadCommand, tags, @@ -137,6 +142,9 @@ fn extract_rpath_strings(macho: &MachO) -> Vec { rva: None, length, score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, confidence: 1.0, }); } diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs index 1ced150..19b5038 100644 --- a/src/extraction/mod.rs +++ b/src/extraction/mod.rs @@ -483,6 +483,7 @@ impl StringExtractor for BasicExtractor { let length = import.name.len() as u32; all_strings.push(FoundString { text: import.name.clone(), + original_text: None, encoding: Encoding::Utf8, offset: 0, rva: None, @@ -490,6 +491,9 @@ impl StringExtractor for BasicExtractor { length, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ImportName, confidence: 1.0, }); @@ -500,6 +504,7 @@ impl StringExtractor for BasicExtractor { let length = export.name.len() as u32; all_strings.push(FoundString { text: export.name.clone(), + original_text: None, encoding: Encoding::Utf8, offset: 0, rva: None, @@ -507,6 +512,9 @@ impl StringExtractor for BasicExtractor { length, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ExportName, confidence: 1.0, }); @@ -579,6 +587,7 @@ impl StringExtractor for BasicExtractor { let length = import.name.len() as u32; all_strings.push(FoundString { text: import.name.clone(), + original_text: None, encoding: Encoding::Utf8, offset: 0, rva: None, @@ -586,6 +595,9 @@ impl StringExtractor for BasicExtractor { length, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ImportName, confidence: 1.0, }); @@ -596,6 +608,7 @@ impl StringExtractor for BasicExtractor { let length = export.name.len() as u32; all_strings.push(FoundString { text: export.name.clone(), + original_text: None, encoding: Encoding::Utf8, offset: 0, rva: None, @@ -603,6 +616,9 @@ impl StringExtractor for BasicExtractor { length, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ExportName, confidence: 1.0, }); @@ -748,6 +764,7 @@ impl StringExtractor for BasicExtractor { let found_string = FoundString { text, + original_text: None, encoding, offset: absolute_offset, rva, @@ -755,6 +772,9 @@ impl StringExtractor for BasicExtractor { length: length as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence, }; diff --git a/src/extraction/pe_resources.rs b/src/extraction/pe_resources.rs index 695d834..7938667 100644 --- a/src/extraction/pe_resources.rs +++ b/src/extraction/pe_resources.rs @@ -426,6 +426,7 @@ pub fn extract_version_info_strings(data: &[u8]) -> Vec { let length = text.len() as u32; let found_string = FoundString { text, + original_text: None, encoding: Encoding::Utf16Le, offset: 0, // pelite doesn't provide offsets easily rva: None, @@ -433,6 +434,9 @@ pub fn extract_version_info_strings(data: &[u8]) -> Vec { length, tags: vec![Tag::Version, Tag::Resource], score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ResourceString, confidence: 1.0, }; @@ -582,6 +586,7 @@ pub fn extract_string_table_strings(data: &[u8]) -> Vec { let found_string = FoundString { text, + original_text: None, encoding: Encoding::Utf16Le, offset: 0, // File offset not easily available from pelite DataEntry rva, @@ -589,6 +594,9 @@ pub fn extract_string_table_strings(data: &[u8]) -> Vec { length: text_len, tags: vec![Tag::Resource], score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ResourceString, confidence: 1.0, }; @@ -770,6 +778,7 @@ pub fn extract_manifest_strings(data: &[u8]) -> Vec { let length = manifest_text.len() as u32; let found_string = FoundString { text: manifest_text, + original_text: None, encoding, offset: 0, // File offset not easily available from pelite DataEntry rva, @@ -777,6 +786,9 @@ pub fn extract_manifest_strings(data: &[u8]) -> Vec { length, tags: vec![Tag::Manifest, Tag::Resource], score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::ResourceString, confidence: 1.0, }; diff --git a/src/extraction/utf16.rs b/src/extraction/utf16.rs index 711ffbb..2eb3f1f 100644 --- a/src/extraction/utf16.rs +++ b/src/extraction/utf16.rs @@ -822,6 +822,7 @@ fn extract_utf16_strings_with_byte_order( if utf16_confidence >= config.confidence_threshold { found_strings.push(FoundString { text, + original_text: None, encoding: match byte_order { ByteOrder::LE => Encoding::Utf16Le, ByteOrder::BE => Encoding::Utf16Be, @@ -833,6 +834,9 @@ fn extract_utf16_strings_with_byte_order( length: bytes_for_decoding.len() as u32, tags: Vec::new(), score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, source: StringSource::SectionData, confidence: utf16_confidence, }); diff --git a/src/types.rs b/src/types.rs index bccd80c..5347f33 100644 --- a/src/types.rs +++ b/src/types.rs @@ -223,10 +223,23 @@ pub struct ResourceStringEntry { } /// A string found in the binary with metadata +/// +/// The `original_text` field preserves the pre-demangled text when demangling +/// is applied. Debug-only fields provide transparency into how the final score +/// was produced and are only populated when debug mode is enabled. #[derive(Debug, Clone, Serialize, Deserialize)] +#[non_exhaustive] pub struct FoundString { /// The extracted string text pub text: String, + /// Original text before demangling (if applicable) + /// + /// When a string is identified as a mangled symbol (e.g., C++ or Rust mangled names), + /// this field preserves the original mangled form before demangling is applied. + /// The `text` field will contain the demangled version. This is `None` for strings + /// that are not mangled symbols or when demangling is not performed. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub original_text: Option, /// The encoding used for this string pub encoding: Encoding, /// File offset where the string was found @@ -241,6 +254,30 @@ pub struct FoundString { pub tags: Vec, /// Relevance score for ranking pub score: i32, + /// Section weight contribution to the final score (debug only) + /// + /// When debug mode is enabled, this field contains the weight assigned based on + /// the section where the string was found. Higher weights indicate sections more + /// likely to contain meaningful strings (e.g., .rodata vs .text). This is `None` + /// unless explicitly populated by the ranking system in debug mode. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub section_weight: Option, + /// Semantic classification boost to the final score (debug only) + /// + /// When debug mode is enabled, this field contains the score boost applied based on + /// semantic tags (URLs, file paths, GUIDs, etc.). Strings with valuable semantic + /// meaning receive positive boosts. This is `None` unless explicitly populated by + /// the ranking system in debug mode. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub semantic_boost: Option, + /// Noise penalty applied to the final score (debug only) + /// + /// When debug mode is enabled, this field contains the penalty applied for noise + /// characteristics (low confidence, repetitive patterns, etc.). Higher penalties + /// indicate strings more likely to be noise. This is `None` unless explicitly + /// populated by the ranking system in debug mode. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub noise_penalty: Option, /// Source of the string (section data, import, etc.) pub source: StringSource, /// Confidence score from noise filtering (0.0-1.0) @@ -254,6 +291,109 @@ pub struct FoundString { } impl FoundString { + /// Creates a new FoundString with required fields and sensible defaults + /// + /// # Arguments + /// + /// * `text` - The extracted string text + /// * `encoding` - The encoding used for this string + /// * `offset` - File offset where the string was found + /// * `length` - Length of the string in bytes + /// * `source` - Source of the string (section data, import, etc.) + /// + /// # Returns + /// + /// A new FoundString with optional fields set to None/empty and confidence + /// set to 1.0 + #[must_use] + pub fn new( + text: String, + encoding: Encoding, + offset: u64, + length: u32, + source: StringSource, + ) -> Self { + Self { + text, + original_text: None, + encoding, + offset, + rva: None, + section: None, + length, + tags: Vec::new(), + score: 0, + section_weight: None, + semantic_boost: None, + noise_penalty: None, + source, + confidence: 1.0, + } + } + + /// Sets the RVA (Relative Virtual Address) + #[must_use] + pub fn with_rva(mut self, rva: u64) -> Self { + self.rva = Some(rva); + self + } + + /// Sets the section name + #[must_use] + pub fn with_section(mut self, section: String) -> Self { + self.section = Some(section); + self + } + + /// Sets the tags + #[must_use] + pub fn with_tags(mut self, tags: Vec) -> Self { + self.tags = tags; + self + } + + /// Sets the score + #[must_use] + pub fn with_score(mut self, score: i32) -> Self { + self.score = score; + self + } + + /// Sets the confidence + #[must_use] + pub fn with_confidence(mut self, confidence: f32) -> Self { + self.confidence = confidence; + self + } + + /// Sets the original text (for demangled symbols) + #[must_use] + pub fn with_original_text(mut self, original_text: String) -> Self { + self.original_text = Some(original_text); + self + } + + /// Sets the section weight (debug mode) + #[must_use] + pub fn with_section_weight(mut self, weight: i32) -> Self { + self.section_weight = Some(weight); + self + } + + /// Sets the semantic boost (debug mode) + #[must_use] + pub fn with_semantic_boost(mut self, boost: i32) -> Self { + self.semantic_boost = Some(boost); + self + } + + /// Sets the noise penalty (debug mode) + #[must_use] + pub fn with_noise_penalty(mut self, penalty: i32) -> Self { + self.noise_penalty = Some(penalty); + self + } + /// Returns true if confidence is high (>= 0.7) pub fn is_high_confidence(&self) -> bool { self.confidence >= 0.7 @@ -307,3 +447,110 @@ impl From for StringyError { StringyError::ParseError(format!("Resource lookup error: {}", err)) } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Creates a test FoundString with all optional fields set to None + fn create_test_found_string() -> FoundString { + FoundString { + text: "test_string".to_string(), + original_text: None, + encoding: Encoding::Ascii, + offset: 0x1000, + rva: Some(0x2000), + section: Some(".rodata".to_string()), + length: 11, + tags: vec![Tag::Url], + score: 100, + section_weight: None, + semantic_boost: None, + noise_penalty: None, + source: StringSource::SectionData, + confidence: 0.85, + } + } + + #[test] + fn test_found_string_serde_optional_fields_none() { + // Test that optional fields are skipped when None + let found_string = create_test_found_string(); + let json = serde_json::to_string(&found_string).expect("Serialization failed"); + + // Verify optional fields are not present in JSON + assert!(!json.contains("original_text")); + assert!(!json.contains("section_weight")); + assert!(!json.contains("semantic_boost")); + assert!(!json.contains("noise_penalty")); + + // Verify required fields are present + assert!(json.contains("text")); + assert!(json.contains("encoding")); + assert!(json.contains("offset")); + } + + #[test] + fn test_found_string_serde_optional_fields_some() { + // Test that optional fields are included when Some + let mut found_string = create_test_found_string(); + found_string.original_text = Some("_ZN4test6mangled".to_string()); + found_string.section_weight = Some(50); + found_string.semantic_boost = Some(25); + found_string.noise_penalty = Some(-10); + + let json = serde_json::to_string(&found_string).expect("Serialization failed"); + + // Verify optional fields are present in JSON + assert!(json.contains("original_text")); + assert!(json.contains("_ZN4test6mangled")); + assert!(json.contains("section_weight")); + assert!(json.contains("semantic_boost")); + assert!(json.contains("noise_penalty")); + } + + #[test] + fn test_found_string_serde_roundtrip() { + // Test serialization/deserialization roundtrip with all fields + let mut found_string = create_test_found_string(); + found_string.original_text = Some("mangled_name".to_string()); + found_string.section_weight = Some(75); + found_string.semantic_boost = Some(30); + found_string.noise_penalty = Some(-5); + + let json = serde_json::to_string(&found_string).expect("Serialization failed"); + let deserialized: FoundString = + serde_json::from_str(&json).expect("Deserialization failed"); + + assert_eq!(found_string.text, deserialized.text); + assert_eq!(found_string.original_text, deserialized.original_text); + assert_eq!(found_string.section_weight, deserialized.section_weight); + assert_eq!(found_string.semantic_boost, deserialized.semantic_boost); + assert_eq!(found_string.noise_penalty, deserialized.noise_penalty); + } + + #[test] + fn test_found_string_deserialize_missing_optional_fields() { + // Test that missing optional fields default to None during deserialization + let json = r#"{ + "text": "test", + "encoding": "Ascii", + "offset": 0, + "rva": null, + "section": null, + "length": 4, + "tags": [], + "score": 0, + "source": "SectionData", + "confidence": 1.0 + }"#; + + let deserialized: FoundString = serde_json::from_str(json).expect("Deserialization failed"); + + assert_eq!(deserialized.text, "test"); + assert_eq!(deserialized.original_text, None); + assert_eq!(deserialized.section_weight, None); + assert_eq!(deserialized.semantic_boost, None); + assert_eq!(deserialized.noise_penalty, None); + } +} diff --git a/tests/classification_integration.rs b/tests/classification_integration.rs index 710f2ea..4a1ddda 100644 --- a/tests/classification_integration.rs +++ b/tests/classification_integration.rs @@ -4,18 +4,13 @@ use stringy::classification::SemanticClassifier; use stringy::types::{Encoding, FoundString, StringSource, Tag}; fn make_found_string(text: &str) -> FoundString { - FoundString { - text: text.to_string(), - encoding: Encoding::Ascii, - offset: 0, - rva: None, - section: None, - length: text.len() as u32, - tags: Vec::new(), - score: 0, - source: StringSource::SectionData, - confidence: 1.0, - } + FoundString::new( + text.to_string(), + Encoding::Ascii, + 0, + text.len() as u32, + StringSource::SectionData, + ) } fn classify_tags(classifier: &SemanticClassifier, text: &str) -> Vec { diff --git a/tests/test_deduplication.rs b/tests/test_deduplication.rs index 8a1fb5c..236e5cc 100644 --- a/tests/test_deduplication.rs +++ b/tests/test_deduplication.rs @@ -276,30 +276,28 @@ fn test_deduplication_score_bonuses() { // Create strings with different sources to test multi-source bonus let strings = vec![ - FoundString { - text: "TestString".to_string(), - encoding: Encoding::Utf8, - offset: 0x100, - rva: Some(0x1000), - section: Some(".rodata".to_string()), - length: 10, - tags: vec![], - score: 10, - source: StringSource::SectionData, - confidence: 0.8, - }, - FoundString { - text: "TestString".to_string(), - encoding: Encoding::Utf8, - offset: 0x200, - rva: Some(0x2000), - section: Some(".data".to_string()), - length: 10, - tags: vec![], - score: 15, - source: StringSource::ImportName, - confidence: 0.9, - }, + FoundString::new( + "TestString".to_string(), + Encoding::Utf8, + 0x100, + 10, + StringSource::SectionData, + ) + .with_rva(0x1000) + .with_section(".rodata".to_string()) + .with_score(10) + .with_confidence(0.8), + FoundString::new( + "TestString".to_string(), + Encoding::Utf8, + 0x200, + 10, + StringSource::ImportName, + ) + .with_rva(0x2000) + .with_section(".data".to_string()) + .with_score(15) + .with_confidence(0.9), ]; let canonical = deduplicate(strings, None, true);