diff --git a/.github/workflows/build-and-test-Linux.yaml b/.github/workflows/build-and-test-Linux.yaml index 31fc894d5f4..b73e58a57f9 100644 --- a/.github/workflows/build-and-test-Linux.yaml +++ b/.github/workflows/build-and-test-Linux.yaml @@ -23,6 +23,7 @@ jobs: sudo apt-get install --no-install-recommends -y clang-19 clang++-19 flex bison cmake ninja-build maven jq libxml2-utils dpkg-dev ccache sudo apt-get install --no-install-recommends -y libssl-dev libelf-dev libudev-dev libpci-dev libiberty-dev autoconf sudo apt-get install --no-install-recommends -y gawk jq + sudo apt-get install --no-install-recommends -y libgoogle-perftools-dev - name: Prepare ccache uses: actions/cache@v5 diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml index 1a4d70514e9..39445b76fc3 100644 --- a/.github/workflows/coverage.yaml +++ b/.github/workflows/coverage.yaml @@ -31,7 +31,7 @@ jobs: DEBIAN_FRONTEND: noninteractive run: | sudo apt-get update - sudo apt-get install --no-install-recommends -y g++ gcc gdb binutils flex bison cmake maven jq libxml2-utils openjdk-11-jdk-headless lcov ccache z3 + sudo apt-get install --no-install-recommends -y g++ gcc gdb binutils flex bison cmake maven jq libxml2-utils openjdk-11-jdk-headless lcov ccache z3 libgoogle-perftools-dev - name: Confirm z3 solver is available and log the version installed run: z3 --version - name: Download cvc-5 from the releases page and make sure it can be deployed diff --git a/.github/workflows/performance.yaml b/.github/workflows/performance.yaml index 93b9af238b0..198e3b2c318 100644 --- a/.github/workflows/performance.yaml +++ b/.github/workflows/performance.yaml @@ -45,7 +45,7 @@ jobs: DEBIAN_FRONTEND: noninteractive run: | sudo apt-get update - sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc g++ maven flex bison ccache + sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc g++ maven flex bison ccache libgoogle-perftools-dev - name: Prepare ccache uses: actions/cache/restore@v5 diff --git a/.github/workflows/profiling.yaml b/.github/workflows/profiling.yaml index 92fd4e42f3e..95bd25247cd 100644 --- a/.github/workflows/profiling.yaml +++ b/.github/workflows/profiling.yaml @@ -55,6 +55,7 @@ jobs: fi echo "$(dirname "$PERF_REAL")" >> "$GITHUB_PATH" "$PERF_REAL" --version + # perf requires perf_event_paranoid=-1 for call graph recording sudo sysctl kernel.perf_event_paranoid=-1 - name: Restore ccache diff --git a/.github/workflows/pull-request-checks.yaml b/.github/workflows/pull-request-checks.yaml index ba3e934a87d..80ff2f1228f 100644 --- a/.github/workflows/pull-request-checks.yaml +++ b/.github/workflows/pull-request-checks.yaml @@ -24,7 +24,7 @@ jobs: DEBIAN_FRONTEND: noninteractive run: | sudo apt-get update - sudo apt-get install --no-install-recommends -yq gcc gdb g++ maven jq flex bison libxml2-utils ccache cmake z3 + sudo apt-get install --no-install-recommends -yq gcc gdb g++ maven jq flex bison libxml2-utils ccache cmake z3 libgoogle-perftools-dev - name: Confirm z3 solver is available and log the version installed run: z3 --version - name: Download cvc-5 from the releases page and make sure it can be deployed @@ -222,7 +222,7 @@ jobs: DEBIAN_FRONTEND: noninteractive run: | sudo apt-get update - sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3 + sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3 libgoogle-perftools-dev - name: Confirm z3 solver is available and log the version installed run: z3 --version - name: Download cvc-5 from the releases page and make sure it can be deployed @@ -361,7 +361,7 @@ jobs: DEBIAN_FRONTEND: noninteractive run: | sudo apt-get update - sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3 + sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3 libgoogle-perftools-dev - name: Confirm z3 solver is available and log the version installed run: z3 --version - name: Download cvc-5 from the releases page and make sure it can be deployed @@ -468,7 +468,7 @@ jobs: DEBIAN_FRONTEND: noninteractive run: | sudo apt-get update - sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3 + sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3 libgoogle-perftools-dev - name: Confirm z3 solver is available and log the version installed run: z3 --version - name: Download cvc-5 from the releases page and make sure it can be deployed @@ -522,7 +522,7 @@ jobs: DEBIAN_FRONTEND: noninteractive run: | sudo apt-get update - sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils ccache doxygen z3 g++-multilib + sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils ccache doxygen z3 g++-multilib libgoogle-perftools-dev - name: Confirm z3 solver is available and log the version installed run: z3 --version - name: Download cvc-5 from the releases page and make sure it can be deployed diff --git a/CMakeLists.txt b/CMakeLists.txt index 0344a94d747..e81829ff636 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,6 +196,53 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR endif() endif() +# Alternative memory allocator for improved performance. +# tcmalloc and jemalloc give 18-25% speedup on typical CBMC workloads +# due to better handling of many small allocations (irept tree nodes). +# Set to "auto" to detect, or "tcmalloc", "jemalloc", or "system". +set(allocator "auto" CACHE STRING "Memory allocator: auto, tcmalloc, jemalloc, or system") + +if(NOT allocator STREQUAL "system") + if(allocator STREQUAL "auto" OR allocator STREQUAL "tcmalloc") + find_library(TCMALLOC_LIB NAMES tcmalloc_minimal tcmalloc) + endif() + if(allocator STREQUAL "auto" OR allocator STREQUAL "jemalloc") + find_library(JEMALLOC_LIB NAMES jemalloc) + endif() + + if(allocator STREQUAL "tcmalloc") + if(NOT TCMALLOC_LIB) + message(FATAL_ERROR "tcmalloc requested but not found") + endif() + set(CBMC_ALLOCATOR_LIB ${TCMALLOC_LIB}) + message(STATUS "Using tcmalloc: ${TCMALLOC_LIB}") + elseif(allocator STREQUAL "jemalloc") + if(NOT JEMALLOC_LIB) + message(FATAL_ERROR "jemalloc requested but not found") + endif() + set(CBMC_ALLOCATOR_LIB ${JEMALLOC_LIB}) + message(STATUS "Using jemalloc: ${JEMALLOC_LIB}") + elseif(allocator STREQUAL "auto") + if(TCMALLOC_LIB) + set(CBMC_ALLOCATOR_LIB ${TCMALLOC_LIB}) + message(STATUS "Auto-detected tcmalloc: ${TCMALLOC_LIB}") + elseif(JEMALLOC_LIB) + set(CBMC_ALLOCATOR_LIB ${JEMALLOC_LIB}) + message(STATUS "Auto-detected jemalloc: ${JEMALLOC_LIB}") + else() + message(STATUS "No alternative allocator found, using system malloc") + endif() + endif() +endif() + +# Make the allocator available for executables via cprover_allocator target. +if(CBMC_ALLOCATOR_LIB) + add_library(cprover_allocator INTERFACE) + target_link_libraries(cprover_allocator INTERFACE ${CBMC_ALLOCATOR_LIB}) +else() + add_library(cprover_allocator INTERFACE) +endif() + function(cprover_default_properties) set(CBMC_CXX_STANDARD ${CMAKE_CXX_STANDARD}) set(CBMC_CXX_STANDARD_REQUIRED ${CMAKE_CXX_STANDARD_REQUIRED}) diff --git a/PROFILING_ANALYSIS.md b/PROFILING_ANALYSIS.md new file mode 100644 index 00000000000..3c1dc0d6f7b --- /dev/null +++ b/PROFILING_ANALYSIS.md @@ -0,0 +1,1678 @@ +# CBMC Performance Profiling Analysis + +Date: 2026-03-11 — 2026-03-12 +Branch: `experiment/hash-optimization` +Baseline: Release (`-O3 -DNDEBUG`), commit `b191cc1dac` (develop) +System: Linux x86_64 + +## Methodology + +- Tool: `scripts/profile_cbmc.py --auto-large --auto-csmith --runs 3 --timeout 90` +- 15 benchmarks × 3 runs = 45 profiling runs, 49,462 total samples +- Solver excluded (`--dimacs --outfile /dev/null`) — only pre-solver stages profiled +- Source locations resolved via `build-debug/bin/cbmc` (RelWithDebInfo) + +## Benchmarks + +| Benchmark | Symex (s) | Convert SSA (s) | Total (s) | Steps | ±Total | +|-----------|-----------|-----------------|-----------|-------|--------| +| linked_list | 0.215 | 0.336 | 2.3 | 4,433 | ±0.06s | +| array_ops | 3.124 | 0.174 | 5.1 | 16,448 | ±0.00s | +| structs | 0.031 | 0.000 | 1.1 | 312 | ±0.06s | +| dlinked_list | 0.385 | 0.541 | 3.3 | 8,242 | ±0.00s | +| string_ops | 0.101 | 0.033 | 2.1 | 1,950 | ±0.06s | +| func_ptrs | 0.011 | 0.000 | 1.0 | 271 | ±0.00s | +| bitvector | 0.016 | 0.000 | 1.1 | 608 | ±0.06s | +| matrix | 1.827 | 0.000 | 3.1 | 4,148 | ±0.00s | +| unions | 0.022 | 0.000 | 1.1 | 382 | ±0.07s | +| tree | 0.980 | 0.943 | 5.2 | 27,789 | ±0.00s | +| csmith_1234567890 | 11.890 | 0.227 | 17.2 | 139,379 | ±0.06s | +| csmith_42 | 8.462 | 0.206 | 13.2 | 121,844 | ±0.00s | +| csmith_1111111111 | 17.291 | 0.258 | 23.2 | 153,556 | ±0.06s | +| csmith_2718281828 | 11.544 | 0.238 | 16.3 | 148,596 | ±0.01s | +| csmith_314159265 | 3.183 | 0.069 | 6.2 | 56,187 | ±0.06s | + +Symex dominates wall time for: array_ops, matrix, and all CSmith benchmarks. + +## Sample Distribution + +- CBMC code: 28,550 (57.7%) +- Libraries (libc, libstdc++): 20,912 (42.3%) +- Total: 49,462 + +## Top Hotspots + +### 1. String interning: `string_containert::get` — 11.7% combined + +The `_Hashtable::_M_find_before_node` (10.0%) and `_Hashtable::find` (1.6%) +functions are called almost exclusively from `string_containert::get`. +Additionally, `hash_string` accounts for 1.7%. + +- **Root cause**: NOT the hash function or table configuration (see + investigation below), but the **volume of redundant interning calls**. + `update_identifier(ssa_exprt&)` is called 11.5M times on csmith_42 + with a 98.6% hit rate — almost all calls find an already-interned string. +- **Call chain**: `field_sensitivityt::get_fields` / `rename` / `set_indices` + → `update_identifier` → `build_identifier` (creates two `ostringstream`, + builds string char-by-char) → `irep_idt(oss.str())` → `string_containert::get` + → `hash_string` → `hash_table.find` +- **Source**: `src/util/ssa_expr.cpp` (`update_identifier`, `build_identifier`) + +#### Investigation: hash function and table tuning + +Tested three approaches on csmith_42 (11.5M get() calls, 156K unique strings): + +| Change | Result | Reason | +|--------|--------|--------| +| FNV-1a hash (replace h*31+c) | **3.6% slower** | Multiply costlier than shift-subtract; chain lengths already short | +| Pre-reserve 10K buckets | No effect | `unordered_map` handles growth fine | +| Max load factor 0.5 | **1.3-1.8% faster** | Shorter chains, but modest (doubles memory) | + +Hash distribution quality is similar for both hash functions (~37% empty +buckets at load factor 1.0). `dstringt::hash() = no` (sequential index) +is actually perfect for `std::unordered_map`. + +#### Recommended optimization + +Avoid redundant `get()` calls rather than tuning the hash table: +- **Cache SSA identifiers**: skip `build_identifier` when l0/l1/l2 and + the original expression haven't changed +- **Avoid ostringstream**: use direct string concatenation with `id2string()` +- **Reduce `update_identifier` calls**: audit callers for unnecessary rebuilds + +Estimated impact: 6-9% overall speedup (50-80% reduction of the 11.7% cost). + +#### Implemented: replace ostringstream with string concatenation + +Replaced `std::ostringstream` with direct `std::string` concatenation +(using `reserve(64)` + `append`) in `build_ssa_identifier_rec` and +`initialize_ssa_identifier`. Commit `f8057bea69`. + +| Benchmark | Baseline | Optimized | Speedup | +|-----------|----------|-----------|---------| +| linked_list | 1.49s | 1.46s | 2.0% | +| array_ops | 3.13s | 2.78s | **11.2%** | +| csmith_42 | 8.89s | 8.58s | 3.5% | +| csmith_1111111111 | 16.91s | 15.92s | **5.9%** | + +#### Implemented: avoid full rebuild in set_level_0/1/2 + +When `set_level_0`, `set_level_1`, or `set_level_2` is called, the +identifier can be derived from the existing one by appending a suffix +(`!l0`, `@l1`, `#l2`) instead of rebuilding from the expression tree. +This is safe because each level is only set when previously empty +(callers guard against re-setting). Commits `4564ca2163`, `2c09b3ef20`. + +Instrumentation showed `update_identifier` is called 504,737 times on +csmith_42: 24% from `set_level_0`, 22% from `set_level_1`, 16% from +`set_level_2`, 37% from `set_expression`. + +Combined with the ostringstream optimization: + +| Benchmark | Baseline | After all SSA opts | Speedup | +|-----------|----------|--------------------|---------| +| linked_list | 1.49s | 1.44s | 3.4% | +| array_ops | 3.13s | 2.69s | 14.1% | +| csmith_42 | 8.89s | 8.44s | 5.1% | +| csmith_1111111111 | 16.91s | 15.75s | 6.9% | + +#### Implemented: cache array identifier prefix in field_sensitivity + +In `field_sensitivityt::get_fields` for array types, pre-compute the +base identifier and level suffixes once before the loop, then construct +each element's identifier by concatenating `base[[i]]` + suffixes +directly. This avoids calling `set_expression` (which triggers a full +identifier rebuild) for each array index. Commit `3a95db52b3`. + +Additional speedup on array-heavy benchmark: array_ops 2.69s → 2.56s +(4.8%). Minimal impact on CSmith benchmarks. + +#### Investigated and ruled out: member expression caching + +Attempted the same prefix-caching approach for member expressions +(`base..component!l0@l1`). No measurable improvement — the member case +is not in a hot loop (called once per struct field access, not iterated). + +Further optimization potential remains: the remaining `set_expression` +calls (37% of `update_identifier` on CSmith) still do full rebuilds. +Reducing these would require restructuring how `dstringt` is constructed +to avoid intermediate `std::string` allocation — `dstringt` doesn't +support concatenation, so every new identifier must go through +`string_containert::get()`. + +### 2. Sharing tree destruction: `sharing_treet::remove_ref` — 9.9% + +Recursive reference-count decrement and deallocation of irept tree nodes. + +- **Root cause**: 63% of samples are self-recursive — deep irept trees + cause deep recursion in the destructor chain. +- **Key callers**: + - 5.7% from `renamedt` (SSA renaming creates/destroys expressions) + - 2.9% from `goto_symext::symex_function_call_symbol` + - 2.5% from `~multi_path_symex_only_checkert` (final cleanup) + - 2.4% from `simplify_exprt::simplify` +- **Source locations**: + - `src/util/message.h:303` (52.6% — likely inlining artifact) + - `src/util/typecheck.cpp:16` (10.5%) + - `src/util/std_expr.h:1334` (3.1% — in rename) +- **Source**: `src/util/sharing_node.h`, `src/util/irep.h` +- **Possible improvements**: + - Iterative (non-recursive) `remove_ref` using an explicit stack + - Deferred/batched destruction (arena-style) + - Reduce unnecessary temporary irept copies that trigger destruction + +### 3. Memory allocation: malloc/free/new — ~25% combined + +| Function | % | +|----------|---| +| malloc | 7.4% | +| _int_free | 6.6% | +| _int_malloc | 5.4% | +| cfree | 4.7% | +| malloc_consolidate | 4.0% | +| operator new | 2.5% | +| unlink_chunk | 1.7% | + +- **Root cause**: CBMC creates and destroys vast numbers of small objects + (irept nodes, expression trees, SSA steps). The sharing tree helps but + copy-on-write still triggers many allocations. +- **Possible improvements**: + - Custom allocator / object pool for irept nodes + - Reduce copies (more move semantics, fewer temporaries) + - This is largely a consequence of hotspots #2 and #7 — fixing those + would reduce allocation pressure + +### 4. irept::find — 5.4% + +Linear scan through `forward_list_as_mapt` to find a named sub-tree. + +- **Key callers**: + - 22.6% from `simplify_exprt::simplify_node_preorder` (at `goto_program.h:274`) + - 19.7% from `constant_exprt::check` (at `irep.h:228`) + - 10.5% from `requires_renaming` + - 9.1% from `field_sensitivityt::get_fields` +- **Source**: `src/util/irep.h` +- **Possible improvements**: + - `constant_exprt::check` and `simplify_node_preorder` call `find` to + check expression types — could be short-circuited with cached type IDs + - `requires_renaming` does repeated type lookups — result could be cached + +### 5. Symbol table lookup: `next_unused_suffix` — 4.9% + +84.8% of `symbol_tablet::find` calls come from `symbol_table_baset::next_unused_suffix`. + +- **Root cause**: Generating unique symbol names by probing the symbol table + with incrementing suffixes. Each probe is a full hash table lookup. +- **Source location**: `src/util/expr.h:96` (in `next_unused_suffix`) +- **Source**: `src/util/symbol_table_base.h` +- **Possible improvements**: + - Maintain a per-prefix counter to generate the next suffix directly + without probing the table + - This is a clear algorithmic improvement — O(1) instead of O(n) probes + +### 6. irept::get — 4.8% + +Similar to `irept::find` but returns a value rather than a reference. + +- **Key callers**: + - 21.4% from `irept::get_bool` (at `std_expr.h:131`) + - 9.1% from `update_identifier` + - 6.5% from `renamedt` (rename) + - 5.5% from `build_ssa_identifier_rec` +- **Source**: `src/util/irep.h` +- **Possible improvements**: Same as #4 — reduce redundant lookups + +### 7. Copy-on-write detach: `sharing_treet::detach` — 4.6% + +Creates a private copy of a shared irept node before mutation. + +- **Key callers**: + - 45.9% from `irept::add` (at `symex_dead.cpp:65`) + - 17.0% from `field_sensitivityt::apply` (at `std_types.h:149`) + - 10.8% from `irept::remove` (at `symex_dead.cpp:72`) + - 9.1% from `apply_to_objects_in_dereference` +- **Source**: `src/util/sharing_node.h` +- **Possible improvements**: + - `symex_dead.cpp:65,72` — the dead variable handler adds and removes + irept fields, triggering detach each time. Could batch mutations. + - `field_sensitivityt::apply` — could avoid unnecessary copies when + the result is immediately consumed + +### 8. SSA renaming map: `sharing_mapt::get_leaf_node` — 2.1% + +Hash-trie lookup in the SSA renaming map. + +- **Key callers**: 90.2% from `rename` (at `irep.h:228`) +- **Source**: `src/util/sharing_map.h` +- **Possible improvements**: + - The sharing_map is a hash-array-mapped trie. For the rename use case, + a flat hash map might be faster for small-to-medium maps. + +## Prioritized Optimization Plan + +Based on impact and feasibility: + +| Priority | Target | Impact | Effort | Approach | +|----------|--------|--------|--------|----------| +| **P1** | `next_unused_suffix` | 4.9% | Low | Per-prefix counter instead of probing | +| **P2** | SSA identifier rebuilding | 11.7% | Medium | Cache identifiers / avoid ostringstream | +| **P3** | `sharing_treet::remove_ref` | 9.9% | Medium | Iterative destruction with explicit stack | +| **P4** | `symex_dead.cpp` detach | 4.6% | Low | Batch add/remove to avoid double detach | +| **P5** | `constant_exprt::check` in simplifier | ~2% | Low | Short-circuit common cases | +| **P6** | `requires_renaming` caching | ~1% | Low | Cache result per type | + +P2 is the highest-impact target: the hashing investigation confirmed that +the 11.7% cost is from redundant `update_identifier` calls, not from hash +table configuration. P1 remains the best bang-for-buck at low effort. + +## Post-Optimization Investigation (2026-03-11) + +After implementing the SSA identifier optimizations (P2), re-profiled to +assess remaining targets. + +### Updated profile (csmith_42, post-optimization) + +| Rank | Function | % | Notes | +|------|----------|---|-------| +| 1 | `string_containert` hash lookup | 11.1% | Volume of calls, not hash quality | +| 2 | `sharing_treet::remove_ref` | 7.2% | 63% self-recursive | +| 3 | `symbol_tablet::find` | 6.0% | `next_unused_suffix` now only 0.56% | +| 4 | malloc/free combined | ~18% | Consequence of irept allocation | +| 5 | `memcmp` | 3.5% | From `string_ptrt::operator==` | +| 6 | `irept::find` | 3.4% | Scattered callers | +| 7 | `irept::get` | 3.2% | Scattered callers | +| 8 | `sharing_treet::detach` | 2.7% | From `irept::add`/`remove` | +| 9 | `hash_string` | 2.1% | String interning hash function | +| 10 | `irept::operator==` | 2.1% | 57% self-recursive, 32% from `merge_irept` | + +### Investigation results for remaining targets + +**P1: `next_unused_suffix`** — Now only 0.56% (was 4.9% in original profile). +The original measurement was inflated by the `string_containert::get` cost +that dominated the `symbol_tablet::find` samples. After the SSA identifier +optimization reduced interning calls, `next_unused_suffix` is no longer a +significant bottleneck. `symbol_table_buildert` already provides a suffix +cache for the cases that need it. + +**P3: `sharing_treet::remove_ref`** — Tested the existing +`nonrecursive_destructor` (already implemented but disabled via `#if 0`). +Result: **15% SLOWER** (9.74s vs 8.44s on csmith_42). The explicit +`std::vector` stack with `reserve` and iteration is more expensive than +the recursive version, which benefits from the CPU call stack being in +L1 cache. The nonrecursive version is only useful for avoiding stack +overflow on extremely deep trees, not for performance. + +#### Implemented: irept union optimization (issue #7960) + +Cherry-picked thomasspriggs' branch (`tas/irep_optimisation1_unions`) +which makes `irept` hold a union of a pointer or an id, reducing memory +usage for leaf nodes. Commits `81005555c6`, `92464ad78a` (originally +`ccd06c39f8`, `4a1b67f8bc`). Cherry-picked cleanly onto current develop. + +| Benchmark | Without union | With union | Speedup | +|-----------|-------------|------------|---------| +| linked_list | 1.44s | 1.37s | **4.9%** | +| array_ops | 2.56s | 2.49s | **2.7%** | +| csmith_42 | 8.49s | 8.12s | **4.4%** | +| csmith_1111111111 | 15.75s | 15.37s | **2.4%** | + +Consistent 2.4-4.9% improvement from reduced memory footprint and +fewer allocations for leaf nodes. + +**P4: `symex_dead.cpp` detach** — The source locations from addr2line +(`symex_dead.cpp:65,72`) were inlining artifacts. The actual `detach` +cost is spread across many callers (45.9% from `irept::add`, 17% from +`field_sensitivityt::apply`, 10.8% from `irept::remove`). No single +call site dominates enough for a targeted fix. + +**P5: `constant_exprt::check`** — Only 0.32% of time is in `check` +itself. The 19.7% of `irept::find` attributed to `check` is because +`check` calls `find(ID_value)`. The `check` functions are validation +that runs on every `to_constant_expr` cast even in Release builds +(INVARIANT is not compiled out by default). Disabling checks would +require `CPROVER_INVARIANT_DO_NOT_CHECK` which is not recommended. + +### Conclusion + +The remaining hotspots are dominated by fundamental irept operations +(`find`, `get`, `add`, `remove_ref`, `detach`, `operator==`) that are +called millions of times from many different sites. These are inherent +to the sharing tree data structure and cannot be optimized by targeting +individual call sites. Significant further improvement would require +either: + +1. **Reducing the number of irept operations** — e.g., by caching + intermediate results, avoiding unnecessary copies, or restructuring + algorithms to batch mutations. + +2. **Changing the irept data structure** — e.g., using a flat hash map + instead of `forward_list_as_mapt` for named sub-trees, or using + a different representation for frequently-accessed fields. + +3. **Reducing string interning volume** — the 11.1% in + `string_containert::get` is from the remaining `set_expression` + calls (37% of `update_identifier`) in `field_sensitivity.cpp`'s + array element loop. Caching per-index identifiers there could help. + +## Raw Data + +Full results in `profile-results/results.json` and per-benchmark flamegraphs +in `profile-results//flamegraph.svg`. + +## Alternative Allocator Investigation (2026-03-11) + +Tested drop-in replacement allocators via `LD_PRELOAD` to address the +~18% of samples in malloc/free/new. + +### Results + +| Benchmark | glibc | tcmalloc | jemalloc | tcmalloc Δ | jemalloc Δ | +|-----------|-------|----------|----------|-----------|-----------| +| linked_list | 1.44s | 1.18s | 1.15s | **-18%** | **-20%** | +| array_ops | 2.56s | 1.91s | 2.09s | **-25%** | **-18%** | +| csmith_42 | 8.50s | 6.41s | 6.50s | **-25%** | **-24%** | +| csmith_1111111111 | 15.71s | 12.73s | 12.01s | **-19%** | **-24%** | + +Both allocators give **18-25% speedup** with zero code changes. + +### Why it helps + +CBMC's irept sharing tree creates and destroys millions of small objects +(tree_nodet, ~64-128 bytes each). glibc's malloc uses per-thread arenas +with bins, but the consolidation and free-list management overhead is +significant for this pattern. tcmalloc and jemalloc use thread-local +caches and size-class segregation that are much faster for small objects. + +### Usage + +```bash +# Via LD_PRELOAD (no rebuild needed) +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 cbmc ... +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 cbmc ... + +# Install +apt-get install libtcmalloc-minimal4 # or libjemalloc2 +``` + +### Recommendation + +Add a CMake option to link against tcmalloc or jemalloc. This is the +single largest performance improvement found in this investigation — +larger than all the SSA identifier optimizations combined. + +#### Implemented: CMake allocator auto-detection + +Added `-Dallocator=auto|tcmalloc|jemalloc|system` CMake option. When +set to `auto` (the default), CMake searches for tcmalloc then jemalloc +and links the first one found. Commit `0da7f8ff4b`. + +Install: `apt-get install libgoogle-perftools-dev` (or `libjemalloc-dev`) + +#### tcmalloc vs jemalloc comparison + +Both give 18-25% speedup. Key differences: +- **tcmalloc**: More consistent (no warmup outliers), slightly faster on + array-heavy workloads. Known issue: memory footprint can grow over time + in long-running processes — not a concern for CBMC which runs and exits. +- **jemalloc**: Better memory efficiency for long-running servers. Has a + warmup cost visible in the first run. +- **mimalloc** (Microsoft): Only ~1% improvement — its advantages are for + multi-threaded workloads. +- **glibc tuning** (`MALLOC_ARENA_MAX`, `MALLOC_TRIM_THRESHOLD`): 1-3%. +- **macOS**: System allocator already uses magazine-based allocation + similar to tcmalloc; improvement would be smaller. +- **Windows**: Default heap allocator with LFH is reasonably fast. + +Recommendation: tcmalloc for CBMC (single-threaded, short-lived). + +#### Implemented: CI tcmalloc installation + +Added `libgoogle-perftools-dev` to all Linux CI workflows that build +CBMC with CMake. Commit `28824f9f07`. Workflows updated: +build-and-test-Linux, pull-request-checks, performance, coverage, +profiling. + +#### Perf event selection: cycles vs cpu-clock + +Tested both events at 997 Hz sampling. Results are nearly identical for +function-level profiling (same ranking, same percentages within noise). +`cycles` uses hardware PMU counters (more precise for instruction-level +analysis); `cpu-clock` is a software timer (always available, works in +CI containers/VMs). The profiling tool uses `cycles` as default with +automatic `cpu-clock` fallback. + +## Combined Optimization Summary + +**Note:** This section reflects early results (2026-03-11) before the +field_sensitivity cache was found to have a correctness bug and before +the pool allocator and get_fields hoisting were added. See +"Comprehensive 13-Benchmark Results (2026-03-12)" for final numbers. + +| Optimization | Speedup (csmith_42) | Cumulative | +|-------------|--------------------:|----------:| +| Baseline | — | 8.89s | +| SSA string concat | 3.5% | 8.58s | +| set_level_2 opt | 0.4% | 8.55s | +| set_level_0/1 opt | 1.3% | 8.44s | +| field_sensitivity cache | ~0% (array: 4.8%) | 8.49s | +| **tcmalloc** | **24.6%** | **6.41s** | +| **Total** | **27.9%** | **6.41s** | + +On the array-heavy benchmark, the combined effect is even larger: +3.13s → 1.91s (**39% faster**). + +## Final Combined Results (2026-03-11) + +**Note:** This section reflects early results before the pool allocator, +get_fields hoisting, and get_new_name cache were added. The +field_sensitivity cache listed below was later found to have a +correctness bug and was removed. See "Comprehensive 13-Benchmark Results +(2026-03-12)" for final numbers. + +All optimizations combined on `experiment/hash-optimization` branch: +- SSA identifier string concat (replace ostringstream) +- SSA set_level_0/1/2 suffix appending (avoid full rebuild) +- field_sensitivity array loop caching +- irept union optimization (from issue #7960) +- tcmalloc linked via CMake + +### Performance vs baseline (develop, glibc malloc) + +| Benchmark | Baseline | Optimized | Speedup | +|-----------|----------|-----------|---------| +| linked_list | 1.49s | 1.08s | **27.5%** | +| array_ops | 3.08s | 1.89s | **38.6%** | +| csmith_42 | 8.83s | 6.26s | **29.1%** | +| csmith_1111111111 | 16.81s | 12.44s | **26.0%** | + +### New profile (csmith_42, all optimizations + tcmalloc) + +| Rank | Function | % | Change vs original | +|------|----------|---|-------------------| +| 1 | `string_containert` hash lookup | 13.2% | Was 10.0% — now larger share because other costs reduced | +| 2 | `sharing_treet::remove_ref` | 6.1% | Was 9.9% — irept union helped | +| 3 | tcmalloc new/delete | 7.8% | Was 18% glibc — 2.3x reduction | +| 4 | `irept::find` | 4.7% | Was 5.4% | +| 5 | `memcmp` | 4.5% | Was 3.5% — larger share | +| 6 | `irept::get` | 3.4% | Was 4.8% | +| 7 | `sharing_treet::detach` | 3.0% | Was 4.6% | +| 8 | `symbol_tablet::find` | 3.0% | Was 4.9% | +| 9 | `irept::operator==` | 2.1% | Unchanged | +| 10 | `hash_string` | 1.9% | Was 1.7% | + +### Remaining avenues to explore + +1. **String interning volume** (13.2%): Still the #1 hotspot. The remaining + `set_expression` calls in `field_sensitivity.cpp` (member expressions, + not just arrays) still trigger full identifier rebuilds. Also, + `build_ssa_identifier_rec` is called from the constructor for every new + `ssa_exprt`. Caching the base identifier per-symbol could help. + +2. **`memcmp` in string interning** (4.5%): The `string_ptrt::operator==` + does `memcmp` on every hash table probe. If we stored a pre-computed + hash alongside the string pointer, we could skip `memcmp` when hashes + differ. However, `std::unordered_map` already does this internally. + The cost is from hash collisions in the same bucket. + +3. **`irept::find` / `irept::get`** (8.1% combined): These do linear scans + through `forward_list_as_mapt`. Replacing this with a flat hash map or + small sorted array for named sub-trees could help, but is a major + structural change to irept. + +4. **`merge_irept::merged`** (1.8%): Expression merging during symex. + Could benefit from better hash caching. + +5. **`sharing_mapt::get_leaf_node`** (1.3%): The SSA renaming map uses a + hash-array-mapped trie. A flat hash map might be faster for the typical + map sizes in symex. + +### Assessment + +The low-hanging fruit has been picked. The remaining hotspots are either: +- **Fundamental data structure costs** (irept find/get, sharing_tree) that + require structural changes to improve +- **Algorithmic** (string interning volume) that require deeper changes to + how SSA identifiers are managed +- **Already well-optimized** by tcmalloc (allocation is now 7.8% vs 18%) + +The most promising remaining avenue is reducing string interning volume +further — specifically, avoiding `string_containert::get` calls for strings +that are already `dstringt` values. This would require changes to how +`ssa_exprt` stores and updates its identifier. + +## Remaining Optimization Plans + +### Plan A: Reduce string interning volume (13.2%) + +**Goal**: Avoid calling `string_containert::get()` for strings that are +already interned (i.e., already `dstringt` values). + +**Approach**: The hot path is `set_expression` in `field_sensitivity.cpp` +which calls `update_identifier` → `build_identifier` → constructs +`irep_idt(std::string)` → `string_containert::get()`. Instead of building +a `std::string` and interning it, build the identifier from existing +`irep_idt` parts using a concatenation that produces an `irep_idt` directly. + +**Investigation outcome**: Attempted caching for member expressions +(appending `..component` before level suffixes). No measurable improvement +because the member case is not in a hot loop. The remaining `set_expression` +calls (37% of `update_identifier`) are from diverse call sites without a +single dominant pattern. Further reduction would require adding concatenation +support to `dstringt` itself (which currently only supports construction +from `std::string` via `string_containert::get()`). + +**Estimated impact**: 3-5% (reducing 13.2% by ~30-40%) +**Risk**: High — requires changes to core `dstringt`/`string_containert` +**Status**: Partially explored, diminishing returns + +### Plan B: Replace `forward_list_as_mapt` in irept (8.1%) + +**Goal**: Speed up `irept::find()` and `irept::get()` which do linear +scans through a linked list of named sub-trees. + +**Approach**: Replace `forward_list_as_mapt` with a +small flat sorted array or a small hash map. Most irept nodes have +0-5 named sub-trees, so a linear scan of a contiguous array would be +faster than a linked list due to cache locality. + +**Estimated impact**: 3-5% (reducing 8.1% by ~40-60%) +**Risk**: High — `forward_list_as_mapt` is used throughout irept + +### Plan C: Improve `merge_irept::merged` (1.8%) + +**Goal**: Speed up expression merging during symex. + +**Approach**: `merge_irept::merged` uses `irept::operator==` (which is +56% self-recursive) to check if an expression is already in the merge +set. Pre-computing and caching hash values for irept nodes would allow +skipping the deep equality check when hashes differ. + +**Estimated impact**: 1-2% +**Risk**: Low — isolated change in merge_irept + +### Plan D: Optimize `sharing_mapt::get_leaf_node` (1.3%) + +**Goal**: Speed up SSA renaming map lookups. + +**Approach**: The sharing_map is a hash-array-mapped trie optimized for +persistent/shared maps. For the L2 renaming use case where maps are +typically small and not heavily shared, a flat `std::unordered_map` +might be faster. Profile to confirm. + +**Estimated impact**: 0.5-1% +**Risk**: Medium — sharing_map is used for path merging + + +## Verification of Claims (2026-03-11) + +All claims were re-verified using a broader benchmark suite (6 benchmarks, +5 runs each, median reported). Builds were verified via `ldd` to confirm +tcmalloc presence/absence. tcmalloc was tested via `LD_PRELOAD` on builds +that don't have the allocator CMake option. + +### Methodology correction + +Earlier verification incorrectly showed tcmalloc at 0.1% because +`-Dallocator=system` was silently ignored by the baseline CMakeLists.txt +(which doesn't have the allocator option). Both "glibc" and "tcmalloc" +baseline builds were actually identical. Re-verified using `LD_PRELOAD`. + +### field_sensitivity array cache (3a95db52b3) — CORRECTNESS BUG + +This commit bypasses `set_expression` and manually constructs SSA +identifiers. It produces **different SSA formulas** than the standard +path (step count changes from 1950 to 1555 on string_ops). This causes +massive regressions: + +| Benchmark | Without | With | Change | +|------------|---------|---------|--------| +| string_ops | 0.245s | 4.013s | **16x slower** | +| matrix | 1.431s | 10.56s | **7x slower** | + +The commit has been moved to the top of the branch for separate debugging. +All other optimizations verified without this commit present. + +### Verified results at HEAD~2 (all opts except field_sensitivity cache and free-list) + +| Benchmark | Baseline | +SSA opts | +tcmalloc | +Both | SSA Δ | tc Δ | Combined | +| | glibc | glibc | preload | preload | | | | +|--------------|----------|-----------|-----------|----------|--------|--------|----------| +| linked_list | 0.554s | 0.518s | 0.471s | 0.451s | +6.5% | +15.0% | +18.6% | +| array_ops | 3.081s | 2.631s | 2.405s | 2.037s | +14.6% | +21.9% | +33.9% | +| dlinked_list | 0.971s | 0.918s | 0.795s | 0.760s | +5.5% | +18.1% | +21.7% | +| matrix | 1.661s | 1.431s | 1.293s | 1.103s | +13.8% | +22.2% | +33.6% | +| tree | 2.512s | 2.382s | 1.978s | 1.895s | +5.2% | +21.3% | +24.6% | +| string_ops | 0.259s | 0.245s | — | 0.263s | +5.4% | — | -1.5% | +| **TOTAL (5)**| 8.779s | 7.880s | 6.942s | 6.246s |**+10.2%**|**+20.9%**|**+28.9%**| + +No regressions on any benchmark. + +### Free-list pool allocator — re-evaluated (glibc only, 5 runs, median) + +Earlier testing on a too-small benchmark showed only 1.4%. Re-tested +with the full suite (without tcmalloc): + +| Benchmark | Without | With | Speedup | +|--------------|---------|--------|---------| +| linked_list | 0.520s | 0.512s | 1.5% | +| array_ops | 2.644s | 2.503s | **5.3%**| +| dlinked_list | 0.912s | 0.900s | 1.3% | +| matrix | 1.435s | 1.356s | **5.5%**| +| tree | 2.379s | 2.337s | 1.8% | +| string_ops | 0.243s | 0.235s | 3.3% | +| **TOTAL** | 8.133s | 7.843s | **3.6%**| + +The free-list gives a consistent 3.6% speedup with glibc, concentrated +on symex-heavy workloads. With tcmalloc, the effect is 0.7% (tcmalloc +already has thread-local size-class caches). + +**Assessment**: The 31% claim from the commit message is not reproduced +(likely measured on a different/larger workload or with confounding +factors). The actual benefit is 3.6% with glibc. This is meaningful for +platforms without tcmalloc (macOS, Windows) but adds complexity: +- Memory is never returned to the OS +- Interacts poorly with sanitizers (ASan, valgrind) +- `thread_local` overhead on single-threaded CBMC +- Redundant when tcmalloc is available + +### tcmalloc — CONFIRMED at 15-22% + +Verified via `LD_PRELOAD` on clean baseline builds confirmed to have +no tcmalloc (via `ldd`). The benefit is consistent across all non-trivial +benchmarks and applies to both baseline and SSA-optimized builds. + +SAT solver benefit from tcmalloc is minimal (~1.3% for CaDiCaL, ~1.2% +for MiniSat2) because solvers use fewer, larger allocations. Confirmed +that solvers do use tcmalloc (verified via `LD_DEBUG=bindings`). + +### SSA identifier optimizations — CONFIRMED at 5-15% + +The ostringstream→string concat and set_level suffix appending give +10.2% overall, with 14.6% on array_ops and 13.8% on matrix. The +benefit is real and concentrated on symex-heavy workloads. + +### Summary of claim accuracy + +| Optimization | Claimed | Verified | Status | +|-------------|---------|----------|--------| +| SSA string concat | 3.5-11.2% | 5-15% | ✅ Confirmed | +| set_level_0/1/2 | 1.3% | included above | ✅ Plausible | +| field_sensitivity cache | 4.8% | **7-16x regression** | ❌ Bug | +| irept union | 2.4-4.9% | not isolated | ⚠️ Included in HEAD~2 | +| tcmalloc | 18-25% | 15-22% | ✅ Confirmed | +| Free-list (glibc) | 31% | 3.2% | ⚠️ Overstated | +| Free-list (tcmalloc) | 2% | 0.7% | ⚠️ See definitive analysis | +| get_new_name cache | 4.9% → 0.56% | not exercised | ⚠️ Unverified | + +## Post-Verification Profile: Best Configuration (2026-03-11) + +Configuration: HEAD~2 (SSA opts + irept union + get_new_name cache + tcmalloc) +Benchmarks: 6 (linked_list, array_ops, dlinked_list, matrix, tree, heavy_array) +Total samples: 1,738 + +### Timings + +| Benchmark | Symex | Convert SSA | Total | Steps | +|-------------|---------|-------------|--------|--------| +| linked_list | 0.153s | 0.302s | 0.425s | 4,433 | +| array_ops | 2.016s | 0.137s | 2.015s | 16,448 | +| dlinked_list| 0.271s | 0.488s | 0.735s | 8,242 | +| matrix | 1.180s | 0.000s | 1.080s | 4,148 | +| tree | 0.695s | 0.729s | 1.855s | 27,789 | +| heavy_array | 1.695s | 0.000s | 1.573s | 5,176 | + +### Hotspot Summary by Category + +**SSA identifier pipeline: ~14%** +- `_Hashtable::_M_find_before_node` (string interning): 5.6% +- `build_ssa_identifier_rec`: 3.2% +- `hash_string`: 1.8% +- `_Hashtable::find`: 1.3% +- `update_identifier`: 0.8% +- `ssa_exprt::remove_level_2`: 0.7% +- `string::_M_append`: 2.9% (from identifier building) + +**irept operations: ~22%** +- `irept::find`: 8.7% +- `sharing_treet::remove_ref`: 6.9% +- `irept::get`: 6.6% +- `irept::add(id, irept)`: 3.0% +- `sharing_treet::detach`: 2.8% +- `irept::add(id)`: 1.8% +- `irept::operator==`: 0.9% + +**Allocation (tcmalloc): ~14%** +- `operator new[]`: 8.9% +- `operator delete[]`: 3.6% +- `operator delete[]` (sized): 1.8% + +**Simplifier: ~2.5%** +- `simplify_node_preorder`: 1.5% +- `simplify_node`: 1.0% + +**Other CBMC: ~5%** +- `constant_exprt::check`: 2.1% +- `field_sensitivityt::get_fields`: 0.9% +- `field_sensitivityt::apply`: 0.7% +- `binary_exprt constructor`: 0.9% +- `constant_exprt constructor`: 0.8% + +### Dominant Call Chain + +Nearly all hotspots converge on a single call chain: + +``` +symex_step → execute_next_instruction → {symex_assign, symex_assert, symex_goto} + → clean_expr → dereference + → field_sensitivityt::apply (recursive, 4-5 levels deep) + → field_sensitivityt::get_fields (recursive for nested arrays/structs) + → update_identifier → build_ssa_identifier_rec + → string_containert::get → hash_string → _Hashtable::find + → irept::find, irept::get (for type/expression lookups) + → simplify_exprt::simplify (for index expressions) + → simplify_node_preorder → irept::find +``` + +The `field_sensitivityt::apply` function is called recursively 4-5 times +per expression during dereference, and `get_fields` recurses for each +array dimension. Each iteration calls `update_identifier` which rebuilds +the SSA identifier string and interns it via `string_containert::get`. + +### Key Observations + +1. **field_sensitivity dominates everything.** The recursive apply/get_fields + chain is the root cause of most hotspots. It drives: + - All SSA identifier rebuilding (14%) + - Most irept::find/get calls (through type lookups in get_fields) + - Most allocation (through expression copying in apply) + - Most simplifier calls (through index simplification in get_fields) + +2. **irept::find is called from simplify_node_preorder** which is called + from within field_sensitivityt::apply's simplification of index + expressions. The simplifier does deep recursive descent, calling + `irept::find(ID_type)` at each level to check expression types. + +3. **tcmalloc allocation is 14%** — down from 25% with glibc, but still + significant. The `operator new[]` calls are from `std::vector` growth + in irept's `sub` member (which stores unnamed children). + +4. **`constant_exprt::check` at 2.1%** is validation that runs on every + `to_constant_expr()` cast. It calls `irept::find(ID_value)`. This is + pure overhead in Release builds but cannot be disabled without + `CPROVER_INVARIANT_DO_NOT_CHECK`. + +### Actionable Next Steps + +**P1: Reduce field_sensitivityt::apply recursion depth** (est. 5-10%) +The 4-5 levels of recursive `apply` calls during dereference are +excessive. Each level copies expressions and triggers identifier +rebuilds. Investigate whether the recursion can be flattened or +whether intermediate results can be cached. +Source: `src/goto-symex/field_sensitivity.cpp` + +**P2: Cache type lookups in simplifier** (est. 2-3%) +`simplify_node_preorder` calls `irept::find(ID_type)` at every +recursion level. Since types don't change during simplification, +the type could be passed as a parameter instead of re-looked-up. +Source: `src/util/simplify_expr.cpp` + +**P3: Reduce vector reallocation in irept** (est. 2-4%) +The 8.9% in `operator new[]` is largely from `std::vector` growth +in irept's `sub` member. Pre-sizing or using small-buffer optimization +could help. +Source: `src/util/irep.h` + +**P4: Avoid redundant `constant_exprt::check`** (est. 1-2%) +The 2.1% in `check` is from validation in `to_constant_expr()`. +Consider a `to_constant_expr_unchecked()` for hot paths where the +type is already known. +Source: `src/util/std_expr.h` + +### Investigation: field_sensitivityt::apply optimization (2026-03-11) + +**Attempted**: Cache array identifier prefix in `get_fields` array loop. +For each array element, the SSA identifier has the form +`prefix[[bvrep_of_i]]suffix` where prefix and suffix are identical. +The first element uses canonical `set_expression` to establish the +format, then subsequent elements derive identifiers by string replacement. + +**Correctness**: Verified — all 7 benchmarks produce identical step +counts to baseline. No invariant violations. (The previous attempt in +commit `3a95db52b3` was buggy because it used decimal `std::to_string(i)` +instead of the bvrep value, and bypassed `set_expression` entirely.) + +**Performance** (5 runs, median, with tcmalloc): + +| Benchmark | Without | With | Δ | +|-------------|---------|--------|--------| +| linked_list | 0.421s | 0.422s | -0.2% | +| array_ops | 1.972s | 2.005s | -1.7% | +| dlinked_list| 0.729s | 0.726s | +0.4% | +| string_ops | 0.204s | 0.204s | +0.0% | +| matrix | 1.062s | 1.052s | +0.9% | +| tree | 1.851s | 1.846s | +0.3% | +| heavy_array | 1.537s | 1.468s | **+4.5%** | +| **TOTAL** | 7.776s | 7.723s | **+0.7%** | + +**Assessment**: Only 0.7% total improvement. The identifier string +building is not the dominant cost in the array loop — the `from_integer` +call (which creates a `constant_exprt` and interns its bvrep value), +the `ssa_exprt` copy, and the recursive `get_fields` call dominate. +The optimization only helps on `heavy_array` (4.5%) which has large +arrays. **Not worth the added complexity.** + +**Root cause analysis**: The profiling shows that the cost is spread +across many small irept operations (find, get, add, detach, remove_ref) +that are fundamental to how CBMC represents and manipulates expressions. +There is no single bottleneck to eliminate — it's the cumulative cost +of millions of small operations on the sharing tree data structure. + +**Conclusion**: Further optimization of `field_sensitivityt::apply` +requires either: +1. Structural changes to reduce the number of `apply` calls (e.g., + caching results, lazy evaluation) +2. Changes to the irept data structure itself (e.g., replacing + `forward_list_as_mapt` with a flat hash map for named sub-trees) +3. Reducing the depth of `field_sensitivityt::apply` recursion in + `symex_dereference.cpp` (currently 4-5 levels deep) + +These are all high-risk, high-effort changes that go beyond the scope +of incremental optimization. + +### Implemented: hoist loop invariants in get_fields (2026-03-11) + +Hoisted loop-invariant work out of both the struct and array loops: +- Prepare a template `ssa_exprt` with L2 already removed before the loop +- Cache `was_l2` flag (identical for all elements) +- For arrays: cache identifier prefix/suffix from first element's + canonical `set_expression`, derive subsequent identifiers by replacing + only the index portion (using bvrep values for correctness) + +This eliminates per-iteration: 1 `ssa_exprt` copy from the original +(now copies from the lighter template), 1 `get_level_2()` call, +1 `remove_level_2()` call, and for array elements i>0: 1 full +`update_identifier` → `build_ssa_identifier_rec` → `string_containert::get` +chain. + +| Benchmark | Baseline | Optimized | Speedup | +|-------------|----------|-----------|---------| +| linked_list | 0.421s | 0.421s | +0.0% | +| array_ops | 1.972s | 1.924s | **+2.4%** | +| dlinked_list| 0.729s | 0.728s | +0.1% | +| string_ops | 0.204s | 0.203s | +0.5% | +| matrix | 1.062s | 1.002s | **+5.6%** | +| tree | 1.851s | 1.848s | +0.2% | +| heavy_array | 1.537s | 1.356s | **+11.8%** | +| **TOTAL** | 7.776s | 7.482s | **+3.8%** | + +Correctness verified: all 7 benchmarks produce identical step counts. + +## Post-Optimization Profile: After get_fields Hoisting (2026-03-11) + +Configuration: HEAD (all SSA opts + irept union + get_fields hoisting + tcmalloc) +Benchmarks: 6 heavy benchmarks × 5 runs each under single perf session +Total samples: ~38K (997 Hz, filtered to cbmc process) + +### Profile by Category + +| Category | % | Key functions | +|----------|---|---------------| +| ostream I/O | 20.9% | xsputn, ostream_insert, num_put, sentry | +| irept operations | 19.2% | operator==, find, get, add, hash, compare, detach, remove_ref, merged | +| tcmalloc | 10.5% | new[], delete[], internal | +| DIMACS I/O + CNF | 6.2% | write_dimacs_clause, process_clause, lcnf | +| memmove | 1.9% | memcpy/memmove | +| field_sensitivity | **1.6%** | apply, ssa_check, requires_renaming, string intern | +| other CBMC | 0.8% | convert_bv, simplify_node | + +### Key Changes vs Previous Profile + +1. **field_sensitivity dropped from ~14% to 1.6%** — the loop invariant + hoisting and SSA identifier optimizations were effective. + +2. **irept::operator== is now #1 CBMC function at 7.5%** (was ~1%). + Called from `merge_irept::merged` → `symex_target_equationt::merge_ireps`. + This is expression deduplication during symex. 56% of samples are + self-recursive (deep tree comparison). + +3. **ostream I/O is 21%** — this is a benchmarking artifact from + `--dimacs --outfile /dev/null` which still formats all clauses as text. + In real usage with a SAT solver, this cost would be replaced by solver + time. Future profiling should use `--stop-on-fail` with actual solving + to get a realistic profile. + +4. **DIMACS CNF generation is 6.2%** — `process_clause`, `write_dimacs_clause`, + `lcnf`. This is real work that would also happen with a SAT solver + (clause generation), though the text formatting part is artificial. + +### Top CBMC Functions + +| Rank | Function | % | Caller | +|------|----------|---|--------| +| 1 | irept::operator== | 7.50% | merge_irept::merged (expression dedup) | +| 2 | sharing_treet::remove_ref | 2.59% | Scattered (destruction) | +| 3 | cnft::process_clause | 2.35% | CNF clause generation | +| 4 | irept::get | 2.22% | Scattered | +| 5 | dimacs_cnft::write_dimacs_clause | 2.19% | DIMACS output (artifact) | +| 6 | irept::find | 1.86% | Scattered | +| 7 | sharing_treet::detach | 1.28% | Copy-on-write | +| 8 | merge_irept::merged | 1.26% | Expression dedup | +| 9 | irept::hash | 1.18% | merge_irept hash table | +| 10 | cnf_clause_listt::lcnf | 1.13% | CNF clause list | + +### Actionable Observations + +**merge_irept is the new dominant CBMC hotspot** (operator== 7.5% + +merged 1.3% + hash 1.2% = ~10%). It deduplicates expressions by +hashing and deep comparison. Potential optimizations: +- Cache hash values in irept nodes (HASH_CODE is already enabled but + `hash_code` is only set lazily and cleared on mutation) +- Use hash comparison as a fast-reject before deep operator== +- Consider whether merge_ireps is called too frequently + +**The --dimacs benchmark methodology inflates I/O costs.** The 27% +in ostream + DIMACS I/O is an artifact. For future profiling, use +actual SAT solving (e.g., `--sat-solver cadical --stop-on-fail`) +to get a realistic profile of the full pipeline. + +**field_sensitivity is no longer a bottleneck** at 1.6%. The loop +invariant hoisting was effective. + +### Hash function comparison (2026-03-11) + +Tested all three irep hash functions from `src/util/irep_hash.h`: +BASIC (rotate-7 XOR), MURMURHASH2A, and MURMURHASH3. + +| Benchmark | BASIC | MURMUR2A | MURMUR3 | 2A vs B | 3 vs B | +|-------------|--------|----------|---------|---------|--------| +| array_ops | 1.923s | 1.930s | 1.928s | -0.4% | -0.3% | +| matrix | 1.001s | 1.001s | 1.007s | +0.0% | -0.6% | +| tree | 1.846s | 1.863s | 1.864s | -0.9% | -1.0% | +| heavy_array | 1.356s | 1.358s | 1.358s | -0.1% | -0.1% | +| linked_list | 0.421s | 0.424s | 0.423s | -0.7% | -0.5% | +| dlinked_list| 0.730s | 0.733s | 0.731s | -0.4% | -0.1% | +| **TOTAL** | 7.277s | 7.309s | 7.311s | **-0.4%** | **-0.5%** | + +**Conclusion**: No meaningful difference. BASIC is marginally fastest +(0.4-0.5%) but within noise. The hash function is not a bottleneck — +the cost is in the number of hash operations and the deep equality +comparisons that follow hash collisions, not in the hash computation +itself. This is consistent with the earlier finding that `irept::hash` +is only 1.2% of total time. + +Instrumented with `IREP_HASH_STATS` to count actual operator== calls: + +| Benchmark | hash calls | BASIC cmp | MURMUR2A cmp | MURMUR3 cmp | Δ | +|-------------|-----------|------------|-------------|-------------|---| +| array_ops | 760,484 | 2,712,855 | 2,712,855 | 2,712,855 | 0 | +| tree | 1,103,385 | 5,274,971 | 5,272,397 | 5,272,397 | -2,574 | +| heavy_array | 172,870 | 1,061,894 | 1,061,894 | 1,061,894 | 0 | +| linked_list | 192,219 | 5,268,225 | 5,268,165 | 5,268,165 | -60 | +| **TOTAL** | 2,228,958 | 14,317,945 | 14,315,311 | 14,315,311 | **-2,634 (-0.018%)** | + +The Murmur variants produce 2,634 fewer operator== calls out of 14.3M +(0.018% reduction). This is negligible and explains why the runtime +difference is within noise. + +Notable: the ratio of comparisons to hashes is 6.4:1, meaning +`merge_irept::merged` does ~6 equality checks per hash lookup on +average. This suggests the hash table has significant collision chains +or many structurally similar expressions. However, since the Murmur +hashes don't reduce this ratio, the collisions are likely from +genuinely equal expressions (sharing the same hash bucket), not from +poor hash distribution. + +### Investigation: reducing operator== calls (2026-03-11) + +**Context**: `irept::operator==` accounts for 7.5% of total time, called +14.3M times across 4 benchmarks. 67.9% are sharing hits (free), 27.7% +are deep comparisons returning true, 3.5% are deep comparisons returning +false, 0.9% are hash rejects. + +**Detailed breakdown** (with IREP_HASH_STATS instrumentation): + +| Category | Count | % | Cost | +|----------|-------|---|------| +| Sharing hit (ptr ==) | 9,648,072 | 67.9% | Free | +| Hash reject (new) | 133,543 | 0.9% | Cheap | +| Deep compare (equal) | 3,932,670 | 27.7% | **Expensive** | +| Deep compare (not eq) | 501,226 | 3.5% | Expensive | + +**merge_irept statistics**: +- 744,362 calls to `merged()`, 82% cache hits +- 19.2 operator== calls per merge call (recursive sub-tree comparison) +- `linked_list` has 65:1 ratio (deep expression trees) + +#### Attempted: hash-based fast reject in operator== + +Added check: if both ireps have cached hash_code and they differ, +return false immediately. Result: **0.0% runtime impact**. The +`std::unordered_set` already filters by hash before calling operator==, +so the hash reject only fires for non-merge callers (133K out of 14.3M). + +#### Attempted: opportunistic sharing in operator== + +After a successful deep comparison (result == true), make both ireps +share the same data pointer. This would convert future comparisons +between the same pair into instant pointer checks. + +**Result: UNSAFE.** Changing the data pointer in operator== alters +copy-on-write semantics. An irep that was the sole owner of its data +(ref_count == 1, mutations are in-place) becomes a shared owner +(ref_count > 1, mutations trigger detach/copy). This changed the +generated formula: `dlinked_list` went from 8,242 to 19,552 steps. + +#### Attempted: pointer-based fast lookup in merge_irept + +Added `unordered_set known_pointers` to skip the +hash+equality check when an irep's data pointer is already known +to be in the store. + +**Result: 1.3% SLOWER.** The overhead of maintaining the pointer set +(hashing pointers, inserting, looking up) outweighs the savings. + +#### Analysis: why 3.9M deep-equal comparisons exist + +The deep-equal comparisons happen because: +1. `merge_irept::merged` is called on every SSA step (744K calls) +2. 82% of calls find the irep already in the store (cache hit) +3. But `unordered_set::insert` must call operator== to verify the match +4. operator== recurses into sub-trees (19 recursive calls on average) +5. The sub-trees of the incoming irep may not share data pointers with + the store entries, even though they're structurally equal + +The fundamental issue is that `std::unordered_set` has no way to skip +the equality check — it must verify that the hash match is not a +collision. The only way to avoid this is to ensure the incoming irep +shares data pointers with the store entry at every level, which would +require the irep to have been constructed from previously-merged +components. + +#### Conclusion + +The 7.5% cost of operator== in merge_irept is inherent to the +hash-set-based deduplication approach. The only viable optimizations +would be: +1. **Reduce merge frequency**: Call merge_ireps less often (e.g., every + N steps instead of every step) +2. **Structural change**: Replace the unordered_set with a data structure + that can verify membership without deep comparison (e.g., a trie + indexed by irep structure) +3. **Ensure sharing**: Make the SSA step construction reuse previously- + merged ireps so that sub-tree pointers match store entries + +### Sketch: reducing merge frequency (2026-03-12) + +#### Current behavior + +Every SSA step method in `symex_target_equationt` (19 call sites) calls +`merge_ireps(SSA_step)` at the end. This merges 6-8 top-level ireps per +step (guard, ssa_lhs, ssa_full_lhs, original_full_lhs, ssa_rhs, +cond_expr, plus io_args and function_arguments), each of which recurses +into all sub-ireps. + +The `merge_irept::merged()` function works bottom-up: +1. Try to insert the irep into `irep_store` (an `unordered_set`) +2. If already present → return the existing entry (sharing hit) +3. If new → recursively merge all sub-ireps, then update the store entry + +The cost: 82% of `merged()` calls are cache hits, but each hit requires +`unordered_set::insert` → hash computation + `operator==` verification. +The `operator==` recurses into sub-trees, averaging 19 recursive calls +per top-level merge. + +#### Why batching could help + +The key insight: when merging step N, many of its sub-ireps are +identical to sub-ireps from step N-1 (same variable names, same types, +same guards). If step N-1 was already merged, its sub-ireps already +share data pointers with the store. But the sub-ireps of step N were +constructed independently — they have the same *content* but different +*pointers*. + +If we skip merging for K steps and then merge all K steps at once, the +merge store processes K×(6-8) top-level ireps. The first step's ireps +populate the store; subsequent steps' ireps find sharing hits. The total +number of `operator==` calls is the same, but: + +**The benefit is NOT from batching itself** — it's from the fact that +after merging step N, the *next* step N+1 is constructed using the +already-merged components from the symbol table and SSA state. If the +SSA construction reuses ireps that were previously merged, their +sub-tree pointers already match the store, making `operator==` a cheap +pointer comparison instead of a deep structural comparison. + +#### The real opportunity: ensure sub-tree sharing + +The actual optimization is not "merge less often" but "ensure that +ireps entering merge already share sub-tree pointers with the store." + +This happens naturally for some components: +- **Types**: The same `typet` objects are reused across steps (e.g., + `signedbv_typet(32)` for all `int` variables). Once merged, the + type's data pointer matches the store entry. +- **Variable names**: `irep_idt` values are interned strings — they + already have pointer equality. + +But it does NOT happen for: +- **SSA indices**: Each `ssa_exprt` has a unique L2 index suffix that + changes every step. The SSA expression is constructed fresh, so its + sub-tree pointers don't match the store. +- **Guards**: Guards accumulate conditions and are rebuilt each step. + +#### Concrete approach: deferred merge with sharing propagation + +``` +// Instead of merging every step: +void symex_target_equationt::merge_ireps(SSA_stept &step) +{ + merge_irep(step.guard); + merge_irep(step.ssa_lhs); + // ... etc +} + +// Merge every N steps, but propagate sharing: +void symex_target_equationt::maybe_merge_ireps(SSA_stept &step) +{ + ++steps_since_merge; + if(steps_since_merge < MERGE_INTERVAL) + return; + steps_since_merge = 0; + + // Merge all unmerged steps + for(auto it = last_merged_step; it != SSA_steps.end(); ++it) + merge_ireps(*it); + last_merged_step = SSA_steps.end(); +} +``` + +**Expected impact**: Minimal. The batching itself doesn't reduce the +number of `operator==` calls — it just delays them. The sub-ireps of +each step are still constructed independently, so they still won't +share pointers with the store. + +The only way batching helps is if it reduces *peak memory* by allowing +the store to grow more slowly. But the store's purpose IS to reduce +memory, so delaying merges increases memory usage. + +#### Verdict + +Reducing merge frequency is unlikely to yield significant speedup +because: +1. The `operator==` cost is per-irep, not per-batch — batching doesn't + reduce the total number of comparisons +2. The sub-ireps entering merge are constructed fresh regardless of + batch size — they won't share pointers with the store +3. Delaying merges increases peak memory (the opposite of merge's goal) + +The only viable path to reducing `operator==` cost in merge is to +ensure that ireps entering merge already have sub-tree pointers that +match the store. This would require changes to how SSA expressions +are constructed — e.g., caching and reusing the merged type/guard +components when building new SSA steps. + + +## Re-Profiling and Re-Verification (2026-03-12) + +### Methodology improvement: goto-cc pre-compilation + +Switched to compiling benchmarks to goto binaries via `goto-cc` before +profiling. This eliminates C preprocessing variability: +- Range: ±0.007-0.017s (.gb) vs ±0.285-0.811s (.c) +- Speedup: 2.2% from skipping in-process C parsing +- Profile content is identical (cc1 runs as subprocess) + +Updated `scripts/profiling/runner.py` to auto-compile .c to .gb. + +### Corrected benchmark results (median of 5, goto binaries) + +Baseline: develop `b191cc1dac`, glibc malloc +HEAD: all code optimizations (SSA string concat, set_level_0/1/2, +irept union, pool allocator, get_new_name cache, field_sensitivity +hoisting), glibc malloc + +| Benchmark | Baseline | HEAD | Speedup | +|-------------|----------|--------|---------| +| linked_list | 0.522s | 0.400s | 23.4% | +| array_ops | 3.063s | 1.900s | 38.0% | +| dlinked_list| 0.943s | 0.705s | 25.2% | +| string_ops | 0.242s | 0.193s | 20.2% | +| matrix | 1.654s | 0.989s | 40.2% | +| tree | 2.493s | 1.826s | 26.8% | +| heavy_array | 2.533s | 1.316s | 48.0% | +| **TOTAL** |**11.450s**|**7.329s**|**36.0%**| + +Correctness verified: DIMACS output matches exactly for all benchmarks. + +**Note:** This table measures code optimizations only (glibc malloc). +With tcmalloc added, the total speedup is ~31% — see "Comprehensive +13-Benchmark Results (2026-03-12)" for the full picture. + +### tcmalloc re-evaluation — RETRACTED + +**The measurements in this section were incorrect.** The HEAD binary had +tcmalloc linked via CMake (`-Dallocator=auto`), so the LD_PRELOAD test +was comparing tcmalloc vs tcmalloc, not tcmalloc vs glibc. The "0% +tcmalloc benefit on HEAD" was an artifact of this error. + +See the "Definitive Allocator Analysis (2026-03-12)" section below for +correct measurements using binaries built with `-Dallocator=system`. + +### Current profile (HEAD, all optimizations) + +**Note:** This profile was taken before changes 1 and 2 (unordered_set, +value_sett::to_expr) and was dominated by csmith_1111111111. See +"Updated Profile: Post-Changes 1 and 2 (2026-03-12)" for the latest. + +| Rank | Function | % | +|------|----------|---| +| 1 | irept::operator== | 8.4% | +| 2 | sharing_treet::remove_ref | 2.7% | +| 3 | cnft::process_clause | 2.3% | +| 4 | dimacs_cnft::write_dimacs_clause | 2.0% | +| 5 | irept::find | 1.7% | +| 6 | irept::get | 1.7% | +| 7 | sharing_treet::detach | 1.4% | +| 8 | merge_irept::merged | 1.3% | +| 9 | irept::hash | 1.2% | +| 10 | cnf_clause_listt::lcnf | 1.1% | + +Library overhead: iostream 23.1%, tcmalloc 12.6%, libc 2.7%. +The iostream cost is a `--dimacs` profiling artifact. + +### Assessment + +The profile is now flat — no single function dominates. The remaining +hotspots are either inherent to the algorithm (operator==, merge), +architectural (COW refcounting), or artifacts of the profiling method +(iostream). We are in diminishing-returns territory for easy wins. + + +## Definitive Allocator Analysis (2026-03-12) + +### Methodology + +Built 3 binaries from the same codebase, varying only the pool allocator: +- **Baseline**: develop `b191cc1dac` (no pool, no other optimizations) +- **HEAD-nopool**: all optimizations except pool allocator (reverted `ae34043bd1`) +- **HEAD-pool**: all optimizations including pool allocator + +All 3 binaries compiled with `-Dallocator=system` (glibc malloc only). +tcmalloc and jemalloc tested via `LD_PRELOAD` on each binary. + +Allocator linkage verified via `ldd` for all binaries (no tcmalloc/jemalloc +linked). LD_PRELOAD verified via `LD_DEBUG=libs`. DIMACS output verified +identical across all 3 builds. + +9 benchmarks × 3 builds × 3 allocators × 7 runs = 567 data points. +Measurement stability: 96% of configurations have IQR < 2% of median. + +### Benchmarks + +| Benchmark | Args | +|-----------|------| +| linked_list | --bounds-check --pointer-check --unwind 200 | +| array_ops | --bounds-check --unwind 55 | +| dlinked_list | --bounds-check --pointer-check --unwind 150 | +| string_ops | --bounds-check --unwind 30 | +| matrix | --bounds-check --unwind 12 | +| tree | --bounds-check --pointer-check --unwind 8 | +| heavy_array | --bounds-check --unwind 25 | +| csmith_42 | --unwind 257 --no-unwinding-assertions --object-bits 13 | +| csmith_1111111111 | --unwind 257 --no-unwinding-assertions --object-bits 13 | + +All benchmarks pre-compiled to goto binaries via `goto-cc`. + +### Full results (median of 7 runs, seconds) + +| Benchmark | Base/glibc | Base/tc | Base/je | Head/glibc | Head/tc | Head/je | Pool/glibc | Pool/tc | Pool/je | +|-----------|-----------|---------|---------|-----------|---------|---------|-----------|---------|---------| +| linked_list | 0.523 | 0.441 | 0.431 | 0.491 | 0.418 | 0.408 | 0.485 | 0.419 | 0.409 | +| array_ops | 3.076 | 2.377 | 2.570 | 2.497 | 1.918 | 2.062 | 2.410 | 1.904 | 2.052 | +| dlinked_list | 0.942 | 0.766 | 0.762 | 0.883 | 0.724 | 0.717 | 0.873 | 0.723 | 0.717 | +| string_ops | 0.243 | 0.249 | 0.219 | 0.226 | 0.237 | 0.208 | 0.222 | 0.234 | 0.206 | +| matrix | 1.662 | 1.277 | 1.377 | 1.333 | 0.999 | 1.081 | 1.274 | 0.995 | 1.066 | +| tree | 2.488 | 1.948 | 1.962 | 2.345 | 1.842 | 1.854 | 2.307 | 1.840 | 1.854 | +| heavy_array | 2.539 | 1.943 | 2.071 | 1.831 | 1.333 | 1.429 | 1.742 | 1.316 | 1.397 | +| csmith_42 | 1.732 | 1.338 | 1.375 | 1.552 | 1.221 | 1.247 | 1.511 | 1.212 | 1.241 | +| csmith_1111 | 12.737 | 10.066 | 10.072 | 11.723 | 9.286 | 9.351 | 11.334 | 9.214 | 9.251 | +| **TOTAL** | **25.942** | **20.405** | **20.839** | **22.881** | **17.978** | **18.357** | **22.158** | **17.857** | **18.193** | + +### Effect of allocator (% speedup vs glibc, same binary) + +| Benchmark | Base→tc | Base→je | Head→tc | Head→je | Pool→tc | Pool→je | +|-----------|---------|---------|---------|---------|---------|---------| +| linked_list | +15.7% | +17.6% | +14.9% | +16.9% | +13.6% | +15.7% | +| array_ops | +22.7% | +16.4% | +23.2% | +17.4% | +21.0% | +14.9% | +| dlinked_list | +18.7% | +19.1% | +18.0% | +18.8% | +17.2% | +17.9% | +| string_ops | -2.5% | +9.9% | -4.9% | +8.0% | -5.4% | +7.2% | +| matrix | +23.2% | +17.1% | +25.1% | +18.9% | +21.9% | +16.3% | +| tree | +21.7% | +21.1% | +21.4% | +20.9% | +20.2% | +19.6% | +| heavy_array | +23.5% | +18.4% | +27.2% | +22.0% | +24.5% | +19.8% | +| csmith_42 | +22.7% | +20.6% | +21.3% | +19.7% | +19.8% | +17.9% | +| csmith_1111 | +21.0% | +20.9% | +20.8% | +20.2% | +18.7% | +18.4% | +| **TOTAL** | **+21.3%** | **+19.7%** | **+21.4%** | **+19.8%** | **+19.4%** | **+17.9%** | + +### Effect of pool allocator (Head-nopool → Head-pool) + +| Benchmark | glibc | tcmalloc | jemalloc | +|-----------|-------|----------|----------| +| linked_list | +1.2% | -0.2% | -0.2% | +| array_ops | +3.5% | +0.7% | +0.5% | +| dlinked_list | +1.1% | +0.1% | +0.0% | +| string_ops | +1.8% | +1.3% | +1.0% | +| matrix | +4.4% | +0.4% | +1.4% | +| tree | +1.6% | +0.1% | +0.0% | +| heavy_array | +4.9% | +1.3% | +2.2% | +| csmith_42 | +2.6% | +0.7% | +0.5% | +| csmith_1111 | +3.3% | +0.8% | +1.1% | +| **TOTAL** | **+3.2%** | **+0.7%** | **+0.9%** | + +### 2×2 summary (HEAD code, total across 9 benchmarks) + +| | glibc | tcmalloc | jemalloc | +|---|---|---|---| +| **No pool** | 22.881s (ref) | 17.978s (+21.4%) | 18.357s (+19.8%) | +| **Pool** | 22.158s (+3.2%) | 17.857s (+22.0%) | 18.193s (+20.5%) | + +### Conclusions + +1. **tcmalloc gives 21% speedup** consistently across all builds (baseline, + HEAD-nopool, HEAD-pool) and all benchmarks except string_ops (which is + too small at 0.2s for the benefit to outweigh initialization overhead). + This effect is independent of code optimizations. + +2. **jemalloc gives 20% speedup**, slightly less than tcmalloc on most + benchmarks. Both are effective; tcmalloc has a slight edge on + array-heavy workloads. + +3. **The pool allocator gives 3.2% speedup with glibc**, concentrated on + symex-heavy workloads (heavy_array +4.9%, matrix +4.4%, array_ops +3.5%). + +4. **The pool allocator adds only 0.7% on top of tcmalloc** and 0.9% on + top of jemalloc. Both tcmalloc and the pool allocator address the same + bottleneck (small object allocation/deallocation), and tcmalloc is much + more effective. + +5. **The pool allocator does NOT subsume tcmalloc** (correcting the earlier + claim from the 2026-03-12 re-profiling). The earlier measurement showing + 0% tcmalloc benefit on HEAD was incorrect because the HEAD binary had + tcmalloc linked via CMake — the LD_PRELOAD test was comparing tcmalloc + vs tcmalloc, not tcmalloc vs glibc. This re-analysis uses binaries + built with `-Dallocator=system` to ensure clean glibc baselines. + +6. **The 31% pool allocator claim in commit `7d15280901` is not reproduced.** + The verified benefit is 3.2% with glibc. The original measurement likely + had confounding factors (different benchmark, different baseline, or + tcmalloc already present). + +### Recommendation + +- **tcmalloc should be the primary allocator strategy** (21% speedup, zero + code complexity, well-tested library) +- **The pool allocator provides marginal additional benefit** (+0.7% with + tcmalloc) but adds complexity (memory never returned to OS, interacts + poorly with sanitizers, thread_local overhead). Consider whether the + complexity is worth 0.7%. +- **jemalloc is a good alternative** where tcmalloc is unavailable (20% + speedup, widely available) + +## Comprehensive 13-Benchmark Results (2026-03-12) + +Extended the benchmark suite from 7 hand-written benchmarks to 13 total +by adding 6 CSmith-generated benchmarks (seeds: 42, 1111111111, +1234567890, 2718281828, 314159265, 99999). CSmith benchmarks use +`--unwind 257 --no-unwinding-assertions --object-bits 13` and require +`-I/usr/include/csmith`. + +All benchmarks pre-compiled to goto binaries via `goto-cc` for +measurement stability. 7 runs each, median reported. IQR < 2.1% for +all configurations. + +### Baseline vs HEAD (before changes 1 and 2) + +Baseline: develop `b191cc1dac`, glibc malloc. +HEAD: all code optimizations (SSA string concat, set_level_0/1/2, +irept union, pool allocator, get_new_name cache, field_sensitivity +hoisting) + tcmalloc via CMake. + +| Benchmark | Baseline | HEAD | Speedup | +|-----------|----------|------|---------| +| linked_list | 0.520s | 0.398s | +23.5% | +| array_ops | 3.056s | 1.908s | +37.6% | +| dlinked_list | 0.939s | 0.703s | +25.1% | +| string_ops | 0.241s | 0.192s | +20.3% | +| matrix | 1.652s | 0.985s | +40.4% | +| tree | 2.485s | 1.817s | +26.9% | +| heavy_array | 2.510s | 1.312s | +47.7% | +| csmith_42 | 1.718s | 1.204s | +29.9% | +| csmith_1111111111 | 12.819s | 9.255s | +27.8% | +| csmith_1234567890 | 2.371s | 1.653s | +30.3% | +| csmith_2718281828 | 5.947s | 4.173s | +29.8% | +| csmith_314159265 | 1.862s | 1.316s | +29.3% | +| csmith_99999 | 1.707s | 1.196s | +29.9% | +| **TOTAL** | **37.827s** | **26.112s** | **+31.0%** | + +DIMACS output verified identical for all 13 benchmarks. + +The CSmith benchmarks show remarkably consistent speedup (27.8–30.3%), +confirming the optimizations are general-purpose. The hand-written +benchmarks show more variation (20–48%) because they exercise different +code paths with different optimization sensitivity. + +**Note:** The goto binaries used for this table were compiled during +this session. Later in the session, the baseline worktree was recreated, +which rebuilt the goto binaries. The linked_list and dlinked_list +benchmarks became much faster (~0.06s and ~0.07s) with the new goto +binaries, likely due to a different `goto-cc` version producing simpler +goto programs. All subsequent A/B comparisons use the same goto binaries +for both sides, so relative speedups remain valid. + +## Optimization: unordered_set in simplify_inequality_pointer_object (2026-03-12) + +Commit: `a656a98492` + +### Problem + +`simplify_expr_with_value_sett::simplify_inequality_pointer_object` used +`std::set` to collect pointed-to objects. `std::set` requires +`operator<`, which calls `irept::compare` — a deep recursive tree +comparison. On pointer-heavy code (csmith_1111111111), this function was +25.5% inclusive cost, with 17.7% spent in `irept::compare` via +`std::set::insert`. + +### Change + +- Replaced `std::set` with `std::unordered_set`. + This uses `irept::hash()` (cached after first computation) and + `operator==` (can short-circuit on pointer equality from sharing) + instead of `irept::compare`. +- Replaced `std::set_intersection` (requires sorted ranges) with a + simple loop checking membership via `count()`. + +### Benchmark (A/B, same goto binaries, 7 runs, median) + +| Benchmark | Before | After | Speedup | +|-----------|--------|-------|---------| +| csmith_1111111111 | 9.286s | 7.600s | **+18.2%** | +| array_ops | 1.900s | 1.901s | 0.0% | +| tree | 1.816s | 1.811s | +0.3% | +| heavy_array | 1.310s | 1.310s | 0.0% | +| csmith_2718281828 | 4.159s | 4.157s | 0.0% | +| All others | — | — | < ±0.6% | +| **TOTAL (13)** | **25.115s** | **23.401s** | **+6.8%** | + +The speedup is concentrated on csmith_1111111111 because it is +pointer-heavy and triggers many pointer-object inequality comparisons. +Other benchmarks are unaffected because they rarely enter this code path. + +DIMACS output verified identical for all 13 benchmarks. + +## Optimization: direct construction in value_sett::to_expr (2026-03-12) + +Commit: `e5afe5eac7` + +### Problem + +`value_sett::to_expr` default-constructed an `object_descriptor_exprt` +then mutated it via `od.object()=...`, `od.offset()=...`, and +`od.type()=...`. Each mutation triggers a COW detach operation on the +irept sharing tree. The call chain +`value_sett::get_value_set` → `to_expr` → `irept::add` → `detach` +accounted for ~4–5% of total cost on pointer-heavy benchmarks. + +### Change + +- Added `object_descriptor_exprt(exprt _object, exprt _offset)` + constructor that passes both operands directly to `binary_exprt`, + avoiding all post-construction mutations. +- Simplified `to_expr` to use direct construction. +- The `type()` field was previously set to `object.type()` but is never + read by any consumer (verified by grep). The existing single-argument + constructor already left it as `typet()`. + +### Benchmark (A/B vs change 1 only, same goto binaries, 7 runs, median) + +| Benchmark | Change 1 | Change 1+2 | Speedup | +|-----------|----------|------------|---------| +| csmith_1111111111 | 7.633s | 7.250s | **+5.0%** | +| tree | 1.810s | 1.823s | -0.7% | +| All others | — | — | < ±0.6% | +| **TOTAL (13)** | **23.434s** | **23.102s** | **+1.4%** | + +The speedup is again concentrated on csmith_1111111111 (pointer-heavy). +The -0.7% on tree is within noise (IQR ~1%). + +DIMACS output verified identical for all 13 benchmarks. + +## Updated Profile: Post-Changes 1 and 2 (2026-03-12) + +Configuration: HEAD with both changes, tcmalloc via CMake. +Benchmarks: 9 (all except csmith_1111111111, which was excluded to avoid +biasing the aggregated profile — it is 3–7× heavier than other benchmarks +and would dominate sample counts). +Method: `perf record` at 997 Hz with 3 iterations per benchmark. + +### Self-cost by category + +| Category | Self % | Key functions | +|----------|--------|---------------| +| irept access (find+get+add) | 16.2% | Linear scan of `forward_list_as_mapt` per property access | +| irept COW (remove_ref+detach) | 13.5% | Triggered by mutations during rename, simplify, field_sensitivity | +| simplifier | 5.8% | 30% inclusive; `simplify_node_preorder` 2.0%, `simplify_node` 1.3%, `simplify_rec` 1.1% | +| string interning | 3.9% | `hash_string` 0.7%, hashtable lookups 2.6%, `get_dstring_number` 0.3% | +| BigInt/bvrep | 3.8% | `BigInt::compare` 0.5%, `BigInt::~BigInt` 0.3%, `get_bvrep_bit` 0.9%, `integer2string` 0.6% | +| sharing_mapt | 2.8% | SSA rename state lookups via `get_leaf_node` | +| irept compare (==, hash) | 2.8% | Down from 5.7% pre-change-1 | +| field_sensitivity | 2.6% | Recursive `apply` calls | +| validation checks | 2.2% | `constant_exprt::check` 1.4%, `ssa_exprt::check` 1.1% | +| SSA identifiers | 2.0% | `update_identifier`, `build_ssa_identifier_rec` | +| merge_irept | 1.7% | Expression deduplication | +| rename | 1.0% | `rename`, `rename` | +| tcmalloc | 13.2% | Irreducible allocator overhead | +| libc | 2.6% | memcpy, memmove | +| libstdc++ | 1.4% | | +| other CBMC | 9.5% | Long tail of small functions | + +### Top 15 CBMC functions (self cost) + +| Rank | Function | Self % | +|------|----------|--------| +| 1 | irept::find | 7.5% | +| 2 | sharing_treet::remove_ref | 7.3% | +| 3 | sharing_treet::detach | 5.2% | +| 4 | irept::get | 4.7% | +| 5 | irept::add(id) | 2.9% | +| 6 | field_sensitivityt::apply | 2.3% | +| 7 | string_containert hashtable find | 2.1% | +| 8 | simplify_node_preorder | 2.0% | +| 9 | irept::operator== | 1.8% | +| 10 | merge_irept::merged | 1.7% | +| 11 | sharing_mapt::get_leaf_node | 1.6% | +| 12 | constant_exprt::check | 1.4% | +| 13 | simplify_node | 1.3% | +| 14 | ssa_exprt::check | 1.1% | +| 15 | simplify_rec | 1.1% | + +### Per-benchmark variation + +| Benchmark | COW | Access | Cmp | Simplify | String | BigInt | +|-----------|-----|--------|-----|----------|--------|--------| +| array_ops | 12% | 16% | 3% | 3% | 5% | 4% | +| heavy_array | 13% | 15% | 1% | 4% | **10%** | **6%** | +| tree | 5% | 6% | 4% | 1% | 2% | 0% | +| csmith_2718281828 | 14% | 14% | 6% | 6% | 5% | 4% | +| csmith_42 | 13% | 15% | 4% | 6% | 4% | 4% | + +heavy_array stands out with 10% string interning and 6% BigInt — this +is from `field_sensitivityt::get_fields` creating `dstringt` values for +array element field names, and constant index expressions using BigInt. +The tree benchmark has low COW/access (5–6%) because it spends more time +in CNF/DIMACS output (it produces the largest formula: 966K vars, 3.3M +clauses). + +### Inclusive cost (top-level call chain) + +``` +execute_next_instruction 89% +├── symex_function_call_symbol 37% (parameter_assignments, clean_expr) +├── symex_assign 33% (assign_non_struct_symbol → rename) +│ └── simplify_rec 30% (simplify_node_preorder → simplify_node) +├── clean_expr → dereference 32% +│ └── field_sensitivityt::apply 21% (recursive, 4-5 levels) +└── symex_assert ~5% +``` + +### Dominant call chains for top functions + +**irept::find (7.5%)**: Called from simplify (via type checks), rename +(via field_sensitivity::apply), and parameter_assignments. Spread across +many call sites — no single dominant caller. + +**remove_ref (7.3%)**: Triggered by irept destruction when temporary +expressions go out of scope. Recursive (deep trees cause deep +remove_ref chains). Spread across symex_assign, symex_function_call, +and simplify. + +**detach (5.2%)**: Triggered by irept mutations (add, set, operand +assignment). Largest single caller is rename → field_sensitivity::apply +chain, which mutates expressions during SSA renaming. + +### Assessment + +The profile is now genuinely flat. No single CBMC function exceeds 7.5% +self cost. The top two categories — irept access (16.2%) and irept COW +(13.5%) — are fundamental to how CBMC's data structures work and are +spread across many call sites rather than concentrated in one hot path. + +### Remaining optimization opportunities + +1. **irept access (16.2%)** — `forward_list_as_mapt` does a linear scan + for every `find`/`get`/`add` call. Replacing it with a flat hash map + or small sorted array would help, but is a major architectural change + affecting the entire codebase. + +2. **irept COW (13.5%)** — inherent to the sharing tree design. The + remaining mutations are mostly in the rename/simplify core loop where + they are necessary. Could be reduced by batching mutations or using + move semantics more aggressively. + +3. **String interning (3.9%, up to 10% on heavy_array)** — caching + `dstringt` values for predictable field names in + `field_sensitivityt::get_fields` could help on array-heavy benchmarks. + +4. **BigInt (3.8%, up to 6% on heavy_array)** — constant expression + construction overhead. `BigInt` uses heap allocation; a small-buffer + optimization or replacing with fixed-width integers where possible + could help. + +5. **Validation checks (2.2%)** — `constant_exprt::check` and + `ssa_exprt::check` run on every `to_constant_expr()` / + `to_ssa_expr()` cast. These could be compiled out in Release builds + via `CPROVER_INVARIANT_DO_NOT_CHECK`, but that flag also disables + other useful invariant checks. + +6. **tcmalloc (13.2%)** — irreducible allocator overhead. Already using + the fastest general-purpose allocator. Further gains would require + reducing allocation volume (fewer temporary expressions, arena + allocation for symex state). diff --git a/scripts/profiling/runner.py b/scripts/profiling/runner.py index 573e85b5f06..1cffc611bbb 100644 --- a/scripts/profiling/runner.py +++ b/scripts/profiling/runner.py @@ -57,6 +57,26 @@ def compile_to_goto_binary(cbmc, source_file, output_dir): return None +def compile_to_goto_binary(cbmc, source_file, output_dir): + """Compile a C/C++ source file to a goto binary using goto-cc. + + Returns the path to the goto binary, or None if goto-cc is not available + or compilation fails (in which case CBMC will parse the source directly). + """ + goto_cc = Path(cbmc).parent / "goto-cc" + if not goto_cc.is_file(): + return None + + gb_path = output_dir / (Path(source_file).stem + ".gb") + try: + subprocess.run( + [str(goto_cc), "-o", str(gb_path), source_file], + capture_output=True, check=True, timeout=60) + return gb_path + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + return None + + def run_benchmark(cbmc, bench, output_dir, timeout, memory_mb, skip_solver): """Run perf on a single benchmark. Returns parsed results dict.""" name = bench["name"] diff --git a/src/cbmc/CMakeLists.txt b/src/cbmc/CMakeLists.txt index 4266fedae42..c8598568aa9 100644 --- a/src/cbmc/CMakeLists.txt +++ b/src/cbmc/CMakeLists.txt @@ -45,6 +45,7 @@ target_link_libraries(cbmc else() target_link_libraries(cbmc cbmc-lib) endif() +target_link_libraries(cbmc cprover_allocator) install(TARGETS cbmc DESTINATION ${CMAKE_INSTALL_BINDIR}) # Man page diff --git a/src/goto-analyzer/CMakeLists.txt b/src/goto-analyzer/CMakeLists.txt index 20cc1f44d49..6854c7fd21f 100644 --- a/src/goto-analyzer/CMakeLists.txt +++ b/src/goto-analyzer/CMakeLists.txt @@ -25,6 +25,7 @@ target_link_libraries(goto-analyzer-lib # Executable add_executable(goto-analyzer goto_analyzer_main.cpp) target_link_libraries(goto-analyzer goto-analyzer-lib) +target_link_libraries(goto-analyzer cprover_allocator) install(TARGETS goto-analyzer DESTINATION ${CMAKE_INSTALL_BINDIR}) # Man page diff --git a/src/goto-cc/CMakeLists.txt b/src/goto-cc/CMakeLists.txt index cd259bbb960..f21920539c3 100644 --- a/src/goto-cc/CMakeLists.txt +++ b/src/goto-cc/CMakeLists.txt @@ -24,6 +24,7 @@ target_link_libraries(goto-cc-lib add_executable(goto-cc goto_cc_main.cpp) target_link_libraries(goto-cc goto-cc-lib) +target_link_libraries(goto-cc cprover_allocator) install(TARGETS goto-cc DESTINATION ${CMAKE_INSTALL_BINDIR}) if(WIN32) diff --git a/src/goto-diff/CMakeLists.txt b/src/goto-diff/CMakeLists.txt index 9c4607be813..61b38e143ce 100644 --- a/src/goto-diff/CMakeLists.txt +++ b/src/goto-diff/CMakeLists.txt @@ -26,6 +26,7 @@ target_link_libraries(goto-diff-lib # Executable add_executable(goto-diff goto_diff_main.cpp) target_link_libraries(goto-diff goto-diff-lib) +target_link_libraries(goto-diff cprover_allocator) install(TARGETS goto-diff DESTINATION ${CMAKE_INSTALL_BINDIR}) # Man page diff --git a/src/goto-harness/CMakeLists.txt b/src/goto-harness/CMakeLists.txt index 25bd5f582f5..5bbbf7cd2a0 100644 --- a/src/goto-harness/CMakeLists.txt +++ b/src/goto-harness/CMakeLists.txt @@ -9,6 +9,7 @@ target_link_libraries(goto-harness json json-symtab-language ) +target_link_libraries(goto-harness cprover_allocator) install(TARGETS goto-harness DESTINATION ${CMAKE_INSTALL_BINDIR}) # Man page diff --git a/src/goto-instrument/CMakeLists.txt b/src/goto-instrument/CMakeLists.txt index a1edca91f9a..afdaba684e6 100644 --- a/src/goto-instrument/CMakeLists.txt +++ b/src/goto-instrument/CMakeLists.txt @@ -29,6 +29,7 @@ add_if_library(goto-instrument-lib glpk) # Executable add_executable(goto-instrument goto_instrument_main.cpp) target_link_libraries(goto-instrument goto-instrument-lib) +target_link_libraries(goto-instrument cprover_allocator) install(TARGETS goto-instrument DESTINATION ${CMAKE_INSTALL_BINDIR}) # Man page diff --git a/src/goto-symex/field_sensitivity.cpp b/src/goto-symex/field_sensitivity.cpp index 0a5a56b6213..167b7ab6ecd 100644 --- a/src/goto-symex/field_sensitivity.cpp +++ b/src/goto-symex/field_sensitivity.cpp @@ -285,11 +285,15 @@ exprt field_sensitivityt::get_fields( const exprt &compound_op = ssa_expr.get_original_expr(); + // Hoist loop-invariant work: prepare template with L2 already removed. + ssa_exprt tmp_template = ssa_expr; + const bool was_l2 = !tmp_template.get_level_2().empty(); + if(was_l2) + tmp_template.remove_level_2(); + for(const auto &comp : components) { - ssa_exprt tmp = ssa_expr; - bool was_l2 = !tmp.get_level_2().empty(); - tmp.remove_level_2(); + ssa_exprt tmp = tmp_template; tmp.set_expression( member_exprt{compound_op, comp.get_name(), comp.type()}); exprt field = get_fields(ns, state, tmp, disjoined_fields_only); @@ -328,13 +332,78 @@ exprt field_sensitivityt::get_fields( const exprt &array = ssa_expr.get_original_expr(); + // Hoist loop-invariant work: prepare a template ssa_exprt once, + // then mutate only the index-dependent parts in each iteration. + // This avoids repeated ssa_exprt copies, remove_level_2 calls, + // and redundant irept operations. + ssa_exprt tmp_template = ssa_expr; + const bool was_l2 = !tmp_template.get_level_2().empty(); + if(was_l2) + tmp_template.remove_level_2(); + + // Cache the identifier prefix/suffix from the first element. + std::string id_prefix, id_suffix, l1_prefix, l1_suffix; + bool have_cached_prefix = false; + + // Pre-compute the index type (same for all elements). + const typet &idx_type = type.index_type(); + for(std::size_t i = 0; i < array_size; ++i) { - const index_exprt index(array, from_integer(i, type.index_type())); - ssa_exprt tmp = ssa_expr; - bool was_l2 = !tmp.get_level_2().empty(); - tmp.remove_level_2(); - tmp.set_expression(index); + // Reuse a single constant_exprt, just changing the value. + const constant_exprt idx_const = from_integer(i, idx_type); + const index_exprt index(array, idx_const); + + // Copy the template (which already has L2 removed). + ssa_exprt tmp = tmp_template; + + if(!have_cached_prefix) + { + // First element: use canonical set_expression to establish format. + tmp.set_expression(index); + + const std::string &idx_val = id2string(idx_const.get_value()); + const std::string marker = "[[" + idx_val + "]]"; + + const std::string &full_id = id2string(tmp.get_identifier()); + const std::string &full_l1 = id2string(tmp.get_l1_object_identifier()); + auto pos = full_id.rfind(marker); + if(pos != std::string::npos) + { + id_prefix = full_id.substr(0, pos); + id_suffix = full_id.substr(pos + marker.size()); + auto l1_pos = full_l1.rfind(marker); + if(l1_pos != std::string::npos) + { + l1_prefix = full_l1.substr(0, l1_pos); + l1_suffix = full_l1.substr(l1_pos + marker.size()); + } + have_cached_prefix = true; + } + } + else + { + // Subsequent elements: set expression and derive identifier directly. + tmp.type() = as_const(index).type(); + static_cast(tmp).add(ID_expression, index); + + const std::string &idx_val = id2string(idx_const.get_value()); + + std::string new_id = id_prefix; + new_id += "[["; + new_id += idx_val; + new_id += "]]"; + new_id += id_suffix; + tmp.set_identifier(new_id); + + std::string new_l1 = l1_prefix; + new_l1 += "[["; + new_l1 += idx_val; + new_l1 += "]]"; + new_l1 += l1_suffix; + tmp.set(ID_L1_object_identifier, new_l1); + } + exprt element = get_fields(ns, state, tmp, disjoined_fields_only); if(was_l2) { diff --git a/src/goto-symex/simplify_expr_with_value_set.cpp b/src/goto-symex/simplify_expr_with_value_set.cpp index 24a519f6364..a804780805f 100644 --- a/src/goto-symex/simplify_expr_with_value_set.cpp +++ b/src/goto-symex/simplify_expr_with_value_set.cpp @@ -19,6 +19,8 @@ Author: Michael Tautschnig #include "goto_symex_can_forward_propagate.h" +#include + /// Try to evaluate a simple pointer comparison. /// \param operation: ID_equal or ID_not_equal /// \param symbol_expr: The symbol expression in the condition @@ -164,7 +166,7 @@ simplify_expr_with_value_sett::simplify_inequality_pointer_object( auto collect_objects = [this](const exprt &pointer) { - std::set objects; + std::unordered_set objects; if(auto address_of = expr_try_dynamic_cast(pointer)) { objects.insert( @@ -209,14 +211,16 @@ simplify_expr_with_value_sett::simplify_inequality_pointer_object( : changed(static_cast(false_exprt{})); } - std::list intersection; - std::set_intersection( - lhs_objects.begin(), - lhs_objects.end(), - rhs_objects.begin(), - rhs_objects.end(), - std::back_inserter(intersection)); - if(!lhs_objects.empty() && !rhs_objects.empty() && intersection.empty()) + bool has_intersection = false; + for(const auto &obj : lhs_objects) + { + if(rhs_objects.count(obj) != 0) + { + has_intersection = true; + break; + } + } + if(!lhs_objects.empty() && !rhs_objects.empty() && !has_intersection) { // all pointed-to objects on the left-hand side are different from any of // the pointed-to objects on the right-hand side diff --git a/src/pointer-analysis/value_set.cpp b/src/pointer-analysis/value_set.cpp index 3641c00fd60..d40a83934cd 100644 --- a/src/pointer-analysis/value_set.cpp +++ b/src/pointer-analysis/value_set.cpp @@ -255,16 +255,10 @@ exprt value_sett::to_expr(const object_map_dt::value_type &it) const object.id()==ID_unknown) return object; - object_descriptor_exprt od; - - od.object()=object; - if(it.second) - od.offset() = *it.second; - - od.type()=od.object().type(); - - return std::move(od); + return object_descriptor_exprt{object, *it.second}; + else + return object_descriptor_exprt{object}; } bool value_sett::make_union(const value_sett::valuest &new_values) diff --git a/src/symtab2gb/CMakeLists.txt b/src/symtab2gb/CMakeLists.txt index 8a4b1f5ea24..8dc375d8031 100644 --- a/src/symtab2gb/CMakeLists.txt +++ b/src/symtab2gb/CMakeLists.txt @@ -11,6 +11,7 @@ target_link_libraries(symtab2gb goto-programs json-symtab-language) +target_link_libraries(symtab2gb cprover_allocator) install(TARGETS symtab2gb DESTINATION ${CMAKE_INSTALL_BINDIR}) # Man page diff --git a/src/util/irep.cpp b/src/util/irep.cpp index 36e103ea8ff..236ae5604a5 100644 --- a/src/util/irep.cpp +++ b/src/util/irep.cpp @@ -43,12 +43,14 @@ void irept::move_to_sub(irept &irep) const irep_idt &irept::get(const irep_idt &name) const { + static const irep_idt empty; + if(is_leaf_only()) + return empty; const named_subt &s = get_named_sub(); named_subt::const_iterator it=s.find(name); if(it==s.end()) { - static const irep_idt empty; return empty; } return it->second.id(); @@ -92,6 +94,8 @@ void irept::remove(const irep_idt &name) const irept &irept::find(const irep_idt &name) const { + if(is_leaf_only()) + return get_nil_irep(); const named_subt &s = get_named_sub(); auto it = s.find(name); @@ -102,12 +106,16 @@ const irept &irept::find(const irep_idt &name) const irept &irept::add(const irep_idt &name) { + if(is_leaf_only()) + promote_to_non_leaf(); named_subt &s = get_named_sub(); return s[name]; } irept &irept::add(const irep_idt &name, irept irep) { + if(is_leaf_only()) + promote_to_non_leaf(); named_subt &s = get_named_sub(); #if NAMED_SUB_IS_FORWARD_LIST @@ -415,7 +423,7 @@ std::size_t irept::number_of_non_comments(const named_subt &named_sub) std::size_t irept::hash() const { #if HASH_CODE - if(read().hash_code!=0) + if(!is_leaf_only() && read().hash_code!=0) return read().hash_code; #endif @@ -442,7 +450,8 @@ std::size_t irept::hash() const result = hash_finalize(result, sub.size() + number_of_named_ireps); #if HASH_CODE - read().hash_code=result; + if(!is_leaf_only()) + read().hash_code=result; #endif #ifdef IREP_HASH_STATS ++irep_hash_cnt; diff --git a/src/util/irep.h b/src/util/irep.h index 28f87062c48..7d75a3997fd 100644 --- a/src/util/irep.h +++ b/src/util/irep.h @@ -12,6 +12,7 @@ Author: Daniel Kroening, kroening@kroening.com #include #include +#include #include "invariant.h" #include "irep_ids.h" @@ -53,20 +54,6 @@ id2string(const irep_idt &d) class irept; const irept &get_nil_irep(); -/// Used in tree_nodet for activating or not reference counting. -/// tree_nodet uses inheritance from ref_count_ift instead of a field, so that -/// it gets deleted if empty ([[no_unique_address]] only appears in C++20). -template -struct ref_count_ift -{ -}; - -template <> -struct ref_count_ift -{ - unsigned ref_count = 1; -}; - /// A node with data in a tree, it contains: /// /// * \ref irept::dt::data : A \ref dstringt and thus an integer which is a @@ -85,8 +72,8 @@ struct ref_count_ift /// /// * \c hash_code : if HASH_CODE is activated, this is used to cache the /// result of the hash function. -template -class tree_nodet : public ref_count_ift +template +class tree_nodet final { public: // These are not stable. @@ -99,6 +86,8 @@ class tree_nodet : public ref_count_ift friend treet; + unsigned ref_count = 1; + /// This irep_idt is the only place to store data in an tree node irep_idt data; @@ -141,6 +130,30 @@ class tree_nodet : public ref_count_ift sub(std::move(_sub)) { } + + // Pool allocator for tree_nodet: maintains a free list of previously + // deallocated nodes to avoid malloc/free overhead. CBMC allocates and + // frees millions of tree_nodet objects (14M+ on typical workloads); + // reusing memory from a free list is significantly faster. + static void *operator new(std::size_t size) + { + if(free_list_head) + { + void *p = free_list_head; + free_list_head = *static_cast(p); + return p; + } + return ::operator new(size); + } + + static void operator delete(void *p, std::size_t /*size*/) + { + *static_cast(p) = free_list_head; + free_list_head = p; + } + +private: + static inline thread_local void *free_list_head = nullptr; }; /// Base class for tree-like data structures with sharing @@ -148,31 +161,69 @@ template class sharing_treet { public: - using dt = tree_nodet; + using dt = tree_nodet; using subt = typename dt::subt; using named_subt = typename dt::named_subt; /// Used to refer to this class from derived classes using tree_implementationt = sharing_treet; - explicit sharing_treet(irep_idt _id) : data(new dt(std::move(_id))) +protected: + struct flagged_idt + { + bool is_leaf : 1; + unsigned padding : 31; + irep_idt id; + + explicit flagged_idt(irep_idt id = {}) : + is_leaf{true}, + padding{0}, + id{std::move(id)} + { + static_assert(sizeof(flagged_idt) == sizeof(dt *), + "Sizes should match for optimisation to apply."); + } + }; + + union pointer_or_idt + { + flagged_idt flagged_id; + dt * pointer; + }; + + union const_pointer_or_idt + { + dt * const pointer; + flagged_idt flagged_id; + }; + + static dt *as_pointer(irep_idt id) + { + pointer_or_idt pointer_or_id{ flagged_idt{id} }; + return pointer_or_id.pointer; + } + +public: + explicit sharing_treet(irep_idt _id) : data(as_pointer(_id)) { } sharing_treet(irep_idt _id, named_subt _named_sub, subt _sub) - : data(new dt(std::move(_id), std::move(_named_sub), std::move(_sub))) + : data( + _named_sub.empty() && _sub.empty() ? as_pointer(_id) + : new dt(std::move(_id), std::move(_named_sub), std::move(_sub))) { } // constructor for blank irep - sharing_treet() : data(&empty_d) + sharing_treet() : data(empty_d) { } // copy constructor sharing_treet(const sharing_treet &irep) : data(irep.data) { - if(data!=&empty_d) + if(!is_leaf_only()) { PRECONDITION(data->ref_count != 0); data->ref_count++; @@ -190,7 +241,7 @@ class sharing_treet #ifdef IREP_DEBUG std::cout << "COPY MOVE\n"; #endif - irep.data=&empty_d; + irep.data=empty_d; } sharing_treet &operator=(const sharing_treet &irep) @@ -199,10 +250,18 @@ class sharing_treet std::cout << "ASSIGN\n"; #endif + if(this->is_leaf_only()) + { + data = irep.data; + if(!is_leaf_only()) + ++data->ref_count; + return *this; + } + // Ordering is very important here! // Consider self-assignment, which may destroy 'irep' dt *irep_data=irep.data; - if(irep_data!=&empty_d) + if(!irep.is_leaf_only()) irep_data->ref_count++; remove_ref(data); // this may kill 'irep' @@ -225,12 +284,28 @@ class sharing_treet ~sharing_treet() { - remove_ref(data); + if(!is_leaf_only()) + remove_ref(data); } protected: dt *data; - static dt empty_d; + static dt * const empty_d; + static const subt empty_sub; + static const named_subt empty_named_sub; + + /// Checks out if data is actually a pointer or an id. + bool is_leaf_only() const + { + const_pointer_or_idt pointer_or_id{ data }; + return pointer_or_id.flagged_id.is_leaf; + } + + void promote_to_non_leaf() + { + const_pointer_or_idt pointer_or_id{ data }; + data = new dt(pointer_or_id.flagged_id.id); + } static void remove_ref(dt *old_data); static void nonrecursive_destructor(dt *old_data); @@ -250,52 +325,37 @@ class sharing_treet #endif return *data; } -}; - -// Static field initialization -template -typename sharing_treet::dt - sharing_treet::empty_d; -/// Base class for tree-like data structures without sharing -template -class non_sharing_treet -{ -public: - using dt = tree_nodet; - using subt = typename dt::subt; - using named_subt = typename dt::named_subt; - - /// Used to refer to this class from derived classes - using tree_implementationt = non_sharing_treet; - - explicit non_sharing_treet(irep_idt _id) : data(std::move(_id)) + const irep_idt &id() const { + const auto pointer_or_id = + reinterpret_cast(&data); + return is_leaf_only() ? pointer_or_id->flagged_id.id : read().data; } - non_sharing_treet(irep_idt _id, named_subt _named_sub, subt _sub) - : data(std::move(_id), std::move(_named_sub), std::move(_sub)) + void id(const irep_idt &_data) { + if(is_leaf_only()) + data = as_pointer(_data); + else + write().data=_data; } - non_sharing_treet() = default; +}; - const dt &read() const - { - return data; - } +// Static field initialization +template +typename sharing_treet::dt * const + sharing_treet::empty_d = + sharing_treet::as_pointer(irep_idt{}); - dt &write() - { -#if HASH_CODE - data.hash_code = 0; -#endif - return data; - } +template + const typename sharing_treet::subt + sharing_treet::empty_sub = {}; -protected: - dt data; -}; +template + const typename sharing_treet::named_subt + sharing_treet::empty_named_sub = {}; /// There are a large number of kinds of tree structured or tree-like data in /// CPROVER. \ref irept provides a single, unified representation for all of @@ -385,14 +445,8 @@ class irept irept() = default; - const irep_idt &id() const - { return read().data; } - const std::string &id_string() const - { return id2string(read().data); } - - void id(const irep_idt &_data) - { write().data=_data; } + { return id2string(id()); } const irept &find(const irep_idt &name) const; irept &add(const irep_idt &name); @@ -445,10 +499,29 @@ class irept void make_nil() { *this=get_nil_irep(); } - subt &get_sub() { return write().sub; } // DANGEROUS - const subt &get_sub() const { return read().sub; } - named_subt &get_named_sub() { return write().named_sub; } // DANGEROUS - const named_subt &get_named_sub() const { return read().named_sub; } + subt &get_sub() { + if(is_leaf_only()) + promote_to_non_leaf(); + return write().sub; + } // DANGEROUS + + const subt &get_sub() const { + if(is_leaf_only()) + return empty_sub; + return read().sub; + } + + named_subt &get_named_sub() { + if(is_leaf_only()) + promote_to_non_leaf(); + return write().named_sub; + } // DANGEROUS + + const named_subt &get_named_sub() const { + if(is_leaf_only()) + return empty_named_sub; + return read().named_sub; + } std::size_t hash() const; std::size_t full_hash() const; @@ -509,19 +582,14 @@ struct diagnostics_helpert template void sharing_treet::detach() { + if(is_leaf_only()) + return; + #ifdef IREP_DEBUG std::cout << "DETACH1: " << data << '\n'; #endif - if(data == &empty_d) - { - data = new dt; - -#ifdef IREP_DEBUG - std::cout << "ALLOCATED " << data << '\n'; -#endif - } - else if(data->ref_count > 1) + if(data->ref_count > 1) { dt *old_data(data); data = new dt(*old_data); @@ -544,7 +612,8 @@ void sharing_treet::detach() template void sharing_treet::remove_ref(dt *old_data) { - if(old_data == &empty_d) + const const_pointer_or_idt pointer_or_id{ old_data }; + if(pointer_or_id.flagged_id.is_leaf) return; #if 0 diff --git a/src/util/pointer_expr.h b/src/util/pointer_expr.h index d3130b3110c..74323f6c194 100644 --- a/src/util/pointer_expr.h +++ b/src/util/pointer_expr.h @@ -198,6 +198,15 @@ class object_descriptor_exprt : public binary_exprt { } + object_descriptor_exprt(exprt _object, exprt _offset) + : binary_exprt( + std::move(_object), + ID_object_descriptor, + std::move(_offset), + typet()) + { + } + /// Given an expression \p expr, attempt to find the underlying object it /// represents by skipping over type casts and removing balanced /// dereference/address_of operations; that object will then be available diff --git a/src/util/rename.cpp b/src/util/rename.cpp index 2b622b96874..ad559cc818a 100644 --- a/src/util/rename.cpp +++ b/src/util/rename.cpp @@ -8,10 +8,15 @@ Author: Daniel Kroening, kroening@kroening.com #include "rename.h" -#include - #include "namespace.h" +#include +#include + +/// Produce a fresh name by appending delimiter + number to \p name. +/// Uses a per-prefix counter to avoid O(n) probing of the symbol table +/// on each call. Falls back to symbol table probing only on the first +/// call for each prefix. irep_idt get_new_name(const irep_idt &name, const namespacet &ns, char delimiter) { @@ -21,5 +26,21 @@ get_new_name(const irep_idt &name, const namespacet &ns, char delimiter) std::string prefix = id2string(name) + delimiter; - return prefix + std::to_string(ns.smallest_unused_suffix(prefix)); + // Cache the next suffix to try for each prefix. This turns the O(n) + // linear probe in smallest_unused_suffix into amortized O(1). + // thread_local for safety in case of future multi-threading. + static thread_local std::unordered_map suffix_cache; + auto it = suffix_cache.find(prefix); + std::size_t suffix; + if(it != suffix_cache.end()) + suffix = it->second; + else + suffix = ns.smallest_unused_suffix(prefix); + + // Verify the suffix is actually unused + while(!ns.lookup(prefix + std::to_string(suffix), symbol)) + ++suffix; + + suffix_cache[prefix] = suffix + 1; + return prefix + std::to_string(suffix); } diff --git a/src/util/ssa_expr.cpp b/src/util/ssa_expr.cpp index e92c79967e6..4b02b8457bb 100644 --- a/src/util/ssa_expr.cpp +++ b/src/util/ssa_expr.cpp @@ -8,8 +8,6 @@ Author: Daniel Kroening, kroening@kroening.com #include "ssa_expr.h" -#include - #include "pointer_expr.h" /// If \p expr is: @@ -18,21 +16,28 @@ Author: Daniel Kroening, kroening@kroening.com /// - an index_exprt where the index is a constant, apply recursively on the /// array and add "[[index]]" /// \return the stream \p os -static std::ostream & -initialize_ssa_identifier(std::ostream &os, const exprt &expr) +static void initialize_ssa_identifier(std::string &id, const exprt &expr) { if(auto member = expr_try_dynamic_cast(expr)) { - return initialize_ssa_identifier(os, member->struct_op()) - << ".." << member->get_component_name(); + initialize_ssa_identifier(id, member->struct_op()); + id += ".."; + id += id2string(member->get_component_name()); + return; } if(auto index = expr_try_dynamic_cast(expr)) { - const irep_idt &idx = to_constant_expr(index->index()).get_value(); - return initialize_ssa_identifier(os, index->array()) << "[[" << idx << "]]"; + initialize_ssa_identifier(id, index->array()); + id += "[["; + id += id2string(to_constant_expr(index->index()).get_value()); + id += "]]"; + return; } if(auto symbol = expr_try_dynamic_cast(expr)) - return os << symbol->get_identifier(); + { + id += id2string(symbol->get_identifier()); + return; + } UNREACHABLE; } @@ -42,9 +47,9 @@ ssa_exprt::ssa_exprt(const exprt &expr) : symbol_exprt(expr.type()) set(ID_C_SSA_symbol, true); add(ID_expression, expr); with_source_location(expr.source_location()); - std::ostringstream os; - initialize_ssa_identifier(os, expr); - const std::string id = os.str(); + std::string id; + id.reserve(64); + initialize_ssa_identifier(id, expr); set_identifier(id); set(ID_L1_object_identifier, id); } @@ -53,57 +58,71 @@ ssa_exprt::ssa_exprt(const exprt &expr) : symbol_exprt(expr.type()) /// "s!l0@l1". /// If \p expr is a member or index expression, recursively apply the procedure /// and add "..component_name" or "[[index]]" to \p os. +/// Build the base part of an SSA identifier (without level suffixes) +/// by appending to \p id and \p l1_object_id. static void build_ssa_identifier_rec( const exprt &expr, const irep_idt &l0, const irep_idt &l1, const irep_idt &l2, - std::ostream &os, - std::ostream &l1_object_os) + std::string &id, + std::string &l1_object_id) { - if(expr.id()==ID_member) + if(expr.id() == ID_member) { - const member_exprt &member=to_member_expr(expr); + const member_exprt &member = to_member_expr(expr); - build_ssa_identifier_rec(member.struct_op(), l0, l1, l2, os, l1_object_os); + build_ssa_identifier_rec(member.struct_op(), l0, l1, l2, id, l1_object_id); - os << ".." << member.get_component_name(); - l1_object_os << ".." << member.get_component_name(); + const std::string &component = id2string(member.get_component_name()); + id += ".."; + id += component; + l1_object_id += ".."; + l1_object_id += component; } - else if(expr.id()==ID_index) + else if(expr.id() == ID_index) { - const index_exprt &index=to_index_expr(expr); - - build_ssa_identifier_rec(index.array(), l0, l1, l2, os, l1_object_os); - - const irep_idt &idx = to_constant_expr(index.index()).get_value(); - os << "[[" << idx << "]]"; - l1_object_os << "[[" << idx << "]]"; + const index_exprt &index = to_index_expr(expr); + + build_ssa_identifier_rec(index.array(), l0, l1, l2, id, l1_object_id); + + const std::string &idx = + id2string(to_constant_expr(index.index()).get_value()); + id += "[["; + id += idx; + id += "]]"; + l1_object_id += "[["; + l1_object_id += idx; + l1_object_id += "]]"; } - else if(expr.id()==ID_symbol) + else if(expr.id() == ID_symbol) { - auto symid=to_symbol_expr(expr).get_identifier(); - os << symid; - l1_object_os << symid; + const std::string &symid = id2string(to_symbol_expr(expr).get_identifier()); + id += symid; + l1_object_id += symid; if(!l0.empty()) { - // Distinguish different threads of execution - os << '!' << l0; - l1_object_os << '!' << l0; + const std::string &l0s = id2string(l0); + id += '!'; + id += l0s; + l1_object_id += '!'; + l1_object_id += l0s; } if(!l1.empty()) { - // Distinguish different calls to the same function (~stack frame) - os << '@' << l1; - l1_object_os << '@' << l1; + const std::string &l1s = id2string(l1); + id += '@'; + id += l1s; + l1_object_id += '@'; + l1_object_id += l1s; } if(!l2.empty()) { - // Distinguish SSA steps for the same variable - os << '#' << l2; + id += '#'; + id += id2string(l2); } } else @@ -116,12 +135,16 @@ static std::pair build_identifier( const irep_idt &l1, const irep_idt &l2) { - std::ostringstream oss; - std::ostringstream l1_object_oss; + std::string id; + std::string l1_object_id; + // Typical SSA identifiers are 20-60 chars; pre-allocate to avoid + // repeated reallocation during string building. + id.reserve(64); + l1_object_id.reserve(64); - build_ssa_identifier_rec(expr, l0, l1, l2, oss, l1_object_oss); + build_ssa_identifier_rec(expr, l0, l1, l2, id, l1_object_id); - return std::make_pair(irep_idt(oss.str()), irep_idt(l1_object_oss.str())); + return std::make_pair(irep_idt(id), irep_idt(l1_object_id)); } static void update_identifier(ssa_exprt &ssa) @@ -181,19 +204,42 @@ const irep_idt ssa_exprt::get_l1_object_identifier() const void ssa_exprt::set_level_0(std::size_t i) { set(ID_L0, i); - ::update_identifier(*this); + // Optimized: L0 is only set when it was previously empty (the caller + // guards against re-setting). The current identifier is "base" and + // we need "base!l0". Append directly instead of rebuilding. + const std::string &cur_id = id2string(get_identifier()); + std::string suffix = "!" + std::to_string(i); + std::string new_id = cur_id + suffix; + set_identifier(new_id); + set(ID_L1_object_identifier, new_id); } void ssa_exprt::set_level_1(std::size_t i) { set(ID_L1, i); - ::update_identifier(*this); + // Optimized: L1 is only set when it was previously empty (the caller + // guards against re-setting). The current identifier is "base!l0" and + // we need "base!l0@l1". Append directly instead of rebuilding. + const std::string &cur_id = id2string(get_identifier()); + std::string suffix = "@" + std::to_string(i); + std::string new_id = cur_id + suffix; + set_identifier(new_id); + set(ID_L1_object_identifier, new_id); } void ssa_exprt::set_level_2(std::size_t i) { set(ID_L2, i); - ::update_identifier(*this); + // Optimized: the L1 object identifier doesn't change when only L2 changes, + // and the main identifier just needs the #N suffix updated. Derive from + // the cached L1 object identifier instead of rebuilding from scratch. + const std::string &l1_id = id2string(get(ID_L1_object_identifier)); + std::string new_id; + new_id.reserve(l1_id.size() + 8); + new_id = l1_id; + new_id += '#'; + new_id += std::to_string(i); + set_identifier(new_id); } void ssa_exprt::remove_level_2()