diff --git a/.github/workflows/build-and-test-Linux.yaml b/.github/workflows/build-and-test-Linux.yaml
index 31fc894d5f4..b73e58a57f9 100644
--- a/.github/workflows/build-and-test-Linux.yaml
+++ b/.github/workflows/build-and-test-Linux.yaml
@@ -23,6 +23,7 @@ jobs:
           sudo apt-get install --no-install-recommends -y clang-19 clang++-19 flex bison cmake ninja-build maven jq libxml2-utils dpkg-dev ccache
           sudo apt-get install --no-install-recommends -y libssl-dev libelf-dev libudev-dev libpci-dev libiberty-dev autoconf
           sudo apt-get install --no-install-recommends -y gawk jq
+          sudo apt-get install --no-install-recommends -y libgoogle-perftools-dev
 
       - name: Prepare ccache
         uses: actions/cache@v5
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
index 1a4d70514e9..39445b76fc3 100644
--- a/.github/workflows/coverage.yaml
+++ b/.github/workflows/coverage.yaml
@@ -31,7 +31,7 @@ jobs:
           DEBIAN_FRONTEND: noninteractive
         run: |
           sudo apt-get update
-          sudo apt-get install --no-install-recommends -y g++ gcc gdb binutils flex bison cmake maven jq libxml2-utils openjdk-11-jdk-headless lcov ccache z3
+          sudo apt-get install --no-install-recommends -y g++ gcc gdb binutils flex bison cmake maven jq libxml2-utils openjdk-11-jdk-headless lcov ccache z3 libgoogle-perftools-dev
       - name: Confirm z3 solver is available and log the version installed
         run: z3 --version
       - name: Download cvc-5 from the releases page and make sure it can be deployed
diff --git a/.github/workflows/performance.yaml b/.github/workflows/performance.yaml
index 93b9af238b0..198e3b2c318 100644
--- a/.github/workflows/performance.yaml
+++ b/.github/workflows/performance.yaml
@@ -45,7 +45,7 @@ jobs:
           DEBIAN_FRONTEND: noninteractive
         run: |
           sudo apt-get update
-          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc g++ maven flex bison ccache
+          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc g++ maven flex bison ccache libgoogle-perftools-dev
 
       - name: Prepare ccache
         uses: actions/cache/restore@v5
diff --git a/.github/workflows/profiling.yaml b/.github/workflows/profiling.yaml
index 92fd4e42f3e..95bd25247cd 100644
--- a/.github/workflows/profiling.yaml
+++ b/.github/workflows/profiling.yaml
@@ -55,6 +55,7 @@ jobs:
           fi
           echo "$(dirname "$PERF_REAL")" >> "$GITHUB_PATH"
           "$PERF_REAL" --version
+          # perf requires perf_event_paranoid=-1 for call graph recording
           sudo sysctl kernel.perf_event_paranoid=-1
 
       - name: Restore ccache
diff --git a/.github/workflows/pull-request-checks.yaml b/.github/workflows/pull-request-checks.yaml
index ba3e934a87d..80ff2f1228f 100644
--- a/.github/workflows/pull-request-checks.yaml
+++ b/.github/workflows/pull-request-checks.yaml
@@ -24,7 +24,7 @@ jobs:
           DEBIAN_FRONTEND: noninteractive
         run: |
           sudo apt-get update
-          sudo apt-get install --no-install-recommends -yq gcc gdb g++ maven jq flex bison libxml2-utils ccache cmake z3
+          sudo apt-get install --no-install-recommends -yq gcc gdb g++ maven jq flex bison libxml2-utils ccache cmake z3 libgoogle-perftools-dev
       - name: Confirm z3 solver is available and log the version installed
         run: z3 --version
       - name: Download cvc-5 from the releases page and make sure it can be deployed
@@ -222,7 +222,7 @@ jobs:
           DEBIAN_FRONTEND: noninteractive
         run: |
           sudo apt-get update
-          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3
+          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3 libgoogle-perftools-dev
       - name: Confirm z3 solver is available and log the version installed
         run: z3 --version
       - name: Download cvc-5 from the releases page and make sure it can be deployed
@@ -361,7 +361,7 @@ jobs:
           DEBIAN_FRONTEND: noninteractive
         run: |
           sudo apt-get update
-          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3
+          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3 libgoogle-perftools-dev
       - name: Confirm z3 solver is available and log the version installed
         run: z3 --version
       - name: Download cvc-5 from the releases page and make sure it can be deployed
@@ -468,7 +468,7 @@ jobs:
           DEBIAN_FRONTEND: noninteractive
         run: |
           sudo apt-get update
-          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3
+          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils dpkg-dev ccache doxygen graphviz z3 libgoogle-perftools-dev
       - name: Confirm z3 solver is available and log the version installed
         run: z3 --version
       - name: Download cvc-5 from the releases page and make sure it can be deployed
@@ -522,7 +522,7 @@ jobs:
           DEBIAN_FRONTEND: noninteractive
         run: |
           sudo apt-get update
-          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils ccache doxygen z3 g++-multilib
+          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc gdb g++ maven flex bison libxml2-utils ccache doxygen z3 g++-multilib libgoogle-perftools-dev
       - name: Confirm z3 solver is available and log the version installed
         run: z3 --version
       - name: Download cvc-5 from the releases page and make sure it can be deployed
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0344a94d747..e81829ff636 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,6 +196,53 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR
     endif()
 endif()
 
+# Alternative memory allocator for improved performance.
+# tcmalloc and jemalloc give 18-25% speedup on typical CBMC workloads
+# due to better handling of many small allocations (irept tree nodes).
+# Set to "auto" to detect, or "tcmalloc", "jemalloc", or "system".
+set(allocator "auto" CACHE STRING "Memory allocator: auto, tcmalloc, jemalloc, or system")
+
+if(NOT allocator STREQUAL "system")
+    if(allocator STREQUAL "auto" OR allocator STREQUAL "tcmalloc")
+        find_library(TCMALLOC_LIB NAMES tcmalloc_minimal tcmalloc)
+    endif()
+    if(allocator STREQUAL "auto" OR allocator STREQUAL "jemalloc")
+        find_library(JEMALLOC_LIB NAMES jemalloc)
+    endif()
+
+    if(allocator STREQUAL "tcmalloc")
+        if(NOT TCMALLOC_LIB)
+            message(FATAL_ERROR "tcmalloc requested but not found")
+        endif()
+        set(CBMC_ALLOCATOR_LIB ${TCMALLOC_LIB})
+        message(STATUS "Using tcmalloc: ${TCMALLOC_LIB}")
+    elseif(allocator STREQUAL "jemalloc")
+        if(NOT JEMALLOC_LIB)
+            message(FATAL_ERROR "jemalloc requested but not found")
+        endif()
+        set(CBMC_ALLOCATOR_LIB ${JEMALLOC_LIB})
+        message(STATUS "Using jemalloc: ${JEMALLOC_LIB}")
+    elseif(allocator STREQUAL "auto")
+        if(TCMALLOC_LIB)
+            set(CBMC_ALLOCATOR_LIB ${TCMALLOC_LIB})
+            message(STATUS "Auto-detected tcmalloc: ${TCMALLOC_LIB}")
+        elseif(JEMALLOC_LIB)
+            set(CBMC_ALLOCATOR_LIB ${JEMALLOC_LIB})
+            message(STATUS "Auto-detected jemalloc: ${JEMALLOC_LIB}")
+        else()
+            message(STATUS "No alternative allocator found, using system malloc")
+        endif()
+    endif()
+endif()
+
+# Make the allocator available for executables via cprover_allocator target.
+if(CBMC_ALLOCATOR_LIB)
+    add_library(cprover_allocator INTERFACE)
+    target_link_libraries(cprover_allocator INTERFACE ${CBMC_ALLOCATOR_LIB})
+else()
+    add_library(cprover_allocator INTERFACE)
+endif()
+
 function(cprover_default_properties)
     set(CBMC_CXX_STANDARD ${CMAKE_CXX_STANDARD})
     set(CBMC_CXX_STANDARD_REQUIRED ${CMAKE_CXX_STANDARD_REQUIRED})
diff --git a/PROFILING_ANALYSIS.md b/PROFILING_ANALYSIS.md
new file mode 100644
index 00000000000..3c1dc0d6f7b
--- /dev/null
+++ b/PROFILING_ANALYSIS.md
@@ -0,0 +1,1678 @@
+# CBMC Performance Profiling Analysis
+
+Date: 2026-03-11 — 2026-03-12
+Branch: `experiment/hash-optimization`
+Baseline: Release (`-O3 -DNDEBUG`), commit `b191cc1dac` (develop)
+System: Linux x86_64
+
+## Methodology
+
+- Tool: `scripts/profile_cbmc.py --auto-large --auto-csmith --runs 3 --timeout 90`
+- 15 benchmarks × 3 runs = 45 profiling runs, 49,462 total samples
+- Solver excluded (`--dimacs --outfile /dev/null`) — only pre-solver stages profiled
+- Source locations resolved via `build-debug/bin/cbmc` (RelWithDebInfo)
+
+## Benchmarks
+
+| Benchmark | Symex (s) | Convert SSA (s) | Total (s) | Steps | ±Total |
+|-----------|-----------|-----------------|-----------|-------|--------|
+| linked_list | 0.215 | 0.336 | 2.3 | 4,433 | ±0.06s |
+| array_ops | 3.124 | 0.174 | 5.1 | 16,448 | ±0.00s |
+| structs | 0.031 | 0.000 | 1.1 | 312 | ±0.06s |
+| dlinked_list | 0.385 | 0.541 | 3.3 | 8,242 | ±0.00s |
+| string_ops | 0.101 | 0.033 | 2.1 | 1,950 | ±0.06s |
+| func_ptrs | 0.011 | 0.000 | 1.0 | 271 | ±0.00s |
+| bitvector | 0.016 | 0.000 | 1.1 | 608 | ±0.06s |
+| matrix | 1.827 | 0.000 | 3.1 | 4,148 | ±0.00s |
+| unions | 0.022 | 0.000 | 1.1 | 382 | ±0.07s |
+| tree | 0.980 | 0.943 | 5.2 | 27,789 | ±0.00s |
+| csmith_1234567890 | 11.890 | 0.227 | 17.2 | 139,379 | ±0.06s |
+| csmith_42 | 8.462 | 0.206 | 13.2 | 121,844 | ±0.00s |
+| csmith_1111111111 | 17.291 | 0.258 | 23.2 | 153,556 | ±0.06s |
+| csmith_2718281828 | 11.544 | 0.238 | 16.3 | 148,596 | ±0.01s |
+| csmith_314159265 | 3.183 | 0.069 | 6.2 | 56,187 | ±0.06s |
+
+Symex dominates wall time for: array_ops, matrix, and all CSmith benchmarks.
+
+## Sample Distribution
+
+- CBMC code: 28,550 (57.7%)
+- Libraries (libc, libstdc++): 20,912 (42.3%)
+- Total: 49,462
+
+## Top Hotspots
+
+### 1. String interning: `string_containert::get` — 11.7% combined
+
+The `_Hashtable::_M_find_before_node` (10.0%) and `_Hashtable::find` (1.6%)
+functions are called almost exclusively from `string_containert::get`.
+Additionally, `hash_string` accounts for 1.7%.
+
+- **Root cause**: NOT the hash function or table configuration (see
+  investigation below), but the **volume of redundant interning calls**.
+  `update_identifier(ssa_exprt&)` is called 11.5M times on csmith_42
+  with a 98.6% hit rate — almost all calls find an already-interned string.
+- **Call chain**: `field_sensitivityt::get_fields` / `rename` / `set_indices`
+  → `update_identifier` → `build_identifier` (creates two `ostringstream`,
+  builds string char-by-char) → `irep_idt(oss.str())` → `string_containert::get`
+  → `hash_string` → `hash_table.find`
+- **Source**: `src/util/ssa_expr.cpp` (`update_identifier`, `build_identifier`)
+
+#### Investigation: hash function and table tuning
+
+Tested three approaches on csmith_42 (11.5M get() calls, 156K unique strings):
+
+| Change | Result | Reason |
+|--------|--------|--------|
+| FNV-1a hash (replace h*31+c) | **3.6% slower** | Multiply costlier than shift-subtract; chain lengths already short |
+| Pre-reserve 10K buckets | No effect | `unordered_map` handles growth fine |
+| Max load factor 0.5 | **1.3-1.8% faster** | Shorter chains, but modest (doubles memory) |
+
+Hash distribution quality is similar for both hash functions (~37% empty
+buckets at load factor 1.0). `dstringt::hash() = no` (sequential index)
+is actually perfect for `std::unordered_map<dstringt, ...>`.
+
+#### Recommended optimization
+
+Avoid redundant `get()` calls rather than tuning the hash table:
+- **Cache SSA identifiers**: skip `build_identifier` when l0/l1/l2 and
+  the original expression haven't changed
+- **Avoid ostringstream**: use direct string concatenation with `id2string()`
+- **Reduce `update_identifier` calls**: audit callers for unnecessary rebuilds
+
+Estimated impact: 6-9% overall speedup (50-80% reduction of the 11.7% cost).
+
+#### Implemented: replace ostringstream with string concatenation
+
+Replaced `std::ostringstream` with direct `std::string` concatenation
+(using `reserve(64)` + `append`) in `build_ssa_identifier_rec` and
+`initialize_ssa_identifier`. Commit `f8057bea69`.
+
+| Benchmark | Baseline | Optimized | Speedup |
+|-----------|----------|-----------|---------|
+| linked_list | 1.49s | 1.46s | 2.0% |
+| array_ops | 3.13s | 2.78s | **11.2%** |
+| csmith_42 | 8.89s | 8.58s | 3.5% |
+| csmith_1111111111 | 16.91s | 15.92s | **5.9%** |
+
+#### Implemented: avoid full rebuild in set_level_0/1/2
+
+When `set_level_0`, `set_level_1`, or `set_level_2` is called, the
+identifier can be derived from the existing one by appending a suffix
+(`!l0`, `@l1`, `#l2`) instead of rebuilding from the expression tree.
+This is safe because each level is only set when previously empty
+(callers guard against re-setting). Commits `4564ca2163`, `2c09b3ef20`.
+
+Instrumentation showed `update_identifier` is called 504,737 times on
+csmith_42: 24% from `set_level_0`, 22% from `set_level_1`, 16% from
+`set_level_2`, 37% from `set_expression`.
+
+Combined with the ostringstream optimization:
+
+| Benchmark | Baseline | After all SSA opts | Speedup |
+|-----------|----------|--------------------|---------|
+| linked_list | 1.49s | 1.44s | 3.4% |
+| array_ops | 3.13s | 2.69s | 14.1% |
+| csmith_42 | 8.89s | 8.44s | 5.1% |
+| csmith_1111111111 | 16.91s | 15.75s | 6.9% |
+
+#### Implemented: cache array identifier prefix in field_sensitivity
+
+In `field_sensitivityt::get_fields` for array types, pre-compute the
+base identifier and level suffixes once before the loop, then construct
+each element's identifier by concatenating `base[[i]]` + suffixes
+directly. This avoids calling `set_expression` (which triggers a full
+identifier rebuild) for each array index. Commit `3a95db52b3`.
+
+Additional speedup on array-heavy benchmark: array_ops 2.69s → 2.56s
+(4.8%). Minimal impact on CSmith benchmarks.
+
+#### Investigated and ruled out: member expression caching
+
+Attempted the same prefix-caching approach for member expressions
+(`base..component!l0@l1`). No measurable improvement — the member case
+is not in a hot loop (called once per struct field access, not iterated).
+
+Further optimization potential remains: the remaining `set_expression`
+calls (37% of `update_identifier` on CSmith) still do full rebuilds.
+Reducing these would require restructuring how `dstringt` is constructed
+to avoid intermediate `std::string` allocation — `dstringt` doesn't
+support concatenation, so every new identifier must go through
+`string_containert::get()`.
+
+### 2. Sharing tree destruction: `sharing_treet::remove_ref` — 9.9%
+
+Recursive reference-count decrement and deallocation of irept tree nodes.
+
+- **Root cause**: 63% of samples are self-recursive — deep irept trees
+  cause deep recursion in the destructor chain.
+- **Key callers**:
+  - 5.7% from `renamedt` (SSA renaming creates/destroys expressions)
+  - 2.9% from `goto_symext::symex_function_call_symbol`
+  - 2.5% from `~multi_path_symex_only_checkert` (final cleanup)
+  - 2.4% from `simplify_exprt::simplify`
+- **Source locations**:
+  - `src/util/message.h:303` (52.6% — likely inlining artifact)
+  - `src/util/typecheck.cpp:16` (10.5%)
+  - `src/util/std_expr.h:1334` (3.1% — in rename)
+- **Source**: `src/util/sharing_node.h`, `src/util/irep.h`
+- **Possible improvements**:
+  - Iterative (non-recursive) `remove_ref` using an explicit stack
+  - Deferred/batched destruction (arena-style)
+  - Reduce unnecessary temporary irept copies that trigger destruction
+
+### 3. Memory allocation: malloc/free/new — ~25% combined
+
+| Function | % |
+|----------|---|
+| malloc | 7.4% |
+| _int_free | 6.6% |
+| _int_malloc | 5.4% |
+| cfree | 4.7% |
+| malloc_consolidate | 4.0% |
+| operator new | 2.5% |
+| unlink_chunk | 1.7% |
+
+- **Root cause**: CBMC creates and destroys vast numbers of small objects
+  (irept nodes, expression trees, SSA steps). The sharing tree helps but
+  copy-on-write still triggers many allocations.
+- **Possible improvements**:
+  - Custom allocator / object pool for irept nodes
+  - Reduce copies (more move semantics, fewer temporaries)
+  - This is largely a consequence of hotspots #2 and #7 — fixing those
+    would reduce allocation pressure
+
+### 4. irept::find — 5.4%
+
+Linear scan through `forward_list_as_mapt` to find a named sub-tree.
+
+- **Key callers**:
+  - 22.6% from `simplify_exprt::simplify_node_preorder` (at `goto_program.h:274`)
+  - 19.7% from `constant_exprt::check` (at `irep.h:228`)
+  - 10.5% from `requires_renaming`
+  - 9.1% from `field_sensitivityt::get_fields`
+- **Source**: `src/util/irep.h`
+- **Possible improvements**:
+  - `constant_exprt::check` and `simplify_node_preorder` call `find` to
+    check expression types — could be short-circuited with cached type IDs
+  - `requires_renaming` does repeated type lookups — result could be cached
+
+### 5. Symbol table lookup: `next_unused_suffix` — 4.9%
+
+84.8% of `symbol_tablet::find` calls come from `symbol_table_baset::next_unused_suffix`.
+
+- **Root cause**: Generating unique symbol names by probing the symbol table
+  with incrementing suffixes. Each probe is a full hash table lookup.
+- **Source location**: `src/util/expr.h:96` (in `next_unused_suffix`)
+- **Source**: `src/util/symbol_table_base.h`
+- **Possible improvements**:
+  - Maintain a per-prefix counter to generate the next suffix directly
+    without probing the table
+  - This is a clear algorithmic improvement — O(1) instead of O(n) probes
+
+### 6. irept::get — 4.8%
+
+Similar to `irept::find` but returns a value rather than a reference.
+
+- **Key callers**:
+  - 21.4% from `irept::get_bool` (at `std_expr.h:131`)
+  - 9.1% from `update_identifier`
+  - 6.5% from `renamedt` (rename)
+  - 5.5% from `build_ssa_identifier_rec`
+- **Source**: `src/util/irep.h`
+- **Possible improvements**: Same as #4 — reduce redundant lookups
+
+### 7. Copy-on-write detach: `sharing_treet::detach` — 4.6%
+
+Creates a private copy of a shared irept node before mutation.
+
+- **Key callers**:
+  - 45.9% from `irept::add` (at `symex_dead.cpp:65`)
+  - 17.0% from `field_sensitivityt::apply` (at `std_types.h:149`)
+  - 10.8% from `irept::remove` (at `symex_dead.cpp:72`)
+  - 9.1% from `apply_to_objects_in_dereference`
+- **Source**: `src/util/sharing_node.h`
+- **Possible improvements**:
+  - `symex_dead.cpp:65,72` — the dead variable handler adds and removes
+    irept fields, triggering detach each time. Could batch mutations.
+  - `field_sensitivityt::apply` — could avoid unnecessary copies when
+    the result is immediately consumed
+
+### 8. SSA renaming map: `sharing_mapt::get_leaf_node` — 2.1%
+
+Hash-trie lookup in the SSA renaming map.
+
+- **Key callers**: 90.2% from `rename` (at `irep.h:228`)
+- **Source**: `src/util/sharing_map.h`
+- **Possible improvements**:
+  - The sharing_map is a hash-array-mapped trie. For the rename use case,
+    a flat hash map might be faster for small-to-medium maps.
+
+## Prioritized Optimization Plan
+
+Based on impact and feasibility:
+
+| Priority | Target | Impact | Effort | Approach |
+|----------|--------|--------|--------|----------|
+| **P1** | `next_unused_suffix` | 4.9% | Low | Per-prefix counter instead of probing |
+| **P2** | SSA identifier rebuilding | 11.7% | Medium | Cache identifiers / avoid ostringstream |
+| **P3** | `sharing_treet::remove_ref` | 9.9% | Medium | Iterative destruction with explicit stack |
+| **P4** | `symex_dead.cpp` detach | 4.6% | Low | Batch add/remove to avoid double detach |
+| **P5** | `constant_exprt::check` in simplifier | ~2% | Low | Short-circuit common cases |
+| **P6** | `requires_renaming` caching | ~1% | Low | Cache result per type |
+
+P2 is the highest-impact target: the hashing investigation confirmed that
+the 11.7% cost is from redundant `update_identifier` calls, not from hash
+table configuration. P1 remains the best bang-for-buck at low effort.
+
+## Post-Optimization Investigation (2026-03-11)
+
+After implementing the SSA identifier optimizations (P2), re-profiled to
+assess remaining targets.
+
+### Updated profile (csmith_42, post-optimization)
+
+| Rank | Function | % | Notes |
+|------|----------|---|-------|
+| 1 | `string_containert` hash lookup | 11.1% | Volume of calls, not hash quality |
+| 2 | `sharing_treet::remove_ref` | 7.2% | 63% self-recursive |
+| 3 | `symbol_tablet::find` | 6.0% | `next_unused_suffix` now only 0.56% |
+| 4 | malloc/free combined | ~18% | Consequence of irept allocation |
+| 5 | `memcmp` | 3.5% | From `string_ptrt::operator==` |
+| 6 | `irept::find` | 3.4% | Scattered callers |
+| 7 | `irept::get` | 3.2% | Scattered callers |
+| 8 | `sharing_treet::detach` | 2.7% | From `irept::add`/`remove` |
+| 9 | `hash_string` | 2.1% | String interning hash function |
+| 10 | `irept::operator==` | 2.1% | 57% self-recursive, 32% from `merge_irept` |
+
+### Investigation results for remaining targets
+
+**P1: `next_unused_suffix`** — Now only 0.56% (was 4.9% in original profile).
+The original measurement was inflated by the `string_containert::get` cost
+that dominated the `symbol_tablet::find` samples. After the SSA identifier
+optimization reduced interning calls, `next_unused_suffix` is no longer a
+significant bottleneck. `symbol_table_buildert` already provides a suffix
+cache for the cases that need it.
+
+**P3: `sharing_treet::remove_ref`** — Tested the existing
+`nonrecursive_destructor` (already implemented but disabled via `#if 0`).
+Result: **15% SLOWER** (9.74s vs 8.44s on csmith_42). The explicit
+`std::vector` stack with `reserve` and iteration is more expensive than
+the recursive version, which benefits from the CPU call stack being in
+L1 cache. The nonrecursive version is only useful for avoiding stack
+overflow on extremely deep trees, not for performance.
+
+#### Implemented: irept union optimization (issue #7960)
+
+Cherry-picked thomasspriggs' branch (`tas/irep_optimisation1_unions`)
+which makes `irept` hold a union of a pointer or an id, reducing memory
+usage for leaf nodes. Commits `81005555c6`, `92464ad78a` (originally
+`ccd06c39f8`, `4a1b67f8bc`). Cherry-picked cleanly onto current develop.
+
+| Benchmark | Without union | With union | Speedup |
+|-----------|-------------|------------|---------|
+| linked_list | 1.44s | 1.37s | **4.9%** |
+| array_ops | 2.56s | 2.49s | **2.7%** |
+| csmith_42 | 8.49s | 8.12s | **4.4%** |
+| csmith_1111111111 | 15.75s | 15.37s | **2.4%** |
+
+Consistent 2.4-4.9% improvement from reduced memory footprint and
+fewer allocations for leaf nodes.
+
+**P4: `symex_dead.cpp` detach** — The source locations from addr2line
+(`symex_dead.cpp:65,72`) were inlining artifacts. The actual `detach`
+cost is spread across many callers (45.9% from `irept::add`, 17% from
+`field_sensitivityt::apply`, 10.8% from `irept::remove`). No single
+call site dominates enough for a targeted fix.
+
+**P5: `constant_exprt::check`** — Only 0.32% of time is in `check`
+itself. The 19.7% of `irept::find` attributed to `check` is because
+`check` calls `find(ID_value)`. The `check` functions are validation
+that runs on every `to_constant_expr` cast even in Release builds
+(INVARIANT is not compiled out by default). Disabling checks would
+require `CPROVER_INVARIANT_DO_NOT_CHECK` which is not recommended.
+
+### Conclusion
+
+The remaining hotspots are dominated by fundamental irept operations
+(`find`, `get`, `add`, `remove_ref`, `detach`, `operator==`) that are
+called millions of times from many different sites. These are inherent
+to the sharing tree data structure and cannot be optimized by targeting
+individual call sites. Significant further improvement would require
+either:
+
+1. **Reducing the number of irept operations** — e.g., by caching
+   intermediate results, avoiding unnecessary copies, or restructuring
+   algorithms to batch mutations.
+
+2. **Changing the irept data structure** — e.g., using a flat hash map
+   instead of `forward_list_as_mapt` for named sub-trees, or using
+   a different representation for frequently-accessed fields.
+
+3. **Reducing string interning volume** — the 11.1% in
+   `string_containert::get` is from the remaining `set_expression`
+   calls (37% of `update_identifier`) in `field_sensitivity.cpp`'s
+   array element loop. Caching per-index identifiers there could help.
+
+## Raw Data
+
+Full results in `profile-results/results.json` and per-benchmark flamegraphs
+in `profile-results/<name>/flamegraph.svg`.
+
+## Alternative Allocator Investigation (2026-03-11)
+
+Tested drop-in replacement allocators via `LD_PRELOAD` to address the
+~18% of samples in malloc/free/new.
+
+### Results
+
+| Benchmark | glibc | tcmalloc | jemalloc | tcmalloc Δ | jemalloc Δ |
+|-----------|-------|----------|----------|-----------|-----------|
+| linked_list | 1.44s | 1.18s | 1.15s | **-18%** | **-20%** |
+| array_ops | 2.56s | 1.91s | 2.09s | **-25%** | **-18%** |
+| csmith_42 | 8.50s | 6.41s | 6.50s | **-25%** | **-24%** |
+| csmith_1111111111 | 15.71s | 12.73s | 12.01s | **-19%** | **-24%** |
+
+Both allocators give **18-25% speedup** with zero code changes.
+
+### Why it helps
+
+CBMC's irept sharing tree creates and destroys millions of small objects
+(tree_nodet, ~64-128 bytes each). glibc's malloc uses per-thread arenas
+with bins, but the consolidation and free-list management overhead is
+significant for this pattern. tcmalloc and jemalloc use thread-local
+caches and size-class segregation that are much faster for small objects.
+
+### Usage
+
+```bash
+# Via LD_PRELOAD (no rebuild needed)
+LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 cbmc ...
+LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 cbmc ...
+
+# Install
+apt-get install libtcmalloc-minimal4  # or libjemalloc2
+```
+
+### Recommendation
+
+Add a CMake option to link against tcmalloc or jemalloc. This is the
+single largest performance improvement found in this investigation —
+larger than all the SSA identifier optimizations combined.
+
+#### Implemented: CMake allocator auto-detection
+
+Added `-Dallocator=auto|tcmalloc|jemalloc|system` CMake option. When
+set to `auto` (the default), CMake searches for tcmalloc then jemalloc
+and links the first one found. Commit `0da7f8ff4b`.
+
+Install: `apt-get install libgoogle-perftools-dev` (or `libjemalloc-dev`)
+
+#### tcmalloc vs jemalloc comparison
+
+Both give 18-25% speedup. Key differences:
+- **tcmalloc**: More consistent (no warmup outliers), slightly faster on
+  array-heavy workloads. Known issue: memory footprint can grow over time
+  in long-running processes — not a concern for CBMC which runs and exits.
+- **jemalloc**: Better memory efficiency for long-running servers. Has a
+  warmup cost visible in the first run.
+- **mimalloc** (Microsoft): Only ~1% improvement — its advantages are for
+  multi-threaded workloads.
+- **glibc tuning** (`MALLOC_ARENA_MAX`, `MALLOC_TRIM_THRESHOLD`): 1-3%.
+- **macOS**: System allocator already uses magazine-based allocation
+  similar to tcmalloc; improvement would be smaller.
+- **Windows**: Default heap allocator with LFH is reasonably fast.
+
+Recommendation: tcmalloc for CBMC (single-threaded, short-lived).
+
+#### Implemented: CI tcmalloc installation
+
+Added `libgoogle-perftools-dev` to all Linux CI workflows that build
+CBMC with CMake. Commit `28824f9f07`. Workflows updated:
+build-and-test-Linux, pull-request-checks, performance, coverage,
+profiling.
+
+#### Perf event selection: cycles vs cpu-clock
+
+Tested both events at 997 Hz sampling. Results are nearly identical for
+function-level profiling (same ranking, same percentages within noise).
+`cycles` uses hardware PMU counters (more precise for instruction-level
+analysis); `cpu-clock` is a software timer (always available, works in
+CI containers/VMs). The profiling tool uses `cycles` as default with
+automatic `cpu-clock` fallback.
+
+## Combined Optimization Summary
+
+**Note:** This section reflects early results (2026-03-11) before the
+field_sensitivity cache was found to have a correctness bug and before
+the pool allocator and get_fields hoisting were added. See
+"Comprehensive 13-Benchmark Results (2026-03-12)" for final numbers.
+
+| Optimization | Speedup (csmith_42) | Cumulative |
+|-------------|--------------------:|----------:|
+| Baseline | — | 8.89s |
+| SSA string concat | 3.5% | 8.58s |
+| set_level_2 opt | 0.4% | 8.55s |
+| set_level_0/1 opt | 1.3% | 8.44s |
+| field_sensitivity cache | ~0% (array: 4.8%) | 8.49s |
+| **tcmalloc** | **24.6%** | **6.41s** |
+| **Total** | **27.9%** | **6.41s** |
+
+On the array-heavy benchmark, the combined effect is even larger:
+3.13s → 1.91s (**39% faster**).
+
+## Final Combined Results (2026-03-11)
+
+**Note:** This section reflects early results before the pool allocator,
+get_fields hoisting, and get_new_name cache were added. The
+field_sensitivity cache listed below was later found to have a
+correctness bug and was removed. See "Comprehensive 13-Benchmark Results
+(2026-03-12)" for final numbers.
+
+All optimizations combined on `experiment/hash-optimization` branch:
+- SSA identifier string concat (replace ostringstream)
+- SSA set_level_0/1/2 suffix appending (avoid full rebuild)
+- field_sensitivity array loop caching
+- irept union optimization (from issue #7960)
+- tcmalloc linked via CMake
+
+### Performance vs baseline (develop, glibc malloc)
+
+| Benchmark | Baseline | Optimized | Speedup |
+|-----------|----------|-----------|---------|
+| linked_list | 1.49s | 1.08s | **27.5%** |
+| array_ops | 3.08s | 1.89s | **38.6%** |
+| csmith_42 | 8.83s | 6.26s | **29.1%** |
+| csmith_1111111111 | 16.81s | 12.44s | **26.0%** |
+
+### New profile (csmith_42, all optimizations + tcmalloc)
+
+| Rank | Function | % | Change vs original |
+|------|----------|---|-------------------|
+| 1 | `string_containert` hash lookup | 13.2% | Was 10.0% — now larger share because other costs reduced |
+| 2 | `sharing_treet::remove_ref` | 6.1% | Was 9.9% — irept union helped |
+| 3 | tcmalloc new/delete | 7.8% | Was 18% glibc — 2.3x reduction |
+| 4 | `irept::find` | 4.7% | Was 5.4% |
+| 5 | `memcmp` | 4.5% | Was 3.5% — larger share |
+| 6 | `irept::get` | 3.4% | Was 4.8% |
+| 7 | `sharing_treet::detach` | 3.0% | Was 4.6% |
+| 8 | `symbol_tablet::find` | 3.0% | Was 4.9% |
+| 9 | `irept::operator==` | 2.1% | Unchanged |
+| 10 | `hash_string` | 1.9% | Was 1.7% |
+
+### Remaining avenues to explore
+
+1. **String interning volume** (13.2%): Still the #1 hotspot. The remaining
+   `set_expression` calls in `field_sensitivity.cpp` (member expressions,
+   not just arrays) still trigger full identifier rebuilds. Also,
+   `build_ssa_identifier_rec` is called from the constructor for every new
+   `ssa_exprt`. Caching the base identifier per-symbol could help.
+
+2. **`memcmp` in string interning** (4.5%): The `string_ptrt::operator==`
+   does `memcmp` on every hash table probe. If we stored a pre-computed
+   hash alongside the string pointer, we could skip `memcmp` when hashes
+   differ. However, `std::unordered_map` already does this internally.
+   The cost is from hash collisions in the same bucket.
+
+3. **`irept::find` / `irept::get`** (8.1% combined): These do linear scans
+   through `forward_list_as_mapt`. Replacing this with a flat hash map or
+   small sorted array for named sub-trees could help, but is a major
+   structural change to irept.
+
+4. **`merge_irept::merged`** (1.8%): Expression merging during symex.
+   Could benefit from better hash caching.
+
+5. **`sharing_mapt::get_leaf_node`** (1.3%): The SSA renaming map uses a
+   hash-array-mapped trie. A flat hash map might be faster for the typical
+   map sizes in symex.
+
+### Assessment
+
+The low-hanging fruit has been picked. The remaining hotspots are either:
+- **Fundamental data structure costs** (irept find/get, sharing_tree) that
+  require structural changes to improve
+- **Algorithmic** (string interning volume) that require deeper changes to
+  how SSA identifiers are managed
+- **Already well-optimized** by tcmalloc (allocation is now 7.8% vs 18%)
+
+The most promising remaining avenue is reducing string interning volume
+further — specifically, avoiding `string_containert::get` calls for strings
+that are already `dstringt` values. This would require changes to how
+`ssa_exprt` stores and updates its identifier.
+
+## Remaining Optimization Plans
+
+### Plan A: Reduce string interning volume (13.2%)
+
+**Goal**: Avoid calling `string_containert::get()` for strings that are
+already interned (i.e., already `dstringt` values).
+
+**Approach**: The hot path is `set_expression` in `field_sensitivity.cpp`
+which calls `update_identifier` → `build_identifier` → constructs
+`irep_idt(std::string)` → `string_containert::get()`. Instead of building
+a `std::string` and interning it, build the identifier from existing
+`irep_idt` parts using a concatenation that produces an `irep_idt` directly.
+
+**Investigation outcome**: Attempted caching for member expressions
+(appending `..component` before level suffixes). No measurable improvement
+because the member case is not in a hot loop. The remaining `set_expression`
+calls (37% of `update_identifier`) are from diverse call sites without a
+single dominant pattern. Further reduction would require adding concatenation
+support to `dstringt` itself (which currently only supports construction
+from `std::string` via `string_containert::get()`).
+
+**Estimated impact**: 3-5% (reducing 13.2% by ~30-40%)
+**Risk**: High — requires changes to core `dstringt`/`string_containert`
+**Status**: Partially explored, diminishing returns
+
+### Plan B: Replace `forward_list_as_mapt` in irept (8.1%)
+
+**Goal**: Speed up `irept::find()` and `irept::get()` which do linear
+scans through a linked list of named sub-trees.
+
+**Approach**: Replace `forward_list_as_mapt<dstringt, irept>` with a
+small flat sorted array or a small hash map. Most irept nodes have
+0-5 named sub-trees, so a linear scan of a contiguous array would be
+faster than a linked list due to cache locality.
+
+**Estimated impact**: 3-5% (reducing 8.1% by ~40-60%)
+**Risk**: High — `forward_list_as_mapt` is used throughout irept
+
+### Plan C: Improve `merge_irept::merged` (1.8%)
+
+**Goal**: Speed up expression merging during symex.
+
+**Approach**: `merge_irept::merged` uses `irept::operator==` (which is
+56% self-recursive) to check if an expression is already in the merge
+set. Pre-computing and caching hash values for irept nodes would allow
+skipping the deep equality check when hashes differ.
+
+**Estimated impact**: 1-2%
+**Risk**: Low — isolated change in merge_irept
+
+### Plan D: Optimize `sharing_mapt::get_leaf_node` (1.3%)
+
+**Goal**: Speed up SSA renaming map lookups.
+
+**Approach**: The sharing_map is a hash-array-mapped trie optimized for
+persistent/shared maps. For the L2 renaming use case where maps are
+typically small and not heavily shared, a flat `std::unordered_map`
+might be faster. Profile to confirm.
+
+**Estimated impact**: 0.5-1%
+**Risk**: Medium — sharing_map is used for path merging
+
+
+## Verification of Claims (2026-03-11)
+
+All claims were re-verified using a broader benchmark suite (6 benchmarks,
+5 runs each, median reported). Builds were verified via `ldd` to confirm
+tcmalloc presence/absence. tcmalloc was tested via `LD_PRELOAD` on builds
+that don't have the allocator CMake option.
+
+### Methodology correction
+
+Earlier verification incorrectly showed tcmalloc at 0.1% because
+`-Dallocator=system` was silently ignored by the baseline CMakeLists.txt
+(which doesn't have the allocator option). Both "glibc" and "tcmalloc"
+baseline builds were actually identical. Re-verified using `LD_PRELOAD`.
+
+### field_sensitivity array cache (3a95db52b3) — CORRECTNESS BUG
+
+This commit bypasses `set_expression` and manually constructs SSA
+identifiers. It produces **different SSA formulas** than the standard
+path (step count changes from 1950 to 1555 on string_ops). This causes
+massive regressions:
+
+| Benchmark  | Without | With    | Change |
+|------------|---------|---------|--------|
+| string_ops | 0.245s  | 4.013s  | **16x slower** |
+| matrix     | 1.431s  | 10.56s  | **7x slower** |
+
+The commit has been moved to the top of the branch for separate debugging.
+All other optimizations verified without this commit present.
+
+### Verified results at HEAD~2 (all opts except field_sensitivity cache and free-list)
+
+| Benchmark    | Baseline | +SSA opts | +tcmalloc | +Both    | SSA Δ  | tc Δ   | Combined |
+|              | glibc    | glibc     | preload   | preload  |        |        |          |
+|--------------|----------|-----------|-----------|----------|--------|--------|----------|
+| linked_list  | 0.554s   | 0.518s    | 0.471s    | 0.451s   | +6.5%  | +15.0% | +18.6%   |
+| array_ops    | 3.081s   | 2.631s    | 2.405s    | 2.037s   | +14.6% | +21.9% | +33.9%   |
+| dlinked_list | 0.971s   | 0.918s    | 0.795s    | 0.760s   | +5.5%  | +18.1% | +21.7%   |
+| matrix       | 1.661s   | 1.431s    | 1.293s    | 1.103s   | +13.8% | +22.2% | +33.6%   |
+| tree         | 2.512s   | 2.382s    | 1.978s    | 1.895s   | +5.2%  | +21.3% | +24.6%   |
+| string_ops   | 0.259s   | 0.245s    | —         | 0.263s   | +5.4%  | —      | -1.5%    |
+| **TOTAL (5)**| 8.779s   | 7.880s    | 6.942s    | 6.246s   |**+10.2%**|**+20.9%**|**+28.9%**|
+
+No regressions on any benchmark.
+
+### Free-list pool allocator — re-evaluated (glibc only, 5 runs, median)
+
+Earlier testing on a too-small benchmark showed only 1.4%. Re-tested
+with the full suite (without tcmalloc):
+
+| Benchmark    | Without | With   | Speedup |
+|--------------|---------|--------|---------|
+| linked_list  | 0.520s  | 0.512s | 1.5%    |
+| array_ops    | 2.644s  | 2.503s | **5.3%**|
+| dlinked_list | 0.912s  | 0.900s | 1.3%    |
+| matrix       | 1.435s  | 1.356s | **5.5%**|
+| tree         | 2.379s  | 2.337s | 1.8%    |
+| string_ops   | 0.243s  | 0.235s | 3.3%    |
+| **TOTAL**    | 8.133s  | 7.843s | **3.6%**|
+
+The free-list gives a consistent 3.6% speedup with glibc, concentrated
+on symex-heavy workloads. With tcmalloc, the effect is 0.7% (tcmalloc
+already has thread-local size-class caches).
+
+**Assessment**: The 31% claim from the commit message is not reproduced
+(likely measured on a different/larger workload or with confounding
+factors). The actual benefit is 3.6% with glibc. This is meaningful for
+platforms without tcmalloc (macOS, Windows) but adds complexity:
+- Memory is never returned to the OS
+- Interacts poorly with sanitizers (ASan, valgrind)
+- `thread_local` overhead on single-threaded CBMC
+- Redundant when tcmalloc is available
+
+### tcmalloc — CONFIRMED at 15-22%
+
+Verified via `LD_PRELOAD` on clean baseline builds confirmed to have
+no tcmalloc (via `ldd`). The benefit is consistent across all non-trivial
+benchmarks and applies to both baseline and SSA-optimized builds.
+
+SAT solver benefit from tcmalloc is minimal (~1.3% for CaDiCaL, ~1.2%
+for MiniSat2) because solvers use fewer, larger allocations. Confirmed
+that solvers do use tcmalloc (verified via `LD_DEBUG=bindings`).
+
+### SSA identifier optimizations — CONFIRMED at 5-15%
+
+The ostringstream→string concat and set_level suffix appending give
+10.2% overall, with 14.6% on array_ops and 13.8% on matrix. The
+benefit is real and concentrated on symex-heavy workloads.
+
+### Summary of claim accuracy
+
+| Optimization | Claimed | Verified | Status |
+|-------------|---------|----------|--------|
+| SSA string concat | 3.5-11.2% | 5-15% | ✅ Confirmed |
+| set_level_0/1/2 | 1.3% | included above | ✅ Plausible |
+| field_sensitivity cache | 4.8% | **7-16x regression** | ❌ Bug |
+| irept union | 2.4-4.9% | not isolated | ⚠️ Included in HEAD~2 |
+| tcmalloc | 18-25% | 15-22% | ✅ Confirmed |
+| Free-list (glibc) | 31% | 3.2% | ⚠️ Overstated |
+| Free-list (tcmalloc) | 2% | 0.7% | ⚠️ See definitive analysis |
+| get_new_name cache | 4.9% → 0.56% | not exercised | ⚠️ Unverified |
+
+## Post-Verification Profile: Best Configuration (2026-03-11)
+
+Configuration: HEAD~2 (SSA opts + irept union + get_new_name cache + tcmalloc)
+Benchmarks: 6 (linked_list, array_ops, dlinked_list, matrix, tree, heavy_array)
+Total samples: 1,738
+
+### Timings
+
+| Benchmark    | Symex   | Convert SSA | Total  | Steps  |
+|-------------|---------|-------------|--------|--------|
+| linked_list | 0.153s  | 0.302s      | 0.425s | 4,433  |
+| array_ops   | 2.016s  | 0.137s      | 2.015s | 16,448 |
+| dlinked_list| 0.271s  | 0.488s      | 0.735s | 8,242  |
+| matrix      | 1.180s  | 0.000s      | 1.080s | 4,148  |
+| tree        | 0.695s  | 0.729s      | 1.855s | 27,789 |
+| heavy_array | 1.695s  | 0.000s      | 1.573s | 5,176  |
+
+### Hotspot Summary by Category
+
+**SSA identifier pipeline: ~14%**
+- `_Hashtable::_M_find_before_node` (string interning): 5.6%
+- `build_ssa_identifier_rec`: 3.2%
+- `hash_string`: 1.8%
+- `_Hashtable::find`: 1.3%
+- `update_identifier`: 0.8%
+- `ssa_exprt::remove_level_2`: 0.7%
+- `string::_M_append`: 2.9% (from identifier building)
+
+**irept operations: ~22%**
+- `irept::find`: 8.7%
+- `sharing_treet::remove_ref`: 6.9%
+- `irept::get`: 6.6%
+- `irept::add(id, irept)`: 3.0%
+- `sharing_treet::detach`: 2.8%
+- `irept::add(id)`: 1.8%
+- `irept::operator==`: 0.9%
+
+**Allocation (tcmalloc): ~14%**
+- `operator new[]`: 8.9%
+- `operator delete[]`: 3.6%
+- `operator delete[]` (sized): 1.8%
+
+**Simplifier: ~2.5%**
+- `simplify_node_preorder`: 1.5%
+- `simplify_node`: 1.0%
+
+**Other CBMC: ~5%**
+- `constant_exprt::check`: 2.1%
+- `field_sensitivityt::get_fields`: 0.9%
+- `field_sensitivityt::apply`: 0.7%
+- `binary_exprt constructor`: 0.9%
+- `constant_exprt constructor`: 0.8%
+
+### Dominant Call Chain
+
+Nearly all hotspots converge on a single call chain:
+
+```
+symex_step → execute_next_instruction → {symex_assign, symex_assert, symex_goto}
+  → clean_expr → dereference
+    → field_sensitivityt::apply (recursive, 4-5 levels deep)
+      → field_sensitivityt::get_fields (recursive for nested arrays/structs)
+        → update_identifier → build_ssa_identifier_rec
+          → string_containert::get → hash_string → _Hashtable::find
+        → irept::find, irept::get (for type/expression lookups)
+        → simplify_exprt::simplify (for index expressions)
+          → simplify_node_preorder → irept::find
+```
+
+The `field_sensitivityt::apply` function is called recursively 4-5 times
+per expression during dereference, and `get_fields` recurses for each
+array dimension. Each iteration calls `update_identifier` which rebuilds
+the SSA identifier string and interns it via `string_containert::get`.
+
+### Key Observations
+
+1. **field_sensitivity dominates everything.** The recursive apply/get_fields
+   chain is the root cause of most hotspots. It drives:
+   - All SSA identifier rebuilding (14%)
+   - Most irept::find/get calls (through type lookups in get_fields)
+   - Most allocation (through expression copying in apply)
+   - Most simplifier calls (through index simplification in get_fields)
+
+2. **irept::find is called from simplify_node_preorder** which is called
+   from within field_sensitivityt::apply's simplification of index
+   expressions. The simplifier does deep recursive descent, calling
+   `irept::find(ID_type)` at each level to check expression types.
+
+3. **tcmalloc allocation is 14%** — down from 25% with glibc, but still
+   significant. The `operator new[]` calls are from `std::vector` growth
+   in irept's `sub` member (which stores unnamed children).
+
+4. **`constant_exprt::check` at 2.1%** is validation that runs on every
+   `to_constant_expr()` cast. It calls `irept::find(ID_value)`. This is
+   pure overhead in Release builds but cannot be disabled without
+   `CPROVER_INVARIANT_DO_NOT_CHECK`.
+
+### Actionable Next Steps
+
+**P1: Reduce field_sensitivityt::apply recursion depth** (est. 5-10%)
+The 4-5 levels of recursive `apply` calls during dereference are
+excessive. Each level copies expressions and triggers identifier
+rebuilds. Investigate whether the recursion can be flattened or
+whether intermediate results can be cached.
+Source: `src/goto-symex/field_sensitivity.cpp`
+
+**P2: Cache type lookups in simplifier** (est. 2-3%)
+`simplify_node_preorder` calls `irept::find(ID_type)` at every
+recursion level. Since types don't change during simplification,
+the type could be passed as a parameter instead of re-looked-up.
+Source: `src/util/simplify_expr.cpp`
+
+**P3: Reduce vector reallocation in irept** (est. 2-4%)
+The 8.9% in `operator new[]` is largely from `std::vector` growth
+in irept's `sub` member. Pre-sizing or using small-buffer optimization
+could help.
+Source: `src/util/irep.h`
+
+**P4: Avoid redundant `constant_exprt::check`** (est. 1-2%)
+The 2.1% in `check` is from validation in `to_constant_expr()`.
+Consider a `to_constant_expr_unchecked()` for hot paths where the
+type is already known.
+Source: `src/util/std_expr.h`
+
+### Investigation: field_sensitivityt::apply optimization (2026-03-11)
+
+**Attempted**: Cache array identifier prefix in `get_fields` array loop.
+For each array element, the SSA identifier has the form
+`prefix[[bvrep_of_i]]suffix` where prefix and suffix are identical.
+The first element uses canonical `set_expression` to establish the
+format, then subsequent elements derive identifiers by string replacement.
+
+**Correctness**: Verified — all 7 benchmarks produce identical step
+counts to baseline. No invariant violations. (The previous attempt in
+commit `3a95db52b3` was buggy because it used decimal `std::to_string(i)`
+instead of the bvrep value, and bypassed `set_expression` entirely.)
+
+**Performance** (5 runs, median, with tcmalloc):
+
+| Benchmark    | Without | With   | Δ      |
+|-------------|---------|--------|--------|
+| linked_list | 0.421s  | 0.422s | -0.2%  |
+| array_ops   | 1.972s  | 2.005s | -1.7%  |
+| dlinked_list| 0.729s  | 0.726s | +0.4%  |
+| string_ops  | 0.204s  | 0.204s | +0.0%  |
+| matrix      | 1.062s  | 1.052s | +0.9%  |
+| tree        | 1.851s  | 1.846s | +0.3%  |
+| heavy_array | 1.537s  | 1.468s | **+4.5%** |
+| **TOTAL**   | 7.776s  | 7.723s | **+0.7%** |
+
+**Assessment**: Only 0.7% total improvement. The identifier string
+building is not the dominant cost in the array loop — the `from_integer`
+call (which creates a `constant_exprt` and interns its bvrep value),
+the `ssa_exprt` copy, and the recursive `get_fields` call dominate.
+The optimization only helps on `heavy_array` (4.5%) which has large
+arrays. **Not worth the added complexity.**
+
+**Root cause analysis**: The profiling shows that the cost is spread
+across many small irept operations (find, get, add, detach, remove_ref)
+that are fundamental to how CBMC represents and manipulates expressions.
+There is no single bottleneck to eliminate — it's the cumulative cost
+of millions of small operations on the sharing tree data structure.
+
+**Conclusion**: Further optimization of `field_sensitivityt::apply`
+requires either:
+1. Structural changes to reduce the number of `apply` calls (e.g.,
+   caching results, lazy evaluation)
+2. Changes to the irept data structure itself (e.g., replacing
+   `forward_list_as_mapt` with a flat hash map for named sub-trees)
+3. Reducing the depth of `field_sensitivityt::apply` recursion in
+   `symex_dereference.cpp` (currently 4-5 levels deep)
+
+These are all high-risk, high-effort changes that go beyond the scope
+of incremental optimization.
+
+### Implemented: hoist loop invariants in get_fields (2026-03-11)
+
+Hoisted loop-invariant work out of both the struct and array loops:
+- Prepare a template `ssa_exprt` with L2 already removed before the loop
+- Cache `was_l2` flag (identical for all elements)
+- For arrays: cache identifier prefix/suffix from first element's
+  canonical `set_expression`, derive subsequent identifiers by replacing
+  only the index portion (using bvrep values for correctness)
+
+This eliminates per-iteration: 1 `ssa_exprt` copy from the original
+(now copies from the lighter template), 1 `get_level_2()` call,
+1 `remove_level_2()` call, and for array elements i>0: 1 full
+`update_identifier` → `build_ssa_identifier_rec` → `string_containert::get`
+chain.
+
+| Benchmark    | Baseline | Optimized | Speedup |
+|-------------|----------|-----------|---------|
+| linked_list | 0.421s   | 0.421s    | +0.0%   |
+| array_ops   | 1.972s   | 1.924s    | **+2.4%** |
+| dlinked_list| 0.729s   | 0.728s    | +0.1%   |
+| string_ops  | 0.204s   | 0.203s    | +0.5%   |
+| matrix      | 1.062s   | 1.002s    | **+5.6%** |
+| tree        | 1.851s   | 1.848s    | +0.2%   |
+| heavy_array | 1.537s   | 1.356s    | **+11.8%** |
+| **TOTAL**   | 7.776s   | 7.482s    | **+3.8%** |
+
+Correctness verified: all 7 benchmarks produce identical step counts.
+
+## Post-Optimization Profile: After get_fields Hoisting (2026-03-11)
+
+Configuration: HEAD (all SSA opts + irept union + get_fields hoisting + tcmalloc)
+Benchmarks: 6 heavy benchmarks × 5 runs each under single perf session
+Total samples: ~38K (997 Hz, filtered to cbmc process)
+
+### Profile by Category
+
+| Category | % | Key functions |
+|----------|---|---------------|
+| ostream I/O | 20.9% | xsputn, ostream_insert, num_put, sentry |
+| irept operations | 19.2% | operator==, find, get, add, hash, compare, detach, remove_ref, merged |
+| tcmalloc | 10.5% | new[], delete[], internal |
+| DIMACS I/O + CNF | 6.2% | write_dimacs_clause, process_clause, lcnf |
+| memmove | 1.9% | memcpy/memmove |
+| field_sensitivity | **1.6%** | apply, ssa_check, requires_renaming, string intern |
+| other CBMC | 0.8% | convert_bv, simplify_node |
+
+### Key Changes vs Previous Profile
+
+1. **field_sensitivity dropped from ~14% to 1.6%** — the loop invariant
+   hoisting and SSA identifier optimizations were effective.
+
+2. **irept::operator== is now #1 CBMC function at 7.5%** (was ~1%).
+   Called from `merge_irept::merged` → `symex_target_equationt::merge_ireps`.
+   This is expression deduplication during symex. 56% of samples are
+   self-recursive (deep tree comparison).
+
+3. **ostream I/O is 21%** — this is a benchmarking artifact from
+   `--dimacs --outfile /dev/null` which still formats all clauses as text.
+   In real usage with a SAT solver, this cost would be replaced by solver
+   time. Future profiling should use `--stop-on-fail` with actual solving
+   to get a realistic profile.
+
+4. **DIMACS CNF generation is 6.2%** — `process_clause`, `write_dimacs_clause`,
+   `lcnf`. This is real work that would also happen with a SAT solver
+   (clause generation), though the text formatting part is artificial.
+
+### Top CBMC Functions
+
+| Rank | Function | % | Caller |
+|------|----------|---|--------|
+| 1 | irept::operator== | 7.50% | merge_irept::merged (expression dedup) |
+| 2 | sharing_treet::remove_ref | 2.59% | Scattered (destruction) |
+| 3 | cnft::process_clause | 2.35% | CNF clause generation |
+| 4 | irept::get | 2.22% | Scattered |
+| 5 | dimacs_cnft::write_dimacs_clause | 2.19% | DIMACS output (artifact) |
+| 6 | irept::find | 1.86% | Scattered |
+| 7 | sharing_treet::detach | 1.28% | Copy-on-write |
+| 8 | merge_irept::merged | 1.26% | Expression dedup |
+| 9 | irept::hash | 1.18% | merge_irept hash table |
+| 10 | cnf_clause_listt::lcnf | 1.13% | CNF clause list |
+
+### Actionable Observations
+
+**merge_irept is the new dominant CBMC hotspot** (operator== 7.5% +
+merged 1.3% + hash 1.2% = ~10%). It deduplicates expressions by
+hashing and deep comparison. Potential optimizations:
+- Cache hash values in irept nodes (HASH_CODE is already enabled but
+  `hash_code` is only set lazily and cleared on mutation)
+- Use hash comparison as a fast-reject before deep operator==
+- Consider whether merge_ireps is called too frequently
+
+**The --dimacs benchmark methodology inflates I/O costs.** The 27%
+in ostream + DIMACS I/O is an artifact. For future profiling, use
+actual SAT solving (e.g., `--sat-solver cadical --stop-on-fail`)
+to get a realistic profile of the full pipeline.
+
+**field_sensitivity is no longer a bottleneck** at 1.6%. The loop
+invariant hoisting was effective.
+
+### Hash function comparison (2026-03-11)
+
+Tested all three irep hash functions from `src/util/irep_hash.h`:
+BASIC (rotate-7 XOR), MURMURHASH2A, and MURMURHASH3.
+
+| Benchmark    | BASIC  | MURMUR2A | MURMUR3 | 2A vs B | 3 vs B |
+|-------------|--------|----------|---------|---------|--------|
+| array_ops   | 1.923s | 1.930s   | 1.928s  | -0.4%   | -0.3%  |
+| matrix      | 1.001s | 1.001s   | 1.007s  | +0.0%   | -0.6%  |
+| tree        | 1.846s | 1.863s   | 1.864s  | -0.9%   | -1.0%  |
+| heavy_array | 1.356s | 1.358s   | 1.358s  | -0.1%   | -0.1%  |
+| linked_list | 0.421s | 0.424s   | 0.423s  | -0.7%   | -0.5%  |
+| dlinked_list| 0.730s | 0.733s   | 0.731s  | -0.4%   | -0.1%  |
+| **TOTAL**   | 7.277s | 7.309s   | 7.311s  | **-0.4%** | **-0.5%** |
+
+**Conclusion**: No meaningful difference. BASIC is marginally fastest
+(0.4-0.5%) but within noise. The hash function is not a bottleneck —
+the cost is in the number of hash operations and the deep equality
+comparisons that follow hash collisions, not in the hash computation
+itself. This is consistent with the earlier finding that `irept::hash`
+is only 1.2% of total time.
+
+Instrumented with `IREP_HASH_STATS` to count actual operator== calls:
+
+| Benchmark    | hash calls | BASIC cmp  | MURMUR2A cmp | MURMUR3 cmp | Δ |
+|-------------|-----------|------------|-------------|-------------|---|
+| array_ops   | 760,484   | 2,712,855  | 2,712,855   | 2,712,855   | 0 |
+| tree        | 1,103,385 | 5,274,971  | 5,272,397   | 5,272,397   | -2,574 |
+| heavy_array | 172,870   | 1,061,894  | 1,061,894   | 1,061,894   | 0 |
+| linked_list | 192,219   | 5,268,225  | 5,268,165   | 5,268,165   | -60 |
+| **TOTAL**   | 2,228,958 | 14,317,945 | 14,315,311  | 14,315,311  | **-2,634 (-0.018%)** |
+
+The Murmur variants produce 2,634 fewer operator== calls out of 14.3M
+(0.018% reduction). This is negligible and explains why the runtime
+difference is within noise.
+
+Notable: the ratio of comparisons to hashes is 6.4:1, meaning
+`merge_irept::merged` does ~6 equality checks per hash lookup on
+average. This suggests the hash table has significant collision chains
+or many structurally similar expressions. However, since the Murmur
+hashes don't reduce this ratio, the collisions are likely from
+genuinely equal expressions (sharing the same hash bucket), not from
+poor hash distribution.
+
+### Investigation: reducing operator== calls (2026-03-11)
+
+**Context**: `irept::operator==` accounts for 7.5% of total time, called
+14.3M times across 4 benchmarks. 67.9% are sharing hits (free), 27.7%
+are deep comparisons returning true, 3.5% are deep comparisons returning
+false, 0.9% are hash rejects.
+
+**Detailed breakdown** (with IREP_HASH_STATS instrumentation):
+
+| Category | Count | % | Cost |
+|----------|-------|---|------|
+| Sharing hit (ptr ==) | 9,648,072 | 67.9% | Free |
+| Hash reject (new) | 133,543 | 0.9% | Cheap |
+| Deep compare (equal) | 3,932,670 | 27.7% | **Expensive** |
+| Deep compare (not eq) | 501,226 | 3.5% | Expensive |
+
+**merge_irept statistics**:
+- 744,362 calls to `merged()`, 82% cache hits
+- 19.2 operator== calls per merge call (recursive sub-tree comparison)
+- `linked_list` has 65:1 ratio (deep expression trees)
+
+#### Attempted: hash-based fast reject in operator==
+
+Added check: if both ireps have cached hash_code and they differ,
+return false immediately. Result: **0.0% runtime impact**. The
+`std::unordered_set` already filters by hash before calling operator==,
+so the hash reject only fires for non-merge callers (133K out of 14.3M).
+
+#### Attempted: opportunistic sharing in operator==
+
+After a successful deep comparison (result == true), make both ireps
+share the same data pointer. This would convert future comparisons
+between the same pair into instant pointer checks.
+
+**Result: UNSAFE.** Changing the data pointer in operator== alters
+copy-on-write semantics. An irep that was the sole owner of its data
+(ref_count == 1, mutations are in-place) becomes a shared owner
+(ref_count > 1, mutations trigger detach/copy). This changed the
+generated formula: `dlinked_list` went from 8,242 to 19,552 steps.
+
+#### Attempted: pointer-based fast lookup in merge_irept
+
+Added `unordered_set<const void*> known_pointers` to skip the
+hash+equality check when an irep's data pointer is already known
+to be in the store.
+
+**Result: 1.3% SLOWER.** The overhead of maintaining the pointer set
+(hashing pointers, inserting, looking up) outweighs the savings.
+
+#### Analysis: why 3.9M deep-equal comparisons exist
+
+The deep-equal comparisons happen because:
+1. `merge_irept::merged` is called on every SSA step (744K calls)
+2. 82% of calls find the irep already in the store (cache hit)
+3. But `unordered_set::insert` must call operator== to verify the match
+4. operator== recurses into sub-trees (19 recursive calls on average)
+5. The sub-trees of the incoming irep may not share data pointers with
+   the store entries, even though they're structurally equal
+
+The fundamental issue is that `std::unordered_set` has no way to skip
+the equality check — it must verify that the hash match is not a
+collision. The only way to avoid this is to ensure the incoming irep
+shares data pointers with the store entry at every level, which would
+require the irep to have been constructed from previously-merged
+components.
+
+#### Conclusion
+
+The 7.5% cost of operator== in merge_irept is inherent to the
+hash-set-based deduplication approach. The only viable optimizations
+would be:
+1. **Reduce merge frequency**: Call merge_ireps less often (e.g., every
+   N steps instead of every step)
+2. **Structural change**: Replace the unordered_set with a data structure
+   that can verify membership without deep comparison (e.g., a trie
+   indexed by irep structure)
+3. **Ensure sharing**: Make the SSA step construction reuse previously-
+   merged ireps so that sub-tree pointers match store entries
+
+### Sketch: reducing merge frequency (2026-03-12)
+
+#### Current behavior
+
+Every SSA step method in `symex_target_equationt` (19 call sites) calls
+`merge_ireps(SSA_step)` at the end. This merges 6-8 top-level ireps per
+step (guard, ssa_lhs, ssa_full_lhs, original_full_lhs, ssa_rhs,
+cond_expr, plus io_args and function_arguments), each of which recurses
+into all sub-ireps.
+
+The `merge_irept::merged()` function works bottom-up:
+1. Try to insert the irep into `irep_store` (an `unordered_set<irept>`)
+2. If already present → return the existing entry (sharing hit)
+3. If new → recursively merge all sub-ireps, then update the store entry
+
+The cost: 82% of `merged()` calls are cache hits, but each hit requires
+`unordered_set::insert` → hash computation + `operator==` verification.
+The `operator==` recurses into sub-trees, averaging 19 recursive calls
+per top-level merge.
+
+#### Why batching could help
+
+The key insight: when merging step N, many of its sub-ireps are
+identical to sub-ireps from step N-1 (same variable names, same types,
+same guards). If step N-1 was already merged, its sub-ireps already
+share data pointers with the store. But the sub-ireps of step N were
+constructed independently — they have the same *content* but different
+*pointers*.
+
+If we skip merging for K steps and then merge all K steps at once, the
+merge store processes K×(6-8) top-level ireps. The first step's ireps
+populate the store; subsequent steps' ireps find sharing hits. The total
+number of `operator==` calls is the same, but:
+
+**The benefit is NOT from batching itself** — it's from the fact that
+after merging step N, the *next* step N+1 is constructed using the
+already-merged components from the symbol table and SSA state. If the
+SSA construction reuses ireps that were previously merged, their
+sub-tree pointers already match the store, making `operator==` a cheap
+pointer comparison instead of a deep structural comparison.
+
+#### The real opportunity: ensure sub-tree sharing
+
+The actual optimization is not "merge less often" but "ensure that
+ireps entering merge already share sub-tree pointers with the store."
+
+This happens naturally for some components:
+- **Types**: The same `typet` objects are reused across steps (e.g.,
+  `signedbv_typet(32)` for all `int` variables). Once merged, the
+  type's data pointer matches the store entry.
+- **Variable names**: `irep_idt` values are interned strings — they
+  already have pointer equality.
+
+But it does NOT happen for:
+- **SSA indices**: Each `ssa_exprt` has a unique L2 index suffix that
+  changes every step. The SSA expression is constructed fresh, so its
+  sub-tree pointers don't match the store.
+- **Guards**: Guards accumulate conditions and are rebuilt each step.
+
+#### Concrete approach: deferred merge with sharing propagation
+
+```
+// Instead of merging every step:
+void symex_target_equationt::merge_ireps(SSA_stept &step)
+{
+  merge_irep(step.guard);
+  merge_irep(step.ssa_lhs);
+  // ... etc
+}
+
+// Merge every N steps, but propagate sharing:
+void symex_target_equationt::maybe_merge_ireps(SSA_stept &step)
+{
+  ++steps_since_merge;
+  if(steps_since_merge < MERGE_INTERVAL)
+    return;
+  steps_since_merge = 0;
+
+  // Merge all unmerged steps
+  for(auto it = last_merged_step; it != SSA_steps.end(); ++it)
+    merge_ireps(*it);
+  last_merged_step = SSA_steps.end();
+}
+```
+
+**Expected impact**: Minimal. The batching itself doesn't reduce the
+number of `operator==` calls — it just delays them. The sub-ireps of
+each step are still constructed independently, so they still won't
+share pointers with the store.
+
+The only way batching helps is if it reduces *peak memory* by allowing
+the store to grow more slowly. But the store's purpose IS to reduce
+memory, so delaying merges increases memory usage.
+
+#### Verdict
+
+Reducing merge frequency is unlikely to yield significant speedup
+because:
+1. The `operator==` cost is per-irep, not per-batch — batching doesn't
+   reduce the total number of comparisons
+2. The sub-ireps entering merge are constructed fresh regardless of
+   batch size — they won't share pointers with the store
+3. Delaying merges increases peak memory (the opposite of merge's goal)
+
+The only viable path to reducing `operator==` cost in merge is to
+ensure that ireps entering merge already have sub-tree pointers that
+match the store. This would require changes to how SSA expressions
+are constructed — e.g., caching and reusing the merged type/guard
+components when building new SSA steps.
+
+
+## Re-Profiling and Re-Verification (2026-03-12)
+
+### Methodology improvement: goto-cc pre-compilation
+
+Switched to compiling benchmarks to goto binaries via `goto-cc` before
+profiling. This eliminates C preprocessing variability:
+- Range: ±0.007-0.017s (.gb) vs ±0.285-0.811s (.c)
+- Speedup: 2.2% from skipping in-process C parsing
+- Profile content is identical (cc1 runs as subprocess)
+
+Updated `scripts/profiling/runner.py` to auto-compile .c to .gb.
+
+### Corrected benchmark results (median of 5, goto binaries)
+
+Baseline: develop `b191cc1dac`, glibc malloc
+HEAD: all code optimizations (SSA string concat, set_level_0/1/2,
+irept union, pool allocator, get_new_name cache, field_sensitivity
+hoisting), glibc malloc
+
+| Benchmark    | Baseline | HEAD   | Speedup |
+|-------------|----------|--------|---------|
+| linked_list | 0.522s   | 0.400s | 23.4%   |
+| array_ops   | 3.063s   | 1.900s | 38.0%   |
+| dlinked_list| 0.943s   | 0.705s | 25.2%   |
+| string_ops  | 0.242s   | 0.193s | 20.2%   |
+| matrix      | 1.654s   | 0.989s | 40.2%   |
+| tree        | 2.493s   | 1.826s | 26.8%   |
+| heavy_array | 2.533s   | 1.316s | 48.0%   |
+| **TOTAL**   |**11.450s**|**7.329s**|**36.0%**|
+
+Correctness verified: DIMACS output matches exactly for all benchmarks.
+
+**Note:** This table measures code optimizations only (glibc malloc).
+With tcmalloc added, the total speedup is ~31% — see "Comprehensive
+13-Benchmark Results (2026-03-12)" for the full picture.
+
+### tcmalloc re-evaluation — RETRACTED
+
+**The measurements in this section were incorrect.** The HEAD binary had
+tcmalloc linked via CMake (`-Dallocator=auto`), so the LD_PRELOAD test
+was comparing tcmalloc vs tcmalloc, not tcmalloc vs glibc. The "0%
+tcmalloc benefit on HEAD" was an artifact of this error.
+
+See the "Definitive Allocator Analysis (2026-03-12)" section below for
+correct measurements using binaries built with `-Dallocator=system`.
+
+### Current profile (HEAD, all optimizations)
+
+**Note:** This profile was taken before changes 1 and 2 (unordered_set,
+value_sett::to_expr) and was dominated by csmith_1111111111. See
+"Updated Profile: Post-Changes 1 and 2 (2026-03-12)" for the latest.
+
+| Rank | Function | % |
+|------|----------|---|
+| 1 | irept::operator== | 8.4% |
+| 2 | sharing_treet::remove_ref | 2.7% |
+| 3 | cnft::process_clause | 2.3% |
+| 4 | dimacs_cnft::write_dimacs_clause | 2.0% |
+| 5 | irept::find | 1.7% |
+| 6 | irept::get | 1.7% |
+| 7 | sharing_treet::detach | 1.4% |
+| 8 | merge_irept::merged | 1.3% |
+| 9 | irept::hash | 1.2% |
+| 10 | cnf_clause_listt::lcnf | 1.1% |
+
+Library overhead: iostream 23.1%, tcmalloc 12.6%, libc 2.7%.
+The iostream cost is a `--dimacs` profiling artifact.
+
+### Assessment
+
+The profile is now flat — no single function dominates. The remaining
+hotspots are either inherent to the algorithm (operator==, merge),
+architectural (COW refcounting), or artifacts of the profiling method
+(iostream). We are in diminishing-returns territory for easy wins.
+
+
+## Definitive Allocator Analysis (2026-03-12)
+
+### Methodology
+
+Built 3 binaries from the same codebase, varying only the pool allocator:
+- **Baseline**: develop `b191cc1dac` (no pool, no other optimizations)
+- **HEAD-nopool**: all optimizations except pool allocator (reverted `ae34043bd1`)
+- **HEAD-pool**: all optimizations including pool allocator
+
+All 3 binaries compiled with `-Dallocator=system` (glibc malloc only).
+tcmalloc and jemalloc tested via `LD_PRELOAD` on each binary.
+
+Allocator linkage verified via `ldd` for all binaries (no tcmalloc/jemalloc
+linked). LD_PRELOAD verified via `LD_DEBUG=libs`. DIMACS output verified
+identical across all 3 builds.
+
+9 benchmarks × 3 builds × 3 allocators × 7 runs = 567 data points.
+Measurement stability: 96% of configurations have IQR < 2% of median.
+
+### Benchmarks
+
+| Benchmark | Args |
+|-----------|------|
+| linked_list | --bounds-check --pointer-check --unwind 200 |
+| array_ops | --bounds-check --unwind 55 |
+| dlinked_list | --bounds-check --pointer-check --unwind 150 |
+| string_ops | --bounds-check --unwind 30 |
+| matrix | --bounds-check --unwind 12 |
+| tree | --bounds-check --pointer-check --unwind 8 |
+| heavy_array | --bounds-check --unwind 25 |
+| csmith_42 | --unwind 257 --no-unwinding-assertions --object-bits 13 |
+| csmith_1111111111 | --unwind 257 --no-unwinding-assertions --object-bits 13 |
+
+All benchmarks pre-compiled to goto binaries via `goto-cc`.
+
+### Full results (median of 7 runs, seconds)
+
+| Benchmark | Base/glibc | Base/tc | Base/je | Head/glibc | Head/tc | Head/je | Pool/glibc | Pool/tc | Pool/je |
+|-----------|-----------|---------|---------|-----------|---------|---------|-----------|---------|---------|
+| linked_list | 0.523 | 0.441 | 0.431 | 0.491 | 0.418 | 0.408 | 0.485 | 0.419 | 0.409 |
+| array_ops | 3.076 | 2.377 | 2.570 | 2.497 | 1.918 | 2.062 | 2.410 | 1.904 | 2.052 |
+| dlinked_list | 0.942 | 0.766 | 0.762 | 0.883 | 0.724 | 0.717 | 0.873 | 0.723 | 0.717 |
+| string_ops | 0.243 | 0.249 | 0.219 | 0.226 | 0.237 | 0.208 | 0.222 | 0.234 | 0.206 |
+| matrix | 1.662 | 1.277 | 1.377 | 1.333 | 0.999 | 1.081 | 1.274 | 0.995 | 1.066 |
+| tree | 2.488 | 1.948 | 1.962 | 2.345 | 1.842 | 1.854 | 2.307 | 1.840 | 1.854 |
+| heavy_array | 2.539 | 1.943 | 2.071 | 1.831 | 1.333 | 1.429 | 1.742 | 1.316 | 1.397 |
+| csmith_42 | 1.732 | 1.338 | 1.375 | 1.552 | 1.221 | 1.247 | 1.511 | 1.212 | 1.241 |
+| csmith_1111 | 12.737 | 10.066 | 10.072 | 11.723 | 9.286 | 9.351 | 11.334 | 9.214 | 9.251 |
+| **TOTAL** | **25.942** | **20.405** | **20.839** | **22.881** | **17.978** | **18.357** | **22.158** | **17.857** | **18.193** |
+
+### Effect of allocator (% speedup vs glibc, same binary)
+
+| Benchmark | Base→tc | Base→je | Head→tc | Head→je | Pool→tc | Pool→je |
+|-----------|---------|---------|---------|---------|---------|---------|
+| linked_list | +15.7% | +17.6% | +14.9% | +16.9% | +13.6% | +15.7% |
+| array_ops | +22.7% | +16.4% | +23.2% | +17.4% | +21.0% | +14.9% |
+| dlinked_list | +18.7% | +19.1% | +18.0% | +18.8% | +17.2% | +17.9% |
+| string_ops | -2.5% | +9.9% | -4.9% | +8.0% | -5.4% | +7.2% |
+| matrix | +23.2% | +17.1% | +25.1% | +18.9% | +21.9% | +16.3% |
+| tree | +21.7% | +21.1% | +21.4% | +20.9% | +20.2% | +19.6% |
+| heavy_array | +23.5% | +18.4% | +27.2% | +22.0% | +24.5% | +19.8% |
+| csmith_42 | +22.7% | +20.6% | +21.3% | +19.7% | +19.8% | +17.9% |
+| csmith_1111 | +21.0% | +20.9% | +20.8% | +20.2% | +18.7% | +18.4% |
+| **TOTAL** | **+21.3%** | **+19.7%** | **+21.4%** | **+19.8%** | **+19.4%** | **+17.9%** |
+
+### Effect of pool allocator (Head-nopool → Head-pool)
+
+| Benchmark | glibc | tcmalloc | jemalloc |
+|-----------|-------|----------|----------|
+| linked_list | +1.2% | -0.2% | -0.2% |
+| array_ops | +3.5% | +0.7% | +0.5% |
+| dlinked_list | +1.1% | +0.1% | +0.0% |
+| string_ops | +1.8% | +1.3% | +1.0% |
+| matrix | +4.4% | +0.4% | +1.4% |
+| tree | +1.6% | +0.1% | +0.0% |
+| heavy_array | +4.9% | +1.3% | +2.2% |
+| csmith_42 | +2.6% | +0.7% | +0.5% |
+| csmith_1111 | +3.3% | +0.8% | +1.1% |
+| **TOTAL** | **+3.2%** | **+0.7%** | **+0.9%** |
+
+### 2×2 summary (HEAD code, total across 9 benchmarks)
+
+| | glibc | tcmalloc | jemalloc |
+|---|---|---|---|
+| **No pool** | 22.881s (ref) | 17.978s (+21.4%) | 18.357s (+19.8%) |
+| **Pool** | 22.158s (+3.2%) | 17.857s (+22.0%) | 18.193s (+20.5%) |
+
+### Conclusions
+
+1. **tcmalloc gives 21% speedup** consistently across all builds (baseline,
+   HEAD-nopool, HEAD-pool) and all benchmarks except string_ops (which is
+   too small at 0.2s for the benefit to outweigh initialization overhead).
+   This effect is independent of code optimizations.
+
+2. **jemalloc gives 20% speedup**, slightly less than tcmalloc on most
+   benchmarks. Both are effective; tcmalloc has a slight edge on
+   array-heavy workloads.
+
+3. **The pool allocator gives 3.2% speedup with glibc**, concentrated on
+   symex-heavy workloads (heavy_array +4.9%, matrix +4.4%, array_ops +3.5%).
+
+4. **The pool allocator adds only 0.7% on top of tcmalloc** and 0.9% on
+   top of jemalloc. Both tcmalloc and the pool allocator address the same
+   bottleneck (small object allocation/deallocation), and tcmalloc is much
+   more effective.
+
+5. **The pool allocator does NOT subsume tcmalloc** (correcting the earlier
+   claim from the 2026-03-12 re-profiling). The earlier measurement showing
+   0% tcmalloc benefit on HEAD was incorrect because the HEAD binary had
+   tcmalloc linked via CMake — the LD_PRELOAD test was comparing tcmalloc
+   vs tcmalloc, not tcmalloc vs glibc. This re-analysis uses binaries
+   built with `-Dallocator=system` to ensure clean glibc baselines.
+
+6. **The 31% pool allocator claim in commit `7d15280901` is not reproduced.**
+   The verified benefit is 3.2% with glibc. The original measurement likely
+   had confounding factors (different benchmark, different baseline, or
+   tcmalloc already present).
+
+### Recommendation
+
+- **tcmalloc should be the primary allocator strategy** (21% speedup, zero
+  code complexity, well-tested library)
+- **The pool allocator provides marginal additional benefit** (+0.7% with
+  tcmalloc) but adds complexity (memory never returned to OS, interacts
+  poorly with sanitizers, thread_local overhead). Consider whether the
+  complexity is worth 0.7%.
+- **jemalloc is a good alternative** where tcmalloc is unavailable (20%
+  speedup, widely available)
+
+## Comprehensive 13-Benchmark Results (2026-03-12)
+
+Extended the benchmark suite from 7 hand-written benchmarks to 13 total
+by adding 6 CSmith-generated benchmarks (seeds: 42, 1111111111,
+1234567890, 2718281828, 314159265, 99999). CSmith benchmarks use
+`--unwind 257 --no-unwinding-assertions --object-bits 13` and require
+`-I/usr/include/csmith`.
+
+All benchmarks pre-compiled to goto binaries via `goto-cc` for
+measurement stability. 7 runs each, median reported. IQR < 2.1% for
+all configurations.
+
+### Baseline vs HEAD (before changes 1 and 2)
+
+Baseline: develop `b191cc1dac`, glibc malloc.
+HEAD: all code optimizations (SSA string concat, set_level_0/1/2,
+irept union, pool allocator, get_new_name cache, field_sensitivity
+hoisting) + tcmalloc via CMake.
+
+| Benchmark | Baseline | HEAD | Speedup |
+|-----------|----------|------|---------|
+| linked_list | 0.520s | 0.398s | +23.5% |
+| array_ops | 3.056s | 1.908s | +37.6% |
+| dlinked_list | 0.939s | 0.703s | +25.1% |
+| string_ops | 0.241s | 0.192s | +20.3% |
+| matrix | 1.652s | 0.985s | +40.4% |
+| tree | 2.485s | 1.817s | +26.9% |
+| heavy_array | 2.510s | 1.312s | +47.7% |
+| csmith_42 | 1.718s | 1.204s | +29.9% |
+| csmith_1111111111 | 12.819s | 9.255s | +27.8% |
+| csmith_1234567890 | 2.371s | 1.653s | +30.3% |
+| csmith_2718281828 | 5.947s | 4.173s | +29.8% |
+| csmith_314159265 | 1.862s | 1.316s | +29.3% |
+| csmith_99999 | 1.707s | 1.196s | +29.9% |
+| **TOTAL** | **37.827s** | **26.112s** | **+31.0%** |
+
+DIMACS output verified identical for all 13 benchmarks.
+
+The CSmith benchmarks show remarkably consistent speedup (27.8–30.3%),
+confirming the optimizations are general-purpose. The hand-written
+benchmarks show more variation (20–48%) because they exercise different
+code paths with different optimization sensitivity.
+
+**Note:** The goto binaries used for this table were compiled during
+this session. Later in the session, the baseline worktree was recreated,
+which rebuilt the goto binaries. The linked_list and dlinked_list
+benchmarks became much faster (~0.06s and ~0.07s) with the new goto
+binaries, likely due to a different `goto-cc` version producing simpler
+goto programs. All subsequent A/B comparisons use the same goto binaries
+for both sides, so relative speedups remain valid.
+
+## Optimization: unordered_set in simplify_inequality_pointer_object (2026-03-12)
+
+Commit: `a656a98492`
+
+### Problem
+
+`simplify_expr_with_value_sett::simplify_inequality_pointer_object` used
+`std::set<exprt>` to collect pointed-to objects. `std::set` requires
+`operator<`, which calls `irept::compare` — a deep recursive tree
+comparison. On pointer-heavy code (csmith_1111111111), this function was
+25.5% inclusive cost, with 17.7% spent in `irept::compare` via
+`std::set::insert`.
+
+### Change
+
+- Replaced `std::set<exprt>` with `std::unordered_set<exprt, irep_hash>`.
+  This uses `irept::hash()` (cached after first computation) and
+  `operator==` (can short-circuit on pointer equality from sharing)
+  instead of `irept::compare`.
+- Replaced `std::set_intersection` (requires sorted ranges) with a
+  simple loop checking membership via `count()`.
+
+### Benchmark (A/B, same goto binaries, 7 runs, median)
+
+| Benchmark | Before | After | Speedup |
+|-----------|--------|-------|---------|
+| csmith_1111111111 | 9.286s | 7.600s | **+18.2%** |
+| array_ops | 1.900s | 1.901s | 0.0% |
+| tree | 1.816s | 1.811s | +0.3% |
+| heavy_array | 1.310s | 1.310s | 0.0% |
+| csmith_2718281828 | 4.159s | 4.157s | 0.0% |
+| All others | — | — | < ±0.6% |
+| **TOTAL (13)** | **25.115s** | **23.401s** | **+6.8%** |
+
+The speedup is concentrated on csmith_1111111111 because it is
+pointer-heavy and triggers many pointer-object inequality comparisons.
+Other benchmarks are unaffected because they rarely enter this code path.
+
+DIMACS output verified identical for all 13 benchmarks.
+
+## Optimization: direct construction in value_sett::to_expr (2026-03-12)
+
+Commit: `e5afe5eac7`
+
+### Problem
+
+`value_sett::to_expr` default-constructed an `object_descriptor_exprt`
+then mutated it via `od.object()=...`, `od.offset()=...`, and
+`od.type()=...`. Each mutation triggers a COW detach operation on the
+irept sharing tree. The call chain
+`value_sett::get_value_set` → `to_expr` → `irept::add` → `detach`
+accounted for ~4–5% of total cost on pointer-heavy benchmarks.
+
+### Change
+
+- Added `object_descriptor_exprt(exprt _object, exprt _offset)`
+  constructor that passes both operands directly to `binary_exprt`,
+  avoiding all post-construction mutations.
+- Simplified `to_expr` to use direct construction.
+- The `type()` field was previously set to `object.type()` but is never
+  read by any consumer (verified by grep). The existing single-argument
+  constructor already left it as `typet()`.
+
+### Benchmark (A/B vs change 1 only, same goto binaries, 7 runs, median)
+
+| Benchmark | Change 1 | Change 1+2 | Speedup |
+|-----------|----------|------------|---------|
+| csmith_1111111111 | 7.633s | 7.250s | **+5.0%** |
+| tree | 1.810s | 1.823s | -0.7% |
+| All others | — | — | < ±0.6% |
+| **TOTAL (13)** | **23.434s** | **23.102s** | **+1.4%** |
+
+The speedup is again concentrated on csmith_1111111111 (pointer-heavy).
+The -0.7% on tree is within noise (IQR ~1%).
+
+DIMACS output verified identical for all 13 benchmarks.
+
+## Updated Profile: Post-Changes 1 and 2 (2026-03-12)
+
+Configuration: HEAD with both changes, tcmalloc via CMake.
+Benchmarks: 9 (all except csmith_1111111111, which was excluded to avoid
+biasing the aggregated profile — it is 3–7× heavier than other benchmarks
+and would dominate sample counts).
+Method: `perf record` at 997 Hz with 3 iterations per benchmark.
+
+### Self-cost by category
+
+| Category | Self % | Key functions |
+|----------|--------|---------------|
+| irept access (find+get+add) | 16.2% | Linear scan of `forward_list_as_mapt` per property access |
+| irept COW (remove_ref+detach) | 13.5% | Triggered by mutations during rename, simplify, field_sensitivity |
+| simplifier | 5.8% | 30% inclusive; `simplify_node_preorder` 2.0%, `simplify_node` 1.3%, `simplify_rec` 1.1% |
+| string interning | 3.9% | `hash_string` 0.7%, hashtable lookups 2.6%, `get_dstring_number` 0.3% |
+| BigInt/bvrep | 3.8% | `BigInt::compare` 0.5%, `BigInt::~BigInt` 0.3%, `get_bvrep_bit` 0.9%, `integer2string` 0.6% |
+| sharing_mapt | 2.8% | SSA rename state lookups via `get_leaf_node` |
+| irept compare (==, hash) | 2.8% | Down from 5.7% pre-change-1 |
+| field_sensitivity | 2.6% | Recursive `apply` calls |
+| validation checks | 2.2% | `constant_exprt::check` 1.4%, `ssa_exprt::check` 1.1% |
+| SSA identifiers | 2.0% | `update_identifier`, `build_ssa_identifier_rec` |
+| merge_irept | 1.7% | Expression deduplication |
+| rename | 1.0% | `rename<L1>`, `rename<L3>` |
+| tcmalloc | 13.2% | Irreducible allocator overhead |
+| libc | 2.6% | memcpy, memmove |
+| libstdc++ | 1.4% | |
+| other CBMC | 9.5% | Long tail of small functions |
+
+### Top 15 CBMC functions (self cost)
+
+| Rank | Function | Self % |
+|------|----------|--------|
+| 1 | irept::find | 7.5% |
+| 2 | sharing_treet::remove_ref | 7.3% |
+| 3 | sharing_treet::detach | 5.2% |
+| 4 | irept::get | 4.7% |
+| 5 | irept::add(id) | 2.9% |
+| 6 | field_sensitivityt::apply | 2.3% |
+| 7 | string_containert hashtable find | 2.1% |
+| 8 | simplify_node_preorder | 2.0% |
+| 9 | irept::operator== | 1.8% |
+| 10 | merge_irept::merged | 1.7% |
+| 11 | sharing_mapt::get_leaf_node | 1.6% |
+| 12 | constant_exprt::check | 1.4% |
+| 13 | simplify_node | 1.3% |
+| 14 | ssa_exprt::check | 1.1% |
+| 15 | simplify_rec | 1.1% |
+
+### Per-benchmark variation
+
+| Benchmark | COW | Access | Cmp | Simplify | String | BigInt |
+|-----------|-----|--------|-----|----------|--------|--------|
+| array_ops | 12% | 16% | 3% | 3% | 5% | 4% |
+| heavy_array | 13% | 15% | 1% | 4% | **10%** | **6%** |
+| tree | 5% | 6% | 4% | 1% | 2% | 0% |
+| csmith_2718281828 | 14% | 14% | 6% | 6% | 5% | 4% |
+| csmith_42 | 13% | 15% | 4% | 6% | 4% | 4% |
+
+heavy_array stands out with 10% string interning and 6% BigInt — this
+is from `field_sensitivityt::get_fields` creating `dstringt` values for
+array element field names, and constant index expressions using BigInt.
+The tree benchmark has low COW/access (5–6%) because it spends more time
+in CNF/DIMACS output (it produces the largest formula: 966K vars, 3.3M
+clauses).
+
+### Inclusive cost (top-level call chain)
+
+```
+execute_next_instruction                89%
+├── symex_function_call_symbol          37%  (parameter_assignments, clean_expr)
+├── symex_assign                        33%  (assign_non_struct_symbol → rename<L3>)
+│   └── simplify_rec                    30%  (simplify_node_preorder → simplify_node)
+├── clean_expr → dereference            32%
+│   └── field_sensitivityt::apply       21%  (recursive, 4-5 levels)
+└── symex_assert                         ~5%
+```
+
+### Dominant call chains for top functions
+
+**irept::find (7.5%)**: Called from simplify (via type checks), rename
+(via field_sensitivity::apply), and parameter_assignments. Spread across
+many call sites — no single dominant caller.
+
+**remove_ref (7.3%)**: Triggered by irept destruction when temporary
+expressions go out of scope. Recursive (deep trees cause deep
+remove_ref chains). Spread across symex_assign, symex_function_call,
+and simplify.
+
+**detach (5.2%)**: Triggered by irept mutations (add, set, operand
+assignment). Largest single caller is rename<L3> → field_sensitivity::apply
+chain, which mutates expressions during SSA renaming.
+
+### Assessment
+
+The profile is now genuinely flat. No single CBMC function exceeds 7.5%
+self cost. The top two categories — irept access (16.2%) and irept COW
+(13.5%) — are fundamental to how CBMC's data structures work and are
+spread across many call sites rather than concentrated in one hot path.
+
+### Remaining optimization opportunities
+
+1. **irept access (16.2%)** — `forward_list_as_mapt` does a linear scan
+   for every `find`/`get`/`add` call. Replacing it with a flat hash map
+   or small sorted array would help, but is a major architectural change
+   affecting the entire codebase.
+
+2. **irept COW (13.5%)** — inherent to the sharing tree design. The
+   remaining mutations are mostly in the rename/simplify core loop where
+   they are necessary. Could be reduced by batching mutations or using
+   move semantics more aggressively.
+
+3. **String interning (3.9%, up to 10% on heavy_array)** — caching
+   `dstringt` values for predictable field names in
+   `field_sensitivityt::get_fields` could help on array-heavy benchmarks.
+
+4. **BigInt (3.8%, up to 6% on heavy_array)** — constant expression
+   construction overhead. `BigInt` uses heap allocation; a small-buffer
+   optimization or replacing with fixed-width integers where possible
+   could help.
+
+5. **Validation checks (2.2%)** — `constant_exprt::check` and
+   `ssa_exprt::check` run on every `to_constant_expr()` /
+   `to_ssa_expr()` cast. These could be compiled out in Release builds
+   via `CPROVER_INVARIANT_DO_NOT_CHECK`, but that flag also disables
+   other useful invariant checks.
+
+6. **tcmalloc (13.2%)** — irreducible allocator overhead. Already using
+   the fastest general-purpose allocator. Further gains would require
+   reducing allocation volume (fewer temporary expressions, arena
+   allocation for symex state).
diff --git a/scripts/profiling/runner.py b/scripts/profiling/runner.py
index 573e85b5f06..1cffc611bbb 100644
--- a/scripts/profiling/runner.py
+++ b/scripts/profiling/runner.py
@@ -57,6 +57,26 @@ def compile_to_goto_binary(cbmc, source_file, output_dir):
         return None
 
 
+def compile_to_goto_binary(cbmc, source_file, output_dir):
+    """Compile a C/C++ source file to a goto binary using goto-cc.
+
+    Returns the path to the goto binary, or None if goto-cc is not available
+    or compilation fails (in which case CBMC will parse the source directly).
+    """
+    goto_cc = Path(cbmc).parent / "goto-cc"
+    if not goto_cc.is_file():
+        return None
+
+    gb_path = output_dir / (Path(source_file).stem + ".gb")
+    try:
+        subprocess.run(
+            [str(goto_cc), "-o", str(gb_path), source_file],
+            capture_output=True, check=True, timeout=60)
+        return gb_path
+    except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
+        return None
+
+
 def run_benchmark(cbmc, bench, output_dir, timeout, memory_mb, skip_solver):
     """Run perf on a single benchmark. Returns parsed results dict."""
     name = bench["name"]
diff --git a/src/cbmc/CMakeLists.txt b/src/cbmc/CMakeLists.txt
index 4266fedae42..c8598568aa9 100644
--- a/src/cbmc/CMakeLists.txt
+++ b/src/cbmc/CMakeLists.txt
@@ -45,6 +45,7 @@ target_link_libraries(cbmc
 else()
 target_link_libraries(cbmc cbmc-lib)
 endif()
+target_link_libraries(cbmc cprover_allocator)
 install(TARGETS cbmc DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 # Man page
diff --git a/src/goto-analyzer/CMakeLists.txt b/src/goto-analyzer/CMakeLists.txt
index 20cc1f44d49..6854c7fd21f 100644
--- a/src/goto-analyzer/CMakeLists.txt
+++ b/src/goto-analyzer/CMakeLists.txt
@@ -25,6 +25,7 @@ target_link_libraries(goto-analyzer-lib
 # Executable
 add_executable(goto-analyzer goto_analyzer_main.cpp)
 target_link_libraries(goto-analyzer goto-analyzer-lib)
+target_link_libraries(goto-analyzer cprover_allocator)
 install(TARGETS goto-analyzer DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 # Man page
diff --git a/src/goto-cc/CMakeLists.txt b/src/goto-cc/CMakeLists.txt
index cd259bbb960..f21920539c3 100644
--- a/src/goto-cc/CMakeLists.txt
+++ b/src/goto-cc/CMakeLists.txt
@@ -24,6 +24,7 @@ target_link_libraries(goto-cc-lib
 add_executable(goto-cc goto_cc_main.cpp)
 target_link_libraries(goto-cc goto-cc-lib)
 
+target_link_libraries(goto-cc cprover_allocator)
 install(TARGETS goto-cc DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 if(WIN32)
diff --git a/src/goto-diff/CMakeLists.txt b/src/goto-diff/CMakeLists.txt
index 9c4607be813..61b38e143ce 100644
--- a/src/goto-diff/CMakeLists.txt
+++ b/src/goto-diff/CMakeLists.txt
@@ -26,6 +26,7 @@ target_link_libraries(goto-diff-lib
 # Executable
 add_executable(goto-diff goto_diff_main.cpp)
 target_link_libraries(goto-diff goto-diff-lib)
+target_link_libraries(goto-diff cprover_allocator)
 install(TARGETS goto-diff DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 # Man page
diff --git a/src/goto-harness/CMakeLists.txt b/src/goto-harness/CMakeLists.txt
index 25bd5f582f5..5bbbf7cd2a0 100644
--- a/src/goto-harness/CMakeLists.txt
+++ b/src/goto-harness/CMakeLists.txt
@@ -9,6 +9,7 @@ target_link_libraries(goto-harness
   json
   json-symtab-language
 )
+target_link_libraries(goto-harness cprover_allocator)
 install(TARGETS goto-harness DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 # Man page
diff --git a/src/goto-instrument/CMakeLists.txt b/src/goto-instrument/CMakeLists.txt
index a1edca91f9a..afdaba684e6 100644
--- a/src/goto-instrument/CMakeLists.txt
+++ b/src/goto-instrument/CMakeLists.txt
@@ -29,6 +29,7 @@ add_if_library(goto-instrument-lib glpk)
 # Executable
 add_executable(goto-instrument goto_instrument_main.cpp)
 target_link_libraries(goto-instrument goto-instrument-lib)
+target_link_libraries(goto-instrument cprover_allocator)
 install(TARGETS goto-instrument DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 # Man page
diff --git a/src/goto-symex/field_sensitivity.cpp b/src/goto-symex/field_sensitivity.cpp
index 0a5a56b6213..167b7ab6ecd 100644
--- a/src/goto-symex/field_sensitivity.cpp
+++ b/src/goto-symex/field_sensitivity.cpp
@@ -285,11 +285,15 @@ exprt field_sensitivityt::get_fields(
 
     const exprt &compound_op = ssa_expr.get_original_expr();
 
+    // Hoist loop-invariant work: prepare template with L2 already removed.
+    ssa_exprt tmp_template = ssa_expr;
+    const bool was_l2 = !tmp_template.get_level_2().empty();
+    if(was_l2)
+      tmp_template.remove_level_2();
+
     for(const auto &comp : components)
     {
-      ssa_exprt tmp = ssa_expr;
-      bool was_l2 = !tmp.get_level_2().empty();
-      tmp.remove_level_2();
+      ssa_exprt tmp = tmp_template;
       tmp.set_expression(
         member_exprt{compound_op, comp.get_name(), comp.type()});
       exprt field = get_fields(ns, state, tmp, disjoined_fields_only);
@@ -328,13 +332,78 @@ exprt field_sensitivityt::get_fields(
 
     const exprt &array = ssa_expr.get_original_expr();
 
+    // Hoist loop-invariant work: prepare a template ssa_exprt once,
+    // then mutate only the index-dependent parts in each iteration.
+    // This avoids repeated ssa_exprt copies, remove_level_2 calls,
+    // and redundant irept operations.
+    ssa_exprt tmp_template = ssa_expr;
+    const bool was_l2 = !tmp_template.get_level_2().empty();
+    if(was_l2)
+      tmp_template.remove_level_2();
+
+    // Cache the identifier prefix/suffix from the first element.
+    std::string id_prefix, id_suffix, l1_prefix, l1_suffix;
+    bool have_cached_prefix = false;
+
+    // Pre-compute the index type (same for all elements).
+    const typet &idx_type = type.index_type();
+
     for(std::size_t i = 0; i < array_size; ++i)
     {
-      const index_exprt index(array, from_integer(i, type.index_type()));
-      ssa_exprt tmp = ssa_expr;
-      bool was_l2 = !tmp.get_level_2().empty();
-      tmp.remove_level_2();
-      tmp.set_expression(index);
+      // Reuse a single constant_exprt, just changing the value.
+      const constant_exprt idx_const = from_integer(i, idx_type);
+      const index_exprt index(array, idx_const);
+
+      // Copy the template (which already has L2 removed).
+      ssa_exprt tmp = tmp_template;
+
+      if(!have_cached_prefix)
+      {
+        // First element: use canonical set_expression to establish format.
+        tmp.set_expression(index);
+
+        const std::string &idx_val = id2string(idx_const.get_value());
+        const std::string marker = "[[" + idx_val + "]]";
+
+        const std::string &full_id = id2string(tmp.get_identifier());
+        const std::string &full_l1 = id2string(tmp.get_l1_object_identifier());
+        auto pos = full_id.rfind(marker);
+        if(pos != std::string::npos)
+        {
+          id_prefix = full_id.substr(0, pos);
+          id_suffix = full_id.substr(pos + marker.size());
+          auto l1_pos = full_l1.rfind(marker);
+          if(l1_pos != std::string::npos)
+          {
+            l1_prefix = full_l1.substr(0, l1_pos);
+            l1_suffix = full_l1.substr(l1_pos + marker.size());
+          }
+          have_cached_prefix = true;
+        }
+      }
+      else
+      {
+        // Subsequent elements: set expression and derive identifier directly.
+        tmp.type() = as_const(index).type();
+        static_cast<exprt &>(tmp).add(ID_expression, index);
+
+        const std::string &idx_val = id2string(idx_const.get_value());
+
+        std::string new_id = id_prefix;
+        new_id += "[[";
+        new_id += idx_val;
+        new_id += "]]";
+        new_id += id_suffix;
+        tmp.set_identifier(new_id);
+
+        std::string new_l1 = l1_prefix;
+        new_l1 += "[[";
+        new_l1 += idx_val;
+        new_l1 += "]]";
+        new_l1 += l1_suffix;
+        tmp.set(ID_L1_object_identifier, new_l1);
+      }
+
       exprt element = get_fields(ns, state, tmp, disjoined_fields_only);
       if(was_l2)
       {
diff --git a/src/goto-symex/simplify_expr_with_value_set.cpp b/src/goto-symex/simplify_expr_with_value_set.cpp
index 24a519f6364..a804780805f 100644
--- a/src/goto-symex/simplify_expr_with_value_set.cpp
+++ b/src/goto-symex/simplify_expr_with_value_set.cpp
@@ -19,6 +19,8 @@ Author: Michael Tautschnig
 
 #include "goto_symex_can_forward_propagate.h"
 
+#include <unordered_set>
+
 /// Try to evaluate a simple pointer comparison.
 /// \param operation: ID_equal or ID_not_equal
 /// \param symbol_expr: The symbol expression in the condition
@@ -164,7 +166,7 @@ simplify_expr_with_value_sett::simplify_inequality_pointer_object(
 
   auto collect_objects = [this](const exprt &pointer)
   {
-    std::set<exprt> objects;
+    std::unordered_set<exprt, irep_hash> objects;
     if(auto address_of = expr_try_dynamic_cast<address_of_exprt>(pointer))
     {
       objects.insert(
@@ -209,14 +211,16 @@ simplify_expr_with_value_sett::simplify_inequality_pointer_object(
                                  : changed(static_cast<exprt>(false_exprt{}));
   }
 
-  std::list<exprt> intersection;
-  std::set_intersection(
-    lhs_objects.begin(),
-    lhs_objects.end(),
-    rhs_objects.begin(),
-    rhs_objects.end(),
-    std::back_inserter(intersection));
-  if(!lhs_objects.empty() && !rhs_objects.empty() && intersection.empty())
+  bool has_intersection = false;
+  for(const auto &obj : lhs_objects)
+  {
+    if(rhs_objects.count(obj) != 0)
+    {
+      has_intersection = true;
+      break;
+    }
+  }
+  if(!lhs_objects.empty() && !rhs_objects.empty() && !has_intersection)
   {
     // all pointed-to objects on the left-hand side are different from any of
     // the pointed-to objects on the right-hand side
diff --git a/src/pointer-analysis/value_set.cpp b/src/pointer-analysis/value_set.cpp
index 3641c00fd60..d40a83934cd 100644
--- a/src/pointer-analysis/value_set.cpp
+++ b/src/pointer-analysis/value_set.cpp
@@ -255,16 +255,10 @@ exprt value_sett::to_expr(const object_map_dt::value_type &it) const
      object.id()==ID_unknown)
     return object;
 
-  object_descriptor_exprt od;
-
-  od.object()=object;
-
   if(it.second)
-    od.offset() = *it.second;
-
-  od.type()=od.object().type();
-
-  return std::move(od);
+    return object_descriptor_exprt{object, *it.second};
+  else
+    return object_descriptor_exprt{object};
 }
 
 bool value_sett::make_union(const value_sett::valuest &new_values)
diff --git a/src/symtab2gb/CMakeLists.txt b/src/symtab2gb/CMakeLists.txt
index 8a4b1f5ea24..8dc375d8031 100644
--- a/src/symtab2gb/CMakeLists.txt
+++ b/src/symtab2gb/CMakeLists.txt
@@ -11,6 +11,7 @@ target_link_libraries(symtab2gb
         goto-programs
         json-symtab-language)
 
+target_link_libraries(symtab2gb cprover_allocator)
 install(TARGETS symtab2gb DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 # Man page
diff --git a/src/util/irep.cpp b/src/util/irep.cpp
index 36e103ea8ff..236ae5604a5 100644
--- a/src/util/irep.cpp
+++ b/src/util/irep.cpp
@@ -43,12 +43,14 @@ void irept::move_to_sub(irept &irep)
 
 const irep_idt &irept::get(const irep_idt &name) const
 {
+  static const irep_idt empty;
+  if(is_leaf_only())
+    return empty;
   const named_subt &s = get_named_sub();
   named_subt::const_iterator it=s.find(name);
 
   if(it==s.end())
   {
-    static const irep_idt empty;
     return empty;
   }
   return it->second.id();
@@ -92,6 +94,8 @@ void irept::remove(const irep_idt &name)
 
 const irept &irept::find(const irep_idt &name) const
 {
+  if(is_leaf_only())
+    return get_nil_irep();
   const named_subt &s = get_named_sub();
   auto it = s.find(name);
 
@@ -102,12 +106,16 @@ const irept &irept::find(const irep_idt &name) const
 
 irept &irept::add(const irep_idt &name)
 {
+  if(is_leaf_only())
+    promote_to_non_leaf();
   named_subt &s = get_named_sub();
   return s[name];
 }
 
 irept &irept::add(const irep_idt &name, irept irep)
 {
+  if(is_leaf_only())
+    promote_to_non_leaf();
   named_subt &s = get_named_sub();
 
 #if NAMED_SUB_IS_FORWARD_LIST
@@ -415,7 +423,7 @@ std::size_t irept::number_of_non_comments(const named_subt &named_sub)
 std::size_t irept::hash() const
 {
 #if HASH_CODE
-  if(read().hash_code!=0)
+  if(!is_leaf_only() && read().hash_code!=0)
     return read().hash_code;
   #endif
 
@@ -442,7 +450,8 @@ std::size_t irept::hash() const
   result = hash_finalize(result, sub.size() + number_of_named_ireps);
 
 #if HASH_CODE
-  read().hash_code=result;
+  if(!is_leaf_only())
+    read().hash_code=result;
 #endif
 #ifdef IREP_HASH_STATS
   ++irep_hash_cnt;
diff --git a/src/util/irep.h b/src/util/irep.h
index 28f87062c48..7d75a3997fd 100644
--- a/src/util/irep.h
+++ b/src/util/irep.h
@@ -12,6 +12,7 @@ Author: Daniel Kroening, kroening@kroening.com
 
 #include <string>
 #include <vector>
+#include <cstdint>
 
 #include "invariant.h"
 #include "irep_ids.h"
@@ -53,20 +54,6 @@ id2string(const irep_idt &d)
 class irept;
 const irept &get_nil_irep();
 
-/// Used in tree_nodet for activating or not reference counting.
-/// tree_nodet uses inheritance from ref_count_ift instead of a field, so that
-/// it gets deleted if empty ([[no_unique_address]] only appears in C++20).
-template <bool enabled>
-struct ref_count_ift
-{
-};
-
-template <>
-struct ref_count_ift<true>
-{
-  unsigned ref_count = 1;
-};
-
 /// A node with data in a tree, it contains:
 ///
 /// * \ref irept::dt::data : A \ref dstringt and thus an integer which is a
@@ -85,8 +72,8 @@ struct ref_count_ift<true>
 ///
 /// * \c hash_code : if HASH_CODE is activated, this is used to cache the
 ///   result of the hash function.
-template <typename treet, typename named_subtreest, bool sharing = true>
-class tree_nodet : public ref_count_ift<sharing>
+template <typename treet, typename named_subtreest>
+class tree_nodet final
 {
 public:
   // These are not stable.
@@ -99,6 +86,8 @@ class tree_nodet : public ref_count_ift<sharing>
 
   friend treet;
 
+  unsigned ref_count = 1;
+
   /// This irep_idt is the only place to store data in an tree node
   irep_idt data;
 
@@ -141,6 +130,30 @@ class tree_nodet : public ref_count_ift<sharing>
       sub(std::move(_sub))
   {
   }
+
+  // Pool allocator for tree_nodet: maintains a free list of previously
+  // deallocated nodes to avoid malloc/free overhead. CBMC allocates and
+  // frees millions of tree_nodet objects (14M+ on typical workloads);
+  // reusing memory from a free list is significantly faster.
+  static void *operator new(std::size_t size)
+  {
+    if(free_list_head)
+    {
+      void *p = free_list_head;
+      free_list_head = *static_cast<void **>(p);
+      return p;
+    }
+    return ::operator new(size);
+  }
+
+  static void operator delete(void *p, std::size_t /*size*/)
+  {
+    *static_cast<void **>(p) = free_list_head;
+    free_list_head = p;
+  }
+
+private:
+  static inline thread_local void *free_list_head = nullptr;
 };
 
 /// Base class for tree-like data structures with sharing
@@ -148,31 +161,69 @@ template <typename derivedt, typename named_subtreest>
 class sharing_treet
 {
 public:
-  using dt = tree_nodet<derivedt, named_subtreest, true>;
+  using dt = tree_nodet<derivedt, named_subtreest>;
   using subt = typename dt::subt;
   using named_subt = typename dt::named_subt;
 
   /// Used to refer to this class from derived classes
   using tree_implementationt = sharing_treet;
 
-  explicit sharing_treet(irep_idt _id) : data(new dt(std::move(_id)))
+protected:
+  struct flagged_idt
+  {
+    bool is_leaf : 1;
+    unsigned padding : 31;
+    irep_idt id;
+
+    explicit flagged_idt(irep_idt id = {}) :
+        is_leaf{true},
+        padding{0},
+        id{std::move(id)}
+    {
+      static_assert(sizeof(flagged_idt) == sizeof(dt *),
+                    "Sizes should match for optimisation to apply.");
+    }
+  };
+
+  union pointer_or_idt
+  {
+    flagged_idt flagged_id;
+    dt * pointer;
+  };
+
+  union const_pointer_or_idt
+  {
+    dt * const pointer;
+    flagged_idt flagged_id;
+  };
+
+  static dt *as_pointer(irep_idt id)
+  {
+    pointer_or_idt pointer_or_id{ flagged_idt{id} };
+    return pointer_or_id.pointer;
+  }
+
+public:
+  explicit sharing_treet(irep_idt _id) : data(as_pointer(_id))
   {
   }
 
   sharing_treet(irep_idt _id, named_subt _named_sub, subt _sub)
-    : data(new dt(std::move(_id), std::move(_named_sub), std::move(_sub)))
+    : data(
+        _named_sub.empty() && _sub.empty() ? as_pointer(_id)
+        : new dt(std::move(_id), std::move(_named_sub), std::move(_sub)))
   {
   }
 
   // constructor for blank irep
-  sharing_treet() : data(&empty_d)
+  sharing_treet() : data(empty_d)
   {
   }
 
   // copy constructor
   sharing_treet(const sharing_treet &irep) : data(irep.data)
   {
-    if(data!=&empty_d)
+    if(!is_leaf_only())
     {
       PRECONDITION(data->ref_count != 0);
       data->ref_count++;
@@ -190,7 +241,7 @@ class sharing_treet
 #ifdef IREP_DEBUG
     std::cout << "COPY MOVE\n";
 #endif
-    irep.data=&empty_d;
+    irep.data=empty_d;
   }
 
   sharing_treet &operator=(const sharing_treet &irep)
@@ -199,10 +250,18 @@ class sharing_treet
     std::cout << "ASSIGN\n";
 #endif
 
+    if(this->is_leaf_only())
+    {
+      data = irep.data;
+      if(!is_leaf_only())
+        ++data->ref_count;
+      return *this;
+    }
+
     // Ordering is very important here!
     // Consider self-assignment, which may destroy 'irep'
     dt *irep_data=irep.data;
-    if(irep_data!=&empty_d)
+    if(!irep.is_leaf_only())
       irep_data->ref_count++;
 
     remove_ref(data); // this may kill 'irep'
@@ -225,12 +284,28 @@ class sharing_treet
 
   ~sharing_treet()
   {
-    remove_ref(data);
+    if(!is_leaf_only())
+      remove_ref(data);
   }
 
 protected:
   dt *data;
-  static dt empty_d;
+  static dt * const empty_d;
+  static const subt empty_sub;
+  static const named_subt empty_named_sub;
+
+  /// Checks out if data is actually a pointer or an id.
+  bool is_leaf_only() const
+  {
+    const_pointer_or_idt pointer_or_id{ data };
+    return pointer_or_id.flagged_id.is_leaf;
+  }
+
+  void promote_to_non_leaf()
+  {
+    const_pointer_or_idt pointer_or_id{ data };
+    data = new dt(pointer_or_id.flagged_id.id);
+  }
 
   static void remove_ref(dt *old_data);
   static void nonrecursive_destructor(dt *old_data);
@@ -250,52 +325,37 @@ class sharing_treet
 #endif
     return *data;
   }
-};
-
-// Static field initialization
-template <typename derivedt, typename named_subtreest>
-typename sharing_treet<derivedt, named_subtreest>::dt
-  sharing_treet<derivedt, named_subtreest>::empty_d;
 
-/// Base class for tree-like data structures without sharing
-template <typename derivedt, typename named_subtreest>
-class non_sharing_treet
-{
-public:
-  using dt = tree_nodet<derivedt, named_subtreest, false>;
-  using subt = typename dt::subt;
-  using named_subt = typename dt::named_subt;
-
-  /// Used to refer to this class from derived classes
-  using tree_implementationt = non_sharing_treet;
-
-  explicit non_sharing_treet(irep_idt _id) : data(std::move(_id))
+  const irep_idt &id() const
   {
+    const auto pointer_or_id =
+      reinterpret_cast<const pointer_or_idt *>(&data);
+    return is_leaf_only() ? pointer_or_id->flagged_id.id : read().data;
   }
 
-  non_sharing_treet(irep_idt _id, named_subt _named_sub, subt _sub)
-    : data(std::move(_id), std::move(_named_sub), std::move(_sub))
+  void id(const irep_idt &_data)
   {
+    if(is_leaf_only())
+      data = as_pointer(_data);
+    else
+      write().data=_data;
   }
 
-  non_sharing_treet() = default;
+};
 
-  const dt &read() const
-  {
-    return data;
-  }
+// Static field initialization
+template <typename derivedt, typename named_subtreest>
+typename sharing_treet<derivedt, named_subtreest>::dt * const
+  sharing_treet<derivedt, named_subtreest>::empty_d =
+    sharing_treet<derivedt, named_subtreest>::as_pointer(irep_idt{});
 
-  dt &write()
-  {
-#if HASH_CODE
-    data.hash_code = 0;
-#endif
-    return data;
-  }
+template <typename derivedt, typename named_subtreest>
+  const typename sharing_treet<derivedt, named_subtreest>::subt
+    sharing_treet<derivedt, named_subtreest>::empty_sub = {};
 
-protected:
-  dt data;
-};
+template <typename derivedt, typename named_subtreest>
+  const typename sharing_treet<derivedt, named_subtreest>::named_subt
+    sharing_treet<derivedt, named_subtreest>::empty_named_sub = {};
 
 /// There are a large number of kinds of tree structured or tree-like data in
 /// CPROVER. \ref irept provides a single, unified representation for all of
@@ -385,14 +445,8 @@ class irept
 
   irept() = default;
 
-  const irep_idt &id() const
-  { return read().data; }
-
   const std::string &id_string() const
-  { return id2string(read().data); }
-
-  void id(const irep_idt &_data)
-  { write().data=_data; }
+  { return id2string(id()); }
 
   const irept &find(const irep_idt &name) const;
   irept &add(const irep_idt &name);
@@ -445,10 +499,29 @@ class irept
 
   void make_nil() { *this=get_nil_irep(); }
 
-  subt &get_sub() { return write().sub; } // DANGEROUS
-  const subt &get_sub() const { return read().sub; }
-  named_subt &get_named_sub() { return write().named_sub; } // DANGEROUS
-  const named_subt &get_named_sub() const { return read().named_sub; }
+  subt &get_sub() {
+    if(is_leaf_only())
+      promote_to_non_leaf();
+    return write().sub;
+  } // DANGEROUS
+
+  const subt &get_sub() const {
+    if(is_leaf_only())
+      return empty_sub;
+    return read().sub;
+  }
+
+  named_subt &get_named_sub() {
+    if(is_leaf_only())
+      promote_to_non_leaf();
+    return write().named_sub;
+  } // DANGEROUS
+
+  const named_subt &get_named_sub() const {
+    if(is_leaf_only())
+      return empty_named_sub;
+    return read().named_sub;
+  }
 
   std::size_t hash() const;
   std::size_t full_hash() const;
@@ -509,19 +582,14 @@ struct diagnostics_helpert<irep_pretty_diagnosticst>
 template <typename derivedt, typename named_subtreest>
 void sharing_treet<derivedt, named_subtreest>::detach()
 {
+  if(is_leaf_only())
+    return;
+
 #ifdef IREP_DEBUG
   std::cout << "DETACH1: " << data << '\n';
 #endif
 
-  if(data == &empty_d)
-  {
-    data = new dt;
-
-#ifdef IREP_DEBUG
-    std::cout << "ALLOCATED " << data << '\n';
-#endif
-  }
-  else if(data->ref_count > 1)
+  if(data->ref_count > 1)
   {
     dt *old_data(data);
     data = new dt(*old_data);
@@ -544,7 +612,8 @@ void sharing_treet<derivedt, named_subtreest>::detach()
 template <typename derivedt, typename named_subtreest>
 void sharing_treet<derivedt, named_subtreest>::remove_ref(dt *old_data)
 {
-  if(old_data == &empty_d)
+  const const_pointer_or_idt pointer_or_id{ old_data };
+  if(pointer_or_id.flagged_id.is_leaf)
     return;
 
 #if 0
diff --git a/src/util/pointer_expr.h b/src/util/pointer_expr.h
index d3130b3110c..74323f6c194 100644
--- a/src/util/pointer_expr.h
+++ b/src/util/pointer_expr.h
@@ -198,6 +198,15 @@ class object_descriptor_exprt : public binary_exprt
   {
   }
 
+  object_descriptor_exprt(exprt _object, exprt _offset)
+    : binary_exprt(
+        std::move(_object),
+        ID_object_descriptor,
+        std::move(_offset),
+        typet())
+  {
+  }
+
   /// Given an expression \p expr, attempt to find the underlying object it
   /// represents by skipping over type casts and removing balanced
   /// dereference/address_of operations; that object will then be available
diff --git a/src/util/rename.cpp b/src/util/rename.cpp
index 2b622b96874..ad559cc818a 100644
--- a/src/util/rename.cpp
+++ b/src/util/rename.cpp
@@ -8,10 +8,15 @@ Author: Daniel Kroening, kroening@kroening.com
 
 #include "rename.h"
 
-#include <string>
-
 #include "namespace.h"
 
+#include <string>
+#include <unordered_map>
+
+/// Produce a fresh name by appending delimiter + number to \p name.
+/// Uses a per-prefix counter to avoid O(n) probing of the symbol table
+/// on each call. Falls back to symbol table probing only on the first
+/// call for each prefix.
 irep_idt
 get_new_name(const irep_idt &name, const namespacet &ns, char delimiter)
 {
@@ -21,5 +26,21 @@ get_new_name(const irep_idt &name, const namespacet &ns, char delimiter)
 
   std::string prefix = id2string(name) + delimiter;
 
-  return prefix + std::to_string(ns.smallest_unused_suffix(prefix));
+  // Cache the next suffix to try for each prefix. This turns the O(n)
+  // linear probe in smallest_unused_suffix into amortized O(1).
+  // thread_local for safety in case of future multi-threading.
+  static thread_local std::unordered_map<std::string, std::size_t> suffix_cache;
+  auto it = suffix_cache.find(prefix);
+  std::size_t suffix;
+  if(it != suffix_cache.end())
+    suffix = it->second;
+  else
+    suffix = ns.smallest_unused_suffix(prefix);
+
+  // Verify the suffix is actually unused
+  while(!ns.lookup(prefix + std::to_string(suffix), symbol))
+    ++suffix;
+
+  suffix_cache[prefix] = suffix + 1;
+  return prefix + std::to_string(suffix);
 }
diff --git a/src/util/ssa_expr.cpp b/src/util/ssa_expr.cpp
index e92c79967e6..4b02b8457bb 100644
--- a/src/util/ssa_expr.cpp
+++ b/src/util/ssa_expr.cpp
@@ -8,8 +8,6 @@ Author: Daniel Kroening, kroening@kroening.com
 
 #include "ssa_expr.h"
 
-#include <sstream>
-
 #include "pointer_expr.h"
 
 /// If \p expr is:
@@ -18,21 +16,28 @@ Author: Daniel Kroening, kroening@kroening.com
 ///   - an index_exprt where the index is a constant, apply recursively on the
 ///     array and add "[[index]]"
 /// \return the stream \p os
-static std::ostream &
-initialize_ssa_identifier(std::ostream &os, const exprt &expr)
+static void initialize_ssa_identifier(std::string &id, const exprt &expr)
 {
   if(auto member = expr_try_dynamic_cast<member_exprt>(expr))
   {
-    return initialize_ssa_identifier(os, member->struct_op())
-           << ".." << member->get_component_name();
+    initialize_ssa_identifier(id, member->struct_op());
+    id += "..";
+    id += id2string(member->get_component_name());
+    return;
   }
   if(auto index = expr_try_dynamic_cast<index_exprt>(expr))
   {
-    const irep_idt &idx = to_constant_expr(index->index()).get_value();
-    return initialize_ssa_identifier(os, index->array()) << "[[" << idx << "]]";
+    initialize_ssa_identifier(id, index->array());
+    id += "[[";
+    id += id2string(to_constant_expr(index->index()).get_value());
+    id += "]]";
+    return;
   }
   if(auto symbol = expr_try_dynamic_cast<symbol_exprt>(expr))
-    return os << symbol->get_identifier();
+  {
+    id += id2string(symbol->get_identifier());
+    return;
+  }
 
   UNREACHABLE;
 }
@@ -42,9 +47,9 @@ ssa_exprt::ssa_exprt(const exprt &expr) : symbol_exprt(expr.type())
   set(ID_C_SSA_symbol, true);
   add(ID_expression, expr);
   with_source_location(expr.source_location());
-  std::ostringstream os;
-  initialize_ssa_identifier(os, expr);
-  const std::string id = os.str();
+  std::string id;
+  id.reserve(64);
+  initialize_ssa_identifier(id, expr);
   set_identifier(id);
   set(ID_L1_object_identifier, id);
 }
@@ -53,57 +58,71 @@ ssa_exprt::ssa_exprt(const exprt &expr) : symbol_exprt(expr.type())
 /// "s!l0@l1".
 /// If \p expr is a member or index expression, recursively apply the procedure
 /// and add "..component_name" or "[[index]]" to \p os.
+/// Build the base part of an SSA identifier (without level suffixes)
+/// by appending to \p id and \p l1_object_id.
 static void build_ssa_identifier_rec(
   const exprt &expr,
   const irep_idt &l0,
   const irep_idt &l1,
   const irep_idt &l2,
-  std::ostream &os,
-  std::ostream &l1_object_os)
+  std::string &id,
+  std::string &l1_object_id)
 {
-  if(expr.id()==ID_member)
+  if(expr.id() == ID_member)
   {
-    const member_exprt &member=to_member_expr(expr);
+    const member_exprt &member = to_member_expr(expr);
 
-    build_ssa_identifier_rec(member.struct_op(), l0, l1, l2, os, l1_object_os);
+    build_ssa_identifier_rec(member.struct_op(), l0, l1, l2, id, l1_object_id);
 
-    os << ".." << member.get_component_name();
-    l1_object_os << ".." << member.get_component_name();
+    const std::string &component = id2string(member.get_component_name());
+    id += "..";
+    id += component;
+    l1_object_id += "..";
+    l1_object_id += component;
   }
-  else if(expr.id()==ID_index)
+  else if(expr.id() == ID_index)
   {
-    const index_exprt &index=to_index_expr(expr);
-
-    build_ssa_identifier_rec(index.array(), l0, l1, l2, os, l1_object_os);
-
-    const irep_idt &idx = to_constant_expr(index.index()).get_value();
-    os << "[[" << idx << "]]";
-    l1_object_os << "[[" << idx << "]]";
+    const index_exprt &index = to_index_expr(expr);
+
+    build_ssa_identifier_rec(index.array(), l0, l1, l2, id, l1_object_id);
+
+    const std::string &idx =
+      id2string(to_constant_expr(index.index()).get_value());
+    id += "[[";
+    id += idx;
+    id += "]]";
+    l1_object_id += "[[";
+    l1_object_id += idx;
+    l1_object_id += "]]";
   }
-  else if(expr.id()==ID_symbol)
+  else if(expr.id() == ID_symbol)
   {
-    auto symid=to_symbol_expr(expr).get_identifier();
-    os << symid;
-    l1_object_os << symid;
+    const std::string &symid = id2string(to_symbol_expr(expr).get_identifier());
+    id += symid;
+    l1_object_id += symid;
 
     if(!l0.empty())
     {
-      // Distinguish different threads of execution
-      os << '!' << l0;
-      l1_object_os << '!' << l0;
+      const std::string &l0s = id2string(l0);
+      id += '!';
+      id += l0s;
+      l1_object_id += '!';
+      l1_object_id += l0s;
     }
 
     if(!l1.empty())
     {
-      // Distinguish different calls to the same function (~stack frame)
-      os << '@' << l1;
-      l1_object_os << '@' << l1;
+      const std::string &l1s = id2string(l1);
+      id += '@';
+      id += l1s;
+      l1_object_id += '@';
+      l1_object_id += l1s;
     }
 
     if(!l2.empty())
     {
-      // Distinguish SSA steps for the same variable
-      os << '#' << l2;
+      id += '#';
+      id += id2string(l2);
     }
   }
   else
@@ -116,12 +135,16 @@ static std::pair<irep_idt, irep_idt> build_identifier(
   const irep_idt &l1,
   const irep_idt &l2)
 {
-  std::ostringstream oss;
-  std::ostringstream l1_object_oss;
+  std::string id;
+  std::string l1_object_id;
+  // Typical SSA identifiers are 20-60 chars; pre-allocate to avoid
+  // repeated reallocation during string building.
+  id.reserve(64);
+  l1_object_id.reserve(64);
 
-  build_ssa_identifier_rec(expr, l0, l1, l2, oss, l1_object_oss);
+  build_ssa_identifier_rec(expr, l0, l1, l2, id, l1_object_id);
 
-  return std::make_pair(irep_idt(oss.str()), irep_idt(l1_object_oss.str()));
+  return std::make_pair(irep_idt(id), irep_idt(l1_object_id));
 }
 
 static void update_identifier(ssa_exprt &ssa)
@@ -181,19 +204,42 @@ const irep_idt ssa_exprt::get_l1_object_identifier() const
 void ssa_exprt::set_level_0(std::size_t i)
 {
   set(ID_L0, i);
-  ::update_identifier(*this);
+  // Optimized: L0 is only set when it was previously empty (the caller
+  // guards against re-setting). The current identifier is "base" and
+  // we need "base!l0". Append directly instead of rebuilding.
+  const std::string &cur_id = id2string(get_identifier());
+  std::string suffix = "!" + std::to_string(i);
+  std::string new_id = cur_id + suffix;
+  set_identifier(new_id);
+  set(ID_L1_object_identifier, new_id);
 }
 
 void ssa_exprt::set_level_1(std::size_t i)
 {
   set(ID_L1, i);
-  ::update_identifier(*this);
+  // Optimized: L1 is only set when it was previously empty (the caller
+  // guards against re-setting). The current identifier is "base!l0" and
+  // we need "base!l0@l1". Append directly instead of rebuilding.
+  const std::string &cur_id = id2string(get_identifier());
+  std::string suffix = "@" + std::to_string(i);
+  std::string new_id = cur_id + suffix;
+  set_identifier(new_id);
+  set(ID_L1_object_identifier, new_id);
 }
 
 void ssa_exprt::set_level_2(std::size_t i)
 {
   set(ID_L2, i);
-  ::update_identifier(*this);
+  // Optimized: the L1 object identifier doesn't change when only L2 changes,
+  // and the main identifier just needs the #N suffix updated. Derive from
+  // the cached L1 object identifier instead of rebuilding from scratch.
+  const std::string &l1_id = id2string(get(ID_L1_object_identifier));
+  std::string new_id;
+  new_id.reserve(l1_id.size() + 8);
+  new_id = l1_id;
+  new_id += '#';
+  new_id += std::to_string(i);
+  set_identifier(new_id);
 }
 
 void ssa_exprt::remove_level_2()