From 931b1cc725fb51045d4edee8a0b795d8b6b00ce9 Mon Sep 17 00:00:00 2001
From: sh <github.shum@liber.li>
Date: Wed, 18 Mar 2026 08:48:21 +0000
Subject: [PATCH 1/8] smp-server: memory usage analysis

---
 docs/plans/memory-analysis.md | 118 ++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 docs/plans/memory-analysis.md

diff --git a/docs/plans/memory-analysis.md b/docs/plans/memory-analysis.md
new file mode 100644
index 000000000..574a10197
--- /dev/null
+++ b/docs/plans/memory-analysis.md
@@ -0,0 +1,118 @@
+## Root Cause Analysis: SMP Server Memory Growth (23.5GB)
+
+### Log Summary
+
+- **Duration**: ~22 hours (Mar 16 12:12 → Mar 17 10:20)
+- **92,277 proxy connection errors** out of 92,656 total log lines (99.6%)
+- **292 unique failing destination servers**, top offender: `nowhere.moe` (12,875 errors)
+- Only **145 successful proxy connections**
+
+---
+
+### Root Cause #1 (PRIMARY): PostgreSQL Queue Cache Never Evicts
+
+**Files**: `src/Simplex/Messaging/Server/QueueStore/Postgres.hs`
+
+The Postgres queue store has `useCache = True` hard-coded (via `Journal.hs:429`). Every queue accessed or created gets inserted into `queues :: TMap RecipientId q` and **is never removed**, even after deletion.
+
+**Evidence** — `deleteStoreQueue` at Postgres.hs:448-465:
+```haskell
+deleteStoreQueue st sq = ... do
+    atomically $ writeTVar qr Nothing     -- QueueRec set to Nothing...
+    when (useCache st) $ do
+      atomically $ TM.delete (senderId q) $ senders st      -- ✅ cleaned
+      forM_ (notifier q) $ \NtfCreds {notifierId} -> do
+        atomically $ TM.delete notifierId $ notifiers st     -- ✅ cleaned
+        atomically $ TM.delete notifierId $ notifierLocks st -- ✅ cleaned
+    -- ❌ NO TM.delete rId $ queues st  — zombie entry stays forever!
+```
+
+Similarly, `links :: TMap LinkId RecipientId` is never cleaned on queue deletion.
+
+**Impact**: Every queue created and then deleted leaves a zombie in the `queues` map (~200 bytes minimum per entry). On a busy server like smp19 running for days/weeks:
+- Millions of queues created/deleted → millions of zombie cache entries
+- Active queues also stay cached forever once loaded from Postgres
+- **This is the primary unbounded growth mechanism**
+
+The `loadedQueueCount` Prometheus metric would confirm this — it shows the size of this cache (Prometheus.hs:500).
+
+---
+
+### Root Cause #2 (AMPLIFIER): GHC Heap Sizing with `-A16m -N`
+
+**RTS flags**: `+RTS -N -A16m -I0.01 -Iw15 -s -RTS`
+
+With 16 cores:
+- **Nursery**: 16 × 16MB = **256MB baseline**
+- GHC's default major GC threshold = **2× live data** — if live data is 10GB, GHC allows the heap to grow to **~20GB before triggering major GC**
+- VIRT = 1031GB is normal for GHC (address space reservation, not actual memory)
+
+The `-I0.01` setting is aggressive for idle GC (10ms), but with 22K clients the server is rarely idle, so major GC is deferred.
+
+---
+
+### Root Cause #3 (CONTRIBUTOR): No Cache Eviction by Design
+
+Looking at `getQueue_` in Postgres.hs:193-244:
+- When `useCache = True`, queues are loaded from Postgres on first access
+- They are cached in the `queues` TMap **forever**
+- There is no LRU eviction, no TTL, no max cache size
+- The comment at line 233 acknowledges interaction with `withAllMsgQueues` but no eviction mechanism exists
+
+---
+
+### What is NOT the Main Cause
+
+1. **Proxy connection state (`smpClients`)**: Properly managed — one entry per destination server, cleaned after 30-second `persistErrorInterval`. Max 292 entries. Code at `Client/Agent.hs:196-242` and `Session.hs:24-39` is correct.
+
+2. **SubscribedClients**: Despite the misleading comment at `Env/STM.hs:376` saying "subscriptions are never removed," they ARE cleaned up on both client disconnect (`Server.hs:1112`) and queue deletion (`Server.hs:308` via `lookupDeleteSubscribedClient`).
+
+3. **Client structures**: 22K × ~3KB = ~66MB — negligible.
+
+4. **TBQueues**: Bounded (`TBQueue` with `tbqSize = 128`), properly sized.
+
+5. **Thread management**: `forkClient` uses weak references, `finally` blocks ensure cleanup. Well-managed.
+
+---
+
+### Memory Budget Estimate
+
+| Component | Estimated Size |
+|-----------|---------------|
+| Postgres queue cache (growing) | **5-15+ GB** |
+| TLS connection state (22K × ~30KB) | ~660 MB |
+| GHC nursery (16 × 16MB) | 256 MB |
+| Client structures | ~66 MB |
+| Proxy agent state | ~50 KB |
+| **GHC old gen headroom (2× live)** | **doubles effective usage** |
+
+---
+
+### Recommended Fixes (Priority Order)
+
+**1. Remove zombie queue entries from cache on deletion** (immediate fix)
+
+In `deleteStoreQueue` in `Postgres.hs`, add `TM.delete rId $ queues st` and `forM_ (queueData q) $ \(lnkId, _) -> TM.delete lnkId $ links st`:
+
+```haskell
+-- After the existing cleanup:
+when (useCache st) $ do
+  atomically $ TM.delete rId $ queues st          -- ADD THIS
+  atomically $ TM.delete (senderId q) $ senders st
+  forM_ (queueData q) $ \(lnkId, _) ->
+    atomically $ TM.delete lnkId $ links st        -- ADD THIS
+  forM_ (notifier q) $ \NtfCreds {notifierId} -> do
+    atomically $ TM.delete notifierId $ notifiers st
+    atomically $ TM.delete notifierId $ notifierLocks st
+```
+
+**2. Add configuration option to disable cache** — allow `useCache = False` via INI config for Postgres deployments where the DB is fast enough.
+
+**3. Add cache eviction** — periodically evict queues not accessed recently (LRU or TTL-based).
+
+**4. Tune GHC RTS**:
+- Add `-F1.5` (or lower) to reduce the major GC threshold multiplier from default 2.0
+- Consider `-M<limit>` to set a hard heap cap
+- Consider `-N8` instead of `-N` to halve the nursery size and reduce per-capability overhead
+
+**5. Add observability** — log `loadedQueueCount` periodically to track cache growth. This metric exists in Prometheus but should also appear in regular stats logs.

From 404dd10d4a14a15ab47aaffabcab263eb8641bac Mon Sep 17 00:00:00 2001
From: sh <github.shum@liber.li>
Date: Wed, 18 Mar 2026 11:23:35 +0000
Subject: [PATCH 2/8] further analysis

---
 docs/plans/memory-analysis.md | 247 ++++++++++++++++++++++++----------
 1 file changed, 177 insertions(+), 70 deletions(-)

diff --git a/docs/plans/memory-analysis.md b/docs/plans/memory-analysis.md
index 574a10197..9c45005fb 100644
--- a/docs/plans/memory-analysis.md
+++ b/docs/plans/memory-analysis.md
@@ -1,5 +1,12 @@
 ## Root Cause Analysis: SMP Server Memory Growth (23.5GB)
 
+### Environment
+
+- **Server**: smp19.simplex.im, ~21,927 connected clients
+- **Storage**: PostgreSQL backend with `useCache = False`
+- **RTS flags**: `+RTS -N -A16m -I0.01 -Iw15 -s -RTS` (16 cores)
+- **Memory**: 23.5GB RES / 1031GB VIRT (75% of available RAM)
+
 ### Log Summary
 
 - **Duration**: ~22 hours (Mar 16 12:12 → Mar 17 10:20)
@@ -9,110 +16,210 @@
 
 ---
 
-### Root Cause #1 (PRIMARY): PostgreSQL Queue Cache Never Evicts
+### Known Factor: GHC Heap Sizing
 
-**Files**: `src/Simplex/Messaging/Server/QueueStore/Postgres.hs`
+With 16 cores and `-A16m`:
+- **Nursery**: 16 × 16MB = **256MB baseline**
+- GHC default major GC threshold = **2× live data** — if live data is 10GB, heap grows to ~20GB before major GC
+- The server is rarely idle with 22K clients, so major GC is deferred despite `-I0.01`
+- This is an amplifier — whatever the actual live data size is, GHC roughly doubles it
 
-The Postgres queue store has `useCache = True` hard-coded (via `Journal.hs:429`). Every queue accessed or created gets inserted into `queues :: TMap RecipientId q` and **is never removed**, even after deletion.
+---
 
-**Evidence** — `deleteStoreQueue` at Postgres.hs:448-465:
-```haskell
-deleteStoreQueue st sq = ... do
-    atomically $ writeTVar qr Nothing     -- QueueRec set to Nothing...
-    when (useCache st) $ do
-      atomically $ TM.delete (senderId q) $ senders st      -- ✅ cleaned
-      forM_ (notifier q) $ \NtfCreds {notifierId} -> do
-        atomically $ TM.delete notifierId $ notifiers st     -- ✅ cleaned
-        atomically $ TM.delete notifierId $ notifierLocks st -- ✅ cleaned
-    -- ❌ NO TM.delete rId $ queues st  — zombie entry stays forever!
-```
+### Candidate Structures That Could Grow Unboundedly
 
-Similarly, `links :: TMap LinkId RecipientId` is never cleaned on queue deletion.
+Analysis of the full codebase identified these structures that either grow without bound or have uncertain cleanup:
 
-**Impact**: Every queue created and then deleted leaves a zombie in the `queues` map (~200 bytes minimum per entry). On a busy server like smp19 running for days/weeks:
-- Millions of queues created/deleted → millions of zombie cache entries
-- Active queues also stay cached forever once loaded from Postgres
-- **This is the primary unbounded growth mechanism**
+#### 1. `SubscribedClients` maps — `Env/STM.hs:378`
 
-The `loadedQueueCount` Prometheus metric would confirm this — it shows the size of this cache (Prometheus.hs:500).
+Both `subscribers.queueSubscribers` and `ntfSubscribers.queueSubscribers` (and their `serviceSubscribers`) use `SubscribedClients (TMap EntityId (TVar (Maybe (Client s))))`.
 
----
+Comment at line 376: *"The subscriptions that were made at any point are not removed"*
 
-### Root Cause #2 (AMPLIFIER): GHC Heap Sizing with `-A16m -N`
+`deleteSubcribedClient` IS called on disconnect (Server.hs:1112) and DOES call `TM.delete`. But it only deletes if the current stored client matches — if another client already re-subscribed, the old client's disconnect won't remove the entry. This is by design for mobile client continuity, but the net effect on map size over time is unclear without measurement.
 
-**RTS flags**: `+RTS -N -A16m -I0.01 -Iw15 -s -RTS`
+#### 2. ProxyAgent's subscription TMaps — `Client/Agent.hs:145-151`
 
-With 16 cores:
-- **Nursery**: 16 × 16MB = **256MB baseline**
-- GHC's default major GC threshold = **2× live data** — if live data is 10GB, GHC allows the heap to grow to **~20GB before triggering major GC**
-- VIRT = 1031GB is normal for GHC (address space reservation, not actual memory)
+The `SMPClientAgent` has 4 TMaps that accumulate one top-level entry per unique destination server and **never remove** them:
 
-The `-I0.01` setting is aggressive for idle GC (10ms), but with 22K clients the server is rarely idle, so major GC is deferred.
+- `activeServiceSubs :: TMap SMPServer (TVar ...)` (line 145)
+- `activeQueueSubs :: TMap SMPServer (TMap QueueId ...)` (line 146)
+- `pendingServiceSubs :: TMap SMPServer (TVar ...)` (line 149)
+- `pendingQueueSubs :: TMap SMPServer (TMap QueueId ...)` (line 150)
 
----
+Comment at line 262: *"these vars are never removed, they are only added"*
 
-### Root Cause #3 (CONTRIBUTOR): No Cache Eviction by Design
+These are only used for the proxy agent (SParty 'Sender), so they grow with each unique destination SMP server proxied to. With 292 unique servers in this log period, these are likely small — but long-running servers may accumulate thousands.
 
-Looking at `getQueue_` in Postgres.hs:193-244:
-- When `useCache = True`, queues are loaded from Postgres on first access
-- They are cached in the `queues` TMap **forever**
-- There is no LRU eviction, no TTL, no max cache size
-- The comment at line 233 acknowledges interaction with `withAllMsgQueues` but no eviction mechanism exists
+`closeSMPClientAgent` (line 369) does NOT clear these 4 maps.
 
----
+#### 3. `NtfStore` — `NtfStore.hs:26`
 
-### What is NOT the Main Cause
+`NtfStore (TMap NotifierId (TVar [MsgNtf]))` — one entry per NotifierId.
 
-1. **Proxy connection state (`smpClients`)**: Properly managed — one entry per destination server, cleaned after 30-second `persistErrorInterval`. Max 292 entries. Code at `Client/Agent.hs:196-242` and `Session.hs:24-39` is correct.
+`deleteExpiredNtfs` (line 47) filters expired notifications from lists but does **not remove entries with empty lists** from the TMap. Over time, NotifierIds that no longer receive notifications leave zombie `TVar []` entries.
 
-2. **SubscribedClients**: Despite the misleading comment at `Env/STM.hs:376` saying "subscriptions are never removed," they ARE cleaned up on both client disconnect (`Server.hs:1112`) and queue deletion (`Server.hs:308` via `lookupDeleteSubscribedClient`).
+`deleteNtfs` (line 44) does remove the full entry via `TM.lookupDelete` — but only called when a notifier is explicitly deleted.
 
-3. **Client structures**: 22K × ~3KB = ~66MB — negligible.
+#### 4. `serviceLocks` in PostgresQueueStore — `Postgres.hs:112,469`
+
+`serviceLocks :: TMap CertFingerprint Lock` — one Lock per unique certificate fingerprint.
+
+`getCreateService` (line 469) calls `withLockMap (serviceLocks st) fp` which calls `getMapLock` (Agent/Client.hs:1029-1032) — this **unconditionally inserts** a Lock into the TMap. There is **no cleanup code** for serviceLocks anywhere. This is NOT guarded by `useCache`.
+
+#### 5. `sentCommands` per proxy client connection — `Client.hs:580`
+
+Each `PClient` has `sentCommands :: TMap CorrId (Request err msg)`. Entries are added per command sent (line 1369) and only removed when a response arrives (line 698). If a connection drops before all responses arrive, entries remain until the `PClient` is GC'd. Since `PClient` is captured by the connection thread which terminates on error, the `PClient` should become GC-eligible — but GC timing depends on heap pressure.
+
+#### 6. `subQ :: TQueue (ClientSub, ClientId)` — `Env/STM.hs:363`
+
+Unbounded `TQueue` for subscription changes. If the subscriber thread (`serverThread`) can't process changes fast enough, this queue grows without backpressure. With 22K clients subscribing/unsubscribing, sustained bursts could cause this queue to bloat.
 
-4. **TBQueues**: Bounded (`TBQueue` with `tbqSize = 128`), properly sized.
+---
+
+### Ruled Out
 
-5. **Thread management**: `forkClient` uses weak references, `finally` blocks ensure cleanup. Well-managed.
+1. **PostgreSQL queue cache**: `useCache = False` — `queues`, `senders`, `links`, `notifiers` TMaps are empty.
+2. **`notifierLocks`**: Guarded by `useCache` (Postgres.hs:377,405) — not used with `useCache = False`.
+3. **Client structures**: 22K × ~3KB = ~66MB — negligible.
+4. **TBQueues**: Bounded (`tbqSize = 128`).
+5. **Thread management**: `forkClient` uses weak refs + `finally` blocks. `endThreads` cleared on disconnect.
+6. **Proxy `smpClients`/`smpSessions`**: Properly cleaned on disconnect/expiry.
+7. **`smpSubWorkers`**: Properly cleaned on worker completion; also cleared in `closeSMPClientAgent`.
+8. **`pendingEvents`**: Atomically swapped empty every `pendingENDInterval`.
+9. **Stats IORef counters**: Fixed number, bounded.
+10. **DB connection pool**: Bounded `TBQueue` with bracket-based return.
 
 ---
 
-### Memory Budget Estimate
+### Insufficient Data to Determine Root Cause
 
-| Component | Estimated Size |
-|-----------|---------------|
-| Postgres queue cache (growing) | **5-15+ GB** |
-| TLS connection state (22K × ~30KB) | ~660 MB |
-| GHC nursery (16 × 16MB) | 256 MB |
-| Client structures | ~66 MB |
-| Proxy agent state | ~50 KB |
-| **GHC old gen headroom (2× live)** | **doubles effective usage** |
+Without measuring the actual sizes of these structures at runtime, we cannot determine which (if any) is the primary contributor. The following exact logging changes will identify the root cause.
 
 ---
 
-### Recommended Fixes (Priority Order)
+### EXACT LOGS TO ADD
 
-**1. Remove zombie queue entries from cache on deletion** (immediate fix)
+Add a new periodic logging thread in `src/Simplex/Messaging/Server.hs`.
 
-In `deleteStoreQueue` in `Postgres.hs`, add `TM.delete rId $ queues st` and `forM_ (queueData q) $ \(lnkId, _) -> TM.delete lnkId $ links st`:
+Insert at `Server.hs:197` (after `prometheusMetricsThread_`):
 
 ```haskell
--- After the existing cleanup:
-when (useCache st) $ do
-  atomically $ TM.delete rId $ queues st          -- ADD THIS
-  atomically $ TM.delete (senderId q) $ senders st
-  forM_ (queueData q) $ \(lnkId, _) ->
-    atomically $ TM.delete lnkId $ links st        -- ADD THIS
-  forM_ (notifier q) $ \NtfCreds {notifierId} -> do
-    atomically $ TM.delete notifierId $ notifiers st
-    atomically $ TM.delete notifierId $ notifierLocks st
+        <> memoryDiagThread_ cfg
 ```
 
-**2. Add configuration option to disable cache** — allow `useCache = False` via INI config for Postgres deployments where the DB is fast enough.
+Then define:
+
+```haskell
+    memoryDiagThread_ :: ServerConfig s -> [M s ()]
+    memoryDiagThread_ ServerConfig {prometheusInterval = Just _} =
+      [memoryDiagThread]
+    memoryDiagThread_ _ = []
+
+    memoryDiagThread :: M s ()
+    memoryDiagThread = do
+      labelMyThread "memoryDiag"
+      Env { ntfStore = NtfStore ntfMap
+          , server = srv@Server {subscribers, ntfSubscribers}
+          , proxyAgent = ProxyAgent {smpAgent = pa}
+          , msgStore_ = ms
+          } <- ask
+      let interval = 300_000_000 -- 5 minutes
+      liftIO $ forever $ do
+        threadDelay interval
+        -- GHC RTS stats
+        rts <- getRTSStats
+        let liveBytes = gcdetails_live_bytes $ gc rts
+            heapSize = gcdetails_mem_in_use_bytes $ gc rts
+            gcCount = gcs rts
+        -- Server structures
+        clientCount <- IM.size <$> getServerClients srv
+        -- SubscribedClients (queue and service subscribers for both SMP and NTF)
+        smpQSubs <- M.size <$> getSubscribedClients (queueSubscribers subscribers)
+        smpSSubs <- M.size <$> getSubscribedClients (serviceSubscribers subscribers)
+        ntfQSubs <- M.size <$> getSubscribedClients (queueSubscribers ntfSubscribers)
+        ntfSSubs <- M.size <$> getSubscribedClients (serviceSubscribers ntfSubscribers)
+        -- Pending events
+        smpPending <- IM.size <$> readTVarIO (pendingEvents subscribers)
+        ntfPending <- IM.size <$> readTVarIO (pendingEvents ntfSubscribers)
+        -- NtfStore
+        ntfStoreSize <- M.size <$> readTVarIO ntfMap
+        -- ProxyAgent maps
+        let SMPClientAgent {smpClients, smpSessions, activeServiceSubs, activeQueueSubs, pendingServiceSubs, pendingQueueSubs, smpSubWorkers} = pa
+        paClients <- M.size <$> readTVarIO smpClients
+        paSessions <- M.size <$> readTVarIO smpSessions
+        paActSvc <- M.size <$> readTVarIO activeServiceSubs
+        paActQ <- M.size <$> readTVarIO activeQueueSubs
+        paPndSvc <- M.size <$> readTVarIO pendingServiceSubs
+        paPndQ <- M.size <$> readTVarIO pendingQueueSubs
+        paWorkers <- M.size <$> readTVarIO smpSubWorkers
+        -- Loaded queue counts
+        lc <- loadedQueueCounts $ fromMsgStore ms
+        -- Log everything
+        logInfo $
+          "MEMORY "
+            <> "rts_live=" <> tshow liveBytes
+            <> " rts_heap=" <> tshow heapSize
+            <> " rts_gc=" <> tshow gcCount
+            <> " clients=" <> tshow clientCount
+            <> " smpQSubs=" <> tshow smpQSubs
+            <> " smpSSubs=" <> tshow smpSSubs
+            <> " ntfQSubs=" <> tshow ntfQSubs
+            <> " ntfSSubs=" <> tshow ntfSSubs
+            <> " smpPending=" <> tshow smpPending
+            <> " ntfPending=" <> tshow ntfPending
+            <> " ntfStore=" <> tshow ntfStoreSize
+            <> " paClients=" <> tshow paClients
+            <> " paSessions=" <> tshow paSessions
+            <> " paActSvc=" <> tshow paActSvc
+            <> " paActQ=" <> tshow paActQ
+            <> " paPndSvc=" <> tshow paPndSvc
+            <> " paPndQ=" <> tshow paPndQ
+            <> " paWorkers=" <> tshow paWorkers
+            <> " loadedQ=" <> tshow (loadedQueueCount lc)
+            <> " loadedNtf=" <> tshow (loadedNotifierCount lc)
+            <> " ntfLocks=" <> tshow (notifierLockCount lc)
+```
+
+Note: `smpSubs.subsCount` (queueSubscribers size) and `smpSubs.subServicesCount` (serviceSubscribers size) are **already logged** in Prometheus (lines 475-496). The log above adds all other candidate structures plus GHC RTS memory stats.
+
+This produces a single log line every 5 minutes:
+
+```
+[INFO] MEMORY rts_live=10737418240 rts_heap=23488102400 rts_gc=4521 clients=21927 smpQSubs=1847233 smpSSubs=42 ntfQSubs=982112 ntfSSubs=31 smpPending=0 ntfPending=0 ntfStore=512844 paClients=12 paSessions=12 paActSvc=0 paActQ=0 paPndSvc=0 paPndQ=0 paWorkers=3 loadedQ=0 loadedNtf=0 ntfLocks=0
+```
+
+### What Each Metric Tells Us
+
+| Metric | What it reveals | If growing = suspect |
+|--------|----------------|---------------------|
+| `rts_live` | Actual live data after last major GC | Baseline — everything else should add up to this |
+| `rts_heap` | Total heap (should be ~2× rts_live) | If >> 2× live, fragmentation issue |
+| `clients` | Connected client count | Known: ~22K |
+| `smpQSubs` | SubscribedClients map size (queue subs) | If >> clients × avg_subs, entries not cleaned |
+| `smpSSubs` | SubscribedClients map size (service subs) | Should be small |
+| `ntfQSubs` | NTF SubscribedClients map (queue subs) | Same concern as smpQSubs |
+| `ntfSSubs` | NTF SubscribedClients map (service subs) | Should be small |
+| `smpPending` / `ntfPending` | Pending END/DELD events per client | If large, subscriber thread lagging |
+| `ntfStore` | NotifierId count in NtfStore | If growing monotonically, zombie entries |
+| `paClients` | Proxy connections to other servers | Should be <= unique dest servers |
+| `paSessions` | Active proxy sessions | Should match paClients |
+| `paActSvc` / `paActQ` | Proxy active subscriptions | If growing, entries never removed |
+| `paPndSvc` / `paPndQ` | Proxy pending subscriptions | If growing, resubscription stuck |
+| `paWorkers` | Active reconnect workers | If growing, workers stuck in retry |
+| `loadedQ` | Cached queues in store (0 with useCache=False) | Should be 0 |
+| `ntfLocks` | Notifier locks in store | Should be 0 with useCache=False |
+
+### Interpretation Guide
+
+**If `smpQSubs` is in the millions**: SubscribedClients is the primary leak. Entries accumulate for every queue ever subscribed to.
+
+**If `ntfStore` grows monotonically**: Zombie notification entries (empty lists after expiration). Fix: `deleteExpiredNtfs` should remove entries with empty lists.
+
+**If `paActSvc` + `paActQ` grow**: Proxy agent subscription maps are the leak. Fix: add cleanup when no active/pending subs exist for a server.
 
-**3. Add cache eviction** — periodically evict queues not accessed recently (LRU or TTL-based).
+**If `rts_live` is much smaller than `rts_heap`**: GHC heap fragmentation. Fix: tune `-F` flag (GC trigger factor) or use `-c` (compacting GC).
 
-**4. Tune GHC RTS**:
-- Add `-F1.5` (or lower) to reduce the major GC threshold multiplier from default 2.0
-- Consider `-M<limit>` to set a hard heap cap
-- Consider `-N8` instead of `-N` to halve the nursery size and reduce per-capability overhead
+**If `rts_live` ~ 10-12GB**: The live data is genuinely large. Look at which metric is the largest contributor.
 
-**5. Add observability** — log `loadedQueueCount` periodically to track cache growth. This metric exists in Prometheus but should also appear in regular stats logs.
+**If nothing above is large but `rts_live` is large**: The leak is in a structure not measured here — likely TLS connection buffers, ByteString retention from Postgres queries, or GHC runtime overhead. Next step would be heap profiling with `-hT`.

From 646476f5fa38347a4be7acf0d702cf4a44bc4c45 Mon Sep 17 00:00:00 2001
From: sh <github.shum@liber.li>
Date: Thu, 19 Mar 2026 07:11:56 +0000
Subject: [PATCH 3/8] smp-server: add periodic memory diagnostics logging

Log sizes of all in-memory data structures every 5 minutes
to help identify memory growth root cause on busy servers.
---
 src/Simplex/Messaging/Server.hs | 57 ++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/src/Simplex/Messaging/Server.hs b/src/Simplex/Messaging/Server.hs
index e50416af6..93c1d0195 100644
--- a/src/Simplex/Messaging/Server.hs
+++ b/src/Simplex/Messaging/Server.hs
@@ -91,7 +91,7 @@ import qualified Data.X509 as X
 import qualified Data.X509.Validation as XV
 import GHC.Conc.Signal
 import GHC.IORef (atomicSwapIORef)
-import GHC.Stats (getRTSStats)
+import GHC.Stats (RTSStats (..), GCDetails (..), getRTSStats)
 import GHC.TypeLits (KnownNat)
 import Network.Socket (ServiceName, Socket, socketToHandle)
 import qualified Network.TLS as TLS
@@ -198,6 +198,7 @@ smpServer started cfg@ServerConfig {transports, transportConfig = tCfg, startOpt
             <> serverStatsThread_ cfg
             <> prometheusMetricsThread_ cfg
             <> controlPortThread_ cfg
+            <> [memoryDiagThread]
     )
     `finally` stopServer s
   where
@@ -719,6 +720,60 @@ smpServer started cfg@ServerConfig {transports, transportConfig = tCfg, startOpt
                 Nothing -> acc
                 Just (_, ts) -> (cnt + 1, updateTimeBuckets ts ts' times)
 
+    memoryDiagThread :: M s ()
+    memoryDiagThread = do
+      labelMyThread "memoryDiag"
+      Env
+        { ntfStore = NtfStore ntfMap,
+          server = srv@Server {subscribers, ntfSubscribers},
+          proxyAgent = ProxyAgent {smpAgent = pa},
+          msgStore_ = ms
+        } <- ask
+      let SMPClientAgent {smpClients, smpSessions, activeServiceSubs, activeQueueSubs, pendingServiceSubs, pendingQueueSubs, smpSubWorkers} = pa
+      liftIO $ forever $ do
+        threadDelay 300_000_000 -- 5 minutes
+        rts <- getRTSStats
+        let GCDetails {gcdetails_live_bytes, gcdetails_mem_in_use_bytes} = gc rts
+        clientCount <- IM.size <$> getServerClients srv
+        smpQSubs <- M.size <$> getSubscribedClients (queueSubscribers subscribers)
+        smpSSubs <- M.size <$> getSubscribedClients (serviceSubscribers subscribers)
+        ntfQSubs <- M.size <$> getSubscribedClients (queueSubscribers ntfSubscribers)
+        ntfSSubs <- M.size <$> getSubscribedClients (serviceSubscribers ntfSubscribers)
+        smpPending <- IM.size <$> readTVarIO (pendingEvents subscribers)
+        ntfPending <- IM.size <$> readTVarIO (pendingEvents ntfSubscribers)
+        ntfStoreSize <- M.size <$> readTVarIO ntfMap
+        paClients' <- M.size <$> readTVarIO smpClients
+        paSessions' <- M.size <$> readTVarIO smpSessions
+        paActSvc <- M.size <$> readTVarIO activeServiceSubs
+        paActQ <- M.size <$> readTVarIO activeQueueSubs
+        paPndSvc <- M.size <$> readTVarIO pendingServiceSubs
+        paPndQ <- M.size <$> readTVarIO pendingQueueSubs
+        paWorkers <- M.size <$> readTVarIO smpSubWorkers
+        lc <- loadedQueueCounts $ fromMsgStore ms
+        logInfo $
+          "MEMORY"
+            <> " rts_live=" <> tshow gcdetails_live_bytes
+            <> " rts_heap=" <> tshow gcdetails_mem_in_use_bytes
+            <> " rts_gc=" <> tshow (gcs rts)
+            <> " clients=" <> tshow clientCount
+            <> " smpQSubs=" <> tshow smpQSubs
+            <> " smpSSubs=" <> tshow smpSSubs
+            <> " ntfQSubs=" <> tshow ntfQSubs
+            <> " ntfSSubs=" <> tshow ntfSSubs
+            <> " smpPending=" <> tshow smpPending
+            <> " ntfPending=" <> tshow ntfPending
+            <> " ntfStore=" <> tshow ntfStoreSize
+            <> " paClients=" <> tshow paClients'
+            <> " paSessions=" <> tshow paSessions'
+            <> " paActSvc=" <> tshow paActSvc
+            <> " paActQ=" <> tshow paActQ
+            <> " paPndSvc=" <> tshow paPndSvc
+            <> " paPndQ=" <> tshow paPndQ
+            <> " paWorkers=" <> tshow paWorkers
+            <> " loadedQ=" <> tshow (loadedQueueCount lc)
+            <> " loadedNtf=" <> tshow (loadedNotifierCount lc)
+            <> " ntfLocks=" <> tshow (notifierLockCount lc)
+
     runClient :: Transport c => X.CertificateChain -> C.APrivateSignKey -> TProxy c 'TServer -> c 'TServer -> M s ()
     runClient srvCert srvSignKey tp h = do
       ms <- asks msgStore

From c6d6e30e48839b345427ce295bc4f7631aa3ebd7 Mon Sep 17 00:00:00 2001
From: sh <github.shum@liber.li>
Date: Thu, 19 Mar 2026 13:53:39 +0000
Subject: [PATCH 4/8] add memory-analysis-results based on the produced logs

---
 docs/plans/memory-analysis-results.md | 37 +++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 docs/plans/memory-analysis-results.md

diff --git a/docs/plans/memory-analysis-results.md b/docs/plans/memory-analysis-results.md
new file mode 100644
index 000000000..98bffffb5
--- /dev/null
+++ b/docs/plans/memory-analysis-results.md
@@ -0,0 +1,37 @@
+## Memory Diagnostics Results (5.5 hours, 07:49 - 13:19, Mar 19)
+
+**rts_heap (total process heap) — grows monotonically, never shrinks:**
+```
+07:49  10.1 GB
+09:24  14.1 GB
+11:24  18.7 GB
+13:19  20.7 GB
+```
+Growth: **+10.6 GB in 5.5 hours** (~1.9 GB/hour). GHC never returns memory to the OS.
+
+**rts_live (actual live data) — sawtooth pattern, minimums growing:**
+```
+07:54   5.5 GB  (post-GC valley)
+08:54   6.2 GB
+09:44   6.6 GB
+11:24   6.6 GB
+13:14   9.1 GB
+```
+The post-GC floor is rising: **+3.6 GB over 5.5 hours**. This confirms a genuine leak.
+
+**But smpQSubs is NOT the cause** — it oscillates between 1.2M-1.4M, not growing monotonically. At ~130 bytes/entry, 1.4M entries = ~180MB. Can't explain 9GB.
+
+**clients** oscillates 14K-20K, also not monotonically growing.
+
+**Everything else is tiny**: ntfStore ~7K entries (<1MB), paClients ~350 (~50KB), all other metrics near 0.
+
+**The leak is in something we're not measuring.** ~6-9GB of live data is unaccounted for by all tracked structures. The most likely candidates are:
+
+1. **Per-client state we didn't measure** — the *contents* of TBQueues (buffered messages), per-client `subscriptions` TMap contents (Sub records with TVars)
+2. **TLS connection buffers** — the `tls` library allocates internal state per connection
+3. **Pinned ByteStrings** from PostgreSQL queries — these aren't collected by normal GC
+4. **GHC heap fragmentation** — pinned objects cause block-level fragmentation
+
+The next step is either:
+- **Add more metrics**: measure total TBQueue fill across all clients, total subscription count, and pinned byte count from RTS stats
+- **Run with `-hT`**: heap profiling by type to see exactly what's consuming memory

From 323be9c6a4b0fb906c0a76ce5e51285e79d38eed Mon Sep 17 00:00:00 2001
From: sh <github.shum@liber.li>
Date: Fri, 20 Mar 2026 06:57:29 +0000
Subject: [PATCH 5/8] feat(smp-server): add per-client and RTS memory metrics

Add clientSubs, clientSndQ, clientMsgQ, clientThreads counts
and GHC RTS large objects, compact, fragmentation metrics
to identify memory growth not explained by server-level maps.
---
 src/Simplex/Messaging/Server.hs | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/Simplex/Messaging/Server.hs b/src/Simplex/Messaging/Server.hs
index 93c1d0195..f483e163b 100644
--- a/src/Simplex/Messaging/Server.hs
+++ b/src/Simplex/Messaging/Server.hs
@@ -733,7 +733,7 @@ smpServer started cfg@ServerConfig {transports, transportConfig = tCfg, startOpt
       liftIO $ forever $ do
         threadDelay 300_000_000 -- 5 minutes
         rts <- getRTSStats
-        let GCDetails {gcdetails_live_bytes, gcdetails_mem_in_use_bytes} = gc rts
+        let GCDetails {gcdetails_live_bytes, gcdetails_mem_in_use_bytes, gcdetails_large_objects_bytes, gcdetails_compact_bytes, gcdetails_block_fragmentation_bytes} = gc rts
         clientCount <- IM.size <$> getServerClients srv
         smpQSubs <- M.size <$> getSubscribedClients (queueSubscribers subscribers)
         smpSSubs <- M.size <$> getSubscribedClients (serviceSubscribers subscribers)
@@ -750,12 +750,27 @@ smpServer started cfg@ServerConfig {transports, transportConfig = tCfg, startOpt
         paPndQ <- M.size <$> readTVarIO pendingQueueSubs
         paWorkers <- M.size <$> readTVarIO smpSubWorkers
         lc <- loadedQueueCounts $ fromMsgStore ms
+        -- per-client metrics: total subscriptions and queue fill
+        clients <- getServerClients srv
+        let clientsList = IM.elems clients
+        totalSubs <- sum <$> mapM (\Client {subscriptions} -> M.size <$> readTVarIO subscriptions) clientsList
+        totalSndQ <- sum <$> mapM (\Client {sndQ} -> fromIntegral <$> atomically (lengthTBQueue sndQ)) clientsList
+        totalMsgQ <- sum <$> mapM (\Client {msgQ} -> fromIntegral <$> atomically (lengthTBQueue msgQ)) clientsList
+        totalEndThreads <- sum <$> mapM (\Client {endThreads} -> IM.size <$> readTVarIO endThreads) clientsList
         logInfo $
           "MEMORY"
             <> " rts_live=" <> tshow gcdetails_live_bytes
             <> " rts_heap=" <> tshow gcdetails_mem_in_use_bytes
+            <> " rts_max_live=" <> tshow (max_live_bytes rts)
+            <> " rts_large=" <> tshow gcdetails_large_objects_bytes
+            <> " rts_compact=" <> tshow gcdetails_compact_bytes
+            <> " rts_frag=" <> tshow gcdetails_block_fragmentation_bytes
             <> " rts_gc=" <> tshow (gcs rts)
             <> " clients=" <> tshow clientCount
+            <> " clientSubs=" <> tshow totalSubs
+            <> " clientSndQ=" <> tshow (totalSndQ :: Int)
+            <> " clientMsgQ=" <> tshow (totalMsgQ :: Int)
+            <> " clientThreads=" <> tshow totalEndThreads
             <> " smpQSubs=" <> tshow smpQSubs
             <> " smpSSubs=" <> tshow smpSSubs
             <> " ntfQSubs=" <> tshow ntfQSubs

From 4c8ace4db6ac76ead3222e31f0b99ff112bdd475 Mon Sep 17 00:00:00 2001
From: sh <github.shum@liber.li>
Date: Fri, 20 Mar 2026 08:27:24 +0000
Subject: [PATCH 6/8] update findings

---
 docs/plans/memory-analysis-results.md | 117 ++++++++++++++++++++------
 1 file changed, 90 insertions(+), 27 deletions(-)

diff --git a/docs/plans/memory-analysis-results.md b/docs/plans/memory-analysis-results.md
index 98bffffb5..5cb9cf555 100644
--- a/docs/plans/memory-analysis-results.md
+++ b/docs/plans/memory-analysis-results.md
@@ -1,37 +1,100 @@
-## Memory Diagnostics Results (5.5 hours, 07:49 - 13:19, Mar 19)
+## Memory Diagnostics Results
 
-**rts_heap (total process heap) — grows monotonically, never shrinks:**
-```
-07:49  10.1 GB
-09:24  14.1 GB
-11:24  18.7 GB
-13:19  20.7 GB
-```
-Growth: **+10.6 GB in 5.5 hours** (~1.9 GB/hour). GHC never returns memory to the OS.
+### Data Collection
+
+Server: smp19.simplex.im, PostgreSQL backend, `useCache = False`
+RTS flags: `+RTS -N -A16m -I0.01 -Iw15 -s -RTS` (16 cores)
+
+### Mar 20 Data (1 hour, 07:19-08:19)
 
-**rts_live (actual live data) — sawtooth pattern, minimums growing:**
 ```
-07:54   5.5 GB  (post-GC valley)
-08:54   6.2 GB
-09:44   6.6 GB
-11:24   6.6 GB
-13:14   9.1 GB
+Time    rts_live  rts_heap  rts_large  rts_frag   clients  non-large
+07:19    7.5 GB    8.2 GB    5.5 GB    0.03 GB    14,000    2.0 GB
+07:24    6.4 GB   10.8 GB    5.2 GB    3.6 GB     14,806    1.2 GB
+07:29    8.2 GB   10.8 GB    6.5 GB    1.8 GB     15,667    1.7 GB
+07:34   10.0 GB   12.3 GB    7.9 GB    1.4 GB     15,845    2.1 GB
+07:39    6.7 GB   13.0 GB    5.3 GB    5.6 GB     16,589    1.4 GB
+07:44    8.5 GB   13.0 GB    6.7 GB    3.7 GB     16,283    1.8 GB
+07:49    6.5 GB   13.0 GB    5.2 GB    5.8 GB     16,532    1.3 GB
+07:54    6.0 GB   13.0 GB    4.8 GB    6.3 GB     16,636    1.2 GB
+07:59    6.4 GB   13.0 GB    5.1 GB    5.9 GB     16,769    1.3 GB
+08:04    8.3 GB   13.0 GB    6.5 GB    3.9 GB     17,352    1.8 GB
+08:09   10.2 GB   13.0 GB    8.0 GB    1.9 GB     17,053    2.2 GB
+08:14    5.6 GB   13.0 GB    4.5 GB    6.8 GB     17,147    1.1 GB
+08:19    7.6 GB   13.0 GB    6.1 GB    4.6 GB     17,496    1.5 GB
 ```
-The post-GC floor is rising: **+3.6 GB over 5.5 hours**. This confirms a genuine leak.
 
-**But smpQSubs is NOT the cause** — it oscillates between 1.2M-1.4M, not growing monotonically. At ~130 bytes/entry, 1.4M entries = ~180MB. Can't explain 9GB.
+non-large = rts_live - rts_large (normal Haskell heap objects: Maps, TVars, closures)
+
+### Mar 19 Data (5.5 hours, 07:49-13:19)
+
+rts_heap grew from 10.1 GB to 20.7 GB over 5.5 hours.
+Post-GC rts_live floor rose from 5.5 GB to 9.1 GB.
+
+### Findings
+
+**1. Large/pinned objects dominate live data (60-80%)**
+
+`rts_large` = 4.5-8.0 GB out of 5.6-10.2 GB live. These are allocations > ~3KB that go on GHC's large object heap. They oscillate (not growing monotonically), meaning they are being allocated and freed constantly — transient, not leaked.
+
+**2. Fragmentation is the heap growth mechanism**
+
+`rts_heap ≈ rts_live + rts_frag`. The heap grows because pinned/large objects fragment GHC's block allocator. Once GHC expands the heap, it never shrinks. Growth pattern:
+- Large objects allocated → occupy blocks
+- Large objects freed → blocks can't be reused if ANY other object shares the block
+- New allocations need fresh blocks → heap expands
+- Heap never returns memory to OS
+
+**3. Non-large heap data is stable (~1.0-2.2 GB)**
+
+Normal Haskell objects (Maps, TVars, closures, client structures) account for only 1-2 GB. This scales with client count at ~100-130 KB/client and does NOT grow over time.
+
+**4. All tracked data structures are NOT the cause**
+
+- `clientSndQ=0, clientMsgQ=0` — TBQueues empty, no message accumulation
+- `smpQSubs` oscillates ~1.0-1.4M — entries are cleaned up, not leaking
+- `ntfStore` < 2K entries — negligible
+- All proxy agent maps near 0
+- `loadedQ=0` — useCache=False confirmed working
+
+**5. Source of large objects is unclear without heap profiling**
+
+The 4.5-8.0 GB of large objects could come from:
+- PostgreSQL driver (`postgresql-simple`/`libpq`) — pinned ByteStrings for query results
+- TLS library (`tls`) — pinned buffers per connection
+- Network socket I/O — pinned ByteStrings for recv/send
+- SMP protocol message blocks
+
+Cannot distinguish between these without `-hT` heap profiling (which is too expensive for this server).
+
+### Root Cause
+
+**GHC heap fragmentation from constant churn of large/pinned ByteString allocations.**
+
+Not a data structure leak. The live data itself is reasonable (5-10 GB for 15-17K clients). The problem is that GHC's copying GC cannot compact around pinned objects, so the heap grows with fragmentation and never shrinks.
+
+### Mitigation Options
 
-**clients** oscillates 14K-20K, also not monotonically growing.
+All are RTS flag changes — no rebuild needed, reversible by restart.
 
-**Everything else is tiny**: ntfStore ~7K entries (<1MB), paClients ~350 (~50KB), all other metrics near 0.
+**1. `-F1.2`** (reduce GC trigger factor from default 2.0)
+- Triggers major GC when heap reaches 1.2x live data instead of 2x
+- Reclaims fragmented blocks sooner
+- Trade-off: more frequent GC, slightly higher CPU
+- Risk: low — just makes GC run more often
 
-**The leak is in something we're not measuring.** ~6-9GB of live data is unaccounted for by all tracked structures. The most likely candidates are:
+**2. Reduce `-A16m` to `-A4m`** (smaller nursery)
+- More frequent minor GC → short-lived pinned objects freed faster
+- Trade-off: more GC cycles, but each is smaller
+- Risk: low — may actually improve latency by reducing GC pause times
 
-1. **Per-client state we didn't measure** — the *contents* of TBQueues (buffered messages), per-client `subscriptions` TMap contents (Sub records with TVars)
-2. **TLS connection buffers** — the `tls` library allocates internal state per connection
-3. **Pinned ByteStrings** from PostgreSQL queries — these aren't collected by normal GC
-4. **GHC heap fragmentation** — pinned objects cause block-level fragmentation
+**3. `+RTS -xn`** (nonmoving GC)
+- Designed for pinned-heavy workloads — avoids copying entirely
+- Available since GHC 8.10, improved in 9.x
+- Trade-off: different GC characteristics, less battle-tested
+- Risk: medium — different GC algorithm, should test first
 
-The next step is either:
-- **Add more metrics**: measure total TBQueue fill across all clients, total subscription count, and pinned byte count from RTS stats
-- **Run with `-hT`**: heap profiling by type to see exactly what's consuming memory
+**4. Limit concurrent connections** (application-level)
+- Since large objects scale per-client, fewer clients = less fragmentation
+- Trade-off: reduced capacity
+- Risk: low but impacts users

From 2a0af04ab845169c392420ded0fecd95bc195cf0 Mon Sep 17 00:00:00 2001
From: sh <github.shum@liber.li>
Date: Fri, 20 Mar 2026 14:48:11 +0000
Subject: [PATCH 7/8] feat(bench): add smp-server memory benchmark framework

Layered benchmark that isolates per-component memory cost:
- Phase 1: baseline (no clients)
- Phase 2: TLS connections only
- Phase 3: queue creation (NEW + KEY)
- Phase 4: subscriptions (SUB)
- Phase 5: message send
- Phase 6: message receive + ACK
- Phase 7: sustained load with time-series

Includes Docker Compose (PostgreSQL 17), run.sh with
--compare-rts mode for testing different GC configurations.
---
 bench/ClientSim.hs       | 133 +++++++++++++++++++++
 bench/Main.hs            | 243 +++++++++++++++++++++++++++++++++++++++
 bench/Report.hs          | 113 ++++++++++++++++++
 bench/docker-compose.yml |  20 ++++
 bench/init.sql           |   2 +
 bench/run.sh             |  48 ++++++++
 simplexmq.cabal          |  30 +++++
 7 files changed, 589 insertions(+)
 create mode 100644 bench/ClientSim.hs
 create mode 100644 bench/Main.hs
 create mode 100644 bench/Report.hs
 create mode 100644 bench/docker-compose.yml
 create mode 100644 bench/init.sql
 create mode 100755 bench/run.sh

diff --git a/bench/ClientSim.hs b/bench/ClientSim.hs
new file mode 100644
index 000000000..319c8482f
--- /dev/null
+++ b/bench/ClientSim.hs
@@ -0,0 +1,133 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DuplicateRecordFields #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE PatternSynonyms #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE TypeApplications #-}
+{-# OPTIONS_GHC -fno-warn-ambiguous-fields #-}
+
+module ClientSim
+  ( SimClient (..),
+    connectClient,
+    createQueue,
+    subscribeQueue,
+    sendMessage,
+    receiveAndAck,
+    connectN,
+    benchKeyHash,
+  )
+where
+
+import Control.Concurrent.Async (mapConcurrently)
+import Control.Concurrent.STM
+import Control.Monad (forM_)
+import Control.Monad.Except (runExceptT)
+import Data.ByteString.Char8 (ByteString)
+import Data.List (unfoldr)
+import qualified Data.List.NonEmpty as L
+import Network.Socket (ServiceName)
+import qualified Simplex.Messaging.Crypto as C
+import Simplex.Messaging.Encoding
+import Simplex.Messaging.Protocol
+import Simplex.Messaging.Transport
+import Simplex.Messaging.Transport.Client
+import Simplex.Messaging.Version
+
+data SimClient = SimClient
+  { scHandle :: THandleSMP TLS 'TClient,
+    scRcvKey :: C.APrivateAuthKey,
+    scRcvId :: RecipientId,
+    scSndId :: SenderId,
+    scDhSecret :: C.DhSecret 'C.X25519
+  }
+
+benchKeyHash :: C.KeyHash
+benchKeyHash = "LcJUMfVhwD8yxjAiSaDzzGF3-kLG4Uh0Fl_ZIjrRwjI="
+
+connectClient :: TransportHost -> ServiceName -> IO (THandleSMP TLS 'TClient)
+connectClient host port = do
+  let tcConfig = defaultTransportClientConfig {clientALPN = Just alpnSupportedSMPHandshakes}
+  runTransportClient tcConfig Nothing host port (Just benchKeyHash) $ \h ->
+    runExceptT (smpClientHandshake h Nothing benchKeyHash supportedClientSMPRelayVRange False Nothing) >>= \case
+      Right th -> pure th
+      Left e -> error $ "SMP handshake failed: " <> show e
+
+connectN :: Int -> TransportHost -> ServiceName -> IO [THandleSMP TLS 'TClient]
+connectN n host port = do
+  let batches = chunksOf 100 [1 .. n]
+  concat <$> mapM (\batch -> mapConcurrently (\_ -> connectClient host port) batch) batches
+
+createQueue :: THandleSMP TLS 'TClient -> IO SimClient
+createQueue h = do
+  g <- C.newRandom
+  (rPub, rKey) <- atomically $ C.generateAuthKeyPair C.SEd448 g
+  (sPub, sKey) <- atomically $ C.generateAuthKeyPair C.SEd25519 g
+  (dhPub, dhPriv :: C.PrivateKeyX25519) <- atomically $ C.generateKeyPair g
+  -- NEW command
+  Resp "1" NoEntity (Ids rId sId srvDh) <- signSendRecv h rKey ("1", NoEntity, New rPub dhPub)
+  let dhShared = C.dh' srvDh dhPriv
+  -- KEY command (secure queue)
+  Resp "2" _ OK <- signSendRecv h rKey ("2", rId, KEY sPub)
+  pure SimClient {scHandle = h, scRcvKey = rKey, scRcvId = rId, scSndId = sId, scDhSecret = dhShared}
+
+subscribeQueue :: SimClient -> IO ()
+subscribeQueue SimClient {scHandle = h, scRcvKey = rKey, scRcvId = rId} = do
+  Resp "3" _ (SOK _) <- signSendRecv h rKey ("3", rId, SUB)
+  pure ()
+
+sendMessage :: THandleSMP TLS 'TClient -> C.APrivateAuthKey -> SenderId -> ByteString -> IO ()
+sendMessage h sKey sId body = do
+  Resp "4" _ OK <- signSendRecv h sKey ("4", sId, SEND noMsgFlags body)
+  pure ()
+
+receiveAndAck :: SimClient -> IO ()
+receiveAndAck SimClient {scHandle = h, scRcvKey = rKey, scRcvId = rId} = do
+  (_, _, Right (MSG RcvMessage {msgId = mId})) <- tGet1 h
+  Resp "5" _ OK <- signSendRecv h rKey ("5", rId, ACK mId)
+  pure ()
+
+-- Helpers (same patterns as ServerTests.hs)
+
+pattern Resp :: CorrId -> EntityId -> BrokerMsg -> Transmission (Either ErrorType BrokerMsg)
+pattern Resp corrId queueId command <- (corrId, queueId, Right command)
+
+pattern Ids :: RecipientId -> SenderId -> RcvPublicDhKey -> BrokerMsg
+pattern Ids rId sId srvDh <- IDS (QIK rId sId srvDh _ _ Nothing Nothing)
+
+pattern New :: RcvPublicAuthKey -> RcvPublicDhKey -> Command 'Creator
+pattern New rPub dhPub = NEW (NewQueueReq rPub dhPub Nothing SMSubscribe (Just (QRMessaging Nothing)) Nothing)
+
+signSendRecv :: (Transport c, PartyI p) => THandleSMP c 'TClient -> C.APrivateAuthKey -> (ByteString, EntityId, Command p) -> IO (Transmission (Either ErrorType BrokerMsg))
+signSendRecv h pk t = do
+  signSend h pk t
+  (r L.:| _) <- tGetClient h
+  pure r
+
+signSend :: (Transport c, PartyI p) => THandleSMP c 'TClient -> C.APrivateAuthKey -> (ByteString, EntityId, Command p) -> IO ()
+signSend h@THandle {params} (C.APrivateAuthKey a pk) (corrId, qId, cmd) = do
+  let TransmissionForAuth {tForAuth, tToSend} = encodeTransmissionForAuth params (CorrId corrId, qId, cmd)
+      authorize t = (,Nothing) <$> case a of
+        C.SEd25519 -> Just . TASignature . C.ASignature C.SEd25519 $ C.sign' pk t
+        C.SEd448 -> Just . TASignature . C.ASignature C.SEd448 $ C.sign' pk t
+        C.SX25519 -> (\THAuthClient {peerServerPubKey = k} -> TAAuthenticator $ C.cbAuthenticate k pk (C.cbNonce corrId) t) <$> thAuth params
+  Right () <- tPut1 h (authorize tForAuth, tToSend)
+  pure ()
+
+tPut1 :: Transport c => THandle v c 'TClient -> SentRawTransmission -> IO (Either TransportError ())
+tPut1 h t = do
+  rs <- tPut h (Right t L.:| [])
+  case rs of
+    (r : _) -> pure r
+    [] -> error "tPut1: empty result"
+
+tGet1 :: (ProtocolEncoding v err cmd, Transport c) => THandle v c 'TClient -> IO (Transmission (Either err cmd))
+tGet1 h = do
+  (r L.:| _) <- tGetClient h
+  pure r
+
+chunksOf :: Int -> [a] -> [[a]]
+chunksOf n = unfoldr $ \xs -> if null xs then Nothing else Just (splitAt n xs)
diff --git a/bench/Main.hs b/bench/Main.hs
new file mode 100644
index 000000000..3d923c9eb
--- /dev/null
+++ b/bench/Main.hs
@@ -0,0 +1,243 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DuplicateRecordFields #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE NumericUnderscores #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeApplications #-}
+{-# OPTIONS_GHC -fno-warn-ambiguous-fields #-}
+
+module Main where
+
+import Control.Concurrent (threadDelay)
+import Control.Concurrent.Async (async, cancel, forConcurrently_, mapConcurrently, mapConcurrently_)
+import Control.Concurrent.STM
+import Control.Monad (forever, forM_, void, when)
+import Data.ByteString.Char8 (ByteString)
+import qualified Data.ByteString.Char8 as B
+import Data.IORef
+import Data.List (unfoldr)
+import Data.Time.Clock (getCurrentTime, utctDayTime)
+import Network.Socket (ServiceName)
+import System.Environment (getArgs)
+import System.IO (hFlush, stdout)
+
+import ClientSim
+import Report
+
+import Crypto.Random (ChaChaDRG)
+import qualified Simplex.Messaging.Crypto as C
+import Simplex.Messaging.Server (runSMPServerBlocking)
+import Simplex.Messaging.Server.Env.STM as Env
+import Simplex.Messaging.Server.Expiration (ExpirationConfig (..))
+import Simplex.Messaging.Server.MsgStore.Postgres (PostgresMsgStore)
+import Simplex.Messaging.Server.QueueStore.Postgres.Config (PostgresStoreCfg (..))
+import Simplex.Messaging.Agent.Store.Postgres.Options (DBOpts (..))
+import Simplex.Messaging.Agent.Store.Shared (MigrationConfirmation (..))
+import Simplex.Messaging.Client.Agent (SMPClientAgentConfig (..), defaultSMPClientAgentConfig)
+import Simplex.Messaging.Transport
+import Simplex.Messaging.Transport.Server (ServerCredentials (..), mkTransportServerConfig)
+import Simplex.Messaging.Version
+import UnliftIO.Exception (bracket)
+
+import Control.Logger.Simple (logInfo, withGlobalLogging, LogConfig (..), setLogLevel, LogLevel (..))
+
+data BenchConfig = BenchConfig
+  { numClients :: Int,
+    sustainedMinutes :: Int,
+    pgConnStr :: ByteString,
+    serverPort :: ServiceName,
+    timeSeriesFile :: FilePath
+  }
+
+defaultBenchConfig :: BenchConfig
+defaultBenchConfig =
+  BenchConfig
+    { numClients = 5000,
+      sustainedMinutes = 5,
+      pgConnStr = "postgresql://smp@localhost:15432/smp_bench",
+      serverPort = "15001",
+      timeSeriesFile = "bench-timeseries.csv"
+    }
+
+parseArgs :: IO BenchConfig
+parseArgs = do
+  args <- getArgs
+  pure $ go args defaultBenchConfig
+  where
+    go [] c = c
+    go ("--clients" : n : rest) c = go rest c {numClients = read n}
+    go ("--minutes" : n : rest) c = go rest c {sustainedMinutes = read n}
+    go ("--pg" : s : rest) c = go rest c {pgConnStr = B.pack s}
+    go ("--port" : p : rest) c = go rest c {serverPort = p}
+    go ("--timeseries" : f : rest) c = go rest c {timeSeriesFile = f}
+    go (x : _) _ = error $ "Unknown argument: " <> x
+
+main :: IO ()
+main = withGlobalLogging LogConfig {lc_file = Nothing, lc_stderr = True} $ do
+  setLogLevel LogInfo
+  bc@BenchConfig {numClients, sustainedMinutes, serverPort, timeSeriesFile, pgConnStr} <- parseArgs
+  putStrLn $ "SMP Server Memory Benchmark"
+  putStrLn $ "  clients:  " <> show numClients
+  putStrLn $ "  sustain:  " <> show sustainedMinutes <> " min"
+  putStrLn $ "  pg:       " <> B.unpack pgConnStr
+  putStrLn $ "  port:     " <> serverPort
+  putStrLn ""
+
+  snapshotsRef <- newIORef []
+
+  let snap phase clients = do
+        s <- takeSnapshot phase clients
+        modifyIORef' snapshotsRef (s :)
+        putStrLn $ "  [" <> show phase <> "] live=" <> show (snapLive s `div` (1024 * 1024)) <> "MB large=" <> show (snapLarge s `div` (1024 * 1024)) <> "MB"
+        hFlush stdout
+
+  withBenchServer bc $ do
+    putStrLn "Phase 1: Baseline (no clients)"
+    snap "baseline" 0
+
+    putStrLn $ "Phase 2: Connecting " <> show numClients <> " TLS clients..."
+    handles <- connectN numClients "localhost" serverPort
+    putStrLn $ "  Connected " <> show (length handles) <> " clients"
+    snap "tls_connect" (length handles)
+
+    putStrLn "Phase 3: Creating queues (NEW + KEY)..."
+    simClients <- mapConcurrently createQueue handles
+    putStrLn $ "  Created " <> show (length simClients) <> " queues"
+    snap "queue_create" (length simClients)
+
+    putStrLn "Phase 4: Subscribing (SUB)..."
+    mapConcurrently_ subscribeQueue simClients
+    snap "subscribe" (length simClients)
+
+    -- Pair up clients: first half sends to second half
+    let halfN = length simClients `div` 2
+        senders = take halfN simClients
+        receivers = drop halfN simClients
+        pairs = zip senders receivers
+
+    putStrLn $ "Phase 5: Sending " <> show halfN <> " messages..."
+    g <- C.newRandom
+    forConcurrently_ pairs $ \(sender, receiver) -> do
+      (_, sKey) <- atomically $ C.generateAuthKeyPair C.SEd25519 g
+      sendMessage (scHandle sender) sKey (scSndId receiver) "benchmark test message payload 1234567890"
+    snap "msg_send" (length simClients)
+
+    putStrLn "Phase 6: Receiving and ACKing messages..."
+    forConcurrently_ receivers receiveAndAck
+    snap "msg_recv" (length simClients)
+
+    putStrLn $ "Phase 7: Sustained load (" <> show sustainedMinutes <> " min)..."
+    writeTimeSeriesHeader timeSeriesFile
+    -- Logger thread: snapshot every 10s
+    logger <- async $ forever $ do
+      threadDelay 10_000_000
+      s <- takeSnapshot "sustained" (length simClients)
+      appendTimeSeries timeSeriesFile s
+    -- Worker threads: continuous send/receive
+    let loopDurationUs = sustainedMinutes * 60 * 1_000_000
+    workersDone <- newTVarIO False
+    workers <- async $ do
+      deadline <- (+ loopDurationUs) <$> getMonotonicTimeUs
+      sustainedLoop g pairs deadline
+      atomically $ writeTVar workersDone True
+    -- Wait for workers
+    void $ atomically $ readTVar workersDone >>= \done -> when (not done) retry
+    cancel logger
+    cancel workers
+    snap "sustained_end" (length simClients)
+
+  snapshots <- reverse <$> readIORef snapshotsRef
+  printSummary snapshots
+  putStrLn $ "\nTime-series written to: " <> timeSeriesFile
+
+sustainedLoop :: TVar ChaChaDRG -> [(SimClient, SimClient)] -> Int -> IO ()
+sustainedLoop g pairs deadline = go
+  where
+    go = do
+      now <- getMonotonicTimeUs
+      when (now < deadline) $ do
+        forConcurrently_ pairs $ \(sender, receiver) -> do
+          (_, sKey) <- atomically $ C.generateAuthKeyPair C.SEd25519 g
+          sendMessage (scHandle sender) sKey (scSndId receiver) "sustained load message payload"
+        forConcurrently_ (map snd pairs) receiveAndAck
+        go
+
+getMonotonicTimeUs :: IO Int
+getMonotonicTimeUs = do
+  t <- getCurrentTime
+  pure $ round (utctDayTime t * 1_000_000)
+
+withBenchServer :: BenchConfig -> IO a -> IO a
+withBenchServer BenchConfig {pgConnStr, serverPort} action = do
+  started <- newEmptyTMVarIO
+  let srvCfg = benchServerConfig pgConnStr serverPort
+  bracket
+    (async $ runSMPServerBlocking started srvCfg Nothing)
+    cancel
+    (\_ -> waitForServer started >> action)
+  where
+    waitForServer started = do
+      r <- atomically $ takeTMVar started
+      if r
+        then putStrLn $ "Server started on port " <> serverPort
+        else error "Server failed to start"
+
+benchServerConfig :: ByteString -> ServiceName -> ServerConfig PostgresMsgStore
+benchServerConfig pgConn port =
+  let storeCfg = PostgresStoreCfg
+        { dbOpts = DBOpts {connstr = pgConn, schema = "smp_server", poolSize = 10, createSchema = True},
+          dbStoreLogPath = Nothing,
+          confirmMigrations = MCYesUp,
+          deletedTTL = 86400
+        }
+   in ServerConfig
+        { transports = [(port, transport @TLS, False)],
+          smpHandshakeTimeout = 120_000_000,
+          tbqSize = 128,
+          msgQueueQuota = 128,
+          maxJournalMsgCount = 256,
+          maxJournalStateLines = 16,
+          queueIdBytes = 24,
+          msgIdBytes = 24,
+          serverStoreCfg = SSCDatabase storeCfg,
+          storeNtfsFile = Nothing,
+          allowNewQueues = True,
+          newQueueBasicAuth = Nothing,
+          controlPortUserAuth = Nothing,
+          controlPortAdminAuth = Nothing,
+          dailyBlockQueueQuota = 20,
+          messageExpiration = Just defaultMessageExpiration,
+          expireMessagesOnStart = False,
+          expireMessagesOnSend = False,
+          idleQueueInterval = 14400,
+          notificationExpiration = defaultNtfExpiration,
+          inactiveClientExpiration = Nothing,
+          logStatsInterval = Nothing,
+          logStatsStartTime = 0,
+          serverStatsLogFile = "bench/tmp/stats.log",
+          serverStatsBackupFile = Nothing,
+          prometheusInterval = Nothing,
+          prometheusMetricsFile = "bench/tmp/metrics.txt",
+          pendingENDInterval = 500_000,
+          ntfDeliveryInterval = 200_000,
+          smpCredentials =
+            ServerCredentials
+              { caCertificateFile = Just "tests/fixtures/ca.crt",
+                privateKeyFile = "tests/fixtures/server.key",
+                certificateFile = "tests/fixtures/server.crt"
+              },
+          httpCredentials = Nothing,
+          smpServerVRange = supportedServerSMPRelayVRange,
+          Env.transportConfig = mkTransportServerConfig True (Just alpnSupportedSMPHandshakes) True,
+          controlPort = Nothing,
+          smpAgentCfg = defaultSMPClientAgentConfig {persistErrorInterval = 1},
+          allowSMPProxy = False,
+          serverClientConcurrency = 16,
+          information = Nothing,
+          startOptions = StartOptions {maintenance = False, compactLog = False, logLevel = LogInfo, skipWarnings = True, confirmMigrations = MCYesUp}
+        }
+
diff --git a/bench/Report.hs b/bench/Report.hs
new file mode 100644
index 000000000..de59fcd64
--- /dev/null
+++ b/bench/Report.hs
@@ -0,0 +1,113 @@
+{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE NumericUnderscores #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module Report
+  ( Snapshot (..),
+    takeSnapshot,
+    printSummary,
+    writeTimeSeriesHeader,
+    appendTimeSeries,
+  )
+where
+
+import Control.Concurrent (threadDelay)
+import Data.List (foldl')
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import Data.Time.Clock (UTCTime, getCurrentTime)
+import Data.Time.Format.ISO8601 (iso8601Show)
+import Data.Word (Word32, Word64)
+import GHC.Stats (RTSStats (..), GCDetails (..), getRTSStats)
+import System.IO (Handle, IOMode (..), hFlush, hSetBuffering, BufferMode (..), withFile)
+import System.Mem (performMajorGC)
+
+data Snapshot = Snapshot
+  { snapTime :: UTCTime,
+    snapPhase :: Text,
+    snapLive :: Word64,
+    snapHeap :: Word64,
+    snapLarge :: Word64,
+    snapFrag :: Word64,
+    snapGCs :: Word32,
+    snapClients :: Int
+  }
+
+takeSnapshot :: Text -> Int -> IO Snapshot
+takeSnapshot phase clients = do
+  performMajorGC
+  threadDelay 1_000_000
+  rts <- getRTSStats
+  ts <- getCurrentTime
+  let GCDetails {gcdetails_live_bytes, gcdetails_mem_in_use_bytes, gcdetails_large_objects_bytes, gcdetails_block_fragmentation_bytes} = gc rts
+  pure
+    Snapshot
+      { snapTime = ts,
+        snapPhase = phase,
+        snapLive = gcdetails_live_bytes,
+        snapHeap = gcdetails_mem_in_use_bytes,
+        snapLarge = gcdetails_large_objects_bytes,
+        snapFrag = gcdetails_block_fragmentation_bytes,
+        snapGCs = gcs rts,
+        snapClients = clients
+      }
+
+printSummary :: [Snapshot] -> IO ()
+printSummary [] = putStrLn "No snapshots collected."
+printSummary snaps = do
+  putStrLn ""
+  putStrLn hdr
+  putStrLn $ replicate (length hdr) '-'
+  mapM_ printRow (zip (Snapshot {snapLive = 0, snapHeap = 0, snapLarge = 0, snapFrag = 0, snapGCs = 0, snapClients = 0, snapPhase = "", snapTime = snapTime (head snaps)} : snaps) snaps)
+  where
+    hdr = padR 20 "Phase" <> padL 12 "live_MB" <> padL 12 "large_MB" <> padL 12 "frag_MB" <> padL 12 "heap_MB" <> padL 10 "clients" <> padL 14 "d_live_MB" <> padL 14 "d_large_MB" <> padL 14 "KB/client"
+    printRow (prev, cur) =
+      putStrLn $
+        padR 20 (T.unpack $ snapPhase cur)
+          <> padL 12 (showMB $ snapLive cur)
+          <> padL 12 (showMB $ snapLarge cur)
+          <> padL 12 (showMB $ snapFrag cur)
+          <> padL 12 (showMB $ snapHeap cur)
+          <> padL 10 (show $ snapClients cur)
+          <> padL 14 (showDeltaMB (snapLive cur) (snapLive prev))
+          <> padL 14 (showDeltaMB (snapLarge cur) (snapLarge prev))
+          <> padL 14 (perClient cur)
+    showMB w = show (w `div` (1024 * 1024))
+    showDeltaMB a b
+      | a >= b = "+" <> show ((a - b) `div` (1024 * 1024))
+      | otherwise = "-" <> show ((b - a) `div` (1024 * 1024))
+    perClient Snapshot {snapClients, snapLive}
+      | snapClients > 0 = show (snapLive `div` fromIntegral snapClients `div` 1024)
+      | otherwise = "-"
+    padR n s = s <> replicate (max 0 (n - length s)) ' '
+    padL n s = replicate (max 0 (n - length s)) ' ' <> s
+
+csvHeader :: Text
+csvHeader = "timestamp,phase,rts_live,rts_heap,rts_large,rts_frag,rts_gc,clients"
+
+snapshotCsv :: Snapshot -> Text
+snapshotCsv Snapshot {snapTime, snapPhase, snapLive, snapHeap, snapLarge, snapFrag, snapGCs, snapClients} =
+  T.intercalate
+    ","
+    [ T.pack $ iso8601Show snapTime,
+      snapPhase,
+      tshow snapLive,
+      tshow snapHeap,
+      tshow snapLarge,
+      tshow snapFrag,
+      tshow snapGCs,
+      tshow snapClients
+    ]
+
+writeTimeSeriesHeader :: FilePath -> IO ()
+writeTimeSeriesHeader path = T.writeFile path (csvHeader <> "\n")
+
+appendTimeSeries :: FilePath -> Snapshot -> IO ()
+appendTimeSeries path snap =
+  withFile path AppendMode $ \h -> do
+    hSetBuffering h LineBuffering
+    T.hPutStrLn h $ snapshotCsv snap
+
+tshow :: Show a => a -> Text
+tshow = T.pack . show
diff --git a/bench/docker-compose.yml b/bench/docker-compose.yml
new file mode 100644
index 000000000..ce7f16ec4
--- /dev/null
+++ b/bench/docker-compose.yml
@@ -0,0 +1,20 @@
+services:
+  postgres:
+    image: postgres:17
+    environment:
+      POSTGRES_USER: smp
+      POSTGRES_DB: smp_bench
+      POSTGRES_HOST_AUTH_METHOD: trust
+    ports:
+      - "15432:5432"
+    volumes:
+      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
+      - pgdata:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U smp -d smp_bench"]
+      interval: 2s
+      timeout: 5s
+      retries: 10
+
+volumes:
+  pgdata:
diff --git a/bench/init.sql b/bench/init.sql
new file mode 100644
index 000000000..a9c8fb89b
--- /dev/null
+++ b/bench/init.sql
@@ -0,0 +1,2 @@
+CREATE EXTENSION IF NOT EXISTS pgcrypto;
+CREATE SCHEMA IF NOT EXISTS smp_server;
diff --git a/bench/run.sh b/bench/run.sh
new file mode 100755
index 000000000..3f4f9e671
--- /dev/null
+++ b/bench/run.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -e
+cd "$(dirname "$0")"
+
+mkdir -p tmp
+
+reset_db() {
+  docker compose down -v 2>/dev/null || true
+  docker compose up -d --wait
+  echo "PostgreSQL ready."
+}
+
+if [ "$1" = "--compare-rts" ]; then
+  shift
+  for label_flags in \
+    "default:-N -A16m -s" \
+    "F1.2:-N -A16m -F1.2 -s" \
+    "F1.5:-N -A16m -F1.5 -s" \
+    "A4m:-N -A4m -s" \
+    "A4m-F1.2:-N -A4m -F1.2 -s" \
+    "compact:-N -A16m -c -s" \
+    "nonmoving:-N -A16m -xn -s"; do
+    label="${label_flags%%:*}"
+    flags="${label_flags#*:}"
+    echo ""
+    echo "=========================================="
+    echo "  RTS config: $label ($flags)"
+    echo "=========================================="
+    reset_db
+    cabal run smp-server-bench -- \
+      --timeseries "bench-${label}.csv" \
+      --clients "${BENCH_CLIENTS:-1000}" \
+      --minutes "${BENCH_MINUTES:-2}" \
+      "$@" \
+      +RTS $flags -RTS
+  done
+  echo ""
+  echo "Done. CSV files: bench-*.csv"
+else
+  reset_db
+  cabal run smp-server-bench -- \
+    --clients "${BENCH_CLIENTS:-5000}" \
+    --minutes "${BENCH_MINUTES:-5}" \
+    "$@" \
+    +RTS -N -A16m -s -RTS
+fi
+
+docker compose down
diff --git a/simplexmq.cabal b/simplexmq.cabal
index a0b4c5b80..feb1cdddc 100644
--- a/simplexmq.cabal
+++ b/simplexmq.cabal
@@ -432,6 +432,36 @@ executable smp-server
     , text
   default-language: Haskell2010
 
+executable smp-server-bench
+  if flag(client_library)
+      buildable: False
+  if flag(server_postgres)
+    cpp-options: -DdbServerPostgres
+  main-is: Main.hs
+  other-modules:
+      ClientSim
+      Report
+  hs-source-dirs:
+      bench
+  default-extensions:
+      StrictData
+  ghc-options: -O2 -threaded -rtsopts
+  build-depends:
+      base
+    , async
+    , bytestring
+    , containers
+    , crypton
+    , mtl
+    , network
+    , simple-logger
+    , simplexmq
+    , stm
+    , text
+    , time
+    , unliftio
+  default-language: Haskell2010
+
 executable xftp
   if flag(client_library)
       buildable: False

From 9a0a85a24c2047eb15a8fe88b92c74f80071d59f Mon Sep 17 00:00:00 2001
From: sh <github.shum@liber.li>
Date: Fri, 20 Mar 2026 14:51:16 +0000
Subject: [PATCH 8/8] feat(bench): self-contained Docker Compose setup

Build benchmark binary inside container via multi-stage
Dockerfile. All-in-one: docker compose run bench.
---
 bench/Dockerfile         | 25 +++++++++++++++++++++++++
 bench/docker-compose.yml | 30 ++++++++++++++++++++++++++++--
 bench/run.sh             | 40 ++++++++++++++++++++++++++--------------
 3 files changed, 79 insertions(+), 16 deletions(-)
 create mode 100644 bench/Dockerfile

diff --git a/bench/Dockerfile b/bench/Dockerfile
new file mode 100644
index 000000000..1bdcc1290
--- /dev/null
+++ b/bench/Dockerfile
@@ -0,0 +1,25 @@
+FROM haskell:9.6.3 AS build
+
+WORKDIR /src
+
+# Copy cabal file first for dependency caching
+COPY simplexmq.cabal cabal.project* ./
+RUN cabal update && cabal build --only-dependencies -f server_postgres smp-server-bench || true
+
+# Copy full source
+COPY . .
+RUN cabal build -f server_postgres smp-server-bench \
+  && cp $(cabal list-bin -f server_postgres smp-server-bench) /usr/local/bin/smp-server-bench
+
+FROM debian:bookworm-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      libgmp10 libpq5 libffi8 zlib1g ca-certificates \
+  && rm -rf /var/lib/apt/lists/*
+
+COPY --from=build /usr/local/bin/smp-server-bench /usr/local/bin/smp-server-bench
+COPY tests/fixtures /app/tests/fixtures
+
+WORKDIR /app
+
+ENTRYPOINT ["smp-server-bench"]
diff --git a/bench/docker-compose.yml b/bench/docker-compose.yml
index ce7f16ec4..92aed6b52 100644
--- a/bench/docker-compose.yml
+++ b/bench/docker-compose.yml
@@ -5,8 +5,6 @@ services:
       POSTGRES_USER: smp
       POSTGRES_DB: smp_bench
       POSTGRES_HOST_AUTH_METHOD: trust
-    ports:
-      - "15432:5432"
     volumes:
       - ./init.sql:/docker-entrypoint-initdb.d/init.sql
       - pgdata:/var/lib/postgresql/data
@@ -16,5 +14,33 @@ services:
       timeout: 5s
       retries: 10
 
+  bench:
+    build:
+      context: ..
+      dockerfile: bench/Dockerfile
+    depends_on:
+      postgres:
+        condition: service_healthy
+    environment:
+      BENCH_PG: "postgresql://smp@postgres/smp_bench"
+      BENCH_CLIENTS: "${BENCH_CLIENTS:-5000}"
+      BENCH_MINUTES: "${BENCH_MINUTES:-5}"
+    command:
+      - "--pg"
+      - "postgresql://smp@postgres/smp_bench"
+      - "--clients"
+      - "${BENCH_CLIENTS:-5000}"
+      - "--minutes"
+      - "${BENCH_MINUTES:-5}"
+      - "--timeseries"
+      - "/results/timeseries.csv"
+      - "+RTS"
+      - "-N"
+      - "-A16m"
+      - "-T"
+      - "-RTS"
+    volumes:
+      - ./results:/results
+
 volumes:
   pgdata:
diff --git a/bench/run.sh b/bench/run.sh
index 3f4f9e671..f62160b51 100755
--- a/bench/run.sh
+++ b/bench/run.sh
@@ -2,24 +2,25 @@
 set -e
 cd "$(dirname "$0")"
 
-mkdir -p tmp
+mkdir -p results
 
 reset_db() {
   docker compose down -v 2>/dev/null || true
-  docker compose up -d --wait
+  docker compose up -d --wait postgres
   echo "PostgreSQL ready."
 }
 
 if [ "$1" = "--compare-rts" ]; then
   shift
+  docker compose build bench
   for label_flags in \
-    "default:-N -A16m -s" \
-    "F1.2:-N -A16m -F1.2 -s" \
-    "F1.5:-N -A16m -F1.5 -s" \
-    "A4m:-N -A4m -s" \
-    "A4m-F1.2:-N -A4m -F1.2 -s" \
-    "compact:-N -A16m -c -s" \
-    "nonmoving:-N -A16m -xn -s"; do
+    "default:-N -A16m -T" \
+    "F1.2:-N -A16m -F1.2 -T" \
+    "F1.5:-N -A16m -F1.5 -T" \
+    "A4m:-N -A4m -T" \
+    "A4m-F1.2:-N -A4m -F1.2 -T" \
+    "compact:-N -A16m -c -T" \
+    "nonmoving:-N -A16m -xn -T"; do
     label="${label_flags%%:*}"
     flags="${label_flags#*:}"
     echo ""
@@ -27,22 +28,33 @@ if [ "$1" = "--compare-rts" ]; then
     echo "  RTS config: $label ($flags)"
     echo "=========================================="
     reset_db
-    cabal run smp-server-bench -- \
-      --timeseries "bench-${label}.csv" \
+    docker compose run --rm \
+      -e BENCH_CLIENTS="${BENCH_CLIENTS:-1000}" \
+      -e BENCH_MINUTES="${BENCH_MINUTES:-2}" \
+      bench \
+      --pg "postgresql://smp@postgres/smp_bench" \
       --clients "${BENCH_CLIENTS:-1000}" \
       --minutes "${BENCH_MINUTES:-2}" \
+      --timeseries "/results/bench-${label}.csv" \
       "$@" \
       +RTS $flags -RTS
   done
   echo ""
-  echo "Done. CSV files: bench-*.csv"
-else
+  echo "Done. Results in bench/results/"
+elif [ "$1" = "--local" ]; then
+  # Run natively (not in container) — requires local Postgres
+  shift
   reset_db
-  cabal run smp-server-bench -- \
+  cabal run smp-server-bench -f server_postgres -- \
+    --pg "postgresql://smp@localhost:15432/smp_bench" \
     --clients "${BENCH_CLIENTS:-5000}" \
     --minutes "${BENCH_MINUTES:-5}" \
     "$@" \
     +RTS -N -A16m -s -RTS
+else
+  # Run fully in containers
+  reset_db
+  docker compose run --rm bench "$@"
 fi
 
 docker compose down