diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 62f2a7e2..83c452fb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -41,3 +41,4 @@ jobs:
         ./build/tools/run_tests
         ./build/tools/benchmodel ./example_models/wavenet.nam
         ./build/tools/benchmodel ./example_models/lstm.nam
+        ./build/tools/render ./example_models/wavenet.nam ./example_audio/input.wav ./example_audio/output.wav
diff --git a/.gitignore b/.gitignore
index b7ee58e6..34ee36ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,5 @@
 
 docs/_build/
 *.DS_Store
+
+example_audio/output.wav
diff --git a/.gitmodules b/.gitmodules
index 11c19841..f49ce6e8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "Dependencies/eigen"]
 	path = Dependencies/eigen
 	url = https://gitlab.com/libeigen/eigen
+[submodule "Dependencies/AudioDSPTools"]
+	path = Dependencies/AudioDSPTools
+	url = https://github.com/sdatkinson/AudioDSPTools.git
diff --git a/Dependencies/AudioDSPTools b/Dependencies/AudioDSPTools
new file mode 160000
index 00000000..0827c6c2
--- /dev/null
+++ b/Dependencies/AudioDSPTools
@@ -0,0 +1 @@
+Subproject commit 0827c6c2fc0deced568536142ea86f189e0b98a1
diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
index 9bbbc020..d440f0c1 100644
--- a/NAM/conv1d.cpp
+++ b/NAM/conv1d.cpp
@@ -1,4 +1,5 @@
 #include "conv1d.h"
+#include "profiling.h"
 #include <stdexcept>
 
 namespace nam
@@ -143,6 +144,9 @@ void Conv1D::SetMaxBufferSize(const int maxBufferSize)
 
 void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 {
+  // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
+  // to avoid double-counting when Conv1D is called from within profiled blocks.
+
   // Write input to ring buffer
   _input_buffer.Write(input, num_frames);
 
diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index 05dab09d..b644af31 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -8,6 +8,7 @@
 #include <unordered_set>
 
 #include "dsp.h"
+#include "profiling.h"
 #include "registry.h"
 
 #define tanh_impl_ std::tanh
@@ -443,6 +444,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu
 
 void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames)
 {
+  // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
+  // to provide meaningful categories (input_mixin, layer1x1, head1x1, rechannel)
+  // rather than generic conv1x1.
   assert(num_frames <= _output.cols());
 
   if (this->_is_depthwise)
diff --git a/NAM/film.h b/NAM/film.h
index f0f86fb4..eeb750a4 100644
--- a/NAM/film.h
+++ b/NAM/film.h
@@ -81,9 +81,13 @@ class FiLM
     assert(num_frames <= condition.cols());
     assert(num_frames <= _output.cols());
 
+    // Conv1x1 to compute scale/shift from condition
     _cond_to_scale_shift.process_(condition, num_frames);
     const auto& scale_shift = _cond_to_scale_shift.GetOutput();
 
+    // Note: FiLM time is included in the caller's profiling category (e.g., conv1d, input_mixin)
+    // rather than tracked separately, to avoid double-counting.
+
     const auto scale = scale_shift.topRows(get_input_dim()).leftCols(num_frames);
     if (_do_shift)
     {
diff --git a/NAM/profiling.cpp b/NAM/profiling.cpp
new file mode 100644
index 00000000..885872ee
--- /dev/null
+++ b/NAM/profiling.cpp
@@ -0,0 +1,88 @@
+#include "profiling.h"
+
+#ifdef NAM_PROFILING
+
+#if defined(__ARM_ARCH_7EM__) || defined(ARM_MATH_CM7)
+// ARM Cortex-M7: Use DWT cycle counter for precise timing
+#include "stm32h7xx.h"
+
+namespace nam {
+namespace profiling {
+
+ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {};
+int g_num_entries = 0;
+
+// CPU frequency in MHz (Daisy runs at 480 MHz)
+static constexpr uint32_t CPU_FREQ_MHZ = 480;
+
+uint32_t get_time_us() {
+  // DWT->CYCCNT gives cycle count
+  // Divide by CPU_FREQ_MHZ to get microseconds
+  return DWT->CYCCNT / CPU_FREQ_MHZ;
+}
+
+} // namespace profiling
+} // namespace nam
+
+#else
+// Non-ARM: Use std::chrono for timing (for testing on desktop)
+#include <chrono>
+
+namespace nam {
+namespace profiling {
+
+ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {};
+int g_num_entries = 0;
+
+uint32_t get_time_us() {
+  using namespace std::chrono;
+  static auto start = high_resolution_clock::now();
+  auto now = high_resolution_clock::now();
+  return (uint32_t)duration_cast<microseconds>(now - start).count();
+}
+
+} // namespace profiling
+} // namespace nam
+
+#endif // ARM check
+
+namespace nam {
+namespace profiling {
+
+int register_type(const char* name) {
+  int idx = g_num_entries++;
+  g_entries[idx].name = name;
+  g_entries[idx].accumulated_us = 0;
+  return idx;
+}
+
+void reset() {
+  for (int i = 0; i < g_num_entries; i++)
+    g_entries[i].accumulated_us = 0;
+}
+
+void print_results() {
+  uint32_t total = 0;
+  for (int i = 0; i < g_num_entries; i++)
+    total += g_entries[i].accumulated_us;
+
+  printf("\nProfiling breakdown:\n");
+  printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%");
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+
+  for (int i = 0; i < g_num_entries; i++) {
+    uint32_t us = g_entries[i].accumulated_us;
+    if (us > 0) {
+      uint32_t pct = total > 0 ? (us * 100 / total) : 0;
+      printf("%-12s %8.1f %5lu%%\n", g_entries[i].name, us / 1000.0f, (unsigned long)pct);
+    }
+  }
+
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+  printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%");
+}
+
+} // namespace profiling
+} // namespace nam
+
+#endif // NAM_PROFILING
diff --git a/NAM/profiling.h b/NAM/profiling.h
new file mode 100644
index 00000000..4db570b9
--- /dev/null
+++ b/NAM/profiling.h
@@ -0,0 +1,85 @@
+#pragma once
+
+// Dynamic profiling registry for NAM building blocks
+// Enable with -DNAM_PROFILING
+//
+// Usage:
+//   1. Register profiling types at file scope (static init):
+//        static int PROF_FOO = nam::profiling::register_type("Foo");
+//   2. Call nam::profiling::reset() before benchmark
+//   3. In hot path:
+//        NAM_PROFILE_START();
+//        // ... code ...
+//        NAM_PROFILE_ADD(PROF_FOO);
+//   4. Call nam::profiling::print_results() to display breakdown
+
+#ifdef NAM_PROFILING
+
+#include <cstdint>
+#include <cstdio>
+
+namespace nam {
+namespace profiling {
+
+constexpr int MAX_PROFILING_TYPES = 32;
+
+struct ProfilingEntry {
+  const char* name;
+  uint32_t accumulated_us;
+};
+
+extern ProfilingEntry g_entries[MAX_PROFILING_TYPES];
+extern int g_num_entries;
+
+// Register a named profiling type. Returns index for fast accumulation.
+// Called at static-init time or during setup, NOT in the hot path.
+int register_type(const char* name);
+
+// Get current time in microseconds (platform-specific)
+uint32_t get_time_us();
+
+// Reset all profiling counters
+void reset();
+
+// Print profiling results to stdout
+void print_results();
+
+// Helper macros for timing sections
+// Usage:
+//   NAM_PROFILE_START();
+//   // ... code to profile ...
+//   NAM_PROFILE_ADD(PROF_FOO);  // Adds elapsed time to entry, resets timer
+
+#define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us()
+#define NAM_PROFILE_ADD(idx) do { \
+  uint32_t _prof_now = nam::profiling::get_time_us(); \
+  nam::profiling::g_entries[idx].accumulated_us += (_prof_now - _prof_start); \
+  _prof_start = _prof_now; \
+} while(0)
+
+// Variant that doesn't reset the timer (for one-shot measurements)
+#define NAM_PROFILE_ADD_NORESTART(idx) \
+  nam::profiling::g_entries[idx].accumulated_us += (nam::profiling::get_time_us() - _prof_start)
+
+// Reset the timer without recording (for re-syncing mid-function)
+#define NAM_PROFILE_RESTART() _prof_start = nam::profiling::get_time_us()
+
+} // namespace profiling
+} // namespace nam
+
+#else // NAM_PROFILING not defined
+
+// No-op macros when profiling is disabled
+#define NAM_PROFILE_START() ((void)0)
+#define NAM_PROFILE_ADD(idx) ((void)0)
+#define NAM_PROFILE_ADD_NORESTART(idx) ((void)0)
+#define NAM_PROFILE_RESTART() ((void)0)
+
+namespace nam {
+namespace profiling {
+  inline void reset() {}
+  inline void print_results() {}
+} // namespace profiling
+} // namespace nam
+
+#endif // NAM_PROFILING
diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
index 6eb74a3b..4a1b5217 100644
--- a/NAM/wavenet.cpp
+++ b/NAM/wavenet.cpp
@@ -6,9 +6,20 @@
 #include <Eigen/Dense>
 
 #include "get_dsp.h"
+#include "profiling.h"
 #include "registry.h"
 #include "wavenet.h"
 
+#ifdef NAM_PROFILING
+static int PROF_CONV1D      = nam::profiling::register_type("Conv1D");
+static int PROF_INPUT_MIXIN = nam::profiling::register_type("InputMixin");
+static int PROF_LAYER1X1    = nam::profiling::register_type("Layer1x1");
+static int PROF_HEAD1X1     = nam::profiling::register_type("Head1x1");
+static int PROF_RECHANNEL   = nam::profiling::register_type("Rechannel");
+static int PROF_ACTIVATION  = nam::profiling::register_type("Activation");
+static int PROF_COPIES      = nam::profiling::register_type("Copies");
+#endif
+
 // Layer ======================================================================
 
 void nam::wavenet::_Layer::SetMaxBufferSize(const int maxBufferSize)
@@ -89,6 +100,8 @@ void nam::wavenet::_Layer::set_weights_(std::vector<float>::iterator& weights)
 
 void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames)
 {
+  NAM_PROFILE_START();
+
   const long bottleneck = this->_bottleneck; // Use the actual bottleneck value, not the doubled output channels
 
   // Step 1: input convolutions
@@ -107,6 +120,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& conv_output = this->_conv.GetOutput();
     this->_conv_post_film->Process_(conv_output, condition, num_frames);
   }
+  NAM_PROFILE_ADD(PROF_CONV1D);
 
   if (this->_input_mixin_pre_film)
   {
@@ -123,8 +137,12 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput();
     this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames);
   }
+  NAM_PROFILE_ADD(PROF_INPUT_MIXIN);
+
   this->_z.leftCols(num_frames).noalias() =
     _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames);
+  NAM_PROFILE_ADD(PROF_COPIES);
+
   if (this->_activation_pre_film)
   {
     this->_activation_pre_film->Process_(this->_z, condition, num_frames);
@@ -139,6 +157,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
   if (this->_gating_mode == GatingMode::NONE)
   {
     this->_activation->apply(this->_z.leftCols(num_frames));
+    NAM_PROFILE_ADD(PROF_ACTIVATION);
     if (this->_activation_post_film)
     {
       this->_activation_post_film->Process_(this->_z, condition, num_frames);
@@ -146,6 +165,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z, num_frames);
+      NAM_PROFILE_ADD(PROF_LAYER1X1);
     }
   }
   else if (this->_gating_mode == GatingMode::GATED)
@@ -155,6 +175,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_gating_activation->apply(input_block, output_block);
+    NAM_PROFILE_ADD(PROF_ACTIVATION);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -165,6 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
+      NAM_PROFILE_ADD(PROF_LAYER1X1);
     }
   }
   else if (this->_gating_mode == GatingMode::BLENDED)
@@ -174,6 +196,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_blending_activation->apply(input_block, output_block);
+    NAM_PROFILE_ADD(PROF_ACTIVATION);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -184,6 +207,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
+      NAM_PROFILE_ADD(PROF_LAYER1X1);
       if (this->_layer1x1_post_film)
       {
         Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput();
@@ -207,6 +231,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
       Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput();
       this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames);
     }
+    NAM_PROFILE_ADD(PROF_HEAD1X1);
     this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames);
   }
   else // No head 1x1
@@ -230,6 +255,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     // If layer1x1 is inactive, residual connection is just the input (identity)
     this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames);
   }
+  NAM_PROFILE_ADD(PROF_COPIES);
 }
 
 // LayerArray =================================================================
@@ -298,9 +324,12 @@ void nam::wavenet::_LayerArray::Process(const Eigen::MatrixXf& layer_inputs, con
 void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition,
                                              const int num_frames)
 {
+  NAM_PROFILE_START();
+
   // Process rechannel and get output
   this->_rechannel.process_(layer_inputs, num_frames);
   Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput();
+  NAM_PROFILE_ADD(PROF_RECHANNEL);
 
   // Process layers
   for (size_t i = 0; i < this->_layers.size(); i++)
@@ -329,7 +358,9 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs
     this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames);
 
   // Process head rechannel
+  NAM_PROFILE_RESTART();
   _head_rechannel.process_(this->_head_inputs, num_frames);
+  NAM_PROFILE_ADD(PROF_RECHANNEL);
 }
 
 
diff --git a/example_audio/input.wav b/example_audio/input.wav
new file mode 100644
index 00000000..fd0302bd
Binary files /dev/null and b/example_audio/input.wav differ
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 8118e085..bbe93f3e 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -4,18 +4,49 @@ file(GLOB_RECURSE NAM_SOURCES ../NAM/*.cpp ../NAM/*.c ../NAM*.h)
 set(TOOLS benchmodel)
 
 add_custom_target(tools ALL
-	DEPENDS ${TOOLS})
+	DEPENDS ${TOOLS} render)
+
+set(AUDIO_DSP_TOOLS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../Dependencies/AudioDSPTools")
+set(AUDIO_DSP_TOOLS_WAV_SOURCES "${AUDIO_DSP_TOOLS_DIR}/dsp/wav.cpp")
 
 include_directories(tools ..)
 include_directories(tools ${NAM_DEPS_PATH}/eigen)
 include_directories(tools ${NAM_DEPS_PATH}/nlohmann)
+include_directories(tools ${AUDIO_DSP_TOOLS_DIR}/dsp)
 
 add_executable(loadmodel loadmodel.cpp ${NAM_SOURCES})
 add_executable(benchmodel benchmodel.cpp ${NAM_SOURCES})
+add_executable(render render.cpp ${NAM_SOURCES} ${AUDIO_DSP_TOOLS_WAV_SOURCES})
+target_compile_features(render PUBLIC cxx_std_20)
+# AudioDSPTools wav.cpp has sign-compare issues; don't fail build
+set_source_files_properties(${AUDIO_DSP_TOOLS_WAV_SOURCES} PROPERTIES COMPILE_FLAGS "-Wno-error")
+set_target_properties(render PROPERTIES
+	CXX_VISIBILITY_PRESET hidden
+	INTERPROCEDURAL_OPTIMIZATION TRUE
+	PREFIX ""
+)
+if (CMAKE_SYSTEM_NAME STREQUAL "Windows")
+	target_compile_definitions(render PRIVATE NOMINMAX WIN32_LEAN_AND_MEAN)
+endif()
+if (MSVC)
+	target_compile_options(render PRIVATE
+		"$<$<CONFIG:DEBUG>:/W4>"
+		"$<$<CONFIG:RELEASE>:/O2>"
+	)
+else()
+	target_compile_options(render PRIVATE
+		-Wall -Wextra -Wpedantic -Wstrict-aliasing -Wunreachable-code -Weffc++ -Wno-unused-parameter
+		"$<$<CONFIG:DEBUG>:-Og;-ggdb;-Werror>"
+		"$<$<CONFIG:RELEASE>:-Ofast>"
+	)
+endif()
+add_executable(memory_usage memory_usage.cpp)
 add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCES})
 # Compile run_tests without optimizations to ensure allocation tracking works correctly
 # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run
 set_target_properties(run_tests PROPERTIES COMPILE_OPTIONS "-O0")
+# Benchmodel should be built with NAM_PROFILING set
+target_compile_definitions(benchmodel PRIVATE NAM_PROFILING)
 # Ensure assertions are enabled for run_tests by removing NDEBUG if it was set
 # Release/RelWithDebInfo/MinSizeRel build types automatically define NDEBUG
 # We use a compile option to undefine it, which works on GCC, Clang, and MSVC
@@ -32,6 +63,7 @@ endif()
 source_group(NAM ${CMAKE_CURRENT_SOURCE_DIR} FILES ${NAM_SOURCES})
 
 target_compile_features(${TOOLS} PUBLIC cxx_std_20)
+target_compile_features(memory_usage PUBLIC cxx_std_20)
 
 set_target_properties(${TOOLS}
 	PROPERTIES
@@ -61,4 +93,4 @@ endif()
 # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
 # Don't let this break my build on debug:
 set_source_files_properties(../NAM/dsp.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
-set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
\ No newline at end of file
+set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
diff --git a/tools/benchmodel.cpp b/tools/benchmodel.cpp
index 39c14b0e..42556f59 100644
--- a/tools/benchmodel.cpp
+++ b/tools/benchmodel.cpp
@@ -4,6 +4,7 @@
 
 #include "NAM/dsp.h"
 #include "NAM/get_dsp.h"
+#include "NAM/profiling.h"
 
 using std::chrono::duration;
 using std::chrono::duration_cast;
@@ -62,6 +63,9 @@ int main(int argc, char* argv[])
       outputPtrs[ch] = outputBuffers[ch].data();
     }
 
+    // Reset profiling counters before benchmark
+    nam::profiling::reset();
+
     std::cout << "Running benchmark\n";
     auto t1 = high_resolution_clock::now();
     for (size_t i = 0; i < numBuffers; i++)
@@ -80,6 +84,9 @@ int main(int argc, char* argv[])
 
     std::cout << ms_int.count() << "ms\n";
     std::cout << ms_double.count() << "ms\n";
+
+    // Print profiling breakdown if enabled
+    nam::profiling::print_results();
   }
   else
   {
diff --git a/tools/memory_usage.cpp b/tools/memory_usage.cpp
new file mode 100644
index 00000000..853ca8fe
--- /dev/null
+++ b/tools/memory_usage.cpp
@@ -0,0 +1,611 @@
+// memory_usage.cpp — Report total memory required to host a NAM model at runtime.
+//
+// Usage: memory_usage <model_path> [--buffer-size N]
+//
+// Parses the .nam JSON config and computes weight memory (learned parameters stored
+// in Eigen matrices/vectors) and buffer memory (intermediate computation/state that
+// depends on maxBufferSize) without instantiating the model.
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "json.hpp"
+
+using json = nlohmann::json;
+
+static constexpr int DEFAULT_BUFFER_SIZE = 2048;
+static constexpr long INPUT_BUFFER_SAFETY_FACTOR = 32;
+
+// ─── Result accumulator ─────────────────────────────────────────────────────
+
+struct MemoryResult
+{
+  size_t weight_bytes = 0;
+  size_t buffer_bytes = 0;
+
+  void add_weights(size_t floats) { weight_bytes += floats * sizeof(float); }
+  void add_buffers(size_t floats) { buffer_bytes += floats * sizeof(float); }
+
+  MemoryResult& operator+=(const MemoryResult& o)
+  {
+    weight_bytes += o.weight_bytes;
+    buffer_bytes += o.buffer_bytes;
+    return *this;
+  }
+};
+
+// ─── Conv1x1 ────────────────────────────────────────────────────────────────
+
+// Conv1x1 stores either a full (out_channels x in_channels) matrix (possibly
+// block-diagonal when grouped), or a depthwise weight vector when groups ==
+// in_channels == out_channels.
+static MemoryResult conv1x1_memory(int in_ch, int out_ch, bool bias, int groups, int M)
+{
+  MemoryResult r;
+  bool depthwise = (groups == in_ch && in_ch == out_ch);
+  if (depthwise)
+    r.add_weights(in_ch); // _depthwise_weight(in_ch)
+  else
+    r.add_weights((size_t)out_ch * in_ch); // _weight(out_ch, in_ch)
+  if (bias)
+    r.add_weights(out_ch); // _bias(out_ch)
+  r.add_buffers((size_t)out_ch * M); // _output(out_ch, M)
+  return r;
+}
+
+// ─── Conv1D ─────────────────────────────────────────────────────────────────
+
+// Conv1D stores kernel_size weight matrices (each out_ch x in_ch) or depthwise
+// vectors, plus a bias vector, a ring buffer, and an output buffer.
+static MemoryResult conv1d_memory(int in_ch, int out_ch, int kernel_size, bool bias, int dilation, int groups, int M)
+{
+  MemoryResult r;
+  bool depthwise = (groups == in_ch && in_ch == out_ch);
+  if (depthwise)
+    r.add_weights((size_t)kernel_size * in_ch); // _depthwise_weight[k](in_ch)
+  else
+    r.add_weights((size_t)kernel_size * out_ch * in_ch); // _weight[k](out_ch, in_ch)
+  if (bias)
+    r.add_weights(out_ch); // _bias(out_ch)
+
+  // Ring buffer: storage = (in_ch, 2 * max_lookback + M)
+  // max_lookback = (kernel_size - 1) * dilation
+  long max_lookback = (kernel_size > 0) ? (long)(kernel_size - 1) * dilation : 0;
+  long ring_storage = 2 * max_lookback + M;
+  r.add_buffers((size_t)in_ch * ring_storage); // _input_buffer._storage
+
+  // Output buffer: (out_ch, M)
+  r.add_buffers((size_t)out_ch * M); // _output
+
+  return r;
+}
+
+// ─── FiLM ───────────────────────────────────────────────────────────────────
+
+struct FiLMParams
+{
+  bool active = false;
+  bool shift = true;
+  int groups = 1;
+};
+
+static MemoryResult film_memory(int condition_dim, int input_dim, const FiLMParams& fp, int M)
+{
+  if (!fp.active)
+    return {};
+  MemoryResult r;
+  int scale_shift_dim = fp.shift ? 2 * input_dim : input_dim;
+  // _cond_to_scale_shift is a Conv1x1(condition_dim -> scale_shift_dim, bias=true, groups)
+  r += conv1x1_memory(condition_dim, scale_shift_dim, true, fp.groups, M);
+  // _output(input_dim, M)
+  r.add_buffers((size_t)input_dim * M);
+  return r;
+}
+
+// ─── BatchNorm ──────────────────────────────────────────────────────────────
+
+static MemoryResult batchnorm_memory(int dim)
+{
+  MemoryResult r;
+  // Stores scale(dim) + loc(dim) derived from running_mean, running_var, weight, bias, eps
+  // The source values are consumed from weights array; only scale + loc are stored at runtime.
+  r.add_weights(2 * (size_t)dim);
+  return r;
+}
+
+// ─── LSTM ───────────────────────────────────────────────────────────────────
+
+static MemoryResult lstm_memory(const json& config)
+{
+  MemoryResult r;
+  int num_layers = config["num_layers"];
+  int input_size = config["input_size"];
+  int hidden_size = config["hidden_size"];
+  int in_channels = config.value("in_channels", 1);
+  int out_channels = config.value("out_channels", 1);
+
+  for (int i = 0; i < num_layers; i++)
+  {
+    int cell_input = (i == 0) ? input_size : hidden_size;
+    // _w(4*H, I+H)
+    r.add_weights((size_t)4 * hidden_size * (cell_input + hidden_size));
+    // _b(4*H)
+    r.add_weights(4 * (size_t)hidden_size);
+    // _xh(I+H) — stores initial hidden state in the hidden portion
+    r.add_weights((size_t)(cell_input + hidden_size));
+    // _c(H) — initial cell state
+    r.add_weights((size_t)hidden_size);
+
+    // Buffers: _ifgo(4*H)
+    r.add_buffers(4 * (size_t)hidden_size);
+    // Note: _xh and _c are also modified during inference but they are
+    // loaded from weights (initial state), so counted as weights above.
+  }
+
+  // _head_weight(out_channels, hidden_size)
+  r.add_weights((size_t)out_channels * hidden_size);
+  // _head_bias(out_channels)
+  r.add_weights(out_channels);
+
+  // Top-level buffers: _input(input_size), _output(out_channels)
+  r.add_buffers(input_size);
+  r.add_buffers(out_channels);
+
+  return r;
+}
+
+// ─── Linear ─────────────────────────────────────────────────────────────────
+
+static MemoryResult linear_memory(const json& config)
+{
+  MemoryResult r;
+  int receptive_field = config["receptive_field"];
+  bool bias = config["bias"];
+  int in_channels = config.value("in_channels", 1);
+  int out_channels = config.value("out_channels", 1);
+
+  // _weight(receptive_field)
+  r.add_weights(receptive_field);
+  // _bias (scalar float)
+  if (bias)
+    r.add_weights(1);
+
+  // Buffer base: _input_buffers = in_channels vectors of (32 * receptive_field)
+  r.add_buffers((size_t)in_channels * INPUT_BUFFER_SAFETY_FACTOR * receptive_field);
+  // _output_buffers: resized per-call, not pre-allocated to a fixed size
+  // (depends on num_frames, not maxBufferSize)
+
+  return r;
+}
+
+// ─── ConvNet ────────────────────────────────────────────────────────────────
+
+static MemoryResult convnet_memory(const json& config, int M)
+{
+  MemoryResult r;
+  int channels = config["channels"];
+  std::vector<int> dilations = config["dilations"];
+  bool batchnorm = config["batchnorm"];
+  int groups = config.value("groups", 1);
+  int in_channels = config.value("in_channels", 1);
+  int out_channels = config.value("out_channels", 1);
+
+  int max_dilation = *std::max_element(dilations.begin(), dilations.end());
+
+  // Buffer base class: _input_buffers = in_channels * (32 * max_dilation)
+  int receptive_field = max_dilation; // passed to Buffer as receptive_field
+  r.add_buffers((size_t)in_channels * INPUT_BUFFER_SAFETY_FACTOR * receptive_field);
+
+  // ConvNet blocks
+  for (size_t i = 0; i < dilations.size(); i++)
+  {
+    int block_in = (i == 0) ? in_channels : channels;
+    int block_out = channels;
+    // Conv1D with kernel_size=2, bias=!batchnorm
+    r += conv1d_memory(block_in, block_out, 2, !batchnorm, dilations[i], groups, M);
+    // Optional batchnorm
+    if (batchnorm)
+      r += batchnorm_memory(block_out);
+    // _output(out_channels, M) per block
+    r.add_buffers((size_t)block_out * M);
+  }
+
+  // _block_vals: 1 entry of (channels, buffer_size)
+  // buffer_size = input_buffers[0].size() = 32 * receptive_field
+  long buffer_size = INPUT_BUFFER_SAFETY_FACTOR * receptive_field;
+  r.add_buffers((size_t)channels * buffer_size);
+
+  // _head: weight(out_channels, channels) + bias(out_channels)
+  r.add_weights((size_t)out_channels * channels);
+  r.add_weights(out_channels);
+
+  // _head_output is resized per-call, not a fixed pre-allocation
+
+  return r;
+}
+
+// ─── WaveNet helpers ────────────────────────────────────────────────────────
+
+static FiLMParams parse_film_params(const json& layer_config, const std::string& key)
+{
+  FiLMParams fp;
+  if (layer_config.find(key) == layer_config.end() || layer_config[key] == false)
+    return fp; // inactive
+  const json& fc = layer_config[key];
+  fp.active = fc.value("active", true);
+  fp.shift = fc.value("shift", true);
+  fp.groups = fc.value("groups", 1);
+  return fp;
+}
+
+enum class GatingMode
+{
+  NONE,
+  GATED,
+  BLENDED
+};
+
+static std::vector<GatingMode> parse_gating_modes(const json& layer_config, size_t num_layers)
+{
+  std::vector<GatingMode> modes;
+
+  auto parse_str = [](const std::string& s) -> GatingMode {
+    if (s == "gated")
+      return GatingMode::GATED;
+    if (s == "blended")
+      return GatingMode::BLENDED;
+    return GatingMode::NONE;
+  };
+
+  if (layer_config.find("gating_mode") != layer_config.end())
+  {
+    if (layer_config["gating_mode"].is_array())
+    {
+      for (const auto& gm : layer_config["gating_mode"])
+        modes.push_back(parse_str(gm.get<std::string>()));
+    }
+    else
+    {
+      GatingMode mode = parse_str(layer_config["gating_mode"].get<std::string>());
+      modes.resize(num_layers, mode);
+    }
+  }
+  else if (layer_config.find("gated") != layer_config.end())
+  {
+    bool gated = layer_config["gated"];
+    modes.resize(num_layers, gated ? GatingMode::GATED : GatingMode::NONE);
+  }
+  else
+  {
+    modes.resize(num_layers, GatingMode::NONE);
+  }
+  return modes;
+}
+
+// WaveNet _Layer memory
+static MemoryResult wavenet_layer_memory(int condition_size, int channels, int bottleneck, int kernel_size, int dilation,
+                                         GatingMode gating_mode, int groups_input, int groups_input_mixin,
+                                         bool layer1x1_active, int layer1x1_groups, bool head1x1_active,
+                                         int head1x1_out_channels, int head1x1_groups, const FiLMParams& conv_pre_film,
+                                         const FiLMParams& conv_post_film, const FiLMParams& input_mixin_pre_film,
+                                         const FiLMParams& input_mixin_post_film,
+                                         const FiLMParams& activation_pre_film,
+                                         const FiLMParams& activation_post_film,
+                                         const FiLMParams& layer1x1_post_film, const FiLMParams& head1x1_post_film,
+                                         int M)
+{
+  MemoryResult r;
+  bool gated = (gating_mode != GatingMode::NONE);
+  int conv_out = gated ? 2 * bottleneck : bottleneck;
+
+  // _conv: Conv1D(channels -> conv_out, kernel_size, bias=true, dilation, groups_input)
+  r += conv1d_memory(channels, conv_out, kernel_size, true, dilation, groups_input, M);
+
+  // _input_mixin: Conv1x1(condition_size -> conv_out, bias=false, groups_input_mixin)
+  r += conv1x1_memory(condition_size, conv_out, false, groups_input_mixin, M);
+
+  // _layer1x1 (optional): Conv1x1(bottleneck -> channels, bias=true, layer1x1_groups)
+  if (layer1x1_active)
+    r += conv1x1_memory(bottleneck, channels, true, layer1x1_groups, M);
+
+  // _head1x1 (optional): Conv1x1(bottleneck -> head1x1_out_channels, bias=true, head1x1_groups)
+  if (head1x1_active)
+    r += conv1x1_memory(bottleneck, head1x1_out_channels, true, head1x1_groups, M);
+
+  // Buffers: _z(conv_out, M)
+  r.add_buffers((size_t)conv_out * M);
+  // _output_next_layer(channels, M)
+  r.add_buffers((size_t)channels * M);
+  // _output_head: if head1x1 active -> (head1x1_out_channels, M), else (bottleneck, M)
+  int head_out = head1x1_active ? head1x1_out_channels : bottleneck;
+  r.add_buffers((size_t)head_out * M);
+
+  // FiLM modules (up to 8)
+  r += film_memory(condition_size, channels, conv_pre_film, M);
+  r += film_memory(condition_size, conv_out, conv_post_film, M);
+  r += film_memory(condition_size, condition_size, input_mixin_pre_film, M);
+  r += film_memory(condition_size, conv_out, input_mixin_post_film, M);
+  r += film_memory(condition_size, conv_out, activation_pre_film, M);
+  r += film_memory(condition_size, bottleneck, activation_post_film, M);
+  if (layer1x1_active)
+    r += film_memory(condition_size, channels, layer1x1_post_film, M);
+  if (head1x1_active)
+    r += film_memory(condition_size, head1x1_out_channels, head1x1_post_film, M);
+
+  return r;
+}
+
+// WaveNet _LayerArray memory
+static MemoryResult wavenet_layer_array_memory(const json& layer_config, int M)
+{
+  MemoryResult r;
+  int input_size = layer_config["input_size"];
+  int condition_size = layer_config["condition_size"];
+  int head_size = layer_config["head_size"];
+  int channels = layer_config["channels"];
+  int bottleneck = layer_config.value("bottleneck", channels);
+  int kernel_size = layer_config["kernel_size"];
+  std::vector<int> dilations = layer_config["dilations"];
+  size_t num_layers = dilations.size();
+  bool head_bias = layer_config["head_bias"];
+
+  int groups_input = layer_config.value("groups_input", 1);
+  int groups_input_mixin = layer_config.value("groups_input_mixin", 1);
+
+  // layer1x1 params
+  bool layer1x1_active = true;
+  int layer1x1_groups = 1;
+  if (layer_config.find("layer1x1") != layer_config.end())
+  {
+    layer1x1_active = layer_config["layer1x1"]["active"];
+    layer1x1_groups = layer_config["layer1x1"]["groups"];
+  }
+
+  // head1x1 params
+  bool head1x1_active = false;
+  int head1x1_out_channels = channels;
+  int head1x1_groups = 1;
+  if (layer_config.find("head1x1") != layer_config.end())
+  {
+    head1x1_active = layer_config["head1x1"]["active"];
+    head1x1_out_channels = layer_config["head1x1"]["out_channels"];
+    head1x1_groups = layer_config["head1x1"]["groups"];
+  }
+
+  // Gating modes
+  std::vector<GatingMode> gating_modes = parse_gating_modes(layer_config, num_layers);
+
+  // FiLM params
+  FiLMParams conv_pre = parse_film_params(layer_config, "conv_pre_film");
+  FiLMParams conv_post = parse_film_params(layer_config, "conv_post_film");
+  FiLMParams input_mixin_pre = parse_film_params(layer_config, "input_mixin_pre_film");
+  FiLMParams input_mixin_post = parse_film_params(layer_config, "input_mixin_post_film");
+  FiLMParams activation_pre = parse_film_params(layer_config, "activation_pre_film");
+  FiLMParams activation_post = parse_film_params(layer_config, "activation_post_film");
+  FiLMParams layer1x1_post = parse_film_params(layer_config, "layer1x1_post_film");
+  FiLMParams head1x1_post = parse_film_params(layer_config, "head1x1_post_film");
+
+  // _rechannel: Conv1x1(input_size -> channels, bias=false)
+  r += conv1x1_memory(input_size, channels, false, 1, M);
+
+  // Per-layer
+  for (size_t i = 0; i < num_layers; i++)
+  {
+    r += wavenet_layer_memory(condition_size, channels, bottleneck, kernel_size, dilations[i], gating_modes[i],
+                              groups_input, groups_input_mixin, layer1x1_active, layer1x1_groups, head1x1_active,
+                              head1x1_out_channels, head1x1_groups, conv_pre, conv_post, input_mixin_pre,
+                              input_mixin_post, activation_pre, activation_post, layer1x1_post, head1x1_post, M);
+  }
+
+  // _head_rechannel: Conv1x1(head_output_size -> head_size, bias=head_bias)
+  int head_output_size = head1x1_active ? head1x1_out_channels : bottleneck;
+  r += conv1x1_memory(head_output_size, head_size, head_bias, 1, M);
+
+  // Buffers: _layer_outputs(channels, M)
+  r.add_buffers((size_t)channels * M);
+  // _head_inputs(head_output_size, M)
+  r.add_buffers((size_t)head_output_size * M);
+
+  return r;
+}
+
+// Forward declaration for recursive condition_dsp
+static MemoryResult compute_memory(const std::string& architecture, const json& config, int M);
+
+// WaveNet top-level memory
+static MemoryResult wavenet_memory(const json& config, int M)
+{
+  MemoryResult r;
+  int in_channels = config.value("in_channels", 1);
+
+  // condition_dim = in_channels (from _get_condition_dim())
+  int condition_dim = in_channels;
+
+  // Recursive condition_dsp
+  bool has_condition_dsp = false;
+  int condition_output_channels = condition_dim;
+  if (config.find("condition_dsp") != config.end())
+  {
+    has_condition_dsp = true;
+    const json& cdsp = config["condition_dsp"];
+    std::string cdsp_arch = cdsp["architecture"];
+    json cdsp_config = cdsp["config"];
+    r += compute_memory(cdsp_arch, cdsp_config, M);
+    // condition_output_channels comes from the condition_dsp's output
+    // For now, we use condition_size from first layer as a proxy
+    // (the actual model validates this match)
+    if (config.find("layers") != config.end() && config["layers"].size() > 0)
+      condition_output_channels = config["layers"][0]["condition_size"];
+  }
+
+  // _condition_input(condition_dim, M)
+  r.add_buffers((size_t)condition_dim * M);
+
+  // _condition_output
+  if (!has_condition_dsp)
+  {
+    // _condition_output(condition_dim, M)
+    r.add_buffers((size_t)condition_dim * M);
+  }
+  else
+  {
+    // _condition_output(condition_output_channels, M)
+    r.add_buffers((size_t)condition_output_channels * M);
+    // _condition_dsp_input_buffers: condition_dim vectors of M doubles/floats
+    // These are std::vector<std::vector<NAM_SAMPLE>> where NAM_SAMPLE is double
+    r.add_buffers((size_t)condition_dim * M * (sizeof(double) / sizeof(float)));
+    // _condition_dsp_output_buffers: condition_output_channels vectors of M doubles
+    r.add_buffers((size_t)condition_output_channels * M * (sizeof(double) / sizeof(float)));
+    // Pointer arrays are negligible
+  }
+
+  // Layer arrays
+  for (const auto& layer_config : config["layers"])
+    r += wavenet_layer_array_memory(layer_config, M);
+
+  // _head_scale (1 float) — it's a weight
+  r.add_weights(1);
+
+  return r;
+}
+
+// ─── Dispatch ───────────────────────────────────────────────────────────────
+
+static MemoryResult compute_memory(const std::string& architecture, const json& config, int M)
+{
+  if (architecture == "WaveNet")
+    return wavenet_memory(config, M);
+  if (architecture == "LSTM")
+    return lstm_memory(config);
+  if (architecture == "ConvNet")
+    return convnet_memory(config, M);
+  if (architecture == "Linear")
+    return linear_memory(config);
+  throw std::runtime_error("Unknown architecture: " + architecture);
+}
+
+// ─── Formatting helpers ─────────────────────────────────────────────────────
+
+static std::string format_bytes(size_t bytes)
+{
+  char buf[64];
+  if (bytes < 1024)
+    snprintf(buf, sizeof(buf), "%zu bytes", bytes);
+  else if (bytes < 1024 * 1024)
+    snprintf(buf, sizeof(buf), "%.2f KB", bytes / 1024.0);
+  else
+    snprintf(buf, sizeof(buf), "%.2f MB", bytes / (1024.0 * 1024.0));
+  return buf;
+}
+
+static std::string format_with_commas(size_t n)
+{
+  std::string s = std::to_string(n);
+  int insert_pos = (int)s.length() - 3;
+  while (insert_pos > 0)
+  {
+    s.insert(insert_pos, ",");
+    insert_pos -= 3;
+  }
+  return s;
+}
+
+// ─── Main ───────────────────────────────────────────────────────────────────
+
+int main(int argc, char* argv[])
+{
+  if (argc < 2)
+  {
+    fprintf(stderr, "Usage: memory_usage <model_path> [--buffer-size N]\n");
+    return 1;
+  }
+
+  const char* model_path = argv[1];
+  int buffer_size = DEFAULT_BUFFER_SIZE;
+
+  for (int i = 2; i < argc; i++)
+  {
+    if (strcmp(argv[i], "--buffer-size") == 0 && i + 1 < argc)
+    {
+      buffer_size = atoi(argv[++i]);
+      if (buffer_size <= 0)
+      {
+        fprintf(stderr, "Error: buffer size must be positive\n");
+        return 1;
+      }
+    }
+    else
+    {
+      fprintf(stderr, "Unknown option: %s\n", argv[i]);
+      return 1;
+    }
+  }
+
+  // Read and parse JSON
+  std::ifstream file(model_path);
+  if (!file.is_open())
+  {
+    fprintf(stderr, "Error: cannot open %s\n", model_path);
+    return 1;
+  }
+
+  json j;
+  try
+  {
+    file >> j;
+  }
+  catch (const std::exception& e)
+  {
+    fprintf(stderr, "Error parsing JSON: %s\n", e.what());
+    return 1;
+  }
+
+  std::string architecture = j["architecture"];
+  json config = j["config"];
+
+  // Cross-check: count weights in JSON
+  size_t json_weight_count = 0;
+  if (j.find("weights") != j.end())
+    json_weight_count = j["weights"].size();
+
+  double sample_rate = -1.0;
+  if (j.find("sample_rate") != j.end())
+    sample_rate = j["sample_rate"];
+
+  try
+  {
+    MemoryResult result = compute_memory(architecture, config, buffer_size);
+    size_t total = result.weight_bytes + result.buffer_bytes;
+
+    printf("Model: %s\n", model_path);
+    printf("Architecture: %s\n", architecture.c_str());
+    if (sample_rate > 0)
+      printf("Sample rate: %.0f Hz\n", sample_rate);
+    printf("\n");
+    printf("Weights:  %s bytes (%s)\n", format_with_commas(result.weight_bytes).c_str(),
+           format_bytes(result.weight_bytes).c_str());
+    printf("Buffers:  %s bytes (%s)  [buffer size: %d]\n", format_with_commas(result.buffer_bytes).c_str(),
+           format_bytes(result.buffer_bytes).c_str(), buffer_size);
+    printf("Total:    %s bytes (%s)\n", format_with_commas(total).c_str(), format_bytes(total).c_str());
+
+    if (json_weight_count > 0)
+    {
+      printf("\nJSON weights: %zu values (%s bytes)\n", json_weight_count,
+             format_with_commas(json_weight_count * sizeof(float)).c_str());
+    }
+  }
+  catch (const std::exception& e)
+  {
+    fprintf(stderr, "Error computing memory: %s\n", e.what());
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/tools/render.cpp b/tools/render.cpp
new file mode 100644
index 00000000..77836b41
--- /dev/null
+++ b/tools/render.cpp
@@ -0,0 +1,159 @@
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+#include "NAM/dsp.h"
+#include "NAM/get_dsp.h"
+#include "wav.h"
+
+namespace
+{
+// Write mono 32-bit float WAV file (IEEE float format 3).
+bool SaveWavFloat32(const char* fileName, const float* samples, size_t numSamples, double sampleRate)
+{
+  std::ofstream out(fileName, std::ios::binary);
+  if (!out.is_open())
+  {
+    std::cerr << "Error: Failed to open output file " << fileName << "\n";
+    return false;
+  }
+
+  const uint32_t dataSize = static_cast<uint32_t>(numSamples * sizeof(float));
+  const uint32_t chunkSize = 36 + dataSize;
+
+  // RIFF header
+  out.write("RIFF", 4);
+  out.write(reinterpret_cast<const char*>(&chunkSize), 4);
+  out.write("WAVE", 4);
+
+  // fmt chunk (16 bytes for PCM/IEEE)
+  const uint32_t fmtSize = 16;
+  out.write("fmt ", 4);
+  out.write(reinterpret_cast<const char*>(&fmtSize), 4);
+  const uint16_t audioFormat = 3;  // IEEE float
+  out.write(reinterpret_cast<const char*>(&audioFormat), 2);
+  const uint16_t numChannels = 1;
+  out.write(reinterpret_cast<const char*>(&numChannels), 2);
+  const uint32_t sr = static_cast<uint32_t>(sampleRate);
+  out.write(reinterpret_cast<const char*>(&sr), 4);
+  const uint32_t byteRate = sr * sizeof(float);
+  out.write(reinterpret_cast<const char*>(&byteRate), 4);
+  const uint16_t blockAlign = sizeof(float);
+  out.write(reinterpret_cast<const char*>(&blockAlign), 2);
+  const uint16_t bitsPerSample = 32;
+  out.write(reinterpret_cast<const char*>(&bitsPerSample), 2);
+
+  // data chunk
+  out.write("data", 4);
+  out.write(reinterpret_cast<const char*>(&dataSize), 4);
+  out.write(reinterpret_cast<const char*>(samples), dataSize);
+
+  return out.good();
+}
+
+}  // namespace
+
+int main(int argc, char* argv[])
+{
+  if (argc < 3 || argc > 4)
+  {
+    std::cerr << "Usage: render <model.nam> <input.wav> [output.wav]\n";
+    return 1;
+  }
+
+  const char* modelPath = argv[1];
+  const char* inputPath = argv[2];
+  const char* outputPath = (argc >= 4) ? argv[3] : "output.wav";
+
+  std::cerr << "Loading model [" << modelPath << "]\n";
+  auto model = nam::get_dsp(std::filesystem::path(modelPath));
+  if (!model)
+  {
+    std::cerr << "Failed to load model\n";
+    return 1;
+  }
+  std::cerr << "Model loaded successfully\n";
+
+  std::vector<float> inputAudio;
+  double inputSampleRate = 0.0;
+  auto loadResult = dsp::wav::Load(inputPath, inputAudio, inputSampleRate);
+  if (loadResult != dsp::wav::LoadReturnCode::SUCCESS)
+  {
+    std::cerr << "Failed to load input WAV: " << dsp::wav::GetMsgForLoadReturnCode(loadResult) << "\n";
+    return 1;
+  }
+
+  const double expectedRate = model->GetExpectedSampleRate();
+  if (expectedRate > 0 && std::abs(inputSampleRate - expectedRate) > 0.5)
+  {
+    std::cerr << "Error: Input WAV sample rate (" << inputSampleRate
+              << " Hz) does not match model expected rate (" << expectedRate << " Hz)\n";
+    return 1;
+  }
+
+  const double sampleRate = expectedRate > 0 ? expectedRate : inputSampleRate;
+  const int bufferSize = 64;
+  model->Reset(sampleRate, bufferSize);
+
+  const int inChannels = model->NumInputChannels();
+  const int outChannels = model->NumOutputChannels();
+
+  if (inChannels != 1)
+  {
+    std::cerr << "Error: render tool currently supports mono input only (model has " << inChannels
+              << " input channels)\n";
+    return 1;
+  }
+
+  std::vector<std::vector<NAM_SAMPLE>> inputBuffers(inChannels);
+  std::vector<std::vector<NAM_SAMPLE>> outputBuffers(outChannels);
+  std::vector<NAM_SAMPLE*> inputPtrs(inChannels);
+  std::vector<NAM_SAMPLE*> outputPtrs(outChannels);
+
+  for (int ch = 0; ch < inChannels; ch++)
+  {
+    inputBuffers[ch].resize(bufferSize, 0.0);
+    inputPtrs[ch] = inputBuffers[ch].data();
+  }
+  for (int ch = 0; ch < outChannels; ch++)
+  {
+    outputBuffers[ch].resize(bufferSize, 0.0);
+    outputPtrs[ch] = outputBuffers[ch].data();
+  }
+
+  std::vector<float> outputAudio;
+  outputAudio.reserve(static_cast<size_t>(outChannels) * inputAudio.size());
+
+  size_t readPos = 0;
+  const size_t totalSamples = inputAudio.size();
+
+  while (readPos < totalSamples)
+  {
+    const size_t toRead = std::min(static_cast<size_t>(bufferSize), totalSamples - readPos);
+
+    for (size_t i = 0; i < toRead; i++)
+      inputBuffers[0][i] = static_cast<NAM_SAMPLE>(inputAudio[readPos + i]);
+    for (size_t i = toRead; i < static_cast<size_t>(bufferSize); i++)
+      inputBuffers[0][i] = 0;
+
+    model->process(inputPtrs.data(), outputPtrs.data(), static_cast<int>(toRead));
+
+    for (size_t i = 0; i < toRead; i++)
+      outputAudio.push_back(static_cast<float>(outputBuffers[0][i]));
+
+    readPos += toRead;
+  }
+
+  if (!SaveWavFloat32(outputPath, outputAudio.data(), outputAudio.size(), sampleRate))
+  {
+    return 1;
+  }
+
+  std::cerr << "Wrote " << outputAudio.size() << " samples to " << outputPath << "\n";
+  return 0;
+}