sdatkinson · sdatkinson · Feb 14, 2026 · Feb 14, 2026 · Feb 19, 2026 · Feb 21, 2026
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -41,3 +41,4 @@ jobs:
         ./build/tools/run_tests
         ./build/tools/benchmodel ./example_models/wavenet.nam
         ./build/tools/benchmodel ./example_models/lstm.nam
+        ./build/tools/render ./example_models/wavenet.nam ./example_audio/input.wav ./example_audio/output.wav
diff --git a/.gitignore b/.gitignore
@@ -35,3 +35,5 @@
 
 docs/_build/
 *.DS_Store
+
+example_audio/output.wav
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "Dependencies/eigen"]
 	path = Dependencies/eigen
 	url = https://gitlab.com/libeigen/eigen
+[submodule "Dependencies/AudioDSPTools"]
+	path = Dependencies/AudioDSPTools
+	url = https://github.com/sdatkinson/AudioDSPTools.git
diff --git a/Dependencies/AudioDSPTools b/Dependencies/AudioDSPTools
diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
@@ -1,4 +1,5 @@
 #include "conv1d.h"
+#include "profiling.h"
 #include <stdexcept>
 
 namespace nam
@@ -143,6 +144,9 @@ void Conv1D::SetMaxBufferSize(const int maxBufferSize)
 
 void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 {
+  // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
+  // to avoid double-counting when Conv1D is called from within profiled blocks.
+
   // Write input to ring buffer
   _input_buffer.Write(input, num_frames);
 

diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
@@ -8,6 +8,7 @@
 #include <unordered_set>
 
 #include "dsp.h"
+#include "profiling.h"
 #include "registry.h"
 
 #define tanh_impl_ std::tanh
@@ -443,6 +444,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu
 
 void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames)
 {
+  // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
+  // to provide meaningful categories (input_mixin, layer1x1, head1x1, rechannel)
+  // rather than generic conv1x1.
   assert(num_frames <= _output.cols());
 
   if (this->_is_depthwise)

diff --git a/NAM/film.h b/NAM/film.h
@@ -81,9 +81,13 @@ class FiLM
     assert(num_frames <= condition.cols());
     assert(num_frames <= _output.cols());
 
+    // Conv1x1 to compute scale/shift from condition
     _cond_to_scale_shift.process_(condition, num_frames);
     const auto& scale_shift = _cond_to_scale_shift.GetOutput();
 
+    // Note: FiLM time is included in the caller's profiling category (e.g., conv1d, input_mixin)
+    // rather than tracked separately, to avoid double-counting.
+
     const auto scale = scale_shift.topRows(get_input_dim()).leftCols(num_frames);
     if (_do_shift)
     {

diff --git a/NAM/profiling.cpp b/NAM/profiling.cpp
@@ -0,0 +1,88 @@
+#include "profiling.h"
+
+#ifdef NAM_PROFILING
+
+#if defined(__ARM_ARCH_7EM__) || defined(ARM_MATH_CM7)
+// ARM Cortex-M7: Use DWT cycle counter for precise timing
+#include "stm32h7xx.h"
+
+namespace nam {
+namespace profiling {
+
+ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {};
+int g_num_entries = 0;
+
+// CPU frequency in MHz (Daisy runs at 480 MHz)
+static constexpr uint32_t CPU_FREQ_MHZ = 480;
+
+uint32_t get_time_us() {
+  // DWT->CYCCNT gives cycle count
+  // Divide by CPU_FREQ_MHZ to get microseconds
+  return DWT->CYCCNT / CPU_FREQ_MHZ;
+}
+
+} // namespace profiling
+} // namespace nam
+
+#else
+// Non-ARM: Use std::chrono for timing (for testing on desktop)
+#include <chrono>
+
+namespace nam {
+namespace profiling {
+
+ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {};
+int g_num_entries = 0;
+
+uint32_t get_time_us() {
+  using namespace std::chrono;
+  static auto start = high_resolution_clock::now();
+  auto now = high_resolution_clock::now();
+  return (uint32_t)duration_cast<microseconds>(now - start).count();
+}
+
+} // namespace profiling
+} // namespace nam
+
+#endif // ARM check
+
+namespace nam {
+namespace profiling {
+
+int register_type(const char* name) {
+  int idx = g_num_entries++;
+  g_entries[idx].name = name;
+  g_entries[idx].accumulated_us = 0;
+  return idx;
+}
+
+void reset() {
+  for (int i = 0; i < g_num_entries; i++)
+    g_entries[i].accumulated_us = 0;
+}
+
+void print_results() {
+  uint32_t total = 0;
+  for (int i = 0; i < g_num_entries; i++)
+    total += g_entries[i].accumulated_us;
+
+  printf("\nProfiling breakdown:\n");
+  printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%");
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+
+  for (int i = 0; i < g_num_entries; i++) {
+    uint32_t us = g_entries[i].accumulated_us;
+    if (us > 0) {
+      uint32_t pct = total > 0 ? (us * 100 / total) : 0;
+      printf("%-12s %8.1f %5lu%%\n", g_entries[i].name, us / 1000.0f, (unsigned long)pct);
+    }
+  }
+
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+  printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%");
+}
+
+} // namespace profiling
+} // namespace nam
+
+#endif // NAM_PROFILING
diff --git a/NAM/profiling.h b/NAM/profiling.h
@@ -0,0 +1,85 @@
+#pragma once
+
+// Dynamic profiling registry for NAM building blocks
+// Enable with -DNAM_PROFILING
+//
+// Usage:
+//   1. Register profiling types at file scope (static init):
+//        static int PROF_FOO = nam::profiling::register_type("Foo");
+//   2. Call nam::profiling::reset() before benchmark
+//   3. In hot path:
+//        NAM_PROFILE_START();
+//        // ... code ...
+//        NAM_PROFILE_ADD(PROF_FOO);
+//   4. Call nam::profiling::print_results() to display breakdown
+
+#ifdef NAM_PROFILING
+
+#include <cstdint>
+#include <cstdio>
+
+namespace nam {
+namespace profiling {
+
+constexpr int MAX_PROFILING_TYPES = 32;
+
+struct ProfilingEntry {
+  const char* name;
+  uint32_t accumulated_us;
+};
+
+extern ProfilingEntry g_entries[MAX_PROFILING_TYPES];
+extern int g_num_entries;
+
+// Register a named profiling type. Returns index for fast accumulation.
+// Called at static-init time or during setup, NOT in the hot path.
+int register_type(const char* name);
+
+// Get current time in microseconds (platform-specific)
+uint32_t get_time_us();
+
+// Reset all profiling counters
+void reset();
+
+// Print profiling results to stdout
+void print_results();
+
+// Helper macros for timing sections
+// Usage:
+//   NAM_PROFILE_START();
+//   // ... code to profile ...
+//   NAM_PROFILE_ADD(PROF_FOO);  // Adds elapsed time to entry, resets timer
+
+#define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us()
+#define NAM_PROFILE_ADD(idx) do { \
+  uint32_t _prof_now = nam::profiling::get_time_us(); \
+  nam::profiling::g_entries[idx].accumulated_us += (_prof_now - _prof_start); \
+  _prof_start = _prof_now; \
+} while(0)
+
+// Variant that doesn't reset the timer (for one-shot measurements)
+#define NAM_PROFILE_ADD_NORESTART(idx) \
+  nam::profiling::g_entries[idx].accumulated_us += (nam::profiling::get_time_us() - _prof_start)
+
+// Reset the timer without recording (for re-syncing mid-function)
+#define NAM_PROFILE_RESTART() _prof_start = nam::profiling::get_time_us()
+
+} // namespace profiling
+} // namespace nam
+
+#else // NAM_PROFILING not defined
+
+// No-op macros when profiling is disabled
+#define NAM_PROFILE_START() ((void)0)
+#define NAM_PROFILE_ADD(idx) ((void)0)
+#define NAM_PROFILE_ADD_NORESTART(idx) ((void)0)
+#define NAM_PROFILE_RESTART() ((void)0)
+
+namespace nam {
+namespace profiling {
+  inline void reset() {}
+  inline void print_results() {}
+} // namespace profiling
+} // namespace nam
+
+#endif // NAM_PROFILING
diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
@@ -6,9 +6,20 @@
 #include <Eigen/Dense>
 
 #include "get_dsp.h"
+#include "profiling.h"
 #include "registry.h"
 #include "wavenet.h"
 
+#ifdef NAM_PROFILING
+static int PROF_CONV1D      = nam::profiling::register_type("Conv1D");
+static int PROF_INPUT_MIXIN = nam::profiling::register_type("InputMixin");
+static int PROF_LAYER1X1    = nam::profiling::register_type("Layer1x1");
+static int PROF_HEAD1X1     = nam::profiling::register_type("Head1x1");
+static int PROF_RECHANNEL   = nam::profiling::register_type("Rechannel");
+static int PROF_ACTIVATION  = nam::profiling::register_type("Activation");
+static int PROF_COPIES      = nam::profiling::register_type("Copies");
+#endif
+
 // Layer ======================================================================
 
 void nam::wavenet::_Layer::SetMaxBufferSize(const int maxBufferSize)
@@ -89,6 +100,8 @@ void nam::wavenet::_Layer::set_weights_(std::vector<float>::iterator& weights)
 
 void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames)
 {
+  NAM_PROFILE_START();
+
   const long bottleneck = this->_bottleneck; // Use the actual bottleneck value, not the doubled output channels
 
   // Step 1: input convolutions
@@ -107,6 +120,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& conv_output = this->_conv.GetOutput();
     this->_conv_post_film->Process_(conv_output, condition, num_frames);
   }
+  NAM_PROFILE_ADD(PROF_CONV1D);
 
   if (this->_input_mixin_pre_film)
   {
@@ -123,8 +137,12 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput();
     this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames);
   }
+  NAM_PROFILE_ADD(PROF_INPUT_MIXIN);
+
   this->_z.leftCols(num_frames).noalias() =
     _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames);
+  NAM_PROFILE_ADD(PROF_COPIES);
+
   if (this->_activation_pre_film)
   {
     this->_activation_pre_film->Process_(this->_z, condition, num_frames);
@@ -139,13 +157,15 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
   if (this->_gating_mode == GatingMode::NONE)
   {
     this->_activation->apply(this->_z.leftCols(num_frames));
+    NAM_PROFILE_ADD(PROF_ACTIVATION);
     if (this->_activation_post_film)
     {
       this->_activation_post_film->Process_(this->_z, condition, num_frames);
     }
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z, num_frames);
+      NAM_PROFILE_ADD(PROF_LAYER1X1);
     }
   }
   else if (this->_gating_mode == GatingMode::GATED)
@@ -155,6 +175,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_gating_activation->apply(input_block, output_block);
+    NAM_PROFILE_ADD(PROF_ACTIVATION);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -165,6 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
+      NAM_PROFILE_ADD(PROF_LAYER1X1);
     }
   }
   else if (this->_gating_mode == GatingMode::BLENDED)
@@ -174,6 +196,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_blending_activation->apply(input_block, output_block);
+    NAM_PROFILE_ADD(PROF_ACTIVATION);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -184,6 +207,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
+      NAM_PROFILE_ADD(PROF_LAYER1X1);
       if (this->_layer1x1_post_film)
       {
         Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput();
@@ -207,6 +231,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
       Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput();
       this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames);
     }
+    NAM_PROFILE_ADD(PROF_HEAD1X1);
     this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames);
   }
   else // No head 1x1
@@ -230,6 +255,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     // If layer1x1 is inactive, residual connection is just the input (identity)
     this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames);
   }
+  NAM_PROFILE_ADD(PROF_COPIES);
 }
 
 // LayerArray =================================================================
@@ -298,9 +324,12 @@ void nam::wavenet::_LayerArray::Process(const Eigen::MatrixXf& layer_inputs, con
 void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition,
                                              const int num_frames)
 {
+  NAM_PROFILE_START();
+
   // Process rechannel and get output
   this->_rechannel.process_(layer_inputs, num_frames);
   Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput();
+  NAM_PROFILE_ADD(PROF_RECHANNEL);
 
   // Process layers
   for (size_t i = 0; i < this->_layers.size(); i++)
@@ -329,7 +358,9 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs
     this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames);
 
   // Process head rechannel
+  NAM_PROFILE_RESTART();
   _head_rechannel.process_(this->_head_inputs, num_frames);
+  NAM_PROFILE_ADD(PROF_RECHANNEL);
 }
 
 

diff --git a/example_audio/input.wav b/example_audio/input.wav
Original file line number	Diff line number	Diff line change
Expand Up		@@ -35,3 +35,5 @@

		docs/_build/
		*.DS_Store

		example_audio/output.wav