diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 62f2a7e2..83c452fb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -41,3 +41,4 @@ jobs: ./build/tools/run_tests ./build/tools/benchmodel ./example_models/wavenet.nam ./build/tools/benchmodel ./example_models/lstm.nam + ./build/tools/render ./example_models/wavenet.nam ./example_audio/input.wav ./example_audio/output.wav diff --git a/.gitignore b/.gitignore index b7ee58e6..34ee36ee 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,5 @@ docs/_build/ *.DS_Store + +example_audio/output.wav diff --git a/.gitmodules b/.gitmodules index 11c19841..f49ce6e8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "Dependencies/eigen"] path = Dependencies/eigen url = https://gitlab.com/libeigen/eigen +[submodule "Dependencies/AudioDSPTools"] + path = Dependencies/AudioDSPTools + url = https://github.com/sdatkinson/AudioDSPTools.git diff --git a/Dependencies/AudioDSPTools b/Dependencies/AudioDSPTools new file mode 160000 index 00000000..0827c6c2 --- /dev/null +++ b/Dependencies/AudioDSPTools @@ -0,0 +1 @@ +Subproject commit 0827c6c2fc0deced568536142ea86f189e0b98a1 diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp index 9bbbc020..d440f0c1 100644 --- a/NAM/conv1d.cpp +++ b/NAM/conv1d.cpp @@ -1,4 +1,5 @@ #include "conv1d.h" +#include "profiling.h" #include namespace nam @@ -143,6 +144,9 @@ void Conv1D::SetMaxBufferSize(const int maxBufferSize) void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) { + // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp) + // to avoid double-counting when Conv1D is called from within profiled blocks. + // Write input to ring buffer _input_buffer.Write(input, num_frames); diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index 05dab09d..b644af31 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -8,6 +8,7 @@ #include #include "dsp.h" +#include "profiling.h" #include "registry.h" #define tanh_impl_ std::tanh @@ -443,6 +444,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu void nam::Conv1x1::process_(const Eigen::Ref& input, const int num_frames) { + // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp) + // to provide meaningful categories (input_mixin, layer1x1, head1x1, rechannel) + // rather than generic conv1x1. assert(num_frames <= _output.cols()); if (this->_is_depthwise) diff --git a/NAM/film.h b/NAM/film.h index f0f86fb4..eeb750a4 100644 --- a/NAM/film.h +++ b/NAM/film.h @@ -81,9 +81,13 @@ class FiLM assert(num_frames <= condition.cols()); assert(num_frames <= _output.cols()); + // Conv1x1 to compute scale/shift from condition _cond_to_scale_shift.process_(condition, num_frames); const auto& scale_shift = _cond_to_scale_shift.GetOutput(); + // Note: FiLM time is included in the caller's profiling category (e.g., conv1d, input_mixin) + // rather than tracked separately, to avoid double-counting. + const auto scale = scale_shift.topRows(get_input_dim()).leftCols(num_frames); if (_do_shift) { diff --git a/NAM/profiling.cpp b/NAM/profiling.cpp new file mode 100644 index 00000000..885872ee --- /dev/null +++ b/NAM/profiling.cpp @@ -0,0 +1,88 @@ +#include "profiling.h" + +#ifdef NAM_PROFILING + +#if defined(__ARM_ARCH_7EM__) || defined(ARM_MATH_CM7) +// ARM Cortex-M7: Use DWT cycle counter for precise timing +#include "stm32h7xx.h" + +namespace nam { +namespace profiling { + +ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {}; +int g_num_entries = 0; + +// CPU frequency in MHz (Daisy runs at 480 MHz) +static constexpr uint32_t CPU_FREQ_MHZ = 480; + +uint32_t get_time_us() { + // DWT->CYCCNT gives cycle count + // Divide by CPU_FREQ_MHZ to get microseconds + return DWT->CYCCNT / CPU_FREQ_MHZ; +} + +} // namespace profiling +} // namespace nam + +#else +// Non-ARM: Use std::chrono for timing (for testing on desktop) +#include + +namespace nam { +namespace profiling { + +ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {}; +int g_num_entries = 0; + +uint32_t get_time_us() { + using namespace std::chrono; + static auto start = high_resolution_clock::now(); + auto now = high_resolution_clock::now(); + return (uint32_t)duration_cast(now - start).count(); +} + +} // namespace profiling +} // namespace nam + +#endif // ARM check + +namespace nam { +namespace profiling { + +int register_type(const char* name) { + int idx = g_num_entries++; + g_entries[idx].name = name; + g_entries[idx].accumulated_us = 0; + return idx; +} + +void reset() { + for (int i = 0; i < g_num_entries; i++) + g_entries[i].accumulated_us = 0; +} + +void print_results() { + uint32_t total = 0; + for (int i = 0; i < g_num_entries; i++) + total += g_entries[i].accumulated_us; + + printf("\nProfiling breakdown:\n"); + printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%"); + printf("%-12s %8s %6s\n", "--------", "--------", "----"); + + for (int i = 0; i < g_num_entries; i++) { + uint32_t us = g_entries[i].accumulated_us; + if (us > 0) { + uint32_t pct = total > 0 ? (us * 100 / total) : 0; + printf("%-12s %8.1f %5lu%%\n", g_entries[i].name, us / 1000.0f, (unsigned long)pct); + } + } + + printf("%-12s %8s %6s\n", "--------", "--------", "----"); + printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%"); +} + +} // namespace profiling +} // namespace nam + +#endif // NAM_PROFILING diff --git a/NAM/profiling.h b/NAM/profiling.h new file mode 100644 index 00000000..4db570b9 --- /dev/null +++ b/NAM/profiling.h @@ -0,0 +1,85 @@ +#pragma once + +// Dynamic profiling registry for NAM building blocks +// Enable with -DNAM_PROFILING +// +// Usage: +// 1. Register profiling types at file scope (static init): +// static int PROF_FOO = nam::profiling::register_type("Foo"); +// 2. Call nam::profiling::reset() before benchmark +// 3. In hot path: +// NAM_PROFILE_START(); +// // ... code ... +// NAM_PROFILE_ADD(PROF_FOO); +// 4. Call nam::profiling::print_results() to display breakdown + +#ifdef NAM_PROFILING + +#include +#include + +namespace nam { +namespace profiling { + +constexpr int MAX_PROFILING_TYPES = 32; + +struct ProfilingEntry { + const char* name; + uint32_t accumulated_us; +}; + +extern ProfilingEntry g_entries[MAX_PROFILING_TYPES]; +extern int g_num_entries; + +// Register a named profiling type. Returns index for fast accumulation. +// Called at static-init time or during setup, NOT in the hot path. +int register_type(const char* name); + +// Get current time in microseconds (platform-specific) +uint32_t get_time_us(); + +// Reset all profiling counters +void reset(); + +// Print profiling results to stdout +void print_results(); + +// Helper macros for timing sections +// Usage: +// NAM_PROFILE_START(); +// // ... code to profile ... +// NAM_PROFILE_ADD(PROF_FOO); // Adds elapsed time to entry, resets timer + +#define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us() +#define NAM_PROFILE_ADD(idx) do { \ + uint32_t _prof_now = nam::profiling::get_time_us(); \ + nam::profiling::g_entries[idx].accumulated_us += (_prof_now - _prof_start); \ + _prof_start = _prof_now; \ +} while(0) + +// Variant that doesn't reset the timer (for one-shot measurements) +#define NAM_PROFILE_ADD_NORESTART(idx) \ + nam::profiling::g_entries[idx].accumulated_us += (nam::profiling::get_time_us() - _prof_start) + +// Reset the timer without recording (for re-syncing mid-function) +#define NAM_PROFILE_RESTART() _prof_start = nam::profiling::get_time_us() + +} // namespace profiling +} // namespace nam + +#else // NAM_PROFILING not defined + +// No-op macros when profiling is disabled +#define NAM_PROFILE_START() ((void)0) +#define NAM_PROFILE_ADD(idx) ((void)0) +#define NAM_PROFILE_ADD_NORESTART(idx) ((void)0) +#define NAM_PROFILE_RESTART() ((void)0) + +namespace nam { +namespace profiling { + inline void reset() {} + inline void print_results() {} +} // namespace profiling +} // namespace nam + +#endif // NAM_PROFILING diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp index 6eb74a3b..4a1b5217 100644 --- a/NAM/wavenet.cpp +++ b/NAM/wavenet.cpp @@ -6,9 +6,20 @@ #include #include "get_dsp.h" +#include "profiling.h" #include "registry.h" #include "wavenet.h" +#ifdef NAM_PROFILING +static int PROF_CONV1D = nam::profiling::register_type("Conv1D"); +static int PROF_INPUT_MIXIN = nam::profiling::register_type("InputMixin"); +static int PROF_LAYER1X1 = nam::profiling::register_type("Layer1x1"); +static int PROF_HEAD1X1 = nam::profiling::register_type("Head1x1"); +static int PROF_RECHANNEL = nam::profiling::register_type("Rechannel"); +static int PROF_ACTIVATION = nam::profiling::register_type("Activation"); +static int PROF_COPIES = nam::profiling::register_type("Copies"); +#endif + // Layer ====================================================================== void nam::wavenet::_Layer::SetMaxBufferSize(const int maxBufferSize) @@ -89,6 +100,8 @@ void nam::wavenet::_Layer::set_weights_(std::vector::iterator& weights) void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames) { + NAM_PROFILE_START(); + const long bottleneck = this->_bottleneck; // Use the actual bottleneck value, not the doubled output channels // Step 1: input convolutions @@ -107,6 +120,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& conv_output = this->_conv.GetOutput(); this->_conv_post_film->Process_(conv_output, condition, num_frames); } + NAM_PROFILE_ADD(PROF_CONV1D); if (this->_input_mixin_pre_film) { @@ -123,8 +137,12 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput(); this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames); } + NAM_PROFILE_ADD(PROF_INPUT_MIXIN); + this->_z.leftCols(num_frames).noalias() = _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames); + NAM_PROFILE_ADD(PROF_COPIES); + if (this->_activation_pre_film) { this->_activation_pre_film->Process_(this->_z, condition, num_frames); @@ -139,6 +157,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_gating_mode == GatingMode::NONE) { this->_activation->apply(this->_z.leftCols(num_frames)); + NAM_PROFILE_ADD(PROF_ACTIVATION); if (this->_activation_post_film) { this->_activation_post_film->Process_(this->_z, condition, num_frames); @@ -146,6 +165,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z, num_frames); + NAM_PROFILE_ADD(PROF_LAYER1X1); } } else if (this->_gating_mode == GatingMode::GATED) @@ -155,6 +175,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma auto input_block = this->_z.leftCols(num_frames); auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames); this->_gating_activation->apply(input_block, output_block); + NAM_PROFILE_ADD(PROF_ACTIVATION); if (this->_activation_post_film) { // Use Process() for blocks and copy result back @@ -165,6 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames); + NAM_PROFILE_ADD(PROF_LAYER1X1); } } else if (this->_gating_mode == GatingMode::BLENDED) @@ -174,6 +196,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma auto input_block = this->_z.leftCols(num_frames); auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames); this->_blending_activation->apply(input_block, output_block); + NAM_PROFILE_ADD(PROF_ACTIVATION); if (this->_activation_post_film) { // Use Process() for blocks and copy result back @@ -184,6 +207,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames); + NAM_PROFILE_ADD(PROF_LAYER1X1); if (this->_layer1x1_post_film) { Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput(); @@ -207,6 +231,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput(); this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames); } + NAM_PROFILE_ADD(PROF_HEAD1X1); this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames); } else // No head 1x1 @@ -230,6 +255,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma // If layer1x1 is inactive, residual connection is just the input (identity) this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames); } + NAM_PROFILE_ADD(PROF_COPIES); } // LayerArray ================================================================= @@ -298,9 +324,12 @@ void nam::wavenet::_LayerArray::Process(const Eigen::MatrixXf& layer_inputs, con void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition, const int num_frames) { + NAM_PROFILE_START(); + // Process rechannel and get output this->_rechannel.process_(layer_inputs, num_frames); Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput(); + NAM_PROFILE_ADD(PROF_RECHANNEL); // Process layers for (size_t i = 0; i < this->_layers.size(); i++) @@ -329,7 +358,9 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames); // Process head rechannel + NAM_PROFILE_RESTART(); _head_rechannel.process_(this->_head_inputs, num_frames); + NAM_PROFILE_ADD(PROF_RECHANNEL); } diff --git a/example_audio/input.wav b/example_audio/input.wav new file mode 100644 index 00000000..fd0302bd Binary files /dev/null and b/example_audio/input.wav differ diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 8118e085..bbe93f3e 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -4,18 +4,49 @@ file(GLOB_RECURSE NAM_SOURCES ../NAM/*.cpp ../NAM/*.c ../NAM*.h) set(TOOLS benchmodel) add_custom_target(tools ALL - DEPENDS ${TOOLS}) + DEPENDS ${TOOLS} render) + +set(AUDIO_DSP_TOOLS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../Dependencies/AudioDSPTools") +set(AUDIO_DSP_TOOLS_WAV_SOURCES "${AUDIO_DSP_TOOLS_DIR}/dsp/wav.cpp") include_directories(tools ..) include_directories(tools ${NAM_DEPS_PATH}/eigen) include_directories(tools ${NAM_DEPS_PATH}/nlohmann) +include_directories(tools ${AUDIO_DSP_TOOLS_DIR}/dsp) add_executable(loadmodel loadmodel.cpp ${NAM_SOURCES}) add_executable(benchmodel benchmodel.cpp ${NAM_SOURCES}) +add_executable(render render.cpp ${NAM_SOURCES} ${AUDIO_DSP_TOOLS_WAV_SOURCES}) +target_compile_features(render PUBLIC cxx_std_20) +# AudioDSPTools wav.cpp has sign-compare issues; don't fail build +set_source_files_properties(${AUDIO_DSP_TOOLS_WAV_SOURCES} PROPERTIES COMPILE_FLAGS "-Wno-error") +set_target_properties(render PROPERTIES + CXX_VISIBILITY_PRESET hidden + INTERPROCEDURAL_OPTIMIZATION TRUE + PREFIX "" +) +if (CMAKE_SYSTEM_NAME STREQUAL "Windows") + target_compile_definitions(render PRIVATE NOMINMAX WIN32_LEAN_AND_MEAN) +endif() +if (MSVC) + target_compile_options(render PRIVATE + "$<$:/W4>" + "$<$:/O2>" + ) +else() + target_compile_options(render PRIVATE + -Wall -Wextra -Wpedantic -Wstrict-aliasing -Wunreachable-code -Weffc++ -Wno-unused-parameter + "$<$:-Og;-ggdb;-Werror>" + "$<$:-Ofast>" + ) +endif() +add_executable(memory_usage memory_usage.cpp) add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCES}) # Compile run_tests without optimizations to ensure allocation tracking works correctly # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run set_target_properties(run_tests PROPERTIES COMPILE_OPTIONS "-O0") +# Benchmodel should be built with NAM_PROFILING set +target_compile_definitions(benchmodel PRIVATE NAM_PROFILING) # Ensure assertions are enabled for run_tests by removing NDEBUG if it was set # Release/RelWithDebInfo/MinSizeRel build types automatically define NDEBUG # We use a compile option to undefine it, which works on GCC, Clang, and MSVC @@ -32,6 +63,7 @@ endif() source_group(NAM ${CMAKE_CURRENT_SOURCE_DIR} FILES ${NAM_SOURCES}) target_compile_features(${TOOLS} PUBLIC cxx_std_20) +target_compile_features(memory_usage PUBLIC cxx_std_20) set_target_properties(${TOOLS} PROPERTIES @@ -61,4 +93,4 @@ endif() # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h # Don't let this break my build on debug: set_source_files_properties(../NAM/dsp.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") -set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") \ No newline at end of file +set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") diff --git a/tools/benchmodel.cpp b/tools/benchmodel.cpp index 39c14b0e..42556f59 100644 --- a/tools/benchmodel.cpp +++ b/tools/benchmodel.cpp @@ -4,6 +4,7 @@ #include "NAM/dsp.h" #include "NAM/get_dsp.h" +#include "NAM/profiling.h" using std::chrono::duration; using std::chrono::duration_cast; @@ -62,6 +63,9 @@ int main(int argc, char* argv[]) outputPtrs[ch] = outputBuffers[ch].data(); } + // Reset profiling counters before benchmark + nam::profiling::reset(); + std::cout << "Running benchmark\n"; auto t1 = high_resolution_clock::now(); for (size_t i = 0; i < numBuffers; i++) @@ -80,6 +84,9 @@ int main(int argc, char* argv[]) std::cout << ms_int.count() << "ms\n"; std::cout << ms_double.count() << "ms\n"; + + // Print profiling breakdown if enabled + nam::profiling::print_results(); } else { diff --git a/tools/memory_usage.cpp b/tools/memory_usage.cpp new file mode 100644 index 00000000..853ca8fe --- /dev/null +++ b/tools/memory_usage.cpp @@ -0,0 +1,611 @@ +// memory_usage.cpp — Report total memory required to host a NAM model at runtime. +// +// Usage: memory_usage [--buffer-size N] +// +// Parses the .nam JSON config and computes weight memory (learned parameters stored +// in Eigen matrices/vectors) and buffer memory (intermediate computation/state that +// depends on maxBufferSize) without instantiating the model. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "json.hpp" + +using json = nlohmann::json; + +static constexpr int DEFAULT_BUFFER_SIZE = 2048; +static constexpr long INPUT_BUFFER_SAFETY_FACTOR = 32; + +// ─── Result accumulator ───────────────────────────────────────────────────── + +struct MemoryResult +{ + size_t weight_bytes = 0; + size_t buffer_bytes = 0; + + void add_weights(size_t floats) { weight_bytes += floats * sizeof(float); } + void add_buffers(size_t floats) { buffer_bytes += floats * sizeof(float); } + + MemoryResult& operator+=(const MemoryResult& o) + { + weight_bytes += o.weight_bytes; + buffer_bytes += o.buffer_bytes; + return *this; + } +}; + +// ─── Conv1x1 ──────────────────────────────────────────────────────────────── + +// Conv1x1 stores either a full (out_channels x in_channels) matrix (possibly +// block-diagonal when grouped), or a depthwise weight vector when groups == +// in_channels == out_channels. +static MemoryResult conv1x1_memory(int in_ch, int out_ch, bool bias, int groups, int M) +{ + MemoryResult r; + bool depthwise = (groups == in_ch && in_ch == out_ch); + if (depthwise) + r.add_weights(in_ch); // _depthwise_weight(in_ch) + else + r.add_weights((size_t)out_ch * in_ch); // _weight(out_ch, in_ch) + if (bias) + r.add_weights(out_ch); // _bias(out_ch) + r.add_buffers((size_t)out_ch * M); // _output(out_ch, M) + return r; +} + +// ─── Conv1D ───────────────────────────────────────────────────────────────── + +// Conv1D stores kernel_size weight matrices (each out_ch x in_ch) or depthwise +// vectors, plus a bias vector, a ring buffer, and an output buffer. +static MemoryResult conv1d_memory(int in_ch, int out_ch, int kernel_size, bool bias, int dilation, int groups, int M) +{ + MemoryResult r; + bool depthwise = (groups == in_ch && in_ch == out_ch); + if (depthwise) + r.add_weights((size_t)kernel_size * in_ch); // _depthwise_weight[k](in_ch) + else + r.add_weights((size_t)kernel_size * out_ch * in_ch); // _weight[k](out_ch, in_ch) + if (bias) + r.add_weights(out_ch); // _bias(out_ch) + + // Ring buffer: storage = (in_ch, 2 * max_lookback + M) + // max_lookback = (kernel_size - 1) * dilation + long max_lookback = (kernel_size > 0) ? (long)(kernel_size - 1) * dilation : 0; + long ring_storage = 2 * max_lookback + M; + r.add_buffers((size_t)in_ch * ring_storage); // _input_buffer._storage + + // Output buffer: (out_ch, M) + r.add_buffers((size_t)out_ch * M); // _output + + return r; +} + +// ─── FiLM ─────────────────────────────────────────────────────────────────── + +struct FiLMParams +{ + bool active = false; + bool shift = true; + int groups = 1; +}; + +static MemoryResult film_memory(int condition_dim, int input_dim, const FiLMParams& fp, int M) +{ + if (!fp.active) + return {}; + MemoryResult r; + int scale_shift_dim = fp.shift ? 2 * input_dim : input_dim; + // _cond_to_scale_shift is a Conv1x1(condition_dim -> scale_shift_dim, bias=true, groups) + r += conv1x1_memory(condition_dim, scale_shift_dim, true, fp.groups, M); + // _output(input_dim, M) + r.add_buffers((size_t)input_dim * M); + return r; +} + +// ─── BatchNorm ────────────────────────────────────────────────────────────── + +static MemoryResult batchnorm_memory(int dim) +{ + MemoryResult r; + // Stores scale(dim) + loc(dim) derived from running_mean, running_var, weight, bias, eps + // The source values are consumed from weights array; only scale + loc are stored at runtime. + r.add_weights(2 * (size_t)dim); + return r; +} + +// ─── LSTM ─────────────────────────────────────────────────────────────────── + +static MemoryResult lstm_memory(const json& config) +{ + MemoryResult r; + int num_layers = config["num_layers"]; + int input_size = config["input_size"]; + int hidden_size = config["hidden_size"]; + int in_channels = config.value("in_channels", 1); + int out_channels = config.value("out_channels", 1); + + for (int i = 0; i < num_layers; i++) + { + int cell_input = (i == 0) ? input_size : hidden_size; + // _w(4*H, I+H) + r.add_weights((size_t)4 * hidden_size * (cell_input + hidden_size)); + // _b(4*H) + r.add_weights(4 * (size_t)hidden_size); + // _xh(I+H) — stores initial hidden state in the hidden portion + r.add_weights((size_t)(cell_input + hidden_size)); + // _c(H) — initial cell state + r.add_weights((size_t)hidden_size); + + // Buffers: _ifgo(4*H) + r.add_buffers(4 * (size_t)hidden_size); + // Note: _xh and _c are also modified during inference but they are + // loaded from weights (initial state), so counted as weights above. + } + + // _head_weight(out_channels, hidden_size) + r.add_weights((size_t)out_channels * hidden_size); + // _head_bias(out_channels) + r.add_weights(out_channels); + + // Top-level buffers: _input(input_size), _output(out_channels) + r.add_buffers(input_size); + r.add_buffers(out_channels); + + return r; +} + +// ─── Linear ───────────────────────────────────────────────────────────────── + +static MemoryResult linear_memory(const json& config) +{ + MemoryResult r; + int receptive_field = config["receptive_field"]; + bool bias = config["bias"]; + int in_channels = config.value("in_channels", 1); + int out_channels = config.value("out_channels", 1); + + // _weight(receptive_field) + r.add_weights(receptive_field); + // _bias (scalar float) + if (bias) + r.add_weights(1); + + // Buffer base: _input_buffers = in_channels vectors of (32 * receptive_field) + r.add_buffers((size_t)in_channels * INPUT_BUFFER_SAFETY_FACTOR * receptive_field); + // _output_buffers: resized per-call, not pre-allocated to a fixed size + // (depends on num_frames, not maxBufferSize) + + return r; +} + +// ─── ConvNet ──────────────────────────────────────────────────────────────── + +static MemoryResult convnet_memory(const json& config, int M) +{ + MemoryResult r; + int channels = config["channels"]; + std::vector dilations = config["dilations"]; + bool batchnorm = config["batchnorm"]; + int groups = config.value("groups", 1); + int in_channels = config.value("in_channels", 1); + int out_channels = config.value("out_channels", 1); + + int max_dilation = *std::max_element(dilations.begin(), dilations.end()); + + // Buffer base class: _input_buffers = in_channels * (32 * max_dilation) + int receptive_field = max_dilation; // passed to Buffer as receptive_field + r.add_buffers((size_t)in_channels * INPUT_BUFFER_SAFETY_FACTOR * receptive_field); + + // ConvNet blocks + for (size_t i = 0; i < dilations.size(); i++) + { + int block_in = (i == 0) ? in_channels : channels; + int block_out = channels; + // Conv1D with kernel_size=2, bias=!batchnorm + r += conv1d_memory(block_in, block_out, 2, !batchnorm, dilations[i], groups, M); + // Optional batchnorm + if (batchnorm) + r += batchnorm_memory(block_out); + // _output(out_channels, M) per block + r.add_buffers((size_t)block_out * M); + } + + // _block_vals: 1 entry of (channels, buffer_size) + // buffer_size = input_buffers[0].size() = 32 * receptive_field + long buffer_size = INPUT_BUFFER_SAFETY_FACTOR * receptive_field; + r.add_buffers((size_t)channels * buffer_size); + + // _head: weight(out_channels, channels) + bias(out_channels) + r.add_weights((size_t)out_channels * channels); + r.add_weights(out_channels); + + // _head_output is resized per-call, not a fixed pre-allocation + + return r; +} + +// ─── WaveNet helpers ──────────────────────────────────────────────────────── + +static FiLMParams parse_film_params(const json& layer_config, const std::string& key) +{ + FiLMParams fp; + if (layer_config.find(key) == layer_config.end() || layer_config[key] == false) + return fp; // inactive + const json& fc = layer_config[key]; + fp.active = fc.value("active", true); + fp.shift = fc.value("shift", true); + fp.groups = fc.value("groups", 1); + return fp; +} + +enum class GatingMode +{ + NONE, + GATED, + BLENDED +}; + +static std::vector parse_gating_modes(const json& layer_config, size_t num_layers) +{ + std::vector modes; + + auto parse_str = [](const std::string& s) -> GatingMode { + if (s == "gated") + return GatingMode::GATED; + if (s == "blended") + return GatingMode::BLENDED; + return GatingMode::NONE; + }; + + if (layer_config.find("gating_mode") != layer_config.end()) + { + if (layer_config["gating_mode"].is_array()) + { + for (const auto& gm : layer_config["gating_mode"]) + modes.push_back(parse_str(gm.get())); + } + else + { + GatingMode mode = parse_str(layer_config["gating_mode"].get()); + modes.resize(num_layers, mode); + } + } + else if (layer_config.find("gated") != layer_config.end()) + { + bool gated = layer_config["gated"]; + modes.resize(num_layers, gated ? GatingMode::GATED : GatingMode::NONE); + } + else + { + modes.resize(num_layers, GatingMode::NONE); + } + return modes; +} + +// WaveNet _Layer memory +static MemoryResult wavenet_layer_memory(int condition_size, int channels, int bottleneck, int kernel_size, int dilation, + GatingMode gating_mode, int groups_input, int groups_input_mixin, + bool layer1x1_active, int layer1x1_groups, bool head1x1_active, + int head1x1_out_channels, int head1x1_groups, const FiLMParams& conv_pre_film, + const FiLMParams& conv_post_film, const FiLMParams& input_mixin_pre_film, + const FiLMParams& input_mixin_post_film, + const FiLMParams& activation_pre_film, + const FiLMParams& activation_post_film, + const FiLMParams& layer1x1_post_film, const FiLMParams& head1x1_post_film, + int M) +{ + MemoryResult r; + bool gated = (gating_mode != GatingMode::NONE); + int conv_out = gated ? 2 * bottleneck : bottleneck; + + // _conv: Conv1D(channels -> conv_out, kernel_size, bias=true, dilation, groups_input) + r += conv1d_memory(channels, conv_out, kernel_size, true, dilation, groups_input, M); + + // _input_mixin: Conv1x1(condition_size -> conv_out, bias=false, groups_input_mixin) + r += conv1x1_memory(condition_size, conv_out, false, groups_input_mixin, M); + + // _layer1x1 (optional): Conv1x1(bottleneck -> channels, bias=true, layer1x1_groups) + if (layer1x1_active) + r += conv1x1_memory(bottleneck, channels, true, layer1x1_groups, M); + + // _head1x1 (optional): Conv1x1(bottleneck -> head1x1_out_channels, bias=true, head1x1_groups) + if (head1x1_active) + r += conv1x1_memory(bottleneck, head1x1_out_channels, true, head1x1_groups, M); + + // Buffers: _z(conv_out, M) + r.add_buffers((size_t)conv_out * M); + // _output_next_layer(channels, M) + r.add_buffers((size_t)channels * M); + // _output_head: if head1x1 active -> (head1x1_out_channels, M), else (bottleneck, M) + int head_out = head1x1_active ? head1x1_out_channels : bottleneck; + r.add_buffers((size_t)head_out * M); + + // FiLM modules (up to 8) + r += film_memory(condition_size, channels, conv_pre_film, M); + r += film_memory(condition_size, conv_out, conv_post_film, M); + r += film_memory(condition_size, condition_size, input_mixin_pre_film, M); + r += film_memory(condition_size, conv_out, input_mixin_post_film, M); + r += film_memory(condition_size, conv_out, activation_pre_film, M); + r += film_memory(condition_size, bottleneck, activation_post_film, M); + if (layer1x1_active) + r += film_memory(condition_size, channels, layer1x1_post_film, M); + if (head1x1_active) + r += film_memory(condition_size, head1x1_out_channels, head1x1_post_film, M); + + return r; +} + +// WaveNet _LayerArray memory +static MemoryResult wavenet_layer_array_memory(const json& layer_config, int M) +{ + MemoryResult r; + int input_size = layer_config["input_size"]; + int condition_size = layer_config["condition_size"]; + int head_size = layer_config["head_size"]; + int channels = layer_config["channels"]; + int bottleneck = layer_config.value("bottleneck", channels); + int kernel_size = layer_config["kernel_size"]; + std::vector dilations = layer_config["dilations"]; + size_t num_layers = dilations.size(); + bool head_bias = layer_config["head_bias"]; + + int groups_input = layer_config.value("groups_input", 1); + int groups_input_mixin = layer_config.value("groups_input_mixin", 1); + + // layer1x1 params + bool layer1x1_active = true; + int layer1x1_groups = 1; + if (layer_config.find("layer1x1") != layer_config.end()) + { + layer1x1_active = layer_config["layer1x1"]["active"]; + layer1x1_groups = layer_config["layer1x1"]["groups"]; + } + + // head1x1 params + bool head1x1_active = false; + int head1x1_out_channels = channels; + int head1x1_groups = 1; + if (layer_config.find("head1x1") != layer_config.end()) + { + head1x1_active = layer_config["head1x1"]["active"]; + head1x1_out_channels = layer_config["head1x1"]["out_channels"]; + head1x1_groups = layer_config["head1x1"]["groups"]; + } + + // Gating modes + std::vector gating_modes = parse_gating_modes(layer_config, num_layers); + + // FiLM params + FiLMParams conv_pre = parse_film_params(layer_config, "conv_pre_film"); + FiLMParams conv_post = parse_film_params(layer_config, "conv_post_film"); + FiLMParams input_mixin_pre = parse_film_params(layer_config, "input_mixin_pre_film"); + FiLMParams input_mixin_post = parse_film_params(layer_config, "input_mixin_post_film"); + FiLMParams activation_pre = parse_film_params(layer_config, "activation_pre_film"); + FiLMParams activation_post = parse_film_params(layer_config, "activation_post_film"); + FiLMParams layer1x1_post = parse_film_params(layer_config, "layer1x1_post_film"); + FiLMParams head1x1_post = parse_film_params(layer_config, "head1x1_post_film"); + + // _rechannel: Conv1x1(input_size -> channels, bias=false) + r += conv1x1_memory(input_size, channels, false, 1, M); + + // Per-layer + for (size_t i = 0; i < num_layers; i++) + { + r += wavenet_layer_memory(condition_size, channels, bottleneck, kernel_size, dilations[i], gating_modes[i], + groups_input, groups_input_mixin, layer1x1_active, layer1x1_groups, head1x1_active, + head1x1_out_channels, head1x1_groups, conv_pre, conv_post, input_mixin_pre, + input_mixin_post, activation_pre, activation_post, layer1x1_post, head1x1_post, M); + } + + // _head_rechannel: Conv1x1(head_output_size -> head_size, bias=head_bias) + int head_output_size = head1x1_active ? head1x1_out_channels : bottleneck; + r += conv1x1_memory(head_output_size, head_size, head_bias, 1, M); + + // Buffers: _layer_outputs(channels, M) + r.add_buffers((size_t)channels * M); + // _head_inputs(head_output_size, M) + r.add_buffers((size_t)head_output_size * M); + + return r; +} + +// Forward declaration for recursive condition_dsp +static MemoryResult compute_memory(const std::string& architecture, const json& config, int M); + +// WaveNet top-level memory +static MemoryResult wavenet_memory(const json& config, int M) +{ + MemoryResult r; + int in_channels = config.value("in_channels", 1); + + // condition_dim = in_channels (from _get_condition_dim()) + int condition_dim = in_channels; + + // Recursive condition_dsp + bool has_condition_dsp = false; + int condition_output_channels = condition_dim; + if (config.find("condition_dsp") != config.end()) + { + has_condition_dsp = true; + const json& cdsp = config["condition_dsp"]; + std::string cdsp_arch = cdsp["architecture"]; + json cdsp_config = cdsp["config"]; + r += compute_memory(cdsp_arch, cdsp_config, M); + // condition_output_channels comes from the condition_dsp's output + // For now, we use condition_size from first layer as a proxy + // (the actual model validates this match) + if (config.find("layers") != config.end() && config["layers"].size() > 0) + condition_output_channels = config["layers"][0]["condition_size"]; + } + + // _condition_input(condition_dim, M) + r.add_buffers((size_t)condition_dim * M); + + // _condition_output + if (!has_condition_dsp) + { + // _condition_output(condition_dim, M) + r.add_buffers((size_t)condition_dim * M); + } + else + { + // _condition_output(condition_output_channels, M) + r.add_buffers((size_t)condition_output_channels * M); + // _condition_dsp_input_buffers: condition_dim vectors of M doubles/floats + // These are std::vector> where NAM_SAMPLE is double + r.add_buffers((size_t)condition_dim * M * (sizeof(double) / sizeof(float))); + // _condition_dsp_output_buffers: condition_output_channels vectors of M doubles + r.add_buffers((size_t)condition_output_channels * M * (sizeof(double) / sizeof(float))); + // Pointer arrays are negligible + } + + // Layer arrays + for (const auto& layer_config : config["layers"]) + r += wavenet_layer_array_memory(layer_config, M); + + // _head_scale (1 float) — it's a weight + r.add_weights(1); + + return r; +} + +// ─── Dispatch ─────────────────────────────────────────────────────────────── + +static MemoryResult compute_memory(const std::string& architecture, const json& config, int M) +{ + if (architecture == "WaveNet") + return wavenet_memory(config, M); + if (architecture == "LSTM") + return lstm_memory(config); + if (architecture == "ConvNet") + return convnet_memory(config, M); + if (architecture == "Linear") + return linear_memory(config); + throw std::runtime_error("Unknown architecture: " + architecture); +} + +// ─── Formatting helpers ───────────────────────────────────────────────────── + +static std::string format_bytes(size_t bytes) +{ + char buf[64]; + if (bytes < 1024) + snprintf(buf, sizeof(buf), "%zu bytes", bytes); + else if (bytes < 1024 * 1024) + snprintf(buf, sizeof(buf), "%.2f KB", bytes / 1024.0); + else + snprintf(buf, sizeof(buf), "%.2f MB", bytes / (1024.0 * 1024.0)); + return buf; +} + +static std::string format_with_commas(size_t n) +{ + std::string s = std::to_string(n); + int insert_pos = (int)s.length() - 3; + while (insert_pos > 0) + { + s.insert(insert_pos, ","); + insert_pos -= 3; + } + return s; +} + +// ─── Main ─────────────────────────────────────────────────────────────────── + +int main(int argc, char* argv[]) +{ + if (argc < 2) + { + fprintf(stderr, "Usage: memory_usage [--buffer-size N]\n"); + return 1; + } + + const char* model_path = argv[1]; + int buffer_size = DEFAULT_BUFFER_SIZE; + + for (int i = 2; i < argc; i++) + { + if (strcmp(argv[i], "--buffer-size") == 0 && i + 1 < argc) + { + buffer_size = atoi(argv[++i]); + if (buffer_size <= 0) + { + fprintf(stderr, "Error: buffer size must be positive\n"); + return 1; + } + } + else + { + fprintf(stderr, "Unknown option: %s\n", argv[i]); + return 1; + } + } + + // Read and parse JSON + std::ifstream file(model_path); + if (!file.is_open()) + { + fprintf(stderr, "Error: cannot open %s\n", model_path); + return 1; + } + + json j; + try + { + file >> j; + } + catch (const std::exception& e) + { + fprintf(stderr, "Error parsing JSON: %s\n", e.what()); + return 1; + } + + std::string architecture = j["architecture"]; + json config = j["config"]; + + // Cross-check: count weights in JSON + size_t json_weight_count = 0; + if (j.find("weights") != j.end()) + json_weight_count = j["weights"].size(); + + double sample_rate = -1.0; + if (j.find("sample_rate") != j.end()) + sample_rate = j["sample_rate"]; + + try + { + MemoryResult result = compute_memory(architecture, config, buffer_size); + size_t total = result.weight_bytes + result.buffer_bytes; + + printf("Model: %s\n", model_path); + printf("Architecture: %s\n", architecture.c_str()); + if (sample_rate > 0) + printf("Sample rate: %.0f Hz\n", sample_rate); + printf("\n"); + printf("Weights: %s bytes (%s)\n", format_with_commas(result.weight_bytes).c_str(), + format_bytes(result.weight_bytes).c_str()); + printf("Buffers: %s bytes (%s) [buffer size: %d]\n", format_with_commas(result.buffer_bytes).c_str(), + format_bytes(result.buffer_bytes).c_str(), buffer_size); + printf("Total: %s bytes (%s)\n", format_with_commas(total).c_str(), format_bytes(total).c_str()); + + if (json_weight_count > 0) + { + printf("\nJSON weights: %zu values (%s bytes)\n", json_weight_count, + format_with_commas(json_weight_count * sizeof(float)).c_str()); + } + } + catch (const std::exception& e) + { + fprintf(stderr, "Error computing memory: %s\n", e.what()); + return 1; + } + + return 0; +} diff --git a/tools/render.cpp b/tools/render.cpp new file mode 100644 index 00000000..77836b41 --- /dev/null +++ b/tools/render.cpp @@ -0,0 +1,159 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NAM/dsp.h" +#include "NAM/get_dsp.h" +#include "wav.h" + +namespace +{ +// Write mono 32-bit float WAV file (IEEE float format 3). +bool SaveWavFloat32(const char* fileName, const float* samples, size_t numSamples, double sampleRate) +{ + std::ofstream out(fileName, std::ios::binary); + if (!out.is_open()) + { + std::cerr << "Error: Failed to open output file " << fileName << "\n"; + return false; + } + + const uint32_t dataSize = static_cast(numSamples * sizeof(float)); + const uint32_t chunkSize = 36 + dataSize; + + // RIFF header + out.write("RIFF", 4); + out.write(reinterpret_cast(&chunkSize), 4); + out.write("WAVE", 4); + + // fmt chunk (16 bytes for PCM/IEEE) + const uint32_t fmtSize = 16; + out.write("fmt ", 4); + out.write(reinterpret_cast(&fmtSize), 4); + const uint16_t audioFormat = 3; // IEEE float + out.write(reinterpret_cast(&audioFormat), 2); + const uint16_t numChannels = 1; + out.write(reinterpret_cast(&numChannels), 2); + const uint32_t sr = static_cast(sampleRate); + out.write(reinterpret_cast(&sr), 4); + const uint32_t byteRate = sr * sizeof(float); + out.write(reinterpret_cast(&byteRate), 4); + const uint16_t blockAlign = sizeof(float); + out.write(reinterpret_cast(&blockAlign), 2); + const uint16_t bitsPerSample = 32; + out.write(reinterpret_cast(&bitsPerSample), 2); + + // data chunk + out.write("data", 4); + out.write(reinterpret_cast(&dataSize), 4); + out.write(reinterpret_cast(samples), dataSize); + + return out.good(); +} + +} // namespace + +int main(int argc, char* argv[]) +{ + if (argc < 3 || argc > 4) + { + std::cerr << "Usage: render [output.wav]\n"; + return 1; + } + + const char* modelPath = argv[1]; + const char* inputPath = argv[2]; + const char* outputPath = (argc >= 4) ? argv[3] : "output.wav"; + + std::cerr << "Loading model [" << modelPath << "]\n"; + auto model = nam::get_dsp(std::filesystem::path(modelPath)); + if (!model) + { + std::cerr << "Failed to load model\n"; + return 1; + } + std::cerr << "Model loaded successfully\n"; + + std::vector inputAudio; + double inputSampleRate = 0.0; + auto loadResult = dsp::wav::Load(inputPath, inputAudio, inputSampleRate); + if (loadResult != dsp::wav::LoadReturnCode::SUCCESS) + { + std::cerr << "Failed to load input WAV: " << dsp::wav::GetMsgForLoadReturnCode(loadResult) << "\n"; + return 1; + } + + const double expectedRate = model->GetExpectedSampleRate(); + if (expectedRate > 0 && std::abs(inputSampleRate - expectedRate) > 0.5) + { + std::cerr << "Error: Input WAV sample rate (" << inputSampleRate + << " Hz) does not match model expected rate (" << expectedRate << " Hz)\n"; + return 1; + } + + const double sampleRate = expectedRate > 0 ? expectedRate : inputSampleRate; + const int bufferSize = 64; + model->Reset(sampleRate, bufferSize); + + const int inChannels = model->NumInputChannels(); + const int outChannels = model->NumOutputChannels(); + + if (inChannels != 1) + { + std::cerr << "Error: render tool currently supports mono input only (model has " << inChannels + << " input channels)\n"; + return 1; + } + + std::vector> inputBuffers(inChannels); + std::vector> outputBuffers(outChannels); + std::vector inputPtrs(inChannels); + std::vector outputPtrs(outChannels); + + for (int ch = 0; ch < inChannels; ch++) + { + inputBuffers[ch].resize(bufferSize, 0.0); + inputPtrs[ch] = inputBuffers[ch].data(); + } + for (int ch = 0; ch < outChannels; ch++) + { + outputBuffers[ch].resize(bufferSize, 0.0); + outputPtrs[ch] = outputBuffers[ch].data(); + } + + std::vector outputAudio; + outputAudio.reserve(static_cast(outChannels) * inputAudio.size()); + + size_t readPos = 0; + const size_t totalSamples = inputAudio.size(); + + while (readPos < totalSamples) + { + const size_t toRead = std::min(static_cast(bufferSize), totalSamples - readPos); + + for (size_t i = 0; i < toRead; i++) + inputBuffers[0][i] = static_cast(inputAudio[readPos + i]); + for (size_t i = toRead; i < static_cast(bufferSize); i++) + inputBuffers[0][i] = 0; + + model->process(inputPtrs.data(), outputPtrs.data(), static_cast(toRead)); + + for (size_t i = 0; i < toRead; i++) + outputAudio.push_back(static_cast(outputBuffers[0][i])); + + readPos += toRead; + } + + if (!SaveWavFloat32(outputPath, outputAudio.data(), outputAudio.size(), sampleRate)) + { + return 1; + } + + std::cerr << "Wrote " << outputAudio.size() << " samples to " << outputPath << "\n"; + return 0; +}