From b8ba5051dbb318e34a9a0b85beca5efa7528c388 Mon Sep 17 00:00:00 2001 From: ssjia Date: Thu, 19 Mar 2026 12:14:57 -0700 Subject: [PATCH 1/2] [ET-VK][conv1d] Implement height-packed pointwise conv1d operator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a new conv1d pointwise (kernel_size=1) operator using height-packed layout where channels are the packed dimension (WHCN dim 1). This enables dot-product reduction over input channels: each vec4 load gives 4 consecutive channel values, yielding 4 MACs per dot() instruction. Uses tiled computation with the FP tile infrastructure from linear/matmul (FPInputTile, FPWeightTile, FPOutTile, fp_accumulate_with_fp_weight) and 4OC×4IC blocked weight packing via pack_fp_linear_weight.glsl for cache-friendly texture2d weight reads. Adaptive tile_m selection (4/2/1 rows) based on GPU occupancy. Thread mapping: X=OC4 tiles, Y=L tiles, Z=batch. Each thread computes TILE_M×TILE_N4×4 output elements. Inner loop loads input tiles and packed weight tiles, then calls fp_accumulate_with_fp_weight for tiled FMA. Supports both buffer and texture3d storage for input/output, texture2d or buffer for packed weights, fp32/fp16, and optional bias. Registered as et_vk.conv1d_pw.default (standalone custom op for testing/benchmarking). Performance on Adreno 750 (S24): - [1,256,1024]x[512,256,1] texture f16: 908 GFLOP/s - [1,512,2048]x[256,512,1] texture f16: 865 GFLOP/s - [1,128,4096]x[128,128,1] texture f16: 781 GFLOP/s - [1,256,1024]x[512,256,1] buffer f16: 491 GFLOP/s Differential Revision: [D97344092](https://our.internmc.facebook.com/intern/diff/D97344092/) [ghstack-poisoned] --- .../runtime/graph/ops/glsl/conv1d_pw.glsl | 194 +++++++++++++ .../runtime/graph/ops/glsl/conv1d_pw.yaml | 41 +++ .../runtime/graph/ops/impl/Conv1dPW.cpp | 263 ++++++++++++++++++ .../test/custom_ops/impl/TestConv1dPW.cpp | 24 ++ backends/vulkan/test/custom_ops/targets.bzl | 1 + .../vulkan/test/custom_ops/test_conv1d_pw.cpp | 248 +++++++++++++++++ 6 files changed, 771 insertions(+) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.yaml create mode 100644 backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp create mode 100644 backends/vulkan/test/custom_ops/impl/TestConv1dPW.cpp create mode 100644 backends/vulkan/test/custom_ops/test_conv1d_pw.cpp diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl new file mode 100644 index 00000000000..463e69c823f --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl @@ -0,0 +1,194 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} +#define T ${texel_load_component_type(DTYPE, STORAGE)} + +$if STORAGE == "buffer": + #define OUTPUT_BUFFER + #define INPUT_BUFFER +$if WEIGHT_STORAGE == "buffer": + #define WEIGHT_BUFFER +$if HAS_BIAS: + #define HAS_BIAS +$if STORAGE == "buffer" and HAS_BIAS: + #define BIAS_BUFFER + +#define TILE_M4 ${TILE_M4} +#define TILE_K4 ${TILE_K4} +#define TILE_N4 ${TILE_N4} + +#define TILE_M ${TILE_M} +#define TILE_K ${TILE_K4 * 4} +#define TILE_N ${TILE_N4 * 4} + +${define_required_extensions(STORAGE, DTYPE)} +$if WEIGHT_STORAGE != STORAGE: + ${define_required_extensions(WEIGHT_STORAGE, DTYPE)} + +layout(std430) buffer; + +#include "common.glslh" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE, is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, WEIGHT_STORAGE, is_scalar_array=False)} +$if HAS_BIAS: + ${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE, is_scalar_array=False)} + +// in_sizes: {L, C_in, N, 1} in WHCN order +${layout_declare_ubo(B, "ivec4", "in_sizes")} +// out_sizes: {L, C_out, N, 1} in WHCN order +${layout_declare_ubo(B, "ivec4", "out_sizes")} +$if HAS_BIAS: + ${layout_declare_ubo(B, "ivec4", "bias_sizes")} + +$if HAS_BIAS: + layout(push_constant) uniform restrict Block { + int weight_B; + float alpha; + float beta; + }; +$else: + layout(push_constant) uniform restrict Block { + int weight_B; + }; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +#include "linear_fp_input_tile.glslh" +#include "linear_fp_weight_tile.glslh" +#include "linear_fp_output_tile.glslh" +#include "linear_fp_packed_weight_tile_load.glslh" +#include "linear_fp_output_tile_fp_compute.glslh" + +// Conv1d pointwise is matrix multiplication with swapped texture coordinates. +// Linear: input ivec3(k4, m, b), output ivec3(n4, m, b) [width-packed] +// Conv1d: input ivec3(m, k4, b), output ivec3(m, n4, b) [height-packed] +// Buffer indexing is identical: (b * M + m) * K4 + k4 + +VEC4_T load_input_x4( + const int k4, + const int m, + const int b, + const int K4, + const int M) { +#ifdef INPUT_BUFFER + return t_in[(b * M + m) * K4 + k4]; +#else + return texelFetch(t_in, ivec3(m, k4, b), 0); +#endif +} + +void load_input_tile_with_checks( + out FPInputTile tile, + const int k4_start, + const int m_start, + const int b, + const int K4, + const int M) { + [[unroll]] for (int m = 0; m < TILE_M; ++m) { + [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) { + if (k4_start + k4 < K4 && m_start + m < M) { + tile.data[m][k4] = + load_input_x4(k4_start + k4, m_start + m, b, K4, M); + } else { + tile.data[m][k4] = VEC4_T(0.0); + } + } + } +} + +void store_output_x4( + const VEC4_T texel, + const int n4, + const int m, + const int b, + const int N4, + const int M) { +#ifdef OUTPUT_BUFFER + t_out[(b * M + m) * N4 + n4] = texel; +#else + imageStore(t_out, ivec3(m, n4, b), texel); +#endif +} + +void store_output_tile_with_checks( + const FPOutTile out_tile, + const int n4_start, + const int m_start, + const int b, + const int N4, + const int M) { + [[unroll]] for (int m = 0; m < TILE_M; ++m) { + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + if (m_start + m < M && n4_start + n4 < N4) { + store_output_x4( + out_tile.data[m][n4], n4_start + n4, m_start + m, b, N4, M); + } + } + } +} + +void main() { + // Thread mapping: X=OC4 (N4), Y=L/tile_m (M tiles), Z=batch + const int tile_idx_n = int(gl_GlobalInvocationID.x); + const int tile_idx_m = int(gl_GlobalInvocationID.y); + + const int n4_start = tile_idx_n * TILE_N4; + const int m_start = tile_idx_m * TILE_M; + + // in_sizes: {L, C_in, N, 1} in WHCN + const int K = in_sizes.y; // C_in + const int M = in_sizes.x; // L + const int K4 = div_up_4(K); + // out_sizes: {L, C_out, N, 1} in WHCN + const int N_out = out_sizes.y; // C_out + const int N4 = div_up_4(N_out); + + if (n4_start >= N4 || m_start >= M) { + return; + } + + FPOutTile out_tile; + initialize(out_tile); + + FPInputTile in_tile; + FPWeightTile w_tile; + + const int b = int(gl_GlobalInvocationID.z); + + for (int k4 = 0; k4 < K4; k4++) { + load_input_tile_with_checks(in_tile, k4, m_start, b, K4, M); + load_packed_weight_tile_with_checks(w_tile, n4_start, k4, 0, N4, K4); + fp_accumulate_with_fp_weight(out_tile, in_tile, w_tile); + } + +#ifdef HAS_BIAS + // Load bias (per output channel, width-packed) and apply + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + VEC4_T bias_val = VEC4_T(0.0); + if (n4_start + n4 < N4) { +#ifdef BIAS_BUFFER + bias_val = t_bias[n4_start + n4]; +#else + bias_val = texelFetch(t_bias, ivec3(n4_start + n4, 0, 0), 0); +#endif + } + [[unroll]] for (int m = 0; m < TILE_M; ++m) { + out_tile.data[m][n4] = + VEC4_T(alpha) * out_tile.data[m][n4] + VEC4_T(beta) * bias_val; + } + } +#endif + + store_output_tile_with_checks(out_tile, n4_start, m_start, b, N4, M); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.yaml new file mode 100644 index 00000000000..473b291934f --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.yaml @@ -0,0 +1,41 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv1d_pw: + parameter_names_with_default_values: + DTYPE: float + STORAGE: texture3d + WEIGHT_STORAGE: texture2d + HAS_BIAS: false + TILE_M4: 1 + TILE_K4: 1 + TILE_N4: 1 + TILE_M: 4 + generate_variant_forall: + combination: + parameter_names: [STORAGE, WEIGHT_STORAGE] + combos: + - parameter_values: [texture3d, texture2d] + - parameter_values: [texture3d, buffer] + - parameter_values: [buffer, texture2d] + - parameter_values: [buffer, buffer] + DTYPE: + - VALUE: float + - VALUE: half + shader_variants: + - NAME: conv1d_pw + - NAME: conv1d_pw_tile_row_2 + TILE_M: 2 + - NAME: conv1d_pw_tile_row_1 + TILE_M: 1 + - NAME: conv1d_pw_bias + HAS_BIAS: true + - NAME: conv1d_pw_bias_tile_row_2 + HAS_BIAS: true + TILE_M: 2 + - NAME: conv1d_pw_bias_tile_row_1 + HAS_BIAS: true + TILE_M: 1 diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp new file mode 100644 index 00000000000..608dbeaabe1 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp @@ -0,0 +1,263 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +#include + +#include + +namespace vkcompute { + +// Minimum number of thread groups to target for good GPU occupancy. +static constexpr uint32_t kMinOccupancyThreads = 4096; + +// Returns the tile_m (1, 2, or 4) for the conv1d_pw shader. tile_m tiles the +// L (spatial) dimension. The largest tile that produces at least +// kMinOccupancyThreads thread groups is chosen. +static uint32_t +pick_conv1d_pw_tile_m(uint32_t C_out, uint32_t L, uint32_t N_batch) { + uint32_t n_groups = utils::div_up_4(C_out); + for (uint32_t tile_m : {4u, 2u, 1u}) { + uint32_t total = n_groups * utils::div_up(L, tile_m) * N_batch; + if (total >= kMinOccupancyThreads) { + return tile_m; + } + } + return 1u; +} + +// Prepack conv1d_pw weight [C_out, C_in, 1] into 4OC x 4IC blocked layout. +// This is equivalent to prepack_fp_linear_weight with N=C_out, K=C_in, +// is_transposed=true, but extracts dimensions from the conv weight shape. +static ValueRef prepack_conv1d_pw_weight( + ComputeGraph& graph, + const ValueRef weight_data) { + std::vector weight_sizes = graph.sizes_of(weight_data); + // weight is [C_out, C_in, 1] + int64_t N = weight_sizes.at(0); // C_out + int64_t K = weight_sizes.at(1); // C_in + + int64_t K4 = utils::div_up(K, int64_t(4)); + int64_t N4 = utils::div_up(N, int64_t(4)); + + // Packed tensor: K4 rows, N4*4 vec4 elements per row. + int64_t output_height = K4; + int64_t output_width = N4 * 4 * 4; + + utils::StorageType weight_storage = utils::kTexture2D; + uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); + if (output_width / 4 > max_extent || + static_cast(output_height) > max_extent) { + weight_storage = utils::kBuffer; + } + + ValueRef packed_weight = graph.add_tensor( + {output_height, output_width}, + graph.dtype_of(weight_data), + weight_storage, + utils::kWidthPacked); + + utils::uvec3 global_wg_size = { + utils::safe_downcast(N4), + utils::safe_downcast(K4), + 1u}; + + struct PackParams { + int32_t N; + int32_t K; + int32_t B; + int32_t is_transposed; + }; + PackParams pack_params{ + utils::safe_downcast(N), utils::safe_downcast(K), 1, 1}; + + std::string kernel_name = "pack_fp_linear_weight"; + add_storage_type_suffix(kernel_name, weight_storage); + add_dtype_suffix(kernel_name, graph.dtype_of(weight_data)); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + graph.create_local_wg_size(global_wg_size), + weight_data, + packed_weight, + {}, + {}, + {PushConstantDataInfo(&pack_params, sizeof(PackParams))})); + + return packed_weight; +} + +void resize_conv1d_pw_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + const ValueRef out = args.at(0).refs.at(0); + const ValueRef self = args.at(1).refs.at(0); + + const int64_t C_out = graph->get_int(extra_args.at(0)); + + const std::vector in_sizes = graph->sizes_of(self); + const int64_t N_batch = in_sizes.at(0); + const int64_t L = in_sizes.at(2); + + graph->virtual_resize(out, {N_batch, C_out, L}); +} + +struct Conv1dPWIntParams final { + int32_t weight_B; +}; + +struct Conv1dPWBiasParams final { + float alpha; + float beta; +}; + +vkapi::ShaderInfo pick_conv1d_pw_shader( + ComputeGraph* graph, + const std::vector& args, + const std::vector& resize_args) { + const ValueRef out = args.at(0).refs.at(0); + const ValueRef packed_weight = args.at(1).refs.at(1); + bool has_bias = graph->get_bool(resize_args.at(1)); + + // out is [N_batch, C_out, L]; in WHCN: {L, C_out, N_batch, 1} + uint32_t C_out = graph->size_at(-2, out); + uint32_t L = graph->size_at(-1, out); + uint32_t N_batch = + graph->dim_of(out) >= 3 ? graph->size_at(-3, out) : 1; + uint32_t tile_m = pick_conv1d_pw_tile_m(C_out, L, N_batch); + + std::string kernel_name; + if (has_bias) { + kernel_name = tile_m <= 1 ? "conv1d_pw_bias_tile_row_1" + : tile_m <= 2 ? "conv1d_pw_bias_tile_row_2" + : "conv1d_pw_bias"; + } else { + kernel_name = tile_m <= 1 ? "conv1d_pw_tile_row_1" + : tile_m <= 2 ? "conv1d_pw_tile_row_2" + : "conv1d_pw"; + } + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph->storage_type_of(out)); + add_storage_type_suffix(kernel_name, graph->storage_type_of(packed_weight)); + add_dtype_suffix(kernel_name, graph->dtype_of(out)); + return VK_KERNEL_FROM_STR(kernel_name); +} + +utils::uvec3 pick_conv1d_pw_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + + // out is [N_batch, C_out, L]; in WHCN: {L, C_out, N_batch, 1} + uint32_t C_out = graph->size_at(-2, out); + uint32_t L = graph->size_at(-1, out); + uint32_t N_batch = + graph->dim_of(out) >= 3 ? graph->size_at(-3, out) : 1; + uint32_t tile_m = pick_conv1d_pw_tile_m(C_out, L, N_batch); + + // X=OC4 (div_up_4(C_out)), Y=L/tile_m, Z=N_batch + return {utils::div_up_4(C_out), utils::div_up(L, tile_m), N_batch}; +} + +void add_conv1d_pw_node( + ComputeGraph& graph, + const ValueRef in, + const ValueRef weight_data, + const ValueRef bias, + const ValueRef out) { + VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kHeightDim); + VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kHeightDim); + + ValueRef packed_weight = prepack_conv1d_pw_weight(graph, weight_data); + + bool has_bias = graph.val_is_not_none(bias); + ValueRef packed_bias = kDummyValueRef; + if (has_bias) { + packed_bias = prepack_standard( + graph, bias, graph.storage_type_of(out), utils::kWidthPacked); + } + + std::vector out_sizes = graph.sizes_of(out); + int64_t C_out = out_sizes.at(1); + ValueRef C_out_ref = graph.add_scalar(C_out); + ValueRef has_bias_ref = graph.add_scalar(has_bias); + + Conv1dPWIntParams int_params{1}; + Conv1dPWBiasParams bias_params{1.0f, 1.0f}; + + std::vector read_inputs = {in, packed_weight}; + if (has_bias) { + read_inputs.push_back(packed_bias); + } + + std::vector push_constants = { + PushConstantDataInfo(&int_params, sizeof(Conv1dPWIntParams)), + }; + if (has_bias) { + push_constants.push_back( + PushConstantDataInfo(&bias_params, sizeof(Conv1dPWBiasParams))); + } + + vkapi::ParamsBindList shader_params = { + graph.sizes_ubo(in), graph.sizes_ubo(out)}; + if (has_bias) { + shader_params.append(graph.sizes_ubo(packed_bias)); + } + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + pick_conv1d_pw_shader, + pick_conv1d_pw_global_wg_size, + pick_hw_square_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {read_inputs, vkapi::kRead}}, + // Shader params buffers + shader_params, + // Push Constants + push_constants, + // Specialization Constants + {}, + // Resize Args + {C_out_ref, has_bias_ref}, + // Resizing Logic + resize_conv1d_pw_node)); +} + +void conv1d_pw(ComputeGraph& graph, const std::vector& args) { + // args: in, weight, bias, stride, padding, dilation, groups, out + ValueRef in = args[0]; + ValueRef weight = args[1]; + ValueRef bias = args[2]; + ValueRef out = args[7]; + + const std::vector weight_sizes = graph.sizes_of(weight); + VK_CHECK_COND( + weight_sizes.at(2) == 1, "conv1d_pw only supports kernel_size=1"); + VK_CHECK_COND( + graph.get_int(args[6]) == 1, "conv1d_pw only supports groups=1"); + + add_conv1d_pw_node(graph, in, weight, bias, out); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(et_vk.conv1d_pw.default, conv1d_pw); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/impl/TestConv1dPW.cpp b/backends/vulkan/test/custom_ops/impl/TestConv1dPW.cpp new file mode 100644 index 00000000000..fd4a42eecd9 --- /dev/null +++ b/backends/vulkan/test/custom_ops/impl/TestConv1dPW.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace vkcompute { + +void test_conv1d_pw(ComputeGraph& graph, const std::vector& args) { + // args: in, weight, bias, stride, padding, dilation, groups, out + VK_GET_OP_FN("et_vk.conv1d_pw.default")(graph, args); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(test_etvk.test_conv1d_pw.default, test_conv1d_pw); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl index 84432bce30b..1f0cecc1c0c 100644 --- a/backends/vulkan/test/custom_ops/targets.bzl +++ b/backends/vulkan/test/custom_ops/targets.bzl @@ -102,3 +102,4 @@ def define_common_targets(is_fbcode = False): define_custom_op_test_binary("test_mm") define_custom_op_test_binary("test_conv2d_pw") define_custom_op_test_binary("test_conv2d_dw") + define_custom_op_test_binary("test_conv1d_pw") diff --git a/backends/vulkan/test/custom_ops/test_conv1d_pw.cpp b/backends/vulkan/test/custom_ops/test_conv1d_pw.cpp new file mode 100644 index 00000000000..632224c478d --- /dev/null +++ b/backends/vulkan/test/custom_ops/test_conv1d_pw.cpp @@ -0,0 +1,248 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include +#include + +#include "utils.h" + +using namespace executorch::vulkan::prototyping; +using namespace vkcompute; + +static constexpr int64_t kRefDimSizeLimit = 256; + +struct Conv1dPWConfig { + int64_t N; + int64_t C_in; + int64_t C_out; + int64_t L; + bool has_bias; +}; + +static TestCase create_conv1d_pw_test_case( + const Conv1dPWConfig& config, + vkapi::ScalarType dtype, + utils::StorageType storage_type) { + TestCase test_case; + + bool is_perf = config.C_in > kRefDimSizeLimit || + config.C_out > kRefDimSizeLimit || config.L > kRefDimSizeLimit; + + std::string prefix = is_perf ? "PERF" : "ACCU"; + std::string storage_str = storage_type_abbrev(storage_type); + std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32"; + + std::string bias_str = config.has_bias ? "+bias" : ""; + + std::string name = prefix + " conv1d_pw" + bias_str + " [" + + std::to_string(config.N) + "," + std::to_string(config.C_in) + "," + + std::to_string(config.L) + "]x[" + std::to_string(config.C_out) + "," + + std::to_string(config.C_in) + ",1] " + storage_str + "(HP) " + dtype_str; + + test_case.set_name(name); + test_case.set_operator_name("test_etvk.test_conv1d_pw.default"); + + // Input: [N, C_in, L] height-packed + ValueSpec input( + {config.N, config.C_in, config.L}, + dtype, + storage_type, + utils::kHeightPacked, + DataGenType::RANDOM); + test_case.add_input_spec(input); + + // Weight: [C_out, C_in, 1] height-packed, constant + ValueSpec weight( + {config.C_out, config.C_in, 1}, + dtype, + storage_type, + utils::kHeightPacked, + DataGenType::RANDOM); + weight.set_constant(true); + test_case.add_input_spec(weight); + + // Bias: [C_out] or None + if (config.has_bias) { + ValueSpec bias( + {config.C_out}, + dtype, + storage_type, + utils::kWidthPacked, + DataGenType::RANDOM); + bias.set_constant(true); + test_case.add_input_spec(bias); + } else { + ValueSpec none_bias(static_cast(0)); + none_bias.set_none(true); + test_case.add_input_spec(none_bias); + } + + // stride = [1] + test_case.add_input_spec(ValueSpec(std::vector{1})); + // padding = [0] + test_case.add_input_spec(ValueSpec(std::vector{0})); + // dilation = [1] + test_case.add_input_spec(ValueSpec(std::vector{1})); + // groups = 1 + test_case.add_input_spec(ValueSpec(static_cast(1))); + + // Output: [N, C_out, L] height-packed + ValueSpec output( + {config.N, config.C_out, config.L}, + dtype, + storage_type, + utils::kHeightPacked, + DataGenType::ZEROS); + test_case.add_output_spec(output); + + if (dtype == vkapi::kHalf) { + test_case.set_abs_tolerance(1e-1f); + test_case.set_rel_tolerance(1e-2f); + } else { + test_case.set_abs_tolerance(1e-3f); + test_case.set_rel_tolerance(1e-3f); + } + + test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"}); + + return test_case; +} + +static void conv1d_pw_reference_impl(TestCase& test_case) { + const auto& input_spec = test_case.inputs()[0]; + const auto& weight_spec = test_case.inputs()[1]; + const auto& bias_spec = test_case.inputs()[2]; + ValueSpec& output = test_case.outputs()[0]; + + if (input_spec.dtype != vkapi::kFloat) { + throw std::invalid_argument("Reference only supports float"); + } + + auto in_sizes = input_spec.get_tensor_sizes(); + auto w_sizes = weight_spec.get_tensor_sizes(); + + int64_t N = in_sizes[0]; + int64_t C_in = in_sizes[1]; + int64_t L = in_sizes[2]; + int64_t C_out = w_sizes[0]; + + const auto& in_data = input_spec.get_float_data(); + const auto& w_data = weight_spec.get_float_data(); + auto& ref_data = output.get_ref_float_data(); + ref_data.resize(N * C_out * L, 0.0f); + + // input is NCHW-contiguous: [N, C_in, L] + // weight is [C_out, C_in, 1] + for (int64_t n = 0; n < N; ++n) { + for (int64_t oc = 0; oc < C_out; ++oc) { + for (int64_t l = 0; l < L; ++l) { + float sum = 0.0f; + for (int64_t ic = 0; ic < C_in; ++ic) { + sum += in_data[n * C_in * L + ic * L + l] * w_data[oc * C_in + ic]; + } + ref_data[n * C_out * L + oc * L + l] = sum; + } + } + } + + if (!bias_spec.is_none()) { + const auto& bias_data = bias_spec.get_float_data(); + for (int64_t n = 0; n < N; ++n) { + for (int64_t oc = 0; oc < C_out; ++oc) { + for (int64_t l = 0; l < L; ++l) { + ref_data[n * C_out * L + oc * L + l] += bias_data[oc]; + } + } + } + } +} + +static std::vector generate_conv1d_pw_test_cases() { + std::vector test_cases; + + std::vector storage_types = { + utils::kTexture3D, utils::kBuffer}; + + // Accuracy shapes (float, small) + std::vector accu_configs = { + {1, 16, 32, 64, false}, + {1, 16, 32, 64, true}, + {1, 32, 16, 128, false}, + {1, 32, 16, 128, true}, + {1, 64, 64, 32, false}, + {1, 128, 256, 16, true}, + {2, 16, 32, 64, false}, + {2, 16, 32, 64, true}, + // Non-aligned channel counts (not a multiple of 4) + {1, 5, 7, 64, false}, + {1, 5, 7, 64, true}, + {1, 13, 17, 48, false}, + {1, 13, 17, 48, true}, + {1, 7, 5, 32, false}, + {2, 5, 13, 64, true}, + }; + + for (const auto& cfg : accu_configs) { + for (auto st : storage_types) { + test_cases.push_back(create_conv1d_pw_test_case(cfg, vkapi::kFloat, st)); + } + } + + // Performance shapes (half + float) + std::vector perf_configs = { + {1, 256, 512, 1024, false}, + {1, 256, 512, 1024, true}, + {1, 512, 256, 2048, false}, + {1, 128, 128, 4096, true}, + }; + + for (const auto& cfg : perf_configs) { + for (auto st : storage_types) { + test_cases.push_back(create_conv1d_pw_test_case(cfg, vkapi::kFloat, st)); + test_cases.push_back(create_conv1d_pw_test_case(cfg, vkapi::kHalf, st)); + } + } + + return test_cases; +} + +static int64_t conv1d_pw_flop_calculator(const TestCase& test_case) { + auto in_sizes = test_case.inputs()[0].get_tensor_sizes(); + auto w_sizes = test_case.inputs()[1].get_tensor_sizes(); + + int64_t N = in_sizes[0]; + int64_t C_in = in_sizes[1]; + int64_t L = in_sizes[2]; + int64_t C_out = w_sizes[0]; + + return 2 * N * C_in * C_out * L; +} + +int main(int argc, char* argv[]) { + set_debugging(false); + set_print_output(false); + set_print_latencies(false); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Conv1d Pointwise (Height-Packed) Benchmark" << std::endl; + print_separator(); + + ReferenceComputeFunc ref_fn = conv1d_pw_reference_impl; + + auto results = execute_test_cases( + generate_conv1d_pw_test_cases, + conv1d_pw_flop_calculator, + "Conv1dPW", + 3, + 10, + ref_fn); + + return 0; +} From 5cb8a14d967d0f380a104aeae15ef53e57ea2f99 Mon Sep 17 00:00:00 2001 From: ssjia Date: Thu, 19 Mar 2026 15:48:42 -0700 Subject: [PATCH 2/2] Update on "[ET-VK][conv1d] Implement height-packed pointwise conv1d operator" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a new conv1d pointwise (kernel_size=1) operator using height-packed layout where channels are the packed dimension (WHCN dim 1). This enables dot-product reduction over input channels: each vec4 load gives 4 consecutive channel values, yielding 4 MACs per dot() instruction. Uses tiled computation with the FP tile infrastructure from linear/matmul (FPInputTile, FPWeightTile, FPOutTile, fp_accumulate_with_fp_weight) and 4OC×4IC blocked weight packing via pack_fp_linear_weight.glsl for cache-friendly texture2d weight reads. Adaptive tile_m selection (4/2/1 rows) based on GPU occupancy. Thread mapping: X=OC4 tiles, Y=L tiles, Z=batch. Each thread computes TILE_M×TILE_N4×4 output elements. Inner loop loads input tiles and packed weight tiles, then calls fp_accumulate_with_fp_weight for tiled FMA. Supports both buffer and texture3d storage for input/output, texture2d or buffer for packed weights, fp32/fp16, and optional bias. Registered as et_vk.conv1d_pw.default (standalone custom op for testing/benchmarking). Performance on Adreno 750 (S24): - [1,256,1024]x[512,256,1] texture f16: 908 GFLOP/s - [1,512,2048]x[256,512,1] texture f16: 865 GFLOP/s - [1,128,4096]x[128,128,1] texture f16: 781 GFLOP/s - [1,256,1024]x[512,256,1] buffer f16: 491 GFLOP/s Differential Revision: [D97344092](https://our.internmc.facebook.com/intern/diff/D97344092/) [ghstack-poisoned] --- .../runtime/graph/ops/glsl/conv1d_pw.glsl | 12 ++++++ .../runtime/graph/ops/impl/Conv1dPW.cpp | 38 ++++++++++++++----- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl index 463e69c823f..91ec2fbcab4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl @@ -56,10 +56,14 @@ $if HAS_BIAS: int weight_B; float alpha; float beta; + float output_min; + float output_max; }; $else: layout(push_constant) uniform restrict Block { int weight_B; + float output_min; + float output_max; }; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -190,5 +194,13 @@ void main() { } #endif + // Apply activation clamp + [[unroll]] for (int m = 0; m < TILE_M; ++m) { + [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) { + out_tile.data[m][n4] = + clamp(out_tile.data[m][n4], VEC4_T(output_min), VEC4_T(output_max)); + } + } + store_output_tile_with_checks(out_tile, n4_start, m_start, b, N4, M); } diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp index 608dbeaabe1..6c9ba28384a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp @@ -16,6 +16,8 @@ #include +#include + namespace vkcompute { // Minimum number of thread groups to target for good GPU occupancy. @@ -117,11 +119,15 @@ void resize_conv1d_pw_node( struct Conv1dPWIntParams final { int32_t weight_B; + float output_min; + float output_max; }; struct Conv1dPWBiasParams final { float alpha; float beta; + float output_min; + float output_max; }; vkapi::ShaderInfo pick_conv1d_pw_shader( @@ -181,7 +187,9 @@ void add_conv1d_pw_node( const ValueRef in, const ValueRef weight_data, const ValueRef bias, - const ValueRef out) { + const ValueRef out, + const float output_min = std::numeric_limits::lowest(), + const float output_max = std::numeric_limits::max()) { VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kHeightDim); VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kHeightDim); @@ -199,20 +207,21 @@ void add_conv1d_pw_node( ValueRef C_out_ref = graph.add_scalar(C_out); ValueRef has_bias_ref = graph.add_scalar(has_bias); - Conv1dPWIntParams int_params{1}; - Conv1dPWBiasParams bias_params{1.0f, 1.0f}; + Conv1dPWIntParams int_params{1, output_min, output_max}; + Conv1dPWBiasParams bias_params{1.0f, 1.0f, output_min, output_max}; std::vector read_inputs = {in, packed_weight}; if (has_bias) { read_inputs.push_back(packed_bias); } - std::vector push_constants = { - PushConstantDataInfo(&int_params, sizeof(Conv1dPWIntParams)), - }; + std::vector push_constants; if (has_bias) { push_constants.push_back( PushConstantDataInfo(&bias_params, sizeof(Conv1dPWBiasParams))); + } else { + push_constants.push_back( + PushConstantDataInfo(&int_params, sizeof(Conv1dPWIntParams))); } vkapi::ParamsBindList shader_params = { @@ -240,12 +249,14 @@ void add_conv1d_pw_node( resize_conv1d_pw_node)); } +// Args: in, weight, bias, stride, padding, dilation, groups, +// output_min, output_max, out +// output_min and output_max may be kDummyValueRef (no clamp). void conv1d_pw(ComputeGraph& graph, const std::vector& args) { - // args: in, weight, bias, stride, padding, dilation, groups, out ValueRef in = args[0]; ValueRef weight = args[1]; ValueRef bias = args[2]; - ValueRef out = args[7]; + ValueRef out = args[9]; const std::vector weight_sizes = graph.sizes_of(weight); VK_CHECK_COND( @@ -253,7 +264,16 @@ void conv1d_pw(ComputeGraph& graph, const std::vector& args) { VK_CHECK_COND( graph.get_int(args[6]) == 1, "conv1d_pw only supports groups=1"); - add_conv1d_pw_node(graph, in, weight, bias, out); + float output_min = std::numeric_limits::lowest(); + float output_max = std::numeric_limits::max(); + if (is_valid(args[7])) { + output_min = graph.extract_scalar(args[7]); + } + if (is_valid(args[8])) { + output_max = graph.extract_scalar(args[8]); + } + + add_conv1d_pw_node(graph, in, weight, bias, out, output_min, output_max); } REGISTER_OPERATORS {