From b8ba5051dbb318e34a9a0b85beca5efa7528c388 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Thu, 19 Mar 2026 12:14:57 -0700
Subject: [PATCH 1/2] [ET-VK][conv1d] Implement height-packed pointwise conv1d
 operator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement a new conv1d pointwise (kernel_size=1) operator using height-packed
layout where channels are the packed dimension (WHCN dim 1). This enables
dot-product reduction over input channels: each vec4 load gives 4 consecutive
channel values, yielding 4 MACs per dot() instruction.

Uses tiled computation with the FP tile infrastructure from linear/matmul
(FPInputTile, FPWeightTile, FPOutTile, fp_accumulate_with_fp_weight) and
4OC×4IC blocked weight packing via pack_fp_linear_weight.glsl for
cache-friendly texture2d weight reads. Adaptive tile_m selection (4/2/1 rows)
based on GPU occupancy.

Thread mapping: X=OC4 tiles, Y=L tiles, Z=batch. Each thread computes
TILE_M×TILE_N4×4 output elements. Inner loop loads input tiles and packed
weight tiles, then calls fp_accumulate_with_fp_weight for tiled FMA.

Supports both buffer and texture3d storage for input/output, texture2d or
buffer for packed weights, fp32/fp16, and optional bias. Registered as
et_vk.conv1d_pw.default (standalone custom op for testing/benchmarking).

Performance on Adreno 750 (S24):
- [1,256,1024]x[512,256,1] texture f16: 908 GFLOP/s
- [1,512,2048]x[256,512,1] texture f16: 865 GFLOP/s
- [1,128,4096]x[128,128,1] texture f16: 781 GFLOP/s
- [1,256,1024]x[512,256,1] buffer f16: 491 GFLOP/s

Differential Revision: [D97344092](https://our.internmc.facebook.com/intern/diff/D97344092/)

[ghstack-poisoned]
---
 .../runtime/graph/ops/glsl/conv1d_pw.glsl     | 194 +++++++++++++
 .../runtime/graph/ops/glsl/conv1d_pw.yaml     |  41 +++
 .../runtime/graph/ops/impl/Conv1dPW.cpp       | 263 ++++++++++++++++++
 .../test/custom_ops/impl/TestConv1dPW.cpp     |  24 ++
 backends/vulkan/test/custom_ops/targets.bzl   |   1 +
 .../vulkan/test/custom_ops/test_conv1d_pw.cpp | 248 +++++++++++++++++
 6 files changed, 771 insertions(+)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp
 create mode 100644 backends/vulkan/test/custom_ops/impl/TestConv1dPW.cpp
 create mode 100644 backends/vulkan/test/custom_ops/test_conv1d_pw.cpp

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl
new file mode 100644
index 00000000000..463e69c823f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+#define T ${texel_load_component_type(DTYPE, STORAGE)}
+
+$if STORAGE == "buffer":
+  #define OUTPUT_BUFFER
+  #define INPUT_BUFFER
+$if WEIGHT_STORAGE == "buffer":
+  #define WEIGHT_BUFFER
+$if HAS_BIAS:
+  #define HAS_BIAS
+$if STORAGE == "buffer" and HAS_BIAS:
+  #define BIAS_BUFFER
+
+#define TILE_M4 ${TILE_M4}
+#define TILE_K4 ${TILE_K4}
+#define TILE_N4 ${TILE_N4}
+
+#define TILE_M ${TILE_M}
+#define TILE_K ${TILE_K4 * 4}
+#define TILE_N ${TILE_N4 * 4}
+
+${define_required_extensions(STORAGE, DTYPE)}
+$if WEIGHT_STORAGE != STORAGE:
+  ${define_required_extensions(WEIGHT_STORAGE, DTYPE)}
+
+layout(std430) buffer;
+
+#include "common.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, WEIGHT_STORAGE, is_scalar_array=False)}
+$if HAS_BIAS:
+  ${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE, is_scalar_array=False)}
+
+// in_sizes: {L, C_in, N, 1} in WHCN order
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+// out_sizes: {L, C_out, N, 1} in WHCN order
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+$if HAS_BIAS:
+  ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
+
+$if HAS_BIAS:
+  layout(push_constant) uniform restrict Block {
+    int weight_B;
+    float alpha;
+    float beta;
+  };
+$else:
+  layout(push_constant) uniform restrict Block {
+    int weight_B;
+  };
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "linear_fp_input_tile.glslh"
+#include "linear_fp_weight_tile.glslh"
+#include "linear_fp_output_tile.glslh"
+#include "linear_fp_packed_weight_tile_load.glslh"
+#include "linear_fp_output_tile_fp_compute.glslh"
+
+// Conv1d pointwise is matrix multiplication with swapped texture coordinates.
+// Linear: input ivec3(k4, m, b), output ivec3(n4, m, b)  [width-packed]
+// Conv1d: input ivec3(m, k4, b), output ivec3(m, n4, b)  [height-packed]
+// Buffer indexing is identical: (b * M + m) * K4 + k4
+
+VEC4_T load_input_x4(
+    const int k4,
+    const int m,
+    const int b,
+    const int K4,
+    const int M) {
+#ifdef INPUT_BUFFER
+  return t_in[(b * M + m) * K4 + k4];
+#else
+  return texelFetch(t_in, ivec3(m, k4, b), 0);
+#endif
+}
+
+void load_input_tile_with_checks(
+    out FPInputTile tile,
+    const int k4_start,
+    const int m_start,
+    const int b,
+    const int K4,
+    const int M) {
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
+      if (k4_start + k4 < K4 && m_start + m < M) {
+        tile.data[m][k4] =
+            load_input_x4(k4_start + k4, m_start + m, b, K4, M);
+      } else {
+        tile.data[m][k4] = VEC4_T(0.0);
+      }
+    }
+  }
+}
+
+void store_output_x4(
+    const VEC4_T texel,
+    const int n4,
+    const int m,
+    const int b,
+    const int N4,
+    const int M) {
+#ifdef OUTPUT_BUFFER
+  t_out[(b * M + m) * N4 + n4] = texel;
+#else
+  imageStore(t_out, ivec3(m, n4, b), texel);
+#endif
+}
+
+void store_output_tile_with_checks(
+    const FPOutTile out_tile,
+    const int n4_start,
+    const int m_start,
+    const int b,
+    const int N4,
+    const int M) {
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      if (m_start + m < M && n4_start + n4 < N4) {
+        store_output_x4(
+            out_tile.data[m][n4], n4_start + n4, m_start + m, b, N4, M);
+      }
+    }
+  }
+}
+
+void main() {
+  // Thread mapping: X=OC4 (N4), Y=L/tile_m (M tiles), Z=batch
+  const int tile_idx_n = int(gl_GlobalInvocationID.x);
+  const int tile_idx_m = int(gl_GlobalInvocationID.y);
+
+  const int n4_start = tile_idx_n * TILE_N4;
+  const int m_start = tile_idx_m * TILE_M;
+
+  // in_sizes: {L, C_in, N, 1} in WHCN
+  const int K = in_sizes.y;  // C_in
+  const int M = in_sizes.x;  // L
+  const int K4 = div_up_4(K);
+  // out_sizes: {L, C_out, N, 1} in WHCN
+  const int N_out = out_sizes.y; // C_out
+  const int N4 = div_up_4(N_out);
+
+  if (n4_start >= N4 || m_start >= M) {
+    return;
+  }
+
+  FPOutTile out_tile;
+  initialize(out_tile);
+
+  FPInputTile in_tile;
+  FPWeightTile w_tile;
+
+  const int b = int(gl_GlobalInvocationID.z);
+
+  for (int k4 = 0; k4 < K4; k4++) {
+    load_input_tile_with_checks(in_tile, k4, m_start, b, K4, M);
+    load_packed_weight_tile_with_checks(w_tile, n4_start, k4, 0, N4, K4);
+    fp_accumulate_with_fp_weight(out_tile, in_tile, w_tile);
+  }
+
+#ifdef HAS_BIAS
+  // Load bias (per output channel, width-packed) and apply
+  [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+    VEC4_T bias_val = VEC4_T(0.0);
+    if (n4_start + n4 < N4) {
+#ifdef BIAS_BUFFER
+      bias_val = t_bias[n4_start + n4];
+#else
+      bias_val = texelFetch(t_bias, ivec3(n4_start + n4, 0, 0), 0);
+#endif
+    }
+    [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+      out_tile.data[m][n4] =
+          VEC4_T(alpha) * out_tile.data[m][n4] + VEC4_T(beta) * bias_val;
+    }
+  }
+#endif
+
+  store_output_tile_with_checks(out_tile, n4_start, m_start, b, N4, M);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.yaml
new file mode 100644
index 00000000000..473b291934f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.yaml
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv1d_pw:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+    WEIGHT_STORAGE: texture2d
+    HAS_BIAS: false
+    TILE_M4: 1
+    TILE_K4: 1
+    TILE_N4: 1
+    TILE_M: 4
+  generate_variant_forall:
+    combination:
+      parameter_names: [STORAGE, WEIGHT_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture2d]
+        - parameter_values: [texture3d, buffer]
+        - parameter_values: [buffer, texture2d]
+        - parameter_values: [buffer, buffer]
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: conv1d_pw
+    - NAME: conv1d_pw_tile_row_2
+      TILE_M: 2
+    - NAME: conv1d_pw_tile_row_1
+      TILE_M: 1
+    - NAME: conv1d_pw_bias
+      HAS_BIAS: true
+    - NAME: conv1d_pw_bias_tile_row_2
+      HAS_BIAS: true
+      TILE_M: 2
+    - NAME: conv1d_pw_bias_tile_row_1
+      HAS_BIAS: true
+      TILE_M: 1
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp
new file mode 100644
index 00000000000..608dbeaabe1
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Linear.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+// Minimum number of thread groups to target for good GPU occupancy.
+static constexpr uint32_t kMinOccupancyThreads = 4096;
+
+// Returns the tile_m (1, 2, or 4) for the conv1d_pw shader. tile_m tiles the
+// L (spatial) dimension. The largest tile that produces at least
+// kMinOccupancyThreads thread groups is chosen.
+static uint32_t
+pick_conv1d_pw_tile_m(uint32_t C_out, uint32_t L, uint32_t N_batch) {
+  uint32_t n_groups = utils::div_up_4(C_out);
+  for (uint32_t tile_m : {4u, 2u, 1u}) {
+    uint32_t total = n_groups * utils::div_up(L, tile_m) * N_batch;
+    if (total >= kMinOccupancyThreads) {
+      return tile_m;
+    }
+  }
+  return 1u;
+}
+
+// Prepack conv1d_pw weight [C_out, C_in, 1] into 4OC x 4IC blocked layout.
+// This is equivalent to prepack_fp_linear_weight with N=C_out, K=C_in,
+// is_transposed=true, but extracts dimensions from the conv weight shape.
+static ValueRef prepack_conv1d_pw_weight(
+    ComputeGraph& graph,
+    const ValueRef weight_data) {
+  std::vector<int64_t> weight_sizes = graph.sizes_of(weight_data);
+  // weight is [C_out, C_in, 1]
+  int64_t N = weight_sizes.at(0); // C_out
+  int64_t K = weight_sizes.at(1); // C_in
+
+  int64_t K4 = utils::div_up(K, int64_t(4));
+  int64_t N4 = utils::div_up(N, int64_t(4));
+
+  // Packed tensor: K4 rows, N4*4 vec4 elements per row.
+  int64_t output_height = K4;
+  int64_t output_width = N4 * 4 * 4;
+
+  utils::StorageType weight_storage = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (output_width / 4 > max_extent ||
+      static_cast<uint32_t>(output_height) > max_extent) {
+    weight_storage = utils::kBuffer;
+  }
+
+  ValueRef packed_weight = graph.add_tensor(
+      {output_height, output_width},
+      graph.dtype_of(weight_data),
+      weight_storage,
+      utils::kWidthPacked);
+
+  utils::uvec3 global_wg_size = {
+      utils::safe_downcast<uint32_t>(N4),
+      utils::safe_downcast<uint32_t>(K4),
+      1u};
+
+  struct PackParams {
+    int32_t N;
+    int32_t K;
+    int32_t B;
+    int32_t is_transposed;
+  };
+  PackParams pack_params{
+      utils::safe_downcast<int32_t>(N), utils::safe_downcast<int32_t>(K), 1, 1};
+
+  std::string kernel_name = "pack_fp_linear_weight";
+  add_storage_type_suffix(kernel_name, weight_storage);
+  add_dtype_suffix(kernel_name, graph.dtype_of(weight_data));
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      weight_data,
+      packed_weight,
+      {},
+      {},
+      {PushConstantDataInfo(&pack_params, sizeof(PackParams))}));
+
+  return packed_weight;
+}
+
+void resize_conv1d_pw_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef self = args.at(1).refs.at(0);
+
+  const int64_t C_out = graph->get_int(extra_args.at(0));
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(self);
+  const int64_t N_batch = in_sizes.at(0);
+  const int64_t L = in_sizes.at(2);
+
+  graph->virtual_resize(out, {N_batch, C_out, L});
+}
+
+struct Conv1dPWIntParams final {
+  int32_t weight_B;
+};
+
+struct Conv1dPWBiasParams final {
+  float alpha;
+  float beta;
+};
+
+vkapi::ShaderInfo pick_conv1d_pw_shader(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef packed_weight = args.at(1).refs.at(1);
+  bool has_bias = graph->get_bool(resize_args.at(1));
+
+  // out is [N_batch, C_out, L]; in WHCN: {L, C_out, N_batch, 1}
+  uint32_t C_out = graph->size_at<uint32_t>(-2, out);
+  uint32_t L = graph->size_at<uint32_t>(-1, out);
+  uint32_t N_batch =
+      graph->dim_of(out) >= 3 ? graph->size_at<uint32_t>(-3, out) : 1;
+  uint32_t tile_m = pick_conv1d_pw_tile_m(C_out, L, N_batch);
+
+  std::string kernel_name;
+  if (has_bias) {
+    kernel_name = tile_m <= 1 ? "conv1d_pw_bias_tile_row_1"
+        : tile_m <= 2         ? "conv1d_pw_bias_tile_row_2"
+                              : "conv1d_pw_bias";
+  } else {
+    kernel_name = tile_m <= 1 ? "conv1d_pw_tile_row_1"
+        : tile_m <= 2         ? "conv1d_pw_tile_row_2"
+                              : "conv1d_pw";
+  }
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph->storage_type_of(out));
+  add_storage_type_suffix(kernel_name, graph->storage_type_of(packed_weight));
+  add_dtype_suffix(kernel_name, graph->dtype_of(out));
+  return VK_KERNEL_FROM_STR(kernel_name);
+}
+
+utils::uvec3 pick_conv1d_pw_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  // out is [N_batch, C_out, L]; in WHCN: {L, C_out, N_batch, 1}
+  uint32_t C_out = graph->size_at<uint32_t>(-2, out);
+  uint32_t L = graph->size_at<uint32_t>(-1, out);
+  uint32_t N_batch =
+      graph->dim_of(out) >= 3 ? graph->size_at<uint32_t>(-3, out) : 1;
+  uint32_t tile_m = pick_conv1d_pw_tile_m(C_out, L, N_batch);
+
+  // X=OC4 (div_up_4(C_out)), Y=L/tile_m, Z=N_batch
+  return {utils::div_up_4(C_out), utils::div_up(L, tile_m), N_batch};
+}
+
+void add_conv1d_pw_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef out) {
+  VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kHeightDim);
+  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kHeightDim);
+
+  ValueRef packed_weight = prepack_conv1d_pw_weight(graph, weight_data);
+
+  bool has_bias = graph.val_is_not_none(bias);
+  ValueRef packed_bias = kDummyValueRef;
+  if (has_bias) {
+    packed_bias = prepack_standard(
+        graph, bias, graph.storage_type_of(out), utils::kWidthPacked);
+  }
+
+  std::vector<int64_t> out_sizes = graph.sizes_of(out);
+  int64_t C_out = out_sizes.at(1);
+  ValueRef C_out_ref = graph.add_scalar(C_out);
+  ValueRef has_bias_ref = graph.add_scalar(has_bias);
+
+  Conv1dPWIntParams int_params{1};
+  Conv1dPWBiasParams bias_params{1.0f, 1.0f};
+
+  std::vector<ValueRef> read_inputs = {in, packed_weight};
+  if (has_bias) {
+    read_inputs.push_back(packed_bias);
+  }
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&int_params, sizeof(Conv1dPWIntParams)),
+  };
+  if (has_bias) {
+    push_constants.push_back(
+        PushConstantDataInfo(&bias_params, sizeof(Conv1dPWBiasParams)));
+  }
+
+  vkapi::ParamsBindList shader_params = {
+      graph.sizes_ubo(in), graph.sizes_ubo(out)};
+  if (has_bias) {
+    shader_params.append(graph.sizes_ubo(packed_bias));
+  }
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      pick_conv1d_pw_shader,
+      pick_conv1d_pw_global_wg_size,
+      pick_hw_square_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {read_inputs, vkapi::kRead}},
+      // Shader params buffers
+      shader_params,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {},
+      // Resize Args
+      {C_out_ref, has_bias_ref},
+      // Resizing Logic
+      resize_conv1d_pw_node));
+}
+
+void conv1d_pw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args: in, weight, bias, stride, padding, dilation, groups, out
+  ValueRef in = args[0];
+  ValueRef weight = args[1];
+  ValueRef bias = args[2];
+  ValueRef out = args[7];
+
+  const std::vector<int64_t> weight_sizes = graph.sizes_of(weight);
+  VK_CHECK_COND(
+      weight_sizes.at(2) == 1, "conv1d_pw only supports kernel_size=1");
+  VK_CHECK_COND(
+      graph.get_int(args[6]) == 1, "conv1d_pw only supports groups=1");
+
+  add_conv1d_pw_node(graph, in, weight, bias, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(et_vk.conv1d_pw.default, conv1d_pw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestConv1dPW.cpp b/backends/vulkan/test/custom_ops/impl/TestConv1dPW.cpp
new file mode 100644
index 00000000000..fd4a42eecd9
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestConv1dPW.cpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+
+namespace vkcompute {
+
+void test_conv1d_pw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args: in, weight, bias, stride, padding, dilation, groups, out
+  VK_GET_OP_FN("et_vk.conv1d_pw.default")(graph, args);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.test_conv1d_pw.default, test_conv1d_pw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 84432bce30b..1f0cecc1c0c 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -102,3 +102,4 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("test_mm")
     define_custom_op_test_binary("test_conv2d_pw")
     define_custom_op_test_binary("test_conv2d_dw")
+    define_custom_op_test_binary("test_conv1d_pw")
diff --git a/backends/vulkan/test/custom_ops/test_conv1d_pw.cpp b/backends/vulkan/test/custom_ops/test_conv1d_pw.cpp
new file mode 100644
index 00000000000..632224c478d
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_conv1d_pw.cpp
@@ -0,0 +1,248 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <iostream>
+#include <vector>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 256;
+
+struct Conv1dPWConfig {
+  int64_t N;
+  int64_t C_in;
+  int64_t C_out;
+  int64_t L;
+  bool has_bias;
+};
+
+static TestCase create_conv1d_pw_test_case(
+    const Conv1dPWConfig& config,
+    vkapi::ScalarType dtype,
+    utils::StorageType storage_type) {
+  TestCase test_case;
+
+  bool is_perf = config.C_in > kRefDimSizeLimit ||
+      config.C_out > kRefDimSizeLimit || config.L > kRefDimSizeLimit;
+
+  std::string prefix = is_perf ? "PERF" : "ACCU";
+  std::string storage_str = storage_type_abbrev(storage_type);
+  std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32";
+
+  std::string bias_str = config.has_bias ? "+bias" : "";
+
+  std::string name = prefix + "  conv1d_pw" + bias_str + " [" +
+      std::to_string(config.N) + "," + std::to_string(config.C_in) + "," +
+      std::to_string(config.L) + "]x[" + std::to_string(config.C_out) + "," +
+      std::to_string(config.C_in) + ",1]  " + storage_str + "(HP) " + dtype_str;
+
+  test_case.set_name(name);
+  test_case.set_operator_name("test_etvk.test_conv1d_pw.default");
+
+  // Input: [N, C_in, L] height-packed
+  ValueSpec input(
+      {config.N, config.C_in, config.L},
+      dtype,
+      storage_type,
+      utils::kHeightPacked,
+      DataGenType::RANDOM);
+  test_case.add_input_spec(input);
+
+  // Weight: [C_out, C_in, 1] height-packed, constant
+  ValueSpec weight(
+      {config.C_out, config.C_in, 1},
+      dtype,
+      storage_type,
+      utils::kHeightPacked,
+      DataGenType::RANDOM);
+  weight.set_constant(true);
+  test_case.add_input_spec(weight);
+
+  // Bias: [C_out] or None
+  if (config.has_bias) {
+    ValueSpec bias(
+        {config.C_out},
+        dtype,
+        storage_type,
+        utils::kWidthPacked,
+        DataGenType::RANDOM);
+    bias.set_constant(true);
+    test_case.add_input_spec(bias);
+  } else {
+    ValueSpec none_bias(static_cast<int32_t>(0));
+    none_bias.set_none(true);
+    test_case.add_input_spec(none_bias);
+  }
+
+  // stride = [1]
+  test_case.add_input_spec(ValueSpec(std::vector<int32_t>{1}));
+  // padding = [0]
+  test_case.add_input_spec(ValueSpec(std::vector<int32_t>{0}));
+  // dilation = [1]
+  test_case.add_input_spec(ValueSpec(std::vector<int32_t>{1}));
+  // groups = 1
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(1)));
+
+  // Output: [N, C_out, L] height-packed
+  ValueSpec output(
+      {config.N, config.C_out, config.L},
+      dtype,
+      storage_type,
+      utils::kHeightPacked,
+      DataGenType::ZEROS);
+  test_case.add_output_spec(output);
+
+  if (dtype == vkapi::kHalf) {
+    test_case.set_abs_tolerance(1e-1f);
+    test_case.set_rel_tolerance(1e-2f);
+  } else {
+    test_case.set_abs_tolerance(1e-3f);
+    test_case.set_rel_tolerance(1e-3f);
+  }
+
+  test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"});
+
+  return test_case;
+}
+
+static void conv1d_pw_reference_impl(TestCase& test_case) {
+  const auto& input_spec = test_case.inputs()[0];
+  const auto& weight_spec = test_case.inputs()[1];
+  const auto& bias_spec = test_case.inputs()[2];
+  ValueSpec& output = test_case.outputs()[0];
+
+  if (input_spec.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Reference only supports float");
+  }
+
+  auto in_sizes = input_spec.get_tensor_sizes();
+  auto w_sizes = weight_spec.get_tensor_sizes();
+
+  int64_t N = in_sizes[0];
+  int64_t C_in = in_sizes[1];
+  int64_t L = in_sizes[2];
+  int64_t C_out = w_sizes[0];
+
+  const auto& in_data = input_spec.get_float_data();
+  const auto& w_data = weight_spec.get_float_data();
+  auto& ref_data = output.get_ref_float_data();
+  ref_data.resize(N * C_out * L, 0.0f);
+
+  // input is NCHW-contiguous: [N, C_in, L]
+  // weight is [C_out, C_in, 1]
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t oc = 0; oc < C_out; ++oc) {
+      for (int64_t l = 0; l < L; ++l) {
+        float sum = 0.0f;
+        for (int64_t ic = 0; ic < C_in; ++ic) {
+          sum += in_data[n * C_in * L + ic * L + l] * w_data[oc * C_in + ic];
+        }
+        ref_data[n * C_out * L + oc * L + l] = sum;
+      }
+    }
+  }
+
+  if (!bias_spec.is_none()) {
+    const auto& bias_data = bias_spec.get_float_data();
+    for (int64_t n = 0; n < N; ++n) {
+      for (int64_t oc = 0; oc < C_out; ++oc) {
+        for (int64_t l = 0; l < L; ++l) {
+          ref_data[n * C_out * L + oc * L + l] += bias_data[oc];
+        }
+      }
+    }
+  }
+}
+
+static std::vector<TestCase> generate_conv1d_pw_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  std::vector<utils::StorageType> storage_types = {
+      utils::kTexture3D, utils::kBuffer};
+
+  // Accuracy shapes (float, small)
+  std::vector<Conv1dPWConfig> accu_configs = {
+      {1, 16, 32, 64, false},
+      {1, 16, 32, 64, true},
+      {1, 32, 16, 128, false},
+      {1, 32, 16, 128, true},
+      {1, 64, 64, 32, false},
+      {1, 128, 256, 16, true},
+      {2, 16, 32, 64, false},
+      {2, 16, 32, 64, true},
+      // Non-aligned channel counts (not a multiple of 4)
+      {1, 5, 7, 64, false},
+      {1, 5, 7, 64, true},
+      {1, 13, 17, 48, false},
+      {1, 13, 17, 48, true},
+      {1, 7, 5, 32, false},
+      {2, 5, 13, 64, true},
+  };
+
+  for (const auto& cfg : accu_configs) {
+    for (auto st : storage_types) {
+      test_cases.push_back(create_conv1d_pw_test_case(cfg, vkapi::kFloat, st));
+    }
+  }
+
+  // Performance shapes (half + float)
+  std::vector<Conv1dPWConfig> perf_configs = {
+      {1, 256, 512, 1024, false},
+      {1, 256, 512, 1024, true},
+      {1, 512, 256, 2048, false},
+      {1, 128, 128, 4096, true},
+  };
+
+  for (const auto& cfg : perf_configs) {
+    for (auto st : storage_types) {
+      test_cases.push_back(create_conv1d_pw_test_case(cfg, vkapi::kFloat, st));
+      test_cases.push_back(create_conv1d_pw_test_case(cfg, vkapi::kHalf, st));
+    }
+  }
+
+  return test_cases;
+}
+
+static int64_t conv1d_pw_flop_calculator(const TestCase& test_case) {
+  auto in_sizes = test_case.inputs()[0].get_tensor_sizes();
+  auto w_sizes = test_case.inputs()[1].get_tensor_sizes();
+
+  int64_t N = in_sizes[0];
+  int64_t C_in = in_sizes[1];
+  int64_t L = in_sizes[2];
+  int64_t C_out = w_sizes[0];
+
+  return 2 * N * C_in * C_out * L;
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Conv1d Pointwise (Height-Packed) Benchmark" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = conv1d_pw_reference_impl;
+
+  auto results = execute_test_cases(
+      generate_conv1d_pw_test_cases,
+      conv1d_pw_flop_calculator,
+      "Conv1dPW",
+      3,
+      10,
+      ref_fn);
+
+  return 0;
+}

From 5cb8a14d967d0f380a104aeae15ef53e57ea2f99 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Thu, 19 Mar 2026 15:48:42 -0700
Subject: [PATCH 2/2] Update on "[ET-VK][conv1d] Implement height-packed
 pointwise conv1d operator"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement a new conv1d pointwise (kernel_size=1) operator using height-packed
layout where channels are the packed dimension (WHCN dim 1). This enables
dot-product reduction over input channels: each vec4 load gives 4 consecutive
channel values, yielding 4 MACs per dot() instruction.

Uses tiled computation with the FP tile infrastructure from linear/matmul
(FPInputTile, FPWeightTile, FPOutTile, fp_accumulate_with_fp_weight) and
4OC×4IC blocked weight packing via pack_fp_linear_weight.glsl for
cache-friendly texture2d weight reads. Adaptive tile_m selection (4/2/1 rows)
based on GPU occupancy.

Thread mapping: X=OC4 tiles, Y=L tiles, Z=batch. Each thread computes
TILE_M×TILE_N4×4 output elements. Inner loop loads input tiles and packed
weight tiles, then calls fp_accumulate_with_fp_weight for tiled FMA.

Supports both buffer and texture3d storage for input/output, texture2d or
buffer for packed weights, fp32/fp16, and optional bias. Registered as
et_vk.conv1d_pw.default (standalone custom op for testing/benchmarking).

Performance on Adreno 750 (S24):
- [1,256,1024]x[512,256,1] texture f16: 908 GFLOP/s
- [1,512,2048]x[256,512,1] texture f16: 865 GFLOP/s
- [1,128,4096]x[128,128,1] texture f16: 781 GFLOP/s
- [1,256,1024]x[512,256,1] buffer f16: 491 GFLOP/s

Differential Revision: [D97344092](https://our.internmc.facebook.com/intern/diff/D97344092/)

[ghstack-poisoned]
---
 .../runtime/graph/ops/glsl/conv1d_pw.glsl     | 12 ++++++
 .../runtime/graph/ops/impl/Conv1dPW.cpp       | 38 ++++++++++++++-----
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl
index 463e69c823f..91ec2fbcab4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d_pw.glsl
@@ -56,10 +56,14 @@ $if HAS_BIAS:
     int weight_B;
     float alpha;
     float beta;
+    float output_min;
+    float output_max;
   };
 $else:
   layout(push_constant) uniform restrict Block {
     int weight_B;
+    float output_min;
+    float output_max;
   };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -190,5 +194,13 @@ void main() {
   }
 #endif
 
+  // Apply activation clamp
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      out_tile.data[m][n4] =
+          clamp(out_tile.data[m][n4], VEC4_T(output_min), VEC4_T(output_max));
+    }
+  }
+
   store_output_tile_with_checks(out_tile, n4_start, m_start, b, N4, M);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp
index 608dbeaabe1..6c9ba28384a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv1dPW.cpp
@@ -16,6 +16,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
+#include <limits>
+
 namespace vkcompute {
 
 // Minimum number of thread groups to target for good GPU occupancy.
@@ -117,11 +119,15 @@ void resize_conv1d_pw_node(
 
 struct Conv1dPWIntParams final {
   int32_t weight_B;
+  float output_min;
+  float output_max;
 };
 
 struct Conv1dPWBiasParams final {
   float alpha;
   float beta;
+  float output_min;
+  float output_max;
 };
 
 vkapi::ShaderInfo pick_conv1d_pw_shader(
@@ -181,7 +187,9 @@ void add_conv1d_pw_node(
     const ValueRef in,
     const ValueRef weight_data,
     const ValueRef bias,
-    const ValueRef out) {
+    const ValueRef out,
+    const float output_min = std::numeric_limits<float>::lowest(),
+    const float output_max = std::numeric_limits<float>::max()) {
   VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kHeightDim);
   VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kHeightDim);
 
@@ -199,20 +207,21 @@ void add_conv1d_pw_node(
   ValueRef C_out_ref = graph.add_scalar(C_out);
   ValueRef has_bias_ref = graph.add_scalar(has_bias);
 
-  Conv1dPWIntParams int_params{1};
-  Conv1dPWBiasParams bias_params{1.0f, 1.0f};
+  Conv1dPWIntParams int_params{1, output_min, output_max};
+  Conv1dPWBiasParams bias_params{1.0f, 1.0f, output_min, output_max};
 
   std::vector<ValueRef> read_inputs = {in, packed_weight};
   if (has_bias) {
     read_inputs.push_back(packed_bias);
   }
 
-  std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&int_params, sizeof(Conv1dPWIntParams)),
-  };
+  std::vector<PushConstantDataInfo> push_constants;
   if (has_bias) {
     push_constants.push_back(
         PushConstantDataInfo(&bias_params, sizeof(Conv1dPWBiasParams)));
+  } else {
+    push_constants.push_back(
+        PushConstantDataInfo(&int_params, sizeof(Conv1dPWIntParams)));
   }
 
   vkapi::ParamsBindList shader_params = {
@@ -240,12 +249,14 @@ void add_conv1d_pw_node(
       resize_conv1d_pw_node));
 }
 
+// Args: in, weight, bias, stride, padding, dilation, groups,
+//       output_min, output_max, out
+// output_min and output_max may be kDummyValueRef (no clamp).
 void conv1d_pw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // args: in, weight, bias, stride, padding, dilation, groups, out
   ValueRef in = args[0];
   ValueRef weight = args[1];
   ValueRef bias = args[2];
-  ValueRef out = args[7];
+  ValueRef out = args[9];
 
   const std::vector<int64_t> weight_sizes = graph.sizes_of(weight);
   VK_CHECK_COND(
@@ -253,7 +264,16 @@ void conv1d_pw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   VK_CHECK_COND(
       graph.get_int(args[6]) == 1, "conv1d_pw only supports groups=1");
 
-  add_conv1d_pw_node(graph, in, weight, bias, out);
+  float output_min = std::numeric_limits<float>::lowest();
+  float output_max = std::numeric_limits<float>::max();
+  if (is_valid(args[7])) {
+    output_min = graph.extract_scalar<float>(args[7]);
+  }
+  if (is_valid(args[8])) {
+    output_max = graph.extract_scalar<float>(args[8]);
+  }
+
+  add_conv1d_pw_node(graph, in, weight, bias, out, output_min, output_max);
 }
 
 REGISTER_OPERATORS {