From 6da3593815e34e9b15e61c2e58ba838ee52de239 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 15:25:54 +0000
Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20pass=20`std::string`=20by?=
 =?UTF-8?q?=20const=20reference=20to=20avoid=20unnecessary=20copies=20in?=
 =?UTF-8?q?=20RDMA=20ops?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh      | 8 ++------
 .../sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp       | 6 ++----
 .../sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp   | 6 ++----
 .../sm90_gemm_gated_tma_warpspecialized_cooperative.hpp  | 2 +-
 .../sm90_gemm_gated_tma_warpspecialized_pingpong.hpp     | 4 ++--
 custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h | 4 +---
 .../w4a8_moe/w4a8_moe_cutlass_kernel_template.cu         | 5 ++---
 .../cutlass_kernels/w4a8_moe/weight_process_utils.h      | 6 ++----
 .../gpu_ops/cutlass_kernels/weight_process_utils.h       | 6 ++----
 custom_ops/gpu_ops/flash_mask_attn/softmax.hpp           | 2 +-
 custom_ops/gpu_ops/helper.cu                             | 3 +--
 .../gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu    | 3 +--
 .../gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp      | 2 +-
 custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu          | 2 +-
 custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h    | 2 +-
 custom_ops/gpu_ops/sparse_indexer/exception.h            | 8 ++++----
 custom_ops/gpu_ops/tune_cublaslt_gemm.cu                 | 3 +--
 .../kvcache_transfer/include/kvcache_rdma.h              | 9 ++++++---
 .../kvcache_transfer/src/kvcache_rdma.cpp                | 6 +++---
 19 files changed, 36 insertions(+), 51 deletions(-)
diff --git a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh
index cb4c25bcf4e..6c1360dd36a 100644
--- a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh
+++ b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh
@@ -53,13 +53,9 @@ struct Signal {
   alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
 };
 
-struct __align__(16) RankData {
-  const void* __restrict__ ptrs[8];
-};
+struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
 
-struct __align__(16) RankSignals {
-  Signal* signals[8];
-};
+struct __align__(16) RankSignals { Signal* signals[8]; };
 
 // like std::array, but aligned
 template <typename T, int sz>
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp
index f335ec2d399..f34015c39e0 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp
@@ -622,16 +622,14 @@ struct CollectiveMmaGated<
       } else {
         return thread_mma.partition_B(sAux);
       }
-    }
-    ();
+    }();
     auto tCrAux = [&]() -> auto {
       if constexpr (SwapAB) {
         return thread_mma.make_fragment_A(tCsAux);
       } else {
         return thread_mma.make_fragment_B(tCsAux);
       }
-    }
-    ();
+    }();
 
     CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum0));  // M
     CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum0));  // N
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp
index c34ad242e25..8a17ea8105b 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -617,16 +617,14 @@ struct CollectiveMmaGated<
       } else {
         return thread_mma.partition_B(sAux);
       }
-    }
-    ();
+    }();
     auto tCrAux = [&]() -> auto {
       if constexpr (SwapAB) {
         return thread_mma.make_fragment_A(tCsAux);
       } else {
         return thread_mma.make_fragment_B(tCsAux);
       }
-    }
-    ();
+    }();
 
     CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum0));  // M
     CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum0));  // N
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_cooperative.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_cooperative.hpp
index efb90a5e7c6..46273113a31 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_cooperative.hpp
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_cooperative.hpp
@@ -635,7 +635,7 @@ class GemmUniversalGated<
         collective_epilogue.load_tail(epi_load_pipeline,
                                       epi_load_pipe_producer_state);
       }  // Epilogue Producer Warp End
-    }    // Producer Warp Group End
+    }  // Producer Warp Group End
 
     else if (warp_group_role == WarpGroupRole::Consumer0 ||
              warp_group_role == WarpGroupRole::Consumer1) {
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_pingpong.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_pingpong.hpp
index 9609adc32a7..4e4b3e4a662 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_pingpong.hpp
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_pingpong.hpp
@@ -618,7 +618,7 @@ class GemmUniversalGated<
         collective_epilogue.load_tail(epi_load_pipeline,
                                       epi_load_pipe_producer_state);
       }  // Epilogue Producer Warp End
-    }    // Producer Warp Group End
+    }  // Producer Warp Group End
 
     else if (warp_group_role == WarpGroupRole::Consumer0 ||
              warp_group_role == WarpGroupRole::Consumer1) {
@@ -714,7 +714,7 @@ class GemmUniversalGated<
         scheduler.advance_to_next_work(NumMmaWarpGroups);
         work_tile_info = scheduler.get_current_work();
       }  // Scheduler work fetch loop
-    }    // Consumer Warp Groups End
+    }  // Consumer Warp Groups End
 #endif
   }
 };
diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h
index 0927d327385..e8028e31d1a 100644
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h
@@ -32,9 +32,7 @@
 // workspace for cublas gemm : 32MB
 #define CUBLAS_WORKSPACE_SIZE 33554432
 
-typedef struct __align__(4) {
-  half x, y, z, w;
-}
+typedef struct __align__(4) { half x, y, z, w; }
 half4;
 
 /* **************************** type definition ***************************** */
diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_cutlass_kernel_template.cu b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_cutlass_kernel_template.cu
index cf02b86af19..f1b3f4ed5ab 100644
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_cutlass_kernel_template.cu
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_cutlass_kernel_template.cu
@@ -876,9 +876,8 @@ void W4A8MoeGemmRunner<OutputType, IntAType, IntBType>::moe_gemm(
         gemm_config_from_file_and_param.stages = max_total_rows_config.stages;
       }
     }
-    VLOG(1) << "W4A8 moe gemm "
-            << "total_rows: " << total_rows << " n: " << gemm_n
-            << " k: " << gemm_k
+    VLOG(1) << "W4A8 moe gemm " << "total_rows: " << total_rows
+            << " n: " << gemm_n << " k: " << gemm_k
             << "Using gemm config from config file: config_total_rows: "
             << best_total_rows << " config_n: " << best_n
             << " config_k: " << best_k << "tile_config: "
diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/weight_process_utils.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/weight_process_utils.h
index dd6536c762e..91239a823ed 100644
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/weight_process_utils.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/weight_process_utils.h
@@ -433,8 +433,7 @@ void interleave_column_major_tensor(int8_t* interleaved_quantized_tensor,
 
   const size_t rows_per_tile = 64;
   std::cout << "running interleave_column_major_tensor" << std::endl;
-  std::cout << "num_rows:" << num_rows << ","
-            << "num_cols:" << num_cols << ","
+  std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << ","
             << "BITS_PER_ELT:" << BITS_PER_ELT << ","
             << "elts_in_int32:" << elts_in_int32 << ","
             << "rows_per_tile:" << rows_per_tile << std::endl;
@@ -486,8 +485,7 @@ void interleave_column_major_tensor_int4(int8_t* interleaved_quantized_tensor,
 
   const size_t rows_per_tile = 64;
   std::cout << "running interleave_column_major_tensor" << std::endl;
-  std::cout << "num_rows:" << num_rows << ","
-            << "num_cols:" << num_cols << ","
+  std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << ","
             << "BITS_PER_ELT:" << BITS_PER_ELT << ","
             << "elts_in_int32:" << elts_in_int32 << ","
             << "rows_per_tile:" << rows_per_tile << std::endl;
diff --git a/custom_ops/gpu_ops/cutlass_kernels/weight_process_utils.h b/custom_ops/gpu_ops/cutlass_kernels/weight_process_utils.h
index 01a9201a0e2..f3217bea688 100644
--- a/custom_ops/gpu_ops/cutlass_kernels/weight_process_utils.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/weight_process_utils.h
@@ -434,8 +434,7 @@ void interleave_column_major_tensor(int8_t* interleaved_quantized_tensor,
 
   const size_t rows_per_tile = 64;
   std::cout << "running interleave_column_major_tensor" << std::endl;
-  std::cout << "num_rows:" << num_rows << ","
-            << "num_cols:" << num_cols << ","
+  std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << ","
             << "BITS_PER_ELT:" << BITS_PER_ELT << ","
             << "elts_in_int32:" << elts_in_int32 << ","
             << "rows_per_tile:" << rows_per_tile << std::endl;
@@ -487,8 +486,7 @@ void interleave_column_major_tensor_int4(int8_t* interleaved_quantized_tensor,
 
   const size_t rows_per_tile = 64;
   std::cout << "running interleave_column_major_tensor" << std::endl;
-  std::cout << "num_rows:" << num_rows << ","
-            << "num_cols:" << num_cols << ","
+  std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << ","
             << "BITS_PER_ELT:" << BITS_PER_ELT << ","
             << "elts_in_int32:" << elts_in_int32 << ","
             << "rows_per_tile:" << rows_per_tile << std::endl;
diff --git a/custom_ops/gpu_ops/flash_mask_attn/softmax.hpp b/custom_ops/gpu_ops/flash_mask_attn/softmax.hpp
index 39d94a3b6e9..f1293762035 100644
--- a/custom_ops/gpu_ops/flash_mask_attn/softmax.hpp
+++ b/custom_ops/gpu_ops/flash_mask_attn/softmax.hpp
@@ -189,7 +189,7 @@ struct Softmax {
   using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
   TensorT row_max, row_sum;
 
-  CUTLASS_DEVICE Softmax(){};
+  CUTLASS_DEVICE Softmax() {};
 
   template <bool Is_first, bool Check_inf = false, typename Tensor0>
   __forceinline__ __device__ TensorT max(Tensor0 &acc_s,
diff --git a/custom_ops/gpu_ops/helper.cu b/custom_ops/gpu_ops/helper.cu
index 45a3660d399..458b74640ab 100644
--- a/custom_ops/gpu_ops/helper.cu
+++ b/custom_ops/gpu_ops/helper.cu
@@ -133,8 +133,7 @@ void GPUMemoryChecker::addCheckPoint(const char* call_file, int call_line) {
       std::cout << "\nCall Line: " << call_line << "\t";
       for (int i = 0; i < visible_device_.size(); i++) {
         unsigned int device_id = visible_device_.at(i);
-        std::cout << "GPU " << device_id << ": "
-                  << "  Used memory: "
+        std::cout << "GPU " << device_id << ": " << "  Used memory: "
                   << visible_device_mem_usage_.at(device_id) / (1024 * 1024)
                   << " MB\t";
       }
diff --git a/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu b/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu
index f56982747d1..ecf6ecbe2de 100644
--- a/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu
+++ b/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu
@@ -245,8 +245,7 @@ void SentKeyValueByRemotePtr(const paddle::Tensor& local_key_tensor,
   getNvidiaGPUMemoryUsage(__LINE__);
 
 #ifdef DEBUG_IPC_SENT
-  std::cout << "#### 1111"
-            << " remote_key_ptr: " << remote_key_ptr
+  std::cout << "#### 1111" << " remote_key_ptr: " << remote_key_ptr
             << " remote_value_ptr: " << remote_value_ptr << std::endl;
 #endif
   getNvidiaGPUMemoryUsage(__LINE__);
diff --git a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp
index bb808c2a115..bd46089a570 100644
--- a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp
+++ b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp
@@ -168,7 +168,7 @@ struct Softmax {
   using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
   TensorT row_max, row_sum;
 
-  CUTLASS_DEVICE Softmax(){};
+  CUTLASS_DEVICE Softmax() {};
 
   template <bool Is_first, bool Check_inf = false, typename Tensor0>
   __forceinline__ __device__ TensorT max(Tensor0 &acc_s,
diff --git a/custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu b/custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu
index 8b83d44df21..d31043b7963 100644
--- a/custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu
+++ b/custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu
@@ -58,7 +58,7 @@ __global__ void permute_cols_kernel(
     const int32_t* __restrict__ num_tokens_past_padded_ptr,
     int size_m,
     int size_k,
-    int top_k){};
+    int top_k) {};
 
 }  // namespace marlin
 
diff --git a/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h b/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h
index 79c51b4c8a7..863c3efa571 100644
--- a/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h
+++ b/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h
@@ -30,7 +30,7 @@ struct Tensor {
   Tensor() : raw_tensor_() {}
   Tensor(const Tensor &) = default;
   Tensor(Tensor &&) = default;
-  Tensor operator=(const Tensor &x) &noexcept {
+  Tensor operator=(const Tensor &x) & noexcept {
     raw_tensor_ = x.raw_tensor_;
     return *this;
   }
diff --git a/custom_ops/gpu_ops/sparse_indexer/exception.h b/custom_ops/gpu_ops/sparse_indexer/exception.h
index 5b6dde348f4..a83782fd68e 100644
--- a/custom_ops/gpu_ops/sparse_indexer/exception.h
+++ b/custom_ops/gpu_ops/sparse_indexer/exception.h
@@ -82,8 +82,8 @@ class Error : public std::exception {
         int line,
         const std::string& message) {
     std::ostringstream oss;
-    oss << "Error in function '" << func << "' "
-        << "at " << file << ":" << line << ": " << message;
+    oss << "Error in function '" << func << "' " << "at " << file << ":" << line
+        << ": " << message;
     message_ = oss.str();
   }
 
@@ -102,8 +102,8 @@ class Warning {
           int line,
           const std::string& message) {
     std::ostringstream oss;
-    oss << "Warning in function '" << func << "' "
-        << "at " << file << ":" << line << ": " << message;
+    oss << "Warning in function '" << func << "' " << "at " << file << ":"
+        << line << ": " << message;
     message_ = oss.str();
   }
 
diff --git a/custom_ops/gpu_ops/tune_cublaslt_gemm.cu b/custom_ops/gpu_ops/tune_cublaslt_gemm.cu
index b2c370de639..08b760091f3 100644
--- a/custom_ops/gpu_ops/tune_cublaslt_gemm.cu
+++ b/custom_ops/gpu_ops/tune_cublaslt_gemm.cu
@@ -180,8 +180,7 @@ static void TestMatmulRun(cublasLtHandle_t ltHandle,
     std::cerr << "Not enough workspace! Required "
               << static_cast<double>(heurResult.workspaceSize) / 1024.0 /
                      1024.0 / 1024.0
-              << " GiB"
-              << ", But remaining "
+              << " GiB" << ", But remaining "
               << static_cast<double>(remainingMemorySize) / 1024.0 / 1024.0 /
                      1024.0
               << " GiB" << std::endl;
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h
index e976a830280..888fb46b0c8 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h
@@ -68,27 +68,30 @@ class RDMACommunicator {
                                         uint32_t access_flags);
   bool deregister_memory_regions(struct RdmaContext* ctx);
 
+  // Performance optimization: pass data_type by reference to avoid copying
   bool post_block_send(struct RdmaContext* ctx,
                        int layer_idx,
                        const std::vector<int64_t>& local_block_ids,
-                       const std::string data_type,
+                       const std::string& data_type,
                        std::vector<uint64_t>& remote_addr,
                        uint32_t rkey,
                        const std::string& ip,
                        const std::string& port);
 
+  // Performance optimization: pass data_type by reference to avoid copying
   bool execute_rdma_writes(struct RdmaContext* ctx,
                            int layer_idx,
                            const std::vector<int64_t>& local_block_ids,
-                           const std::string data_type,
+                           const std::string& data_type,
                            std::vector<uint64_t>& remote_addr,
                            uint32_t rkey);
 
+  // Performance optimization: pass data_type by reference to avoid copying
   void prepare_write_requests(struct ibv_sge* sge_list,
                               struct ibv_send_wr* send_wr_list,
                               int layer_idx,
                               const std::vector<int64_t>& local_block_ids,
-                              const std::string data_type,
+                              const std::string& data_type,
                               std::vector<uint64_t>& remote_addr,
                               uint32_t rkey);
 
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp
index a86d014fc8d..1b2f91f1c20 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp
@@ -1256,7 +1256,7 @@ bool RDMACommunicator::post_block_send(
     struct RdmaContext* ctx,
     int layer_idx,
     const std::vector<int64_t>& local_block_ids,
-    const std::string data_type,
+    const std::string& data_type,
     std::vector<uint64_t>& remote_addr,
     uint32_t rkey,
     const std::string& ip,
@@ -1283,7 +1283,7 @@ bool RDMACommunicator::execute_rdma_writes(
     struct RdmaContext* ctx,
     int layer_idx,
     const std::vector<int64_t>& local_block_ids,
-    const std::string data_type,
+    const std::string& data_type,
     std::vector<uint64_t>& remote_addr,
     uint32_t rkey) {
   auto block_num = local_block_ids.size();
@@ -1331,7 +1331,7 @@ void RDMACommunicator::prepare_write_requests(
     struct ibv_send_wr* send_wr_list,
     int layer_idx,
     const std::vector<int64_t>& local_block_ids,
-    const std::string data_type,
+    const std::string& data_type,
     std::vector<uint64_t>& remote_addr,
     uint32_t rkey) {
   auto block_num = local_block_ids.size();