From 6da3593815e34e9b15e61c2e58ba838ee52de239 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 7 Apr 2026 15:25:54 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20pass=20`std::string`=20by?= =?UTF-8?q?=20const=20reference=20to=20avoid=20unnecessary=20copies=20in?= =?UTF-8?q?=20RDMA=20ops?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh | 8 ++------ .../sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp | 6 ++---- .../sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp | 6 ++---- .../sm90_gemm_gated_tma_warpspecialized_cooperative.hpp | 2 +- .../sm90_gemm_gated_tma_warpspecialized_pingpong.hpp | 4 ++-- custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h | 4 +--- .../w4a8_moe/w4a8_moe_cutlass_kernel_template.cu | 5 ++--- .../cutlass_kernels/w4a8_moe/weight_process_utils.h | 6 ++---- .../gpu_ops/cutlass_kernels/weight_process_utils.h | 6 ++---- custom_ops/gpu_ops/flash_mask_attn/softmax.hpp | 2 +- custom_ops/gpu_ops/helper.cu | 3 +-- .../gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu | 3 +-- .../gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp | 2 +- custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu | 2 +- custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h | 2 +- custom_ops/gpu_ops/sparse_indexer/exception.h | 8 ++++---- custom_ops/gpu_ops/tune_cublaslt_gemm.cu | 3 +-- .../kvcache_transfer/include/kvcache_rdma.h | 9 ++++++--- .../kvcache_transfer/src/kvcache_rdma.cpp | 6 +++--- 19 files changed, 36 insertions(+), 51 deletions(-) diff --git a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh index cb4c25bcf4e..6c1360dd36a 100644 --- a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh +++ b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh @@ -53,13 +53,9 @@ struct Signal { alignas(128) FlagType peer_counter[2][kMaxBlocks][8]; }; -struct __align__(16) RankData { - const void* __restrict__ ptrs[8]; -}; +struct __align__(16) RankData { const void* __restrict__ ptrs[8]; }; -struct __align__(16) RankSignals { - Signal* signals[8]; -}; +struct __align__(16) RankSignals { Signal* signals[8]; }; // like std::array, but aligned template diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp index f335ec2d399..f34015c39e0 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp @@ -622,16 +622,14 @@ struct CollectiveMmaGated< } else { return thread_mma.partition_B(sAux); } - } - (); + }(); auto tCrAux = [&]() -> auto { if constexpr (SwapAB) { return thread_mma.make_fragment_A(tCsAux); } else { return thread_mma.make_fragment_B(tCsAux); } - } - (); + }(); CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum0)); // M CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum0)); // N diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp index c34ad242e25..8a17ea8105b 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp @@ -617,16 +617,14 @@ struct CollectiveMmaGated< } else { return thread_mma.partition_B(sAux); } - } - (); + }(); auto tCrAux = [&]() -> auto { if constexpr (SwapAB) { return thread_mma.make_fragment_A(tCsAux); } else { return thread_mma.make_fragment_B(tCsAux); } - } - (); + }(); CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum0)); // M CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum0)); // N diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_cooperative.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_cooperative.hpp index efb90a5e7c6..46273113a31 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_cooperative.hpp +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_cooperative.hpp @@ -635,7 +635,7 @@ class GemmUniversalGated< collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state); } // Epilogue Producer Warp End - } // Producer Warp Group End + } // Producer Warp Group End else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) { diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_pingpong.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_pingpong.hpp index 9609adc32a7..4e4b3e4a662 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_pingpong.hpp +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/sm90_gemm_gated_tma_warpspecialized_pingpong.hpp @@ -618,7 +618,7 @@ class GemmUniversalGated< collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state); } // Epilogue Producer Warp End - } // Producer Warp Group End + } // Producer Warp Group End else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) { @@ -714,7 +714,7 @@ class GemmUniversalGated< scheduler.advance_to_next_work(NumMmaWarpGroups); work_tile_info = scheduler.get_current_work(); } // Scheduler work fetch loop - } // Consumer Warp Groups End + } // Consumer Warp Groups End #endif } }; diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h index 0927d327385..e8028e31d1a 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h +++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h @@ -32,9 +32,7 @@ // workspace for cublas gemm : 32MB #define CUBLAS_WORKSPACE_SIZE 33554432 -typedef struct __align__(4) { - half x, y, z, w; -} +typedef struct __align__(4) { half x, y, z, w; } half4; /* **************************** type definition ***************************** */ diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_cutlass_kernel_template.cu b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_cutlass_kernel_template.cu index cf02b86af19..f1b3f4ed5ab 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_cutlass_kernel_template.cu +++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_cutlass_kernel_template.cu @@ -876,9 +876,8 @@ void W4A8MoeGemmRunner::moe_gemm( gemm_config_from_file_and_param.stages = max_total_rows_config.stages; } } - VLOG(1) << "W4A8 moe gemm " - << "total_rows: " << total_rows << " n: " << gemm_n - << " k: " << gemm_k + VLOG(1) << "W4A8 moe gemm " << "total_rows: " << total_rows + << " n: " << gemm_n << " k: " << gemm_k << "Using gemm config from config file: config_total_rows: " << best_total_rows << " config_n: " << best_n << " config_k: " << best_k << "tile_config: " diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/weight_process_utils.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/weight_process_utils.h index dd6536c762e..91239a823ed 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/weight_process_utils.h +++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/weight_process_utils.h @@ -433,8 +433,7 @@ void interleave_column_major_tensor(int8_t* interleaved_quantized_tensor, const size_t rows_per_tile = 64; std::cout << "running interleave_column_major_tensor" << std::endl; - std::cout << "num_rows:" << num_rows << "," - << "num_cols:" << num_cols << "," + std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << "," << "BITS_PER_ELT:" << BITS_PER_ELT << "," << "elts_in_int32:" << elts_in_int32 << "," << "rows_per_tile:" << rows_per_tile << std::endl; @@ -486,8 +485,7 @@ void interleave_column_major_tensor_int4(int8_t* interleaved_quantized_tensor, const size_t rows_per_tile = 64; std::cout << "running interleave_column_major_tensor" << std::endl; - std::cout << "num_rows:" << num_rows << "," - << "num_cols:" << num_cols << "," + std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << "," << "BITS_PER_ELT:" << BITS_PER_ELT << "," << "elts_in_int32:" << elts_in_int32 << "," << "rows_per_tile:" << rows_per_tile << std::endl; diff --git a/custom_ops/gpu_ops/cutlass_kernels/weight_process_utils.h b/custom_ops/gpu_ops/cutlass_kernels/weight_process_utils.h index 01a9201a0e2..f3217bea688 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/weight_process_utils.h +++ b/custom_ops/gpu_ops/cutlass_kernels/weight_process_utils.h @@ -434,8 +434,7 @@ void interleave_column_major_tensor(int8_t* interleaved_quantized_tensor, const size_t rows_per_tile = 64; std::cout << "running interleave_column_major_tensor" << std::endl; - std::cout << "num_rows:" << num_rows << "," - << "num_cols:" << num_cols << "," + std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << "," << "BITS_PER_ELT:" << BITS_PER_ELT << "," << "elts_in_int32:" << elts_in_int32 << "," << "rows_per_tile:" << rows_per_tile << std::endl; @@ -487,8 +486,7 @@ void interleave_column_major_tensor_int4(int8_t* interleaved_quantized_tensor, const size_t rows_per_tile = 64; std::cout << "running interleave_column_major_tensor" << std::endl; - std::cout << "num_rows:" << num_rows << "," - << "num_cols:" << num_cols << "," + std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << "," << "BITS_PER_ELT:" << BITS_PER_ELT << "," << "elts_in_int32:" << elts_in_int32 << "," << "rows_per_tile:" << rows_per_tile << std::endl; diff --git a/custom_ops/gpu_ops/flash_mask_attn/softmax.hpp b/custom_ops/gpu_ops/flash_mask_attn/softmax.hpp index 39d94a3b6e9..f1293762035 100644 --- a/custom_ops/gpu_ops/flash_mask_attn/softmax.hpp +++ b/custom_ops/gpu_ops/flash_mask_attn/softmax.hpp @@ -189,7 +189,7 @@ struct Softmax { using TensorT = decltype(make_tensor(Shape>{})); TensorT row_max, row_sum; - CUTLASS_DEVICE Softmax(){}; + CUTLASS_DEVICE Softmax() {}; template __forceinline__ __device__ TensorT max(Tensor0 &acc_s, diff --git a/custom_ops/gpu_ops/helper.cu b/custom_ops/gpu_ops/helper.cu index 45a3660d399..458b74640ab 100644 --- a/custom_ops/gpu_ops/helper.cu +++ b/custom_ops/gpu_ops/helper.cu @@ -133,8 +133,7 @@ void GPUMemoryChecker::addCheckPoint(const char* call_file, int call_line) { std::cout << "\nCall Line: " << call_line << "\t"; for (int i = 0; i < visible_device_.size(); i++) { unsigned int device_id = visible_device_.at(i); - std::cout << "GPU " << device_id << ": " - << " Used memory: " + std::cout << "GPU " << device_id << ": " << " Used memory: " << visible_device_mem_usage_.at(device_id) / (1024 * 1024) << " MB\t"; } diff --git a/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu b/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu index f56982747d1..ecf6ecbe2de 100644 --- a/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu +++ b/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu @@ -245,8 +245,7 @@ void SentKeyValueByRemotePtr(const paddle::Tensor& local_key_tensor, getNvidiaGPUMemoryUsage(__LINE__); #ifdef DEBUG_IPC_SENT - std::cout << "#### 1111" - << " remote_key_ptr: " << remote_key_ptr + std::cout << "#### 1111" << " remote_key_ptr: " << remote_key_ptr << " remote_value_ptr: " << remote_value_ptr << std::endl; #endif getNvidiaGPUMemoryUsage(__LINE__); diff --git a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp index bb808c2a115..bd46089a570 100644 --- a/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp +++ b/custom_ops/gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp @@ -168,7 +168,7 @@ struct Softmax { using TensorT = decltype(make_tensor(Shape>{})); TensorT row_max, row_sum; - CUTLASS_DEVICE Softmax(){}; + CUTLASS_DEVICE Softmax() {}; template __forceinline__ __device__ TensorT max(Tensor0 &acc_s, diff --git a/custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu b/custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu index 8b83d44df21..d31043b7963 100644 --- a/custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu +++ b/custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu @@ -58,7 +58,7 @@ __global__ void permute_cols_kernel( const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m, int size_k, - int top_k){}; + int top_k) {}; } // namespace marlin diff --git a/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h b/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h index 79c51b4c8a7..863c3efa571 100644 --- a/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h +++ b/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h @@ -30,7 +30,7 @@ struct Tensor { Tensor() : raw_tensor_() {} Tensor(const Tensor &) = default; Tensor(Tensor &&) = default; - Tensor operator=(const Tensor &x) &noexcept { + Tensor operator=(const Tensor &x) & noexcept { raw_tensor_ = x.raw_tensor_; return *this; } diff --git a/custom_ops/gpu_ops/sparse_indexer/exception.h b/custom_ops/gpu_ops/sparse_indexer/exception.h index 5b6dde348f4..a83782fd68e 100644 --- a/custom_ops/gpu_ops/sparse_indexer/exception.h +++ b/custom_ops/gpu_ops/sparse_indexer/exception.h @@ -82,8 +82,8 @@ class Error : public std::exception { int line, const std::string& message) { std::ostringstream oss; - oss << "Error in function '" << func << "' " - << "at " << file << ":" << line << ": " << message; + oss << "Error in function '" << func << "' " << "at " << file << ":" << line + << ": " << message; message_ = oss.str(); } @@ -102,8 +102,8 @@ class Warning { int line, const std::string& message) { std::ostringstream oss; - oss << "Warning in function '" << func << "' " - << "at " << file << ":" << line << ": " << message; + oss << "Warning in function '" << func << "' " << "at " << file << ":" + << line << ": " << message; message_ = oss.str(); } diff --git a/custom_ops/gpu_ops/tune_cublaslt_gemm.cu b/custom_ops/gpu_ops/tune_cublaslt_gemm.cu index b2c370de639..08b760091f3 100644 --- a/custom_ops/gpu_ops/tune_cublaslt_gemm.cu +++ b/custom_ops/gpu_ops/tune_cublaslt_gemm.cu @@ -180,8 +180,7 @@ static void TestMatmulRun(cublasLtHandle_t ltHandle, std::cerr << "Not enough workspace! Required " << static_cast(heurResult.workspaceSize) / 1024.0 / 1024.0 / 1024.0 - << " GiB" - << ", But remaining " + << " GiB" << ", But remaining " << static_cast(remainingMemorySize) / 1024.0 / 1024.0 / 1024.0 << " GiB" << std::endl; diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h index e976a830280..888fb46b0c8 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h @@ -68,27 +68,30 @@ class RDMACommunicator { uint32_t access_flags); bool deregister_memory_regions(struct RdmaContext* ctx); + // Performance optimization: pass data_type by reference to avoid copying bool post_block_send(struct RdmaContext* ctx, int layer_idx, const std::vector& local_block_ids, - const std::string data_type, + const std::string& data_type, std::vector& remote_addr, uint32_t rkey, const std::string& ip, const std::string& port); + // Performance optimization: pass data_type by reference to avoid copying bool execute_rdma_writes(struct RdmaContext* ctx, int layer_idx, const std::vector& local_block_ids, - const std::string data_type, + const std::string& data_type, std::vector& remote_addr, uint32_t rkey); + // Performance optimization: pass data_type by reference to avoid copying void prepare_write_requests(struct ibv_sge* sge_list, struct ibv_send_wr* send_wr_list, int layer_idx, const std::vector& local_block_ids, - const std::string data_type, + const std::string& data_type, std::vector& remote_addr, uint32_t rkey); diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp index a86d014fc8d..1b2f91f1c20 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp @@ -1256,7 +1256,7 @@ bool RDMACommunicator::post_block_send( struct RdmaContext* ctx, int layer_idx, const std::vector& local_block_ids, - const std::string data_type, + const std::string& data_type, std::vector& remote_addr, uint32_t rkey, const std::string& ip, @@ -1283,7 +1283,7 @@ bool RDMACommunicator::execute_rdma_writes( struct RdmaContext* ctx, int layer_idx, const std::vector& local_block_ids, - const std::string data_type, + const std::string& data_type, std::vector& remote_addr, uint32_t rkey) { auto block_num = local_block_ids.size(); @@ -1331,7 +1331,7 @@ void RDMACommunicator::prepare_write_requests( struct ibv_send_wr* send_wr_list, int layer_idx, const std::vector& local_block_ids, - const std::string data_type, + const std::string& data_type, std::vector& remote_addr, uint32_t rkey) { auto block_num = local_block_ids.size();