Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,9 @@ struct Signal {
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
};

struct __align__(16) RankData {
const void* __restrict__ ptrs[8];
};
struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };

struct __align__(16) RankSignals {
Signal* signals[8];
};
struct __align__(16) RankSignals { Signal* signals[8]; };

// like std::array, but aligned
template <typename T, int sz>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -622,16 +622,14 @@ struct CollectiveMmaGated<
} else {
return thread_mma.partition_B(sAux);
}
}
();
}();
auto tCrAux = [&]() -> auto {
if constexpr (SwapAB) {
return thread_mma.make_fragment_A(tCsAux);
} else {
return thread_mma.make_fragment_B(tCsAux);
}
}
();
}();

CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum0)); // M
CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum0)); // N
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -617,16 +617,14 @@ struct CollectiveMmaGated<
} else {
return thread_mma.partition_B(sAux);
}
}
();
}();
auto tCrAux = [&]() -> auto {
if constexpr (SwapAB) {
return thread_mma.make_fragment_A(tCsAux);
} else {
return thread_mma.make_fragment_B(tCsAux);
}
}
();
}();

CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum0)); // M
CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum0)); // N
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,7 @@ class GemmUniversalGated<
collective_epilogue.load_tail(epi_load_pipeline,
epi_load_pipe_producer_state);
} // Epilogue Producer Warp End
} // Producer Warp Group End
} // Producer Warp Group End

else if (warp_group_role == WarpGroupRole::Consumer0 ||
warp_group_role == WarpGroupRole::Consumer1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ class GemmUniversalGated<
collective_epilogue.load_tail(epi_load_pipeline,
epi_load_pipe_producer_state);
} // Epilogue Producer Warp End
} // Producer Warp Group End
} // Producer Warp Group End

else if (warp_group_role == WarpGroupRole::Consumer0 ||
warp_group_role == WarpGroupRole::Consumer1) {
Expand Down Expand Up @@ -714,7 +714,7 @@ class GemmUniversalGated<
scheduler.advance_to_next_work(NumMmaWarpGroups);
work_tile_info = scheduler.get_current_work();
} // Scheduler work fetch loop
} // Consumer Warp Groups End
} // Consumer Warp Groups End
#endif
}
};
Expand Down
4 changes: 1 addition & 3 deletions custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cuda_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@
// workspace for cublas gemm : 32MB
#define CUBLAS_WORKSPACE_SIZE 33554432

typedef struct __align__(4) {
half x, y, z, w;
}
typedef struct __align__(4) { half x, y, z, w; }
half4;

/* **************************** type definition ***************************** */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -876,9 +876,8 @@ void W4A8MoeGemmRunner<OutputType, IntAType, IntBType>::moe_gemm(
gemm_config_from_file_and_param.stages = max_total_rows_config.stages;
}
}
VLOG(1) << "W4A8 moe gemm "
<< "total_rows: " << total_rows << " n: " << gemm_n
<< " k: " << gemm_k
VLOG(1) << "W4A8 moe gemm " << "total_rows: " << total_rows
<< " n: " << gemm_n << " k: " << gemm_k
<< "Using gemm config from config file: config_total_rows: "
<< best_total_rows << " config_n: " << best_n
<< " config_k: " << best_k << "tile_config: "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,7 @@ void interleave_column_major_tensor(int8_t* interleaved_quantized_tensor,

const size_t rows_per_tile = 64;
std::cout << "running interleave_column_major_tensor" << std::endl;
std::cout << "num_rows:" << num_rows << ","
<< "num_cols:" << num_cols << ","
std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << ","
<< "BITS_PER_ELT:" << BITS_PER_ELT << ","
<< "elts_in_int32:" << elts_in_int32 << ","
<< "rows_per_tile:" << rows_per_tile << std::endl;
Expand Down Expand Up @@ -486,8 +485,7 @@ void interleave_column_major_tensor_int4(int8_t* interleaved_quantized_tensor,

const size_t rows_per_tile = 64;
std::cout << "running interleave_column_major_tensor" << std::endl;
std::cout << "num_rows:" << num_rows << ","
<< "num_cols:" << num_cols << ","
std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << ","
<< "BITS_PER_ELT:" << BITS_PER_ELT << ","
<< "elts_in_int32:" << elts_in_int32 << ","
<< "rows_per_tile:" << rows_per_tile << std::endl;
Expand Down
6 changes: 2 additions & 4 deletions custom_ops/gpu_ops/cutlass_kernels/weight_process_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,8 +434,7 @@ void interleave_column_major_tensor(int8_t* interleaved_quantized_tensor,

const size_t rows_per_tile = 64;
std::cout << "running interleave_column_major_tensor" << std::endl;
std::cout << "num_rows:" << num_rows << ","
<< "num_cols:" << num_cols << ","
std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << ","
<< "BITS_PER_ELT:" << BITS_PER_ELT << ","
<< "elts_in_int32:" << elts_in_int32 << ","
<< "rows_per_tile:" << rows_per_tile << std::endl;
Expand Down Expand Up @@ -487,8 +486,7 @@ void interleave_column_major_tensor_int4(int8_t* interleaved_quantized_tensor,

const size_t rows_per_tile = 64;
std::cout << "running interleave_column_major_tensor" << std::endl;
std::cout << "num_rows:" << num_rows << ","
<< "num_cols:" << num_cols << ","
std::cout << "num_rows:" << num_rows << "," << "num_cols:" << num_cols << ","
<< "BITS_PER_ELT:" << BITS_PER_ELT << ","
<< "elts_in_int32:" << elts_in_int32 << ","
<< "rows_per_tile:" << rows_per_tile << std::endl;
Expand Down
2 changes: 1 addition & 1 deletion custom_ops/gpu_ops/flash_mask_attn/softmax.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ struct Softmax {
using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
TensorT row_max, row_sum;

CUTLASS_DEVICE Softmax(){};
CUTLASS_DEVICE Softmax() {};

template <bool Is_first, bool Check_inf = false, typename Tensor0>
__forceinline__ __device__ TensorT max(Tensor0 &acc_s,
Expand Down
3 changes: 1 addition & 2 deletions custom_ops/gpu_ops/helper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,7 @@ void GPUMemoryChecker::addCheckPoint(const char* call_file, int call_line) {
std::cout << "\nCall Line: " << call_line << "\t";
for (int i = 0; i < visible_device_.size(); i++) {
unsigned int device_id = visible_device_.at(i);
std::cout << "GPU " << device_id << ": "
<< " Used memory: "
std::cout << "GPU " << device_id << ": " << " Used memory: "
<< visible_device_mem_usage_.at(device_id) / (1024 * 1024)
<< " MB\t";
}
Expand Down
3 changes: 1 addition & 2 deletions custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,7 @@ void SentKeyValueByRemotePtr(const paddle::Tensor& local_key_tensor,
getNvidiaGPUMemoryUsage(__LINE__);

#ifdef DEBUG_IPC_SENT
std::cout << "#### 1111"
<< " remote_key_ptr: " << remote_key_ptr
std::cout << "#### 1111" << " remote_key_ptr: " << remote_key_ptr
<< " remote_value_ptr: " << remote_value_ptr << std::endl;
#endif
getNvidiaGPUMemoryUsage(__LINE__);
Expand Down
2 changes: 1 addition & 1 deletion custom_ops/gpu_ops/moba_attn/moba_encoder_attn/softmax.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ struct Softmax {
using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
TensorT row_max, row_sum;

CUTLASS_DEVICE Softmax(){};
CUTLASS_DEVICE Softmax() {};

template <bool Is_first, bool Check_inf = false, typename Tensor0>
__forceinline__ __device__ TensorT max(Tensor0 &acc_s,
Expand Down
2 changes: 1 addition & 1 deletion custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ __global__ void permute_cols_kernel(
const int32_t* __restrict__ num_tokens_past_padded_ptr,
int size_m,
int size_k,
int top_k){};
int top_k) {};

} // namespace marlin

Expand Down
2 changes: 1 addition & 1 deletion custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ struct Tensor {
Tensor() : raw_tensor_() {}
Tensor(const Tensor &) = default;
Tensor(Tensor &&) = default;
Tensor operator=(const Tensor &x) &noexcept {
Tensor operator=(const Tensor &x) & noexcept {
raw_tensor_ = x.raw_tensor_;
return *this;
}
Expand Down
8 changes: 4 additions & 4 deletions custom_ops/gpu_ops/sparse_indexer/exception.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ class Error : public std::exception {
int line,
const std::string& message) {
std::ostringstream oss;
oss << "Error in function '" << func << "' "
<< "at " << file << ":" << line << ": " << message;
oss << "Error in function '" << func << "' " << "at " << file << ":" << line
<< ": " << message;
message_ = oss.str();
}

Expand All @@ -102,8 +102,8 @@ class Warning {
int line,
const std::string& message) {
std::ostringstream oss;
oss << "Warning in function '" << func << "' "
<< "at " << file << ":" << line << ": " << message;
oss << "Warning in function '" << func << "' " << "at " << file << ":"
<< line << ": " << message;
message_ = oss.str();
}

Expand Down
3 changes: 1 addition & 2 deletions custom_ops/gpu_ops/tune_cublaslt_gemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,7 @@ static void TestMatmulRun(cublasLtHandle_t ltHandle,
std::cerr << "Not enough workspace! Required "
<< static_cast<double>(heurResult.workspaceSize) / 1024.0 /
1024.0 / 1024.0
<< " GiB"
<< ", But remaining "
<< " GiB" << ", But remaining "
<< static_cast<double>(remainingMemorySize) / 1024.0 / 1024.0 /
1024.0
<< " GiB" << std::endl;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,27 +68,30 @@ class RDMACommunicator {
uint32_t access_flags);
bool deregister_memory_regions(struct RdmaContext* ctx);

// Performance optimization: pass data_type by reference to avoid copying
bool post_block_send(struct RdmaContext* ctx,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 兼容性 PR 标题缺少官方要求的 Tag。

根据 FastDeploy PR 规范,标题必须包含官方 Tag 之一,如 [Optimization][KVCache][Feature] 等。

建议修改标题为:

  • [Optimization] Bolt: pass std::string by const reference to avoid unnecessary copies in RDMA ops

或者:

  • [KVCache] Bolt: pass std::string by const reference to avoid unnecessary copies in RDMA ops

int layer_idx,
const std::vector<int64_t>& local_block_ids,
const std::string data_type,
const std::string& data_type,
std::vector<uint64_t>& remote_addr,
uint32_t rkey,
const std::string& ip,
const std::string& port);

// Performance optimization: pass data_type by reference to avoid copying
bool execute_rdma_writes(struct RdmaContext* ctx,
int layer_idx,
const std::vector<int64_t>& local_block_ids,
const std::string data_type,
const std::string& data_type,
std::vector<uint64_t>& remote_addr,
uint32_t rkey);

// Performance optimization: pass data_type by reference to avoid copying
void prepare_write_requests(struct ibv_sge* sge_list,
struct ibv_send_wr* send_wr_list,
int layer_idx,
const std::vector<int64_t>& local_block_ids,
const std::string data_type,
const std::string& data_type,
std::vector<uint64_t>& remote_addr,
uint32_t rkey);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1256,7 +1256,7 @@ bool RDMACommunicator::post_block_send(
struct RdmaContext* ctx,
int layer_idx,
const std::vector<int64_t>& local_block_ids,
const std::string data_type,
const std::string& data_type,
std::vector<uint64_t>& remote_addr,
uint32_t rkey,
const std::string& ip,
Expand All @@ -1283,7 +1283,7 @@ bool RDMACommunicator::execute_rdma_writes(
struct RdmaContext* ctx,
int layer_idx,
const std::vector<int64_t>& local_block_ids,
const std::string data_type,
const std::string& data_type,
std::vector<uint64_t>& remote_addr,
uint32_t rkey) {
auto block_num = local_block_ids.size();
Expand Down Expand Up @@ -1331,7 +1331,7 @@ void RDMACommunicator::prepare_write_requests(
struct ibv_send_wr* send_wr_list,
int layer_idx,
const std::vector<int64_t>& local_block_ids,
const std::string data_type,
const std::string& data_type,
std::vector<uint64_t>& remote_addr,
uint32_t rkey) {
auto block_num = local_block_ids.size();
Expand Down
Loading