Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions include/infiniop/ops/moe_wna16_marlin_gemm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef __INFINIOP_MOE_WNA16_MARLIN_GEMM_API_H__
#define __INFINIOP_MOE_WNA16_MARLIN_GEMM_API_H__

#include "../operator_descriptor.h"
#include <cstdint>

typedef struct InfiniopDescriptor *infiniopMoeWna16MarlinGemmDescriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreateMoeWna16MarlinGemmDescriptor(infiniopHandle_t handle,
infiniopMoeWna16MarlinGemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_q_weight_desc,
infiniopTensorDescriptor_t b_bias_desc,
infiniopTensorDescriptor_t b_scales_desc,
infiniopTensorDescriptor_t global_scales_desc,
infiniopTensorDescriptor_t b_zeros_desc,
infiniopTensorDescriptor_t g_idx_desc,
infiniopTensorDescriptor_t perm_desc,
infiniopTensorDescriptor_t sorted_token_desc,
infiniopTensorDescriptor_t expert_ids_desc,
infiniopTensorDescriptor_t num_tokens_post_padded_desc,
infiniopTensorDescriptor_t topk_weights_desc,
int size_m, int size_n, int size_k,
int top_k, int moe_block_size);
;

__INFINI_C __export infiniStatus_t infiniopGetMoeWna16MarlinGemmWorkspaceSize(infiniopMoeWna16MarlinGemmDescriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopMoeWna16MarlinGemm(infiniopMoeWna16MarlinGemmDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b_q_weight,
void *b_bias,
void *b_scales,
void *global_scales,
void *b_zeros,
void *g_idx,
void *perm,
void *sorted_token_ids,
void *expert_ids,
void *num_tokens_post_padded,
void *topk_weights,
bool mul_topk_weights,
bool is_ep,
int64_t b_q_type_id,
bool is_k_full,
bool use_atomic_add,
bool use_fp32_reduce,
bool is_zp_float,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyMoeWna16MarlinGemmDescriptor(infiniopMoeWna16MarlinGemmDescriptor_t desc);

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -1041,7 +1041,7 @@ infiniStatus_t gptq_marlin_gemm_kernel(void *c,
return INFINI_STATUS_SUCCESS;
}
#endif
int getCudaDeviceSMCount() {
static int getCudaDeviceSMCount() {
int dev;
cudaGetDevice(&dev);
cudaDeviceProp prop;
Expand Down
84 changes: 84 additions & 0 deletions src/infiniop/ops/moe_wna16_marlin_gemm/info.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#ifndef __MOE_WNA16_MARLIN_GEMM_INFO_H__
#define __MOE_WNA16_MARLIN_GEMM_INFO_H__

#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>

#include <cassert>

namespace op::moe_wna16_marlin_gemm {

class MoeWna16MarlinGemmInfo {
MoeWna16MarlinGemmInfo() = default;

public:
infiniDtype_t dtype;
int size_m, size_n, size_k, top_k, moe_block_size;
size_t num_groups, sorted_token_ids_size_0, b_q_weight_size_1, b_q_weight_size_2, b_zeros_size_1, b_zeros_size_2, c_size_0;
bool has_act_order, has_bias, has_zp;

static utils::Result<MoeWna16MarlinGemmInfo> create(
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_q_weight_desc,
infiniopTensorDescriptor_t b_bias_desc,
infiniopTensorDescriptor_t b_scales_desc,
infiniopTensorDescriptor_t global_scales_desc,
infiniopTensorDescriptor_t b_zeros_desc,
infiniopTensorDescriptor_t g_idx_desc,
infiniopTensorDescriptor_t perm_desc,
infiniopTensorDescriptor_t sorted_token_desc,
infiniopTensorDescriptor_t expert_ids_desc,
infiniopTensorDescriptor_t num_tokens_post_padded_desc,
infiniopTensorDescriptor_t topk_weights_desc, int size_m, int size_n, int size_k, int top_k, int moe_block_size) {
CHECK_OR_RETURN(
c_desc != nullptr && a_desc != nullptr && b_q_weight_desc != nullptr && b_scales_desc != nullptr,
INFINI_STATUS_NULL_POINTER);
const infiniDtype_t dtype = a_desc->dtype();

CHECK_OR_RETURN(a_desc->dim(0) == static_cast<size_t>(size_m)
&& a_desc->dim(1) == static_cast<size_t>(size_k)
&& c_desc->dim(1) == static_cast<size_t>(size_n),
INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(b_scales_desc->ndim() == 3
&& b_scales_desc->dim(2) == static_cast<size_t>(size_n),
INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t num_groups = b_scales_desc->dim(1);
bool has_act_order = false;
bool has_bias = (b_bias_desc != nullptr);
bool has_zp = (b_zeros_desc != nullptr);
if (g_idx_desc != nullptr && perm_desc != nullptr) {
CHECK_OR_RETURN(g_idx_desc->dim(g_idx_desc->ndim() - 1) == static_cast<size_t>(size_k)
&& perm_desc->dim(perm_desc->ndim() - 1) == static_cast<size_t>(size_k),
INFINI_STATUS_BAD_TENSOR_SHAPE);
has_act_order = true;
}
if (num_groups > 1) {
CHECK_OR_RETURN(static_cast<size_t>(size_k) % num_groups == 0,
INFINI_STATUS_BAD_TENSOR_SHAPE);
}
if (b_bias_desc != nullptr) {
CHECK_OR_RETURN(b_bias_desc->dim(1) == static_cast<size_t>(size_n)
&& b_bias_desc->strides()[1] == 1,
INFINI_STATUS_BAD_TENSOR_SHAPE);
}

size_t sorted_token_ids_size_0 = sorted_token_desc->dim(0);
size_t b_q_weight_size_1 = b_q_weight_desc->dim(1);
size_t b_q_weight_size_2 = b_q_weight_desc->dim(2);
size_t b_zeros_size_1 = 0;
size_t b_zeros_size_2 = 0;
if (b_zeros_desc != nullptr) {
b_zeros_size_1 = b_zeros_desc->dim(1);
b_zeros_size_2 = b_zeros_desc->dim(2);
}
size_t c_size_0 = c_desc->dim(0);
return utils::Result<MoeWna16MarlinGemmInfo>(
MoeWna16MarlinGemmInfo{dtype, size_m, size_n, size_k, top_k, moe_block_size, num_groups, sorted_token_ids_size_0, b_q_weight_size_1, b_q_weight_size_2, b_zeros_size_1, b_zeros_size_2, has_act_order, has_bias, has_zp});
}
};

} // namespace op::moe_wna16_marlin_gemm

#endif // __MOE_WNA16_MARLIN_GEMM_INFO_H__
Loading
Loading