Skip to content

Commit

Permalink
add assign_pos\scatter operator and cpu kernel for moe.
Browse files Browse the repository at this point in the history
  • Loading branch information
wendy12022 committed Mar 20, 2024
1 parent de18a1b commit 5b33afd
Show file tree
Hide file tree
Showing 17 changed files with 565 additions and 1 deletion.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,5 @@ add_subdirectory(src/06frontend)
add_subdirectory(src/07onnx)
add_subdirectory(src/08communication)
add_subdirectory(src/08-01llm)
add_subdirectory(src/08-02moe)
add_subdirectory(src/09python_ffi)
24 changes: 24 additions & 0 deletions src/04kernel/include/kernel/attributes/moe_info.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef KERNEL_MOE_INFO_H
#define KERNEL_MOE_INFO_H

#include "../tensor.h"

namespace refactor::kernel {

struct AssignPosInfo {
uint32_t top, expert_num;
uint32_t elementSize;

AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate);
};

struct ReorderInfo{
bool scatter;
uint32_t top;
uint32_t blockNum, blockSize;
ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs);
};

}// namespace refactor::kernel

#endif// KERNEL_SPLIT_INFO_H
29 changes: 29 additions & 0 deletions src/04kernel/include/kernel/collectors/moe.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#ifndef KERNEL_MOE_H
#define KERNEL_MOE_H

#include "../collector.h"

namespace refactor::kernel {

struct AssignPosCollector final : public InfoCollector {
uint32_t topk,numExperts;
constexpr AssignPosCollector(decltype(_target) target, uint32_t topk, uint32_t numExperts) noexcept
: InfoCollector(target) ,topk(topk), numExperts(numExperts){}

std::vector<KernelBox>
filter(TensorRefs inputs, TensorRefs outputs) const final;
};

struct ReorderCollector final : public InfoCollector {
bool scatter;
uint32_t topk;
constexpr ReorderCollector(decltype(_target) target, bool scatter, uint32_t topk) noexcept
: InfoCollector(target) ,scatter(scatter), topk(topk){}

std::vector<KernelBox>
filter(TensorRefs inputs, TensorRefs outputs) const final;
};

}// namespace refactor::kernel

#endif// KERNEL_SPLIT_H
13 changes: 13 additions & 0 deletions src/04kernel/src/attributes/moe_info.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "kernel/attributes/moe_info.h"
#include <numeric>

namespace refactor::kernel {

AssignPosInfo::AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate):\
top(top), expert_num(expert_num),elementSize(gate.elementsSize()){}

ReorderInfo::ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs):\
scatter(scatter), top(top),blockNum(inputs[1].get().elementsSize()), blockSize(inputs[0].get().strides()[0]){}


}
51 changes: 51 additions & 0 deletions src/04kernel/src/collectors/moe.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#include "kernel/collectors/moe.h"
#include "../kernels/moe/cpu_kernel.hh"
#include "kernel/attributes/moe_info.h"

namespace refactor::kernel {

std::vector<KernelBox>
AssignPosCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
AssignPosInfo info(topk, numExperts, inputs[0]);
std::vector<KernelBox> ans;
switch (_target) {
case decltype(_target)::Cpu:
if (auto ptr = AssignPosCpu::build(info); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
//todo :暂时用cpu的实现
case decltype(_target)::Nvidia:
if (auto ptr = AssignPosCpu::build(info); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
return ans;
}

std::vector<KernelBox>
ReorderCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
ReorderInfo info(scatter, topk, inputs);
std::vector<KernelBox> ans;
switch (_target) {
case decltype(_target)::Cpu:
if (auto ptr = ReorderCpu::build(info); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
//todo :暂时用cpu的实现
case decltype(_target)::Nvidia:
if (auto ptr = ReorderCpu::build(info); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
return ans;
}

}// namespace refactor::kernel
83 changes: 83 additions & 0 deletions src/04kernel/src/kernels/moe/cpu_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include "cpu_kernel.hh"
#include <execution>
#include <list>

namespace refactor::kernel {

AssignPosCpu::AssignPosCpu(AssignPosInfo info) noexcept
: Kernel(), info(std::move(info)) {}

auto AssignPosCpu::build(AssignPosInfo info) noexcept -> KernelBox {
return std::make_unique<AssignPosCpu>(std::move(info));
}
auto AssignPosCpu::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto AssignPosCpu::kernelTypeId() const noexcept -> size_t {
return typeId();
}
auto AssignPosCpu::description() const noexcept -> std::string_view {
return "Performing AssignPos operation on generic cpu";
}

auto AssignPosCpu::lower(Resources &) const noexcept -> RoutineWorkspace {
using namespace runtime;
return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
auto gate = reinterpret_cast<uint8_t const *>(inputs[0]);

auto expert_cnt = reinterpret_cast<uint8_t*>(outputs[0]);//T
auto pos = reinterpret_cast<uint8_t*>(outputs[1]);
std::memset(expert_cnt, 0, info.expert_num);
for (size_t i = 0; i < info.elementSize; i ++){
ASSERT (gate[i] >= 0 && gate[i] < info.expert_num, "gate exceeds expert idx scope!");
expert_cnt[gate[i]] ++;
}
std::vector<uint8_t> expert_accumlate;
expert_accumlate.assign(info.expert_num, 0);
for (size_t i=0; i<expert_accumlate.size(); ++i){
expert_accumlate[i] = (i==0) ? expert_cnt[i] : (expert_accumlate[i-1] + expert_cnt[i]);
}

for (size_t i=0; i< info.elementSize; ++i){
pos[--expert_accumlate[gate[i]]] = i;
}
};
}


ReorderCpu::ReorderCpu(ReorderInfo info) noexcept
: Kernel(), info(std::move(info)) {}

auto ReorderCpu::build(ReorderInfo info) noexcept -> KernelBox {
return std::make_unique<ReorderCpu>(std::move(info));
}
auto ReorderCpu::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto ReorderCpu::kernelTypeId() const noexcept -> size_t {
return typeId();
}
auto ReorderCpu::description() const noexcept -> std::string_view {
return "Performing scatter operation on generic cpu";
}

auto ReorderCpu::lower(Resources &) const noexcept -> RoutineWorkspace {
using namespace runtime;
return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
auto input = reinterpret_cast<float const *>(inputs[0]);
auto pos = reinterpret_cast<uint32_t const *>(inputs[1]);
auto dstVal = reinterpret_cast<float*>(outputs[0]);//T

for(size_t i = 0; i<info.blockNum; i++){
if (info.scatter)
std::copy_n(input + (pos[i]/info.top) * info.blockSize, info.blockSize, dstVal + i*info.blockSize);
else
std::copy_n(input + i*info.blockSize, info.blockSize, dstVal + pos[i] * info.blockSize);
}
};
}
}// namespace refactor::kernel
35 changes: 35 additions & 0 deletions src/04kernel/src/kernels/moe/cpu_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#ifndef KERNEL_MOE_CPU_KERNEL_HH
#define KERNEL_MOE_CPU_KERNEL_HH

#include "kernel/attributes/moe_info.h"
#include "kernel/kernel.h"

namespace refactor::kernel {

struct AssignPosCpu final : public Kernel {
AssignPosInfo info;
explicit AssignPosCpu(AssignPosInfo info) noexcept;

static KernelBox build(AssignPosInfo info) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
RoutineWorkspace lower(Resources &) const noexcept final;
};

struct ReorderCpu final : public Kernel {
ReorderInfo info;
explicit ReorderCpu(ReorderInfo info) noexcept;

static KernelBox build(ReorderInfo info) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
RoutineWorkspace lower(Resources &) const noexcept final;
};

}// namespace refactor::kernel

#endif// KERNEL_SPLIT_CPU_KERNEL_HH
75 changes: 75 additions & 0 deletions src/04kernel/test/kernels/moe/test_cpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#include "../../../src/kernels/moe/cpu_kernel.hh"
#include <gtest/gtest.h>
#include <numeric>

using namespace refactor;
using namespace kernel;

TEST(kernel, AssignPosCpu) {
// build routine
//auto inputTensor = Tensor::share(DataType::F32, Shape{4, 1024});
auto gate = Tensor::share(DataType::U32, Shape{8, 2});
auto expert_cnt = Tensor::share(DataType::U32, Shape{4});
auto pos = Tensor::share(DataType::U32, Shape{16});

auto kernel = AssignPosCpu::build(AssignPosInfo(2,4, *gate));
ASSERT_TRUE(kernel);
auto res = runtime::Resources();
auto routine = kernel->lower(res).routine;
// put input data
std::vector<uint8_t> ins = {3,2, 0,1, 2,1, 1,3, 2,0, 1,3, 1,0, 1,2};
std::vector<uint8_t> out0(expert_cnt->elementsSize());
std::vector<uint8_t> out1(pos->elementsSize());

// inference
void const *inputs[]{ins.data()};
void *outputs[]{out0.data(), out1.data()};
routine(res, nullptr, inputs, outputs);

// check
std::vector<uint32_t> expectExpertCnt = {3,6,4,3};
std::vector<uint32_t> expectPos = {13,9,2, 14,12,10,6,5,3, 15,8,4,1, 11,7,0};
//std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});

for(size_t i=0;i< expectPos.size(); ++i){
EXPECT_EQ(expectPos[i], out1[i]);
}
for(size_t i=0;i< expectExpertCnt.size(); ++i){
EXPECT_EQ(expectExpertCnt[i], out0[i]);
}
}

TEST(kernel, ReorderScatterCpu) {
// build routine
const int seq = 8, hid = 4, top = 2;
auto input = Tensor::share(DataType::U32, Shape{seq, hid});
auto pos = Tensor::share(DataType::U32, Shape{seq * top});
std::vector<Arc<Tensor>> inputTensors{input, pos};
TensorRefs inputs_;
inputs_.reserve(inputTensors.size());
std::transform(inputTensors.begin(), inputTensors.end(),
std::back_inserter(inputs_),
[](auto const &it) { return std::cref(*it); });

auto kernel = ReorderCpu::build(ReorderInfo(true, top, inputs_));
ASSERT_TRUE(kernel);
auto res = runtime::Resources();
auto routine = kernel->lower(res).routine;
// put input data
std::vector<float> ins0(input->elementsSize());
std::iota(ins0.begin(), ins0.end(), 0);
std::vector<uint32_t> ins1 = {13,9,2, 14,12,10,6,5,3, 15,8,4,1, 11,7,0};
std::vector<float> out(input->elementsSize() * top);

// inference
void const *inputs[]{ins0.data(), ins1.data()};
void *outputs[]{out.data()};
routine(res, nullptr, inputs, outputs);
std::for_each(out.begin(), out.end(),[](const float &val){std::cout<<val<<" ";});
// check
for(size_t i=0;i< seq; ++i){
int row = ins1[i]/top;
for(size_t j = 0; j<hid; j++)
EXPECT_EQ(ins0[row *hid + j], out[i*hid + j]);
}
}
37 changes: 37 additions & 0 deletions src/05computation/include/computation/operators/moe.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef COMPUTATION_MOE_H
#define COMPUTATION_MOE_H

#include "../operator.h"

namespace refactor::computation {

struct AssignPos final : public Operator {
uint32_t topk,numExperts;

constexpr explicit AssignPos(uint32_t topk, uint32_t numExperts) noexcept : Operator(),
topk(topk), numExperts(numExperts){}

static size_t typeId() noexcept;
size_t opTypeId() const noexcept final;
std::string_view name() const noexcept final;
kernel::CollectorBox candidateKernels(Target) const final;
std::string serialize() const noexcept final;
};

struct Reorder final : public Operator {
bool scatter;
uint32_t topk;

constexpr explicit Reorder(bool scatter, uint32_t topk) noexcept : Operator(),
scatter(scatter), topk(topk){}

static size_t typeId() noexcept;
size_t opTypeId() const noexcept final;
std::string_view name() const noexcept final;
kernel::CollectorBox candidateKernels(Target) const final;
std::string serialize() const noexcept final;
};

}// namespace refactor::computation

#endif// COMPUTATION_RMS_NORMALIZATION_H
34 changes: 34 additions & 0 deletions src/05computation/src/operators/moe.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#include "computation/operators/moe.h"
#include "kernel/collectors/moe.h"

namespace refactor::computation {

auto AssignPos::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}
auto AssignPos::opTypeId() const noexcept -> size_t { return typeId(); }
auto AssignPos::name() const noexcept -> std::string_view { return "moe::AssignPos"; }
auto AssignPos::candidateKernels(Target target) const -> kernel::CollectorBox {
using Collector_ = kernel::AssignPosCollector;
return std::make_unique<Collector_>(target, topk, numExperts);
}
auto AssignPos::serialize() const noexcept -> std::string {
return "moe::AssignPos()";
}

auto Reorder::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}
auto Reorder::opTypeId() const noexcept -> size_t { return typeId(); }
auto Reorder::name() const noexcept -> std::string_view { return "moe::Reorder"; }
auto Reorder::candidateKernels(Target target) const -> kernel::CollectorBox {
using Collector_ = kernel::ReorderCollector;
return std::make_unique<Collector_>(target, scatter, topk);
}
auto Reorder::serialize() const noexcept -> std::string {
return "moe::Reorder()";
}

}// namespace refactor::computation
Loading

0 comments on commit 5b33afd

Please sign in to comment.