add assign_pos\scatter operator and cpu kernel for moe.

InfiniTensor · Mar 20, 2024 · 5b33afd · 5b33afd
1 parent de18a1b
commit 5b33afd
Show file tree

Hide file tree

Showing 17 changed files with 565 additions and 1 deletion.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -106,4 +106,5 @@ add_subdirectory(src/06frontend)
 add_subdirectory(src/07onnx)
 add_subdirectory(src/08communication)
 add_subdirectory(src/08-01llm)
+add_subdirectory(src/08-02moe)
 add_subdirectory(src/09python_ffi)
diff --git a/src/04kernel/include/kernel/attributes/moe_info.h b/src/04kernel/include/kernel/attributes/moe_info.h
@@ -0,0 +1,24 @@
+#ifndef KERNEL_MOE_INFO_H
+#define KERNEL_MOE_INFO_H
+
+#include "../tensor.h"
+
+namespace refactor::kernel {
+
+    struct AssignPosInfo {
+        uint32_t top, expert_num;
+        uint32_t elementSize;
+
+        AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate);        
+    };
+
+    struct ReorderInfo{
+        bool scatter;  
+        uint32_t top;
+        uint32_t blockNum, blockSize;
+        ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs);
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SPLIT_INFO_H
diff --git a/src/04kernel/include/kernel/collectors/moe.h b/src/04kernel/include/kernel/collectors/moe.h
@@ -0,0 +1,29 @@
+#ifndef KERNEL_MOE_H
+#define KERNEL_MOE_H
+
+#include "../collector.h"
+
+namespace refactor::kernel {
+
+    struct AssignPosCollector final : public InfoCollector {
+        uint32_t topk,numExperts;
+        constexpr AssignPosCollector(decltype(_target) target, uint32_t topk, uint32_t numExperts) noexcept
+            : InfoCollector(target) ,topk(topk), numExperts(numExperts){}
+
+        std::vector<KernelBox>
+        filter(TensorRefs inputs, TensorRefs outputs) const final;
+    };
+
+    struct ReorderCollector final : public InfoCollector {
+        bool scatter;
+        uint32_t topk;
+        constexpr ReorderCollector(decltype(_target) target, bool scatter, uint32_t topk) noexcept
+            : InfoCollector(target) ,scatter(scatter), topk(topk){}
+
+        std::vector<KernelBox>
+        filter(TensorRefs inputs, TensorRefs outputs) const final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SPLIT_H
diff --git a/src/04kernel/src/attributes/moe_info.cc b/src/04kernel/src/attributes/moe_info.cc
@@ -0,0 +1,13 @@
+#include "kernel/attributes/moe_info.h"
+#include <numeric>
+
+namespace refactor::kernel {
+
+AssignPosInfo::AssignPosInfo(uint32_t top, uint32_t expert_num, Tensor const &gate):\
+    top(top), expert_num(expert_num),elementSize(gate.elementsSize()){}      
+
+ReorderInfo::ReorderInfo(bool scatter, uint32_t top, TensorRefs inputs):\
+    scatter(scatter), top(top),blockNum(inputs[1].get().elementsSize()), blockSize(inputs[0].get().strides()[0]){}  
+
+
+}
diff --git a/src/04kernel/src/collectors/moe.cc b/src/04kernel/src/collectors/moe.cc
@@ -0,0 +1,51 @@
+#include "kernel/collectors/moe.h"
+#include "../kernels/moe/cpu_kernel.hh"
+#include "kernel/attributes/moe_info.h"
+
+namespace refactor::kernel {
+
+    std::vector<KernelBox>
+    AssignPosCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
+        AssignPosInfo info(topk, numExperts, inputs[0]);
+        std::vector<KernelBox> ans;
+        switch (_target) {
+            case decltype(_target)::Cpu:
+                if (auto ptr = AssignPosCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            //todo ：暂时用cpu的实现
+            case decltype(_target)::Nvidia:
+                if (auto ptr = AssignPosCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            default:
+                UNREACHABLEX(void, "Unknown target");
+        }
+        return ans;
+    }
+
+    std::vector<KernelBox>
+    ReorderCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
+        ReorderInfo info(scatter, topk, inputs);
+        std::vector<KernelBox> ans;
+        switch (_target) {
+            case decltype(_target)::Cpu:
+                if (auto ptr = ReorderCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            //todo ：暂时用cpu的实现
+            case decltype(_target)::Nvidia:
+                if (auto ptr = ReorderCpu::build(info); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            default:
+                UNREACHABLEX(void, "Unknown target");
+        }
+        return ans;
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/moe/cpu_kernel.cc b/src/04kernel/src/kernels/moe/cpu_kernel.cc
@@ -0,0 +1,83 @@
+#include "cpu_kernel.hh"
+#include <execution>
+#include <list>
+
+namespace refactor::kernel {
+
+    AssignPosCpu::AssignPosCpu(AssignPosInfo info) noexcept
+        : Kernel(), info(std::move(info)) {}
+
+    auto AssignPosCpu::build(AssignPosInfo info) noexcept -> KernelBox {
+        return std::make_unique<AssignPosCpu>(std::move(info));
+    }
+    auto AssignPosCpu::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto AssignPosCpu::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto AssignPosCpu::description() const noexcept -> std::string_view {
+        return "Performing AssignPos operation on generic cpu";
+    }
+
+    auto AssignPosCpu::lower(Resources &) const noexcept -> RoutineWorkspace {
+        using namespace runtime;
+        return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto gate = reinterpret_cast<uint8_t const *>(inputs[0]);
+
+            auto expert_cnt = reinterpret_cast<uint8_t*>(outputs[0]);//T
+            auto pos = reinterpret_cast<uint8_t*>(outputs[1]);
+            std::memset(expert_cnt, 0, info.expert_num);
+            for (size_t i = 0; i < info.elementSize; i ++){
+                ASSERT (gate[i] >= 0 && gate[i] < info.expert_num, "gate exceeds expert idx scope!");
+                expert_cnt[gate[i]] ++;
+            }
+            std::vector<uint8_t> expert_accumlate;
+            expert_accumlate.assign(info.expert_num, 0);
+            for (size_t i=0; i<expert_accumlate.size(); ++i){
+                expert_accumlate[i] = (i==0) ? expert_cnt[i] : (expert_accumlate[i-1] + expert_cnt[i]);
+            }
+
+            for (size_t i=0; i< info.elementSize; ++i){
+                pos[--expert_accumlate[gate[i]]] = i;
+            }          
+        };
+    }
+
+
+    ReorderCpu::ReorderCpu(ReorderInfo info) noexcept
+        : Kernel(), info(std::move(info)) {}
+
+    auto ReorderCpu::build(ReorderInfo info) noexcept -> KernelBox {
+        return std::make_unique<ReorderCpu>(std::move(info));
+    }
+    auto ReorderCpu::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto ReorderCpu::kernelTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto ReorderCpu::description() const noexcept -> std::string_view {
+        return "Performing scatter operation on generic cpu";
+    }
+
+    auto ReorderCpu::lower(Resources &) const noexcept -> RoutineWorkspace {
+        using namespace runtime;
+        return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto input = reinterpret_cast<float const *>(inputs[0]);
+            auto pos = reinterpret_cast<uint32_t const *>(inputs[1]);
+            auto dstVal = reinterpret_cast<float*>(outputs[0]);//T
+
+            for(size_t i = 0; i<info.blockNum; i++){
+                if (info.scatter)
+                    std::copy_n(input + (pos[i]/info.top) * info.blockSize, info.blockSize, dstVal + i*info.blockSize);
+                else 
+                    std::copy_n(input + i*info.blockSize, info.blockSize, dstVal + pos[i] * info.blockSize);
+            }            
+        };
+    }
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/moe/cpu_kernel.hh b/src/04kernel/src/kernels/moe/cpu_kernel.hh
@@ -0,0 +1,35 @@
+#ifndef KERNEL_MOE_CPU_KERNEL_HH
+#define KERNEL_MOE_CPU_KERNEL_HH
+
+#include "kernel/attributes/moe_info.h"
+#include "kernel/kernel.h"
+
+namespace refactor::kernel {
+
+    struct AssignPosCpu final : public Kernel {
+        AssignPosInfo info;
+        explicit AssignPosCpu(AssignPosInfo info) noexcept;
+
+        static KernelBox build(AssignPosInfo info) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+        RoutineWorkspace lower(Resources &) const noexcept final;
+    };
+
+    struct ReorderCpu final : public Kernel {
+        ReorderInfo info;
+        explicit ReorderCpu(ReorderInfo info) noexcept;
+
+        static KernelBox build(ReorderInfo info) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+        RoutineWorkspace lower(Resources &) const noexcept final;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_SPLIT_CPU_KERNEL_HH
diff --git a/src/04kernel/test/kernels/moe/test_cpu.cpp b/src/04kernel/test/kernels/moe/test_cpu.cpp
@@ -0,0 +1,75 @@
+#include "../../../src/kernels/moe/cpu_kernel.hh"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+
+TEST(kernel, AssignPosCpu) {
+    // build routine    
+    //auto inputTensor = Tensor::share(DataType::F32, Shape{4, 1024});
+    auto gate = Tensor::share(DataType::U32, Shape{8, 2});
+    auto expert_cnt = Tensor::share(DataType::U32, Shape{4});
+    auto pos = Tensor::share(DataType::U32, Shape{16});
+
+    auto kernel = AssignPosCpu::build(AssignPosInfo(2,4, *gate));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<uint8_t> ins = {3,2, 0,1, 2,1, 1,3, 2,0, 1,3, 1,0, 1,2};
+    std::vector<uint8_t>  out0(expert_cnt->elementsSize());
+    std::vector<uint8_t> out1(pos->elementsSize());
+
+    // inference
+    void const *inputs[]{ins.data()};
+    void *outputs[]{out0.data(), out1.data()};
+    routine(res, nullptr, inputs, outputs);    
+
+    // check
+    std::vector<uint32_t> expectExpertCnt = {3,6,4,3};
+    std::vector<uint32_t> expectPos = {13,9,2, 14,12,10,6,5,3, 15,8,4,1, 11,7,0};
+    //std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});
+
+    for(size_t i=0;i< expectPos.size(); ++i){
+        EXPECT_EQ(expectPos[i], out1[i]);
+    }
+    for(size_t i=0;i< expectExpertCnt.size(); ++i){
+        EXPECT_EQ(expectExpertCnt[i], out0[i]);
+    }
+}
+
+TEST(kernel, ReorderScatterCpu) {
+    // build routine    
+    const int seq = 8, hid = 4, top = 2;
+    auto input = Tensor::share(DataType::U32, Shape{seq, hid});
+    auto pos = Tensor::share(DataType::U32, Shape{seq * top});
+    std::vector<Arc<Tensor>> inputTensors{input, pos};
+    TensorRefs inputs_;
+    inputs_.reserve(inputTensors.size());
+    std::transform(inputTensors.begin(), inputTensors.end(),
+                   std::back_inserter(inputs_),
+                   [](auto const &it) { return std::cref(*it); });
+
+    auto kernel = ReorderCpu::build(ReorderInfo(true, top, inputs_));
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto routine = kernel->lower(res).routine;
+    // put input data
+    std::vector<float>  ins0(input->elementsSize());
+    std::iota(ins0.begin(), ins0.end(), 0);
+    std::vector<uint32_t> ins1 = {13,9,2, 14,12,10,6,5,3, 15,8,4,1, 11,7,0};
+    std::vector<float>  out(input->elementsSize() * top);
+
+    // inference
+    void const *inputs[]{ins0.data(), ins1.data()};
+    void *outputs[]{out.data()};
+    routine(res, nullptr, inputs, outputs);    
+    std::for_each(out.begin(), out.end(),[](const float &val){std::cout<<val<<" ";});
+    // check    
+    for(size_t i=0;i< seq; ++i){
+        int row = ins1[i]/top;
+        for(size_t j = 0; j<hid; j++)
+            EXPECT_EQ(ins0[row *hid + j], out[i*hid + j]);
+    }
+}
diff --git a/src/05computation/include/computation/operators/moe.h b/src/05computation/include/computation/operators/moe.h
@@ -0,0 +1,37 @@
+#ifndef COMPUTATION_MOE_H
+#define COMPUTATION_MOE_H
+
+#include "../operator.h"
+
+namespace refactor::computation {
+
+    struct AssignPos final : public Operator {
+        uint32_t topk,numExperts;
+
+        constexpr explicit AssignPos(uint32_t topk, uint32_t numExperts) noexcept : Operator(), 
+            topk(topk), numExperts(numExperts){}
+
+        static size_t typeId() noexcept;
+        size_t opTypeId() const noexcept final;
+        std::string_view name() const noexcept final;
+        kernel::CollectorBox candidateKernels(Target) const final;
+        std::string serialize() const noexcept final;
+    };
+
+     struct Reorder final : public Operator {
+        bool scatter;
+        uint32_t topk;
+
+        constexpr explicit Reorder(bool scatter, uint32_t topk) noexcept : Operator(), 
+            scatter(scatter), topk(topk){}
+
+        static size_t typeId() noexcept;
+        size_t opTypeId() const noexcept final;
+        std::string_view name() const noexcept final;
+        kernel::CollectorBox candidateKernels(Target) const final;
+        std::string serialize() const noexcept final;
+    };
+
+}// namespace refactor::computation
+
+#endif// COMPUTATION_RMS_NORMALIZATION_H
diff --git a/src/05computation/src/operators/moe.cc b/src/05computation/src/operators/moe.cc
@@ -0,0 +1,34 @@
+#include "computation/operators/moe.h"
+#include "kernel/collectors/moe.h"
+
+namespace refactor::computation {
+
+    auto AssignPos::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    auto AssignPos::opTypeId() const noexcept -> size_t { return typeId(); }
+    auto AssignPos::name() const noexcept -> std::string_view { return "moe::AssignPos"; }
+    auto AssignPos::candidateKernels(Target target) const -> kernel::CollectorBox {
+        using Collector_ = kernel::AssignPosCollector;
+        return std::make_unique<Collector_>(target, topk, numExperts);
+    }
+    auto AssignPos::serialize() const noexcept -> std::string {
+        return "moe::AssignPos()";
+    }
+
+    auto Reorder::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    auto Reorder::opTypeId() const noexcept -> size_t { return typeId(); }
+    auto Reorder::name() const noexcept -> std::string_view { return "moe::Reorder"; }
+    auto Reorder::candidateKernels(Target target) const -> kernel::CollectorBox {
+        using Collector_ = kernel::ReorderCollector;
+        return std::make_unique<Collector_>(target, scatter, topk);
+    }
+    auto Reorder::serialize() const noexcept -> std::string {
+        return "moe::Reorder()";
+    }
+
+}// namespace refactor::computation