环境准备
见https://gitee.com/zaj1414904389/ascend-tutorial.git
工程创建
CANN软件包中提供了工程创建工具msopgen,开发者可以输入算子原型定义文件生成Ascend C算子开发工程
[{"op": "AddCustom","input_desc": [{"name": "x","param_type": "required","format": ["ND"],"type": ["fp16"]},{"name": "y","param_type": "required","format": ["ND"],"type": ["fp16"]}],"output_desc": [{"name": "z","param_type": "required","format": ["ND"],"type": ["fp16"]}]}
]
使用msopgen工具生成AddCustom算子的开发工程。
执行以下命令
/usr/local/Ascend/ascend-toolkit/8.0.RC1.alpha002/python/site-packages/bin/msopgen gen -i /home/ma-user/add_custom.json -c ai_core-Ascend910A -lan cpp -out /home/ma-user/AddCustom
生成代码目录
(MindSpore) [root@edbdd54b26c74c17b9ddfb1308c88382-task0-0 AddCustom]# tree -L 2
.
AddCustom
├── build.sh // 编译入口脚本
├── cmake
│ ├── config.cmake
│ ├── util // 算子工程编译所需脚本及公共编译文件存放目录
├── CMakeLists.txt // 算子工程的CMakeLists.txt
├── CMakePresets.json // 编译配置项
├── framework // 算子插件实现文件目录,单算子模型文件的生成不依赖算子适配插件,无需关注
├── op_host // host侧实现文件
│ ├── add_custom_tiling.h // 算子tiling定义文件
│ ├── add_custom.cpp // 算子原型注册、shape推导、信息库、tiling实现等内容文件
│ ├── CMakeLists.txt
├── op_kernel // kernel侧实现文件
│ ├── CMakeLists.txt
│ ├── add_custom.cpp // 算子核函数实现文件
├── scripts // 自定义算子工程打包相关脚本所在目录
算子核函数实现
算子核函数实现代码的内部调用关系
AddCustom/op_kernel/add_custom.cpp完整代码
#include "kernel_operator.h"
using namespace AscendC;
constexpr int32_t BUFFER_NUM = 2;
class KernelAdd {
public:
__aicore__ inline KernelAdd() {}
// 初始化函数,完成内存初始化相关操作
__aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
{// 使用获取到的TilingData计算得到singleCoreSize(每个核上总计算数据大小)、tileNum(每个核上分块个数)、singleTileLength(每个分块大小)等变量ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");this->blockLength = totalLength / GetBlockNum();this->tileNum = tileNum;ASSERT(tileNum != 0 && "tile num can not be zero!");this->tileLength = this->blockLength / tileNum / BUFFER_NUM;// 获取当前核的起始索引xGm.SetGlobalBuffer((__gm__ DTYPE_X*)x + this->blockLength * GetBlockIdx(), this->blockLength);yGm.SetGlobalBuffer((__gm__ DTYPE_Y*)y + this->blockLength * GetBlockIdx(), this->blockLength);zGm.SetGlobalBuffer((__gm__ DTYPE_Z*)z + this->blockLength * GetBlockIdx(), this->blockLength);// 通过Pipe内存管理对象为输入输出Queue分配内存pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X));pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Y));pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z));
}
// 核心处理函数,实现算子逻辑,调用私有成员函数CopyIn、Compute、CopyOut完成矢量算子的三级流水操作
__aicore__ inline void Process()
{int32_t loopCount = this->tileNum * BUFFER_NUM;for (int32_t i = 0; i < loopCount; i++) {CopyIn(i);Compute(i);CopyOut(i);}
}private:
// 搬入函数,完成CopyIn阶段的处理,被核心Process函数调用
__aicore__ inline void CopyIn(int32_t progress)
{// 从Queue中分配输入TensorLocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();LocalTensor<DTYPE_Y> yLocal = inQueueY.AllocTensor<DTYPE_Y>();// 将GlobalTensor数据拷贝到LocalTensorDataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);// 将LocalTesor放入VECIN(代表矢量编程中搬入数据的逻辑存放位置)的Queue中inQueueX.EnQue(xLocal);inQueueY.EnQue(yLocal);
}
// 计算函数,完成Compute阶段的处理,被核心Process函数调用
__aicore__ inline void Compute(int32_t progress)
{// 将Tensor从队列中取出,用于后续计算LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();LocalTensor<DTYPE_Y> yLocal = inQueueY.DeQue<DTYPE_Y>();// 从Queue中分配输出TensorLocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();// 调用Add接口进行计算Add(zLocal, xLocal, yLocal, this->tileLength);// 将计算结果LocalTensor放入到VecOut的Queue中outQueueZ.EnQue<DTYPE_Z>(zLocal);// 释放输入TensorinQueueX.FreeTensor(xLocal);inQueueY.FreeTensor(yLocal);
}
// 搬出函数,完成CopyOut阶段的处理,被核心Process函数调用
__aicore__ inline void CopyOut(int32_t progress)
{
// 从VecOut的Queue中取出输出Tensor
LocalTensor<DTYPE_Z> zLocal = outQueueZ.DeQue<DTYPE_Z>();// 将输出Tensor拷贝到GlobalTensor中DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);// 将不再使用的LocalTensor释放outQueueZ.FreeTensor(zLocal);}private://Pipe内存管理对象TPipe pipe;//输入数据Queue队列管理对象,QuePosition为VECINTQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY; //输出数据Queue队列管理对象,QuePosition为VECOUTTQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;//管理输入输出Global Memory内存地址的对象,其中xGm, yGm为输入,zGm为输出GlobalTensor<DTYPE_X> xGm;GlobalTensor<DTYPE_Y> yGm;GlobalTensor<DTYPE_Z> zGm;// 每个核上总计算数据大小uint32_t blockLength;// 每个核上总计算数据分块个数uint32_t tileNum;// 每个分块大小uint32_t tileLength;
};extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
{// 获取Host侧传入的Tiling参数GET_TILING_DATA(tilingData, tiling);// 初始化算子类KernelAdd op;// 算子类的初始化函数,完成内存初始化相关工作op.Init(x, y, z, tilingData.totalLength, tilingData.tileNum);if (TILING_KEY_IS(1)) {// 完成算子实现的核心逻辑op.Process();}
}
Host侧算子实现
核函数开发并验证完成后,下一步就是进行Host侧的实现,对应“AddCustom/op_host”目录下的add_custom_tiling.h文件与add_custom.cpp文件。
修改“add_custom_tiling.h”文件,在此文件中增加粗体部分的代码,进行Tiling参数的定义。
#ifndef ADD_CUSTOM_TILING_H
#define ADD_CUSTOM_TILING_H
#include "register/tilingdata_base.h"
namespace optiling {
BEGIN_TILING_DATA_DEF(TilingData)// AddCustom算子使用了2个tiling参数:totalLength与tileNumTILING_DATA_FIELD_DEF(uint32_t, totalLength); // 总计算数据量TILING_DATA_FIELD_DEF(uint32_t, tileNum); // 每个核上总计算数据分块个数
END_TILING_DATA_DEF;// 注册tiling数据到对应的算子
REGISTER_TILING_DATA_CLASS(AddCustom, TilingData)
}
#endif // ADD_CUSTOM_TILING_H
修改“add_custom.cpp”文件,进行Tiling的实现。
修改“TilingFunc”函数,实现Tiling上下文的获取,并通过上下文获取输入输出shape信息,并根据shape信息设置TilingData、序列化保存TilingData,并设置TilingKey。
namespace optiling {
const uint32_t BLOCK_DIM = 8;
const uint32_t TILE_NUM = 8;
static ge::graphStatus TilingFunc(gert::TilingContext* context)
{TilingData tiling;uint32_t totalLength = context->GetInputTensor(0)->GetShapeSize();context->SetBlockDim(BLOCK_DIM);tiling.set_totalLength(totalLength);tiling.set_tileNum(TILE_NUM);tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());context->SetTilingKey(1);size_t *currentWorkspace = context->GetWorkspaceSizes(1);currentWorkspace[0] = 0;return ge::GRAPH_SUCCESS;
}
} // namespace optiling
修改“add_custom.cpp”文件中的算子原型注册,此函数为入口函数。
namespace ops {
class AddCustom : public OpDef {
public:explicit AddCustom(const char* name) : OpDef(name){ // Add算子的第一个输入this->Input("x").ParamType(REQUIRED) // 代表输入必选.DataType({ ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32 }) // 输入支持的数据类型.Format({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND }) // 输入支持的数据格式.UnknownShapeFormat({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND }); // 未知Shape情况下的Format的默认值// Add算子的第二个输入this->Input("y").ParamType(REQUIRED).DataType({ ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32 }).Format({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND }).UnknownShapeFormat({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND });this->Output("z").ParamType(REQUIRED).DataType({ ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32 }).Format({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND }).UnknownShapeFormat({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND });// 关联InferShape函数this->SetInferShape(ge::InferShape);// 关联Tiling函数this->AICore().SetTiling(optiling::TilingFunc);// 注册算子支持的AI处理器型号,请替换为实际支持的AI处理器型号this->AICore().AddConfig("ascend910");}
};
// 结束算子注册
OP_ADD(AddCustom);
} // namespace ops
算子工程编译部署
译AddCustom工程,生成自定义算子安装包,并将其安装到算子库中。
修改CMakePresets.json中ASCEND_CANN_PACKAGE_PATH为CANN软件包安装路径。
{……"configurePresets": [{……"ASCEND_CANN_PACKAGE_PATH": {"type": "PATH","value": "/usr/local/Ascend/ascend-toolkit/latest" //请替换为CANN软件包安装后的实际路径},……}]
}
在算子工程AddCustom目录下执行如下命令,进行算子工程编译。
./build.sh
编译成功
start compile Ascend C operator AddCustom. kernel name is AddCustom_402e355eb717124771cfc7dbebfe946c
start compile Ascend C operator AddCustom. kernel name is AddCustom_ccd748392d99d04b8205210970fde2b9
start compile Ascend C operator AddCustom. kernel name is AddCustom_1e04ee05ab491cc5ae9c3d5c9ee8950b
compile Ascend C operator: AddCustom success!
compile Ascend C operator: AddCustom success!
compile Ascend C operator: AddCustom success!
[Ascend910A] Generating AddCustom_402e355eb717124771cfc7dbebfe946c Done
/usr/bin/gmake
[100%] Built target ascendc_bin_ascend910_add_custom_2
[Ascend910A] Generating AddCustom_ccd748392d99d04b8205210970fde2b9 Done
/usr/bin/gmake
[100%] Built target ascendc_bin_ascend910_add_custom_1
[Ascend910A] Generating AddCustom_1e04ee05ab491cc5ae9c3d5c9ee8950b Done
/usr/bin/gmake
[100%] Built target ascendc_bin_ascend910_add_custom_0
[100%] Built target ascendc_bin_ascend910_gen_ops_config
[100%] Built target binary
[ 7%] Built target modify_vendor
[ 15%] Built target ascendc_impl_gen
[ 38%] Built target cust_op_proto
[ 46%] Built target npu_supported_ops
[ 61%] Built target cust_tf_parsers
[ 76%] Built target cust_opapi
[ 84%] Built target ops_info_gen_ascend910
[100%] Built target cust_optiling
[100%] Built target gen_version_info
[100%] Built target optiling_compat
Run CPack packaging tool...
CPack: Create package using External
CPack: Install projects
CPack: - Run preinstall target for: opp
CPack: - Install project: opp []
CPack: Create package
定义算子安装包部署。
编译成功后,会在当前目录下创建build_out目录,并在build_out目录下生成自定义算子安装包custom_opp__.run,例如“custom_opp_ubuntu_x86_64.run”。
cd /home/ma-user/AddCustom/build_out
./custom_opp_euleros_aarch64.run
命令执行成功后,自定义算子包中的相关文件将部署至当前环境的OPP算子库的vendors/customize目录中。
(MindSpore) [root@edbdd54b26c74c17b9ddfb1308c88382-task0-0 AddCustom]# ll /home/ma-user/AddCustom/build_out/_CPack_Packages/Linux/External/custom_opp_euleros_aarch64.run/packages/vendors/customize/
total 20
drwxr-x--- 3 root root 4096 Jun 21 07:20 framework
drwxr-x--- 4 root root 4096 Jun 21 07:20 op_api
drwxr-x--- 3 root root 4096 Jun 21 07:20 op_impl
drwxr-x--- 4 root root 4096 Jun 21 07:20 op_proto
-rw-r--r-- 1 root root 42 Jun 21 07:20 version.info
算子ST测试
CANN开发套件包中提供了ST测试工具“msopst”,用于生成算子的ST测试用例并在硬件环境中执行。
创建算子ST测试用例定义文件“AddCustom_case.json”,例如存储到跟算子工程目录“AddCustom”同级别的“AddCustom_st”路径下。
“AddCustom_case.json”文件的样例如下,开发者可基于此文件定制修改。
/home/ma-user/AddCustom_st/AddCustom_case.json
[{"case_name": "Test_AddCustom_001", "op": "AddCustom", "input_desc": [ {"format": ["ND"],"type": ["float16"],"shape": [8,2048],"data_distribute": [ "uniform"],"value_range": [ [0.1,1.0]],"name": "x"},{"format": ["ND"],"type": ["float16"],"shape": [8,2048],"data_distribute": ["uniform"],"value_range": [[0.1,1.0]],"name": "y"}],"output_desc": [{"format": ["ND"],"type": ["float16"],"shape": [8,2048],"name": "z"}]}
]
配置ST测试用例执行时依赖的环境变量。
export DDK_PATH=/usr/local/Ascend/ascend-toolkit/latest
export NPU_HOST_LIB=/usr/local/Ascend/ascend-toolkit/latest/runtime/lib64/stub
进入msopst工具所在目录,执行如下命令生成并执行测试用例。
● 进入msopst工具所在目录。
cd /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/bin
● 生成测试用例文件并执行。
./msopst run -i /home/ma-user/AddCustom_st/AddCustom_case.json -soc Ascend910A -out /home/ma-user/AddCustom_st
输出结果
b'Result file append successfully.'
b'[ OK ] AddCustom.Test_AddCustom_001_case_001_ND_float16 ( 800.668 ms )'
b'[=========] Ran 1 tests. ( 800.73 ms total )'
b'[PASSED] 1 tests.'
b'[FAILED] 0 tests.'
2024-06-21 07:56:48 (99241) - [INFO] Testcase execute in Ascend910A, cost time: 1.817183 s.
2024-06-21 07:56:48 (99241) - [INFO] Finish to run /home/ma-user/AddCustom_st/20240621075616/AddCustom/run/out/main.
2024-06-21 07:56:48 (99241) - [INFO] Step:------>>>>>> Start to get result <<<<<<------
2024-06-21 07:56:48 (99241) - [INFO] Find result.txt in /home/ma-user/AddCustom_st/20240621075616/AddCustom/run/out/result_files/result.txt.
2024-06-21 07:56:48 (99241) - [INFO] Case 'Test_AddCustom_001_case_001_ND_float16' run successfully.
2024-06-21 07:56:48 (99241) - [INFO] Get result data in AiHost execute time: 0.000810 s
========================================================================
run command: ./msopst run -i /home/ma-user/AddCustom_st/AddCustom_case.json -soc Ascend910A -out /home/ma-user/AddCustom_st
------------------------------------------------------------------------
- test case count: 1
- success count: 1
- failed count: 0
------------------------------------------------------------------------
========================================================================2024-06-21 07:56:48 (99241) - [INFO] Process finished!
2024-06-21 07:56:48 (99241) - [INFO] The st report saved in: /home/ma-user/AddCustom_st/20240621075616/st_report.json.
附录
AddCustom/op_host/add_custom.cpp完整代码
#include "add_custom_tiling.h"
#include "register/op_def_registry.h"namespace optiling {
const uint32_t BLOCK_DIM = 8;
const uint32_t TILE_NUM = 8;
static ge::graphStatus TilingFunc(gert::TilingContext* context)
{TilingData tiling;uint32_t totalLength = context->GetInputTensor(0)->GetShapeSize();context->SetBlockDim(BLOCK_DIM);tiling.set_totalLength(totalLength);tiling.set_tileNum(TILE_NUM);tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());context->SetTilingKey(1);size_t *currentWorkspace = context->GetWorkspaceSizes(1);currentWorkspace[0] = 0;return ge::GRAPH_SUCCESS;
}
}namespace ge {
static ge::graphStatus InferShape(gert::InferShapeContext* context)
{const gert::Shape* x1_shape = context->GetInputShape(0);gert::Shape* y_shape = context->GetOutputShape(0);*y_shape = *x1_shape;return GRAPH_SUCCESS;
}
}namespace ops {
class AddCustom : public OpDef {
public:explicit AddCustom(const char* name) : OpDef(name){// Add算子的第一个输入this->Input("x").ParamType(REQUIRED) // 代表输入必选.DataType({ ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32 }) // 输入支持的数据类型.Format({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND }) // 输入支持的数据格式.UnknownShapeFormat({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND }); // 未知Shape情况下的Format的默认值// Add算子的第二个输入this->Input("y").ParamType(REQUIRED).DataType({ ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32 }).Format({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND }).UnknownShapeFormat({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND });this->Output("z").ParamType(REQUIRED).DataType({ ge::DT_FLOAT16, ge::DT_FLOAT, ge::DT_INT32 }).Format({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND }).UnknownShapeFormat({ ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND });// 关联InferShape函数this->SetInferShape(ge::InferShape);// 关联Tiling函数this->AICore().SetTiling(optiling::TilingFunc);// 注册算子支持的AI处理器型号,请替换为实际支持的AI处理器型号this->AICore().AddConfig("ascend910");}
};OP_ADD(AddCustom);
}