一 网络聚合通信测试
以下测试用例为:
整集群测试,每节点进程数从2开始以2的幂次增加至满核心;
测试常见的通信聚合测试8个条目
二 测试前准备
- 待测节点已完成OS安装及基础配置
- 待测节点已配置完IP(若存在IB,则需要配置IB IP)
- 待测节点做完互信操作
- 所有节点具有共享存储
- 编译安装osu_benchmark测试工具至共享目录
三 测试
- 上传测试脚本至osu_benchmark测试工具目录,如:
/share/opt/osu/libexec/osu-micro-benchmarks/mpi/collective/
- 在该目录创建nodelist文件并填入待测节点IP
vim nodelist
10.186.121.102
10.186.121.103
10.186.121.104
10.186.121.105
10.186.121.106
10.186.121.107
10.186.121.108
10.186.121.109
10.186.121.110
........
- 在该目录创建processlist文件并填入测试进程数
#假设每节点总核数为64,从2开始,已2的幂次增加
vim processlist
2
4
8
16
32
64
- `脚本赋予执行权限
chmod +x osu_batch_test.sh
- 执行脚本进行测试
[root@linux ~]# bash osu_batch_test.sh
===============================================>>> Please choose a number to continue:
1 osu_allgather
2 osu_allreduce
3 osu_alltoall
3 osu_barrier
5 osu_bcast
6 osu_gather
7 osu_reduce
8 osu_scatter
9 Exit
>>>input number>>>
- 执行完成后会所有的日志会保存在在当前目录下的log文件夹中
四 脚本
#!/bin/bash
current_dir=`pwd`
node_file=${current_dir}/nodelist
proc_file=${current_dir}/processlist
mkdir -p ${current_dir}/log
logfile=$current_path/log/
size=65536if [ ! -f ${node_file} ] || [ ! -f ${proc_file} ];thenecho -e "Error: Nodes file ${node_file} or Process file ${proc_file} is not exist."exit 1
fi#获取节点及进程总数
cat ${proc_file} | grep -v "^#" | grep -v "^$" > process.temp
processlist=process.temp
cat ${node_file} | grep -v "^#" | grep -v "^$" > nodes.temp
nodelist=nodes.temp
count=`grep -v '^$' $processlist | wc -l `
nodenum=`grep -v '^$' $nodelist | wc -l `
if [ $count -eq 0 ] || [ $nodenum -eq 0 ];thenecho -e "Warning: Nodes file ${node_file} or process file ${proc_file} is empty, skip."exit 1
fi#获取进程数内容
proc_list=(`awk '{print $1}' $processlist`)
rm -rf $processlistfunction test_osu_allgather() {# test osu_allgatherecho -e "\n>>> Start to test osu_allgather :"echo -e "--------------------------------------------------------------------------------------"for ((i=0; i<$count; i++))doecho -e "\n>>> Start to test ppn=${proc_list[$i]} :"echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_allgather.log"mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_allgather -m $size >> ${logfile}/${nodenum}"nodes_osu_allgather.log"sleep 2doneecho "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_allgather.log"
}function test_osu_allreduce(){# test osu_allreduceecho -e "\n>>> Start to test osu_allreduce :"echo -e "--------------------------------------------------------------------------------------"for ((i=0; i<$count; i++))doecho -e "\n>>> Start to test ppn=${proc_list[$i]} :"echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_allreduce.log"mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_allreduce -m $size >> ${logfile}/${nodenum}"nodes_osu_allreduce.log"sleep 2doneecho "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_allreduce.log"
}function test_osu_alltoall() {# test osu_alltoallecho -e "\n>>> Start to test osu_alltoall :"echo -e "--------------------------------------------------------------------------------------"for ((i=0; i<$count; i++))doecho -e "\n>>> Start to test ppn=${proc_list[$i]} :"echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_alltoall.log"mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_alltoall -m $size >> ${logfile}/${nodenum}"nodes_osu_alltoall.log"sleep 2doneecho "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_alltoall.log"
}function test_osu_barrier() {# test osu_barrierecho -e "\n>>> Start to test osu_barrier :"echo -e "--------------------------------------------------------------------------------------"for ((i=0; i<$count; i++))doecho -e "\n>>> Start to test ppn=${proc_list[$i]} :"echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_barrier.log"mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_barrier -m $size >> ${logfile}/${nodenum}"nodes_osu_barrier.log"sleep 2doneecho "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_barrier.log"}function test_osu_bcast() {# test osu_bcastecho -e "\n>>> Start to test osu_bcast :"echo -e "--------------------------------------------------------------------------------------"for ((i=0; i<$count; i++))doecho -e "\n>>> Start to test ppn=${proc_list[$i]} :"echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_bcast.log"mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_bcast -m $size >> ${logfile}/${nodenum}"nodes_osu_bcast.log"sleep 2doneecho "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_bcast.log"}function test_osu_gather() {# test osu_gatherecho -e "\n>>> Start to test osu_gather :"echo -e "--------------------------------------------------------------------------------------"for ((i=0; i<$count; i++))doecho -e "\n>>> Start to test ppn=${proc_list[$i]} :"echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_gather.log"mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_gather -m $size >> ${logfile}/${nodenum}"nodes_osu_gather.log"sleep 2doneecho "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_gather.log"}function test_osu_reduce() {# test osu_reduceecho -e "\n>>> Start to test osu_reduce :"echo -e "--------------------------------------------------------------------------------------"for ((i=0; i<$count; i++))doecho -e "\n>>> Start to test ppn=${proc_list[$i]} :"echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_reduce.log"mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_reduce -m $size >> ${logfile}/${nodenum}"nodes_osu_reduce.log"sleep 2doneecho "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_reduce.log"}function test_osu_scatter() {# test osu_scatterecho -e "\n>>> Start to test osu_scatter :"echo -e "--------------------------------------------------------------------------------------"for ((i=0; i<$count; i++))doecho -e "\n>>> Start to test ppn=${proc_list[$i]} :"echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_scatter.log"mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_scatter -m $size >> ${logfile}/${nodenum}"nodes_osu_scatter.log"sleep 2doneecho "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_scatter.log"}# main function# print menuecho -e "==============================================="while :doecho -e "\n>>> Please choose a number to continue:"echo -e "1 osu_allgather"echo -e "2 osu_allreduce"echo -e "3 osu_alltoall"echo -e "3 osu_barrier"echo -e "5 osu_bcast"echo -e "6 osu_gather"echo -e "7 osu_reduce"echo -e "8 osu_scatter"echo -e "9 Exit"# read inputread -p ">>>input number>>> " nuif [[ "$nu" == "1" ]];thentest_osu_allgatherelif [[ "$nu" == "2" ]];thentest_osu_allreduceelif [[ "$nu" == "3" ]];thentest_osu_alltoallelif [[ "$nu" == "4" ]];thentest_osu_barrierelif [[ "$nu" == "5" ]];thentest_osu_bcastelif [[ "$nu" == "6" ]];thentest_osu_gatherelif [[ "$nu" == "7" ]];thentest_osu_reduceelif [[ "$nu" == "8" ]];thentest_osu_scatterelif [[ "$nu" == "9" ]];thenecho -e "\n>>> exit"exit 0elseecho -e "\033[41;37m unsupported input. \033[0m"fidone
日常总结,一起学习进步