MPI 学习-环境搭建及测试DEMO
- 1.网络配置
- 2.搭建NFS服务器,用于共享数据,使每个节点看到相同的内容
- 3.基于ubuntu22.04 docker容器,安装依赖(在计算节点上执行)
- 4.设置免密登录(在每个计算节点的容器里执行)
- 5.测试一:bash命令
- A.运行
- B.输出
- 6.测试二:简单收发测试
- A.代码
- B.编译运行
- C.输出
- 7.测试三.MPI_Allreduce
- A.代码:
- B.编译运行
- C.输出
- 8.参考文档:
本文演示了如何在二台物理机上通过MPI实现多机通信,为了不破坏系统环境,分别在二台物理机上运行ubuntu22.04容器
1.网络配置
IP | 功能 |
---|---|
192.168.1.100 | NFS服务器 |
192.168.1.101 | 计算节点0 |
192.168.1.102 | 计算节点1 |
2.搭建NFS服务器,用于共享数据,使每个节点看到相同的内容
apt install nfs-kernel-server -y
echo "/mnt/disk/mpi *(rw,sync,no_root_squash,no_subtree_check)" > /etc/exports
service nfs-kernel-server restart
3.基于ubuntu22.04 docker容器,安装依赖(在计算节点上执行)
# 创建容器
mkdir mpi
cd mpi
docker stop mpi
docker rm mpi
docker run -ti --privileged --net=host -v $PWD:/home \-w /home --name mpi ubuntu:22.04 /bin/bash
docker start mpi
docker exec -ti mpi /bin/bash# 更新apt源
sed -i "s@http://.*archive.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list
sed -i "s@http://.*security.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list
apt update# 安装依赖
apt install gcc g++ vim git wget curl unzip make -y
apt install -y pkg-config
apt install -y python3.10
apt install -y python3.10-dev
apt install -y python3-pip
apt install -y libsystemd*
apt install -y libabsl-dev
apt install -y libopencv-dev
apt install -y psmisc
apt install -y openssh-server
apt install -y gdb
apt install -y pciutils
apt install -y nfs-common
apt install -y openmpi-bin openmpi-doc libopenmpi-dev # 设置ssh端口和密码(为避免跟host sshd冲突,修改了容器里sshd端口)
sed -i 's/^.*PermitRootLogin.*$/PermitRootLogin yes/g' /etc/ssh/sshd_config
sed -i 's/^.*Port.*$/Port 2223/g' /etc/ssh/sshd_config
export passwd=Hello123 && printf "${passwd}\n${passwd}\n" | passwd root# 运行sshd服务
cat >/usr/bin/run.sh <<EOF
#!/bin/bash
mkdir -p /run/sshd
source ~/.bashrc
/usr/sbin/sshd -D
EOF
chmod 777 /usr/bin/run.sh
nohup /usr/bin/run.sh &# 挂载nfs
mkdir ~/cloud
mount -t nfs 192.168.1.100:/mnt/disk/mpi ~/cloud# 编辑计算节点列表
cd ~/cloud
cat > hostfile <<EOF
192.168.1.101
192.168.1.102
EOF
4.设置免密登录(在每个计算节点的容器里执行)
ssh-keygen -t rsa
ssh-copy-id -i ~/.ssh/id_rsa.pub -p 2223 root@192.168.1.101
ssh-copy-id -i ~/.ssh/id_rsa.pub -p 2223 root@192.168.1.102
5.测试一:bash命令
A.运行
cd ~/cloud
mpirun --allow-run-as-root -mca plm_rsh_args "-p 2223" \-np 2 -hostfile hostfile -pernode \
bash -c 'echo "$OMPI_COMM_WORLD_RANK of $OMPI_COMM_WORLD_SIZE on $(hostname) $(pwd)"'
B.输出
0 of 2 on NODE1 /root/cloud
1 of 2 on NODE2 /root/cloud
6.测试二:简单收发测试
A.代码
#include <mpi.h>
#include <iostream>
#include <stdlib.h>int main(int argc, char *argv[])
{int size,myid;double start, end;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &myid);MPI_Comm_size(MPI_COMM_WORLD, &size);int value=myid+2;printf("current_rank:%d ranks:%d value:%d\n",myid,size,value);MPI_Barrier(MPI_COMM_WORLD);start=MPI_Wtime();for (int i=1; i<size; ++i) {if (myid==0){int temp;MPI_Recv(&temp, 1, MPI_INT, i, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE);//循环接收非rank0的所有的节点数据value+=temp; //在rank0上汇集并累加}else {MPI_Send(&value, 1, MPI_INT, 0, i, MPI_COMM_WORLD); //非rank0负责发送自己的数据到rank0}}MPI_Barrier(MPI_COMM_WORLD); //等待所有操作完成end=MPI_Wtime();if (myid==0){printf("final result:%d time:%0.6f\n",value,end-start);//value为所有值之各}MPI_Finalize();
}
B.编译运行
cd ~/cloud
mpic++ mpi_demo.cc -o mpi_demo
mpirun --allow-run-as-root -mca plm_rsh_args "-p 2223" \-np 2 -hostfile hostfile ./mpi_demo
C.输出
current_rank:0 ranks:2 value:2
current_rank:1 ranks:2 value:3
final result:5 time:0.000005
7.测试三.MPI_Allreduce
A.代码:
#include <mpi.h>
#include <iostream>
#include <stdlib.h>int main(int argc, char *argv[])
{int size,myid;double start, end;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &myid);MPI_Comm_size(MPI_COMM_WORLD, &size);int count=32;int value=myid+2;int data_array[count]={0};for(int i=0;i<count;i++){data_array[i]=value+i;}printf("current_rank:%d ranks:%d value:%d\n",myid,size,value);MPI_Barrier(MPI_COMM_WORLD);int recv_data_array[count]={0};start=MPI_Wtime();//将所有节点data_array向量求和之后拷贝到recv_data_array中,每个rank的recv_data_array是一致的MPI_Allreduce(data_array,recv_data_array,count, MPI_INT, MPI_SUM, MPI_COMM_WORLD );MPI_Barrier(MPI_COMM_WORLD);end=MPI_Wtime();if (myid==0){printf("rank0\n");for(int i=1;i<=count;i++){printf("%08d ",recv_data_array[i-1]);if(i%16==0)printf("\n");}}MPI_Finalize();
}
B.编译运行
mpic++ mpi_all_reduce.cc -o mpi_all_reduce
mpirun --allow-run-as-root -mca plm_rsh_args "-p 2223" \-np 2 -hostfile hostfile ./mpi_all_reduce
C.输出
current_rank:0 ranks:2 value:2
rank0
00000005 00000007 00000009 00000011 00000013 00000015 00000017 00000019
00000021 00000023 00000025 00000027 00000029 00000031 00000033 00000035
00000037 00000039 00000041 00000043 00000045 00000047 00000049 00000051
00000053 00000055 00000057 00000059 00000061 00000063 00000065 00000067
current_rank:1 ranks:2 value:3
8.参考文档:
-
MPI API文档
-
双机搭建MPI
-
多机多卡运行nccl-tests和channel获取