1、安装gcc
#安装编译环境
yum -y install make gcc gcc-c++
2、下载显卡驱动
点击 直达连接
nvidia高级搜索下载历史版本驱动程序(下载历史版本驱动)
https://www.nvidia.cn/Download/Find.aspx?lang=cn
3、安装驱动
安装显卡驱动
./NVIDIA-Linux-x86_64-535.98.run -m=kernel-open
4、修改系统参数,更新内核,重启服务器
rm -f /etc/modprobe.d/blacklist-nvidia-nouveau.conf /etc/modprobe.d/nvidia-unsupported-gpu.conf
echo blacklist nouveau | tee /etc/modprobe.d/blacklist-nvidia-nouveau.conf && \echo options nouveau modeset=0 | tee -a /etc/modprobe.d/blacklist-nvidia-nouveau.conf && \echo options nvidia NVreg_OpenRmEnableUnsupportedGpus=1 | tee /etc/modprobe.d/nvidia-unsupported-gpu.conf && \dracut --force && \/sbin/reboot
5、检查驱动
执行nvidia-smi
Wed Aug 16 13:46:06 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.98 Driver Version: 535.98 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3090 Off | 00000000:13:00.0 Off | N/A |
| 32% 21C P8 8W / 350W | 4MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------++---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
6、安装nvidia-container-runtime
#安装源
curl -s -L https://nvidia.github.io/libnvidia-container/centos8/libnvidia-container.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
#安装容器运行时
yum install -y nvidia-container-runtime
7、修改containerd配置文件
7.1、增加如下配置
[plugins."io.containerd.runtime.v1.linux"]no_shim = falseruntime = "nvidia-container-runtime"runtime_root = ""shim = "containerd-shim"shim_debug = false
7.2、修改container配置
修改前:runtime_type = "io.containerd.runc.v2"
修改后:runtime_type = "io.containerd.runtime.v1.linux"
7.3、完整配置文件
[root@ai-4 containerd]# pwd
/etc/containerd
[root@ai-4 containerd]# cat config.toml
version = 2
root = "/var/lib/containerd"
state = "/run/containerd"
oom_score = 0[grpc]address = "/run/containerd/containerd.sock"uid = 0gid = 0max_recv_message_size = 16777216max_send_message_size = 16777216[debug]address = "/run/containerd/containerd-debug.sock"uid = 0gid = 0level = "warn"[timeouts]"io.containerd.timeout.shim.cleanup" = "5s""io.containerd.timeout.shim.load" = "5s""io.containerd.timeout.shim.shutdown" = "3s""io.containerd.timeout.task.state" = "2s"[plugins][plugins."io.containerd.grpc.v1.cri"]sandbox_image = "sealos.hub:5000/pause:3.2"max_container_log_line_size = -1max_concurrent_downloads = 20disable_apparmor = true[plugins."io.containerd.grpc.v1.cri".containerd]snapshotter = "overlayfs"default_runtime_name = "runc"[plugins."io.containerd.grpc.v1.cri".containerd.runtimes][plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]runtime_type = "io.containerd.runtime.v1.linux"runtime_engine = ""runtime_root = ""[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]SystemdCgroup = true[plugins."io.containerd.grpc.v1.cri".registry]config_path = "/etc/containerd/certs.d"[plugins."io.containerd.grpc.v1.cri".registry.configs][plugins."io.containerd.grpc.v1.cri".registry.configs."sealos.hub:5000".auth]username = "admin"password = "***********"[plugins."io.containerd.runtime.v1.linux"]no_shim = falseruntime = "nvidia-container-runtime"runtime_root = ""shim = "containerd-shim"shim_debug = false
8、测试containerd下显卡是否正常加载显卡
[root@ai-4 containerd]# ctr run --rm --gpus 0 docker.io/nvidia/cuda:11.0.3-base-ubuntu20.04 nvidia-smi nvidia-smi
Wed Aug 16 05:57:19 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.98 Driver Version: 535.98 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3090 Off | 00000000:13:00.0 Off | N/A |
| 32% 21C P8 8W / 350W | 4MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------++---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
9、K8S部署插件支持显卡(如果没有部署可通过如下命令部署,K8S Master上执行)
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.7.1/nvidia-device-plugin.yml
10、K8S检查对应节点是否有GPU资源
[root@k8s-master-17227100216 ~]# kubectl describe node node9 |grep gpugpu/type=nvidianvidia.com/gpu: 1nvidia.com/gpu: 1nvidia.com/gpu 0 0
11、部署GPU测试容器
apiVersion: v1
kind: Pod
metadata:name: cuda-vector-add
spec:restartPolicy: OnFailurecontainers:- name: cuda-vector-add#image: "k8s.gcr.io/cuda-vector-add:v0.1"image: "docker.io/nvidia/cuda:11.0.3-base-ubuntu20.04"command:- nvidia-smiresources:limits:nvidia.com/gpu: 1