ceph
ceph osd lspoolsrbd ls -p testpool#查看 ceph 集群中有多少个 pool,并且每个 pool 容量及利 用情况
rados dfceph -sceph osd tree
ceph dfceph versionsceph osd pool lsceph osd crush rule dumpceph auth print-key client.adminceph orch host lsceph crash lsceph osd pool statsceph df detail
ceph osd statceph mon stat查看image rbd
rbd ls -p kubeceph osd df
ceph osd pool autoscale-statusceph:
10.240.62.11/12/13
root:autelceph2 用户名:autel
密码:Autonomy@Autel13 Autel#3913[root@ceph-admin ~]# ceph mgr services
{"dashboard": "https://10.250.53.152:8443/","prometheus": "http://10.250.53.152:9283/"
}kubectl logs -f qinzhao-cache-resunet-demo-pipeline-wbkkh-2890309351 -n qinzhao -c lustre-importer-preloadkubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"apps/jupyter/jupyter-web-app/upstream/base/configs/spawner_ui_config.yamlkustomize build apps/jupyter/jupyter-web-app/upstream/overlays/istio | kubectl apply -f -kustomize build apps/tensorboard/tensorboard-controller/upstream/overlays/kubeflow | kubectl apply -f -kubectl get pods -n kubeflow -l kustomize.component=profileshttps://www.amazonaws.cn/ec2/instance-types/kubectl taint node autel-poweredge-r750 nodetype=T4:NoExecute
kubectl taint node autelrobotics-gpu10 nodetype=RTX3090:NoExecute
autelrobotics-gpu10
kubectl taint node autel-poweredge-r750 nodetype:NoExecute-kubectl taint node autelrobotics-gpu09 nodetype:NoSchedule-kubectl taint node autelrobotics-gpu09 nodetype:NoExecute-nodegroup=gpu:NoSchedulekubectl taint nodes autelrobotics-gpu02 nodegroup=gpu:NoSchedulekubectl label node autelrobotics-gpu02 gputype=A40lsof -n -P -i:22stracekubectl get csinode
查看活跃进程个数
top -H -p 1kubectl create secret tls ai-tls \--namespace ai-test \--key tls.key \--cert tls.pemhttps://github.com/NVIDIA/nvidia-docker/issues/1678
nvidia-container-cli -k -d /dev/tty infols -l /dev/char
cat /etc/nvidia-container-runtime/config.tomlstat -fc %T /sys/fs/cgroup/sar -n TCP,ETCP 1fdisk -lldd# 修改后,重新挂载生效
# mount -o remount /dev/shmnstat
mpstat -P all 1
slabtop
pcstatnetstat -ant | awk '{print $6}' | sort | uniq -c | sort -ndmesg -T
pmap -x 1649 | sort -k 3 -n -r
cat /proc/1649/smaps | grep 7f4250021000
dump memory memory.dump 0x7f2340539000 0x7f235d553000strings memory.dumppidstat -p pid -r 1 1000
sudo ./stackcount ip_outputdmesg -Tw
perf
NetHogsiftop -i eth0 -P -N
./opensnoop -Tn snmp-pass
slabtopnfsstat -cdu -ah --max-depth=2 /var/log |sort -rh |head -10./fileslower
ulimit -a解决显存释放问题:
fuser -v /dev/nvidia*lsof -Pninetstat -n | awk '/^tcp/ {++S[$NF]} END {for(a in S) print a, S[a]}'
NFS运维:
systemctl status rpcbind nfs-server
nfsiostat
dmesg | grep nfs
exportfs -v
mpstat -P ALL 1
ss -t -a |grep "IP"nfsstat -c
iostat iostat -d -x -k 1netstat -an | "IP:2049"dstat
ps aux | grep /apphttps://learnku.com/articles/39851
https://zhuanlan.zhihu.com/p/614314627fdisk -lblkidnfsiostat 1sar -b 1iostat -m -d /dev/md0 1strace -p pid 查看进程当前调用栈,查死循环或者卡顿时极为有用
strace -eopen /usr/local/kk-mail/service/dovecot/sbin/dovecot 查看进程当前打开了哪些文件 cat /proc/715765/*/task/stack/proc/12544/task/12873/stack
systemtapcat /var/log/Xorg.0.log |grep -i "nvidia"