lammps编译(2Aug2023、intel2020、rtx4070ti)

说明:

[root@node101 ~]# cat /etc/redhat-release
CentOS Linux release 7.9.2009 (Core)
[root@node101 ~]# gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-redhat-linux/4.8.5/lto-wrapper
Target: x86_64-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu --enable-languages=c,c++,objc,obj-c++,java,fortran,ada,go,lto --enable-plugin --enable-initfini-array --disable-libgcj --with-isl=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/isl-install --with-cloog=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/cloog-install --enable-gnu-indirect-function --with-tune=generic --with-arch_32=x86-64 --build=x86_64-redhat-linux
Thread model: posix
gcc version 4.8.5 20150623 (Red Hat 4.8.5-44) (GCC)
[root@node101 ~]# which mpirun
/opt/gpuApp/ompi/bin/mpirun
[root@node101 ~]# which icc
/opt/intel/compilers_and_libraries_2020.1.211/linux/bin/intel64/icc
[root@node101 ~]# which nvcc
/usr/local/cuda-12.3/bin/nvcc
[root@node101 ~]# lscpu
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                32
On-line CPU(s) list:   0-31
Thread(s) per core:    2
Core(s) per socket:    16
Socket(s):             1
NUMA node(s):          1
Vendor ID:             AuthenticAMD
CPU family:            23
Model:                 49
Model name:            AMD EPYC 7302 16-Core Processor
Stepping:              0
CPU MHz:               1500.000
CPU max MHz:           3000.0000
CPU min MHz:           1500.0000
BogoMIPS:              6000.34
Virtualization:        AMD-V
L1d cache:             32K
L1i cache:             32K
L2 cache:              512K
L3 cache:              16384K
NUMA node0 CPU(s):     0-31
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc art rep_good nopl nonstop_tsc extd_apicid aperfmperf eagerfpu pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_l2 cpb cat_l3 cdp_l3 hw_pstate sme retpoline_amd ssbd ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif umip overflow_recov succor smca
[root@node101 ~]# free -g
              total        used        free      shared  buff/cache   available
Mem:            251           5         227           0          18         244
Swap:           127           0         127
[root@node101 ~]#
 

lammps支持单精度,也支持双精度。受限于4070Ti,其双精度能力很差,故本次使用单精度方式进行使用。

显卡的SM值可以通过cuda自带的工具查询:

[root@node101 tools]#ls /usr/local/cuda/samples/1_Utilities/deviceQuery

deviceQuery deviceQuery.cpp deviceQuery.o Makefile NsightEclipse.xml readme.txt

[root@node101 tools]#cd /usr/local/cuda/samples/1_Utilities/deviceQuery

[root@node101 deviceQuery]#./deviceQuery

1、环境文件

cat << EOF > ~/lammps-gpu-env.sh

#!/bin/bash

source /opt/intel/compilers_and_libraries_2020/linux/bin/compilervars.sh intel64

export PATH=/usr/local/cuda-12.3/bin:$PATH

export LD_LIBRARY_PATH=/usr/local/cuda-12.3/targets/x86_64-linux/lib:$LD_LIBRARY_PATH

export C_INCLUDE_PATH=/usr/local/cuda-12.3/targets/x86_64-linux/include:$C_INCLUDE_PATH

EOF

2、gdrcopy

cd gpu-lammps/

tar -zxvf gdrcopy-2.0.tar.gz

cd gdrcopy-2.0/

mkdir -p /opt/gpuApp/gdrcopy/include

mkdir -p /opt/gpuApp/gdrcopy/lib64

make PREFIX=/opt/gpuApp/gdrcopy lib lib_install

cat << EOF >> ~/lammps-gpu-env.sh

export PATH=/opt/gpuApp/gdrcopy/include:\$PATH

export CPATH=/opt/gpuApp/gdrcopy/include:\$CPATH

export LD_LIBRARY_PATH=/opt/gpuApp/gdrcopy/lib64:\$LD_LIBRARY_PATH

EOF

3、ucx

cd ~/gpu-lammps/

tar -zxvf ucx-1.7.0.tar.gz

cd ucx-1.7.0/

./configure --prefix=/opt/gpuApp/ucx --enable-optimizations --disable-logging --disable-debug --disable-assertions --disable-params-check --disable-doxygen-doc --with-cuda=/usr/local/cuda --with-gdrcopy=/opt/gpuApp/gdrcopy/ --with-verbs --with-rdmacm

……………………………….

configure: =========================================================

configure: UCX build configuration:

configure: Preprocessor flags:   -DCPU_FLAGS="|avx" -I${abs_top_srcdir}/src -I${abs_top_builddir} -I${abs_top_builddir}/src

configure:            C flags:   -O3 -g -Wall -Werror -mavx

configure:          C++ flags:   -O3 -g -Wall -Werror -mavx

configure:       Multi-thread:   Disabled

configure:          MPI tests:   Disabled

configure:      Devel headers:

configure:        UCT modules:   < cuda ib rdmacm cma >

configure:       CUDA modules:   < gdrcopy >

configure:       ROCM modules:   < >

configure:         IB modules:   < >

configure:        UCM modules:   < cuda >

configure:       Perf modules:   < cuda >

configure: =========================================================

…………..

cat << EOF >> ~/lammps-gpu-env.sh

export PATH=/opt/gpuApp/ucx/bin:\$PATH

export LD_LIBRARY_PATH=/opt/gpuApp/ucx/lib:\$LD_LIBRARY_PATH

EOF

4、openmpi

[root@node101 gpu-lammps]# cd ~/gpu-lammps/

[root@node101 gpu-lammps]# tar -xvf openmpi-4.1.6.tar

[root@node101 gpu-lammps]# cd openmpi-4.1.6/

[root@node101 openmpi-4.1.6]# ./configure --prefix=/opt/gpuApp/ompi --enable-mpirun-prefix-by-default --enable-cuda --enable-dlopen --enable-weak-symbols --enable-heterogeneous --enable-binaries --enable-script-wrapper-compilers --enable-orterun-prefix-by-default --enable-mca-no-build=btl-uct --with-cuda --with-pmix --with-verbs --with-ucx=/opt/gpuApp/ucx

…………

Open MPI configuration:

-----------------------

Version: 4.1.6

Build MPI C bindings: yes

Build MPI C++ bindings (deprecated): no

Build MPI Fortran bindings: mpif.h, use mpi

MPI Build Java bindings (experimental): no

Build Open SHMEM support: yes

Debug build: no

Platform file: (none)

Miscellaneous

-----------------------

CUDA support: yes

HWLOC support: internal

Libevent support: internal

Open UCC: no

PMIx support: Internal

Transports

-----------------------

Cisco usNIC: no

Cray uGNI (Gemini/Aries): no

Intel Omnipath (PSM2): no

Intel TrueScale (PSM): no

Mellanox MXM: no

Open UCX: yes

OpenFabrics OFI Libfabric: no

OpenFabrics Verbs: yes

Portals4: no

Shared memory/copy in+copy out: yes

Shared memory/Linux CMA: yes

Shared memory/Linux KNEM: no

Shared memory/XPMEM: no

TCP: yes

Resource Managers

-----------------------

Cray Alps: no

Grid Engine: no

LSF: no

Moab: no

Slurm: yes

ssh/rsh: yes

Torque: no

OMPIO File Systems

-----------------------

DDN Infinite Memory Engine: no

Generic Unix FS: yes

IBM Spectrum Scale/GPFS: no

Lustre: no

PVFS2/OrangeFS: no

[root@node101 openmpi-4.1.6]# make -j 32

[root@node101 openmpi-4.1.6]# make install

[root@node101 openmpi-4.1.6]# cat << EOF >> ~/lammps-gpu-env.sh

export PATH=/opt/gpuApp/ompi/bin:\$PATH

export LD_LIBRARY_PATH=/opt/gpuApp/ompi/lib:\$LD_LIBRARY_PATH

export INCLUDE=/opt/gpuApp/ompi/include:\$INCLUDE

EOF

[root@node101 openmpi-4.1.6]#

5、lammps-cpu

[root@node101 gpu-lammps]# tar -zxvf lammps-2Aug2023.tar.gz

[root@node101 gpu-lammps]# cd lammps-2Aug2023/src

[root@node101 src]#source ~/lammps-gpu-env.sh

[root@node101 src]# make yes-all

[root@node101 src]# make no-lib

[root@node101 src]# cp MAKE/OPTIONS/Makefile.intel_cpu_openmpi MAKE/Makefile.intel

[root@node101 src]# make -j 32 intel

[root@node101 src]# cp lmp_intel lmp_intel_cpu

6、lammps-gpu

[root@node101 gpu-lammps]# cd lammps-2Aug2023/lib/gpu/

[root@node101 gpu]#source ~/lammps-gpu-env.sh

[root@node101 gpu]# vi Makefile.linux                         ##修改SM和CUDA_PRECISION[强撞1] 

[root@node101 gpu]# make -f Makefile.linux               ##编译GPU库

[root@node101 gpu]# ./nvc_get_devices

Found 1 platform(s).

CUDA Driver Version:                           12.30

Device 0: "NVIDIA GeForce RTX 4070 Ti"

  Type of device:                                GPU

  Compute capability:                            8.9

  Double precision support:                      Yes

  Total amount of global memory:                 11.7281 GB

  Number of compute units/multiprocessors:       60

  Number of cores:                               11520

  Total amount of constant memory:               65536 bytes

  Total amount of local/shared memory per block: 49152 bytes

  Total number of registers available per block: 65536

  Warp size:                                     32

  Maximum number of threads per block:           1024

  Maximum group size (# of threads per block)    1024 x 1024 x 64

  Maximum item sizes (# threads for each dim)    2147483647 x 65535 x 65535

  Maximum memory pitch:                          2147483647 bytes

  Texture alignment:                             512 bytes

  Clock rate:                                    2.61 GHz

  Run time limit on kernels:                     No

  Integrated:                                    No

  Support host page-locked memory mapping:       Yes

  Compute mode:                                  Default

  Concurrent kernel execution:                   Yes

  Device has ECC support enabled:                No

Device 1: "NVIDIA GeForce RTX 4070 Ti"

  Type of device:                                GPU

  Compute capability:                            8.9

  Double precision support:                      Yes

  Total amount of global memory:                 11.7281 GB

  Number of compute units/multiprocessors:       60

  Number of cores:                               11520

  Total amount of constant memory:               65536 bytes

  Total amount of local/shared memory per block: 49152 bytes

  Total number of registers available per block: 65536

  Warp size:                                     32

  Maximum number of threads per block:           1024

  Maximum group size (# of threads per block)    1024 x 1024 x 64

  Maximum item sizes (# threads for each dim)    2147483647 x 65535 x 65535

  Maximum memory pitch:                          2147483647 bytes

  Texture alignment:                             512 bytes

  Clock rate:                                    2.61 GHz

  Run time limit on kernels:                     No

  Integrated:                                    No

  Support host page-locked memory mapping:       Yes

  Compute mode:                                  Default

  Concurrent kernel execution:                   Yes

  Device has ECC support enabled:                No

Device 2: "NVIDIA GeForce RTX 4070 Ti"

  Type of device:                                GPU

  Compute capability:                            8.9

  Double precision support:                      Yes

  Total amount of global memory:                 11.7281 GB

  Number of compute units/multiprocessors:       60

  Number of cores:                               11520

  Total amount of constant memory:               65536 bytes

  Total amount of local/shared memory per block: 49152 bytes

  Total number of registers available per block: 65536

  Warp size:                                     32

  Maximum number of threads per block:           1024

  Maximum group size (# of threads per block)    1024 x 1024 x 64

  Maximum item sizes (# threads for each dim)    2147483647 x 65535 x 65535

  Maximum memory pitch:                          2147483647 bytes

  Texture alignment:                             512 bytes

  Clock rate:                                    2.61 GHz

  Run time limit on kernels:                     No

  Integrated:                                    No

  Support host page-locked memory mapping:       Yes

  Compute mode:                                  Default

  Concurrent kernel execution:                   Yes

  Device has ECC support enabled:                No

Device 3: "NVIDIA GeForce RTX 4070 Ti"

  Type of device:                                GPU

  Compute capability:                            8.9

  Double precision support:                      Yes

  Total amount of global memory:                 11.7281 GB

  Number of compute units/multiprocessors:       60

  Number of cores:                               11520

  Total amount of constant memory:               65536 bytes

  Total amount of local/shared memory per block: 49152 bytes

  Total number of registers available per block: 65536

  Warp size:                                     32

  Maximum number of threads per block:           1024

  Maximum group size (# of threads per block)    1024 x 1024 x 64

  Maximum item sizes (# threads for each dim)    2147483647 x 65535 x 65535

  Maximum memory pitch:                          2147483647 bytes

  Texture alignment:                             512 bytes

  Clock rate:                                    2.61 GHz

  Run time limit on kernels:                     No

  Integrated:                                    No

  Support host page-locked memory mapping:       Yes

  Compute mode:                                  Default

  Concurrent kernel execution:                   Yes

  Device has ECC support enabled:                No

[root@node101 gpu]# cd ../../src

[root@node101 src]#make package-status

[root@node101 src]#make yes-gpu

[root@node101 src]#make no-amoeba

[root@node101 src]#make clean-all

[root@node101 src]#make clean-machine

[root@node101 src]#make clean-intel

[root@node101 src]#make -j 32 intel

[root@node101 src]#cp lmp_intel lmp_intel_gpu

7、测试

7.1cpu

source /opt/gpuApp/lammps-gpu-env.sh

mpirun -np 12 /opt/gpuApp/lammps/lmp_intel_cpu -in in.NHO

7.2 4core_1gpu

source /opt/gpuApp/lammps-gpu-env.sh

mpirun -np 4 /opt/gpuApp/lammps/lmp_intel_cuda -sf gpu -pk gpu 1 -in in.NHO

GPU状态:

7.3 16core_1gpu

source /opt/gpuApp/lammps-gpu-env.sh

mpirun -np 16 /opt/gpuApp/lammps/lmp_intel_cuda -sf gpu -pk gpu 1 -in in.NHO

GPU状态:

7.4 16core_4gpu

source /opt/gpuApp/lammps-gpu-env.sh

mpirun -np 16 /opt/gpuApp/lammps/lmp_intel_cuda -sf gpu -pk gpu 4 -in in.NHO

GPU状态:


 [强撞1]4070Ti为安培架构,SM为86。双精度性能差,PRECISION为-D_SINGLE_SINGLE

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/218167.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

Vmd+lstm代码详解 完整代码数据可直接运行

项目视频讲解:Vmd+lstm时间序列预测分类回归预测代码详解 完整代码可直接运行_哔哩哔哩_bilibili 项目演示效果: 代码详解: # -*- coding: utf-8 -*- # 导入库pip install openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple import pandas as pd import numpy as np fr…

QVTK 可视化

#ifndef MAINWINDOW_H #define MAINWINDOW_H#include <QMainWindow>#include <vtkNew.h> // 智能指针 #include <QVTKOpenGLNativeWidget.h> #include <vtkCylinderSource.h> // 圆柱#include <vtkPolyDataMapper.h&g…

OpenHarmony应用开发——在标准OpenHarmony上运行应用-标准OpenHarmony工程设置

一、前言 前面我们创建了一个工程并使其在HarmonyOS系统上运行&#xff0c;本文我们来阐述一下如何在标准的OpenHarmony开发板或系统上运行。 二、详细步骤 1.下载并配置OpenHarmony SDK 首先&#xff0c;打开Settings. 将SDK选择为OpenHarmony&#xff0c;第一次选择路径应该…

java SSM教师工作量管理系统myeclipse开发mysql数据库springMVC模式java编程计算机网页设计

一、源码特点 java SSM 教师工作量管理系统是一套完善的web设计系统&#xff08;系统采用SSM框架进行设计开发&#xff0c;springspringMVCmybatis&#xff09;&#xff0c;对理解JSP java编程开发语言有帮助&#xff0c;系统具有完整的源代码和数据库&#xff0c;系统主要…

2023年12月16日(星期六)骑行樱花谷

2023年12月16日 (星期六) 骑行樱花谷(赏冬樱花&#xff09;&#xff0c;早8:30到9:00&#xff0c; 郊野公园西门集合&#xff0c;9:30准时出发 【因迟到者&#xff0c;骑行速度快者&#xff0c;可自行追赶偶遇。】 偶遇地点:郊野公园西门集合 &#xff0c;家住东&#xff0c;南…

vue实现自动打字效果(带光标效果)

代码介绍(其实就是通过字符串截取加定时拼接完成的,我相信有时间都能琢磨出来,来这里就是为了省事) 上vue页面代码: <template><div idApp><h2>{{text}}<span ref"fou" class"fousdis">{{_}}</span></h2></div>…

【数学建模】《实战数学建模:例题与讲解》第十二讲-因子分析、判别分析(含Matlab代码)

【数学建模】《实战数学建模&#xff1a;例题与讲解》第十二讲-因子分析、判别分析&#xff08;含Matlab代码&#xff09; 基本概念时间判别费歇判别贝叶斯判别 习题10.31. 题目要求2.解题过程3.程序4.结果 习题10.6&#xff08;1&#xff09;1. 题目要求2.解题过程——对应分析…

任意文件读取漏洞

使用方法php://filter/readconvert.base64-encode/resourcexxx 任意文件读取漏洞 php://filter/readconvert.base64-encode/resourceflag 在url后边接上 以base64的编码形式 读取flag里面的内容 php://filter/readconvert.base64encode/resourceflag 用kali来解码 创建一个文…

使用Python实现单链表

目录 一、引言 二、节点的定义 三、链表的创建 四、插入节点 五、删除节点 六、遍历链表 七、节点的查找 八、总结 一、引言 单链表是一种常用的数据结构&#xff0c;它由一系列节点组成&#xff0c;每个节点包含一个数据元素和指向下一个节点的指针。单链表可以用来存…

Qt 中文处理

windows下 Qt显示中文的几种方式&#xff1a; 1&#xff0c; 环境&#xff1a;Qt 5.15.2 vs2019 64位 win11系统 默认用Qt 创建的文件使用utf-8编码格式&#xff0c;此环境下 中文没有问题 ui->textEdit->append("中文测试"); 2&#xff0c; 某些 低于…

【MySQL备份】MySQL备份工具-MyDumper

目录 什么是MyDumper MyDumper优势有哪些 如何安装MyDumper 参数解释 1 mydumper参数解释 备份流程 一致性快照如何工作&#xff1f; 如何排除&#xff08;或包含&#xff09;数据库&#xff1f; 输出文件 Metadata文件 ​编辑 表数据 文件 表结构 文件 建库文件…

【Unity学习笔记】光照简介

本节主要是简单介绍一些常见的光照组件和渲染设置。 文章目录 灯光类型平行光Directional Light点光源Point Light聚光灯Spot Light面积光 Area Light 阴影设置全局光照明光照模式直接光照与间接光照Mixed Lighting 光照探针Light Probe Group光照探针组 反射探针 灯光类型 在…

工具应用:Robot Framework->对协议级接口进行测试

实验简介 本节实验主要为大家讲解如何利用Robot Framework结合常用的关键字完成对Agileone系统中的“需求提案”模块进行协议级接口的自动化测试脚本开发。 实验目的 &#xff08;1&#xff09; 掌握RF的Requests库的常用关键字及用法。 &#xff08;2&#xff09; 能够熟练…

一文速览字节最新分布式操作系统KubeWharf

一文速览字节最新分布式操作系统KubeWharf KubeWharf 是字节跳动基础架构团队在对 Kubernetes 进行了大规模应用和不断优化增强之后的技术结晶。 这是一套以 Kubernetes 为基础构建的分布式操作系统&#xff0c;由一组云原生组件构成&#xff0c;专注于提高系统的可扩展性、功…

怪兽吃糖果

欢迎来到程序小院 怪兽吃糖果 玩法&#xff1a;左右飞出的糖果&#xff0c;点击鼠标糖果即为怪兽吃掉&#xff0c;不同的糖果不同的分数奖励&#xff0c; 吃不掉的糖果会扣除一次生命&#xff0c;共三次生命值&#xff0c;点击炸弹游戏结束&#xff0c;快去吃糖果吧^^开始游戏…

基于ssm大学生创新创业平台项目管理子系统设计与实现论文

摘 要 互联网发展至今&#xff0c;无论是其理论还是技术都已经成熟&#xff0c;而且它广泛参与在社会中的方方面面。它让信息都可以通过网络传播&#xff0c;搭配信息管理工具可以很好地为人们提供服务。针对大学生创新创业项目信息管理混乱&#xff0c;出错率高&#xff0c;信…

Redis持久化AOF详解

基础面试题 什么是AOF AOF&#xff08;Append-Only File&#xff09;用于将Redis服务器收到的写操作追加到日志文件&#xff0c;通过该机制可以保证服务器重启后依然可以依靠日志文件恢复数据。 它的工作过程大抵分为以下几步&#xff1a; 收到客户端的写入命令(例如SET、DE…

安全算法(一):安全技术、加密的基础知识、哈希函数的简单介绍

安全算法&#xff08;一&#xff09;&#xff1a;安全技术、加密的基础知识、哈希函数的简单介绍 通过互联网交换数据时&#xff0c;数据要经过各种各样的网络和设备才能传到对方那里。数据在传输过程中有可能会经过某些恶意用户的设备&#xff0c;从而导致内容被盗取。 因此…

外汇天眼:新手做外汇交易需要注意什么?

外汇投资是一个充满机会和挑战的市场&#xff0c;对于新手来说&#xff0c;了解一些必要的知识和技巧是非常重要的。 以下是一些新手投资外汇必须注意以下几点&#xff1a; 1.了解外汇市场的基本知识 在进入外汇市场之前&#xff0c;了解一些基本知识是必要的。 这包括外汇市…

亚马逊鲲鹏系统:防关联技术守护您的账户安全

亚马逊买家账号注册是一项相当简便的操作&#xff0c;但当涉及到批量注册时&#xff0c;我们就需要更加注意防关联的问题。对于那些对此领域不够熟悉的朋友们&#xff0c;可以使用亚马逊鲲鹏系统&#xff0c;这款系统能够为我们提供一站式的解决方案。该系统不仅支持买家账号的…