记录一个编译的LLVM 含clang 和 PTX 来支持 HIPIFY 的构建配置

llvm 18 debug 版本

build llvmorg-18.1rc4 debug

$ cd llvm-project

$ git checkout llvmorg-18.1.0-rc4

$ mkdir build_d

$ cd build_d

$ mkdir -p ../../local_d

cmake \
-DCMAKE_INSTALL_PREFIX=../../local_d \
-DLLVM_SOURCE_DIR=../llvm \
-DLLVM_ENABLE_PROJECTS="bolt;clang;clang-tools-extra;lld;mlir"  \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX"  \
-DLLVM_INCLUDE_TESTS=OFF \
-DCMAKE_BUILD_TYPE=Debug \
../llvm

其余部分拆出来了:

cross-project-tests;libclc;lldb;polly;flang

-DLLVM_ENABLE_RUNTIMES="libunwind;libcxxabi;pstl;libcxx;openmp"      \
libc;compiler-rt;

$ make -j34

$make install

llvm 18 release版本

cd llvm-project

mkdir build_r

cd build_r

mkdir -p ../../local_r

cmake \
-DCMAKE_INSTALL_PREFIX=../../local_r \
-DLLVM_SOURCE_DIR=../llvm \
-DLLVM_ENABLE_PROJECTS="bolt;clang;clang-tools-extra;lld;mlir"  \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX"  \
-DLLVM_INCLUDE_TESTS=OFF \
-DCMAKE_BUILD_TYPE=Release \
../llvm

$ make -j34

效果:

$make install

build HIPIFY debug

$ mkdir /home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/hipify

cmake  \
-DCMAKE_INSTALL_PREFIX=/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/hipify  \
-DCMAKE_BUILD_TYPE=Debug  \
-DCMAKE_PREFIX_PATH=/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d  \
..

还有一种更多配置的编译配置方法,其实用不到:

cmake-DHIPIFY_CLANG_TESTS=ON \-DCMAKE_BUILD_TYPE=Release \-DCMAKE_INSTALL_PREFIX=../dist \-DCMAKE_PREFIX_PATH=/usr/llvm/17.0.6/dist \-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.3.2 \-DCUDA_DNN_ROOT_DIR=/usr/local/cudnn-8.9.7 \-DCUDA_CUB_ROOT_DIR=/usr/local/cub-2.1.0 \-DLLVM_EXTERNAL_LIT=/usr/llvm/17.0.6/build/bin/llvm-lit \..

using hipify-clang
 

hipify-clang intro.cu --cuda-path="/usr/local/cuda-12.3" --print-stats-csv

$ /home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/hipify/bin/hipify-clang vectorAdd.cu --cuda-path="/usr/local/cuda-12.3" --clang-resource-directory="/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/lib/clang/18"

写成Makefile:

EXE := vectorAdd_hipall: $(EXE)$(EXE): vectorAdd.cu.hiphipcc $< -o $@%.hip: %/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/hipify/bin/hipify-clang $< --cuda-path=/usr/local/cuda-12.3 --clang-resource-directory=/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/lib/clang/18.PHONY: clean
clean:${RM} $(EXE) *.hip

效果:

源cu代码:

#include <stdio.h>#include <cuda_runtime.h>__global__ void vectorAdd(const float *A, const float *B, float *C,int numElements) {int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < numElements) {C[i] = A[i] + B[i] + 0.0f;}if(i==7)printf("Hello kernel threadID=%d\n", i);
}int main(void)
{cudaError_t err = cudaSuccess;int numElements = 50000;size_t size = numElements * sizeof(float);printf("[Vector addition of %d elements]\n", numElements);float *h_A = (float *)malloc(size);float *h_B = (float *)malloc(size);float *h_C = (float *)malloc(size);if (h_A == NULL || h_B == NULL || h_C == NULL) {fprintf(stderr, "Failed to allocate host vectors!\n");exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {h_A[i] = rand() / (float)RAND_MAX;h_B[i] = rand() / (float)RAND_MAX;}float *d_A = NULL;err = cudaMalloc((void **)&d_A, size);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}float *d_B = NULL;err = cudaMalloc((void **)&d_B, size);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}float *d_C = NULL;err = cudaMalloc((void **)&d_C, size);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy input data from the host memory to the CUDA device\n");err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);if (err != cudaSuccess) {fprintf(stderr,"Failed to copy vector A from host to device (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);if (err != cudaSuccess) {fprintf(stderr,"Failed to copy vector B from host to device (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}int threadsPerBlock = 256;int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);err = cudaGetLastError();if (err != cudaSuccess) {fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy output data from the CUDA device to the host memory\n");err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);if (err != cudaSuccess) {fprintf(stderr,"Failed to copy vector C from device to host (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {fprintf(stderr, "Result verification failed at element %d!\n", i);exit(EXIT_FAILURE);}}printf("Test PASSED\n");err = cudaFree(d_A);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector A (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}err = cudaFree(d_B);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector B (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}err = cudaFree(d_C);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector C (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}free(h_A);free(h_B);free(h_C);printf("Done\n");return 0;
}

生成的vectorAdd.cu.hip代码:

#include <stdio.h>#include <hip/hip_runtime.h>__global__ void vectorAdd(const float *A, const float *B, float *C,int numElements) {int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < numElements) {C[i] = A[i] + B[i] + 0.0f;}if(i==7)printf("Hello kernel threadID=%d\n", i);
}int main(void)
{hipError_t err = hipSuccess;int numElements = 50000;size_t size = numElements * sizeof(float);printf("[Vector addition of %d elements]\n", numElements);float *h_A = (float *)malloc(size);float *h_B = (float *)malloc(size);float *h_C = (float *)malloc(size);if (h_A == NULL || h_B == NULL || h_C == NULL) {fprintf(stderr, "Failed to allocate host vectors!\n");exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {h_A[i] = rand() / (float)RAND_MAX;h_B[i] = rand() / (float)RAND_MAX;}float *d_A = NULL;err = hipMalloc((void **)&d_A, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}float *d_B = NULL;err = hipMalloc((void **)&d_B, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}float *d_C = NULL;err = hipMalloc((void **)&d_C, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy input data from the host memory to the CUDA device\n");err = hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector A from host to device (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector B from host to device (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}int threadsPerBlock = 256;int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);err = hipGetLastError();if (err != hipSuccess) {fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy output data from the CUDA device to the host memory\n");err = hipMemcpy(h_C, d_C, size, hipMemcpyDeviceToHost);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector C from device to host (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {fprintf(stderr, "Result verification failed at element %d!\n", i);exit(EXIT_FAILURE);}}printf("Test PASSED\n");err = hipFree(d_A);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector A (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipFree(d_B);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector B (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipFree(d_C);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector C (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}free(h_A);free(h_B);free(h_C);printf("Done\n");return 0;
}

其他参考选项示例:

指示头文件文件夹

./hipify-clang square.cu --cuda-path=/usr/local/cuda-12.3 -I /usr/local/cuda-12.3/samples/common/inc

指示C++标准

./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-12.3 -- -std=c++17

多个 .cu 文件一起编译

./hipify-clang cpp17.cu ../../square.cu /home/user/cuda/intro.cu --cuda-path=/usr/local/cuda-12.3 -- -std=c++17

统计修改的信息

$ /home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/hipify/bin/hipify-clang vectorAdd.cu --cuda-path=/usr/local/cuda-12.3 --clang-resource-directory=/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/lib/clang/18 --print-stats

将 统计信息存入 .csv文件中

 --print-stats
改成 --print-stats-csv

遗留问题

llvmorg-18.1.rc release 配置有问题:

cmake \
-DCMAKE_INSTALL_PREFIX=../../local \
-DLLVM_SOURCE_DIR=../llvm \
-DLLVM_ENABLE_PROJECTS="bolt;clang;clang-tools-extra;cross-project-tests;libclc;lld;mlir;polly;flang"  \
-DLLVM_ENABLE_RUNTIMES="libc;libunwind;libcxxabi;pstl;libcxx;compiler-rt;openmp"      \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX"  \
-DLLVM_INCLUDE_TESTS=OFF \
-DCMAKE_BUILD_TYPE=Release \
../llvm

lldb;

貌似拿掉 libc 就能行

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/739385.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

平台工程:构建企业数字化转型的基石

有人说&#xff0c;平台工程&#xff08;Platform Engineering&#xff09;&#xff0c;不过是新瓶装旧酒&#xff08;DevOps&#xff09;。 而Gartner 将平台工程列为 2024 顶级战略技术趋势之一。我国信通院也开始陆续制定与平台工程相关的技术标准。 随着数字化浪潮的席卷…

【数学】【网格】【状态压缩】782 变为棋盘

作者推荐 视频算法专题 本文涉及知识点 数学 网格 状态压缩 LeetCode:782 变为棋盘 一个 n x n 的二维网络 board 仅由 0 和 1 组成 。每次移动&#xff0c;你能任意交换两列或是两行的位置。 返回 将这个矩阵变为 “棋盘” 所需的最小移动次数 。如果不存在可行的变换&am…

day08_Mybatis

文章目录 前言一、快速入门1.1 入门程序分析1.2 入门程序实现1.2.1 准备工作1.2.1.1 创建springboot工程1.2.1.2 数据准备 1.2.2 配置Mybatis1.2.3 编写SQL语句1.2.4 单元测试1.3 解决SQL警告与提示 二、JDBC介绍2.1 介绍2.2 代码2.3 问题分析2.4 技术对比 三、数据库连接池3.1…

c语言经典测试题12

1.题1 float f[10]; // 假设这里有对f进行初始化的代码 for(int i 0; i < 10;) { if(f[i] 0) break; } 上述代码有那些缺陷&#xff08;&#xff09; A: for(int i 0; i < 10;)这一行写错了 B: f是float型数据直接做相等判断有风险 C: f[i]应该是f[i] D: 没有缺…

【YOLOv8模型网络结构图理解】

YOLOv8模型网络结构图理解 1 YOLOv8的yaml配置文件2 YOLOv8网络结构2.1 Conv2.2 C3与C2f2.3 SPPF2.4 Upsample2.5 Detect层 1 YOLOv8的yaml配置文件 YOLOv8的配置文件定义了模型的关键参数和结构&#xff0c;包括类别数、模型尺寸、骨干&#xff08;backbone&#xff09;和头部…

谷歌seo外链重要还是内容重要?

想做网站&#xff0c;内容跟外链缺一不可&#xff0c;如果真的要说哪个更重要&#xff0c;那内容依旧是网站的核心&#xff0c;而外链则是额外的加分项 内容永远是王道&#xff0c;不管谷歌seo的算法怎么变&#xff0c;只要你的内容没问题&#xff0c;那就肯定不会牵扯到你的网…

Oracle 配置多个缓冲池(Keep pool Recycle Pool)

默认情况下&#xff0c;Oracle只有一个缓冲池 - Buffer Cache&#xff0c;其可以满足基本数据缓存需求。但某些数据的访问模式可能与普通数据不同&#xff0c;对于访问非常频繁的数据和很少访问的数据&#xff08;两种极端&#xff09;&#xff0c;Oracle可以支持配置两个独立的…

“312血洗四周年”!比特币冲破7.2万创新高!手持华尔街资金与减半叙事,跃升为全球第8大资产!

在过去一周时间里&#xff0c;比特币三次突破了2021年11月的历史高点。周一&#xff0c;加密市场延续涨势&#xff0c;比特币涨至72000美元以上&#xff0c;盘中一度触及72800美元&#xff0c;以太坊攀升至4000美元以上。 随着比特币再次创下新纪录&#xff0c;其市值已突破1.4…

python基础及网络爬虫

网络爬虫(Web crawler)&#xff0c;有时候也叫网络蜘蛛(Web spider)&#xff0c;是指这样一类程序——它们可以自动连接到互联网站点&#xff0c;并读取网页中的内容或者存放在网络上的各种信息&#xff0c;并按照某种策略对目标信息进行采集&#xff08;如对某个网站的全部页面…

记录一下C++的学习之旅吧--C++基础

文章目录 前言using namespace std; 使用标准命名空间一、helloworld-输出表示1.1代码1.2 运行结果 二、变量2.1.1 普通变量代码2.1.2 运行结果2.2.1 常量和变量代码2.2.2 运行结果 三、sizeof---统计数据类型所占的内存大小3.1 代码3.2 运行结果 四、小数表示4.2 运行结果 五、…

基于React低代码平台开发:直击最新高效应用构建

&#x1f3e1;浩泽学编程&#xff1a;个人主页 &#x1f525; 推荐专栏&#xff1a;《深入浅出SpringBoot》《java对AI的调用开发》 《RabbitMQ》《Spring》《SpringMVC》《项目实战》 &#x1f6f8;学无止境&#xff0c;不骄不躁&#xff0c;知行合一 文章目录…

【JavaWeb】Tomacat部署Web项目

Hi i,m JinXiang ⭐ 前言 ⭐ 本篇文章主要介绍【JavaWeb】Tomacat部署Web项目的详细使用以及部分理论知识 &#x1f349;欢迎点赞 &#x1f44d; 收藏 ⭐留言评论 &#x1f4dd;私信必回哟&#x1f601; &#x1f349;博主收将持续更新学习记录获&#xff0c;友友们有任何问题…

《为什么学生不喜欢上学?》读书笔记

书简介 美国弗吉尼亚大学心理学教授威林厄姆的教育心理学著作。 作者在文末揭示了撰写此书的目的&#xff1a; 【 教育是将世代积累的智慧传递给孩子&#xff0c;我们强烈地相信它的重要性&#xff0c;因为我们知道&#xff0c;它为每个孩子以及其他所有人都带来了更好生活的希…

./ 相对路径与node程序的启动目录有关

node:internal/fs/sync:78 return binding.openSync( ^ Error: ENOENT: no such file or directory, open D:\前端的学习之路\项目\codeHub\keys\private_key.pem at Object.open (node:internal/fs/sync:78:18) at Object.openSync (node:fs:565:…

【网站项目】014乡镇自来水收费系统

&#x1f64a;作者简介&#xff1a;拥有多年开发工作经验&#xff0c;分享技术代码帮助学生学习&#xff0c;独立完成自己的项目或者毕业设计。 代码可以私聊博主获取。&#x1f339;赠送计算机毕业设计600个选题excel文件&#xff0c;帮助大学选题。赠送开题报告模板&#xff…

JavaScript---lazyload图片懒加载处理_IntersectionObserver

IntersectionObserver API兼容性&#xff1a; 部分代码展示&#xff1a; // 懒加载处理const imgsElem document.querySelectorAll(img);const topElem document.querySelector(#top);// IntersectionObserverconst Observer new IntersectionObserver((entries, observer) …

PWARL CTF and others

title: 一些复杂点的题目 date: 2024-03-09 16:05:24 tags: CTF 2024年3月9日 今日习题完成&#xff1a; 1.BUU [网鼎杯 2020 半决赛]AliceWebsite 2.[RoarCTF 2019]Online Proxy 3.[Polar CTF]到底给不给flag呢 4.网鼎杯 2020 总决赛]Game Exp [RoarCTF 2019]Online Proxy …

【软件工程导论】——软工学绪论及传统软件工程(学习笔记)

&#x1f4d6; 前言&#xff1a;随着软件产业的发展&#xff0c;计算机应用逐步渗透到社会生活的各个角落&#xff0c;使各行各业都发生了很大的变化。这同时也促使人们对软件的品种、数量、功能和质量等提出了越来越高的要求。然而&#xff0c;软件的规模越大、越复杂&#xf…

X64 页表结构

PML4&#xff08;Page Map Level 4&#xff09;是x86-64架构中用于管理虚拟内存地址翻译的四级页表结构之一。它是一种树形结构&#xff0c;由多个页目录表&#xff08;Page Directory Pointer Table&#xff0c;PDPT&#xff09;组成&#xff0c;每个PDPT有512个指向下一级页表…

Hololens 2应用开发系列(3)——MRTK基础知识及配置文件配置(中)

Hololens 2应用开发系列&#xff08;3&#xff09;——MRTK基础知识及配置文件配置&#xff08;中&#xff09; 一、前言二、输入系统2.1 MRTK输入系统介绍2.2 输入数据提供者&#xff08;Input Data Providers&#xff09;2.3 输入动作&#xff08;Input Actions&#xff09;2…