0, 下载并编译 rocBLAS 的调试版本
sudo apt install python3.10-venv
sudo apt install libmsgpack-dev
sudo pip install joblibgit clone --recursive https://github.com/ROCm/rocBLAS.git
$ cd rocBLAS/
$ ./install.sh -i -g
构建时间也不短
1,下载并编译 rocSolver 的调试版本
git clone --recursive git@github.com:ROCm/rocSOLVER.git
cd rocSOLVER/
~/ex_rocm/rocSOLVER$ ./install.sh -i -g --install_dir ../local/ --rocblas_dir /opt/rocm/lib
这个编译时间真的长,3个小时的样子,主要是99%后花了两个小时多,跟计算机性能关系不大。
2,编译app源代码
ex_rocsolver_dgeqrf.cpp
/
// example.cpp source code //
/#include <algorithm> // for std::min
#include <stddef.h> // for size_t
#include <stdio.h>
#include <vector>
#include <hip/hip_runtime_api.h> // for hip functions
#include <rocsolver/rocsolver.h> // for all the rocsolver C interfaces and type declarationsvoid init_vector(double* A, int n)
{for(int i=0; i<n; i++)A[i] = (rand()%2000)/1000.0;
}void print_matrix(double* A, int M, int N, int lda)
{for(int i=0; i<M; i++){for(int j=0; j<N; j++){printf("%7.4f, ", A[i + j*lda]);}printf("\n");}}int main() {rocblas_int M = 7;rocblas_int N = 7;rocblas_int lda = M;// here is where you would initialize M, N and lda with desired valuesrocblas_handle handle;rocblas_create_handle(&handle);size_t size_A = size_t(lda) * N; // the size of the array for the matrixsize_t size_piv = size_t(std::min(M, N)); // the size of array for the Householder scalarsstd::vector<double> hA(size_A); // creates array for matrix in CPUstd::vector<double> hIpiv(size_piv); // creates array for householder scalars in CPUinit_vector(hA.data(), size_A);memset(hIpiv.data(), 0, size_piv*sizeof(double));print_matrix(hA.data(), M, N, lda);double *dA, *dIpiv;hipMalloc(&dA, sizeof(double)*size_A); // allocates memory for matrix in GPUhipMalloc(&dIpiv, sizeof(double)*size_piv); // allocates memory for scalars in GPU// here is where you would initialize matrix A (array hA) with input data// note: matrices must be stored in column major format,// i.e. entry (i,j) should be accessed by hA[i + j*lda]// copy data to GPUhipMemcpy(dA, hA.data(), sizeof(double)*size_A, hipMemcpyHostToDevice);// compute the QR factorization on the GPUrocsolver_dgeqrf(handle, M, N, dA, lda, dIpiv);// copy the results back to CPUhipMemcpy(hA.data(), dA, sizeof(double)*size_A, hipMemcpyDeviceToHost);hipMemcpy(hIpiv.data(), dIpiv, sizeof(double)*size_piv, hipMemcpyDeviceToHost);printf("\nR =\n");print_matrix(hA.data(), M, N, lda);printf("\ntau=\n");print_matrix(hIpiv.data(), 1, N, 1);// the results are now in hA and hIpiv, so you can use them herehipFree(dA); // de-allocate GPU memoryhipFree(dIpiv);rocblas_destroy_handle(handle); // destroy handle
}
Makefile
EXE := ex_rocsolver_dgeqrfall: $(EXE)INC := -I /home/hipper/ex_rocm/rocSOLVER/build/debug/rocsolver-install/include/rocsolver -D__HIP_PLATFORM_AMD__
LD_FLAGS := -L /home/hipper/ex_rocm/rocSOLVER/build/debug/rocsolver-install/lib -lamdhip64 -lrocblas -lrocsolverex_rocsolver_dgeqrf.o: ex_rocsolver_dgeqrf.cppg++ -g $< $(INC) -c -o $@ex_rocsolver_dgeqrf: ex_rocsolver_dgeqrf.og++ -g $< $(LD_FLAGS) -o $@.PHONY: clean
clean:${RM} *.o $(EXE)
3,运行调试
export LD_LIBRARY_PATH=/home/hipper/ex_rocm/rocSOLVER/build/debug/rocsolver-install/lib
37 ROCSOLVER_LAUNCH_KERNEL(set_diag<T>, dim3(batch_count, 1, 1), dim3(1, 1, 1), 0, stream,
(gdb)
137 ROCSOLVER_LAUNCH_KERNEL(set_diag<T>, dim3(batch_count, 1, 1), dim3(1, 1, 1), 0, stream,
(gdb)
145 if(j < n - 1)
(gdb)
147 rocsolver_larf_template(handle, rocblas_side_left, m - j, n - j - 1, A,
(gdb)
154 ROCSOLVER_LAUNCH_KERNEL(restore_diag<T>, dim3(batch_count, 1, 1), dim3(1, 1, 1), 0, stream,
(gdb)
154 ROCSOLVER_LAUNCH_KERNEL(restore_diag<T>, dim3(batch_count, 1, 1), dim3(1, 1, 1), 0, stream,
(gdb)
129 for(rocblas_int j = 0; j < dim; ++j)
(gdb)
132 rocsolver_larfg_template(handle, m - j, A, shiftA + idx2D(j, j, lda), A,
(gdb)
137 ROCSOLVER_LAUNCH_KERNEL(set_diag<T>, dim3(batch_count, 1, 1), dim3(1, 1, 1), 0, stream,
(gdb)
137 ROCSOLVER_LAUNCH_KERNEL(set_diag<T>, dim3(batch_count, 1, 1), dim3(1, 1, 1), 0, stream,
(gdb)
145 if(j < n - 1)
(gdb)
154 ROCSOLVER_LAUNCH_KERNEL(restore_diag<T>, dim3(batch_count, 1, 1), dim3(1, 1, 1), 0, stream,
(gdb)
154 ROCSOLVER_LAUNCH_KERNEL(restore_diag<T>, dim3(batch_count, 1, 1), dim3(1, 1, 1), 0, stream,
(gdb)
129 for(rocblas_int j = 0; j < dim; ++j)
(gdb)
163 }
(gdb)
rocsolver_geqrf_template<false, false, double, double*> (handle=0x55555565ecd0, m=<optimized out>, n=<optimized out>, A=0x7fff09000000, shiftA=0, lda=7, strideA=<optimized out>, ipiv=<optimized out>, strideP=<optimized out>, batch_count=<optimized out>, scalars=<optimized out>, work_workArr=<optimized out>, Abyx_norms_trfact=<optimized out>, diag_tmptr=<optimized out>, workArr=<optimized out>) at /home/hipper/ex_rocm/rocSOLVER/library/src/lapack/roclapack_geqrf.hpp:174
174 }
(gdb)