学习内容:
利用cuda的cublas库,编写复数矩阵乘的代码
学习产出:
编写代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <random>// 设置随机种子。为了每次运行都得到相同的随机数,可以使用固定的值。
// 如果你想每次运行都得到不同的随机数,可以使用当前时间作为种子,例如 std::time(nullptr)。
// 创建一个分布对象,用于定义随机数的范围-10 到 10
unsigned seed = 100;
std::mt19937 generator(seed);
std::uniform_real_distribution<float> distribution(-10.0f, 10.0f);// 辅助函数:生成随机复数
cuComplex getRandomComplex() {float real = distribution(generator);float imag = distribution(generator);return make_cuComplex(real, imag);
}// 初始化复数矩阵
void initComplexMatrix(cuComplex *data, int size) {for (int i = 0; i < size; ++i) {data[i] = getRandomComplex();}
}// 打印复数矩阵
void printComplexMatrix(const cuComplex *data, int rows, int cols) {for (int i = 0; i < rows; ++i) {for (int j = 0; j < cols; ++j) {printf("(%f, %f) ", data[i * cols + j].x, data[i * cols + j].y);}printf("\n");}
}int main(int argc, char **argv) {// 矩阵大小int M = 2; // A的行数int N = 2; // B的列数int K = 2; // A的列数/B的行数// 分配host侧内存cuComplex *h_A = (cuComplex *)malloc(M * K * sizeof(cuComplex));cuComplex *h_B = (cuComplex *)malloc(K * N * sizeof(cuComplex));cuComplex *h_C = (cuComplex *)malloc(M * N * sizeof(cuComplex));// 随机初始化矩阵A和BinitComplexMatrix(h_A, M * K);initComplexMatrix(h_B, K * N);// 打印矩阵Aprintf("h_A:\n");printComplexMatrix(h_A, M, K);// 打印矩阵Bprintf("h_B:\n");printComplexMatrix(h_B, K, N);// 分配device侧内存cuComplex *d_A, *d_B, *d_C;cudaMalloc((void **)&d_A, M * K * sizeof(cuComplex));cudaMalloc((void **)&d_B, K * N * sizeof(cuComplex));cudaMalloc((void **)&d_C, M * N * sizeof(cuComplex));// 将数据从host传输到devicecudaMemcpy(d_A, h_A, M * K * sizeof(cuComplex), cudaMemcpyHostToDevice);cudaMemcpy(d_B, h_B, K * N * sizeof(cuComplex), cudaMemcpyHostToDevice);// 创建cuBLAS句柄cublasHandle_t handle;cublasCreate(&handle);// 设置矩阵乘法的参数const cuComplex alpha = make_cuComplex(1.0f, 0.0f); // 缩放因子const cuComplex beta = make_cuComplex(0.0f, 0.0f); // 累加因子// 执行复数矩阵乘法 C = alpha * A * B + beta * CcublasCgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, d_A, M, d_B, K, &beta, d_C, M);// 将结果从device传输到hostcudaMemcpy(h_C, d_C, M * N * sizeof(cuComplex), cudaMemcpyDeviceToHost);// 打印结果矩阵Cprintf("h_C:\n");printComplexMatrix(h_C, M, N);// 释放内存和cuBLAS句柄cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);free(h_A);free(h_B);free(h_C);cublasDestroy(handle);return 0;
}
利用nvcc进行编译:
nvcc -arch=sm_61 demo_v2.cu -o demo_v2 -lcublas
执行:
.\demo_v2.exe
得到如下结果:
h_A:
(0.868099, 3.423113) (-4.432612, -1.759072)
(-1.509648, 0.527649) (6.895523, -7.027903)
h_B:
(-9.905622, -6.865778) (-7.568618, -6.270657)
(3.414982, -5.797845) (6.517056, -0.945202)
h_C:
(29.637926, -34.395329) (-64.428764, 57.810093)
(13.471494, 11.522404) (12.959600, -32.626499)