#include"stdio.h"#include<iostream>#include<cuda.h>#include<cuda_runtime.h>//Defining two constants
__constant__ int constant_f;
__constant__ int constant_g;#defineN5//Kernel function for using constant memory
__global__ voidgpu_constant_memory(float*d_in,float*d_out){//Thread index for current kernelint tid = threadIdx.x; d_out[tid]= constant_f*d_in[tid]+ constant_g;}
常量内存中的变量使用 __constant__ 关键字修饰
使用 cudaMemcpyToSymbol 函数吧这些常量复制到内核执行所需要的常量内存中
常量内存应合理使用,不然会增加程序执行时间
主函数调用如下:
intmain(void){//Defining Arrays for hostfloat h_in[N], h_out[N];//Defining Pointers for devicefloat*d_in,*d_out;int h_f =2;int h_g =20;// allocate the memory on the cpucudaMalloc((void**)&d_in, N *sizeof(float));cudaMalloc((void**)&d_out, N *sizeof(float));//Initializing Arrayfor(int i =0; i < N; i++){h_in[i]= i;}//Copy Array from host to devicecudaMemcpy(d_in, h_in, N *sizeof(float), cudaMemcpyHostToDevice);//Copy constants to constant memorycudaMemcpyToSymbol(constant_f,&h_f,sizeof(int),0,cudaMemcpyHostToDevice);cudaMemcpyToSymbol(constant_g,&h_g,sizeof(int));//Calling kernel with one block and N threads per blockgpu_constant_memory <<<1, N >>>(d_in, d_out);//Coping result back to host from device memorycudaMemcpy(h_out, d_out, N *sizeof(float), cudaMemcpyDeviceToHost);//Printing result on consoleprintf("Use of Constant memory on GPU \n");for(int i =0; i < N; i++){printf("The expression for input %f is %f\n", h_in[i], h_out[i]);}//Free up memorycudaFree(d_in);cudaFree(d_out);return0;}
#include"stdio.h"#include<iostream>#include<cuda.h>#include<cuda_runtime.h>#defineNUM_THREADS10#defineN10//纹理内存定义
texture <float,1, cudaReadModeElementType> textureRef;
__global__ voidgpu_texture_memory(int n,float*d_out){int idx = blockIdx.x*blockDim.x + threadIdx.x;if(idx < n){float temp =tex1D(textureRef,float(idx));d_out[idx]= temp;}}intmain(){//Calculate number of blocks to launchint num_blocks = N / NUM_THREADS +((N % NUM_THREADS)?1:0);//Declare device pointerfloat*d_out;// allocate space on the device for the resultcudaMalloc((void**)&d_out,sizeof(float)* N);// allocate space on the host for the resultsfloat*h_out =(float*)malloc(sizeof(float)*N);//Declare and initialize host arrayfloat h_in[N];for(int i =0; i < N; i++){h_in[i]=float(i);}//Define CUDA ArraycudaArray *cu_Array;cudaMallocArray(&cu_Array,&textureRef.channelDesc, N,1);//Copy data to CUDA Array,(0,0)表示从左上角开始cudaMemcpyToArray(cu_Array,0,0, h_in,sizeof(float)*N, cudaMemcpyHostToDevice);// bind a texture to the CUDA arraycudaBindTextureToArray(textureRef, cu_Array);//Call Kernel gpu_texture_memory <<<num_blocks, NUM_THREADS >>>(N, d_out);// copy result back to hostcudaMemcpy(h_out, d_out,sizeof(float)*N, cudaMemcpyDeviceToHost);printf("Use of Texture memory on GPU: \n");for(int i =0; i < N; i++){printf("Texture element at %d is : %f\n",i, h_out[i]);}free(h_out);cudaFree(d_out);cudaFreeArray(cu_Array);cudaUnbindTexture(textureRef);}
前言
在构建基于 Spring 的 Web 应用程序时,了解初始化流程是至关重要的。本文将详细介绍 Servlet 容器的初始化过程,并重点探讨 Spring 框架在其中的作用,特别是 ServletContainerInitializer、SpringServletContainerInitializer 和 WebAp…
Model Memory Utility - a Hugging Face Space by hf-accelerate
这个工具可以计算在 Hugging Face Hub上托管的大型模型训练和执行推理时所需的vRAM内存量。模型所需的最低推荐vRAM内存量表示为“最大层”的大小,模型的训练大约是其大小的4倍(针对Adam…
100323. 优质数对的总数 I
原题链接
100323. 优质数对的总数 I
思路分析
签到题
AC代码
class Solution:def numberOfPairs(self, nums1: List[int], nums2: List[int], k: int) -> int:n, m len(nums1), len(nums2)ret 0for i in range(n):for j in range(m):if nu…