Tesla T4 P2P测试

Tesla T4 P2P测试

  • 一.测试环境
  • 二.测试步骤
    • 1.获取设备信息
    • 2.查看PCIE拓扑结构
    • 3.选择9B、9E这二张
    • 4.查看逻辑设备ID
    • 5.设置环境变量(需要用逻辑设备ID,通过UUID跟smi看到的物理ID关联)
    • 6.不同地址的原子操作
    • 2.P2P与非P2P的性能差异
    • 3.GPU带宽测试

Tesla T4 P2P测试

  • 通过物理ID找到逻辑ID
  • NCU P2P相关的Metrics
  • PCIE、DRAM相关的Metrics

一.测试环境


二.测试步骤

1.获取设备信息

nvidia-smi -L
GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-c91a8013-0877-df61-53b9-016fabcd5f82)
GPU 1: NVIDIA A30 (UUID: GPU-f67790b5-bb58-614d-4190-3598a99f925e)
GPU 2: Tesla T4 (UUID: GPU-e95bfaa3-bf41-7aeb-d1e7-4f4f98ac3a63)
GPU 3: Tesla T4 (UUID: GPU-7b470c8f-cfe3-81d2-1dd8-2e36c2552d0e)
GPU 4: Tesla T4 (UUID: GPU-d59282d2-060d-270e-1c0e-f50e936ffede)
GPU 5: NVIDIA GeForce RTX 3080 Ti (UUID: GPU-9a131b18-28de-d6a1-01e9-76a133e21680)
GPU 6: NVIDIA A30 (UUID: GPU-49daa3a5-490b-569c-f1d2-79d98c6d3a02)
GPU 7: Tesla T4 (UUID: GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1)nvidia-smi  -q | grep "Bus Id"
Bus Id                            : 00000000:34:00.0
Bus Id                            : 00000000:35:00.0
Bus Id                            : 00000000:36:00.0 Tesla T4
Bus Id                            : 00000000:37:00.0 Tesla T4
Bus Id                            : 00000000:9B:00.0 Tesla T4 d59282d2
Bus Id                            : 00000000:9C:00.0
Bus Id                            : 00000000:9D:00.0
Bus Id                            : 00000000:9E:00.0 Tesla T4 1f45f1e1

2.查看PCIE拓扑结构

lstopo --ignore Core --ignore Misc --ignore PU

在这里插入图片描述

3.选择9B、9E这二张

4.查看逻辑设备ID

tee devinfo.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)int main() {int device_count=0;CUDA_CHECK(cudaGetDeviceCount(&device_count));for(int deviceid=0; deviceid<device_count;deviceid++){CUDA_CHECK(cudaSetDevice(deviceid));  cudaDeviceProp deviceProp;CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));printf("Index,%d UUID:GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x\n",deviceid,(unsigned char)deviceProp.uuid.bytes[0], (unsigned char)deviceProp.uuid.bytes[1],(unsigned char)deviceProp.uuid.bytes[2], (unsigned char)deviceProp.uuid.bytes[3],(unsigned char)deviceProp.uuid.bytes[4], (unsigned char)deviceProp.uuid.bytes[5],(unsigned char)deviceProp.uuid.bytes[6], (unsigned char)deviceProp.uuid.bytes[7],(unsigned char)deviceProp.uuid.bytes[8], (unsigned char)deviceProp.uuid.bytes[9],(unsigned char)deviceProp.uuid.bytes[10],(unsigned char)deviceProp.uuid.bytes[11],(unsigned char)deviceProp.uuid.bytes[12],(unsigned char)deviceProp.uuid.bytes[13],(unsigned char)deviceProp.uuid.bytes[14],(unsigned char)deviceProp.uuid.bytes[15]);}return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o devinfo devinfo.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
unset CUDA_VISIBLE_DEVICES && ./devinfo
  • 输出
Index,0 UUID:GPU-c91a8013-0877-df61-53b9-016fabcd5f82
Index,1 UUID:GPU-9a131b18-28de-d6a1-01e9-76a133e21680
Index,2 UUID:GPU-f67790b5-bb58-614d-4190-3598a99f925e
Index,3 UUID:GPU-49daa3a5-490b-569c-f1d2-79d98c6d3a02
Index,4 UUID:GPU-e95bfaa3-bf41-7aeb-d1e7-4f4f98ac3a63
Index,5 UUID:GPU-7b470c8f-cfe3-81d2-1dd8-2e36c2552d0e
Index,6 UUID:GPU-d59282d2-060d-270e-1c0e-f50e936ffede  #选中
Index,7 UUID:GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1  #选中

5.设置环境变量(需要用逻辑设备ID,通过UUID跟smi看到的物理ID关联)

export CUDA_VISIBLE_DEVICES=6,7

6.不同地址的原子操作

tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)template<int mode>
__global__ void dummyKernel(float *data) {int idx = threadIdx.x + blockIdx.x * blockDim.x;for(int i=0;i<102400;i++){atomicAdd(&data[idx], idx*i);}
}template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{ CUDA_CHECK(cudaDeviceSynchronize());auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream); f(stream); cudaEventRecord(stop_ev, stream); CUDA_CHECK(cudaEventSynchronize(stop_ev)); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> diff = end - start; float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start_ev, stop_ev); printf("E2E:%8.4fms Kernel:%8.4fms\n",diff.count()*1000,milliseconds);
}int main() {int devID0 = 0, devID1 = 1;int device_count=0;CUDA_CHECK(cudaGetDeviceCount(&device_count));for(int deviceid=0; deviceid<2;deviceid++){CUDA_CHECK(cudaSetDevice(deviceid));  cudaDeviceProp deviceProp;CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));std::cout << "-----------------------------------" << std::endl;std::cout << "Device Index: " << deviceid << std::endl;std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;std::cout << "Device name: " << deviceProp.name << std::endl;std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;std::cout << "canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl;std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;printf("UUID:GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x\n",(unsigned char)deviceProp.uuid.bytes[0], (unsigned char)deviceProp.uuid.bytes[1],(unsigned char)deviceProp.uuid.bytes[2], (unsigned char)deviceProp.uuid.bytes[3],(unsigned char)deviceProp.uuid.bytes[4], (unsigned char)deviceProp.uuid.bytes[5],(unsigned char)deviceProp.uuid.bytes[6], (unsigned char)deviceProp.uuid.bytes[7],(unsigned char)deviceProp.uuid.bytes[8], (unsigned char)deviceProp.uuid.bytes[9],(unsigned char)deviceProp.uuid.bytes[10],(unsigned char)deviceProp.uuid.bytes[11],(unsigned char)deviceProp.uuid.bytes[12],(unsigned char)deviceProp.uuid.bytes[13],(unsigned char)deviceProp.uuid.bytes[14],(unsigned char)deviceProp.uuid.bytes[15]);}std::cout << "-----------------------------------" << std::endl;int p2p_value=0;CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;#define block_size (32*4)size_t dataSize = block_size * sizeof(float);float *data0_dev, *data1_dev;CUDA_CHECK(cudaSetDevice(devID0));CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));CUDA_CHECK(cudaSetDevice(devID1));CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));CUDA_CHECK(cudaMemset(data1_dev, 0, dataSize));// 启用P2Pint canAccessPeer=0;CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));if (canAccessPeer) {CUDA_CHECK(cudaSetDevice(devID1));cudaStream_t stream;cudaStreamCreate(&stream);cudaEvent_t start_ev, stop_ev;cudaEventCreate(&start_ev);cudaEventCreate(&stop_ev);CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存TIMEIT([&data0_dev](cudaStream_t &stream)-> void {dummyKernel<1><<<1, block_size,0,stream>>>(data0_dev);},stream,start_ev,stop_ev);TIMEIT([&data1_dev](cudaStream_t &stream)-> void {dummyKernel<3><<<1, block_size,0,stream>>>(data1_dev);},stream,start_ev,stop_ev);CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));}else{printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);}CUDA_CHECK(cudaFree(data0_dev));CUDA_CHECK(cudaFree(data1_dev));return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
./p2p/usr/local/cuda/bin/ncu --metrics \
lts__t_requests_aperture_device.sum,\
lts__t_sectors_aperture_device.sum,\
lts__t_requests_aperture_peer.sum,\
lts__t_requests_srcnode_gpc_aperture_peer.sum,\
lts__t_requests_srcunit_l1_aperture_peer.sum,\
lts__t_requests_srcunit_tex_aperture_peer.sum,\
lts__t_sectors_aperture_peer.sum,\
lts__t_sectors_srcnode_gpc_aperture_peer.sum,\
lts__t_sectors_srcunit_l1_aperture_peer.sum,\
lts__t_sectors_srcunit_tex_aperture_peer.sum,\
dram__bytes_read.sum,\
dram__bytes_write.sum,\
dram__bytes_read.sum.per_second,\
pcie__read_bytes.sum,\
pcie__write_bytes.sum,\
pcie__read_bytes.sum.per_second,\
pcie__write_bytes.sum.per_second,\
dram__bytes_write.sum.per_second ./p2p
  • 输出
-----------------------------------
Device Index: 0
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
canMapHostMemory: 1
Number of SMs: 40
UUID:GPU-d59282d2-060d-270e-1c0e-f50e936ffede
-----------------------------------
Device Index: 1
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
canMapHostMemory: 1
Number of SMs: 40
UUID:GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1
-----------------------------------
cudaDevP2PAttrAccessSupported: 1
E2E:132.1300ms Kernel:132.1245ms  #GPU1通过P2P对GPU0的设备内存进行原子操作
E2E:  3.5552ms Kernel:  3.5444ms  #GPU1对本地DRAM进行原子操作void dummyKernel<(int)1>(float *), 2024-Sep-25 17:06:47, Context 2, Stream 34
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
dram__bytes_read.sum                                                             Kbyte                          52.86
dram__bytes_read.sum.per_second                                           Kbyte/second                         405.11
dram__bytes_write.sum                                                            Kbyte                           1.25
dram__bytes_write.sum.per_second                                          Kbyte/second                           9.56
lts__t_requests_aperture_device.sum                                            request                          17775
lts__t_requests_aperture_peer.sum                                              request                         409600 #4个warp,每个102400 次合并访问 地址范围了4*32*4
lts__t_requests_srcnode_gpc_aperture_peer.sum                                  request                         409600
lts__t_requests_srcunit_l1_aperture_peer.sum                                   request                              0
lts__t_requests_srcunit_tex_aperture_peer.sum                                  request                         409600
lts__t_sectors_aperture_device.sum                                              sector                          93043
lts__t_sectors_aperture_peer.sum                                                sector                        1638400 
lts__t_sectors_srcnode_gpc_aperture_peer.sum                                    sector                        1638400 #合并访问一次请求4个sector 1638400*32=50MB
lts__t_sectors_srcunit_l1_aperture_peer.sum                                     sector                              0
lts__t_sectors_srcunit_tex_aperture_peer.sum                                    sector                        1638400
pcie__read_bytes.sum                                                             Mbyte                          59.10
pcie__read_bytes.sum.per_second                                           Mbyte/second                         452.93
pcie__write_bytes.sum                                                            Mbyte                          72.18 #实际PCIE读写加起来超过50MB
pcie__write_bytes.sum.per_second                                          Mbyte/second                         553.17
---------------------------------------------------------------------- --------------- ------------------------------void dummyKernel<(int)3>(float *), 2024-Sep-25 17:06:48, Context 2, Stream 34
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
dram__bytes_read.sum                                                             Kbyte                           6.21
dram__bytes_read.sum.per_second                                           Mbyte/second                           1.77
dram__bytes_write.sum                                                             byte                            224
dram__bytes_write.sum.per_second                                          Kbyte/second                          63.93
lts__t_requests_aperture_device.sum                                            request                         414530
lts__t_requests_aperture_peer.sum                                              request                              0
lts__t_requests_srcnode_gpc_aperture_peer.sum                                  request                              0
lts__t_requests_srcunit_l1_aperture_peer.sum                                   request                              0
lts__t_requests_srcunit_tex_aperture_peer.sum                                  request                              0
lts__t_sectors_aperture_device.sum                                              sector                        1643605
lts__t_sectors_aperture_peer.sum                                                sector                              0
lts__t_sectors_srcnode_gpc_aperture_peer.sum                                    sector                              0
lts__t_sectors_srcunit_l1_aperture_peer.sum                                     sector                              0
lts__t_sectors_srcunit_tex_aperture_peer.sum                                    sector                              0
pcie__read_bytes.sum                                                             Kbyte                           3.58
pcie__read_bytes.sum.per_second                                           Mbyte/second                           1.02
pcie__write_bytes.sum                                                            Kbyte                           3.07
pcie__write_bytes.sum.per_second                                          Kbyte/second                         876.74
---------------------------------------------------------------------- --------------- ------------------------------

2.P2P与非P2P的性能差异

tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)template<int mode>
__global__ void dummyKernel(float *input_data,float *output_data) {int idx = threadIdx.x + blockIdx.x * blockDim.x;output_data[idx]=input_data[idx];
}template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{ CUDA_CHECK(cudaDeviceSynchronize());auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream); f(stream); cudaEventRecord(stop_ev, stream); CUDA_CHECK(cudaEventSynchronize(stop_ev)); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> diff = end - start; float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start_ev, stop_ev); printf("E2E:%7.2fms Kernel:%7.2fms\n",diff.count()*1000,milliseconds);
}int main() {int devID0 = 0, devID1 = 1;int device_count=0;CUDA_CHECK(cudaGetDeviceCount(&device_count));for(int deviceid=0; deviceid<2;deviceid++){CUDA_CHECK(cudaSetDevice(deviceid));  cudaDeviceProp deviceProp;CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));std::cout << "-----------------------------------" << std::endl;std::cout << "Device Index: " << deviceid << std::endl;std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;std::cout << "Device name: " << deviceProp.name << std::endl;std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;}std::cout << "-----------------------------------" << std::endl;int p2p_value=0;CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;#define block_size 1024#define block_count 1000000Lsize_t dataSize = block_count*block_size * sizeof(float);float *data0_dev, *data1_dev,*data1_dev_ex;CUDA_CHECK(cudaSetDevice(devID0));CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));CUDA_CHECK(cudaSetDevice(devID1));CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));CUDA_CHECK(cudaMalloc(&data1_dev_ex, dataSize));char *host;CUDA_CHECK(cudaMallocHost(&host,dataSize));printf("Init Done(%.2f)GB..\n",dataSize/1024.0/1024.0/1024.0);// 启用P2Pint canAccessPeer=0;CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));if (canAccessPeer) {CUDA_CHECK(cudaSetDevice(devID1));cudaStream_t stream;cudaStreamCreate(&stream);cudaEvent_t start_ev, stop_ev;cudaEventCreate(&start_ev);cudaEventCreate(&stop_ev);CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存TIMEIT([&](cudaStream_t &stream)-> void {cudaMemcpyAsync(host,data1_dev,dataSize,cudaMemcpyHostToDevice,stream);},stream,start_ev,stop_ev);TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<1><<<block_count, block_size,0,stream>>>(data0_dev,data1_dev);},stream,start_ev,stop_ev);TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<2><<<block_count, block_size,0,stream>>>(data1_dev_ex,data1_dev);},stream,start_ev,stop_ev);CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));}else{printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);}CUDA_CHECK(cudaFreeHost(host));CUDA_CHECK(cudaFree(data0_dev));CUDA_CHECK(cudaFree(data1_dev));CUDA_CHECK(cudaFree(data1_dev_ex));return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
./p2p/usr/local/NVIDIA-Nsight-Compute/ncu --metrics \
dram__bytes_read.sum.pct_of_peak_sustained_elapsed,\
dram__bytes_write.sum.pct_of_peak_sustained_elapsed,\
dram__bytes_read.sum.per_second,\
pcie__read_bytes.sum.per_second,\
pcie__write_bytes.sum.per_second,\
dram__bytes_write.sum.per_second ./p2p
  • 输出
-----------------------------------
Device Index: 0
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
Number of SMs: 40
-----------------------------------
Device Index: 1
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
Number of SMs: 40
-----------------------------------
cudaDevP2PAttrAccessSupported: 1
Init Done(3.81)GB..
E2E: 325.70ms Kernel: 325.71ms  # GPU1 cudaMemcpyHostToDevice  11.697GB/s
E2E: 307.29ms Kernel: 307.31ms  # GPU1 通过P2P从GPU0的设备内存读取数据(比H2D快) 12.39GB/s
E2E:  37.90ms Kernel:  37.89ms  # GPU1 DRAM内 D2D的拷贝 2*100.55GB/svoid dummyKernel<1>(float *, float *) (1000000, 1, 1)x(1024, 1, 1), Context 2, Stream 34, Device 7, CC 7.5
Section: Command line profiler metrics
--------------------------------------------------- ----------- ------------
Metric Name                                         Metric Unit Metric Value
--------------------------------------------------- ----------- ------------
dram__bytes_read.sum.pct_of_peak_sustained_elapsed            %         0.01
dram__bytes_read.sum.per_second                         Mbyte/s        37.35
dram__bytes_write.sum.pct_of_peak_sustained_elapsed           %         4.72
dram__bytes_write.sum.per_second                        Gbyte/s        15.10
pcie__read_bytes.sum.per_second                         Gbyte/s        15.01
pcie__write_bytes.sum.per_second                        Gbyte/s         3.34
--------------------------------------------------- ----------- ------------void dummyKernel<2>(float *, float *) (1000000, 1, 1)x(1024, 1, 1), Context 2, Stream 34, Device 7, CC 7.5
Section: Command line profiler metrics
--------------------------------------------------- ----------- ------------
Metric Name                                         Metric Unit Metric Value
--------------------------------------------------- ----------- ------------
dram__bytes_read.sum.pct_of_peak_sustained_elapsed            %        37.34  #同时读写时,利用率加起来75%
dram__bytes_read.sum.per_second                         Gbyte/s       119.41
dram__bytes_write.sum.pct_of_peak_sustained_elapsed           %        37.81
dram__bytes_write.sum.per_second                        Gbyte/s       120.89  #加起来239GB/s,跟后面的带宽测试一致
pcie__read_bytes.sum.per_second                         Mbyte/s        22.03
pcie__write_bytes.sum.per_second                        Mbyte/s         7.42
--------------------------------------------------- ----------- ------------

3.GPU带宽测试

git clone https://www.github.com/nvidia/cuda-samples
cd cuda-samples/Samples/1_Utilities/deviceQuery
make clean && make
./deviceQuery
cd ../bandwidthTest/
make clean && make
./bandwidthTest --device=0
  • 输出
Running on...Device 0: Tesla T4Quick ModeHost to Device Bandwidth, 1 Device(s)PINNED Memory TransfersTransfer Size (Bytes)        Bandwidth(GB/s)32000000                     12.8Device to Host Bandwidth, 1 Device(s)PINNED Memory TransfersTransfer Size (Bytes)        Bandwidth(GB/s)32000000                     13.1Device to Device Bandwidth, 1 Device(s)PINNED Memory TransfersTransfer Size (Bytes)        Bandwidth(GB/s)32000000                     239.4
Result = PASS

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/pingmian/54969.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

2024PT展,现场精华

9月25-27日&#xff0c;2024年国际信息通信展&#xff08;简称PT展&#xff09;在北京国家会议中心召开。 小枣君去了现场&#xff0c;也拍了一些照片&#xff0c;特此分享给大家。 会场离“奥林匹克公园”地铁站很近&#xff1a; Logo设计得还是挺好看的&#xff1a; 熟悉的场…

一区黏菌算法+双向深度学习+注意力机制!SMA-BiTCN-BiGRU-Attention黏菌算法优化双向时间卷积双向门控循环单元融合注意力机制多变量回归预测

一区黏菌算法双向深度学习注意力机制&#xff01;SMA-BiTCN-BiGRU-Attention黏菌算法优化双向时间卷积双向门控循环单元融合注意力机制多变量回归预测 目录 一区黏菌算法双向深度学习注意力机制&#xff01;SMA-BiTCN-BiGRU-Attention黏菌算法优化双向时间卷积双向门控循环单元…

探索机器人快换盘技术的未来之路:智能化与协作的革新

在当今快速发展的科技时代&#xff0c;机器人已成为众多领域不可或缺的得力助手。其中&#xff0c;机器人快换盘技术作为提升机器人灵活性和应用广度的重要技术&#xff0c;正经历着前所未有的变革与创新。下面请随我们一起深入探讨这一技术的未来发展趋势。 一、人工智能&…

数据工程师岗位常见面试问题-2(附回答)

数据工程师已成为科技行业最重要的角色之一&#xff0c;是组织构建数据基础设施的骨干。随着企业越来越依赖数据驱动的决策&#xff0c;对成熟数据工程师的需求会不断上升。如果您正在准备数据工程师面试&#xff0c;那么应该掌握常见的数据工程师面试问题&#xff1a;包括工作…

Flink Lookup Join的工作原理、性能优化和应用场景

目录 1 Flink Lookup Join的工作原理 1.1 数据流处理与维表关联 1.2 键值对查询 1.3 数据时效性与准确性 2 Flink Lookup Join的实现方法 2.1 SQL语句编写 2.2 系统架构与数据流 3 Flink Lookup Join的性能优化 3.1 数据存储与索引 3.2 连接算法优化 3.3 资源配置与…

第19周JavaWeb编程实战-MyBatis实现OA系统 1-OA系统

办公OA系统项目开发 课程简介 本课程将通过慕课办公OA平台的开发&#xff0c;讲解实际项目开发中必须掌握的技能和设计技巧。课程分为三个主要阶段&#xff1a; 需求说明及环境准备&#xff1a; 基于RBAC的访问控制模块开发&#xff1a; 多级请假审批流程开发&#xff1a; …

matlab-对比两张图片的HSV分量的差值并形成直方图

%对比两张图片的HSV分量的差值并形成直方图&#xff0c;改个路径就能用&#xff0c;图片分辨率要一致 close all; clear all; clc; I1imread(E:\test\resources\image\1.jpg); I2imread(E:\test\resources\image\2.jpg); HSV1 rgb2ntsc(I1); HSV2 rgb2ntsc(I2); %HSV,HSV 代…

Android Webview和ScrollView冲突和WebView使用总结

1.因为Webview和ScrollView都用滑动事件&#xff0c;导致webview很难被滑动&#xff0c;即使被滑动了一点也非常不顺畅2.解决滑动冲突问题后发现&#xff0c;如果webview嵌套的html中含有轮播图等还是有问题。 使用自定义ScrollWebView解决这个问题 public class ScrollWebVi…

<<迷雾>> 第 4 章 电子计算机发明的前夜 示例电路

莫尔斯电报示意图 info::操作说明 鼠标单击开关切换开合状态 通电后, 线圈产生磁力从而将铁片开关(衔铁臂)吸引下来 primary::在线交互操作链接 https://cc.xiaogd.net/?startCircuitLinkhttps://book.xiaogd.net/cyjsjdmw-examples/assets/circuit/cyjsjdmw-ch04-01-morse-te…

Shopline对接需要注意的问题

Shopline对接是一项复杂而细致的工作&#xff0c;为了确保对接的顺利进行&#xff0c;并保证系统的稳定性和可靠性&#xff0c;需要注意以下几个方面。 1.API文档的详细阅读 功能理解&#xff1a; 仔细阅读Shopline提供的API文档&#xff0c;全面了解每个接口的功能、参数、返…

AI学习指南深度学习篇-Adadelta的Python实践

AI学习指南深度学习篇-Adadelta的Python实践 深度学习是人工智能领域的一个重要分支&#xff0c;近年来在各个领域都取得了显著的成就。在深度学习的模型训练中&#xff0c;优化算法起着至关重要的作用&#xff0c;其中Adadelta是一种常用的优化算法之一。本篇博客将使用Pytho…

初始docker以及docker的基本使用!!!

文章目录 虚拟化技术Docker/podman 命令通用命令查看docker 当前版本管理docker运行 镜像操作[image]列出本地所有镜像拉取镜像删除镜像把docker中的镜像打包成文件把镜像文件加载到docker中上传镜像 容器操作[container]创建容器docker run的参数选项列出所有容器启动容器停止…

JavaScript 中的闭包的形成及使用场景

JavaScript 中的闭包 闭包&#xff08;Closure&#xff09; 是 JavaScript 中一个非常重要且独特的概念&#xff0c;它指的是 函数能够记住并访问其词法作用域内的变量&#xff0c;即使这个函数在其词法作用域之外执行。 通俗地说&#xff0c;闭包是 一个函数可以“记住”它在…

安防区域保护:无线电干扰设备技术详解

在安防区域保护中&#xff0c;无线电干扰设备技术扮演着重要角色&#xff0c;它主要用于通过发射特定频率的无线电波来干扰无人机或其他无线电设备的通信链路、导航信号或控制信号&#xff0c;以达到削弱、阻断甚至控制这些设备运行的目的。以下是对无线电干扰设备技术的详细解…

EditText 设置imeOptions事件监听无效问题

在EditText获取焦点的时候&#xff0c;给软键盘右下角按键添加事件事件监听&#xff0c;发现出现setOnEditorActionListener事件监听无效的问题。 EditText设置 imeOptions表示为搜索事件&#xff0c;软键盘弹出时&#xff0c;右下角显示为搜索字样而不是换行符 android:sin…

【GAN】生成对抗网络Generative Adversarial Networks理解摘要

【Pytorch】生成对抗网络实战_pytorch生成对抗网络-CSDN博客 【损失函数】KL散度与交叉熵理解-CSDN博客 [1406.2661] Generative Adversarial Networks (arxiv.org) GAN本质是对抗或者说竞争&#xff0c;通过生成器和鉴别器的竞争获取有效地结果&#xff0c;换句话说&#xff0…

每日一练 2024.9.29(2)

目录 解题思路与代码实现 题目分析 一、解题策略 关键步骤&#xff1a; 二、代码实现 三、代码解析 四、复杂度分析 五、运行示例 示例1&#xff1a; 示例2&#xff1a; 六、总结 解题思路与代码实现 题目分析 这道题目要求我们找到字符串列表 strs 中的相似字符组…

C++——vector

1.简介 2.成员函数 2.1构造函数 void test_vector1() {//1.无参构造vector<int> v1;cout << v1.capacity() << endl;//2.传参构造vector<int> v2(10,1);//3.迭代器构造vector<int> v3(v2.begin(), v2.end());//也可以使用其它容器的迭代器区间来…

scrapy快速上手

安装 除了scrapy本身还要安装两个库 pip install scrapy pip install pywin32 pip install wheel 创建项目 在要创建项目的地方打开powershell scrapy startproject 项目名 我们得到这样的项目结构&#xff0c;功能如下 scrapy.cfg 项目的主配置信息 …

DMA传输原理详解

DMA-(Data Memory Access) 作用 直接存储器存取(DMA)用来提供在外设和存储器之间或者存储器和存储器之间的高速数据传 输。无须CPU干预&#xff0c;数据可以通过DMA快速地移动&#xff0c;这就节省了CPU的资源来做其他操作。两个DMA控制器有12个通道(DMA1有7个通道&#xff0c;…