Tesla T4 P2P测试

Tesla T4 P2P测试

  • 一.测试环境
  • 二.测试步骤
    • 1.获取设备信息
    • 2.查看PCIE拓扑结构
    • 3.选择9B、9E这二张
    • 4.查看逻辑设备ID
    • 5.设置环境变量(需要用逻辑设备ID,通过UUID跟smi看到的物理ID关联)
    • 6.不同地址的原子操作
    • 2.P2P与非P2P的性能差异
    • 3.GPU带宽测试

Tesla T4 P2P测试

  • 通过物理ID找到逻辑ID
  • NCU P2P相关的Metrics
  • PCIE、DRAM相关的Metrics

一.测试环境


二.测试步骤

1.获取设备信息

nvidia-smi -L
GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-c91a8013-0877-df61-53b9-016fabcd5f82)
GPU 1: NVIDIA A30 (UUID: GPU-f67790b5-bb58-614d-4190-3598a99f925e)
GPU 2: Tesla T4 (UUID: GPU-e95bfaa3-bf41-7aeb-d1e7-4f4f98ac3a63)
GPU 3: Tesla T4 (UUID: GPU-7b470c8f-cfe3-81d2-1dd8-2e36c2552d0e)
GPU 4: Tesla T4 (UUID: GPU-d59282d2-060d-270e-1c0e-f50e936ffede)
GPU 5: NVIDIA GeForce RTX 3080 Ti (UUID: GPU-9a131b18-28de-d6a1-01e9-76a133e21680)
GPU 6: NVIDIA A30 (UUID: GPU-49daa3a5-490b-569c-f1d2-79d98c6d3a02)
GPU 7: Tesla T4 (UUID: GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1)nvidia-smi  -q | grep "Bus Id"
Bus Id                            : 00000000:34:00.0
Bus Id                            : 00000000:35:00.0
Bus Id                            : 00000000:36:00.0 Tesla T4
Bus Id                            : 00000000:37:00.0 Tesla T4
Bus Id                            : 00000000:9B:00.0 Tesla T4 d59282d2
Bus Id                            : 00000000:9C:00.0
Bus Id                            : 00000000:9D:00.0
Bus Id                            : 00000000:9E:00.0 Tesla T4 1f45f1e1

2.查看PCIE拓扑结构

lstopo --ignore Core --ignore Misc --ignore PU

在这里插入图片描述

3.选择9B、9E这二张

4.查看逻辑设备ID

tee devinfo.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)int main() {int device_count=0;CUDA_CHECK(cudaGetDeviceCount(&device_count));for(int deviceid=0; deviceid<device_count;deviceid++){CUDA_CHECK(cudaSetDevice(deviceid));  cudaDeviceProp deviceProp;CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));printf("Index,%d UUID:GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x\n",deviceid,(unsigned char)deviceProp.uuid.bytes[0], (unsigned char)deviceProp.uuid.bytes[1],(unsigned char)deviceProp.uuid.bytes[2], (unsigned char)deviceProp.uuid.bytes[3],(unsigned char)deviceProp.uuid.bytes[4], (unsigned char)deviceProp.uuid.bytes[5],(unsigned char)deviceProp.uuid.bytes[6], (unsigned char)deviceProp.uuid.bytes[7],(unsigned char)deviceProp.uuid.bytes[8], (unsigned char)deviceProp.uuid.bytes[9],(unsigned char)deviceProp.uuid.bytes[10],(unsigned char)deviceProp.uuid.bytes[11],(unsigned char)deviceProp.uuid.bytes[12],(unsigned char)deviceProp.uuid.bytes[13],(unsigned char)deviceProp.uuid.bytes[14],(unsigned char)deviceProp.uuid.bytes[15]);}return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o devinfo devinfo.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
unset CUDA_VISIBLE_DEVICES && ./devinfo
  • 输出
Index,0 UUID:GPU-c91a8013-0877-df61-53b9-016fabcd5f82
Index,1 UUID:GPU-9a131b18-28de-d6a1-01e9-76a133e21680
Index,2 UUID:GPU-f67790b5-bb58-614d-4190-3598a99f925e
Index,3 UUID:GPU-49daa3a5-490b-569c-f1d2-79d98c6d3a02
Index,4 UUID:GPU-e95bfaa3-bf41-7aeb-d1e7-4f4f98ac3a63
Index,5 UUID:GPU-7b470c8f-cfe3-81d2-1dd8-2e36c2552d0e
Index,6 UUID:GPU-d59282d2-060d-270e-1c0e-f50e936ffede  #选中
Index,7 UUID:GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1  #选中

5.设置环境变量(需要用逻辑设备ID,通过UUID跟smi看到的物理ID关联)

export CUDA_VISIBLE_DEVICES=6,7

6.不同地址的原子操作

tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)template<int mode>
__global__ void dummyKernel(float *data) {int idx = threadIdx.x + blockIdx.x * blockDim.x;for(int i=0;i<102400;i++){atomicAdd(&data[idx], idx*i);}
}template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{ CUDA_CHECK(cudaDeviceSynchronize());auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream); f(stream); cudaEventRecord(stop_ev, stream); CUDA_CHECK(cudaEventSynchronize(stop_ev)); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> diff = end - start; float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start_ev, stop_ev); printf("E2E:%8.4fms Kernel:%8.4fms\n",diff.count()*1000,milliseconds);
}int main() {int devID0 = 0, devID1 = 1;int device_count=0;CUDA_CHECK(cudaGetDeviceCount(&device_count));for(int deviceid=0; deviceid<2;deviceid++){CUDA_CHECK(cudaSetDevice(deviceid));  cudaDeviceProp deviceProp;CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));std::cout << "-----------------------------------" << std::endl;std::cout << "Device Index: " << deviceid << std::endl;std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;std::cout << "Device name: " << deviceProp.name << std::endl;std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;std::cout << "canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl;std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;printf("UUID:GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x\n",(unsigned char)deviceProp.uuid.bytes[0], (unsigned char)deviceProp.uuid.bytes[1],(unsigned char)deviceProp.uuid.bytes[2], (unsigned char)deviceProp.uuid.bytes[3],(unsigned char)deviceProp.uuid.bytes[4], (unsigned char)deviceProp.uuid.bytes[5],(unsigned char)deviceProp.uuid.bytes[6], (unsigned char)deviceProp.uuid.bytes[7],(unsigned char)deviceProp.uuid.bytes[8], (unsigned char)deviceProp.uuid.bytes[9],(unsigned char)deviceProp.uuid.bytes[10],(unsigned char)deviceProp.uuid.bytes[11],(unsigned char)deviceProp.uuid.bytes[12],(unsigned char)deviceProp.uuid.bytes[13],(unsigned char)deviceProp.uuid.bytes[14],(unsigned char)deviceProp.uuid.bytes[15]);}std::cout << "-----------------------------------" << std::endl;int p2p_value=0;CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;#define block_size (32*4)size_t dataSize = block_size * sizeof(float);float *data0_dev, *data1_dev;CUDA_CHECK(cudaSetDevice(devID0));CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));CUDA_CHECK(cudaSetDevice(devID1));CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));CUDA_CHECK(cudaMemset(data1_dev, 0, dataSize));// 启用P2Pint canAccessPeer=0;CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));if (canAccessPeer) {CUDA_CHECK(cudaSetDevice(devID1));cudaStream_t stream;cudaStreamCreate(&stream);cudaEvent_t start_ev, stop_ev;cudaEventCreate(&start_ev);cudaEventCreate(&stop_ev);CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存TIMEIT([&data0_dev](cudaStream_t &stream)-> void {dummyKernel<1><<<1, block_size,0,stream>>>(data0_dev);},stream,start_ev,stop_ev);TIMEIT([&data1_dev](cudaStream_t &stream)-> void {dummyKernel<3><<<1, block_size,0,stream>>>(data1_dev);},stream,start_ev,stop_ev);CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));}else{printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);}CUDA_CHECK(cudaFree(data0_dev));CUDA_CHECK(cudaFree(data1_dev));return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
./p2p/usr/local/cuda/bin/ncu --metrics \
lts__t_requests_aperture_device.sum,\
lts__t_sectors_aperture_device.sum,\
lts__t_requests_aperture_peer.sum,\
lts__t_requests_srcnode_gpc_aperture_peer.sum,\
lts__t_requests_srcunit_l1_aperture_peer.sum,\
lts__t_requests_srcunit_tex_aperture_peer.sum,\
lts__t_sectors_aperture_peer.sum,\
lts__t_sectors_srcnode_gpc_aperture_peer.sum,\
lts__t_sectors_srcunit_l1_aperture_peer.sum,\
lts__t_sectors_srcunit_tex_aperture_peer.sum,\
dram__bytes_read.sum,\
dram__bytes_write.sum,\
dram__bytes_read.sum.per_second,\
pcie__read_bytes.sum,\
pcie__write_bytes.sum,\
pcie__read_bytes.sum.per_second,\
pcie__write_bytes.sum.per_second,\
dram__bytes_write.sum.per_second ./p2p
  • 输出
-----------------------------------
Device Index: 0
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
canMapHostMemory: 1
Number of SMs: 40
UUID:GPU-d59282d2-060d-270e-1c0e-f50e936ffede
-----------------------------------
Device Index: 1
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
canMapHostMemory: 1
Number of SMs: 40
UUID:GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1
-----------------------------------
cudaDevP2PAttrAccessSupported: 1
E2E:132.1300ms Kernel:132.1245ms  #GPU1通过P2P对GPU0的设备内存进行原子操作
E2E:  3.5552ms Kernel:  3.5444ms  #GPU1对本地DRAM进行原子操作void dummyKernel<(int)1>(float *), 2024-Sep-25 17:06:47, Context 2, Stream 34
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
dram__bytes_read.sum                                                             Kbyte                          52.86
dram__bytes_read.sum.per_second                                           Kbyte/second                         405.11
dram__bytes_write.sum                                                            Kbyte                           1.25
dram__bytes_write.sum.per_second                                          Kbyte/second                           9.56
lts__t_requests_aperture_device.sum                                            request                          17775
lts__t_requests_aperture_peer.sum                                              request                         409600 #4个warp,每个102400 次合并访问 地址范围了4*32*4
lts__t_requests_srcnode_gpc_aperture_peer.sum                                  request                         409600
lts__t_requests_srcunit_l1_aperture_peer.sum                                   request                              0
lts__t_requests_srcunit_tex_aperture_peer.sum                                  request                         409600
lts__t_sectors_aperture_device.sum                                              sector                          93043
lts__t_sectors_aperture_peer.sum                                                sector                        1638400 
lts__t_sectors_srcnode_gpc_aperture_peer.sum                                    sector                        1638400 #合并访问一次请求4个sector 1638400*32=50MB
lts__t_sectors_srcunit_l1_aperture_peer.sum                                     sector                              0
lts__t_sectors_srcunit_tex_aperture_peer.sum                                    sector                        1638400
pcie__read_bytes.sum                                                             Mbyte                          59.10
pcie__read_bytes.sum.per_second                                           Mbyte/second                         452.93
pcie__write_bytes.sum                                                            Mbyte                          72.18 #实际PCIE读写加起来超过50MB
pcie__write_bytes.sum.per_second                                          Mbyte/second                         553.17
---------------------------------------------------------------------- --------------- ------------------------------void dummyKernel<(int)3>(float *), 2024-Sep-25 17:06:48, Context 2, Stream 34
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
dram__bytes_read.sum                                                             Kbyte                           6.21
dram__bytes_read.sum.per_second                                           Mbyte/second                           1.77
dram__bytes_write.sum                                                             byte                            224
dram__bytes_write.sum.per_second                                          Kbyte/second                          63.93
lts__t_requests_aperture_device.sum                                            request                         414530
lts__t_requests_aperture_peer.sum                                              request                              0
lts__t_requests_srcnode_gpc_aperture_peer.sum                                  request                              0
lts__t_requests_srcunit_l1_aperture_peer.sum                                   request                              0
lts__t_requests_srcunit_tex_aperture_peer.sum                                  request                              0
lts__t_sectors_aperture_device.sum                                              sector                        1643605
lts__t_sectors_aperture_peer.sum                                                sector                              0
lts__t_sectors_srcnode_gpc_aperture_peer.sum                                    sector                              0
lts__t_sectors_srcunit_l1_aperture_peer.sum                                     sector                              0
lts__t_sectors_srcunit_tex_aperture_peer.sum                                    sector                              0
pcie__read_bytes.sum                                                             Kbyte                           3.58
pcie__read_bytes.sum.per_second                                           Mbyte/second                           1.02
pcie__write_bytes.sum                                                            Kbyte                           3.07
pcie__write_bytes.sum.per_second                                          Kbyte/second                         876.74
---------------------------------------------------------------------- --------------- ------------------------------

2.P2P与非P2P的性能差异

tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)template<int mode>
__global__ void dummyKernel(float *input_data,float *output_data) {int idx = threadIdx.x + blockIdx.x * blockDim.x;output_data[idx]=input_data[idx];
}template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{ CUDA_CHECK(cudaDeviceSynchronize());auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream); f(stream); cudaEventRecord(stop_ev, stream); CUDA_CHECK(cudaEventSynchronize(stop_ev)); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> diff = end - start; float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start_ev, stop_ev); printf("E2E:%7.2fms Kernel:%7.2fms\n",diff.count()*1000,milliseconds);
}int main() {int devID0 = 0, devID1 = 1;int device_count=0;CUDA_CHECK(cudaGetDeviceCount(&device_count));for(int deviceid=0; deviceid<2;deviceid++){CUDA_CHECK(cudaSetDevice(deviceid));  cudaDeviceProp deviceProp;CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));std::cout << "-----------------------------------" << std::endl;std::cout << "Device Index: " << deviceid << std::endl;std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;std::cout << "Device name: " << deviceProp.name << std::endl;std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;}std::cout << "-----------------------------------" << std::endl;int p2p_value=0;CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;#define block_size 1024#define block_count 1000000Lsize_t dataSize = block_count*block_size * sizeof(float);float *data0_dev, *data1_dev,*data1_dev_ex;CUDA_CHECK(cudaSetDevice(devID0));CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));CUDA_CHECK(cudaSetDevice(devID1));CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));CUDA_CHECK(cudaMalloc(&data1_dev_ex, dataSize));char *host;CUDA_CHECK(cudaMallocHost(&host,dataSize));printf("Init Done(%.2f)GB..\n",dataSize/1024.0/1024.0/1024.0);// 启用P2Pint canAccessPeer=0;CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));if (canAccessPeer) {CUDA_CHECK(cudaSetDevice(devID1));cudaStream_t stream;cudaStreamCreate(&stream);cudaEvent_t start_ev, stop_ev;cudaEventCreate(&start_ev);cudaEventCreate(&stop_ev);CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存TIMEIT([&](cudaStream_t &stream)-> void {cudaMemcpyAsync(host,data1_dev,dataSize,cudaMemcpyHostToDevice,stream);},stream,start_ev,stop_ev);TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<1><<<block_count, block_size,0,stream>>>(data0_dev,data1_dev);},stream,start_ev,stop_ev);TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<2><<<block_count, block_size,0,stream>>>(data1_dev_ex,data1_dev);},stream,start_ev,stop_ev);CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));}else{printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);}CUDA_CHECK(cudaFreeHost(host));CUDA_CHECK(cudaFree(data0_dev));CUDA_CHECK(cudaFree(data1_dev));CUDA_CHECK(cudaFree(data1_dev_ex));return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
./p2p/usr/local/NVIDIA-Nsight-Compute/ncu --metrics \
dram__bytes_read.sum.pct_of_peak_sustained_elapsed,\
dram__bytes_write.sum.pct_of_peak_sustained_elapsed,\
dram__bytes_read.sum.per_second,\
pcie__read_bytes.sum.per_second,\
pcie__write_bytes.sum.per_second,\
dram__bytes_write.sum.per_second ./p2p
  • 输出
-----------------------------------
Device Index: 0
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
Number of SMs: 40
-----------------------------------
Device Index: 1
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
Number of SMs: 40
-----------------------------------
cudaDevP2PAttrAccessSupported: 1
Init Done(3.81)GB..
E2E: 325.70ms Kernel: 325.71ms  # GPU1 cudaMemcpyHostToDevice  11.697GB/s
E2E: 307.29ms Kernel: 307.31ms  # GPU1 通过P2P从GPU0的设备内存读取数据(比H2D快) 12.39GB/s
E2E:  37.90ms Kernel:  37.89ms  # GPU1 DRAM内 D2D的拷贝 2*100.55GB/svoid dummyKernel<1>(float *, float *) (1000000, 1, 1)x(1024, 1, 1), Context 2, Stream 34, Device 7, CC 7.5
Section: Command line profiler metrics
--------------------------------------------------- ----------- ------------
Metric Name                                         Metric Unit Metric Value
--------------------------------------------------- ----------- ------------
dram__bytes_read.sum.pct_of_peak_sustained_elapsed            %         0.01
dram__bytes_read.sum.per_second                         Mbyte/s        37.35
dram__bytes_write.sum.pct_of_peak_sustained_elapsed           %         4.72
dram__bytes_write.sum.per_second                        Gbyte/s        15.10
pcie__read_bytes.sum.per_second                         Gbyte/s        15.01
pcie__write_bytes.sum.per_second                        Gbyte/s         3.34
--------------------------------------------------- ----------- ------------void dummyKernel<2>(float *, float *) (1000000, 1, 1)x(1024, 1, 1), Context 2, Stream 34, Device 7, CC 7.5
Section: Command line profiler metrics
--------------------------------------------------- ----------- ------------
Metric Name                                         Metric Unit Metric Value
--------------------------------------------------- ----------- ------------
dram__bytes_read.sum.pct_of_peak_sustained_elapsed            %        37.34  #同时读写时,利用率加起来75%
dram__bytes_read.sum.per_second                         Gbyte/s       119.41
dram__bytes_write.sum.pct_of_peak_sustained_elapsed           %        37.81
dram__bytes_write.sum.per_second                        Gbyte/s       120.89  #加起来239GB/s,跟后面的带宽测试一致
pcie__read_bytes.sum.per_second                         Mbyte/s        22.03
pcie__write_bytes.sum.per_second                        Mbyte/s         7.42
--------------------------------------------------- ----------- ------------

3.GPU带宽测试

git clone https://www.github.com/nvidia/cuda-samples
cd cuda-samples/Samples/1_Utilities/deviceQuery
make clean && make
./deviceQuery
cd ../bandwidthTest/
make clean && make
./bandwidthTest --device=0
  • 输出
Running on...Device 0: Tesla T4Quick ModeHost to Device Bandwidth, 1 Device(s)PINNED Memory TransfersTransfer Size (Bytes)        Bandwidth(GB/s)32000000                     12.8Device to Host Bandwidth, 1 Device(s)PINNED Memory TransfersTransfer Size (Bytes)        Bandwidth(GB/s)32000000                     13.1Device to Device Bandwidth, 1 Device(s)PINNED Memory TransfersTransfer Size (Bytes)        Bandwidth(GB/s)32000000                     239.4
Result = PASS

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/pingmian/54969.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

2024PT展,现场精华

9月25-27日&#xff0c;2024年国际信息通信展&#xff08;简称PT展&#xff09;在北京国家会议中心召开。 小枣君去了现场&#xff0c;也拍了一些照片&#xff0c;特此分享给大家。 会场离“奥林匹克公园”地铁站很近&#xff1a; Logo设计得还是挺好看的&#xff1a; 熟悉的场…

一区黏菌算法+双向深度学习+注意力机制!SMA-BiTCN-BiGRU-Attention黏菌算法优化双向时间卷积双向门控循环单元融合注意力机制多变量回归预测

一区黏菌算法双向深度学习注意力机制&#xff01;SMA-BiTCN-BiGRU-Attention黏菌算法优化双向时间卷积双向门控循环单元融合注意力机制多变量回归预测 目录 一区黏菌算法双向深度学习注意力机制&#xff01;SMA-BiTCN-BiGRU-Attention黏菌算法优化双向时间卷积双向门控循环单元…

探索机器人快换盘技术的未来之路:智能化与协作的革新

在当今快速发展的科技时代&#xff0c;机器人已成为众多领域不可或缺的得力助手。其中&#xff0c;机器人快换盘技术作为提升机器人灵活性和应用广度的重要技术&#xff0c;正经历着前所未有的变革与创新。下面请随我们一起深入探讨这一技术的未来发展趋势。 一、人工智能&…

<<迷雾>> 第 4 章 电子计算机发明的前夜 示例电路

莫尔斯电报示意图 info::操作说明 鼠标单击开关切换开合状态 通电后, 线圈产生磁力从而将铁片开关(衔铁臂)吸引下来 primary::在线交互操作链接 https://cc.xiaogd.net/?startCircuitLinkhttps://book.xiaogd.net/cyjsjdmw-examples/assets/circuit/cyjsjdmw-ch04-01-morse-te…

Shopline对接需要注意的问题

Shopline对接是一项复杂而细致的工作&#xff0c;为了确保对接的顺利进行&#xff0c;并保证系统的稳定性和可靠性&#xff0c;需要注意以下几个方面。 1.API文档的详细阅读 功能理解&#xff1a; 仔细阅读Shopline提供的API文档&#xff0c;全面了解每个接口的功能、参数、返…

初始docker以及docker的基本使用!!!

文章目录 虚拟化技术Docker/podman 命令通用命令查看docker 当前版本管理docker运行 镜像操作[image]列出本地所有镜像拉取镜像删除镜像把docker中的镜像打包成文件把镜像文件加载到docker中上传镜像 容器操作[container]创建容器docker run的参数选项列出所有容器启动容器停止…

安防区域保护:无线电干扰设备技术详解

在安防区域保护中&#xff0c;无线电干扰设备技术扮演着重要角色&#xff0c;它主要用于通过发射特定频率的无线电波来干扰无人机或其他无线电设备的通信链路、导航信号或控制信号&#xff0c;以达到削弱、阻断甚至控制这些设备运行的目的。以下是对无线电干扰设备技术的详细解…

【GAN】生成对抗网络Generative Adversarial Networks理解摘要

【Pytorch】生成对抗网络实战_pytorch生成对抗网络-CSDN博客 【损失函数】KL散度与交叉熵理解-CSDN博客 [1406.2661] Generative Adversarial Networks (arxiv.org) GAN本质是对抗或者说竞争&#xff0c;通过生成器和鉴别器的竞争获取有效地结果&#xff0c;换句话说&#xff0…

每日一练 2024.9.29(2)

目录 解题思路与代码实现 题目分析 一、解题策略 关键步骤&#xff1a; 二、代码实现 三、代码解析 四、复杂度分析 五、运行示例 示例1&#xff1a; 示例2&#xff1a; 六、总结 解题思路与代码实现 题目分析 这道题目要求我们找到字符串列表 strs 中的相似字符组…

C++——vector

1.简介 2.成员函数 2.1构造函数 void test_vector1() {//1.无参构造vector<int> v1;cout << v1.capacity() << endl;//2.传参构造vector<int> v2(10,1);//3.迭代器构造vector<int> v3(v2.begin(), v2.end());//也可以使用其它容器的迭代器区间来…

scrapy快速上手

安装 除了scrapy本身还要安装两个库 pip install scrapy pip install pywin32 pip install wheel 创建项目 在要创建项目的地方打开powershell scrapy startproject 项目名 我们得到这样的项目结构&#xff0c;功能如下 scrapy.cfg 项目的主配置信息 …

LeetCode[中等] 17. 电话号码的字母组合

给定一个仅包含数字 2-9 的字符串&#xff0c;返回所有它能表示的字母组合。答案可以按 任意顺序 返回。 给出数字到字母的映射如下&#xff08;与电话按键相同&#xff09;。注意 1 不对应任何字母。 思路 回溯法 log&#xff1a;当前结果数组&#xff1b;level&#xff1a…

第五届计算机科学与管理科技国际学术会议(ICCSMT 2024)

梁哲&#xff0c;同济大学长聘特聘教授&#xff0c;国家杰青、首届国家杰青延续项目获得者、上海市曙光学者、上海市优秀学术带头人。本科毕业于新加坡国立大计算机工程系、硕士毕业于新加坡国立大学工业与系统工程系、博士毕业于美国新泽西州立大学工业工程系。理论研究主要集…

修改Opcenter EXFN 页面超时时间(Adjust UI Session Extend Token)

如果你想修改Opcenter EXFN中页面Session的超时时间&#xff0c;你可以按照如下步骤修改SessionAge 这个参数&#xff1a; 管理员运行CMD执行以下命令 umconf -getconfig -file C:\temp\config.json如果第2步有报错&#xff0c;则执行步骤4;如果没有报错则执行第5步如果第2步…

探索光耦:光耦在电脑电源中的应用及其重要性

随着计算机技术的飞速发展&#xff0c;电脑已成为现代生活和工作中不可或缺的工具。无论是日常办公、游戏娱乐还是复杂的图像处理&#xff0c;电脑电源的稳定性和安全性都至关重要。作为电脑电源的核心部件之一&#xff0c;光耦&#xff08;光电耦合器&#xff09;在提升电源性…

JavaScript网页设计案例:互动式简历网站

JavaScript网页设计案例&#xff1a;互动式简历网站 在现代网页设计中&#xff0c;JavaScript 是实现交互和动态效果的关键技术。本文将通过一个完整的案例&#xff0c;展示如何使用 JavaScript 构建一个交互式的个人简历网页。本文不仅会涵盖 HTML 和 CSS 的使用&#xff0c;…

android和ios双端应用性能的测试工具

1.工具介绍 基于日常工作的需要&#xff0c;开发了一款新的android和ios端应用性能测试工具&#xff0c;本工具在数据测试方面与所流行的工具没有区别。欢迎下载使用体验。 本工具为筋斗云&#xff0c;工具说明 本工具无侵入&#xff0c;不需要root&#xff0c;低延迟…

(十七)、Mac 安装k8s

文章目录 1、Enable Kubernetes2、查看k8s运行状态3、启用 kubernetes-dashboard3.1、如果启动成功&#xff0c;可以在浏览器访问3.2、如果没有跳转&#xff0c;需要单独安装 kubernetes-dashboard3.2.1、方式一&#xff1a;一步到位3.2.2、方式二&#xff1a;逐步进行 1、Enab…

如何恢复被删除的 GitLab 项目?

GitLab 是一个全球知名的一体化 DevOps 平台&#xff0c;很多人都通过私有化部署 GitLab 来进行源代码托管。极狐GitLab 是 GitLab 在中国的发行版&#xff0c;专门为中国程序员服务。可以一键式部署极狐GitLab。 学习极狐GitLab 的相关资料&#xff1a; 极狐GitLab 官网极狐…

time命令:轻松测量Linux命令执行时间!

一、命令简介 用途&#xff1a; 用于测量 Linux 命令执行的时间&#xff0c;包括实际时间、用户 CPU 时间和系统 CPU 时间。刚开始以为是用来“看现在几点钟”的 &#x1f972;。标签&#xff1a; 实用工具&#xff0c;性能分析。 ‍ 二、命令参数 2.1 命令格式 time [选项…