用Roofline模型去分析pytorch和Triton算子

1.参考链接
2.测试环境
3.安装相关依赖
4.锁频
5.获取理论算力
6.创建测试脚本
7.运行测试程序生成Roofline图
8.NVIDIA Nsight Compute生成Roofline
9.效果图
- A.nn.Linear
- B.Triton实现

本文演示了如何用Roofline模型去分析pytorch和Triton算子
遗留问题:NVIDIA Nsight Compute中的Peak Work是怎么算出来的,又不是峰值算力

1.参考链接

roofline-overview
rtx-3060
OpenAI Triton

2.测试环境

+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 3060        On  | 00000000:03:00.0 Off |                  N/A |
|  0%   48C    P5              29W / 170W |     18MiB / 12288MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
torch==2.3.1+cu121

3.安装相关依赖

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
export PATH=$PATH:/usr/local/cuda/bin
export CUDA_HOME=$CUDA_HOME:/usr/local/cuda
pip install pycuda

4.锁频

MAX_Graphics_CLOCK=`nvidia-smi -q -d SUPPORTED_CLOCKS | grep 'Graphics' | sed 's/[^0-9]//g' | sort -n | uniq | tail -n 1`
MAX_Memory_CLOCK=`nvidia-smi -q -d SUPPORTED_CLOCKS | grep 'Memory' | sed 's/[^0-9]//g' | sort -n | uniq | tail -n 1`
nvidia-smi -pm 1
nvidia-smi -lgc $MAX_Graphics_CLOCK,$MAX_Graphics_CLOCK
nvidia-smi -i 0 -ac $MAX_Memory_CLOCK,$MAX_Graphics_CLOCK
nvidia-smi -q -d CLOCK

5.获取理论算力

tee Theoretical_FLOPS.py <<-'EOF'
import pycuda.driver as cuda
import pycuda.autoinitdef get_gpu_compute_capability_and_clock_rate():device = cuda.Device(0)compute_capability = device.compute_capability()clock_rate = device.get_attribute(cuda.device_attribute.CLOCK_RATE)  # in kHzsm_count = device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)cores_per_sm = get_cuda_cores_per_sm(compute_capability)return compute_capability, clock_rate, sm_count, cores_per_smdef get_cuda_cores_per_sm(compute_capability):major, minor = compute_capabilityif major == 2:return 32elif major == 3:return 192elif major == 5:return 128elif major == 6 and minor in [0, 1]:return 64elif major == 6 and minor == 2:return 128elif major == 7 and minor in [0, 5]:return 64elif major == 7 and minor == 2:return 64elif major == 8 and minor in [0, 6]:return 128else:raise ValueError("Unknown compute capability")def calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm):clock_rate_hz = clock_rate * 1e3  # Convert kHz to Hzflops = clock_rate_hz * sm_count * cores_per_sm * 2  # 2 FLOPs per clock per core (FMA)return flopscompute_capability, clock_rate, sm_count, cores_per_sm = get_gpu_compute_capability_and_clock_rate()
theoretical_flops = calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm)
print(f"GPU compute capability: {compute_capability}")
print(f"Clock rate (kHz): {clock_rate}")
print(f"Number of SMs: {sm_count}")
print(f"Cores per SM: {cores_per_sm}")
print(f"Theoretical FLOPS for float32: {theoretical_flops / 1e12} TFLOPS")
EOF
python Theoretical_FLOPS.py

输出

GPU compute capability: (8, 6)
Clock rate (kHz): 1852000
Number of SMs: 28
Cores per SM: 128
Theoretical FLOPS for float32: 13.275136 TFLOPS

6.创建测试脚本

tee roofline_model.py <<-'EOF'
import sys
import torch
import torch.nn as nn
import triton
import triton.language as tl
import math
import torch
import torch.nn as nn
from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties
import os
import argparse# 定义一个测试模型
class SimpleModel(nn.Module):def __init__(self,input_features,output_features):super(SimpleModel, self).__init__()self.fc1 = nn.Linear(input_features,output_features,bias=False)def forward(self, x):x = self.fc1(x)return x@triton.jit
def sgemm_kernel(A, B, C,M, N, K,stride_am, stride_ak,stride_bk, stride_bn,stride_cm, stride_cn,BLOCK_SIZE: tl.constexpr
):""" Kernel for computing C = A @ B """# Define the program idspid_m = tl.program_id(0)pid_n = tl.program_id(1)# Create base pointers for A and B and Coffs_am = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)offs_bn = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)offs_ak = tl.arange(0, BLOCK_SIZE)a_ptrs = A + (stride_am * offs_am[:, None] + stride_ak * offs_ak[None, :])b_ptrs = B + (stride_bk * offs_ak[:, None] + stride_bn * offs_bn[None, :])# Initialize accumulatoracc = tl.zeros((BLOCK_SIZE, BLOCK_SIZE), dtype=tl.float32)# Loop over K dimensionfor k in range(0, K, BLOCK_SIZE):a = tl.load(a_ptrs, mask=offs_am[:, None] < M)b = tl.load(b_ptrs, mask=offs_bn[None, :] < N)acc += tl.dot(a, b)a_ptrs += BLOCK_SIZE * stride_akb_ptrs += BLOCK_SIZE * stride_bk# Write back resultsc_ptrs = C + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]tl.store(c_ptrs, acc, mask=(offs_am[:, None] < M) & (offs_bn[None, :] < N))class TritonLinear(nn.Module):def __init__(self, in_features, out_features):super(TritonLinear, self).__init__()self.in_features = in_featuresself.out_features = out_featuresself.weight = nn.Parameter(torch.randn(out_features, in_features).float()).cuda()def forward(self, x):assert x.shape[1] == self.in_featuresout = torch.empty((x.shape[0], self.out_features), device=x.device, dtype=x.dtype).cuda()grid = lambda META: (math.ceil(x.shape[0] / META['BLOCK_SIZE']), math.ceil(self.out_features / META['BLOCK_SIZE']))sgemm_kernel[grid](x, self.weight, out,x.shape[0], self.out_features, self.in_features,x.stride(0), x.stride(1),self.weight.stride(0), self.weight.stride(1),out.stride(0), out.stride(1),BLOCK_SIZE=64)return outdef main(args):# 模型和输入数据input_features = 8192output_features = 8192batch_size = 8192model = SimpleModel(input_features,output_features)input_data = torch.randn(batch_size, input_features)    test_count=10# 计算 FLOPs 和内存访问量flops = FlopCountAnalysis(model, input_data).total()*test_countactivations = ActivationCountAnalysis(model, input_data).total() + input_data.numel()print("activations:",activations)# 计算参数个数params = sum(p.numel() for p in model.parameters())# 内存访问量假定为 activations 和params 乘以 4 字节（假设 activations 和 params 是 float32 类型）activation_memory_access = activations * 4params_memory_access = params * 4memory_access = activation_memory_access + params_memory_accessmemory_access=memory_access*test_countif args.triton_kernel:model = TritonLinear(in_features=input_features, out_features=output_features)else:model=model.cuda()input_data=input_data.float().cuda()for i in range(5):output = model(input_data) torch.cuda.synchronize()if args.warmup_only:returnif False:# 设置 CUDA 事件用于计算执行时间start_event = torch.cuda.Event(enable_timing=True)end_event = torch.cuda.Event(enable_timing=True)start_event.record()for _ in range(test_count):output = model(input_data)end_event.record()torch.cuda.synchronize()total_cuda_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒else:# 使用 PyTorch Profiler 计算 FLOPs、内存访问和执行时间with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:for _ in range(test_count):output = model(input_data)key_averages = prof.key_averages()for ev in key_averages:print(ev)total_cuda_time = sum([event.self_cuda_time_total for event in key_averages if event.key.find("sgemm")>=0]) / 1e6  # 转换至秒# FLOPs 转换至 GFLOPsflops_measured_glops = flops / 1e9# 内存带宽测量memory_access_gb=memory_access/ 1e9bandwidth_measured = memory_access_gb / total_cuda_time  # 单位：GB/sprint("bandwidth_measured:",bandwidth_measured)# GPU 的峰值性能和带宽peak_performance = 13.275136  * 1e3  # 单位：GFLOPsmemory_bandwidth = 360.0  # 单位：GB/s# 计算 Roofline 模型中的数据点Io = np.logspace(-2,4,100) #GFLOPs/GBperformance = np.minimum(peak_performance, Io * memory_bandwidth)  #不同计算密度下的最大FLOPs/S,上限为峰值算力peak_performance# 绘制 Roofline 模型plt.figure(figsize=(10, 6))thresold=0.75# 设置字体以支持中文font_path = 'simsun.ttc'  # 在这里替换为你的字体路径font_prop = FontProperties(fname=font_path)# Bandwidth Boundx=Io[Io<(peak_performance / memory_bandwidth)]plt.fill_between(x, np.minimum(peak_performance, x * memory_bandwidth)*thresold,np.minimum(peak_performance, x * memory_bandwidth),color='lightblue', alpha=0.6, label='Bandwidth Bound')# Compute Boundx2=Io[Io>=(peak_performance / memory_bandwidth)]plt.fill_between(x2, np.minimum(peak_performance, x2 * memory_bandwidth)*thresold, np.minimum(peak_performance, x2 * memory_bandwidth), color='green', alpha=0.6, label='Compute Bound')# 绘制低性能区域plt.fill_between(Io, 0, np.minimum(peak_performance, Io * memory_bandwidth)*thresold,color='gray', alpha=0.6, label='poor performance')plt.axhline(y=peak_performance, color='b', linestyle='--', label=f'峰值计算能力:{peak_performance/1e3:.2f}TFLOPs')plt.axvline(x=peak_performance / memory_bandwidth, color='g', linestyle='--', label=f'{peak_performance / memory_bandwidth:.2f}GFLOPs/GB')plt.loglog(Io, performance, label='Roofline')arithmetic_intensity_measured=flops_measured_glops/memory_access_gb #GFLOPs/GB(算法的静态属性)point_y = arithmetic_intensity_measured*bandwidth_measuredplt.scatter(arithmetic_intensity_measured, point_y, c='r',label=f'Measured Points {point_y/1e3:.2f} TFLOPs/sec {point_y*100/peak_performance:.2f}%')plt.xlabel('操作强度 [GFLOPs/GB]', fontproperties=font_prop)plt.ylabel('性能 [GFLOPs/sec]', fontproperties=font_prop)plt.title('Roofline 模型', fontproperties=font_prop)plt.legend(prop=font_prop)# 保存图片而不显示plt.savefig('roofline_model.png')plt.close()print(f"FLOPs: {flops} FLOPs")print(f"内存访问量: {memory_access} 字节")print(f"执行时间: {total_cuda_time:.4f} 秒")print(f"理论值的:{point_y*100/peak_performance:.2f}%")parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--warmup_only", action="store_true", help="warmup_only")
parser.add_argument("--triton_kernel", action="store_true", help="triton_kernel")args = parser.parse_args()
main(args)
EOF

7.运行测试程序生成Roofline图

python roofline_model.py
python roofline_model.py --triton_kernel

输出

FLOPs: 5497558138880 FLOPs
内存访问量: 8053063680 字节
执行时间: 1.3862 秒
理论值的:29.87%FLOPs: 5497558138880 FLOPs
内存访问量: 8053063680 字节
执行时间: 1.0957 秒
理论值的:37.80%

8.NVIDIA Nsight Compute生成Roofline

/usr/local/cuda/bin/ncu -f --section SpeedOfLight_HierarchicalSingleRooflineChart \--section ComputeWorkloadAnalysis --section MemoryWorkloadAnalysis \--target-processes all --export roofline_report python roofline_model.py --warmup_only/usr/local/cuda/bin/ncu -f --section SpeedOfLight_HierarchicalSingleRooflineChart \--section ComputeWorkloadAnalysis --section MemoryWorkloadAnalysis \--target-processes all --export roofline_triton_kernel_report python roofline_model.py --warmup_only --triton_kernel