pytorch通过change_current_allocator获取所有的子Module实际的内存占用情况

1.背景介绍
2.参考链接
3.自己的内存分配器
4.pytorch测试代码

1.背景介绍

目的:需要准确统计pytorch每一层计算所需的设备内存
问题:对齐的原因,直接使用torch.cuda.memory_allocated()并不准确
方法:
- 设置CUBLAS_WORKSPACE_CONFIG,排除CUBLAS_WORKSPACE的影响
- 使用torch.cuda.memory.change_current_allocator设置自己的内存分配器
- 在自己的内存分配器里记录内存分配情况

2.参考链接

Using custom memory allocators for CUDA
跟踪一个Pytorch Module在训练过程中的内存分配情况
cuBLAS workspaces

3.自己的内存分配器

tee alloc.cc<<-'EOF'
#include <sys/types.h>
#include <cuda_runtime_api.h>
#include <iostream>
#include <assert.h>
#include <unordered_map>
#include <iostream>
#include <mutex>// 内存监视器类
class MemoryMonitor {
public:// 分配内存并记录void* allocate(size_t size) {void* ptr;cudaMalloc(&ptr,size);if (ptr) {std::lock_guard<std::mutex> lock(mtx);allocations[ptr] = size;totalAllocated += size;}return ptr;}// 释放内存并记录void deallocate(void* ptr) {if (ptr) {std::lock_guard<std::mutex> lock(mtx);auto it = allocations.find(ptr);if (it != allocations.end()) {totalAllocated -= it->second;allocations.erase(it);}cudaFree(ptr);}}// 获取当前的总分配大小size_t getTotalAllocated() const {std::lock_guard<std::mutex> lock(mtx);return totalAllocated;}private:std::unordered_map<void*, size_t> allocations; // 存储分配地址和大小的哈希表size_t totalAllocated = 0; // 当前总分配大小mutable std::mutex mtx; // 保护数据结构的互斥锁
};MemoryMonitor monitor;extern "C" {void* my_malloc(ssize_t size, int device, cudaStream_t stream) {return monitor.allocate(size);}void my_free(void* ptr, ssize_t size, int device, cudaStream_t stream) {monitor.deallocate(ptr);}unsigned long long getTotalAllocated(){return monitor.getTotalAllocated();}
}
EOF
g++ alloc.cc -o alloc.so -I/usr/local/cuda/include -shared -fPIC

4.pytorch测试代码


tee torch_mem_stat.py <<-'EOF'
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['CUBLAS_WORKSPACE_CONFIG']=":0:0"
import ctypes
import numpy as np
import torch
from torch.nn import Module, Linear
import torch.nn as nn
from torch.optim import Adam,SGD
from dataclasses import dataclass
from typing import Any
import time
import torchvision.models as models
import syshook_allocator=int(sys.argv[1])if hook_allocator==1:os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING']='1'lib = ctypes.CDLL('./alloc.so')lib.getTotalAllocated.restype = ctypes.c_ulonglongprint("hook_allocator")new_alloc = torch.cuda.memory.CUDAPluggableAllocator('./alloc.so', 'my_malloc', 'my_free')torch.cuda.memory.change_current_allocator(new_alloc)def get_memory_allocated():if hook_allocator:return lib.getTotalAllocated()else:return torch.cuda.memory_allocated()# 对象和类名缓存
object_cache = {}
class_name_count = {}def is_tensor(val):return isinstance(val, (torch.Tensor, nn.Parameter))def describe_tensor_data(tensor,desc=""):if is_tensor(tensor):desc+=f"[shape({','.join(map(str,list(tensor.shape)))})_dtype({tensor.dtype})]"elif isinstance(tensor, (tuple, list)):for idx, t in enumerate(tensor):desc=describe_tensor_data(t,f"{desc}idx({idx})")else:desc+=f"[dtype({type(tensor)})]"return descdef get_unique_name(class_name, obj_id):# 生成唯一的对象名称if class_name not in class_name_count:class_name_count[class_name] = 0uid = f"{class_name}_{obj_id}"if uid not in object_cache:class_name_count[class_name] += 1object_cache[uid] = {"idx": class_name_count[class_name]}return f'-{object_cache[uid]["idx"]}'def initialize_module_attributes(name,module):# 初始化模块属性if not hasattr(module, 'uuid'):module.uuid = name+get_unique_name(module.__class__.__name__, id(module))if not hasattr(module, 'backward_mem'):module.backward_mem = 0if not hasattr(module, 'forward_mem'):module.forward_mem = 0if not hasattr(module, 'fwd_mem_sz'):module.fwd_mem_sz = Noneif not hasattr(module, 'bwd_mem_sz'):module.bwd_mem_sz = None    def pre_backward_hook(module, grad_input):module.backward_mem=get_memory_allocated()def post_backward_hook(module, grad_input, grad_output):memory_allocated=get_memory_allocated()module.bwd_mem_sz=memory_allocated-module.backward_memrank=0if torch.distributed.is_initialized():rank=torch.distributed.get_rank()    if rank==0:with open("torch_module_mem_info.txt","a+") as f:f.write(f"bwd-{module.uuid}#{module.bwd_mem_sz}#{memory_allocated}#{describe_tensor_data(grad_input)}#{describe_tensor_data(grad_output)}\n")def pre_forward_hook(module, input):   module.forward_mem=get_memory_allocated()def post_forward_hook(module, input, output):memory_allocated=get_memory_allocated()module.fwd_mem_sz=memory_allocated-module.forward_memrank=0if torch.distributed.is_initialized():rank=torch.distributed.get_rank()    if rank==0:    with open("torch_module_mem_info.txt","a+") as f:f.write(f"fwd-{module.uuid}#{module.fwd_mem_sz}#{memory_allocated}#{describe_tensor_data(input)}#{describe_tensor_data(output)}\n")def register_forward_hooks(name,module):initialize_module_attributes(name,module)module.register_forward_pre_hook(pre_forward_hook)module.register_forward_hook(post_forward_hook)def register_backward_hooks(name,module):initialize_module_attributes(name,module)module.register_full_backward_pre_hook(pre_backward_hook)module.register_full_backward_hook(post_backward_hook)class HookModel(object):def __init__(self, model):output_dict = {}self.get_submodule_recrusicve(model, "", output_dict)for name, module in output_dict.items():if name.endswith("Sequential"):continueregister_forward_hooks(name,module)register_backward_hooks(name,module)def get_submodule_recrusicve(self,module, prefix, output_dict):prefix = prefix + "/" + type(module).__name__output_dict[prefix] = modulefor name, submodule in module.named_children():self.get_submodule_recrusicve(submodule, f"{prefix}.{name}", output_dict)class FeedForward(Module):def __init__(self,hidden_size,ffn_size):super().__init__()self.fc = nn.Sequential(Linear(in_features=hidden_size, out_features=ffn_size,bias=False),nn.ReLU(),Linear(in_features=ffn_size, out_features=ffn_size*2,bias=False),nn.Dropout(0.5),Linear(in_features=ffn_size*2, out_features=hidden_size,bias=False),)self.norm = nn.LayerNorm(normalized_shape=hidden_size, elementwise_affine=False)def forward(self, x):return x + self.fc(self.norm(x))def main():model=FeedForward(100,128) model=model.float().cuda()model.train()obj=HookModel(model)opt=Adam(model.parameters(),lr=0.001)input=torch.randn(1,100).float().cuda()with open("torch_module_mem_info.txt","w") as f:f.write("")for i in range(1):output=model(input)loss=-torch.log(output.sum())opt.zero_grad()loss.backward()opt.step()
main()
EOFpython torch_mem_stat.py 0
cat torch_module_mem_info.txt
python torch_mem_stat.py 1
cat torch_module_mem_info.txt

输出

#默认的分配器
fwd-/FeedForward.norm/LayerNorm-1#512#285696#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.0/Linear-1#512#286208#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.1/ReLU-1#512#286720#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.2/Linear-2#1024#287232#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.3/Dropout-1#1536#288768#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.4/Linear-3#512#288256#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward-1#3072#288256#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward-1#0#289792#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.4/Linear-3#102400#392192#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.3/Dropout-1#512#392192#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.2/Linear-2#131584#522752#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.1/ReLU-1#0#521728#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,128)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.0/Linear-1#0#521216#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,128)_dtype(torch.float32)]#自定义分配器
fwd-/FeedForward.norm/LayerNorm-1#400#285472#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.0/Linear-1#512#285984#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.1/ReLU-1#512#286496#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.2/Linear-2#1024#287008#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.3/Dropout-1#1280#288288#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.4/Linear-3#400#287664#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward-1#2592#287664#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward-1#0#287676#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.4/Linear-3#102400#390076#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.3/Dropout-1#768#390840#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.2/Linear-2#131584#521400#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.1/ReLU-1#0#520376#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,128)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.0/Linear-1#0#519864#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,128)_dtype(torch.float32)]