defbroadcast_data(keys, data, datatype):"""Broadcast data from rank zero of each model parallel group to themembers of the same model parallel group.
Arguments:keys: list of keys in the data disctionary to be broadcasteddata: data dictionary of string keys and cpu tensor values.datatype: torch data type of all tensors in data associatedwith keys.
"""</span>
<span class="hljs-comment"># Build (key, size) and (key, number of elements) dictionaries along</span>
<span class="hljs-comment"># with the total number of elements on all ranks.</span>
key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,data)<span class="hljs-comment"># Pack on rank zero.</span>
<span class="hljs-keyword">if</span> get_tensor_model_parallel_rank() == <span class="hljs-number">0</span>: <span class="hljs-comment"># rank 0才压缩</span><span class="hljs-comment"># Check that all keys have the same data type.</span>_check_data_types(keys, data, datatype)<span class="hljs-comment"># Flatten the data associated with the keys</span>flatten_data = torch.cat([data[key].contiguous().view(-<span class="hljs-number">1</span>) <span class="hljs-keyword">for</span> key <span class="hljs-keyword">in</span> keys], dim=<span class="hljs-number">0</span>).cuda()
<span class="hljs-keyword">else</span>:flatten_data = torch.empty(total_numel,device=torch.cuda.current_device(),dtype=datatype)<span class="hljs-comment"># Broadcast</span>
torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(),group=get_tensor_model_parallel_group())<span class="hljs-comment"># Unpack</span>
output = {}
offset = <span class="hljs-number">0</span>
<span class="hljs-keyword">for</span> key <span class="hljs-keyword">in</span> keys:size = key_size[key]numel = key_numel[key]output[key] = flatten_data.narrow(<span class="hljs-number">0</span>, offset, numel).view(size)offset += numel<span class="hljs-keyword">return</span> output
defget_tensor_model_parallel_src_rank():"""Calculate the global rank corresponding to the first local rankin the tensor model parallel group."""global_rank = torch.distributed.get_rank()local_world_size = get_tensor_model_parallel_world_size()return (global_rank // local_world_size) * local_world_size
defpretrain(train_valid_test_dataset_provider,model_provider,model_type,forward_step_func,extra_args_provider=None,args_defaults={}):"""Main training program.
This function will run the followings in the order provided:1) initialize Megatron.2) setup model, optimizer and lr schedule using the model_provider.3) call train_val_test_data_provider to get train/val/test datasets.4) train the modle using the forward_step_func.
"""</span><span class="hljs-comment"># Initalize and get arguments, timers, and Tensorboard writer.</span>
initialize_megatron(extra_args_provider=extra_args_provider,args_defaults=args_defaults)<span class="hljs-comment"># Adjust the startup time so it reflects the largest value.</span>
<span class="hljs-comment"># This will be closer to what scheduler will see (outside of</span>
<span class="hljs-comment"># image ... launches.</span>
<span class="hljs-keyword">global</span> _TRAIN_START_TIME
start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME])
torch.distributed.all_reduce(start_time_tensor,op=torch.distributed.ReduceOp.MIN)
_TRAIN_START_TIME = start_time_tensor.item()args = get_args()
timers = get_timers()<span class="hljs-comment"># Model, optimizer, and learning rate. 使用model_provider设置模型、优化器和lr计划</span>
model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider,model_type)<span class="hljs-comment"># Data stuff. 调用train_val_test_data_provider以获取train/val/测试数据集</span>
<span class="hljs-keyword">if</span> args.virtual_pipeline_model_parallel_size <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:all_data_iterators = [build_train_valid_test_data_iterators(train_valid_test_dataset_provider)<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(model))]train_data_iterator = [data_iterators[<span class="hljs-number">0</span>] <span class="hljs-keyword">for</span> data_iterators <span class="hljs-keyword">in</span> all_data_iterators]valid_data_iterator = [data_iterators[<span class="hljs-number">1</span>] <span class="hljs-keyword">for</span> data_iterators <span class="hljs-keyword">in</span> all_data_iterators]test_data_iterator = [data_iterators[<span class="hljs-number">2</span>] <span class="hljs-keyword">for</span> data_iterators <span class="hljs-keyword">in</span> all_data_iterators]
<span class="hljs-keyword">else</span>:train_data_iterator, valid_data_iterator, test_data_iterator \= build_train_valid_test_data_iterators(train_valid_test_dataset_provider)iteration = <span class="hljs-number">0</span>
<span class="hljs-keyword">if</span> args.do_train <span class="hljs-keyword">and</span> args.train_iters > <span class="hljs-number">0</span>:iteration = train(forward_step_func, <span class="hljs-comment"># 训练模型</span>model, optimizer, lr_scheduler,train_data_iterator, valid_data_iterator)<span class="hljs-keyword">if</span> args.do_valid:prefix = <span class="hljs-string">'the end of training for val data'</span>evaluate_and_print_results(prefix, forward_step_func,valid_data_iterator, model,iteration, <span class="hljs-literal">False</span>)<span class="hljs-keyword">if</span> args.save <span class="hljs-keyword">and</span> iteration != <span class="hljs-number">0</span>:save_checkpoint(iteration, model, optimizer, lr_scheduler)<span class="hljs-keyword">if</span> args.do_test:<span class="hljs-comment"># Run on test data.</span>prefix = <span class="hljs-string">'the end of training for test data'</span>evaluate_and_print_results(prefix, forward_step_func,test_data_iterator, model,<span class="hljs-number">0</span>, <span class="hljs-literal">True</span>)
对于我们分析来说,initialize_megatron 是重点,这里初始化了 megatron。
0x03 初始化
3.1 initialize_megatron
initialize_megatron 方法会设置全局变量,初始化分布式环境等等。
definitialize_megatron(extra_args_provider=None, args_defaults={},ignore_unknown_args=False, allow_no_cuda=False):"""Set global variables, initialize distributed, andset autoresume and random seeds.`allow_no_cuda` should not be set unless using megatron for cpu only data processing. In general this arg should not be set unless you know what you are doing.Returns a function to finalize distributed env initialization (optionally, only when args.lazy_mpu_init == True)"""ifnot allow_no_cuda:# Make sure cuda is available.assert torch.cuda.is_available(), 'Megatron requires CUDA.'
<span class="hljs-comment"># Parse args, build tokenizer, and set adlr-autoresume,</span>
<span class="hljs-comment"># tensorboard-writer, and timers.</span>
set_global_variables(extra_args_provider=extra_args_provider, <span class="hljs-comment"># 设置全局变量</span>args_defaults=args_defaults,ignore_unknown_args=ignore_unknown_args)<span class="hljs-comment"># torch.distributed initialization</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">finish_mpu_init</span>():args = get_args()<span class="hljs-comment"># Pytorch distributed.</span>_initialize_distributed() <span class="hljs-comment"># 设置分布式</span><span class="hljs-comment"># Random seeds for reproducibility.</span><span class="hljs-keyword">if</span> args.rank == <span class="hljs-number">0</span>:<span class="hljs-built_in">print</span>(<span class="hljs-string">'> setting random seeds to {} ...'</span>.<span class="hljs-built_in">format</span>(args.seed))_set_random_seed(args.seed)<span class="hljs-comment"># Set pytorch JIT layer fusion options.</span>
_set_jit_fusion_options()args = get_args()
<span class="hljs-keyword">if</span> args.lazy_mpu_init:args.use_cpu_initialization=<span class="hljs-literal">True</span><span class="hljs-comment"># delayed initialization of DDP-related stuff</span><span class="hljs-comment"># We only set basic DDP globals </span>set_tensor_model_parallel_world_size(args.tensor_model_parallel_size)<span class="hljs-comment"># and return function for external DDP manager</span><span class="hljs-comment"># to call when it has DDP initialized</span>set_tensor_model_parallel_rank(args.rank) <span class="hljs-keyword">return</span> finish_mpu_init
<span class="hljs-keyword">else</span>:<span class="hljs-comment"># Megatron's MPU is the master. Complete initialization right away.</span>finish_mpu_init()<span class="hljs-comment"># Autoresume.</span>_init_autoresume()<span class="hljs-comment"># Compile dependencies.</span>_compile_dependencies()<span class="hljs-comment"># No continuation function</span><span class="hljs-keyword">return</span> <span class="hljs-literal">None</span>
创建完worker进程之后,程序需要知道哪些进程在训练同一个模型,torch.distributed.init_process_group 就实现了这个功能。torch.distributed.init_process_group 会生成一个进程组,同组内进程训练同一个模型,也能确定用什么方式进行通信。进程组会给组内每个进程一个序号,就是gloabl rank,如果是多机并行,每个机器创建的进程之间也有一个序号,就是 local rank。如果是单机多卡并行,local rank 和 global rank是一致的。
def_initialize_distributed():"""Initialize torch.distributed and mpu."""args = get_args()
device_count = torch.cuda.device_count()
<span class="hljs-keyword">if</span> torch.distributed.is_initialized():args.rank = torch.distributed.get_rank()args.world_size = torch.distributed.get_world_size()
<span class="hljs-keyword">else</span>:<span class="hljs-comment"># Manually set the device ids.</span><span class="hljs-keyword">if</span> device_count > <span class="hljs-number">0</span>:device = args.rank % device_count<span class="hljs-keyword">if</span> args.local_rank <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:<span class="hljs-keyword">assert</span> args.local_rank == device, \<span class="hljs-string">'expected local-rank to be the same as rank % device-count.'</span><span class="hljs-keyword">else</span>:args.local_rank = devicetorch.cuda.set_device(device)
<span class="hljs-comment"># Call the init process</span>
torch.distributed.init_process_group( <span class="hljs-comment"># 初始化PyTorch分布式环境</span>backend=args.distributed_backend,world_size=args.world_size, rank=args.rank,timeout=timedelta(minutes=<span class="hljs-number">10</span>))<span class="hljs-comment"># Set the tensor model-parallel, pipeline model-parallel, and</span>
<span class="hljs-comment"># data-parallel communicators.</span>
<span class="hljs-keyword">if</span> device_count > <span class="hljs-number">0</span>:<span class="hljs-keyword">if</span> mpu.model_parallel_is_initialized():<span class="hljs-built_in">print</span>(<span class="hljs-string">'model parallel is already initialized'</span>)<span class="hljs-keyword">else</span>:<span class="hljs-comment"># 初始化模型并行,比如设置各种进程组</span>mpu.initialize_model_parallel(args.tensor_model_parallel_size,args.pipeline_model_parallel_size,args.virtual_pipeline_model_parallel_size,args.pipeline_model_parallel_split_rank)
3.3 初始化进程组全局变量
因为调用了 mpu.initialize_model_parallel 来设置模型并行,数据并行等各种进程组,所以我们假定目前进程组都已经设置成功,所以每个 rank 对应的进程都有自己的全局变量。假定目前有16个GPU,属于两个node,rank 0 ~7 属于第一个节点,rank 8 ~ 15 属于第二个节点。下面的 gi 指的是第 i 个 GPU。
_TENSOR_MODEL_PARALLEL_GROUP :当前 rank 所属于的Intra-layer model parallel group,就是tensor 并行进程组。
# Intra-layer model parallel group that the current rank belongs to.
_TENSOR_MODEL_PARALLEL_GROUP = None# Inter-layer model parallel group that the current rank belongs to.
_PIPELINE_MODEL_PARALLEL_GROUP = None# Model parallel group (both intra- and pipeline) that the current rank belongs to.
_MODEL_PARALLEL_GROUP = None# Embedding group.
_EMBEDDING_GROUP = None# Data parallel group that the current rank belongs to.
_DATA_PARALLEL_GROUP = None
defget_language_model(num_tokentypes, add_pooler,encoder_attn_mask_type, init_method=None,scaled_init_method=None, add_encoder=True,add_decoder=False,decoder_attn_mask_type=AttnMaskType.causal,pre_process=True, post_process=True):"""Build language model and return along with the key to save."""args = get_args()
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, init_method, output_layer_init_method,layer_type=LayerType.encoder,self_attn_mask_type=AttnMaskType.padding,pre_process=<span class="hljs-literal">True</span>, post_process=<span class="hljs-literal">True</span></span>):<span class="hljs-built_in">super</span>(ParallelTransformer, self).__init__()args = get_args()self.bf16 = args.bf16self.fp32_residual_connection = args.fp32_residual_connectionself.pre_process = pre_processself.post_process = post_processself.input_tensor = <span class="hljs-literal">None</span><span class="hljs-comment"># Store activation checkpoiting flag.</span>self.activations_checkpoint_method = args.activations_checkpoint_methodself.activations_checkpoint_num_layers = args.activations_checkpoint_num_layersself.distribute_checkpointed_activations = args.distribute_checkpointed_activations<span class="hljs-comment"># Number of layers.</span>self.num_layers = mpu.get_num_layers( <span class="hljs-comment"># 获得本Transformer的具体层数</span>args, args.model_type == ModelType.encoder_and_decoder)<span class="hljs-comment"># Transformer layers.</span><span class="hljs-keyword">def</span> <span class="hljs-title function_">build_layer</span>(<span class="hljs-params">layer_number</span>):<span class="hljs-keyword">return</span> ParallelTransformerLayer( <span class="hljs-comment"># 返回一层 Transformmer</span>init_method,output_layer_init_method,layer_number,layer_type=layer_type,self_attn_mask_type=self_attn_mask_type)<span class="hljs-keyword">if</span> args.virtual_pipeline_model_parallel_size <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:<span class="hljs-comment"># Number of layers in each model chunk is the number of layers in the stage,</span><span class="hljs-comment"># divided by the number of model chunks in a stage.</span>self.num_layers = self.num_layers // args.virtual_pipeline_model_parallel_size<span class="hljs-comment"># With 8 layers, 2 stages, and 4 model chunks, we want an assignment of</span><span class="hljs-comment"># layers to stages like (each list is a model chunk):</span><span class="hljs-comment"># Stage 0: [0] [2] [4] [6]</span><span class="hljs-comment"># Stage 1: [1] [3] [5] [7]</span><span class="hljs-comment"># With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of</span><span class="hljs-comment"># layers to stages like (each list is a model chunk):</span><span class="hljs-comment"># Stage 0: [0, 1] [4, 5]</span><span class="hljs-comment"># Stage 1: [2, 3] [6, 7]</span>offset = mpu.get_virtual_pipeline_model_parallel_rank() * (args.num_layers // args.virtual_pipeline_model_parallel_size) + \(mpu.get_pipeline_model_parallel_rank() * self.num_layers)<span class="hljs-keyword">else</span>:<span class="hljs-comment"># Each stage gets a contiguous set of layers.</span>offset = mpu.get_pipeline_model_parallel_rank() * self.num_layersself.layers = torch.nn.ModuleList( <span class="hljs-comment"># 生成 num_layers 个 Transformer</span>[build_layer(i + <span class="hljs-number">1</span> + offset) <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(self.num_layers)])<span class="hljs-keyword">if</span> self.post_process:<span class="hljs-comment"># Final layer norm before output.</span>self.final_layernorm = LayerNorm(args.hidden_size,eps=args.layernorm_epsilon,no_persist_layer_norm=args.no_persist_layer_norm)
defget_pipeline_model_parallel_world_size():"""Return world size for the pipeline model parallel group."""global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZEif _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE isnotNone:return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZEreturn torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
from megatron.model import DistributedDataParallel as LocalDDP
from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
if wrap_with_ddp: if args.DDP_impl == ‘torch’: i = torch.cuda.current_device() model = [torchDDP(model_module, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) for model_module in model]
classDistributedDataParallel(DistributedDataParallelBase):"""DDP with contiguous buffers options to storre and accumulate gradients.This class:- has the potential to reduce memory fragmentation.- provides the option to do the gradient accumulationin a type other than the params type (for example fp32)
Arguments:module: input model.accumulate_allreduce_grads_in_fp32: if true do the gradient accumulationand the gradient all-reduce all in in float32. If this option istrue, we require `use_contiguous_buffers` to be true too.use_contiguous_buffers: if true, use a contiguous buffer to store thegradients.
"""</span>
<span class="hljs-built_in">super</span>(DistributedDataParallel, self).__init__(module)self.accumulate_allreduce_grads_in_fp32 \= accumulate_allreduce_grads_in_fp32
self.use_contiguous_buffers = use_contiguous_buffers
<span class="hljs-comment"># If we are using fp32-accumulate-allreduce explicitly</span>
<span class="hljs-comment"># this means we need main grads in a continous buffer.</span>
<span class="hljs-keyword">if</span> self.accumulate_allreduce_grads_in_fp32:<span class="hljs-keyword">assert</span> self.use_contiguous_buffers<span class="hljs-comment"># ===================================</span>
<span class="hljs-comment"># Rest of this part applies only to</span>
<span class="hljs-comment"># the case we use continuous buffers.</span>
<span class="hljs-comment"># ===================================</span>
self._grad_buffers = <span class="hljs-literal">None</span>
<span class="hljs-keyword">if</span> self.use_contiguous_buffers: <span class="hljs-comment"># 这里只考虑连续内存</span>self._grad_buffers = {} <span class="hljs-comment"># 定义buffer</span><span class="hljs-comment"># Simple function to define buffer type.</span><span class="hljs-keyword">def</span> <span class="hljs-title function_">_get_buffer_type</span>(<span class="hljs-params">param</span>): <span class="hljs-comment"># 返回buffer类型</span><span class="hljs-keyword">return</span> torch.<span class="hljs-built_in">float</span> <span class="hljs-keyword">if</span> \self.accumulate_allreduce_grads_in_fp32 <span class="hljs-keyword">else</span> param.dtype<span class="hljs-comment"># First calculate total number of elements per type.</span>type_num_elements = {}<span class="hljs-keyword">for</span> param <span class="hljs-keyword">in</span> self.module.parameters(): <span class="hljs-comment"># 遍历模型参数</span><span class="hljs-keyword">if</span> param.requires_grad: <span class="hljs-comment"># 如果需要计算梯度</span>dtype = _get_buffer_type(param) <span class="hljs-comment"># 获取参数类型</span>type_num_elements[dtype] = type_num_elements.get(dtype, <span class="hljs-number">0</span>) \+ param.data.nelement() <span class="hljs-comment"># 该类型参数数目做相应增加</span><span class="hljs-comment"># 目前 type_num_elements 是各种类型参数的个数 </span><span class="hljs-comment"># Allocate the buffer.</span><span class="hljs-keyword">for</span> dtype, num_elements <span class="hljs-keyword">in</span> type_num_elements.items(): <span class="hljs-comment"># 遍历各种类型</span>self._grad_buffers[dtype] = MemoryBuffer(num_elements, dtype) <span class="hljs-comment"># 分配内存</span><span class="hljs-comment"># 这里是假定反向传播是参数的反方向,存储每个参数梯度的起始位置 </span><span class="hljs-comment"># Assume the back prop order is reverse the params order, </span><span class="hljs-comment"># store the start index for the gradients.</span><span class="hljs-keyword">for</span> param <span class="hljs-keyword">in</span> self.module.parameters(): <span class="hljs-comment"># 遍历模型参数</span><span class="hljs-keyword">if</span> param.requires_grad: <span class="hljs-comment"># 如果需要计算梯度</span>dtype = _get_buffer_type(param) <span class="hljs-comment"># 获取参数类型</span>type_num_elements[dtype] -= param.data.nelement() <span class="hljs-comment"># 减少size</span><span class="hljs-comment"># 确定该参数在MemoryBuffer的位置</span>param.main_grad = self._grad_buffers[dtype].get( <span class="hljs-comment"># 获取该参数对应的内存</span>param.data.shape, type_num_elements[dtype])<span class="hljs-comment"># Backward hook.</span><span class="hljs-comment"># Accumalation function for the gradients. We need</span><span class="hljs-comment"># to store them so they don't go out of scope.</span>self.grad_accs = []<span class="hljs-comment"># Loop over all the parameters in the model.</span><span class="hljs-keyword">for</span> param <span class="hljs-keyword">in</span> self.module.parameters(): <span class="hljs-comment"># 遍历模型参数</span><span class="hljs-keyword">if</span> param.requires_grad: <span class="hljs-comment"># 如果需要计算梯度</span><span class="hljs-comment"># Expand so we get access to grad_fn.</span>param_tmp = param.expand_as(param)<span class="hljs-comment"># Get the gradient accumulator functtion.</span>grad_acc = param_tmp.grad_fn.next_functions[<span class="hljs-number">0</span>][<span class="hljs-number">0</span>] <span class="hljs-comment"># 得到参数对应的梯度函数</span>grad_acc.register_hook(self._make_param_hook(param)) <span class="hljs-comment"># 注册了hook</span>self.grad_accs.append(grad_acc) <span class="hljs-comment"># 统一管理梯度函数,其实就是book keeping作用</span>
5.2.3 内存
MemoryBuffer 是内存抽象。
classMemoryBuffer:
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, numel, dtype</span>):self.numel = numelself.dtype = dtypeself.data = torch.zeros(self.numel, <span class="hljs-comment"># 初始化内存</span>dtype=self.dtype,device=torch.cuda.current_device(),requires_grad=<span class="hljs-literal">False</span>)<span class="hljs-keyword">def</span> <span class="hljs-title function_">zero</span>(<span class="hljs-params">self</span>):<span class="hljs-string">"""Reset the buffer to zero."""</span>self.data.zero_()<span class="hljs-keyword">def</span> <span class="hljs-title function_">get</span>(<span class="hljs-params">self, shape, start_index</span>):<span class="hljs-string">"""Return a tensor with the input `shape` as a view into the1-D data starting at `start_index`."""</span>end_index = start_index + shape.numel() <span class="hljs-comment"># 定位到该张量在内存buffer之中的位置</span><span class="hljs-keyword">assert</span> end_index <= self.numel, \<span class="hljs-string">'requested tensor is out of the buffer range.'</span>buffer_tensor = self.data[start_index:end_index] <span class="hljs-comment"># 拿到内存</span>buffer_tensor = buffer_tensor.view(shape)<span class="hljs-keyword">return</span> buffer_tensor <span class="hljs-comment"># </span>
5.2.4 支撑函数
下面是两个支撑函数,分别是用于拷贝梯度和将buffer清零。
def_make_param_hook(self, param):"""Create the all-reduce hook for backprop."""# Hook used for back-prop.defparam_hook(*unused):# Add the gradient to the buffer.if param.grad.data isnotNone:param.main_grad.add_(param.grad.data) # 把梯度拷贝到连续内存之中# Now we can deallocate grad memory.param.grad = Nonereturn param_hook
defzero_grad_buffer(self): “”“Set the grad buffer data to zero. Needs to be called at the begining of each iteration.”“” assert self._grad_buffers isnotNone, ‘buffers are not initialized.’ for, bufferin self.grad_buffers.items(): buffer.zero()
defallreduce_gradients(self):"""Reduce gradients across data parallel ranks."""# If we have buffers, simply reduce the data in the buffer.if self._grad_buffers isnotNone:# 连续内存for _, buffer_ in self._grad_buffers.items(): # 遍历各种类型的bufferbuffer_.data /= mpu.get_data_parallel_world_size()torch.distributed.all_reduce( # 统一归并buffer_.data, group=mpu.get_data_parallel_group())else:# Otherwise, bucketize and all-reducebuckets = {} # 否则还是用桶来归并# Pack the buckets.for param in self.module.parameters(): # 遍历梯度if param.requires_grad and param.grad isnotNone:tp = param.data.type()if tp notin buckets:buckets[tp] = []buckets[tp].append(param) # 同类型的梯度放到对应类型的桶之中param.main_grad = param.grad
下载HEVC插件
点击 HEVC Video Extension 2.2.20.0 latest downloads,根据教程下载安装
安装 Random User-Agent
点击 Random User-Agent 安装
配置 Random User-Agent
![在这里插入图片描述](https://i-blog.csdnimg.cn/direct/dda0ea75096c42c0a79ef6f6f5521…