第三课：GPT

文章目录

第三课：GPT
- 1、学习总结：
- - GPT出现的原因
  - GPT的方法原理
  - 目前存在的问题
  - 无监督的预训练
  - - 优化目标
    - 模型结构
  - 监督微调
  - 课程ppt及代码地址
- 2、学习心得：
- 3、经验分享：
- 4、课程反馈：
- 5、使用MindSpore昇思的体验和反馈：
- 6、未来展望：

第三课：GPT

1、学习总结：

GPT出现的原因

未标注的文本数据远多于已标注的文本数据，并且对于不同的下游任务会存在不同的标注方式

GPT的方法原理

半监督学习

基于大量未标注的文本数据，训练预训练语言模型
使用已标注文本数据，对模型针对某一特定下游任务进行finetune，只更改output layer（线性层）

目前存在的问题

自然语言处理的下游任务非常多元，难以有统一的优化目标
难以将预训练模型的信息完全传递到finetune的下游任务中

无监督的预训练

优化目标

模型结构

由于训练objective的选择，gpt在模型选择上不应该看见当前token后的信息，故模型应设计为单向网络，即transformer中的decoder结构。

import os
import logging
import numpy as np
import mindspore
from mindspore import nn
from mindspore import ops
from mindspore import Tensor
from mindspore.common.initializer import initializer, Normal
from mindnlp.models.gpt.gpt_config import GPTConfig
from mindnlp._legacy.nn import Dropout
from mindnlp.abc import PreTrainedModel
from mindnlp.models.utils import Conv1D, prune_conv1d_layer, find_pruneable_heads_and_indices
from mindnlp.models.utils import SequenceSummary
from mindnlp.models.activations import ACT2FN
from mindnlp import GPTConfig

# Feed-Forward 实现
class MLP(nn.Cell):r"""GPT MLP"""def __init__(self, n_state, config):super().__init__()n_embd = config.n_embdself.c_fc = Conv1D(n_state, n_embd)self.c_proj = Conv1D(n_embd, n_state)self.act = ACT2FN[config.afn]self.dropout = Dropout(p=config.resid_pdrop)def construct(self, x):h = self.act(self.c_fc(x))h2 = self.c_proj(h)return self.dropout(h2)

# Multi-head attention 实现class Attention(nn.Cell):r"""GPT Attention"""def __init__(self, nx, n_positions, config, scale=False):super().__init__()n_state = nxif n_state % config.n_head != 0:raise ValueError(f"Attention n_state shape: {n_state} must be divisible by config.n_head {config.n_head}")self.bias = Tensor(np.tril(np.ones((n_positions, n_positions))), mindspore.float32).view(1, 1, n_positions, n_positions)self.n_head = config.n_headself.split_size = n_stateself.scale = scaleself.c_attn = Conv1D(n_state * 3, n_state)self.c_attn = Conv1D(n_state * 3, n_state)self.c_proj = Conv1D(n_state, n_state)self.attn_dropout = Dropout(p=config.attn_pdrop)self.resid_dropout = Dropout(p=config.resid_pdrop)self.pruned_heads = set()self.output_attentions = config.output_attentionsdef prune_heads(self, heads):"""Prunes heads of the model."""if len(heads) == 0:returnhead_size = self.split_size//self.n_headheads, index = find_pruneable_heads_and_indices(heads, self.n_head, head_size, self.pruned_heads)index_attn = ops.cat([index, index + self.split_size, index + (2 * self.split_size)])self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, axis=1)self.c_proj = prune_conv1d_layer(self.c_proj, index, axis=0)self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))self.n_head = self.n_head - len(heads)self.pruned_heads = self.pruned_heads.union(heads)def _attn(self, q, k, v, attention_mask=None, head_mask=None):w = ops.matmul(q, k)if self.scale:w = w / ops.sqrt(ops.scalar_to_tensor(v.shape[-1]))b = self.bias[:, :, : w.shape[-2], : w.shape[-1]]w = w * b + -1e9 * (1 - b)if attention_mask is not None:w = w + attention_maskw = ops.softmax(w)w = self.attn_dropout(w)if head_mask is not None:w = w * head_maskoutputs = (ops.matmul(w, v),)if self.output_attentions:outputs += (w,)return outputsdef merge_heads(self, x):"""merge heads"""x = x.transpose(0, 2, 1, 3)new_x_shape = x.shape[:-2] + (x.shape[-2] * x.shape[-1],)return x.view(new_x_shape)def split_heads(self, x, k=False):"""split heads"""new_x_shape = x.shape[:-1] + (self.n_head, x.shape[-1] // self.n_head)x = x.view(new_x_shape)if k:return x.transpose(0, 2, 3, 1)return x.transpose(0, 2, 1, 3)def construct(self, x, attention_mask=None, head_mask=None):x = self.c_attn(x)query, key, value = ops.split(x, self.split_size, axis=2)query = self.split_heads(query)key = self.split_heads(key, k=True)value = self.split_heads(value)attn_outputs = self._attn(query, key, value, attention_mask, head_mask)a = attn_outputs[0]a = self.merge_heads(a)a = self.c_proj(a)a = self.resid_dropout(a)outputs = (a,) + attn_outputs[1:]return outputs

class Block(nn.Cell):r"""GPT Block"""def __init__(self, n_positions, config, scale=False):super().__init__()nx = config.n_embdself.attn = Attention(nx, n_positions, config, scale)self.ln_1 = nn.LayerNorm((nx,), epsilon=config.layer_norm_epsilon)self.mlp = MLP(4 * nx, config)self.ln_2 = nn.LayerNorm((nx,), epsilon=config.layer_norm_epsilon)def construct(self, x, attention_mask=None, head_mask=None):output_attn = self.attn(x,attention_mask=attention_mask,head_mask=head_mask)a = output_attn[0]n = self.ln_1(x + a)m = self.mlp(n)h = self.ln_2(n + m)outputs = (h,) + output_attn[1:]return outputs

class GPTPreTrainedModel(PreTrainedModel):"""BertPretrainedModel"""convert_torch_to_mindspore = torch_to_mindsporepretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAPconfig_class = GPTConfigbase_model_prefix = 'transformer'def _init_weights(self, cell):"""Initialize the weights"""if isinstance(cell, nn.Dense):cell.weight.set_data(initializer(Normal(self.config.initializer_range),cell.weight.shape, cell.weight.dtype))if cell.has_bias:cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))elif isinstance(cell, nn.Embedding):embedding_table = initializer(Normal(self.config.initializer_range),cell.embedding_table.shape,cell.embedding_table.dtype)if cell.padding_idx is not None:embedding_table[cell.padding_idx] = 0cell.embedding_table.set_data(embedding_table)elif isinstance(cell, nn.LayerNorm):cell.gamma.set_data(initializer('ones', cell.gamma.shape, cell.gamma.dtype))cell.beta.set_data(initializer('zeros', cell.beta.shape, cell.beta.dtype))class GPTModel(GPTPreTrainedModel):"""The bare GPT transformer model outputting raw hidden-states without any specific head on top"""def __init__(self, config):super().__init__(config)self.config = configself.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)self.drop = nn.Dropout(p=config.embd_pdrop)self.h = nn.CellList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)])self.position_ids = ops.arange(config.n_positions)self.n_layer = self.config.n_layerself.output_attentions = self.config.output_attentionsself.output_hidden_states = self.config.output_hidden_statesdef get_input_embeddings(self):"""return the input embeddings layer"""return self.tokens_embeddef set_input_embeddings(self, value):"""set the input embeddings layer"""self.tokens_embed = valuedef _prune_heads(self, heads_to_prune):"""Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}"""for layer, heads in heads_to_prune.items():self.h[layer].attn.prune_heads(heads)def construct(self,input_ids=None,attention_mask=None,token_type_ids=None,position_ids=None,head_mask=None,inputs_embeds=None,):if input_ids is not None and inputs_embeds is not None:raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")if input_ids is not None:input_shape = input_ids.shapeinput_ids = input_ids.view(-1, input_shape[-1])elif inputs_embeds is not None:input_shape = inputs_embeds.shape[:-1]else:raise ValueError("You have to specify either input_ids or inputs_embeds")if position_ids is None:position_ids = self.position_ids[None, : input_shape[-1]]if attention_mask is not None:attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)attention_mask = (1.0 - attention_mask) * Tensor(np.finfo(mindspore.dtype_to_nptype(self.dtype)).min,self.dtype)head_mask = self.get_head_mask(head_mask, self.n_layer)if inputs_embeds is None:inputs_embeds = self.tokens_embed(input_ids)position_embeds = self.positions_embed(position_ids)if token_type_ids is not None:token_type_ids = token_type_ids.view(-1, token_type_ids.shape[-1])token_type_embeds = self.tokens_embed(token_type_ids)else:token_type_embeds = 0hidden_states = inputs_embeds + position_embeds + token_type_embedshidden_states = self.drop(hidden_states)output_shape = input_shape + (hidden_states.shape[-1],)all_attentions = ()all_hidden_states = ()for i, block in enumerate(self.h):if self.output_hidden_states:all_hidden_states = all_hidden_states + (hidden_states,)outputs = block(hidden_states, attention_mask, head_mask[i])hidden_states = outputs[0]if self.output_attentions:all_attentions = all_attentions + (outputs[1],)hidden_states = hidden_states.view(*output_shape)if self.output_hidden_states:all_hidden_states = all_hidden_states + (hidden_states,)return (hidden_states, all_hidden_states, all_attentions)

监督微调

在已经预训练好的GPT上额外加一层线性层

并通过缩小目标与计算结果的误差进行模型优化

最终为加速模型收敛及提高模型的泛化性，融入pretrain时language modelling的优化目标

# 文本序列分类任务class GPTForSequenceClassification(GPTPreTrainedModel):"""The Original GPT Model transformer with a sequence classification head on top (linear layer).GPTForSequenceClassification uses the last token in order to do the classification, as other causalmodels (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of thelast token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a paddingtoken in each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Sinceit cannot guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (takethe last value in each row of the batch)."""def __init__(self, config):super().__init__(config)self.config = configself.num_labels = config.num_labelsself.transformer = GPTModel(config)self.score = nn.Dense(config.n_embd, self.num_labels, has_bias=False)self.pad_token_id = self.config.pad_token_idproblem_type = config.problem_typeif problem_type is None:self.loss = Noneelse:if self.num_labels == 1:self.problem_type = "regression"self.loss = nn.MSELoss()elif self.num_labels > 1:self.problem_type = "single_label_classification"self.loss = nn.CrossEntropyLoss()else:self.problem_type = "multi_label_classification"self.loss = nn.BCEWithLogitsLoss()def construct(self,input_ids = None,attention_mask = None,token_type_ids = None,position_ids = None,head_mask = None,inputs_embeds = None,labels = None,):r"""labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):Labels for computing the sequence classification/regression loss. Indices should be in`[0, ...,config.num_labels - 1]`.If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If`config.num_labels > 1` a classification loss is computed (Cross-Entropy)."""transformer_outputs = self.transformer(input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,position_ids=position_ids,head_mask=head_mask,inputs_embeds=inputs_embeds,)hidden_states = transformer_outputs[0]logits = self.score(hidden_states)if input_ids is not None:batch_size, _ = input_ids.shape[:2]else:batch_size, _ = inputs_embeds.shape[:2]if self.pad_token_id is None and batch_size != 1:raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")if self.pad_token_id is None:sequence_lengths = -1else:if input_ids is not None:sequence_lengths = ops.ne(input_ids, self.pad_token_id).sum(-1) - 1else:sequence_lengths = -1pooled_logits = logits[:, sequence_lengths]loss = Noneoutput = (pooled_logits,) + transformer_outputs[1:]if labels is not None:if self.num_labels == 1:loss = self.loss(pooled_logits.squeeze(), labels.squeeze())elif self.num_labels > 1:loss = self.loss(pooled_logits.view(-1, self.num_labels), labels.view(-1))else:loss = self.loss(pooled_logits, labels)if loss is not None:output = (loss,) + outputreturn output

课程ppt及代码地址

github地址（网络不好的可以访问下面我克隆到gitee上的地址）：GPT
gitee地址：GPT

2、学习心得：

通过本次学习，熟悉了Mindspore这个国产深度学习框架，也对GPT的基本技术原理有所了解，同时也学会了如何在IMDb数据集上微调GPT完成一个情感分类的任务，比较有成就感！！！另外就是Selina小姐姐讲课的氛围比较轻松，学起来比较快乐！

3、经验分享：

在启智openI上的npu跑时记得使用mindspore1.7的镜像，同时安装对应mindnlp的版本，不然可能会因为版本不兼容而报错。另外就是IMDB情感分类任务的微调练习一定要做，这样能比较get到整个微调的全流程是怎样的，后面再去学习llama等模型的微调时就会更加得心应手。

4、课程反馈：

本次课程中的代码串讲我觉得是做的最好的地方，没有照着ppt一直念，而是在jupyter lab上把代码和原理结合到一块进行讲解，让学习者对代码的理解更加深入。我觉得内容的最后可以稍微推荐一下与Mindspore大模型相关的套件，让学习者在相关套件上可以开发出更多好玩和有趣的东西！

5、使用MindSpore昇思的体验和反馈：

MindSpore昇思的优点和喜欢的方面：

灵活性和可扩展性： MindSpore提供了灵活的编程模型，支持静态计算图和动态计算图。这种设计使得它适用于多种类型的机器学习和深度学习任务，并且具有一定的可扩展性。
跨平台支持： MindSpore支持多种硬件平台，包括CPU、GPU和NPU等，这使得它具有在不同设备上运行的能力，并能充分利用各种硬件加速。
自动并行和分布式训练： MindSpore提供了自动并行和分布式训练的功能，使得用户可以更轻松地处理大规模数据和模型，并更高效地进行训练。
生态系统和社区支持： MindSpore致力于建立开放的生态系统，并鼓励社区贡献，这对于一个开源框架来说非常重要，能够帮助用户更好地学习和解决问题。

一些建议和改进方面：