Huggingface的Transformer库经验总结

文章目录

  • transformers
    • transformers.Trainer
      • train()
      • tr_loss += self.training_step(model, inputs)
      • dagn的forward中逐步计算损失函数

transformers

transformers.Trainer

class Trainer:#这段代码根据训练数据集的类型和硬件环境,选择适当的采样器来处理数据集。def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:if isinstance(self.train_dataset, torch.utils.data.IterableDataset):return Noneelif is_torch_tpu_available():return get_tpu_sampler(self.train_dataset)else:return (RandomSampler(self.train_dataset)if self.args.local_rank == -1else DistributedSampler(self.train_dataset))
class Trainer:#这段代码通过检查和设置采样器来创建一个适用于训练数据集的 DataLoader 对象,并返回它,以便在训练过程中使用。def get_train_dataloader(self) -> DataLoader:"""Returns the training :class:`~torch.utils.data.DataLoader`.Will use no sampler if :obj:`self.train_dataset` is a :obj:`torch.utils.data.IterableDataset`, a random sampler(adapted to distributed training if necessary) otherwise.Subclass and override this method if you want to inject some custom behavior."""if self.train_dataset is None:raise ValueError("Trainer: training requires a train_dataset.")train_sampler = self._get_train_sampler()return DataLoader(self.train_dataset,batch_size=self.args.train_batch_size,sampler=train_sampler,collate_fn=self.data_collator,drop_last=self.args.dataloader_drop_last,)
        # This might change the seed so needs to run first.self._hp_search_setup(trial)# Model re-initif self.model_init is not None:# Seed must be set before instantiating the model when using model_init.set_seed(self.args.seed)model = self.model_init()self.model = model.to(self.args.device)# Reinitializes optimizer and schedulerself.optimizer, self.lr_scheduler = None, None# Data loader and number of training stepstrain_dataloader = self.get_train_dataloader()if self.args.max_steps > 0:t_total = self.args.max_stepsnum_train_epochs = (self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1)else:t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs)num_train_epochs = self.args.num_train_epochsself.args.max_steps = t_totalself.create_optimizer_and_scheduler(num_training_steps=t_total)# Check if saved optimizer or scheduler states existif (model_path is not Noneand os.path.isfile(os.path.join(model_path, "optimizer.pt"))and os.path.isfile(os.path.join(model_path, "scheduler.pt"))):# Load in optimizer and scheduler statesself.optimizer.load_state_dict(torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device))self.lr_scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt")))model = self.modelif self.args.fp16 and _use_apex:if not is_apex_available():raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level)# multi-gpu training (should be after apex fp16 initialization)if self.args.n_gpu > 1:model = torch.nn.DataParallel(model)# Distributed training (should be after apex fp16 initialization)if self.args.local_rank != -1:model = torch.nn.parallel.DistributedDataParallel(model,device_ids=[self.args.local_rank],output_device=self.args.local_rank,find_unused_parameters=True,)if self.tb_writer is not None:self.tb_writer.add_text("args", self.args.to_json_string())self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={})

train()

		#9. 训练循环# Train!if is_torch_tpu_available():total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size()else:total_train_batch_size = (self.args.train_batch_size* self.args.gradient_accumulation_steps* (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1))logger.info("***** Running training *****")logger.info("  Num examples = %d", self.num_examples(train_dataloader))# 训练示例数量logger.info("  Num Epochs = %d", num_train_epochs)#训练的总 Epoch 数logger.info("  Instantaneous batch size per device = %d", self.args.per_device_train_batch_size)#每个设备上的批量大小logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size)#并行、分布式和梯度累积后的总训练批量大小logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)#梯度累积步数logger.info("  Total optimization steps = %d", t_total)#总优化步骤数# 这些变量用于跟踪训练过程中的全局步骤、当前 Epoch、已经训练的 Epoch 数和当前 Epoch 中已经训练的步骤数。self.global_step = 0self.epoch = 0epochs_trained = 0steps_trained_in_current_epoch = 0# 检查点恢复这部分代码用于从检查点恢复训练进度。如果提供了 model_path,则尝试从中解析出 global_step、epochs_trained 和 steps_trained_in_current_epoch,以便继续训练。如果解析失败,则从头开始训练。# Check if continuing training from a checkpointif model_path is not None:# set global_step to global_step of last saved checkpoint from model pathtry:self.global_step = int(model_path.split("-")[-1].split(os.path.sep)[0])epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps)steps_trained_in_current_epoch = self.global_step % (len(train_dataloader) // self.args.gradient_accumulation_steps)logger.info("  Continuing training from checkpoint, will skip to saved global_step")logger.info("  Continuing training from epoch %d", epochs_trained)logger.info("  Continuing training from global step %d", self.global_step)logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)except ValueError:self.global_step = 0logger.info("  Starting fine-tuning.")# 初始化损失和优化器 	•	tr_loss:初始化训练损失。logging_loss_scalar:用于日志记录的损失标量。model.zero_grad():将模型的梯度归零。tr_loss = torch.tensor(0.0).to(self.args.device)logging_loss_scalar = 0.0model.zero_grad()# 训练循环disable_tqdm = self.args.disable_tqdm or not self.is_local_process_zero()#	•	disable_tqdm:控制进度条的显示。train_pbar = trange(epochs_trained, int(np.ceil(num_train_epochs)), desc="Epoch", disable=disable_tqdm)#train_pbar:用于显示 Epoch 进度的进度条。# ange(epochs_trained, int(np.ceil(num_train_epochs))):从已经训练的 Epoch 开始,继续进行剩余的 Epoch 训练。如果使用分布式采样器,设置当前 Epoch。for epoch in range(epochs_trained, int(np.ceil(num_train_epochs))):if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):train_dataloader.sampler.set_epoch(epoch)# 数据加载和迭代根据设备类型(TPU 或 GPU)设置数据加载器。	如果 past_index 大于等于 0,重置过去的内存状态。if is_torch_tpu_available():parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader(self.args.device)epoch_iterator = parallel_loaderelse:epoch_iterator = train_dataloader# Reset the past mems state at the beginning of each epoch if necessary.if self.args.past_index >= 0:self._past = None# epoch_pbar = tqdm(epoch_iterator, desc="Iteration", disable=disable_tqdm)# 迭代每个epoch :inputs 通常包含如下键:	input_ids:输入文本的标记索引。	attention_mask:关注标记,用于指示模型应关注哪些标记。labels:目标标签。for step, inputs in enumerate(epoch_iterator):# Skip past any already trained steps if resuming trainingif steps_trained_in_current_epoch > 0:steps_trained_in_current_epoch -= 1epoch_pbar.update(1)continuetr_loss += self.training_step(model, inputs) #进入将输入喂入模型进行训练的阶段if (step + 1) % self.args.gradient_accumulation_steps == 0 or (# last step in epoch but step is always smaller than gradient_accumulation_stepslen(epoch_iterator) <= self.args.gradient_accumulation_stepsand (step + 1) == len(epoch_iterator)):if self.args.fp16 and _use_native_amp:self.scaler.unscale_(self.optimizer)torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)elif self.args.fp16 and _use_apex:torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.args.max_grad_norm)else:torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)if is_torch_tpu_available():xm.optimizer_step(self.optimizer)elif self.args.fp16 and _use_native_amp:self.scaler.step(self.optimizer)self.scaler.update()else:self.optimizer.step()self.lr_scheduler.step()model.zero_grad()self.global_step += 1self.epoch = epoch + (step + 1) / len(epoch_iterator)if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or (self.global_step == 1 and self.args.logging_first_step):logs: Dict[str, float] = {}tr_loss_scalar = tr_loss.item()logs["loss"] = (tr_loss_scalar - logging_loss_scalar) / self.args.logging_steps# backward compatibility for pytorch schedulerslogs["learning_rate"] = (self.lr_scheduler.get_last_lr()[0]if version.parse(torch.__version__) >= version.parse("1.4")else self.lr_scheduler.get_lr()[0])logging_loss_scalar = tr_loss_scalarself.log(logs)if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 0:metrics = self.evaluate()self._report_to_hp_search(trial, epoch, metrics)if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0:# In all cases (even distributed/parallel), self.model is always a reference# to the model we want to save.if hasattr(model, "module"):assert (model.module is self.model), f"Module {model.module} should be a reference to self.model"else:assert model is self.model, f"Model {model} should be a reference to self.model"# Save model checkpointcheckpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}"if self.hp_search_backend is not None and trial is not None:run_id = (trial.numberif self.hp_search_backend == HPSearchBackend.OPTUNAelse tune.get_trial_id())checkpoint_folder += f"-run-{run_id}"output_dir = os.path.join(self.args.output_dir, checkpoint_folder)self.save_model(output_dir)if self.is_world_process_zero():self._rotate_checkpoints()if is_torch_tpu_available():xm.rendezvous("saving_optimizer_states")xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))elif self.is_world_process_zero():torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))epoch_pbar.update(1)if self.args.max_steps > 0 and self.global_step >= self.args.max_steps:breakepoch_pbar.close()train_pbar.update(1)if self.args.tpu_metrics_debug or self.args.debug:if is_torch_tpu_available():# tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)xm.master_print(met.metrics_report())else:logger.warning("You enabled PyTorch/XLA debug metrics but you don't have a TPU ""configured. Check your training configuration if this is unexpected.")if self.args.max_steps > 0 and self.global_step >= self.args.max_steps:break

开始训练的示意图
在这里插入图片描述

tr_loss += self.training_step(model, inputs)

    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:"""Perform a training step on a batch of inputs.Subclass and override to inject custom behavior.Args:model (:obj:`nn.Module`):The model to train.inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):The inputs and targets of the model.The dictionary will be unpacked before being fed to the model. Most models expect the targets under theargument :obj:`labels`. Check your model's documentation for all accepted arguments.Return::obj:`torch.Tensor`: The tensor with training loss on this batch."""if hasattr(self, "_training_step"):warnings.warn("The `_training_step` method is deprecated and won't be called in a future version, define `training_step` in your subclass.",FutureWarning,)return self._training_step(model, inputs, self.optimizer)model.train()inputs = self._prepare_inputs(inputs)if self.args.fp16 and _use_native_amp:with autocast():outputs = model(**inputs)loss = outputs[0]else:outputs = model(**inputs) #获得输出# We don't use .loss here since the model may return tuples instead of ModelOutput.loss = outputs[0]

获得输出需要调用DAGN.py中的
class DAGN():
def forward():
# input_ids.size()=torch.Size([32,128])
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None

   flat_passage_mask = passage_mask.view(-1, passage_mask.size(-1)) if passage_mask is not None else Noneflat_question_mask = question_mask.view(-1, question_mask.size(-1)) if question_mask is not None else Noneflat_argument_bpe_ids = argument_bpe_ids.view(-1, argument_bpe_ids.size(-1)) if argument_bpe_ids is not None else Noneflat_domain_bpe_ids = domain_bpe_ids.view(-1, domain_bpe_ids.size(-1)) if domain_bpe_ids is not None else None  flat_punct_bpe_ids = punct_bpe_ids.view(-1, punct_bpe_ids.size(-1)) if punct_bpe_ids is not None else None# 获取roberta模型最后一层的输出利用input和attention_mask;last_hidden_state=torch.Size([32,128,768])# BERT输入的所有token经过BERT编码后,会在每个位置输出一个大小为 hidden_size(在 BERT-base中是 768)的向量。last_hidden_state, p = self.roberta(flat_input_ids, attention_mask=flat_attention_mask, token_type_ids=None, return_dict = False)#last_hidden_state, p = self.bert(flat_input_ids, attention_mask=flat_attention_mask, token_type_ids=None, return_dict = False)sequence_output = last_hidden_state#torch.Size([32,128,768])pooled_output = p #torch.Size([32.768])

在这里插入图片描述
上述文件中指明了得到的内容:

@dataclass
class BaseModelOutputWithPooling(ModelOutput):"""Base class for model's outputs that also contains a pooling of the last hidden states.Args:last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):Sequence of hidden-states at the output of the last layer of the model.pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):Last layer hidden-state of the first token of the sequence (classification token)further processed by a Linear layer and a Tanh activation function. The Linearlayer weights are trained from the next sentence prediction (classification)objective during pretraining.hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)of shape :obj:`(batch_size, sequence_length, hidden_size)`.Hidden-states of the model at the output of each layer plus the initial embedding outputs.attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.Attentions weights after the attention softmax, used to compute the weighted average in the self-attentionheads."""last_hidden_state: torch.FloatTensorpooler_output: torch.FloatTensor = Nonehidden_states: Optional[Tuple[torch.FloatTensor]] = Noneattentions: Optional[Tuple[torch.FloatTensor]] = None

图像解释:
在这里插入图片描述

在这里插入图片描述

dagn的forward中逐步计算损失函数

在dagn.py的第464行,在计算损失的时候,github中的原始代码调用错了算是计算的函数,改为
loss2 = self.get_con_lossL(positive_keys2, negative_keys2)

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/web/24778.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

【免费Web系列】大家好 ,今天是Web课程的第十七天点赞收藏关注,持续更新作品 !

这是Web第一天的课程大家可以传送过去学习 http://t.csdnimg.cn/K547r SpingBoot原理 在前面十多天的课程当中&#xff0c;我们学习的都是web开发的技术使用&#xff0c;都是面向应用层面的&#xff0c;我们学会了怎么样去用。而我们今天所要学习的是web后端开发的最后一个篇…

冯喜运:6.7晚间黄金原油行情分析及操作建议

【黄金消息面分析】&#xff1a;周四(6月6日)纽市尾盘&#xff0c;现货黄金盘中报2373.15美元/盎司&#xff0c;涨18.16美元或0.77%。如果金价反弹至上周高点2364上方&#xff0c;将引发一周看涨反转。日收盘价高于该价格水平将确认突破。一旦突破得到确认&#xff0c;金价进一…

Vue3【十一】08使用toRefs和toRef

08使用toRefs和toRef toRefs()函数将person对象中的name和age属性转换为响应式引用&#xff0c;并返回一个对象&#xff0c;对象中的name和age属性都是响应式引用&#xff0c;具有响应式功能。 toRef()函数将person对象中的name属性转换为响应式引用&#xff0c;并返回一个响应…

海南聚广众达电子商务咨询有限公司正规吗?

在数字经济的浪潮下&#xff0c;海南聚广众达电子商务咨询有限公司凭借其对抖音电商领域的深刻洞察和专业服务&#xff0c;成为引领行业新风尚的佼佼者。公司不仅具备丰富的电商运营经验&#xff0c;更有一支高效、创新的团队&#xff0c;致力于为品牌商家提供全方位的电商解决…

ThingsKit:智能物联网平台的创新者

在数字化浪潮的推动下&#xff0c;物联网&#xff08;IoT&#xff09;正在迅速改变我们的生活和工作方式。ThingsKit&#xff0c;一个领先的物联网平台&#xff0c;致力于通过其创新的技术和服务&#xff0c;为用户提供一个全面、灵活且易于使用的解决方案。 核心特点 设备连接…

MyBatisPlus插件生成代码

文章目录 概要安装插件使用插件 概要 MyBatis-Plus 是 MyBatis 的增强工具&#xff0c;旨在简化 MyBatis 的开发。MyBatis-Plus 代码生成器插件可以自动生成项目中常见的代码&#xff0c;如实体类、Mapper 接口、Service 接口和实现类、Controller 等&#xff0c;从而减少手动…

FreeSurFer的recon-all处理流——学习记录

官方网址&#xff1a;ReconAllTableStableV6.0 - Free Surfer Wiki &#xff08;1&#xff09;颅骨剥离skullstrip 颅骨剥离后生成文件&#xff1a;/mri/brainmask.mgz &#xff08;2&#xff09;图像配准canorm Freesurfer图像配准&#xff1a;将 mri/nu.mgz 体积与 FREESU…

SpringBoot+Vue学生作业管理系统【附:资料➕文档】

前言&#xff1a;我是源码分享交流Coding&#xff0c;专注JavaVue领域&#xff0c;专业提供程序设计开发、源码分享、 技术指导讲解、各类项目免费分享&#xff0c;定制和毕业设计服务&#xff01; 免费获取方式--->>文章末尾处&#xff01; 项目介绍047&#xff1a; 【…

面试高频问题----6

一、String、StringBuffer、StringBuilder 1.String&#xff1a; ***string类是java中用于表示不可变字符序列的类。 ***string对象是不可变的&#xff0c;一旦创建&#xff0c;其值就不能被改变。每次对string对象的修改操作都会生成一个新的string对象。 ***由于string的…

MTK7621+MT7603+MT7613 RF定频测试方法

1、从下面网址下载QA软件包&#xff0c;然后在WIN系统下安装QA环境。https://download.csdn.net/download/zhouwu_linux/89408518 在WINDOWS 7系统下先安装WinPcap_4_1_3.exe。 2、电脑先连接仪器&#xff0c;主板网络与电脑连接并接12V 2V的电源。模组默认的IP&#xff1a;10…

天才程序员周弈帆 | Stable Diffusion 解读(一):回顾早期工作

本文来源公众号“天才程序员周弈帆”&#xff0c;仅用于学术分享&#xff0c;侵权删&#xff0c;干货满满。 原文链接&#xff1a;Stable Diffusion 解读&#xff08;一&#xff09;&#xff1a;回顾早期工作 在2022年的这波AI绘画浪潮中&#xff0c;Stable Diffusion无疑是最…

【故障诊断】基于改进型的节点重构小波包频带能量谱结合概率神经网络 PNN实现轴承联合故障诊断

研究内容: 基于改进型的节点重构小波包频带能量谱结合概率神经网络(PNN)的轴承联合故障诊断研究旨在开发一种有效的方法来识别轴承故障。该方法结合了节点重构小波包频带能量谱和PNN模型,以提高轴承故障诊断的准确性和可靠性。 研究路线: 文献综述:对相关领域的文献进行…

并行处理的题目:

问题定义 给定一个字符串数组&#xff0c;将所有字符串分组&#xff0c;每一组的字符串包含的字符相同但是顺序不同。 如&#xff1a; 输入: ["eat", "tea", "tan", "ate", "nat", "bat"], 输出: [ ["ate&qu…

C++面向对象程序设计 - 字符串流

文件流是以外存文件为输入输出对象的数据流&#xff0c;字符串流不是以外存文件为输入输出的对象&#xff0c;而以内存中用户定义的字符数组&#xff08;字符串&#xff09;为输入输出的对象&#xff0c;即将数据输出到内存中的字符数组&#xff0c;或者从字符数组&#xff08;…

探索Linux中的dbus-binding-tool:理解其用途与用法

探索Linux中的dbus-binding-tool&#xff1a;理解其用途与用法 在Linux系统中&#xff0c;D-Bus是一个消息总线系统&#xff0c;它允许应用程序之间发送和接收消息。这种机制对于实现进程间通信&#xff08;IPC&#xff09;和系统服务集成非常有用。其中&#xff0c;dbus-bind…

特别名词Test Paper2

特别名词Test Paper2 cabinet 橱柜cable 电缆&#xff0c;有线电视cafe 咖啡厅cafeteria 咖啡店&#xff0c;自助餐厅cage 笼子Cambridge 剑桥camel 骆驼camera 相机camp 露营campus 校园candidate 候选人&#xff0c;考生candle 蜡烛canteen 食堂capital 资金&#xff0c;首都…

《中国数据库年度行业分析报告》节选:数据库关键技术及发展趋势

墨天轮于5月29日正式发布 《2023年中国数据库年度行业分析报告》&#xff0c;总结梳理了中国数据库行业的技术演进及趋势。作为云上数据库和数据计算领域的领先者&#xff0c;拓数派受邀参与创作&#xff0c;联合编写了《AI 时代下新一代数据仓库的演进》《从数据库到数据计算系…

VMware导入小白分享的MacOS版本之后,无法开机的解决方案

前言 这段时间陆续有小伙伴找到小白&#xff0c;说&#xff1a;导入小白分享的MacOS版本之后&#xff0c;出现无法开机的问题。 遇到这个问题&#xff0c;并不是说明分享版本有问题&#xff0c;因为大部分小伙伴导入之后都没有出现类似的问题&#xff0c;都是导入之后开机&…

unity中常见的角色控制方法

使用物理引擎&#xff08;如 Rigidbody&#xff09;来控制角色 using UnityEngine;public class PlayerController : MonoBehaviour {public float moveSpeed 5f;public float jumpForce 5f;private Rigidbody rb;private bool isGrounded;void Start(){// 获取角色的 Rigid…

NVeloDocx一个基于NVelocity的word模版引擎

NVeloDocx是一个基于NVelocity的Word模版引擎&#xff0c;目前主要是用于E6低代码开发平台供用户轻松制作各种Word报告模版。 有以下优点&#xff1a; 1、完全的NVelocity语法&#xff1b; 2、直接在Word中写NVelocity脚本&#xff0c;使用非常非常方便&#xff1b; 3、完全兼…