HuggingFace的transfomers库

pipeline

from transformers import pipelineclassifier = pipeline("sentiment-analysis")#自动下载模型和tokenizer
classifier("We are very happy to show you the 🤗 Transformers library.")#[{'label': 'POSITIVE', 'score': 0.9998}]
#输入多句
results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
for result in results:print(f"label: {result['label']}, with score: {round(result['score'], 4)}")#可以指定模型
import torch
from transformers import pipeline
from datasets import load_dataset, Audiospeech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
result = speech_recognizer(dataset[:4]["audio"])
print([d["text"] for d in result])#指定device
transcriber = pipeline(model="openai/whisper-large-v2", device=0)
#自动分配device
#pip install --upgrade accelerate
transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")#batch推理
transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
texts = transcriber(audio_filenames)#其他参数示例
# pip install accelerate
import torch
from transformers import pipelinepipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
output = pipe("This is a cool example!", do_sample=True, top_p=0.95)

下面是更多类型,完整列表 

AutoClass

AutoTokenizer

from transformers import AutoTokenizermodel_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(encoding)
#指定返回pytorch tensor
pt_batch = tokenizer(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],padding=True,truncation=True,max_length=512,return_tensors="pt",#如果是tf_tensor则写tf
)

AutoModel

from transformers import AutoModelForSequenceClassificationmodel_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
pt_outputs = pt_model(**pt_batch)
#输出只有概率,后处理要自己做
from torch import nnpt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)#保存模型
pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)
pt_model.save_pretrained(pt_save_directory)
pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")#torch加载tf保存的模型
from transformers import AutoModeltokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)

AutoConfig

from transformers import AutoConfig# Download configuration from huggingface.co and cache.
config = AutoConfig.from_pretrained("bert-base-uncased")# Download configuration from huggingface.co (user-uploaded) and cache.
config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased")# If configuration file is in a directory (e.g., was saved using *save_pretrained('./test/saved_model/')*).
config = AutoConfig.from_pretrained("./test/bert_saved_model/")# Load a specific configuration file.
config = AutoConfig.from_pretrained("./test/bert_saved_model/my_configuration.json")# Change some config attributes when loading a pretrained config.
config = AutoConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
config.output_attentionsconfig, unused_kwargs = AutoConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
)from transformers import AutoModel
my_model = AutoModel.from_config(config)

 

Trainer

from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import Trainermodel = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")training_args = TrainingArguments(output_dir="path/to/save/folder/",learning_rate=2e-5,per_device_train_batch_size=8,per_device_eval_batch_size=8,num_train_epochs=2,
)tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
def tokenize_dataset(dataset):return tokenizer(dataset["text"])
dataset = dataset.map(tokenize_dataset, batched=True)data_collator = DataCollatorWithPadding(tokenizer=tokenizer)trainer = Trainer(model=model,args=training_args,train_dataset=dataset["train"],eval_dataset=dataset["test"],tokenizer=tokenizer,data_collator=data_collator,
)  # doctest: +SKIP
trainer.train()

安装

pip install transformers
pip install 'transformers[torch]'#只安装torch后端#源码安装
pip install git+https://github.com/huggingface/transformers
#开发者模式
git clone https://github.com/huggingface/transformers.git
cd transformers
pip install -e .

tokenizer

我获取了opt类型的tokenizer,那么enc是什么类型呢?有哪些方法呢?

from transformers import AutoTokenizer
enc = AutoTokenizer.from_pretrained('facebook/opt-125m')

可以通过print(enc)看到,enc是GPT2TokenizerFast类型,搜索类型的定义,在python安装包的transformers/models/gpt2/tokenization_gpt2_fast.py

class GPT2TokenizerFast(PreTrainedTokenizerFast):vocab_files_names = VOCAB_FILES_NAMESpretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAPmax_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmodel_input_names = ["input_ids", "attention_mask"]slow_tokenizer_class = GPT2Tokenizerdef __init__(self,vocab_file=None,merges_file=None,tokenizer_file=None,unk_token="<|endoftext|>",bos_token="<|endoftext|>",eos_token="<|endoftext|>",add_prefix_space=False,**kwargs,):super().__init__(vocab_file,merges_file,tokenizer_file=tokenizer_file,unk_token=unk_token,bos_token=bos_token,eos_token=eos_token,add_prefix_space=add_prefix_space,**kwargs,)def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:def _encode_plus(self, *args, **kwargs) -> BatchEncoding:def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):vocab_files_names = VOCAB_FILES_NAMESslow_tokenizer_class: PreTrainedTokenizer = Nonedef __init__(self, *args, **kwargs):tokenizer_object = kwargs.pop("tokenizer_object", None)slow_tokenizer = kwargs.pop("__slow_tokenizer", None)fast_tokenizer_file = kwargs.pop("tokenizer_file", None)from_slow = kwargs.pop("from_slow", False)added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})@property#属性装饰器的作用在于将成员函数变成成员变量,访问的时候不需要/不能加()def is_fast(self) -> bool:@propertydef can_save_slow_tokenizer(self) -> bool:@propertydef vocab_size(self) -> int:def get_vocab(self) -> Dict[str, int]:@propertydef vocab(self) -> Dict[str, int]:@propertydef added_tokens_encoder(self) -> Dict[str, int]:@propertydef added_tokens_decoder(self) -> Dict[int, AddedToken]:def get_added_vocab(self) -> Dict[str, int]:def __len__(self) -> int:@propertydef backend_tokenizer(self) -> TokenizerFast:@propertydef decoder(self) -> DecoderFast:def _convert_encoding(self,encoding: EncodingFast,return_token_type_ids: Optional[bool] = None,return_attention_mask: Optional[bool] = None,return_overflowing_tokens: bool = False,return_special_tokens_mask: bool = False,return_offsets_mapping: bool = False,return_length: bool = False,verbose: bool = True,) -> Tuple[Dict[str, Any], List[EncodingFast]]:def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:def _convert_token_to_id_with_added_voc(self, token: str) -> int:def _convert_id_to_token(self, index: int) -> Optional[str]:def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:def num_special_tokens_to_add(self, pair: bool = False) -> int:def convert_ids_to_tokens(self, ids: Union[int, List[int]], skip_special_tokens: bool = False) -> Union[str, List[str]]:def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:def set_truncation_and_padding(self,padding_strategy: PaddingStrategy,truncation_strategy: TruncationStrategy,max_length: int,stride: int,pad_to_multiple_of: Optional[int],):def _batch_encode_plus(self,batch_text_or_text_pairs: Union[List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]],add_special_tokens: bool = True,padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,max_length: Optional[int] = None,stride: int = 0,is_split_into_words: bool = False,pad_to_multiple_of: Optional[int] = None,return_tensors: Optional[str] = None,return_token_type_ids: Optional[bool] = None,return_attention_mask: Optional[bool] = None,return_overflowing_tokens: bool = False,return_special_tokens_mask: bool = False,return_offsets_mapping: bool = False,return_length: bool = False,verbose: bool = True,) -> BatchEncoding:def _encode_plus(self,text: Union[TextInput, PreTokenizedInput],text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,add_special_tokens: bool = True,padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,max_length: Optional[int] = None,stride: int = 0,is_split_into_words: bool = False,pad_to_multiple_of: Optional[int] = None,return_tensors: Optional[bool] = None,return_token_type_ids: Optional[bool] = None,return_attention_mask: Optional[bool] = None,return_overflowing_tokens: bool = False,return_special_tokens_mask: bool = False,return_offsets_mapping: bool = False,return_length: bool = False,verbose: bool = True,**kwargs,) -> BatchEncoding:def convert_tokens_to_string(self, tokens: List[str]) -> str:def _decode(self,token_ids: Union[int, List[int]],skip_special_tokens: bool = False,clean_up_tokenization_spaces: bool = None,**kwargs,) -> str:def _save_pretrained(self,save_directory: Union[str, os.PathLike],file_names: Tuple[str],legacy_format: Optional[bool] = None,filename_prefix: Optional[str] = None,) -> Tuple[str]:def train_new_from_iterator(self,text_iterator,vocab_size,length=None,new_special_tokens=None,special_tokens_map=None,**kwargs,):

流式输出

官网指导

from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamertok = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
streamer = TextStreamer(tok)# Despite returning the usual output, the streamer will also print the generated text to stdout.
_ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Threadtok = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
streamer = TextIteratorStreamer(tok)# Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
generated_text = ""
for new_text in streamer:generated_text += new_text
print(generated_text)

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/137327.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

C# OpenCvSharp 玉米粒计数

效果 项目 代码 using OpenCvSharp; using System; using System.Drawing; using System.Text; using System.Windows.Forms;namespace OpenCvSharp_Demo {public partial class frmMain : Form{public frmMain(){InitializeComponent();}string fileFilter "*.*|*.bmp;…

【C++】STL 标准模板库 ③ ( STL 容器简介 | STL 容器区别 | STL 容器分类 | 常用的 STL 容器 )

文章目录 一、STL 容器简介1、STL 容器区别2、STL 容器分类3、常用的 STL 容器 一、STL 容器简介 1、STL 容器区别 STL 容器 用于管理 一组 数据元素 , 不同类型的 STL 容器 的区别 主要是 节点 和 节点之间的关系模型 不同 ; 容器的内存空间是否连续 : 向量 vector 的内存空间…

跟着森老师学React Hooks(1)——使用Vite构建React项目

Vite是一款构建工具&#xff0c;对ts有很好的支持&#xff0c;最近也是在前端越来越流行。 以往的React项目的初始化方式大多是通过脚手架create-react-app(本质是webpack)&#xff0c;其实比起Vite来构建&#xff0c;启动会慢一些。 所以这次跟着B站的一个教程&#xff0c;使用…

YOLOWeeds: 用于棉花生产系统中多类杂草检测的 YOLO 目标检测器的新基准

YOLOWeeds: A novel benchmark of YOLO object detectors for multi-class weed detection in cotton production systems 摘要1、介绍2、总结摘要 过度依赖除草剂控制杂草,加速了杂草的抗除草剂进化,引起了对环境、食品安全和人类健康的日益关注。自动化/机器人除草的机器视…

生态环境领域基于R语言piecewiseSEM结构方程模型

结构方程模型&#xff08;Sructural Equation Modeling&#xff0c;SEM&#xff09;可分析系统内变量间的相互关系&#xff0c;并通过图形化方式清晰展示系统中多变量因果关系网&#xff0c;具有强大的数据分析功能和广泛的适用性&#xff0c;是近年来生态、进化、环境、地学、…

【2】Spring Boot 3 项目搭建

目录 【2】Spring Boot 3 初始项目搭建项目生成1. 使用IDEA商业版创建2. 使用官方start脚手架创建 配置与启动Git版本控制 个人主页: 【⭐️个人主页】 需要您的【&#x1f496; 点赞关注】支持 &#x1f4af; 【2】Spring Boot 3 初始项目搭建 项目生成 1. 使用IDEA商业版创…

JVM-虚拟机的故障处理与调优案例分析

案例1&#xff1a;大内存硬件上的程序部署策略 一个15万PV/日左右的在线文档类型网站最近更换了硬件系统&#xff0c;服务器的硬件为四路志强处理器、16GB物理内存&#xff0c;操作系统为64位CentOS 5.4&#xff0c;Resin作为Web服务器。整个服务器暂时没有部署别的应用&#…

stm32HAL库串口错误回调函数的使用

使用stm32HAL库串口IDLE中断时,有时串口速度快会导致串口出错后续就收不到数据了 可以通过实现串口中断回调函数来解决 void usart_DMA_rx_EN(UART_HandleTypeDef *huart,uint8_t *pData,uint16_t Size) { HAL_UARTEx_ReceiveToIdle_IT(huart,pData,Size); //HAL_UARTEx_Recei…

通过 dump 虚拟机线程方法栈和堆内存来分析 Android 卡顿和 OOM 问题

作者&#xff1a;Tans5 Android 中的性能问题无非就是卡顿和 OOM&#xff0c;虽然总体就这两种&#xff0c;但是造成这两种性能问题的原因却是非常多&#xff0c;需要具体的原因具体分析&#xff0c;而且这是非常复杂的。本篇文章只是简单介绍如何找到造成这些问题的直接原因的…

Nodejs的安装以及配置(node-v12.16.1-x64.msi)

Nodejs的安装以及配置 1、安装 node-v12.16.1-x64.msi点击安装&#xff0c;注意以下步骤 本文设置nodejs的安装的路径&#xff1a;D:\soft\nodejs 继续点击next&#xff0c;选中Add to PATH &#xff0c;旁边的英文告诉我们会把 环境变量 给我们配置好 当然也可以只选择 Nod…

linux安装配置MongoDB并设置开机启动

linux安装配置MongoDB并设置开机启动 文章目录 linux安装配置MongoDB并设置开机启动1. 下载 MongoDB 的linux安装包2. 上传 MongoDB 安装包到linux系统中3. 解压 MongoDB 安装包4. 创建 MongoDB 必要目录5. 移动 MongoDB 安装目录6. 设置 MongoDB 环境变量7. 添加 MongoDB 配置…

Oracle获取执行计划的6种方法

一、什么是执行计划&#xff1f; 执行计划是一条查询语句在Oracle中的执行过程或访问路径的描述。 执行计划描述了SQL引擎为执行SQL语句进行的操作&#xff0c;分析SQL语句相关的性能问题或仅仅质疑查询优化器的决定时&#xff0c;必须知道执行计划&#xff1b;所以执行计划常用…

【使用教程】在Ubuntu下PMM60系列一体化伺服电机通过PDO跑循环同步位置模式详解

本教程将指导您在Ubuntu操作系统下使用PDO来配置和控制PMM60系列一体化伺服电机以实现循环同步位置模式。我们将介绍必要的步骤和命令&#xff0c;以确保您能够成功地配置和控制PMM系列一体化伺服电机。 一、准备工作 在正式介绍之前还需要一些准备工作&#xff1a;1.装有lin…

点亮一个灯

.text .global _start _start: RCC时钟使能 GPIOE RCC_MP_AHB$ENSETR[4]->1 LDR R0,0x50000a28 LDR R1,[R0] ORR R1,R1,#(0x1<<4) ORR R1,R1,#(0x1<<5) STR R1,[R0]设置PE10为输出模式 GPIOE_MODER[21:20]->01 先清0 LDR R0,0x50006000 LDR R1,[R0] BI…

Spark 读取ES采坑系列

目录 一、使用的插件 二、ES集群和Elasticsearch-hadoop版本问题 三、Elasticsearch-hadoop 和Scala版本以及Spark版本&#xff08;版本不匹配会有各种异常信息 一、使用的插件 <dependency><groupId>org.elasticsearch</groupId><artifactId>elas…

centos配置docker环境

CentOS系统更换软件安装源 yum默认链接的还是国外的镜像&#xff0c;速度相对不理想&#xff0c;配置成国内的镜像会快很多,这里以阿里镜像为例进行配置&#xff1a; 首先进行更新&#xff1a; yum updatebase源 第一步&#xff1a;备份你的原镜像文件&#xff0c;以免出错后…

为什么说软文推广中了解用户是关键?

数字化时代下软文成为众多企业推广品牌的方式之一&#xff0c;所谓软文&#xff0c;就是指以向用户提供信息&#xff0c;并将产品隐含在信息中的柔性手段。 想要使软文效果明显&#xff0c;就必须深入了解用户&#xff0c;把握其需求、兴趣和行为特点&#xff0c;这也是今天媒…

一篇文章教会你写一个贪吃蛇小游戏(纯C语言)

一篇文章教会你写一个贪吃蛇小游戏 1、游戏展示2、游戏功能3、Win32 API3.1 控制台程序3.2 控制台屏幕上的坐标COORD3.3 GetStdHandle函数3.4 GetConsoleCursorInfo函数3.4.1 CONSOLE_CURSOR_INFO结构体 3.5 SetConsoleCursorInfo函数3.6 SetConsoleCursorPosition函数3.7 GetA…

制作麒麟V10-server-sp2镜像

1.挂载iso 文件到目录 mount -o loop /xxx.iso /mnt 这样mnt 目录下会有iso 解压相关的文件 2.修改源文件内容 vim /etc/yum.repos.d/ kylin_x86_64.repo 将里面的所有的源enabled 都改成 0 并添加一个新的源 [ks10-local] name Kylin Linux Advanced Server 10 - Local base…

STM32-HAL库09-CAN通讯(loopback模式)

一、所用材料&#xff1a; STM32F103C6T6最小系统板 STM32CUBEMX&#xff08;HAL库软件&#xff09; MDK5 串口调试助手 二、所学内容&#xff1a; 初步学习如何使用STM32的CAN通讯功能&#xff0c;在本章节主要达到板内CAN通讯的效果&#xff0c;即32发送CAN信息再在CAN接收…