HuggingFace的transfomers库

pipeline

from transformers import pipelineclassifier = pipeline("sentiment-analysis")#自动下载模型和tokenizer
classifier("We are very happy to show you the 🤗 Transformers library.")#[{'label': 'POSITIVE', 'score': 0.9998}]
#输入多句
results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
for result in results:print(f"label: {result['label']}, with score: {round(result['score'], 4)}")#可以指定模型
import torch
from transformers import pipeline
from datasets import load_dataset, Audiospeech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
result = speech_recognizer(dataset[:4]["audio"])
print([d["text"] for d in result])#指定device
transcriber = pipeline(model="openai/whisper-large-v2", device=0)
#自动分配device
#pip install --upgrade accelerate
transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")#batch推理
transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
texts = transcriber(audio_filenames)#其他参数示例
# pip install accelerate
import torch
from transformers import pipelinepipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
output = pipe("This is a cool example!", do_sample=True, top_p=0.95)

下面是更多类型,完整列表 

AutoClass

AutoTokenizer

from transformers import AutoTokenizermodel_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(encoding)
#指定返回pytorch tensor
pt_batch = tokenizer(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],padding=True,truncation=True,max_length=512,return_tensors="pt",#如果是tf_tensor则写tf
)

AutoModel

from transformers import AutoModelForSequenceClassificationmodel_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
pt_outputs = pt_model(**pt_batch)
#输出只有概率,后处理要自己做
from torch import nnpt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)#保存模型
pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)
pt_model.save_pretrained(pt_save_directory)
pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")#torch加载tf保存的模型
from transformers import AutoModeltokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)

AutoConfig

from transformers import AutoConfig# Download configuration from huggingface.co and cache.
config = AutoConfig.from_pretrained("bert-base-uncased")# Download configuration from huggingface.co (user-uploaded) and cache.
config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased")# If configuration file is in a directory (e.g., was saved using *save_pretrained('./test/saved_model/')*).
config = AutoConfig.from_pretrained("./test/bert_saved_model/")# Load a specific configuration file.
config = AutoConfig.from_pretrained("./test/bert_saved_model/my_configuration.json")# Change some config attributes when loading a pretrained config.
config = AutoConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
config.output_attentionsconfig, unused_kwargs = AutoConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
)from transformers import AutoModel
my_model = AutoModel.from_config(config)

 

Trainer

from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import Trainermodel = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")training_args = TrainingArguments(output_dir="path/to/save/folder/",learning_rate=2e-5,per_device_train_batch_size=8,per_device_eval_batch_size=8,num_train_epochs=2,
)tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
def tokenize_dataset(dataset):return tokenizer(dataset["text"])
dataset = dataset.map(tokenize_dataset, batched=True)data_collator = DataCollatorWithPadding(tokenizer=tokenizer)trainer = Trainer(model=model,args=training_args,train_dataset=dataset["train"],eval_dataset=dataset["test"],tokenizer=tokenizer,data_collator=data_collator,
)  # doctest: +SKIP
trainer.train()

安装

pip install transformers
pip install 'transformers[torch]'#只安装torch后端#源码安装
pip install git+https://github.com/huggingface/transformers
#开发者模式
git clone https://github.com/huggingface/transformers.git
cd transformers
pip install -e .

tokenizer

我获取了opt类型的tokenizer,那么enc是什么类型呢?有哪些方法呢?

from transformers import AutoTokenizer
enc = AutoTokenizer.from_pretrained('facebook/opt-125m')

可以通过print(enc)看到,enc是GPT2TokenizerFast类型,搜索类型的定义,在python安装包的transformers/models/gpt2/tokenization_gpt2_fast.py

class GPT2TokenizerFast(PreTrainedTokenizerFast):vocab_files_names = VOCAB_FILES_NAMESpretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAPmax_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmodel_input_names = ["input_ids", "attention_mask"]slow_tokenizer_class = GPT2Tokenizerdef __init__(self,vocab_file=None,merges_file=None,tokenizer_file=None,unk_token="<|endoftext|>",bos_token="<|endoftext|>",eos_token="<|endoftext|>",add_prefix_space=False,**kwargs,):super().__init__(vocab_file,merges_file,tokenizer_file=tokenizer_file,unk_token=unk_token,bos_token=bos_token,eos_token=eos_token,add_prefix_space=add_prefix_space,**kwargs,)def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:def _encode_plus(self, *args, **kwargs) -> BatchEncoding:def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):vocab_files_names = VOCAB_FILES_NAMESslow_tokenizer_class: PreTrainedTokenizer = Nonedef __init__(self, *args, **kwargs):tokenizer_object = kwargs.pop("tokenizer_object", None)slow_tokenizer = kwargs.pop("__slow_tokenizer", None)fast_tokenizer_file = kwargs.pop("tokenizer_file", None)from_slow = kwargs.pop("from_slow", False)added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})@property#属性装饰器的作用在于将成员函数变成成员变量,访问的时候不需要/不能加()def is_fast(self) -> bool:@propertydef can_save_slow_tokenizer(self) -> bool:@propertydef vocab_size(self) -> int:def get_vocab(self) -> Dict[str, int]:@propertydef vocab(self) -> Dict[str, int]:@propertydef added_tokens_encoder(self) -> Dict[str, int]:@propertydef added_tokens_decoder(self) -> Dict[int, AddedToken]:def get_added_vocab(self) -> Dict[str, int]:def __len__(self) -> int:@propertydef backend_tokenizer(self) -> TokenizerFast:@propertydef decoder(self) -> DecoderFast:def _convert_encoding(self,encoding: EncodingFast,return_token_type_ids: Optional[bool] = None,return_attention_mask: Optional[bool] = None,return_overflowing_tokens: bool = False,return_special_tokens_mask: bool = False,return_offsets_mapping: bool = False,return_length: bool = False,verbose: bool = True,) -> Tuple[Dict[str, Any], List[EncodingFast]]:def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:def _convert_token_to_id_with_added_voc(self, token: str) -> int:def _convert_id_to_token(self, index: int) -> Optional[str]:def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:def num_special_tokens_to_add(self, pair: bool = False) -> int:def convert_ids_to_tokens(self, ids: Union[int, List[int]], skip_special_tokens: bool = False) -> Union[str, List[str]]:def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:def set_truncation_and_padding(self,padding_strategy: PaddingStrategy,truncation_strategy: TruncationStrategy,max_length: int,stride: int,pad_to_multiple_of: Optional[int],):def _batch_encode_plus(self,batch_text_or_text_pairs: Union[List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]],add_special_tokens: bool = True,padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,max_length: Optional[int] = None,stride: int = 0,is_split_into_words: bool = False,pad_to_multiple_of: Optional[int] = None,return_tensors: Optional[str] = None,return_token_type_ids: Optional[bool] = None,return_attention_mask: Optional[bool] = None,return_overflowing_tokens: bool = False,return_special_tokens_mask: bool = False,return_offsets_mapping: bool = False,return_length: bool = False,verbose: bool = True,) -> BatchEncoding:def _encode_plus(self,text: Union[TextInput, PreTokenizedInput],text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,add_special_tokens: bool = True,padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,max_length: Optional[int] = None,stride: int = 0,is_split_into_words: bool = False,pad_to_multiple_of: Optional[int] = None,return_tensors: Optional[bool] = None,return_token_type_ids: Optional[bool] = None,return_attention_mask: Optional[bool] = None,return_overflowing_tokens: bool = False,return_special_tokens_mask: bool = False,return_offsets_mapping: bool = False,return_length: bool = False,verbose: bool = True,**kwargs,) -> BatchEncoding:def convert_tokens_to_string(self, tokens: List[str]) -> str:def _decode(self,token_ids: Union[int, List[int]],skip_special_tokens: bool = False,clean_up_tokenization_spaces: bool = None,**kwargs,) -> str:def _save_pretrained(self,save_directory: Union[str, os.PathLike],file_names: Tuple[str],legacy_format: Optional[bool] = None,filename_prefix: Optional[str] = None,) -> Tuple[str]:def train_new_from_iterator(self,text_iterator,vocab_size,length=None,new_special_tokens=None,special_tokens_map=None,**kwargs,):

流式输出

官网指导

from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamertok = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
streamer = TextStreamer(tok)# Despite returning the usual output, the streamer will also print the generated text to stdout.
_ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Threadtok = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
streamer = TextIteratorStreamer(tok)# Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
generated_text = ""
for new_text in streamer:generated_text += new_text
print(generated_text)

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/137327.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

C# OpenCvSharp 玉米粒计数

效果 项目 代码 using OpenCvSharp; using System; using System.Drawing; using System.Text; using System.Windows.Forms;namespace OpenCvSharp_Demo {public partial class frmMain : Form{public frmMain(){InitializeComponent();}string fileFilter "*.*|*.bmp;…

【C++】STL 标准模板库 ③ ( STL 容器简介 | STL 容器区别 | STL 容器分类 | 常用的 STL 容器 )

文章目录 一、STL 容器简介1、STL 容器区别2、STL 容器分类3、常用的 STL 容器 一、STL 容器简介 1、STL 容器区别 STL 容器 用于管理 一组 数据元素 , 不同类型的 STL 容器 的区别 主要是 节点 和 节点之间的关系模型 不同 ; 容器的内存空间是否连续 : 向量 vector 的内存空间…

跟着森老师学React Hooks(1)——使用Vite构建React项目

Vite是一款构建工具&#xff0c;对ts有很好的支持&#xff0c;最近也是在前端越来越流行。 以往的React项目的初始化方式大多是通过脚手架create-react-app(本质是webpack)&#xff0c;其实比起Vite来构建&#xff0c;启动会慢一些。 所以这次跟着B站的一个教程&#xff0c;使用…

生态环境领域基于R语言piecewiseSEM结构方程模型

结构方程模型&#xff08;Sructural Equation Modeling&#xff0c;SEM&#xff09;可分析系统内变量间的相互关系&#xff0c;并通过图形化方式清晰展示系统中多变量因果关系网&#xff0c;具有强大的数据分析功能和广泛的适用性&#xff0c;是近年来生态、进化、环境、地学、…

【2】Spring Boot 3 项目搭建

目录 【2】Spring Boot 3 初始项目搭建项目生成1. 使用IDEA商业版创建2. 使用官方start脚手架创建 配置与启动Git版本控制 个人主页: 【⭐️个人主页】 需要您的【&#x1f496; 点赞关注】支持 &#x1f4af; 【2】Spring Boot 3 初始项目搭建 项目生成 1. 使用IDEA商业版创…

JVM-虚拟机的故障处理与调优案例分析

案例1&#xff1a;大内存硬件上的程序部署策略 一个15万PV/日左右的在线文档类型网站最近更换了硬件系统&#xff0c;服务器的硬件为四路志强处理器、16GB物理内存&#xff0c;操作系统为64位CentOS 5.4&#xff0c;Resin作为Web服务器。整个服务器暂时没有部署别的应用&#…

通过 dump 虚拟机线程方法栈和堆内存来分析 Android 卡顿和 OOM 问题

作者&#xff1a;Tans5 Android 中的性能问题无非就是卡顿和 OOM&#xff0c;虽然总体就这两种&#xff0c;但是造成这两种性能问题的原因却是非常多&#xff0c;需要具体的原因具体分析&#xff0c;而且这是非常复杂的。本篇文章只是简单介绍如何找到造成这些问题的直接原因的…

Nodejs的安装以及配置(node-v12.16.1-x64.msi)

Nodejs的安装以及配置 1、安装 node-v12.16.1-x64.msi点击安装&#xff0c;注意以下步骤 本文设置nodejs的安装的路径&#xff1a;D:\soft\nodejs 继续点击next&#xff0c;选中Add to PATH &#xff0c;旁边的英文告诉我们会把 环境变量 给我们配置好 当然也可以只选择 Nod…

Oracle获取执行计划的6种方法

一、什么是执行计划&#xff1f; 执行计划是一条查询语句在Oracle中的执行过程或访问路径的描述。 执行计划描述了SQL引擎为执行SQL语句进行的操作&#xff0c;分析SQL语句相关的性能问题或仅仅质疑查询优化器的决定时&#xff0c;必须知道执行计划&#xff1b;所以执行计划常用…

【使用教程】在Ubuntu下PMM60系列一体化伺服电机通过PDO跑循环同步位置模式详解

本教程将指导您在Ubuntu操作系统下使用PDO来配置和控制PMM60系列一体化伺服电机以实现循环同步位置模式。我们将介绍必要的步骤和命令&#xff0c;以确保您能够成功地配置和控制PMM系列一体化伺服电机。 一、准备工作 在正式介绍之前还需要一些准备工作&#xff1a;1.装有lin…

点亮一个灯

.text .global _start _start: RCC时钟使能 GPIOE RCC_MP_AHB$ENSETR[4]->1 LDR R0,0x50000a28 LDR R1,[R0] ORR R1,R1,#(0x1<<4) ORR R1,R1,#(0x1<<5) STR R1,[R0]设置PE10为输出模式 GPIOE_MODER[21:20]->01 先清0 LDR R0,0x50006000 LDR R1,[R0] BI…

Spark 读取ES采坑系列

目录 一、使用的插件 二、ES集群和Elasticsearch-hadoop版本问题 三、Elasticsearch-hadoop 和Scala版本以及Spark版本&#xff08;版本不匹配会有各种异常信息 一、使用的插件 <dependency><groupId>org.elasticsearch</groupId><artifactId>elas…

centos配置docker环境

CentOS系统更换软件安装源 yum默认链接的还是国外的镜像&#xff0c;速度相对不理想&#xff0c;配置成国内的镜像会快很多,这里以阿里镜像为例进行配置&#xff1a; 首先进行更新&#xff1a; yum updatebase源 第一步&#xff1a;备份你的原镜像文件&#xff0c;以免出错后…

为什么说软文推广中了解用户是关键?

数字化时代下软文成为众多企业推广品牌的方式之一&#xff0c;所谓软文&#xff0c;就是指以向用户提供信息&#xff0c;并将产品隐含在信息中的柔性手段。 想要使软文效果明显&#xff0c;就必须深入了解用户&#xff0c;把握其需求、兴趣和行为特点&#xff0c;这也是今天媒…

一篇文章教会你写一个贪吃蛇小游戏(纯C语言)

一篇文章教会你写一个贪吃蛇小游戏 1、游戏展示2、游戏功能3、Win32 API3.1 控制台程序3.2 控制台屏幕上的坐标COORD3.3 GetStdHandle函数3.4 GetConsoleCursorInfo函数3.4.1 CONSOLE_CURSOR_INFO结构体 3.5 SetConsoleCursorInfo函数3.6 SetConsoleCursorPosition函数3.7 GetA…

STM32-HAL库09-CAN通讯(loopback模式)

一、所用材料&#xff1a; STM32F103C6T6最小系统板 STM32CUBEMX&#xff08;HAL库软件&#xff09; MDK5 串口调试助手 二、所学内容&#xff1a; 初步学习如何使用STM32的CAN通讯功能&#xff0c;在本章节主要达到板内CAN通讯的效果&#xff0c;即32发送CAN信息再在CAN接收…

SpringBoot实现Excel导入导出

SpringBoot实现Excel导入导出 在我们平时工作中经常会遇到要操作Excel的功能&#xff0c;比如导出个用户信息或者订单信息的Excel报表。你肯定听说过 POI这个东西&#xff0c;可以实现。但是POI实现的API确实很麻烦&#xff0c;它需要写那种逐行解析的代码&#xff08;类似Xm…

关于mac下pycharm旧版本没删除的情况下新版本2023安装之后闪退

先说结论&#xff0c;我用的app cleaner 重新删除的pycharm &#xff0c;再重新安装即可。在此记录一下 之前安装的旧版的2020的pycharm&#xff0c;因为装不了新的插件&#xff0c;没办法就升级了。新装2023打开之后闪退&#xff0c;重启系统也不行&#xff0c;怀疑是一起破解…

3D全景技术,为我们打开全新宣传领域

随着科技的发展&#xff0c;3D全景技术正在融入我们的生活&#xff0c;这种全新视觉体验方式为我们打开了一扇全新的宣传领域&#xff0c;可以让我们多方位、多视角地探索各个行业&#xff0c;无论是对教育、商业、还是其他领域&#xff0c;都产生了深远的影响。 3D全景技术结合…

【每日一题】—— C. Anonymous Informant(Codeforces Round 908 (Div. 2))(思维题)

&#x1f30f;博客主页&#xff1a;PH_modest的博客主页 &#x1f6a9;当前专栏&#xff1a;每日一题 &#x1f48c;其他专栏&#xff1a; &#x1f534; 每日反刍 &#x1f7e1; C跬步积累 &#x1f7e2; C语言跬步积累 &#x1f308;座右铭&#xff1a;广积粮&#xff0c;缓称…