import warnings
warnings.filterwarnings("ignore")
from transformers import pipeline#用人家设计好的流程完成一些简单的任务
classifier = pipeline("sentiment-analysis")
classifier(
[
"I've been waiting for a HuggingFace course my whole life.",
"I hate this so much!",
]
)
tokenizer要做的事:
- 分词,分字以及特殊字符(起始,终止,间隔,分类等特殊字符可以自己设计的)
- 对每一个token映射得到一个ID(每个词都会对应一个唯一的ID)
- 还有一些辅助信息也可以得到,比如当前词属于哪个句子(还有一些MASK,表示是否事原来的词还是特殊字符等)
from transformers import AutoTokenizer#自动判断
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"#根据这个模型所对应的来加载
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print(5)
raw_inputs = [
"I've been waiting for a this course my whole life.",
"I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)