from transformers import AutoTokenizertokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
tokenizer.save_pretrained("./bert-tiny/")input_string = "Your input string here 我是中文"
token_ids = tokenizer.encode(input_string, add_special_tokens=True)
print(token_ids)# 打印词表对应的token
print(tokenizer.convert_ids_to_tokens(token_ids))
输出:
[101, 2115, 7953, 5164, 2182, 1855, 100, 1746, 1861, 102]
[‘[CLS]’, ‘your’, ‘input’, ‘string’, ‘here’, ‘我’, ‘[UNK]’, ‘中’, ‘文’, ‘[SEP]’]