import jieba
from tensorflow.keras.preprocessing.text import Tokenizer
import joblib
def dm01_onehot_gen():vocabs = ["周杰伦", "陈奕迅", "王力宏", "李宗盛", "吴亦凡", "鹿晗"]mytokenizer = Tokenizer() mytokenizer.fit_on_texts(vocabs)print('mytokenizer.index_word-->', mytokenizer.index_word)print('mytokenizer.word_index-->', mytokenizer.word_index)for vocab in vocabs:zero_list = [0] * len(mytokenizer.index_word) idx = mytokenizer.word_index[vocab] - 1zero_list[idx] = 1print(vocab, '的onehot编码是', zero_list)joblib.dump(mytokenizer, './mytokenizer2' )print('保存 token ok')pass
def dm_onehot_use():mytokenizer = joblib.load('mytokenizer2')vocab = '李宗盛'idx = mytokenizer.word_index[vocab] -1zero_list = [0] * len(mytokenizer.index_word)zero_list[idx] = 1print(vocab, '的onehot编码是', zero_list)print('狗蛋 开始 ...')vocab = '狗蛋'idx = mytokenizer.word_index[vocab] - 1zero_list = [0] * len(mytokenizer.index_word)zero_list[idx] = 1print(vocab, '的onehot编码是', zero_list)passif __name__ == '__main__':dm_onehot_use()print('one-hot编码 End')