import string# 移除符号, 文本中有中文符号“与”
remove_symbol = string.punctuation + string.whitespace +'“”'# 读取文件defread_file(file_name):""":param file_name: 文件的路径:return: 直方图"""hist =dict()file_data =open(file_name, encoding='utf8')# 跳过开头部分skip_header(file_data)# 现在的file_data以及没有了开头部分.for line in file_data:# 跳过结尾部分if line.startswith('*** END OF THIS PROJECT'):break# 单词很多使用'-'拼在一起, 将'-'替换为空格words_line = line.replace('-',' ')# 字符串按空格切分words_list = words_line.split()for word in words_list:# 处理单词前后的符号, 中间的不要管word = word.strip(remove_symbol)# 转为全小写lower_word = word.lower()# 统计频率hist:dicthist[lower_word]= hist.get(lower_word,0)+1return histdefskip_header(file_obj):""":param file_obj: 文件对象:return: None读取文件对象, 读取到 *** START OF THIS PROJECT GUTENBERG EBOOK EMMA *** 这一行则停止"""for line in file_obj:if line.startswith('*** START OF THIS PROJECT'):breakdefmake_words_dict(file_name):words_dict =dict()fin =open(file_name, encoding='utf8')for line in fin:# 去除结尾\nword = line.strip()words_dict[word]=''return words_dictdefdifference():res_hist = read_file('emma.txt')res_words_dict = make_words_dict('words.txt')# 判断存在的单词与不存在额单词not_exist_words_list =list()for word in res_hist:if word notin res_words_dict:not_exist_words_list.append(word)print('存在的单词有',len(res_hist)-len(not_exist_words_list),'个.')print('不存在的单词有',len(not_exist_words_list),'个.')difference()
>>> t =['a','a','b']>>> hist = histogram(t){'a':2,'b':1}
你的函数因该是以2/3地概率返回'a',以1/3的概率返回'b'.
import randomdefchoose_from_hist(hist_dict):frequency_list =list()# 遍历键值对for key, val in hist_dict.items():# 列表添加val个键for i inrange(val):frequency_list.append(key)random_res = random.choice(frequency_list)return random_resfor v inrange(3):histogram_dict ={'a':2,'b':1}res = choose_from_hist(histogram_dict)print(res)
print('Total number of words:', total_words(hist))print('Number of different words', different_words(hist))
以及结果:
Total number of words:164120
Number of different words 8904
import stringdefprocess_file(filename):hist =dict()fp =open(filename, encoding='utf8')for line in fp:process_line(line, hist)return histdefprocess_line(line, hist):line = line.replace('-',' ')for word in line.split():word = word.strip(string.punctuation + string.whitespace)word = word.lower()hist[word]= hist.get(word,0)+1hist = process_file('emma.txt')deftotal_words(hist):returnsum(hist.values())defdifferent_words(hist):returnlen(hist)# 单词总数print('Total number of words:', total_words(hist))# 不同单词的数量print('Number of different words', different_words(hist))
# 完整代码import string# 移除符号, 文本中有中文符号“与”
remove_symbol = string.punctuation + string.whitespace +'“”'# 读取文件defread_file(file_name):""":param file_name: 文件的路径:return: 直方图"""hist =dict()file_data =open(file_name, encoding='utf8')# 跳过开头部分skip_header(file_data)# 现在的file_data以及没有了开头部分.for line in file_data:# 跳过结尾部分if line.startswith('*** END OF THIS PROJECT'):break# 单词很多使用'-'拼在一起, 将'-'替换为空格words_line = line.replace('-',' ')# 字符串按空格切分words_list = words_line.split()for word in words_list:# 处理单词前后的符号, 中间的不要管word = word.strip(remove_symbol)# 转为全小写lower_word = word.lower()# 统计频率hist:dicthist[lower_word]= hist.get(lower_word,0)+1return histdefskip_header(file_obj):""":param file_obj: 文件对象:return: None读取文件对象, 读取到 *** START OF THIS PROJECT GUTENBERG EBOOK EMMA *** 这一行则停止"""for line in file_obj:if line.startswith('*** START OF THIS PROJECT'):breakdefmost_common(hist):t =[]for key, value in hist.items():t.append((value, key))t.sort(reverse=True)return thist = read_file('emma.txt')
t = most_common(hist)print('The most common word are:')for freq, word in t[:10]:print(word, freq, sep='\t')
# 简化版本, 注意: 需要去掉开头与结尾的..import string# 移除符号, 文本中有中文符号“与”
remove_symbol = string.punctuation + string.whitespace +'“”'# 读取文件defread_file(file_name):hist =dict()file_data =open(file_name, encoding='utf8')# 跳过开头部分skip_header(file_data)# 现在的file_data以及没有了开头部分.for line in file_data:# 跳过结尾部分if line.startswith('*** END OF THIS PROJECT'):break# 单词很多使用'-'拼在一起, 将'-'替换为空格words_line = line.replace('-',' ')# 字符串按空格切分words_list = words_line.split()for word in words_list:# 处理单词前后的符号, 中间的不要管word = word.strip(remove_symbol)# 转为全小写lower_word = word.lower()# 统计频率hist:dicthist[lower_word]= hist.get(lower_word,0)+1return histdefskip_header(file_obj):for line in file_obj:if line.startswith('*** START OF THIS PROJECT'):breakhist = read_file('emma.txt')# 简化
hist_list =list(hist.items())
hist_list.sort(key=lambda x: x[1], reverse=True)print('The most common word are:')for freq, word in hist_list[:10]:print(word, freq, sep='\t')
# 完整代码import string# 移除符号, 文本中有中文符号“与”
remove_symbol = string.punctuation + string.whitespace +'“”'# 读取文件defread_file(file_name):hist =dict()file_data =open(file_name, encoding='utf8')# 跳过开头部分skip_header(file_data)# 现在的file_data以及没有了开头部分.for line in file_data:# 跳过结尾部分if line.startswith('*** END OF THIS PROJECT'):break# 单词很多使用'-'拼在一起, 将'-'替换为空格words_line = line.replace('-',' ')# 字符串按空格切分words_list = words_line.split()for word in words_list:# 处理单词前后的符号, 中间的不要管word = word.strip(remove_symbol)# 转为全小写lower_word = word.lower()# 统计频率hist:dicthist[lower_word]= hist.get(lower_word,0)+1return histdefskip_header(file_obj):for line in file_obj:if line.startswith('*** START OF THIS PROJECT'):breakdefmost_common(hist):t =[]for key, value in hist.items():t.append((value, key))t.sort(reverse=True)return tdefprint_most_common(hist, num=10):t = most_common(hist)print('The most common word are:')for freq, word in t[:num]:print(word, freq, sep='\t')hist = read_file('emma.txt')
print_most_common(hist)# print_most_common(hist, 20)
words = process_file('words')
diff = subtract(hist, words)print("Words in the book that aren't in the word list:")for word in diff:print(word, end=' ')
#我的结果Wordsinthebookthataren'tinthewordlist:emmaausteniwoodhouseasister's remembrance taylor mr woodhouse'staylor's...
# 完整代码, 有一点不同, 就是调用函数生成直方图时, 要不要跳过开头的配置...import string# 移除符号, 文本中有中文符号“与”
remove_symbol = string.punctuation + string.whitespace +'“”'# 读取文件defprocess_file(file_name, is_skip=True):hist =dict()file_data =open(file_name, encoding='utf8')if is_skip:# 跳过开头部分skip_header(file_data)# 现在的file_data以及没有了开头部分.for line in file_data:# 跳过结尾部分if line.startswith('*** END OF THIS PROJECT'):break# 单词很多使用'-'拼在一起, 将'-'替换为空格words_line = line.replace('-',' ')# 字符串按空格切分words_list = words_line.split()for word in words_list:# 处理单词前后的符号, 中间的不要管word = word.strip(remove_symbol)# 转为全小写lower_word = word.lower()# 统计频率hist:dicthist[lower_word]= hist.get(lower_word,0)+1return histdefskip_header(file_obj):for line in file_obj:if line.startswith('*** START OF THIS PROJECT'):break# 字典减法defsubtract(d1, d2):res =dict()for key in d1:if key notin d2:res[key]=Nonereturn res# 跳过靠头结尾
hist = process_file('emma.txt')# 不跳过开头
words = process_file('words.txt',False)diff = subtract(hist, words)print("Words in the book that aren't in the word list:")for word in diff:print(word, end=' ')
# 完整代码, 有一点不同, 就是调用函数生成直方图时, 要不要跳过开头的配置...import string# 移除符号, 文本中有中文符号“与”
remove_symbol = string.punctuation + string.whitespace +'“”'# 读取文件defprocess_file(file_name, is_skip=True):hist =dict()file_data =open(file_name, encoding='utf8')if is_skip:# 跳过开头部分skip_header(file_data)# 现在的file_data以及没有了开头部分.for line in file_data:# 跳过结尾部分if line.startswith('*** END OF THIS PROJECT'):break# 单词很多使用'-'拼在一起, 将'-'替换为空格words_line = line.replace('-',' ')# 字符串按空格切分words_list = words_line.split()for word in words_list:# 处理单词前后的符号, 中间的不要管word = word.strip(remove_symbol)# 转为全小写lower_word = word.lower()# 统计频率hist:dicthist[lower_word]= hist.get(lower_word,0)+1return histdefskip_header(file_obj):for line in file_obj:if line.startswith('*** START OF THIS PROJECT'):break# 集合减法, 集合是无序的, 输出的顺序不一致的...defsubtract(d1, d2):# 小集合 - 大集合returnset(d1)-set(d2)# 跳过靠头结尾
hist = process_file('emma.txt')# 不跳过开头
words = process_file('words.txt',False)diff = subtract(hist, words)print("Words in the book that aren't in the word list:")for word in diff:print(word, end=' ')
马尔可夫分析:1.编写一个程序从文件中读入文本,并进行马尔可夫分析.结果因该是一个字典,将前缀映射到可能后缀的集合.集合可以是列表,元组或者字典.由你来做出合适的选择,你可以使用前缀长度2来测试程序,但编写程序时应当考虑可以方便地改为其他前缀长度.2.在前面编写地程序中添加一个函数,基于马尔可夫分析地结果随机生成文本.下面时一个从<<爱玛>>中使用前缀长度2生成地例子:Hewasveryclever,beitsweetnessorbeangry,ashamedoronlyamused,atsuchastroke.ShehadneverthoughtofHannahtillyouwerenevermeantforme?""I cannot make speechess, Emma:"hesooncutitallhimself.对这个例子,我留下每个单词后面地标点.结果几乎时语法正确地,但也不完整正确.语义上,它看起来像是有意义的,但也不完全是.当增加前缀长度是,结果会怎么样?随机生成的文本会不会看起来更有意义?3.一旦你的程序可以正常运行后,可以考虑尝试一下混搭:如果对两本或更多本书进行组合,则生成的随机文本会以一种有趣的方式混合各书中的词汇和短语.致谢:本案例分析基于Kernighan和Pike和ThePracticeofProgramming(Addison-Wesley,1999)一书中的一个实例.你应当在继续阅读全尝试这个练习,接着可从↓下载我的解答:https://raw.githubusercontent.com/AllenDowney/ThinkPython2/master/code/markov.py你也需要↓:https://raw.githubusercontent.com/AllenDowney/ThinkPython2/master/code/emma.txt
public class NineNineMulTable{public static void main(String[] args){for(int i 1; i < 9; i ){for(int j 1; j < i; j ){System.out.print(j " * " i " " i * j "\t");//再次先输出j在输出i是打印出来是1*2,2*2}S…