函数介绍:
本次实验利用了上一次实验构建成功的倒排表
send为评测提交函数
and_search为AND逻辑布尔查询
and_search_sort增添了相关度排序的功能
mul_search函数接受不同参数来确定查询的逻辑
test支持用户手动输入测试
代码实现:
bool_search.py
import json
import requests
def send():f=open(r'xxx.txt',"rb")files={'file':f}r=requests.post(url="http://121.37.1.35:5001/detectfile",files=files)print(r.text)
def and_search(query_str, inverted_index):# 初始化结果集,它将存储包含所有查询词项的文档IDquery_words=query_str.split()result_docs = None# 遍历每个查询词项for word in query_words:# 获取当前词项的文档集合word_docs = set(doc_id for doc_id, _ in inverted_index[word]['tf_list'])# 如果是第一个词项,初始化结果集if result_docs is None:result_docs = word_docselse:# 取交集以保留只包含所有词项的文档result_docs &= word_docs# 如果在任何点结果集为空,可以提前结束搜索if not result_docs:break# 将结果集转换为文档ID列表result_doc_ids = list(result_docs)return result_doc_idsdef read_index():"""示例使用inverted_index 是倒排索引,格式如下:{'词1': {'df': 2, 'tf_list': [[1, 2], [2, 1]]},'词2': {'df': 1, 'tf_list': [[1, 1]]},}"""file_path = r"E:\Procedure\Project\IR\inverted_index.json"file_path='inverted_index.json'inverted_index = {}with open(file_path, 'r', encoding='utf-8') as file:for line in file:record = json.loads(line)if record:key, value = record.popitem() # 获取字典中的第一对键值对inverted_index[key] = valuereturn inverted_indexdef and_search_sort(query_str,inverted_index):# 将查询字符串按空格分割成词项列表query_words = query_str.split()# 初始化一个字典来存储每个文档的总词频doc_frequencies = {}# 计算每个文档的总词频for word in query_words:t_freq={}for tf_list in inverted_index.get(word, {}).get('tf_list', []):if tf_list:doc_id,tf=tf_list[0],tf_list[1]t_freq[doc_id]=tfif not doc_frequencies:doc_frequencies.update(t_freq)continuedt={}for key,value in doc_frequencies.items():if(t_freq.get(key,None)):dt[key]=value+t_freq[key]doc_frequencies=dt# 根据总词频对文档进行排序sorted_doc_frequencies = sorted(doc_frequencies.items(),key=lambda item:item[1],reverse=True)# 提取排序后的文档ID列表sorted_doc_ids = [doc_id for doc_id, _ in sorted_doc_frequencies]return sorted_doc_idsdef read_query():file='query-2024.txt'index=read_index()res=[]with open(file,'r',encoding='gbk')as file:#每一行为一条查询for line in file:#返回一个列表search_results=and_search_sort(query_str=line,inverted_index=index)search_str=[str(doc_id) for doc_id in search_results]re_str='\t'.join(search_str)res.append(re_str)path= 'xxx.txt'with open(path,'w',encoding='utf-8')as file:for string in res:file.write(string)file.write('\n')#read_query()
#send()
增加OR逻辑与Not逻辑:
import bool_search
all_docs=set(range(1,44973))
def mul_search(query_words,inverted_index,condition):result_docs=Nonefor word in query_words.split():word_docs = set(doc_id for doc_id, _ in inverted_index[word]['tf_list'])if result_docs is None:result_docs = word_docselse:if condition == "AND":result_docs &= word_docselse:result_docs |= word_docs# 可以添加NOT操作的处理if condition=="AND" or condition=="OR":return list(result_docs)else:return all_docs-result_docsdef test():inverted_index = bool_search.read_index()condition=input("输入查询类型:")query_words=input()res=mul_search(query_words,inverted_index,condition)print(res)return
test()
优化:
测试时,一定要先读入倒排表,再读取用户的输入,否则倒排表读取时间较久,影响用户体验。