目录
问题描述:
问题解决:
问题描述:
原始的标注的三元组格式如下:
需要转换的格式如下:
tips:有一个小的难点:
1. 针对多三元组的情况,需要额外考虑
2. 最后一个样本,也记得需要处理
问题解决:
from pdb import set_trace as stop
import os
from tqdm import trange
generated_path= "/public/home/hongy/qtxu/UniCOQE_20230812/data/tuple/ele/train_new_generated.txt"
Unicoqe_path = "/public/home/hongy/qtxu/UniCOQE_20230812/data/tuple/ele/train.txt"raw_data = []
with open(os.path.join(generated_path), 'r') as f:for line in f:raw_data.append(line)with open(Unicoqe_path, 'w') as fw:line_id, i = 0, 0text_line, label_line = '', ''for line_id in trange(len(raw_data), desc= "procesing data ……"):cur_line = raw_data[line_id]if len(cur_line.split('\t')) != 2:label_line += '\n' + cur_lineelse:if text_line !='':sent, label = text_line.strip().split("\t")fw.write(sent+"####")label_list = label_line.strip().split('\n\n')span_index =[]all_span= ()for label_i in label_list:cur_span = label_i.strip()[1:-1].split(';')sub, obj,asp = cur_span[0], cur_span[1], cur_span[2]try:sub_index = [int(index) for index, word in (pair.split('&', 1) if '&' in pair else [pair, ''] for pair in sub.strip()[1:-1].split())] # 针对14&&这样的特例,进行处理obj_index = [int(index) for index, word in (pair.split('&', 1) if '&' in pair else [pair, ''] for pair in obj.strip()[1:-1].split())]asp_index = [int(index) for index, word in (pair.split('&')for pair in asp.strip()[1:-1].split())] except:print(text_line)stop()span_tuple = (sub_index, obj_index, asp_index)span_index.append(span_tuple)fw.write(str('['+', '.join(str(span) for span in span_index)) + "]\n")text_line = cur_linelabel_line=''fw.write(text_line.strip().split("\t")[0]+"####")label_list = label_line.strip().split('\n\n')span_index =[]all_span= ()for label_i in label_list:cur_span = label_i.strip()[1:-1].split(';')sub, obj,asp = cur_span[0], cur_span[1], cur_span[2]sub_index = [int(index) for index, word in (pair.split('&', 1) if '&' in pair else [pair, ''] for pair in sub.strip()[1:-1].split())] # 针对14&&这样的特例,进行处理obj_index = [int(index) for index, word in (pair.split('&', 1) if '&' in pair else [pair, ''] for pair in obj.strip()[1:-1].split())]asp_index = [int(index) for index, word in (pair.split('&')for pair in asp.strip()[1:-1].split())] span_tuple = (sub_index, obj_index, asp_index)span_index.append(span_tuple)fw.write(str('['+', '.join(str(span) for span in span_index)) + "]\n")