使用该脚本对上述的结果"lin_20240321_calculating_rG4score.R"进行过滤
import csvdef read_file(file_path):with open(file_path, 'r') as file:reader = csv.DictReader(file, delimiter='\t')return list(reader)def process_sequences(data):gene_sequences = {}for row in data:gene_id = row['Id']start = int(row['Start'])end = int(row['End'])length=int(row['total_length'])score = float(row['G4Hscore'])if gene_id not in gene_sequences:gene_sequences[gene_id] = []gene_sequences[gene_id].append({'Type': row['Type'],'Start': start,'End': end,'Length': length,'Sequence': row['Sequence'],'Score': score})# 对每个基因的序列按分数降序排序for gene_id, sequences in gene_sequences.items():gene_sequences[gene_id] = sorted(sequences, key=lambda x: x['Score'], reverse=True)# 保留分数最高且不重叠的序列final_selection = {}for gene_id, sequences in gene_sequences.items():final_selection[gene_id] = []for seq in sequences:if not any(seq['Start'] < s['End'] and seq['End'] > s['Start'] for s in final_selection[gene_id]):final_selection[gene_id].append(seq)return final_selectiondef write_results(gene_sequences, output_file):with open(output_file, 'w', newline='') as file:writer = csv.writer(file, delimiter='\t')writer.writerow(['Id', 'Type', 'Start', 'End', 'Total_length','Sequence', 'Score'])for gene_id, sequences in gene_sequences.items():for seq in sequences:writer.writerow([gene_id, seq['Type'], seq['Start'], seq['End'], seq['Length'], seq['Sequence'], seq['Score']])# 输入和输出文件路径
#usage:python lin_filter_non-overlap_rg4.py -f1 lijinonextended_3utr_allrg4output1.fasta -f2 lijinonextended_3utr_allrg4output2.fasta
import argparse
parser = argparse.ArgumentParser(description="Advanced screening always by hash")
parser.add_argument("-f1","--file1",help="input1")
parser.add_argument("-f2","--file2",help="input2")
args = parser.parse_args()# 读取文件
data = read_file(args.file1)
# 处理序列,保留得分最高且不重叠的序列
gene_sequences = process_sequences(data)
# 将结果写入新文件
write_results(gene_sequences, args.file2)