lin_20240204_extract_rg4.py
1.使用正则表达式从utr,cds区域提取rG4,输出结果包括序列名称,序列内容,初始位置,终止位置,序列类型;
2.生成的文件,用于下一步计算g4score:lin_20240321_calculating_rG4score.R
import redef read_fasta(file_path):"""读取FASTA文件并返回一个字典,键是序列名称,值是序列。"""sequences = {}with open(file_path, 'r') as file:sequence_name = Nonesequence_data = ''for line in file:line = line.strip()if line.startswith('>'):if sequence_name:sequences[sequence_name] = sequence_datasequence_data = ''sequence_name = line[1:]else:sequence_data += lineif sequence_name:sequences[sequence_name] = sequence_datareturn sequencesdef find_rG4_sequences(sequences):"""查找并返回一个包含所有非重叠rG4序列、类型和位置的列表。"""patterns = {'G2L1-2': r'G{2}[ATCG]{1,2}G{2}[ATCG]{1,2}G{2}[ATCG]{1,2}G{2}','G2L1-4': r'G{2}[ATCG]{1,4}G{2}[ATCG]{1,4}G{2}[ATCG]{1,4}G{2}','G2L1-9': r'G{2}[ATCG]{1,4}G{2}[ATCG]{1,4}G{2}[ATCG]{1,4}G{2}','G3': r'G{3}[ATCG]{1,15}G{3}[ATCG]{1,15}G{3}[ATCG]{1,15}G{3}','G3B1': r'G{3}[ATCG]{1,9}G{2}[ATC]G[ATCG]{1,9}G{3}[ATCG]{1,9}G{3}','G3B2': r'G{3}[ATCG]{1,9}G{3}[ATCG]{1,9}G{2}[ATC]G[ATCG]{1,9}G{3}','G3V1': r'G{2}[ATCG]{1,9}G{3}[ATCG]{1,9}G{3}[ATCG]{1,9}G{3}','G3V2': r'G{3}[ATCG]{1,9}G{3}[ATCG]{1,9}G{3}[ATCG]{1,9}G{2}'}results = []for name, seq in sequences.items():for rG4_type, pattern in patterns.items():matches = re.finditer(pattern, seq)for match in matches:result = {'sequence_name': name,'rG4_type': rG4_type,'rG4_sequence': match.group(),'start': match.start(),'end': match.end()}results.append(result)return resultsdef save_results(results, output_file):header="Id\tSequence\n""""将rG4序列及其信息保存到文件中。"""with open(output_file, 'w') as file:file.write(header)for result in results:# 获取原始FASTA序列的长度original_sequence_length = len(sequences[result['sequence_name']])file.write(f"{result['sequence_name']}_{result['rG4_type']}_{result['start']}_{result['end']}_{original_sequence_length}\t{result['rG4_sequence']}\n")#! /usr/bin/env python
#usage: python hash-always.py -f1 1.txt -f2 2.txt > out.txt
import argparse
parser = argparse.ArgumentParser(description="Advanced screening always by hash")
parser.add_argument("-f1","--file1",help="input1")
parser.add_argument("-f2","--file2",help="input2")
args = parser.parse_args()# 读取FASTA文件
sequences = read_fasta(args.file1)
# 查找rG4序列
rG4_results = find_rG4_sequences(sequences)
# 保存结果
save_results(rG4_results, args.file2)
# print(f"rG4 sequences have been successfully saved to {args.file2}")