1.输入文件为基因组文件和gff3文件,输出为5utr和3utr,并且utr已经考虑了正负链和可变剪接情况,意思是如果utr存在可变剪接,输出的文件已经给拼接好了,并且考虑了正负链和拼接方向
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# python lin_extract_5utr_cds_3utr2.py iwgsc_refseqv2.1_annotation_200916_HC_LC.gff3 ../iwgsc_refseqv2.1_assembly.fa output_5utr.fasta2 output_3utr.fasta2
import sys
from Bio import SeqIO
from collections import defaultdictdef extract_and_concatenate_utrs(gff_file, genome_file, output_5utr, output_3utr):genome = SeqIO.to_dict(SeqIO.parse(genome_file, "fasta"))utrs_5 = defaultdict(lambda: {'+': [], '-': []})utrs_3 = defaultdict(lambda: {'+': [], '-': []})with open(gff_file, 'r') as gff: