生信序列基本操作算法
建议在Jupyter实践,python版本3.9
1. 定义DNA序列
seq = 'ACGT'# 从序列指定索引的碱基
seq[1]
# 'C'# 序列长度
len(seq)
# 4
2. 序列拼接
# 序列拼接 - 字符串
seq1 = 'AACC'
seq2 = 'GGTT'
print(seq1 + seq2)
# AACCGGTT# 序列拼接 - 列表
seqs = ['A', 'C', 'G', 'T']
print(''.join(seqs))
3. 序列生成
# 随机产生ATCG碱基
import random
random.choice('ACGT')# 生成DNA序列
import random
seq = ''
for _ in range(100):seq += random.choice('ACGT')
print(seq)
# GTGTCGACACCCGAGTGTTCACAAGTGGTCTTTCGGCACTTTGCTCTGAGTGGCGCGCTTAGCTATAAGGGCAAGTATCCTAGTCATATCCCGAAGGCGT# 生成DNA序列方法2
seq = ''.join([random.choice('ACGT') for _ in range(10)])
print(seq)
# ATGGATGTCT
4. 序列截取
seq = "ATGGATGTCT"
seq[1:3]
# 'AT'seq[:3]
#'GAT'seq[7:]
# 'GAC'seq[-3:]
# 'GAC'
5. 序列比较
# 获取序列最长相同前缀
def longestCommonPrefix(s1, s2):i = 0while i < len(s1) and i < len(s2) and s1[i] == s2[i]:i += 1return s1[:i]
longestCommonPrefix('ACCATTG', 'ACCAAGTC')
# 'ACCA'# 比较序列是否相同
def match(s1, s2):if not len(s1) == len(s2):return Falsefor i in range(0, len(s1)):if not s1[i] == s2[i]:return Falsereturn True
match('ACCATTG', 'ACCATTG')
# False# 序列字符串比较
'ACCATTG' == 'ACCATTG'
True
5. 获取互补序列
def Complement(seq):complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}t = ''for base in seq:t = t + complement[base]return t
Complement('ACCATTG')
# 'TGGTAAC'
6. 获取反向互补序列
def reverseComplement(seq):complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}t = ''for base in seq:t = complement[base] + treturn t
reverseComplement('ACCATTG')
# 'CAATGGT'
7. 读取基因组fasta序列文件
def readGenome(filename):genome = ''with open(filename, 'r') as f:for line in f:# 忽略文件头信息if not line[0] == '>':genome += line.rstrip()return genome
genome = readGenome('lambda_virus.fa')len(genome)
# 48502genome[:100]
# 'GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACC'
7. 计算基因组fasta文件ATCG碱基的数量
# 循环遍历方法
counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
for base in genome:counts[base] += 1
print(counts)
# {'A': 12334, 'C': 11362, 'G': 12820, 'T': 11986}# 调用库
import collections
collections.Counter(genome)
# Counter({'G': 12820, 'A': 12334, 'T': 11986, 'C': 11362})