hhblits
是 HMM-HMM(Hidden Markov Model to Hidden Markov Model)比对方法的一部分,也是 HMMER 软件套件中的工具之一。与 hhsearch
类似,hhblits
也用于进行高效的蛋白质序列比对,特别擅长于检测远缘同源性。
hh-suite 源码 https://github.com/soedinglab/hh-suite
以下是 hhblits
的一些主要特点和用途:
-
HMM-HMM比对:
hhblits
使用隐藏马尔可夫模型(HMM)来表示蛋白质家族的模型,从而能够更全面地考虑多序列信息。这使得它相对于传统的序列-序列比对方法更能捕捉蛋白质家族的模式和结构。 -
迭代比对:
hhblits
支持迭代比对的策略,通过在每一轮中更新模型,逐渐提高比对的灵敏度。这种策略对于发现相对远缘同源蛋白质非常有用。 -
远缘同源性检测: 与其他 HMM-HMM比对方法类似,
hhblits
被设计用于检测远缘同源蛋白质,即那些在序列上相对较远但在结构和功能上相似的蛋白质。 -
数据库搜索: 用户可以使用
hhblits
在大型蛋白质数据库中搜索与给定蛋白质序列相似的蛋白质。 -
灵敏度和特异性:
hhblits
的设计目标是在保持高灵敏度的同时,降低假阳性的比对,以提高比对的特异性。
import glob
import os
import subprocess
from typing import Any, List, Mapping, Optional, Sequencefrom absl import logging
import contextlib
import shutil
import tempfile
import time_HHBLITS_DEFAULT_P = 20
_HHBLITS_DEFAULT_Z = 500@contextlib.contextmanager
def tmpdir_manager(base_dir: Optional[str] = None):"""Context manager that deletes a temporary directory on exit."""tmpdir = tempfile.mkdtemp(dir=base_dir)try:yield tmpdirfinally:shutil.rmtree(tmpdir, ignore_errors=True)@contextlib.contextmanager
def timing(msg: str):logging.info('Started %s', msg)tic = time.time()yieldtoc = time.time()logging.info('Finished %s in %.3f seconds', msg, toc - tic)class HHBlits:"""Python wrapper of the HHblits binary."""def __init__(self,*,binary_path: str,databases: Sequence[str],n_cpu: int = 4,n_iter: int = 3,e_value: float = 0.001,maxseq: int = 1_000_000,realign_max: int = 100_000,maxfilt: int = 100_000,min_prefilter_hits: int = 1000,all_seqs: bool = False,alt: Optional[int] = None,p: int = _HHBLITS_DEFAULT_P,z: int = _HHBLITS_DEFAULT_Z):"""Initializes the Python HHblits wrapper.Args:binary_path: The path to the HHblits executable.databases: A sequence of HHblits database paths. This should be thecommon prefix for the database files (i.e. up to but not including_hhm.ffindex etc.)n_cpu: The number of CPUs to give HHblits.n_iter: The number of HHblits iterations.e_value: The E-value, see HHblits docs for more details.maxseq: The maximum number of rows in an input alignment. Note that thisparameter is only supported in HHBlits version 3.1 and higher.realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.maxfilt: Max number of hits allowed to pass the 2nd prefilter.HHblits default: 20000.min_prefilter_hits: Min number of hits to pass prefilter.HHblits default: 100.all_seqs: Return all sequences in the MSA / Do not filter the result MSA.HHblits default: False.alt: Show up to this many alternative alignments.p: Minimum Prob for a hit to be included in the output hhr file.HHblits default: 20.z: Hard cap on number of hits reported in the hhr file.HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.Raises:RuntimeError: If HHblits binary not found within the path."""self.binary_path = binary_pathself.databases = databasesself.n_cpu = n_cpuself.n_iter = n_iterself.e_value = e_valueself.maxseq = maxseqself.realign_max = realign_maxself.maxfilt = maxfiltself.min_prefilter_hits = min_prefilter_hitsself.all_seqs = all_seqsself.alt = altself.p = pself.z = zdef query(self, input_fasta_path: str) -> List[Mapping[str, Any]]:"""Queries the database using HHblits."""with tmpdir_manager() as query_tmp_dir:a3m_path = os.path.join(query_tmp_dir, 'output.a3m')db_cmd = []for db_path in self.databases:db_cmd.append('-d')db_cmd.append(db_path)cmd = [self.binary_path,'-i', input_fasta_path,'-cpu', str(self.n_cpu),'-oa3m', a3m_path,'-o', '/dev/null','-n', str(self.n_iter),'-e', str(self.e_value),'-maxseq', str(self.maxseq),'-realign_max', str(self.realign_max),'-maxfilt', str(self.maxfilt),'-min_prefilter_hits', str(self.min_prefilter_hits)]if self.all_seqs:cmd += ['-all']if self.alt:cmd += ['-alt', str(self.alt)]if self.p != _HHBLITS_DEFAULT_P:cmd += ['-p', str(self.p)]if self.z != _HHBLITS_DEFAULT_Z:cmd += ['-Z', str(self.z)]cmd += db_cmdprint("cmd:",' '.join(cmd))logging.info('Launching subprocess "%s"', ' '.join(cmd))process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)with timing('HHblits query'):stdout, stderr = process.communicate()retcode = process.wait()if retcode:# Logs have a 15k character limit, so log HHblits error line by line.logging.error('HHblits failed. HHblits stderr begin:')for error_line in stderr.decode('utf-8').splitlines():if error_line.strip():logging.error(error_line.strip())logging.error('HHblits stderr end')raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % (stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))with open(a3m_path) as f:a3m = f.read()raw_output = dict(a3m=a3m,output=stdout,stderr=stderr,n_iter=self.n_iter,e_value=self.e_value)return [raw_output]if __name__ == "__main__":hhblits_binary_path = "/home/zheng/software/hhsuite-3.3.0-SSE2-Linux/bin/hhblits"input_fasta_path = "/home/zheng/test/Q94K49.fasta"#hhsuite数据库下载地址:https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/ scop70_database_path = "/home/zheng/database/scop70_1.75_hhsuite3/scop70_1.75"pdb70_database_path = "/home/zheng/database/pdb70_from_mmcif_latest/pdb70"## 单一数据库#hhblits_runner = HHBlits(binary_path = hhblits_binary_path, # databases = [scop70_database_path])## 多个数据库database_lst = [scop70_database_path, pdb70_database_path]hhblits_runner = HHBlits(binary_path = hhblits_binary_path,databases = database_lst) result = hhblits_runner.query(input_fasta_path)print(len(result))print(result[0]['a3m'])#print(result)