python运行hhblits二进制命令的包装器类

hhblits 是 HMM-HMM（Hidden Markov Model to Hidden Markov Model）比对方法的一部分，也是 HMMER 软件套件中的工具之一。与 hhsearch 类似，hhblits 也用于进行高效的蛋白质序列比对，特别擅长于检测远缘同源性。

hh-suite 源码 https://github.com/soedinglab/hh-suite

以下是 hhblits 的一些主要特点和用途：

HMM-HMM比对： hhblits 使用隐藏马尔可夫模型（HMM）来表示蛋白质家族的模型，从而能够更全面地考虑多序列信息。这使得它相对于传统的序列-序列比对方法更能捕捉蛋白质家族的模式和结构。
迭代比对： hhblits 支持迭代比对的策略，通过在每一轮中更新模型，逐渐提高比对的灵敏度。这种策略对于发现相对远缘同源蛋白质非常有用。
远缘同源性检测： 与其他 HMM-HMM比对方法类似，hhblits 被设计用于检测远缘同源蛋白质，即那些在序列上相对较远但在结构和功能上相似的蛋白质。
数据库搜索： 用户可以使用 hhblits 在大型蛋白质数据库中搜索与给定蛋白质序列相似的蛋白质。
灵敏度和特异性： hhblits 的设计目标是在保持高灵敏度的同时，降低假阳性的比对，以提高比对的特异性。

import glob
import os
import subprocess
from typing import Any, List, Mapping, Optional, Sequencefrom absl import logging
import contextlib
import shutil
import tempfile
import time_HHBLITS_DEFAULT_P = 20
_HHBLITS_DEFAULT_Z = 500@contextlib.contextmanager
def tmpdir_manager(base_dir: Optional[str] = None):"""Context manager that deletes a temporary directory on exit."""tmpdir = tempfile.mkdtemp(dir=base_dir)try:yield tmpdirfinally:shutil.rmtree(tmpdir, ignore_errors=True)@contextlib.contextmanager
def timing(msg: str):logging.info('Started %s', msg)tic = time.time()yieldtoc = time.time()logging.info('Finished %s in %.3f seconds', msg, toc - tic)class HHBlits:"""Python wrapper of the HHblits binary."""def __init__(self,*,binary_path: str,databases: Sequence[str],n_cpu: int = 4,n_iter: int = 3,e_value: float = 0.001,maxseq: int = 1_000_000,realign_max: int = 100_000,maxfilt: int = 100_000,min_prefilter_hits: int = 1000,all_seqs: bool = False,alt: Optional[int] = None,p: int = _HHBLITS_DEFAULT_P,z: int = _HHBLITS_DEFAULT_Z):"""Initializes the Python HHblits wrapper.Args:binary_path: The path to the HHblits executable.databases: A sequence of HHblits database paths. This should be thecommon prefix for the database files (i.e. up to but not including_hhm.ffindex etc.)n_cpu: The number of CPUs to give HHblits.n_iter: The number of HHblits iterations.e_value: The E-value, see HHblits docs for more details.maxseq: The maximum number of rows in an input alignment. Note that thisparameter is only supported in HHBlits version 3.1 and higher.realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.maxfilt: Max number of hits allowed to pass the 2nd prefilter.HHblits default: 20000.min_prefilter_hits: Min number of hits to pass prefilter.HHblits default: 100.all_seqs: Return all sequences in the MSA / Do not filter the result MSA.HHblits default: False.alt: Show up to this many alternative alignments.p: Minimum Prob for a hit to be included in the output hhr file.HHblits default: 20.z: Hard cap on number of hits reported in the hhr file.HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.Raises:RuntimeError: If HHblits binary not found within the path."""self.binary_path = binary_pathself.databases = databasesself.n_cpu = n_cpuself.n_iter = n_iterself.e_value = e_valueself.maxseq = maxseqself.realign_max = realign_maxself.maxfilt = maxfiltself.min_prefilter_hits = min_prefilter_hitsself.all_seqs = all_seqsself.alt = altself.p = pself.z = zdef query(self, input_fasta_path: str) -> List[Mapping[str, Any]]:"""Queries the database using HHblits."""with tmpdir_manager() as query_tmp_dir:a3m_path = os.path.join(query_tmp_dir, 'output.a3m')db_cmd = []for db_path in self.databases:db_cmd.append('-d')db_cmd.append(db_path)cmd = [self.binary_path,'-i', input_fasta_path,'-cpu', str(self.n_cpu),'-oa3m', a3m_path,'-o', '/dev/null','-n', str(self.n_iter),'-e', str(self.e_value),'-maxseq', str(self.maxseq),'-realign_max', str(self.realign_max),'-maxfilt', str(self.maxfilt),'-min_prefilter_hits', str(self.min_prefilter_hits)]if self.all_seqs:cmd += ['-all']if self.alt:cmd += ['-alt', str(self.alt)]if self.p != _HHBLITS_DEFAULT_P:cmd += ['-p', str(self.p)]if self.z != _HHBLITS_DEFAULT_Z:cmd += ['-Z', str(self.z)]cmd += db_cmdprint("cmd:",' '.join(cmd))logging.info('Launching subprocess "%s"', ' '.join(cmd))process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)with timing('HHblits query'):stdout, stderr = process.communicate()retcode = process.wait()if retcode:# Logs have a 15k character limit, so log HHblits error line by line.logging.error('HHblits failed. HHblits stderr begin:')for error_line in stderr.decode('utf-8').splitlines():if error_line.strip():logging.error(error_line.strip())logging.error('HHblits stderr end')raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % (stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))with open(a3m_path) as f:a3m = f.read()raw_output = dict(a3m=a3m,output=stdout,stderr=stderr,n_iter=self.n_iter,e_value=self.e_value)return [raw_output]if __name__ == "__main__":hhblits_binary_path = "/home/zheng/software/hhsuite-3.3.0-SSE2-Linux/bin/hhblits"input_fasta_path = "/home/zheng/test/Q94K49.fasta"#hhsuite数据库下载地址：https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/  scop70_database_path = "/home/zheng/database/scop70_1.75_hhsuite3/scop70_1.75"pdb70_database_path = "/home/zheng/database/pdb70_from_mmcif_latest/pdb70"## 单一数据库#hhblits_runner = HHBlits(binary_path = hhblits_binary_path,  #                         databases = [scop70_database_path])## 多个数据库database_lst = [scop70_database_path, pdb70_database_path]hhblits_runner = HHBlits(binary_path = hhblits_binary_path,databases = database_lst) result = hhblits_runner.query(input_fasta_path)print(len(result))print(result[0]['a3m'])#print(result)

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.mzph.cn/news/164541.shtml

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！