https://blog.csdn.net/missinghead/article/details/135699418
【生物信息】VFDB数据库预测毒力因子-CSDN博客
知道蛋白uniprot id 提取蛋白的fasta格式序列:现在uniprot网站下载物种所有蛋白序列文件
conda create -n kp_proteomic
conda activate kp_proteomic
#下载diamond:(课本地下载后上传)
wget http://github.com/bbuchfink/diamond/releases/download/v2.0.6/diamond-linux64.tar.gz
tar xvzf diamond-linux64.tar.gz #gunzip
conda install diamond
which diamond #得到路径,接下来的命令记得变更
#VFDB数据库下载fasta文件解压后上传,VFDB_setA_pro.fas
#建库
~/.conda/envs/kp_proteomic/bin/diamond makedb --in VFDB_setA_pro.fas --db VFDB_proteins
#比对
~/.conda/envs/kp_proteomic/bin/diamond blastp -d VFDB_proteins -q protein.fasta -f 6 -o vf.txt
grep "^>" VFDB_setA_pro.fas | sed 's/^>//g' > VFDB_setA_id.txt
awk '{print $2}' vf.txt | while read -r line
do
grep -F "$line" VFDB_setA_id.txt >> matches.txt
done
paste vf.txt matches.txt > vf_detail.txt
awk -F'\t' '{match($NF, /\[([^\]]+)\]/, a); print $0 FS a[1]}' vf_detail.txt > output.txt
mv output.txt vf_detail.txt
awk -F'\t' '{match($NF, /\(([^\)]+)\) -/, a); match($NF, /- ([^-\n]*)$/, b); print $0 FS a[1] FS b[1]}' vf_detail.txt > output.txt
mv output.txt vf_detail.txt