比如病毒注释软件注释到了:
Viruses;Phixviricota;Malgrandaviricetes;Petitvirales;Microviridae;;
import os
import pandas as pd# 设置工作区
os.chdir("folder")
dir_ls = os.listdir()# 把ICTV物种表保存为df
ICTV_txt = "ICTV去冗余.txt"
df = pd.read_csv(ICTV_txt, sep='\t')result_ls = []
for file in dir_ls:if "mother_95" in file: #需要查看的结果文件result_ls.append(file)for file2 in result_ls:with open(file2, "r") as f2, open(f"{file2}_TAX.txt", "a") as out:print(f"vOTU\tRealm\tKingdom\tPhylum\tClass\tOrder\tFamily\tGenus\t", file=out)next(f2) # Skip the headerfor line in f2.readlines():
# 提取最后一个物种名字,和vOTU名字。比如提取Malgrandaviricetes
# 比如IC1_idba-ud_1267_4139 Viruses;Phixviricota;Malgrandaviricetes;;Tax = line.split("\t")[-1].strip("\n")name = line.split("\t")[0].strip()last_Tax = [item for item in Tax.split(";") if item][-1]print(f"Searching for last_Tax: {last_Tax}")# 查找目标名字在ICTV的df的位置 df.apply(, axis=1)按行搜索search_result = df.apply(lambda row: row.str.contains(last_Tax).any(), axis=1)first_matching_index = search_result.idxmax() if search_result.any() else None# If a match is foundif first_matching_index is not None:matching_row = df.loc[first_matching_index]print(f"Matching row found: {matching_row}")# 在 matching_row 中找到等于 last_Tax 的第一个匹配项,并返回其索引last_tax_index = matching_row[matching_row == last_Tax].index[0]# Select columns from the start to the column containing last_Taxcolumns_before_last_tax = matching_row.loc[:last_tax_index]# Output the relevant columnsprint(f"Output from start to last_Tax: {columns_before_last_tax.values}")print(f"{name}",end="\t",file=out)for i in columns_before_last_tax.values:print(i,end="\t",file=out)print("", file=out)else:print("No match found.")print(f"{name}", end="\t", file=out)print("not assigned", file=out)
处理后
vOTU Realm Kingdom Phylum Class Order Family Genus
IC1_idba-ud_1037_4508 Monodnaviria Sangervirae Phixviricota Malgrandaviricetes Petitvirales Microviridae