AlphaFold3 generate_chain_data_cache 脚本在源代码的scripts文件夹下。该脚本从指定目录中批量解析 mmCIF/PDB 文件的工具,并将每个链的基本信息(序列、分辨率、是否属于聚类等)提取并写入 JSON 文件,主要用于后续蛋白质建模、过滤或训练数据准备。
源代码:
import argparse
from functools import partial
import json
import logging
from multiprocessing import Pool
import os
import string
from collections import defaultdict
from tqdm import tqdm
from src.data.mmcif_parsing import parse
from src.common import protein, residue_constants
import sys
sys.path.append("../../../Downloads") # an innocent hack to get this to run from the top leveldef parse_file(f, args,chain_cluster_size_dict
):file_id, ext = os.path.splitext(f)if ext == ".cif":with open(os.path.join(args.data_dir, f), "r") as fp:mmcif_string = fp.read()mmcif = parse(file_id=file_id, mmcif_string=mmcif_string)if mmcif.mmcif_object is None:logging.info(f"Could not parse {f}. Skipping...")return {}else:mmcif = mmcif.mmcif_objectout = {}for chain_id, seq in mmcif.chain_to_seqres.items():full_name = "_".join([file_id, chain_id])out[full_name] = {}local_data = out[full_name]local_data["release_date"] = mmcif.header["release_date"]local_data["seq"] = seqlocal_data["resolution"] = mmcif.header["resolution"]if chain_cluster_size_dict is not None:cluster_size = chain_cluster_size_dict.get(full_name.upper(), -1)local_data["cluster_size"] = cluster_sizeelif ext == ".pdb":with open(os.path.join(args.data_dir, f), "r") as fp:pdb_string = fp.read()protein_object = protein.from_pdb_string(pdb_string, None)aatype = pr