蛋白质序列FeatureDict转化为TensorDict

主要转化语句为 tensor_dict = {k: tf.constant(v) for k, v in np_example.items() if k in features_metadata}。 增加了特征名称的选择,不同特征维度,特征数的判断等。

from typing import Dict, Tuple, Sequence, Union, Mapping, Optional
#import tensorflow.compat.v1 as tf
import tensorflow as tf
import numpy as np
import pickle# Type aliases.
FeaturesMetadata = Dict[str, Tuple[tf.dtypes.DType, Sequence[Union[str, int]]]]#FeatureDict = Mapping[str, np.ndarray]
TensorDict = Dict[str, tf.Tensor]NUM_RES = 'num residues placeholder'
NUM_TEMPLATES = 'num templates placeholder'
NUM_SEQ = "length msa placeholder"atom_type_num = 37FEATURES = {#### Static features of a protein sequence ####"aatype": (tf.float32, [NUM_RES, 21]),"between_segment_residues": (tf.int64, [NUM_RES, 1]),"deletion_matrix": (tf.float32, [NUM_SEQ, NUM_RES, 1]),"domain_name": (tf.string, [1]),"msa": (tf.int64, [NUM_SEQ, NUM_RES, 1]),"num_alignments": (tf.int64, [NUM_RES, 1]),"residue_index": (tf.int64, [NUM_RES, 1]),"seq_length": (tf.int64, [NUM_RES, 1]),"sequence": (tf.string, [1]),"all_atom_positions": (tf.float32,[NUM_RES, atom_type_num, 3]),"all_atom_mask": (tf.int64, [NUM_RES, atom_type_num]),"resolution": (tf.float32, [1]),"template_domain_names": (tf.string, [NUM_TEMPLATES]),"template_sum_probs": (tf.float32, [NUM_TEMPLATES, 1]),"template_aatype": (tf.float32, [NUM_TEMPLATES, NUM_RES, 22]),"template_all_atom_positions": (tf.float32, [NUM_TEMPLATES, NUM_RES, atom_type_num, 3]),"template_all_atom_masks": (tf.float32, [NUM_TEMPLATES, NUM_RES, atom_type_num, 1]),
}def _make_features_metadata(feature_names: Sequence[str]) -> FeaturesMetadata:"""Makes a feature name to type and shape mapping from a list of names."""# Make sure these features are always read.required_features = ["aatype", "sequence", "seq_length"]feature_names = list(set(feature_names) | set(required_features))features_metadata = {name: FEATURES[name] for name in feature_names}return features_metadatadef np_to_tensor_dict(np_example: Mapping[str, np.ndarray],features: Sequence[str],) -> TensorDict:"""Creates dict of tensors from a dict of NumPy arrays.Args:np_example: A dict of NumPy feature arrays.features: A list of strings of feature names to be returned in the dataset.Returns:A dictionary of features mapping feature names to features. Only the givenfeatures are returned, all other ones are filtered out."""features_metadata = _make_features_metadata(features)print(f"features_metadata:{features_metadata}")tensor_dict = {k: tf.constant(v) for k, v in np_example.items()if k in features_metadata}#print(f"tensor_dict:{tensor_dict}")# Ensures shapes are as expected. Needed for setting size of empty features# e.g. when no template hits were found.tensor_dict = parse_reshape_logic(tensor_dict, features_metadata)return tensor_dictdef protein_features_shape(feature_name: str,num_residues: int,msa_length: int,num_templates: Optional[int] = None,features: Optional[FeaturesMetadata] = None):"""Get the shape for the given feature name.This is near identical to _get_tf_shape_no_placeholders() but with 2differences:* This method does not calculate a single placeholder from the total number ofelements (eg given <NUM_RES, 3> and size := 12, this won't deduce NUM_RESmust be 4)* This method will work with tensorsArgs:feature_name: String identifier for the feature. If the feature name endswith "_unnormalized", this suffix is stripped off.num_residues: The number of residues in the current domain - some elementsof the shape can be dynamic and will be replaced by this value.msa_length: The number of sequences in the multiple sequence alignment, someelements of the shape can be dynamic and will be replaced by this value.If the number of alignments is unknown / not read, please pass None formsa_length.num_templates (optional): The number of templates in this tfexample.features: A feature_name to (tf_dtype, shape) lookup; defaults to FEATURES.Returns:List of ints representation the tensor size.Raises:ValueError: If a feature is requested but no concrete placeholder value isgiven."""features = features or FEATURESif feature_name.endswith("_unnormalized"):feature_name = feature_name[:-13]# features是FeaturesMetadata数据结构# FeaturesMetadata = Dict[str, Tuple[tf.dtypes.DType, Sequence[Union[str, int]]]]unused_dtype, raw_sizes = features[feature_name]#print(f"feature_name:{feature_name}")#print(f"features value:{features[feature_name]}") #print(f"features[feature_name]:{features[feature_name]}")#print(f"unused_dtype:{unused_dtype}")#print(f"raw_sizes:{raw_sizes}"replacements = {NUM_RES: num_residues,NUM_SEQ: msa_length}if num_templates is not None:replacements[NUM_TEMPLATES] = num_templates# my_dict.get(key, default_value)sizes = [replacements.get(dimension, dimension) for dimension in raw_sizes]for dimension in sizes:if isinstance(dimension, str):raise ValueError("Could not parse %s (shape: %s) with values: %s" % (feature_name, raw_sizes, replacements))return sizesdef parse_reshape_logic(parsed_features: TensorDict,features: FeaturesMetadata,key: Optional[str] = None) -> TensorDict:"""Transforms parsed serial features to the correct shape."""# Find out what is the number of sequences and the number of alignments.num_residues = tf.cast(_first(parsed_features["seq_length"]), dtype=tf.int32)if "num_alignments" in parsed_features:num_msa = tf.cast(_first(parsed_features["num_alignments"]), dtype=tf.int32)else:num_msa = 0if "template_domain_names" in parsed_features:num_templates = tf.cast(tf.shape(parsed_features["template_domain_names"])[0], dtype=tf.int32)else:num_templates = 0if key is not None and "key" in features:parsed_features["key"] = [key]  # Expand dims from () to (1,).# Reshape the tensors according to the sequence length and num alignments.for k, v in parsed_features.items():new_shape = protein_features_shape(feature_name=k,num_residues=num_residues,msa_length=num_msa,num_templates=num_templates,features=features)#print(f"new_shape:{new_shape}")new_shape_size = tf.constant(1, dtype=tf.int32)for dim in new_shape:new_shape_size *= tf.cast(dim, tf.int32)#print(f"new_shape_size:{new_shape_size}")#print(f"original_shape_size:{ tf.size(v)}")# 断言函数,用于检查两个张量是否相等。不相等引发异常assert_equal = tf.assert_equal(tf.size(v), new_shape_size,name="assert_%s_shape_correct" % k,message="The size of feature %s (%s) could not be reshaped ""into %s" % (k, tf.size(v), new_shape))if "template" not in k:# Make sure the feature we are reshaping is not empty.assert_non_empty = tf.assert_greater(tf.size(v), 0, name="assert_%s_non_empty" % k,message="The feature %s is not set in the tf.Example. Either do not ""request the feature or use a tf.Example that has the ""feature set." % k)with tf.control_dependencies([assert_non_empty, assert_equal]):parsed_features[k] = tf.reshape(v, new_shape, name="reshape_%s" % k)else:with tf.control_dependencies([assert_equal]):parsed_features[k] = tf.reshape(v, new_shape, name="reshape_%s" % k)return parsed_featuresdef _first(tensor: tf.Tensor) -> tf.Tensor:"""Returns the 1st element - the input can be a tensor or a scalar."""return tf.reshape(tensor, shape=(-1,))[0] # 将其转换为一维数组## 读入FeatureDict列表
with open("HBB_features_lst.pkl", 'rb') as f:HBB_features_lst = pickle.load(f)Human_HBB_feature_dict = HBB_features_lst[0]print(Human_HBB_feature_dict.keys())#print(Human_HBB_feature_dict['num_alignments'])features = FEATURES.keys()#for key in Human_HBB_feature_dict.keys():
#    if key not in features:
#        print(key)#print(features)Human_HBB_tensor_dict = np_to_tensor_dict(Human_HBB_feature_dict,features= features)print(Human_HBB_tensor_dict.keys())
#print(Human_HBB_tensor_dict)#print(Human_HBB_tensor_dict["template_domain_names"])

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/209609.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

postgresql_conf中常用配置项

在 PostgreSQL 的 postgresql.conf 配置文件中&#xff0c;有许多常用的配置项&#xff0c;这些配置项可以根据特定需求和性能优化进行调整。以下是一些常用的配置项及其作用&#xff1a; 1. shared_buffers 用于设置 PostgreSQL 实例使用的共享内存缓冲区大小。增加此值可以…

游戏被攻击该怎么办?游戏盾该如何使用,游戏盾如何防护攻击

随着Internet互联网络带宽的增加和多种DDOS黑客工具的不断发布&#xff0c;DDOS拒绝服务攻击的实施越来越容易&#xff0c;DDOS攻击事件正在成上升趋势。出于商业竞争、打击报复和网络敲诈等多种因素&#xff0c;导致很多商业站点、游戏服务器、聊天网络等网络服务商长期以来一…

Nacos 配置加密功能也太鸡肋了吧,有种更好的方式

大家好&#xff0c;我是风筝&#xff0c;微信搜「古时的风筝」&#xff0c;更多干货 当项目中用了 Nacos 做配置中心&#xff0c;是不是所有的配置都放到里面呢&#xff0c;大部分时候为了省事和统一&#xff0c;系统所有的配置都直接放在里面了&#xff0c;有时候&#xff0c…

什么是自动化测试框架?常用的自动化测试框架有哪些?

无论是在自动化测试实践&#xff0c;还是日常交流中&#xff0c;经常听到一个词&#xff1a;框架。之前学习自动化测试的过程中&#xff0c;一直对“框架”这个词知其然不知其所以然。 最近看了很多自动化相关的资料&#xff0c;加上自己的一些实践&#xff0c;算是对“框架”…

Redis相关知识

yum安装redis 使用以下命令&#xff1a;直接将redis安装到Linux服务器&#xff08;Xshell&#xff09;中 yum -y install redis 启动redis 使用以下命令&#xff0c;以后台运行方式启动redis redis-server /etc/redis.conf & 操作redis 使用以下命令启动redis客户端 redis-…

RFID在新能源工厂大放异彩

RFID在新能源工厂大放异彩 我国在十四五规划中提出了建设绿色低碳发展的目标&#xff0c;新能源产业成为了国家发展的重点领域之一&#xff0c;开始大力支持各种新能源厂商发展。各个厂商之间不仅比产品、比技术。也比生产想要降本增效&#xff0c;为了实现这一目标&#xff0…

MBD Introduction

介绍 MATLAB是MathWorks公司的商业数学软件&#xff0c;应用于科学计算、可视化以及交互式程序设计等高科技计算环境。Simulink是MATLAB中的一种可视化仿真工具。 Simulink是一个模块图环境&#xff0c;用于多域仿真以及基于模型的设计。它支持系统设计、仿真、自动代码生成以…

Spring基于xml半注解开发

目录 Component的使用 依赖注解的使用 非自定义Bean的注解开发 Component的使用 基本Bean注解&#xff0c;主要是使用注解的方式替代原有的xml的<bean>标签及其标签属性的配置&#xff0c;使用Component注解替代<bean>标签中的id以及class属性&#xff0c;而对…

算法Day26 数位统计

数位统计 Description 给你一个整数n&#xff0c;统计并返回各位数字都不同的数字x的个数&#xff0c;其中0 ≤ x < 10^n。 Input 输入整数n 0≤n≤13 Output 输出整数个数 Sample 代码 import java.util.Scanner;public class Main {public static void main(String[] ar…

一个Oracle Application Container的实例

本例基本涵盖了Oracle Multitenant功能中application container的以下内容&#xff1a; 创建application container/root创建application PDB创建application SEED在application root中安装application在application root中升级application同步application 整个过程如下 创建…

Epoll服务器(ET工作模式)

目录 Epoll ET服务器设计思路Connection类TcpServer类 回调函数Accepter函数Recever函数Sender函数Excepter函数 事件处理套接字相关接口封装运行Epoll服务器 Epoll ET服务器 设计思路 在epoll ET服务器中&#xff0c;我们需要处理如下几种事件&#xff1a; 读事件&#xff…

基于javeweb实现的图书借阅管理系统

一、系统架构 前端&#xff1a;jsp | js | css | jquery 后端&#xff1a;servlet | jdbc 环境&#xff1a;jdk1.7 | mysql | tocmat 二、代码及数据库 三、功能介绍 01. 登录页 02. 首页 03. 图书管理 04. 读者管理 05. 图书分类管理 06. 图书借阅信息 07. 图书归还信…

CDN加速技术:降低服务器与网站成本的智慧选择

随着互联网的飞速发展&#xff0c;网站的访问量不断攀升&#xff0c;服务器负载压力逐渐增大。为了提高用户体验、降低服务器成本&#xff0c;并确保网站的高可用性&#xff0c;CDN&#xff08;内容分发网络&#xff09;加速技术应运而生。本文将从服务器与网站成本的角度分析C…

NLP项目实战01--电影评论分类

介绍&#xff1a; 欢迎来到本篇文章&#xff01;在这里&#xff0c;我们将探讨一个常见而重要的自然语言处理任务——文本分类。具体而言&#xff0c;我们将关注情感分析任务&#xff0c;即通过分析电影评论的情感来判断评论是正面的、负面的。 展示&#xff1a; 训练展示如下…

比较不同聚类方法的评估指标

归一化互信息&#xff08;NMI&#xff09; 要求&#xff1a;需要每个序列的真实标签&#xff08;分类信息&#xff09;

你在地铁上修过bug吗?

作为技术人员&#xff0c;有没有遇到下班路上收到老板电话&#xff0c;系统故障&#xff0c;然后地铁上掏出电脑&#xff0c;修bug的场景。自己负责的业务线上出现问题&#xff0c;负责人心里是很慌的&#xff0c;在这种心理状态下做事很容易二次犯错&#xff0c;造成更大的问题…

SAP UI5 walkthrough step10 Descriptor for Applications

在这一步&#xff0c;我们将会把所有的应用相关的描述性的文件独立放到manifest.json 新建一个manifest.json文件 webapp/manifest.json (New) {"_version": "1.58.0","sap.app": {"id": "ui5.walkthrough","i18n&q…

【已解决】No module named ‘sklearn‘

问题描述 No module named ‘sklearn‘ 解决办法 pip install scikit-learn 完结撒花 契约、包容、感恩、原则……这些成年人该有的基本精神&#xff0c;为什么我在他们身上找不到呢&#xff1f;

图像叠加中文字体

目录 1) 前言2) freetype下载3) Demo3.1) 下载3.2) 编译3.3) 运行3.4) 结果3.5) 更详细的使用见目录中说明 4) 积少成多 1) 前言 最近在做图片、视频叠加文字&#xff0c;要求支持中文&#xff0c;基本原理是将图片或视频解码后叠加文字&#xff0c;之后做图片或视频编码即可。…

ASP.NET Core概述-微软已经收购了mono,为什么还搞.NET Core呢

一、.NET Core概述 1、相关历程 .NET在设计之初也是考虑像Java一样跨平台&#xff0c;.NET Framework是在Windows下运行的&#xff0c;大部分类是可以兼容移植到Linux下&#xff0c;但是没有人做这个工作。 2001年米格尔为Gnome寻找桌面开发技术&#xff0c;在研究了微软的.…