目录
SlefTalk官网
依赖项安装
windows安装psbody
templates.pkl 下载地址:
推理代码:
diffspeaker
SlefTalk官网
GitHub - psyai-net/SelfTalk_release: This is the official source for our ACM MM 2023 paper "SelfTalk: A Self-Supervised Commutative Training Diagram to Comprehend 3D Talking Faces""
有预训练
依赖项安装
GitHub - MPI-IS/mesh: MPI-IS Mesh Processing Library
linux安装psbody库
如何安装psbody库、mesh包-CSDN博客
windows安装psbody
python3.8版本
Windows安装psbody_mesh - 哔哩哔哩
D:\Program Files\boost_1_73_0
templates.pkl 下载地址:
DiffSpeaker/scripts/demo/demo_biwi.sh at 0f2ae2d4e88fe78bff8bb7a1fdbc628e8733f120 · theEricMa/DiffSpeaker · GitHub
推理代码:
import numpy as np
import librosa
import os, argparse, pickle
from SelfTalk import SelfTalk
from transformers import Wav2Vec2Processor
import torch
import time
import cv2
import tempfile
from subprocess import call
import pyrender
# from psbody.mesh import Mesh
import trimesh# os.environ['PYOPENGL_PLATFORM'] = 'osmesa' # egl@torch.no_grad()
def test_model(args):if not os.path.exists(args.result_path):os.makedirs(args.result_path)# build modelmodel = SelfTalk(args)model.load_state_dict(torch.load('BIWI.pth',map_location=torch.device(args.device)))model = model.to(torch.device(args.device))model.eval()with torch.no_grad():template_file = os.path.join(args.dataset, args.template_path)with open(template_file, 'rb') as fin:templates = pickle.load(fin, encoding='latin1')temp = templates[args.subject]template = temp.reshape((-1))template = np.reshape(template, (-1, template.shape[0]))template = torch.FloatTensor(template).to(device=args.device)wav_path = args.wav_pathtest_name = os.path.basename(wav_path).split(".")[0]speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000)processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")audio_feature = np.squeeze(processor(speech_array, sampling_rate=16000).input_values)audio_feature = np.reshape(audio_feature, (-1, audio_feature.shape[0]))audio_feature = torch.FloatTensor(audio_feature).to(device=args.device)start = time.time()prediction, lip_features, logits = model.predict(audio_feature, template)end = time.time()print("Model predict time: ", end - start)prediction = prediction.squeeze() # (seq_len, V*3)np.save(os.path.join(args.result_path, test_name), prediction.detach().cpu().numpy())# The implementation of rendering is borrowed from VOCA: https://github.com/TimoBolkart/voca/blob/master/utils/rendering.py
def get_unit_factor(unit):if unit == 'mm':return 1000.0elif unit == 'cm':return 100.0elif unit == 'm':return 1.0else:raise ValueError('Unit not supported')def render_mesh_helper(mesh, t_center, rot=np.zeros(3), tex_img=None, z_offset=0):camera_params = {'c': np.array([400, 400]),'k': np.array([-0.19816071, 0.92822711, 0, 0, 0]),'f': np.array([4754.97941935 / 8, 4754.97941935 / 8])}frustum = {'near': 0.01, 'far': 3.0, 'height': 800, 'width': 800}mesh_copy = Mesh(mesh.v, mesh.f)mesh_copy.v[:] = cv2.Rodrigues(rot)[0].dot((mesh_copy.v - t_center).T).T + t_centerintensity = 2.0rgb_per_v = Noneprimitive_material = pyrender.material.MetallicRoughnessMaterial(alphaMode='BLEND',baseColorFactor=[0.3, 0.3, 0.3, 1.0],metallicFactor=0.8,roughnessFactor=0.8)tri_mesh = trimesh.Trimesh(vertices=mesh_copy.v, faces=mesh_copy.f, vertex_colors=rgb_per_v)render_mesh = pyrender.Mesh.from_trimesh(tri_mesh, material=primitive_material, smooth=True)if 1 == 1:scene = pyrender.Scene(ambient_light=[.2, .2, .2], bg_color=[0, 0, 0])else:scene = pyrender.Scene(ambient_light=[.2, .2, .2], bg_color=[255, 255, 255])camera = pyrender.IntrinsicsCamera(fx=camera_params['f'][0],fy=camera_params['f'][1],cx=camera_params['c'][0],cy=camera_params['c'][1],znear=frustum['near'],zfar=frustum['far'])scene.add(render_mesh, pose=np.eye(4))camera_pose = np.eye(4)camera_pose[:3, 3] = np.array([0, 0, 1.0 - z_offset])scene.add(camera, pose=[[1, 0, 0, 0],[0, 1, 0, 0],[0, 0, 1, 1],[0, 0, 0, 1]])angle = np.pi / 6.0pos = camera_pose[:3, 3]light_color = np.array([1., 1., 1.])light = pyrender.DirectionalLight(color=light_color, intensity=intensity)light_pose = np.eye(4)light_pose[:3, 3] = posscene.add(light, pose=light_pose.copy())light_pose[:3, 3] = cv2.Rodrigues(np.array([angle, 0, 0]))[0].dot(pos)scene.add(light, pose=light_pose.copy())light_pose[:3, 3] = cv2.Rodrigues(np.array([-angle, 0, 0]))[0].dot(pos)scene.add(light, pose=light_pose.copy())light_pose[:3, 3] = cv2.Rodrigues(np.array([0, -angle, 0]))[0].dot(pos)scene.add(light, pose=light_pose.copy())light_pose[:3, 3] = cv2.Rodrigues(np.array([0, angle, 0]))[0].dot(pos)scene.add(light, pose=light_pose.copy())flags = pyrender.RenderFlags.SKIP_CULL_FACES# try:# eglr = pyrender.OffscreenRenderer(viewport_width=frustum['width'], viewport_height=frustum['height'])color, _ = r.render(scene, flags=flags)# except:# print('pyrender: Failed rendering frame')# color = np.zeros((frustum['height'], frustum['width'], 3), dtype='uint8')return color[..., ::-1]def render_sequence_meshes(audio_fname, sequence_vertices, template, out_path, uv_template_fname='',texture_img_fname=''):if not os.path.exists(out_path):os.makedirs(out_path)tmp_video_file = tempfile.NamedTemporaryFile('w', suffix='.mp4', dir=out_path)if int(cv2.__version__[0]) < 3:writer = cv2.VideoWriter(tmp_video_file.name, cv2.cv.CV_FOURCC(*'mp4v'), 25, (800, 800), True)else:writer = cv2.VideoWriter(tmp_video_file.name, cv2.VideoWriter_fourcc(*'mp4v'), 25, (800, 800), True)if os.path.exists(uv_template_fname) and os.path.exists(texture_img_fname):uv_template = Mesh(filename=uv_template_fname)vt, ft = uv_template.vt, uv_template.fttex_img = cv2.imread(texture_img_fname)[:, :, ::-1]else:vt, ft = None, Nonetex_img = Nonenum_frames = sequence_vertices.shape[0]center = np.mean(sequence_vertices[0], axis=0)for i_frame in range(num_frames):render_mesh = Mesh(sequence_vertices[i_frame], template.f)if vt is not None and ft is not None:render_mesh.vt, render_mesh.ft = vt, ftimg = render_mesh_helper(render_mesh, center)writer.write(img)writer.release()video_fname = os.path.join(out_path, 'video.mp4')cmd = ('ffmpeg' + ' -i {0} -i {1} -pix_fmt yuv420p -qscale 0 {2} -y'.format(audio_fname, tmp_video_file.name, video_fname)).split()call(cmd)def output_sequence_meshes(sequence_vertices, template, out_path, uv_template_fname='', texture_img_fname=''):mesh_out_path = os.path.join(out_path, 'meshes')if not os.path.exists(mesh_out_path):os.makedirs(mesh_out_path)if os.path.exists(uv_template_fname):uv_template = Mesh(filename=uv_template_fname)vt, ft = uv_template.vt, uv_template.ftelse:vt, ft = None, Nonenum_frames = sequence_vertices.shape[0]for i_frame in range(num_frames):out_fname = os.path.join(mesh_out_path, '%05d.obj' % i_frame)out_mesh = Mesh(sequence_vertices[i_frame], template.f)if vt is not None and ft is not None:out_mesh.vt, out_mesh.ft = vt, ftif os.path.exists(texture_img_fname):out_mesh.set_texture_image(texture_img_fname)out_mesh.write_obj(out_fname)def main():parser = argparse.ArgumentParser(description='SelfTalk: A Self-Supervised Commutative Training Diagram to Comprehend 3D Talking Faces')parser.add_argument("--model_name", type=str, default="BIWI", help='vocaset or BIWI')parser.add_argument("--dataset", type=str, default="BIWI", help='vocaset or BIWI')parser.add_argument("--fps", type=float, default=25, help='frame rate - 30 for vocaset; 25 for BIWI')parser.add_argument("--feature_dim", type=int, default=1024, help='512 for vocaset; 1024 for BIWI')parser.add_argument("--period", type=int, default=25, help='period in PPE - 30 for vocaset; 25 for BIWI')parser.add_argument("--vertice_dim", type=int, default=23370 * 3,help='number of vertices - 5023*3 for vocaset; 23370*3 for BIWI')parser.add_argument("--device", type=str, default="cuda", help='cuda or cpu')parser.add_argument("--train_subjects", type=str, default="F2 F3 F4 M3 M4 M5")parser.add_argument("--test_subjects", type=str, default="F1 F5 F6 F7 F8 M1 M2 M6")parser.add_argument("--output_path", type=str, default="demo/output", help='path of the rendered video sequence')parser.add_argument("--wav_path", type=str, default="demo/wav/test.wav", help='path of the input audio signal')parser.add_argument("--result_path", type=str, default="demo/result", help='path of the predictions')parser.add_argument("--subject", type=str, default="F3",help='select a subject from test_subjects or train_subjects')parser.add_argument("--background_black", type=bool, default=True, help='whether to use black background')parser.add_argument("--template_path", type=str, default="templates.pkl", help='path of the personalized templates')parser.add_argument("--render_template_path", type=str, default="templates",help='path of the mesh in BIWI/FLAME topology')args = parser.parse_args()test_model(args)# fa_path = args.result_path + "/" + args.wav_path.split("/")[-1].split(".")[0] + ".npy"# temp = "./BIWI/BIWI.ply"# out_path = fa_path.split(".")[0]# audio_fname = args.wav_path## template = Mesh(filename=temp)# predicted_vertices_out = np.load(fa_path).reshape(-1, 23370, 3)# print("Start rendering...")# output_sequence_meshes(predicted_vertices_out, template, out_path)## render_sequence_meshes(audio_fname, predicted_vertices_out, template, out_path, uv_template_fname='',# texture_img_fname='')if __name__ == "__main__":main()
diffspeaker
https://github.com/theEricMa/DiffSpeaker