使用LoFTR模型进行图像配准、重叠区提取

LoFTR模型源自2021年CVPR提出的一篇论文LoFTR: Detector-Free Local Feature Matching with Transformers，其基于pytorch实现图像配准，与基于superpoint+superglue的方法不同，
是一个端到端的图像配准方法。与LoFTR官方库相关的有loftr2onnx库，整体来说loftr2onnx库使用更方便，效果更好。但loftr2onnx转出的onnx模型是有问题的，不能使用。

项目地址：https://github.com/zju3dv/LoFTR
项目地址2：https://github.com/oooooha/loftr2onnx
demo体验：https://huggingface.co/spaces/kornia/Kornia-LoFTR

1、web demo体验

访问 https://huggingface.co/spaces/kornia/Kornia-LoFTR

点配准的效果如下所示，可以看到与sp+sg方法相比，点对的平行关系直观上要好很多。
在这里插入图片描述

2、使用LoFTR项目

2.1 下载代码

打开https://github.com/zju3dv/LoFTR 下载代码
在这里插入图片描述

2.2 安装依赖项

安装项目使用：解压代码，然后在终端进入目录，执行pip install -r .\requirements.txt
在这里插入图片描述

2.3 下载模型

https://drive.google.com/drive/folders/1DOcOPZb3-5cWxLqn256AhwUVjBPifhuf 下载权重。如果需要训练，可以下载训练数据与从测试数据。
在这里插入图片描述
将权重的压缩文件解压后放到项目根目录下

2.4 初步使用

将 https://hpg123.blog.csdn.net/article/details/124824892 中章节3的代码（对应标题为 superpoint中read_img_as_tensor函数）保存为imgutils.py

from src.loftr import LoFTR, default_cfg
import torch
from imgutils import *
import time
# Initialize LoFTR
matcher = LoFTR(config=default_cfg)
matcher.load_state_dict(torch.load("weights/indoor_ds_new.ckpt")['state_dict'])
matcher = matcher.eval().cuda()p1=r'C:\Users\hpg\Pictures\t1.jpg'
p2=r'C:\Users\hpg\Pictures\t2.jpg'
t1,im1=read_img_as_tensor(p1,(384,384))
t2,im2=read_img_as_tensor(p2,(384,384))
batch = {'image0': t1, 'image1': t2}
# Inference
with torch.no_grad():matcher(batch)    # t0=time.time()times=10for i in range(times):matcher(batch)rt1=time.time()-t0rt1=rt1/timesmkpts0 = batch['mkpts0_f'].cpu().numpy()mkpts1 = batch['mkpts1_f'].cpu().numpy()mconf = batch['mconf'].cpu().numpy()print(f'运行时间：{rt1:.4f}',mkpts0.shape,mkpts1.shape,mconf)

代码运行效果如下所示，可以看到一个图片需要0.19s（笔记本，1060显卡），换成台式机3060显卡，预计在0.05s左右一张图

运行时间：0.1933 (32, 2) (32, 2) [0.22855578 0.21740437 0.34927088 0.28389925 0.27157754 0.269668280.22636016 0.22058277 0.20475665 0.20878278 0.22838292 0.254485850.27047077 0.34403533 0.22612476 0.2044811  0.26239234 0.327975540.2263804  0.26544347 0.3401669  0.39336586 0.3473139  0.282306940.23061718 0.23949552 0.46178365 0.3540019  0.5322925  0.272002370.26731068 0.39827508]

3、使用loftr2onnx库进行配准

3.1 基本使用

也可以基于 https://github.com/oooooha/loftr2onnx 项目进行图像配准
在这里插入图片描述
使用代码如下


from loftr_wrapper import LoFTRWrapper as LoFTR
import torch
from imgutils import *
import time
# Initialize LoFTR
matcher = LoFTR()
matcher.load_state_dict(torch.load("weights/indoor_ds_new.ckpt")['state_dict'])
matcher = matcher.eval().cuda()p1=r'C:\Users\hpg\Pictures\t1.jpg'
p2=r'C:\Users\hpg\Pictures\t2.jpg'
t1,im1=read_img_as_tensor(p1,(384,384))
t2,im2=read_img_as_tensor(p2,(384,384))
# Inference
with torch.no_grad():result=matcher(t1,t2)    # t0=time.time()times=10for i in range(times):result=matcher(t1,t2)rt1=time.time()-t0rt1=rt1/timesmkpts0 = result['keypoints0'].cpu().numpy()mkpts1 = result['keypoints1'].cpu().numpy()mconf = result['confidence'].cpu().numpy()print(f'运行时间：{rt1:.4f}',mkpts0.shape,mkpts1.shape,mconf)

代码输出如下所示，可以看到与LoFTR项目的输出有所差异

运行时间：0.1925 (212, 2) (212, 2) [0.4566688  0.53420454 0.5319168  0.5320238  0.46744433 0.40682140.5363396  0.45674214 0.60001785 0.6576139  0.53006035 0.595909240.5725811  0.5505655  0.44364485 0.40315574 0.4293331  0.50609730.6550978  0.52451503 0.553644   0.63088214 0.6906601  0.616680740.4543735  0.4138872  0.4332955  0.47855106 0.60533136 0.67351430.7912271  0.7220486  0.75414115 0.75669855 0.60389113 0.403050660.71130437 0.6583284  0.5403245  0.5433615  0.40149704 0.66738440.4093839  0.5410701  0.51509964 0.42121148 0.68238974 0.552473960.5116625  0.8369319  0.53321654 0.5323315  0.5779519  0.647059260.43591025 0.40134645 0.4599252  0.46620858 0.6388375  0.83547580.515318   0.6521981  0.54744494 0.64528877 0.7466613  0.63595170.58179545 0.4587202  0.4856584  0.42029297 0.43322447 0.432207580.6896481  0.79645556 0.5817581  0.75245494 0.5786756  0.72515590.814531   0.49031648 0.46484298 0.54241467 0.5943087  0.72451150.6457875  0.8097793  0.7199513  0.49220178 0.5443373  0.40861040.5046131  0.7193697  0.6752727  0.41796637 0.5513792  0.70874180.7779165  0.75016826 0.68525094 0.58962977 0.6315668  0.49130850.56355244 0.41288543 0.52281946 0.42782715 0.43921712 0.52160180.5566503  0.78442967 0.6013023  0.42023212 0.43102428 0.615640640.40717542 0.49634054 0.45509326 0.4511342  0.41775596 0.558971760.56803375 0.6018254  0.71239305 0.44001386 0.43651453 0.69477330.8648205  0.4988858  0.40208712 0.71607304 0.9030141  0.55438260.49472648 0.5359598  0.74733096 0.6617334  0.7066015  0.7256770.43446922 0.5126569  0.52367914 0.45096788 0.4248741  0.432852750.723374   0.86523044 0.65740126 0.427191   0.4776224  0.48018260.4530296  0.4275035  0.527438   0.52301216 0.58992577 0.417273430.48609605 0.7365703  0.6339512  0.6379226  0.4489899  0.413250480.5010124  0.49238032 0.57079905 0.62783945 0.5092921  0.57263870.60590863 0.44714844 0.6284152  0.40801758 0.40126294 0.42214190.52245826 0.70989937 0.49206337 0.553483   0.4956581  0.41806970.6228596  0.6543849  0.7747963  0.61180156 0.60290194 0.54211940.6149054  0.48783877 0.40048426 0.47044232 0.40145218 0.423598560.68902797 0.44713116 0.84827214 0.48961237 0.6137104  0.77524260.7184252  0.71058017 0.47483382 0.7151901  0.78853625 0.669882540.7502565  0.42592585 0.49173304 0.4657402  0.59592575 0.428502770.4645101  0.5070625 ]

3.2 差异分析

1、这主要是loftr2onnx项目通过loftr_wrapper对LoFTR的forward流程进行了调整。

#!/usr/bin/env python
import copy
import os
import sys
from typing import Any, Dictimport torch
from einops.einops import rearrange_CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(_CURRENT_DIR, "LoFTR"))from loftr import LoFTR, default_cfgDEFAULT_CFG = copy.deepcopy(default_cfg)
DEFAULT_CFG["coarse"]["temp_bug_fix"] = Trueclass LoFTRWrapper(LoFTR):def __init__(self,config: Dict[str, Any] = DEFAULT_CFG,):LoFTR.__init__(self, config)def forward(self,image0: torch.Tensor,image1: torch.Tensor,) -> Dict[str, torch.Tensor]:data = {"image0": image0,"image1": image1,}del image0, image1data.update({"bs": data["image0"].size(0),"hw0_i": data["image0"].shape[2:],"hw1_i": data["image1"].shape[2:],})if data["hw0_i"] == data["hw1_i"]:  # faster & better BN convergencefeats_c, feats_f = self.backbone(torch.cat([data["image0"], data["image1"]], dim=0))(feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split(data["bs"]), feats_f.split(data["bs"])else:  # handle different input shapes(feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone(data["image0"]), self.backbone(data["image1"])data.update({"hw0_c": feat_c0.shape[2:],"hw1_c": feat_c1.shape[2:],"hw0_f": feat_f0.shape[2:],"hw1_f": feat_f1.shape[2:],})# 2. coarse-level loftr module# add featmap with positional encoding, then flatten it to sequence [N, HW, C]feat_c0 = rearrange(self.pos_encoding(feat_c0), "n c h w -> n (h w) c")feat_c1 = rearrange(self.pos_encoding(feat_c1), "n c h w -> n (h w) c")mask_c0 = mask_c1 = None  # mask is useful in trainingif "mask0" in data:mask_c0, mask_c1 = data["mask0"].flatten(-2), data["mask1"].flatten(-2)feat_c0, feat_c1 = self.loftr_coarse(feat_c0, feat_c1, mask_c0, mask_c1)# 3. match coarse-levelself.coarse_matching(feat_c0, feat_c1, data, mask_c0=mask_c0, mask_c1=mask_c1)# 4. fine-level refinementfeat_f0_unfold, feat_f1_unfold = self.fine_preprocess(feat_f0, feat_f1, feat_c0, feat_c1, data)if feat_f0_unfold.size(0) != 0:  # at least one coarse level predictedfeat_f0_unfold, feat_f1_unfold = self.loftr_fine(feat_f0_unfold, feat_f1_unfold)# 5. match fine-levelself.fine_matching(feat_f0_unfold, feat_f1_unfold, data)rename_keys: Dict[str, str] = {"mkpts0_f": "keypoints0","mkpts1_f": "keypoints1","mconf": "confidence",}out: Dict[str, torch.Tensor] = {}for k, v in rename_keys.items():_d = data[k]if isinstance(_d, torch.Tensor):out[v] = _delse:raise TypeError(f"Expected torch.Tensor for item `{k}`. Gotcha {type(_d)}")del datareturn out

2、然后cfg或许有不同

loftr2onnx中的默认配置在loftr\utils\cvpr_ds_config.py中，如下所示。可以看到 _CN.MATCH_COARSE.THR与_CN.MATCH_COARSE.BORDER_RM 是做过修改的，与默认值不同

from yacs.config import CfgNode as CNdef lower_config(yacs_cfg):if not isinstance(yacs_cfg, CN):return yacs_cfgreturn {k.lower(): lower_config(v) for k, v in yacs_cfg.items()}_CN = CN()
_CN.BACKBONE_TYPE = 'ResNetFPN'
_CN.RESOLUTION = (8, 2)  # options: [(8, 2), (16, 4)]
_CN.FINE_WINDOW_SIZE = 5  # window_size in fine_level, must be odd
_CN.FINE_CONCAT_COARSE_FEAT = True# 1. LoFTR-backbone (local feature CNN) config
_CN.RESNETFPN = CN()
_CN.RESNETFPN.INITIAL_DIM = 128
_CN.RESNETFPN.BLOCK_DIMS = [128, 196, 256]  # s1, s2, s3# 2. LoFTR-coarse module config
_CN.COARSE = CN()
_CN.COARSE.D_MODEL = 256
_CN.COARSE.D_FFN = 256
_CN.COARSE.NHEAD = 8
_CN.COARSE.LAYER_NAMES = ['self', 'cross'] * 4
_CN.COARSE.ATTENTION = 'linear'  # options: ['linear', 'full']
_CN.COARSE.TEMP_BUG_FIX = False# 3. Coarse-Matching config
_CN.MATCH_COARSE = CN()
_CN.MATCH_COARSE.THR = 0.4  # thresh default=0.2
_CN.MATCH_COARSE.BORDER_RM = 4  # border default=2
_CN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'  # options: ['dual_softmax, 'sinkhorn']
_CN.MATCH_COARSE.DSMAX_TEMPERATURE = 0.1
_CN.MATCH_COARSE.SKH_ITERS = 3
_CN.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0
_CN.MATCH_COARSE.SKH_PREFILTER = True
_CN.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.4  # training tricks: save GPU memory
_CN.MATCH_COARSE.TRAIN_PAD_NUM_GT_MIN = 200  # training tricks: avoid DDP deadlock# 4. LoFTR-fine module config
_CN.FINE = CN()
_CN.FINE.D_MODEL = 128
_CN.FINE.D_FFN = 128
_CN.FINE.NHEAD = 8
_CN.FINE.LAYER_NAMES = ['self', 'cross'] * 1
_CN.FINE.ATTENTION = 'linear'default_cfg = lower_config(_CN)

4、基于LoFTR提取重叠区域

4.1 使用LoFTR库

这里的imgutils 与前文说明的有细微区别，打开https://blog.csdn.net/a486259/article/details/124824892 将章节1的代码保存为imgutils.py即可

from src.loftr import LoFTR, default_cfg
import torch
from imgutils import *
import time
# Initialize LoFTR
matcher = LoFTR(config=default_cfg)
matcher.load_state_dict(torch.load("weights/indoor_ds_new.ckpt")['state_dict'])
matcher = matcher.eval().cuda()p1=r'C:\Users\hpg\Pictures\t1.jpg'
p2=r'C:\Users\hpg\Pictures\t2.jpg'
t1,im1=read_img_as_tensor_gray(p1,(384,384))
t2,im2=read_img_as_tensor_gray(p2,(384,384))
batch = {'image0': t1, 'image1': t2}
# Inference
with torch.no_grad():t0=time.time()times=1for i in range(times):matcher(batch)rt1=time.time()-t0rt1=rt1/timesmkpts0 = batch['mkpts0_f'].cpu().numpy()mkpts1 = batch['mkpts1_f'].cpu().numpy()confidence = batch['mconf'].cpu().numpy()print(f'运行时间：{rt1:.4f}',mkpts0.shape,mkpts1.shape)import cv2 as cv
pt_num = mkpts0.shape[0]
im_dst,im_res=im1,im2
img = np.zeros((max(im_dst.shape[0], im_res.shape[0]), im_dst.shape[1]+im_res.shape[1]+10,3))
img[:,:im_res.shape[0],]=im_dst
img[:,-im_res.shape[0]:]=im_res
img=img.astype(np.uint8)
match_threshold=0.6
for i in range(0, pt_num):if (confidence[i] > match_threshold):pt0 = mkpts0[i].to('cpu').numpy().astype(np.int32)pt1 = mkpts1[i].to('cpu').numpy().astype(np.int32)#cv.circle(img, (pt0[0], pt0[1]), 1, (0, 0, 255), 2)#cv.circle(img, (pt1[0], pt1[1]+650), (0, 0, 255), 2)cv.line(img, pt0, (pt1[0]+im_res.shape[0], pt1[1]), (0, 255, 0), 1)
myimshow( img,size=12)import cv2
def getGoodMatchPoint(mkpts0, mkpts1, confidence,  match_threshold:float=0.5):n = min(mkpts0.size(0), mkpts1.size(0))srcImage1_matchedKPs, srcImage2_matchedKPs=[],[]if (match_threshold > 1 or match_threshold < 0):print("match_threshold error!")for i in range(n):kp0 = mkpts0[i]kp1 = mkpts1[i]pt0=(kp0[0].item(),kp0[1].item());pt1=(kp1[0].item(),kp1[1].item());c = confidence[i].item();if (c > match_threshold):srcImage1_matchedKPs.append(pt0);srcImage2_matchedKPs.append(pt1);return np.array(srcImage1_matchedKPs),np.array(srcImage2_matchedKPs)
pts_src, pts_dst=getGoodMatchPoint(mkpts0, mkpts1, confidence)h1, status = cv2.findHomography(pts_src, pts_dst, cv.RANSAC, 8)
im_out1 = cv2.warpPerspective(im_dst, h1, (im_dst.shape[1],im_dst.shape[0]))
im_out2 = cv2.warpPerspective(im_res, h1, (im_dst.shape[1],im_dst.shape[0]),16)
#这里 im_res和im_out1是严格配准的状态
myimshowsCL([im_dst,im_out1,im_res,im_out2],rows=2,cols=2, size=6)

在这里插入图片描述
代码运行报错，因为匹配到的点太少了，无法计算转化矩阵提取重叠区

4.2 使用loftr2onnx库

这里的imgutils 与前文说明的有细微区别，打开https://blog.csdn.net/a486259/article/details/124824892 将章节1的代码保存为imgutils.py即可

使用代码如下


from loftr_wrapper import LoFTRWrapper as LoFTR
import torch
from imgutils import *
import time
# Initialize LoFTR
matcher = LoFTR()
matcher.load_state_dict(torch.load("weights/indoor_ds_new.ckpt")['state_dict'])
matcher = matcher.eval().cuda()p1=r'C:\Users\hpg\Pictures\t1.jpg'
p2=r'C:\Users\hpg\Pictures\t2.jpg'
t1,im1=read_img_as_tensor_gray(p1,(384,384))
t2,im2=read_img_as_tensor_gray(p2,(384,384))
# Inference
with torch.no_grad():#result=matcher(t1,t2)    # t0=time.time()times=1for i in range(times):result=matcher(t1,t2)rt1=time.time()-t0rt1=rt1/timesmkpts0 = result['keypoints0']#.cpu().numpy()mkpts1 = result['keypoints1']#.cpu().numpy()confidence = result['confidence']#.cpu().numpy()print(f'运行时间：{rt1:.4f}',mkpts0.shape,mkpts1.shape,confidence)import cv2 as cv
pt_num = mkpts0.shape[0]
im_dst,im_res=im1,im2
img = np.zeros((max(im_dst.shape[0], im_res.shape[0]), im_dst.shape[1]+im_res.shape[1]+10,3))
img[:,:im_res.shape[0],]=im_dst
img[:,-im_res.shape[0]:]=im_res
img=img.astype(np.uint8)
match_threshold=0.01
for i in range(0, pt_num):if (confidence[i] > match_threshold):pt0 = mkpts0[i].to('cpu').numpy().astype(np.int32)pt1 = mkpts1[i].to('cpu').numpy().astype(np.int32)#cv.circle(img, (pt0[0], pt0[1]), 1, (0, 0, 255), 2)#cv.circle(img, (pt1[0], pt1[1]+650), (0, 0, 255), 2)cv.line(img, tuple(pt0.tolist()), (pt1[0]+im_res.shape[0], pt1[1]), (0, 255, 0), 1)
myimshow( img,size=12)import cv2
def getGoodMatchPoint(mkpts0, mkpts1, confidence,  match_threshold:float=0.5):n = min(mkpts0.size(0), mkpts1.size(0))srcImage1_matchedKPs, srcImage2_matchedKPs=[],[]if (match_threshold > 1 or match_threshold < 0):print("match_threshold error!")for i in range(n):kp0 = mkpts0[i]kp1 = mkpts1[i]pt0=(kp0[0].item(),kp0[1].item());pt1=(kp1[0].item(),kp1[1].item());c = confidence[i].item();if (c > match_threshold):srcImage1_matchedKPs.append(pt0);srcImage2_matchedKPs.append(pt1);return np.array(srcImage1_matchedKPs),np.array(srcImage2_matchedKPs)
pts_src, pts_dst=getGoodMatchPoint(mkpts0, mkpts1, confidence)h1, status = cv2.findHomography(pts_src, pts_dst, cv.RANSAC, 4)
# im_dst=im_dst.astype(np.float32)/255
# im_res=im_res.astype(np.float32)/255
print(im_dst.shape,im_dst.dtype,im_dst.max(),im_res.shape,im_res.dtype,im_res.max(),h1)
im_out1 = cv2.warpPerspective(im_dst, h1, (im_dst.shape[1],im_dst.shape[0]))
im_out2 = cv2.warpPerspective(im_res, h1, (im_dst.shape[1],im_dst.shape[0]),16)
#这里 im_res和im_out1是严格配准的状态
myimshowsCL([im_dst,im_out1,im_res,im_out2],rows=2,cols=2, size=6)