论文标题: Decoupled Local Aggregation for Point Cloud Learning
代码地址:https://github.com/Matrix-ASC/DeLA
1. Dataloader
数据导入流程: 预处理得到.pth文件 -> S3DIS类定义 -> 根据配置文件S3DIS类实例化 -> 通过Dataloader读出数据
配置文件:https://github.com/Matrix-ASC/DeLA/blob/main/S3DIS/config.py
s3dis_args.k = [24, 24, 24, 24]
s3dis_args.grid_size = [0.04, 0.08, 0.16, 0.32]
s3dis_args.max_pts = 30000
数据集实例化的代码:https://github.com/Matrix-ASC/DeLA/blob/main/S3DIS/train.py
traindlr = DataLoader(S3DIS(s3dis_args, partition="!5", loop=30), batch_size=batch_size, collate_fn=s3dis_collate_fn, shuffle=True, pin_memory=True, persistent_workers=True, drop_last=True, num_workers=16)warmup_fn(model, S3DIS(s3dis_warmup_args, partition="!5", loop=batch_size, warmup=True))
数据集定义的代码:https://github.com/Matrix-ASC/DeLA/blob/main/S3DIS/s3dis.py
- 参数设置:
class S3DIS(Dataset):r"""partition => areas, can be "2" "23" "!23"==="1456" train=True => training train=False => validating test=True => testing warmup=True => warmup args: k => k in knn, [k1, k2, ..., kn] grid_size => as in subsampling, [0.04, 0.06, ..., 0.3] if warmup is True, should be estimated (lower) downsampling ratio except the first: [0.04, 2, ..., 2.5] max_pts => optional, max points per sample when training """def __init__(self, args, partition="!5", loop=30, train=True, test=False, warmup=False):
数据集共有area1-area6,6个area。其中area5是test集合,其他都是训练集合。k是邻居的数目,都设置成了24。grid size分别是[0.04, 0.08, 0.16, 0.32]。loop为30代表每个epoch所有数据循环读出30次。
- 训练时数据增强:旋转和最小化。
if self.train:angle = random.random() * 2 * math.picos, sin = math.cos(angle), math.sin(angle)rotmat = torch.tensor([[cos, sin, 0], [-sin, cos, 0], [0, 0, 1]])rotmat *= random.uniform(0.8, 1.2)xyz = xyz @ rotmatxyz += torch.empty_like(xyz).normal_(std=0.005)xyz -= xyz.min(dim=0)[0]
- 体素化:
# here grid size is assumed 0.04, so estimated downsampling ratio is ~14
if self.train:indices = grid_subsampling(xyz, self.grid_size[0], 2.5 / 14)
else:indices = grid_subsampling_test(xyz, self.grid_size[0], 2.5 / 14, pick=0)xyz = xyz[indices]
- 如果点数大于3000,取最近的30000个点
if xyz.shape[0] > self.max_pts and self.train:pt = random.choice(xyz)condition = (xyz - pt).square().sum(dim=1).argsort()[:self.max_pts].sort()[0] # sort to preserve localityxyz = xyz[condition]indices = indices[condition]
- 0.2的概率drop color以及颜色的增强
if self.train and random.random() < 0.2:col.fill_(0.)
else:if self.train and random.random() < 0.2:colmin = col.min(dim=0, keepdim=True)[0]colmax = col.max(dim=0, keepdim=True)[0]scale = 255 / (colmax - colmin)alpha = random.random()col = (1 - alpha + alpha * scale) * col - alpha * colmin * scalecol.mul_(1 / 250.)
- Hight appending:
height = xyz[:, 2:]
feature = torch.cat([col, height], dim=1)
- KNN得到邻居的下标:
indices = []
self.knn(xyz, self.grid_size[::-1], self.k[::-1], indices)
- xyz要乘40:
xyz.mul_(40)return xyz, feature, indices, lbl
- KNN代码
def knn(self, xyz: torch.Tensor, grid_size: list, k: list, indices: list, full_xyz: torch.Tensor=None):"""presubsampling and knn search \\return indices: knn1, sub1, knn2, sub2, knn3, back_knn1, back_knn2"""first = full_xyz is Nonelast = len(k) == 1gs = grid_size.pop()if first:full_xyz = xyzelse:if self.warmup:sub_indices = torch.randperm(xyz.shape[0])[:int(xyz.shape[0] / gs)].contiguous()else:sub_indices = grid_subsampling(xyz, gs)xyz = xyz[sub_indices]indices.append(sub_indices)kdt = KDTree(xyz)indices.append(kdt.knn(xyz, k.pop(), False)[0])if not last:self.knn(xyz, grid_size, k, indices, full_xyz)if not first:back = kdt.knn(full_xyz, 1, False)[0].squeeze(-1)indices.append(back)return
递归调用得到所有层的采样
collate_fn 代码
- 训练时统一每个batch中3万个点
def s3dis_collate_fn(batch):"""[[xcil], [xcil], ...]"""xyz, col, indices, lbl = list(zip(*batch))depth = (len(indices[0]) + 2) // 3cnt1 = [0] * depthpts = []for ids in indices:pts.extend(x.shape[0] for x in ids[:2*depth:2])cnt2 = []fix_indices(ids[::-1], cnt1[::-1], cnt2)cnt1 = cnt2xyz = torch.cat(xyz, dim=0)col = torch.cat(col, dim=0)lbl = torch.cat(lbl, dim=0)indices = [torch.cat(ids, dim=0) for ids in zip(*indices)]pts = torch.tensor(pts, dtype=torch.int64).view(-1, depth).transpose(0, 1).contiguous()return xyz, col, indices, pts, lbl
dataloader的batch合并的输出:xyz是坐标,col是颜色,indices是KNN的下标,lbl是标签值。
- 测试时batch的大小是1是全部点的数目
def s3dis_test_collate_fn(batch):return batch[0]
2. Network
配置文件:https://github.com/Matrix-ASC/DeLA/blob/main/S3DIS/config.py
dela_args = SimpleNamespace()
dela_args.ks = s3dis_args.k
dela_args.depths = [4, 4, 8, 4]
dela_args.dims = [64, 128, 256, 512]
dela_args.nbr_dims = [32, 32]
dela_args.head_dim = 256
dela_args.num_classes = 13
drop_path = 0.1
drop_rates = torch.linspace(0., drop_path, sum(dela_args.depths)).split(dela_args.depths)
dela_args.drop_paths = [dpr.tolist() for dpr in drop_rates]
dela_args.head_drops = torch.linspace(0., 0.15, len(dela_args.depths)).tolist()
dela_args.bn_momentum = 0.02
dela_args.act = nn.GELU
dela_args.mlp_ratio = 2
# gradient checkpoint
dela_args.use_cp = Falsedela_args.cor_std = [1.6, 3.2, 6.4, 12.8]
每一层drop_path设置为[[0.0, 0.025, 0.05, 0.075], [0.1, 0.125, 0.15, 0.175], [0.2, 0.225, 0.25, 0.275, 0.3, 0.325, 0.35, 0.375], [0.4, 0.425, 0.45, 0.475]]。
Model实例化代码:https://github.com/Matrix-ASC/DeLA/blob/main/S3DIS/train.py
model = DelaSemSeg(dela_args).cuda()
Model定义的代码:
- DelaSemSeg 定义:
多个stage按顺序输入,然后输入head。
class DelaSemSeg(nn.Module):r"""DeLA for Semantic Segmentation args: examplesdepths: [4, 4, ..., 4] dims: [128, 256, ..., 512]nbr_dims: [32, 32], dims in spatial encoding || 7->16->32->out->pool | 3->8->16->32->pool->outhead_dim: 256, hidden dim in cls headnum_classes: 13drop_paths: [0., 0., ..., 0.1], in-stage drop path rate, can be list of lists, len(dp[i]) = depth[i]head_drops: [0., 0.05, ..., 0.2], scale wise drop rate before cls headbn_momentum: 0.02 act: nn.GELUmlp_ratio: 2, can be floatuse_cp: False, enable gradient checkpoint to save memoryIf True, blocks and spatial encoding are checkpointed"""def __init__(self, args):super().__init__()# bn momentum for checkpointed layersargs.cp_bn_momentum = 1 - (1 - args.bn_momentum)**0.5self.stage = Stage(args)hid_dim = args.head_dimout_dim = args.num_classesself.head = nn.Sequential(nn.BatchNorm1d(hid_dim, momentum=args.bn_momentum),args.act(),nn.Linear(hid_dim, out_dim))self.apply(self._init_weights)def _init_weights(self, m):if isinstance(m, nn.Linear):trunc_normal_(m.weight, std=.02)if isinstance(m, nn.Linear) and m.bias is not None:nn.init.constant_(m.bias, 0)def forward(self, xyz, x, indices, pts_list=None):indices = indices[:]x, closs = self.stage(x, xyz, None, indices, pts_list)if self.training:return self.head(x), clossreturn self.head(x)
- Stage定义:
(1) Downsample:
# downsamplingif not self.first:ids = indices.pop()xyz = xyz[ids]x = self.skip_proj(x)[ids] + self.lfp(x.unsqueeze(0), prev_knn).squeeze(0)[ids]knn = indices.pop()
按照idx选取下采样后的点,然后两条支路相加获得最后的结果
(2) spatial encoding:
N, k = knn.shape
nbr = xyz[knn] - xyz.unsqueeze(1)
nbr = torch.cat([nbr, x[knn]], dim=-1).view(-1, 7) if self.first else nbr.view(-1, 3)
if self.training and self.cp:nbr.requires_grad_()
nbr_embed_func = lambda x: self.nbr_embed(x).view(N, k, -1).max(dim=1)[0]
nbr = checkpoint(nbr_embed_func, nbr) if self.training and self.cp else nbr_embed_func(nbr)
nbr = self.nbr_proj(nbr)
nbr = self.nbr_bn(nbr)
x = nbr if self.first else nbr + x
nbr_embed再nbr_proj再nbr_bn。
(3) main block:
# main block
knn = knn.unsqueeze(0)
pts = pts_list.pop() if pts_list is not None else None
x = checkpoint(self.local_aggregation, x, knn, pts) if self.training and self.cp else self.local_aggregation(x, knn, pts)# get subsequent feature maps
if not self.last:sub_x, sub_c = self.sub_stage(x, xyz, knn, indices, pts_list)
else:sub_x = sub_c = None
先用local_aggregation进行聚合,然后如果有后面的stage,递归的调用下一个stage。
(4) regularization
根据feature的差 x[rel_k] - x,来预测坐标的差。设置了监督loss。且返回值设置为sub_c + closs。
# regularization
if self.training:rel_k = torch.randint(self.k, (N, 1), device=x.device)rel_k = torch.gather(knn.squeeze(0), 1, rel_k).squeeze(1)rel_cor = (xyz[rel_k] - xyz)rel_cor.mul_(self.cor_std)# print(rel_cor.std(dim=0))rel_p = x[rel_k] - xrel_p = self.cor_head(rel_p)closs = F.mse_loss(rel_p, rel_cor)sub_c = sub_c + closs if sub_c is not None else closs
(5) upsampling
drop层加入
x = self.postproj(x)
if not self.first:back_nn = indices[self.depth-1]x = x[back_nn]
x = self.drop(x)
sub_x = sub_x + x if sub_x is not None else x
- 聚合模块的定义:
class Block(nn.Module):def __init__(self, dim, depth, drop_path, mlp_ratio, bn_momentum, act):super().__init__()self.depth = depthself.lfps = nn.ModuleList([LFP(dim, dim, bn_momentum) for _ in range(depth)])self.mlp = Mlp(dim, mlp_ratio, bn_momentum, act, 0.2)self.mlps = nn.ModuleList([Mlp(dim, mlp_ratio, bn_momentum, act) for _ in range(depth // 2)])if isinstance(drop_path, list):drop_rates = drop_pathself.dp = [dp > 0. for dp in drop_path]else:drop_rates = torch.linspace(0., drop_path, self.depth).tolist()self.dp = [drop_path > 0.] * depth#print(drop_rates)self.drop_paths = nn.ModuleList([DropPath(dpr) for dpr in drop_rates])def drop_path(self, x, i, pts):if not self.dp[i] or not self.training:return xreturn torch.cat([self.drop_paths[i](xx) for xx in torch.split(x, pts, dim=1)], dim=1)def forward(self, x, knn, pts=None):x = x + self.drop_path(self.mlp(x), 0, pts)for i in range(self.depth):x = x + self.drop_path(self.lfps[i](x, knn), i, pts)if i % 2 == 1:x = x + self.drop_path(self.mlps[i // 2](x), i, pts)return x
偶数层lfps,奇数层lfps+mlp
- LFP层定义:
class LFP(nn.Module):r"""Local Feature Propagation Layerf = linear(f)f_i = bn(max{f_j | j in knn_i} - f_i)"""def __init__(self, in_dim, out_dim, bn_momentum, init=0.):super().__init__()self.proj = nn.Linear(in_dim, out_dim, bias=False)self.bn = nn.BatchNorm1d(out_dim, momentum=bn_momentum)nn.init.constant_(self.bn.weight, init)def forward(self, x, knn):B, N, C = x.shapex = self.proj(x)x = knn_edge_maxpooling(x, knn, self.training)x = self.bn(x.view(B*N, -1)).view(B, N, -1)return x
按照edge_maxpooling进行计算。
3. Train
地址:https://github.com/Matrix-ASC/DeLA/blob/main/S3DIS/train.py
- warm up定义:
def warmup_fn(model, dataset):model.train()traindlr = DataLoader(dataset, batch_size=len(dataset), collate_fn=s3dis_collate_fn, pin_memory=True, num_workers=6)for xyz, feature, indices, pts, y in traindlr:xyz = xyz.cuda(non_blocking=True)feature = feature.cuda(non_blocking=True)indices = [ii.cuda(non_blocking=True).long() for ii in indices[::-1]]pts = pts.tolist()[::-1]y = y.cuda(non_blocking=True)with autocast():p, closs = model(xyz, feature, indices, pts)loss = F.cross_entropy(p, y) + clossloss.backward()
- warm up训练代码:
warmup_fn(model, S3DIS(s3dis_warmup_args, partition="!5", loop=batch_size, warmup=True))
- 训练代码:
for i in range(start_epoch, epoch):model.train()ttls.reset()corls.reset()metric.reset()now = time()for xyz, feature, indices, pts, y in traindlr:lam = scheduler_step/(epoch*step_per_epoch)lam = 3e-3 ** lam * .25scheduler.step(scheduler_step)scheduler_step += 1xyz = xyz.cuda(non_blocking=True)feature = feature.cuda(non_blocking=True)indices = [ii.cuda(non_blocking=True).long() for ii in indices[::-1]]pts = pts.tolist()[::-1]y = y.cuda(non_blocking=True)with autocast():p, closs = model(xyz, feature, indices, pts)loss = F.cross_entropy(p, y, label_smoothing=ls)metric.update(p.detach(), y)ttls.update(loss.item())corls.update(closs.item())optimizer.zero_grad(set_to_none=True)scaler.scale(loss + closs*lam).backward()scaler.step(optimizer)scaler.update()
lam的设置,正则化损失的权重越来越小。
- scheduler + optimizer
from utils.timm.scheduler.cosine_lr import CosineLRScheduler
from utils.timm.optim import create_optimizer_v2optimizer = create_optimizer_v2(model, lr=lr, weight_decay=5e-2)
scheduler = CosineLRScheduler(optimizer, t_initial = epoch * step_per_epoch, lr_min = lr/10000,warmup_t=warmup*step_per_epoch, warmup_lr_init = lr/20)
scaler = GradScaler()
4. Loss
with autocast():p, closs = model(xyz, feature, indices, pts)loss = F.cross_entropy(p, y, label_smoothing=ls)
scaler.scale(loss + closs*lam).backward()
两种loss混合
5. Metric
miou+acc