Background
在2D的檢測中,獲得目前檢測目标的Bbox(Bounding box)的中心點周圍往往都有實際的像素存在。但是在3D的檢測中,3D傳感器的成像特性都是隻可以對表面進行掃描,在我物體的中心周圍不會存在實際的點。是以直接在3D空間中尋找Bbox面臨這巨大的挑戰。在這樣的基礎下,作者提出了VoteNet,将Hough Voting的方法運用到了3D目标檢測當中。這種方法也被3DIoUMatch和SESS作為點雲特征提取的方法。
Progress
- 提出了一種端到端的可微架構,在3D點雲中成功使用霍夫投票的方法
- 在SUN RGB-D和ScanNet上取得了好的成績
- 深入分析了投票方法對3D目标檢測的重要性
VoteNet Pipline
VoteNet Architecture
在上圖的Pipline中可以看出,VoteNet主要分為兩個部分,分别是Voting Point Clouds和Object proposal and classification from votes。前半部分的主要作用是得到預測點,後半部分的作用是根據預測點給出分類結果
Hough Voting的基本了解
這裡我參照了這個文章了解霍夫投票,這裡的圖示可以友善的表示出霍夫投票的過程,不得不說這個圖很形象了。
Voting in Point Clouds
在Voting in point clouds中分為兩個部分,一部分是點雲特征提取層,另一部分是投票層。其中在點雲特征提取層使用了PointNet++作為backbone。輸入為點雲大小為
。在經過PointNet++之後,輸出大小為
(其中M為種子點個數),同僚每個種子點還會生成一個Vote。其中的PointNet++的基本結構如下圖
具體代碼如下:
class Pointnet2Backbone(nn.Module):
r"""
Backbone network for point cloud feature learning.
Based on Pointnet++ single-scale grouping network.
Parameters
----------
input_feature_dim: int
Number of input channels in the feature descriptor for each point.
e.g. 3 for RGB.
"""
def __init__(self, input_feature_dim=0):
super().__init__()
self.sa1 = PointnetSAModuleVotes(
npoint=2048, #中心點數量
radius=0.2, #半徑
nsample=64, #采樣點數量
mlp=[input_feature_dim, 64, 64, 128], #mlp
use_xyz=True,
normalize_xyz=True
)
self.sa2 = PointnetSAModuleVotes(
npoint=1024,
radius=0.4,
nsample=32,
mlp=[128, 128, 128, 256],
use_xyz=True,
normalize_xyz=True
)
self.sa3 = PointnetSAModuleVotes(
npoint=512,
radius=0.8,
nsample=16,
mlp=[256, 128, 128, 256],
use_xyz=True,
normalize_xyz=True
)
self.sa4 = PointnetSAModuleVotes(
npoint=256,
radius=1.2,
nsample=16,
mlp=[256, 128, 128, 256],
use_xyz=True,
normalize_xyz=True
)
self.fp1 = PointnetFPModule(mlp=[256+256,256,256])
self.fp2 = PointnetFPModule(mlp=[256+256,256,256])
def _break_up_pc(self, pc):
xyz = pc[..., 0:3].contiguous()
features = (
pc[..., 3:].transpose(1, 2).contiguous()
if pc.size(-1) > 3 else None
)
return xyz, features
def forward(self, pointcloud: torch.cuda.FloatTensor, end_points=None):
r"""
Forward pass of the network
Parameters
----------
pointcloud: Variable(torch.cuda.FloatTensor)
(B, N, 3 + input_feature_dim) tensor
Point cloud to run predicts on
Each point in the point-cloud MUST
be formated as (x, y, z, features...)
Returns
----------
end_points: {XXX_xyz, XXX_features, XXX_inds}
XXX_xyz: float32 Tensor of shape (B,K,3)
XXX_features: float32 Tensor of shape (B,K,D)
XXX-inds: int64 Tensor of shape (B,K) values in [0,N-1]
"""
if not end_points: end_points = {}
batch_size = pointcloud.shape[0]
xyz, features = self._break_up_pc(pointcloud)
# --------- 4 SET ABSTRACTION LAYERS ---------
xyz, features, fps_inds = self.sa1(xyz, features)
end_points['sa1_inds'] = fps_inds
end_points['sa1_xyz'] = xyz
end_points['sa1_features'] = features
xyz, features, fps_inds = self.sa2(xyz, features) # this fps_inds is just 0,1,...,1023
end_points['sa2_inds'] = fps_inds
end_points['sa2_xyz'] = xyz
end_points['sa2_features'] = features
xyz, features, fps_inds = self.sa3(xyz, features) # this fps_inds is just 0,1,...,511
end_points['sa3_xyz'] = xyz
end_points['sa3_features'] = features
xyz, features, fps_inds = self.sa4(xyz, features) # this fps_inds is just 0,1,...,255
end_points['sa4_xyz'] = xyz
end_points['sa4_features'] = features
# --------- 2 FEATURE UPSAMPLING LAYERS --------
features = self.fp1(end_points['sa3_xyz'], end_points['sa4_xyz'], end_points['sa3_features'], end_points['sa4_features'])
features = self.fp2(end_points['sa2_xyz'], end_points['sa3_xyz'], end_points['sa2_features'], features)
end_points['fp2_features'] = features
end_points['fp2_xyz'] = end_points['sa2_xyz']
num_seed = end_points['fp2_xyz'].shape[1]
end_points['fp2_inds'] = end_points['sa1_inds'][:,0:num_seed] # indices among the entire input point clouds
return end_points
if __name__=='__main__':
backbone_net = Pointnet2Backbone(input_feature_dim=3).cuda()
print(backbone_net)
backbone_net.eval()
out = backbone_net(torch.rand(16,20000,6).cuda())
for key in sorted(out.keys()):
print(key, '\t', out[key].shape)
投票層使用MLP将種子點生成Vote。其中MLP以種子點特征
作為輸入,并且輸出在歐式空間下的
偏移量和特征偏移量
。這兩個偏移量加上原本的坐标和特征便成為了Vote。也就是
和
。對于預測的Loss如下:
代碼如下:
class VotingModule(nn.Module):
def __init__(self, vote_factor, seed_feature_dim):
""" Votes generation from seed point features.
Args:
vote_facotr: int
number of votes generated from each seed point
seed_feature_dim: int
number of channels of seed point features
vote_feature_dim: int
number of channels of vote features
"""
super().__init__()
self.vote_factor = vote_factor
self.in_dim = seed_feature_dim #種子點次元
self.out_dim = self.in_dim # due to residual feature, in_dim has to be == out_dim
self.conv1 = torch.nn.Conv1d(self.in_dim, self.in_dim, 1)
self.conv2 = torch.nn.Conv1d(self.in_dim, self.in_dim, 1)
self.conv3 = torch.nn.Conv1d(self.in_dim, (3+self.out_dim) * self.vote_factor, 1)
self.bn1 = torch.nn.BatchNorm1d(self.in_dim)
self.bn2 = torch.nn.BatchNorm1d(self.in_dim)
def forward(self, seed_xyz, seed_features):
""" Forward pass.
Arguments:
seed_xyz: (batch_size, num_seed, 3) Pytorch tensor
seed_features: (batch_size, feature_dim, num_seed) Pytorch tensor
Returns:
vote_xyz: (batch_size, num_seed*vote_factor, 3)
vote_features: (batch_size, vote_feature_dim, num_seed*vote_factor)
"""
batch_size = seed_xyz.shape[0] # batch_size
num_seed = seed_xyz.shape[1] # num_seed
num_vote = num_seed*self.vote_factor # num_vote 投票的數量=num_seed*vote_factor
net = F.relu(self.bn1(self.conv1(seed_features)))
net = F.relu(self.bn2(self.conv2(net)))
net = self.conv3(net) # (batch_size, (3+out_dim)*vote_factor, num_seed)
net = net.transpose(2,1).view(batch_size, num_seed, self.vote_factor, 3+self.out_dim) #求反轉矩陣
offset = net[:,:,:,0:3]
vote_xyz = seed_xyz.unsqueeze(2) + offset #加入偏移量
vote_xyz = vote_xyz.contiguous().view(batch_size, num_vote, 3)
residual_features = net[:,:,:,3:] # (batch_size, num_seed, vote_factor, out_dim)
vote_features = seed_features.transpose(2,1).unsqueeze(2) + residual_features
vote_features = vote_features.contiguous().view(batch_size, num_vote, self.out_dim) #拷貝資料
vote_features = vote_features.transpose(2,1).contiguous()
return vote_xyz, vote_features
在上面的步驟中,作者給出了一個比較形象化的圖示表示,其中不同顔色的字代表不同的部分
Object Proposal and classification from votes
在此部分主要目的是為了聚類之前的投票,并且對他們的特征對象進行分類。在這個過程中,VoteNet根據之前的種子點和種子點的投票結果來産生聚類的結果。
在此步驟中首先使用類似于PointNet++中的方法對目前的資料進行了Sampling & Grouping(其中分為兩部分,分别是
的點雲和
特征)。其中Sampling使用FPS的方法選出K個中心點。之後的Grouping和PointNet++中一樣,使用BallQyery的方法來産生根據之前的K個中心點的聚類結果。
在Object Proposal階段使用了MLP,通過之前在Sampling & Grouping産生的K個聚類的結果提取特征并。在這裡,每個點都進行了坐标的變換,從全局坐标系轉換到了目前Group所在的中心點的相對坐标系下。之後傳入PointNet++的子產品中進行運算。在MLP1中的輸出被最大池化并傳給MLP2中使用。具體公式如下:
代碼如下:
class ProposalModule(nn.Module):
def __init__(self, num_class, num_heading_bin, num_size_cluster, mean_size_arr, num_proposal, sampling, seed_feat_dim=256):
super().__init__()
self.num_class = num_class
self.num_heading_bin = num_heading_bin
self.num_size_cluster = num_size_cluster
self.mean_size_arr = mean_size_arr
self.num_proposal = num_proposal
self.sampling = sampling
self.seed_feat_dim = seed_feat_dim
# Vote clustering
# decoder
self.vote_aggregation = PointnetSAModuleVotes(
npoint=self.num_proposal,
radius=0.3,
nsample=16,
mlp=[self.seed_feat_dim, 128, 128, 128],
use_xyz=True,
normalize_xyz=True
)
# Object proposal/detection
# Objectness scores (2), center residual (3),
# heading class+residual (num_heading_bin*2), size class+residual(num_size_cluster*4)
self.conv1 = torch.nn.Conv1d(128,128,1)
self.conv2 = torch.nn.Conv1d(128,128,1)
self.conv3 = torch.nn.Conv1d(128,2+3+num_heading_bin*2+num_size_cluster*4+self.num_class,1)
self.bn1 = torch.nn.BatchNorm1d(128)
self.bn2 = torch.nn.BatchNorm1d(128)
def forward(self, xyz, features, end_points):
"""
Args:
xyz: (B,K,3)
features: (B,C,K)
Returns:
scores: (B,num_proposal,2+3+NH*2+NS*4)
"""
if self.sampling == 'vote_fps':
# Farthest point sampling (FPS) on votes
xyz, features, fps_inds = self.vote_aggregation(xyz, features)
sample_inds = fps_inds
elif self.sampling == 'seed_fps':
# FPS on seed and choose the votes corresponding to the seeds
# This gets us a slightly better coverage of *object* votes than vote_fps (which tends to get more cluster votes)
sample_inds = pointnet2_utils.furthest_point_sample(end_points['seed_xyz'], self.num_proposal)
xyz, features, _ = self.vote_aggregation(xyz, features, sample_inds)
elif self.sampling == 'random':
# Random sampling from the votes
num_seed = end_points['seed_xyz'].shape[1]
batch_size = end_points['seed_xyz'].shape[0]
sample_inds = torch.randint(0, num_seed, (batch_size, self.num_proposal), dtype=torch.int).cuda()
xyz, features, _ = self.vote_aggregation(xyz, features, sample_inds)
else:
log_string('Unknown sampling strategy: %s. Exiting!'%(self.sampling))
exit()
end_points['aggregated_vote_xyz'] = xyz # (batch_size, num_proposal, 3)
end_points['aggregated_vote_inds'] = sample_inds # (batch_size, num_proposal,) # should be 0,1,2,...,num_proposal
# --------- PROPOSAL GENERATION ---------
net = F.relu(self.bn1(self.conv1(features)))
net = F.relu(self.bn2(self.conv2(net)))
net = self.conv3(net) # (batch_size, 2+3+num_heading_bin*2+num_size_cluster*4, num_proposal)
end_points = decode_scores(net, end_points, self.num_class, self.num_heading_bin, self.num_size_cluster, self.mean_size_arr)
return end_points
在上面的步驟中,作者給出了一個比較形象化的圖示表示,其中不同顔色的字代表不同的部分
Loss
對于目标檢測結果:交叉熵損失函數
對于Bbox的預測結果:交叉熵損失函數
Experiments
作者在SUN RGB-D上實驗了自己的結果,并對比了不同的方法。結果如下圖所示:
同僚,作者也在ScanNetV2上驗證了自己的實驗結果:
除此之外,作何為了驗證Hough Vote确實在實驗中起到了最用,并且為了證明投票的效果确實要優于不投票的效果,作者還引入了BoxNet的對比。對比結果如下:
Improvement
這篇文章可以說是PointNet和PointNet++的延續,之後也在SESS和3DIoU中有使用,我也是看到後面這兩篇文章才開始關注VoteNet的。但是這篇文章可能有幾點我認為可以提升的地方,當然也在之後的一些文章中有證明。因為VoteNet是使用的傳統的聚類方法,如果在一些障礙物比較密集的場景中可能VoteNet的結果不是很好。因為聚類政策會收到周圍的障礙物的影響,是以投票結果也會受到影響,最終導緻準确率下降。