bevfusion代码阅读
build_from_cfg
这段代码是一个函数,用于根据配置字典构建一个模块对象。函数的参数包括一个配置字典
cfg,一个注册表registry,和一个可选的默认参数字典default_args。函数首先进行参数的类型检查,确保
cfg是一个字典,registry是一个Registry对象,default_args是一个字典或者None。然后,函数将
cfg复制给args,并将default_args中的键值对添加到args中。接下来,函数从
args中弹出键为 'type' 的值,该值指定了要构建的对象类型。如果该值是一个字符串,函数从注册表中获取对应的类对象;如果该值是一个类对象,则直接使用该对象;否则,抛出一个类型错误。最后,函数使用构建好的类对象和剩余的参数构建一个对象,并返回该对象。
此函数的作用是根据配置字典构建一个模块对象,可以根据需要自定义配置来构建不同的模块。
def build_from_cfg(cfg, registry, default_args=None):
"""Build a module from config dict.Args:
cfg (dict): Config dict. It should at least contain the key "type".
registry (:obj:`Registry`): The registry to search the type from.
default_args (dict, optional): Default initialization arguments.Returns:
object: The constructed object.
"""
if not isinstance(cfg, dict):
raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
if 'type' not in cfg:
if default_args is None or 'type' not in default_args:
raise KeyError(
'`cfg` or `default_args` must contain the key "type", '
f'but got {cfg}\n{default_args}')
if not isinstance(registry, Registry):
raise TypeError('registry must be an mmcv.Registry object, '
f'but got {type(registry)}')
if not (isinstance(default_args, dict) or default_args is None):
raise TypeError('default_args must be a dict or None, '
f'but got {type(default_args)}')args = cfg.copy()
if default_args is not None:
for name, value in default_args.items():
args.setdefault(name, value)obj_type = args.pop('type')
if isinstance(obj_type, str):
obj_cls = registry.get(obj_type)
if obj_cls is None:
raise KeyError(
f'{obj_type} is not in the {registry.name} registry')
elif inspect.isclass(obj_type):
obj_cls = obj_type
else:
raise TypeError(
f'type must be a str or valid type, but got {type(obj_type)}')
try:
return obj_cls(**args)
except Exception as e:
# Normal TypeError does not print class name.
raise type(e)(f'{obj_cls.__name__}: {e}')
BasicBlock(dla.py)
基本块架构:包含两个卷积层和一个 ReLU 激活函数层,形成一个基本的卷积神经网络块,多个基本块可以构成更复杂的卷积神经网络
conv1d与conv2d的区别:conv1d从左到右做卷积操作,conv2d先从左到右,再从上到下
conv_cfg=dict(type="Conv2d"),norm_cfg=dict(type="BN2d")
conv_cfg:dict类型,是卷积层的配置参数,应该包含:
type:str类型,'Conv1d','Conv2d';
class BasicBlock(nn.Module):
def __init__(
self, inplanes, planes, stride=1, dilation=1,
conv_cfg=dict(type="Conv2d"), norm_cfg=dict(type="BN2d"),
act_cfg=None
):
super(BasicBlock, self).__init__()
self.conv1 = ConvModule(
inplanes,
planes,
kernel_size=3,
stride=stride,#stride=1
padding=dilation,
bias=norm_cfg is None,
dilation=dilation,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg
)
self.conv2 = ConvModule(
planes,//in_channels,输入特征矩阵的深度即channel,黑白图像channel为1 planes,//out_channels,卷积核的个数,使用n个卷积核输出的特征矩阵深度即channel就是n
kernel_size=3,//卷积核的尺寸,大小为3*3
stride=1,//stride(int|tauple),卷积核的步长,若为tuple类型第一个代表高度,第二个代表宽度
padding=dilation,//padding(int|tauple),输入特征矩阵四周补零的情况默认为0,若为int型,例如:1 ,代表补一圈0;若为tuple型如(2, 1) 代表在上下补两行,左右补一列。 bias=norm_cfg is None,//偏执参数,bias (bool | str): True\False\Auto
如果是Auto,偏差由norm_cfg决定,norm_cfg is None,bias=True;否则 bias=False dilation=dilation,//是否采用空洞卷积,1为不采用。卷积核处理数据时各值的间距,大于1使用空洞卷积
conv_cfg=conv_cfg,//dict类型,卷积层
norm_cfg=norm_cfg,//dict类型,正则化层配置字典,默认值:None act_cfg=act_cfg//dict类型,激活曾配置字典,默认值:dict(typr='ReLu')
)
self.stride = stride
def forward(self, x, residual=None):#网络的前向传播,可选残差
if residual is None:
residual = x
out = self.conv1(x)#将x传入self.conv1中进行卷积操作
out = F.relu_(out)#对结果应用relu激活函数
out = self.conv2(out)#将conv1结果传入self.conv2中进行卷积操作
out = out + residual//与残差相加
out = F.relu_(out)
return out
可以参考这个链接,http://t.csdn.cn/R8wOC很详细
Camera(depth_lss.py base.py bev_pool.py)
@VTRANSFORMS.register_module()self.dtransform = nn.Sequential(#Sequential对象,用于将多个层按顺序组合起来,形成一个线性的神经网络模型 nn.Conv2d(1, 8, 1),#一个卷积层,输入通道数为1,输出通道数为8,卷积核大小为1x1 nn.BatchNorm2d(8),#一个批归一化层,用于对卷积结果的每个通道进行归一化处理 nn.ReLU(True),#激活函数(ReLU),用于对归一化后的结果进行非线性变换 nn.Conv2d(8, 32, 5, stride=4, padding=2),输入通道数为8,输出通道数为32,卷积核大小为5x5,步长为4,填充为2 nn.BatchNorm2d(32), nn.ReLU(True), nn.Conv2d(32, 64, 5, stride=2, padding=2), nn.BatchNorm2d(64), nn.ReLU(True), ) //定义了一个简单的卷积神经网络模型,它将输入数据进行一系列的卷积、批归一化和激活处理后,输出数据的大小为self.D + self.C self.depthnet = nn.Sequential( nn.Conv2d(in_channels + 64, in_channels, 3, padding=1), nn.BatchNorm2d(in_channels), nn.ReLU(True), nn.Conv2d(in_channels, in_channels, 3, padding=1), nn.BatchNorm2d(in_channels), nn.ReLU(True), nn.Conv2d(in_channels, self.D + self.C, 1), )//下采样
if downsample > 1: assert downsample == 2, downsample#downsample是否为2,若downsample不为2,downsample==downsample self.downsample = nn.Sequential( nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False), nn.BatchNorm2d(out_channels), nn.ReLU(True), nn.Conv2d( out_channels, out_channels, 3, stride=downsample, padding=1, bias=False, ), nn.BatchNorm2d(out_channels), nn.ReLU(True), nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False), nn.BatchNorm2d(out_channels), nn.ReLU(True), )//当下采样操作不需进行时,将self.downsample设置为nn.Identity(),即使用恒等映射函数 else: self.downsample = nn.Identity()@force_fp32()#将网络模型中的算子强制转换为 float32 类型 //从输入张量中提取特征 def get_cam_feats(self, x, d): B, N, C, fH, fW = x.shape#获取输入张量 x的形状信息,包括 batch 大小 B,序列长度 N ,通道数 C,图像高度 fH,图像宽度 fW
- 经过SwinTransformer得到camera feature x(W, H,C).
- 激光点投影到图像上获取每个pixel的深度图d(W,H,1), 再经过三层卷积(dtransform),得到深度feature d.
- camera feature和深度feature concate一下, 经过三层卷积(depthnet)得到最终的特征.
- 对属于深度的通道softmax操作,得到真正的深度分布, 然后与feature相乘,得到最终的带深度的camera feature.
d = d.view(B * N, *d.shape[2:])#将d的第一维大小调整为B * N,并保持其他维度不变 x = x.view(B * N, C, fH, fW)d = self.dtransform(d)#提取特征,蓝色为lss提取特征 x = torch.cat([d, x], dim=1)#d和x在dimension1(第二个维度)的方向上进行连接 x = self.depthnet(x)#输出通道数为D+C depth = x[:, : self.D].softmax(dim=1)#从张量x中提取前self.D个维度,并对其进行 softmax 归一化(可选深度的权重) x = depth.unsqueeze(1) * x[:, self.D : (self.D + self.C)].unsqueeze(2) #unsqueeze在指定维度上添加一个维度,深度值 * 特征 = 2D特征转变为3D空间(俯视图)内的特征 x = x.view(B, N, self.C, self.D, fH, fW) x = x.permute(0, 1, 3, 4, 5, 2)#改变维度顺序 return x //对输入进行前向计算 def forward(self, *args, **kwargs): x = super().forward(*args, **kwargs) x = self.downsample(x) return x带深度的camera feature要根据内外参投影到BEV下,获得每个像素对应的x,y,这才是真正的BEV feature
由于每张图都会往BEV下投影,会有一个格子中投影了好几个camera feature的情况, 所以需要bev pooling操作.(实际上是lss的voxel pooling操作)
@force_fp32() def bev_pool(self, geom_feats, x): # geom_feats是预先生成的3D空间坐标点, x是camera feature B, N, D, H, W, C = x.shape Nprime = B * N * D * H * W # flatten x x = x.reshape(Nprime, C)# 将特征点云展平,一共有 B*N*D*H*W 个点 # flatten indices geom_feats = ((geom_feats - (self.bx - self.dx / 2.0)) / self.dx).long()#ego下的空间坐标转换到体素坐标(计算栅格坐标并取整) geom_feats = geom_feats.view(Nprime, 3) batch_ix = torch.cat( [ torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long) for ix in range(B) ] )#创建一个全0张量,并依次填充每个批次的索引值,将所有批次的索引拼接起来,形状为 (Nprime,1)每个点对应于哪个batch geom_feats = torch.cat((geom_feats, batch_ix), 1)#形状为(Nprime, 4) # filter out points that are outside box过滤掉在边界线之外的点 x:0~199 y: 0~199 z: 0 kept = ( (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0]) & (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1]) & (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2]) ) x = x[kept] geom_feats = geom_feats[kept] x = bev_pool(x, geom_feats, B, self.nx[2], self.nx[0], self.nx[1])#池化 # collapse Z final = torch.cat(x.unbind(dim=2), 1) return final# final: bs x 64 x 200 x 200真正的bev_pool在mmdet3d.ops里
//根据坐标信息对特征图进行坐标池化操作。具体操作过程是,根据每个坐标在特征图中的位置计算排名,然后对特征张量进行累积和操作,最后进行转置和拼接得到池化结果
def bev_pool(feats, coords, B, D, H, W): assert feats.shape[0] == coords.shape[0]#第一维度相等 ranks = ( coords[:, 0] * (W * D * B) + coords[:, 1] * (D * B) + coords[:, 2] * B + coords[:, 3] ) indices = ranks.argsort()#按照升序对排名张量进行排序,返回索引张量 feats, coords, ranks = feats[indices], coords[indices], ranks[indices] x = QuickCumsumCuda.apply(feats, coords, ranks, B, D, H, W)#进行累积和操作,参数是特征张量、坐标张量、排名、基准边长、特征图缩放因子、特征图高度和宽度 x = x.permute(0, 4, 1, 2, 3).contiguous() return x
参考了这篇文章https://zhuanlan.zhihu.com/p/560000194
Lidar(pillar_encoder.py)
pillar_encoder.py
//点云编码器
class PointPillarsEncoder(nn.Module): def __init__( self, pts_voxel_encoder: Dict[str, Any], pts_middle_encoder: Dict[str, Any], **kwargs, ): super().__init__() self.pts_voxel_encoder = build_backbone(pts_voxel_encoder) self.pts_middle_encoder = build_backbone(pts_middle_encoder) def forward(self, feats, coords, batch_size, sizes): x = self.pts_voxel_encoder(feats, sizes, coords) x = self.pts_middle_encoder(x, coords, batch_size) return x//point feature转换为pillar future def forward(self, voxel_features, coords, batch_size): # batch_canvas will be the final output.输出画布 batch_canvas = [] for batch_itt in range(batch_size): # Create the canvas for this sample canvas = torch.zeros( self.in_channels, self.nx * self.ny,#画布的行数和列数 dtype=voxel_features.dtype, device=voxel_features.device, ) # Only include non-empty pillars batch_mask = coords[:, 0] == batch_itt#创建一个掩码batch_mask,用于筛选出属于该批次的坐标信息 this_coords = coords[batch_mask, :]#筛选出属于该批次的坐标信息 # modified -> xyz coords indices = this_coords[:, 1] * self.ny + this_coords[:, 2] indices = indices.type(torch.long) voxels = voxel_features[batch_mask, :]#筛选出voxel voxels = voxels.t()#转置为二维张量 # Now scatter the blob back to the canvas.用scatter操作将体素特征散布到canvas对应的位置上 canvas[:, indices] = voxels # Append to a list for later stacking.后续堆叠为3维张量 batch_canvas.append(canvas) # Stack to 3-dim tensor (batch-size, nchannels, nrows*ncols) batch_canvas = torch.stack(batch_canvas, 0)#堆叠为3维张量 # Undo the column stacking to final 4-dim tensor转换为4维张量 batch_canvas = batch_canvas.view(batch_size, self.in_channels, self.nx, self.ny) return batch_canvasclass PillarFeatureNet(nn.Module):super().__init__() self.name = "PillarFeatureNet" assert len(feat_channels) > 0 self.in_channels = in_channels in_channels += 5 if with_distance: in_channels += 1 self._with_distance = with_distance # Create PillarFeatureNet layers feat_channels = [in_channels] + list(feat_channels)#将in_channels添加到feat_channels列表的开头 pfn_layers = [] for i in range(len(feat_channels) - 1): in_filters = feat_channels[i]#当前索引位置的特征通道数 out_filters = feat_channels[i + 1]#当前索引位置的下一个特征通道数 if i < len(feat_channels) - 2:#不是倒数第二个位置 last_layer = False else: last_layer = True pfn_layers.append( PFNLayer( in_filters, out_filters, norm_cfg=norm_cfg, last_layer=last_layer ) ) self.pfn_layers = nn.ModuleList(pfn_layers)#转换为nn.ModuleList类型 # Need pillar (voxel) size and x/y offset in order to calculate pillar offset self.vx = voxel_size[0] self.vy = voxel_size[1] self.x_offset = self.vx / 2 + point_cloud_range[0]#模型的x偏移量 self.y_offset = self.vy / 2 + point_cloud_range[1]#模型的y偏移量def forward(self, features, num_voxels, coors): device = features.device dtype = features.dtype # Find distance of x, y, and z from cluster center # features = features[:, :, :self.num_input] points_mean = features[:, :, :3].sum(dim=1, keepdim=True) / num_voxels.type_as( features ).view(-1, 1, 1)#计算每个立方体中所有点的平均值,并保留一个维度 f_cluster = features[:, :, :3] - points_mean#计算前3个维度(x,y,z)与所属立方体中心的偏移量 # Find distance of x, y, and z from pillar center # f_center = features[:, :, :2] # modified according to xyz coords # 计算相对于pillar center的x,y,z偏移量 f_center = torch.zeros_like(features[:, :, :2]) f_center[:, :, 0] = features[:, :, 0] - ( coors[:, 1].to(dtype).unsqueeze(1) * self.vx + self.x_offset ) f_center[:, :, 1] = features[:, :, 1] - ( coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset ) # Combine together feature decorations features_ls = [features, f_cluster, f_center] if self._with_distance: points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)#范数 features_ls.append(points_dist) features = torch.cat(features_ls, dim=-1) # The feature decorations were calculated without regard to whether pillar was empty. Need to ensure that # empty pillars remain set to zeros. voxel_count = features.shape[1] mask = get_paddings_indicator(num_voxels, voxel_count, axis=0) mask = torch.unsqueeze(mask, -1).type_as(features) features *= mask # Forward pass through PFNLayers for pfn in self.pfn_layers: features = pfn(features) return features.squeeze()
bevfusion.py
加载模块,bevfuion的pipeline
self.encoders = nn.ModuleDict() #是否使用camera encoder if encoders.get("camera") is not None: self.encoders["camera"] = nn.ModuleDict( { "backbone": build_backbone(encoders["camera"]["backbone"]), "neck": build_neck(encoders["camera"]["neck"]), "vtransform": build_vtransform(encoders["camera"]["vtransform"]), } ) #是否使用lidar encoder if encoders.get("lidar") is not None: if encoders["lidar"]["voxelize"].get("max_num_points", -1) > 0: voxelize_module = Voxelization(**encoders["lidar"]["voxelize"]) else: voxelize_module = DynamicScatter(**encoders["lidar"]["voxelize"]) self.encoders["lidar"] = nn.ModuleDict( { "voxelize": voxelize_module, "backbone": build_backbone(encoders["lidar"]["backbone"]), } ) self.voxelize_reduce = encoders["lidar"].get("voxelize_reduce", True)//提取相机特征 def extract_camera_features( self, x, points, camera2ego,#用于相机坐标系和世界坐标系之间的转换 lidar2ego,#用于激光雷达坐标系和世界坐标系之间的转换 lidar2camera,#用于激光雷达坐标系和相机坐标系之间的转换 lidar2image,#用于激光雷达坐标系和图像坐标系之间的转换 camera_intrinsics,#相机内参,大小为(B, 3, 3),其中第一个维度表示样本数量,第二个和第三个维度表示相机内参矩阵 camera2lidar,#相机坐标系和激光雷达坐标系之间的转换 img_aug_matrix,#图像增广矩阵 lidar_aug_matrix,#激光雷达增广矩阵 img_metas,#图像的元数据 ) -> torch.Tensor:#返回一个torch.Tensor类型的输出 B, N, C, H, W = x.size() x = x.view(B * N, C, H, W) x = self.encoders["camera"]["backbone"](x)#图像编码 x = self.encoders["camera"]["neck"](x) if not isinstance(x, torch.Tensor): x = x[0] BN, C, H, W = x.size() x = x.view(B, int(BN / B), C, H, W)#调整为原来的形状 x = self.encoders["camera"]["vtransform"]( x, points, camera2ego, lidar2ego, lidar2camera, lidar2image, camera_intrinsics, camera2lidar, img_aug_matrix, lidar_aug_matrix, img_metas, ) return xdef extract_lidar_features(self, x) -> torch.Tensor: feats, coords, sizes = self.voxelize(x)#将输入的点云数据x转换为体素格式 batch_size = coords[-1, 0] + 1 x = self.encoders["lidar"]["backbone"](feats, coords, batch_size, sizes=sizes) return x@torch.no_grad() @force_fp32() def voxelize(self, points): feats, coords, sizes = [], [], [] for k, res in enumerate(points): ret = self.encoders["lidar"]["voxelize"](res)#对点云数据进行离散化处理 if len(ret) == 3: # hard voxelize硬体素化(更高效) f, c, n = ret else: assert len(ret) == 2 #软体素化(更精确) f, c = ret n = None feats.append(f) coords.append(F.pad(c, (1, 0), mode="constant", value=k)) if n is not None: sizes.append(n) feats = torch.cat(feats, dim=0) coords = torch.cat(coords, dim=0) if len(sizes) > 0:#存在大小信息 sizes = torch.cat(sizes, dim=0) if self.voxelize_reduce: feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view( -1, 1 ) feats = feats.contiguous()#将特征向量重新排列成一个连续的内存块 return feats, coords, sizesdef forward_single( ... features = [] for sensor in ( self.encoders if self.training else list(self.encoders.keys())[::-1] ):#在训练模式下遍历self.encoders中的传感器,否则逆序遍历self.encoders.keys() 中的传感器 if sensor == "camera": feature = self.extract_camera_features( img, points, camera2ego, lidar2ego, lidar2camera, lidar2image, camera_intrinsics, camera2lidar, img_aug_matrix, lidar_aug_matrix, metas, ) elif sensor == "lidar": feature = self.extract_lidar_features(points) else: raise ValueError(f"unsupported sensor: {sensor}") features.append(feature) if not self.training: # avoid OOM(内存耗尽) features = features[::-1] if self.fuser is not None: x = self.fuser(features)#将特征输入融合器进行融合 else: assert len(features) == 1, features x = features[0] batch_size = x.shape[0] x = self.decoder["backbone"](x) x = self.decoder["neck"](x) if self.training: outputs = {} for type, head in self.heads.items():#遍历每个类型的头部网络(self.heads)并进行相应的计算 if type == "object": pred_dict = head(x, metas) losses = head.loss(gt_bboxes_3d, gt_labels_3d, pred_dict)#loss方法计算预测结果与标签之间的损失 elif type == "map": losses = head(x, gt_masks_bev)#计算预测结果与地面真实掩码之间的损失 else: raise ValueError(f"unsupported head: {type}") for name, val in losses.items(): if val.requires_grad: outputs[f"loss/{type}/{name}"] = val * self.loss_scale[type] else: outputs[f"stats/{type}/{name}"] = val return outputs else: outputs = [{} for _ in range(batch_size)] for type, head in self.heads.items(): if type == "object": pred_dict = head(x, metas) bboxes = head.get_bboxes(pred_dict, metas)#预测框坐标 for k, (boxes, scores, labels) in enumerate(bboxes): outputs[k].update( { "boxes_3d": boxes.to("cpu"), "scores_3d": scores.cpu(), "labels_3d": labels.cpu(), } ) elif type == "map": logits = head(x) for k in range(batch_size): outputs[k].update( { "masks_bev": logits[k].cpu(), "gt_masks_bev": gt_masks_bev[k].cpu(), } ) else: raise ValueError(f"unsupported head: {type}") return outputs
base.py:BaseTransform实现图像和点云转换到bev
//视锥体创建 def create_frustum(self): iH, iW = self.image_size fH, fW = self.feature_size ds = ( torch.arange(*self.dbound, dtype=torch.float)#self.dbound表示视锥体的边界 .view(-1, 1, 1) # -1 表示根据张量大小自动推断该维度的大小 .expand(-1, fH, fW) ) D, _, _ = ds.shape xs = ( torch.linspace(0, iW - 1, fW, dtype=torch.float) .view(1, 1, fW) .expand(D, fH, fW) ) ys = ( torch.linspace(0, iH - 1, fH, dtype=torch.float) .view(1, fH, 1) .expand(D, fH, fW) ) frustum = torch.stack((xs, ys, ds), -1)# 按最后一个维度拼接成一个大小为 (D, fH, fW, 3) 的张量 return nn.Parameter(frustum, requires_grad=False)//相机坐标系到激光雷达坐标系的几何转换 def get_geometry( self, camera2lidar_rots,#相机到激光雷达的旋转矩阵,形状为BxNx3x3 camera2lidar_trans,#相机到激光雷达的平移向量,形状为BxNx3 intrins,#相机的内参矩阵,形状为Bx3x3 post_rots,#点云集合的旋转矩阵,形状为BxNx3x3 post_trans,#点云集合的平移向量,形状为BxNx3 **kwargs,#其他关键字参数 ): B, N, _ = camera2lidar_trans.shape # undo post-transformation撤销后转换 #执行图像空间增广的逆变换 # B x N x D x H x W x 3 points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3) points = ( torch.inverse(post_rots)#计算 post_rots 的逆矩阵 .view(B, N, 1, 1, 1, 3, 3) .matmul(points.unsqueeze(-1))#(B,N,1,1,1,1,3) ) # cam_to_lidar points = torch.cat( ( points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3], points[:, :, :, :, :, 2:3], ), 5, ) combine = camera2lidar_rots.matmul(torch.inverse(intrins))#将旋转矩阵和内参矩阵的逆相乘得到一个转换矩阵 points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1) points += camera2lidar_trans.view(B, N, 1, 1, 1, 3) if "extra_rots" in kwargs: extra_rots = kwargs["extra_rots"] points = ( extra_rots.view(B, 1, 1, 1, 1, 3, 3) .repeat(1, N, 1, 1, 1, 1, 1)#(B, N, 1, 1, 1, 3, 3) .matmul(points.unsqueeze(-1)) .squeeze(-1) ) if "extra_trans" in kwargs: extra_trans = kwargs["extra_trans"] points += extra_trans.view(B, 1, 1, 1, 1, 3).repeat(1, N, 1, 1, 1, 1) return pointsdef bev_pool(self, geom_feats, x):#根据这些点生成bev空间的特征//深度转换的操作,将输入的点云数据转换为深度图像,并进行了相应的坐标变换和归一化处理 class BaseDepthTransform(BaseTransform): @force_fp32() def forward( self, img, points, sensor2ego, lidar2ego, lidar2camera, lidar2image, cam_intrinsic, camera2lidar, img_aug_matrix, lidar_aug_matrix, metas, **kwargs, ): rots = sensor2ego[..., :3, :3] trans = sensor2ego[..., :3, 3] intrins = cam_intrinsic[..., :3, :3] post_rots = img_aug_matrix[..., :3, :3] post_trans = img_aug_matrix[..., :3, 3] lidar2ego_rots = lidar2ego[..., :3, :3] lidar2ego_trans = lidar2ego[..., :3, 3] camera2lidar_rots = camera2lidar[..., :3, :3] camera2lidar_trans = camera2lidar[..., :3, 3] # print(img.shape, self.image_size, self.feature_size) batch_size = len(points) depth = torch.zeros(batch_size, img.shape[1], 1, *self.image_size).to( points[0].device ) for b in range(batch_size): cur_coords = points[b][:, :3] cur_img_aug_matrix = img_aug_matrix[b] cur_lidar_aug_matrix = lidar_aug_matrix[b] cur_lidar2image = lidar2image[b] # inverse aug cur_coords -= cur_lidar_aug_matrix[:3, 3] cur_coords = torch.inverse(cur_lidar_aug_matrix[:3, :3]).matmul( cur_coords.transpose(1, 0) ) # lidar2image cur_coords = cur_lidar2image[:, :3, :3].matmul(cur_coords) cur_coords += cur_lidar2image[:, :3, 3].reshape(-1, 3, 1) # get 2d coords dist = cur_coords[:, 2, :] cur_coords[:, 2, :] = torch.clamp(cur_coords[:, 2, :], 1e-5, 1e5)#归一化到[1e-5, 1e5]的范围内 cur_coords[:, :2, :] /= cur_coords[:, 2:3, :] # imgaug cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords) cur_coords += cur_img_aug_matrix[:, :3, 3].reshape(-1, 3, 1) cur_coords = cur_coords[:, :2, :].transpose(1, 2) # normalize coords for grid sample cur_coords = cur_coords[..., [1, 0]] on_img = ( (cur_coords[..., 0] < self.image_size[0]) & (cur_coords[..., 0] >= 0) & (cur_coords[..., 1] < self.image_size[1]) & (cur_coords[..., 1] >= 0) )#根据坐标的范围判断点是否在图像上 for c in range(on_img.shape[0]): masked_coords = cur_coords[c, on_img[c]].long() masked_dist = dist[c, on_img[c]]#图像上的点到相机中心的距离 depth[b, c, 0, masked_coords[:, 0], masked_coords[:, 1]] = masked_dist#深度图中的相应位置的值替换为到相机中心的距离 extra_rots = lidar_aug_matrix[..., :3, :3] extra_trans = lidar_aug_matrix[..., :3, 3] geom = self.get_geometry( camera2lidar_rots, camera2lidar_trans, intrins, post_rots, post_trans, extra_rots=extra_rots, extra_trans=extra_trans, ) x = self.get_cam_feats(img, depth) x = self.bev_pool(geom, x) return x