bevfusion代码阅读

build_from_cfg

这段代码是一个函数，用于根据配置字典构建一个模块对象。函数的参数包括一个配置字典 cfg，一个注册表 registry，和一个可选的默认参数字典 default_args。

函数首先进行参数的类型检查，确保 cfg 是一个字典，registry 是一个 Registry 对象，default_args 是一个字典或者 None。

然后，函数将 cfg 复制给 args，并将 default_args 中的键值对添加到 args 中。

接下来，函数从 args 中弹出键为 'type' 的值，该值指定了要构建的对象类型。如果该值是一个字符串，函数从注册表中获取对应的类对象；如果该值是一个类对象，则直接使用该对象；否则，抛出一个类型错误。

最后，函数使用构建好的类对象和剩余的参数构建一个对象，并返回该对象。

此函数的作用是根据配置字典构建一个模块对象，可以根据需要自定义配置来构建不同的模块。

def build_from_cfg(cfg, registry, default_args=None):
    """Build a module from config dict.

    Args:
        cfg (dict): Config dict. It should at least contain the key "type".
        registry (:obj:`Registry`): The registry to search the type from.
        default_args (dict, optional): Default initialization arguments.

    Returns:
        object: The constructed object.
    """
    if not isinstance(cfg, dict):
        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
    if 'type' not in cfg:
        if default_args is None or 'type' not in default_args:
            raise KeyError(
                '`cfg` or `default_args` must contain the key "type", '
                f'but got {cfg}\n{default_args}')
    if not isinstance(registry, Registry):
        raise TypeError('registry must be an mmcv.Registry object, '
                        f'but got {type(registry)}')
    if not (isinstance(default_args, dict) or default_args is None):
        raise TypeError('default_args must be a dict or None, '
                        f'but got {type(default_args)}')

    args = cfg.copy()

    if default_args is not None:
        for name, value in default_args.items():
            args.setdefault(name, value)

    obj_type = args.pop('type')
    if isinstance(obj_type, str):
        obj_cls = registry.get(obj_type)
        if obj_cls is None:
            raise KeyError(
                f'{obj_type} is not in the {registry.name} registry')
    elif inspect.isclass(obj_type):
        obj_cls = obj_type
    else:
        raise TypeError(
            f'type must be a str or valid type, but got {type(obj_type)}')
    try:
        return obj_cls(**args)
    except Exception as e:
        # Normal TypeError does not print class name.
        raise type(e)(f'{obj_cls.__name__}: {e}')

BasicBlock(dla.py)

基本块架构：包含两个卷积层和一个 ReLU 激活函数层，形成一个基本的卷积神经网络块，多个基本块可以构成更复杂的卷积神经网络

conv1d与conv2d的区别：conv1d从左到右做卷积操作，conv2d先从左到右，再从上到下

conv_cfg=dict(type="Conv2d"),norm_cfg=dict(type="BN2d")

conv_cfg：dict类型，是卷积层的配置参数，应该包含：

type：str类型，'Conv1d'，'Conv2d'；卷积形参

class BasicBlock(nn.Module):
    def __init__(
        self, inplanes, planes, stride=1, dilation=1, 
        conv_cfg=dict(type="Conv2d"), norm_cfg=dict(type="BN2d"),
        act_cfg=None
    ):
        super(BasicBlock, self).__init__()
        self.conv1 = ConvModule(
            inplanes,
            planes,
            kernel_size=3,
            stride=stride,#stride=1
            padding=dilation,
            bias=norm_cfg is None,
            dilation=dilation,
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg,
            act_cfg=act_cfg
        )

        self.conv2 = ConvModule(
         planes,//in_channels,输入特征矩阵的深度即channel，黑白图像channel为1         planes,//out_channels,卷积核的个数，使用n个卷积核输出的特征矩阵深度即channel就是n 
        kernel_size=3,//卷积核的尺寸,大小为3*3 
        stride=1,//stride(int|tauple),卷积核的步长，若为tuple类型第一个代表高度，第二个代表宽度         
        padding=dilation,//padding(int|tauple),输入特征矩阵四周补零的情况默认为0，若为int型，例如：1 ，代表补一圈0；若为tuple型如(2, 1) 代表在上下补两行，左右补一列。         bias=norm_cfg is None,//偏执参数，bias (bool | str): True\False\Auto
如果是Auto，偏差由norm_cfg决定，norm_cfg is None,bias=True;否则 bias=False         dilation=dilation,//是否采用空洞卷积，1为不采用。卷积核处理数据时各值的间距,大于1使用空洞卷积 
        conv_cfg=conv_cfg,//dict类型，卷积层 
        norm_cfg=norm_cfg,//dict类型，正则化层配置字典，默认值：None         act_cfg=act_cfg//dict类型，激活曾配置字典，默认值：dict(typr='ReLu')
 )
        self.stride = stride

    def forward(self, x, residual=None):#网络的前向传播，可选残差
        if residual is None:
            residual = x

        out = self.conv1(x)#将x传入self.conv1中进行卷积操作
        out = F.relu_(out)#对结果应用relu激活函数

        out = self.conv2(out)#将conv1结果传入self.conv2中进行卷积操作

        out = out + residual//与残差相加
        out = F.relu_(out)

        return out

可以参考这个链接，http://t.csdn.cn/R8wOC很详细

Camera(depth_lss.py base.py bev_pool.py)

@VTRANSFORMS.register_module()

self.dtransform = nn.Sequential(#Sequential对象，用于将多个层按顺序组合起来，形成一个线性的神经网络模型
    nn.Conv2d(1, 8, 1),#一个卷积层，输入通道数为1，输出通道数为8，卷积核大小为1x1
    nn.BatchNorm2d(8),#一个批归一化层，用于对卷积结果的每个通道进行归一化处理
    nn.ReLU(True),#激活函数（ReLU），用于对归一化后的结果进行非线性变换
    nn.Conv2d(8, 32, 5, stride=4, padding=2),输入通道数为8，输出通道数为32，卷积核大小为5x5，步长为4，填充为2
    nn.BatchNorm2d(32),
    nn.ReLU(True),
    nn.Conv2d(32, 64, 5, stride=2, padding=2),
    nn.BatchNorm2d(64),
    nn.ReLU(True),
)

//定义了一个简单的卷积神经网络模型，它将输入数据进行一系列的卷积、批归一化和激活处理后，输出数据的大小为self.D + self.C
self.depthnet = nn.Sequential(
    nn.Conv2d(in_channels + 64, in_channels, 3, padding=1),
    nn.BatchNorm2d(in_channels),
    nn.ReLU(True),
    nn.Conv2d(in_channels, in_channels, 3, padding=1),
    nn.BatchNorm2d(in_channels),
    nn.ReLU(True),
    nn.Conv2d(in_channels, self.D + self.C, 1),
)

//下采样

if downsample > 1:
    assert downsample == 2, downsample#downsample是否为2，若downsample不为2，downsample==downsample
    self.downsample = nn.Sequential(
        nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(True),
        nn.Conv2d(
            out_channels,
            out_channels,
            3,
            stride=downsample,
            padding=1,
            bias=False,
        ),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(True),
        nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(True),
    )//当下采样操作不需进行时，将self.downsample设置为nn.Identity()，即使用恒等映射函数
else:
    self.downsample = nn.Identity()

@force_fp32()#将网络模型中的算子强制转换为 float32 类型
//从输入张量中提取特征
def get_cam_feats(self, x, d):
    B, N, C, fH, fW = x.shape#获取输入张量 x的形状信息，包括 batch 大小 B，序列长度 N
，通道数 C，图像高度 fH，图像宽度 fW

经过SwinTransformer得到camera feature x(W, H,C).
激光点投影到图像上获取每个pixel的深度图d(W,H,1), 再经过三层卷积(dtransform),得到深度feature d.
camera feature和深度feature concate一下, 经过三层卷积(depthnet)得到最终的特征.
对属于深度的通道softmax操作,得到真正的深度分布, 然后与feature相乘,得到最终的带深度的camera feature.

    d = d.view(B * N, *d.shape[2:])#将d的第一维大小调整为B * N，并保持其他维度不变
    x = x.view(B * N, C, fH, fW)

    d = self.dtransform(d)#提取特征，蓝色为lss提取特征
    x = torch.cat([d, x], dim=1)#d和x在dimension1（第二个维度）的方向上进行连接
    x = self.depthnet(x)#输出通道数为D+C

    depth = x[:, : self.D].softmax(dim=1)#从张量x中提取前self.D个维度，并对其进行 softmax 归一化（可选深度的权重）
    x = depth.unsqueeze(1) * x[:, self.D : (self.D + self.C)].unsqueeze(2)
#unsqueeze在指定维度上添加一个维度，深度值 * 特征 = 2D特征转变为3D空间(俯视图)内的特征

    x = x.view(B, N, self.C, self.D, fH, fW)
    x = x.permute(0, 1, 3, 4, 5, 2)#改变维度顺序
    return x
//对输入进行前向计算
def forward(self, *args, **kwargs):
    x = super().forward(*args, **kwargs)
    x = self.downsample(x)
    return x

带深度的camera feature要根据内外参投影到BEV下,获得每个像素对应的x,y,这才是真正的BEV feature

由于每张图都会往BEV下投影,会有一个格子中投影了好几个camera feature的情况, 所以需要bev pooling操作.（实际上是lss的voxel pooling操作）

@force_fp32()
def bev_pool(self, geom_feats, x):
# geom_feats是预先生成的3D空间坐标点, x是camera feature
    B, N, D, H, W, C = x.shape
    Nprime = B * N * D * H * W

    # flatten x
    x = x.reshape(Nprime, C)# 将特征点云展平，一共有 B*N*D*H*W 个点

    # flatten indices
    geom_feats = ((geom_feats - (self.bx - self.dx / 2.0)) / self.dx).long()#ego下的空间坐标转换到体素坐标（计算栅格坐标并取整）
    geom_feats = geom_feats.view(Nprime, 3)
    batch_ix = torch.cat(
        [
            torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long)
            for ix in range(B)
        ]
    )#创建一个全0张量，并依次填充每个批次的索引值，将所有批次的索引拼接起来，形状为 (Nprime,1)每个点对应于哪个batch

    geom_feats = torch.cat((geom_feats, batch_ix), 1)#形状为(Nprime, 4)

    # filter out points that are outside box过滤掉在边界线之外的点 x:0~199 y: 0~199 z: 0
    kept = (
        (geom_feats[:, 0] >= 0)
        & (geom_feats[:, 0] < self.nx[0])
        & (geom_feats[:, 1] >= 0)
        & (geom_feats[:, 1] < self.nx[1])
        & (geom_feats[:, 2] >= 0)
        & (geom_feats[:, 2] < self.nx[2])
    )
    x = x[kept]
    geom_feats = geom_feats[kept]

    x = bev_pool(x, geom_feats, B, self.nx[2], self.nx[0], self.nx[1])#池化

    # collapse Z
    final = torch.cat(x.unbind(dim=2), 1)

    return final# final: bs x 64 x 200 x 200

真正的bev_pool在mmdet3d.ops里

//根据坐标信息对特征图进行坐标池化操作。具体操作过程是，根据每个坐标在特征图中的位置计算排名，然后对特征张量进行累积和操作，最后进行转置和拼接得到池化结果

def bev_pool(feats, coords, B, D, H, W):
    assert feats.shape[0] == coords.shape[0]#第一维度相等

    ranks = (
        coords[:, 0] * (W * D * B)
        + coords[:, 1] * (D * B)
        + coords[:, 2] * B
        + coords[:, 3]
    )
    indices = ranks.argsort()#按照升序对排名张量进行排序，返回索引张量
    feats, coords, ranks = feats[indices], coords[indices], ranks[indices]

    x = QuickCumsumCuda.apply(feats, coords, ranks, B, D, H, W)#进行累积和操作，参数是特征张量、坐标张量、排名、基准边长、特征图缩放因子、特征图高度和宽度
    x = x.permute(0, 4, 1, 2, 3).contiguous()
    return x

参考了这篇文章https://zhuanlan.zhihu.com/p/560000194

Lidar(pillar_encoder.py)

pillar_encoder.py

//点云编码器

class PointPillarsEncoder(nn.Module):
    def __init__(
        self,
        pts_voxel_encoder: Dict[str, Any],
        pts_middle_encoder: Dict[str, Any],
        **kwargs,
    ):
        super().__init__()
        self.pts_voxel_encoder = build_backbone(pts_voxel_encoder)
        self.pts_middle_encoder = build_backbone(pts_middle_encoder)

    def forward(self, feats, coords, batch_size, sizes):
        x = self.pts_voxel_encoder(feats, sizes, coords)
        x = self.pts_middle_encoder(x, coords, batch_size)
        return x

//point feature转换为pillar future
def forward(self, voxel_features, coords, batch_size):
    # batch_canvas will be the final output.输出画布
    batch_canvas = []
    for batch_itt in range(batch_size):
        # Create the canvas for this sample
        canvas = torch.zeros(
            self.in_channels,
            self.nx * self.ny,#画布的行数和列数
            dtype=voxel_features.dtype,
            device=voxel_features.device,
        )

        # Only include non-empty pillars
        batch_mask = coords[:, 0] == batch_itt#创建一个掩码batch_mask，用于筛选出属于该批次的坐标信息

        this_coords = coords[batch_mask, :]#筛选出属于该批次的坐标信息
        # modified -> xyz coords
        indices = this_coords[:, 1] * self.ny + this_coords[:, 2]
        indices = indices.type(torch.long)
        voxels = voxel_features[batch_mask, :]#筛选出voxel
        voxels = voxels.t()#转置为二维张量

        # Now scatter the blob back to the canvas.用scatter操作将体素特征散布到canvas对应的位置上
        canvas[:, indices] = voxels

        # Append to a list for later stacking.后续堆叠为3维张量
        batch_canvas.append(canvas)

    # Stack to 3-dim tensor (batch-size, nchannels, nrows*ncols)
    batch_canvas = torch.stack(batch_canvas, 0)#堆叠为3维张量

    # Undo the column stacking to final 4-dim tensor转换为4维张量
    batch_canvas = batch_canvas.view(batch_size, self.in_channels, self.nx, self.ny)
    return batch_canvas

class PillarFeatureNet(nn.Module):

super().__init__()
self.name = "PillarFeatureNet"
assert len(feat_channels) > 0

self.in_channels = in_channels
in_channels += 5
if with_distance:
    in_channels += 1
self._with_distance = with_distance

# Create PillarFeatureNet layers
feat_channels = [in_channels] + list(feat_channels)#将in_channels添加到feat_channels列表的开头
pfn_layers = []
for i in range(len(feat_channels) - 1):
    in_filters = feat_channels[i]#当前索引位置的特征通道数
    out_filters = feat_channels[i + 1]#当前索引位置的下一个特征通道数
    if i < len(feat_channels) - 2:#不是倒数第二个位置
        last_layer = False
    else:
        last_layer = True
    pfn_layers.append(
        PFNLayer(
            in_filters, out_filters, norm_cfg=norm_cfg, last_layer=last_layer
        )
    )
self.pfn_layers = nn.ModuleList(pfn_layers)#转换为nn.ModuleList类型

# Need pillar (voxel) size and x/y offset in order to calculate pillar offset
self.vx = voxel_size[0]
self.vy = voxel_size[1]
self.x_offset = self.vx / 2 + point_cloud_range[0]#模型的x偏移量
self.y_offset = self.vy / 2 + point_cloud_range[1]#模型的y偏移量

def forward(self, features, num_voxels, coors):
    device = features.device

    dtype = features.dtype

    # Find distance of x, y, and z from cluster center
    # features = features[:, :, :self.num_input]
    points_mean = features[:, :, :3].sum(dim=1, keepdim=True) / num_voxels.type_as(
        features
    ).view(-1, 1, 1)#计算每个立方体中所有点的平均值，并保留一个维度
    f_cluster = features[:, :, :3] - points_mean#计算前3个维度（x，y，z）与所属立方体中心的偏移量

    # Find distance of x, y, and z from pillar center
    # f_center = features[:, :, :2]
    # modified according to xyz coords
    # 计算相对于pillar center的x,y,z偏移量
    f_center = torch.zeros_like(features[:, :, :2])
    f_center[:, :, 0] = features[:, :, 0] - (
        coors[:, 1].to(dtype).unsqueeze(1) * self.vx + self.x_offset
    )
    f_center[:, :, 1] = features[:, :, 1] - (
        coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset
    )

    # Combine together feature decorations
    features_ls = [features, f_cluster, f_center]
    if self._with_distance:
        points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)#范数
        features_ls.append(points_dist)
    features = torch.cat(features_ls, dim=-1)

    # The feature decorations were calculated without regard to whether pillar was empty. Need to ensure that
    # empty pillars remain set to zeros.
    voxel_count = features.shape[1]
    mask = get_paddings_indicator(num_voxels, voxel_count, axis=0)
    mask = torch.unsqueeze(mask, -1).type_as(features)
    features *= mask

    # Forward pass through PFNLayers
    for pfn in self.pfn_layers:
        features = pfn(features)

    return features.squeeze()

bevfusion.py

加载模块，bevfuion的pipeline

self.encoders = nn.ModuleDict()
#是否使用camera encoder
if encoders.get("camera") is not None:
    self.encoders["camera"] = nn.ModuleDict(
        {
            "backbone": build_backbone(encoders["camera"]["backbone"]),
            "neck": build_neck(encoders["camera"]["neck"]),
            "vtransform": build_vtransform(encoders["camera"]["vtransform"]),
        }
    )
#是否使用lidar encoder
if encoders.get("lidar") is not None:
    if encoders["lidar"]["voxelize"].get("max_num_points", -1) > 0:
        voxelize_module = Voxelization(**encoders["lidar"]["voxelize"])
    else:
        voxelize_module = DynamicScatter(**encoders["lidar"]["voxelize"])
    self.encoders["lidar"] = nn.ModuleDict(
        {
            "voxelize": voxelize_module,
            "backbone": build_backbone(encoders["lidar"]["backbone"]),
        }
    )
    self.voxelize_reduce = encoders["lidar"].get("voxelize_reduce", True)

//提取相机特征
def extract_camera_features(
    self,
    x,
    points,
    camera2ego,#用于相机坐标系和世界坐标系之间的转换
    lidar2ego,#用于激光雷达坐标系和世界坐标系之间的转换
    lidar2camera,#用于激光雷达坐标系和相机坐标系之间的转换
    lidar2image,#用于激光雷达坐标系和图像坐标系之间的转换
    camera_intrinsics,#相机内参，大小为(B, 3, 3)，其中第一个维度表示样本数量，第二个和第三个维度表示相机内参矩阵
    camera2lidar,#相机坐标系和激光雷达坐标系之间的转换
    img_aug_matrix,#图像增广矩阵
    lidar_aug_matrix,#激光雷达增广矩阵
    img_metas,#图像的元数据
) -> torch.Tensor:#返回一个torch.Tensor类型的输出
    B, N, C, H, W = x.size()
    x = x.view(B * N, C, H, W)

    x = self.encoders["camera"]["backbone"](x)#图像编码
    x = self.encoders["camera"]["neck"](x)

    if not isinstance(x, torch.Tensor):
        x = x[0]

    BN, C, H, W = x.size()
    x = x.view(B, int(BN / B), C, H, W)#调整为原来的形状

    x = self.encoders["camera"]["vtransform"](
        x,
        points,
        camera2ego,
        lidar2ego,
        lidar2camera,
        lidar2image,
        camera_intrinsics,
        camera2lidar,
        img_aug_matrix,
        lidar_aug_matrix,
        img_metas,
    )
    return x

def extract_lidar_features(self, x) -> torch.Tensor:
    feats, coords, sizes = self.voxelize(x)#将输入的点云数据x转换为体素格式
    batch_size = coords[-1, 0] + 1
    x = self.encoders["lidar"]["backbone"](feats, coords, batch_size, sizes=sizes)
    return x

@torch.no_grad()
@force_fp32()
def voxelize(self, points):
    feats, coords, sizes = [], [], []
    for k, res in enumerate(points):
        ret = self.encoders["lidar"]["voxelize"](res)#对点云数据进行离散化处理
        if len(ret) == 3:
            # hard voxelize硬体素化（更高效）
            f, c, n = ret
        else:
            assert len(ret) == 2
            #软体素化（更精确）
            f, c = ret
            n = None
        feats.append(f)
        coords.append(F.pad(c, (1, 0), mode="constant", value=k))
        if n is not None:
            sizes.append(n)

    feats = torch.cat(feats, dim=0)
    coords = torch.cat(coords, dim=0)
    if len(sizes) > 0:#存在大小信息
        sizes = torch.cat(sizes, dim=0)
        if self.voxelize_reduce:
            feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(
                -1, 1
            )
            feats = feats.contiguous()#将特征向量重新排列成一个连续的内存块

    return feats, coords, sizes

def forward_single(
...
    features = []
    for sensor in (
        self.encoders if self.training else list(self.encoders.keys())[::-1]
    ):#在训练模式下遍历self.encoders中的传感器，否则逆序遍历self.encoders.keys()
中的传感器
        if sensor == "camera":
            feature = self.extract_camera_features(
                img,
                points,
                camera2ego,
                lidar2ego,
                lidar2camera,
                lidar2image,
                camera_intrinsics,
                camera2lidar,
                img_aug_matrix,
                lidar_aug_matrix,
                metas,
            )
        elif sensor == "lidar":
            feature = self.extract_lidar_features(points)
        else:
            raise ValueError(f"unsupported sensor: {sensor}")
        features.append(feature)

    if not self.training:
        # avoid OOM（内存耗尽）
        features = features[::-1]

    if self.fuser is not None:
        x = self.fuser(features)#将特征输入融合器进行融合
    else:
        assert len(features) == 1, features
        x = features[0]

    batch_size = x.shape[0]

    x = self.decoder["backbone"](x)
    x = self.decoder["neck"](x)

    if self.training:
        outputs = {}
        for type, head in self.heads.items():#遍历每个类型的头部网络（self.heads）并进行相应的计算
            if type == "object":
                pred_dict = head(x, metas)
                losses = head.loss(gt_bboxes_3d, gt_labels_3d, pred_dict)#loss方法计算预测结果与标签之间的损失
            elif type == "map":
                losses = head(x, gt_masks_bev)#计算预测结果与地面真实掩码之间的损失
            else:
                raise ValueError(f"unsupported head: {type}")
            for name, val in losses.items():
                if val.requires_grad:
                    outputs[f"loss/{type}/{name}"] = val * self.loss_scale[type]
                else:
                    outputs[f"stats/{type}/{name}"] = val
        return outputs
    else:
        outputs = [{} for _ in range(batch_size)]
        for type, head in self.heads.items():
            if type == "object":
                pred_dict = head(x, metas)
                bboxes = head.get_bboxes(pred_dict, metas)#预测框坐标
                for k, (boxes, scores, labels) in enumerate(bboxes):
                    outputs[k].update(
                        {
                            "boxes_3d": boxes.to("cpu"),
                            "scores_3d": scores.cpu(),
                            "labels_3d": labels.cpu(),
                        }
                    )
            elif type == "map":
                logits = head(x)
                for k in range(batch_size):
                    outputs[k].update(
                        {
                            "masks_bev": logits[k].cpu(),
                            "gt_masks_bev": gt_masks_bev[k].cpu(),
                        }
                    )
            else:
                raise ValueError(f"unsupported head: {type}")
        return outputs

base.py：BaseTransform实现图像和点云转换到bev

//视锥体创建
def create_frustum(self):
    iH, iW = self.image_size
    fH, fW = self.feature_size

    ds = (
        torch.arange(*self.dbound, dtype=torch.float)#self.dbound表示视锥体的边界
        .view(-1, 1, 1)        # -1 表示根据张量大小自动推断该维度的大小
        .expand(-1, fH, fW)
    )
    D, _, _ = ds.shape

    xs = (
        torch.linspace(0, iW - 1, fW, dtype=torch.float)
        .view(1, 1, fW)
        .expand(D, fH, fW)
    )
    ys = (
        torch.linspace(0, iH - 1, fH, dtype=torch.float)
        .view(1, fH, 1)
        .expand(D, fH, fW)
    )

    frustum = torch.stack((xs, ys, ds), -1)# 按最后一个维度拼接成一个大小为 (D, fH, fW, 3) 的张量
    return nn.Parameter(frustum, requires_grad=False)

//相机坐标系到激光雷达坐标系的几何转换
def get_geometry(
    self,
    camera2lidar_rots,#相机到激光雷达的旋转矩阵，形状为BxNx3x3
    camera2lidar_trans,#相机到激光雷达的平移向量，形状为BxNx3
    intrins,#相机的内参矩阵,形状为Bx3x3
    post_rots,#点云集合的旋转矩阵,形状为BxNx3x3
    post_trans,#点云集合的平移向量，形状为BxNx3
    **kwargs,#其他关键字参数
):
    B, N, _ = camera2lidar_trans.shape

    # undo post-transformation撤销后转换
    #执行图像空间增广的逆变换
    # B x N x D x H x W x 3
    points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3)
    points = (
        torch.inverse(post_rots)#计算 post_rots 的逆矩阵
        .view(B, N, 1, 1, 1, 3, 3)
        .matmul(points.unsqueeze(-1))#（B，N，1，1，1，1，3）
    )
    # cam_to_lidar
    points = torch.cat(
        (
            points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
            points[:, :, :, :, :, 2:3],
        ),
        5,
    )
    combine = camera2lidar_rots.matmul(torch.inverse(intrins))#将旋转矩阵和内参矩阵的逆相乘得到一个转换矩阵
    points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
    points += camera2lidar_trans.view(B, N, 1, 1, 1, 3)

    if "extra_rots" in kwargs:
        extra_rots = kwargs["extra_rots"]
        points = (
            extra_rots.view(B, 1, 1, 1, 1, 3, 3)
            .repeat(1, N, 1, 1, 1, 1, 1)#(B, N, 1, 1, 1, 3, 3)
            .matmul(points.unsqueeze(-1))
            .squeeze(-1)
        )
    if "extra_trans" in kwargs:
        extra_trans = kwargs["extra_trans"]
        points += extra_trans.view(B, 1, 1, 1, 1, 3).repeat(1, N, 1, 1, 1, 1)

    return points

def bev_pool(self, geom_feats, x):#根据这些点生成bev空间的特征

//深度转换的操作,将输入的点云数据转换为深度图像，并进行了相应的坐标变换和归一化处理
class BaseDepthTransform(BaseTransform):
    @force_fp32()
    def forward(
        self,
        img,
        points,
        sensor2ego,
        lidar2ego,
        lidar2camera,
        lidar2image,
        cam_intrinsic,
        camera2lidar,
        img_aug_matrix,
        lidar_aug_matrix,
        metas,
        **kwargs,
    ):
        rots = sensor2ego[..., :3, :3]
        trans = sensor2ego[..., :3, 3]
        intrins = cam_intrinsic[..., :3, :3]
        post_rots = img_aug_matrix[..., :3, :3]
        post_trans = img_aug_matrix[..., :3, 3]
        lidar2ego_rots = lidar2ego[..., :3, :3]
        lidar2ego_trans = lidar2ego[..., :3, 3]
        camera2lidar_rots = camera2lidar[..., :3, :3]
        camera2lidar_trans = camera2lidar[..., :3, 3]

        # print(img.shape, self.image_size, self.feature_size)

        batch_size = len(points)
        depth = torch.zeros(batch_size, img.shape[1], 1, *self.image_size).to(
            points[0].device
        )

        for b in range(batch_size):
            cur_coords = points[b][:, :3]
            cur_img_aug_matrix = img_aug_matrix[b]
            cur_lidar_aug_matrix = lidar_aug_matrix[b]
            cur_lidar2image = lidar2image[b]

            # inverse aug
            cur_coords -= cur_lidar_aug_matrix[:3, 3]
            cur_coords = torch.inverse(cur_lidar_aug_matrix[:3, :3]).matmul(
                cur_coords.transpose(1, 0)
            )
            # lidar2image
            cur_coords = cur_lidar2image[:, :3, :3].matmul(cur_coords)
            cur_coords += cur_lidar2image[:, :3, 3].reshape(-1, 3, 1)
            # get 2d coords
            dist = cur_coords[:, 2, :]
            cur_coords[:, 2, :] = torch.clamp(cur_coords[:, 2, :], 1e-5, 1e5)#归一化到[1e-5, 1e5]的范围内
            cur_coords[:, :2, :] /= cur_coords[:, 2:3, :]

            # imgaug
            cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords)
            cur_coords += cur_img_aug_matrix[:, :3, 3].reshape(-1, 3, 1)
            cur_coords = cur_coords[:, :2, :].transpose(1, 2)

            # normalize coords for grid sample
            cur_coords = cur_coords[..., [1, 0]]

            on_img = (
                (cur_coords[..., 0] < self.image_size[0])
                & (cur_coords[..., 0] >= 0)
                & (cur_coords[..., 1] < self.image_size[1])
                & (cur_coords[..., 1] >= 0)
            )#根据坐标的范围判断点是否在图像上
            for c in range(on_img.shape[0]):
                masked_coords = cur_coords[c, on_img[c]].long()
                masked_dist = dist[c, on_img[c]]#图像上的点到相机中心的距离
                depth[b, c, 0, masked_coords[:, 0], masked_coords[:, 1]] = masked_dist#深度图中的相应位置的值替换为到相机中心的距离

        extra_rots = lidar_aug_matrix[..., :3, :3]
        extra_trans = lidar_aug_matrix[..., :3, 3]
        geom = self.get_geometry(
            camera2lidar_rots,
            camera2lidar_trans,
            intrins,
            post_rots,
            post_trans,
            extra_rots=extra_rots,
            extra_trans=extra_trans,
        )

        x = self.get_cam_feats(img, depth)
        x = self.bev_pool(geom, x)
        return x