D2L学习笔记-计算机视觉

2023-03-10
Author AsanoSaki
~52.12K words

李沐动手学深度学习（PyTorch）课程学习笔记第七章：计算机视觉。

1. 图像增广

图像增广在对训练图像进行一系列的随机变化之后，生成相似但不同的训练样本，从而扩大了训练集的规模。此外，应用图像增广的原因是，随机改变训练样本可以减少模型对某些属性的依赖，从而提高模型的泛化能力。例如，我们可以以不同的方式裁剪图像，使感兴趣的对象出现在不同的位置，减少模型对于对象出现位置的依赖。我们还可以调整亮度、颜色等因素来降低模型对颜色的敏感度。

下面的代码有50%的几率使图像向左或向右翻转：

1	trans = torchvision.transforms.RandomHorizontalFlip()

有50%的几率向上或向下翻转，注意，上下翻转图像不如左右图像翻转那样常用，需要根据数据集的特征考虑是否可以将图像上下翻转：

1	trans = torchvision.transforms.RandomVerticalFlip()

随机裁剪一个面积为原始面积10%到100%的区域，该区域的宽高比从0.5~2之间随机取值。然后，区域的宽度和高度都被缩放到200像素：

1	trans = torchvision.transforms.RandomResizedCrop((200, 200), scale=(0.1, 1), ratio=(0.5, 2))

我们可以改变图像颜色的四个方面：亮度、对比度、饱和度和色调。在下面的示例中，我们随机更改图像的亮度，随机值为原始图像的50%(1 - 0.5)到150%(1 + 0.5)之间：

1	trans = torchvision.transforms.ColorJitter(brightness=0.5, contrast=0, saturation=0, hue=0)

在实践中，我们将结合多种图像增广方法。我们可以通过使用一个 Compose 实例来综合上面定义的不同的图像增广方法，并将它们应用到每个图像：

trans = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),  # 50%的概率使图片水平翻转
    transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
    transforms.ToTensor()])

图像增广可以直接作用在图像数据上，也可以在使用 torchvision.datasets 导入数据集的时候通过 transform 参数指定：

1
2
3

X = trans(X)

cifar_train = torchvision.datasets.CIFAR10(root="../data", train=True, transform=trans, download=True)

2. 微调

微调（fine-tuning）是迁移学习（transfer learning）中的常见技巧，微调包括以下四个步骤：

在源数据集（例如 ImageNet 数据集）上预训练神经网络模型，即源模型。
创建一个新的神经网络模型，即目标模型。这将复制源模型上的所有模型设计及其参数（输出层除外）。我们假定这些模型参数包含从源数据集中学到的知识，这些知识也将适用于目标数据集。我们还假设源模型的输出层与源数据集的标签密切相关；因此不在目标模型中使用该层。
向目标模型添加输出层，其输出数是目标数据集中的类别数。然后随机初始化该层的模型参数。
在目标数据集（如椅子数据集）上训练目标模型。输出层将从头开始进行训练，而所有其他层的参数将根据源模型的参数进行微调。

当目标数据集比源数据集小得多时，微调有助于提高模型的泛化能力。

我们将在一个 CIFAR10 数据集上微调 ResNet-18 模型。该模型已在 ImageNet 数据集上进行了预训练：

import torch
import torch.nn as nn
import torchvision
from torch.utils import data
from torchvision import transforms
from torchvision import models
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

# ---------- Data ----------
train_trans = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    # 使用ImageNet的RGB通道的均值和标准差，以标准化每个通道
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])])

test_trans = transforms.Compose([
    transforms.Resize([256, 256]),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])])

batch_size = 128
cifar_train = torchvision.datasets.CIFAR10(root="../data", train=True, transform=train_trans, download=True)
cifar_test = torchvision.datasets.CIFAR10(root="../data", train=False, transform=test_trans, download=True)
train_iter = data.DataLoader(cifar_train, batch_size, shuffle=True, num_workers=0)
test_iter = data.DataLoader(cifar_test, batch_size, shuffle=False, num_workers=0)

# ---------- ResNet-18 ----------
def resnet_model(num_classes, use_pretrained=True):
    net = models.resnet18(pretrained=use_pretrained)
    net.fc = nn.Linear(net.fc.in_features, num_classes)
    nn.init.xavier_uniform_(net.fc.weight)

    return net

net = resnet_model(10)

# ---------- Train ----------
# 如果param_group=True，输出层中的模型参数将使用十倍的学习率
def train(net, train_iter, test_iter, num_epochs, lr, device, wd, param_group=True):
    print('training on', device)
    net.to(device)
    loss_function = nn.CrossEntropyLoss()
    loss_function.to(device)

    if param_group:
        params_1x = [param for name, param in net.named_parameters() if name not in ["fc.weight", "fc.bias"]]
        optimizer = torch.optim.SGD([{'params': params_1x},
                                     {'params': net.fc.parameters(), 'lr': lr * 10}], lr=lr, weight_decay=wd)
    else:
        optimizer = torch.optim.SGD(net.parameters(), lr=lr, weight_decay=wd)

    writer = SummaryWriter('../logs/FineTune_CIFAR10_train_log')

    best_acc = 0.0
    for epoch in range(num_epochs):
        net.train()
        train_loss = []
        train_acc = []
        for img, label in tqdm(train_iter):
            img, label = img.to(device), label.to(device)
            label_hat = net(img)

            loss = loss_function(label_hat, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            label_hat = label_hat.argmax(axis=1)
            acc = (label_hat.type(label.dtype) == label).float().mean()
            train_loss.append(loss.item())
            train_acc.append(acc)

        train_loss = sum(train_loss) / len(train_loss)
        train_acc = sum(train_acc) / len(train_acc)
        print(f"[ Train | {epoch + 1:03d}/{num_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")

        net.eval()
        valid_loss = []
        valid_acc = []
        with torch.no_grad():
            for img, label in tqdm(test_iter):
                img, label = img.to(device), label.to(device)
                label_hat = net(img)
                loss = loss_function(label_hat, label)
                label_hat = label_hat.argmax(axis=1)
                acc = (label_hat.type(label.dtype) == label).float().mean()
                valid_loss.append(loss.item())
                valid_acc.append(acc)

        valid_loss = sum(valid_loss) / len(valid_loss)
        valid_acc = sum(valid_acc) / len(valid_acc)
        print(f"[ Valid | {epoch + 1:03d}/{num_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")

        writer.add_scalar('train_loss', train_loss, epoch + 1)
        writer.add_scalar('train_acc', train_acc, epoch + 1)
        writer.add_scalar('valid_loss', valid_loss, epoch + 1)
        writer.add_scalar('valid_acc', valid_acc, epoch + 1)

        if valid_acc > best_acc:
            best_acc = valid_acc
            torch.save(net.state_dict(), '../save/FineTune_CIFAR10_train.params')
            print('saving model with acc {:.3f}'.format(best_acc))

    writer.close()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr, wd, num_epochs = 0.0005, 0.001, 50

train(net, train_iter, test_iter, num_epochs, lr, device, wd, param_group=True)

3. 目标检测和边界框

在图像分类任务中，我们假设图像中只有一个主要物体对象，我们只关注如何识别其类别。然而，很多时候图像里有多个我们感兴趣的目标，我们不仅想知道它们的类别，还想得到它们在图像中的具体位置。在计算机视觉里，我们将这类任务称为目标检测（object detection）或目标识别（object recognition）。

下面加载本节将使用的示例图像。图像左边是一只狗，右边是一只猫。它们是这张图像里的两个主要目标：

import torch
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

plt.figure(dpi=100, figsize=(8, 6))
img = plt.imread('../images/catdog.jpg')
plt.imshow(img)
plt.show()

在目标检测中，我们通常使用边界框（bounding box）来描述对象的空间位置。边界框是矩形的，由矩形左上角的以及右下角的 x 和 y 坐标决定。另一种常用的边界框表示方法是边界框中心的 (x, y) 轴坐标以及框的宽度和高度。

在这里，我们定义在这两种表示法之间进行转换的函数：box_corner_to_center 从两角表示法转换为中心宽度表示法，而 box_center_to_corner 反之亦然。输入参数 boxes 可以是长度为4的张量，也可以是形状为 (N, 4) 的二维张量，其中 N 是边界框的数量。

def box_corner_to_center(boxes):
    """从（左上，右下）转换到（中间，宽度，高度）"""
    x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    cx = (x1 + x2) / 2
    cy = (y1 + y2) / 2
    w = x2 - x1
    h = y2 - y1
    boxes = torch.stack((cx, cy, w, h), dim=-1)
    return boxes

def box_center_to_corner(boxes):
    """从（中间，宽度，高度）转换到（左上，右下）"""
    cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    x1 = cx - 0.5 * w
    y1 = cy - 0.5 * h
    x2 = cx + 0.5 * w
    y2 = cy + 0.5 * h
    boxes = torch.stack((x1, y1, x2, y2), dim=-1)
    return boxes

我们将根据坐标信息定义图像中狗和猫的边界框。图像中坐标的原点是图像的左上角，向右的方向为 x 轴的正方向，向下的方向为 y 轴的正方向：

# bbox是边界框的英文缩写
dog_bbox, cat_bbox = [60.0, 45.0, 378.0, 516.0], [400.0, 112.0, 655.0, 493.0]
boxes = torch.tensor((dog_bbox, cat_bbox))
print(box_center_to_corner(box_corner_to_center(boxes)) == boxes)
# tensor([[True, True, True, True],
#         [True, True, True, True]])

我们可以将边界框在图中画出，以检查其是否准确。画之前，我们定义一个辅助函数 bbox_to_rect。它将边界框表示成 matplotlib 的边界框格式，在图像上添加边界框之后，我们可以看到两个物体的主要轮廓基本上在两个框内：

def bbox_to_rect(bbox, color):
    # 将边界框(左上x, 左上y, 右下x, 右下y)格式转换成matplotlib格式：(xy=(左上x, 左上y), width=宽, height=高)
    return plt.Rectangle(xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1],
                         fill=False, edgecolor=color, linewidth=2)

fig = plt.imshow(img)
fig.axes.add_patch(bbox_to_rect(dog_bbox, 'blue'))
fig.axes.add_patch(bbox_to_rect(cat_bbox, 'red'))
plt.show()

4. 目标检测数据集

目标检测领域没有像 MNIST 和 Fashion-MNIST 那样的小数据集。为了快速测试目标检测模型，我们收集并标记了一个小型数据集。首先，我们拍摄了一组香蕉的照片，并生成了1000张不同角度和大小的香蕉图像。然后，我们在一些背景图片的随机位置上放一张香蕉的图像。最后，我们在图片上为这些香蕉标记了边界框。

包含所有图像和 CSV 标签文件的香蕉检测数据集可以直接从互联网下载，通过 read_data_bananas 函数，我们读取香蕉检测数据集的图像和标签。该数据集的 CSV 文件内含目标类别标签和位于左上角和右下角的真实边界框坐标：

import torch
import torchvision
import os
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
from d2l import torch as d2l
from torch.utils.data import Dataset, DataLoader

d2l.DATA_HUB['banana-detection'] = (d2l.DATA_URL + 'banana-detection.zip', '5de26c8fce5ccdea9f91267273464dc968d20d72')

def read_data_bananas(is_train=True):
    """读取香蕉检测数据集中的图像和标签"""
    data_dir = d2l.download_extract('banana-detection')  # 路径为../data
    csv_fname = os.path.join(data_dir, 'bananas_train' if is_train else 'bananas_val', 'label.csv')
    csv_data = pd.read_csv(csv_fname)
    csv_data = csv_data.set_index('img_name')
    images, targets = [], []
    for img_name, target in csv_data.iterrows():
        images.append(torchvision.io.read_image(os.path.join(data_dir, 'bananas_train' if is_train else 'bananas_val', 'images', f'{img_name}')))
        # 这里的target为：(类别, 左上角x, 左上角y, 右下角x, 右下角y)，其中所有图像都具有相同的香蕉类（索引为0）
        targets.append(list(target))
    return images, torch.tensor(targets).unsqueeze(1) / 256

以下 BananasDataset 类别将允许我们创建一个自定义 Dataset 实例来加载香蕉检测数据集：

class BananasDataset(Dataset):
    """一个用于加载香蕉检测数据集的自定义数据集"""
    def __init__(self, is_train):
        self.features, self.labels = read_data_bananas(is_train)
        print('read ' + str(len(self.features)) + (f' training examples' if is_train else f' validation examples'))

    def __getitem__(self, idx):
        return (self.features[idx].float(), self.labels[idx])

    def __len__(self):
        return len(self.features)

最后，我们定义 load_data_bananas 函数，来为训练集和测试集返回两个数据加载器实例。对于测试集，无须按随机顺序读取它：

def load_data_bananas(batch_size):
    """加载香蕉检测数据集"""
    train_iter = DataLoader(BananasDataset(is_train=True), batch_size, shuffle=True)
    val_iter = DataLoader(BananasDataset(is_train=False), batch_size)
    return train_iter, val_iter

让我们读取一个小批量，并打印其中的图像和标签的形状。图像的小批量的形状为：(批量大小, 通道数, 高度, 宽度)，它与我们之前图像分类任务中的相同。标签的小批量的形状为：(批量大小, M, 5)，其中 M 是数据集的任何图像中边界框可能出现的最大数量。

小批量计算虽然高效，但它要求每张图像含有相同数量的边界框，以便放在同一个批量中。通常来说，图像可能拥有不同数量个边界框；因此，在达到 M 之前，边界框少于 M 的图像将被非法边界框填充。这样，每个边界框的标签将被长度为5的数组表示。数组中的第一个元素是边界框中对象的类别，其中-1表示用于填充的非法边界框。数组的其余四个元素是边界框左上角和右下角的 (x, y) 坐标值（值域在0~1之间）。对于香蕉数据集而言，由于每张图像上只有一个边界框，因此 M = 1。

batch_size, edge_size = 32, 256
train_iter, val_iter = load_data_bananas(batch_size)
features, labels = next(iter(train_iter))
print(features.shape, labels.shape)  # torch.Size([32, 3, 256, 256]) torch.Size([32, 1, 5])

接下来让我们展示10幅带有真实边界框的图像。我们可以看到在所有这些图像中香蕉的旋转角度、大小和位置都有所不同。当然，这只是一个简单的人工数据集，实践中真实世界的数据集通常要复杂得多：

# d2l.show_images()函数的实现
def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5):
    figsize = (num_cols * scale, num_rows * scale)
    _, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
    axes = axes.flatten()
    for i, (ax, img) in enumerate(zip(axes, imgs)):
        try:
            img = img.numpy()
        except:
            pass
        ax.imshow(img)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        if titles:
            ax.set_title(titles[i])
    return axes

def bbox_to_rect(bbox, color):
    return plt.Rectangle(xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1],
                         fill=False, edgecolor=color, linewidth=2)

imgs = (features[0:10].permute(0, 2, 3, 1)) / 255
axes = show_images(imgs, 2, 5, scale=2)
for ax, label in zip(axes, labels[0:10]):
    ax.add_patch(bbox_to_rect(label[0][1:5] * edge_size, color='white'))
    # d2l.show_bboxes(ax, [label[0][1:5] * edge_size], colors=['w'])  # 功能与上一行相同
plt.show()

5. 锚框

由于本节难度较大，因此详细分析见：D2L-计算机视觉-锚框。

import torch
import matplotlib.pyplot as plt
from d2l import torch as d2l

torch.set_printoptions(2)  # 精简输出精度

def multibox_prior(data, sizes, ratios):
    """生成以每个像素为中心具有不同形状的锚框"""
    in_height, in_width = data.shape[-2:]
    device, num_sizes, num_ratios = data.device, len(sizes), len(ratios)
    boxes_per_pixel = (num_sizes + num_ratios - 1)
    size_tensor = torch.tensor(sizes, device=device)
    ratio_tensor = torch.tensor(ratios, device=device)

    # 为了将锚点移动到像素的中心，需要设置偏移量。
    # 因为一个像素的高为1且宽为1，我们选择偏移我们的中心0.5
    offset_h, offset_w = 0.5, 0.5
    steps_h = 1.0 / in_height  # 在y轴上缩放步长
    steps_w = 1.0 / in_width  # 在x轴上缩放步长

    # 生成锚框的所有中心点
    center_h = (torch.arange(in_height, device=device) + offset_h) * steps_h
    center_w = (torch.arange(in_width, device=device) + offset_w) * steps_w
    shift_y, shift_x = torch.meshgrid(center_h, center_w, indexing='ij')
    shift_y, shift_x = shift_y.reshape(-1), shift_x.reshape(-1)
    print(center_h.shape)  # torch.Size([561])
    print(shift_y.shape)  # torch.Size([408408])

    # 生成“boxes_per_pixel”个高和宽，之后用于创建锚框的四角坐标(xmin, xmax, ymin, ymax)
    w = torch.cat((size_tensor * torch.sqrt(ratio_tensor[0]),
                   sizes[0] * torch.sqrt(ratio_tensor[1:]))) * in_height / in_width  # 处理矩形输入
    h = torch.cat((size_tensor / torch.sqrt(ratio_tensor[0]),
                   sizes[0] / torch.sqrt(ratio_tensor[1:])))
    print(w.shape)  # torch.Size([5])

    # 除以2来获得半高和半宽作为中心点到左上和右下的偏移量，repeat(a, b)表示在行上复制a倍，在列上复制b倍
    anchor_manipulations = torch.stack((-w, -h, w, h)).T.repeat(in_height * in_width, 1) / 2
    print(anchor_manipulations.shape)  # torch.Size([2042040, 4])

    # 每个中心点都将有“boxes_per_pixel”个锚框，所以生成含所有锚框中心的网格，重复了“boxes_per_pixel”次
    out_grid = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1).repeat_interleave(boxes_per_pixel, dim=0)
    print(out_grid.shape)  # torch.Size([2042040, 4])
    output = out_grid + anchor_manipulations
    return output.unsqueeze(0)  # 增加batch维度

img = plt.imread('../images/catdog.jpg')
h, w = img.shape[:2]
print(h, w)  # 561 728
X = torch.rand(size=(1, 3, h, w))
Y = multibox_prior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5])
print(Y.shape)  # torch.Size([1, 2042040, 4])

boxes = Y.reshape(h, w, 5, 4)
print(boxes[250, 250, 0, :])  # tensor([0.06, 0.07, 0.63, 0.82])

def show_bboxes(axes, bboxes, labels=None, colors=None):
    """显示所有边界框"""
    def _make_list(obj, default_values=None):
        if obj is None:
            obj = default_values
        elif not isinstance(obj, (list, tuple)):
            obj = [obj]
        return obj

    labels = _make_list(labels)
    colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])
    for i, bbox in enumerate(bboxes):
        color = colors[i % len(colors)]
        rect = d2l.bbox_to_rect(bbox.detach().numpy(), color)
        axes.add_patch(rect)
        if labels and len(labels) > i:
            text_color = 'k' if color == 'w' else 'w'
            axes.text(rect.xy[0], rect.xy[1], labels[i], va='center', ha='center',
                      fontsize=9, color=text_color, bbox=dict(facecolor=color, lw=0))

plt.figure(dpi=100)
bbox_scale = torch.tensor((w, h, w, h))  # 用于将坐标值从0~1复原为0~w(h)
# fig = plt.imshow(img)
# show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale,
#             ['s=0.75, r=1', 's=0.5, r=1', 's=0.25, r=1', 's=0.75, r=2', 's=0.75, r=0.5'])
# plt.show()

def box_iou(boxes1, boxes2):
    """计算两个锚框或边界框列表中成对的交并比"""
    box_area = lambda boxes: ((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]))
    # boxes1.shape: (boxes1的数量, 4)
    # boxes2.shape: (boxes2的数量, 4)
    # areas1.shape: (boxes1的数量,)
    # areas2.shape: (boxes2的数量,)
    areas1 = box_area(boxes1)
    areas2 = box_area(boxes2)
    # inter_upperlefts.shape, inter_lowerrights.shape, inters.shape: (boxes1的数量, boxes2的数量, 2)
    inter_upperlefts = torch.max(boxes1[:, None, :2], boxes2[:, :2])
    inter_lowerrights = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
    inters = (inter_lowerrights - inter_upperlefts).clamp(min=0)
    # inter_areas.shape, union_areas.shape: (boxes1的数量, boxes2的数量)
    inter_areas = inters[:, :, 0] * inters[:, :, 1]
    union_areas = areas1[:, None] + areas2 - inter_areas
    return inter_areas / union_areas

def assign_anchor_to_bbox(ground_truth, anchors, device, iou_threshold=0.5):
    """将最接近的真实边界框分配给锚框"""
    num_anchors, num_gt_boxes = anchors.shape[0], ground_truth.shape[0]
    # 位于第i行和第j列的元素x_ij是锚框i和真实边界框j的IoU
    jaccard = box_iou(anchors, ground_truth)
    # 对于每个锚框，分配的真实边界框的张量
    anchors_bbox_map = torch.full((num_anchors,), -1, dtype=torch.long, device=device)
    # 根据阈值，决定是否分配真实边界框
    max_ious, indices = torch.max(jaccard, dim=1)
    anc_i = torch.nonzero(max_ious >= iou_threshold).reshape(-1)
    box_j = indices[max_ious >= iou_threshold]
    anchors_bbox_map[anc_i] = box_j
    col_discard = torch.full((num_anchors,), -1)
    row_discard = torch.full((num_gt_boxes,), -1)
    for _ in range(num_gt_boxes):
        max_idx = torch.argmax(jaccard)
        box_idx = (max_idx % num_gt_boxes).long()
        anc_idx = (max_idx / num_gt_boxes).long()
        anchors_bbox_map[anc_idx] = box_idx
        jaccard[:, box_idx] = col_discard
        jaccard[anc_idx, :] = row_discard
    return anchors_bbox_map

def offset_boxes(anchors, assigned_bb, eps=1e-6):
    """对锚框偏移量的转换"""
    c_anc = d2l.box_corner_to_center(anchors)
    c_assigned_bb = d2l.box_corner_to_center(assigned_bb)
    offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:]
    offset_wh = 5 * torch.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:])
    offset = torch.cat([offset_xy, offset_wh], dim=1)
    return offset

def multibox_target(anchors, labels):
    """使用真实边界框标记锚框"""
    batch_size, anchors = labels.shape[0], anchors.squeeze(0)
    batch_offset, batch_mask, batch_class_labels = [], [], []
    device, num_anchors = anchors.device, anchors.shape[0]
    for i in range(batch_size):
        label = labels[i, :, :]
        anchors_bbox_map = assign_anchor_to_bbox(label[:, 1:], anchors, device)
        bbox_mask = ((anchors_bbox_map >= 0).float().unsqueeze(-1)).repeat(1, 4)
        # 将类标签和分配的边界框坐标初始化为零
        class_labels = torch.zeros(num_anchors, dtype=torch.long, device=device)
        assigned_bb = torch.zeros((num_anchors, 4), dtype=torch.float32, device=device)
        # 使用真实边界框来标记锚框的类别
        # 如果一个锚框没有被分配，标记其为背景（值为零）
        indices_true = torch.nonzero(anchors_bbox_map >= 0)
        bb_idx = anchors_bbox_map[indices_true]
        class_labels[indices_true] = label[bb_idx, 0].long() + 1
        assigned_bb[indices_true] = label[bb_idx, 1:]
        # 偏移量转换
        offset = offset_boxes(anchors, assigned_bb) * bbox_mask
        batch_offset.append(offset.reshape(-1))
        batch_mask.append(bbox_mask.reshape(-1))
        batch_class_labels.append(class_labels)
    bbox_offset = torch.stack(batch_offset)
    bbox_mask = torch.stack(batch_mask)
    class_labels = torch.stack(batch_class_labels)
    return (bbox_offset, bbox_mask, class_labels)

ground_truth = torch.tensor([[0, 0.1, 0.08, 0.52, 0.92],
                             [1, 0.55, 0.2, 0.9, 0.88]])
anchors = torch.tensor([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4],
                        [0.63, 0.05, 0.88, 0.98], [0.66, 0.45, 0.8, 0.8],
                        [0.57, 0.3, 0.92, 0.9]])

# fig = plt.imshow(img)
# show_bboxes(fig.axes, ground_truth[:, 1:] * bbox_scale, ['dog', 'cat'], 'k')
# show_bboxes(fig.axes, anchors * bbox_scale, ['0', '1', '2', '3', '4'])
# plt.show()

# 返回的结果中有三个元素，都是张量格式。第一个元素包含了为每个锚框标记的四个偏移值。注意负类锚框的偏移量被标记为零
# 第二个元素是掩码（mask）变量，形状为（批量大小，锚框数的四倍）
# 第三个元素包含标记的输入锚框的类别
labels = multibox_target(anchors.unsqueeze(0), ground_truth.unsqueeze(0))
print(labels[2])  # tensor([[0, 1, 2, 0, 2]])

def offset_inverse(anchors, offset_preds):
    """根据带有预测偏移量的锚框来预测边界框"""
    anc = d2l.box_corner_to_center(anchors)
    pred_bbox_xy = (offset_preds[:, :2] * anc[:, 2:] / 10) + anc[:, :2]
    pred_bbox_wh = torch.exp(offset_preds[:, 2:] / 5) * anc[:, 2:]
    pred_bbox = torch.cat((pred_bbox_xy, pred_bbox_wh), dim=1)
    predicted_bbox = d2l.box_center_to_corner(pred_bbox)
    return predicted_bbox

def nms(boxes, scores, iou_threshold):
    """对预测边界框的置信度进行排序"""
    B = torch.argsort(scores, dim=-1, descending=True)
    keep = []  # 保留预测边界框的指标
    while B.numel() > 0:
        i = B[0]
        keep.append(i)
        if B.numel() == 1: break
        iou = box_iou(boxes[i, :].reshape(-1, 4), boxes[B[1:], :].reshape(-1, 4)).reshape(-1)
        inds = torch.nonzero(torch.as_tensor(iou <= iou_threshold, dtype=torch.float32)).reshape(-1)
        B = B[inds + 1]
    return torch.tensor(keep, device=boxes.device)

def multibox_detection(cls_probs, offset_preds, anchors, nms_threshold=0.5, pos_threshold=0.009999999):
    """使用非极大值抑制来预测边界框"""
    device, batch_size = cls_probs.device, cls_probs.shape[0]
    anchors = anchors.squeeze(0)
    num_classes, num_anchors = cls_probs.shape[1], cls_probs.shape[2]
    out = []
    for i in range(batch_size):
        cls_prob, offset_pred = cls_probs[i], offset_preds[i].reshape(-1, 4)
        conf, class_id = torch.max(cls_prob[1:], 0)
        predicted_bb = offset_inverse(anchors, offset_pred)
        keep = nms(predicted_bb, conf, nms_threshold)

        # 找到所有的non_keep索引，并将类设置为背景
        all_idx = torch.arange(num_anchors, dtype=torch.long, device=device)
        combined = torch.cat((keep, all_idx))
        uniques, counts = combined.unique(return_counts=True)
        non_keep = uniques[counts == 1]
        all_id_sorted = torch.cat((keep, non_keep))
        class_id[non_keep] = -1
        class_id = class_id[all_id_sorted]
        conf, predicted_bb = conf[all_id_sorted], predicted_bb[all_id_sorted]
        # pos_threshold是一个用于非背景预测的阈值
        below_min_idx = (conf < pos_threshold)
        class_id[below_min_idx] = -1
        conf[below_min_idx] = 1 - conf[below_min_idx]
        pred_info = torch.cat((class_id.unsqueeze(1), conf.unsqueeze(1), predicted_bb), dim=1)
        out.append(pred_info)
    return torch.stack(out)

anchors = torch.tensor([[0.1, 0.08, 0.52, 0.92], [0.08, 0.2, 0.56, 0.95],
                        [0.15, 0.3, 0.62, 0.91], [0.55, 0.2, 0.9, 0.88]])
offset_preds = torch.tensor([0] * anchors.numel())
cls_probs = torch.tensor([[0] * 4,  # 背景的预测概率
                          [0.9, 0.8, 0.7, 0.1],  # 狗的预测概率
                          [0.1, 0.2, 0.3, 0.9]])  # 猫的预测概率

# fig = plt.imshow(img)
# show_bboxes(fig.axes, anchors * bbox_scale, ['dog=0.9', 'dog=0.8', 'dog=0.7', 'cat=0.9'])
# plt.show()

# 返回结果的形状是（批量大小，锚框的数量，6）
# 第一个元素是预测的类索引，从0开始（0代表狗，1代表猫），值-1表示背景或在非极大值抑制中被移除了
# 第二个元素是预测的边界框的置信度
# 其余四个元素分别是预测边界框左上角和右下角的坐标（范围介于0~1之间）
output = multibox_detection(cls_probs.unsqueeze(0),
                            offset_preds.unsqueeze(0),
                            anchors.unsqueeze(0),
                            nms_threshold=0.5)
print(output)
# tensor([[[ 0.00,  0.90,  0.10,  0.08,  0.52,  0.92],
#          [ 1.00,  0.90,  0.55,  0.20,  0.90,  0.88],
#          [-1.00,  0.80,  0.08,  0.20,  0.56,  0.95],
#          [-1.00,  0.70,  0.15,  0.30,  0.62,  0.91]]])

# 删除-1类别（背景）的预测边界框后，我们可以输出由非极大值抑制保存的最终预测边界框
fig = plt.imshow(img)
for i in output[0].detach().numpy():
    if i[0] == -1:
        continue
    label = ('dog=', 'cat=')[int(i[0])] + str(i[1])
    show_bboxes(fig.axes, [torch.tensor(i[2:]) * bbox_scale], label)
plt.show()

6. 多尺度目标检测

在上一节中，我们以输入图像的每个像素为中心，生成了多个锚框。基本而言，这些锚框代表了图像不同区域的样本。然而，如果为每个像素都生成的锚框，我们最终可能会得到太多需要计算的锚框。想象一个561*728的输入图像，如果以每个像素为中心生成五个形状不同的锚框，就需要在图像上标记和预测超过200万个锚框（561*728*5）。

6.1 多尺度锚框

减少图像上的锚框数量并不困难。比如，我们可以在输入图像中均匀采样一小部分像素，并以它们为中心生成锚框。此外，在不同尺度下，我们可以生成不同数量和不同大小的锚框。直观地说，比起较大的目标，较小的目标在图像上出现的可能性更多样。例如，1*1、1*2和2*2的目标可以分别以4、2和1种可能的方式出现在2*2的图像上。因此，当使用较小的锚框检测较小的物体时，我们可以采样更多的区域，而对于较大的物体，我们可以采样较少的区域。

为了演示如何在多个尺度下生成锚框，让我们先读取一张图像。它的高度和宽度分别为561和728像素：

import torch
import matplotlib.pyplot as plt
from d2l import torch as d2l

img = plt.imread('../images/catdog.jpg')
h, w = img.shape[:2]
print(h, w)  # 561 728

display_anchors 函数定义如下。我们在特征图（fmap）上生成锚框（anchors），每个单位（像素）作为锚框的中心。由于锚框中的 (x, y) 轴坐标值（anchors）已经被除以特征图（fmap）的宽度和高度，因此这些值介于0和1之间，表示特征图中锚框的相对位置。

由于锚框（anchors）的中心分布于特征图（fmap）上的所有单位，因此这些中心必须根据其相对空间位置在任何输入图像上均匀分布。更具体地说，给定特征图的宽度和高度 fmap_w 和 fmap_h，以下函数将均匀地对任何输入图像中 fmap_h 行和 fmap_w 列中的像素进行采样。以这些均匀采样的像素为中心，将会生成大小为 s（假设列表 s 的长度为1）且宽高比（ratios）不同的锚框：

def display_anchors(fmap_w, fmap_h, s):
    plt.figure(dpi=100)
    # 前两个维度上的值不影响输出
    fmap = torch.zeros((1, 10, fmap_h, fmap_w))
    anchors = d2l.multibox_prior(fmap, sizes=s, ratios=[1, 2, 0.5])
    bbox_scale = torch.tensor((w, h, w, h))
    d2l.show_bboxes(plt.imshow(img).axes, anchors[0] * bbox_scale)
    plt.show()

首先，让我们考虑探测小目标。为了在显示时更容易分辨，在这里具有不同中心的锚框不会重叠：锚框的尺度设置为0.15，特征图的高度和宽度设置为4。我们可以看到，图像上4行和4列的锚框的中心是均匀分布的：

1	display_anchors(fmap_w=4, fmap_h=4, s=[0.15])

然后，我们将特征图的高度和宽度减小一半，然后使用较大的锚框来检测较大的目标。当尺度设置为0.4时，一些锚框将彼此重叠：

1	display_anchors(fmap_w=2, fmap_h=2, s=[0.4])

最后，我们进一步将特征图的高度和宽度减小一半，然后将锚框的尺度增加到0.8。此时，锚框的中心即是图像的中心：

1	display_anchors(fmap_w=1, fmap_h=1, s=[0.8])

6.2 多尺度检测

既然我们已经生成了多尺度的锚框，我们就将使用它们来检测不同尺度下各种大小的目标。下面，我们介绍一种基于 CNN 的多尺度目标检测方法，将在第8节（SSD）中实现。

在某种规模上，假设我们有 c 张形状为 h * w 的特征图。使用上一小节中的方法，我们生成了 hw 组锚框，其中每组都有 a 个中心相同的锚框。例如，在上一小节实验的第一个尺度上，给定10个（通道数量）4 * 4 的特征图，我们生成了16组锚框，每组包含3个中心相同的锚框。接下来，每个锚框都根据真实值边界框来标记了类和偏移量。在当前尺度下，目标检测模型需要预测输入图像上 hw 组锚框类别和偏移量，其中不同组锚框具有不同的中心。

假设此处的 c 张特征图是 CNN 基于输入图像的正向传播算法获得的中间输出。既然每张特征图上都有 hw 个不同的空间位置，那么相同空间位置可以看作含有 c 个单元。根据感受野的定义，特征图在相同空间位置的 c 个单元在输入图像上的感受野相同：它们表征了同一感受野内的输入图像信息。因此，我们可以将特征图在同一空间位置的 c 个单元变换为使用此空间位置生成的 a 个锚框类别和偏移量。本质上，我们用输入图像在某个感受野区域内的信息，来预测输入图像上与该区域位置相近的锚框类别和偏移量。

当不同层的特征图在输入图像上分别拥有不同大小的感受野时，它们可以用于检测不同大小的目标。例如，我们可以设计一个神经网络，其中靠近输出层的特征图单元具有更宽的感受野，这样它们就可以从输入图像中检测到较大的目标。

简言之，我们可以利用深层神经网络在多个层次上对图像进行分层表示，从而实现多尺度目标检测。在第8节我们将通过一个具体的例子来说明它是如何工作的。

7. 区域卷积神经网络（R-CNN）系列

7.1 R-CNN

R-CNN 首先从输入图像中选取若干（例如2000个）提议区域（如锚框也是一种选取方法），并标注它们的类别和边界框（如偏移量）。然后，用卷积神经网络对每个提议区域进行前向传播以抽取其特征。接下来，我们用每个提议区域的特征来预测类别和边界框。具体来说，R-CNN 包括以下四个步骤：

对输入图像使用选择性搜索来选取多个高质量的提议区域。这些提议区域通常是在多个尺度下选取的，并具有不同的形状和大小。每个提议区域都将被标注类别和真实边界框；
选择一个预训练的卷积神经网络，并将其在输出层之前截断。将每个提议区域变形为网络需要的输入尺寸，并通过前向传播输出抽取的提议区域特征；
将每个提议区域的特征连同其标注的类别作为一个样本。训练多个支持向量机对目标分类，其中每个支持向量机用来判断样本是否属于某一个类别；
将每个提议区域的特征连同其标注的边界框作为一个样本，训练线性回归模型来预测真实边界框。

尽管 R-CNN 模型通过预训练的卷积神经网络有效地抽取了图像特征，但它的速度很慢。想象一下，我们可能从一张图像中选出上千个提议区域，这需要上千次的卷积神经网络的前向传播来执行目标检测。这种庞大的计算量使得 R-CNN 在现实世界中难以被广泛应用。

7.2 Fast R-CNN

R-CNN 的主要性能瓶颈在于，对每个提议区域，卷积神经网络的前向传播是独立的，而没有共享计算。由于这些区域通常有重叠，独立的特征抽取会导致重复的计算。Fast R-CNN 对 R-CNN 的主要改进之一，是仅在整张图像上执行卷积神经网络的前向传播。Fast R-CNN 的主要计算如下：

与 R-CNN 相比，Fast R-CNN 用来提取特征的入卷积神经网络的输入是整个图像，而不是各个提议区域。此外，这个网络通常会参与训练。设输入为一张图像，将卷积神经网络的输出的形状记为 1 * c * h1 * w1；
假设选择性搜索生成了 n 个提议区域。这些形状各异的提议区域在卷积神经网络的输出上分别标出了形状各异的兴趣区域。然后，这些感兴趣的区域需要进一步抽取出形状相同的特征（比如指定高度 h2 和宽度 w2），以便于连结后输出。为了实现这一目标，Fast R-CNN 引入了兴趣区域汇聚层（RoI pooling）：将卷积神经网络的输出和提议区域作为输入，输出连结后的各个提议区域抽取的特征，形状为 n * c * h2 * w2；
通过全连接层将输出形状变换为 n * d，其中超参数 d 取决于模型设计；
预测 n 个提议区域中每个区域的类别和边界框。更具体地说，在预测类别和边界框时，将全连接层的输出分别转换为形状为 n * q（q 是类别的数量）的输出和形状为 n * 4 的输出。其中预测类别时使用 Softmax 回归。

下面，我们演示了兴趣区域汇聚层的计算方法。假设卷积神经网络抽取的特征 X 的高度和宽度都是4，且只有单通道：

import torch
import torchvision

X = torch.arange(16.).reshape(1, 1, 4, 4)
print(X)
# tensor([[[[ 0.,  1.,  2.,  3.],
#           [ 4.,  5.,  6.,  7.],
#           [ 8.,  9., 10., 11.],
#           [12., 13., 14., 15.]]]])

让我们进一步假设输入图像的高度和宽度都是40像素，且选择性搜索在此图像上生成了两个提议区域。每个区域由5个元素表示：区域目标类别、左上角和右下角的 (x, y) 坐标：

1	rois = torch.Tensor([[0, 0, 0, 20, 20], [0, 0, 10, 30, 30]])

由于 X 的高和宽是输入图像高和宽的1/10，因此，两个提议区域的坐标先按 spatial_scale 乘以0.1。然后，在 X 上分别标出这两个兴趣区域 X[:, :, 0:3, 0:3] 和 X[:, :, 1:4, 0:4]。最后，在 2 * 2 的兴趣区域汇聚层中，每个兴趣区域被划分为子窗口网格，并进一步抽取相同形状 2 * 2 的特征：

print(torchvision.ops.roi_pool(X, rois, output_size=(2, 2), spatial_scale=0.1))
# tensor([[[[ 5.,  6.],
#           [ 9., 10.]]],
#
#
#         [[[ 9., 11.],
#           [13., 15.]]]])

7.3 Faster R-CNN

为了较精确地检测目标结果，Fast R-CNN 模型通常需要在选择性搜索中生成大量的提议区域。Faster R-CNN 提出将选择性搜索替换为区域提议网络（region proposal network），从而减少提议区域的生成数量，并保证目标检测的精度。具体来说，区域提议网络的计算步骤如下：

使用填充为1的 3 * 3 的卷积层变换卷积神经网络的输出，并将输出通道数记为 c。这样，卷积神经网络为图像抽取的特征图中的每个单元均得到一个长度为 c 的新特征；
以特征图的每个像素为中心，生成多个不同大小和宽高比的锚框并标注它们；
使用锚框中心单元长度为 c 的特征，分别预测该锚框的二元类别（含目标还是背景）和边界框；
使用非极大值抑制，从预测类别为目标的预测边界框中移除相似的结果。最终输出的预测边界框即是兴趣区域汇聚层所需的提议区域。

值得一提的是，区域提议网络作为 Faster R-CNN 模型的一部分，是和整个模型一起训练得到的。换句话说，Faster R-CNN 的目标函数不仅包括目标检测中的类别和边界框预测，还包括区域提议网络中锚框的二元类别和边界框预测。作为端到端训练的结果，区域提议网络能够学习到如何生成高质量的提议区域，从而在减少了从数据中学习的提议区域的数量的情况下，仍保持目标检测的精度。

7.4 Mask R-CNN

如果在训练集中还标注了每个目标在图像上的像素级位置，那么 Mask R-CNN 能够有效地利用这些详尽的标注信息进一步提升目标检测的精度。

Mask R-CNN 是基于 Faster R-CNN 修改而来的。具体来说，Mask R-CNN 将兴趣区域汇聚层替换为了兴趣区域对齐层（RoI Align），使用双线性插值（bilinear interpolation）来保留特征图上的空间信息，从而更适于像素级预测。兴趣区域对齐层的输出包含了所有与兴趣区域的形状相同的特征图。它们不仅被用于预测每个兴趣区域的类别和边界框，还通过额外的全卷积网络预测目标的像素级位置。本章的后续章节将更详细地介绍如何使用全卷积网络预测图像中像素级的语义。

8. 单发多框检测（SSD）

SSD 模型主要由基础网络组成，其后是几个多尺度特征块。基本网络用于从输入图像中提取特征，因此它可以使用深度卷积神经网络。单发多框检测论文中选用了在分类层之前截断的 VGG，现在也常用 ResNet 替代。我们可以设计基础网络，使它输出的高和宽较大。这样一来，基于该特征图生成的锚框数量较多，可以用来检测尺寸较小的目标。接下来的每个多尺度特征块将上一层提供的特征图的高和宽缩小（如减半），并使特征图中每个单元在输入图像上的感受野变得更广阔。

回想一下在第6节中，通过深度神经网络分层表示图像的多尺度目标检测的设计。由于接近顶部的多尺度特征图较小，但具有较大的感受野，它们适合检测较少但较大的物体。简而言之，通过多尺度特征块，单发多框检测生成不同大小的锚框，并通过预测边界框的类别和偏移量来检测大小不同的目标，因此这是一个多尺度目标检测模型。

8.1 类别预测层与边界框预测层

设目标类别的数量为 q。这样一来，锚框有 q + 1 个类别，其中第0类是背景。在某个尺度下，设特征图的高和宽分别为 h 和 w。如果以其中每个单元为中心生成 a 个锚框，那么我们需要对 hwa 个锚框进行分类。如果使用全连接层作为输出，很容易导致模型参数过多。回忆 NiN 一节介绍的使用卷积层的通道来输出类别预测的方法，单发多框检测采用同样的方法来降低模型复杂度。

具体来说，类别预测层使用一个保持输入高和宽的卷积层。这样一来，输出和输入在特征图宽和高上的空间坐标一一对应。考虑输出和输入同一空间坐标 (x, y)：输出特征图上 (x, y) 坐标的通道里包含了以输入特征图 (x, y) 坐标为中心生成的所有锚框的类别预测。因此输出通道数为 a * (q + 1)。

类别预测层的定义如下：

import torch
import torchvision
import matplotlib.pyplot as plt
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from torch.nn import functional as F
from d2l import torch as d2l
from tqdm import tqdm

# num_inputs为输入通道数，(num_classes + 1)表示还有一个背景类，因为需要预测每个锚框是哪个类因此输出通道要乘以num_anchors
# 即对于输入的每一个像素，它的输出通道数就是以该像素为中心的num_anchors个锚框的预测值
def cls_predictor(num_inputs, num_anchors, num_classes):
    return nn.Conv2d(num_inputs, num_anchors * (num_classes + 1), kernel_size=3, padding=1)

边界框预测层的设计与类别预测层的设计类似。唯一不同的是，这里需要为每个锚框预测4个偏移量，而不是 q + 1 个类别：

1
2
3

# 预测锚框和真实边界框的offset，对每一个锚框有4个预测值
def bbox_predictor(num_inputs, num_anchors):
    return nn.Conv2d(num_inputs, num_anchors * 4, kernel_size=3, padding=1)

8.2 连结多尺度的预测

单发多框检测使用多尺度特征图来生成锚框并预测其类别和偏移量。在不同的尺度下，特征图的形状或以同一单元为中心的锚框的数量可能会有所不同。因此，不同尺度下预测输出的形状可能会有所不同。

在以下示例中，我们为同一个小批量构建两个不同比例（Y1 和 Y2）的特征图，其中 Y2 的高度和宽度是 Y1 的一半。以类别预测为例，假设 Y1 和 Y2 的每个单元分别生成了5个和3个锚框。进一步假设目标类别的数量为10，对于特征图 Y1 和 Y2，类别预测输出中的通道数分别为 5 * (10 + 1) = 55 和 3 * (10 + 1) = 33，其中任一输出的形状是 (批量大小, 通道数, 高度, 宽度)：

def forward(x, block):
    return block(x)

# Y1为对输入的400(20*20)个像素都会做55(5*(10+1))个预测
# 因此在不同尺度下的预测除了batch维之外另外三个维度都会发生变化
Y1 = forward(torch.zeros((2, 8, 20, 20)), cls_predictor(8, 5, 10))
Y2 = forward(torch.zeros((2, 16, 10, 10)), cls_predictor(16, 3, 10))
print(Y1.shape, Y2.shape)  # torch.Size([2, 55, 20, 20]) torch.Size([2, 33, 10, 10])

除了批量大小这一维度外，其他三个维度都具有不同的尺寸。为了将这两个预测输出链接起来以提高计算效率，我们将把这些张量转换为更一致的格式。

通道维包含中心相同的锚框的预测结果。我们首先将通道维移到最后一维。因为不同尺度下批量大小仍保持不变，我们可以将预测结果转成二维的 (批量大小, 高 * 宽 * 通道数) 的格式，以方便之后在维度1上的连结。这样一来，尽管 Y1 和 Y2 在通道数、高度和宽度方面具有不同的大小，我们仍然可以在同一个小批量的两个不同尺度上连接这两个预测输出：

# start_dim=1表示将后面三个维度展平成一维
# 把通道放最后表示对于每个像素的预测是连续值，否则展平后每个像素的预测就不是连续的
def flatten_pred(pred):
    return torch.flatten(pred.permute(0, 2, 3, 1), start_dim=1)

def concat_preds(preds):
    return torch.cat([flatten_pred(p) for p in preds], dim=1)

print(concat_preds([Y1, Y2]).shape)  # torch.Size([2, 25300])

8.3 高和宽减半块

高和宽减半块将输入特征图的高度和宽度减半，会扩大每个单元在其输出特征图中的感受野，该模块此前已在 VGG 中使用过：

# 高宽减半块，该模块将输入特征图的高度和宽度减半
def down_sample_blk(in_channels, out_channels):
    blk = []
    for _ in range(2):
        blk.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
        blk.append(nn.BatchNorm2d(out_channels))
        blk.append(nn.ReLU())
        in_channels = out_channels
    blk.append(nn.MaxPool2d(2))
    return nn.Sequential(*blk)

print(forward(torch.zeros((2, 3, 20, 20)), down_sample_blk(3, 10)).shape)  # torch.Size([2, 10, 10, 10])

8.4 基本网络块

基本网络块用于从输入图像中抽取特征。为了计算简洁，我们构造了一个小的基础网络，该网络串联3个高和宽减半块，并逐步将通道数翻倍：

def base_net():
    blk = []
    num_filters = [3, 16, 32, 64]
    for i in range(len(num_filters) - 1):
        blk.append(down_sample_blk(num_filters[i], num_filters[i + 1]))
    return nn.Sequential(*blk)

print(forward(torch.zeros((2, 3, 256, 256)), base_net()).shape)  # torch.Size([2, 64, 32, 32])

8.5 完整的模型

完整的单发多框检测模型由五个模块组成，每个块生成的特征图既用于生成锚框，又用于预测这些锚框的类别和偏移量。在这五个模块中，第一个是基本网络块，第二个到第四个是高和宽减半块，最后一个模块使用全局最大池化层将高度和宽度都降到1。从技术上讲，第二到第五个区块都是 SSD 中的多尺度特征块：

def get_blk(i):
    if i == 0:
        blk = base_net()
    elif i == 1:
        blk = down_sample_blk(64, 128)
    elif i == 4:
        blk = nn.AdaptiveMaxPool2d((1, 1))
    else:
        blk = down_sample_blk(128, 128)
    return blk

现在我们为每个块定义前向传播。与图像分类任务不同，此处的输出包括：CNN 特征图 Y、在当前尺度下根据 Y 生成的锚框、预测的这些锚框的类别和偏移量（基于 Y）：

# 为每个块定义前向传播，此处的cls_predictor和bbox_predictor为已经构造好的卷积层
def blk_forward(X, blk, size, ratio, cls_predictor, bbox_predictor):
    Y = blk(X)  # feature map
    anchors = d2l.multibox_prior(Y, sizes=size, ratios=ratio)
    cls_preds = cls_predictor(Y)
    bbox_preds = bbox_predictor(Y)
    return (Y, anchors, cls_preds, bbox_preds)

超参数的设置过程可以看：单发多框检测（SSD）。

1
2
3

sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79], [0.88, 0.961]]
ratios = [[1, 2, 0.5]] * 5
num_anchors = len(sizes[0]) + len(ratios[0]) - 1

现在，我们就可以按如下方式定义完整的模型 TinySSD 了：

class TinySSD(nn.Module):
    def __init__(self, num_classes, **kwargs):
        super(TinySSD, self).__init__(**kwargs)
        self.num_classes = num_classes
        idx_to_in_channels = [64, 128, 128, 128, 128]
        for i in range(5):
            # 即赋值语句self.blk_i=get_blk(i)
            setattr(self, f'blk_{i}', get_blk(i))
            setattr(self, f'cls_{i}', cls_predictor(idx_to_in_channels[i], num_anchors, num_classes))
            setattr(self, f'bbox_{i}', bbox_predictor(idx_to_in_channels[i], num_anchors))

    def forward(self, X):
        anchors, cls_preds, bbox_preds = [None] * 5, [None] * 5, [None] * 5
        for i in range(5):
            # getattr(self, 'blk_%d'%i)即访问self.blk_i
            X, anchors[i], cls_preds[i], bbox_preds[i] = blk_forward(
                X, getattr(self, f'blk_{i}'), sizes[i], ratios[i],
                getattr(self, f'cls_{i}'), getattr(self, f'bbox_{i}'))
            # print(f'anchors[{i}], cls_preds[{i}], bbox_preds[{i}]:', anchors[i].shape, cls_preds[i].shape, bbox_preds[i].shape)
        anchors = torch.cat(anchors, dim=1)
        cls_preds = concat_preds(cls_preds)
        cls_preds = cls_preds.reshape(cls_preds.shape[0], -1, self.num_classes + 1)
        bbox_preds = concat_preds(bbox_preds)
        return anchors, cls_preds, bbox_preds

# f_map: (32, 3, 256, 256)->(32, 64, 32, 32)->(32, 128, 16, 16)->(32, 128, 8, 8)->(32, 128, 4, 4)->(32, 128, 1, 1)
# anchors[0], cls_preds[0], bbox_preds[0]: torch.Size([1, 4096, 4]) torch.Size([32, 8, 32, 32]) torch.Size([32, 16, 32, 32])
# anchors[1], cls_preds[1], bbox_preds[1]: torch.Size([1, 1024, 4]) torch.Size([32, 8, 16, 16]) torch.Size([32, 16, 16, 16])
# anchors[2], cls_preds[2], bbox_preds[2]: torch.Size([1, 256, 4]) torch.Size([32, 8, 8, 8]) torch.Size([32, 16, 8, 8])
# anchors[3], cls_preds[3], bbox_preds[3]: torch.Size([1, 64, 4]) torch.Size([32, 8, 4, 4]) torch.Size([32, 16, 4, 4])
# anchors[4], cls_preds[4], bbox_preds[4]: torch.Size([1, 4, 4]) torch.Size([32, 8, 1, 1]) torch.Size([32, 16, 1, 1])
net = TinySSD(num_classes=1)
X = torch.zeros((32, 3, 256, 256))
anchors, cls_preds, bbox_preds = net(X)

print('output anchors:', anchors.shape)  # output anchors: torch.Size([1, 5444, 4])
print('output class preds:', cls_preds.shape)  # output class preds: torch.Size([32, 5444, 2])
print('output bbox preds:', bbox_preds.shape)  # output bbox preds: torch.Size([32, 21776])

8.6 训练模型

首先读取数据集和设置超参数：

1
2
3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr, num_epochs, batch_size = 0.2, 20, 32
train_iter, valid_iter = d2l.load_data_bananas(batch_size)

然后定义损失函数和评价函数，目标检测有两种类型的损失。第一种有关锚框类别的损失：我们可以简单地复用之前图像分类问题里一直使用的交叉熵损失函数来计算；第二种有关正类锚框偏移量的损失：预测偏移量是一个回归问题。但是，对于这个回归问题，我们在这里不使用平方损失，而是使用 L1 范数损失，即预测值和真实值之差的绝对值。掩码变量 bbox_masks 令负类锚框和填充锚框不参与损失的计算。最后，我们将锚框类别和偏移量的损失相加，以获得模型的最终损失函数：

cls_loss = nn.CrossEntropyLoss(reduction='none')
bbox_loss = nn.L1Loss(reduction='none')

def calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks):
    batch_size, num_classes = cls_preds.shape[0], cls_preds.shape[2]
    cls = cls_loss(cls_preds.reshape(-1, num_classes), cls_labels.reshape(-1)).reshape(batch_size, -1).mean(dim=1)
    bbox = bbox_loss(bbox_preds * bbox_masks, bbox_labels * bbox_masks).mean(dim=1)
    return cls + bbox

我们可以沿用准确率评价分类结果。由于偏移量使用了 L1 范数损失，我们使用平均绝对误差来（MAE）评价边界框的预测结果。这些预测结果是从生成的锚框及其预测偏移量中获得的：

def cls_eval(cls_preds, cls_labels):
    # 由于类别预测结果放在最后一维，argmax需要指定最后一维
    return float((cls_preds.argmax(dim=-1).type(cls_labels.dtype) == cls_labels).sum())

def bbox_eval(bbox_preds, bbox_labels, bbox_masks):
    return float((torch.abs((bbox_labels - bbox_preds) * bbox_masks)).sum())

最后是训练模型，在训练模型时，我们需要在模型的前向传播过程中生成多尺度锚框 anchors，并预测其类别 cls_preds 和偏移量 bbox_preds。然后，我们根据标签信息 label 为生成的锚框标记类别 cls_labels 和偏移量 bbox_labels。最后，我们根据类别和偏移量的预测和标注值计算损失函数：

def train(net, train_iter, valid_iter, num_epochs, lr, device):
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)
    net.apply(init_weights)

    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr, weight_decay=5e-4)

    writer = SummaryWriter('../logs/SSD_train_log')

    best_acc = 0.0
    for epoch in range(num_epochs):
        net.train()
        train_loss, train_acc, train_bbox_err = [], [], []
        for feature, label in tqdm(train_iter):
            feature, label = feature.to(device), label.to(device)
            # 生成多尺度的锚框，为每个锚框预测类别和偏移量
            anchors, cls_preds, bbox_preds = net(feature)
            # 为每个锚框标注类别和偏移量
            bbox_labels, bbox_masks, cls_labels = d2l.multibox_target(anchors, label)
            # 根据类别和偏移量的预测和标注值计算损失函数
            loss = calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks)

            optimizer.zero_grad()
            loss.mean().backward()
            optimizer.step()

            acc = cls_eval(cls_preds, cls_labels) / cls_labels.numel()
            bbox_mae = bbox_eval(bbox_preds, bbox_labels, bbox_masks) / bbox_labels.numel()

            train_loss.append(loss.mean())
            train_acc.append(acc)
            train_bbox_err.append(bbox_mae)

        train_loss = sum(train_loss) / len(train_loss)
        train_acc = sum(train_acc) / len(train_acc)
        train_bbox_err = sum(train_bbox_err) / len(train_bbox_err)
        print(f"[ Train | {epoch + 1:03d}/{num_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}, bbox_err = {train_bbox_err:.5f}")

        net.eval()
        valid_loss, valid_acc, valid_bbox_err = [], [], []
        with torch.no_grad():
            for feature, label in tqdm(valid_iter):
                feature, label = feature.to(device), label.to(device)
                anchors, cls_preds, bbox_preds = net(feature)
                bbox_labels, bbox_masks, cls_labels = d2l.multibox_target(anchors, label)

                loss = calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks)
                acc = cls_eval(cls_preds, cls_labels) / cls_labels.numel()
                bbox_mae = bbox_eval(bbox_preds, bbox_labels, bbox_masks) / bbox_labels.numel()

                valid_loss.append(loss.mean())
                valid_acc.append(acc)
                valid_bbox_err.append(bbox_mae)

        valid_loss = sum(valid_loss) / len(valid_loss)
        valid_acc = sum(valid_acc) / len(valid_acc)
        valid_bbox_err = sum(valid_bbox_err) / len(valid_bbox_err)
        print(f"[ Valid | {epoch + 1:03d}/{num_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}, bbox_err = {valid_bbox_err:.5f}")

        writer.add_scalars('train', {'loss': train_loss,
                                     'acc': train_acc,
                                     'bbox_err': train_bbox_err}, epoch + 1)
        writer.add_scalars('valid', {'loss': valid_loss,
                                     'acc': valid_acc,
                                     'bbox_err': valid_bbox_err}, epoch + 1)

        if valid_acc > best_acc:
            best_acc = valid_acc
            torch.save(net.state_dict(), '../save/SSD_train.params')
            print('saving model with acc {:.3f}'.format(best_acc))

    writer.close()

train(net, train_iter, valid_iter, num_epochs, lr, device)

8.7 预测目标

在预测阶段，我们希望能把图像里面所有我们感兴趣的目标检测出来。在下面，我们读取并调整测试图像的大小，然后将其转成卷积层需要的四维格式。使用 multibox_detection 函数，我们可以根据锚框及其预测偏移量得到预测边界框，然后通过非极大值抑制来移除相似的预测边界框。最后，我们筛选所有置信度不低于0.9的边界框，做为最终输出：

X = torchvision.io.read_image('../images/banana.jpg').unsqueeze(0).float()  # 将其转成卷积层需要的四维格式
img = X.squeeze(0).permute(1, 2, 0).long()  # (h, w, c)
net.to(device)
net.load_state_dict(torch.load('../save/SSD_train.params'))

def predict(x, net, device):
    net.eval()
    anchors, cls_preds, bbox_preds = net(X.to(device))
    cls_probs = F.softmax(cls_preds, dim=2).permute(0, 2, 1)
    output = d2l.multibox_detection(cls_probs, bbox_preds, anchors)
    idx = [i for i, row in enumerate(output[0]) if row[0] != -1]
    return output[0, idx]

def display(img, output, threshold):
    plt.figure(dpi=100)
    fig = plt.imshow(img)
    for row in output:
        score = float(row[1])
        if score < threshold:
            continue
        h, w = img.shape[0:2]
        bbox = [row[2:6] * torch.tensor((w, h, w, h), device=row.device)]
        d2l.show_bboxes(fig.axes, bbox, '%.2f' % score, 'w')
    plt.show()

output = predict(X, net, device)
display(img, output.cpu(), threshold=0.9)

9. 语义分割和数据集

在前几节中讨论的目标检测问题中，我们一直使用方形边界框来标注和预测图像中的目标。本节将探讨语义分割（semantic segmentation）问题，它重点关注于如何将图像分割成属于不同语义类别的区域。与目标检测不同，语义分割可以识别并理解图像中每一个像素的内容：其语义区域的标注和预测是像素级的。与目标检测相比，语义分割标注的像素级的边框显然更加精细。

9.1 图像分割和实例分割

计算机视觉领域还有2个与语义分割相似的重要问题，即图像分割（image segmentation）和实例分割（instance segmentation）。我们在这里将它们同语义分割简单区分一下：

图像分割将图像划分为若干组成区域，这类问题的方法通常利用图像中像素之间的相关性。它在训练时不需要有关图像像素的标签信息，在预测时也无法保证分割出的区域具有我们希望得到的语义。以图像 catdog.jpg 作为输入，图像分割可能会将狗分为两个区域：一个覆盖以黑色为主的嘴和眼睛，另一个覆盖以黄色为主的其余部分身体。
实例分割也叫同时检测并分割（simultaneous detection and segmentation），它研究如何识别图像中各个目标实例的像素级区域。与语义分割不同，实例分割不仅需要区分语义，还要区分不同的目标实例。例如，如果图像中有两条狗，则实例分割需要区分像素属于的两条狗中的哪一条。

9.2 Pascal VOC2012 语义分割数据集

最重要的语义分割数据集之一是 Pascal VOC2012，下面我们深入了解一下这个数据集：

import os
import torch
import torchvision
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from d2l import torch as d2l

d2l.DATA_HUB['voc2012'] = (d2l.DATA_URL + 'VOCtrainval_11-May-2012.tar', '4e443f8a2eca6b1dac8a6c57641b67dd40621a49')
voc_dir = d2l.download_extract('voc2012', 'VOCdevkit/VOC2012')  # 提取出的数据集位于../data/VOCdevkit/VOC2012

进入路径 ../data/VOCdevkit/VOC2012 之后，我们可以看到数据集的不同组件。ImageSets/Segmentation 路径包含用于训练和测试样本的文本文件，而 JPEGImages 和 SegmentationClass 路径分别存储着每个示例的输入图像和标签。此处的标签也采用图像格式，其尺寸和它所标注的输入图像的尺寸相同。此外，标签中颜色相同的像素属于同一个语义类别。下面将 read_voc_images 函数定义为将所有输入的图像和标签读入内存：

def read_voc_images(voc_dir, is_train=True):
    """读取所有VOC图像并标注"""
    txt_fname = os.path.join(voc_dir, 'ImageSets', 'Segmentation', 'train.txt' if is_train else 'val.txt')
    mode = torchvision.io.image.ImageReadMode.RGB
    with open(txt_fname, 'r') as f:
        images = f.read().split()
    features, labels = [], []
    for i, fname in enumerate(images):
        features.append(torchvision.io.read_image(os.path.join(voc_dir, 'JPEGImages', f'{fname}.jpg')))
        labels.append(torchvision.io.read_image(os.path.join(voc_dir, 'SegmentationClass' , f'{fname}.png'), mode))
    return features, labels

train_features, train_labels = read_voc_images(voc_dir, True)
print(len(train_features))  # 1464
print(train_features[0].shape, train_labels[0].shape)  # torch.Size([3, 281, 500]) torch.Size([3, 281, 500])

下面我们绘制前5个输入图像及其标签。在标签图像中，白色和黑色分别表示边框和背景，而其他颜色则对应不同的类别：

imgs = train_features[0:5] + train_labels[0:5]
imgs = [img.permute(1, 2, 0) for img in imgs]  # 将通道放到最后一维
print(len(imgs), imgs[0].shape)  # 10 torch.Size([281, 500, 3])
d2l.show_images(imgs, 2, 5)
plt.show()

接下来，我们列举 RGB 颜色值和类名：

VOC_COLORMAP = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], [128, 0, 128],
                [0, 128, 128], [128, 128, 128], [64, 0, 0], [192, 0, 0], [64, 128, 0],
                [192, 128, 0], [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128],
                [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]

VOC_CLASSES = ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
               'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
               'person', 'potted plant', 'sheep', 'sofa', 'train', 'tv/monitor']

通过上面定义的两个常量，我们可以方便地查找标签中每个像素的类索引。我们定义了 voc_colormap2label 函数来构建从上述 RGB 颜色值到类别索引的映射，而 voc_label_indices 函数将 RGB 值映射到在 Pascal VOC2012 数据集中的类别索引：

def voc_colormap2label():
    """构建从RGB到VOC类别索引的映射"""
    colormap2label = torch.zeros(256 ** 3, dtype=torch.long)
    for i, colormap in enumerate(VOC_COLORMAP):
        colormap2label[(colormap[0] * 256 + colormap[1]) * 256 + colormap[2]] = i  # 哈希映射
    return colormap2label

def voc_label_indices(colormap, colormap2label):
    """将VOC标签中的RGB值映射到它们的类别索引"""
    colormap = colormap.permute(1, 2, 0).numpy().astype('int32')
    idx = ((colormap[:, :, 0] * 256 + colormap[:, :, 1]) * 256 + colormap[:, :, 2])
    return colormap2label[idx]

例如，在第一张样本图像中，飞机头部区域的类别索引为1，而背景索引为0：

colormap2label = voc_colormap2label()
y = voc_label_indices(train_labels[0], colormap2label)
print(y[105:115, 130:140], VOC_CLASSES[1])
# tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
#         [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
#         [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
#         [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
#         [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
#         [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
#         [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
#         [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
#         [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
#         [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]]) aeroplane

之前的实验我们通过再缩放图像使其符合模型的输入形状。然而在语义分割中，这样做需要将预测的像素类别重新映射回原始尺寸的输入图像。这样的映射可能不够精确，尤其在不同语义的分割区域。为了避免这个问题，我们将图像裁剪为固定尺寸，而不是再缩放。具体来说，我们使用图像增广中的随机裁剪，裁剪输入图像和标签的相同区域：

def voc_rand_crop(feature, label, height, width):
    """随机裁剪特征和标签图像"""
    rect = torchvision.transforms.RandomCrop.get_params(feature, (height, width))
    feature = torchvision.transforms.functional.crop(feature, *rect)
    label = torchvision.transforms.functional.crop(label, *rect)
    return feature, label

imgs = []
for _ in range(5):
    imgs += voc_rand_crop(train_features[0], train_labels[0], 200, 300)
imgs = [img.permute(1, 2, 0) for img in imgs]
d2l.show_images(imgs[::2] + imgs[1::2], 2, 5)
plt.show()

我们通过继承高级 API 提供的 Dataset 类，自定义了一个语义分割数据集类 VOCSegDataset。通过实现 __getitem__ 函数，我们可以任意访问数据集中索引为 idx 的输入图像及其每个像素的类别索引。由于数据集中有些图像的尺寸可能小于随机裁剪所指定的输出尺寸，这些样本可以通过自定义的 filter 函数移除掉。此外，我们还定义了 normalize_image 函数，从而对输入图像的 RGB 三个通道的值分别做标准化：

class VOCSegDataset(Dataset):
    """一个用于加载VOC数据集的自定义数据集"""
    def __init__(self, is_train, crop_size, voc_dir):
        self.transform = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        self.crop_size = crop_size
        features, labels = read_voc_images(voc_dir, is_train=is_train)
        self.features = [self.normalize_image(feature) for feature in self.filter(features)]
        self.labels = self.filter(labels)
        self.colormap2label = voc_colormap2label()
        print('read ' + str(len(self.features)) + ' examples')

    def normalize_image(self, img):
        return self.transform(img.float() / 255)

    # 过滤掉比裁切大小还小的图像
    def filter(self, imgs):
        return [img for img in imgs if (img.shape[1] >= self.crop_size[0] and
                                        img.shape[2] >= self.crop_size[1])]

    def __getitem__(self, idx):
        feature, label = voc_rand_crop(self.features[idx], self.labels[idx], *self.crop_size)
        return (feature, voc_label_indices(label, self.colormap2label))

    def __len__(self):
        return len(self.features)

crop_size = (320, 480)
voc_train, voc_valid = VOCSegDataset(True, crop_size, voc_dir), VOCSegDataset(False, crop_size, voc_dir)

最后，我们定义以下 load_data_voc 函数来下载并读取 Pascal VOC2012 语义分割数据集。它返回训练集和测试集的数据迭代器：

def load_data_voc(batch_size):
    """加载VOC语义分割数据集"""
    train_iter = DataLoader(voc_train, batch_size, shuffle=True, drop_last=True, num_workers=0)
    valid_iter = DataLoader(voc_valid, batch_size, drop_last=True, num_workers=0)
    return train_iter, valid_iter

batch_size = 64
train_iter, valid_iter = load_data_voc(batch_size)

for features, labels in train_iter:
    print(features.shape)  # torch.Size([64, 3, 320, 480])
    print(labels.shape)  # torch.Size([64, 320, 480])
    break

AsanoSaki