在计算机视觉领域,图像分类是最基础也最具挑战性的任务之一。CIFAR-10数据集作为经典的基准测试集,包含10个类别的6万张32x32像素彩色图像,相比MNIST手写数字数据集更接近真实世界的复杂场景。这个项目将带您从零开始构建一个完整的深度学习流程,使用PyTorch框架实现CIFAR-10图像分类任务。
通过本实践,您将掌握:
在开始项目前,确保您的环境配置正确:
python复制import torch
import torchvision
# 检查GPU可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"当前使用设备: {device}")
# 打印环境信息
print(f"PyTorch版本: {torch.__version__}")
print(f"Torchvision版本: {torchvision.__version__}")
if torch.cuda.is_available():
print(f"CUDA版本: {torch.version.cuda}")
print(f"GPU型号: {torch.cuda.get_device_name(0)}")
提示:如果Jupyter Notebook显示使用CPU,请检查内核选择是否正确。在Colab中,通过"修改->笔记本设置"确保选择GPU加速。
PyTorch的torchvision.datasets模块提供了便捷的数据集加载方式:
python复制from torchvision import transforms
# 定义数据预处理
transform = transforms.Compose([
transforms.ToTensor(), # 转换为张量并归一化到[0,1]
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # 标准化到[-1,1]
])
# 加载训练集和测试集
train_dataset = torchvision.datasets.CIFAR10(
root='./data',
train=True,
download=True,
transform=transform
)
test_dataset = torchvision.datasets.CIFAR10(
root='./data',
train=False,
download=True,
transform=transform
)
使用DataLoader实现批量加载和数据打乱:
python复制from torch.utils.data import DataLoader
BATCH_SIZE = 64
train_loader = DataLoader(
train_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
num_workers=2
)
test_loader = DataLoader(
test_dataset,
batch_size=BATCH_SIZE,
shuffle=False,
num_workers=2
)
数据形状说明:
理解数据是建模的第一步,我们先可视化部分样本:
python复制import matplotlib.pyplot as plt
import numpy as np
# 获取一个batch的数据
images, labels = next(iter(train_loader))
class_names = ['飞机', '汽车', '鸟', '猫', '鹿',
'狗', '蛙', '马', '船', '卡车']
# 显示单张图像
def imshow(img):
img = img / 2 + 0.5 # 反标准化
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.axis('off')
plt.show()
# 显示第一张图像及其标签
imshow(images[0])
print(f"标签: {class_names[labels[0]]}")
查看多个样本有助于理解数据分布:
python复制# 显示一个batch的图像网格
def show_batch(images, labels, nrows=4, ncols=8):
plt.figure(figsize=(10, 5))
for i in range(nrows*ncols):
plt.subplot(nrows, ncols, i+1)
img = images[i] / 2 + 0.5
plt.imshow(np.transpose(img.numpy(), (1, 2, 0)))
plt.title(class_names[labels[i]], fontsize=8)
plt.axis('off')
plt.tight_layout()
plt.show()
show_batch(images, labels)
针对CIFAR-10的特点,我们设计一个包含以下层的CNN:
python复制import torch.nn as nn
import torch.nn.functional as F
class CIFAR10CNN(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(64 * 8 * 8, 256)
self.fc2 = nn.Linear(256, 10)
self.dropout = nn.Dropout(0.25)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x))) # [B,32,16,16]
x = self.pool(F.relu(self.conv2(x))) # [B,64,8,8]
x = torch.flatten(x, 1) # [B,64*8*8]
x = self.dropout(x)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
model = CIFAR10CNN().to(device)
print(model)
了解模型参数量有助于评估复杂度:
python复制from torchsummary import summary
summary(model, input_size=(3, 32, 32))
典型输出:
code复制----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 32, 32, 32] 896
Conv2d-2 [-1, 64, 32, 32] 18,496
MaxPool2d-3 [-1, 64, 16, 16] 0
Linear-4 [-1, 256] 1,048,832
Linear-5 [-1, 10] 2,570
================================================================
Total params: 1,070,794
Trainable params: 1,070,794
Non-trainable params: 0
----------------------------------------------------------------
设置损失函数和优化器:
python复制import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
完整的训练过程包括:
python复制def train(model, train_loader, criterion, optimizer, epoch):
model.train()
running_loss = 0.0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(train_loader):
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
if batch_idx % 100 == 99:
print(f'Epoch: {epoch+1}, Batch: {batch_idx+1}, '
f'Loss: {running_loss/100:.3f}, Acc: {100.*correct/total:.1f}%')
running_loss = 0.0
train_loss = running_loss / len(train_loader)
train_acc = 100. * correct / total
return train_loss, train_acc
验证模型在测试集上的表现:
python复制def test(model, test_loader, criterion):
model.eval()
test_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for inputs, targets in test_loader:
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
loss = criterion(outputs, targets)
test_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
test_loss /= len(test_loader)
test_acc = 100. * correct / total
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.1f}%')
return test_loss, test_acc
运行多轮训练并记录指标:
python复制EPOCHS = 15
train_losses, test_losses = [], []
train_accs, test_accs = [], []
for epoch in range(EPOCHS):
print(f'\nEpoch {epoch+1}/{EPOCHS}')
train_loss, train_acc = train(model, train_loader, criterion, optimizer, epoch)
test_loss, test_acc = test(model, test_loader, criterion)
scheduler.step()
train_losses.append(train_loss)
test_losses.append(test_loss)
train_accs.append(train_acc)
test_accs.append(test_acc)
绘制训练曲线分析模型表现:
python复制plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(train_accs, label='Train Acc')
plt.plot(test_accs, label='Test Acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.show()
学习率是影响训练效果的关键参数:
python复制# 使用ReduceLROnPlateau根据验证损失调整学习率
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
optimizer,
mode='min',
factor=0.1,
patience=3,
verbose=True
)
增加数据多样性提升模型泛化能力:
python复制from torchvision import transforms
train_transform = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, padding=4),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
更深的网络结构通常能获得更好性能:
python复制class ImprovedCNN(nn.Module):
def __init__(self):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(64, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(128, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(2, 2),
)
self.classifier = nn.Sequential(
nn.Linear(128 * 8 * 8, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.features(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
过拟合问题
训练不收敛
GPU内存不足
python复制from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler()
with autocast():
outputs = model(inputs)
loss = criterion(outputs, targets)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
python复制accum_steps = 4
optimizer.zero_grad()
for i, (inputs, targets) in enumerate(train_loader):
outputs = model(inputs)
loss = criterion(outputs, targets) / accum_steps
loss.backward()
if (i+1) % accum_steps == 0:
optimizer.step()
optimizer.zero_grad()
python复制model.eval()
quantized_model = torch.quantization.quantize_dynamic(
model, {nn.Linear}, dtype=torch.qint8
)
python复制dummy_input = torch.randn(1, 3, 32, 32).to(device)
torch.onnx.export(
model,
dummy_input,
"cifar10_model.onnx",
input_names=["input"],
output_names=["output"]
)
使用预训练模型提升小数据集表现:
python复制from torchvision import models
pretrained_model = models.resnet18(pretrained=True)
# 修改最后一层适配CIFAR-10
pretrained_model.fc = nn.Linear(pretrained_model.fc.in_features, 10)
pretrained_model = pretrained_model.to(device)
结合CNN与Transformer的优势:
python复制class CNNWithAttention(nn.Module):
def __init__(self):
super().__init__()
self.cnn = nn.Sequential(
nn.Conv2d(3, 64, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2)
)
self.attention = nn.MultiheadAttention(64, 4)
self.classifier = nn.Linear(64, 10)
def forward(self, x):
x = self.cnn(x) # [B,64,16,16]
B, C, H, W = x.shape
x = x.view(B, C, -1).permute(2, 0, 1) # [H*W,B,C]
x, _ = self.attention(x, x, x)
x = x.mean(dim=0) # [B,C]
return self.classifier(x)
使用Grad-CAM可视化模型关注区域:
python复制from torchcam.methods import GradCAM
cam_extractor = GradCAM(model, 'conv2')
with torch.no_grad():
out = model(inputs.unsqueeze(0))
activation_map = cam_extractor(out.squeeze(0).argmax().item(), out)
plt.imshow(activation_map[0].squeeze(0).numpy(), cmap='jet', alpha=0.5)
plt.imshow(inputs.permute(1,2,0)/2+0.5)
plt.show()
经过这个完整的CIFAR-10图像分类项目实践,我总结了以下几点关键经验:
数据质量至关重要:相比模型结构,数据预处理和增强往往对最终效果影响更大。在实际项目中,应该投入足够时间理解数据分布和特性。
模型复杂度需要平衡:太简单的模型难以捕捉复杂特征,太复杂的模型容易过拟合。通过验证集监控找到合适的模型规模。
训练过程需要耐心:深度学习模型通常需要较长时间训练才能收敛。使用适当的回调函数(如早停、学习率调整)可以节省时间。
实验记录不可忽视:详细记录每次实验的超参数、修改点和结果,这对分析问题和优化方向非常有帮助。
工程实践技巧:如混合精度训练、梯度累积等技巧在实际项目中能显著提升效率,值得掌握。
这个项目从理论到实践全面锻炼了我的深度学习能力,特别是对CNN工作原理、PyTorch框架使用和模型调优有了更深入的理解。后续可以尝试更复杂的数据集(如CIFAR-100)或探索其他计算机视觉任务(如目标检测、图像分割)。