在计算机视觉领域,图像分类始终是最基础且应用最广泛的任务之一。从智能手机相册的自动分类到工业质检中的缺陷识别,这项技术已经深入到我们生活的方方面面。作为一名长期从事深度学习开发的工程师,我发现PyTorch框架因其直观的API设计和动态计算图特性,特别适合快速实现和迭代图像分类模型。
本文将带你完整实现一个基于ResNet架构的五类动物分类器(狗、马、大象、蝴蝶、鸡)。不同于简单的教程,我会重点分享在实际项目中容易被忽视的关键细节,比如:
推荐使用conda创建独立的Python环境(3.8+版本):
bash复制conda create -n pytorch-classify python=3.8
conda activate pytorch-classify
pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html
pip install matplotlib pandas scikit-learn
注意:CUDA版本需要与本地GPU驱动匹配,可通过
nvidia-smi查询。如果使用CPU训练,安装命令去掉+cu113后缀。
一个良好的数据集目录结构能大幅减少后续开发中的麻烦。建议采用如下组织方式:
code复制animal_dataset/
train/
cane/
img_001.jpg
img_002.jpg
...
cavallo/
...
val/
cane/
img_101.jpg
...
cavallo/
...
test/ # 可选
...
关键细节:
数据增强是提升小样本数据集性能的关键。以下是我在多个项目中验证有效的增强组合:
python复制from torchvision import transforms
train_transform = transforms.Compose([
transforms.Resize(256), # 先放大尺寸
transforms.RandomCrop(224), # 随机裁剪
transforms.RandomHorizontalFlip(p=0.5),
transforms.ColorJitter(
brightness=0.2,
contrast=0.2,
saturation=0.2,
hue=0.1),
transforms.RandomRotation(15),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
验证集只需基础变换:
python复制val_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
经验:Normalize参数使用ImageNet的均值和标准差,这对预训练模型尤其重要。如果从头训练,可以计算自己数据集的统计量。
ResNet的核心创新在于残差块(Residual Block)设计。传统CNN随着深度增加会出现梯度消失问题,而残差连接允许梯度直接回传到浅层:
python复制class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(
in_planes, planes, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(
planes, planes, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
# 捷径连接
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*planes)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x) # 残差连接
out = F.relu(out)
return out
基于PyTorch灵活的特性,我们可以轻松调整原始ResNet架构:
python复制def ResNet18(num_classes=5):
return ResNet(BasicBlock, [2,2,2,2], num_classes)
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes):
super().__init__()
self.in_planes = 64
# 初始卷积层
self.conv1 = nn.Conv2d(3, 64, kernel_size=7,
stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# 四个残差阶段
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
# 分类头
self.avgpool = nn.AdaptiveAvgPool2d((1,1))
self.fc = nn.Linear(512*block.expansion, num_classes)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
关键改进点:
python复制def train_epoch(model, loader, criterion, optimizer, device):
model.train()
running_loss = 0.0
correct = 0
total = 0
for inputs, labels in loader:
inputs, labels = inputs.to(device), labels.to(device)
# 清零梯度
optimizer.zero_grad()
# 前向传播
outputs = model(inputs)
loss = criterion(outputs, labels)
# 反向传播
loss.backward()
optimizer.step()
# 统计指标
running_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
epoch_loss = running_loss / len(loader)
epoch_acc = correct / total
return epoch_loss, epoch_acc
学习率对模型收敛至关重要,推荐使用余弦退火配合热重启:
python复制from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = CosineAnnealingWarmRestarts(
optimizer,
T_0=10, # 初始周期长度
T_mult=2, # 周期倍增系数
eta_min=1e-5) # 最小学习率
利用NVIDIA的AMP技术可以显著减少显存占用:
python复制from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler()
for inputs, labels in loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
with autocast():
outputs = model(inputs)
loss = criterion(outputs, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
除了准确率,还应关注:
python复制from sklearn.metrics import classification_report
def evaluate(model, loader, device):
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
for inputs, labels in loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
print(classification_report(
all_labels, all_preds,
target_names=class_names))
return accuracy_score(all_labels, all_preds)
部署时可以考虑以下优化:
python复制model = torch.quantization.quantize_dynamic(
model, {nn.Linear}, dtype=torch.qint8)
python复制dummy_input = torch.randn(1, 3, 224, 224).to(device)
torch.onnx.export(
model, dummy_input, "model.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={
'input': {0: 'batch_size'},
'output': {0: 'batch_size'}})
使用Flask创建API服务:
python复制from flask import Flask, request, jsonify
import torchvision.transforms as transforms
from PIL import Image
app = Flask(__name__)
model = load_model() # 加载训练好的模型
model.eval()
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
@app.route('/predict', methods=['POST'])
def predict():
if 'file' not in request.files:
return jsonify({'error': 'no file uploaded'}), 400
file = request.files['file']
try:
img = Image.open(file.stream).convert('RGB')
tensor = transform(img).unsqueeze(0)
with torch.no_grad():
outputs = model(tensor)
_, pred = torch.max(outputs, 1)
return jsonify({
'class': class_names[pred.item()],
'confidence': torch.softmax(outputs, 1)[0][pred].item()
})
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
| 现象 | 可能原因 | 解决方案 |
|---|---|---|
| 训练准确率高但验证准确率低 | 模型复杂度过高 | 增加Dropout层(p=0.5) |
| 损失函数波动大 | 学习率过高 | 使用学习率预热(warmup) |
| 各类别准确率差异大 | 数据不均衡 | 使用类别加权损失函数 |
python复制# 在DataLoader中设置
num_workers=min(4, os.cpu_count())
pin_memory=True # 当使用GPU时
prefetch_factor=2 # PyTorch 1.7+
python复制dataset = torch.utils.data.TensorDataset(
torch.from_numpy(np.load('images.npy', mmap_mode='r')),
torch.from_numpy(np.load('labels.npy'))
)
python复制for name, param in model.named_parameters():
if param.grad is None:
print(f"No gradient for {name}")
else:
print(f"{name} grad norm: {param.grad.norm().item():.4f}")
python复制from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
for inputs, _ in train_loader:
outputs = model(inputs)
writer.add_histogram('fc_layer', model.fc.weight)
break
writer.close()
在实际项目中,我发现ResNet18在224x224输入尺寸下,使用Adam优化器(lr=3e-4)配合逐步学习率衰减(每20epoch衰减0.1倍),在5类动物数据集上通常能达到92%以上的验证准确率。关键是要确保数据增强策略能够覆盖实际应用场景中可能出现的图像变化。