人体姿态估计作为计算机视觉领域的核心任务之一,在动作识别、人机交互、运动分析等场景中发挥着重要作用。传统方法往往通过下采样-上采样的对称结构处理图像,而HRNet创新性地提出了全程保持高分辨率表示的网络架构。本文将带您从零开始实现HRNet-W32模型,这个在COCO关键点检测榜单上长期占据领先地位的经典网络。
推荐使用Python 3.8+和PyTorch 1.7+环境,这是HRNet官方代码验证过的稳定组合。以下是关键依赖的安装命令:
bash复制pip install torch==1.7.1 torchvision==0.8.2
pip install opencv-python numpy tqdm
对于GPU加速,需要额外安装对应CUDA版本的PyTorch。可以通过以下命令检查环境是否正常:
python复制import torch
print(torch.__version__, torch.cuda.is_available())
COCO数据集是人体姿态估计最常用的基准数据集,包含超过20万张图像和25万个人体实例标注。数据预处理流程包括:
python复制class COCOKeypointsDataset(torch.utils.data.Dataset):
def __init__(self, root, transforms=None):
self.root = root
self.transforms = transforms
self.image_ids = list(sorted(os.listdir(os.path.join(root, "images"))))
def __getitem__(self, idx):
img_path = os.path.join(self.root, "images", self.image_ids[idx])
img = Image.open(img_path).convert("RGB")
# 加载对应的标注文件
annos = self._load_annotations(idx)
if self.transforms is not None:
img, annos = self.transforms(img, annos)
return img, annos
HRNet使用两种残差块作为基础构建单元:
| 模块类型 | 适用网络深度 | 结构特点 | 计算复杂度 |
|---|---|---|---|
| BasicBlock | 浅层网络 | 两个3x3卷积 | 较低 |
| Bottleneck | 深层网络 | 1x1-3x3-1x1的瓶颈结构 | 较高 |
python复制class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
HighResolutionModule是HRNet的核心创新,其工作流程可分为三个关键步骤:
python复制class HighResolutionModule(nn.Module):
def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
num_channels, fuse_method, multi_scale_output=True):
super(HighResolutionModule, self).__init__()
self._check_branches(num_branches, num_blocks, num_inchannels, num_channels)
self.num_inchannels = num_inchannels
self.fuse_method = fuse_method
self.num_branches = num_branches
self.multi_scale_output = multi_scale_output
self.branches = self._make_branches(num_branches, blocks, num_blocks, num_channels)
self.fuse_layers = self._make_fuse_layers()
self.relu = nn.ReLU(inplace=True)
def _make_fuse_layers(self):
if self.num_branches == 1:
return None
fuse_layers = []
for i in range(self.num_branches if self.multi_scale_output else 1):
fuse_layer = []
for j in range(self.num_branches):
if j > i:
fuse_layer.append(nn.Sequential(
nn.Conv2d(self.num_inchannels[j], self.num_inchannels[i], 1, 1, 0, bias=False),
nn.BatchNorm2d(self.num_inchannels[i]),
nn.Upsample(scale_factor=2**(j-i), mode='nearest')
))
elif j == i:
fuse_layer.append(None)
else:
conv3x3s = []
for k in range(i-j):
if k == i - j - 1:
conv3x3s.append(nn.Sequential(
nn.Conv2d(self.num_inchannels[j], self.num_inchannels[i], 3, 2, 1, bias=False),
nn.BatchNorm2d(self.num_inchannels[i])
))
else:
conv3x3s.append(nn.Sequential(
nn.Conv2d(self.num_inchannels[j], self.num_inchannels[j], 3, 2, 1, bias=False),
nn.BatchNorm2d(self.num_inchannels[j]),
nn.ReLU(inplace=True)
))
fuse_layer.append(nn.Sequential(*conv3x3s))
fuse_layers.append(nn.ModuleList(fuse_layer))
return nn.ModuleList(fuse_layers)
HRNet-W32包含四个主要阶段,每个阶段通过transition layer进行分辨率切换:
python复制def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
transition_layers = []
for i in range(len(num_channels_cur_layer)):
if i < len(num_channels_pre_layer):
if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
transition_layers.append(nn.Sequential(
nn.Conv2d(num_channels_pre_layer[i], num_channels_cur_layer[i], 3, 1, 1, bias=False),
nn.BatchNorm2d(num_channels_cur_layer[i]),
nn.ReLU(inplace=True)
))
else:
transition_layers.append(None)
else:
conv3x3s = []
for j in range(i+1-len(num_channels_pre_layer)):
inchannels = num_channels_pre_layer[-1]
outchannels = num_channels_cur_layer[i] if j == i-len(num_channels_pre_layer) else inchannels
conv3x3s.append(nn.Sequential(
nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False),
nn.BatchNorm2d(outchannels),
nn.ReLU(inplace=True)
))
transition_layers.append(nn.Sequential(*conv3x3s))
return nn.ModuleList(transition_layers)
最终输出层将最高分辨率的特征图转换为关键点热图:
python复制self.final_layer = nn.Conv2d(
in_channels=pre_stage_channels[0],
out_channels=cfg.MODEL.NUM_JOINTS,
kernel_size=extra.FINAL_CONV_KERNEL,
stride=1,
padding=1 if extra.FINAL_CONV_KERNEL == 3 else 0
)
注意:输出热图的分辨率是输入图像的1/4,这是由初始stem网络的两步下采样决定的。
使用改进的Mean Squared Error作为损失函数,对难样本给予更高权重:
python复制class KeypointLoss(nn.Module):
def __init__(self, use_target_weight=True):
super(KeypointLoss, self).__init__()
self.criterion = nn.MSELoss(reduction='mean')
self.use_target_weight = use_target_weight
def forward(self, output, target, target_weight):
batch_size = output.size(0)
num_joints = output.size(1)
heatmaps_pred = output.reshape((batch_size, num_joints, -1))
heatmaps_gt = target.reshape((batch_size, num_joints, -1))
if self.use_target_weight:
loss = self.criterion(heatmaps_pred * target_weight,
heatmaps_gt * target_weight)
else:
loss = self.criterion(heatmaps_pred, heatmaps_gt)
return loss
HRNet-W32的训练需要采用分阶段学习率策略:
| 训练阶段 | 学习率 | 数据增强 | 训练时长 | 验证精度(AP) |
|---|---|---|---|---|
| 初始阶段 | 1e-3 | 基础增强 | 20 epoch | ~60 |
| 中间阶段 | 1e-4 | 增强+ | 40 epoch | ~70 |
| 微调阶段 | 1e-5 | 完整增强 | 20 epoch | ~75 |
关键训练代码如下:
python复制optimizer = torch.optim.Adam(model.parameters(), lr=cfg.TRAIN.LR)
# 学习率调度器
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR)
for epoch in range(cfg.TRAIN.BEGIN_EPOCH, cfg.TRAIN.END_EPOCH):
# 训练一个epoch
train_one_epoch(train_loader, model, criterion, optimizer)
# 验证
if epoch % cfg.TRAIN.VAL_FREQ == 0:
validate(val_loader, model, criterion)
# 更新学习率
lr_scheduler.step()
# 保存检查点
if epoch % cfg.TRAIN.CHECKPOINT_FREQ == 0:
save_checkpoint({
'epoch': epoch,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
}, filename=f'checkpoint_{epoch}.pth.tar')
在COCO数据集上使用标准评估指标:
评估过程中需要注意处理以下特殊情况:
python复制def evaluate(coco, coco_dt, annType='keypoints'):
imgIds = sorted(coco.getImgIds())
dt = coco_dt.loadRes('results.json')
cocoEval = COCOeval(coco, dt, annType)
cocoEval.params.imgIds = imgIds
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
return cocoEval.stats
通过以下方法可以显著提升HRNet的推理速度:
python复制# FP16推理示例
model.half() # 转换模型为半精度
for input, target in val_loader:
input = input.half().cuda()
with torch.no_grad():
output = model(input)
针对显存不足的情况,可以采用以下策略:
python复制# 梯度检查点实现
from torch.utils.checkpoint import checkpoint
def custom_forward(x):
# 定义需要检查点的前向传播
return model(x)
output = checkpoint(custom_forward, input)
在不显著损失精度的情况下减小模型体积:
| 方法 | 压缩率 | 精度损失 | 实现难度 |
|---|---|---|---|
| 知识蒸馏 | 2-4x | <1% | 中等 |
| 量化(int8) | 4x | 1-2% | 简单 |
| 通道剪枝 | 2-3x | 2-3% | 困难 |
知识蒸馏示例代码:
python复制# 教师模型(原始HRNet-W32)
teacher_model = load_pretrained_hrnet()
# 学生模型(轻量版)
student_model = LiteHRNet()
# 蒸馏损失
def distillation_loss(student_output, teacher_output, temperature=3):
soft_target = F.softmax(teacher_output/temperature, dim=1)
soft_student = F.log_softmax(student_output/temperature, dim=1)
return F.kl_div(soft_student, soft_target, reduction='batchmean')