从零实现PyTorch核心：张量、GPU与自动微分-代码聚汇网

从零实现PyTorch核心：张量、GPU与自动微分

投研帮

1. 从零开始构建支持GPU和自动求导的PyTorch框架

作为一名长期使用PyTorch的开发者，我一直对它的内部工作机制充满好奇。当我们调用loss.backward()时，梯度究竟是如何计算的？张量在GPU上的运算又是如何实现的？为了彻底理解这些机制，我决定从零开始构建一个简化版的PyTorch——我将其命名为Norch（NOT PyTorch的缩写，同时也包含我的姓氏Nogueira）。

1.1 为什么需要重新造轮子？

在深度学习领域，PyTorch和TensorFlow等框架已经非常成熟。但作为开发者，仅仅会使用这些框架是不够的。通过亲手实现核心功能，我们可以：

深入理解自动微分的工作原理
掌握GPU加速计算的底层机制
学习高效张量运算的实现方式
为定制化需求打下基础

这个项目将使用C/C++实现核心计算逻辑，通过Python提供友好接口，并支持CUDA进行GPU加速。让我们从最基础的张量表示开始。

2. 张量的本质与实现

2.1 张量的数据结构

张量(Tensor)是深度学习中的基本数据结构，可以看作是多维数组。在实现上，一个张量包含以下几个关键部分：

数据存储：实际存储数值的一维数组
形状(shape)：描述张量每个维度的大小
步幅(stride)：描述如何在内存中访问多维数据
设备(device)：标识数据存储在CPU还是GPU上

c复制typedef struct {
    float* data;      // 数据指针
    int* strides;     // 步幅数组
    int* shape;       // 形状数组
    int ndim;         // 维度数量
    int size;         // 元素总数
    char* device;     // 设备标识("cpu"或"cuda")
} Tensor;

2.2 步幅(stride)的奥秘

步幅是理解张量操作效率的关键。考虑一个形状为[4,8]的二维张量：

code复制[[0, 1, 2, 3, 4, 5, 6, 7],
 [8, 9,10,11,12,13,14,15],
 [16,17,18,19,20,21,22,23],
 [24,25,26,27,28,29,30,31]]

实际上，这些数据在内存中是连续存储的一维数组：

code复制[0,1,2,3,4,5,6,7,8,9,...,31]

要访问位置[i,j]的元素，我们需要计算偏移量：i * stride[0] + j * stride[1]。对于这个例子，步幅是[8,1]，因为每行有8个元素。

2.3 张量操作的实现

基于步幅的概念，我们可以高效实现各种张量操作：

2.3.1 创建张量

c复制Tensor* create_tensor(float* data, int* shape, int ndim) {
    Tensor* tensor = (Tensor*)malloc(sizeof(Tensor));
    tensor->data = data;
    tensor->shape = shape;
    tensor->ndim = ndim;
    
    // 计算元素总数
    tensor->size = 1;
    for(int i=0; i<ndim; i++) {
        tensor->size *= shape[i];
    }
    
    // 计算步幅
    tensor->strides = (int*)malloc(ndim * sizeof(int));
    int stride = 1;
    for(int i=ndim-1; i>=0; i--) {
        tensor->strides[i] = stride;
        stride *= shape[i];
    }
    
    tensor->device = "cpu";
    return tensor;
}

2.3.2 元素访问

c复制float get_item(Tensor* tensor, int* indices) {
    int index = 0;
    for(int i=0; i<tensor->ndim; i++) {
        index += indices[i] * tensor->strides[i];
    }
    return tensor->data[index];
}

2.3.3 张量加法

c复制Tensor* add_tensor(Tensor* tensor1, Tensor* tensor2) {
    // 检查形状是否匹配
    if(tensor1->ndim != tensor2->ndim) {
        fprintf(stderr, "维度不匹配\n");
        exit(1);
    }
    
    for(int i=0; i<tensor1->ndim; i++) {
        if(tensor1->shape[i] != tensor2->shape[i]) {
            fprintf(stderr, "形状不匹配\n");
            exit(1);
        }
    }
    
    // 创建结果张量
    float* result_data = (float*)malloc(tensor1->size * sizeof(float));
    if(tensor1->device == "cuda") {
        add_tensor_cuda(tensor1, tensor2, result_data);
    } else {
        add_tensor_cpu(tensor1, tensor2, result_data);
    }
    
    return create_tensor(result_data, tensor1->shape, tensor1->ndim);
}

2.4 Python接口实现

为了让库更易用，我们使用ctypes提供Python接口：

python复制import ctypes
import os

class CTensor(ctypes.Structure):
    _fields_ = [
        ('data', ctypes.POINTER(ctypes.c_float)),
        ('strides', ctypes.POINTER(ctypes.c_int)),
        ('shape', ctypes.POINTER(ctypes.c_int)),
        ('ndim', ctypes.c_int),
        ('size', ctypes.c_int),
        ('device', ctypes.c_char_p)
    ]

class Tensor:
    _lib = ctypes.CDLL(os.path.join(os.path.dirname(__file__), "libnorch.so"))
    
    def __init__(self, data, device="cpu"):
        self.data, self.shape = self._flatten(data)
        self.device = device
        
        # 转换为C类型
        data_ctype = (ctypes.c_float * len(self.data))(*self.data)
        shape_ctype = (ctypes.c_int * len(self.shape))(*self.shape)
        
        # 调用C函数创建张量
        self._lib.create_tensor.argtypes = [
            ctypes.POINTER(ctypes.c_float),
            ctypes.POINTER(ctypes.c_int),
            ctypes.c_int,
            ctypes.c_char_p
        ]
        self._lib.create_tensor.restype = ctypes.POINTER(CTensor)
        
        self.ctensor = self._lib.create_tensor(
            data_ctype,
            shape_ctype,
            len(self.shape),
            device.encode('utf-8')
        )
    
    def _flatten(self, nested_list):
        """将嵌套列表展平为一维数组并返回形状"""
        # 实现略...
    
    def __add__(self, other):
        """张量加法"""
        self._lib.add_tensor.argtypes = [
            ctypes.POINTER(CTensor),
            ctypes.POINTER(CTensor)
        ]
        self._lib.add_tensor.restype = ctypes.POINTER(CTensor)
        
        result_ctensor = self._lib.add_tensor(self.ctensor, other.ctensor)
        result = Tensor.__new__(Tensor)
        result.ctensor = result_ctensor
        result.shape = self.shape.copy()
        result.device = self.device
        return result

3. GPU加速实现

3.1 CUDA基础

GPU加速的核心思想是利用并行计算。与CPU顺序执行不同，GPU可以同时启动数千个线程并行执行相同指令。CUDA是NVIDIA提供的GPU计算平台，主要概念包括：

核函数(Kernel)：在GPU上执行的函数
线程层次：线程(Thread)→线程块(Block)→网格(Grid)
内存模型：全局内存、共享内存、寄存器等

3.2 张量的GPU实现

3.2.1 数据传输

c复制void cpu_to_cuda(Tensor* tensor) {
    float* gpu_data;
    cudaMalloc((void**)&gpu_data, tensor->size * sizeof(float));
    cudaMemcpy(gpu_data, tensor->data, tensor->size * sizeof(float), 
               cudaMemcpyHostToDevice);
    
    free(tensor->data);
    tensor->data = gpu_data;
    tensor->device = "cuda";
}

void cuda_to_cpu(Tensor* tensor) {
    float* cpu_data = (float*)malloc(tensor->size * sizeof(float));
    cudaMemcpy(cpu_data, tensor->data, tensor->size * sizeof(float),
               cudaMemcpyDeviceToHost);
    
    cudaFree(tensor->data);
    tensor->data = cpu_data;
    tensor->device = "cpu";
}

3.2.2 GPU加法核函数

c复制__global__ void add_kernel(float* a, float* b, float* c, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < size) {
        c[idx] = a[idx] + b[idx];
    }
}

void add_tensor_cuda(Tensor* a, Tensor* b, float* result) {
    float* d_result;
    cudaMalloc((void**)&d_result, a->size * sizeof(float));
    
    int threads_per_block = 256;
    int blocks_per_grid = (a->size + threads_per_block - 1) / threads_per_block;
    
    add_kernel<<<blocks_per_grid, threads_per_block>>>(a->data, b->data, d_result, a->size);
    
    cudaMemcpy(result, d_result, a->size * sizeof(float), cudaMemcpyDeviceToHost);
    cudaFree(d_result);
}

3.3 Python接口

python复制def to(self, device):
    """将张量移动到指定设备"""
    if device not in ["cpu", "cuda"]:
        raise ValueError("设备必须是'cpu'或'cuda'")
    
    if self.device == device:
        return self
    
    self._lib.to_device.argtypes = [
        ctypes.POINTER(CTensor),
        ctypes.c_char_p
    ]
    self._lib.to_device(self.ctensor, device.encode('utf-8'))
    self.device = device
    return self

4. 自动微分实现

4.1 计算图与反向传播

自动微分(Autograd)是PyTorch的核心特性。其基本原理是：

记录所有操作，构建计算图
反向传播时，按照链式法则计算梯度
每个操作需要实现其反向传播规则

4.2 基本操作的反向传播

4.2.1 加法反向传播

对于z = x + y，梯度计算为：
∂L/∂x = ∂L/∂z * ∂z/∂x = ∂L/∂z * 1
∂L/∂y = ∂L/∂z * ∂z/∂y = ∂L/∂z * 1

python复制class AddBackward:
    def __init__(self, x, y):
        self.inputs = [x, y]
    
    def backward(self, grad_output):
        return [grad_output, grad_output]  # 返回对x和y的梯度

4.2.2 乘法反向传播

对于z = x * y，梯度计算为：
∂L/∂x = ∂L/∂z * y
∂L/∂y = ∂L/∂z * x

python复制class MulBackward:
    def __init__(self, x, y):
        self.inputs = [x, y]
    
    def backward(self, grad_output):
        x, y = self.inputs
        return [grad_output * y, grad_output * x]

4.2.3 Sigmoid反向传播

对于σ(x) = 1/(1+e^-x)，其导数为：
σ'(x) = σ(x)*(1-σ(x))

python复制class SigmoidBackward:
    def __init__(self, output):
        self.output = output  # 存储前向传播结果
    
    def backward(self, grad_output):
        sigmoid = self.output
        return [grad_output * sigmoid * (1 - sigmoid)]

4.3 实现自动微分系统

4.3.1 张量扩展

python复制class Tensor:
    def __init__(self, data, requires_grad=False):
        # ...其他初始化代码...
        self.requires_grad = requires_grad
        self.grad = None
        self.grad_fn = None  # 反向传播函数
    
    def backward(self, grad_output=None):
        if not self.requires_grad:
            return
        
        if grad_output is None:
            if self.shape == [1]:  # 标量
                grad_output = Tensor([1.0])
            else:
                raise RuntimeError("非标量张量需要指定grad_output")
        
        if self.grad is None:
            self.grad = Tensor(np.zeros_like(self.data))
        
        # 累加梯度
        self.grad = self.grad + grad_output
        
        if self.grad_fn is not None:
            # 获取输入的梯度
            input_grads = self.grad_fn.backward(grad_output)
            
            # 递归调用反向传播
            for input_tensor, input_grad in zip(self.grad_fn.inputs, input_grads):
                if input_tensor.requires_grad:
                    input_tensor.backward(input_grad)

4.3.2 操作重载

python复制def __add__(self, other):
    result = Tensor(self.data + other.data)
    
    if self.requires_grad or other.requires_grad:
        result.requires_grad = True
        result.grad_fn = AddBackward(self, other)
    
    return result

def __mul__(self, other):
    result = Tensor(self.data * other.data)
    
    if self.requires_grad or other.requires_grad:
        result.requires_grad = True
        result.grad_fn = MulBackward(self, other)
    
    return result

def sigmoid(self):
    output = 1 / (1 + np.exp(-self.data))
    result = Tensor(output)
    
    if self.requires_grad:
        result.requires_grad = True
        result.grad_fn = SigmoidBackward(result)
    
    return result

5. 实战：训练简单模型

5.1 线性回归实现

python复制class Linear:
    def __init__(self, input_dim, output_dim):
        self.weight = Tensor(np.random.randn(input_dim, output_dim) * 0.1, requires_grad=True)
        self.bias = Tensor(np.zeros(output_dim), requires_grad=True)
    
    def __call__(self, x):
        return x @ self.weight + self.bias

def mse_loss(pred, target):
    return ((pred - target) ** 2).mean()

# 准备数据
X = Tensor(np.random.rand(100, 1))
y = 3 * X.data + 2 + np.random.randn(100, 1) * 0.1
y = Tensor(y)

# 创建模型
model = Linear(1, 1)
optimizer = SGD([model.weight, model.bias], lr=0.1)

# 训练循环
for epoch in range(100):
    # 前向传播
    pred = model(X)
    loss = mse_loss(pred, y)
    
    # 反向传播
    loss.backward()
    
    # 更新参数
    optimizer.step()
    
    # 清零梯度
    optimizer.zero_grad()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.data}")

5.2 优化器实现

python复制class SGD:
    def __init__(self, params, lr=0.01):
        self.params = params
        self.lr = lr
    
    def step(self):
        for param in self.params:
            if param.grad is not None:
                param.data -= self.lr * param.grad.data
    
    def zero_grad(self):
        for param in self.params:
            param.grad = None

6. 性能优化技巧

6.1 内存管理

内存池：预分配大块内存，避免频繁malloc/free
引用计数：自动管理张量生命周期
原地操作：如add_、mul_等减少内存分配

6.2 GPU优化

合并内存访问：确保线程访问连续内存
共享内存：用于线程块内的数据共享
异步操作：重叠计算和数据传输

6.3 自动微分优化

延迟计算：只在需要时计算梯度
梯度检查点：减少内存使用
符号微分：对特定操作进行优化

7. 常见问题与调试

7.1 张量形状不匹配

问题：操作时出现形状不匹配错误

解决：

检查输入张量的shape属性
使用reshape/view调整形状
注意广播规则

7.2 GPU内存不足

问题：CUDA out of memory

解决：

减小batch size
使用更小的模型
检查内存泄漏

7.3 梯度爆炸/消失

问题：训练不稳定

解决：

使用梯度裁剪
调整学习率
使用更好的权重初始化

8. 扩展功能

8.1 更多层类型

python复制class ReLU:
    def __call__(self, x):
        mask = x.data > 0
        result = Tensor(x.data * mask, requires_grad=x.requires_grad)
        
        if x.requires_grad:
            result.grad_fn = ReLUBackward(result)
        
        return result

class ReLUBackward:
    def __init__(self, output):
        self.mask = output.data > 0
    
    def backward(self, grad_output):
        return [grad_output * Tensor(self.mask)]

8.2 更多优化器

python复制class Adam:
    def __init__(self, params, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.params = params
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.m = [np.zeros_like(p.data) for p in params]
        self.v = [np.zeros_like(p.data) for p in params]
        self.t = 0
    
    def step(self):
        self.t += 1
        for i, param in enumerate(self.params):
            if param.grad is not None:
                self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * param.grad.data
                self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (param.grad.data ** 2)
                
                m_hat = self.m[i] / (1 - self.beta1 ** self.t)
                v_hat = self.v[i] / (1 - self.beta2 ** self.t)
                
                param.data -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
    
    def zero_grad(self):
        for param in self.params:
            param.grad = None

9. 总结与展望

通过这个项目，我们实现了一个简化版的PyTorch，核心功能包括：

张量数据结构与基本操作
GPU加速支持
自动微分系统
简单模型训练

虽然功能上远不及PyTorch完善，但已经涵盖了深度学习框架的核心概念。这个练习让我对以下方面有了更深入的理解：

张量运算的底层实现
GPU并行计算的原理
自动微分的数学基础
计算图的构建与遍历

未来可能的改进方向包括：

实现更高效的内存管理
支持分布式训练
添加更多神经网络层类型
实现JIT编译优化

这个项目最宝贵的收获不是代码本身，而是对深度学习框架底层原理的深刻理解。建议每个希望深入理解PyTorch的开发者都尝试类似的实现练习。