1. 实验概述
2. 环境配置
2.1 硬件环境
2.2 软件环境
2.3 PyTorch GPU版本安装
# 卸载CPU版本
pip uninstall torch torchvision -y
# 安装GPU版本 (CUDA 12.8)
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu1282.4 验证GPU可用性
import torch
print(f"CUDA available: {torch.cuda.is_available()}") # True
print(f"CUDA version: {torch.version.cuda}") # 12.8
print(f"GPU name: {torch.cuda.get_device_name(0)}") # NVIDIA GeForce RTX 5090 Laptop GPU3. 数据集信息
3.1 CIFAR-10 类别
3.2 数据预处理
# 训练集:数据增强 + 归一化
train_transform = transforms.Compose([
transforms.RandomCrop(32, padding=4), # 随机裁剪
transforms.RandomHorizontalFlip(), # 水平翻转
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])
# 测试集:仅归一化
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])4. GPU设备选择逻辑
4.1 设备自动检测代码
def get_device():
"""
自动检测并选择最佳计算设备
优先级: CUDA > MPS > CPU
"""
if torch.cuda.is_available():
device = torch.device("cuda")
gpu_name = torch.cuda.get_device_name(0)
gpu_count = torch.cuda.device_count()
print(f"[GPU] 使用CUDA设备: {gpu_name}")
print(f"[GPU] GPU数量: {gpu_count}")
print(f"[GPU] CUDA版本: {torch.version.cuda}")
# 显示GPU内存
mem_allocated = torch.cuda.memory_allocated(0) / 1024**2
mem_reserved = torch.cuda.memory_reserved(0) / 1024**2
print(f"[GPU] 已分配内存: {mem_allocated:.2f} MB")
elif torch.backends.mps.is_available():
device = torch.device("mps")
print("[GPU] 使用Apple MPS设备")
else:
device = torch.device("cpu")
print("[CPU] 使用CPU设备")
return device4.2 设备选择流程
开始
│
├─ CUDA可用? ──是──> 使用 CUDA GPU
│
├─ 否
│ └─ MPS可用? ──是──> 使用 Apple MPS
│
└─ 否
└─ 使用 CPU5. 模型架构
5.1 CIFAR10Net 网络结构
CIFAR10Net(
# 第一个卷积块: 32x32 -> 16x16
(conv1): Conv2d(3, 32, 3x3, padding=1) + BatchNorm + ReLU
(conv2): Conv2d(32, 32, 3x3, padding=1) + BatchNorm + ReLU
(pool1): MaxPool2d(2x2)
# 第二个卷积块: 16x16 -> 8x8
(conv3): Conv2d(32, 64, 3x3, padding=1) + BatchNorm + ReLU
(conv4): Conv2d(64, 64, 3x3, padding=1) + BatchNorm + ReLU
(pool2): MaxPool2d(2x2)
# 第三个卷积块: 8x8 -> 4x4
(conv5): Conv2d(64, 128, 3x3, padding=1) + BatchNorm + ReLU
(conv6): Conv2d(128, 128, 3x3, padding=1) + BatchNorm + ReLU
(pool3): MaxPool2d(2x2)
# 全连接层
(fc1): Linear(2048 -> 256)
(dropout): Dropout(0.5)
(fc2): Linear(256 -> 10)
)5.2 模型参数统计
6. 训练配置
7. 训练过程
7.1 训练日志摘要
7.2 关键指标
7.3 训练曲线分析
准确率趋势:
Epoch 1: 60.74% ████████████████
Epoch 5: 74.45% ██████████████████████
Epoch 9: 82.52% ████████████████████████████
Epoch 12: 85.19% ███████████████████████████████
Epoch 15: 85.03% ███████████████████████████████
损失趋势:
Epoch 1: 1.0922 ████████████████████████████████
Epoch 5: 0.7603 ████████████████████████████
Epoch 9: 0.5037 ███████████████
Epoch 12: 0.4479 ██████████████
Epoch 15: 0.4547 ██████████████8. GPU加速效果对比分析
8.1 性能对比测试
# 测试条件: 相同模型、相同数据、10次迭代
# 模型: CIFAR10Net (815,018参数)
# 批次: 256样本
CPU训练时间: 1.28秒
GPU训练时间: 0.15秒
加速比: 8.41x8.2 对比结果
8.3 GPU内存使用
8.4 加速效果分析
-
加速比: RTX 5090实现了8.41倍的加速效果
-
适用场景: GPU加速对大规模神经网络训练效果显著
-
内存效率: 24GB显存仅使用约191MB,表明模型规模可进一步扩大
-
并行计算: 深度学习的矩阵运算天然适合GPU并行架构
9. GPU训练核心代码
9.1 数据迁移到GPU
# 训练时
data, target = data.to(device), target.to(device)
# 模型推理时
output = model(data.to(device))9.2 完整训练循环
for epoch in range(1, epochs + 1):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
# 1. 数据迁移到GPU
data, target = data.to(device), target.to(device)
# 2. 梯度清零
optimizer.zero_grad()
# 3. 前向传播 (GPU计算)
output = model(data)
# 4. 计算损失
loss = criterion(output, target)
# 5. 反向传播 (GPU计算)
loss.backward()
# 6. 参数更新
optimizer.step()9.3 GPU同步
# 对于精确的时间测量,需要GPU同步
torch.cuda.synchronize()
start = time.time()
# ... GPU操作 ...
torch.cuda.synchronize()
end = time.time()10. 注意事项
10.1 Windows系统
# Windows建议设置
num_workers = 0 # 避免多进程问题
pin_memory = True # 加速GPU数据传输10.2 PyTorch GPU配置
-
torch.cuda.is_available()- 检查CUDA是否可用 -
torch.cuda.device_count()- 获取GPU数量 -
torch.cuda.get_device_name(0)- 获取GPU名称 -
torch.cuda.memory_allocated()- 查看当前显存使用 -
torch.cuda.synchronize()- 确保GPU操作完成
11. 扩展实验建议
-
更深网络: 尝试ResNet、VGG等复杂架构
-
数据增强: 添加随机旋转、颜色抖动等
-
学习率调度: 尝试CosineAnnealing、ReduceLROnPlateau
-
正则化: 调整Dropout率、添加L2正则化
-
批量归一化: 深入理解BatchNorm的作用
-
混合精度训练: 使用FP16进一步加速
12. 结论
本实验成功完成了CIFAR-10图像分类任务,实现了以下目标:
通过本实验,初学者可以掌握:
-
GPU设备选择与配置
-
PyTorch CUDA编程基础
-
卷积神经网络设计与训练
-
GPU性能分析与优化
13. 输出文件清单
完整代码
"""
CIFAR-10图像分类器 - GPU深度学习实验
基于PyTorch实现,适合初学者学习GPU加速
数据集: CIFAR-10 (10类彩色图像: 飞机、汽车、鸟、猫、鹿、狗、青蛙、马、船、卡车)
"""
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import logging
from datetime import datetime
import time
# ============================================================
# 1. GPU设备配置与选择逻辑
# ============================================================
def get_device():
"""
自动检测并选择最佳计算设备
优先级: CUDA > MPS > CPU
"""
if torch.cuda.is_available():
device = torch.device("cuda")
gpu_name = torch.cuda.get_device_name(0)
gpu_count = torch.cuda.device_count()
print(f"[GPU] 使用CUDA设备: {gpu_name}")
print(f"[GPU] GPU数量: {gpu_count}")
print(f"[GPU] CUDA版本: {torch.version.cuda}")
# 显示GPU内存
mem_allocated = torch.cuda.memory_allocated(0) / 1024**2
mem_reserved = torch.cuda.memory_reserved(0) / 1024**2
print(f"[GPU] 已分配内存: {mem_allocated:.2f} MB, 预留: {mem_reserved:.2f} MB")
elif torch.backends.mps.is_available():
device = torch.device("mps")
print("[GPU] 使用Apple MPS设备")
else:
device = torch.device("cpu")
print("[CPU] 使用CPU设备")
return device
def setup_logger(log_file):
"""配置日志,同时输出到文件和控制台"""
logger = logging.getLogger("CIFAR10_Training")
logger.setLevel(logging.INFO)
logger.handlers.clear()
formatter = logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
file_handler = logging.FileHandler(log_file, mode='w', encoding='utf-8')
file_handler.setFormatter(formatter)
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.addHandler(console_handler)
return logger
# ============================================================
# 2. 定义CNN模型 (针对CIFAR-10优化)
# ============================================================
class CIFAR10Net(nn.Module):
"""
CIFAR-10专用卷积神经网络
结构: Conv-BN-ReLU-Pool + Conv-BN-ReLU-Pool + Conv-BN-ReLU + FC + FC
输入: 3x32x32 彩色图像
输出: 10个类别
"""
def __init__(self):
super(CIFAR10Net, self).__init__()
# 第一个卷积块
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm2d(32)
self.pool1 = nn.MaxPool2d(2, 2) # 32x32 -> 16x16
# 第二个卷积块
self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.bn3 = nn.BatchNorm2d(64)
self.conv4 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.bn4 = nn.BatchNorm2d(64)
self.pool2 = nn.MaxPool2d(2, 2) # 16x16 -> 8x8
# 第三个卷积块
self.conv5 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.bn5 = nn.BatchNorm2d(128)
self.conv6 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.bn6 = nn.BatchNorm2d(128)
self.pool3 = nn.MaxPool2d(2, 2) # 8x8 -> 4x4
# 全连接层
self.fc1 = nn.Linear(128 * 4 * 4, 256)
self.fc2 = nn.Linear(256, 10)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.5)
def _conv_block(self, x, conv1, bn1, conv2, bn2, pool):
"""卷积块: Conv -> BN -> ReLU -> Conv -> BN -> ReLU -> Pool"""
x = self.relu(bn1(conv1(x)))
x = self.relu(bn2(conv2(x)))
return pool(x)
def forward(self, x):
# 三个卷积块
x = self._conv_block(x, self.conv1, self.bn1, self.conv2, self.bn2, self.pool1)
x = self._conv_block(x, self.conv3, self.bn3, self.conv4, self.bn4, self.pool2)
x = self._conv_block(x, self.conv5, self.bn5, self.conv6, self.bn6, self.pool3)
# 全连接层
x = x.view(-1, 128 * 4 * 4)
x = self.relu(self.fc1(x))
x = self.dropout(x)
return self.fc2(x)
# ============================================================
# 3. 训练函数
# ============================================================
def train_one_epoch(model, loader, criterion, optimizer, device):
"""训练一个epoch"""
model.train()
running_loss = 0.0
correct = 0
total = 0
for batch_idx, (data, target) in enumerate(loader):
# --- GPU数据传输 ---
data, target = data.to(device), target.to(device)
# 梯度清零
optimizer.zero_grad()
# 前向传播 (GPU计算)
output = model(data)
# 计算损失
loss = criterion(output, target)
# 反向传播 (GPU计算)
loss.backward()
# 参数更新
optimizer.step()
# 统计
running_loss += loss.item()
_, predicted = output.max(1)
total += target.size(0)
correct += predicted.eq(target).sum().item()
return running_loss / len(loader), 100. * correct / total
def evaluate(model, loader, criterion, device):
"""评估模型"""
model.eval()
test_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for data, target in loader:
# --- GPU推理 ---
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += criterion(output, target).item()
_, predicted = output.max(1)
total += target.size(0)
correct += predicted.eq(target).sum().item()
return test_loss / len(loader), 100. * correct / total
# ============================================================
# 4. 主训练流程
# ============================================================
def main():
# 创建日志
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"training_log_cifar10_{timestamp}.txt"
logger = setup_logger(log_file)
logger.info("=" * 70)
logger.info("CIFAR-10深度学习实验 - GPU加速训练")
logger.info("=" * 70)
# --------------------------------------------------
# 4.1 GPU设备选择
# --------------------------------------------------
device = get_device()
logger.info(f"最终使用设备: {device}")
# --------------------------------------------------
# 4.2 数据加载
# --------------------------------------------------
logger.info("正在加载CIFAR-10数据集...")
# 数据增强 + 归一化
train_transform = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])
# 加载数据集
train_dataset = datasets.CIFAR10(
root='./data',
train=True,
download=True,
transform=train_transform
)
test_dataset = datasets.CIFAR10(
root='./data',
train=False,
download=True,
transform=test_transform
)
logger.info(f"训练集样本数: {len(train_dataset)}")
logger.info(f"测试集样本数: {len(test_dataset)}")
logger.info(f"类别: {', '.join(train_dataset.classes)}")
# DataLoader
train_loader = DataLoader(
train_dataset,
batch_size=128,
shuffle=True,
num_workers=0,
pin_memory=True # GPU加速数据传输
)
test_loader = DataLoader(
test_dataset,
batch_size=256,
shuffle=False,
num_workers=0,
pin_memory=True
)
# --------------------------------------------------
# 4.3 模型初始化
# --------------------------------------------------
model = CIFAR10Net().to(device)
logger.info(f"\n模型结构:\n{model}")
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f"\n总参数量: {total_params:,}")
logger.info(f"可训练参数量: {trainable_params:,}")
# --------------------------------------------------
# 4.4 损失函数和优化器
# --------------------------------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
# --------------------------------------------------
# 4.5 GPU训练计时
# --------------------------------------------------
logger.info("\n" + "=" * 70)
logger.info("开始GPU训练")
logger.info("=" * 70)
epochs = 15
best_acc = 0.0
model_path = None
for epoch in range(1, epochs + 1):
epoch_start = time.time()
# 训练
train_loss, train_acc = train_one_epoch(
model, train_loader, criterion, optimizer, device
)
# 更新学习率
scheduler.step()
# 评估
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
epoch_time = time.time() - epoch_start
# 记录日志
logger.info(
f"Epoch [{epoch:2d}/{epochs}] | "
f"Loss: {train_loss:.4f}/{test_loss:.4f} | "
f"Acc: {train_acc:.2f}%/{test_acc:.2f}% | "
f"Time: {epoch_time:.2f}s | "
f"LR: {optimizer.param_groups[0]['lr']:.6f}"
)
# 保存最佳模型
if test_acc > best_acc:
best_acc = test_acc
model_path = f"best_cifar10_model_{timestamp}.pth"
torch.save(model.state_dict(), model_path)
logger.info(f" -> 新最佳准确率: {best_acc:.2f}%, 模型已保存")
# --------------------------------------------------
# 4.6 最终评估
# --------------------------------------------------
logger.info("\n" + "=" * 70)
logger.info("训练完成 - 最终评估")
logger.info("=" * 70)
final_loss, final_acc = evaluate(model, test_loader, criterion, device)
logger.info(f"测试集损失: {final_loss:.4f}")
logger.info(f"测试集准确率: {final_acc:.2f}%")
logger.info(f"最佳准确率: {best_acc:.2f}%")
# --------------------------------------------------
# 4.7 GPU内存使用统计
# --------------------------------------------------
if torch.cuda.is_available():
logger.info(f"\nGPU内存统计:")
logger.info(f" - 已分配: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
logger.info(f" - 已预留: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
logger.info(f" - 最大分配: {torch.cuda.max_memory_allocated(0) / 1024**2:.2f} MB")
logger.info(f"\n模型保存: {model_path}")
logger.info("\n" + "=" * 70)
logger.info("训练完成!")
logger.info("=" * 70)
# ============================================================
# 5. 单独的性能对比测试
# ============================================================
def benchmark_device():
"""测试CPU vs GPU性能差异"""
print("\n" + "=" * 70)
print("GPU vs CPU 性能对比测试")
print("=" * 70)
device_cpu = torch.device("cpu")
device_gpu = torch.device("cuda") if torch.cuda.is_available() else None
# 创建测试模型和数据
model = CIFAR10Net()
test_input = torch.randn(256, 3, 32, 32)
test_target = torch.randint(0, 10, (256,))
# CPU测试
model_cpu = model.to(device_cpu)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_cpu.parameters(), lr=0.001)
start = time.time()
for _ in range(10):
optimizer.zero_grad()
output = model_cpu(test_input)
loss = criterion(output, test_target)
loss.backward()
optimizer.step()
cpu_time = time.time() - start
print(f"[CPU] 训练时间: {cpu_time:.2f}s")
# GPU测试
if device_gpu:
model_gpu = model.to(device_gpu)
test_input_gpu = test_input.to(device_gpu)
test_target_gpu = test_target.to(device_gpu)
optimizer_gpu = optim.Adam(model_gpu.parameters(), lr=0.001)
torch.cuda.synchronize()
start = time.time()
for _ in range(10):
optimizer_gpu.zero_grad()
output = model_gpu(test_input_gpu)
loss = criterion(output, test_target_gpu)
loss.backward()
optimizer_gpu.step()
torch.cuda.synchronize()
gpu_time = time.time() - start
print(f"[GPU] 训练时间: {gpu_time:.2f}s")
print(f"[加速比] GPU比CPU快 {cpu_time/gpu_time:.2f}x")
if __name__ == "__main__":
main()
benchmark_device()