模型训练与推理类

以下为模型训练与推理相关的常见问题与解决方法。

训练相关

模型训练时显存不足怎么办?

显存不足是训练过程中的常见问题,可以尝试以下方法:

1. 减小 batch size

# PyTorch 示例
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)  # 从64降低到32

2. 使用梯度累积

# PyTorch 梯度累积示例
accumulation_steps = 4  # 累积4个batch
optimizer.zero_grad()

for i, (inputs, labels) in enumerate(train_loader):
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss = loss / accumulation_steps  # 归一化loss
    loss.backward()
    
    if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

3. 使用混合精度训练

# PyTorch 混合精度训练
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for inputs, labels in train_loader:
    optimizer.zero_grad()
    
    with autocast():  # 自动混合精度
        outputs = model(inputs)
        loss = criterion(outputs, labels)
    
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

4. 使用梯度检查点

# PyTorch 梯度检查点示例
from torch.utils.checkpoint import checkpoint

class CustomModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(1024, 2048)
        self.layer2 = nn.Linear(2048, 4096)
        self.layer3 = nn.Linear(4096, 10)
    
    def forward(self, x):
        x = self.layer1(x)
        # 使用checkpoint减少显存
        x = checkpoint(self.layer2, x)
        x = self.layer3(x)
        return x

5. 使用数据加载优化

# PyTorch 数据加载优化
train_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4,          # 多进程数据加载
    pin_memory=True,        # 使用锁页内存
    prefetch_factor=2       # 预取数据
)

训练速度慢怎么办?

1. 增加GPU数量 - 多卡并行训练

# PyTorch 多卡分布式训练
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

# 初始化进程组
dist.init_process_group(backend='nccl')
local_rank = int(os.environ['LOCAL_RANK'])
torch.cuda.set_device(local_rank)

# 包装模型
model = model.to(local_rank)
model = DDP(model, device_ids=[local_rank])

# 使用分布式数据加载器
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
train_loader = DataLoader(dataset, batch_size=32, sampler=train_sampler)

2. 优化数据加载

# PyTorch 数据加载优化
train_loader = DataLoader(
    dataset,
    batch_size=64,
    num_workers=8,          # 增加worker数量
    prefetch_factor=4,      # 预取更多数据
    persistent_workers=True # 保持worker常驻
)

3. 使用混合精度提升速度

# PyTorch 自动混合精度
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for inputs, labels in train_loader:
    with autocast():
        outputs = model(inputs)
        loss = criterion(outputs, labels)
    
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

4. 选择合适的GPU

根据模型规模选择GPU型号,详细选型请参考:GPU选型指南

训练过程中断怎么办?

1. 使用检查点(checkpoint)

# PyTorch 保存和加载checkpoint
# 保存checkpoint
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
}, 'checkpoint.pth')

# 加载checkpoint
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']

2. 使用 nohup 防止中断

# 使用nohup后台运行训练
nohup python train.py > train.log 2>&1 &

# 查看日志
tail -f train.log

3. 使用 tmux

# 创建tmux会话
tmux new -s training

# 在tmux中运行训练
python train.py

# 分离会话 (Ctrl+B, 然后按D)

# 重新连接会话
tmux attach -t training

4. 实现自动恢复训练

# PyTorch 自动恢复训练
def load_latest_checkpoint(checkpoint_dir):
    checkpoints = glob.glob(os.path.join(checkpoint_dir, '*.pth'))
    if not checkpoints:
        return None, 0
    latest = max(checkpoints, key=os.path.getctime)
    checkpoint = torch.load(latest)
    return checkpoint['model_state_dict'], checkpoint['epoch']

# 使用
model_state, start_epoch = load_latest_checkpoint('./checkpoints')
if model_state:
    model.load_state_dict(model_state)
    print(f'从epoch {start_epoch}恢复训练')

如何选择合适的GPU进行训练?

根据模型规模选择:

  • 小模型(<7B参数):RTX 3090(24GB)、RTX 4090(24GB)
  • 中等模型(7B-30B):L40(48GB)、L40S(48GB)、A100 40GB
  • 大模型(30B+):A100 80GB、H100 SXM(80GB)、H200-SXM-141G

详细选型请参考:GPU选型指南

多卡训练时通信开销大怎么办?

1. 优化通信参数

# PyTorch 优化NCCL通信
import os
# 环境变量设置
os.environ['NCCL_IB_DISABLE'] = '0'        # 启用IB
os.environ['NCCL_SOCKET_IFNAME'] = 'eth0'   # 指定网络接口
os.environ['NCCL_IB_HCA'] = 'mlx5_0'        # 指定IB设备

# 初始化时指定backend
dist.init_process_group(backend='nccl')

2. 增加batch size减少通信频率

# 增大batch size减少梯度同步次数
train_loader = DataLoader(dataset, batch_size=128, shuffle=True)

3. 梯度累积减少通信次数

# 与前面梯度累积示例相同
accumulation_steps = 8  # 增加累积步数,减少同步频率

推理相关

模型推理显存不足怎么办?

1. 使用模型量化

# PyTorch 动态量化
import torch

# 动态量化模型
quantized_model = torch.quantization.quantize_dynamic(
    model, 
    {torch.nn.Linear},  # 量化Linear层
    dtype=torch.qint8
)

# 使用量化模型推理
with torch.no_grad():
    output = quantized_model(input_data)

2. 使用 vLLM 高效推理

# vLLM 高效推理
from vllm import LLM, SamplingParams

llm = LLM(model="your-model-name", quantization="awq")  # 使用AWQ量化

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=512
)

outputs = llm.generate(["Hello, how are you?"], sampling_params)

3. 使用 offloading

# Transformers 使用offloading
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True  # CPU offloading
)

model = AutoModelForCausalLM.from_pretrained(
    "model-name",
    quantization_config=quantization_config,
    device_map="auto"
)

推理速度慢怎么办?

1. 使用 INT8 量化

# PyTorch 静态量化
import torch

# 准备校准数据
model.eval()
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(model, inplace=True)

# 校准
with torch.no_grad():
    for data in calibration_data:
        model(data)

# 转换为量化模型
quantized_model = torch.quantization.convert(model, inplace=True)

2. 使用 TensorRT 加速

# 使用TensorRT推理
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

# 构建TensorRT引擎
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)

# 加载ONNX模型
with open("model.onnx", 'rb') as model:
    parser.parse(model.read())
    
# 构建引擎
engine = builder.build_cuda_engine(network)

3. 批处理推理提升吞吐量

# PyTorch 批处理推理
def batch_inference(model, inputs, batch_size=8):
    outputs = []
    for i in range(0, len(inputs), batch_size):
        batch = inputs[i:i+batch_size]
        with torch.no_grad():
            batch_output = model(batch)
        outputs.append(batch_output)
    return torch.cat(outputs, dim=0)

推理时如何处理并发请求?

1. 使用 Triton Inference Server

# 使用Triton客户端
import tritonclient.http as httpclient

client = httpclient.InferenceServerClient(url="localhost:8000")

# 准备输入
inputs = [
    httpclient.InferInput("input", [1, 3, 224, 224], "FP32")
]
inputs[0].set_data_from_numpy(input_data)

# 推理
outputs = [httpclient.InferRequestedOutput("output")]
response = client.infer(model_name="model", inputs=inputs, outputs=outputs)

2. 请求批处理

# Python 异步批处理
import asyncio
from concurrent.futures import ThreadPoolExecutor

async def batch_process(model, requests, batch_size=4, timeout=0.1):
    batch = []
    for request in requests:
        batch.append(request)
        if len(batch) >= batch_size:
            await process_batch(model, batch)
            batch = []
        await asyncio.sleep(timeout)

模型优化

如何压缩模型大小?

1. 模型量化

# PyTorch 量化示例
import torch

# 选项1: 动态量化(适用于NLP)
quantized_model = torch.quantization.quantize_dynamic(
    model, 
    {torch.nn.Linear, torch.nn.LSTM},
    dtype=torch.qint8
)

# 选项2: 静态量化(适用于CNN)
model.eval()
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
model_prepared = torch.quantization.prepare(model, inplace=True)

# 校准
with torch.no_grad():
    for data in calibration_data:
        model_prepared(data)

# 转换
quantized_model = torch.quantization.convert(model_prepared, inplace=True)

2. 模型剪枝

# PyTorch 剪枝示例
import torch.nn.utils.prune as prune

# 全局非结构化剪枝
parameters_to_prune = []
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        parameters_to_prune.append((module, 'weight'))

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.2  # 剪枝20%
)

# 移除剪枝掩码
for module, name in parameters_to_prune:
    prune.remove(module, name)

3. 知识蒸馏

# 知识蒸馏示例
class DistillationLoss(nn.Module):
    def __init__(self, temperature=4.0, alpha=0.7):
        super().__init__()
        self.temperature = temperature
        self.alpha = alpha
        self.criterion = nn.KLDivLoss(reduction='batchmean')
    
    def forward(self, student_logits, teacher_logits, labels):
        # 软标签loss
        soft_loss = self.criterion(
            F.log_softmax(student_logits / self.temperature, dim=1),
            F.softmax(teacher_logits / self.temperature, dim=1)
        ) * (self.temperature ** 2)
        
        # 硬标签loss
        hard_loss = F.cross_entropy(student_logits, labels)
        
        return self.alpha * soft_loss + (1 - self.alpha) * hard_loss

# 训练循环
teacher.eval()
for data, labels in train_loader:
    student_outputs = student(data)
    with torch.no_grad():
        teacher_outputs = teacher(data)
    
    loss = distillation_loss(student_outputs, teacher_outputs, labels)
    loss.backward()
    optimizer.step()

如何加速模型?

1. 使用 TensorRT

# PyTorch 转 TensorRT
import torch_tensorrt

# 编译模型
trt_model = torch_tensorrt.compile(
    model,
    inputs=[torch_tensorrt.Input((1, 3, 224, 224))],
    enabled_precisions={torch.float, torch.half}  # 支持FP16
)

# 运行推理
output = trt_model(input_data)

2. 使用 ONNX Runtime

# 导出ONNX模型
torch.onnx.export(
    model,
    dummy_input,
    "model.onnx",
    input_names=['input'],
    output_names=['output'],
    dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
)

# 使用ONNX Runtime推理
import onnxruntime as ort

session = ort.InferenceSession("model.onnx", providers=['CUDAExecutionProvider'])
output = session.run(None, {'input': input_data.numpy()})[0]

常见框架问题

PyTorch训练显存泄漏怎么办?

1. 手动清理显存

import torch

# 清理显存
torch.cuda.empty_cache()

# 查看显存使用
print(torch.cuda.memory_allocated())   # 已分配显存
print(torch.cuda.memory_reserved())     # 已缓存显存

2. 使用 torch.no_grad()

# 在不需要梯度的代码块中使用
with torch.no_grad():
    outputs = model(inputs)
    predictions = outputs.argmax(dim=1)

3. 限制历史记录

# 避免梯度累积
for inputs, labels in train_loader:
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    
    loss.backward(retain_graph=False)  # 默认False,不保留计算图
    optimizer.step()
    optimizer.zero_grad()

TensorFlow训练显存不足怎么办?

1. 限制GPU显存使用

import tensorflow as tf

# 方法1: 按需增长
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# 方法2: 设置内存限制
tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10240)]
)

2. 使用混合精度

from tensorflow import keras

# 启用混合精度
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)

# 需要loss scaling
optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
    tf.keras.optimizers.Adam(), loss_scale='dynamic'
)

3. 减小batch size

# 降低batch size
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.batch(16)  # 从32降低到16

Transformers库模型加载慢怎么办?

1. 自动分配到多个GPU

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    device_map="auto",  # 自动分配到所有可用GPU
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

2. 使用量化模型

from transformers import AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,      # 4-bit量化
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=quantization_config,
    device_map="auto"
)

3. 使用 Flash Attention

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    attn_implementation="flash_attention_2",  # 使用Flash Attention
    torch_dtype=torch.float16,
    device_map="auto"
)

环境配置

如何安装CUDA和cuDNN?

云主机和容器实例已预装CUDA和cuDNN,无需手动安装。如需特定版本,请联系客服。

如何设置环境变量?

~/.bashrc~/.bash_profile 中添加:

# CUDA环境变量
export CUDA_HOME=/usr/local/cuda
export PATH=$CUDA_HOME/bin:$PATH
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH

# 可选:cuDNN路径
export CUDNN_PATH=/usr/local/cuda
export LD_LIBRARY_PATH=$CUDNN_PATH/lib64:$LD_LIBRARY_PATH

然后执行:

source ~/.bashrc

PyTorch/TensorFlow版本不兼容怎么办?

1. 查看版本兼容性

# 查看CUDA版本
nvcc --version

# 查看PyTorch版本
python -c "import torch; print(torch.__version__)"
python -c "import torch; print(torch.version.cuda)"

# 查看TensorFlow版本
python -c "import tensorflow as tf; print(tf.__version__)"
python -c "import tensorflow as tf; print(tf.sysconfig.get_build_info()['cuda_version'])"

2. 重新安装兼容版本

# 卸载当前版本
pip uninstall torch torchvision torchaudio

# 安装指定版本(示例:CUDA 11.8)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

性能监控

如何监控GPU使用情况?

# 查看GPU状态
nvidia-smi

# 实时监控(每1秒刷新一次)
watch -n 1 nvidia-smi

# 查看特定GPU信息
nvidia-smi -i 0

# 持续监控并输出到文件
nvidia-smi dmon -s pucvmet -c 100 > gpu_stats.log

主要关注指标:

  • Memory-Usage: 显存使用量
  • GPU-Util: GPU利用率
  • Power Draw: 功耗
  • Temperature: 温度

Python监控GPU使用

# PyTorch 监控显存
import torch

print(f"显存分配: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"显存缓存: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

# 实时监控函数
def monitor_gpu():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"利用率: {torch.cuda.utilization()}%")
    print(f"显存使用: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

monitor_gpu()

如何监控训练进度?

1. 使用TensorBoard

from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('./logs')

for epoch in range(num_epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        # 训练代码
        loss = train_step(data, target)
        
        # 记录到TensorBoard
        global_step = epoch * len(train_loader) + batch_idx
        writer.add_scalar('Loss/train', loss, global_step)
        writer.add_scalar('Learning Rate', optimizer.param_groups[0]['lr'], global_step)

writer.close()

启动TensorBoard:

tensorboard --logdir ./logs

2. 使用进度条

from tqdm import tqdm

for epoch in range(num_epochs):
    pbar = tqdm(train_loader, desc=f'Epoch {epoch}')
    for data, target in pbar:
        loss = train_step(data, target)
        pbar.set_postfix({'loss': f'{loss:.4f}'})

3. 保存训练日志

import logging

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('training.log'),
        logging.StreamHandler()
    ]
)

# 记录训练信息
logging.info(f'Starting training for {num_epochs} epochs')
logging.info(f'Model: {model.__class__.__name__}')
logging.info(f'Trainable parameters: {sum(p.numel() for p in model.parameters())}')

其他问题

训练时Loss不下降怎么办?

1. 检查学习率

# 学习率衰减
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5
)

# 或者使用余弦退火
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=num_epochs
)

2. 设置随机种子保证可复现性

import torch
import numpy as np
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

推理结果不一致怎么办?

1. 设置随机种子

# PyTorch 设置随机种子
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True

# NumPy 设置随机种子
np.random.seed(42)

# Python 设置随机种子
random.seed(42)

2. 确保模型处于评估模式

# 推理前设置为评估模式
model.eval()

# 使用torch.no_grad()
with torch.no_grad():
    outputs = model(inputs)

3. 检查预处理一致性

# 标准化预处理
def preprocess(input_data, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
    # 固定的预处理步骤
    input_data = input_data / 255.0
    mean = torch.tensor(mean).view(3, 1, 1)
    std = torch.tensor(std).view(3, 1, 1)
    return (input_data - mean) / std

如遇到未列出的其他问题,欢迎联系客服获取技术支持。