模型训练与推理类
以下为模型训练与推理相关的常见问题与解决方法。
训练相关
模型训练时显存不足怎么办?
显存不足是训练过程中的常见问题,可以尝试以下方法:
1. 减小 batch size
# PyTorch 示例
train_loader = DataLoader(dataset, batch_size=32, shuffle=True) # 从64降低到32
2. 使用梯度累积
# PyTorch 梯度累积示例
accumulation_steps = 4 # 累积4个batch
optimizer.zero_grad()
for i, (inputs, labels) in enumerate(train_loader):
outputs = model(inputs)
loss = criterion(outputs, labels)
loss = loss / accumulation_steps # 归一化loss
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
3. 使用混合精度训练
# PyTorch 混合精度训练
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for inputs, labels in train_loader:
optimizer.zero_grad()
with autocast(): # 自动混合精度
outputs = model(inputs)
loss = criterion(outputs, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
4. 使用梯度检查点
# PyTorch 梯度检查点示例
from torch.utils.checkpoint import checkpoint
class CustomModel(nn.Module):
def __init__(self):
super().__init__()
self.layer1 = nn.Linear(1024, 2048)
self.layer2 = nn.Linear(2048, 4096)
self.layer3 = nn.Linear(4096, 10)
def forward(self, x):
x = self.layer1(x)
# 使用checkpoint减少显存
x = checkpoint(self.layer2, x)
x = self.layer3(x)
return x
5. 使用数据加载优化
# PyTorch 数据加载优化
train_loader = DataLoader(
dataset,
batch_size=32,
shuffle=True,
num_workers=4, # 多进程数据加载
pin_memory=True, # 使用锁页内存
prefetch_factor=2 # 预取数据
)
训练速度慢怎么办?
1. 增加GPU数量 - 多卡并行训练
# PyTorch 多卡分布式训练
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
# 初始化进程组
dist.init_process_group(backend='nccl')
local_rank = int(os.environ['LOCAL_RANK'])
torch.cuda.set_device(local_rank)
# 包装模型
model = model.to(local_rank)
model = DDP(model, device_ids=[local_rank])
# 使用分布式数据加载器
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
train_loader = DataLoader(dataset, batch_size=32, sampler=train_sampler)
2. 优化数据加载
# PyTorch 数据加载优化
train_loader = DataLoader(
dataset,
batch_size=64,
num_workers=8, # 增加worker数量
prefetch_factor=4, # 预取更多数据
persistent_workers=True # 保持worker常驻
)
3. 使用混合精度提升速度
# PyTorch 自动混合精度
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for inputs, labels in train_loader:
with autocast():
outputs = model(inputs)
loss = criterion(outputs, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
4. 选择合适的GPU
根据模型规模选择GPU型号,详细选型请参考:GPU选型指南
训练过程中断怎么办?
1. 使用检查点(checkpoint)
# PyTorch 保存和加载checkpoint
# 保存checkpoint
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss,
}, 'checkpoint.pth')
# 加载checkpoint
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
2. 使用 nohup 防止中断
# 使用nohup后台运行训练
nohup python train.py > train.log 2>&1 &
# 查看日志
tail -f train.log
3. 使用 tmux
# 创建tmux会话
tmux new -s training
# 在tmux中运行训练
python train.py
# 分离会话 (Ctrl+B, 然后按D)
# 重新连接会话
tmux attach -t training
4. 实现自动恢复训练
# PyTorch 自动恢复训练
def load_latest_checkpoint(checkpoint_dir):
checkpoints = glob.glob(os.path.join(checkpoint_dir, '*.pth'))
if not checkpoints:
return None, 0
latest = max(checkpoints, key=os.path.getctime)
checkpoint = torch.load(latest)
return checkpoint['model_state_dict'], checkpoint['epoch']
# 使用
model_state, start_epoch = load_latest_checkpoint('./checkpoints')
if model_state:
model.load_state_dict(model_state)
print(f'从epoch {start_epoch}恢复训练')
如何选择合适的GPU进行训练?
根据模型规模选择:
- 小模型(<7B参数):RTX 3090(24GB)、RTX 4090(24GB)
- 中等模型(7B-30B):L40(48GB)、L40S(48GB)、A100 40GB
- 大模型(30B+):A100 80GB、H100 SXM(80GB)、H200-SXM-141G
详细选型请参考:GPU选型指南
多卡训练时通信开销大怎么办?
1. 优化通信参数
# PyTorch 优化NCCL通信
import os
# 环境变量设置
os.environ['NCCL_IB_DISABLE'] = '0' # 启用IB
os.environ['NCCL_SOCKET_IFNAME'] = 'eth0' # 指定网络接口
os.environ['NCCL_IB_HCA'] = 'mlx5_0' # 指定IB设备
# 初始化时指定backend
dist.init_process_group(backend='nccl')
2. 增加batch size减少通信频率
# 增大batch size减少梯度同步次数
train_loader = DataLoader(dataset, batch_size=128, shuffle=True)
3. 梯度累积减少通信次数
# 与前面梯度累积示例相同
accumulation_steps = 8 # 增加累积步数,减少同步频率
推理相关
模型推理显存不足怎么办?
1. 使用模型量化
# PyTorch 动态量化
import torch
# 动态量化模型
quantized_model = torch.quantization.quantize_dynamic(
model,
{torch.nn.Linear}, # 量化Linear层
dtype=torch.qint8
)
# 使用量化模型推理
with torch.no_grad():
output = quantized_model(input_data)
2. 使用 vLLM 高效推理
# vLLM 高效推理
from vllm import LLM, SamplingParams
llm = LLM(model="your-model-name", quantization="awq") # 使用AWQ量化
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=512
)
outputs = llm.generate(["Hello, how are you?"], sampling_params)
3. 使用 offloading
# Transformers 使用offloading
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=True # CPU offloading
)
model = AutoModelForCausalLM.from_pretrained(
"model-name",
quantization_config=quantization_config,
device_map="auto"
)
推理速度慢怎么办?
1. 使用 INT8 量化
# PyTorch 静态量化
import torch
# 准备校准数据
model.eval()
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(model, inplace=True)
# 校准
with torch.no_grad():
for data in calibration_data:
model(data)
# 转换为量化模型
quantized_model = torch.quantization.convert(model, inplace=True)
2. 使用 TensorRT 加速
# 使用TensorRT推理
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
# 构建TensorRT引擎
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)
# 加载ONNX模型
with open("model.onnx", 'rb') as model:
parser.parse(model.read())
# 构建引擎
engine = builder.build_cuda_engine(network)
3. 批处理推理提升吞吐量
# PyTorch 批处理推理
def batch_inference(model, inputs, batch_size=8):
outputs = []
for i in range(0, len(inputs), batch_size):
batch = inputs[i:i+batch_size]
with torch.no_grad():
batch_output = model(batch)
outputs.append(batch_output)
return torch.cat(outputs, dim=0)
推理时如何处理并发请求?
1. 使用 Triton Inference Server
# 使用Triton客户端
import tritonclient.http as httpclient
client = httpclient.InferenceServerClient(url="localhost:8000")
# 准备输入
inputs = [
httpclient.InferInput("input", [1, 3, 224, 224], "FP32")
]
inputs[0].set_data_from_numpy(input_data)
# 推理
outputs = [httpclient.InferRequestedOutput("output")]
response = client.infer(model_name="model", inputs=inputs, outputs=outputs)
2. 请求批处理
# Python 异步批处理
import asyncio
from concurrent.futures import ThreadPoolExecutor
async def batch_process(model, requests, batch_size=4, timeout=0.1):
batch = []
for request in requests:
batch.append(request)
if len(batch) >= batch_size:
await process_batch(model, batch)
batch = []
await asyncio.sleep(timeout)
模型优化
如何压缩模型大小?
1. 模型量化
# PyTorch 量化示例
import torch
# 选项1: 动态量化(适用于NLP)
quantized_model = torch.quantization.quantize_dynamic(
model,
{torch.nn.Linear, torch.nn.LSTM},
dtype=torch.qint8
)
# 选项2: 静态量化(适用于CNN)
model.eval()
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
model_prepared = torch.quantization.prepare(model, inplace=True)
# 校准
with torch.no_grad():
for data in calibration_data:
model_prepared(data)
# 转换
quantized_model = torch.quantization.convert(model_prepared, inplace=True)
2. 模型剪枝
# PyTorch 剪枝示例
import torch.nn.utils.prune as prune
# 全局非结构化剪枝
parameters_to_prune = []
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear):
parameters_to_prune.append((module, 'weight'))
prune.global_unstructured(
parameters_to_prune,
pruning_method=prune.L1Unstructured,
amount=0.2 # 剪枝20%
)
# 移除剪枝掩码
for module, name in parameters_to_prune:
prune.remove(module, name)
3. 知识蒸馏
# 知识蒸馏示例
class DistillationLoss(nn.Module):
def __init__(self, temperature=4.0, alpha=0.7):
super().__init__()
self.temperature = temperature
self.alpha = alpha
self.criterion = nn.KLDivLoss(reduction='batchmean')
def forward(self, student_logits, teacher_logits, labels):
# 软标签loss
soft_loss = self.criterion(
F.log_softmax(student_logits / self.temperature, dim=1),
F.softmax(teacher_logits / self.temperature, dim=1)
) * (self.temperature ** 2)
# 硬标签loss
hard_loss = F.cross_entropy(student_logits, labels)
return self.alpha * soft_loss + (1 - self.alpha) * hard_loss
# 训练循环
teacher.eval()
for data, labels in train_loader:
student_outputs = student(data)
with torch.no_grad():
teacher_outputs = teacher(data)
loss = distillation_loss(student_outputs, teacher_outputs, labels)
loss.backward()
optimizer.step()
如何加速模型?
1. 使用 TensorRT
# PyTorch 转 TensorRT
import torch_tensorrt
# 编译模型
trt_model = torch_tensorrt.compile(
model,
inputs=[torch_tensorrt.Input((1, 3, 224, 224))],
enabled_precisions={torch.float, torch.half} # 支持FP16
)
# 运行推理
output = trt_model(input_data)
2. 使用 ONNX Runtime
# 导出ONNX模型
torch.onnx.export(
model,
dummy_input,
"model.onnx",
input_names=['input'],
output_names=['output'],
dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
)
# 使用ONNX Runtime推理
import onnxruntime as ort
session = ort.InferenceSession("model.onnx", providers=['CUDAExecutionProvider'])
output = session.run(None, {'input': input_data.numpy()})[0]
常见框架问题
PyTorch训练显存泄漏怎么办?
1. 手动清理显存
import torch
# 清理显存
torch.cuda.empty_cache()
# 查看显存使用
print(torch.cuda.memory_allocated()) # 已分配显存
print(torch.cuda.memory_reserved()) # 已缓存显存
2. 使用 torch.no_grad()
# 在不需要梯度的代码块中使用
with torch.no_grad():
outputs = model(inputs)
predictions = outputs.argmax(dim=1)
3. 限制历史记录
# 避免梯度累积
for inputs, labels in train_loader:
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward(retain_graph=False) # 默认False,不保留计算图
optimizer.step()
optimizer.zero_grad()
TensorFlow训练显存不足怎么办?
1. 限制GPU显存使用
import tensorflow as tf
# 方法1: 按需增长
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# 方法2: 设置内存限制
tf.config.experimental.set_virtual_device_configuration(
gpus[0],
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10240)]
)
2. 使用混合精度
from tensorflow import keras
# 启用混合精度
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)
# 需要loss scaling
optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
tf.keras.optimizers.Adam(), loss_scale='dynamic'
)
3. 减小batch size
# 降低batch size
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.batch(16) # 从32降低到16
Transformers库模型加载慢怎么办?
1. 自动分配到多个GPU
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
device_map="auto", # 自动分配到所有可用GPU
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
2. 使用量化模型
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, # 4-bit量化
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=quantization_config,
device_map="auto"
)
3. 使用 Flash Attention
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
attn_implementation="flash_attention_2", # 使用Flash Attention
torch_dtype=torch.float16,
device_map="auto"
)
环境配置
如何安装CUDA和cuDNN?
云主机和容器实例已预装CUDA和cuDNN,无需手动安装。如需特定版本,请联系客服。
如何设置环境变量?
在 ~/.bashrc 或 ~/.bash_profile 中添加:
# CUDA环境变量
export CUDA_HOME=/usr/local/cuda
export PATH=$CUDA_HOME/bin:$PATH
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
# 可选:cuDNN路径
export CUDNN_PATH=/usr/local/cuda
export LD_LIBRARY_PATH=$CUDNN_PATH/lib64:$LD_LIBRARY_PATH
然后执行:
source ~/.bashrc
PyTorch/TensorFlow版本不兼容怎么办?
1. 查看版本兼容性
# 查看CUDA版本
nvcc --version
# 查看PyTorch版本
python -c "import torch; print(torch.__version__)"
python -c "import torch; print(torch.version.cuda)"
# 查看TensorFlow版本
python -c "import tensorflow as tf; print(tf.__version__)"
python -c "import tensorflow as tf; print(tf.sysconfig.get_build_info()['cuda_version'])"
2. 重新安装兼容版本
# 卸载当前版本
pip uninstall torch torchvision torchaudio
# 安装指定版本(示例:CUDA 11.8)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
性能监控
如何监控GPU使用情况?
# 查看GPU状态
nvidia-smi
# 实时监控(每1秒刷新一次)
watch -n 1 nvidia-smi
# 查看特定GPU信息
nvidia-smi -i 0
# 持续监控并输出到文件
nvidia-smi dmon -s pucvmet -c 100 > gpu_stats.log
主要关注指标:
- Memory-Usage: 显存使用量
- GPU-Util: GPU利用率
- Power Draw: 功耗
- Temperature: 温度
Python监控GPU使用
# PyTorch 监控显存
import torch
print(f"显存分配: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"显存缓存: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
# 实时监控函数
def monitor_gpu():
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"利用率: {torch.cuda.utilization()}%")
print(f"显存使用: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
monitor_gpu()
如何监控训练进度?
1. 使用TensorBoard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('./logs')
for epoch in range(num_epochs):
for batch_idx, (data, target) in enumerate(train_loader):
# 训练代码
loss = train_step(data, target)
# 记录到TensorBoard
global_step = epoch * len(train_loader) + batch_idx
writer.add_scalar('Loss/train', loss, global_step)
writer.add_scalar('Learning Rate', optimizer.param_groups[0]['lr'], global_step)
writer.close()
启动TensorBoard:
tensorboard --logdir ./logs
2. 使用进度条
from tqdm import tqdm
for epoch in range(num_epochs):
pbar = tqdm(train_loader, desc=f'Epoch {epoch}')
for data, target in pbar:
loss = train_step(data, target)
pbar.set_postfix({'loss': f'{loss:.4f}'})
3. 保存训练日志
import logging
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('training.log'),
logging.StreamHandler()
]
)
# 记录训练信息
logging.info(f'Starting training for {num_epochs} epochs')
logging.info(f'Model: {model.__class__.__name__}')
logging.info(f'Trainable parameters: {sum(p.numel() for p in model.parameters())}')
其他问题
训练时Loss不下降怎么办?
1. 检查学习率
# 学习率衰减
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode='min', factor=0.5, patience=5
)
# 或者使用余弦退火
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer, T_max=num_epochs
)
2. 设置随机种子保证可复现性
import torch
import numpy as np
import random
def set_seed(seed=42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(42)
推理结果不一致怎么办?
1. 设置随机种子
# PyTorch 设置随机种子
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
# NumPy 设置随机种子
np.random.seed(42)
# Python 设置随机种子
random.seed(42)
2. 确保模型处于评估模式
# 推理前设置为评估模式
model.eval()
# 使用torch.no_grad()
with torch.no_grad():
outputs = model(inputs)
3. 检查预处理一致性
# 标准化预处理
def preprocess(input_data, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
# 固定的预处理步骤
input_data = input_data / 255.0
mean = torch.tensor(mean).view(3, 1, 1)
std = torch.tensor(std).view(3, 1, 1)
return (input_data - mean) / std
如遇到未列出的其他问题,欢迎联系客服获取技术支持。
