第4课:RLHF与GRPO
第4课实验:迷你 DeepSeek-R1-Zero
使用 GRPO 训练 Qwen3-1.7B-Base 进行数学推理,观察推理能力的涌现过程,并与蒸馏模型和 Qwen3 思考模式对比
实验概述
| 项目 | 详情 |
|---|---|
| 目标 | 复现迷你 DeepSeek-R1-Zero:用 GRPO 从基座模型训练出数学推理能力 |
| 工具 | TRL (GRPOTrainer) |
| 数据集 | GSM8K 训练集(7,500 道小学数学题) |
| 基座模型 | Qwen/Qwen3-1.7B-Base(基座版,非 Instruct 版) |
| 预计时间 | 约 100 分钟 |
GPU 要求:本实验建议使用 A100-80G。GRPO 需要对每个提示采样多个回复(默认 G=8),内存需求较高。
- A100-80G:推荐配置,
num_generations=8,Qwen3-1.7B-Base - A100-40G:降级为
num_generations=4,或使用 Qwen3-0.6B-Base - RTX 4090 (24GB):使用 Qwen3-0.6B-Base + QLoRA +
num_generations=4
预计 GPU 时间:A100-80G 约 90 分钟(生成是主要瓶颈)。
教学亮点:Qwen3 Instruct 模型自带的思考模式正是通过类似的 RL 训练获得的。在实验最后,你将对比自己从 Base 模型用 GRPO 训出的推理能力与 Qwen3 Instruct 自带的思考模式——直观感受"课堂实验"与"工业级后训练"的差距,以及 RL 方法的内在一致性。
环境准备
# 确认环境
import transformers
import trl
import torch
import re
print(f"Transformers: {transformers.__version__}") # >= 4.51.0
print(f"TRL: {trl.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
gpu_name = torch.cuda.get_device_name(0)
gpu_mem = torch.cuda.get_device_properties(0).total_mem / 1e9
print(f"GPU: {gpu_name} ({gpu_mem:.0f} GB)")
# 加载数据集
from datasets import load_dataset
gsm8k = load_dataset("openai/gsm8k", "main")
print(f"训练集: {len(gsm8k['train'])} 条")
print(f"测试集: {len(gsm8k['test'])} 条")
# 查看数据格式
sample = gsm8k['train'][0]
print(f"\n示例问题: {sample['question']}")
print(f"\n示例答案: {sample['answer']}")Step 1:实现奖励函数(20 分钟)
理解 GSM8K 的答案格式
# GSM8K 的答案格式:推理过程 + "#### <最终数值>"
sample = gsm8k['train'][0]
print("完整答案:")
print(sample['answer'])
# 提取标准答案
def extract_gsm8k_answer(answer_text: str) -> str:
"""从 GSM8K 答案文本中提取最终数值"""
# GSM8K 格式: "...reasoning...\n#### 42"
match = re.search(r'####\s*(.+)', answer_text)
if match:
return match.group(1).strip().replace(",", "")
return ""
# 验证
for i in range(5):
sample = gsm8k['train'][i]
gt = extract_gsm8k_answer(sample['answer'])
print(f"问题 {i+1} 标准答案: {gt}")实现完整的奖励函数
def extract_answer_from_response(response: str) -> str:
"""
从模型回复中提取最终数值答案。
优先匹配 \\boxed{}, 其次匹配最后一个数字。
"""
# 策略1: 匹配 \boxed{...}
boxed_match = re.search(r'\\boxed\{([^}]+)\}', response)
if boxed_match:
return boxed_match.group(1).strip().replace(",", "")
# 策略2: 匹配 "答案是 X" 或 "answer is X" 模式
answer_match = re.search(
r'(?:答案|结果|answer|result)\s*(?:是|为|=|:)\s*(-?\d+\.?\d*)',
response, re.IGNORECASE
)
if answer_match:
return answer_match.group(1).strip()
# 策略3: 匹配 #### 格式
hash_match = re.search(r'####\s*(-?\d+\.?\d*)', response)
if hash_match:
return hash_match.group(1).strip()
# 策略4: 回退到最后一个数字
numbers = re.findall(r'-?\d+\.?\d*', response)
if numbers:
return numbers[-1].replace(",", "")
return ""
def math_reward_fn(completions: list[str], answer: list[str], **kwargs) -> list[float]:
"""
GRPO 奖励函数:检查模型回复的数学正确性。
Args:
completions: 模型生成的回复列表
answer: 标准答案列表(GSM8K 原始格式)
Returns:
rewards: 每个回复的奖励 (1.0 正确, 0.0 错误)
"""
rewards = []
for completion, ans_text in zip(completions, answer):
# 提取标准答案
ground_truth = extract_gsm8k_answer(ans_text)
# 提取模型预测
predicted = extract_answer_from_response(completion)
# 比较答案
try:
correct = abs(float(predicted) - float(ground_truth)) < 1e-5
except (ValueError, TypeError):
correct = predicted.strip() == ground_truth.strip()
reward = 1.0 if correct else 0.0
rewards.append(reward)
return rewards可选:格式奖励
def format_reward_fn(completions: list[str], **kwargs) -> list[float]:
"""
格式奖励:鼓励模型使用结构化推理格式。
如果回复包含 <think>...</think> 标签,给予额外奖励。
"""
rewards = []
for completion in zip(completions):
if "<think>" in completion and "</think>" in completion:
rewards.append(0.1) # 使用了思考标签
else:
rewards.append(0.0)
return rewards验证奖励函数
# 用 5 个样本验证奖励函数
test_cases = [
# (模型回复, 标准答案, 期望奖励)
("Let me think step by step...\nThe answer is \\boxed{42}",
"reasoning...\n#### 42", 1.0),
("I think the answer is 100",
"reasoning...\n#### 50", 0.0),
("经过计算,答案是 15",
"reasoning...\n#### 15", 1.0),
("I'm not sure, maybe 7?",
"reasoning...\n#### 7", 1.0),
("This is a hard problem",
"reasoning...\n#### 25", 0.0),
]
print("奖励函数验证:")
for i, (response, answer, expected) in enumerate(test_cases):
rewards = math_reward_fn([response], [answer])
status = "PASS" if rewards[0] == expected else "FAIL"
predicted = extract_answer_from_response(response)
gt = extract_gsm8k_answer(answer)
print(f" [{status}] Case {i+1}: predicted={predicted}, "
f"ground_truth={gt}, reward={rewards[0]}, expected={expected}")Step 2:配置并启动 GRPO 训练(15 分钟)
加载基座模型
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "Qwen/Qwen3-1.7B-Base" # 注意:使用 Base 版,而非 Instruct 版
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"模型加载完成: {model_name}")
print(f"参数量: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B")准备数据集
# 格式化 GSM8K 为 GRPO 训练格式
def format_gsm8k_for_grpo(example):
"""将 GSM8K 格式化为 GRPO 训练所需的格式"""
# 构造提示:简单的数学问题提示
prompt = (
f"Solve the following math problem step by step. "
f"Put your final answer in \\boxed{{}}.\n\n"
f"Problem: {example['question']}\n\n"
f"Solution:"
)
return {
"prompt": prompt,
"answer": example["answer"], # 保留原始答案用于奖励计算
}
# 应用格式化
train_dataset = gsm8k["train"].map(format_gsm8k_for_grpo)
print(f"训练数据量: {len(train_dataset)}")
print(f"示例提示:\n{train_dataset[0]['prompt']}")配置 GRPO 训练
from trl import GRPOConfig, GRPOTrainer
grpo_config = GRPOConfig(
output_dir="./grpo-qwen3-1.7b-math",
# GRPO 核心参数
num_generations=8, # 每个提示采样 G 个回复
max_completion_length=1024, # 最大生成 token 数
kl_coef=0.001, # KL 惩罚系数(较小,允许策略更多变化)
# 训练超参数
learning_rate=5e-6,
per_device_train_batch_size=1, # 每个设备的批量(G 个回复 × 1 个提示)
gradient_accumulation_steps=4, # 梯度累积
num_train_epochs=1,
max_steps=500, # 最多训练 500 步
# 优化设置
bf16=True,
gradient_checkpointing=True,
optim="adamw_torch",
# 日志和保存
logging_steps=1, # 每步记录(GRPO 训练中每步都很重要)
save_steps=100,
save_total_limit=3,
# 其他
seed=42,
report_to="none", # 或 "wandb"
# vLLM 加速(如果可用)
# use_vllm=True,
# vllm_gpu_utilization=0.7,
)启动训练
# 创建 GRPOTrainer
trainer = GRPOTrainer(
model=model,
args=grpo_config,
train_dataset=train_dataset,
processing_class=tokenizer,
reward_funcs=math_reward_fn, # 传入奖励函数
)
print("=" * 60)
print("开始 GRPO 训练")
print(f"模型: {model_name}")
print(f"组大小 G: {grpo_config.num_generations}")
print(f"最大步数: {grpo_config.max_steps}")
print(f"学习率: {grpo_config.learning_rate}")
print(f"KL 系数: {grpo_config.kl_coef}")
print("=" * 60)
result = trainer.train()
print(f"\n训练完成! 总步数: {result.global_step}")
# 保存最终模型
trainer.save_model("./grpo-qwen3-1.7b-math/final")Step 3:训练与监控(40 分钟)
关键指标监控
在训练过程中,关注以下四个核心指标:
import json
import matplotlib.pyplot as plt
# 提取训练日志
log_history = trainer.state.log_history
# 提取关键指标
steps = []
reward_means = []
reward_stds = []
response_lengths = []
kl_values = []
for entry in log_history:
if "reward/mean" in entry or "rewards/mean" in entry:
steps.append(entry.get("step", 0))
reward_means.append(
entry.get("reward/mean", entry.get("rewards/mean", 0))
)
reward_stds.append(
entry.get("reward/std", entry.get("rewards/std", 0))
)
if "completion_length/mean" in entry:
response_lengths.append(entry["completion_length/mean"])
if "kl" in entry:
kl_values.append(entry["kl"])可视化训练曲线
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 1. 奖励均值(模型是否在学会做对题?)
axes[0, 0].plot(steps[:len(reward_means)], reward_means, 'b-', linewidth=1.5)
axes[0, 0].set_title('Reward Mean (accuracy)', fontsize=12)
axes[0, 0].set_xlabel('Training Steps')
axes[0, 0].set_ylabel('Mean Reward')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_ylim(0, 1)
# 2. 奖励标准差(组内差异是否在变化?)
if reward_stds:
axes[0, 1].plot(steps[:len(reward_stds)], reward_stds, 'r-', linewidth=1.5)
axes[0, 1].set_title('Reward Std (group diversity)', fontsize=12)
axes[0, 1].set_xlabel('Training Steps')
axes[0, 1].set_ylabel('Reward Std')
axes[0, 1].grid(True, alpha=0.3)
# 3. 平均回复长度(模型是否在学会更详细地推理?)
if response_lengths:
axes[1, 0].plot(range(len(response_lengths)), response_lengths,
'g-', linewidth=1.5)
axes[1, 0].set_title('Response Length (reasoning depth)', fontsize=12)
axes[1, 0].set_xlabel('Training Steps')
axes[1, 0].set_ylabel('Avg. Tokens')
axes[1, 0].grid(True, alpha=0.3)
# 4. KL 散度(策略是否偏离参考模型过远?)
if kl_values:
axes[1, 1].plot(range(len(kl_values)), kl_values, 'm-', linewidth=1.5)
axes[1, 1].set_title('KL Divergence', fontsize=12)
axes[1, 1].set_xlabel('Training Steps')
axes[1, 1].set_ylabel('KL')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("./grpo_training_curves.png", dpi=150)
plt.show()
print("训练曲线已保存至 grpo_training_curves.png")指标解读
| 指标 | 正常趋势 | 异常信号 |
|---|---|---|
| Reward Mean | 从 ~0.1 上升到 ~0.5+ | 不上升(学习率太小或奖励函数有 bug) |
| Reward Std | 先上升后下降 | 始终为 0(所有回复结果相同) |
| Response Length | 逐渐增加(学会推理) | 暴涨(可能在生成重复内容) |
| KL Divergence | 缓慢增加 | 快速增长 >10(策略偏离过远) |
训练时间提示:GRPO 的主要瓶颈在于生成步骤(每个提示需要采样 G=8 个完整回复),而非参数更新。如果有 vLLM,启用 use_vllm=True 可以显著加速。在 A100-80G 上,500步约需 90 分钟。
Step 4:分析涌现行为(15 分钟)
对比早期和后期的推理链
from transformers import AutoModelForCausalLM
# 加载不同训练阶段的模型
# 如果保存了中间 checkpoint:
# early_model = 第 10 步的 checkpoint
# late_model = 第 300 步的 checkpoint
# 使用最终模型进行分析
grpo_model = AutoModelForCausalLM.from_pretrained(
"./grpo-qwen3-1.7b-math/final",
torch_dtype=torch.bfloat16,
device_map="auto",
)
# 测试问题
test_questions = [
"Natalia sold clips to 48 of her friends in April, and then she sold "
"half as many clips in May. How many clips did Natalia sell altogether "
"in April and May?",
"Weng earns $12 an hour for babysitting. Yesterday, she just did 50 "
"minutes of babysitting. How much did she earn?",
"Betty is saving money for a new wallet which costs $100. Betty has "
"only half of the money she needs. Her parents decided to give her $15 "
"for that purpose, and her grandparents twice as much as her parents. "
"How much more money does Betty need to make to buy the wallet?",
"Julie is reading a 120-page book. Yesterday, she was able to read 12 "
"pages and today, she read twice as many pages as yesterday. If she "
"wants to read half of the remaining pages tomorrow, how many pages "
"should she read?",
"James writes a 3-page letter to 2 different friends twice a week. How "
"many pages does he write a year?",
]
def generate_reasoning(model, tokenizer, question, max_new_tokens=1024):
"""让模型生成推理过程"""
prompt = (
f"Solve the following math problem step by step. "
f"Put your final answer in \\boxed{{}}.\n\n"
f"Problem: {question}\n\nSolution:"
)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=0.7,
top_p=0.95,
do_sample=True,
)
response = tokenizer.decode(
outputs[0][inputs["input_ids"].shape[1]:],
skip_special_tokens=True
)
return response
# 生成推理链
print("=" * 60)
print("GRPO 模型推理链分析")
print("=" * 60)
for i, q in enumerate(test_questions):
print(f"\n{'='*60}")
print(f"问题 {i+1}: {q[:100]}...")
response = generate_reasoning(grpo_model, tokenizer, q)
print(f"\n推理过程:")
print(response[:600])
if len(response) > 600:
print("...")
print(f"\n回复长度: {len(response.split())} 词")
# 提取答案
predicted = extract_answer_from_response(response)
print(f"提取的答案: {predicted}")涌现行为检查清单
# 分析推理链中的涌现行为
emergence_checklist = {
"逐步计算": ["step", "first", "then", "next", "首先", "然后", "接下来"],
"自我检验": ["verify", "check", "let me", "确认", "验证", "检查"],
"回溯纠正": ["wait", "actually", "reconsider", "mistake", "不对", "重新"],
"结构化推理": ["therefore", "so", "thus", "because", "因此", "所以"],
"数学符号": ["\\boxed", "=", "+", "-", "*", "/"],
}
for i, q in enumerate(test_questions[:3]):
response = generate_reasoning(grpo_model, tokenizer, q)
print(f"\n问题 {i+1} 涌现行为分析:")
for behavior, keywords in emergence_checklist.items():
found = [kw for kw in keywords if kw.lower() in response.lower()]
status = "YES" if found else "NO"
print(f" {behavior}: {status} {found if found else ''}")关键观察:
- 训练早期(~10步):模型通常直接给出答案,很少有推理过程。回复较短。
- 训练中期(~100步):开始出现简单的计算步骤,如"48 + 24 = 72"。
- 训练后期(~300步):出现逐步推理、中间变量、可能出现自我检验。
- 是否出现"顿悟时刻"? 注意观察训练奖励曲线是否有跳跃式上升。
注意:由于我们使用的是 1.7B 模型且训练步数有限,涌现效果可能不如 DeepSeek 论文中报告的那么显著。这正是"课堂实验"与"工业级训练"的差距所在。
Step 5:四方对比(10 分钟)
加载对比模型
# 1. 基座模型(无训练)
base_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-1.7B-Base",
torch_dtype=torch.bfloat16,
device_map="auto",
)
# 2. GRPO 训练模型(已加载为 grpo_model)
# 3. 蒸馏模型
distill_model = AutoModelForCausalLM.from_pretrained(
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
torch_dtype=torch.bfloat16,
device_map="auto",
)
distill_tokenizer = AutoTokenizer.from_pretrained(
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
)
# 4. Qwen3 Instruct(思考模式)
instruct_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-1.7B",
torch_dtype=torch.bfloat16,
device_map="auto",
)
instruct_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")在 GSM8K 测试子集上对比
# 选择 20 道测试题
test_subset = gsm8k["test"].select(range(20))
results = {
"Base": {"correct": 0, "total": 0},
"GRPO": {"correct": 0, "total": 0},
"Distill": {"correct": 0, "total": 0},
"Instruct-Think": {"correct": 0, "total": 0},
}
for idx, sample in enumerate(test_subset):
question = sample["question"]
ground_truth = extract_gsm8k_answer(sample["answer"])
# 各模型生成回复
models_and_configs = [
("Base", base_model, tokenizer, False),
("GRPO", grpo_model, tokenizer, False),
("Distill", distill_model, distill_tokenizer, False),
("Instruct-Think", instruct_model, instruct_tokenizer, True),
]
for name, model, tok, use_think in models_and_configs:
if use_think:
# Qwen3 Instruct 思考模式
messages = [{"role": "user", "content":
f"Solve: {question}\nPut your answer in \\boxed{{}}."
}]
text = tok.apply_chat_template(
messages, tokenize=False,
add_generation_prompt=True,
enable_thinking=True, # 启用思考模式
)
inputs = tok(text, return_tensors="pt").to(model.device)
else:
prompt = (
f"Solve the following math problem step by step. "
f"Put your final answer in \\boxed{{}}.\n\n"
f"Problem: {question}\n\nSolution:"
)
inputs = tok(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=1024,
temperature=0.7,
do_sample=True,
)
response = tok.decode(
outputs[0][inputs["input_ids"].shape[1]:],
skip_special_tokens=True
)
predicted = extract_answer_from_response(response)
try:
correct = abs(float(predicted) - float(ground_truth)) < 1e-5
except (ValueError, TypeError):
correct = predicted.strip() == ground_truth.strip()
results[name]["total"] += 1
if correct:
results[name]["correct"] += 1
# 进度报告
if (idx + 1) % 5 == 0:
print(f"已完成 {idx + 1}/20 题")
# 打印结果
print("\n" + "=" * 60)
print("GSM8K 测试集准确率对比 (20 题)")
print("=" * 60)
print(f"{'模型':<20} {'正确数':<10} {'准确率':<10}")
print("-" * 40)
for name, r in results.items():
acc = r["correct"] / r["total"] * 100 if r["total"] > 0 else 0
print(f"{name:<20} {r['correct']}/{r['total']:<10} {acc:.1f}%")预期结果范围
| 模型 | 预期准确率 | 说明 |
|---|---|---|
| Base (Qwen3-1.7B-Base) | 5-15% | 基座模型,无指令跟随能力 |
| GRPO (500步训练) | 25-45% | 课堂实验规模的 GRPO |
| Distill (R1-Distill-1.5B) | 50-65% | 工业级蒸馏模型 |
| Instruct-Think (Qwen3-1.7B) | 60-75% | 完整后训练 + 思考模式 |
结果解读:
- Base → GRPO 的提升说明 GRPO 确实有效,模型学会了推理
- GRPO < Distill说明蒸馏比从零训练更高效(但蒸馏需要教师模型)
- GRPO < Instruct-Think说明工业级后训练(更多数据、更多步数、四阶段流程)远超课堂实验
- 但 GRPO 的方法论与工业级训练完全一致——差距在于规模而非方法
交付物清单
训练奖励曲线图
包含四个子图:奖励均值、奖励标准差、回复长度、KL 散度随训练步数的变化曲线。
5 个推理链示例
展示训练前后(或不同训练阶段)的推理链对比,突出显示涌现行为(逐步推理、自我检验等)。
GSM8K 四方对比表
Base vs GRPO vs Distill vs Qwen3 Instruct 思考模式的准确率对比表。
分析报告
1页分析报告,讨论以下内容:
- 是否观察到"推理涌现"?具体表现是什么?
- GRPO 训练的推理风格与蒸馏模型有何不同?
- 与 Qwen3 Instruct 思考模式相比,差距主要在哪里?
- 对 RLVR 方法的优势和局限的理解
- 如果有更多资源(更大模型、更长训练),你预期结果会如何改变?
加分项
调整 G 和 KL 系数
# 消融实验: 不同的组大小
for G in [4, 8, 16]:
config = GRPOConfig(
num_generations=G,
max_steps=100, # 短训练用于对比
# ... 其他参数相同
)
# trainer = GRPOTrainer(...)
# trainer.train()
print(f"G={G}: 训练完成, 最终奖励均值 = ...")格式奖励的影响
尝试加入格式奖励(鼓励使用 <think> 标签),观察是否影响推理质量:
def combined_reward_fn(completions, answer, **kwargs):
"""正确性奖励 + 格式奖励"""
correctness = math_reward_fn(completions, answer)
format_bonus = format_reward_fn(completions)
return [c + f for c, f in zip(correctness, format_bonus)]