LLM 后训练实践
第4课:RLHF与GRPO

第4课实验:迷你 DeepSeek-R1-Zero

使用 GRPO 训练 Qwen3-1.7B-Base 进行数学推理,观察推理能力的涌现过程,并与蒸馏模型和 Qwen3 思考模式对比

实验概述

项目详情
目标复现迷你 DeepSeek-R1-Zero:用 GRPO 从基座模型训练出数学推理能力
工具TRL (GRPOTrainer)
数据集GSM8K 训练集(7,500 道小学数学题)
基座模型Qwen/Qwen3-1.7B-Base(基座版,非 Instruct 版)
预计时间约 100 分钟

GPU 要求:本实验建议使用 A100-80G。GRPO 需要对每个提示采样多个回复(默认 G=8),内存需求较高。

  • A100-80G:推荐配置,num_generations=8,Qwen3-1.7B-Base
  • A100-40G:降级为 num_generations=4,或使用 Qwen3-0.6B-Base
  • RTX 4090 (24GB):使用 Qwen3-0.6B-Base + QLoRA + num_generations=4

预计 GPU 时间:A100-80G 约 90 分钟(生成是主要瓶颈)。

教学亮点:Qwen3 Instruct 模型自带的思考模式正是通过类似的 RL 训练获得的。在实验最后,你将对比自己从 Base 模型用 GRPO 训出的推理能力与 Qwen3 Instruct 自带的思考模式——直观感受"课堂实验"与"工业级后训练"的差距,以及 RL 方法的内在一致性。

环境准备

# 确认环境
import transformers
import trl
import torch
import re

print(f"Transformers: {transformers.__version__}")  # >= 4.51.0
print(f"TRL: {trl.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_mem / 1e9
    print(f"GPU: {gpu_name} ({gpu_mem:.0f} GB)")

# 加载数据集
from datasets import load_dataset

gsm8k = load_dataset("openai/gsm8k", "main")
print(f"训练集: {len(gsm8k['train'])} 条")
print(f"测试集: {len(gsm8k['test'])} 条")

# 查看数据格式
sample = gsm8k['train'][0]
print(f"\n示例问题: {sample['question']}")
print(f"\n示例答案: {sample['answer']}")

Step 1:实现奖励函数(20 分钟)

理解 GSM8K 的答案格式

# GSM8K 的答案格式:推理过程 + "#### <最终数值>"
sample = gsm8k['train'][0]
print("完整答案:")
print(sample['answer'])

# 提取标准答案
def extract_gsm8k_answer(answer_text: str) -> str:
    """从 GSM8K 答案文本中提取最终数值"""
    # GSM8K 格式: "...reasoning...\n#### 42"
    match = re.search(r'####\s*(.+)', answer_text)
    if match:
        return match.group(1).strip().replace(",", "")
    return ""

# 验证
for i in range(5):
    sample = gsm8k['train'][i]
    gt = extract_gsm8k_answer(sample['answer'])
    print(f"问题 {i+1} 标准答案: {gt}")

实现完整的奖励函数

def extract_answer_from_response(response: str) -> str:
    """
    从模型回复中提取最终数值答案。
    优先匹配 \\boxed{}, 其次匹配最后一个数字。
    """
    # 策略1: 匹配 \boxed{...}
    boxed_match = re.search(r'\\boxed\{([^}]+)\}', response)
    if boxed_match:
        return boxed_match.group(1).strip().replace(",", "")

    # 策略2: 匹配 "答案是 X" 或 "answer is X" 模式
    answer_match = re.search(
        r'(?:答案|结果|answer|result)\s*(?:||=|:)\s*(-?\d+\.?\d*)',
        response, re.IGNORECASE
    )
    if answer_match:
        return answer_match.group(1).strip()

    # 策略3: 匹配 #### 格式
    hash_match = re.search(r'####\s*(-?\d+\.?\d*)', response)
    if hash_match:
        return hash_match.group(1).strip()

    # 策略4: 回退到最后一个数字
    numbers = re.findall(r'-?\d+\.?\d*', response)
    if numbers:
        return numbers[-1].replace(",", "")

    return ""


def math_reward_fn(completions: list[str], answer: list[str], **kwargs) -> list[float]:
    """
    GRPO 奖励函数:检查模型回复的数学正确性。

    Args:
        completions: 模型生成的回复列表
        answer: 标准答案列表(GSM8K 原始格式)
    Returns:
        rewards: 每个回复的奖励 (1.0 正确, 0.0 错误)
    """
    rewards = []
    for completion, ans_text in zip(completions, answer):
        # 提取标准答案
        ground_truth = extract_gsm8k_answer(ans_text)

        # 提取模型预测
        predicted = extract_answer_from_response(completion)

        # 比较答案
        try:
            correct = abs(float(predicted) - float(ground_truth)) < 1e-5
        except (ValueError, TypeError):
            correct = predicted.strip() == ground_truth.strip()

        reward = 1.0 if correct else 0.0
        rewards.append(reward)

    return rewards

可选:格式奖励

def format_reward_fn(completions: list[str], **kwargs) -> list[float]:
    """
    格式奖励:鼓励模型使用结构化推理格式。

    如果回复包含 <think>...</think> 标签,给予额外奖励。
    """
    rewards = []
    for completion in zip(completions):
        if "<think>" in completion and "</think>" in completion:
            rewards.append(0.1)  # 使用了思考标签
        else:
            rewards.append(0.0)
    return rewards

验证奖励函数

# 用 5 个样本验证奖励函数
test_cases = [
    # (模型回复, 标准答案, 期望奖励)
    ("Let me think step by step...\nThe answer is \\boxed{42}",
     "reasoning...\n#### 42", 1.0),
    ("I think the answer is 100",
     "reasoning...\n#### 50", 0.0),
    ("经过计算,答案是 15",
     "reasoning...\n#### 15", 1.0),
    ("I'm not sure, maybe 7?",
     "reasoning...\n#### 7", 1.0),
    ("This is a hard problem",
     "reasoning...\n#### 25", 0.0),
]

print("奖励函数验证:")
for i, (response, answer, expected) in enumerate(test_cases):
    rewards = math_reward_fn([response], [answer])
    status = "PASS" if rewards[0] == expected else "FAIL"
    predicted = extract_answer_from_response(response)
    gt = extract_gsm8k_answer(answer)
    print(f"  [{status}] Case {i+1}: predicted={predicted}, "
          f"ground_truth={gt}, reward={rewards[0]}, expected={expected}")

Step 2:配置并启动 GRPO 训练(15 分钟)

加载基座模型

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-1.7B-Base"  # 注意:使用 Base 版,而非 Instruct 版

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"模型加载完成: {model_name}")
print(f"参数量: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B")

准备数据集

# 格式化 GSM8K 为 GRPO 训练格式
def format_gsm8k_for_grpo(example):
    """将 GSM8K 格式化为 GRPO 训练所需的格式"""
    # 构造提示:简单的数学问题提示
    prompt = (
        f"Solve the following math problem step by step. "
        f"Put your final answer in \\boxed{{}}.\n\n"
        f"Problem: {example['question']}\n\n"
        f"Solution:"
    )
    return {
        "prompt": prompt,
        "answer": example["answer"],  # 保留原始答案用于奖励计算
    }

# 应用格式化
train_dataset = gsm8k["train"].map(format_gsm8k_for_grpo)
print(f"训练数据量: {len(train_dataset)}")
print(f"示例提示:\n{train_dataset[0]['prompt']}")

配置 GRPO 训练

from trl import GRPOConfig, GRPOTrainer

grpo_config = GRPOConfig(
    output_dir="./grpo-qwen3-1.7b-math",

    # GRPO 核心参数
    num_generations=8,             # 每个提示采样 G 个回复
    max_completion_length=1024,    # 最大生成 token 数
    kl_coef=0.001,                # KL 惩罚系数(较小,允许策略更多变化)

    # 训练超参数
    learning_rate=5e-6,
    per_device_train_batch_size=1, # 每个设备的批量(G 个回复 × 1 个提示)
    gradient_accumulation_steps=4, # 梯度累积
    num_train_epochs=1,
    max_steps=500,                 # 最多训练 500 步

    # 优化设置
    bf16=True,
    gradient_checkpointing=True,
    optim="adamw_torch",

    # 日志和保存
    logging_steps=1,               # 每步记录(GRPO 训练中每步都很重要)
    save_steps=100,
    save_total_limit=3,

    # 其他
    seed=42,
    report_to="none",              # 或 "wandb"

    # vLLM 加速(如果可用)
    # use_vllm=True,
    # vllm_gpu_utilization=0.7,
)

启动训练

# 创建 GRPOTrainer
trainer = GRPOTrainer(
    model=model,
    args=grpo_config,
    train_dataset=train_dataset,
    processing_class=tokenizer,
    reward_funcs=math_reward_fn,  # 传入奖励函数
)

print("=" * 60)
print("开始 GRPO 训练")
print(f"模型: {model_name}")
print(f"组大小 G: {grpo_config.num_generations}")
print(f"最大步数: {grpo_config.max_steps}")
print(f"学习率: {grpo_config.learning_rate}")
print(f"KL 系数: {grpo_config.kl_coef}")
print("=" * 60)

result = trainer.train()
print(f"\n训练完成! 总步数: {result.global_step}")

# 保存最终模型
trainer.save_model("./grpo-qwen3-1.7b-math/final")

Step 3:训练与监控(40 分钟)

关键指标监控

在训练过程中,关注以下四个核心指标:

import json
import matplotlib.pyplot as plt

# 提取训练日志
log_history = trainer.state.log_history

# 提取关键指标
steps = []
reward_means = []
reward_stds = []
response_lengths = []
kl_values = []

for entry in log_history:
    if "reward/mean" in entry or "rewards/mean" in entry:
        steps.append(entry.get("step", 0))
        reward_means.append(
            entry.get("reward/mean", entry.get("rewards/mean", 0))
        )
        reward_stds.append(
            entry.get("reward/std", entry.get("rewards/std", 0))
        )
    if "completion_length/mean" in entry:
        response_lengths.append(entry["completion_length/mean"])
    if "kl" in entry:
        kl_values.append(entry["kl"])

可视化训练曲线

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. 奖励均值(模型是否在学会做对题?)
axes[0, 0].plot(steps[:len(reward_means)], reward_means, 'b-', linewidth=1.5)
axes[0, 0].set_title('Reward Mean (accuracy)', fontsize=12)
axes[0, 0].set_xlabel('Training Steps')
axes[0, 0].set_ylabel('Mean Reward')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_ylim(0, 1)

# 2. 奖励标准差(组内差异是否在变化?)
if reward_stds:
    axes[0, 1].plot(steps[:len(reward_stds)], reward_stds, 'r-', linewidth=1.5)
    axes[0, 1].set_title('Reward Std (group diversity)', fontsize=12)
    axes[0, 1].set_xlabel('Training Steps')
    axes[0, 1].set_ylabel('Reward Std')
    axes[0, 1].grid(True, alpha=0.3)

# 3. 平均回复长度(模型是否在学会更详细地推理?)
if response_lengths:
    axes[1, 0].plot(range(len(response_lengths)), response_lengths,
                     'g-', linewidth=1.5)
    axes[1, 0].set_title('Response Length (reasoning depth)', fontsize=12)
    axes[1, 0].set_xlabel('Training Steps')
    axes[1, 0].set_ylabel('Avg. Tokens')
    axes[1, 0].grid(True, alpha=0.3)

# 4. KL 散度(策略是否偏离参考模型过远?)
if kl_values:
    axes[1, 1].plot(range(len(kl_values)), kl_values, 'm-', linewidth=1.5)
    axes[1, 1].set_title('KL Divergence', fontsize=12)
    axes[1, 1].set_xlabel('Training Steps')
    axes[1, 1].set_ylabel('KL')
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("./grpo_training_curves.png", dpi=150)
plt.show()

print("训练曲线已保存至 grpo_training_curves.png")

指标解读

指标正常趋势异常信号
Reward Mean从 ~0.1 上升到 ~0.5+不上升(学习率太小或奖励函数有 bug)
Reward Std先上升后下降始终为 0(所有回复结果相同)
Response Length逐渐增加(学会推理)暴涨(可能在生成重复内容)
KL Divergence缓慢增加快速增长 >10(策略偏离过远)

训练时间提示:GRPO 的主要瓶颈在于生成步骤(每个提示需要采样 G=8 个完整回复),而非参数更新。如果有 vLLM,启用 use_vllm=True 可以显著加速。在 A100-80G 上,500步约需 90 分钟。

Step 4:分析涌现行为(15 分钟)

对比早期和后期的推理链

from transformers import AutoModelForCausalLM

# 加载不同训练阶段的模型
# 如果保存了中间 checkpoint:
# early_model = 第 10 步的 checkpoint
# late_model = 第 300 步的 checkpoint

# 使用最终模型进行分析
grpo_model = AutoModelForCausalLM.from_pretrained(
    "./grpo-qwen3-1.7b-math/final",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# 测试问题
test_questions = [
    "Natalia sold clips to 48 of her friends in April, and then she sold "
    "half as many clips in May. How many clips did Natalia sell altogether "
    "in April and May?",
    "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 "
    "minutes of babysitting. How much did she earn?",
    "Betty is saving money for a new wallet which costs $100. Betty has "
    "only half of the money she needs. Her parents decided to give her $15 "
    "for that purpose, and her grandparents twice as much as her parents. "
    "How much more money does Betty need to make to buy the wallet?",
    "Julie is reading a 120-page book. Yesterday, she was able to read 12 "
    "pages and today, she read twice as many pages as yesterday. If she "
    "wants to read half of the remaining pages tomorrow, how many pages "
    "should she read?",
    "James writes a 3-page letter to 2 different friends twice a week. How "
    "many pages does he write a year?",
]

def generate_reasoning(model, tokenizer, question, max_new_tokens=1024):
    """让模型生成推理过程"""
    prompt = (
        f"Solve the following math problem step by step. "
        f"Put your final answer in \\boxed{{}}.\n\n"
        f"Problem: {question}\n\nSolution:"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
        )
    response = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[1]:],
        skip_special_tokens=True
    )
    return response

# 生成推理链
print("=" * 60)
print("GRPO 模型推理链分析")
print("=" * 60)

for i, q in enumerate(test_questions):
    print(f"\n{'='*60}")
    print(f"问题 {i+1}: {q[:100]}...")
    response = generate_reasoning(grpo_model, tokenizer, q)
    print(f"\n推理过程:")
    print(response[:600])
    if len(response) > 600:
        print("...")
    print(f"\n回复长度: {len(response.split())} 词")

    # 提取答案
    predicted = extract_answer_from_response(response)
    print(f"提取的答案: {predicted}")

涌现行为检查清单

# 分析推理链中的涌现行为
emergence_checklist = {
    "逐步计算": ["step", "first", "then", "next", "首先", "然后", "接下来"],
    "自我检验": ["verify", "check", "let me", "确认", "验证", "检查"],
    "回溯纠正": ["wait", "actually", "reconsider", "mistake", "不对", "重新"],
    "结构化推理": ["therefore", "so", "thus", "because", "因此", "所以"],
    "数学符号": ["\\boxed", "=", "+", "-", "*", "/"],
}

for i, q in enumerate(test_questions[:3]):
    response = generate_reasoning(grpo_model, tokenizer, q)
    print(f"\n问题 {i+1} 涌现行为分析:")
    for behavior, keywords in emergence_checklist.items():
        found = [kw for kw in keywords if kw.lower() in response.lower()]
        status = "YES" if found else "NO"
        print(f"  {behavior}: {status} {found if found else ''}")

关键观察

  • 训练早期(~10步):模型通常直接给出答案,很少有推理过程。回复较短。
  • 训练中期(~100步):开始出现简单的计算步骤,如"48 + 24 = 72"。
  • 训练后期(~300步):出现逐步推理、中间变量、可能出现自我检验。
  • 是否出现"顿悟时刻"? 注意观察训练奖励曲线是否有跳跃式上升。

注意:由于我们使用的是 1.7B 模型且训练步数有限,涌现效果可能不如 DeepSeek 论文中报告的那么显著。这正是"课堂实验"与"工业级训练"的差距所在。

Step 5:四方对比(10 分钟)

加载对比模型

# 1. 基座模型(无训练)
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-1.7B-Base",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# 2. GRPO 训练模型(已加载为 grpo_model)

# 3. 蒸馏模型
distill_model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
distill_tokenizer = AutoTokenizer.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
)

# 4. Qwen3 Instruct(思考模式)
instruct_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-1.7B",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
instruct_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")

在 GSM8K 测试子集上对比

# 选择 20 道测试题
test_subset = gsm8k["test"].select(range(20))

results = {
    "Base": {"correct": 0, "total": 0},
    "GRPO": {"correct": 0, "total": 0},
    "Distill": {"correct": 0, "total": 0},
    "Instruct-Think": {"correct": 0, "total": 0},
}

for idx, sample in enumerate(test_subset):
    question = sample["question"]
    ground_truth = extract_gsm8k_answer(sample["answer"])

    # 各模型生成回复
    models_and_configs = [
        ("Base", base_model, tokenizer, False),
        ("GRPO", grpo_model, tokenizer, False),
        ("Distill", distill_model, distill_tokenizer, False),
        ("Instruct-Think", instruct_model, instruct_tokenizer, True),
    ]

    for name, model, tok, use_think in models_and_configs:
        if use_think:
            # Qwen3 Instruct 思考模式
            messages = [{"role": "user", "content":
                f"Solve: {question}\nPut your answer in \\boxed{{}}."
            }]
            text = tok.apply_chat_template(
                messages, tokenize=False,
                add_generation_prompt=True,
                enable_thinking=True,  # 启用思考模式
            )
            inputs = tok(text, return_tensors="pt").to(model.device)
        else:
            prompt = (
                f"Solve the following math problem step by step. "
                f"Put your final answer in \\boxed{{}}.\n\n"
                f"Problem: {question}\n\nSolution:"
            )
            inputs = tok(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                temperature=0.7,
                do_sample=True,
            )
        response = tok.decode(
            outputs[0][inputs["input_ids"].shape[1]:],
            skip_special_tokens=True
        )

        predicted = extract_answer_from_response(response)
        try:
            correct = abs(float(predicted) - float(ground_truth)) < 1e-5
        except (ValueError, TypeError):
            correct = predicted.strip() == ground_truth.strip()

        results[name]["total"] += 1
        if correct:
            results[name]["correct"] += 1

    # 进度报告
    if (idx + 1) % 5 == 0:
        print(f"已完成 {idx + 1}/20 题")

# 打印结果
print("\n" + "=" * 60)
print("GSM8K 测试集准确率对比 (20 题)")
print("=" * 60)
print(f"{'模型':<20} {'正确数':<10} {'准确率':<10}")
print("-" * 40)
for name, r in results.items():
    acc = r["correct"] / r["total"] * 100 if r["total"] > 0 else 0
    print(f"{name:<20} {r['correct']}/{r['total']:<10} {acc:.1f}%")

预期结果范围

模型预期准确率说明
Base (Qwen3-1.7B-Base)5-15%基座模型,无指令跟随能力
GRPO (500步训练)25-45%课堂实验规模的 GRPO
Distill (R1-Distill-1.5B)50-65%工业级蒸馏模型
Instruct-Think (Qwen3-1.7B)60-75%完整后训练 + 思考模式

结果解读

  • Base → GRPO 的提升说明 GRPO 确实有效,模型学会了推理
  • GRPO < Distill说明蒸馏比从零训练更高效(但蒸馏需要教师模型)
  • GRPO < Instruct-Think说明工业级后训练(更多数据、更多步数、四阶段流程)远超课堂实验
  • 但 GRPO 的方法论与工业级训练完全一致——差距在于规模而非方法

交付物清单

训练奖励曲线图

包含四个子图:奖励均值、奖励标准差、回复长度、KL 散度随训练步数的变化曲线。

5 个推理链示例

展示训练前后(或不同训练阶段)的推理链对比,突出显示涌现行为(逐步推理、自我检验等)。

GSM8K 四方对比表

Base vs GRPO vs Distill vs Qwen3 Instruct 思考模式的准确率对比表。

分析报告

1页分析报告,讨论以下内容:

  • 是否观察到"推理涌现"?具体表现是什么?
  • GRPO 训练的推理风格与蒸馏模型有何不同?
  • 与 Qwen3 Instruct 思考模式相比,差距主要在哪里?
  • 对 RLVR 方法的优势和局限的理解
  • 如果有更多资源(更大模型、更长训练),你预期结果会如何改变?

加分项

调整 G 和 KL 系数

# 消融实验: 不同的组大小
for G in [4, 8, 16]:
    config = GRPOConfig(
        num_generations=G,
        max_steps=100,  # 短训练用于对比
        # ... 其他参数相同
    )
    # trainer = GRPOTrainer(...)
    # trainer.train()
    print(f"G={G}: 训练完成, 最终奖励均值 = ...")

格式奖励的影响

尝试加入格式奖励(鼓励使用 <think> 标签),观察是否影响推理质量:

def combined_reward_fn(completions, answer, **kwargs):
    """正确性奖励 + 格式奖励"""
    correctness = math_reward_fn(completions, answer)
    format_bonus = format_reward_fn(completions)
    return [c + f for c, f in zip(correctness, format_bonus)]