LLM 后训练实践
第5课:压缩部署与扩展

第5课 上机实验

量化实验(必做):多精度加载 Qwen3-8B 并评估压缩影响;能力扩展选做(三选一):蒸馏分析、多模态实验、工具调用实验

实验概览

本课实验分为两部分:

部分内容时长要求
实验 A(必做)量化实验——测量压缩对模型质量的影响~60 分钟全部完成
实验 B(选做)能力扩展——蒸馏/多模态/工具使用三选一~50 分钟完成一项

预计总计算时间:A100-40G 约 80 分钟。


实验 A:量化实验(必做)

实验目标

  • 以 FP16、INT8、INT4 三种精度加载 Qwen3-8B,对比显存占用和加载时间
  • 测量不同精度下的推理速度(tokens/sec、首 token 延迟)
  • 在 GSM8K 数学推理、指令跟随、中文任务上评估量化对质量的影响
  • 使用 LLM-as-Judge 进行系统化评分

环境准备

# 安装依赖
# pip install "transformers>=4.51.0" bitsandbytes accelerate torch datasets

import torch
import time
import json
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# 验证 GPU
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU 显存: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB")

Step 1:多精度加载对比(20 分钟)

定义加载函数

MODEL_NAME = "Qwen/Qwen3-8B"  # 或本地路径

def load_model(precision="fp16"):
    """以指定精度加载模型,返回模型和元信息"""
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.reset_peak_memory_stats()

    start_time = time.time()

    if precision == "fp16":
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto",
        )
    elif precision == "int8":
        config = BitsAndBytesConfig(load_in_8bit=True)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=config,
            device_map="auto",
        )
    elif precision == "int4":
        config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=config,
            device_map="auto",
        )
    else:
        raise ValueError(f"不支持的精度: {precision}")

    load_time = time.time() - start_time
    memory_gb = torch.cuda.max_memory_allocated() / 1024**3

    return model, {
        "precision": precision,
        "load_time_s": round(load_time, 1),
        "memory_gb": round(memory_gb, 2),
    }

依次加载三种精度并记录数据

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

results = {}
for precision in ["fp16", "int8", "int4"]:
    print(f"\n{'='*50}")
    print(f"正在加载 {precision.upper()} 模型...")
    model, info = load_model(precision)
    results[precision] = info
    print(f"  加载时间: {info['load_time_s']}s")
    print(f"  显存占用: {info['memory_gb']} GB")

    # 简单验证:生成一条回复
    test_input = tokenizer("你好,请自我介绍一下。", return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**test_input, max_new_tokens=50, do_sample=False)
    print(f"  验证回复: {tokenizer.decode(output[0], skip_special_tokens=True)[:100]}")

    del model
    torch.cuda.empty_cache()
    gc.collect()

# 打印对比表
print("\n" + "="*60)
print(f"{'精度':<10} {'显存(GB)':<12} {'加载时间(s)':<14} {'相对FP16'}")
print("-"*60)
fp16_mem = results['fp16']['memory_gb']
for p, info in results.items():
    ratio = info['memory_gb'] / fp16_mem
    print(f"{p.upper():<10} {info['memory_gb']:<12} {info['load_time_s']:<14} {ratio:.2f}x")

Step 2:推理速度测试(15 分钟)

TEST_PROMPTS = [
    "请解释什么是量子计算?",
    "用Python写一个快速排序算法。",
    "请将以下中文翻译成英文:人工智能正在深刻改变我们的生活方式。",
    "写一首关于春天的七言绝句。",
    "请解释相对论的核心思想。",
    "什么是知识蒸馏?请用简单的语言解释。",
    "请列出机器学习的五个主要应用领域。",
    "解释什么是梯度下降法。",
    "为什么大语言模型需要后训练?",
    "请比较 Python 和 Java 的主要区别。",
]

def benchmark_inference(model, tokenizer, prompts, max_new_tokens=128):
    """测量推理速度"""
    total_tokens = 0
    first_token_latencies = []

    for prompt in prompts:
        messages = [{"role": "user", "content": prompt}]
        text = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        input_len = inputs["input_ids"].shape[1]

        # 测量首 token 延迟
        start = time.time()
        with torch.no_grad():
            output = model.generate(
                **inputs, max_new_tokens=1, do_sample=False
            )
        first_token_time = time.time() - start
        first_token_latencies.append(first_token_time)

        # 测量完整生成速度
        start = time.time()
        with torch.no_grad():
            output = model.generate(
                **inputs, max_new_tokens=max_new_tokens, do_sample=False
            )
        gen_time = time.time() - start
        gen_tokens = output.shape[1] - input_len
        total_tokens += gen_tokens

    avg_first_token = sum(first_token_latencies) / len(first_token_latencies)
    tokens_per_sec = total_tokens / sum(
        [time.time() - start for _ in range(1)]  # 简化计算
    )

    return {
        "avg_first_token_latency_ms": round(avg_first_token * 1000, 1),
        "total_tokens": total_tokens,
    }

# 对每种精度运行速度测试
speed_results = {}
for precision in ["fp16", "int8", "int4"]:
    print(f"\n测试 {precision.upper()} 推理速度...")
    model, _ = load_model(precision)

    # 预热
    warmup_input = tokenizer("hello", return_tensors="pt").to(model.device)
    with torch.no_grad():
        model.generate(**warmup_input, max_new_tokens=10)

    # 正式测试
    start_total = time.time()
    total_gen_tokens = 0
    first_latencies = []

    for prompt in TEST_PROMPTS:
        messages = [{"role": "user", "content": prompt}]
        text = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        input_len = inputs["input_ids"].shape[1]

        # 首 token
        t0 = time.time()
        with torch.no_grad():
            out1 = model.generate(**inputs, max_new_tokens=1, do_sample=False)
        first_latencies.append(time.time() - t0)

        # 完整生成
        with torch.no_grad():
            out_full = model.generate(
                **inputs, max_new_tokens=128, do_sample=False
            )
        total_gen_tokens += out_full.shape[1] - input_len

    total_time = time.time() - start_total

    speed_results[precision] = {
        "tokens_per_sec": round(total_gen_tokens / total_time, 1),
        "avg_first_token_ms": round(
            sum(first_latencies) / len(first_latencies) * 1000, 1
        ),
    }

    print(f"  tokens/s: {speed_results[precision]['tokens_per_sec']}")
    print(f"  首token延迟: {speed_results[precision]['avg_first_token_ms']}ms")

    del model
    torch.cuda.empty_cache()
    gc.collect()

Step 3:质量评估(25 分钟)

# === 评估数据集 ===

# 1. GSM8K 数学推理(5题)
gsm8k_problems = [
    {
        "question": "小明有15个苹果,他给了小红3个,又买了7个。他现在有多少个苹果?",
        "answer": 19
    },
    {
        "question": "一个教室有35个学生,其中男生比女生多5人。男生有多少人?",
        "answer": 20
    },
    {
        "question": "火车以每小时120公里的速度行驶,3.5小时能走多少公里?",
        "answer": 420
    },
    {
        "question": "商店里一件衣服原价200元,打八折后又减30元,最终价格是多少?",
        "answer": 130
    },
    {
        "question": "一个长方形的长是12厘米,宽是8厘米。它的周长和面积分别是多少?",
        "answer": "周长40厘米,面积96平方厘米"
    },
]

# 2. 指令跟随(5条)
instruction_prompts = [
    "请用恰好三个词概括机器学习。",
    "请以JSON格式输出以下信息:姓名张三,年龄25,职业工程师。",
    "请列出5个中国历史朝代,每个朝代用一句话描述其特点。",
    "请将以下句子改写为疑问句:大语言模型正在改变世界。",
    "请写一段不超过50字的产品描述,产品是一款智能手表。",
]

# 3. 中文理解/生成(5条)
chinese_prompts = [
    "请解释成语'画龙点睛'的含义,并造一个句子。",
    "请为一家新开的咖啡店写一段开业宣传语,风格温馨文艺。",
    "请分析鲁迅《狂人日记》的主题思想。",
    "请比较唐诗和宋词在风格上的主要差异。",
    "请用通俗的语言解释'内卷'这个网络用语的含义。",
]


def evaluate_model(model, tokenizer, prompts, max_new_tokens=256):
    """生成模型回复"""
    responses = []
    for prompt in prompts:
        if isinstance(prompt, dict):
            content = prompt["question"]
        else:
            content = prompt

        messages = [{"role": "user", "content": content}]
        text = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        inputs = tokenizer(text, return_tensors="pt").to(model.device)

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                temperature=1.0,
            )
        response = tokenizer.decode(
            output[0][inputs["input_ids"].shape[1]:],
            skip_special_tokens=True
        )
        responses.append(response)

    return responses


# 收集所有精度的回复
all_responses = {}
for precision in ["fp16", "int8", "int4"]:
    print(f"\n评估 {precision.upper()} 模型...")
    model, _ = load_model(precision)

    all_responses[precision] = {
        "gsm8k": evaluate_model(model, tokenizer, gsm8k_problems),
        "instruction": evaluate_model(model, tokenizer, instruction_prompts),
        "chinese": evaluate_model(model, tokenizer, chinese_prompts),
    }

    del model
    torch.cuda.empty_cache()
    gc.collect()

# 保存回复用于后续 LLM-as-Judge 评分
with open("quantization_responses.json", "w", encoding="utf-8") as f:
    json.dump(all_responses, f, ensure_ascii=False, indent=2)

Step 4:LLM-as-Judge 评分

# 使用强模型(如 Qwen3-32B API)进行评分
# 如果没有 API,可以用本地加载的 FP16 模型作为 Judge

JUDGE_PROMPT_TEMPLATE = """请你作为一个公正的评委,对以下AI助手的回复进行评分。

评分标准(1-10分):
- 准确性:回复内容是否正确
- 完整性:是否充分回答了问题
- 格式规范:是否遵循了指令要求的格式
- 语言质量:表达是否清晰、自然

用户问题:
{question}

AI回复:
{response}

请给出评分(1-10分)和简要评语。格式:
评分:X/10
评语:...
"""

def judge_responses(questions, responses_by_precision, judge_model=None):
    """使用 LLM-as-Judge 评分"""
    scores = {p: [] for p in responses_by_precision}

    for i, question in enumerate(questions):
        q_text = question["question"] if isinstance(question, dict) else question

        for precision, responses in responses_by_precision.items():
            prompt = JUDGE_PROMPT_TEMPLATE.format(
                question=q_text,
                response=responses[i]
            )

            # 方式1:调用 API
            # score = call_judge_api(prompt)

            # 方式2:使用本地模型
            if judge_model:
                messages = [{"role": "user", "content": prompt}]
                text = tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
                inputs = tokenizer(text, return_tensors="pt").to(judge_model.device)
                with torch.no_grad():
                    output = judge_model.generate(
                        **inputs, max_new_tokens=200, do_sample=False
                    )
                judge_response = tokenizer.decode(
                    output[0][inputs["input_ids"].shape[1]:],
                    skip_special_tokens=True
                )
                # 从评分中提取分数
                import re
                match = re.search(r'评分[::]\s*(\d+)', judge_response)
                score = int(match.group(1)) if match else 5
                scores[precision].append(score)

    return scores

# 运行评估(使用 FP16 模型作为 Judge)
print("加载 Judge 模型(FP16)...")
judge_model, _ = load_model("fp16")

for task_name, questions in [
    ("gsm8k", gsm8k_problems),
    ("instruction", instruction_prompts),
    ("chinese", chinese_prompts),
]:
    task_responses = {p: all_responses[p][task_name] for p in all_responses}
    scores = judge_responses(questions, task_responses, judge_model)

    print(f"\n{task_name} 评分结果:")
    print(f"{'精度':<10} {'平均分':<10} {'各题分数'}")
    for p, s in scores.items():
        avg = sum(s) / len(s) if s else 0
        print(f"{p.upper():<10} {avg:<10.1f} {s}")

结果记录模板

# 汇总所有结果
print("\n" + "="*70)
print("量化实验结果汇总")
print("="*70)
print(f"\n{'指标':<20} {'FP16':<15} {'INT8':<15} {'INT4':<15}")
print("-"*65)
print(f"{'显存 (GB)':<20} {results['fp16']['memory_gb']:<15} "
      f"{results['int8']['memory_gb']:<15} {results['int4']['memory_gb']:<15}")
print(f"{'加载时间 (s)':<20} {results['fp16']['load_time_s']:<15} "
      f"{results['int8']['load_time_s']:<15} {results['int4']['load_time_s']:<15}")
print(f"{'tokens/s':<20} {speed_results['fp16']['tokens_per_sec']:<15} "
      f"{speed_results['int8']['tokens_per_sec']:<15} "
      f"{speed_results['int4']['tokens_per_sec']:<15}")
print(f"{'首token延迟 (ms)':<20} {speed_results['fp16']['avg_first_token_ms']:<15} "
      f"{speed_results['int8']['avg_first_token_ms']:<15} "
      f"{speed_results['int4']['avg_first_token_ms']:<15}")

实验 B:能力扩展选做(三选一)

选项 1:蒸馏模型分析

目标:分析 DeepSeek-R1-Distill-Qwen-1.5B 的推理能力和推理链风格。

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# === Step 1: 加载蒸馏模型 ===
distill_model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
distill_model = AutoModelForCausalLM.from_pretrained(
    distill_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
distill_tokenizer = AutoTokenizer.from_pretrained(distill_model_name)

print(f"蒸馏模型参数量: {sum(p.numel() for p in distill_model.parameters()) / 1e9:.2f}B")
print(f"显存占用: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")


# === Step 2: GSM8K 评估 ===
gsm8k_test = [
    {"question": "一个商店卖苹果,每斤5元。小明买了3斤苹果,付了20元,应找回多少钱?", "answer": 5},
    {"question": "一列火车有12节车厢,每节车厢坐60人。如果火车满员,共有多少乘客?", "answer": 720},
    {"question": "甲、乙两人合作完成一项工作,甲单独做需10天,乙单独做需15天。两人合作几天能完成?", "answer": 6},
    {"question": "一个正方形的周长是24厘米,它的面积是多少平方厘米?", "answer": 36},
    {"question": "小红有一些糖果,她给了小明8颗后还剩12颗。她原来有多少颗糖果?", "answer": 20},
    {"question": "3个箱子共有45个球,第一个箱子比第二个多5个,第二个比第三个多5个。每个箱子各有多少球?", "answer": "20,15,10"},
    {"question": "一本书共300页,小明第一天读了全书的1/5,第二天读了剩下的1/4。第二天读了多少页?", "answer": 60},
    {"question": "把一根绳子对折3次后剪一刀,绳子被剪成几段?", "answer": 9},
    {"question": "一个数的3倍减去7等于20,这个数是多少?", "answer": 9},
    {"question": "长方形花坛长8米,宽5米。沿花坛外侧修一条1米宽的小路,小路面积是多少?", "answer": 32},
]

correct = 0
for i, problem in enumerate(gsm8k_test):
    messages = [{"role": "user", "content": problem["question"]}]
    text = distill_tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = distill_tokenizer(text, return_tensors="pt").to(distill_model.device)

    with torch.no_grad():
        output = distill_model.generate(
            **inputs, max_new_tokens=1024, do_sample=False, temperature=1.0
        )
    response = distill_tokenizer.decode(
        output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
    )

    print(f"\n问题 {i+1}: {problem['question']}")
    print(f"回复: {response[:300]}...")
    print(f"标准答案: {problem['answer']}")

    # 简单检查答案是否正确
    if str(problem["answer"]) in response:
        correct += 1
        print("✓ 正确")
    else:
        print("✗ 可能错误(需人工验证)")

print(f"\n准确率: {correct}/{len(gsm8k_test)} = {correct/len(gsm8k_test)*100:.1f}%")


# === Step 3: 分析推理链风格 ===
print("\n" + "="*60)
print("推理链风格分析")
print("="*60)

analysis_prompt = "一个工厂第一天生产了120件产品,第二天比第一天多生产20%,第三天生产的数量是前两天总和的一半。三天共生产了多少件产品?"

messages = [{"role": "user", "content": analysis_prompt}]
text = distill_tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
inputs = distill_tokenizer(text, return_tensors="pt").to(distill_model.device)

with torch.no_grad():
    output = distill_model.generate(
        **inputs, max_new_tokens=2048, do_sample=False
    )
full_response = distill_tokenizer.decode(
    output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
)

print(f"完整推理链:\n{full_response}")
print(f"\n推理链长度: {len(full_response)} 字符")

# 检查推理链特征
features = {
    "有逐步计算": "步" in full_response or "step" in full_response.lower(),
    "有自我验证": "验证" in full_response or "检查" in full_response,
    "有think标签": "<think>" in full_response,
    "有回溯修正": "不对" in full_response or "重新" in full_response,
}
print("\n推理链特征:")
for feat, present in features.items():
    print(f"  {feat}: {'是' if present else '否'}")

与第4课 GRPO 模型对比:如果你在第4课完成了 GRPO 训练实验,请对比蒸馏模型和 GRPO 模型的推理链风格。蒸馏模型通常更"流畅"但可能不够"深入",而 GRPO 模型可能出现更多"探索性"推理。

选项 2:迷你多模态实验

目标:使用 Qwen3-VL-2B-Instruct 测试视觉理解能力并评估幻觉率。

import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from PIL import Image
import requests
from io import BytesIO

# === Step 1: 加载 VLM 模型 ===
vl_model_name = "Qwen/Qwen3-VL-2B-Instruct"
vl_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    vl_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(vl_model_name)

print(f"VLM 模型已加载,显存: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")


# === Step 2: 准备测试图片 ===
# 使用公开的测试图片 URL
test_images = [
    {
        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png",
        "description": "带透明背景的骰子图片",
        "questions": [
            "请描述图片中的内容。",
            "图片中有几个骰子?",
            "图片中有猫吗?",  # 幻觉测试
        ]
    },
    # 可以添加更多测试图片...
]

# 也可以使用本地图片
def load_image(image_source):
    """加载图片,支持 URL 和本地路径"""
    if image_source.startswith("http"):
        response = requests.get(image_source)
        return Image.open(BytesIO(response.content)).convert("RGB")
    else:
        return Image.open(image_source).convert("RGB")


# === Step 3: 视觉问答测试 ===
def vqa_test(image, question):
    """对图片进行视觉问答"""
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        text=[text], images=[image], return_tensors="pt"
    ).to(vl_model.device)

    with torch.no_grad():
        output = vl_model.generate(**inputs, max_new_tokens=256, do_sample=False)

    response = processor.decode(output[0], skip_special_tokens=True)
    # 提取助手回复部分
    if "assistant" in response:
        response = response.split("assistant")[-1].strip()
    return response


# === Step 4: 运行测试 ===
print("\n视觉问答测试结果:")
print("="*60)

for img_info in test_images:
    try:
        image = load_image(img_info["url"])
        print(f"\n图片: {img_info['description']}")

        for question in img_info["questions"]:
            response = vqa_test(image, question)
            print(f"\n  Q: {question}")
            print(f"  A: {response[:200]}")
    except Exception as e:
        print(f"加载图片失败: {e}")
        continue


# === Step 5: 幻觉测试 ===
print("\n" + "="*60)
print("幻觉测试")
print("="*60)

# 准备幻觉测试问题(对不存在的内容提问)
hallucination_questions = [
    "图片中的文字写了什么?",      # 如果图中没有文字
    "图片中的人在做什么?",        # 如果图中没有人
    "图片中有几辆汽车?",          # 如果图中没有汽车
    "图片背景中的建筑是什么风格?",  # 如果没有建筑
    "图片中动物的品种是什么?",      # 如果没有动物
]

hallucination_count = 0
total_tests = 0

for img_info in test_images:
    try:
        image = load_image(img_info["url"])
        for question in hallucination_questions:
            response = vqa_test(image, question)
            total_tests += 1

            # 检查是否产生幻觉(模型编造了不存在的内容)
            refusal_keywords = ["没有", "不存在", "看不到", "无法", "图中没有", "not"]
            is_refusal = any(kw in response for kw in refusal_keywords)

            if not is_refusal:
                hallucination_count += 1
                status = "⚠️ 可能幻觉"
            else:
                status = "✓ 正确拒绝"

            print(f"  Q: {question}")
            print(f"  A: {response[:150]}")
            print(f"  状态: {status}")
    except Exception as e:
        continue

if total_tests > 0:
    print(f"\n幻觉率: {hallucination_count}/{total_tests} "
          f"= {hallucination_count/total_tests*100:.1f}%")

选项 3:迷你工具使用实验

目标:测试 Qwen3-1.7B 的函数调用能力,构建简单的智能体循环。

import torch
import json
import re
from transformers import AutoModelForCausalLM, AutoTokenizer

# === Step 1: 加载模型 ===
model_name = "Qwen/Qwen3-1.7B"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# === Step 2: 定义工具 ===
tools = [
    {
        "type": "function",
        "function": {
            "name": "calculator",
            "description": "一个简单的计算器,可以执行数学运算",
            "parameters": {
                "type": "object",
                "properties": {
                    "expression": {
                        "type": "string",
                        "description": "数学表达式,如 '2 + 3 * 4'"
                    }
                },
                "required": ["expression"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "获取指定城市的天气信息",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {
                        "type": "string",
                        "description": "城市名称"
                    }
                },
                "required": ["city"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "search_knowledge",
            "description": "搜索知识库获取信息",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "搜索关键词"
                    }
                },
                "required": ["query"]
            }
        }
    },
]


# === Step 3: 模拟工具执行 ===
def execute_tool(name, arguments):
    """模拟工具执行"""
    if name == "calculator":
        try:
            expr = arguments.get("expression", "")
            # 安全地计算数学表达式
            result = eval(expr, {"__builtins__": {}}, {})
            return json.dumps({"result": result})
        except Exception as e:
            return json.dumps({"error": str(e)})

    elif name == "get_weather":
        city = arguments.get("city", "未知")
        # 模拟天气数据
        mock_weather = {
            "北京": {"temp": 22, "condition": "晴", "humidity": 35},
            "上海": {"temp": 26, "condition": "多云", "humidity": 65},
            "深圳": {"temp": 30, "condition": "阵雨", "humidity": 80},
        }
        weather = mock_weather.get(city, {"temp": 20, "condition": "晴", "humidity": 50})
        return json.dumps(weather, ensure_ascii=False)

    elif name == "search_knowledge":
        query = arguments.get("query", "")
        # 模拟知识库搜索
        knowledge = {
            "LoRA": "LoRA是一种参数高效微调方法,通过低秩分解减少可训练参数。",
            "量化": "量化将模型权重从高精度压缩到低精度,减少显存占用。",
            "DPO": "DPO是直接偏好优化方法,无需奖励模型即可进行偏好对齐。",
        }
        for key, value in knowledge.items():
            if key in query:
                return json.dumps({"result": value}, ensure_ascii=False)
        return json.dumps({"result": "未找到相关信息"}, ensure_ascii=False)

    return json.dumps({"error": "未知工具"})


# === Step 4: 智能体循环 ===
def agent_loop(user_query, max_turns=3):
    """完整的智能体循环"""
    messages = [
        {"role": "system", "content": "你是一个有用的助手,可以使用工具来回答问题。"},
        {"role": "user", "content": user_query},
    ]

    print(f"\n用户: {user_query}")

    for turn in range(max_turns):
        # 生成回复
        text = tokenizer.apply_chat_template(
            messages, tools=tools, tokenize=False, add_generation_prompt=True
        )
        inputs = tokenizer(text, return_tensors="pt").to(model.device)

        with torch.no_grad():
            output = model.generate(
                **inputs, max_new_tokens=512, do_sample=False
            )
        response_text = tokenizer.decode(
            output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
        )

        print(f"\n助手 (Turn {turn+1}): {response_text[:300]}")

        # 检查是否包含工具调用
        tool_call_match = re.search(
            r'<tool_call>\s*(\{.*?\})\s*</tool_call>',
            response_text, re.DOTALL
        )

        if tool_call_match:
            try:
                tool_call = json.loads(tool_call_match.group(1))
                tool_name = tool_call.get("name", "")
                tool_args = tool_call.get("arguments", {})

                print(f"  → 调用工具: {tool_name}({json.dumps(tool_args, ensure_ascii=False)})")

                # 执行工具
                result = execute_tool(tool_name, tool_args)
                print(f"  ← 工具返回: {result}")

                # 将工具调用和结果添加到消息中
                messages.append({"role": "assistant", "content": response_text})
                messages.append({
                    "role": "tool",
                    "name": tool_name,
                    "content": result
                })
            except json.JSONDecodeError:
                print("  ⚠️ 工具调用解析失败")
                break
        else:
            # 没有工具调用,直接返回
            print(f"\n最终回复: {response_text}")
            return response_text

    return "达到最大轮数"


# === Step 5: 测试场景 ===
test_queries = [
    "请帮我计算 (15 + 27) * 3 - 40 的结果。",
    "北京今天天气怎么样?",
    "帮我查一下 LoRA 是什么?",
    "上海的温度是多少?如果温度超过25度请提醒我带伞。",
    "请帮我计算一个圆的面积,半径是5厘米。(使用3.14159计算)",
]

correct = 0
total = len(test_queries)

for query in test_queries:
    print("\n" + "="*60)
    result = agent_loop(query)
    # 人工判断是否正确(可自动化)
    print(f"\n请评估此回复是否正确 (y/n): ", end="")
    # 在实际实验中可以记录结果


# === Step 6: 与小模型对比(可选)===
print("\n\n" + "="*60)
print("与 Qwen3-0.6B 对比测试")
print("="*60)

# 加载 0.6B 模型
small_model_name = "Qwen/Qwen3-0.6B"
small_model = AutoModelForCausalLM.from_pretrained(
    small_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
small_tokenizer = AutoTokenizer.from_pretrained(small_model_name)

# 用相同的测试场景对比
for query in test_queries[:3]:  # 测试前3个
    print(f"\n查询: {query}")

    messages = [
        {"role": "system", "content": "你是一个有用的助手,可以使用工具来回答问题。"},
        {"role": "user", "content": query},
    ]
    text = small_tokenizer.apply_chat_template(
        messages, tools=tools, tokenize=False, add_generation_prompt=True
    )
    inputs = small_tokenizer(text, return_tensors="pt").to(small_model.device)

    with torch.no_grad():
        output = small_model.generate(
            **inputs, max_new_tokens=512, do_sample=False
        )
    response = small_tokenizer.decode(
        output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
    )
    print(f"Qwen3-0.6B 回复: {response[:200]}")

模型规模对工具使用的影响:实验中你可能会发现,1.7B 模型的函数调用准确率显著高于 0.6B 模型。工具使用需要模型同时具备理解用户意图、选择合适工具、生成正确参数格式三项能力,这对小模型来说是较大挑战。


交付物清单

完成本次实验后,请提交以下内容:

量化实验报告(必做)

  • 三种精度的显存占用对比表
  • 推理速度对比表(tokens/s、首 token 延迟)
  • 三类任务的质量评分对比表(GSM8K、指令跟随、中文任务)
  • "压缩率 vs 质量保持率"的关系图
  • 简要分析:量化对哪类任务影响最大?为什么?

选做实验报告(三选一)

  • 选项 1:蒸馏模型的推理链分析、与 GRPO 模型的风格对比
  • 选项 2:VLM 的视觉问答结果、幻觉率统计
  • 选项 3:函数调用准确率、智能体循环运行示例

总结反思(1 页)

  • 后训练各环节(SFT → DPO → GRPO → 量化/蒸馏)的关系与选择策略
  • 对于一个实际的 LLM 应用项目,你会如何选择后训练技术组合?