第5课:压缩部署与扩展
第5课 上机实验
量化实验(必做):多精度加载 Qwen3-8B 并评估压缩影响;能力扩展选做(三选一):蒸馏分析、多模态实验、工具调用实验
实验概览
本课实验分为两部分:
| 部分 | 内容 | 时长 | 要求 |
|---|---|---|---|
| 实验 A(必做) | 量化实验——测量压缩对模型质量的影响 | ~60 分钟 | 全部完成 |
| 实验 B(选做) | 能力扩展——蒸馏/多模态/工具使用三选一 | ~50 分钟 | 完成一项 |
预计总计算时间:A100-40G 约 80 分钟。
实验 A:量化实验(必做)
实验目标
- 以 FP16、INT8、INT4 三种精度加载 Qwen3-8B,对比显存占用和加载时间
- 测量不同精度下的推理速度(tokens/sec、首 token 延迟)
- 在 GSM8K 数学推理、指令跟随、中文任务上评估量化对质量的影响
- 使用 LLM-as-Judge 进行系统化评分
环境准备
# 安装依赖
# pip install "transformers>=4.51.0" bitsandbytes accelerate torch datasets
import torch
import time
import json
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# 验证 GPU
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU 显存: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB")Step 1:多精度加载对比(20 分钟)
定义加载函数
MODEL_NAME = "Qwen/Qwen3-8B" # 或本地路径
def load_model(precision="fp16"):
"""以指定精度加载模型,返回模型和元信息"""
torch.cuda.empty_cache()
gc.collect()
torch.cuda.reset_peak_memory_stats()
start_time = time.time()
if precision == "fp16":
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto",
)
elif precision == "int8":
config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=config,
device_map="auto",
)
elif precision == "int4":
config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=config,
device_map="auto",
)
else:
raise ValueError(f"不支持的精度: {precision}")
load_time = time.time() - start_time
memory_gb = torch.cuda.max_memory_allocated() / 1024**3
return model, {
"precision": precision,
"load_time_s": round(load_time, 1),
"memory_gb": round(memory_gb, 2),
}依次加载三种精度并记录数据
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
results = {}
for precision in ["fp16", "int8", "int4"]:
print(f"\n{'='*50}")
print(f"正在加载 {precision.upper()} 模型...")
model, info = load_model(precision)
results[precision] = info
print(f" 加载时间: {info['load_time_s']}s")
print(f" 显存占用: {info['memory_gb']} GB")
# 简单验证:生成一条回复
test_input = tokenizer("你好,请自我介绍一下。", return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(**test_input, max_new_tokens=50, do_sample=False)
print(f" 验证回复: {tokenizer.decode(output[0], skip_special_tokens=True)[:100]}")
del model
torch.cuda.empty_cache()
gc.collect()
# 打印对比表
print("\n" + "="*60)
print(f"{'精度':<10} {'显存(GB)':<12} {'加载时间(s)':<14} {'相对FP16'}")
print("-"*60)
fp16_mem = results['fp16']['memory_gb']
for p, info in results.items():
ratio = info['memory_gb'] / fp16_mem
print(f"{p.upper():<10} {info['memory_gb']:<12} {info['load_time_s']:<14} {ratio:.2f}x")Step 2:推理速度测试(15 分钟)
TEST_PROMPTS = [
"请解释什么是量子计算?",
"用Python写一个快速排序算法。",
"请将以下中文翻译成英文:人工智能正在深刻改变我们的生活方式。",
"写一首关于春天的七言绝句。",
"请解释相对论的核心思想。",
"什么是知识蒸馏?请用简单的语言解释。",
"请列出机器学习的五个主要应用领域。",
"解释什么是梯度下降法。",
"为什么大语言模型需要后训练?",
"请比较 Python 和 Java 的主要区别。",
]
def benchmark_inference(model, tokenizer, prompts, max_new_tokens=128):
"""测量推理速度"""
total_tokens = 0
first_token_latencies = []
for prompt in prompts:
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
input_len = inputs["input_ids"].shape[1]
# 测量首 token 延迟
start = time.time()
with torch.no_grad():
output = model.generate(
**inputs, max_new_tokens=1, do_sample=False
)
first_token_time = time.time() - start
first_token_latencies.append(first_token_time)
# 测量完整生成速度
start = time.time()
with torch.no_grad():
output = model.generate(
**inputs, max_new_tokens=max_new_tokens, do_sample=False
)
gen_time = time.time() - start
gen_tokens = output.shape[1] - input_len
total_tokens += gen_tokens
avg_first_token = sum(first_token_latencies) / len(first_token_latencies)
tokens_per_sec = total_tokens / sum(
[time.time() - start for _ in range(1)] # 简化计算
)
return {
"avg_first_token_latency_ms": round(avg_first_token * 1000, 1),
"total_tokens": total_tokens,
}
# 对每种精度运行速度测试
speed_results = {}
for precision in ["fp16", "int8", "int4"]:
print(f"\n测试 {precision.upper()} 推理速度...")
model, _ = load_model(precision)
# 预热
warmup_input = tokenizer("hello", return_tensors="pt").to(model.device)
with torch.no_grad():
model.generate(**warmup_input, max_new_tokens=10)
# 正式测试
start_total = time.time()
total_gen_tokens = 0
first_latencies = []
for prompt in TEST_PROMPTS:
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
input_len = inputs["input_ids"].shape[1]
# 首 token
t0 = time.time()
with torch.no_grad():
out1 = model.generate(**inputs, max_new_tokens=1, do_sample=False)
first_latencies.append(time.time() - t0)
# 完整生成
with torch.no_grad():
out_full = model.generate(
**inputs, max_new_tokens=128, do_sample=False
)
total_gen_tokens += out_full.shape[1] - input_len
total_time = time.time() - start_total
speed_results[precision] = {
"tokens_per_sec": round(total_gen_tokens / total_time, 1),
"avg_first_token_ms": round(
sum(first_latencies) / len(first_latencies) * 1000, 1
),
}
print(f" tokens/s: {speed_results[precision]['tokens_per_sec']}")
print(f" 首token延迟: {speed_results[precision]['avg_first_token_ms']}ms")
del model
torch.cuda.empty_cache()
gc.collect()Step 3:质量评估(25 分钟)
# === 评估数据集 ===
# 1. GSM8K 数学推理(5题)
gsm8k_problems = [
{
"question": "小明有15个苹果,他给了小红3个,又买了7个。他现在有多少个苹果?",
"answer": 19
},
{
"question": "一个教室有35个学生,其中男生比女生多5人。男生有多少人?",
"answer": 20
},
{
"question": "火车以每小时120公里的速度行驶,3.5小时能走多少公里?",
"answer": 420
},
{
"question": "商店里一件衣服原价200元,打八折后又减30元,最终价格是多少?",
"answer": 130
},
{
"question": "一个长方形的长是12厘米,宽是8厘米。它的周长和面积分别是多少?",
"answer": "周长40厘米,面积96平方厘米"
},
]
# 2. 指令跟随(5条)
instruction_prompts = [
"请用恰好三个词概括机器学习。",
"请以JSON格式输出以下信息:姓名张三,年龄25,职业工程师。",
"请列出5个中国历史朝代,每个朝代用一句话描述其特点。",
"请将以下句子改写为疑问句:大语言模型正在改变世界。",
"请写一段不超过50字的产品描述,产品是一款智能手表。",
]
# 3. 中文理解/生成(5条)
chinese_prompts = [
"请解释成语'画龙点睛'的含义,并造一个句子。",
"请为一家新开的咖啡店写一段开业宣传语,风格温馨文艺。",
"请分析鲁迅《狂人日记》的主题思想。",
"请比较唐诗和宋词在风格上的主要差异。",
"请用通俗的语言解释'内卷'这个网络用语的含义。",
]
def evaluate_model(model, tokenizer, prompts, max_new_tokens=256):
"""生成模型回复"""
responses = []
for prompt in prompts:
if isinstance(prompt, dict):
content = prompt["question"]
else:
content = prompt
messages = [{"role": "user", "content": content}]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
temperature=1.0,
)
response = tokenizer.decode(
output[0][inputs["input_ids"].shape[1]:],
skip_special_tokens=True
)
responses.append(response)
return responses
# 收集所有精度的回复
all_responses = {}
for precision in ["fp16", "int8", "int4"]:
print(f"\n评估 {precision.upper()} 模型...")
model, _ = load_model(precision)
all_responses[precision] = {
"gsm8k": evaluate_model(model, tokenizer, gsm8k_problems),
"instruction": evaluate_model(model, tokenizer, instruction_prompts),
"chinese": evaluate_model(model, tokenizer, chinese_prompts),
}
del model
torch.cuda.empty_cache()
gc.collect()
# 保存回复用于后续 LLM-as-Judge 评分
with open("quantization_responses.json", "w", encoding="utf-8") as f:
json.dump(all_responses, f, ensure_ascii=False, indent=2)Step 4:LLM-as-Judge 评分
# 使用强模型(如 Qwen3-32B API)进行评分
# 如果没有 API,可以用本地加载的 FP16 模型作为 Judge
JUDGE_PROMPT_TEMPLATE = """请你作为一个公正的评委,对以下AI助手的回复进行评分。
评分标准(1-10分):
- 准确性:回复内容是否正确
- 完整性:是否充分回答了问题
- 格式规范:是否遵循了指令要求的格式
- 语言质量:表达是否清晰、自然
用户问题:
{question}
AI回复:
{response}
请给出评分(1-10分)和简要评语。格式:
评分:X/10
评语:...
"""
def judge_responses(questions, responses_by_precision, judge_model=None):
"""使用 LLM-as-Judge 评分"""
scores = {p: [] for p in responses_by_precision}
for i, question in enumerate(questions):
q_text = question["question"] if isinstance(question, dict) else question
for precision, responses in responses_by_precision.items():
prompt = JUDGE_PROMPT_TEMPLATE.format(
question=q_text,
response=responses[i]
)
# 方式1:调用 API
# score = call_judge_api(prompt)
# 方式2:使用本地模型
if judge_model:
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(text, return_tensors="pt").to(judge_model.device)
with torch.no_grad():
output = judge_model.generate(
**inputs, max_new_tokens=200, do_sample=False
)
judge_response = tokenizer.decode(
output[0][inputs["input_ids"].shape[1]:],
skip_special_tokens=True
)
# 从评分中提取分数
import re
match = re.search(r'评分[::]\s*(\d+)', judge_response)
score = int(match.group(1)) if match else 5
scores[precision].append(score)
return scores
# 运行评估(使用 FP16 模型作为 Judge)
print("加载 Judge 模型(FP16)...")
judge_model, _ = load_model("fp16")
for task_name, questions in [
("gsm8k", gsm8k_problems),
("instruction", instruction_prompts),
("chinese", chinese_prompts),
]:
task_responses = {p: all_responses[p][task_name] for p in all_responses}
scores = judge_responses(questions, task_responses, judge_model)
print(f"\n{task_name} 评分结果:")
print(f"{'精度':<10} {'平均分':<10} {'各题分数'}")
for p, s in scores.items():
avg = sum(s) / len(s) if s else 0
print(f"{p.upper():<10} {avg:<10.1f} {s}")结果记录模板
# 汇总所有结果
print("\n" + "="*70)
print("量化实验结果汇总")
print("="*70)
print(f"\n{'指标':<20} {'FP16':<15} {'INT8':<15} {'INT4':<15}")
print("-"*65)
print(f"{'显存 (GB)':<20} {results['fp16']['memory_gb']:<15} "
f"{results['int8']['memory_gb']:<15} {results['int4']['memory_gb']:<15}")
print(f"{'加载时间 (s)':<20} {results['fp16']['load_time_s']:<15} "
f"{results['int8']['load_time_s']:<15} {results['int4']['load_time_s']:<15}")
print(f"{'tokens/s':<20} {speed_results['fp16']['tokens_per_sec']:<15} "
f"{speed_results['int8']['tokens_per_sec']:<15} "
f"{speed_results['int4']['tokens_per_sec']:<15}")
print(f"{'首token延迟 (ms)':<20} {speed_results['fp16']['avg_first_token_ms']:<15} "
f"{speed_results['int8']['avg_first_token_ms']:<15} "
f"{speed_results['int4']['avg_first_token_ms']:<15}")实验 B:能力扩展选做(三选一)
选项 1:蒸馏模型分析
目标:分析 DeepSeek-R1-Distill-Qwen-1.5B 的推理能力和推理链风格。
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# === Step 1: 加载蒸馏模型 ===
distill_model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
distill_model = AutoModelForCausalLM.from_pretrained(
distill_model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
)
distill_tokenizer = AutoTokenizer.from_pretrained(distill_model_name)
print(f"蒸馏模型参数量: {sum(p.numel() for p in distill_model.parameters()) / 1e9:.2f}B")
print(f"显存占用: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
# === Step 2: GSM8K 评估 ===
gsm8k_test = [
{"question": "一个商店卖苹果,每斤5元。小明买了3斤苹果,付了20元,应找回多少钱?", "answer": 5},
{"question": "一列火车有12节车厢,每节车厢坐60人。如果火车满员,共有多少乘客?", "answer": 720},
{"question": "甲、乙两人合作完成一项工作,甲单独做需10天,乙单独做需15天。两人合作几天能完成?", "answer": 6},
{"question": "一个正方形的周长是24厘米,它的面积是多少平方厘米?", "answer": 36},
{"question": "小红有一些糖果,她给了小明8颗后还剩12颗。她原来有多少颗糖果?", "answer": 20},
{"question": "3个箱子共有45个球,第一个箱子比第二个多5个,第二个比第三个多5个。每个箱子各有多少球?", "answer": "20,15,10"},
{"question": "一本书共300页,小明第一天读了全书的1/5,第二天读了剩下的1/4。第二天读了多少页?", "answer": 60},
{"question": "把一根绳子对折3次后剪一刀,绳子被剪成几段?", "answer": 9},
{"question": "一个数的3倍减去7等于20,这个数是多少?", "answer": 9},
{"question": "长方形花坛长8米,宽5米。沿花坛外侧修一条1米宽的小路,小路面积是多少?", "answer": 32},
]
correct = 0
for i, problem in enumerate(gsm8k_test):
messages = [{"role": "user", "content": problem["question"]}]
text = distill_tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = distill_tokenizer(text, return_tensors="pt").to(distill_model.device)
with torch.no_grad():
output = distill_model.generate(
**inputs, max_new_tokens=1024, do_sample=False, temperature=1.0
)
response = distill_tokenizer.decode(
output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
)
print(f"\n问题 {i+1}: {problem['question']}")
print(f"回复: {response[:300]}...")
print(f"标准答案: {problem['answer']}")
# 简单检查答案是否正确
if str(problem["answer"]) in response:
correct += 1
print("✓ 正确")
else:
print("✗ 可能错误(需人工验证)")
print(f"\n准确率: {correct}/{len(gsm8k_test)} = {correct/len(gsm8k_test)*100:.1f}%")
# === Step 3: 分析推理链风格 ===
print("\n" + "="*60)
print("推理链风格分析")
print("="*60)
analysis_prompt = "一个工厂第一天生产了120件产品,第二天比第一天多生产20%,第三天生产的数量是前两天总和的一半。三天共生产了多少件产品?"
messages = [{"role": "user", "content": analysis_prompt}]
text = distill_tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = distill_tokenizer(text, return_tensors="pt").to(distill_model.device)
with torch.no_grad():
output = distill_model.generate(
**inputs, max_new_tokens=2048, do_sample=False
)
full_response = distill_tokenizer.decode(
output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
)
print(f"完整推理链:\n{full_response}")
print(f"\n推理链长度: {len(full_response)} 字符")
# 检查推理链特征
features = {
"有逐步计算": "步" in full_response or "step" in full_response.lower(),
"有自我验证": "验证" in full_response or "检查" in full_response,
"有think标签": "<think>" in full_response,
"有回溯修正": "不对" in full_response or "重新" in full_response,
}
print("\n推理链特征:")
for feat, present in features.items():
print(f" {feat}: {'是' if present else '否'}")与第4课 GRPO 模型对比:如果你在第4课完成了 GRPO 训练实验,请对比蒸馏模型和 GRPO 模型的推理链风格。蒸馏模型通常更"流畅"但可能不够"深入",而 GRPO 模型可能出现更多"探索性"推理。
选项 2:迷你多模态实验
目标:使用 Qwen3-VL-2B-Instruct 测试视觉理解能力并评估幻觉率。
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from PIL import Image
import requests
from io import BytesIO
# === Step 1: 加载 VLM 模型 ===
vl_model_name = "Qwen/Qwen3-VL-2B-Instruct"
vl_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
vl_model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(vl_model_name)
print(f"VLM 模型已加载,显存: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
# === Step 2: 准备测试图片 ===
# 使用公开的测试图片 URL
test_images = [
{
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png",
"description": "带透明背景的骰子图片",
"questions": [
"请描述图片中的内容。",
"图片中有几个骰子?",
"图片中有猫吗?", # 幻觉测试
]
},
# 可以添加更多测试图片...
]
# 也可以使用本地图片
def load_image(image_source):
"""加载图片,支持 URL 和本地路径"""
if image_source.startswith("http"):
response = requests.get(image_source)
return Image.open(BytesIO(response.content)).convert("RGB")
else:
return Image.open(image_source).convert("RGB")
# === Step 3: 视觉问答测试 ===
def vqa_test(image, question):
"""对图片进行视觉问答"""
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": question},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = processor(
text=[text], images=[image], return_tensors="pt"
).to(vl_model.device)
with torch.no_grad():
output = vl_model.generate(**inputs, max_new_tokens=256, do_sample=False)
response = processor.decode(output[0], skip_special_tokens=True)
# 提取助手回复部分
if "assistant" in response:
response = response.split("assistant")[-1].strip()
return response
# === Step 4: 运行测试 ===
print("\n视觉问答测试结果:")
print("="*60)
for img_info in test_images:
try:
image = load_image(img_info["url"])
print(f"\n图片: {img_info['description']}")
for question in img_info["questions"]:
response = vqa_test(image, question)
print(f"\n Q: {question}")
print(f" A: {response[:200]}")
except Exception as e:
print(f"加载图片失败: {e}")
continue
# === Step 5: 幻觉测试 ===
print("\n" + "="*60)
print("幻觉测试")
print("="*60)
# 准备幻觉测试问题(对不存在的内容提问)
hallucination_questions = [
"图片中的文字写了什么?", # 如果图中没有文字
"图片中的人在做什么?", # 如果图中没有人
"图片中有几辆汽车?", # 如果图中没有汽车
"图片背景中的建筑是什么风格?", # 如果没有建筑
"图片中动物的品种是什么?", # 如果没有动物
]
hallucination_count = 0
total_tests = 0
for img_info in test_images:
try:
image = load_image(img_info["url"])
for question in hallucination_questions:
response = vqa_test(image, question)
total_tests += 1
# 检查是否产生幻觉(模型编造了不存在的内容)
refusal_keywords = ["没有", "不存在", "看不到", "无法", "图中没有", "not"]
is_refusal = any(kw in response for kw in refusal_keywords)
if not is_refusal:
hallucination_count += 1
status = "⚠️ 可能幻觉"
else:
status = "✓ 正确拒绝"
print(f" Q: {question}")
print(f" A: {response[:150]}")
print(f" 状态: {status}")
except Exception as e:
continue
if total_tests > 0:
print(f"\n幻觉率: {hallucination_count}/{total_tests} "
f"= {hallucination_count/total_tests*100:.1f}%")选项 3:迷你工具使用实验
目标:测试 Qwen3-1.7B 的函数调用能力,构建简单的智能体循环。
import torch
import json
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
# === Step 1: 加载模型 ===
model_name = "Qwen/Qwen3-1.7B"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# === Step 2: 定义工具 ===
tools = [
{
"type": "function",
"function": {
"name": "calculator",
"description": "一个简单的计算器,可以执行数学运算",
"parameters": {
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": "数学表达式,如 '2 + 3 * 4'"
}
},
"required": ["expression"]
}
}
},
{
"type": "function",
"function": {
"name": "get_weather",
"description": "获取指定城市的天气信息",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "城市名称"
}
},
"required": ["city"]
}
}
},
{
"type": "function",
"function": {
"name": "search_knowledge",
"description": "搜索知识库获取信息",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "搜索关键词"
}
},
"required": ["query"]
}
}
},
]
# === Step 3: 模拟工具执行 ===
def execute_tool(name, arguments):
"""模拟工具执行"""
if name == "calculator":
try:
expr = arguments.get("expression", "")
# 安全地计算数学表达式
result = eval(expr, {"__builtins__": {}}, {})
return json.dumps({"result": result})
except Exception as e:
return json.dumps({"error": str(e)})
elif name == "get_weather":
city = arguments.get("city", "未知")
# 模拟天气数据
mock_weather = {
"北京": {"temp": 22, "condition": "晴", "humidity": 35},
"上海": {"temp": 26, "condition": "多云", "humidity": 65},
"深圳": {"temp": 30, "condition": "阵雨", "humidity": 80},
}
weather = mock_weather.get(city, {"temp": 20, "condition": "晴", "humidity": 50})
return json.dumps(weather, ensure_ascii=False)
elif name == "search_knowledge":
query = arguments.get("query", "")
# 模拟知识库搜索
knowledge = {
"LoRA": "LoRA是一种参数高效微调方法,通过低秩分解减少可训练参数。",
"量化": "量化将模型权重从高精度压缩到低精度,减少显存占用。",
"DPO": "DPO是直接偏好优化方法,无需奖励模型即可进行偏好对齐。",
}
for key, value in knowledge.items():
if key in query:
return json.dumps({"result": value}, ensure_ascii=False)
return json.dumps({"result": "未找到相关信息"}, ensure_ascii=False)
return json.dumps({"error": "未知工具"})
# === Step 4: 智能体循环 ===
def agent_loop(user_query, max_turns=3):
"""完整的智能体循环"""
messages = [
{"role": "system", "content": "你是一个有用的助手,可以使用工具来回答问题。"},
{"role": "user", "content": user_query},
]
print(f"\n用户: {user_query}")
for turn in range(max_turns):
# 生成回复
text = tokenizer.apply_chat_template(
messages, tools=tools, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(
**inputs, max_new_tokens=512, do_sample=False
)
response_text = tokenizer.decode(
output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
)
print(f"\n助手 (Turn {turn+1}): {response_text[:300]}")
# 检查是否包含工具调用
tool_call_match = re.search(
r'<tool_call>\s*(\{.*?\})\s*</tool_call>',
response_text, re.DOTALL
)
if tool_call_match:
try:
tool_call = json.loads(tool_call_match.group(1))
tool_name = tool_call.get("name", "")
tool_args = tool_call.get("arguments", {})
print(f" → 调用工具: {tool_name}({json.dumps(tool_args, ensure_ascii=False)})")
# 执行工具
result = execute_tool(tool_name, tool_args)
print(f" ← 工具返回: {result}")
# 将工具调用和结果添加到消息中
messages.append({"role": "assistant", "content": response_text})
messages.append({
"role": "tool",
"name": tool_name,
"content": result
})
except json.JSONDecodeError:
print(" ⚠️ 工具调用解析失败")
break
else:
# 没有工具调用,直接返回
print(f"\n最终回复: {response_text}")
return response_text
return "达到最大轮数"
# === Step 5: 测试场景 ===
test_queries = [
"请帮我计算 (15 + 27) * 3 - 40 的结果。",
"北京今天天气怎么样?",
"帮我查一下 LoRA 是什么?",
"上海的温度是多少?如果温度超过25度请提醒我带伞。",
"请帮我计算一个圆的面积,半径是5厘米。(使用3.14159计算)",
]
correct = 0
total = len(test_queries)
for query in test_queries:
print("\n" + "="*60)
result = agent_loop(query)
# 人工判断是否正确(可自动化)
print(f"\n请评估此回复是否正确 (y/n): ", end="")
# 在实际实验中可以记录结果
# === Step 6: 与小模型对比(可选)===
print("\n\n" + "="*60)
print("与 Qwen3-0.6B 对比测试")
print("="*60)
# 加载 0.6B 模型
small_model_name = "Qwen/Qwen3-0.6B"
small_model = AutoModelForCausalLM.from_pretrained(
small_model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
)
small_tokenizer = AutoTokenizer.from_pretrained(small_model_name)
# 用相同的测试场景对比
for query in test_queries[:3]: # 测试前3个
print(f"\n查询: {query}")
messages = [
{"role": "system", "content": "你是一个有用的助手,可以使用工具来回答问题。"},
{"role": "user", "content": query},
]
text = small_tokenizer.apply_chat_template(
messages, tools=tools, tokenize=False, add_generation_prompt=True
)
inputs = small_tokenizer(text, return_tensors="pt").to(small_model.device)
with torch.no_grad():
output = small_model.generate(
**inputs, max_new_tokens=512, do_sample=False
)
response = small_tokenizer.decode(
output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
)
print(f"Qwen3-0.6B 回复: {response[:200]}")模型规模对工具使用的影响:实验中你可能会发现,1.7B 模型的函数调用准确率显著高于 0.6B 模型。工具使用需要模型同时具备理解用户意图、选择合适工具、生成正确参数格式三项能力,这对小模型来说是较大挑战。
交付物清单
完成本次实验后,请提交以下内容:
量化实验报告(必做)
- 三种精度的显存占用对比表
- 推理速度对比表(tokens/s、首 token 延迟)
- 三类任务的质量评分对比表(GSM8K、指令跟随、中文任务)
- "压缩率 vs 质量保持率"的关系图
- 简要分析:量化对哪类任务影响最大?为什么?
选做实验报告(三选一)
- 选项 1:蒸馏模型的推理链分析、与 GRPO 模型的风格对比
- 选项 2:VLM 的视觉问答结果、幻觉率统计
- 选项 3:函数调用准确率、智能体循环运行示例
总结反思(1 页)
- 后训练各环节(SFT → DPO → GRPO → 量化/蒸馏)的关系与选择策略
- 对于一个实际的 LLM 应用项目,你会如何选择后训练技术组合?