什么你还不会微调T5模型手把手教你弄懂
编辑
2
2025-03-31

什么?你还不会微调T5模型?手把手教你弄懂!
什么是T5架构模型
有详细了解的小伙伴可以看看论文: 《Exploring the Limits of Transfer Learning with a
UnifiedText-to-Text Transformer》
https://arxiv.org/pdf/1910.10683
简要概述
T5模型的架构与transformer非常相似。
使用 encoder+decoder 的transformer架构。
如图他也是一个 **sequence - >sequence ** 的模式。
与正常的transformer的差异:
- 使用层归一化的简化版本,其中仅重新调整激活值并且不应用附加偏差。
- 位置编码使用相对位置嵌入,而不是sin/cos。
有什么优势吗
相较于GPT、Bert类模型
-
多任务处理 :T5在多任务学习中表现出色,因为它在预训练阶段就接受了多种任务,这让它在处理需要多个步骤或结合多种能力的复杂任务时具有优势。
-
生成类任务 :对于需要生成自然语言的任务(如摘要、翻译、问答生成等),T5可以直接将任务映射为文本生成,具有更高的灵活性和适应性。
-
迁移学习 :T5的预训练涵盖了大量的任务,这使它在迁移到新任务时具有较好的表现。
微调(代码向)
这里我们简单使用transformers、peft库微调。
记住模型微调三步走:
- 预处理模型
- 预处理数据
- 配置参数与训练
预处理模型
加载模型
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
model_name = "/home/valiantsec/phb/models/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name,device_map="auto")
包裹peft模型
from peft import LoraConfig, get_peft_model, TaskType
lora_config = LoraConfig(
r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)
预处理数据
加载并切分数据
# loading dataset
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]
classes = dataset["train"].features["label"].names
dataset = dataset.map(
lambda x: {"text_label": [classes[label] for label in x["label"]]},
batched=True,
num_proc=1,
)
分词+截断/填充+labels
# data preprocessing
text_column = "sentence"
label_column = "text_label"
max_length = 128
def preprocess_function(examples):
inputs = examples[text_column]
targets = examples[label_column]
model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt")
labels = labels["input_ids"]
labels[labels == tokenizer.pad_token_id] = -100
model_inputs["labels"] = labels
return model_inputs
processed_datasets = dataset.map(
preprocess_function,
batched=True,
num_proc=1,
remove_columns=dataset["train"].column_names,
load_from_cache_file=False,
desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]
训练
配置参数
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
"temp",
evaluation_strategy="epoch",
learning_rate=1e-3,
gradient_accumulation_steps=1,
auto_find_batch_size=True,
num_train_epochs=1,
save_steps=100,
save_total_limit=8,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
开始训练
trainer.train()
一键训练脚本
我放在了最下方。
附加脚本
命令行运行
python train.py
yaml 配置文件(config.yaml)
# Model Configuration
model_path: "/home/valiantsec/phb/models/flan-t5-small"
# model_path: "/home/valiantsec/phb/models/codet5"
# 或您选择的其他T5模型路径
# Dataset Configuration
max_length: 128 # 输入序列的最大长度
# Training Configuration
output_dir: "./results"
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 1
learning_rate: 1e-3
lr_scheduler_type: "linear"
warmup_ratio: 0.1
num_train_epochs: 2
# Saving and Logging Configuration
save_strategy: "steps"
save_steps: 500
logging_steps: 100
evaluation_strategy: "steps"
eval_steps: 500
# LoRA Configuration
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
训练脚本(train.py)
import yaml
import logging
import traceback
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback
from peft import LoraConfig, get_peft_model
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class T5Classifier:
def __init__(self, config_file='./config.yaml'):
try:
self.config = self.load_config(config_file)
logger.info("配置加载成功。")
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.config['model_path'], device_map="auto")
self.tokenizer = AutoTokenizer.from_pretrained(self.config['model_path'])
logger.info(f"模型和分词器从 {self.config['model_path']} 加载成功")
except Exception as e:
logger.error(f"初始化过程中发生错误: {str(e)}")
logger.error(traceback.format_exc())
raise
def load_config(self, config_file):
try:
with open(config_file, 'r') as file:
config = yaml.safe_load(file)
# 打印所有配置项的类型和值
for key, value in config.items():
logger.info(f"{key} - 类型: {type(value)}, 值: {value}")
return config
except Exception as e:
logger.error(f"加载配置时发生错误: {str(e)}")
logger.error(traceback.format_exc())
raise
def prepare_datasets(self):
try:
logger.info("正在加载数据集...")
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]
classes = dataset["train"].features["label"].names
dataset = dataset.map(
lambda x: {"text_label": [classes[label] for label in x["label"]]},
batched=True,
num_proc=1,
)
logger.info("数据集已准备好并分为训练集和验证集。")
processed_datasets = dataset.map(
self.preprocess_function,
batched=True,
num_proc=1,
remove_columns=dataset["train"].column_names,
load_from_cache_file=False,
desc="正在对数据集运行分词器",
)
self.train_dataset = processed_datasets["train"]
self.eval_dataset = processed_datasets["validation"]
logger.info("数据集已处理并进行分词。")
except Exception as e:
logger.error(f"准备数据集时发生错误: {str(e)}")
logger.error(traceback.format_exc())
raise
def configure_lora(self):
try:
lora_config = LoraConfig(
r=int(self.config['lora_r']),
lora_alpha=int(self.config['lora_alpha']),
target_modules=["q", "v"],
lora_dropout=float(self.config['lora_dropout']),
bias="none",
task_type="SEQ_2_SEQ_LM"
)
self.model = get_peft_model(self.model, lora_config)
logger.info("LoRA 配置已应用到模型。")
except Exception as e:
logger.error(f"配置 LoRA 时发生错误: {str(e)}")
logger.error(traceback.format_exc())
raise
def train(self):
try:
# 检查关键参数的类型和值
critical_params = ['learning_rate', 'num_train_epochs', 'per_device_train_batch_size',
'per_device_eval_batch_size', 'gradient_accumulation_steps',
'warmup_ratio', 'save_steps', 'logging_steps', 'eval_steps']
for param in critical_params:
logger.info(f"{param} - 类型: {type(self.config[param])}, 值: {self.config[param]}")
training_args = TrainingArguments(
output_dir=self.config['output_dir'],
per_device_train_batch_size=int(self.config['per_device_train_batch_size']),
per_device_eval_batch_size=int(self.config['per_device_eval_batch_size']),
gradient_accumulation_steps=int(self.config['gradient_accumulation_steps']),
learning_rate=float(self.config['learning_rate']),
lr_scheduler_type=self.config['lr_scheduler_type'],
warmup_ratio=float(self.config['warmup_ratio']),
num_train_epochs=float(self.config['num_train_epochs']),
save_strategy=self.config['save_strategy'],
save_steps=int(self.config['save_steps']),
logging_steps=int(self.config['logging_steps']),
evaluation_strategy=self.config['evaluation_strategy'],
eval_steps=int(self.config['eval_steps'])
)
class LoggingCallback(TrainerCallback):
def on_evaluate(self, args, state, control, metrics, **kwargs):
logger.info(f"评估指标: {metrics}")
def on_log(self, args, state, control, logs=None, **kwargs):
if logs:
logger.info(f"训练日志: {logs}")
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=self.train_dataset,
eval_dataset=self.eval_dataset,
compute_metrics=self.compute_metrics,
callbacks=[LoggingCallback()],
)
logger.info("开始训练...")
trainer.train()
logger.info("训练完成。")
logger.info("执行最终评估...")
eval_results = trainer.evaluate()
logger.info(f"最终评估结果: {eval_results}")
except Exception as e:
logger.error(f"训练过程中发生错误: {str(e)}")
logger.error(f"错误类型: {type(e)}")
logger.error(f"错误参数: {e.args}")
logger.error(f"错误追踪:\n{traceback.format_exc()}")
raise
def preprocess_function(self, examples):
try:
text_column = "sentence"
label_column = "text_label"
inputs = examples[text_column]
targets = examples[label_column]
model_inputs = self.tokenizer(inputs, max_length=self.config['max_length'], padding="max_length", truncation=True)
labels = self.tokenizer(targets, max_length=3, padding="max_length", truncation=True)
model_inputs["labels"] = labels["input_ids"]
# 记录一些样本
logger.info(f"样本输入: {inputs[0]}")
logger.info(f"样本目标: {targets[0]}")
logger.info(f"样本模型输入: {model_inputs['input_ids'][0]}")
logger.info(f"样本标签: {model_inputs['labels'][0]}")
return model_inputs
except Exception as e:
logger.error(f"预处理函数中发生错误: {str(e)}")
logger.error(traceback.format_exc())
raise
def compute_metrics(self, eval_pred):
try:
predictions, labels = eval_pred
logger.info(f"预测结果类型: {type(predictions)}")
logger.info(f"标签类型: {type(labels)}")
if isinstance(predictions, tuple):
logger.info(f"预测结果是一个包含 {len(predictions)} 个元素的元组")
# 假设第一个元素包含 logits
predictions = predictions[0]
logger.info(f"预测结果形状: {predictions.shape}")
logger.info(f"标签形状: {labels.shape}")
# 通过取 argmax 获取预测的类别
predicted_classes = np.argmax(predictions, axis=-1)
decoded_preds = self.tokenizer.batch_decode(predicted_classes, skip_special_tokens=True)
# 将 -100 替换为 tokenizer.pad_token_id
labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
logger.info(f"样本预测: {decoded_preds[0]}")
logger.info(f"样本标签: {decoded_labels[0]}")
# 比较预测和标签
accuracy = sum([pred.strip() == label.strip() for pred, label in zip(decoded_preds, decoded_labels)]) / len(decoded_preds)
logger.info(f"计算得到的准确率: {accuracy}")
return {"accuracy": accuracy}
except Exception as e:
logger.error(f"compute_metrics 中发生错误: {str(e)}")
logger.error(traceback.format_exc())
return {"accuracy": 0.0} # 返回默认值
def main():
try:
classifier = T5Classifier()
classifier.prepare_datasets()
classifier.configure_lora()
classifier.train()
except Exception as e:
logger.error(f"main 函数中发生错误: {str(e)}")
logger.error(traceback.format_exc())
if __name__ == "__main__":
main()
- 0
- 0
-
分享